Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 9 additions & 17 deletions invokeai/app/invocations/facetools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import cv2
import numpy as np
from mediapipe.python.solutions.face_mesh import FaceMesh # type: ignore[import]
from PIL import Image, ImageDraw, ImageFilter, ImageFont, ImageOps
from PIL.Image import Image as ImageType
from pydantic import field_validator
Expand All @@ -20,6 +19,7 @@
from invokeai.app.invocations.primitives import ImageOutput
from invokeai.app.services.image_records.image_records_common import ImageCategory
from invokeai.app.services.shared.invocation_context import InvocationContext
from invokeai.backend.image_util.mediapipe_face.mediapipe_face_common import detect_face_landmarks


@invocation_output("face_mask_output")
Expand Down Expand Up @@ -194,23 +194,15 @@ def generate_face_box_mask(
# Convert RGBA to RGB by removing the alpha channel.
np_image = np_image[:, :, :3]

# Create a FaceMesh object for face landmark detection and mesh generation.
face_mesh = FaceMesh(
max_num_faces=999,
min_detection_confidence=minimum_confidence,
min_tracking_confidence=minimum_confidence,
)

# Detect the face landmarks and mesh in the input image.
results = face_mesh.process(np_image)
results = detect_face_landmarks(np_image, max_faces=999, min_confidence=minimum_confidence)

# Check if any face is detected.
if results.multi_face_landmarks: # type: ignore # this are via protobuf and not typed
if results:
# Search for the face_id in the detected faces.
for _face_id, face_landmarks in enumerate(results.multi_face_landmarks): # type: ignore #this are via protobuf and not typed
for _face_id, face_landmarks in enumerate(results):
# Get the bounding box of the face mesh.
x_coordinates = [landmark.x for landmark in face_landmarks.landmark]
y_coordinates = [landmark.y for landmark in face_landmarks.landmark]
x_coordinates = [landmark.x for landmark in face_landmarks]
y_coordinates = [landmark.y for landmark in face_landmarks]
x_min, x_max = min(x_coordinates), max(x_coordinates)
y_min, y_max = min(y_coordinates), max(y_coordinates)

Expand All @@ -219,13 +211,13 @@ def generate_face_box_mask(
mesh_height = int((y_max - y_min) * np_image.shape[0])

# Get the center of the face.
x_center = np.mean([landmark.x * np_image.shape[1] for landmark in face_landmarks.landmark])
y_center = np.mean([landmark.y * np_image.shape[0] for landmark in face_landmarks.landmark])
x_center = np.mean([landmark.x * np_image.shape[1] for landmark in face_landmarks])
y_center = np.mean([landmark.y * np_image.shape[0] for landmark in face_landmarks])

face_landmark_points = np.array(
[
[landmark.x * np_image.shape[1], landmark.y * np_image.shape[0]]
for landmark in face_landmarks.landmark
for landmark in face_landmarks
]
)

Expand Down
Binary file not shown.
269 changes: 178 additions & 91 deletions invokeai/backend/image_util/mediapipe_face/mediapipe_face_common.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,32 @@
from typing import Mapping
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Mapping, Sequence, TypeAlias

import cv2
import mediapipe as mp
import numpy
import numpy.typing as npt
from mediapipe.tasks import python as _mp_python # type: ignore[import]
from mediapipe.tasks.python import vision as _vision # type: ignore[import]
from mediapipe.tasks.python.components.containers.landmark import NormalizedLandmark # type: ignore[import]

mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_face_detection = mp.solutions.face_detection # Only for counting faces.
mp_face_mesh = mp.solutions.face_mesh
mp_face_connections = mp.solutions.face_mesh_connections.FACEMESH_TESSELATION
mp_hand_connections = mp.solutions.hands_connections.HAND_CONNECTIONS
mp_body_connections = mp.solutions.pose_connections.POSE_CONNECTIONS
mp_python: Any = _mp_python
vision: Any = _vision

DrawingSpec = mp.solutions.drawing_styles.DrawingSpec
PoseLandmark = mp.solutions.drawing_styles.PoseLandmark

@dataclass(frozen=True)
class DrawingSpec:
color: tuple[int, int, int]
thickness: int
circle_radius: int


FaceLandmarks = Sequence[NormalizedLandmark]
FaceConnection = tuple[int, int]
ImageArray: TypeAlias = npt.NDArray[numpy.uint8]
DrawingSpecMap: TypeAlias = Mapping[int, DrawingSpec] | DrawingSpec

_FACE_LANDMARKER_MODEL_PATH = Path(__file__).with_name("face_landmarker.task")

min_face_size_pixels: int = 64
f_thick = 2
Expand All @@ -27,123 +41,196 @@
head_draw = DrawingSpec(color=(10, 200, 10), thickness=f_thick, circle_radius=f_rad)

# mp_face_mesh.FACEMESH_CONTOURS has all the items we care about.
face_connection_spec = {}
for edge in mp_face_mesh.FACEMESH_FACE_OVAL:
face_connection_spec: dict[FaceConnection, DrawingSpec] = {}
for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_FACE_OVAL:
edge = (connection.start, connection.end)
face_connection_spec[edge] = head_draw
for edge in mp_face_mesh.FACEMESH_LEFT_EYE:
for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_LEFT_EYE:
edge = (connection.start, connection.end)
face_connection_spec[edge] = left_eye_draw
for edge in mp_face_mesh.FACEMESH_LEFT_EYEBROW:
for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_LEFT_EYEBROW:
edge = (connection.start, connection.end)
face_connection_spec[edge] = left_eyebrow_draw
# for edge in mp_face_mesh.FACEMESH_LEFT_IRIS:
# for edge in vision.FaceLandmarksConnections.FACE_LANDMARKS_LEFT_IRIS:
# face_connection_spec[edge] = left_iris_draw
for edge in mp_face_mesh.FACEMESH_RIGHT_EYE:
for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_EYE:
edge = (connection.start, connection.end)
face_connection_spec[edge] = right_eye_draw
for edge in mp_face_mesh.FACEMESH_RIGHT_EYEBROW:
for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_EYEBROW:
edge = (connection.start, connection.end)
face_connection_spec[edge] = right_eyebrow_draw
# for edge in mp_face_mesh.FACEMESH_RIGHT_IRIS:
# for edge in vision.FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_IRIS:
# face_connection_spec[edge] = right_iris_draw
for edge in mp_face_mesh.FACEMESH_LIPS:
for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_LIPS:
edge = (connection.start, connection.end)
face_connection_spec[edge] = mouth_draw
iris_landmark_spec = {468: right_iris_draw, 473: left_iris_draw}


def draw_pupils(image, landmark_list, drawing_spec, halfwidth: int = 2):
def _get_face_landmarker_model_path() -> str:
if not _FACE_LANDMARKER_MODEL_PATH.exists():
raise FileNotFoundError(
f"Missing vendored MediaPipe model asset at {_FACE_LANDMARKER_MODEL_PATH}. "
"Reinstall the package or restore face_landmarker.task."
)
return str(_FACE_LANDMARKER_MODEL_PATH)


def detect_face_landmarks(img_rgb: ImageArray, max_faces: int, min_confidence: float) -> list[FaceLandmarks]:
options = vision.FaceLandmarkerOptions(
base_options=mp_python.BaseOptions(model_asset_path=_get_face_landmarker_model_path()),
num_faces=max_faces,
min_face_detection_confidence=min_confidence,
min_face_presence_confidence=min_confidence,
min_tracking_confidence=min_confidence,
)

mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=img_rgb)
with vision.FaceLandmarker.create_from_options(options) as landmarker:
results = landmarker.detect(mp_image)

return results.face_landmarks


def _landmark_xy(landmark: NormalizedLandmark) -> tuple[float, float] | None:
if landmark.x is None or landmark.y is None:
return None

return landmark.x, landmark.y


def _landmark_to_pixel(landmark: NormalizedLandmark, image_rows: int, image_cols: int) -> tuple[int, int] | None:
coords = _landmark_xy(landmark)
if coords is None:
return None

x_coord, y_coord = coords

if x_coord < 0 or x_coord > 1 or y_coord < 0 or y_coord > 1:
return None

image_x = min(int(x_coord * (image_cols - 1)), image_cols - 1)
image_y = min(int(y_coord * (image_rows - 1)), image_rows - 1)
return image_x, image_y


def _draw_connections(image: ImageArray, landmark_list: FaceLandmarks, drawing_spec: Mapping[FaceConnection, DrawingSpec]) -> None:
if len(image.shape) != 3:
raise ValueError("Input image must be H,W,C.")

image_rows, image_cols, image_channels = image.shape
if image_channels != 3:
raise ValueError("Input image must contain three channel bgr data.")

for (start_idx, end_idx), spec in drawing_spec.items():
if start_idx >= len(landmark_list) or end_idx >= len(landmark_list):
continue

start_point = _landmark_to_pixel(landmark_list[start_idx], image_rows, image_cols)
end_point = _landmark_to_pixel(landmark_list[end_idx], image_rows, image_cols)
if start_point is None or end_point is None:
continue

cv2.line(image, start_point, end_point, spec.color, spec.thickness)


def draw_pupils(image: ImageArray, landmark_list: FaceLandmarks, drawing_spec: DrawingSpecMap, halfwidth: int = 2) -> None:
"""We have a custom function to draw the pupils because the mp.draw_landmarks method requires a parameter for all
landmarks. Until our PR is merged into mediapipe, we need this separate method."""
if len(image.shape) != 3:
raise ValueError("Input image must be H,W,C.")
image_rows, image_cols, image_channels = image.shape
if image_channels != 3: # BGR channels
raise ValueError("Input image must contain three channel bgr data.")
for idx, landmark in enumerate(landmark_list.landmark):
if (landmark.HasField("visibility") and landmark.visibility < 0.9) or (
landmark.HasField("presence") and landmark.presence < 0.5
for idx, landmark in enumerate(landmark_list):
if (landmark.visibility is not None and landmark.visibility < 0.9) or (
landmark.presence is not None and landmark.presence < 0.5
):
continue
if landmark.x >= 1.0 or landmark.x < 0 or landmark.y >= 1.0 or landmark.y < 0:
point = _landmark_to_pixel(landmark, image_rows, image_cols)
if point is None:
continue
image_x = int(image_cols * landmark.x)
image_y = int(image_rows * landmark.y)

image_x, image_y = point
draw_color = None
if isinstance(drawing_spec, Mapping):
if drawing_spec.get(idx) is None:
continue
else:
draw_color = drawing_spec[idx].color
elif isinstance(drawing_spec, DrawingSpec):
draw_color = drawing_spec[idx].color
else:
draw_color = drawing_spec.color
image[image_y - halfwidth : image_y + halfwidth, image_x - halfwidth : image_x + halfwidth, :] = draw_color

y_min = max(image_y - halfwidth, 0)
y_max = min(image_y + halfwidth, image_rows)
x_min = max(image_x - halfwidth, 0)
x_max = min(image_x + halfwidth, image_cols)
image[y_min:y_max, x_min:x_max, :] = draw_color


def reverse_channels(image):
def reverse_channels(image: ImageArray) -> ImageArray:
"""Given a numpy array in RGB form, convert to BGR. Will also convert from BGR to RGB."""
# im[:,:,::-1] is a neat hack to convert BGR to RGB by reversing the indexing order.
# im[:,:,::[2,1,0]] would also work but makes a copy of the data.
return image[:, :, ::-1]


def generate_annotation(img_rgb, max_faces: int, min_confidence: float):
def generate_annotation(img_rgb: ImageArray, max_faces: int, min_confidence: float) -> ImageArray:
"""
Find up to 'max_faces' inside the provided input image.
If min_face_size_pixels is provided and nonzero it will be used to filter faces that occupy less than this many
pixels in the image.
"""
with mp_face_mesh.FaceMesh(
static_image_mode=True,
max_num_faces=max_faces,
refine_landmarks=True,
min_detection_confidence=min_confidence,
) as facemesh:
img_height, img_width, img_channels = img_rgb.shape
assert img_channels == 3

results = facemesh.process(img_rgb).multi_face_landmarks

if results is None:
print("No faces detected in controlnet image for Mediapipe face annotator.")
return numpy.zeros_like(img_rgb)

# Filter faces that are too small
filtered_landmarks = []
for lm in results:
landmarks = lm.landmark
face_rect = [
landmarks[0].x,
landmarks[0].y,
landmarks[0].x,
landmarks[0].y,
] # Left, up, right, down.
for i in range(len(landmarks)):
face_rect[0] = min(face_rect[0], landmarks[i].x)
face_rect[1] = min(face_rect[1], landmarks[i].y)
face_rect[2] = max(face_rect[2], landmarks[i].x)
face_rect[3] = max(face_rect[3], landmarks[i].y)
if min_face_size_pixels > 0:
face_width = abs(face_rect[2] - face_rect[0])
face_height = abs(face_rect[3] - face_rect[1])
face_width_pixels = face_width * img_width
face_height_pixels = face_height * img_height
face_size = min(face_width_pixels, face_height_pixels)
if face_size >= min_face_size_pixels:
filtered_landmarks.append(lm)
else:
filtered_landmarks.append(lm)

# Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start.
empty = numpy.zeros_like(img_rgb)

# Draw detected faces:
for face_landmarks in filtered_landmarks:
mp_drawing.draw_landmarks(
empty,
face_landmarks,
connections=face_connection_spec.keys(),
landmark_drawing_spec=None,
connection_drawing_spec=face_connection_spec,
)
draw_pupils(empty, face_landmarks, iris_landmark_spec, 2)

# Flip BGR back to RGB.
empty = reverse_channels(empty).copy()

return empty
img_height, img_width, img_channels = img_rgb.shape
assert img_channels == 3

results = detect_face_landmarks(img_rgb, max_faces=max_faces, min_confidence=min_confidence)

if len(results) == 0:
print("No faces detected in controlnet image for Mediapipe face annotator.")
return numpy.zeros_like(img_rgb)

# Filter faces that are too small
filtered_landmarks: list[FaceLandmarks] = []
for landmarks in results:
first_coords = _landmark_xy(landmarks[0])
if first_coords is None:
continue

face_rect = [
first_coords[0],
first_coords[1],
first_coords[0],
first_coords[1],
] # Left, up, right, down.
for landmark in landmarks:
coords = _landmark_xy(landmark)
if coords is None:
continue

face_rect[0] = min(face_rect[0], coords[0])
face_rect[1] = min(face_rect[1], coords[1])
face_rect[2] = max(face_rect[2], coords[0])
face_rect[3] = max(face_rect[3], coords[1])
if min_face_size_pixels > 0:
face_width = abs(face_rect[2] - face_rect[0])
face_height = abs(face_rect[3] - face_rect[1])
face_width_pixels = face_width * img_width
face_height_pixels = face_height * img_height
face_size = min(face_width_pixels, face_height_pixels)
if face_size >= min_face_size_pixels:
filtered_landmarks.append(landmarks)
else:
filtered_landmarks.append(landmarks)

# Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start.
empty = numpy.zeros_like(img_rgb)

# Draw detected faces:
for face_landmarks in filtered_landmarks:
_draw_connections(empty, face_landmarks, face_connection_spec)
draw_pupils(empty, face_landmarks, iris_landmark_spec, 2)

# Flip BGR back to RGB.
empty = reverse_channels(empty).copy()

return empty
Loading
Loading