diff --git a/invokeai/app/invocations/facetools.py b/invokeai/app/invocations/facetools.py index 1092a67ce95..a39d2ba9fe3 100644 --- a/invokeai/app/invocations/facetools.py +++ b/invokeai/app/invocations/facetools.py @@ -5,7 +5,6 @@ import cv2 import numpy as np -from mediapipe.python.solutions.face_mesh import FaceMesh # type: ignore[import] from PIL import Image, ImageDraw, ImageFilter, ImageFont, ImageOps from PIL.Image import Image as ImageType from pydantic import field_validator @@ -20,6 +19,7 @@ from invokeai.app.invocations.primitives import ImageOutput from invokeai.app.services.image_records.image_records_common import ImageCategory from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.image_util.mediapipe_face.mediapipe_face_common import detect_face_landmarks @invocation_output("face_mask_output") @@ -194,23 +194,15 @@ def generate_face_box_mask( # Convert RGBA to RGB by removing the alpha channel. np_image = np_image[:, :, :3] - # Create a FaceMesh object for face landmark detection and mesh generation. - face_mesh = FaceMesh( - max_num_faces=999, - min_detection_confidence=minimum_confidence, - min_tracking_confidence=minimum_confidence, - ) - - # Detect the face landmarks and mesh in the input image. - results = face_mesh.process(np_image) + results = detect_face_landmarks(np_image, max_faces=999, min_confidence=minimum_confidence) # Check if any face is detected. - if results.multi_face_landmarks: # type: ignore # this are via protobuf and not typed + if results: # Search for the face_id in the detected faces. - for _face_id, face_landmarks in enumerate(results.multi_face_landmarks): # type: ignore #this are via protobuf and not typed + for _face_id, face_landmarks in enumerate(results): # Get the bounding box of the face mesh. - x_coordinates = [landmark.x for landmark in face_landmarks.landmark] - y_coordinates = [landmark.y for landmark in face_landmarks.landmark] + x_coordinates = [landmark.x for landmark in face_landmarks] + y_coordinates = [landmark.y for landmark in face_landmarks] x_min, x_max = min(x_coordinates), max(x_coordinates) y_min, y_max = min(y_coordinates), max(y_coordinates) @@ -219,13 +211,13 @@ def generate_face_box_mask( mesh_height = int((y_max - y_min) * np_image.shape[0]) # Get the center of the face. - x_center = np.mean([landmark.x * np_image.shape[1] for landmark in face_landmarks.landmark]) - y_center = np.mean([landmark.y * np_image.shape[0] for landmark in face_landmarks.landmark]) + x_center = np.mean([landmark.x * np_image.shape[1] for landmark in face_landmarks]) + y_center = np.mean([landmark.y * np_image.shape[0] for landmark in face_landmarks]) face_landmark_points = np.array( [ [landmark.x * np_image.shape[1], landmark.y * np_image.shape[0]] - for landmark in face_landmarks.landmark + for landmark in face_landmarks ] ) diff --git a/invokeai/backend/image_util/mediapipe_face/face_landmarker.task b/invokeai/backend/image_util/mediapipe_face/face_landmarker.task new file mode 100644 index 00000000000..c50c845d113 Binary files /dev/null and b/invokeai/backend/image_util/mediapipe_face/face_landmarker.task differ diff --git a/invokeai/backend/image_util/mediapipe_face/mediapipe_face_common.py b/invokeai/backend/image_util/mediapipe_face/mediapipe_face_common.py index 4cf7a66cdc7..e15347ad235 100644 --- a/invokeai/backend/image_util/mediapipe_face/mediapipe_face_common.py +++ b/invokeai/backend/image_util/mediapipe_face/mediapipe_face_common.py @@ -1,18 +1,32 @@ -from typing import Mapping +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Mapping, Sequence, TypeAlias +import cv2 import mediapipe as mp import numpy +import numpy.typing as npt +from mediapipe.tasks import python as _mp_python # type: ignore[import] +from mediapipe.tasks.python import vision as _vision # type: ignore[import] +from mediapipe.tasks.python.components.containers.landmark import NormalizedLandmark # type: ignore[import] -mp_drawing = mp.solutions.drawing_utils -mp_drawing_styles = mp.solutions.drawing_styles -mp_face_detection = mp.solutions.face_detection # Only for counting faces. -mp_face_mesh = mp.solutions.face_mesh -mp_face_connections = mp.solutions.face_mesh_connections.FACEMESH_TESSELATION -mp_hand_connections = mp.solutions.hands_connections.HAND_CONNECTIONS -mp_body_connections = mp.solutions.pose_connections.POSE_CONNECTIONS +mp_python: Any = _mp_python +vision: Any = _vision -DrawingSpec = mp.solutions.drawing_styles.DrawingSpec -PoseLandmark = mp.solutions.drawing_styles.PoseLandmark + +@dataclass(frozen=True) +class DrawingSpec: + color: tuple[int, int, int] + thickness: int + circle_radius: int + + +FaceLandmarks = Sequence[NormalizedLandmark] +FaceConnection = tuple[int, int] +ImageArray: TypeAlias = npt.NDArray[numpy.uint8] +DrawingSpecMap: TypeAlias = Mapping[int, DrawingSpec] | DrawingSpec + +_FACE_LANDMARKER_MODEL_PATH = Path(__file__).with_name("face_landmarker.task") min_face_size_pixels: int = 64 f_thick = 2 @@ -27,27 +41,100 @@ head_draw = DrawingSpec(color=(10, 200, 10), thickness=f_thick, circle_radius=f_rad) # mp_face_mesh.FACEMESH_CONTOURS has all the items we care about. -face_connection_spec = {} -for edge in mp_face_mesh.FACEMESH_FACE_OVAL: +face_connection_spec: dict[FaceConnection, DrawingSpec] = {} +for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_FACE_OVAL: + edge = (connection.start, connection.end) face_connection_spec[edge] = head_draw -for edge in mp_face_mesh.FACEMESH_LEFT_EYE: +for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_LEFT_EYE: + edge = (connection.start, connection.end) face_connection_spec[edge] = left_eye_draw -for edge in mp_face_mesh.FACEMESH_LEFT_EYEBROW: +for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_LEFT_EYEBROW: + edge = (connection.start, connection.end) face_connection_spec[edge] = left_eyebrow_draw -# for edge in mp_face_mesh.FACEMESH_LEFT_IRIS: +# for edge in vision.FaceLandmarksConnections.FACE_LANDMARKS_LEFT_IRIS: # face_connection_spec[edge] = left_iris_draw -for edge in mp_face_mesh.FACEMESH_RIGHT_EYE: +for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_EYE: + edge = (connection.start, connection.end) face_connection_spec[edge] = right_eye_draw -for edge in mp_face_mesh.FACEMESH_RIGHT_EYEBROW: +for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_EYEBROW: + edge = (connection.start, connection.end) face_connection_spec[edge] = right_eyebrow_draw -# for edge in mp_face_mesh.FACEMESH_RIGHT_IRIS: +# for edge in vision.FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_IRIS: # face_connection_spec[edge] = right_iris_draw -for edge in mp_face_mesh.FACEMESH_LIPS: +for connection in vision.FaceLandmarksConnections.FACE_LANDMARKS_LIPS: + edge = (connection.start, connection.end) face_connection_spec[edge] = mouth_draw iris_landmark_spec = {468: right_iris_draw, 473: left_iris_draw} -def draw_pupils(image, landmark_list, drawing_spec, halfwidth: int = 2): +def _get_face_landmarker_model_path() -> str: + if not _FACE_LANDMARKER_MODEL_PATH.exists(): + raise FileNotFoundError( + f"Missing vendored MediaPipe model asset at {_FACE_LANDMARKER_MODEL_PATH}. " + "Reinstall the package or restore face_landmarker.task." + ) + return str(_FACE_LANDMARKER_MODEL_PATH) + + +def detect_face_landmarks(img_rgb: ImageArray, max_faces: int, min_confidence: float) -> list[FaceLandmarks]: + options = vision.FaceLandmarkerOptions( + base_options=mp_python.BaseOptions(model_asset_path=_get_face_landmarker_model_path()), + num_faces=max_faces, + min_face_detection_confidence=min_confidence, + min_face_presence_confidence=min_confidence, + min_tracking_confidence=min_confidence, + ) + + mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=img_rgb) + with vision.FaceLandmarker.create_from_options(options) as landmarker: + results = landmarker.detect(mp_image) + + return results.face_landmarks + + +def _landmark_xy(landmark: NormalizedLandmark) -> tuple[float, float] | None: + if landmark.x is None or landmark.y is None: + return None + + return landmark.x, landmark.y + + +def _landmark_to_pixel(landmark: NormalizedLandmark, image_rows: int, image_cols: int) -> tuple[int, int] | None: + coords = _landmark_xy(landmark) + if coords is None: + return None + + x_coord, y_coord = coords + + if x_coord < 0 or x_coord > 1 or y_coord < 0 or y_coord > 1: + return None + + image_x = min(int(x_coord * (image_cols - 1)), image_cols - 1) + image_y = min(int(y_coord * (image_rows - 1)), image_rows - 1) + return image_x, image_y + + +def _draw_connections(image: ImageArray, landmark_list: FaceLandmarks, drawing_spec: Mapping[FaceConnection, DrawingSpec]) -> None: + if len(image.shape) != 3: + raise ValueError("Input image must be H,W,C.") + + image_rows, image_cols, image_channels = image.shape + if image_channels != 3: + raise ValueError("Input image must contain three channel bgr data.") + + for (start_idx, end_idx), spec in drawing_spec.items(): + if start_idx >= len(landmark_list) or end_idx >= len(landmark_list): + continue + + start_point = _landmark_to_pixel(landmark_list[start_idx], image_rows, image_cols) + end_point = _landmark_to_pixel(landmark_list[end_idx], image_rows, image_cols) + if start_point is None or end_point is None: + continue + + cv2.line(image, start_point, end_point, spec.color, spec.thickness) + + +def draw_pupils(image: ImageArray, landmark_list: FaceLandmarks, drawing_spec: DrawingSpecMap, halfwidth: int = 2) -> None: """We have a custom function to draw the pupils because the mp.draw_landmarks method requires a parameter for all landmarks. Until our PR is merged into mediapipe, we need this separate method.""" if len(image.shape) != 3: @@ -55,95 +142,95 @@ def draw_pupils(image, landmark_list, drawing_spec, halfwidth: int = 2): image_rows, image_cols, image_channels = image.shape if image_channels != 3: # BGR channels raise ValueError("Input image must contain three channel bgr data.") - for idx, landmark in enumerate(landmark_list.landmark): - if (landmark.HasField("visibility") and landmark.visibility < 0.9) or ( - landmark.HasField("presence") and landmark.presence < 0.5 + for idx, landmark in enumerate(landmark_list): + if (landmark.visibility is not None and landmark.visibility < 0.9) or ( + landmark.presence is not None and landmark.presence < 0.5 ): continue - if landmark.x >= 1.0 or landmark.x < 0 or landmark.y >= 1.0 or landmark.y < 0: + point = _landmark_to_pixel(landmark, image_rows, image_cols) + if point is None: continue - image_x = int(image_cols * landmark.x) - image_y = int(image_rows * landmark.y) + + image_x, image_y = point draw_color = None if isinstance(drawing_spec, Mapping): if drawing_spec.get(idx) is None: continue - else: - draw_color = drawing_spec[idx].color - elif isinstance(drawing_spec, DrawingSpec): + draw_color = drawing_spec[idx].color + else: draw_color = drawing_spec.color - image[image_y - halfwidth : image_y + halfwidth, image_x - halfwidth : image_x + halfwidth, :] = draw_color + + y_min = max(image_y - halfwidth, 0) + y_max = min(image_y + halfwidth, image_rows) + x_min = max(image_x - halfwidth, 0) + x_max = min(image_x + halfwidth, image_cols) + image[y_min:y_max, x_min:x_max, :] = draw_color -def reverse_channels(image): +def reverse_channels(image: ImageArray) -> ImageArray: """Given a numpy array in RGB form, convert to BGR. Will also convert from BGR to RGB.""" # im[:,:,::-1] is a neat hack to convert BGR to RGB by reversing the indexing order. # im[:,:,::[2,1,0]] would also work but makes a copy of the data. return image[:, :, ::-1] -def generate_annotation(img_rgb, max_faces: int, min_confidence: float): +def generate_annotation(img_rgb: ImageArray, max_faces: int, min_confidence: float) -> ImageArray: """ Find up to 'max_faces' inside the provided input image. If min_face_size_pixels is provided and nonzero it will be used to filter faces that occupy less than this many pixels in the image. """ - with mp_face_mesh.FaceMesh( - static_image_mode=True, - max_num_faces=max_faces, - refine_landmarks=True, - min_detection_confidence=min_confidence, - ) as facemesh: - img_height, img_width, img_channels = img_rgb.shape - assert img_channels == 3 - - results = facemesh.process(img_rgb).multi_face_landmarks - - if results is None: - print("No faces detected in controlnet image for Mediapipe face annotator.") - return numpy.zeros_like(img_rgb) - - # Filter faces that are too small - filtered_landmarks = [] - for lm in results: - landmarks = lm.landmark - face_rect = [ - landmarks[0].x, - landmarks[0].y, - landmarks[0].x, - landmarks[0].y, - ] # Left, up, right, down. - for i in range(len(landmarks)): - face_rect[0] = min(face_rect[0], landmarks[i].x) - face_rect[1] = min(face_rect[1], landmarks[i].y) - face_rect[2] = max(face_rect[2], landmarks[i].x) - face_rect[3] = max(face_rect[3], landmarks[i].y) - if min_face_size_pixels > 0: - face_width = abs(face_rect[2] - face_rect[0]) - face_height = abs(face_rect[3] - face_rect[1]) - face_width_pixels = face_width * img_width - face_height_pixels = face_height * img_height - face_size = min(face_width_pixels, face_height_pixels) - if face_size >= min_face_size_pixels: - filtered_landmarks.append(lm) - else: - filtered_landmarks.append(lm) - - # Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start. - empty = numpy.zeros_like(img_rgb) - - # Draw detected faces: - for face_landmarks in filtered_landmarks: - mp_drawing.draw_landmarks( - empty, - face_landmarks, - connections=face_connection_spec.keys(), - landmark_drawing_spec=None, - connection_drawing_spec=face_connection_spec, - ) - draw_pupils(empty, face_landmarks, iris_landmark_spec, 2) - - # Flip BGR back to RGB. - empty = reverse_channels(empty).copy() - - return empty + img_height, img_width, img_channels = img_rgb.shape + assert img_channels == 3 + + results = detect_face_landmarks(img_rgb, max_faces=max_faces, min_confidence=min_confidence) + + if len(results) == 0: + print("No faces detected in controlnet image for Mediapipe face annotator.") + return numpy.zeros_like(img_rgb) + + # Filter faces that are too small + filtered_landmarks: list[FaceLandmarks] = [] + for landmarks in results: + first_coords = _landmark_xy(landmarks[0]) + if first_coords is None: + continue + + face_rect = [ + first_coords[0], + first_coords[1], + first_coords[0], + first_coords[1], + ] # Left, up, right, down. + for landmark in landmarks: + coords = _landmark_xy(landmark) + if coords is None: + continue + + face_rect[0] = min(face_rect[0], coords[0]) + face_rect[1] = min(face_rect[1], coords[1]) + face_rect[2] = max(face_rect[2], coords[0]) + face_rect[3] = max(face_rect[3], coords[1]) + if min_face_size_pixels > 0: + face_width = abs(face_rect[2] - face_rect[0]) + face_height = abs(face_rect[3] - face_rect[1]) + face_width_pixels = face_width * img_width + face_height_pixels = face_height * img_height + face_size = min(face_width_pixels, face_height_pixels) + if face_size >= min_face_size_pixels: + filtered_landmarks.append(landmarks) + else: + filtered_landmarks.append(landmarks) + + # Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start. + empty = numpy.zeros_like(img_rgb) + + # Draw detected faces: + for face_landmarks in filtered_landmarks: + _draw_connections(empty, face_landmarks, face_connection_spec) + draw_pupils(empty, face_landmarks, iris_landmark_spec, 2) + + # Flip BGR back to RGB. + empty = reverse_channels(empty).copy() + + return empty diff --git a/pyproject.toml b/pyproject.toml index 155471d9067..e21004593bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,22 +34,22 @@ classifiers = [ dependencies = [ # Core generation dependencies, pinned for reproducible builds. "accelerate", - "bitsandbytes; sys_platform!='darwin'", + "bitsandbytes>=0.48.0; sys_platform!='darwin'", # CUDA 130 supported as of 0.48.0 feb-2026, CUDA 131/132 slated for "future release" "compel==2.1.1", "diffusers[torch]==0.37.0", "gguf", - "mediapipe==0.10.14", # needed for "mediapipeface" controlnet model - "numpy<2.0.0", - "onnx==1.16.1", - "onnxruntime==1.19.2", + "mediapipe==0.10.35", # needed for "mediapipeface" controlnet model + "numpy<=2.4.6", # TODO: figure out why numpy<2.0.0 was pinned? + "onnx==1.21.0", + "onnxruntime==1.26.0", # 1.27.0 will drop support for CUDA 12 "opencv-contrib-python", "safetensors", - "sentencepiece==0.2.0", # 0.2.1 coredumps windows when loading t5 tokenizer + "sentencepiece==0.2.1", # 0.2.1 coredumps windows when loading t5 tokenizer "spandrel", - "torch~=2.7.0", # torch and related dependencies are loosely pinned, will respect requirement of `diffusers[torch]` + "torch~=2.12.0", # torch and related dependencies are loosely pinned, will respect requirement of `diffusers[torch]` "torchsde", # diffusers needs this for SDE solvers, but it is not an explicit dep of diffusers "torchvision", - "transformers>=4.56.0", + "transformers>=4.56.0", # Waiting on compel 2.2 to upgrade to 5.8.1 # Core application dependencies, pinned for reproducible builds. "fastapi-events", @@ -201,7 +201,7 @@ version = { attr = "invokeai.version.__version__" } "*.png", ] "invokeai.assets.fonts" = ["**/*.ttf"] -"invokeai.backend" = ["**.png", "**/*.icc"] +"invokeai.backend" = ["**.png", "**/*.icc", "**/*.task"] "invokeai.configs" = ["*.example", "**/*.yaml", "*.txt"] "invokeai.frontend.web.dist" = ["**"] "invokeai.frontend.web.static" = ["**"]