From 6aea186ee897fea8602ec46b941ef7185b64a5fe Mon Sep 17 00:00:00 2001 From: Henry Ruhs Date: Mon, 22 Jan 2024 13:56:45 +0100 Subject: [PATCH] feat/yoloface (#334) * added yolov8 to face_detector (#323) * added yolov8 to face_detector * added yolov8 to face_detector * Initial cleanup and renaming * Update README * refactored detect_with_yoloface (#329) * refactored detect_with_yoloface * apply review * Change order again * Restore working code * modified code (#330) * refactored detect_with_yoloface * apply review * use temp_frame in detect_with_yoloface * reorder * modified * reorder models * Tiny cleanup --------- Co-authored-by: tamoharu <133945583+tamoharu@users.noreply.github.com> --- README.md | 95 +++++++++++++++++++------------------ facefusion/choices.py | 2 +- facefusion/face_analyser.py | 55 ++++++++++++++++++++- facefusion/typing.py | 2 +- 4 files changed, 104 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 1f7b530f..72ca078a 100644 --- a/README.md +++ b/README.md @@ -30,73 +30,74 @@ Run the command: python run.py [options] options: - -h, --help show this help message and exit - -s SOURCE_PATHS, --source SOURCE_PATHS select a source image - -t TARGET_PATH, --target TARGET_PATH select a target image or video - -o OUTPUT_PATH, --output OUTPUT_PATH specify the output file or directory - -v, --version show program's version number and exit + -h, --help show this help message and exit + -s SOURCE_PATHS, --source SOURCE_PATHS select a source image + -t TARGET_PATH, --target TARGET_PATH select a target image or video + -o OUTPUT_PATH, --output OUTPUT_PATH specify the output file or directory + -v, --version show program's version number and exit misc: - --skip-download omit automate downloads and lookups - --headless run the program in headless mode - --log-level {error,warn,info,debug} choose from the available log levels + --skip-download omit automate downloads and lookups + --headless run the program in headless mode + --log-level {error,warn,info,debug} choose from the available log levels execution: - --execution-providers EXECUTION_PROVIDERS [EXECUTION_PROVIDERS ...] choose from the available execution providers (choices: cpu, ...) - --execution-thread-count [1-128] specify the number of execution threads - --execution-queue-count [1-32] specify the number of execution queries + --execution-providers EXECUTION_PROVIDERS [EXECUTION_PROVIDERS ...] choose from the available execution providers (choices: cpu, ...) + --execution-thread-count [1-128] specify the number of execution threads + --execution-queue-count [1-32] specify the number of execution queries memory: - --video-memory-strategy {strict,moderate,tolerant} specify strategy to handle the video memory - --system-memory-limit [0-128] specify the amount (gb) of system memory to be used + --video-memory-strategy {strict,moderate,tolerant} specify strategy to handle the video memory + --system-memory-limit [0-128] specify the amount (gb) of system memory to be used face analyser: - --face-analyser-order {left-right,right-left,top-bottom,bottom-top,small-large,large-small,best-worst,worst-best} specify the order used for the face analyser - --face-analyser-age {child,teen,adult,senior} specify the age used for the face analyser - --face-analyser-gender {male,female} specify the gender used for the face analyser - --face-detector-model {retinaface,yunet} specify the model used for the face detector - --face-detector-size {160x160,320x320,480x480,512x512,640x640,768x768,960x960,1024x1024} specify the size threshold used for the face detector - --face-detector-score [0.0-1.0] specify the score threshold used for the face detector + --face-analyser-order {left-right,right-left,top-bottom,bottom-top,small-large,large-small,best-worst,worst-best} specify the order used for the face analyser + --face-analyser-age {child,teen,adult,senior} specify the age used for the face analyser + --face-analyser-gender {male,female} specify the gender used for the face analyser + --face-detector-model {retinaface,yoloface,yunet} specify the model used for the face detector + --face-detector-size {160x160,320x320,480x480,512x512,640x640,768x768,960x960,1024x1024} specify the size threshold used for the face detector + --face-detector-score [0.0-1.0] specify the score threshold used for the face detector face selector: - --face-selector-mode {reference,one,many} specify the mode for the face selector - --reference-face-position REFERENCE_FACE_POSITION specify the position of the reference face - --reference-face-distance [0.0-1.5] specify the distance between the reference face and the target face - --reference-frame-number REFERENCE_FRAME_NUMBER specify the number of the reference frame + --face-selector-mode {reference,one,many} specify the mode for the face selector + --reference-face-position REFERENCE_FACE_POSITION specify the position of the reference face + --reference-face-distance [0.0-1.5] specify the distance between the reference face and the target face + --reference-frame-number REFERENCE_FRAME_NUMBER specify the number of the reference frame face mask: - --face-mask-types FACE_MASK_TYPES [FACE_MASK_TYPES ...] choose from the available face mask types (choices: box, occlusion, region) - --face-mask-blur [0.0-1.0] specify the blur amount for face mask - --face-mask-padding FACE_MASK_PADDING [FACE_MASK_PADDING ...] specify the face mask padding (top, right, bottom, left) in percent - --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...] choose from the available face mask regions (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, eye-glasses, nose, mouth, upper-lip, lower-lip) + --face-mask-types FACE_MASK_TYPES [FACE_MASK_TYPES ...] choose from the available face mask types (choices: box, occlusion, region) + --face-mask-blur [0.0-1.0] specify the blur amount for face mask + --face-mask-padding FACE_MASK_PADDING [FACE_MASK_PADDING ...] specify the face mask padding (top, right, bottom, left) in percent + --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...] choose from the available face mask regions (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, eye-glasses, nose, mouth, upper-lip, lower-lip) frame extraction: - --trim-frame-start TRIM_FRAME_START specify the start frame for extraction - --trim-frame-end TRIM_FRAME_END specify the end frame for extraction - --temp-frame-format {jpg,png,bmp} specify the image format used for frame extraction - --temp-frame-quality [0-100] specify the image quality used for frame extraction - --keep-temp retain temporary frames after processing + --trim-frame-start TRIM_FRAME_START specify the start frame for extraction + --trim-frame-end TRIM_FRAME_END specify the end frame for extraction + --temp-frame-format {jpg,png,bmp} specify the image format used for frame extraction + --temp-frame-quality [0-100] specify the image quality used for frame extraction + --keep-temp retain temporary frames after processing output creation: - --output-image-quality [0-100] specify the quality used for the output image - --output-video-encoder {libx264,libx265,libvpx-vp9,h264_nvenc,hevc_nvenc} specify the encoder used for the output video - --output-video-preset {ultrafast,superfast,veryfast,faster,fast,medium,slow,slower,veryslow} specify the preset used for the output video - --output-video-quality [0-100] specify the quality used for the output video - --output-video-resolution OUTPUT_VIDEO_RESOLUTION specify the resolution used for the output video - --output-video-fps OUTPUT_VIDEO_FPS specify the frames per second (fps) used for the output video - --skip-audio omit audio from the target + --output-image-quality [0-100] specify the quality used for the output image + --output-video-encoder {libx264,libx265,libvpx-vp9,h264_nvenc,hevc_nvenc} specify the encoder used for the output video + --output-video-preset {ultrafast,superfast,veryfast,faster,fast,medium,slow,slower,veryslow} specify the preset used for the output video + --output-video-quality [0-100] specify the quality used for the output video + --output-video-resolution OUTPUT_VIDEO_RESOLUTION specify the resolution used for the output video + --output-video-fps OUTPUT_VIDEO_FPS specify the frames per second (fps) used for the output video + --skip-audio omit audio from the target frame processors: - --frame-processors FRAME_PROCESSORS [FRAME_PROCESSORS ...] choose from the available frame processors (choices: face_debugger, face_enhancer, face_swapper, frame_enhancer, ...) - --face-debugger-items FACE_DEBUGGER_ITEMS [FACE_DEBUGGER_ITEMS ...] specify the face debugger items (choices: bbox, kps, face-mask, score) - --face-enhancer-model {codeformer,gfpgan_1.2,gfpgan_1.3,gfpgan_1.4,gpen_bfr_256,gpen_bfr_512,restoreformer} choose the model for the frame processor - --face-enhancer-blend [0-100] specify the blend amount for the frame processor - --face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial} choose the model for the frame processor - --frame-enhancer-model {real_esrgan_x2plus,real_esrgan_x4plus,real_esrnet_x4plus} choose the model for the frame processor - --frame-enhancer-blend [0-100] specify the blend amount for the frame processor + --frame-processors FRAME_PROCESSORS [FRAME_PROCESSORS ...] choose from the available frame processors (choices: face_debugger, face_enhancer, face_swapper, frame_enhancer, ...) + --face-debugger-items FACE_DEBUGGER_ITEMS [FACE_DEBUGGER_ITEMS ...] specify the face debugger items (choices: bbox, kps, face-mask, score) + --face-enhancer-model {codeformer,gfpgan_1.2,gfpgan_1.3,gfpgan_1.4,gpen_bfr_256,gpen_bfr_512,restoreformer_plus_plus} + choose the model for the frame processor + --face-enhancer-blend [0-100] specify the blend amount for the frame processor + --face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial} choose the model for the frame processor + --frame-enhancer-model {real_esrgan_x2plus,real_esrgan_x4plus,real_esrnet_x4plus} choose the model for the frame processor + --frame-enhancer-blend [0-100] specify the blend amount for the frame processor uis: - --ui-layouts UI_LAYOUTS [UI_LAYOUTS ...] choose from the available ui layouts (choices: benchmark, webcam, default, ...) + --ui-layouts UI_LAYOUTS [UI_LAYOUTS ...] choose from the available ui layouts (choices: benchmark, webcam, default, ...) ``` diff --git a/facefusion/choices.py b/facefusion/choices.py index b2fb133f..124e697e 100755 --- a/facefusion/choices.py +++ b/facefusion/choices.py @@ -7,7 +7,7 @@ video_memory_strategies : List[VideoMemoryStrategy] = [ 'strict', 'moderate', 't face_analyser_orders : List[FaceAnalyserOrder] = [ 'left-right', 'right-left', 'top-bottom', 'bottom-top', 'small-large', 'large-small', 'best-worst', 'worst-best' ] face_analyser_ages : List[FaceAnalyserAge] = [ 'child', 'teen', 'adult', 'senior' ] face_analyser_genders : List[FaceAnalyserGender] = [ 'male', 'female' ] -face_detector_models : List[str] = [ 'retinaface', 'yunet' ] +face_detector_models : List[str] = [ 'retinaface', 'yoloface', 'yunet' ] face_detector_sizes : List[str] = [ '160x160', '320x320', '480x480', '512x512', '640x640', '768x768', '960x960', '1024x1024' ] face_selector_modes : List[FaceSelectorMode] = [ 'reference', 'one', 'many' ] face_mask_types : List[FaceMaskType] = [ 'box', 'occlusion', 'region' ] diff --git a/facefusion/face_analyser.py b/facefusion/face_analyser.py index ffb0741f..de756580 100644 --- a/facefusion/face_analyser.py +++ b/facefusion/face_analyser.py @@ -23,6 +23,11 @@ MODELS : ModelSet =\ 'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/retinaface_10g.onnx', 'path': resolve_relative_path('../.assets/models/retinaface_10g.onnx') }, + 'face_detector_yoloface': + { + 'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/yoloface_8n.onnx', + 'path': resolve_relative_path('../.assets/models/yoloface_8n.onnx') + }, 'face_detector_yunet': { 'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/yunet_2023mar.onnx', @@ -58,6 +63,8 @@ def get_face_analyser() -> Any: if FACE_ANALYSER is None: if facefusion.globals.face_detector_model == 'retinaface': face_detector = onnxruntime.InferenceSession(MODELS.get('face_detector_retinaface').get('path'), providers = apply_execution_provider_options(facefusion.globals.execution_providers)) + if facefusion.globals.face_detector_model == 'yoloface': + face_detector = onnxruntime.InferenceSession(MODELS.get('face_detector_yoloface').get('path'), providers = apply_execution_provider_options(facefusion.globals.execution_providers)) if facefusion.globals.face_detector_model == 'yunet': face_detector = cv2.FaceDetectorYN.create(MODELS.get('face_detector_yunet').get('path'), '', (0, 0)) if facefusion.globals.face_recognizer_model == 'arcface_blendswap': @@ -88,6 +95,7 @@ def pre_check() -> bool: model_urls =\ [ MODELS.get('face_detector_retinaface').get('url'), + MODELS.get('face_detector_yoloface').get('url'), MODELS.get('face_detector_yunet').get('url'), MODELS.get('face_recognizer_arcface_inswapper').get('url'), MODELS.get('face_recognizer_arcface_simswap').get('url'), @@ -104,10 +112,13 @@ def extract_faces(frame : Frame) -> List[Face]: temp_frame_height, temp_frame_width, _ = temp_frame.shape ratio_height = frame_height / temp_frame_height ratio_width = frame_width / temp_frame_width + if facefusion.globals.face_detector_model == 'yoloface': + bbox_list, kps_list, score_list = detect_with_yoloface(temp_frame, temp_frame_height, temp_frame_width, face_detector_height, face_detector_width, ratio_height, ratio_width) + return create_faces(frame, bbox_list, kps_list, score_list) if facefusion.globals.face_detector_model == 'retinaface': bbox_list, kps_list, score_list = detect_with_retinaface(temp_frame, temp_frame_height, temp_frame_width, face_detector_height, face_detector_width, ratio_height, ratio_width) return create_faces(frame, bbox_list, kps_list, score_list) - elif facefusion.globals.face_detector_model == 'yunet': + if facefusion.globals.face_detector_model == 'yunet': bbox_list, kps_list, score_list = detect_with_yunet(temp_frame, temp_frame_height, temp_frame_width, ratio_height, ratio_width) return create_faces(frame, bbox_list, kps_list, score_list) return [] @@ -153,6 +164,48 @@ def detect_with_retinaface(temp_frame : Frame, temp_frame_height : int, temp_fra return bbox_list, kps_list, score_list +def detect_with_yoloface(temp_frame : Frame, temp_frame_height : int, temp_frame_width : int, face_detector_height : int, face_detector_width : int, ratio_height : float, ratio_width : float) -> Tuple[List[Bbox], List[Kps], List[Score]]: + face_detector = get_face_analyser().get('face_detector') + bbox_list = [] + kps_list = [] + score_list = [] + offset_width = (face_detector_width - temp_frame_width) / 2 + offset_height = (face_detector_height - temp_frame_height) / 2 + temp_frame = cv2.copyMakeBorder(temp_frame, round(offset_height - 0.1), round(offset_height + 0.1), round(offset_width - 0.1), round(offset_width + 0.1), cv2.BORDER_CONSTANT, value = (114, 114, 114)) + temp_frame = temp_frame.astype(numpy.float32) / 255.0 + temp_frame = temp_frame[..., ::-1].transpose(2, 0, 1) + temp_frame = numpy.expand_dims(temp_frame, axis = 0) + temp_frame = numpy.ascontiguousarray(temp_frame) + with THREAD_SEMAPHORE: + detections = face_detector.run(None, + { + face_detector.get_inputs()[0].name: temp_frame + }) + detections = numpy.squeeze(detections).T + bbox_raw, score_raw, kps_raw = numpy.split(detections, [ 4, 5 ], axis = 1) + keep_indices = numpy.where(score_raw > facefusion.globals.face_detector_score)[0] + if keep_indices.any(): + bbox_raw, kps_raw, score_raw = bbox_raw[keep_indices], kps_raw[keep_indices], score_raw[keep_indices] + for bbox in bbox_raw: + bbox_list.append(numpy.array( + [ + (bbox[0] - bbox[2] / 2 - offset_width) * ratio_width, + (bbox[1] - bbox[3] / 2 - offset_height) * ratio_height, + (bbox[0] + bbox[2] / 2 - offset_width) * ratio_width, + (bbox[1] + bbox[3] / 2 - offset_height) * ratio_height + ])) + kps_raw[:, 0::3] = (kps_raw[:, 0::3] - offset_width) * ratio_width + kps_raw[:, 1::3] = (kps_raw[:, 1::3] - offset_height) * ratio_height + for kps in kps_raw: + indexes = numpy.arange(0, len(kps), 3) + temp_kps = [] + for index in indexes: + temp_kps.append([kps[index], kps[index + 1]]) + kps_list.append(numpy.array(temp_kps)) + score_list = score_raw.ravel().tolist() + return bbox_list, kps_list, score_list + + def detect_with_yunet(temp_frame : Frame, temp_frame_height : int, temp_frame_width : int, ratio_height : float, ratio_width : float) -> Tuple[List[Bbox], List[Kps], List[Score]]: face_detector = get_face_analyser().get('face_detector') face_detector.setInputSize((temp_frame_width, temp_frame_height)) diff --git a/facefusion/typing.py b/facefusion/typing.py index 7a2459ff..2b6a054a 100755 --- a/facefusion/typing.py +++ b/facefusion/typing.py @@ -42,7 +42,7 @@ FaceSelectorMode = Literal['reference', 'one', 'many'] FaceAnalyserOrder = Literal['left-right', 'right-left', 'top-bottom', 'bottom-top', 'small-large', 'large-small', 'best-worst', 'worst-best'] FaceAnalyserAge = Literal['child', 'teen', 'adult', 'senior'] FaceAnalyserGender = Literal['male', 'female'] -FaceDetectorModel = Literal['retinaface', 'yunet'] +FaceDetectorModel = Literal['retinaface', 'yoloface', 'yunet'] FaceRecognizerModel = Literal['arcface_blendswap', 'arcface_inswapper', 'arcface_simswap'] FaceMaskType = Literal['box', 'occlusion', 'region'] FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip']