From 6aea186ee897fea8602ec46b941ef7185b64a5fe Mon Sep 17 00:00:00 2001
From: Henry Ruhs <info@henryruhs.com>
Date: Mon, 22 Jan 2024 13:56:45 +0100
Subject: [PATCH] feat/yoloface (#334)

* added yolov8 to face_detector (#323)

* added yolov8 to face_detector

* added yolov8 to face_detector

* Initial cleanup and renaming

* Update README

* refactored detect_with_yoloface (#329)

* refactored detect_with_yoloface

* apply review

* Change order again

* Restore working code

* modified code (#330)

* refactored detect_with_yoloface

* apply review

* use temp_frame in detect_with_yoloface

* reorder

* modified

* reorder models

* Tiny cleanup

---------

Co-authored-by: tamoharu <133945583+tamoharu@users.noreply.github.com>
---
 README.md                   | 95 +++++++++++++++++++------------------
 facefusion/choices.py       |  2 +-
 facefusion/face_analyser.py | 55 ++++++++++++++++++++-
 facefusion/typing.py        |  2 +-
 4 files changed, 104 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 1f7b530f..72ca078a 100644
--- a/README.md
+++ b/README.md
@@ -30,73 +30,74 @@ Run the command:
 python run.py [options]
 
 options:
-  -h, --help                                                                                                         show this help message and exit
-  -s SOURCE_PATHS, --source SOURCE_PATHS                                                                             select a source image
-  -t TARGET_PATH, --target TARGET_PATH                                                                               select a target image or video
-  -o OUTPUT_PATH, --output OUTPUT_PATH                                                                               specify the output file or directory
-  -v, --version                                                                                                      show program's version number and exit
+  -h, --help                                                                                                            show this help message and exit
+  -s SOURCE_PATHS, --source SOURCE_PATHS                                                                                select a source image
+  -t TARGET_PATH, --target TARGET_PATH                                                                                  select a target image or video
+  -o OUTPUT_PATH, --output OUTPUT_PATH                                                                                  specify the output file or directory
+  -v, --version                                                                                                         show program's version number and exit
 
 misc:
-  --skip-download                                                                                                    omit automate downloads and lookups
-  --headless                                                                                                         run the program in headless mode
-  --log-level {error,warn,info,debug}                                                                                choose from the available log levels
+  --skip-download                                                                                                       omit automate downloads and lookups
+  --headless                                                                                                            run the program in headless mode
+  --log-level {error,warn,info,debug}                                                                                   choose from the available log levels
 
 execution:
-  --execution-providers EXECUTION_PROVIDERS [EXECUTION_PROVIDERS ...]                                                choose from the available execution providers (choices: cpu, ...)
-  --execution-thread-count [1-128]                                                                                   specify the number of execution threads
-  --execution-queue-count [1-32]                                                                                     specify the number of execution queries
+  --execution-providers EXECUTION_PROVIDERS [EXECUTION_PROVIDERS ...]                                                   choose from the available execution providers (choices: cpu, ...)
+  --execution-thread-count [1-128]                                                                                      specify the number of execution threads
+  --execution-queue-count [1-32]                                                                                        specify the number of execution queries
 
 memory:
-  --video-memory-strategy {strict,moderate,tolerant}                                                                 specify strategy to handle the video memory
-  --system-memory-limit [0-128]                                                                                      specify the amount (gb) of system memory to be used
+  --video-memory-strategy {strict,moderate,tolerant}                                                                    specify strategy to handle the video memory
+  --system-memory-limit [0-128]                                                                                         specify the amount (gb) of system memory to be used
 
 face analyser:
-  --face-analyser-order {left-right,right-left,top-bottom,bottom-top,small-large,large-small,best-worst,worst-best}  specify the order used for the face analyser
-  --face-analyser-age {child,teen,adult,senior}                                                                      specify the age used for the face analyser
-  --face-analyser-gender {male,female}                                                                               specify the gender used for the face analyser
-  --face-detector-model {retinaface,yunet}                                                                           specify the model used for the face detector
-  --face-detector-size {160x160,320x320,480x480,512x512,640x640,768x768,960x960,1024x1024}                           specify the size threshold used for the face detector
-  --face-detector-score [0.0-1.0]                                                                                    specify the score threshold used for the face detector
+  --face-analyser-order {left-right,right-left,top-bottom,bottom-top,small-large,large-small,best-worst,worst-best}     specify the order used for the face analyser
+  --face-analyser-age {child,teen,adult,senior}                                                                         specify the age used for the face analyser
+  --face-analyser-gender {male,female}                                                                                  specify the gender used for the face analyser
+  --face-detector-model {retinaface,yoloface,yunet}                                                                     specify the model used for the face detector
+  --face-detector-size {160x160,320x320,480x480,512x512,640x640,768x768,960x960,1024x1024}                              specify the size threshold used for the face detector
+  --face-detector-score [0.0-1.0]                                                                                       specify the score threshold used for the face detector
 
 face selector:
-  --face-selector-mode {reference,one,many}                                                                          specify the mode for the face selector
-  --reference-face-position REFERENCE_FACE_POSITION                                                                  specify the position of the reference face
-  --reference-face-distance [0.0-1.5]                                                                                specify the distance between the reference face and the target face
-  --reference-frame-number REFERENCE_FRAME_NUMBER                                                                    specify the number of the reference frame
+  --face-selector-mode {reference,one,many}                                                                             specify the mode for the face selector
+  --reference-face-position REFERENCE_FACE_POSITION                                                                     specify the position of the reference face
+  --reference-face-distance [0.0-1.5]                                                                                   specify the distance between the reference face and the target face
+  --reference-frame-number REFERENCE_FRAME_NUMBER                                                                       specify the number of the reference frame
 
 face mask:
-  --face-mask-types FACE_MASK_TYPES [FACE_MASK_TYPES ...]                                                            choose from the available face mask types (choices: box, occlusion, region)
-  --face-mask-blur [0.0-1.0]                                                                                         specify the blur amount for face mask
-  --face-mask-padding FACE_MASK_PADDING [FACE_MASK_PADDING ...]                                                      specify the face mask padding (top, right, bottom, left) in percent
-  --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...]                                                      choose from the available face mask regions (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, eye-glasses, nose, mouth, upper-lip, lower-lip)
+  --face-mask-types FACE_MASK_TYPES [FACE_MASK_TYPES ...]                                                               choose from the available face mask types (choices: box, occlusion, region)
+  --face-mask-blur [0.0-1.0]                                                                                            specify the blur amount for face mask
+  --face-mask-padding FACE_MASK_PADDING [FACE_MASK_PADDING ...]                                                         specify the face mask padding (top, right, bottom, left) in percent
+  --face-mask-regions FACE_MASK_REGIONS [FACE_MASK_REGIONS ...]                                                         choose from the available face mask regions (choices: skin, left-eyebrow, right-eyebrow, left-eye, right-eye, eye-glasses, nose, mouth, upper-lip, lower-lip)
 
 frame extraction:
-  --trim-frame-start TRIM_FRAME_START                                                                                specify the start frame for extraction
-  --trim-frame-end TRIM_FRAME_END                                                                                    specify the end frame for extraction
-  --temp-frame-format {jpg,png,bmp}                                                                                  specify the image format used for frame extraction
-  --temp-frame-quality [0-100]                                                                                       specify the image quality used for frame extraction
-  --keep-temp                                                                                                        retain temporary frames after processing
+  --trim-frame-start TRIM_FRAME_START                                                                                   specify the start frame for extraction
+  --trim-frame-end TRIM_FRAME_END                                                                                       specify the end frame for extraction
+  --temp-frame-format {jpg,png,bmp}                                                                                     specify the image format used for frame extraction
+  --temp-frame-quality [0-100]                                                                                          specify the image quality used for frame extraction
+  --keep-temp                                                                                                           retain temporary frames after processing
 
 output creation:
-  --output-image-quality [0-100]                                                                                     specify the quality used for the output image
-  --output-video-encoder {libx264,libx265,libvpx-vp9,h264_nvenc,hevc_nvenc}                                          specify the encoder used for the output video
-  --output-video-preset {ultrafast,superfast,veryfast,faster,fast,medium,slow,slower,veryslow}                       specify the preset used for the output video
-  --output-video-quality [0-100]                                                                                     specify the quality used for the output video
-  --output-video-resolution OUTPUT_VIDEO_RESOLUTION                                                                  specify the resolution used for the output video
-  --output-video-fps OUTPUT_VIDEO_FPS                                                                                specify the frames per second (fps) used for the output video
-  --skip-audio                                                                                                       omit audio from the target
+  --output-image-quality [0-100]                                                                                        specify the quality used for the output image
+  --output-video-encoder {libx264,libx265,libvpx-vp9,h264_nvenc,hevc_nvenc}                                             specify the encoder used for the output video
+  --output-video-preset {ultrafast,superfast,veryfast,faster,fast,medium,slow,slower,veryslow}                          specify the preset used for the output video
+  --output-video-quality [0-100]                                                                                        specify the quality used for the output video
+  --output-video-resolution OUTPUT_VIDEO_RESOLUTION                                                                     specify the resolution used for the output video
+  --output-video-fps OUTPUT_VIDEO_FPS                                                                                   specify the frames per second (fps) used for the output video
+  --skip-audio                                                                                                          omit audio from the target
 
 frame processors:
-  --frame-processors FRAME_PROCESSORS [FRAME_PROCESSORS ...]                                                         choose from the available frame processors (choices: face_debugger, face_enhancer, face_swapper, frame_enhancer, ...)
-  --face-debugger-items FACE_DEBUGGER_ITEMS [FACE_DEBUGGER_ITEMS ...]                                                specify the face debugger items (choices: bbox, kps, face-mask, score)
-  --face-enhancer-model {codeformer,gfpgan_1.2,gfpgan_1.3,gfpgan_1.4,gpen_bfr_256,gpen_bfr_512,restoreformer}        choose the model for the frame processor
-  --face-enhancer-blend [0-100]                                                                                      specify the blend amount for the frame processor
-  --face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial}           choose the model for the frame processor
-  --frame-enhancer-model {real_esrgan_x2plus,real_esrgan_x4plus,real_esrnet_x4plus}                                  choose the model for the frame processor
-  --frame-enhancer-blend [0-100]                                                                                     specify the blend amount for the frame processor
+  --frame-processors FRAME_PROCESSORS [FRAME_PROCESSORS ...]                                                            choose from the available frame processors (choices: face_debugger, face_enhancer, face_swapper, frame_enhancer, ...)
+  --face-debugger-items FACE_DEBUGGER_ITEMS [FACE_DEBUGGER_ITEMS ...]                                                   specify the face debugger items (choices: bbox, kps, face-mask, score)
+  --face-enhancer-model {codeformer,gfpgan_1.2,gfpgan_1.3,gfpgan_1.4,gpen_bfr_256,gpen_bfr_512,restoreformer_plus_plus}
+                                                                                                                        choose the model for the frame processor
+  --face-enhancer-blend [0-100]                                                                                         specify the blend amount for the frame processor
+  --face-swapper-model {blendswap_256,inswapper_128,inswapper_128_fp16,simswap_256,simswap_512_unofficial}              choose the model for the frame processor
+  --frame-enhancer-model {real_esrgan_x2plus,real_esrgan_x4plus,real_esrnet_x4plus}                                     choose the model for the frame processor
+  --frame-enhancer-blend [0-100]                                                                                        specify the blend amount for the frame processor
 
 uis:
-  --ui-layouts UI_LAYOUTS [UI_LAYOUTS ...]                                                                           choose from the available ui layouts (choices: benchmark, webcam, default, ...)
+  --ui-layouts UI_LAYOUTS [UI_LAYOUTS ...]                                                                              choose from the available ui layouts (choices: benchmark, webcam, default, ...)
 ```
 
 
diff --git a/facefusion/choices.py b/facefusion/choices.py
index b2fb133f..124e697e 100755
--- a/facefusion/choices.py
+++ b/facefusion/choices.py
@@ -7,7 +7,7 @@ video_memory_strategies : List[VideoMemoryStrategy] = [ 'strict', 'moderate', 't
 face_analyser_orders : List[FaceAnalyserOrder] = [ 'left-right', 'right-left', 'top-bottom', 'bottom-top', 'small-large', 'large-small', 'best-worst', 'worst-best' ]
 face_analyser_ages : List[FaceAnalyserAge] = [ 'child', 'teen', 'adult', 'senior' ]
 face_analyser_genders : List[FaceAnalyserGender] = [ 'male', 'female' ]
-face_detector_models : List[str] = [ 'retinaface', 'yunet' ]
+face_detector_models : List[str] = [ 'retinaface', 'yoloface', 'yunet' ]
 face_detector_sizes : List[str] = [ '160x160', '320x320', '480x480', '512x512', '640x640', '768x768', '960x960', '1024x1024' ]
 face_selector_modes : List[FaceSelectorMode] = [ 'reference', 'one', 'many' ]
 face_mask_types : List[FaceMaskType] = [ 'box', 'occlusion', 'region' ]
diff --git a/facefusion/face_analyser.py b/facefusion/face_analyser.py
index ffb0741f..de756580 100644
--- a/facefusion/face_analyser.py
+++ b/facefusion/face_analyser.py
@@ -23,6 +23,11 @@ MODELS : ModelSet =\
 		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/retinaface_10g.onnx',
 		'path': resolve_relative_path('../.assets/models/retinaface_10g.onnx')
 	},
+	'face_detector_yoloface':
+	{
+		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/yoloface_8n.onnx',
+		'path': resolve_relative_path('../.assets/models/yoloface_8n.onnx')
+	},
 	'face_detector_yunet':
 	{
 		'url': 'https://github.com/facefusion/facefusion-assets/releases/download/models/yunet_2023mar.onnx',
@@ -58,6 +63,8 @@ def get_face_analyser() -> Any:
 		if FACE_ANALYSER is None:
 			if facefusion.globals.face_detector_model == 'retinaface':
 				face_detector = onnxruntime.InferenceSession(MODELS.get('face_detector_retinaface').get('path'), providers = apply_execution_provider_options(facefusion.globals.execution_providers))
+			if facefusion.globals.face_detector_model == 'yoloface':
+				face_detector = onnxruntime.InferenceSession(MODELS.get('face_detector_yoloface').get('path'), providers = apply_execution_provider_options(facefusion.globals.execution_providers))
 			if facefusion.globals.face_detector_model == 'yunet':
 				face_detector = cv2.FaceDetectorYN.create(MODELS.get('face_detector_yunet').get('path'), '', (0, 0))
 			if facefusion.globals.face_recognizer_model == 'arcface_blendswap':
@@ -88,6 +95,7 @@ def pre_check() -> bool:
 		model_urls =\
 		[
 			MODELS.get('face_detector_retinaface').get('url'),
+			MODELS.get('face_detector_yoloface').get('url'),
 			MODELS.get('face_detector_yunet').get('url'),
 			MODELS.get('face_recognizer_arcface_inswapper').get('url'),
 			MODELS.get('face_recognizer_arcface_simswap').get('url'),
@@ -104,10 +112,13 @@ def extract_faces(frame : Frame) -> List[Face]:
 	temp_frame_height, temp_frame_width, _ = temp_frame.shape
 	ratio_height = frame_height / temp_frame_height
 	ratio_width = frame_width / temp_frame_width
+	if facefusion.globals.face_detector_model == 'yoloface':
+		bbox_list, kps_list, score_list = detect_with_yoloface(temp_frame, temp_frame_height, temp_frame_width, face_detector_height, face_detector_width, ratio_height, ratio_width)
+		return create_faces(frame, bbox_list, kps_list, score_list)
 	if facefusion.globals.face_detector_model == 'retinaface':
 		bbox_list, kps_list, score_list = detect_with_retinaface(temp_frame, temp_frame_height, temp_frame_width, face_detector_height, face_detector_width, ratio_height, ratio_width)
 		return create_faces(frame, bbox_list, kps_list, score_list)
-	elif facefusion.globals.face_detector_model == 'yunet':
+	if facefusion.globals.face_detector_model == 'yunet':
 		bbox_list, kps_list, score_list = detect_with_yunet(temp_frame, temp_frame_height, temp_frame_width, ratio_height, ratio_width)
 		return create_faces(frame, bbox_list, kps_list, score_list)
 	return []
@@ -153,6 +164,48 @@ def detect_with_retinaface(temp_frame : Frame, temp_frame_height : int, temp_fra
 	return bbox_list, kps_list, score_list
 
 
+def detect_with_yoloface(temp_frame : Frame, temp_frame_height : int, temp_frame_width : int, face_detector_height : int, face_detector_width : int, ratio_height : float, ratio_width : float) -> Tuple[List[Bbox], List[Kps], List[Score]]:
+	face_detector = get_face_analyser().get('face_detector')
+	bbox_list = []
+	kps_list = []
+	score_list = []
+	offset_width = (face_detector_width - temp_frame_width) / 2
+	offset_height = (face_detector_height - temp_frame_height) / 2
+	temp_frame = cv2.copyMakeBorder(temp_frame, round(offset_height - 0.1), round(offset_height + 0.1), round(offset_width - 0.1), round(offset_width + 0.1), cv2.BORDER_CONSTANT, value = (114, 114, 114))
+	temp_frame = temp_frame.astype(numpy.float32) / 255.0
+	temp_frame = temp_frame[..., ::-1].transpose(2, 0, 1)
+	temp_frame = numpy.expand_dims(temp_frame, axis = 0)
+	temp_frame = numpy.ascontiguousarray(temp_frame)
+	with THREAD_SEMAPHORE:
+		detections = face_detector.run(None,
+		{
+			face_detector.get_inputs()[0].name: temp_frame
+		})
+	detections = numpy.squeeze(detections).T
+	bbox_raw, score_raw, kps_raw = numpy.split(detections, [ 4, 5 ], axis = 1)
+	keep_indices = numpy.where(score_raw > facefusion.globals.face_detector_score)[0]
+	if keep_indices.any():
+		bbox_raw, kps_raw, score_raw = bbox_raw[keep_indices], kps_raw[keep_indices], score_raw[keep_indices]
+		for bbox in bbox_raw:
+			bbox_list.append(numpy.array(
+			[
+				(bbox[0] - bbox[2] / 2 - offset_width) * ratio_width,
+				(bbox[1] - bbox[3] / 2 - offset_height) * ratio_height,
+				(bbox[0] + bbox[2] / 2 - offset_width) * ratio_width,
+				(bbox[1] + bbox[3] / 2 - offset_height) * ratio_height
+			]))
+		kps_raw[:, 0::3] = (kps_raw[:, 0::3] - offset_width) * ratio_width
+		kps_raw[:, 1::3] = (kps_raw[:, 1::3] - offset_height) * ratio_height
+		for kps in kps_raw:
+			indexes = numpy.arange(0, len(kps), 3)
+			temp_kps = []
+			for index in indexes:
+				temp_kps.append([kps[index], kps[index + 1]])
+			kps_list.append(numpy.array(temp_kps))
+		score_list = score_raw.ravel().tolist()
+	return bbox_list, kps_list, score_list
+
+
 def detect_with_yunet(temp_frame : Frame, temp_frame_height : int, temp_frame_width : int, ratio_height : float, ratio_width : float) -> Tuple[List[Bbox], List[Kps], List[Score]]:
 	face_detector = get_face_analyser().get('face_detector')
 	face_detector.setInputSize((temp_frame_width, temp_frame_height))
diff --git a/facefusion/typing.py b/facefusion/typing.py
index 7a2459ff..2b6a054a 100755
--- a/facefusion/typing.py
+++ b/facefusion/typing.py
@@ -42,7 +42,7 @@ FaceSelectorMode = Literal['reference', 'one', 'many']
 FaceAnalyserOrder = Literal['left-right', 'right-left', 'top-bottom', 'bottom-top', 'small-large', 'large-small', 'best-worst', 'worst-best']
 FaceAnalyserAge = Literal['child', 'teen', 'adult', 'senior']
 FaceAnalyserGender = Literal['male', 'female']
-FaceDetectorModel = Literal['retinaface', 'yunet']
+FaceDetectorModel = Literal['retinaface', 'yoloface', 'yunet']
 FaceRecognizerModel = Literal['arcface_blendswap', 'arcface_inswapper', 'arcface_simswap']
 FaceMaskType = Literal['box', 'occlusion', 'region']
 FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'eye-glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip']