Face selector auto mode (#1137)

* introduce face selector auto mode

* introduce face selector auto mode

* introduce face selector auto mode

* correct way is to pass source_vision_frames
This commit is contained in:
Henry Ruhs
2026-06-01 10:40:33 +02:00
committed by GitHub
parent 162c764b35
commit 6cbe5af9a6
19 changed files with 101 additions and 52 deletions
+5 -3
View File
@@ -2,7 +2,7 @@ import logging
from typing import List, Sequence, get_args
from facefusion.common_helper import create_float_range, create_int_range
from facefusion.types import Angle, AudioEncoder, AudioFormat, AudioTypeSet, BenchmarkMode, BenchmarkResolution, BenchmarkSet, DownloadProvider, DownloadProviderSet, DownloadScope, EncoderSet, ExecutionProvider, ExecutionProviderSet, FaceDetectorModel, FaceDetectorSet, FaceLandmarkerModel, FaceMaskArea, FaceMaskAreaSet, FaceMaskRegion, FaceMaskRegionSet, FaceMaskType, FaceOccluderModel, FaceParserModel, FaceSelectorMode, FaceSelectorOrder, Gender, ImageFormat, ImageTypeSet, JobStatus, LogLevel, LogLevelSet, Race, Score, TempFrameFormat, UiWorkflow, VideoEncoder, VideoFormat, VideoMemoryStrategy, VideoPreset, VideoTypeSet, VoiceExtractorModel
from facefusion.types import Angle, AudioEncoder, AudioFormat, AudioTypeSet, BenchmarkMode, BenchmarkResolution, BenchmarkSet, DownloadProvider, DownloadProviderSet, DownloadScope, EncoderSet, ExecutionProvider, ExecutionProviderSet, FaceDetectorModel, FaceDetectorSet, FaceLandmarkerModel, FaceMaskArea, FaceMaskAreaSet, FaceMaskRegion, FaceMaskRegionSet, FaceMaskType, FaceOccluderModel, FaceParserModel, FaceSelectorGender, FaceSelectorMode, FaceSelectorOrder, FaceSelectorRace, Gender, ImageFormat, ImageTypeSet, JobStatus, LogLevel, LogLevelSet, Race, Score, TempFrameFormat, UiWorkflow, VideoEncoder, VideoFormat, VideoMemoryStrategy, VideoPreset, VideoTypeSet, VoiceExtractorModel
face_detector_set : FaceDetectorSet =\
{
@@ -16,8 +16,10 @@ face_detector_models : List[FaceDetectorModel] = list(get_args(FaceDetectorModel
face_landmarker_models : List[FaceLandmarkerModel] = list(get_args(FaceLandmarkerModel))
face_selector_modes : List[FaceSelectorMode] = list(get_args(FaceSelectorMode))
face_selector_orders : List[FaceSelectorOrder] = list(get_args(FaceSelectorOrder))
face_selector_genders : List[Gender] = list(get_args(Gender))
face_selector_races : List[Race] = list(get_args(Race))
genders : List[Gender] = list(get_args(Gender))
races : List[Race] = list(get_args(Race))
face_selector_genders : List[FaceSelectorGender] = list(get_args(FaceSelectorGender))
face_selector_races : List[FaceSelectorRace] = list(get_args(FaceSelectorRace))
face_occluder_models : List[FaceOccluderModel] = list(get_args(FaceOccluderModel))
face_parser_models : List[FaceParserModel] = list(get_args(FaceParserModel))
face_mask_types : List[FaceMaskType] = list(get_args(FaceMaskType))
+36 -13
View File
@@ -2,26 +2,33 @@ from typing import List
import numpy
import facefusion.choices
from facefusion import state_manager
from facefusion.common_helper import get_first
from facefusion.face_analyser import get_many_faces, get_one_face
from facefusion.types import Face, FaceSelectorOrder, Gender, Race, Score, VisionFrame
def select_faces(reference_vision_frame : VisionFrame, target_vision_frame : VisionFrame) -> List[Face]:
def select_faces(reference_vision_frame : VisionFrame, source_vision_frames : List[VisionFrame], target_vision_frame : VisionFrame) -> List[Face]:
source_faces = []
target_faces = get_many_faces([ target_vision_frame ])
if state_manager.get_item('face_selector_gender') == 'auto' or state_manager.get_item('face_selector_race') == 'auto':
source_faces = get_many_faces(source_vision_frames)
if state_manager.get_item('face_selector_mode') == 'many':
return sort_and_filter_faces(target_faces)
return sort_and_filter_faces(source_faces, target_faces)
if state_manager.get_item('face_selector_mode') == 'one':
target_face = get_one_face(sort_and_filter_faces(target_faces))
target_face = get_one_face(sort_and_filter_faces(source_faces, target_faces))
if target_face:
return [ target_face ]
if state_manager.get_item('face_selector_mode') == 'reference':
reference_faces = get_many_faces([ reference_vision_frame ])
reference_faces = sort_and_filter_faces(reference_faces)
reference_faces = sort_and_filter_faces(source_faces, reference_faces)
reference_face = get_one_face(reference_faces, state_manager.get_item('reference_face_position'))
if reference_face:
match_faces = find_match_faces([ reference_face ], target_faces, state_manager.get_item('reference_face_distance'))
return match_faces
@@ -53,17 +60,33 @@ def calculate_face_distance(face : Face, reference_face : Face) -> float:
return 0
def sort_and_filter_faces(faces : List[Face]) -> List[Face]:
if faces:
def sort_and_filter_faces(source_faces : List[Face], target_faces : List[Face]) -> List[Face]:
if target_faces:
if state_manager.get_item('face_selector_order'):
faces = sort_faces_by_order(faces, state_manager.get_item('face_selector_order'))
if state_manager.get_item('face_selector_gender'):
faces = filter_faces_by_gender(faces, state_manager.get_item('face_selector_gender'))
if state_manager.get_item('face_selector_race'):
faces = filter_faces_by_race(faces, state_manager.get_item('face_selector_race'))
target_faces = sort_faces_by_order(target_faces, state_manager.get_item('face_selector_order'))
face_selector_gender = state_manager.get_item('face_selector_gender')
face_selector_race = state_manager.get_item('face_selector_race')
if source_faces and face_selector_gender == 'auto' or face_selector_race == 'auto':
source_face = get_first(sort_faces_by_order(source_faces, 'large-small'))
if source_face:
if face_selector_gender == 'auto':
face_selector_gender = source_face.gender
if face_selector_race == 'auto':
face_selector_race = source_face.race
if face_selector_gender in facefusion.choices.genders:
target_faces = filter_faces_by_gender(target_faces, face_selector_gender)
if face_selector_race in facefusion.choices.races:
target_faces = filter_faces_by_race(target_faces, face_selector_race)
if state_manager.get_item('face_selector_age_start') or state_manager.get_item('face_selector_age_end'):
faces = filter_faces_by_age(faces, state_manager.get_item('face_selector_age_start'), state_manager.get_item('face_selector_age_end'))
return faces
target_faces = filter_faces_by_age(target_faces, state_manager.get_item('face_selector_age_start'), state_manager.get_item('face_selector_age_end'))
return target_faces
def sort_faces_by_order(faces : List[Face], order : FaceSelectorOrder) -> List[Face]:
@@ -276,10 +276,11 @@ def normalize_extend_frame(extend_vision_frame : VisionFrame) -> VisionFrame:
def process_frame(inputs : AgeModifierInputs) -> ProcessorOutputs:
reference_vision_frame = inputs.get('reference_vision_frame')
source_vision_frames = inputs.get('source_vision_frames')
target_vision_frame = inputs.get('target_vision_frame')
temp_vision_frame = inputs.get('temp_vision_frame')
temp_vision_mask = inputs.get('temp_vision_mask')
target_faces = select_faces(reference_vision_frame, target_vision_frame)
target_faces = select_faces(reference_vision_frame, source_vision_frames, target_vision_frame)
if target_faces:
for target_face in target_faces:
@@ -1,4 +1,4 @@
from typing import Any, Literal, TypeAlias, TypedDict
from typing import Any, List, Literal, TypeAlias, TypedDict
from numpy.typing import NDArray
@@ -7,6 +7,7 @@ from facefusion.types import Mask, VisionFrame
AgeModifierInputs = TypedDict('AgeModifierInputs',
{
'reference_vision_frame' : VisionFrame,
'source_vision_frames' : List[VisionFrame],
'target_vision_frame' : VisionFrame,
'temp_vision_frame' : VisionFrame,
'temp_vision_mask' : Mask
@@ -411,10 +411,11 @@ def prepare_crop_mask(crop_source_mask : Mask, crop_target_mask : Mask) -> Mask:
def process_frame(inputs : DeepSwapperInputs) -> ProcessorOutputs:
reference_vision_frame = inputs.get('reference_vision_frame')
source_vision_frames = inputs.get('source_vision_frames')
target_vision_frame = inputs.get('target_vision_frame')
temp_vision_frame = inputs.get('temp_vision_frame')
temp_vision_mask = inputs.get('temp_vision_mask')
target_faces = select_faces(reference_vision_frame, target_vision_frame)
target_faces = select_faces(reference_vision_frame, source_vision_frames, target_vision_frame)
if target_faces:
for target_face in target_faces:
@@ -1,4 +1,4 @@
from typing import Any, TypeAlias, TypedDict
from typing import Any, List, TypeAlias, TypedDict
from numpy.typing import NDArray
@@ -7,6 +7,7 @@ from facefusion.types import Mask, VisionFrame
DeepSwapperInputs = TypedDict('DeepSwapperInputs',
{
'reference_vision_frame' : VisionFrame,
'source_vision_frames' : List[VisionFrame],
'target_vision_frame' : VisionFrame,
'temp_vision_frame' : VisionFrame,
'temp_vision_mask' : Mask
@@ -257,10 +257,11 @@ def normalize_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
def process_frame(inputs : ExpressionRestorerInputs) -> ProcessorOutputs:
reference_vision_frame = inputs.get('reference_vision_frame')
source_vision_frames = inputs.get('source_vision_frames')
target_vision_frame = inputs.get('target_vision_frame')
temp_vision_frame = inputs.get('temp_vision_frame')
temp_vision_mask = inputs.get('temp_vision_mask')
target_faces = select_faces(reference_vision_frame, target_vision_frame)
target_faces = select_faces(reference_vision_frame, source_vision_frames, target_vision_frame)
if target_faces:
for target_face in target_faces:
@@ -232,10 +232,11 @@ def calculate_scale(temp_vision_frame : VisionFrame) -> int:
def process_frame(inputs : FaceDebuggerInputs) -> ProcessorOutputs:
reference_vision_frame = inputs.get('reference_vision_frame')
source_vision_frames = inputs.get('source_vision_frames')
target_vision_frame = inputs.get('target_vision_frame')
temp_vision_frame = inputs.get('temp_vision_frame')
temp_vision_mask = inputs.get('temp_vision_mask')
target_faces = select_faces(reference_vision_frame, target_vision_frame)
target_faces = select_faces(reference_vision_frame, source_vision_frames, target_vision_frame)
if target_faces:
for target_face in target_faces:
@@ -1,10 +1,11 @@
from typing import Literal, TypedDict
from typing import List, Literal, TypedDict
from facefusion.types import Mask, VisionFrame
FaceDebuggerInputs = TypedDict('FaceDebuggerInputs',
{
'reference_vision_frame' : VisionFrame,
'source_vision_frames' : List[VisionFrame],
'target_vision_frame' : VisionFrame,
'temp_vision_frame' : VisionFrame,
'temp_vision_mask' : Mask
@@ -486,10 +486,11 @@ def normalize_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
def process_frame(inputs : FaceEditorInputs) -> ProcessorOutputs:
reference_vision_frame = inputs.get('reference_vision_frame')
source_vision_frames = inputs.get('source_vision_frames')
target_vision_frame = inputs.get('target_vision_frame')
temp_vision_frame = inputs.get('temp_vision_frame')
temp_vision_mask = inputs.get('temp_vision_mask')
target_faces = select_faces(reference_vision_frame, target_vision_frame)
target_faces = select_faces(reference_vision_frame, source_vision_frames, target_vision_frame)
if target_faces:
for target_face in target_faces:
@@ -1,10 +1,11 @@
from typing import Literal, TypedDict
from typing import List, Literal, TypedDict
from facefusion.types import Mask, VisionFrame
FaceEditorInputs = TypedDict('FaceEditorInputs',
{
'reference_vision_frame' : VisionFrame,
'source_vision_frames' : List[VisionFrame],
'target_vision_frame' : VisionFrame,
'temp_vision_frame' : VisionFrame,
'temp_vision_mask' : Mask
@@ -413,10 +413,11 @@ def blend_paste_frame(temp_vision_frame : VisionFrame, paste_vision_frame : Visi
def process_frame(inputs : FaceEnhancerInputs) -> ProcessorOutputs:
reference_vision_frame = inputs.get('reference_vision_frame')
source_vision_frames = inputs.get('source_vision_frames')
target_vision_frame = inputs.get('target_vision_frame')
temp_vision_frame = inputs.get('temp_vision_frame')
temp_vision_mask = inputs.get('temp_vision_mask')
target_faces = select_faces(reference_vision_frame, target_vision_frame)
target_faces = select_faces(reference_vision_frame, source_vision_frames, target_vision_frame)
if target_faces:
for target_face in target_faces:
@@ -1,4 +1,4 @@
from typing import Any, Literal, TypeAlias, TypedDict
from typing import Any, List, Literal, TypeAlias, TypedDict
from numpy.typing import NDArray
@@ -7,6 +7,7 @@ from facefusion.types import Mask, VisionFrame
FaceEnhancerInputs = TypedDict('FaceEnhancerInputs',
{
'reference_vision_frame' : VisionFrame,
'source_vision_frames' : List[VisionFrame],
'target_vision_frame' : VisionFrame,
'temp_vision_frame' : VisionFrame,
'temp_vision_mask' : Mask
@@ -775,7 +775,7 @@ def process_frame(inputs : FaceSwapperInputs) -> ProcessorOutputs:
temp_vision_frame = inputs.get('temp_vision_frame')
temp_vision_mask = inputs.get('temp_vision_mask')
source_face = extract_source_face(source_vision_frames)
target_faces = select_faces(reference_vision_frame, target_vision_frame)
target_faces = select_faces(reference_vision_frame, source_vision_frames, target_vision_frame)
if source_face and target_faces:
for target_face in target_faces:
@@ -282,11 +282,12 @@ def normalize_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
def process_frame(inputs : LipSyncerInputs) -> ProcessorOutputs:
reference_vision_frame = inputs.get('reference_vision_frame')
source_vision_frames = inputs.get('source_vision_frames')
source_voice_frame = inputs.get('source_voice_frame')
target_vision_frame = inputs.get('target_vision_frame')
temp_vision_frame = inputs.get('temp_vision_frame')
temp_vision_mask = inputs.get('temp_vision_mask')
target_faces = select_faces(reference_vision_frame, target_vision_frame)
target_faces = select_faces(reference_vision_frame, source_vision_frames, target_vision_frame)
if target_faces:
for target_face in target_faces:
@@ -1,4 +1,4 @@
from typing import Any, Literal, TypeAlias, TypedDict
from typing import Any, List, Literal, TypeAlias, TypedDict
from numpy.typing import NDArray
@@ -7,6 +7,7 @@ from facefusion.types import AudioFrame, Mask, VisionFrame
LipSyncerInputs = TypedDict('LipSyncerInputs',
{
'reference_vision_frame' : VisionFrame,
'source_vision_frames' : List[VisionFrame],
'source_voice_frame' : AudioFrame,
'target_vision_frame' : VisionFrame,
'temp_vision_frame' : VisionFrame,
+9 -4
View File
@@ -29,9 +29,14 @@ FaceScoreSet = TypedDict('FaceScoreSet',
'landmarker' : Score
})
Embedding : TypeAlias = NDArray[numpy.float64]
Gender = Literal['female', 'male']
Age : TypeAlias = range
Gender = Literal['female', 'male']
Race = Literal['white', 'black', 'latino', 'asian', 'indian', 'arabic']
FaceSelectorGender = Literal['auto', 'female', 'male']
FaceSelectorRace = Literal['auto', 'white', 'black', 'latino', 'asian', 'indian', 'arabic']
Face = namedtuple('Face',
[
'bounding_box',
@@ -40,8 +45,8 @@ Face = namedtuple('Face',
'angle',
'embedding',
'embedding_norm',
'gender',
'age',
'gender',
'race'
])
Language = Literal['en']
@@ -346,8 +351,8 @@ State = TypedDict('State',
'face_landmarker_score' : Score,
'face_selector_mode' : FaceSelectorMode,
'face_selector_order' : FaceSelectorOrder,
'face_selector_race' : Race,
'face_selector_gender' : Gender,
'face_selector_race' : FaceSelectorRace,
'face_selector_gender' : FaceSelectorGender,
'face_selector_age_start' : int,
'face_selector_age_end' : int,
'reference_face_position' : int,
+20 -14
View File
@@ -9,12 +9,12 @@ from facefusion import state_manager, translator
from facefusion.common_helper import calculate_float_step, calculate_int_step
from facefusion.face_analyser import get_many_faces
from facefusion.face_selector import sort_and_filter_faces
from facefusion.filesystem import is_image, is_video
from facefusion.types import FaceSelectorMode, FaceSelectorOrder, Gender, Race, VisionFrame
from facefusion.filesystem import filter_image_paths, is_image, is_video
from facefusion.types import FaceSelectorGender, FaceSelectorMode, FaceSelectorOrder, FaceSelectorRace, VisionFrame
from facefusion.uis.core import get_ui_component, get_ui_components, register_ui_component
from facefusion.uis.types import ComponentOptions
from facefusion.uis.ui_helper import convert_str_none
from facefusion.vision import fit_cover_frame, read_static_image, read_video_frame
from facefusion.vision import fit_cover_frame, read_static_image, read_static_images, read_video_frame
FACE_SELECTOR_MODE_DROPDOWN : Optional[gradio.Dropdown] = None
FACE_SELECTOR_ORDER_DROPDOWN : Optional[gradio.Dropdown] = None
@@ -42,12 +42,14 @@ def render() -> None:
'elem_classes': 'box-face-selector',
'visible': 'reference' in state_manager.get_item('face_selector_mode')
}
source_vision_frames = read_static_images(filter_image_paths(state_manager.get_item('source_paths')))
if is_image(state_manager.get_item('target_path')):
target_vision_frame = read_static_image(state_manager.get_item('target_path'))
reference_face_gallery_options['value'] = extract_gallery_frames(target_vision_frame)
reference_face_gallery_options['value'] = extract_gallery_frames(source_vision_frames, target_vision_frame)
if is_video(state_manager.get_item('target_path')):
target_vision_frame = read_video_frame(state_manager.get_item('target_path'), state_manager.get_item('reference_frame_number'))
reference_face_gallery_options['value'] = extract_gallery_frames(target_vision_frame)
reference_face_gallery_options['value'] = extract_gallery_frames(source_vision_frames, target_vision_frame)
FACE_SELECTOR_MODE_DROPDOWN = gradio.Dropdown(
label = translator.get('uis.face_selector_mode_dropdown'),
choices = facefusion.choices.face_selector_modes,
@@ -154,12 +156,12 @@ def update_face_selector_order(face_analyser_order : FaceSelectorOrder) -> gradi
return update_reference_position_gallery()
def update_face_selector_gender(face_selector_gender : Gender) -> gradio.Gallery:
def update_face_selector_gender(face_selector_gender : FaceSelectorGender) -> gradio.Gallery:
state_manager.set_item('face_selector_gender', convert_str_none(face_selector_gender))
return update_reference_position_gallery()
def update_face_selector_race(face_selector_race : Race) -> gradio.Gallery:
def update_face_selector_race(face_selector_race : FaceSelectorRace) -> gradio.Gallery:
state_manager.set_item('face_selector_race', convert_str_none(face_selector_race))
return update_reference_position_gallery()
@@ -197,24 +199,27 @@ def clear_and_update_reference_position_gallery() -> gradio.Gallery:
def update_reference_position_gallery(frame_number : int = 0) -> gradio.Gallery:
gallery_vision_frames = []
source_vision_frames = read_static_images(filter_image_paths(state_manager.get_item('source_paths')))
if is_image(state_manager.get_item('target_path')):
target_vision_frame = read_static_image(state_manager.get_item('target_path'))
gallery_vision_frames = extract_gallery_frames(target_vision_frame)
gallery_vision_frames = extract_gallery_frames(source_vision_frames, target_vision_frame)
if is_video(state_manager.get_item('target_path')):
target_vision_frame = read_video_frame(state_manager.get_item('target_path'), frame_number)
gallery_vision_frames = extract_gallery_frames(target_vision_frame)
gallery_vision_frames = extract_gallery_frames(source_vision_frames, target_vision_frame)
if gallery_vision_frames:
return gradio.Gallery(value = gallery_vision_frames)
return gradio.Gallery(value = None)
def extract_gallery_frames(target_vision_frame : VisionFrame) -> List[VisionFrame]:
def extract_gallery_frames(source_vision_frames : List[VisionFrame], target_vision_frame : VisionFrame) -> List[VisionFrame]:
gallery_vision_frames = []
faces = get_many_faces([ target_vision_frame ])
faces = sort_and_filter_faces(faces)
source_faces = get_many_faces(source_vision_frames)
target_faces = get_many_faces([ target_vision_frame ])
target_faces = sort_and_filter_faces(source_faces, target_faces)
for face in faces:
start_x, start_y, end_x, end_y = map(int, face.bounding_box)
for target_face in target_faces:
start_x, start_y, end_x, end_y = map(int, target_face.bounding_box)
padding_x = int((end_x - start_x) * 0.25)
padding_y = int((end_y - start_y) * 0.25)
start_x = max(0, start_x - padding_x)
@@ -225,4 +230,5 @@ def extract_gallery_frames(target_vision_frame : VisionFrame) -> List[VisionFram
crop_vision_frame = fit_cover_frame(crop_vision_frame, (128, 128))
crop_vision_frame = cv2.cvtColor(crop_vision_frame, cv2.COLOR_BGR2RGB)
gallery_vision_frames.append(crop_vision_frame)
return gallery_vision_frames
+4 -4
View File
@@ -231,7 +231,7 @@ def process_preview_frame(reference_vision_frame : VisionFrame, source_vision_fr
return numpy.hstack((temp_vision_frame, temp_vision_frame))
if preview_mode == 'face-by-face':
target_crop_vision_frame, output_crop_vision_frame = create_face_by_face(reference_vision_frame, target_vision_frame[:, :, :3], temp_vision_frame[:, :, :3])
target_crop_vision_frame, output_crop_vision_frame = create_face_by_face(reference_vision_frame, source_vision_frames, target_vision_frame[:, :, :3], temp_vision_frame[:, :, :3])
target_crop_vision_frame = obscure_frame(target_crop_vision_frame)
output_crop_vision_frame = obscure_frame(output_crop_vision_frame)
return numpy.hstack((target_crop_vision_frame, output_crop_vision_frame))
@@ -261,14 +261,14 @@ def process_preview_frame(reference_vision_frame : VisionFrame, source_vision_fr
return numpy.hstack((target_vision_frame, temp_vision_frame))
if preview_mode == 'face-by-face':
target_crop_vision_frame, output_crop_vision_frame = create_face_by_face(reference_vision_frame, target_vision_frame, temp_vision_frame)
target_crop_vision_frame, output_crop_vision_frame = create_face_by_face(reference_vision_frame, source_vision_frames, target_vision_frame, temp_vision_frame)
return numpy.hstack((target_crop_vision_frame, output_crop_vision_frame))
return temp_vision_frame
def create_face_by_face(reference_vision_frame : VisionFrame, target_vision_frame : VisionFrame, temp_vision_frame : VisionFrame) -> Tuple[VisionFrame, VisionFrame]:
target_faces = select_faces(reference_vision_frame[:, :, :3], target_vision_frame[:, :, :3])
def create_face_by_face(reference_vision_frame : VisionFrame, source_vision_frames : List[VisionFrame], target_vision_frame : VisionFrame, temp_vision_frame : VisionFrame) -> Tuple[VisionFrame, VisionFrame]:
target_faces = select_faces(reference_vision_frame[:, :, :3], source_vision_frames, target_vision_frame[:, :, :3])
target_face = get_one_face(target_faces)
if target_face: