from collections import namedtuple from datetime import datetime from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, TypeAlias, TypedDict import cv2 import numpy from numpy.typing import NDArray from onnxruntime import InferenceSession Scale : TypeAlias = float Score : TypeAlias = float Angle : TypeAlias = int Detection : TypeAlias = NDArray[Any] Prediction : TypeAlias = NDArray[Any] BoundingBox : TypeAlias = NDArray[Any] FaceLandmark5 : TypeAlias = NDArray[Any] FaceLandmark68 : TypeAlias = NDArray[Any] FaceLandmarkSet = TypedDict('FaceLandmarkSet', { '5' : FaceLandmark5, #type:ignore[valid-type] '5/68' : FaceLandmark5, #type:ignore[valid-type] '68' : FaceLandmark68, #type:ignore[valid-type] '68/5' : FaceLandmark68 #type:ignore[valid-type] }) FaceScoreSet = TypedDict('FaceScoreSet', { 'detector' : Score, 'landmarker' : Score }) Embedding : TypeAlias = NDArray[numpy.float64] Gender = Literal['female', 'male'] Age : TypeAlias = range Race = Literal['white', 'black', 'latino', 'asian', 'indian', 'arabic'] Face = namedtuple('Face', [ 'bounding_box', 'score_set', 'landmark_set', 'angle', 'embedding', 'embedding_norm', 'gender', 'age', 'race' ]) FaceSet : TypeAlias = Dict[str, List[Face]] FaceStore = TypedDict('FaceStore', { 'static_faces' : FaceSet }) Language = Literal['en'] Locales : TypeAlias = Dict[Language, Dict[str, Any]] LocalePoolSet : TypeAlias = Dict[str, Locales] WorkFlow = Literal['auto', 'audio-to-image:frames', 'audio-to-image:video', 'image-to-image', 'image-to-video', 'image-to-video:frames'] VideoCaptureSet : TypeAlias = Dict[str, cv2.VideoCapture] VideoWriterSet : TypeAlias = Dict[str, cv2.VideoWriter] CameraCaptureSet : TypeAlias = Dict[str, cv2.VideoCapture] VideoPoolSet = TypedDict('VideoPoolSet', { 'capture' : VideoCaptureSet, 'writer' : VideoWriterSet }) CameraPoolSet = TypedDict('CameraPoolSet', { 'capture' : CameraCaptureSet }) ColorMode = Literal['rgb', 'rgba'] VisionFrame : TypeAlias = NDArray[Any] Mask : TypeAlias = NDArray[Any] Points : TypeAlias = NDArray[Any] Distance : TypeAlias = NDArray[Any] Matrix : TypeAlias = NDArray[Any] Anchors : TypeAlias = NDArray[Any] Translation : TypeAlias = NDArray[Any] AudioBuffer : TypeAlias = bytes Audio : TypeAlias = NDArray[Any] AudioChunk : TypeAlias = NDArray[Any] AudioFrame : TypeAlias = NDArray[Any] Spectrogram : TypeAlias = NDArray[Any] Mel : TypeAlias = NDArray[Any] MelFilterBank : TypeAlias = NDArray[Any] Voice : TypeAlias = NDArray[Any] VoiceChunk : TypeAlias = NDArray[Any] Fps : TypeAlias = float Duration : TypeAlias = float Color : TypeAlias = Tuple[int, int, int, int] Padding : TypeAlias = Tuple[int, int, int, int] Margin : TypeAlias = Tuple[int, int, int, int] Orientation = Literal['landscape', 'portrait'] Resolution : TypeAlias = Tuple[int, int] Args : TypeAlias = Dict[str, Any] Scope : TypeAlias = Literal['api', 'cli', 'sys'] ArgsStore = TypedDict('ArgsStore', { 'api' : List[str], 'cli' : List[str], 'sys' : List[str] }) ProcessState = Literal['checking', 'processing', 'stopping', 'pending'] UpdateProgress : TypeAlias = Callable[[int], None] ProcessStep : TypeAlias = Callable[[str, int, Args], bool] Content : TypeAlias = Dict[str, Any] Token : TypeAlias = str SessionId : TypeAlias = str Session = TypedDict('Session', { 'access_token' : Token, 'refresh_token' : Token, 'created_at' : datetime, 'expires_at' : datetime }) Command : TypeAlias = str CommandSet : TypeAlias = Dict[str, List[Command]] WarpTemplate = Literal['arcface_112_v1', 'arcface_112_v2', 'arcface_128', 'dfl_whole_face', 'ffhq_512', 'mtcnn_512', 'styleganex_384'] WarpTemplateSet : TypeAlias = Dict[WarpTemplate, NDArray[Any]] ProcessMode = Literal['output', 'preview', 'stream'] ErrorCode = Literal[0, 1, 2, 3, 4] LogLevel = Literal['error', 'warn', 'info', 'debug'] LogLevelSet : TypeAlias = Dict[LogLevel, int] TableHeader : TypeAlias = str TableContent : TypeAlias = Any FaceDetectorModel = Literal['many', 'retinaface', 'scrfd', 'yolo_face', 'yunet'] FaceLandmarkerModel = Literal['many', '2dfan4', 'peppa_wutz'] FaceDetectorSet : TypeAlias = Dict[FaceDetectorModel, List[str]] FaceSelectorMode = Literal['many', 'one', 'reference'] FaceSelectorOrder = Literal['left-right', 'right-left', 'top-bottom', 'bottom-top', 'small-large', 'large-small', 'best-worst', 'worst-best'] FaceOccluderModel = Literal['many', 'xseg_1', 'xseg_2', 'xseg_3'] FaceParserModel = Literal['bisenet_resnet_18', 'bisenet_resnet_34'] FaceMaskType = Literal['box', 'occlusion', 'area', 'region'] FaceMaskArea = Literal['upper-face', 'lower-face', 'mouth'] FaceMaskRegion = Literal['skin', 'left-eyebrow', 'right-eyebrow', 'left-eye', 'right-eye', 'glasses', 'nose', 'mouth', 'upper-lip', 'lower-lip'] FaceMaskRegionSet : TypeAlias = Dict[FaceMaskRegion, int] FaceMaskAreaSet : TypeAlias = Dict[FaceMaskArea, List[int]] VoiceExtractorModel = Literal['kim_vocal_1', 'kim_vocal_2', 'uvr_mdxnet'] AudioFormat = Literal['flac', 'm4a', 'mp3', 'ogg', 'opus', 'wav'] ImageFormat = Literal['bmp', 'jpeg', 'png', 'tiff', 'webp'] VideoFormat = Literal['avi', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mxf', 'webm', 'wmv'] TempFrameFormat = Literal['bmp', 'jpeg', 'png', 'tiff'] AudioTypeSet : TypeAlias = Dict[AudioFormat, str] ImageTypeSet : TypeAlias = Dict[ImageFormat, str] VideoTypeSet : TypeAlias = Dict[VideoFormat, str] AudioEncoder = Literal['flac', 'aac', 'libmp3lame', 'libopus', 'libvorbis', 'pcm_s16le', 'pcm_s32le'] VideoEncoder = Literal['libx264', 'libx264rgb', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf', 'h264_qsv', 'hevc_qsv', 'h264_videotoolbox', 'hevc_videotoolbox', 'rawvideo'] EncoderSet = TypedDict('EncoderSet', { 'audio' : List[AudioEncoder], 'video' : List[VideoEncoder] }) VideoPreset = Literal['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow'] AssetId : TypeAlias = str AssetType = Literal['source', 'target'] MediaType = Literal['image', 'video', 'audio'] AudioMetadata = TypedDict('AudioMetadata', { 'duration' : Duration, 'sample_rate': int, 'frame_total': int, 'channels': int, 'format': str }) ImageMetadata = TypedDict('ImageMetadata', { 'resolution' : Resolution }) VideoMetadata = TypedDict('VideoMetadata', { 'duration' : Duration, 'frame_total' : int, 'fps' : Fps, 'resolution' : Resolution }) AudioAsset = TypedDict('AudioAsset', { 'id' : AssetId, 'created_at' : datetime, 'expires_at' : datetime, 'type' : AssetType, 'media' : Literal['audio'], 'name' : str, 'format' : AudioFormat, 'size' : int, 'path' : str, 'metadata' : AudioMetadata }) ImageAsset = TypedDict('ImageAsset', { 'id' : AssetId, 'created_at' : datetime, 'expires_at' : datetime, 'type' : AssetType, 'media' : Literal['image'], 'name' : str, 'format' : ImageFormat, 'size' : int, 'path' : str, 'metadata' : ImageMetadata }) VideoAsset = TypedDict('VideoAsset', { 'id' : AssetId, 'created_at' : datetime, 'expires_at' : datetime, 'type' : AssetType, 'media' : Literal['video'], 'name' : str, 'format' : VideoFormat, 'size' : int, 'path' : str, 'metadata' : VideoMetadata }) AssetMetadata : TypeAlias = AudioMetadata | ImageMetadata | VideoMetadata Asset : TypeAlias = AudioAsset | ImageAsset | VideoAsset AssetStore : TypeAlias = Dict[SessionId, Dict[AssetId, Asset]] BenchmarkMode = Literal['warm', 'cold'] BenchmarkResolution = Literal['240p', '360p', '540p', '720p', '1080p', '1440p', '2160p'] BenchmarkSet : TypeAlias = Dict[BenchmarkResolution, str] BenchmarkCycleSet = TypedDict('BenchmarkCycleSet', { 'target_path' : str, 'cycle_count' : int, 'average_run' : float, 'fastest_run' : float, 'slowest_run' : float, 'relative_fps' : float }) WebcamMode = Literal['inline', 'udp', 'v4l2'] StreamMode = Literal['udp', 'v4l2'] ModelOptions : TypeAlias = Dict[str, Any] ModelSet : TypeAlias = Dict[str, ModelOptions] ModelInitializer : TypeAlias = NDArray[Any] ExecutionProvider = Literal['cuda', 'tensorrt', 'rocm', 'migraphx', 'coreml', 'openvino', 'qnn', 'directml', 'cpu'] ExecutionProviderValue = Literal['CPUExecutionProvider', 'CoreMLExecutionProvider', 'CUDAExecutionProvider', 'DmlExecutionProvider', 'OpenVINOExecutionProvider', 'MIGraphXExecutionProvider', 'QNNExecutionProvider', 'ROCMExecutionProvider', 'TensorrtExecutionProvider'] ExecutionProviderSet : TypeAlias = Dict[ExecutionProvider, ExecutionProviderValue] InferenceProvider : TypeAlias = Any InferenceOptionSet : TypeAlias = Dict[str, Any] ValueAndUnit = TypedDict('ValueAndUnit', { 'value' : int, 'unit' : str }) ExecutionDeviceFramework = TypedDict('ExecutionDeviceFramework', { 'name' : str, 'version' : str }) ExecutionDeviceProduct = TypedDict('ExecutionDeviceProduct', { 'vendor' : str, 'name' : str }) ExecutionDeviceVideoMemory = TypedDict('ExecutionDeviceVideoMemory', { 'total' : Optional[ValueAndUnit], 'free' : Optional[ValueAndUnit] }) ExecutionDeviceTemperature = TypedDict('ExecutionDeviceTemperature', { 'gpu' : Optional[ValueAndUnit], 'memory' : Optional[ValueAndUnit] }) ExecutionDeviceUtilization = TypedDict('ExecutionDeviceUtilization', { 'gpu' : Optional[ValueAndUnit], 'memory' : Optional[ValueAndUnit] }) ExecutionDevice = TypedDict('ExecutionDevice', { 'driver_version' : str, 'framework' : ExecutionDeviceFramework, 'product' : ExecutionDeviceProduct, 'video_memory' : ExecutionDeviceVideoMemory, 'temperature' : ExecutionDeviceTemperature, 'utilization' : ExecutionDeviceUtilization }) DownloadProvider = Literal['github', 'huggingface'] DownloadProviderValue = TypedDict('DownloadProviderValue', { 'urls' : List[str], 'path' : str }) DownloadProviderSet : TypeAlias = Dict[DownloadProvider, DownloadProviderValue] DownloadScope = Literal['lite', 'full'] Download = TypedDict('Download', { 'url' : str, 'path' : str }) DownloadSet : TypeAlias = Dict[str, Download] VideoMemoryStrategy = Literal['strict', 'moderate', 'tolerant'] AppContext = Literal['cli', 'api'] InferencePool : TypeAlias = Dict[str, InferenceSession] InferencePoolSet : TypeAlias = Dict[AppContext, Dict[str, InferencePool]] JobOutputSet : TypeAlias = Dict[str, List[str]] JobStatus = Literal['drafted', 'queued', 'completed', 'failed'] JobStepStatus = Literal['drafted', 'queued', 'started', 'completed', 'failed'] JobStep = TypedDict('JobStep', { 'args' : Args, 'status' : JobStepStatus }) Job = TypedDict('Job', { 'version' : str, 'date_created' : str, 'date_updated' : Optional[str], 'steps' : List[JobStep] }) JobSet : TypeAlias = Dict[str, Job] StateKey = Literal\ [ 'command', 'config_path', 'temp_path', 'jobs_path', 'source_paths', 'target_path', 'output_path', 'source_pattern', 'target_pattern', 'output_pattern', 'download_providers', 'download_scope', 'benchmark_mode', 'benchmark_resolutions', 'benchmark_cycle_count', 'face_detector_model', 'face_detector_size', 'face_detector_margin', 'face_detector_angles', 'face_detector_score', 'face_landmarker_model', 'face_landmarker_score', 'face_selector_mode', 'face_selector_order', 'face_selector_gender', 'face_selector_race', 'face_selector_age_start', 'face_selector_age_end', 'reference_face_position', 'reference_face_distance', 'reference_frame_number', 'face_occluder_model', 'face_parser_model', 'face_mask_types', 'face_mask_areas', 'face_mask_regions', 'face_mask_blur', 'face_mask_padding', 'voice_extractor_model', 'trim_frame_start', 'trim_frame_end', 'temp_frame_format', 'keep_temp', 'output_image_quality', 'output_image_scale', 'output_audio_encoder', 'output_audio_quality', 'output_audio_volume', 'output_video_encoder', 'output_video_preset', 'output_video_quality', 'output_video_scale', 'output_video_fps', 'processors', 'execution_device_ids', 'execution_providers', 'execution_thread_count', 'video_memory_strategy', 'log_level', 'halt_on_error', 'job_id', 'job_status', 'step_index' ] State = TypedDict('State', { 'command' : str, 'config_path' : str, 'temp_path' : str, 'jobs_path' : str, 'source_paths' : List[str], 'target_path' : str, 'output_path' : str, 'source_pattern' : str, 'target_pattern' : str, 'output_pattern' : str, 'download_providers' : List[DownloadProvider], 'download_scope' : DownloadScope, 'benchmark_mode' : BenchmarkMode, 'benchmark_resolutions' : List[BenchmarkResolution], 'benchmark_cycle_count' : int, 'face_detector_model' : FaceDetectorModel, 'face_detector_size' : str, 'face_detector_margin' : Margin, 'face_detector_angles' : List[Angle], 'face_detector_score' : Score, 'face_landmarker_model' : FaceLandmarkerModel, 'face_landmarker_score' : Score, 'face_selector_mode' : FaceSelectorMode, 'face_selector_order' : FaceSelectorOrder, 'face_selector_race' : Race, 'face_selector_gender' : Gender, 'face_selector_age_start' : int, 'face_selector_age_end' : int, 'reference_face_position' : int, 'reference_face_distance' : float, 'reference_frame_number' : int, 'face_occluder_model' : FaceOccluderModel, 'face_parser_model' : FaceParserModel, 'face_mask_types' : List[FaceMaskType], 'face_mask_areas' : List[FaceMaskArea], 'face_mask_regions' : List[FaceMaskRegion], 'face_mask_blur' : float, 'face_mask_padding' : Padding, 'voice_extractor_model' : VoiceExtractorModel, 'trim_frame_start' : int, 'trim_frame_end' : int, 'temp_frame_format' : TempFrameFormat, 'keep_temp' : bool, 'output_image_quality' : int, 'output_image_scale' : Scale, 'output_audio_encoder' : AudioEncoder, 'output_audio_quality' : int, 'output_audio_volume' : int, 'output_video_encoder' : VideoEncoder, 'output_video_preset' : VideoPreset, 'output_video_quality' : int, 'output_video_scale' : Scale, 'output_video_fps' : float, 'processors' : List[str], 'execution_device_ids' : List[int], 'execution_providers' : List[ExecutionProvider], 'execution_thread_count' : int, 'video_memory_strategy' : VideoMemoryStrategy, 'log_level' : LogLevel, 'halt_on_error' : bool, 'job_id' : str, 'job_status' : JobStatus, 'step_index' : int }) ApplyStateItem : TypeAlias = Callable[[Any, Any], None] StateSet : TypeAlias = Dict[AppContext, State]