Remove xml parsing for gpu metrics (#1030)

* remove xml parsing * remove aitop
2026-04-29 13:05:59 +02:00 · 2026-02-07 00:15:30 +05:30
parent ba8093e844
commit a477b66d36
2 changed files with 74 additions and 53 deletions
@@ -1,10 +1,10 @@
 import os
 import shutil
 import subprocess
-import xml.etree.ElementTree as ElementTree
+
 from functools import lru_cache
 from typing import List, Optional
-
+import pynvml
 import onnxruntime

 import facefusion.choices
@@ -129,9 +129,8 @@ def resolve_openvino_device_type(execution_device_id : int) -> str:
 	return 'GPU.' + str(execution_device_id)


-def run_nvidia_smi() -> subprocess.Popen[bytes]:
-	commands = [ shutil.which('nvidia-smi'), '--query', '--xml-format' ]
-	return subprocess.Popen(commands, stdout = subprocess.PIPE)
+def resolve_cuda_driver_version(cuda_driver_version : int) -> str:
+	return '{}.{}'.format(cuda_driver_version // 1000, (cuda_driver_version % 1000) // 10)


@lru_cache()
@@ -143,52 +142,74 @@ def detect_execution_devices() -> List[ExecutionDevice]:
 	execution_devices : List[ExecutionDevice] = []

 	try:
-		output, _ = run_nvidia_smi().communicate()
-		root_element = ElementTree.fromstring(output)
-	except Exception:
-		root_element = ElementTree.Element('xml')
+		pynvml.nvmlInit()
+		device_count = pynvml.nvmlDeviceGetCount()

-	for gpu_element in root_element.findall('gpu'):
-		execution_devices.append(
-		{
-			'driver_version': root_element.findtext('driver_version'),
-			'framework':
+		for device_id in range(device_count):
+			handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+			product_name = pynvml.nvmlDeviceGetName(handle)
+			driver_version = pynvml.nvmlSystemGetDriverVersion()
+			cuda_driver_version = resolve_cuda_driver_version(pynvml.nvmlSystemGetCudaDriverVersion())
+			memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+			utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
+			temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
+			memory_total_mib = memory_info.total // (1024 * 1024)
+			memory_free_mib = memory_info.free // (1024 * 1024)
+			memory_used_mib = memory_info.used // (1024 * 1024)
+			memory_percent = memory_used_mib / memory_total_mib * 100
+
+			execution_devices.append(
 			{
-				'name': 'CUDA',
-				'version': root_element.findtext('cuda_version')
-			},
-			'product':
-			{
-				'vendor': 'NVIDIA',
-				'name': gpu_element.findtext('product_name').replace('NVIDIA', '').strip()
-			},
-			'video_memory':
-			{
-				'total': create_value_and_unit(gpu_element.findtext('fb_memory_usage/total')),
-				'free': create_value_and_unit(gpu_element.findtext('fb_memory_usage/free'))
-			},
-			'temperature':
-			{
-				'gpu': create_value_and_unit(gpu_element.findtext('temperature/gpu_temp')),
-				'memory': create_value_and_unit(gpu_element.findtext('temperature/memory_temp'))
-			},
-			'utilization':
-			{
-				'gpu': create_value_and_unit(gpu_element.findtext('utilization/gpu_util')),
-				'memory': create_value_and_unit(gpu_element.findtext('utilization/memory_util'))
-			}
-		})
+				'driver_version': driver_version,
+				'framework':
+				{
+					'name': 'CUDA',
+					'version': cuda_driver_version
+				},
+				'product':
+				{
+					'vendor': 'NVIDIA',
+					'name': product_name.replace('NVIDIA', '').strip()
+				},
+				'video_memory':
+				{
+					'total':
+					{
+						'value': int(memory_total_mib),
+						'unit': 'MiB'
+					},
+					'free':
+					{
+						'value': int(memory_free_mib),
+						'unit': 'MiB'
+					}
+				},
+				'temperature':
+				{
+					'gpu':
+					{
+						'value': int(temperature),
+						'unit': 'C'
+					},
+					'memory': None
+				},
+				'utilization':
+				{
+					'gpu':
+					{
+						'value': int(utilization.gpu),
+						'unit': '%'
+					},
+					'memory':
+					{
+						'value': int(memory_percent),
+						'unit': '%'
+					}
+				}
+			})
+
+		pynvml.nvmlShutdown()
+	except Exception:
+		pass

 	return execution_devices
-
-
-def create_value_and_unit(text : str) -> Optional[ValueAndUnit]:
-	if ' ' in text:
-		value, unit = text.split()
-
-		return\
-		{
-			'value': int(value),
-			'unit': str(unit)
-		}
-	return None
@@ -1,8 +1,8 @@
-numpy==2.2.1
+numpy==2.2.6
 onnx==1.20.1
 onnxruntime==1.24.3
-opencv-python==4.13.0.92
-psutil==7.1.3
+opencv-python==4.12.0.88
+nvidia-ml-py==13.590.48
 tqdm==4.67.3
 scipy==1.17.1
 starlette==0.50.0