Remove xml parsing for gpu metrics (#1030)

* remove xml parsing

* remove aitop
This commit is contained in:
Harisreedhar
2026-02-07 00:15:30 +05:30
committed by henryruhs
parent ba8093e844
commit a477b66d36
2 changed files with 74 additions and 53 deletions
+71 -50
View File
@@ -1,10 +1,10 @@
import os
import shutil
import subprocess
import xml.etree.ElementTree as ElementTree
from functools import lru_cache
from typing import List, Optional
import pynvml
import onnxruntime
import facefusion.choices
@@ -129,9 +129,8 @@ def resolve_openvino_device_type(execution_device_id : int) -> str:
return 'GPU.' + str(execution_device_id)
def run_nvidia_smi() -> subprocess.Popen[bytes]:
commands = [ shutil.which('nvidia-smi'), '--query', '--xml-format' ]
return subprocess.Popen(commands, stdout = subprocess.PIPE)
def resolve_cuda_driver_version(cuda_driver_version : int) -> str:
return '{}.{}'.format(cuda_driver_version // 1000, (cuda_driver_version % 1000) // 10)
@lru_cache()
@@ -143,52 +142,74 @@ def detect_execution_devices() -> List[ExecutionDevice]:
execution_devices : List[ExecutionDevice] = []
try:
output, _ = run_nvidia_smi().communicate()
root_element = ElementTree.fromstring(output)
except Exception:
root_element = ElementTree.Element('xml')
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
for gpu_element in root_element.findall('gpu'):
execution_devices.append(
{
'driver_version': root_element.findtext('driver_version'),
'framework':
for device_id in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
product_name = pynvml.nvmlDeviceGetName(handle)
driver_version = pynvml.nvmlSystemGetDriverVersion()
cuda_driver_version = resolve_cuda_driver_version(pynvml.nvmlSystemGetCudaDriverVersion())
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
memory_total_mib = memory_info.total // (1024 * 1024)
memory_free_mib = memory_info.free // (1024 * 1024)
memory_used_mib = memory_info.used // (1024 * 1024)
memory_percent = memory_used_mib / memory_total_mib * 100
execution_devices.append(
{
'name': 'CUDA',
'version': root_element.findtext('cuda_version')
},
'product':
{
'vendor': 'NVIDIA',
'name': gpu_element.findtext('product_name').replace('NVIDIA', '').strip()
},
'video_memory':
{
'total': create_value_and_unit(gpu_element.findtext('fb_memory_usage/total')),
'free': create_value_and_unit(gpu_element.findtext('fb_memory_usage/free'))
},
'temperature':
{
'gpu': create_value_and_unit(gpu_element.findtext('temperature/gpu_temp')),
'memory': create_value_and_unit(gpu_element.findtext('temperature/memory_temp'))
},
'utilization':
{
'gpu': create_value_and_unit(gpu_element.findtext('utilization/gpu_util')),
'memory': create_value_and_unit(gpu_element.findtext('utilization/memory_util'))
}
})
'driver_version': driver_version,
'framework':
{
'name': 'CUDA',
'version': cuda_driver_version
},
'product':
{
'vendor': 'NVIDIA',
'name': product_name.replace('NVIDIA', '').strip()
},
'video_memory':
{
'total':
{
'value': int(memory_total_mib),
'unit': 'MiB'
},
'free':
{
'value': int(memory_free_mib),
'unit': 'MiB'
}
},
'temperature':
{
'gpu':
{
'value': int(temperature),
'unit': 'C'
},
'memory': None
},
'utilization':
{
'gpu':
{
'value': int(utilization.gpu),
'unit': '%'
},
'memory':
{
'value': int(memory_percent),
'unit': '%'
}
}
})
pynvml.nvmlShutdown()
except Exception:
pass
return execution_devices
def create_value_and_unit(text : str) -> Optional[ValueAndUnit]:
if ' ' in text:
value, unit = text.split()
return\
{
'value': int(value),
'unit': str(unit)
}
return None
+3 -3
View File
@@ -1,8 +1,8 @@
numpy==2.2.1
numpy==2.2.6
onnx==1.20.1
onnxruntime==1.24.3
opencv-python==4.13.0.92
psutil==7.1.3
opencv-python==4.12.0.88
nvidia-ml-py==13.590.48
tqdm==4.67.3
scipy==1.17.1
starlette==0.50.0