|
|
@ -76,14 +76,48 @@ def get_gpus_info(client, timeout, info_list:list=None): |
|
|
|
'free_mem': free_mem, |
|
|
|
'util_gpu': util_gpu, |
|
|
|
'util_mem': util_mem, |
|
|
|
'temperature': temperature |
|
|
|
'temperature': temperature, |
|
|
|
'users': {} |
|
|
|
}) |
|
|
|
|
|
|
|
# 读取用户使用信息 |
|
|
|
try: |
|
|
|
gpustat_cmd = 'gpustat --json' |
|
|
|
stdin, stdout, stderr = client.exec_command(gpustat_cmd, timeout=timeout) |
|
|
|
gpustat_output = stdout.read().decode() |
|
|
|
|
|
|
|
# 确保 gpustat 输出不是空的 |
|
|
|
if not gpustat_output: |
|
|
|
raise ValueError("gpustat did not return any output.") |
|
|
|
|
|
|
|
gpustat_info = json.loads(gpustat_output) |
|
|
|
|
|
|
|
# 确保解析的 gpustat 信息格式正确 |
|
|
|
if 'gpus' not in gpustat_info: |
|
|
|
raise ValueError("Parsed gpustat info does not contain 'gpus' key.") |
|
|
|
|
|
|
|
# 解析进程信息 ----------------------------- |
|
|
|
for gpu in gpustat_info['gpus']: |
|
|
|
idx = gpu['index'] |
|
|
|
processes = gpu.get('processes', []) # 使用 get() 方法避免 KeyError |
|
|
|
for process in processes: |
|
|
|
username = process['username'] |
|
|
|
gpu_memory_usage = process['gpu_memory_usage'] # 占用的显存 |
|
|
|
# 找到对应的 GPU,将用户及其显存使用情况记录下来 |
|
|
|
for gpu_result in result: |
|
|
|
if gpu_result['idx'] == idx: |
|
|
|
if username not in gpu_result['users']: |
|
|
|
gpu_result['users'][username] = 0 |
|
|
|
gpu_result['users'][username] += gpu_memory_usage |
|
|
|
except Exception as e: |
|
|
|
if info_list is not None: |
|
|
|
info_list.append(f'gpu user: {e}') |
|
|
|
|
|
|
|
return result |
|
|
|
except Exception as e: |
|
|
|
if info_list is not None: |
|
|
|
info_list.append(f'gpus: {e}') |
|
|
|
None |
|
|
|
return None |
|
|
|
|
|
|
|
def get_storage_info(client, timeout, path_list, info_list:list=None): |
|
|
|
try: |
|
|
@ -157,7 +191,6 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte |
|
|
|
client = paramiko.SSHClient() |
|
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) |
|
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
|
|
|
|
|
shared_data_list[server_title]['err_info'] = None |
|
|
|
re_try_count = 0 |
|
|
|