|
|
@ -349,74 +349,79 @@ def get_table_res(data_list): |
|
|
|
|
|
|
|
return ''.join(result_str) |
|
|
|
|
|
|
|
def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float): |
|
|
|
try: |
|
|
|
# 建立SSH连接 |
|
|
|
client = paramiko.SSHClient() |
|
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) |
|
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
|
|
|
|
|
# 循环检测 |
|
|
|
while run_realtime: |
|
|
|
try: |
|
|
|
stdin, stdout, stderr = client.exec_command(cmd) |
|
|
|
output = stdout.read().decode() |
|
|
|
output = output.split('\n') |
|
|
|
start_idx = 0 |
|
|
|
for i in range(len(output)): |
|
|
|
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': |
|
|
|
start_idx = i + 1 |
|
|
|
break |
|
|
|
output = output[start_idx:-1] |
|
|
|
# 解析数据 ----------------------------- |
|
|
|
result = [] |
|
|
|
for data in output: |
|
|
|
data_list = data.split(', ') |
|
|
|
idx = int(data_list[0]) |
|
|
|
gpu_name = data_list[1] |
|
|
|
total_mem = int(data_list[2].split(' ')[0]) |
|
|
|
used_mem = int(data_list[3].split(' ')[0]) |
|
|
|
free_mem = int(data_list[4].split(' ')[0]) |
|
|
|
util_gpu = int(data_list[5].split(' ')[0]) |
|
|
|
util_mem = int(data_list[6].split(' ')[0]) |
|
|
|
temperature = int(data_list[7]) |
|
|
|
|
|
|
|
# 简化GPU名称 |
|
|
|
if gpu_name.startswith('NVIDIA '): |
|
|
|
gpu_name = gpu_name[7:] |
|
|
|
if gpu_name.startswith('GeForce '): |
|
|
|
gpu_name = gpu_name[8:] |
|
|
|
|
|
|
|
result.append({ |
|
|
|
'idx': idx, |
|
|
|
'gpu_name': gpu_name, |
|
|
|
'total_mem': total_mem, |
|
|
|
'used_mem': used_mem, |
|
|
|
'free_mem': free_mem, |
|
|
|
'util_gpu': util_gpu, |
|
|
|
'util_mem': util_mem, |
|
|
|
'temperature': temperature |
|
|
|
}) |
|
|
|
|
|
|
|
# locked = False |
|
|
|
with data_list_lock: |
|
|
|
# locked = True |
|
|
|
shared_data_list[server_idx]['info_list'] = result |
|
|
|
shared_data_list[server_idx]['updated'] = True |
|
|
|
# locked = False |
|
|
|
except: |
|
|
|
shared_data_list[server_idx].pop('info_list') |
|
|
|
|
|
|
|
time.sleep(interval) |
|
|
|
|
|
|
|
# 关闭连接 |
|
|
|
client.close() |
|
|
|
except Exception as e: |
|
|
|
# if data_list_lock.locked and locked: |
|
|
|
# data_list_lock.release() |
|
|
|
# print(e) |
|
|
|
shared_data_list[server_idx]['err_info'] = str(e) |
|
|
|
def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float, re_connect_time: float=5): |
|
|
|
# 循环连接 |
|
|
|
while run_realtime: |
|
|
|
try: |
|
|
|
# 建立SSH连接 |
|
|
|
client = paramiko.SSHClient() |
|
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*2.5) |
|
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
|
|
|
|
|
re_try_count = 0 |
|
|
|
# 循环检测 |
|
|
|
while run_realtime: |
|
|
|
try: |
|
|
|
stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*2.5) |
|
|
|
output = stdout.read().decode() |
|
|
|
output = output.split('\n') |
|
|
|
start_idx = 0 |
|
|
|
for i in range(len(output)): |
|
|
|
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': |
|
|
|
start_idx = i + 1 |
|
|
|
break |
|
|
|
output = output[start_idx:-1] |
|
|
|
# 解析数据 ----------------------------- |
|
|
|
result = [] |
|
|
|
for data in output: |
|
|
|
data_list = data.split(', ') |
|
|
|
idx = int(data_list[0]) |
|
|
|
gpu_name = data_list[1] |
|
|
|
total_mem = int(data_list[2].split(' ')[0]) |
|
|
|
used_mem = int(data_list[3].split(' ')[0]) |
|
|
|
free_mem = int(data_list[4].split(' ')[0]) |
|
|
|
util_gpu = int(data_list[5].split(' ')[0]) |
|
|
|
util_mem = int(data_list[6].split(' ')[0]) |
|
|
|
temperature = int(data_list[7]) |
|
|
|
|
|
|
|
# 简化GPU名称 |
|
|
|
if gpu_name.startswith('NVIDIA '): |
|
|
|
gpu_name = gpu_name[7:] |
|
|
|
if gpu_name.startswith('GeForce '): |
|
|
|
gpu_name = gpu_name[8:] |
|
|
|
|
|
|
|
result.append({ |
|
|
|
'idx': idx, |
|
|
|
'gpu_name': gpu_name, |
|
|
|
'total_mem': total_mem, |
|
|
|
'used_mem': used_mem, |
|
|
|
'free_mem': free_mem, |
|
|
|
'util_gpu': util_gpu, |
|
|
|
'util_mem': util_mem, |
|
|
|
'temperature': temperature |
|
|
|
}) |
|
|
|
|
|
|
|
# locked = False |
|
|
|
with data_list_lock: |
|
|
|
# locked = True |
|
|
|
shared_data_list[server_idx]['info_list'] = result |
|
|
|
shared_data_list[server_idx]['updated'] = True |
|
|
|
# locked = False |
|
|
|
except: |
|
|
|
shared_data_list[server_idx].pop('info_list') |
|
|
|
|
|
|
|
time.sleep(interval) |
|
|
|
|
|
|
|
# 关闭连接 |
|
|
|
client.close() |
|
|
|
except Exception as e: |
|
|
|
# if data_list_lock.locked and locked: |
|
|
|
# data_list_lock.release() |
|
|
|
# print(e) |
|
|
|
shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}' |
|
|
|
time.sleep(re_connect_time) |
|
|
|
re_try_count += 1 |
|
|
|
|
|
|
|
def realtime(args): |
|
|
|
global run_realtime |
|
|
|