Browse Source

增加了重连

master
lxbhahaha 1 year ago
parent
commit
f5bb6a2dc5
  1. 141
      check.py

141
check.py

@ -349,74 +349,79 @@ def get_table_res(data_list):
return ''.join(result_str) return ''.join(result_str)
def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float): def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float, re_connect_time: float=5):
try: # 循环连接
# 建立SSH连接 while run_realtime:
client = paramiko.SSHClient() try:
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) # 建立SSH连接
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) client = paramiko.SSHClient()
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*2.5)
# 循环检测 cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
while run_realtime:
try: re_try_count = 0
stdin, stdout, stderr = client.exec_command(cmd) # 循环检测
output = stdout.read().decode() while run_realtime:
output = output.split('\n') try:
start_idx = 0 stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*2.5)
for i in range(len(output)): output = stdout.read().decode()
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': output = output.split('\n')
start_idx = i + 1 start_idx = 0
break for i in range(len(output)):
output = output[start_idx:-1] if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
# 解析数据 ----------------------------- start_idx = i + 1
result = [] break
for data in output: output = output[start_idx:-1]
data_list = data.split(', ') # 解析数据 -----------------------------
idx = int(data_list[0]) result = []
gpu_name = data_list[1] for data in output:
total_mem = int(data_list[2].split(' ')[0]) data_list = data.split(', ')
used_mem = int(data_list[3].split(' ')[0]) idx = int(data_list[0])
free_mem = int(data_list[4].split(' ')[0]) gpu_name = data_list[1]
util_gpu = int(data_list[5].split(' ')[0]) total_mem = int(data_list[2].split(' ')[0])
util_mem = int(data_list[6].split(' ')[0]) used_mem = int(data_list[3].split(' ')[0])
temperature = int(data_list[7]) free_mem = int(data_list[4].split(' ')[0])
util_gpu = int(data_list[5].split(' ')[0])
# 简化GPU名称 util_mem = int(data_list[6].split(' ')[0])
if gpu_name.startswith('NVIDIA '): temperature = int(data_list[7])
gpu_name = gpu_name[7:]
if gpu_name.startswith('GeForce '): # 简化GPU名称
gpu_name = gpu_name[8:] if gpu_name.startswith('NVIDIA '):
gpu_name = gpu_name[7:]
result.append({ if gpu_name.startswith('GeForce '):
'idx': idx, gpu_name = gpu_name[8:]
'gpu_name': gpu_name,
'total_mem': total_mem, result.append({
'used_mem': used_mem, 'idx': idx,
'free_mem': free_mem, 'gpu_name': gpu_name,
'util_gpu': util_gpu, 'total_mem': total_mem,
'util_mem': util_mem, 'used_mem': used_mem,
'temperature': temperature 'free_mem': free_mem,
}) 'util_gpu': util_gpu,
'util_mem': util_mem,
# locked = False 'temperature': temperature
with data_list_lock: })
# locked = True
shared_data_list[server_idx]['info_list'] = result # locked = False
shared_data_list[server_idx]['updated'] = True with data_list_lock:
# locked = False # locked = True
except: shared_data_list[server_idx]['info_list'] = result
shared_data_list[server_idx].pop('info_list') shared_data_list[server_idx]['updated'] = True
# locked = False
time.sleep(interval) except:
shared_data_list[server_idx].pop('info_list')
# 关闭连接
client.close() time.sleep(interval)
except Exception as e:
# if data_list_lock.locked and locked: # 关闭连接
# data_list_lock.release() client.close()
# print(e) except Exception as e:
shared_data_list[server_idx]['err_info'] = str(e) # if data_list_lock.locked and locked:
# data_list_lock.release()
# print(e)
shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}'
time.sleep(re_connect_time)
re_try_count += 1
def realtime(args): def realtime(args):
global run_realtime global run_realtime

Loading…
Cancel
Save