diff --git a/check.py b/check.py index 64cdd26..c8bdf6c 100644 --- a/check.py +++ b/check.py @@ -349,74 +349,79 @@ def get_table_res(data_list): return ''.join(result_str) -def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float): - try: - # 建立SSH连接 - client = paramiko.SSHClient() - client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) - cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' - - # 循环检测 - while run_realtime: - try: - stdin, stdout, stderr = client.exec_command(cmd) - output = stdout.read().decode() - output = output.split('\n') - start_idx = 0 - for i in range(len(output)): - if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': - start_idx = i + 1 - break - output = output[start_idx:-1] - # 解析数据 ----------------------------- - result = [] - for data in output: - data_list = data.split(', ') - idx = int(data_list[0]) - gpu_name = data_list[1] - total_mem = int(data_list[2].split(' ')[0]) - used_mem = int(data_list[3].split(' ')[0]) - free_mem = int(data_list[4].split(' ')[0]) - util_gpu = int(data_list[5].split(' ')[0]) - util_mem = int(data_list[6].split(' ')[0]) - temperature = int(data_list[7]) - - # 简化GPU名称 - if gpu_name.startswith('NVIDIA '): - gpu_name = gpu_name[7:] - if gpu_name.startswith('GeForce '): - gpu_name = gpu_name[8:] - - result.append({ - 'idx': idx, - 'gpu_name': gpu_name, - 'total_mem': total_mem, - 'used_mem': used_mem, - 'free_mem': free_mem, - 'util_gpu': util_gpu, - 'util_mem': util_mem, - 'temperature': temperature - }) - - # locked = False - with data_list_lock: - # locked = True - shared_data_list[server_idx]['info_list'] = result - shared_data_list[server_idx]['updated'] = True - # locked = False - except: - shared_data_list[server_idx].pop('info_list') - - time.sleep(interval) - - # 关闭连接 - client.close() - except Exception as e: - # if data_list_lock.locked and locked: - # data_list_lock.release() - # print(e) - shared_data_list[server_idx]['err_info'] = str(e) +def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float, re_connect_time: float=5): + # 循环连接 + while run_realtime: + try: + # 建立SSH连接 + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*2.5) + cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' + + re_try_count = 0 + # 循环检测 + while run_realtime: + try: + stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*2.5) + output = stdout.read().decode() + output = output.split('\n') + start_idx = 0 + for i in range(len(output)): + if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': + start_idx = i + 1 + break + output = output[start_idx:-1] + # 解析数据 ----------------------------- + result = [] + for data in output: + data_list = data.split(', ') + idx = int(data_list[0]) + gpu_name = data_list[1] + total_mem = int(data_list[2].split(' ')[0]) + used_mem = int(data_list[3].split(' ')[0]) + free_mem = int(data_list[4].split(' ')[0]) + util_gpu = int(data_list[5].split(' ')[0]) + util_mem = int(data_list[6].split(' ')[0]) + temperature = int(data_list[7]) + + # 简化GPU名称 + if gpu_name.startswith('NVIDIA '): + gpu_name = gpu_name[7:] + if gpu_name.startswith('GeForce '): + gpu_name = gpu_name[8:] + + result.append({ + 'idx': idx, + 'gpu_name': gpu_name, + 'total_mem': total_mem, + 'used_mem': used_mem, + 'free_mem': free_mem, + 'util_gpu': util_gpu, + 'util_mem': util_mem, + 'temperature': temperature + }) + + # locked = False + with data_list_lock: + # locked = True + shared_data_list[server_idx]['info_list'] = result + shared_data_list[server_idx]['updated'] = True + # locked = False + except: + shared_data_list[server_idx].pop('info_list') + + time.sleep(interval) + + # 关闭连接 + client.close() + except Exception as e: + # if data_list_lock.locked and locked: + # data_list_lock.release() + # print(e) + shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}' + time.sleep(re_connect_time) + re_try_count += 1 def realtime(args): global run_realtime