From 45d4481cd3d0069cab15f00de19eabcd69a81c87 Mon Sep 17 00:00:00 2001 From: lxbhahaha <1580622474@qq.com> Date: Wed, 27 Mar 2024 13:21:28 +0800 Subject: [PATCH] update --- check.py | 106 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/check.py b/check.py index f0e16d8..84be368 100644 --- a/check.py +++ b/check.py @@ -1,4 +1,3 @@ -import multiprocessing import threading import paramiko import argparse @@ -187,7 +186,7 @@ def print_table_res(data_list): # print('TODO') print(time.strftime("%Y%m-%d%H:%M:%S", time.localtime(time.time()))) - cell_width_list = [10, 20, 24, 15] + cell_width_list = [7, 20, 24, 15] len_last3 = cell_width_list[1] + cell_width_list[2] + cell_width_list[3] + 2 # 输出head ------------------------------------------ print_line('up', cell_width_list) @@ -275,7 +274,8 @@ def print_table_res(data_list): str_list.append(table_icon['vline']) str_list.append(clamp_str(title, cell_width_list[0], True)) str_list.append(table_icon['vline']) - str_list.append(clamp_str('erro', len_last3, True)) + err_info = data.get('err_info', 'error') + str_list.append(clamp_str(err_info, len_last3, True)) str_list.append(table_icon['vline']) print(''.join(str_list)) @@ -291,50 +291,53 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv # 循环检测 while run_realtime: - stdin, stdout, stderr = client.exec_command(cmd) - output = stdout.read().decode() - output = output.split('\n') - start_idx = 0 - for i in range(len(output)): - if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': - start_idx = i + 1 - break - output = output[start_idx:-1] - # 解析数据 ----------------------------- - result = [] - for data in output: - data_list = data.split(', ') - idx = int(data_list[0]) - gpu_name = data_list[1] - total_mem = int(data_list[2].split(' ')[0]) - used_mem = int(data_list[3].split(' ')[0]) - free_mem = int(data_list[4].split(' ')[0]) - util_gpu = int(data_list[5].split(' ')[0]) - util_mem = int(data_list[6].split(' ')[0]) - temperature = int(data_list[7]) - - # 简化GPU名称 - if gpu_name.startswith('NVIDIA '): - gpu_name = gpu_name[7:] - if gpu_name.startswith('GeForce '): - gpu_name = gpu_name[8:] - - result.append({ - 'idx': idx, - 'gpu_name': gpu_name, - 'total_mem': total_mem, - 'used_mem': used_mem, - 'free_mem': free_mem, - 'util_gpu': util_gpu, - 'util_mem': util_mem, - 'temperature': temperature - }) - - # locked = False - with data_list_lock: - # locked = True - shared_data_list[server_idx]['info_list'] = result - # locked = False + try: + stdin, stdout, stderr = client.exec_command(cmd) + output = stdout.read().decode() + output = output.split('\n') + start_idx = 0 + for i in range(len(output)): + if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': + start_idx = i + 1 + break + output = output[start_idx:-1] + # 解析数据 ----------------------------- + result = [] + for data in output: + data_list = data.split(', ') + idx = int(data_list[0]) + gpu_name = data_list[1] + total_mem = int(data_list[2].split(' ')[0]) + used_mem = int(data_list[3].split(' ')[0]) + free_mem = int(data_list[4].split(' ')[0]) + util_gpu = int(data_list[5].split(' ')[0]) + util_mem = int(data_list[6].split(' ')[0]) + temperature = int(data_list[7]) + + # 简化GPU名称 + if gpu_name.startswith('NVIDIA '): + gpu_name = gpu_name[7:] + if gpu_name.startswith('GeForce '): + gpu_name = gpu_name[8:] + + result.append({ + 'idx': idx, + 'gpu_name': gpu_name, + 'total_mem': total_mem, + 'used_mem': used_mem, + 'free_mem': free_mem, + 'util_gpu': util_gpu, + 'util_mem': util_mem, + 'temperature': temperature + }) + + # locked = False + with data_list_lock: + # locked = True + shared_data_list[server_idx]['info_list'] = result + # locked = False + except: + shared_data_list[server_idx].pop('info_list') time.sleep(interval) @@ -343,15 +346,15 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv except Exception as e: # if data_list_lock.locked and locked: # data_list_lock.release() - print(e) + # print(e) + shared_data_list[server_idx]['err_info'] = str(e) def realtime(args): global run_realtime try: parser = argparse.ArgumentParser() - parser.add_argument('-n', type=float, default=2, help='服务器多久刷新一次') - parser.add_argument('-f', type=float, default=2, help='显示多久刷新一次') + parser.add_argument('-n', type=float, default=2, help='多久刷新一次') parser.add_argument('-e', '--exclude', type=str, default='', help='不需要显示的服务器(title)用,分割') parser.add_argument('-t', '--table', action='store_true', help='以表格形式绘制') args = parser.parse_args(args) @@ -366,7 +369,6 @@ def realtime(args): server_list = json.load(f) # 共享list - manager = multiprocessing.Manager() data_list = [] run_realtime = True @@ -398,7 +400,7 @@ def realtime(args): # print(info_list) else: print('出错') - time.sleep(args.f) + time.sleep(args.n) run_realtime = False