|
|
@ -1,4 +1,3 @@ |
|
|
|
import multiprocessing |
|
|
|
import threading |
|
|
|
import paramiko |
|
|
|
import argparse |
|
|
@ -187,7 +186,7 @@ def print_table_res(data_list): |
|
|
|
# print('TODO') |
|
|
|
print(time.strftime("%Y%m-%d%H:%M:%S", time.localtime(time.time()))) |
|
|
|
|
|
|
|
cell_width_list = [10, 20, 24, 15] |
|
|
|
cell_width_list = [7, 20, 24, 15] |
|
|
|
len_last3 = cell_width_list[1] + cell_width_list[2] + cell_width_list[3] + 2 |
|
|
|
# 输出head ------------------------------------------ |
|
|
|
print_line('up', cell_width_list) |
|
|
@ -275,7 +274,8 @@ def print_table_res(data_list): |
|
|
|
str_list.append(table_icon['vline']) |
|
|
|
str_list.append(clamp_str(title, cell_width_list[0], True)) |
|
|
|
str_list.append(table_icon['vline']) |
|
|
|
str_list.append(clamp_str('erro', len_last3, True)) |
|
|
|
err_info = data.get('err_info', 'error') |
|
|
|
str_list.append(clamp_str(err_info, len_last3, True)) |
|
|
|
str_list.append(table_icon['vline']) |
|
|
|
print(''.join(str_list)) |
|
|
|
|
|
|
@ -291,50 +291,53 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv |
|
|
|
|
|
|
|
# 循环检测 |
|
|
|
while run_realtime: |
|
|
|
stdin, stdout, stderr = client.exec_command(cmd) |
|
|
|
output = stdout.read().decode() |
|
|
|
output = output.split('\n') |
|
|
|
start_idx = 0 |
|
|
|
for i in range(len(output)): |
|
|
|
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': |
|
|
|
start_idx = i + 1 |
|
|
|
break |
|
|
|
output = output[start_idx:-1] |
|
|
|
# 解析数据 ----------------------------- |
|
|
|
result = [] |
|
|
|
for data in output: |
|
|
|
data_list = data.split(', ') |
|
|
|
idx = int(data_list[0]) |
|
|
|
gpu_name = data_list[1] |
|
|
|
total_mem = int(data_list[2].split(' ')[0]) |
|
|
|
used_mem = int(data_list[3].split(' ')[0]) |
|
|
|
free_mem = int(data_list[4].split(' ')[0]) |
|
|
|
util_gpu = int(data_list[5].split(' ')[0]) |
|
|
|
util_mem = int(data_list[6].split(' ')[0]) |
|
|
|
temperature = int(data_list[7]) |
|
|
|
|
|
|
|
# 简化GPU名称 |
|
|
|
if gpu_name.startswith('NVIDIA '): |
|
|
|
gpu_name = gpu_name[7:] |
|
|
|
if gpu_name.startswith('GeForce '): |
|
|
|
gpu_name = gpu_name[8:] |
|
|
|
|
|
|
|
result.append({ |
|
|
|
'idx': idx, |
|
|
|
'gpu_name': gpu_name, |
|
|
|
'total_mem': total_mem, |
|
|
|
'used_mem': used_mem, |
|
|
|
'free_mem': free_mem, |
|
|
|
'util_gpu': util_gpu, |
|
|
|
'util_mem': util_mem, |
|
|
|
'temperature': temperature |
|
|
|
}) |
|
|
|
|
|
|
|
# locked = False |
|
|
|
with data_list_lock: |
|
|
|
# locked = True |
|
|
|
shared_data_list[server_idx]['info_list'] = result |
|
|
|
# locked = False |
|
|
|
try: |
|
|
|
stdin, stdout, stderr = client.exec_command(cmd) |
|
|
|
output = stdout.read().decode() |
|
|
|
output = output.split('\n') |
|
|
|
start_idx = 0 |
|
|
|
for i in range(len(output)): |
|
|
|
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': |
|
|
|
start_idx = i + 1 |
|
|
|
break |
|
|
|
output = output[start_idx:-1] |
|
|
|
# 解析数据 ----------------------------- |
|
|
|
result = [] |
|
|
|
for data in output: |
|
|
|
data_list = data.split(', ') |
|
|
|
idx = int(data_list[0]) |
|
|
|
gpu_name = data_list[1] |
|
|
|
total_mem = int(data_list[2].split(' ')[0]) |
|
|
|
used_mem = int(data_list[3].split(' ')[0]) |
|
|
|
free_mem = int(data_list[4].split(' ')[0]) |
|
|
|
util_gpu = int(data_list[5].split(' ')[0]) |
|
|
|
util_mem = int(data_list[6].split(' ')[0]) |
|
|
|
temperature = int(data_list[7]) |
|
|
|
|
|
|
|
# 简化GPU名称 |
|
|
|
if gpu_name.startswith('NVIDIA '): |
|
|
|
gpu_name = gpu_name[7:] |
|
|
|
if gpu_name.startswith('GeForce '): |
|
|
|
gpu_name = gpu_name[8:] |
|
|
|
|
|
|
|
result.append({ |
|
|
|
'idx': idx, |
|
|
|
'gpu_name': gpu_name, |
|
|
|
'total_mem': total_mem, |
|
|
|
'used_mem': used_mem, |
|
|
|
'free_mem': free_mem, |
|
|
|
'util_gpu': util_gpu, |
|
|
|
'util_mem': util_mem, |
|
|
|
'temperature': temperature |
|
|
|
}) |
|
|
|
|
|
|
|
# locked = False |
|
|
|
with data_list_lock: |
|
|
|
# locked = True |
|
|
|
shared_data_list[server_idx]['info_list'] = result |
|
|
|
# locked = False |
|
|
|
except: |
|
|
|
shared_data_list[server_idx].pop('info_list') |
|
|
|
|
|
|
|
time.sleep(interval) |
|
|
|
|
|
|
@ -343,15 +346,15 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv |
|
|
|
except Exception as e: |
|
|
|
# if data_list_lock.locked and locked: |
|
|
|
# data_list_lock.release() |
|
|
|
print(e) |
|
|
|
# print(e) |
|
|
|
shared_data_list[server_idx]['err_info'] = str(e) |
|
|
|
|
|
|
|
def realtime(args): |
|
|
|
global run_realtime |
|
|
|
|
|
|
|
try: |
|
|
|
parser = argparse.ArgumentParser() |
|
|
|
parser.add_argument('-n', type=float, default=2, help='服务器多久刷新一次') |
|
|
|
parser.add_argument('-f', type=float, default=2, help='显示多久刷新一次') |
|
|
|
parser.add_argument('-n', type=float, default=2, help='多久刷新一次') |
|
|
|
parser.add_argument('-e', '--exclude', type=str, default='', help='不需要显示的服务器(title)用,分割') |
|
|
|
parser.add_argument('-t', '--table', action='store_true', help='以表格形式绘制') |
|
|
|
args = parser.parse_args(args) |
|
|
@ -366,7 +369,6 @@ def realtime(args): |
|
|
|
server_list = json.load(f) |
|
|
|
|
|
|
|
# 共享list |
|
|
|
manager = multiprocessing.Manager() |
|
|
|
data_list = [] |
|
|
|
|
|
|
|
run_realtime = True |
|
|
@ -398,7 +400,7 @@ def realtime(args): |
|
|
|
# print(info_list) |
|
|
|
else: |
|
|
|
print('出错') |
|
|
|
time.sleep(args.f) |
|
|
|
time.sleep(args.n) |
|
|
|
|
|
|
|
run_realtime = False |
|
|
|
|
|
|
|