Browse Source

update

master
lxbhahaha 10 months ago
parent
commit
c116a81c2c
  1. 21
      app.py

21
app.py

@ -36,7 +36,7 @@ def connect_server():
#endregion
def get_gpus_info(client, timeout):
def get_gpus_info(client, timeout, info_list:list=None):
try:
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
@ -81,9 +81,11 @@ def get_gpus_info(client, timeout):
return result
except Exception as e:
if info_list is not None:
info_list.append(f'gpus: {e}')
None
def get_storage_info(client, timeout, path_list):
def get_storage_info(client, timeout, path_list, info_list:list=None):
try:
result = []
for target_path in path_list:
@ -100,9 +102,11 @@ def get_storage_info(client, timeout, path_list):
result.append(tmp_res)
return result
except Exception as e:
if info_list is not None:
info_list.append(f'storage: {e}')
return None
def get_memory_info(client, timeout):
def get_memory_info(client, timeout, info_list:list=None):
try:
stdin, stdout, stderr = client.exec_command('free', timeout=timeout)
output = stdout.read().decode().split('\n')[1]
@ -116,6 +120,8 @@ def get_memory_info(client, timeout):
return result
except Exception as e:
if info_list is not None:
info_list.append(f'memory: {e}')
return None
# 持续获取一个服务器的信息
@ -143,12 +149,13 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte
keep_run = True
while keep_run:
try:
error_info_list = []
# gpu 信息
gpu_info = get_gpus_info(client, interval*3)
gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list)
# 存储空间信息
storage_info = get_storage_info(client, interval*3, server['storage_list'])
storage_info = get_storage_info(client, interval*3, server['storage_list'], info_list=error_info_list)
# 内存信息
memory_info = get_memory_info(client, interval*3)
memory_info = get_memory_info(client, interval*3, info_list=error_info_list)
# 记录信息
with data_list_lock:
@ -157,6 +164,8 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte
shared_data_list[server_title]['memory_info'] = memory_info
shared_data_list[server_title]['updated'] = True
shared_data_list[server_title]['maxGPU'] = len(gpu_info)
if len(error_info_list) > 0:
shared_data_list[server_title]['err_info'] = '\n'.join(error_info_list)
except Exception as e:
keep_run = False

Loading…
Cancel
Save