|
|
@ -36,7 +36,10 @@ def connect_server(): |
|
|
|
|
|
|
|
#endregion |
|
|
|
|
|
|
|
def get_gpus_info(client, timeout, info_list:list=None): |
|
|
|
def get_gpus_info(client, timeout, info_list:list=None, ignore_gpu=False): |
|
|
|
if ignore_gpu: |
|
|
|
return None |
|
|
|
|
|
|
|
try: |
|
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
|
|
|
|
@ -201,7 +204,7 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte |
|
|
|
try: |
|
|
|
error_info_list = [] |
|
|
|
# gpu 信息 |
|
|
|
gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list) |
|
|
|
gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list, ignore_gpu=server.get('ignore_gpu', False)) |
|
|
|
# 存储空间信息 |
|
|
|
storage_info = get_storage_info(client, interval*3, server['storage_list'], info_list=error_info_list) |
|
|
|
# 内存信息 |
|
|
@ -216,7 +219,7 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte |
|
|
|
shared_data_list[server_title]['memory_info'] = memory_info |
|
|
|
shared_data_list[server_title]['network_info'] = network_info |
|
|
|
shared_data_list[server_title]['updated'] = True |
|
|
|
shared_data_list[server_title]['maxGPU'] = len(gpu_info) |
|
|
|
shared_data_list[server_title]['maxGPU'] = len(gpu_info) if gpu_info is not None else 0 |
|
|
|
if len(error_info_list) > 0: |
|
|
|
shared_data_list[server_title]['err_info'] = '\n'.join(error_info_list) |
|
|
|
|
|
|
|