diff --git a/app.py b/app.py index c67bda2..cc9bbcc 100644 --- a/app.py +++ b/app.py @@ -36,7 +36,7 @@ def connect_server(): #endregion -def get_gpus_info(client, timeout): +def get_gpus_info(client, timeout, info_list:list=None): try: cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' @@ -81,9 +81,11 @@ def get_gpus_info(client, timeout): return result except Exception as e: + if info_list is not None: + info_list.append(f'gpus: {e}') None -def get_storage_info(client, timeout, path_list): +def get_storage_info(client, timeout, path_list, info_list:list=None): try: result = [] for target_path in path_list: @@ -100,9 +102,11 @@ def get_storage_info(client, timeout, path_list): result.append(tmp_res) return result except Exception as e: + if info_list is not None: + info_list.append(f'storage: {e}') return None -def get_memory_info(client, timeout): +def get_memory_info(client, timeout, info_list:list=None): try: stdin, stdout, stderr = client.exec_command('free', timeout=timeout) output = stdout.read().decode().split('\n')[1] @@ -116,6 +120,8 @@ def get_memory_info(client, timeout): return result except Exception as e: + if info_list is not None: + info_list.append(f'memory: {e}') return None # 持续获取一个服务器的信息 @@ -143,12 +149,13 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte keep_run = True while keep_run: try: + error_info_list = [] # gpu 信息 - gpu_info = get_gpus_info(client, interval*3) + gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list) # 存储空间信息 - storage_info = get_storage_info(client, interval*3, server['storage_list']) + storage_info = get_storage_info(client, interval*3, server['storage_list'], info_list=error_info_list) # 内存信息 - memory_info = get_memory_info(client, interval*3) + memory_info = get_memory_info(client, interval*3, info_list=error_info_list) # 记录信息 with data_list_lock: @@ -157,6 +164,8 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte shared_data_list[server_title]['memory_info'] = memory_info shared_data_list[server_title]['updated'] = True shared_data_list[server_title]['maxGPU'] = len(gpu_info) + if len(error_info_list) > 0: + shared_data_list[server_title]['err_info'] = '\n'.join(error_info_list) except Exception as e: keep_run = False