From bc1dbd4f5a09e6bde291d007e601c5d7c8b95c7a Mon Sep 17 00:00:00 2001 From: lxbhahaha <32586299+lxbhahaha@users.noreply.github.com> Date: Thu, 3 Oct 2024 19:43:09 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=AE=9E=E7=8E=B0=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 175 ++++++++++++++++++++++++++++------------------------- index.html | 7 ++- 2 files changed, 98 insertions(+), 84 deletions(-) diff --git a/app.py b/app.py index dc81a1e..c67bda2 100644 --- a/app.py +++ b/app.py @@ -37,80 +37,86 @@ def connect_server(): #endregion def get_gpus_info(client, timeout): - cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' - - stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout) - output = stdout.read().decode() - output = output.split('\n') - start_idx = 0 - for i in range(len(output)): - if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': - start_idx = i + 1 - break - output = output[start_idx:-1] - # 解析数据 ----------------------------- - result = [] - for data in output: - data_list = data.split(', ') - idx = int(data_list[0]) - gpu_name = data_list[1] - total_mem = int(data_list[2].split(' ')[0]) - used_mem = int(data_list[3].split(' ')[0]) - free_mem = int(data_list[4].split(' ')[0]) - util_gpu = int(data_list[5].split(' ')[0]) - util_mem = int(data_list[6].split(' ')[0]) - temperature = int(data_list[7]) - - # 简化GPU名称 - if gpu_name.startswith('NVIDIA '): - gpu_name = gpu_name[7:] - if gpu_name.startswith('GeForce '): - gpu_name = gpu_name[8:] - - result.append({ - 'idx': idx, - 'gpu_name': gpu_name, - 'total_mem': total_mem, - 'used_mem': used_mem, - 'free_mem': free_mem, - 'util_gpu': util_gpu, - 'util_mem': util_mem, - 'temperature': temperature - }) + try: + cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' - return result + stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout) + output = stdout.read().decode() + output = output.split('\n') + start_idx = 0 + for i in range(len(output)): + if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': + start_idx = i + 1 + break + output = output[start_idx:-1] + # 解析数据 ----------------------------- + result = [] + for data in output: + data_list = data.split(', ') + idx = int(data_list[0]) + gpu_name = data_list[1] + total_mem = int(data_list[2].split(' ')[0]) + used_mem = int(data_list[3].split(' ')[0]) + free_mem = int(data_list[4].split(' ')[0]) + util_gpu = int(data_list[5].split(' ')[0]) + util_mem = int(data_list[6].split(' ')[0]) + temperature = int(data_list[7]) + + # 简化GPU名称 + if gpu_name.startswith('NVIDIA '): + gpu_name = gpu_name[7:] + if gpu_name.startswith('GeForce '): + gpu_name = gpu_name[8:] + + result.append({ + 'idx': idx, + 'gpu_name': gpu_name, + 'total_mem': total_mem, + 'used_mem': used_mem, + 'free_mem': free_mem, + 'util_gpu': util_gpu, + 'util_mem': util_mem, + 'temperature': temperature + }) + + return result + except Exception as e: + None def get_storage_info(client, timeout, path_list): - result = [] + try: + result = [] + for target_path in path_list: + stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout) + output = stdout.read().decode() + if output == "": + continue + data = output.split() + tmp_res = { + "path": target_path, + "total": int(data[1]), + "available": int(data[3]) + } + result.append(tmp_res) + return result + except Exception as e: + return None - for target_path in path_list: - stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout) - output = stdout.read().decode() +def get_memory_info(client, timeout): + try: + stdin, stdout, stderr = client.exec_command('free', timeout=timeout) + output = stdout.read().decode().split('\n')[1] if output == "": - continue + return None data = output.split() - tmp_res = { - "path": target_path, + result = { "total": int(data[1]), - "available": int(data[3]) + "used": int(data[2]) } - result.append(tmp_res) - - return result -def get_memory_info(client, timeout): - - stdin, stdout, stderr = client.exec_command('free', timeout=timeout) - output = stdout.read().decode().split('\n')[1] - if output == "": + return result + except Exception as e: return None - data = output.split() - result = { - "total": int(data[1]), - "used": int(data[2]) - } - - return result # 持续获取一个服务器的信息 def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5): @@ -130,7 +136,7 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' - shared_data_list[server_title]['err_info'] = '' + shared_data_list[server_title]['err_info'] = None re_try_count = 0 # 循环检测 @@ -141,17 +147,17 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte gpu_info = get_gpus_info(client, interval*3) # 存储空间信息 storage_info = get_storage_info(client, interval*3, server['storage_list']) + # 内存信息 memory_info = get_memory_info(client, interval*3) - # locked = False + # 记录信息 with data_list_lock: - # locked = True shared_data_list[server_title]['gpu_info_list'] = gpu_info shared_data_list[server_title]['storage_info_list'] = storage_info shared_data_list[server_title]['memory_info'] = memory_info shared_data_list[server_title]['updated'] = True shared_data_list[server_title]['maxGPU'] = len(gpu_info) - # locked = False + except Exception as e: keep_run = False shared_data_list[server_title]['err_info'] = f'{e}' @@ -181,24 +187,27 @@ def filter_data(title_list: list): if title not in data_dict: server_data[title]['err_info'] = f'title \'{title}\' not exist!' continue - # 还没获取到数据 - gpu_info_list = data_dict[title].get('gpu_info_list', None) - if gpu_info_list is None: - err_info = data_dict[title].get('err_info', None) - if err_info is not None: - server_data[title]['err_info'] = data_dict[title]['err_info'] - else: - server_data[title]['err_info'] = f'\'{title}\' still empty.' - continue - # 记录数据 + # 记录数据 ---------------------------------------------------- data_updated = data_dict[title].get('updated', False) - err_info = data_dict[title].get('err_info', '') - server_data[title]['gpu_info_list'] = gpu_info_list - server_data[title]['storage_info_list'] = data_dict[title].get('storage_info_list', []) - server_data[title]['memory_info'] = data_dict[title].get('memory_info', {}) + # 是否更新 server_data[title]['updated'] = data_updated - server_data[title]['err_info'] = err_info + # 报错信息 + err_info = data_dict[title].get('err_info', None) + if err_info is not None: + server_data[title]['err_info'] = err_info + # 显卡 + gpu_info_list = data_dict[title].get('gpu_info_list', None) + if gpu_info_list is not None: + server_data[title]['gpu_info_list'] = gpu_info_list + # 硬盘 + storage_info_list = data_dict[title].get('storage_info_list', None) + if storage_info_list is not None: + server_data[title]['storage_info_list'] = storage_info_list + # 内存 + memory_info = data_dict[title].get('memory_info', None) + if memory_info is not None: + server_data[title]['memory_info'] = memory_info result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') result['server_data'] = server_data return result diff --git a/index.html b/index.html index 9a07e84..92841de 100644 --- a/index.html +++ b/index.html @@ -174,7 +174,12 @@ + 'Utilization: ' + gpu.util_gpu + '%'; serverCard.appendChild(gpuInfo); }); - }else{ + } + if ('err_info' in serverData[key]) + { + // 分割线 + add_bar(serverCard); + let errInfo = document.createElement('div'); errInfo.classList.add('error-info'); errInfo.innerHTML = 'error info
' + serverData[key].err_info;