From a0f28e1a84f0fea8506bb68a9849b9bdd45224f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=B1=BC=E9=AA=A8=E5=89=AA?= <1580622474@qq.com> Date: Wed, 2 Oct 2024 22:44:38 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E6=AD=A5=E5=A2=9E=E5=8A=A0=E4=BA=86?= =?UTF-8?q?=E5=AD=98=E5=82=A8=E7=A9=BA=E9=97=B4=E7=9A=84=E6=98=BE=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 127 ++++++++++++++++++++++++++++++++++------------------- index.html | 19 +++++++- 2 files changed, 99 insertions(+), 47 deletions(-) diff --git a/app.py b/app.py index d233bd7..07de284 100644 --- a/app.py +++ b/app.py @@ -36,7 +36,76 @@ def connect_server(): #endregion +def get_gpus_info(client, timeout): + cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' + + stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout) + output = stdout.read().decode() + output = output.split('\n') + start_idx = 0 + for i in range(len(output)): + if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': + start_idx = i + 1 + break + output = output[start_idx:-1] + # 解析数据 ----------------------------- + result = [] + for data in output: + data_list = data.split(', ') + idx = int(data_list[0]) + gpu_name = data_list[1] + total_mem = int(data_list[2].split(' ')[0]) + used_mem = int(data_list[3].split(' ')[0]) + free_mem = int(data_list[4].split(' ')[0]) + util_gpu = int(data_list[5].split(' ')[0]) + util_mem = int(data_list[6].split(' ')[0]) + temperature = int(data_list[7]) + + # 简化GPU名称 + if gpu_name.startswith('NVIDIA '): + gpu_name = gpu_name[7:] + if gpu_name.startswith('GeForce '): + gpu_name = gpu_name[8:] + + result.append({ + 'idx': idx, + 'gpu_name': gpu_name, + 'total_mem': total_mem, + 'used_mem': used_mem, + 'free_mem': free_mem, + 'util_gpu': util_gpu, + 'util_mem': util_mem, + 'temperature': temperature + }) + + return result + +def get_storage_info(client, timeout, path_list): + result = [] + + for target_path in path_list: + stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout) + output = stdout.read().decode() + if output == "": + continue + data = output.split() + tmp_res = { + "path": target_path, + "total": int(data[1]), + "available": int(data[3]) + } + result.append(tmp_res) + + return result + +# 持续获取一个服务器的信息 def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5): + # 处理一下需要检查的存储空间路径 + if not 'storage_list' in server: + server['storage_list'] = [] + if not '/' in server['storage_list']: + server['storage_list'].append('/') + re_try_count = 0 # 循环连接 while True: @@ -54,57 +123,24 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte keep_run = True while keep_run: try: - stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*3) - output = stdout.read().decode() - output = output.split('\n') - start_idx = 0 - for i in range(len(output)): - if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': - start_idx = i + 1 - break - output = output[start_idx:-1] - # 解析数据 ----------------------------- - result = [] - for data in output: - data_list = data.split(', ') - idx = int(data_list[0]) - gpu_name = data_list[1] - total_mem = int(data_list[2].split(' ')[0]) - used_mem = int(data_list[3].split(' ')[0]) - free_mem = int(data_list[4].split(' ')[0]) - util_gpu = int(data_list[5].split(' ')[0]) - util_mem = int(data_list[6].split(' ')[0]) - temperature = int(data_list[7]) - - # 简化GPU名称 - if gpu_name.startswith('NVIDIA '): - gpu_name = gpu_name[7:] - if gpu_name.startswith('GeForce '): - gpu_name = gpu_name[8:] - - result.append({ - 'idx': idx, - 'gpu_name': gpu_name, - 'total_mem': total_mem, - 'used_mem': used_mem, - 'free_mem': free_mem, - 'util_gpu': util_gpu, - 'util_mem': util_mem, - 'temperature': temperature - }) + # gpu 信息 + gpu_info = get_gpus_info(client, interval*3) + # 存储空间信息 + storage_info = get_storage_info(client, interval*3, server['storage_list']) # locked = False with data_list_lock: # locked = True - shared_data_list[server_title]['info_list'] = result + shared_data_list[server_title]['gpu_info_list'] = gpu_info + shared_data_list[server_title]['storage_info_list'] = storage_info shared_data_list[server_title]['updated'] = True - shared_data_list[server_title]['maxGPU'] = len(output) + shared_data_list[server_title]['maxGPU'] = len(gpu_info) # locked = False except Exception as e: keep_run = False shared_data_list[server_title]['err_info'] = f'{e}' - if 'info_list' in shared_data_list[server_title]: - shared_data_list[server_title].pop('info_list') + if 'gpu_info_list' in shared_data_list[server_title]: + shared_data_list[server_title].pop('gpu_info_list') time.sleep(interval) @@ -130,8 +166,8 @@ def filter_data(title_list: list): server_data[title]['err_info'] = f'title \'{title}\' not exist!' continue # 还没获取到数据 - info_list = data_dict[title].get('info_list', None) - if info_list is None: + gpu_info_list = data_dict[title].get('gpu_info_list', None) + if gpu_info_list is None: err_info = data_dict[title].get('err_info', None) if err_info is not None: server_data[title]['err_info'] = data_dict[title]['err_info'] @@ -142,7 +178,8 @@ def filter_data(title_list: list): # 记录数据 data_updated = data_dict[title].get('updated', False) err_info = data_dict[title].get('err_info', '') - server_data[title]['info_list'] = info_list + server_data[title]['gpu_info_list'] = gpu_info_list + server_data[title]['storage_info_list'] = data_dict[title].get('storage_info_list', []) server_data[title]['updated'] = data_updated server_data[title]['err_info'] = err_info result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') diff --git a/index.html b/index.html index 851ea94..3819e89 100644 --- a/index.html +++ b/index.html @@ -79,8 +79,23 @@ serverName.textContent = key + updateFlag; serverCard.appendChild(serverName); - if ('info_list' in serverData[key]){ - serverData[key].info_list.forEach(function(gpu){ + // 存储空间 + if ('storage_info_list' in serverData[key]){ + let storageInfo = document.createElement('div'); + storageInfo.classList.add('storage-info'); + + for (let i = 0; i < serverData[key].storage_info_list.length; i++) { + let targetPath = serverData[key].storage_info_list[i].path; + let totalStorage = serverData[key].storage_info_list[i].total; + let availableStorage = serverData[key].storage_info_list[i].available; + storageInfo.innerHTML += targetPath + " : " + availableStorage + " / " + totalStorage + "
"; + } + + serverCard.appendChild(storageInfo); + } + // gpu + if ('gpu_info_list' in serverData[key]){ + serverData[key].gpu_info_list.forEach(function(gpu){ let gpuInfo = document.createElement('div'); gpuInfo.classList.add('gpu-info'); let colorDot = greenDot;