From bc1dbd4f5a09e6bde291d007e601c5d7c8b95c7a Mon Sep 17 00:00:00 2001
From: lxbhahaha <32586299+lxbhahaha@users.noreply.github.com>
Date: Thu, 3 Oct 2024 19:43:09 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=AE=9E=E7=8E=B0=E6=96=B9?=
=?UTF-8?q?=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
app.py | 175 ++++++++++++++++++++++++++++-------------------------
index.html | 7 ++-
2 files changed, 98 insertions(+), 84 deletions(-)
diff --git a/app.py b/app.py
index dc81a1e..c67bda2 100644
--- a/app.py
+++ b/app.py
@@ -37,80 +37,86 @@ def connect_server():
#endregion
def get_gpus_info(client, timeout):
- cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
-
- stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout)
- output = stdout.read().decode()
- output = output.split('\n')
- start_idx = 0
- for i in range(len(output)):
- if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
- start_idx = i + 1
- break
- output = output[start_idx:-1]
- # 解析数据 -----------------------------
- result = []
- for data in output:
- data_list = data.split(', ')
- idx = int(data_list[0])
- gpu_name = data_list[1]
- total_mem = int(data_list[2].split(' ')[0])
- used_mem = int(data_list[3].split(' ')[0])
- free_mem = int(data_list[4].split(' ')[0])
- util_gpu = int(data_list[5].split(' ')[0])
- util_mem = int(data_list[6].split(' ')[0])
- temperature = int(data_list[7])
-
- # 简化GPU名称
- if gpu_name.startswith('NVIDIA '):
- gpu_name = gpu_name[7:]
- if gpu_name.startswith('GeForce '):
- gpu_name = gpu_name[8:]
-
- result.append({
- 'idx': idx,
- 'gpu_name': gpu_name,
- 'total_mem': total_mem,
- 'used_mem': used_mem,
- 'free_mem': free_mem,
- 'util_gpu': util_gpu,
- 'util_mem': util_mem,
- 'temperature': temperature
- })
+ try:
+ cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
- return result
+ stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout)
+ output = stdout.read().decode()
+ output = output.split('\n')
+ start_idx = 0
+ for i in range(len(output)):
+ if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
+ start_idx = i + 1
+ break
+ output = output[start_idx:-1]
+ # 解析数据 -----------------------------
+ result = []
+ for data in output:
+ data_list = data.split(', ')
+ idx = int(data_list[0])
+ gpu_name = data_list[1]
+ total_mem = int(data_list[2].split(' ')[0])
+ used_mem = int(data_list[3].split(' ')[0])
+ free_mem = int(data_list[4].split(' ')[0])
+ util_gpu = int(data_list[5].split(' ')[0])
+ util_mem = int(data_list[6].split(' ')[0])
+ temperature = int(data_list[7])
+
+ # 简化GPU名称
+ if gpu_name.startswith('NVIDIA '):
+ gpu_name = gpu_name[7:]
+ if gpu_name.startswith('GeForce '):
+ gpu_name = gpu_name[8:]
+
+ result.append({
+ 'idx': idx,
+ 'gpu_name': gpu_name,
+ 'total_mem': total_mem,
+ 'used_mem': used_mem,
+ 'free_mem': free_mem,
+ 'util_gpu': util_gpu,
+ 'util_mem': util_mem,
+ 'temperature': temperature
+ })
+
+ return result
+ except Exception as e:
+ None
def get_storage_info(client, timeout, path_list):
- result = []
+ try:
+ result = []
+ for target_path in path_list:
+ stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout)
+ output = stdout.read().decode()
+ if output == "":
+ continue
+ data = output.split()
+ tmp_res = {
+ "path": target_path,
+ "total": int(data[1]),
+ "available": int(data[3])
+ }
+ result.append(tmp_res)
+ return result
+ except Exception as e:
+ return None
- for target_path in path_list:
- stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout)
- output = stdout.read().decode()
+def get_memory_info(client, timeout):
+ try:
+ stdin, stdout, stderr = client.exec_command('free', timeout=timeout)
+ output = stdout.read().decode().split('\n')[1]
if output == "":
- continue
+ return None
data = output.split()
- tmp_res = {
- "path": target_path,
+ result = {
"total": int(data[1]),
- "available": int(data[3])
+ "used": int(data[2])
}
- result.append(tmp_res)
-
- return result
-def get_memory_info(client, timeout):
-
- stdin, stdout, stderr = client.exec_command('free', timeout=timeout)
- output = stdout.read().decode().split('\n')[1]
- if output == "":
+ return result
+ except Exception as e:
return None
- data = output.split()
- result = {
- "total": int(data[1]),
- "used": int(data[2])
- }
-
- return result
# 持续获取一个服务器的信息
def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5):
@@ -130,7 +136,7 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3)
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
- shared_data_list[server_title]['err_info'] = ''
+ shared_data_list[server_title]['err_info'] = None
re_try_count = 0
# 循环检测
@@ -141,17 +147,17 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte
gpu_info = get_gpus_info(client, interval*3)
# 存储空间信息
storage_info = get_storage_info(client, interval*3, server['storage_list'])
+ # 内存信息
memory_info = get_memory_info(client, interval*3)
- # locked = False
+ # 记录信息
with data_list_lock:
- # locked = True
shared_data_list[server_title]['gpu_info_list'] = gpu_info
shared_data_list[server_title]['storage_info_list'] = storage_info
shared_data_list[server_title]['memory_info'] = memory_info
shared_data_list[server_title]['updated'] = True
shared_data_list[server_title]['maxGPU'] = len(gpu_info)
- # locked = False
+
except Exception as e:
keep_run = False
shared_data_list[server_title]['err_info'] = f'{e}'
@@ -181,24 +187,27 @@ def filter_data(title_list: list):
if title not in data_dict:
server_data[title]['err_info'] = f'title \'{title}\' not exist!'
continue
- # 还没获取到数据
- gpu_info_list = data_dict[title].get('gpu_info_list', None)
- if gpu_info_list is None:
- err_info = data_dict[title].get('err_info', None)
- if err_info is not None:
- server_data[title]['err_info'] = data_dict[title]['err_info']
- else:
- server_data[title]['err_info'] = f'\'{title}\' still empty.'
- continue
- # 记录数据
+ # 记录数据 ----------------------------------------------------
data_updated = data_dict[title].get('updated', False)
- err_info = data_dict[title].get('err_info', '')
- server_data[title]['gpu_info_list'] = gpu_info_list
- server_data[title]['storage_info_list'] = data_dict[title].get('storage_info_list', [])
- server_data[title]['memory_info'] = data_dict[title].get('memory_info', {})
+ # 是否更新
server_data[title]['updated'] = data_updated
- server_data[title]['err_info'] = err_info
+ # 报错信息
+ err_info = data_dict[title].get('err_info', None)
+ if err_info is not None:
+ server_data[title]['err_info'] = err_info
+ # 显卡
+ gpu_info_list = data_dict[title].get('gpu_info_list', None)
+ if gpu_info_list is not None:
+ server_data[title]['gpu_info_list'] = gpu_info_list
+ # 硬盘
+ storage_info_list = data_dict[title].get('storage_info_list', None)
+ if storage_info_list is not None:
+ server_data[title]['storage_info_list'] = storage_info_list
+ # 内存
+ memory_info = data_dict[title].get('memory_info', None)
+ if memory_info is not None:
+ server_data[title]['memory_info'] = memory_info
result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
result['server_data'] = server_data
return result
diff --git a/index.html b/index.html
index 9a07e84..92841de 100644
--- a/index.html
+++ b/index.html
@@ -174,7 +174,12 @@
+ 'Utilization: ' + gpu.util_gpu + '%';
serverCard.appendChild(gpuInfo);
});
- }else{
+ }
+ if ('err_info' in serverData[key])
+ {
+ // 分割线
+ add_bar(serverCard);
+
let errInfo = document.createElement('div');
errInfo.classList.add('error-info');
errInfo.innerHTML = 'error info
' + serverData[key].err_info;