Browse Source

修改实现方式

master
lxbhahaha 10 months ago
parent
commit
a5631fe369
  1. 175
      app.py
  2. 7
      index.html

175
app.py

@ -37,80 +37,86 @@ def connect_server():
#endregion #endregion
def get_gpus_info(client, timeout): def get_gpus_info(client, timeout):
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' try:
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout)
output = stdout.read().decode()
output = output.split('\n')
start_idx = 0
for i in range(len(output)):
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
start_idx = i + 1
break
output = output[start_idx:-1]
# 解析数据 -----------------------------
result = []
for data in output:
data_list = data.split(', ')
idx = int(data_list[0])
gpu_name = data_list[1]
total_mem = int(data_list[2].split(' ')[0])
used_mem = int(data_list[3].split(' ')[0])
free_mem = int(data_list[4].split(' ')[0])
util_gpu = int(data_list[5].split(' ')[0])
util_mem = int(data_list[6].split(' ')[0])
temperature = int(data_list[7])
# 简化GPU名称
if gpu_name.startswith('NVIDIA '):
gpu_name = gpu_name[7:]
if gpu_name.startswith('GeForce '):
gpu_name = gpu_name[8:]
result.append({
'idx': idx,
'gpu_name': gpu_name,
'total_mem': total_mem,
'used_mem': used_mem,
'free_mem': free_mem,
'util_gpu': util_gpu,
'util_mem': util_mem,
'temperature': temperature
})
return result stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout)
output = stdout.read().decode()
output = output.split('\n')
start_idx = 0
for i in range(len(output)):
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
start_idx = i + 1
break
output = output[start_idx:-1]
# 解析数据 -----------------------------
result = []
for data in output:
data_list = data.split(', ')
idx = int(data_list[0])
gpu_name = data_list[1]
total_mem = int(data_list[2].split(' ')[0])
used_mem = int(data_list[3].split(' ')[0])
free_mem = int(data_list[4].split(' ')[0])
util_gpu = int(data_list[5].split(' ')[0])
util_mem = int(data_list[6].split(' ')[0])
temperature = int(data_list[7])
# 简化GPU名称
if gpu_name.startswith('NVIDIA '):
gpu_name = gpu_name[7:]
if gpu_name.startswith('GeForce '):
gpu_name = gpu_name[8:]
result.append({
'idx': idx,
'gpu_name': gpu_name,
'total_mem': total_mem,
'used_mem': used_mem,
'free_mem': free_mem,
'util_gpu': util_gpu,
'util_mem': util_mem,
'temperature': temperature
})
return result
except Exception as e:
None
def get_storage_info(client, timeout, path_list): def get_storage_info(client, timeout, path_list):
result = [] try:
result = []
for target_path in path_list:
stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout)
output = stdout.read().decode()
if output == "":
continue
data = output.split()
tmp_res = {
"path": target_path,
"total": int(data[1]),
"available": int(data[3])
}
result.append(tmp_res)
return result
except Exception as e:
return None
for target_path in path_list: def get_memory_info(client, timeout):
stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout) try:
output = stdout.read().decode() stdin, stdout, stderr = client.exec_command('free', timeout=timeout)
output = stdout.read().decode().split('\n')[1]
if output == "": if output == "":
continue return None
data = output.split() data = output.split()
tmp_res = { result = {
"path": target_path,
"total": int(data[1]), "total": int(data[1]),
"available": int(data[3]) "used": int(data[2])
} }
result.append(tmp_res)
return result
def get_memory_info(client, timeout): return result
except Exception as e:
stdin, stdout, stderr = client.exec_command('free', timeout=timeout)
output = stdout.read().decode().split('\n')[1]
if output == "":
return None return None
data = output.split()
result = {
"total": int(data[1]),
"used": int(data[2])
}
return result
# 持续获取一个服务器的信息 # 持续获取一个服务器的信息
def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5): def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5):
@ -130,7 +136,7 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3)
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
shared_data_list[server_title]['err_info'] = '' shared_data_list[server_title]['err_info'] = None
re_try_count = 0 re_try_count = 0
# 循环检测 # 循环检测
@ -141,17 +147,17 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte
gpu_info = get_gpus_info(client, interval*3) gpu_info = get_gpus_info(client, interval*3)
# 存储空间信息 # 存储空间信息
storage_info = get_storage_info(client, interval*3, server['storage_list']) storage_info = get_storage_info(client, interval*3, server['storage_list'])
# 内存信息
memory_info = get_memory_info(client, interval*3) memory_info = get_memory_info(client, interval*3)
# locked = False # 记录信息
with data_list_lock: with data_list_lock:
# locked = True
shared_data_list[server_title]['gpu_info_list'] = gpu_info shared_data_list[server_title]['gpu_info_list'] = gpu_info
shared_data_list[server_title]['storage_info_list'] = storage_info shared_data_list[server_title]['storage_info_list'] = storage_info
shared_data_list[server_title]['memory_info'] = memory_info shared_data_list[server_title]['memory_info'] = memory_info
shared_data_list[server_title]['updated'] = True shared_data_list[server_title]['updated'] = True
shared_data_list[server_title]['maxGPU'] = len(gpu_info) shared_data_list[server_title]['maxGPU'] = len(gpu_info)
# locked = False
except Exception as e: except Exception as e:
keep_run = False keep_run = False
shared_data_list[server_title]['err_info'] = f'{e}' shared_data_list[server_title]['err_info'] = f'{e}'
@ -181,24 +187,27 @@ def filter_data(title_list: list):
if title not in data_dict: if title not in data_dict:
server_data[title]['err_info'] = f'title \'{title}\' not exist!' server_data[title]['err_info'] = f'title \'{title}\' not exist!'
continue continue
# 还没获取到数据
gpu_info_list = data_dict[title].get('gpu_info_list', None)
if gpu_info_list is None:
err_info = data_dict[title].get('err_info', None)
if err_info is not None:
server_data[title]['err_info'] = data_dict[title]['err_info']
else:
server_data[title]['err_info'] = f'\'{title}\' still empty.'
continue
# 记录数据 # 记录数据 ----------------------------------------------------
data_updated = data_dict[title].get('updated', False) data_updated = data_dict[title].get('updated', False)
err_info = data_dict[title].get('err_info', '') # 是否更新
server_data[title]['gpu_info_list'] = gpu_info_list
server_data[title]['storage_info_list'] = data_dict[title].get('storage_info_list', [])
server_data[title]['memory_info'] = data_dict[title].get('memory_info', {})
server_data[title]['updated'] = data_updated server_data[title]['updated'] = data_updated
server_data[title]['err_info'] = err_info # 报错信息
err_info = data_dict[title].get('err_info', None)
if err_info is not None:
server_data[title]['err_info'] = err_info
# 显卡
gpu_info_list = data_dict[title].get('gpu_info_list', None)
if gpu_info_list is not None:
server_data[title]['gpu_info_list'] = gpu_info_list
# 硬盘
storage_info_list = data_dict[title].get('storage_info_list', None)
if storage_info_list is not None:
server_data[title]['storage_info_list'] = storage_info_list
# 内存
memory_info = data_dict[title].get('memory_info', None)
if memory_info is not None:
server_data[title]['memory_info'] = memory_info
result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
result['server_data'] = server_data result['server_data'] = server_data
return result return result

7
index.html

@ -174,7 +174,12 @@
+ 'Utilization: ' + gpu.util_gpu + '%'; + 'Utilization: ' + gpu.util_gpu + '%';
serverCard.appendChild(gpuInfo); serverCard.appendChild(gpuInfo);
}); });
}else{ }
if ('err_info' in serverData[key])
{
// 分割线
add_bar(serverCard);
let errInfo = document.createElement('div'); let errInfo = document.createElement('div');
errInfo.classList.add('error-info'); errInfo.classList.add('error-info');
errInfo.innerHTML = '<strong>error info</strong><br>' + serverData[key].err_info; errInfo.innerHTML = '<strong>error info</strong><br>' + serverData[key].err_info;

Loading…
Cancel
Save