5 changed files with 224 additions and 359 deletions
@ -1,319 +0,0 @@ |
|||||
from flask import Flask, jsonify |
|
||||
from datetime import datetime |
|
||||
from flask_cors import CORS |
|
||||
import threading |
|
||||
import paramiko |
|
||||
import json |
|
||||
import time |
|
||||
|
|
||||
#region 全局 |
|
||||
|
|
||||
app = Flask(__name__) |
|
||||
CORS(app) |
|
||||
port = 15002 |
|
||||
server_list_path = 'serverList.json' |
|
||||
data_list_lock = threading.Lock() |
|
||||
check_interval = 2 |
|
||||
# 共享list |
|
||||
data_dict = dict() |
|
||||
|
|
||||
#endregion |
|
||||
|
|
||||
#region 接口 |
|
||||
|
|
||||
# 测试用 |
|
||||
@app.route('/') |
|
||||
def hello(): |
|
||||
return 'hi. —— CheckGPUsWeb' |
|
||||
|
|
||||
@app.route('/all_data', methods=['GET']) |
|
||||
def get_data(): |
|
||||
return jsonify(get_all_data()) |
|
||||
|
|
||||
# 开始连接服务器 |
|
||||
def connect_server(): |
|
||||
pass |
|
||||
|
|
||||
#endregion |
|
||||
|
|
||||
def get_gpus_info(client, timeout, info_list:list=None, ignore_gpu=False): |
|
||||
if ignore_gpu: |
|
||||
return None |
|
||||
|
|
||||
try: |
|
||||
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
||||
|
|
||||
stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout) |
|
||||
output = stdout.read().decode() |
|
||||
output = output.split('\n') |
|
||||
start_idx = 0 |
|
||||
for i in range(len(output)): |
|
||||
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': |
|
||||
start_idx = i + 1 |
|
||||
break |
|
||||
output = output[start_idx:-1] |
|
||||
# 解析数据 ----------------------------- |
|
||||
result = [] |
|
||||
for data in output: |
|
||||
data_list = data.split(', ') |
|
||||
idx = int(data_list[0]) |
|
||||
gpu_name = data_list[1] |
|
||||
total_mem = int(data_list[2].split(' ')[0]) |
|
||||
used_mem = int(data_list[3].split(' ')[0]) |
|
||||
free_mem = int(data_list[4].split(' ')[0]) |
|
||||
util_gpu = int(data_list[5].split(' ')[0]) |
|
||||
util_mem = int(data_list[6].split(' ')[0]) |
|
||||
temperature = int(data_list[7]) |
|
||||
|
|
||||
# 简化GPU名称 |
|
||||
if gpu_name.startswith('NVIDIA '): |
|
||||
gpu_name = gpu_name[7:] |
|
||||
if gpu_name.startswith('GeForce '): |
|
||||
gpu_name = gpu_name[8:] |
|
||||
|
|
||||
result.append({ |
|
||||
'idx': idx, |
|
||||
'gpu_name': gpu_name, |
|
||||
'total_mem': total_mem, |
|
||||
'used_mem': used_mem, |
|
||||
'free_mem': free_mem, |
|
||||
'util_gpu': util_gpu, |
|
||||
'util_mem': util_mem, |
|
||||
'temperature': temperature, |
|
||||
'users': {} |
|
||||
}) |
|
||||
|
|
||||
# 读取用户使用信息 |
|
||||
try: |
|
||||
gpustat_cmd = 'gpustat --json' |
|
||||
stdin, stdout, stderr = client.exec_command(gpustat_cmd, timeout=timeout) |
|
||||
gpustat_output = stdout.read().decode() |
|
||||
|
|
||||
# 确保 gpustat 输出不是空的 |
|
||||
if not gpustat_output: |
|
||||
raise ValueError("gpustat did not return any output.") |
|
||||
|
|
||||
gpustat_info = json.loads(gpustat_output) |
|
||||
|
|
||||
# 确保解析的 gpustat 信息格式正确 |
|
||||
if 'gpus' not in gpustat_info: |
|
||||
raise ValueError("Parsed gpustat info does not contain 'gpus' key.") |
|
||||
|
|
||||
# 解析进程信息 ----------------------------- |
|
||||
for gpu in gpustat_info['gpus']: |
|
||||
idx = gpu['index'] |
|
||||
processes = gpu.get('processes', []) # 使用 get() 方法避免 KeyError |
|
||||
for process in processes: |
|
||||
username = process['username'] |
|
||||
gpu_memory_usage = process['gpu_memory_usage'] # 占用的显存 |
|
||||
# 找到对应的 GPU,将用户及其显存使用情况记录下来 |
|
||||
for gpu_result in result: |
|
||||
if gpu_result['idx'] == idx: |
|
||||
if username not in gpu_result['users']: |
|
||||
gpu_result['users'][username] = 0 |
|
||||
gpu_result['users'][username] += gpu_memory_usage |
|
||||
except Exception as e: |
|
||||
if info_list is not None: |
|
||||
info_list.append(f'gpu user: {e}') |
|
||||
|
|
||||
return result |
|
||||
except paramiko.ssh_exception.SSHException as e: |
|
||||
# ssh 的异常仍然抛出 |
|
||||
raise |
|
||||
except Exception as e: |
|
||||
if info_list is not None: |
|
||||
info_list.append(f'gpus: {e}') |
|
||||
return None |
|
||||
|
|
||||
def get_storage_info(client, timeout, path_list, info_list:list=None): |
|
||||
try: |
|
||||
result = [] |
|
||||
for target_path in path_list: |
|
||||
stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout) |
|
||||
output = stdout.read().decode() |
|
||||
if output == "": |
|
||||
continue |
|
||||
data = output.split() |
|
||||
tmp_res = { |
|
||||
"path": target_path, |
|
||||
"total": int(data[1]), |
|
||||
"available": int(data[3]) |
|
||||
} |
|
||||
result.append(tmp_res) |
|
||||
return result |
|
||||
except paramiko.ssh_exception.SSHException as e: |
|
||||
# ssh 的异常仍然抛出 |
|
||||
raise |
|
||||
except Exception as e: |
|
||||
if info_list is not None: |
|
||||
info_list.append(f'storage: {e}') |
|
||||
return None |
|
||||
|
|
||||
def get_memory_info(client, timeout, info_list:list=None): |
|
||||
try: |
|
||||
stdin, stdout, stderr = client.exec_command('free', timeout=timeout) |
|
||||
output = stdout.read().decode().split('\n')[1] |
|
||||
if output == "": |
|
||||
return None |
|
||||
data = output.split() |
|
||||
result = { |
|
||||
"total": int(data[1]), |
|
||||
"used": int(data[2]) |
|
||||
} |
|
||||
|
|
||||
return result |
|
||||
except paramiko.ssh_exception.SSHException as e: |
|
||||
# ssh 的异常仍然抛出 |
|
||||
raise |
|
||||
except Exception as e: |
|
||||
if info_list is not None: |
|
||||
info_list.append(f'memory: {e}') |
|
||||
return None |
|
||||
|
|
||||
def get_network_info(client, timeout, interface_name, info_list:list=None): |
|
||||
try: |
|
||||
if interface_name is None: |
|
||||
return None |
|
||||
stdin, stdout, stderr = client.exec_command(f'ifstat -i {interface_name} 0.1 1', timeout=timeout) |
|
||||
output = stdout.read().decode().split('\n')[2] |
|
||||
data = output.split() |
|
||||
result = { |
|
||||
"in": float(data[0]), |
|
||||
"out": float(data[1]) |
|
||||
} |
|
||||
return result |
|
||||
except paramiko.ssh_exception.SSHException as e: |
|
||||
# ssh 的异常仍然抛出 |
|
||||
raise |
|
||||
except Exception as e: |
|
||||
if info_list is not None: |
|
||||
info_list.append(f'network: {e}') |
|
||||
return None |
|
||||
|
|
||||
# 持续获取一个服务器的信息 |
|
||||
def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5): |
|
||||
# 处理一下需要检查的存储空间路径 |
|
||||
if not 'storage_list' in server: |
|
||||
server['storage_list'] = [] |
|
||||
if not '/' in server['storage_list']: |
|
||||
server['storage_list'].insert(0, '/') |
|
||||
|
|
||||
re_try_count = 0 |
|
||||
# 循环连接 |
|
||||
while True: |
|
||||
try: |
|
||||
# 建立SSH连接 |
|
||||
client = paramiko.SSHClient() |
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
||||
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) |
|
||||
|
|
||||
shared_data_list[server_title]['err_info'] = None |
|
||||
re_try_count = 0 |
|
||||
|
|
||||
# 循环检测 |
|
||||
keep_run = True |
|
||||
while keep_run: |
|
||||
try: |
|
||||
error_info_list = [] |
|
||||
# gpu 信息 |
|
||||
gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list, ignore_gpu=server.get('ignore_gpu', False)) |
|
||||
# 存储空间信息 |
|
||||
storage_info = get_storage_info(client, interval*3, server['storage_list'], info_list=error_info_list) |
|
||||
# 内存信息 |
|
||||
memory_info = get_memory_info(client, interval*3, info_list=error_info_list) |
|
||||
# 网络信息 |
|
||||
network_info = get_network_info(client, interval*3, server.get('network_interface_name', None), info_list=error_info_list) |
|
||||
|
|
||||
# 记录信息 |
|
||||
with data_list_lock: |
|
||||
shared_data_list[server_title]['gpu_info_list'] = gpu_info |
|
||||
shared_data_list[server_title]['storage_info_list'] = storage_info |
|
||||
shared_data_list[server_title]['memory_info'] = memory_info |
|
||||
shared_data_list[server_title]['network_info'] = network_info |
|
||||
shared_data_list[server_title]['updated'] = True |
|
||||
shared_data_list[server_title]['maxGPU'] = len(gpu_info) if gpu_info is not None else 0 |
|
||||
if len(error_info_list) > 0: |
|
||||
shared_data_list[server_title]['err_info'] = '\n'.join(error_info_list) |
|
||||
|
|
||||
except Exception as e: |
|
||||
keep_run = False |
|
||||
shared_data_list[server_title]['err_info'] = f'{e}' |
|
||||
if 'gpu_info_list' in shared_data_list[server_title]: |
|
||||
shared_data_list[server_title].pop('gpu_info_list') |
|
||||
|
|
||||
time.sleep(interval) |
|
||||
|
|
||||
# 关闭连接 |
|
||||
client.close() |
|
||||
except Exception as e: |
|
||||
shared_data_list[server_title]['err_info'] = f'retry:{re_try_count}, {e}' |
|
||||
time.sleep(re_connect_time) |
|
||||
re_try_count += 1 |
|
||||
|
|
||||
# 获取所有的服务器数据 |
|
||||
def get_all_data(): |
|
||||
return filter_data(list(data_dict.keys())) |
|
||||
|
|
||||
# 根据key过滤所需的服务器数据 |
|
||||
def filter_data(title_list: list): |
|
||||
result = dict() |
|
||||
server_data = dict() |
|
||||
for title in title_list: |
|
||||
server_data[title] = {} |
|
||||
# 不存在该title的数据 |
|
||||
if title not in data_dict: |
|
||||
server_data[title]['err_info'] = f'title \'{title}\' not exist!' |
|
||||
continue |
|
||||
|
|
||||
# 记录数据 ---------------------------------------------------- |
|
||||
data_updated = data_dict[title].get('updated', False) |
|
||||
# 是否更新 |
|
||||
server_data[title]['updated'] = data_updated |
|
||||
# 报错信息 |
|
||||
err_info = data_dict[title].get('err_info', None) |
|
||||
if err_info is not None: |
|
||||
server_data[title]['err_info'] = err_info |
|
||||
# 显卡 |
|
||||
gpu_info_list = data_dict[title].get('gpu_info_list', None) |
|
||||
if gpu_info_list is not None: |
|
||||
server_data[title]['gpu_info_list'] = gpu_info_list |
|
||||
# 硬盘 |
|
||||
storage_info_list = data_dict[title].get('storage_info_list', None) |
|
||||
if storage_info_list is not None: |
|
||||
server_data[title]['storage_info_list'] = storage_info_list |
|
||||
# 内存 |
|
||||
memory_info = data_dict[title].get('memory_info', None) |
|
||||
if memory_info is not None: |
|
||||
server_data[title]['memory_info'] = memory_info |
|
||||
# 网络 |
|
||||
network_info = data_dict[title].get('network_info', None) |
|
||||
if network_info is not None: |
|
||||
server_data[title]['network_info'] = network_info |
|
||||
|
|
||||
result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
||||
result['server_data'] = server_data |
|
||||
return result |
|
||||
|
|
||||
def start_connect(): |
|
||||
# 加载json |
|
||||
with open(server_list_path, 'r') as f: |
|
||||
server_list = json.load(f) |
|
||||
|
|
||||
global data_dict |
|
||||
# 开启线程 |
|
||||
for i, server_data in enumerate(server_list): |
|
||||
data_dict[server_data['title']] = {} |
|
||||
data_dict[server_data['title']]['server_data'] = server_data |
|
||||
thread = threading.Thread(target=keep_check_one, args=(server_data, data_dict, server_data['title'], check_interval)) |
|
||||
thread.daemon = True |
|
||||
thread.start() |
|
||||
|
|
||||
print('start connect') |
|
||||
|
|
||||
# 测试 |
|
||||
def test(): |
|
||||
start_connect() |
|
||||
app.run(debug=True, host='127.0.0.1', port=port) |
|
||||
|
|
||||
if __name__ == '__main__': |
|
||||
test() |
|
@ -0,0 +1,13 @@ |
|||||
|
{ |
||||
|
"server_url": "http://127.0.0.1:15002", |
||||
|
"api_key": "default_key_000", |
||||
|
"interval": 3.0, |
||||
|
"note": "", |
||||
|
"enable": ["gpu", "cpu", "memory", "storage", "network"], |
||||
|
"storage_list":[ |
||||
|
"/", |
||||
|
"/media/D", |
||||
|
"/media/E", |
||||
|
"/media/F" |
||||
|
] |
||||
|
} |
@ -0,0 +1,45 @@ |
|||||
|
from flask import Flask, jsonify, request |
||||
|
from datetime import datetime |
||||
|
from flask_cors import CORS |
||||
|
import threading |
||||
|
import json |
||||
|
import time |
||||
|
|
||||
|
#region 全局 |
||||
|
|
||||
|
app = Flask(__name__) |
||||
|
CORS(app) |
||||
|
port = 15002 |
||||
|
server_list_path = 'serverList.json' |
||||
|
data_list_lock = threading.Lock() |
||||
|
check_interval = 2 |
||||
|
# 共享list |
||||
|
data_dict = dict() |
||||
|
|
||||
|
#endregion |
||||
|
|
||||
|
#region 接口 |
||||
|
|
||||
|
# 测试用 |
||||
|
@app.route('/api') |
||||
|
def hello(): |
||||
|
return 'hi. —— CheckGPUsWeb' |
||||
|
|
||||
|
@app.route('/api/get_data', methods=['GET']) |
||||
|
def get_data(): |
||||
|
return jsonify({}) |
||||
|
|
||||
|
@app.route('/api/update_data', methods=['POST']) |
||||
|
def receive_data(): |
||||
|
data = request.json |
||||
|
print(data) |
||||
|
return jsonify({"status": "success"}) |
||||
|
|
||||
|
#endregion |
||||
|
|
||||
|
# 测试 |
||||
|
def main(): |
||||
|
app.run(debug=False, host='127.0.0.1', port=port) |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
main() |
Loading…
Reference in new issue