Browse Source

除了网络数据外都能获取了

v2
lxb 8 months ago
parent
commit
73d215bcf4
  1. 319
      app.py
  2. 180
      client.py
  3. 13
      client_config.json
  4. 26
      data_define/client_data_example.json
  5. 45
      server.py

319
app.py

@ -1,319 +0,0 @@
from flask import Flask, jsonify
from datetime import datetime
from flask_cors import CORS
import threading
import paramiko
import json
import time
#region 全局
app = Flask(__name__)
CORS(app)
port = 15002
server_list_path = 'serverList.json'
data_list_lock = threading.Lock()
check_interval = 2
# 共享list
data_dict = dict()
#endregion
#region 接口
# 测试用
@app.route('/')
def hello():
return 'hi. —— CheckGPUsWeb'
@app.route('/all_data', methods=['GET'])
def get_data():
return jsonify(get_all_data())
# 开始连接服务器
def connect_server():
pass
#endregion
def get_gpus_info(client, timeout, info_list:list=None, ignore_gpu=False):
if ignore_gpu:
return None
try:
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout)
output = stdout.read().decode()
output = output.split('\n')
start_idx = 0
for i in range(len(output)):
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
start_idx = i + 1
break
output = output[start_idx:-1]
# 解析数据 -----------------------------
result = []
for data in output:
data_list = data.split(', ')
idx = int(data_list[0])
gpu_name = data_list[1]
total_mem = int(data_list[2].split(' ')[0])
used_mem = int(data_list[3].split(' ')[0])
free_mem = int(data_list[4].split(' ')[0])
util_gpu = int(data_list[5].split(' ')[0])
util_mem = int(data_list[6].split(' ')[0])
temperature = int(data_list[7])
# 简化GPU名称
if gpu_name.startswith('NVIDIA '):
gpu_name = gpu_name[7:]
if gpu_name.startswith('GeForce '):
gpu_name = gpu_name[8:]
result.append({
'idx': idx,
'gpu_name': gpu_name,
'total_mem': total_mem,
'used_mem': used_mem,
'free_mem': free_mem,
'util_gpu': util_gpu,
'util_mem': util_mem,
'temperature': temperature,
'users': {}
})
# 读取用户使用信息
try:
gpustat_cmd = 'gpustat --json'
stdin, stdout, stderr = client.exec_command(gpustat_cmd, timeout=timeout)
gpustat_output = stdout.read().decode()
# 确保 gpustat 输出不是空的
if not gpustat_output:
raise ValueError("gpustat did not return any output.")
gpustat_info = json.loads(gpustat_output)
# 确保解析的 gpustat 信息格式正确
if 'gpus' not in gpustat_info:
raise ValueError("Parsed gpustat info does not contain 'gpus' key.")
# 解析进程信息 -----------------------------
for gpu in gpustat_info['gpus']:
idx = gpu['index']
processes = gpu.get('processes', []) # 使用 get() 方法避免 KeyError
for process in processes:
username = process['username']
gpu_memory_usage = process['gpu_memory_usage'] # 占用的显存
# 找到对应的 GPU,将用户及其显存使用情况记录下来
for gpu_result in result:
if gpu_result['idx'] == idx:
if username not in gpu_result['users']:
gpu_result['users'][username] = 0
gpu_result['users'][username] += gpu_memory_usage
except Exception as e:
if info_list is not None:
info_list.append(f'gpu user: {e}')
return result
except paramiko.ssh_exception.SSHException as e:
# ssh 的异常仍然抛出
raise
except Exception as e:
if info_list is not None:
info_list.append(f'gpus: {e}')
return None
def get_storage_info(client, timeout, path_list, info_list:list=None):
try:
result = []
for target_path in path_list:
stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout)
output = stdout.read().decode()
if output == "":
continue
data = output.split()
tmp_res = {
"path": target_path,
"total": int(data[1]),
"available": int(data[3])
}
result.append(tmp_res)
return result
except paramiko.ssh_exception.SSHException as e:
# ssh 的异常仍然抛出
raise
except Exception as e:
if info_list is not None:
info_list.append(f'storage: {e}')
return None
def get_memory_info(client, timeout, info_list:list=None):
try:
stdin, stdout, stderr = client.exec_command('free', timeout=timeout)
output = stdout.read().decode().split('\n')[1]
if output == "":
return None
data = output.split()
result = {
"total": int(data[1]),
"used": int(data[2])
}
return result
except paramiko.ssh_exception.SSHException as e:
# ssh 的异常仍然抛出
raise
except Exception as e:
if info_list is not None:
info_list.append(f'memory: {e}')
return None
def get_network_info(client, timeout, interface_name, info_list:list=None):
try:
if interface_name is None:
return None
stdin, stdout, stderr = client.exec_command(f'ifstat -i {interface_name} 0.1 1', timeout=timeout)
output = stdout.read().decode().split('\n')[2]
data = output.split()
result = {
"in": float(data[0]),
"out": float(data[1])
}
return result
except paramiko.ssh_exception.SSHException as e:
# ssh 的异常仍然抛出
raise
except Exception as e:
if info_list is not None:
info_list.append(f'network: {e}')
return None
# 持续获取一个服务器的信息
def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5):
# 处理一下需要检查的存储空间路径
if not 'storage_list' in server:
server['storage_list'] = []
if not '/' in server['storage_list']:
server['storage_list'].insert(0, '/')
re_try_count = 0
# 循环连接
while True:
try:
# 建立SSH连接
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3)
shared_data_list[server_title]['err_info'] = None
re_try_count = 0
# 循环检测
keep_run = True
while keep_run:
try:
error_info_list = []
# gpu 信息
gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list, ignore_gpu=server.get('ignore_gpu', False))
# 存储空间信息
storage_info = get_storage_info(client, interval*3, server['storage_list'], info_list=error_info_list)
# 内存信息
memory_info = get_memory_info(client, interval*3, info_list=error_info_list)
# 网络信息
network_info = get_network_info(client, interval*3, server.get('network_interface_name', None), info_list=error_info_list)
# 记录信息
with data_list_lock:
shared_data_list[server_title]['gpu_info_list'] = gpu_info
shared_data_list[server_title]['storage_info_list'] = storage_info
shared_data_list[server_title]['memory_info'] = memory_info
shared_data_list[server_title]['network_info'] = network_info
shared_data_list[server_title]['updated'] = True
shared_data_list[server_title]['maxGPU'] = len(gpu_info) if gpu_info is not None else 0
if len(error_info_list) > 0:
shared_data_list[server_title]['err_info'] = '\n'.join(error_info_list)
except Exception as e:
keep_run = False
shared_data_list[server_title]['err_info'] = f'{e}'
if 'gpu_info_list' in shared_data_list[server_title]:
shared_data_list[server_title].pop('gpu_info_list')
time.sleep(interval)
# 关闭连接
client.close()
except Exception as e:
shared_data_list[server_title]['err_info'] = f'retry:{re_try_count}, {e}'
time.sleep(re_connect_time)
re_try_count += 1
# 获取所有的服务器数据
def get_all_data():
return filter_data(list(data_dict.keys()))
# 根据key过滤所需的服务器数据
def filter_data(title_list: list):
result = dict()
server_data = dict()
for title in title_list:
server_data[title] = {}
# 不存在该title的数据
if title not in data_dict:
server_data[title]['err_info'] = f'title \'{title}\' not exist!'
continue
# 记录数据 ----------------------------------------------------
data_updated = data_dict[title].get('updated', False)
# 是否更新
server_data[title]['updated'] = data_updated
# 报错信息
err_info = data_dict[title].get('err_info', None)
if err_info is not None:
server_data[title]['err_info'] = err_info
# 显卡
gpu_info_list = data_dict[title].get('gpu_info_list', None)
if gpu_info_list is not None:
server_data[title]['gpu_info_list'] = gpu_info_list
# 硬盘
storage_info_list = data_dict[title].get('storage_info_list', None)
if storage_info_list is not None:
server_data[title]['storage_info_list'] = storage_info_list
# 内存
memory_info = data_dict[title].get('memory_info', None)
if memory_info is not None:
server_data[title]['memory_info'] = memory_info
# 网络
network_info = data_dict[title].get('network_info', None)
if network_info is not None:
server_data[title]['network_info'] = network_info
result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
result['server_data'] = server_data
return result
def start_connect():
# 加载json
with open(server_list_path, 'r') as f:
server_list = json.load(f)
global data_dict
# 开启线程
for i, server_data in enumerate(server_list):
data_dict[server_data['title']] = {}
data_dict[server_data['title']]['server_data'] = server_data
thread = threading.Thread(target=keep_check_one, args=(server_data, data_dict, server_data['title'], check_interval))
thread.daemon = True
thread.start()
print('start connect')
# 测试
def test():
start_connect()
app.run(debug=True, host='127.0.0.1', port=port)
if __name__ == '__main__':
test()

180
client.py

@ -2,46 +2,174 @@ import os
import json
import time
import psutil
import requests
import subprocess
# region get data
def get_gpus_info():
# todo
pass
# 获取显卡相关信息
def get_gpus_info(error_dict):
result_list = list()
def get_cpus_info():
# cpu_usage_per_core = psutil.cpu_percent(interval=1, percpu=True)
# for i, usage in enumerate(cpu_usage_per_core):
# print(f"CPU核心 {i} 使用率: {usage}%")
try:
gpus_info = json.load(os.popen('gpustat --json'))
for gpu_info in gpus_info['gpus']:
# 处理一下
gpu_name = gpu_info['name']
gpu_name = gpu_name.replace('NVIDIA ', '').replace('GeForce ', '')
process_list = list()
for process_info in gpu_info['processes']:
process_list.append({
"user": process_info['username'],
"memory": process_info['gpu_memory_usage'],
"cmd": ' '.join(process_info["full_command"])
})
# 加到list中
result_list.append({
"idx": gpu_info['index'],
"name": gpu_name,
"temperature": gpu_info['temperature.gpu'],
"used_memory": gpu_info['memory.used'],
"total_memory": gpu_info['memory.total'],
"utilization": gpu_info['utilization.gpu'],
"process_list": process_list
})
except Exception as e:
error_dict['gpu'] = e
# print(psutil.sensors_temperatures())
# 获取逻辑核心数(超线程技术下的线程数)
logical_cores = psutil.cpu_count()
print(f"Logical cores: {logical_cores}")
return result_list
# 获取物理核心数(实际的CPU核心数)
physical_cores = psutil.cpu_count(logical=False)
print(f"Physical cores: {physical_cores}")
# 获取cpu相关信息
cpu_name = None
def get_cpu_info(error_dict):
result_dict = dict()
try:
# 获取cpu型号
global cpu_name
def get_cpu_name():
if cpu_name == None:
import re
# 执行lscpu命令并获取输出
result = subprocess.run(['lscpu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
output = result.stdout
# 使用正则表达式匹配“Model name”或“型号名称”
model_name_match = re.search(r'Model name\s*:\s*(.+)', output)
if model_name_match:
return model_name_match.group(1).strip()
else:
# 如果没有找到“Model name”,则尝试匹配“型号名称”
model_name_match_cn = re.search(r'型号名称\s*:\s*(.+)', output)
if model_name_match_cn:
return model_name_match_cn.group(1).strip()
else:
return "CPU型号信息未找到"
else:
return cpu_name
cpu_name = get_cpu_name()
# 获取每个cpu的温度
temperature_list = list()
temperatures = psutil.sensors_temperatures()
if 'coretemp' in temperatures:
for entry in temperatures['coretemp']:
if entry.label.startswith('Package'):
temperature_list.append(entry.current)
# 记录信息
result_dict["name"] = cpu_name
result_dict["temperature_list"] = temperature_list
result_dict["core_avg_occupy"] = psutil.cpu_percent(interval=None, percpu=False)
result_dict["core_occupy_list"] = psutil.cpu_percent(interval=None, percpu=True)
except Exception as e:
error_dict['cpu'] = e
return result_dict
def get_storages_info():
# todo
pass
# 获取存储相关信息
def get_storages_info(error_dict, path_list):
result_list = list()
try:
for target_path in path_list:
data = subprocess.run(['df', target_path, '|', 'grep', target_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout
data = data.split('\n')[1].split()
tmp_res = {
"path": target_path,
"total": int(data[1]),
"available": int(data[3])
}
result_list.append(tmp_res)
except Exception as e:
error_dict['storage'] = e
def get_memory_info():
# todo
pass
return result_list
# 获取内存相关信息
def get_memory_info(error_dict):
result_dict = dict()
try:
mem = psutil.virtual_memory()
result_dict["total"] = mem.total
result_dict["used"] = mem.used
except Exception as e:
error_dict['memory'] = e
return result_dict
def get_networks_info():
# todo
# 获取网络相关信息
def get_networks_info(error_dict):
# net_io = psutil.net_io_counters()
# print(net_io)
pass
# endregion
client_cfg = None
def collect_data():
result_dict = dict()
error_dict = dict()
# 根据设置采集信息
if 'gpu' in client_cfg['enable']:
result_dict['gpu_list'] = get_gpus_info(error_dict)
if 'cpu' in client_cfg['enable']:
result_dict['cpu'] = get_cpu_info(error_dict)
if 'storage' in client_cfg['enable']:
result_dict['storage_list'] = get_storages_info(error_dict, client_cfg['storage_list'])
if 'memory' in client_cfg['enable']:
result_dict['memory'] = get_memory_info(error_dict)
if 'network' in client_cfg['enable']:
result_dict['network_list'] = get_networks_info(error_dict)
# 记录其他信息
result_dict['update_time_stamp'] = int(time.time())
result_dict['error_dict'] = error_dict
result_dict['note'] = client_cfg['note']
result_dict['api_key'] = client_cfg['api_key']
return result_dict
def main():
get_cpus_info()
# cpu_usage_per_core = get_cpus_info()
# for i, usage in enumerate(cpu_usage_per_core):
# print(f"CPU核心 {i} 使用率: {usage}%")
# 加载配置文件
cfg_path = "client_config.json"
global client_cfg
with open(cfg_path, 'r') as f:
client_cfg = json.load(f)
# 持续发送
send_interval = client_cfg['interval']
api_url = client_cfg['server_url'] + '/api/update_data'
while True:
data = collect_data()
try:
response = requests.post(api_url, json=data)
except Exception as e:
print(e)
time.sleep(send_interval)
if __name__ == '__main__':
main()

13
client_config.json

@ -0,0 +1,13 @@
{
"server_url": "http://127.0.0.1:15002",
"api_key": "default_key_000",
"interval": 3.0,
"note": "",
"enable": ["gpu", "cpu", "memory", "storage", "network"],
"storage_list":[
"/",
"/media/D",
"/media/E",
"/media/F"
]
}

26
data_define/client_data_example.json

@ -21,20 +21,18 @@
]
}
],
"cpu_list":[
{
"idx": 0,
"name": "i5 6500",
"temperature": 50,
"core_avg_occupy": 31.25,
"core_occupy_list":[
12,
23,
0,
90
]
}
],
"cpu":
{
"name": "i5 6500",
"temperature_list": [50, 30],
"core_avg_occupy": 31.25,
"core_occupy_list":[
12,
23,
0,
90
]
},
"storage_list":[
{
"path": "/media/F",

45
server.py

@ -0,0 +1,45 @@
from flask import Flask, jsonify, request
from datetime import datetime
from flask_cors import CORS
import threading
import json
import time
#region 全局
app = Flask(__name__)
CORS(app)
port = 15002
server_list_path = 'serverList.json'
data_list_lock = threading.Lock()
check_interval = 2
# 共享list
data_dict = dict()
#endregion
#region 接口
# 测试用
@app.route('/api')
def hello():
return 'hi. —— CheckGPUsWeb'
@app.route('/api/get_data', methods=['GET'])
def get_data():
return jsonify({})
@app.route('/api/update_data', methods=['POST'])
def receive_data():
data = request.json
print(data)
return jsonify({"status": "success"})
#endregion
# 测试
def main():
app.run(debug=False, host='127.0.0.1', port=port)
if __name__ == '__main__':
main()
Loading…
Cancel
Save