diff --git a/app.py b/app.py deleted file mode 100644 index 8c445b6..0000000 --- a/app.py +++ /dev/null @@ -1,319 +0,0 @@ -from flask import Flask, jsonify -from datetime import datetime -from flask_cors import CORS -import threading -import paramiko -import json -import time - -#region 全局 - -app = Flask(__name__) -CORS(app) -port = 15002 -server_list_path = 'serverList.json' -data_list_lock = threading.Lock() -check_interval = 2 -# 共享list -data_dict = dict() - -#endregion - -#region 接口 - -# 测试用 -@app.route('/') -def hello(): - return 'hi. —— CheckGPUsWeb' - -@app.route('/all_data', methods=['GET']) -def get_data(): - return jsonify(get_all_data()) - -# 开始连接服务器 -def connect_server(): - pass - -#endregion - -def get_gpus_info(client, timeout, info_list:list=None, ignore_gpu=False): - if ignore_gpu: - return None - - try: - cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' - - stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout) - output = stdout.read().decode() - output = output.split('\n') - start_idx = 0 - for i in range(len(output)): - if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': - start_idx = i + 1 - break - output = output[start_idx:-1] - # 解析数据 ----------------------------- - result = [] - for data in output: - data_list = data.split(', ') - idx = int(data_list[0]) - gpu_name = data_list[1] - total_mem = int(data_list[2].split(' ')[0]) - used_mem = int(data_list[3].split(' ')[0]) - free_mem = int(data_list[4].split(' ')[0]) - util_gpu = int(data_list[5].split(' ')[0]) - util_mem = int(data_list[6].split(' ')[0]) - temperature = int(data_list[7]) - - # 简化GPU名称 - if gpu_name.startswith('NVIDIA '): - gpu_name = gpu_name[7:] - if gpu_name.startswith('GeForce '): - gpu_name = gpu_name[8:] - - result.append({ - 'idx': idx, - 'gpu_name': gpu_name, - 'total_mem': total_mem, - 'used_mem': used_mem, - 'free_mem': free_mem, - 'util_gpu': util_gpu, - 'util_mem': util_mem, - 'temperature': temperature, - 'users': {} - }) - - # 读取用户使用信息 - try: - gpustat_cmd = 'gpustat --json' - stdin, stdout, stderr = client.exec_command(gpustat_cmd, timeout=timeout) - gpustat_output = stdout.read().decode() - - # 确保 gpustat 输出不是空的 - if not gpustat_output: - raise ValueError("gpustat did not return any output.") - - gpustat_info = json.loads(gpustat_output) - - # 确保解析的 gpustat 信息格式正确 - if 'gpus' not in gpustat_info: - raise ValueError("Parsed gpustat info does not contain 'gpus' key.") - - # 解析进程信息 ----------------------------- - for gpu in gpustat_info['gpus']: - idx = gpu['index'] - processes = gpu.get('processes', []) # 使用 get() 方法避免 KeyError - for process in processes: - username = process['username'] - gpu_memory_usage = process['gpu_memory_usage'] # 占用的显存 - # 找到对应的 GPU,将用户及其显存使用情况记录下来 - for gpu_result in result: - if gpu_result['idx'] == idx: - if username not in gpu_result['users']: - gpu_result['users'][username] = 0 - gpu_result['users'][username] += gpu_memory_usage - except Exception as e: - if info_list is not None: - info_list.append(f'gpu user: {e}') - - return result - except paramiko.ssh_exception.SSHException as e: - # ssh 的异常仍然抛出 - raise - except Exception as e: - if info_list is not None: - info_list.append(f'gpus: {e}') - return None - -def get_storage_info(client, timeout, path_list, info_list:list=None): - try: - result = [] - for target_path in path_list: - stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout) - output = stdout.read().decode() - if output == "": - continue - data = output.split() - tmp_res = { - "path": target_path, - "total": int(data[1]), - "available": int(data[3]) - } - result.append(tmp_res) - return result - except paramiko.ssh_exception.SSHException as e: - # ssh 的异常仍然抛出 - raise - except Exception as e: - if info_list is not None: - info_list.append(f'storage: {e}') - return None - -def get_memory_info(client, timeout, info_list:list=None): - try: - stdin, stdout, stderr = client.exec_command('free', timeout=timeout) - output = stdout.read().decode().split('\n')[1] - if output == "": - return None - data = output.split() - result = { - "total": int(data[1]), - "used": int(data[2]) - } - - return result - except paramiko.ssh_exception.SSHException as e: - # ssh 的异常仍然抛出 - raise - except Exception as e: - if info_list is not None: - info_list.append(f'memory: {e}') - return None - -def get_network_info(client, timeout, interface_name, info_list:list=None): - try: - if interface_name is None: - return None - stdin, stdout, stderr = client.exec_command(f'ifstat -i {interface_name} 0.1 1', timeout=timeout) - output = stdout.read().decode().split('\n')[2] - data = output.split() - result = { - "in": float(data[0]), - "out": float(data[1]) - } - return result - except paramiko.ssh_exception.SSHException as e: - # ssh 的异常仍然抛出 - raise - except Exception as e: - if info_list is not None: - info_list.append(f'network: {e}') - return None - -# 持续获取一个服务器的信息 -def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5): - # 处理一下需要检查的存储空间路径 - if not 'storage_list' in server: - server['storage_list'] = [] - if not '/' in server['storage_list']: - server['storage_list'].insert(0, '/') - - re_try_count = 0 - # 循环连接 - while True: - try: - # 建立SSH连接 - client = paramiko.SSHClient() - client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) - - shared_data_list[server_title]['err_info'] = None - re_try_count = 0 - - # 循环检测 - keep_run = True - while keep_run: - try: - error_info_list = [] - # gpu 信息 - gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list, ignore_gpu=server.get('ignore_gpu', False)) - # 存储空间信息 - storage_info = get_storage_info(client, interval*3, server['storage_list'], info_list=error_info_list) - # 内存信息 - memory_info = get_memory_info(client, interval*3, info_list=error_info_list) - # 网络信息 - network_info = get_network_info(client, interval*3, server.get('network_interface_name', None), info_list=error_info_list) - - # 记录信息 - with data_list_lock: - shared_data_list[server_title]['gpu_info_list'] = gpu_info - shared_data_list[server_title]['storage_info_list'] = storage_info - shared_data_list[server_title]['memory_info'] = memory_info - shared_data_list[server_title]['network_info'] = network_info - shared_data_list[server_title]['updated'] = True - shared_data_list[server_title]['maxGPU'] = len(gpu_info) if gpu_info is not None else 0 - if len(error_info_list) > 0: - shared_data_list[server_title]['err_info'] = '\n'.join(error_info_list) - - except Exception as e: - keep_run = False - shared_data_list[server_title]['err_info'] = f'{e}' - if 'gpu_info_list' in shared_data_list[server_title]: - shared_data_list[server_title].pop('gpu_info_list') - - time.sleep(interval) - - # 关闭连接 - client.close() - except Exception as e: - shared_data_list[server_title]['err_info'] = f'retry:{re_try_count}, {e}' - time.sleep(re_connect_time) - re_try_count += 1 - -# 获取所有的服务器数据 -def get_all_data(): - return filter_data(list(data_dict.keys())) - -# 根据key过滤所需的服务器数据 -def filter_data(title_list: list): - result = dict() - server_data = dict() - for title in title_list: - server_data[title] = {} - # 不存在该title的数据 - if title not in data_dict: - server_data[title]['err_info'] = f'title \'{title}\' not exist!' - continue - - # 记录数据 ---------------------------------------------------- - data_updated = data_dict[title].get('updated', False) - # 是否更新 - server_data[title]['updated'] = data_updated - # 报错信息 - err_info = data_dict[title].get('err_info', None) - if err_info is not None: - server_data[title]['err_info'] = err_info - # 显卡 - gpu_info_list = data_dict[title].get('gpu_info_list', None) - if gpu_info_list is not None: - server_data[title]['gpu_info_list'] = gpu_info_list - # 硬盘 - storage_info_list = data_dict[title].get('storage_info_list', None) - if storage_info_list is not None: - server_data[title]['storage_info_list'] = storage_info_list - # 内存 - memory_info = data_dict[title].get('memory_info', None) - if memory_info is not None: - server_data[title]['memory_info'] = memory_info - # 网络 - network_info = data_dict[title].get('network_info', None) - if network_info is not None: - server_data[title]['network_info'] = network_info - - result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - result['server_data'] = server_data - return result - -def start_connect(): - # 加载json - with open(server_list_path, 'r') as f: - server_list = json.load(f) - - global data_dict - # 开启线程 - for i, server_data in enumerate(server_list): - data_dict[server_data['title']] = {} - data_dict[server_data['title']]['server_data'] = server_data - thread = threading.Thread(target=keep_check_one, args=(server_data, data_dict, server_data['title'], check_interval)) - thread.daemon = True - thread.start() - - print('start connect') - -# 测试 -def test(): - start_connect() - app.run(debug=True, host='127.0.0.1', port=port) - -if __name__ == '__main__': - test() \ No newline at end of file diff --git a/client.py b/client.py index c459b60..0f7b11d 100644 --- a/client.py +++ b/client.py @@ -2,46 +2,174 @@ import os import json import time import psutil +import requests +import subprocess # region get data -def get_gpus_info(): - # todo - pass +# 获取显卡相关信息 +def get_gpus_info(error_dict): + result_list = list() -def get_cpus_info(): - # cpu_usage_per_core = psutil.cpu_percent(interval=1, percpu=True) - # for i, usage in enumerate(cpu_usage_per_core): - # print(f"CPU核心 {i} 使用率: {usage}%") + try: + gpus_info = json.load(os.popen('gpustat --json')) + for gpu_info in gpus_info['gpus']: + # 处理一下 + gpu_name = gpu_info['name'] + gpu_name = gpu_name.replace('NVIDIA ', '').replace('GeForce ', '') + process_list = list() + for process_info in gpu_info['processes']: + process_list.append({ + "user": process_info['username'], + "memory": process_info['gpu_memory_usage'], + "cmd": ' '.join(process_info["full_command"]) + }) + + # 加到list中 + result_list.append({ + "idx": gpu_info['index'], + "name": gpu_name, + "temperature": gpu_info['temperature.gpu'], + "used_memory": gpu_info['memory.used'], + "total_memory": gpu_info['memory.total'], + "utilization": gpu_info['utilization.gpu'], + "process_list": process_list + }) + except Exception as e: + error_dict['gpu'] = e - # print(psutil.sensors_temperatures()) - # 获取逻辑核心数(超线程技术下的线程数) - logical_cores = psutil.cpu_count() - print(f"Logical cores: {logical_cores}") + return result_list - # 获取物理核心数(实际的CPU核心数) - physical_cores = psutil.cpu_count(logical=False) - print(f"Physical cores: {physical_cores}") +# 获取cpu相关信息 +cpu_name = None +def get_cpu_info(error_dict): + result_dict = dict() + + try: + # 获取cpu型号 + global cpu_name + def get_cpu_name(): + if cpu_name == None: + import re + # 执行lscpu命令并获取输出 + result = subprocess.run(['lscpu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + output = result.stdout + + # 使用正则表达式匹配“Model name”或“型号名称” + model_name_match = re.search(r'Model name\s*:\s*(.+)', output) + if model_name_match: + return model_name_match.group(1).strip() + else: + # 如果没有找到“Model name”,则尝试匹配“型号名称” + model_name_match_cn = re.search(r'型号名称\s*:\s*(.+)', output) + if model_name_match_cn: + return model_name_match_cn.group(1).strip() + else: + return "CPU型号信息未找到" + else: + return cpu_name + cpu_name = get_cpu_name() + + # 获取每个cpu的温度 + temperature_list = list() + temperatures = psutil.sensors_temperatures() + if 'coretemp' in temperatures: + for entry in temperatures['coretemp']: + if entry.label.startswith('Package'): + temperature_list.append(entry.current) + + # 记录信息 + result_dict["name"] = cpu_name + result_dict["temperature_list"] = temperature_list + result_dict["core_avg_occupy"] = psutil.cpu_percent(interval=None, percpu=False) + result_dict["core_occupy_list"] = psutil.cpu_percent(interval=None, percpu=True) + + except Exception as e: + error_dict['cpu'] = e + + return result_dict -def get_storages_info(): - # todo - pass +# 获取存储相关信息 +def get_storages_info(error_dict, path_list): + result_list = list() + try: + for target_path in path_list: + data = subprocess.run(['df', target_path, '|', 'grep', target_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout + data = data.split('\n')[1].split() + tmp_res = { + "path": target_path, + "total": int(data[1]), + "available": int(data[3]) + } + result_list.append(tmp_res) + except Exception as e: + error_dict['storage'] = e -def get_memory_info(): - # todo - pass + return result_list + +# 获取内存相关信息 +def get_memory_info(error_dict): + result_dict = dict() + try: + mem = psutil.virtual_memory() + result_dict["total"] = mem.total + result_dict["used"] = mem.used + except Exception as e: + error_dict['memory'] = e + + return result_dict -def get_networks_info(): - # todo +# 获取网络相关信息 +def get_networks_info(error_dict): + # net_io = psutil.net_io_counters() + # print(net_io) pass # endregion +client_cfg = None + +def collect_data(): + result_dict = dict() + error_dict = dict() + + # 根据设置采集信息 + if 'gpu' in client_cfg['enable']: + result_dict['gpu_list'] = get_gpus_info(error_dict) + if 'cpu' in client_cfg['enable']: + result_dict['cpu'] = get_cpu_info(error_dict) + if 'storage' in client_cfg['enable']: + result_dict['storage_list'] = get_storages_info(error_dict, client_cfg['storage_list']) + if 'memory' in client_cfg['enable']: + result_dict['memory'] = get_memory_info(error_dict) + if 'network' in client_cfg['enable']: + result_dict['network_list'] = get_networks_info(error_dict) + + # 记录其他信息 + result_dict['update_time_stamp'] = int(time.time()) + result_dict['error_dict'] = error_dict + result_dict['note'] = client_cfg['note'] + result_dict['api_key'] = client_cfg['api_key'] + + return result_dict + def main(): - get_cpus_info() - # cpu_usage_per_core = get_cpus_info() - # for i, usage in enumerate(cpu_usage_per_core): - # print(f"CPU核心 {i} 使用率: {usage}%") + # 加载配置文件 + cfg_path = "client_config.json" + global client_cfg + with open(cfg_path, 'r') as f: + client_cfg = json.load(f) + + # 持续发送 + send_interval = client_cfg['interval'] + api_url = client_cfg['server_url'] + '/api/update_data' + while True: + data = collect_data() + try: + response = requests.post(api_url, json=data) + except Exception as e: + print(e) + time.sleep(send_interval) if __name__ == '__main__': main() \ No newline at end of file diff --git a/client_config.json b/client_config.json new file mode 100644 index 0000000..52f86e8 --- /dev/null +++ b/client_config.json @@ -0,0 +1,13 @@ +{ + "server_url": "http://127.0.0.1:15002", + "api_key": "default_key_000", + "interval": 3.0, + "note": "", + "enable": ["gpu", "cpu", "memory", "storage", "network"], + "storage_list":[ + "/", + "/media/D", + "/media/E", + "/media/F" + ] +} \ No newline at end of file diff --git a/data_define/client_data_example.json b/data_define/client_data_example.json index 4fc9a94..8535322 100644 --- a/data_define/client_data_example.json +++ b/data_define/client_data_example.json @@ -21,20 +21,18 @@ ] } ], - "cpu_list":[ - { - "idx": 0, - "name": "i5 6500", - "temperature": 50, - "core_avg_occupy": 31.25, - "core_occupy_list":[ - 12, - 23, - 0, - 90 - ] - } - ], + "cpu": + { + "name": "i5 6500", + "temperature_list": [50, 30], + "core_avg_occupy": 31.25, + "core_occupy_list":[ + 12, + 23, + 0, + 90 + ] + }, "storage_list":[ { "path": "/media/F", diff --git a/server.py b/server.py new file mode 100644 index 0000000..50fa5bd --- /dev/null +++ b/server.py @@ -0,0 +1,45 @@ +from flask import Flask, jsonify, request +from datetime import datetime +from flask_cors import CORS +import threading +import json +import time + +#region 全局 + +app = Flask(__name__) +CORS(app) +port = 15002 +server_list_path = 'serverList.json' +data_list_lock = threading.Lock() +check_interval = 2 +# 共享list +data_dict = dict() + +#endregion + +#region 接口 + +# 测试用 +@app.route('/api') +def hello(): + return 'hi. —— CheckGPUsWeb' + +@app.route('/api/get_data', methods=['GET']) +def get_data(): + return jsonify({}) + +@app.route('/api/update_data', methods=['POST']) +def receive_data(): + data = request.json + print(data) + return jsonify({"status": "success"}) + +#endregion + +# 测试 +def main(): + app.run(debug=False, host='127.0.0.1', port=port) + +if __name__ == '__main__': + main() \ No newline at end of file