除了网络数据外都能获取了

1 year ago · 73d215bcf4
5 changed files with 224 additions and 359 deletions
--- a/app.py
+++ b/app.py
@ -1,319 +0,0 @@
-from flask import Flask, jsonify
-from datetime import datetime
-from flask_cors import CORS
-import threading
-import paramiko
-import json
-import time
-
-#region 全局
-
-app = Flask(__name__)
-CORS(app)
-port = 15002
-server_list_path = 'serverList.json'
-data_list_lock = threading.Lock()
-check_interval = 2
-# 共享list
-data_dict = dict()
-
-#endregion
-
-#region 接口
-
-# 测试用
-@app.route('/')
-def hello():
-    return 'hi. —— CheckGPUsWeb'
-
-@app.route('/all_data', methods=['GET'])
-def get_data():
-    return jsonify(get_all_data())
-
-# 开始连接服务器
-def connect_server():
-    pass
-
-#endregion
-
-def get_gpus_info(client, timeout, info_list:list=None, ignore_gpu=False):
-    if ignore_gpu:
-        return None
-
-    try:
-        cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
-
-        stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout)
-        output = stdout.read().decode()
-        output = output.split('\n')
-        start_idx = 0
-        for i in range(len(output)):
-            if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
-                start_idx = i + 1
-                break
-        output = output[start_idx:-1]
-        # 解析数据 -----------------------------
-        result = []
-        for data in output:
-            data_list = data.split(', ')
-            idx = int(data_list[0])
-            gpu_name = data_list[1]
-            total_mem = int(data_list[2].split(' ')[0])
-            used_mem = int(data_list[3].split(' ')[0])
-            free_mem = int(data_list[4].split(' ')[0])
-            util_gpu = int(data_list[5].split(' ')[0])
-            util_mem = int(data_list[6].split(' ')[0])
-            temperature = int(data_list[7])
-
-            # 简化GPU名称
-            if gpu_name.startswith('NVIDIA '):
-                gpu_name = gpu_name[7:]
-            if gpu_name.startswith('GeForce '):
-                gpu_name = gpu_name[8:]
-
-            result.append({
-                'idx': idx,
-                'gpu_name': gpu_name,
-                'total_mem': total_mem,
-                'used_mem': used_mem,
-                'free_mem': free_mem,
-                'util_gpu': util_gpu,
-                'util_mem': util_mem,
-                'temperature': temperature,
-                'users': {}
-            })
-
-        # 读取用户使用信息
-        try:
-            gpustat_cmd = 'gpustat --json'
-            stdin, stdout, stderr = client.exec_command(gpustat_cmd, timeout=timeout)
-            gpustat_output = stdout.read().decode()
-
-            # 确保 gpustat 输出不是空的
-            if not gpustat_output:
-                raise ValueError("gpustat did not return any output.")
-
-            gpustat_info = json.loads(gpustat_output)
-
-            # 确保解析的 gpustat 信息格式正确
-            if 'gpus' not in gpustat_info:
-                raise ValueError("Parsed gpustat info does not contain 'gpus' key.")
-
-            # 解析进程信息 -----------------------------
-            for gpu in gpustat_info['gpus']:
-                idx = gpu['index']
-                processes = gpu.get('processes', [])  # 使用 get() 方法避免 KeyError
-                for process in processes:
-                    username = process['username']
-                    gpu_memory_usage = process['gpu_memory_usage']  # 占用的显存
-                    # 找到对应的 GPU，将用户及其显存使用情况记录下来
-                    for gpu_result in result:
-                        if gpu_result['idx'] == idx:
-                            if username not in gpu_result['users']:
-                                gpu_result['users'][username] = 0
-                            gpu_result['users'][username] += gpu_memory_usage
-        except Exception as e:
-            if info_list is not None:
-                info_list.append(f'gpu user: {e}')
-
-        return result
-    except paramiko.ssh_exception.SSHException as e:
-        # ssh 的异常仍然抛出
-        raise
-    except Exception as e:
-        if info_list is not None:
-            info_list.append(f'gpus: {e}')
-        return None
-
-def get_storage_info(client, timeout, path_list, info_list:list=None):
-    try:
-        result = []
-        for target_path in path_list:
-            stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout)
-            output = stdout.read().decode()
-            if output == "":
-                continue
-            data = output.split()
-            tmp_res = {
-                "path": target_path,
-                "total": int(data[1]),
-                "available": int(data[3])
-            }
-            result.append(tmp_res)
-        return result
-    except paramiko.ssh_exception.SSHException as e:
-        # ssh 的异常仍然抛出
-        raise
-    except Exception as e:
-        if info_list is not None:
-            info_list.append(f'storage: {e}')
-        return None
-
-def get_memory_info(client, timeout, info_list:list=None):
-    try:
-        stdin, stdout, stderr = client.exec_command('free', timeout=timeout)
-        output = stdout.read().decode().split('\n')[1]
-        if output == "":
-            return None
-        data = output.split()
-        result = {
-            "total": int(data[1]),
-            "used": int(data[2])
-        }
-
-        return result
-    except paramiko.ssh_exception.SSHException as e:
-        # ssh 的异常仍然抛出
-        raise
-    except Exception as e:
-        if info_list is not None:
-            info_list.append(f'memory: {e}')
-        return None
-
-def get_network_info(client, timeout, interface_name, info_list:list=None):
-    try:
-        if interface_name is None:
-            return None
-        stdin, stdout, stderr = client.exec_command(f'ifstat -i {interface_name} 0.1 1', timeout=timeout)
-        output = stdout.read().decode().split('\n')[2]
-        data = output.split()
-        result = {
-            "in": float(data[0]),
-            "out": float(data[1])
-        }
-        return result
-    except paramiko.ssh_exception.SSHException as e:
-        # ssh 的异常仍然抛出
-        raise
-    except Exception as e:
-        if info_list is not None:
-            info_list.append(f'network: {e}')
-        return None
-
-# 持续获取一个服务器的信息
-def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5):
-    # 处理一下需要检查的存储空间路径
-    if not 'storage_list' in server:
-        server['storage_list'] = []
-    if not '/' in server['storage_list']:
-        server['storage_list'].insert(0, '/')
-
-    re_try_count = 0
-    # 循环连接
-    while True:
-        try:
-            # 建立SSH连接
-            client = paramiko.SSHClient()
-            client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-            client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3)
-
-            shared_data_list[server_title]['err_info'] = None
-            re_try_count = 0
-
-            # 循环检测
-            keep_run = True
-            while keep_run:
-                try:
-                    error_info_list = []
-                    # gpu 信息
-                    gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list, ignore_gpu=server.get('ignore_gpu', False))
-                    # 存储空间信息
-                    storage_info = get_storage_info(client, interval*3, server['storage_list'], info_list=error_info_list)
-                    # 内存信息
-                    memory_info = get_memory_info(client, interval*3, info_list=error_info_list)
-                    # 网络信息
-                    network_info = get_network_info(client, interval*3, server.get('network_interface_name', None), info_list=error_info_list)
-
-                    # 记录信息
-                    with data_list_lock:
-                        shared_data_list[server_title]['gpu_info_list'] = gpu_info
-                        shared_data_list[server_title]['storage_info_list'] = storage_info
-                        shared_data_list[server_title]['memory_info'] = memory_info
-                        shared_data_list[server_title]['network_info'] = network_info
-                        shared_data_list[server_title]['updated'] = True
-                        shared_data_list[server_title]['maxGPU'] = len(gpu_info) if gpu_info is not None else 0
-                        if len(error_info_list) > 0:
-                            shared_data_list[server_title]['err_info'] = '\n'.join(error_info_list)
-
-                except Exception as e:
-                    keep_run = False
-                    shared_data_list[server_title]['err_info'] = f'{e}'
-                    if 'gpu_info_list' in shared_data_list[server_title]:
-                        shared_data_list[server_title].pop('gpu_info_list')
-
-                time.sleep(interval)
-
-            # 关闭连接
-            client.close()
-        except Exception as e:
-            shared_data_list[server_title]['err_info'] = f'retry:{re_try_count}, {e}'
-        time.sleep(re_connect_time)
-        re_try_count += 1
-
-# 获取所有的服务器数据
-def get_all_data():
-    return filter_data(list(data_dict.keys()))
-
-# 根据key过滤所需的服务器数据
-def filter_data(title_list: list):
-    result = dict()
-    server_data = dict()
-    for title in title_list:
-        server_data[title] = {}
-        # 不存在该title的数据
-        if title not in data_dict:
-            server_data[title]['err_info'] = f'title \'{title}\' not exist!'
-            continue
-
-        # 记录数据 ----------------------------------------------------
-        data_updated = data_dict[title].get('updated', False)
-        # 是否更新
-        server_data[title]['updated'] = data_updated
-        # 报错信息
-        err_info = data_dict[title].get('err_info', None)
-        if err_info is not None:
-            server_data[title]['err_info'] = err_info
-        # 显卡
-        gpu_info_list = data_dict[title].get('gpu_info_list', None)
-        if gpu_info_list is not None:
-            server_data[title]['gpu_info_list'] = gpu_info_list
-        # 硬盘
-        storage_info_list = data_dict[title].get('storage_info_list', None)
-        if storage_info_list is not None:
-            server_data[title]['storage_info_list'] = storage_info_list
-        # 内存
-        memory_info = data_dict[title].get('memory_info', None)
-        if memory_info is not None:
-            server_data[title]['memory_info'] = memory_info
-        # 网络
-        network_info = data_dict[title].get('network_info', None)
-        if network_info is not None:
-            server_data[title]['network_info'] = network_info
-
-    result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-    result['server_data'] = server_data
-    return result
-
-def start_connect():
-    # 加载json
-    with open(server_list_path, 'r') as f:
-        server_list = json.load(f)
-
-    global data_dict
-    # 开启线程
-    for i, server_data in enumerate(server_list):
-        data_dict[server_data['title']] = {}
-        data_dict[server_data['title']]['server_data'] = server_data
-        thread = threading.Thread(target=keep_check_one, args=(server_data, data_dict, server_data['title'], check_interval))
-        thread.daemon = True
-        thread.start()
-
-    print('start connect')
-
-# 测试
-def test():
-    start_connect()
-    app.run(debug=True, host='127.0.0.1', port=port)
-
-if __name__ == '__main__':
-    test()
--- a/client.py
+++ b/client.py
@ -2,46 +2,174 @@ import os
 import json
 import time
 import psutil
+import requests
+import subprocess

 # region get data

-def get_gpus_info():
-    # todo
-    pass
+# 获取显卡相关信息
+def get_gpus_info(error_dict):
+    result_list = list()

-def get_cpus_info():
-    # cpu_usage_per_core = psutil.cpu_percent(interval=1, percpu=True)
-    # for i, usage in enumerate(cpu_usage_per_core):
-    #     print(f"CPU核心 {i} 使用率: {usage}%")
+    try:
+        gpus_info = json.load(os.popen('gpustat --json'))
+        for gpu_info in gpus_info['gpus']:  
+            # 处理一下
+            gpu_name = gpu_info['name']
+            gpu_name = gpu_name.replace('NVIDIA ', '').replace('GeForce ', '')
+            process_list = list()
+            for process_info in gpu_info['processes']:
+                process_list.append({
+                    "user": process_info['username'],
+                    "memory": process_info['gpu_memory_usage'],
+                    "cmd": ' '.join(process_info["full_command"])
+                })
+            
+            # 加到list中
+            result_list.append({
+                "idx": gpu_info['index'],
+                "name": gpu_name,
+                "temperature": gpu_info['temperature.gpu'],
+                "used_memory": gpu_info['memory.used'],
+                "total_memory": gpu_info['memory.total'],
+                "utilization": gpu_info['utilization.gpu'],
+                "process_list": process_list
+            })
+    except Exception as e:
+        error_dict['gpu'] = e
    
-    # print(psutil.sensors_temperatures())
-    # 获取逻辑核心数（超线程技术下的线程数）
-    logical_cores = psutil.cpu_count()
-    print(f"Logical cores: {logical_cores}")
+    return result_list

-    # 获取物理核心数（实际的CPU核心数）
-    physical_cores = psutil.cpu_count(logical=False)
-    print(f"Physical cores: {physical_cores}")
+# 获取cpu相关信息
+cpu_name = None
+def get_cpu_info(error_dict):
+    result_dict = dict()
+    
+    try:
+        # 获取cpu型号
+        global cpu_name
+        def get_cpu_name():
+            if cpu_name == None:
+                import re
+                # 执行lscpu命令并获取输出
+                result = subprocess.run(['lscpu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                output = result.stdout
+                
+                # 使用正则表达式匹配“Model name”或“型号名称”
+                model_name_match = re.search(r'Model name\s*:\s*(.+)', output)
+                if model_name_match:
+                    return model_name_match.group(1).strip()
+                else:
+                    # 如果没有找到“Model name”，则尝试匹配“型号名称”
+                    model_name_match_cn = re.search(r'型号名称\s*：\s*(.+)', output)
+                    if model_name_match_cn:
+                        return model_name_match_cn.group(1).strip()
+                    else:
+                        return "CPU型号信息未找到"
+            else:
+                return cpu_name
+        cpu_name = get_cpu_name()
+        
+        # 获取每个cpu的温度
+        temperature_list = list()
+        temperatures = psutil.sensors_temperatures()
+        if 'coretemp' in temperatures:
+            for entry in temperatures['coretemp']:
+                if entry.label.startswith('Package'):
+                    temperature_list.append(entry.current)
+        
+        # 记录信息
+        result_dict["name"] = cpu_name
+        result_dict["temperature_list"] = temperature_list
+        result_dict["core_avg_occupy"] = psutil.cpu_percent(interval=None, percpu=False)
+        result_dict["core_occupy_list"] = psutil.cpu_percent(interval=None, percpu=True)
+        
+    except Exception as e:
+        error_dict['cpu'] = e
+    
+    return result_dict

-def get_storages_info():
-    # todo
-    pass
+# 获取存储相关信息
+def get_storages_info(error_dict, path_list):
+    result_list = list()
+    try:
+        for target_path in path_list:
+            data = subprocess.run(['df', target_path, '|', 'grep', target_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout
+            data = data.split('\n')[1].split()
+            tmp_res = {
+                "path": target_path,
+                "total": int(data[1]),
+                "available": int(data[3])
+            }
+            result_list.append(tmp_res)
+    except Exception as e:
+        error_dict['storage'] = e

-def get_memory_info():
-    # todo
-    pass
+    return result_list
+    
+# 获取内存相关信息
+def get_memory_info(error_dict):
+    result_dict = dict()
+    try:
+        mem = psutil.virtual_memory()
+        result_dict["total"] = mem.total
+        result_dict["used"] = mem.used
+    except Exception as e:
+        error_dict['memory'] = e
+
+    return result_dict

-def get_networks_info():
-    # todo
+# 获取网络相关信息
+def get_networks_info(error_dict):
+    # net_io = psutil.net_io_counters()
+    # print(net_io)
    pass

 # endregion

+client_cfg = None
+
+def collect_data():
+    result_dict = dict()
+    error_dict = dict()
+    
+    # 根据设置采集信息
+    if 'gpu' in client_cfg['enable']:
+        result_dict['gpu_list'] = get_gpus_info(error_dict)
+    if 'cpu' in client_cfg['enable']:
+        result_dict['cpu'] = get_cpu_info(error_dict)
+    if 'storage' in client_cfg['enable']:
+        result_dict['storage_list'] = get_storages_info(error_dict, client_cfg['storage_list'])
+    if 'memory' in client_cfg['enable']:
+        result_dict['memory'] = get_memory_info(error_dict)
+    if 'network' in client_cfg['enable']:
+        result_dict['network_list'] = get_networks_info(error_dict)
+    
+    # 记录其他信息
+    result_dict['update_time_stamp'] = int(time.time())
+    result_dict['error_dict'] = error_dict
+    result_dict['note'] = client_cfg['note']
+    result_dict['api_key'] = client_cfg['api_key']
+
+    return result_dict
+
 def main():
-    get_cpus_info()
-    # cpu_usage_per_core = get_cpus_info()
-    # for i, usage in enumerate(cpu_usage_per_core):
-    #     print(f"CPU核心 {i} 使用率: {usage}%")
+    # 加载配置文件
+    cfg_path = "client_config.json"
+    global client_cfg
+    with open(cfg_path, 'r') as f:
+        client_cfg = json.load(f)
+    
+    # 持续发送
+    send_interval = client_cfg['interval']
+    api_url = client_cfg['server_url'] + '/api/update_data'
+    while True:
+        data = collect_data()
+        try:
+            response = requests.post(api_url, json=data)
+        except Exception as e:
+            print(e)
+        time.sleep(send_interval)

 if __name__ == '__main__':
    main()
--- a/client_config.json
+++ b/client_config.json
@ -0,0 +1,13 @@
+{
+    "server_url": "http://127.0.0.1:15002",
+    "api_key": "default_key_000",
+    "interval": 3.0,
+    "note": "",
+    "enable": ["gpu", "cpu", "memory", "storage", "network"],
+    "storage_list":[
+        "/",
+        "/media/D",
+        "/media/E",
+        "/media/F"
+    ]
+}
--- a/data_define/client_data_example.json
+++ b/data_define/client_data_example.json
@ -21,20 +21,18 @@
            ]
        }
    ],
-    "cpu_list":[
-        {
-            "idx": 0,
-            "name": "i5 6500",
-            "temperature": 50,
-            "core_avg_occupy": 31.25,
-            "core_occupy_list":[
-                12,
-                23,
-                0,
-                90
-            ]
-        }
-    ],
+    "cpu":
+    {
+        "name": "i5 6500",
+        "temperature_list": [50, 30],
+        "core_avg_occupy": 31.25,
+        "core_occupy_list":[
+            12,
+            23,
+            0,
+            90
+        ]
+    },
    "storage_list":[
        {
            "path": "/media/F",
--- a/server.py
+++ b/server.py
@ -0,0 +1,45 @@
+from flask import Flask, jsonify, request
+from datetime import datetime
+from flask_cors import CORS
+import threading
+import json
+import time
+
+#region 全局
+
+app = Flask(__name__)
+CORS(app)
+port = 15002
+server_list_path = 'serverList.json'
+data_list_lock = threading.Lock()
+check_interval = 2
+# 共享list
+data_dict = dict()
+
+#endregion
+
+#region 接口
+
+# 测试用
+@app.route('/api')
+def hello():
+    return 'hi. —— CheckGPUsWeb'
+
+@app.route('/api/get_data', methods=['GET'])
+def get_data():
+    return jsonify({})
+
+@app.route('/api/update_data', methods=['POST'])
+def receive_data():
+    data = request.json
+    print(data)
+    return jsonify({"status": "success"})
+
+#endregion
+
+# 测试
+def main():
+    app.run(debug=False, host='127.0.0.1', port=port)
+
+if __name__ == '__main__':
+    main()