Tool_CheckGPUsWeb/active_connector.py


								from version import version

								import threading

								import paramiko

								import time

								import json

								import re


								class Connector:

								    def __init__(self, data_dict : dict, server_cfg : dict, note_dict : dict, lock : threading.Lock, connect_check_interval : float, reconnect_interval : float, multiple_of_timeout : float = 2):

								        self.data_dict = data_dict

								        self.tmp_data_dict = dict()

								        self.server_cfg = server_cfg

								        self.note_dict = note_dict

								        self.lock = lock

								        self.tmp_lock = threading.Lock()

								        self.connect_check_interval = connect_check_interval

								        self.reconnect_interval = reconnect_interval

								        self.multiple_of_timeout = multiple_of_timeout


								    def run(self):

								        # 开启查询线程

								        for i, server_data in enumerate(self.server_cfg):

								            self.tmp_data_dict[server_data['title']] = {}

								            # self.tmp_data_dict[server_data['title']]['server_data'] = server_data

								            thread_check = threading.Thread(target=self.__keep_check_one, args=(server_data, self.tmp_data_dict, server_data['title'], self.connect_check_interval, self.reconnect_interval))

								            thread_check.daemon = True

								            thread_check.start()


								        # 开启同步数据线程

								        thread_transmit = threading.Thread(target=self.__transmit_data, args=(self.connect_check_interval,))

								        thread_transmit.daemon = True

								        thread_transmit.start()


								    # 持续获取一个服务器的信息

								    def __keep_check_one(self, server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5):

								        # 处理一下需要检查的存储空间路径

								        if not 'storage_list' in server:

								            server['storage_list'] = []

								        if not '/' in server['storage_list']:

								            server['storage_list'].insert(0, '/')


								        re_try_count = 0

								        # 循环连接

								        while True:

								            try:

								                # 建立SSH连接

								                client = paramiko.SSHClient()

								                client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

								                client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*self.multiple_of_timeout)


								                with self.tmp_lock:

								                    if 'error_dict' in shared_data_list[server_title]:

								                        shared_data_list[server_title].pop('error_dict')

								                re_try_count = 0


								                # 循环检测

								                keep_run = True

								                while keep_run:

								                    try:

								                        error_info_dict = dict()

								                        # 网络 信息

								                        network_info = self.__get_network_info(client, interval*self.multiple_of_timeout, server.get('network_interface_name', None), error_info_dict)

								                        # CPU 信息

								                        cpu_info = self.__get_cpu_info(client, interval*self.multiple_of_timeout, error_info_dict)

								                        # 内存 信息

								                        memory_info = self.__get_memory_info(client, interval*self.multiple_of_timeout, error_info_dict)

								                        # 存储空间 信息

								                        storage_info = self.__get_storage_info(client, interval*self.multiple_of_timeout, server['storage_list'], error_info_dict)

								                        # GPU 信息

								                        gpu_info = self.__get_gpus_info(client, interval*self.multiple_of_timeout, error_info_dict, ignore_gpu=server.get('ignore_gpu', False))


								                        # 记录信息

								                        with self.tmp_lock:

								                            # 添加公告

								                            if server_title in self.note_dict:

								                                shared_data_list[server_title]['note'] = self.note_dict[server_title]

								                            shared_data_list[server_title]['interval'] = interval

								                            shared_data_list[server_title]['title'] = server_title

								                            shared_data_list[server_title]['version'] = version

								                            shared_data_list[server_title]['update_time_stamp'] = int(time.time())

								                            if cpu_info is not None:

								                                shared_data_list[server_title]['cpu'] = cpu_info

								                            if memory_info is not None:

								                                shared_data_list[server_title]['gpu_list'] = gpu_info

								                            if memory_info is not None:

								                                shared_data_list[server_title]['memory'] = memory_info

								                            if storage_info is not None:

								                                shared_data_list[server_title]['storage_list'] = storage_info

								                            if network_info is not None:

								                                shared_data_list[server_title]['network_list'] = [{

								                                    'default' : False,

								                                    'name' : server['network_interface_name'],

								                                    'in' : network_info['in'],

								                                    'out' : network_info['out'],

								                                }]

								                            if len(error_info_dict) > 0:

								                                shared_data_list[server_title]['error_dict'] = error_info_dict

								                    except Exception as e:

								                        keep_run = False

								                        with self.tmp_lock:

								                            shared_data_list[server_title]['error_dict']['connect'] = f'{e}'

								                            if 'gpu_list' in shared_data_list[server_title]:

								                                shared_data_list[server_title].pop('gpu_list')


								                    time.sleep(interval)


								                # 关闭连接

								                client.close()

								            except Exception as e:

								                if 'error_dict' not in shared_data_list[server_title]:

								                    shared_data_list[server_title]['error_dict'] = dict()

								                shared_data_list[server_title]['error_dict']['connect'] = f'retry:{re_try_count}, {e}'

								            time.sleep(re_connect_time)

								            re_try_count += 1


								    # 持续的将主动获取的服务器信息同步到最终信息中

								    def __transmit_data(self, interval):

								        # 等待一点时间再开始

								        time.sleep(interval * 0.5)

								        while True:

								            time.sleep(interval)

								            # 将临时字典中的内容移动到正式数据中

								            with self.tmp_lock:

								                with self.lock:

								                    for k, v in self.tmp_data_dict.items():

								                        self.data_dict[k] = v


								    #region 获取信息的方法


								    def __get_cpu_info(self, client, timeout, info_dict:dict=None):

								        def get_cpu_temp_via_sysfs(client):

								            try:

								                command = r'''

								                for zone in /sys/class/thermal/thermal_zone*; do

								                    if [ -f "$zone/type" ] && [ -f "$zone/temp" ]; then

								                        type=$(cat "$zone/type")

								                        temp=$(cat "$zone/temp")

								                        echo "$type:$temp"

								                    fi

								                done

								                '''

								                stdin, stdout, stderr = client.exec_command(command)

								                output = stdout.read().decode().strip()

								                temperatures = []

								                for line in output.splitlines():

								                    if ':' in line:

								                        type_str, temp_str = line.split(':', 1)

								                        if 'cpu' in type_str.lower() or 'pkg' in type_str.lower():

								                            try:

								                                temp = int(temp_str.strip())

								                                temperatures.append(round(temp / 1000, 1))  # 转为摄氏度

								                            except ValueError:

								                                continue

								                return temperatures

								            except Exception as e:

								                return []


								        def get_cpu_avg_usage_via_procstat(client, interval_sec=0.3):

								            def read_cpu_stat_line():

								                stdin, stdout, _ = client.exec_command("grep '^cpu ' /proc/stat")

								                line = stdout.read().decode().strip()

								                parts = list(map(int, line.split()[1:]))

								                idle = parts[3] + parts[4]  # idle + iowait

								                total = sum(parts)

								                return idle, total

								            try:

								                idle1, total1 = read_cpu_stat_line()

								                time.sleep(interval_sec)

								                idle2, total2 = read_cpu_stat_line()


								                total_delta = total2 - total1

								                idle_delta = idle2 - idle1


								                if total_delta == 0:

								                    return 0.0

								                usage = 100.0 * (1.0 - idle_delta / total_delta)

								                return round(usage, 1)

								            except Exception as e:

								                return None


								        try:

								            result = dict()

								            # 1. 获取 CPU 型号

								            stdin, stdout, stderr = client.exec_command("lscpu")

								            lscpu_output = stdout.read().decode()

								            model_match = re.search(r"Model name\s*:\s*(.+)", lscpu_output)

								            if model_match:

								                result["name"] = model_match.group(1).strip()

								            else:

								                # 如果没有找到“Model name”，则尝试匹配“型号名称”

								                model_name_match_cn = re.search(r'型号名称\s*：\s*(.+)', lscpu_output)

								                if model_name_match_cn:

								                    result["name"] = model_name_match_cn.group(1).strip()

								                else:

								                    result["name"] = "未知CPU"


								            # 2. 获取 CPU 温度

								            result["temperature_list"] = get_cpu_temp_via_sysfs(client)


								            # 3. 获取 CPU 占用率

								            result["core_avg_occupy"] = get_cpu_avg_usage_via_procstat(client)

								            result["core_occupy_list"] = [-1] # TODO 用现在这种方法去查询所有核心的消耗比较大，也没什么用处，暂时不查了


								            return result

								        except paramiko.ssh_exception.SSHException as e:

								            # ssh 的异常仍然抛出

								            raise

								        except Exception as e:

								            if info_dict is not None:

								                info_dict['cpu'] = f'{e}'

								            return None


								    def __get_memory_info(self, client, timeout, info_dict:dict=None):

								        try:

								            stdin, stdout, stderr = client.exec_command('free', timeout=timeout)

								            output = stdout.read().decode().split('\n')[1]

								            if output == "":

								                return None

								            data = output.split()

								            result = {

								                "total": int(data[1]),

								                "used": int(data[2])

								            }


								            return result

								        except paramiko.ssh_exception.SSHException as e:

								            # ssh 的异常仍然抛出

								            raise

								        except Exception as e:

								            if info_dict is not None:

								                info_dict['memory'] = f'{e}'

								            return None


								    def __get_storage_info(self, client, timeout, path_list, info_dict:dict=None):

								        try:

								            result = []

								            for target_path in path_list:

								                stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout)

								                output = stdout.read().decode()

								                if output == "":

								                    continue

								                data = output.split()

								                tmp_res = {

								                    "path": target_path,

								                    "total": int(data[1]),

								                    "available": int(data[3])

								                }

								                result.append(tmp_res)

								            return result

								        except paramiko.ssh_exception.SSHException as e:

								            # ssh 的异常仍然抛出

								            raise

								        except Exception as e:

								            if info_dict is not None:

								                info_dict['storage'] = f'{e}'

								            return None


								    def __get_network_info(self, client, timeout, interface_name, info_dict:dict=None):

								        try:

								            if interface_name is None:

								                return None

								            stdin, stdout, stderr = client.exec_command(f'ifstat -i {interface_name} 0.1 1', timeout=timeout)

								            output = stdout.read().decode().split('\n')[2]

								            data = output.split()

								            result = {

								                "in": float(data[0]),

								                "out": float(data[1])

								            }

								            return result

								        except paramiko.ssh_exception.SSHException as e:

								            # ssh 的异常仍然抛出

								            raise

								        except Exception as e:

								            if info_dict is not None:

								                info_dict['network'] = f'{e}'

								            return None


								    def __get_gpus_info(self, client, timeout, info_dict:dict=None, ignore_gpu=False):

								        if ignore_gpu:

								            return None


								        try:

								            stdin, stdout, stderr = client.exec_command("gpustat --json")

								            output = stdout.read().decode()

								            gpus_info = json.loads(output)


								            result = []

								            for gpu_info in gpus_info['gpus']:

								                # 处理一下

								                gpu_name = gpu_info['name'].replace('NVIDIA ', '').replace('GeForce ', '')

								                process_list = []

								                for process_info in gpu_info.get('processes', []):

								                    cmd = process_info.get('command', '')

								                    if 'full_command' in process_info:

								                        cmd = ' '.join(process_info["full_command"])

								                    process_list.append({

								                        "user": process_info.get('username'),

								                        "memory": process_info.get('gpu_memory_usage'),

								                        "cmd": cmd

								                    })

								                # 加到list中

								                result.append({

								                    "idx": gpu_info['index'],

								                    "name": gpu_name,

								                    "temperature": gpu_info['temperature.gpu'],

								                    "used_memory": gpu_info['memory.used'],

								                    "total_memory": gpu_info['memory.total'],

								                    "utilization": gpu_info['utilization.gpu'],

								                    "process_list": process_list

								                })


								            return result

								        except paramiko.ssh_exception.SSHException as e:

								            # ssh 的异常仍然抛出

								            raise

								        except Exception as e:

								            if info_dict is not None:

								                info_dict['gpu'] = f'{e}'

								            return None


								    #endregion