From 770a22908935ad013384587750f7ea0d8f2eff8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=B1=BC=E9=AA=A8=E5=89=AA?= <1580622474@qq.com> Date: Mon, 25 Mar 2024 22:32:10 +0800 Subject: [PATCH] update --- .gitignore | 1 + check.py | 277 ++++++++++++++++++++++++++++++++++++++++ serverList_examlpe.json | 23 ++++ 3 files changed, 301 insertions(+) create mode 100644 .gitignore create mode 100644 check.py create mode 100644 serverList_examlpe.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..597871a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +serverList.json \ No newline at end of file diff --git a/check.py b/check.py new file mode 100644 index 0000000..e10fbcb --- /dev/null +++ b/check.py @@ -0,0 +1,277 @@ +import multiprocessing +import threading +import paramiko +import argparse +import json +import time +import os + +COLOR_GREEN = '\033[0;32m' +COLOR_RED = '\033[0;31m' +COLOR_YELLOW = '\033[0;33m' +END_COLOR = '\033[0m' + +server_list_path = 'serverList.json' +run_realtime = False +data_list_lock = threading.Lock() +# data_list = [] + +def check_gpu_utilization(server:dict): + # 建立SSH连接 + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) + + # 执行命令查看显卡占用情况(这里以nvidia-smi为例) + cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' + stdin, stdout, stderr = client.exec_command(cmd) + output = stdout.read().decode() + output = output.split('\n') + start_idx = 0 + for i in range(len(output)): + if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': + start_idx = i + 1 + break + output = output[start_idx:-1] + + # 解析数据 ----------------------------- + result = [] + for data in output: + data_list = data.split(', ') + idx = int(data_list[0]) + gpu_name = data_list[1] + total_mem = int(data_list[2].split(' ')[0]) + used_mem = int(data_list[3].split(' ')[0]) + free_mem = int(data_list[4].split(' ')[0]) + util_gpu = int(data_list[5].split(' ')[0]) + util_mem = int(data_list[6].split(' ')[0]) + temperature = int(data_list[7]) + + # 简化GPU名称 + if gpu_name.startswith('NVIDIA '): + gpu_name = gpu_name[7:] + if gpu_name.startswith('GeForce '): + gpu_name = gpu_name[8:] + + result.append({ + 'idx': idx, + 'gpu_name': gpu_name, + 'total_mem': total_mem, + 'used_mem': used_mem, + 'free_mem': free_mem, + 'util_gpu': util_gpu, + 'util_mem': util_mem, + 'temperature': temperature + }) + + # 关闭连接 + client.close() + return result + +def print_res(data_list): + for data in data_list: + idx = data['idx'] + gpu_name = data['gpu_name'] + used_mem = data['used_mem'] + total_mem = data['total_mem'] + util_gpu = data['util_gpu'] + + diff_len_space = ' ' * (len(str(total_mem)) - len(str(used_mem))) + + if used_mem < 500: + status = COLOR_GREEN + '空闲' + END_COLOR + elif used_mem / total_mem < 0.5: + status = COLOR_YELLOW + '占用' + END_COLOR + else: + status = COLOR_RED + '占用' + END_COLOR + + + res = f'{idx}: {status} - {gpu_name} - {diff_len_space}{used_mem} / {total_mem} MiB, GPU Util: {util_gpu} %' + print(res) + +def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float): + try: + # 建立SSH连接 + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) + cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' + + # 循环检测 + while run_realtime: + stdin, stdout, stderr = client.exec_command(cmd) + output = stdout.read().decode() + output = output.split('\n') + start_idx = 0 + for i in range(len(output)): + if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': + start_idx = i + 1 + break + output = output[start_idx:-1] + # 解析数据 ----------------------------- + result = [] + for data in output: + data_list = data.split(', ') + idx = int(data_list[0]) + gpu_name = data_list[1] + total_mem = int(data_list[2].split(' ')[0]) + used_mem = int(data_list[3].split(' ')[0]) + free_mem = int(data_list[4].split(' ')[0]) + util_gpu = int(data_list[5].split(' ')[0]) + util_mem = int(data_list[6].split(' ')[0]) + temperature = int(data_list[7]) + + # 简化GPU名称 + if gpu_name.startswith('NVIDIA '): + gpu_name = gpu_name[7:] + if gpu_name.startswith('GeForce '): + gpu_name = gpu_name[8:] + + result.append({ + 'idx': idx, + 'gpu_name': gpu_name, + 'total_mem': total_mem, + 'used_mem': used_mem, + 'free_mem': free_mem, + 'util_gpu': util_gpu, + 'util_mem': util_mem, + 'temperature': temperature + }) + + # locked = False + with data_list_lock: + # locked = True + shared_data_list[server_idx]['info_list'] = result + # locked = False + + time.sleep(interval) + + # 关闭连接 + client.close() + except Exception as e: + # if data_list_lock.locked and locked: + # data_list_lock.release() + print(e) + +def realtime(args): + global run_realtime + + try: + parser = argparse.ArgumentParser() + parser.add_argument('-n', type=float, default=2, help='服务器多久刷新一次') + parser.add_argument('-f', type=float, default=2, help='显示多久刷新一次') + parser.add_argument('-e', '--exclude', type=str, default='', help='不需要显示的服务器(title)用,分割') + args = parser.parse_args(args) + except: + print('参数有误!') + return + + exclude_list = args.exclude.split(',') + + # 加载json + with open(server_list_path, 'r') as f: + server_list = json.load(f) + + # 共享list + manager = multiprocessing.Manager() + data_list = [] + + run_realtime = True + # 开启线程 + idx = 0 + for i, server_data in enumerate(server_list): + if server_data['title'] in exclude_list: + continue + data_list.append(dict()) + data_list[-1]['server_data'] = server_data + thread = threading.Thread(target=keep_check_one, args=(server_data, data_list, idx, args.n)) + idx += 1 + thread.daemon = True + thread.start() + + # 绘制 + while True: + with data_list_lock: + os.system('cls' if os.name == 'nt' else 'clear') + for data in data_list: + print(data['server_data']['title'] + ' ---------------') + info_list = data.get('info_list', None) + if info_list: + print_res(info_list) + # print(info_list) + else: + print('出错') + # lock.acquire() + # try: + # # os.system('cls' if os.name == 'nt' else 'clear') + # for data in data_list: + # print(data['server_data']['title'] + ' ---------------') + # info_list = data.get('info_list', None) + # if info_list: + # print(info_list) + # else: + # print('出错') + # finally: + # lock.release() + time.sleep(args.f) + + run_realtime = False + +def check_all(): + # 加载json + with open(server_list_path, 'r') as f: + server_list = json.load(f) + + # 处理每一个结果 + for server_data in server_list: + print('* ' + server_data['title'] + ' --------------------') + try: + res = check_gpu_utilization(server_data) + print_res(res) + except: + print('连接出现问题.') + print() + +def print_server_list(): + # 加载json + with open(server_list_path, 'r') as f: + server_list = json.load(f) + + for s in server_list: + print('----------------------') + print(f"title:{s['title']}") + print(f"ip:{s['ip']}, port:{s['port']}, usr:{s['username']}") + +def main(): + + # 接受输入 + while True: + msg = input(' -> ') + + msg = msg.split(' ') + cmd = msg[0] + args = msg[1:] + + if cmd == 'check': + check_all() + elif cmd == 'clear' or cmd == 'cls': + os.system('cls' if os.name == 'nt' else 'clear') + elif cmd == 'realtime' or cmd == 'rt': + realtime(args) + elif cmd == 'list': + print_server_list() + elif cmd == 'exit': + break + elif cmd == 'help': + print(f'使用方法: 需要把待查看的服务器信息放至{server_list_path}') + print() + print('check, 查看所有显卡状态') + print('realtime, 实时显示(rt同)') + print('list, 查看文件中记录的所有服务器信息') + print('clear, 清空屏幕(cls同)') + print('exit, 退出') + else: + print('没有该指令, 输入help查看帮助') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/serverList_examlpe.json b/serverList_examlpe.json new file mode 100644 index 0000000..2153265 --- /dev/null +++ b/serverList_examlpe.json @@ -0,0 +1,23 @@ +[ + { + "title": "233", + "ip": "10.1.16.233", + "port": 22, + "username": "lxb", + "key_filename": "/home/.ssh/id_rsa" + }, + { + "title": "76", + "ip": "10.1.16.76", + "port": 22, + "username": "lxb", + "key_filename": "/home/.ssh/id_rsa" + }, + { + "title": "174", + "ip": "10.1.16.174", + "port": 22, + "username": "lxb", + "key_filename": "/home/.ssh/id_rsa" + } +] \ No newline at end of file