commit
770a229089
3 changed files with 301 additions and 0 deletions
@ -0,0 +1 @@ |
|||||
|
serverList.json |
@ -0,0 +1,277 @@ |
|||||
|
import multiprocessing |
||||
|
import threading |
||||
|
import paramiko |
||||
|
import argparse |
||||
|
import json |
||||
|
import time |
||||
|
import os |
||||
|
|
||||
|
COLOR_GREEN = '\033[0;32m' |
||||
|
COLOR_RED = '\033[0;31m' |
||||
|
COLOR_YELLOW = '\033[0;33m' |
||||
|
END_COLOR = '\033[0m' |
||||
|
|
||||
|
server_list_path = 'serverList.json' |
||||
|
run_realtime = False |
||||
|
data_list_lock = threading.Lock() |
||||
|
# data_list = [] |
||||
|
|
||||
|
def check_gpu_utilization(server:dict): |
||||
|
# 建立SSH连接 |
||||
|
client = paramiko.SSHClient() |
||||
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
||||
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) |
||||
|
|
||||
|
# 执行命令查看显卡占用情况(这里以nvidia-smi为例) |
||||
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
||||
|
stdin, stdout, stderr = client.exec_command(cmd) |
||||
|
output = stdout.read().decode() |
||||
|
output = output.split('\n') |
||||
|
start_idx = 0 |
||||
|
for i in range(len(output)): |
||||
|
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': |
||||
|
start_idx = i + 1 |
||||
|
break |
||||
|
output = output[start_idx:-1] |
||||
|
|
||||
|
# 解析数据 ----------------------------- |
||||
|
result = [] |
||||
|
for data in output: |
||||
|
data_list = data.split(', ') |
||||
|
idx = int(data_list[0]) |
||||
|
gpu_name = data_list[1] |
||||
|
total_mem = int(data_list[2].split(' ')[0]) |
||||
|
used_mem = int(data_list[3].split(' ')[0]) |
||||
|
free_mem = int(data_list[4].split(' ')[0]) |
||||
|
util_gpu = int(data_list[5].split(' ')[0]) |
||||
|
util_mem = int(data_list[6].split(' ')[0]) |
||||
|
temperature = int(data_list[7]) |
||||
|
|
||||
|
# 简化GPU名称 |
||||
|
if gpu_name.startswith('NVIDIA '): |
||||
|
gpu_name = gpu_name[7:] |
||||
|
if gpu_name.startswith('GeForce '): |
||||
|
gpu_name = gpu_name[8:] |
||||
|
|
||||
|
result.append({ |
||||
|
'idx': idx, |
||||
|
'gpu_name': gpu_name, |
||||
|
'total_mem': total_mem, |
||||
|
'used_mem': used_mem, |
||||
|
'free_mem': free_mem, |
||||
|
'util_gpu': util_gpu, |
||||
|
'util_mem': util_mem, |
||||
|
'temperature': temperature |
||||
|
}) |
||||
|
|
||||
|
# 关闭连接 |
||||
|
client.close() |
||||
|
return result |
||||
|
|
||||
|
def print_res(data_list): |
||||
|
for data in data_list: |
||||
|
idx = data['idx'] |
||||
|
gpu_name = data['gpu_name'] |
||||
|
used_mem = data['used_mem'] |
||||
|
total_mem = data['total_mem'] |
||||
|
util_gpu = data['util_gpu'] |
||||
|
|
||||
|
diff_len_space = ' ' * (len(str(total_mem)) - len(str(used_mem))) |
||||
|
|
||||
|
if used_mem < 500: |
||||
|
status = COLOR_GREEN + '空闲' + END_COLOR |
||||
|
elif used_mem / total_mem < 0.5: |
||||
|
status = COLOR_YELLOW + '占用' + END_COLOR |
||||
|
else: |
||||
|
status = COLOR_RED + '占用' + END_COLOR |
||||
|
|
||||
|
|
||||
|
res = f'{idx}: {status} - {gpu_name} - {diff_len_space}{used_mem} / {total_mem} MiB, GPU Util: {util_gpu} %' |
||||
|
print(res) |
||||
|
|
||||
|
def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float): |
||||
|
try: |
||||
|
# 建立SSH连接 |
||||
|
client = paramiko.SSHClient() |
||||
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
||||
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) |
||||
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
||||
|
|
||||
|
# 循环检测 |
||||
|
while run_realtime: |
||||
|
stdin, stdout, stderr = client.exec_command(cmd) |
||||
|
output = stdout.read().decode() |
||||
|
output = output.split('\n') |
||||
|
start_idx = 0 |
||||
|
for i in range(len(output)): |
||||
|
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu': |
||||
|
start_idx = i + 1 |
||||
|
break |
||||
|
output = output[start_idx:-1] |
||||
|
# 解析数据 ----------------------------- |
||||
|
result = [] |
||||
|
for data in output: |
||||
|
data_list = data.split(', ') |
||||
|
idx = int(data_list[0]) |
||||
|
gpu_name = data_list[1] |
||||
|
total_mem = int(data_list[2].split(' ')[0]) |
||||
|
used_mem = int(data_list[3].split(' ')[0]) |
||||
|
free_mem = int(data_list[4].split(' ')[0]) |
||||
|
util_gpu = int(data_list[5].split(' ')[0]) |
||||
|
util_mem = int(data_list[6].split(' ')[0]) |
||||
|
temperature = int(data_list[7]) |
||||
|
|
||||
|
# 简化GPU名称 |
||||
|
if gpu_name.startswith('NVIDIA '): |
||||
|
gpu_name = gpu_name[7:] |
||||
|
if gpu_name.startswith('GeForce '): |
||||
|
gpu_name = gpu_name[8:] |
||||
|
|
||||
|
result.append({ |
||||
|
'idx': idx, |
||||
|
'gpu_name': gpu_name, |
||||
|
'total_mem': total_mem, |
||||
|
'used_mem': used_mem, |
||||
|
'free_mem': free_mem, |
||||
|
'util_gpu': util_gpu, |
||||
|
'util_mem': util_mem, |
||||
|
'temperature': temperature |
||||
|
}) |
||||
|
|
||||
|
# locked = False |
||||
|
with data_list_lock: |
||||
|
# locked = True |
||||
|
shared_data_list[server_idx]['info_list'] = result |
||||
|
# locked = False |
||||
|
|
||||
|
time.sleep(interval) |
||||
|
|
||||
|
# 关闭连接 |
||||
|
client.close() |
||||
|
except Exception as e: |
||||
|
# if data_list_lock.locked and locked: |
||||
|
# data_list_lock.release() |
||||
|
print(e) |
||||
|
|
||||
|
def realtime(args): |
||||
|
global run_realtime |
||||
|
|
||||
|
try: |
||||
|
parser = argparse.ArgumentParser() |
||||
|
parser.add_argument('-n', type=float, default=2, help='服务器多久刷新一次') |
||||
|
parser.add_argument('-f', type=float, default=2, help='显示多久刷新一次') |
||||
|
parser.add_argument('-e', '--exclude', type=str, default='', help='不需要显示的服务器(title)用,分割') |
||||
|
args = parser.parse_args(args) |
||||
|
except: |
||||
|
print('参数有误!') |
||||
|
return |
||||
|
|
||||
|
exclude_list = args.exclude.split(',') |
||||
|
|
||||
|
# 加载json |
||||
|
with open(server_list_path, 'r') as f: |
||||
|
server_list = json.load(f) |
||||
|
|
||||
|
# 共享list |
||||
|
manager = multiprocessing.Manager() |
||||
|
data_list = [] |
||||
|
|
||||
|
run_realtime = True |
||||
|
# 开启线程 |
||||
|
idx = 0 |
||||
|
for i, server_data in enumerate(server_list): |
||||
|
if server_data['title'] in exclude_list: |
||||
|
continue |
||||
|
data_list.append(dict()) |
||||
|
data_list[-1]['server_data'] = server_data |
||||
|
thread = threading.Thread(target=keep_check_one, args=(server_data, data_list, idx, args.n)) |
||||
|
idx += 1 |
||||
|
thread.daemon = True |
||||
|
thread.start() |
||||
|
|
||||
|
# 绘制 |
||||
|
while True: |
||||
|
with data_list_lock: |
||||
|
os.system('cls' if os.name == 'nt' else 'clear') |
||||
|
for data in data_list: |
||||
|
print(data['server_data']['title'] + ' ---------------') |
||||
|
info_list = data.get('info_list', None) |
||||
|
if info_list: |
||||
|
print_res(info_list) |
||||
|
# print(info_list) |
||||
|
else: |
||||
|
print('出错') |
||||
|
# lock.acquire() |
||||
|
# try: |
||||
|
# # os.system('cls' if os.name == 'nt' else 'clear') |
||||
|
# for data in data_list: |
||||
|
# print(data['server_data']['title'] + ' ---------------') |
||||
|
# info_list = data.get('info_list', None) |
||||
|
# if info_list: |
||||
|
# print(info_list) |
||||
|
# else: |
||||
|
# print('出错') |
||||
|
# finally: |
||||
|
# lock.release() |
||||
|
time.sleep(args.f) |
||||
|
|
||||
|
run_realtime = False |
||||
|
|
||||
|
def check_all(): |
||||
|
# 加载json |
||||
|
with open(server_list_path, 'r') as f: |
||||
|
server_list = json.load(f) |
||||
|
|
||||
|
# 处理每一个结果 |
||||
|
for server_data in server_list: |
||||
|
print('* ' + server_data['title'] + ' --------------------') |
||||
|
try: |
||||
|
res = check_gpu_utilization(server_data) |
||||
|
print_res(res) |
||||
|
except: |
||||
|
print('连接出现问题.') |
||||
|
print() |
||||
|
|
||||
|
def print_server_list(): |
||||
|
# 加载json |
||||
|
with open(server_list_path, 'r') as f: |
||||
|
server_list = json.load(f) |
||||
|
|
||||
|
for s in server_list: |
||||
|
print('----------------------') |
||||
|
print(f"title:{s['title']}") |
||||
|
print(f"ip:{s['ip']}, port:{s['port']}, usr:{s['username']}") |
||||
|
|
||||
|
def main(): |
||||
|
|
||||
|
# 接受输入 |
||||
|
while True: |
||||
|
msg = input(' -> ') |
||||
|
|
||||
|
msg = msg.split(' ') |
||||
|
cmd = msg[0] |
||||
|
args = msg[1:] |
||||
|
|
||||
|
if cmd == 'check': |
||||
|
check_all() |
||||
|
elif cmd == 'clear' or cmd == 'cls': |
||||
|
os.system('cls' if os.name == 'nt' else 'clear') |
||||
|
elif cmd == 'realtime' or cmd == 'rt': |
||||
|
realtime(args) |
||||
|
elif cmd == 'list': |
||||
|
print_server_list() |
||||
|
elif cmd == 'exit': |
||||
|
break |
||||
|
elif cmd == 'help': |
||||
|
print(f'使用方法: 需要把待查看的服务器信息放至{server_list_path}') |
||||
|
print() |
||||
|
print('check, 查看所有显卡状态') |
||||
|
print('realtime, 实时显示(rt同)') |
||||
|
print('list, 查看文件中记录的所有服务器信息') |
||||
|
print('clear, 清空屏幕(cls同)') |
||||
|
print('exit, 退出') |
||||
|
else: |
||||
|
print('没有该指令, 输入help查看帮助') |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
main() |
@ -0,0 +1,23 @@ |
|||||
|
[ |
||||
|
{ |
||||
|
"title": "233", |
||||
|
"ip": "10.1.16.233", |
||||
|
"port": 22, |
||||
|
"username": "lxb", |
||||
|
"key_filename": "/home/.ssh/id_rsa" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "76", |
||||
|
"ip": "10.1.16.76", |
||||
|
"port": 22, |
||||
|
"username": "lxb", |
||||
|
"key_filename": "/home/.ssh/id_rsa" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "174", |
||||
|
"ip": "10.1.16.174", |
||||
|
"port": 22, |
||||
|
"username": "lxb", |
||||
|
"key_filename": "/home/.ssh/id_rsa" |
||||
|
} |
||||
|
] |
Loading…
Reference in new issue