Browse Source

update

master
鱼骨剪 1 year ago
commit
770a229089
  1. 1
      .gitignore
  2. 277
      check.py
  3. 23
      serverList_examlpe.json

1
.gitignore

@ -0,0 +1 @@
serverList.json

277
check.py

@ -0,0 +1,277 @@
import multiprocessing
import threading
import paramiko
import argparse
import json
import time
import os
COLOR_GREEN = '\033[0;32m'
COLOR_RED = '\033[0;31m'
COLOR_YELLOW = '\033[0;33m'
END_COLOR = '\033[0m'
server_list_path = 'serverList.json'
run_realtime = False
data_list_lock = threading.Lock()
# data_list = []
def check_gpu_utilization(server:dict):
# 建立SSH连接
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None))
# 执行命令查看显卡占用情况(这里以nvidia-smi为例)
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
stdin, stdout, stderr = client.exec_command(cmd)
output = stdout.read().decode()
output = output.split('\n')
start_idx = 0
for i in range(len(output)):
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
start_idx = i + 1
break
output = output[start_idx:-1]
# 解析数据 -----------------------------
result = []
for data in output:
data_list = data.split(', ')
idx = int(data_list[0])
gpu_name = data_list[1]
total_mem = int(data_list[2].split(' ')[0])
used_mem = int(data_list[3].split(' ')[0])
free_mem = int(data_list[4].split(' ')[0])
util_gpu = int(data_list[5].split(' ')[0])
util_mem = int(data_list[6].split(' ')[0])
temperature = int(data_list[7])
# 简化GPU名称
if gpu_name.startswith('NVIDIA '):
gpu_name = gpu_name[7:]
if gpu_name.startswith('GeForce '):
gpu_name = gpu_name[8:]
result.append({
'idx': idx,
'gpu_name': gpu_name,
'total_mem': total_mem,
'used_mem': used_mem,
'free_mem': free_mem,
'util_gpu': util_gpu,
'util_mem': util_mem,
'temperature': temperature
})
# 关闭连接
client.close()
return result
def print_res(data_list):
for data in data_list:
idx = data['idx']
gpu_name = data['gpu_name']
used_mem = data['used_mem']
total_mem = data['total_mem']
util_gpu = data['util_gpu']
diff_len_space = ' ' * (len(str(total_mem)) - len(str(used_mem)))
if used_mem < 500:
status = COLOR_GREEN + '空闲' + END_COLOR
elif used_mem / total_mem < 0.5:
status = COLOR_YELLOW + '占用' + END_COLOR
else:
status = COLOR_RED + '占用' + END_COLOR
res = f'{idx}: {status} - {gpu_name} - {diff_len_space}{used_mem} / {total_mem} MiB, GPU Util: {util_gpu} %'
print(res)
def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float):
try:
# 建立SSH连接
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None))
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
# 循环检测
while run_realtime:
stdin, stdout, stderr = client.exec_command(cmd)
output = stdout.read().decode()
output = output.split('\n')
start_idx = 0
for i in range(len(output)):
if output[i] == 'index, name, memory.total [MiB], memory.used [MiB], memory.free [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu':
start_idx = i + 1
break
output = output[start_idx:-1]
# 解析数据 -----------------------------
result = []
for data in output:
data_list = data.split(', ')
idx = int(data_list[0])
gpu_name = data_list[1]
total_mem = int(data_list[2].split(' ')[0])
used_mem = int(data_list[3].split(' ')[0])
free_mem = int(data_list[4].split(' ')[0])
util_gpu = int(data_list[5].split(' ')[0])
util_mem = int(data_list[6].split(' ')[0])
temperature = int(data_list[7])
# 简化GPU名称
if gpu_name.startswith('NVIDIA '):
gpu_name = gpu_name[7:]
if gpu_name.startswith('GeForce '):
gpu_name = gpu_name[8:]
result.append({
'idx': idx,
'gpu_name': gpu_name,
'total_mem': total_mem,
'used_mem': used_mem,
'free_mem': free_mem,
'util_gpu': util_gpu,
'util_mem': util_mem,
'temperature': temperature
})
# locked = False
with data_list_lock:
# locked = True
shared_data_list[server_idx]['info_list'] = result
# locked = False
time.sleep(interval)
# 关闭连接
client.close()
except Exception as e:
# if data_list_lock.locked and locked:
# data_list_lock.release()
print(e)
def realtime(args):
global run_realtime
try:
parser = argparse.ArgumentParser()
parser.add_argument('-n', type=float, default=2, help='服务器多久刷新一次')
parser.add_argument('-f', type=float, default=2, help='显示多久刷新一次')
parser.add_argument('-e', '--exclude', type=str, default='', help='不需要显示的服务器(title)用,分割')
args = parser.parse_args(args)
except:
print('参数有误!')
return
exclude_list = args.exclude.split(',')
# 加载json
with open(server_list_path, 'r') as f:
server_list = json.load(f)
# 共享list
manager = multiprocessing.Manager()
data_list = []
run_realtime = True
# 开启线程
idx = 0
for i, server_data in enumerate(server_list):
if server_data['title'] in exclude_list:
continue
data_list.append(dict())
data_list[-1]['server_data'] = server_data
thread = threading.Thread(target=keep_check_one, args=(server_data, data_list, idx, args.n))
idx += 1
thread.daemon = True
thread.start()
# 绘制
while True:
with data_list_lock:
os.system('cls' if os.name == 'nt' else 'clear')
for data in data_list:
print(data['server_data']['title'] + ' ---------------')
info_list = data.get('info_list', None)
if info_list:
print_res(info_list)
# print(info_list)
else:
print('出错')
# lock.acquire()
# try:
# # os.system('cls' if os.name == 'nt' else 'clear')
# for data in data_list:
# print(data['server_data']['title'] + ' ---------------')
# info_list = data.get('info_list', None)
# if info_list:
# print(info_list)
# else:
# print('出错')
# finally:
# lock.release()
time.sleep(args.f)
run_realtime = False
def check_all():
# 加载json
with open(server_list_path, 'r') as f:
server_list = json.load(f)
# 处理每一个结果
for server_data in server_list:
print('* ' + server_data['title'] + ' --------------------')
try:
res = check_gpu_utilization(server_data)
print_res(res)
except:
print('连接出现问题.')
print()
def print_server_list():
# 加载json
with open(server_list_path, 'r') as f:
server_list = json.load(f)
for s in server_list:
print('----------------------')
print(f"title:{s['title']}")
print(f"ip:{s['ip']}, port:{s['port']}, usr:{s['username']}")
def main():
# 接受输入
while True:
msg = input(' -> ')
msg = msg.split(' ')
cmd = msg[0]
args = msg[1:]
if cmd == 'check':
check_all()
elif cmd == 'clear' or cmd == 'cls':
os.system('cls' if os.name == 'nt' else 'clear')
elif cmd == 'realtime' or cmd == 'rt':
realtime(args)
elif cmd == 'list':
print_server_list()
elif cmd == 'exit':
break
elif cmd == 'help':
print(f'使用方法: 需要把待查看的服务器信息放至{server_list_path}')
print()
print('check, 查看所有显卡状态')
print('realtime, 实时显示(rt同)')
print('list, 查看文件中记录的所有服务器信息')
print('clear, 清空屏幕(cls同)')
print('exit, 退出')
else:
print('没有该指令, 输入help查看帮助')
if __name__ == '__main__':
main()

23
serverList_examlpe.json

@ -0,0 +1,23 @@
[
{
"title": "233",
"ip": "10.1.16.233",
"port": 22,
"username": "lxb",
"key_filename": "/home/.ssh/id_rsa"
},
{
"title": "76",
"ip": "10.1.16.76",
"port": 22,
"username": "lxb",
"key_filename": "/home/.ssh/id_rsa"
},
{
"title": "174",
"ip": "10.1.16.174",
"port": 22,
"username": "lxb",
"key_filename": "/home/.ssh/id_rsa"
}
]
Loading…
Cancel
Save