|
@ -1,6 +1,7 @@ |
|
|
import threading |
|
|
import threading |
|
|
import paramiko |
|
|
import paramiko |
|
|
import argparse |
|
|
import argparse |
|
|
|
|
|
import socket |
|
|
import json |
|
|
import json |
|
|
import time |
|
|
import time |
|
|
import os |
|
|
import os |
|
@ -79,15 +80,15 @@ table_icon_3 = { |
|
|
|
|
|
|
|
|
table_icon = table_icon_3 |
|
|
table_icon = table_icon_3 |
|
|
|
|
|
|
|
|
def check_gpu_utilization(server:dict): |
|
|
def check_gpu_utilization(server:dict, timeout=2): |
|
|
# 建立SSH连接 |
|
|
# 建立SSH连接 |
|
|
client = paramiko.SSHClient() |
|
|
client = paramiko.SSHClient() |
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) |
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=timeout) |
|
|
|
|
|
|
|
|
# 执行命令查看显卡占用情况(这里以nvidia-smi为例) |
|
|
# 执行命令查看显卡占用情况(这里以nvidia-smi为例) |
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
stdin, stdout, stderr = client.exec_command(cmd) |
|
|
stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout) |
|
|
output = stdout.read().decode() |
|
|
output = stdout.read().decode() |
|
|
output = output.split('\n') |
|
|
output = output.split('\n') |
|
|
start_idx = 0 |
|
|
start_idx = 0 |
|
@ -493,8 +494,16 @@ def check_all(show_type='list'): |
|
|
try: |
|
|
try: |
|
|
res = check_gpu_utilization(server_data) |
|
|
res = check_gpu_utilization(server_data) |
|
|
print_res(res) |
|
|
print_res(res) |
|
|
except: |
|
|
except paramiko.AuthenticationException: |
|
|
print('连接出现问题.') |
|
|
print("Authentication failed.") |
|
|
|
|
|
except paramiko.SSHException as ssh_err: |
|
|
|
|
|
print(f"SSH error: {ssh_err}") |
|
|
|
|
|
except socket.timeout: |
|
|
|
|
|
print("Connection timed out.") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(f"{e}") |
|
|
|
|
|
# except: |
|
|
|
|
|
# print('连接出现问题.') |
|
|
print() |
|
|
print() |
|
|
elif show_type == 'table': |
|
|
elif show_type == 'table': |
|
|
pass |
|
|
pass |
|
|