Browse Source

增加了超时检测

master
lxbhahaha 1 year ago
parent
commit
59a0770936
  1. 19
      check.py

19
check.py

@ -1,6 +1,7 @@
import threading import threading
import paramiko import paramiko
import argparse import argparse
import socket
import json import json
import time import time
import os import os
@ -79,15 +80,15 @@ table_icon_3 = {
table_icon = table_icon_3 table_icon = table_icon_3
def check_gpu_utilization(server:dict): def check_gpu_utilization(server:dict, timeout=2):
# 建立SSH连接 # 建立SSH连接
client = paramiko.SSHClient() client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None)) client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=timeout)
# 执行命令查看显卡占用情况(这里以nvidia-smi为例) # 执行命令查看显卡占用情况(这里以nvidia-smi为例)
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
stdin, stdout, stderr = client.exec_command(cmd) stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout)
output = stdout.read().decode() output = stdout.read().decode()
output = output.split('\n') output = output.split('\n')
start_idx = 0 start_idx = 0
@ -493,8 +494,16 @@ def check_all(show_type='list'):
try: try:
res = check_gpu_utilization(server_data) res = check_gpu_utilization(server_data)
print_res(res) print_res(res)
except: except paramiko.AuthenticationException:
print('连接出现问题.') print("Authentication failed.")
except paramiko.SSHException as ssh_err:
print(f"SSH error: {ssh_err}")
except socket.timeout:
print("Connection timed out.")
except Exception as e:
print(f"{e}")
# except:
# print('连接出现问题.')
print() print()
elif show_type == 'table': elif show_type == 'table':
pass pass

Loading…
Cancel
Save