Browse Source

增加了重连

master
lxbhahaha 1 year ago
parent
commit
f5bb6a2dc5
  1. 13
      check.py

13
check.py

@ -349,18 +349,21 @@ def get_table_res(data_list):
return ''.join(result_str)
def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float):
def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interval: float, re_connect_time: float=5):
# 循环连接
while run_realtime:
try:
# 建立SSH连接
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None))
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*2.5)
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
re_try_count = 0
# 循环检测
while run_realtime:
try:
stdin, stdout, stderr = client.exec_command(cmd)
stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*2.5)
output = stdout.read().decode()
output = output.split('\n')
start_idx = 0
@ -416,7 +419,9 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv
# if data_list_lock.locked and locked:
# data_list_lock.release()
# print(e)
shared_data_list[server_idx]['err_info'] = str(e)
shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}'
time.sleep(re_connect_time)
re_try_count += 1
def realtime(args):
global run_realtime

Loading…
Cancel
Save