Browse Source

update, fixbug

master
lxbhahaha 1 year ago
parent
commit
1406425315
  1. 27
      check.py

27
check.py

@ -357,14 +357,17 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv
# 建立SSH连接 # 建立SSH连接
client = paramiko.SSHClient() client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*2.5) client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3)
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv'
shared_data_list[server_idx]['err_info'] = ' '
re_try_count = 0 re_try_count = 0
# 循环检测 # 循环检测
while run_realtime: keep_run = True
while run_realtime and keep_run:
try: try:
stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*2.5) stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*3)
output = stdout.read().decode() output = stdout.read().decode()
output = output.split('\n') output = output.split('\n')
start_idx = 0 start_idx = 0
@ -409,17 +412,21 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv
shared_data_list[server_idx]['info_list'] = result shared_data_list[server_idx]['info_list'] = result
shared_data_list[server_idx]['updated'] = True shared_data_list[server_idx]['updated'] = True
# locked = False # locked = False
except: except Exception as e:
shared_data_list[server_idx].pop('info_list') keep_run = False
shared_data_list[server_idx]['err_info'] = f'{e}'
if 'info_list' in shared_data_list[server_idx]:
shared_data_list[server_idx].pop('info_list')
time.sleep(interval) time.sleep(interval)
# 关闭连接 # 关闭连接
client.close() client.close()
except Exception as e: except Exception as e:
# if data_list_lock.locked and locked: # if re_try_count == 0:
# data_list_lock.release() # shared_data_list[server_idx]['err_info'] = f'test:{type(e)}, {e}'
# print(e) # print(e)
# else:
shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}' shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}'
time.sleep(re_connect_time) time.sleep(re_connect_time)
re_try_count += 1 re_try_count += 1
@ -512,7 +519,7 @@ def check_all(show_type='list'):
except socket.timeout: except socket.timeout:
print("Connection timed out.") print("Connection timed out.")
except Exception as e: except Exception as e:
print(f"{e}") print(f"错误 {e}")
# except: # except:
# print('连接出现问题.') # print('连接出现问题.')
print() print()

Loading…
Cancel
Save