diff --git a/check.py b/check.py index 2a563f1..cba5755 100644 --- a/check.py +++ b/check.py @@ -357,14 +357,17 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv # 建立SSH连接 client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*2.5) + client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' - + + shared_data_list[server_idx]['err_info'] = ' ' re_try_count = 0 + # 循环检测 - while run_realtime: + keep_run = True + while run_realtime and keep_run: try: - stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*2.5) + stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*3) output = stdout.read().decode() output = output.split('\n') start_idx = 0 @@ -409,17 +412,21 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv shared_data_list[server_idx]['info_list'] = result shared_data_list[server_idx]['updated'] = True # locked = False - except: - shared_data_list[server_idx].pop('info_list') + except Exception as e: + keep_run = False + shared_data_list[server_idx]['err_info'] = f'{e}' + if 'info_list' in shared_data_list[server_idx]: + shared_data_list[server_idx].pop('info_list') time.sleep(interval) # 关闭连接 client.close() except Exception as e: - # if data_list_lock.locked and locked: - # data_list_lock.release() - # print(e) + # if re_try_count == 0: + # shared_data_list[server_idx]['err_info'] = f'test:{type(e)}, {e}' + # print(e) + # else: shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}' time.sleep(re_connect_time) re_try_count += 1 @@ -512,7 +519,7 @@ def check_all(show_type='list'): except socket.timeout: print("Connection timed out.") except Exception as e: - print(f"{e}") + print(f"错误 {e}") # except: # print('连接出现问题.') print()