|
@ -357,14 +357,17 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv |
|
|
# 建立SSH连接 |
|
|
# 建立SSH连接 |
|
|
client = paramiko.SSHClient() |
|
|
client = paramiko.SSHClient() |
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*2.5) |
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) |
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
|
|
|
|
|
|
|
|
|
shared_data_list[server_idx]['err_info'] = ' ' |
|
|
re_try_count = 0 |
|
|
re_try_count = 0 |
|
|
|
|
|
|
|
|
# 循环检测 |
|
|
# 循环检测 |
|
|
while run_realtime: |
|
|
keep_run = True |
|
|
|
|
|
while run_realtime and keep_run: |
|
|
try: |
|
|
try: |
|
|
stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*2.5) |
|
|
stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*3) |
|
|
output = stdout.read().decode() |
|
|
output = stdout.read().decode() |
|
|
output = output.split('\n') |
|
|
output = output.split('\n') |
|
|
start_idx = 0 |
|
|
start_idx = 0 |
|
@ -409,17 +412,21 @@ def keep_check_one(server: dict, shared_data_list: list, server_idx: int, interv |
|
|
shared_data_list[server_idx]['info_list'] = result |
|
|
shared_data_list[server_idx]['info_list'] = result |
|
|
shared_data_list[server_idx]['updated'] = True |
|
|
shared_data_list[server_idx]['updated'] = True |
|
|
# locked = False |
|
|
# locked = False |
|
|
except: |
|
|
except Exception as e: |
|
|
shared_data_list[server_idx].pop('info_list') |
|
|
keep_run = False |
|
|
|
|
|
shared_data_list[server_idx]['err_info'] = f'{e}' |
|
|
|
|
|
if 'info_list' in shared_data_list[server_idx]: |
|
|
|
|
|
shared_data_list[server_idx].pop('info_list') |
|
|
|
|
|
|
|
|
time.sleep(interval) |
|
|
time.sleep(interval) |
|
|
|
|
|
|
|
|
# 关闭连接 |
|
|
# 关闭连接 |
|
|
client.close() |
|
|
client.close() |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
# if data_list_lock.locked and locked: |
|
|
# if re_try_count == 0: |
|
|
# data_list_lock.release() |
|
|
# shared_data_list[server_idx]['err_info'] = f'test:{type(e)}, {e}' |
|
|
# print(e) |
|
|
# print(e) |
|
|
|
|
|
# else: |
|
|
shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}' |
|
|
shared_data_list[server_idx]['err_info'] = f'retry:{re_try_count}, {e}' |
|
|
time.sleep(re_connect_time) |
|
|
time.sleep(re_connect_time) |
|
|
re_try_count += 1 |
|
|
re_try_count += 1 |
|
@ -512,7 +519,7 @@ def check_all(show_type='list'): |
|
|
except socket.timeout: |
|
|
except socket.timeout: |
|
|
print("Connection timed out.") |
|
|
print("Connection timed out.") |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print(f"{e}") |
|
|
print(f"错误 {e}") |
|
|
# except: |
|
|
# except: |
|
|
# print('连接出现问题.') |
|
|
# print('连接出现问题.') |
|
|
print() |
|
|
print() |
|
|