|
|
@ -36,7 +36,7 @@ def connect_server(): |
|
|
|
|
|
|
|
#endregion |
|
|
|
|
|
|
|
def get_gpus_info(client, timeout): |
|
|
|
def get_gpus_info(client, timeout, info_list:list=None): |
|
|
|
try: |
|
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
|
|
|
|
@ -81,9 +81,11 @@ def get_gpus_info(client, timeout): |
|
|
|
|
|
|
|
return result |
|
|
|
except Exception as e: |
|
|
|
if info_list is not None: |
|
|
|
info_list.append(f'gpus: {e}') |
|
|
|
None |
|
|
|
|
|
|
|
def get_storage_info(client, timeout, path_list): |
|
|
|
def get_storage_info(client, timeout, path_list, info_list:list=None): |
|
|
|
try: |
|
|
|
result = [] |
|
|
|
for target_path in path_list: |
|
|
@ -100,9 +102,11 @@ def get_storage_info(client, timeout, path_list): |
|
|
|
result.append(tmp_res) |
|
|
|
return result |
|
|
|
except Exception as e: |
|
|
|
if info_list is not None: |
|
|
|
info_list.append(f'storage: {e}') |
|
|
|
return None |
|
|
|
|
|
|
|
def get_memory_info(client, timeout): |
|
|
|
def get_memory_info(client, timeout, info_list:list=None): |
|
|
|
try: |
|
|
|
stdin, stdout, stderr = client.exec_command('free', timeout=timeout) |
|
|
|
output = stdout.read().decode().split('\n')[1] |
|
|
@ -116,6 +120,8 @@ def get_memory_info(client, timeout): |
|
|
|
|
|
|
|
return result |
|
|
|
except Exception as e: |
|
|
|
if info_list is not None: |
|
|
|
info_list.append(f'memory: {e}') |
|
|
|
return None |
|
|
|
|
|
|
|
# 持续获取一个服务器的信息 |
|
|
@ -143,12 +149,13 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte |
|
|
|
keep_run = True |
|
|
|
while keep_run: |
|
|
|
try: |
|
|
|
error_info_list = [] |
|
|
|
# gpu 信息 |
|
|
|
gpu_info = get_gpus_info(client, interval*3) |
|
|
|
gpu_info = get_gpus_info(client, interval*3, info_list=error_info_list) |
|
|
|
# 存储空间信息 |
|
|
|
storage_info = get_storage_info(client, interval*3, server['storage_list']) |
|
|
|
storage_info = get_storage_info(client, interval*3, server['storage_list'], info_list=error_info_list) |
|
|
|
# 内存信息 |
|
|
|
memory_info = get_memory_info(client, interval*3) |
|
|
|
memory_info = get_memory_info(client, interval*3, info_list=error_info_list) |
|
|
|
|
|
|
|
# 记录信息 |
|
|
|
with data_list_lock: |
|
|
@ -157,6 +164,8 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte |
|
|
|
shared_data_list[server_title]['memory_info'] = memory_info |
|
|
|
shared_data_list[server_title]['updated'] = True |
|
|
|
shared_data_list[server_title]['maxGPU'] = len(gpu_info) |
|
|
|
if len(error_info_list) > 0: |
|
|
|
shared_data_list[server_title]['err_info'] = '\n'.join(error_info_list) |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
keep_run = False |
|
|
|