|
@ -36,25 +36,10 @@ def connect_server(): |
|
|
|
|
|
|
|
|
#endregion |
|
|
#endregion |
|
|
|
|
|
|
|
|
def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5): |
|
|
def get_gpus_info(client, timeout): |
|
|
re_try_count = 0 |
|
|
|
|
|
# 循环连接 |
|
|
|
|
|
while True: |
|
|
|
|
|
try: |
|
|
|
|
|
# 建立SSH连接 |
|
|
|
|
|
client = paramiko.SSHClient() |
|
|
|
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
|
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) |
|
|
|
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
|
|
|
|
|
|
shared_data_list[server_title]['err_info'] = '' |
|
|
stdin, stdout, stderr = client.exec_command(cmd, timeout=timeout) |
|
|
re_try_count = 0 |
|
|
|
|
|
|
|
|
|
|
|
# 循环检测 |
|
|
|
|
|
keep_run = True |
|
|
|
|
|
while keep_run: |
|
|
|
|
|
try: |
|
|
|
|
|
stdin, stdout, stderr = client.exec_command(cmd, timeout=interval*3) |
|
|
|
|
|
output = stdout.read().decode() |
|
|
output = stdout.read().decode() |
|
|
output = output.split('\n') |
|
|
output = output.split('\n') |
|
|
start_idx = 0 |
|
|
start_idx = 0 |
|
@ -93,18 +78,69 @@ def keep_check_one(server: dict, shared_data_list: dict, server_title: str, inte |
|
|
'temperature': temperature |
|
|
'temperature': temperature |
|
|
}) |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
def get_storage_info(client, timeout, path_list): |
|
|
|
|
|
result = [] |
|
|
|
|
|
|
|
|
|
|
|
for target_path in path_list: |
|
|
|
|
|
stdin, stdout, stderr = client.exec_command(f'df {target_path} | grep \'{target_path}\'', timeout=timeout) |
|
|
|
|
|
output = stdout.read().decode() |
|
|
|
|
|
if output == "": |
|
|
|
|
|
continue |
|
|
|
|
|
data = output.split() |
|
|
|
|
|
tmp_res = { |
|
|
|
|
|
"path": target_path, |
|
|
|
|
|
"total": int(data[1]), |
|
|
|
|
|
"available": int(data[3]) |
|
|
|
|
|
} |
|
|
|
|
|
result.append(tmp_res) |
|
|
|
|
|
|
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
# 持续获取一个服务器的信息 |
|
|
|
|
|
def keep_check_one(server: dict, shared_data_list: dict, server_title: str, interval: float, re_connect_time: float=5): |
|
|
|
|
|
# 处理一下需要检查的存储空间路径 |
|
|
|
|
|
if not 'storage_list' in server: |
|
|
|
|
|
server['storage_list'] = [] |
|
|
|
|
|
if not '/' in server['storage_list']: |
|
|
|
|
|
server['storage_list'].append('/') |
|
|
|
|
|
|
|
|
|
|
|
re_try_count = 0 |
|
|
|
|
|
# 循环连接 |
|
|
|
|
|
while True: |
|
|
|
|
|
try: |
|
|
|
|
|
# 建立SSH连接 |
|
|
|
|
|
client = paramiko.SSHClient() |
|
|
|
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
|
|
|
|
|
client.connect(server['ip'], port=server['port'], username=server['username'], password=server.get('password', None), key_filename=server.get('key_filename', None), timeout=interval*3) |
|
|
|
|
|
cmd = 'nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory,temperature.gpu --format=csv' |
|
|
|
|
|
|
|
|
|
|
|
shared_data_list[server_title]['err_info'] = '' |
|
|
|
|
|
re_try_count = 0 |
|
|
|
|
|
|
|
|
|
|
|
# 循环检测 |
|
|
|
|
|
keep_run = True |
|
|
|
|
|
while keep_run: |
|
|
|
|
|
try: |
|
|
|
|
|
# gpu 信息 |
|
|
|
|
|
gpu_info = get_gpus_info(client, interval*3) |
|
|
|
|
|
# 存储空间信息 |
|
|
|
|
|
storage_info = get_storage_info(client, interval*3, server['storage_list']) |
|
|
|
|
|
|
|
|
# locked = False |
|
|
# locked = False |
|
|
with data_list_lock: |
|
|
with data_list_lock: |
|
|
# locked = True |
|
|
# locked = True |
|
|
shared_data_list[server_title]['info_list'] = result |
|
|
shared_data_list[server_title]['gpu_info_list'] = gpu_info |
|
|
|
|
|
shared_data_list[server_title]['storage_info_list'] = storage_info |
|
|
shared_data_list[server_title]['updated'] = True |
|
|
shared_data_list[server_title]['updated'] = True |
|
|
shared_data_list[server_title]['maxGPU'] = len(output) |
|
|
shared_data_list[server_title]['maxGPU'] = len(gpu_info) |
|
|
# locked = False |
|
|
# locked = False |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
keep_run = False |
|
|
keep_run = False |
|
|
shared_data_list[server_title]['err_info'] = f'{e}' |
|
|
shared_data_list[server_title]['err_info'] = f'{e}' |
|
|
if 'info_list' in shared_data_list[server_title]: |
|
|
if 'gpu_info_list' in shared_data_list[server_title]: |
|
|
shared_data_list[server_title].pop('info_list') |
|
|
shared_data_list[server_title].pop('gpu_info_list') |
|
|
|
|
|
|
|
|
time.sleep(interval) |
|
|
time.sleep(interval) |
|
|
|
|
|
|
|
@ -130,8 +166,8 @@ def filter_data(title_list: list): |
|
|
server_data[title]['err_info'] = f'title \'{title}\' not exist!' |
|
|
server_data[title]['err_info'] = f'title \'{title}\' not exist!' |
|
|
continue |
|
|
continue |
|
|
# 还没获取到数据 |
|
|
# 还没获取到数据 |
|
|
info_list = data_dict[title].get('info_list', None) |
|
|
gpu_info_list = data_dict[title].get('gpu_info_list', None) |
|
|
if info_list is None: |
|
|
if gpu_info_list is None: |
|
|
err_info = data_dict[title].get('err_info', None) |
|
|
err_info = data_dict[title].get('err_info', None) |
|
|
if err_info is not None: |
|
|
if err_info is not None: |
|
|
server_data[title]['err_info'] = data_dict[title]['err_info'] |
|
|
server_data[title]['err_info'] = data_dict[title]['err_info'] |
|
@ -142,7 +178,8 @@ def filter_data(title_list: list): |
|
|
# 记录数据 |
|
|
# 记录数据 |
|
|
data_updated = data_dict[title].get('updated', False) |
|
|
data_updated = data_dict[title].get('updated', False) |
|
|
err_info = data_dict[title].get('err_info', '') |
|
|
err_info = data_dict[title].get('err_info', '') |
|
|
server_data[title]['info_list'] = info_list |
|
|
server_data[title]['gpu_info_list'] = gpu_info_list |
|
|
|
|
|
server_data[title]['storage_info_list'] = data_dict[title].get('storage_info_list', []) |
|
|
server_data[title]['updated'] = data_updated |
|
|
server_data[title]['updated'] = data_updated |
|
|
server_data[title]['err_info'] = err_info |
|
|
server_data[title]['err_info'] = err_info |
|
|
result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
result['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|