1
0
mirror of https://github.com/newnius/YAO-agent.git synced 2025-06-07 13:51:56 +00:00

optimize, launch earlier when current container exits in pre-schedule; stop monitor when container exited & clear stats

This commit is contained in:
Newnius 2020-07-23 17:10:23 +08:00
parent 9fcf5863e8
commit 6b7b1221a9

View File

@ -18,6 +18,7 @@ import random
import string import string
from pathlib import Path from pathlib import Path
import requests import requests
import traceback
ClientID = os.getenv('ClientID', 1) ClientID = os.getenv('ClientID', 1)
ClientHost = os.getenv('ClientHost', "localhost") ClientHost = os.getenv('ClientHost', "localhost")
@ -52,9 +53,9 @@ active_stats = {0: {
}} }}
def generate_token(stringLength=8): def generate_token(string_length=8):
letters = string.ascii_lowercase letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(stringLength)) return ''.join(random.choice(letters) for i in range(string_length))
def monitor_task(container_id): def monitor_task(container_id):
@ -130,6 +131,13 @@ def monitor_task(container_id):
maxCPU = utilCPU maxCPU = utilCPU
if mem > maxMem: if mem > maxMem:
maxMem = mem maxMem = mem
# When container exited, break & clear taskStats after 30s
if pid != 0 and container.status != 'running':
time.sleep(30)
taskStatsLock.acquire()
taskStats.pop(container_id, None)
taskStatsLock.release()
break
def launch_tasks(stats): def launch_tasks(stats):
@ -149,7 +157,7 @@ def launch_tasks(stats):
lock.acquire() lock.acquire()
for token, task in pending_tasks.items(): for token, task in pending_tasks.items():
if int(utils[task['gpus'][0]]) < 10 and counter[task['gpus'][0]] >= 2 \ if int(utils[task['gpus'][0]]) < 10 and counter[task['gpus'][0]] >= 2 \
and mem_frees[task['gpus'][0]] > task['gpu_mem']: and (mem_frees[task['gpus'][0]] > task['gpu_mem'] or mem_frees[task['gpus'][0]] < 100):
entries_to_remove.append(token) entries_to_remove.append(token)
for k in entries_to_remove: for k in entries_to_remove:
@ -298,7 +306,7 @@ class MyHandler(BaseHTTPRequestHandler):
path.mkdir(parents=True, exist_ok=True) path.mkdir(parents=True, exist_ok=True)
except OSError as e: except OSError as e:
print("Creation of the directory %s failed" % dfs_src) print("Creation of the directory %s failed" % dfs_src)
print(e) print(traceback.format_exc())
try: try:
# set PYTHONUNBUFFERED=1 to output immediately # set PYTHONUNBUFFERED=1 to output immediately