1
0
mirror of https://github.com/newnius/YAO-agent.git synced 2025-06-06 05:21:55 +00:00

optimize, launch earlier when current container exits in pre-schedule; stop monitor when container exited & clear stats

This commit is contained in:
Newnius 2020-07-23 17:10:23 +08:00
parent 9fcf5863e8
commit 6b7b1221a9

View File

@ -18,6 +18,7 @@ import random
import string
from pathlib import Path
import requests
import traceback
ClientID = os.getenv('ClientID', 1)
ClientHost = os.getenv('ClientHost', "localhost")
@ -52,9 +53,9 @@ active_stats = {0: {
}}
def generate_token(stringLength=8):
def generate_token(string_length=8):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(stringLength))
return ''.join(random.choice(letters) for i in range(string_length))
def monitor_task(container_id):
@ -130,6 +131,13 @@ def monitor_task(container_id):
maxCPU = utilCPU
if mem > maxMem:
maxMem = mem
# When container exited, break & clear taskStats after 30s
if pid != 0 and container.status != 'running':
time.sleep(30)
taskStatsLock.acquire()
taskStats.pop(container_id, None)
taskStatsLock.release()
break
def launch_tasks(stats):
@ -149,7 +157,7 @@ def launch_tasks(stats):
lock.acquire()
for token, task in pending_tasks.items():
if int(utils[task['gpus'][0]]) < 10 and counter[task['gpus'][0]] >= 2 \
and mem_frees[task['gpus'][0]] > task['gpu_mem']:
and (mem_frees[task['gpus'][0]] > task['gpu_mem'] or mem_frees[task['gpus'][0]] < 100):
entries_to_remove.append(token)
for k in entries_to_remove:
@ -298,7 +306,7 @@ class MyHandler(BaseHTTPRequestHandler):
path.mkdir(parents=True, exist_ok=True)
except OSError as e:
print("Creation of the directory %s failed" % dfs_src)
print(e)
print(traceback.format_exc())
try:
# set PYTHONUNBUFFERED=1 to output immediately