mirror of
https://github.com/newnius/YAO-agent.git
synced 2025-06-07 13:51:56 +00:00
optimize, launch earlier when current container exits in pre-schedule; stop monitor when container exited & clear stats
This commit is contained in:
parent
9fcf5863e8
commit
6b7b1221a9
16
agent.py
16
agent.py
@ -18,6 +18,7 @@ import random
|
|||||||
import string
|
import string
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import requests
|
import requests
|
||||||
|
import traceback
|
||||||
|
|
||||||
ClientID = os.getenv('ClientID', 1)
|
ClientID = os.getenv('ClientID', 1)
|
||||||
ClientHost = os.getenv('ClientHost', "localhost")
|
ClientHost = os.getenv('ClientHost', "localhost")
|
||||||
@ -52,9 +53,9 @@ active_stats = {0: {
|
|||||||
}}
|
}}
|
||||||
|
|
||||||
|
|
||||||
def generate_token(stringLength=8):
|
def generate_token(string_length=8):
|
||||||
letters = string.ascii_lowercase
|
letters = string.ascii_lowercase
|
||||||
return ''.join(random.choice(letters) for i in range(stringLength))
|
return ''.join(random.choice(letters) for i in range(string_length))
|
||||||
|
|
||||||
|
|
||||||
def monitor_task(container_id):
|
def monitor_task(container_id):
|
||||||
@ -130,6 +131,13 @@ def monitor_task(container_id):
|
|||||||
maxCPU = utilCPU
|
maxCPU = utilCPU
|
||||||
if mem > maxMem:
|
if mem > maxMem:
|
||||||
maxMem = mem
|
maxMem = mem
|
||||||
|
# When container exited, break & clear taskStats after 30s
|
||||||
|
if pid != 0 and container.status != 'running':
|
||||||
|
time.sleep(30)
|
||||||
|
taskStatsLock.acquire()
|
||||||
|
taskStats.pop(container_id, None)
|
||||||
|
taskStatsLock.release()
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def launch_tasks(stats):
|
def launch_tasks(stats):
|
||||||
@ -149,7 +157,7 @@ def launch_tasks(stats):
|
|||||||
lock.acquire()
|
lock.acquire()
|
||||||
for token, task in pending_tasks.items():
|
for token, task in pending_tasks.items():
|
||||||
if int(utils[task['gpus'][0]]) < 10 and counter[task['gpus'][0]] >= 2 \
|
if int(utils[task['gpus'][0]]) < 10 and counter[task['gpus'][0]] >= 2 \
|
||||||
and mem_frees[task['gpus'][0]] > task['gpu_mem']:
|
and (mem_frees[task['gpus'][0]] > task['gpu_mem'] or mem_frees[task['gpus'][0]] < 100):
|
||||||
entries_to_remove.append(token)
|
entries_to_remove.append(token)
|
||||||
|
|
||||||
for k in entries_to_remove:
|
for k in entries_to_remove:
|
||||||
@ -298,7 +306,7 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
path.mkdir(parents=True, exist_ok=True)
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
print("Creation of the directory %s failed" % dfs_src)
|
print("Creation of the directory %s failed" % dfs_src)
|
||||||
print(e)
|
print(traceback.format_exc())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# set PYTHONUNBUFFERED=1 to output immediately
|
# set PYTHONUNBUFFERED=1 to output immediately
|
||||||
|
Loading…
Reference in New Issue
Block a user