1
0
mirror of https://github.com/newnius/YAO-agent.git synced 2025-06-07 13:51:56 +00:00
This commit is contained in:
Newnius 2020-05-04 12:13:42 +08:00
parent dc3b04e581
commit 2a67b42c03

View File

@ -47,6 +47,7 @@ def launch_task_in_background(container, task_id):
def launch_tasks(stats): def launch_tasks(stats):
utils = {} utils = {}
mems = {}
for stat in stats: for stat in stats:
utils[stat['uuid']] = stat['utilization_gpu'] utils[stat['uuid']] = stat['utilization_gpu']
if int(stat['utilization_gpu']) < 60: if int(stat['utilization_gpu']) < 60:
@ -55,13 +56,15 @@ def launch_tasks(stats):
counter[stat['uuid']] += 1 counter[stat['uuid']] += 1
else: else:
counter[stat['uuid']] = 0 counter[stat['uuid']] = 0
mems[stat['uuid']] = stat['memory_free']
client = docker.from_env() client = docker.from_env()
container = client.containers.get('yao-agent-helper') container = client.containers.get('yao-agent-helper')
entries_to_remove = [] entries_to_remove = []
lock.acquire() lock.acquire()
for task_id, task in pending_tasks.items(): for task_id, task in pending_tasks.items():
if int(utils[task['gpus'][0]]) < 60 and counter[task['gpus'][0]] >= 2: if int(utils[task['gpus'][0]]) < 60 and counter[task['gpus'][0]] >= 2 \
and mems[task['gpus'][0]] > task['gpu_mem']:
entries_to_remove.append(task_id) entries_to_remove.append(task_id)
t = Thread(target=launch_task_in_background, name='launch_task', args=(container, task_id,)) t = Thread(target=launch_task_in_background, name='launch_task', args=(container, task_id,))
@ -151,6 +154,9 @@ class MyHandler(BaseHTTPRequestHandler):
docker_cpu_limit = form.getvalue('cpu_limit') docker_cpu_limit = form.getvalue('cpu_limit')
docker_network = form.getvalue('network') docker_network = form.getvalue('network')
docker_wait = form.getvalue('should_wait') docker_wait = form.getvalue('should_wait')
docker_output = form.getvalue('output_dir')
docker_hdfs_dir = form.getvalue('hdfs_dir')
docker_gpu_mem = form.getvalue('gpu_mem')
try: try:
script = " ".join([ script = " ".join([
@ -164,6 +170,9 @@ class MyHandler(BaseHTTPRequestHandler):
"--cpus " + docker_cpu_limit, "--cpus " + docker_cpu_limit,
"--env repo=" + docker_workspace, "--env repo=" + docker_workspace,
"--env should_wait=" + docker_wait, "--env should_wait=" + docker_wait,
"--env output_dir=" + docker_output,
"--env hdfs_dir=" + docker_hdfs_dir,
"--env gpu_mem=" + docker_gpu_mem,
docker_image, docker_image,
docker_cmd docker_cmd
]) ])
@ -174,7 +183,7 @@ class MyHandler(BaseHTTPRequestHandler):
msg = {"code": 0, "id": output.decode('utf-8').rstrip('\n')} msg = {"code": 0, "id": output.decode('utf-8').rstrip('\n')}
lock.acquire() lock.acquire()
pending_tasks[msg['id']] = {'gpus': str(docker_gpus).split(',')} pending_tasks[msg['id']] = {'gpus': str(docker_gpus).split(','), 'gpu_mem': int(docker_gpu_mem)}
lock.release() lock.release()
if exit_code != 0: if exit_code != 0:
msg["code"] = 1 msg["code"] = 1