mirror of
https://github.com/newnius/YAO-agent.git
synced 2025-06-07 13:51:56 +00:00
update
This commit is contained in:
parent
c409c279b7
commit
dc3b04e581
15
agent.py
15
agent.py
@ -27,6 +27,8 @@ pending_tasks = {}
|
|||||||
ver = 0
|
ver = 0
|
||||||
last_version = {}
|
last_version = {}
|
||||||
|
|
||||||
|
counter = {}
|
||||||
|
|
||||||
|
|
||||||
def launch_task_in_background(container, task_id):
|
def launch_task_in_background(container, task_id):
|
||||||
script = " ".join([
|
script = " ".join([
|
||||||
@ -47,13 +49,19 @@ def launch_tasks(stats):
|
|||||||
utils = {}
|
utils = {}
|
||||||
for stat in stats:
|
for stat in stats:
|
||||||
utils[stat['uuid']] = stat['utilization_gpu']
|
utils[stat['uuid']] = stat['utilization_gpu']
|
||||||
|
if int(stat['utilization_gpu']) < 60:
|
||||||
|
if stat['uuid'] not in counter:
|
||||||
|
counter[stat['uuid']] = 0
|
||||||
|
counter[stat['uuid']] += 1
|
||||||
|
else:
|
||||||
|
counter[stat['uuid']] = 0
|
||||||
|
|
||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
container = client.containers.get('yao-agent-helper')
|
container = client.containers.get('yao-agent-helper')
|
||||||
entries_to_remove = []
|
entries_to_remove = []
|
||||||
lock.acquire()
|
lock.acquire()
|
||||||
for task_id, task in pending_tasks.items():
|
for task_id, task in pending_tasks.items():
|
||||||
if int(utils[task['gpus'][0]]) < 60:
|
if int(utils[task['gpus'][0]]) < 60 and counter[task['gpus'][0]] >= 2:
|
||||||
entries_to_remove.append(task_id)
|
entries_to_remove.append(task_id)
|
||||||
|
|
||||||
t = Thread(target=launch_task_in_background, name='launch_task', args=(container, task_id,))
|
t = Thread(target=launch_task_in_background, name='launch_task', args=(container, task_id,))
|
||||||
@ -142,6 +150,7 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
docker_mem_limit = form.getvalue('mem_limit')
|
docker_mem_limit = form.getvalue('mem_limit')
|
||||||
docker_cpu_limit = form.getvalue('cpu_limit')
|
docker_cpu_limit = form.getvalue('cpu_limit')
|
||||||
docker_network = form.getvalue('network')
|
docker_network = form.getvalue('network')
|
||||||
|
docker_wait = form.getvalue('should_wait')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
script = " ".join([
|
script = " ".join([
|
||||||
@ -154,6 +163,7 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
"--memory-reservation " + docker_mem_limit,
|
"--memory-reservation " + docker_mem_limit,
|
||||||
"--cpus " + docker_cpu_limit,
|
"--cpus " + docker_cpu_limit,
|
||||||
"--env repo=" + docker_workspace,
|
"--env repo=" + docker_workspace,
|
||||||
|
"--env should_wait=" + docker_wait,
|
||||||
docker_image,
|
docker_image,
|
||||||
docker_cmd
|
docker_cmd
|
||||||
])
|
])
|
||||||
@ -320,7 +330,8 @@ def report_msg(stats):
|
|||||||
for i in range(len(stats)):
|
for i in range(len(stats)):
|
||||||
if abs(last_version['status'][i]['memory_total'] - post_fields['status'][i]['memory_total']) > 0.0:
|
if abs(last_version['status'][i]['memory_total'] - post_fields['status'][i]['memory_total']) > 0.0:
|
||||||
flag = True
|
flag = True
|
||||||
if abs(last_version['status'][i]['memory_free'] - post_fields['status'][i]['memory_free']) / post_fields['status'][i]['memory_total'] > 0.05:
|
if abs(last_version['status'][i]['memory_free'] - post_fields['status'][i]['memory_free']) / \
|
||||||
|
post_fields['status'][i]['memory_total'] > 0.05:
|
||||||
flag = True
|
flag = True
|
||||||
if abs(last_version['status'][i]['utilization_gpu'] - post_fields['status'][i]['utilization_gpu']) > 25.0:
|
if abs(last_version['status'][i]['utilization_gpu'] - post_fields['status'][i]['utilization_gpu']) > 25.0:
|
||||||
flag = True
|
flag = True
|
||||||
|
Loading…
Reference in New Issue
Block a user