mirror of
https://github.com/newnius/YAO-agent.git
synced 2025-06-06 21:31:55 +00:00
update, support pre start
This commit is contained in:
parent
022212a6e0
commit
84a8867740
28
executor.py
28
executor.py
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
import threading
|
||||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||||
import cgi
|
import cgi
|
||||||
import docker
|
import docker
|
||||||
@ -7,6 +8,29 @@ from urllib import parse
|
|||||||
|
|
||||||
PORT_NUMBER = 8000
|
PORT_NUMBER = 8000
|
||||||
|
|
||||||
|
lock = threading.Lock()
|
||||||
|
pending_tasks = {}
|
||||||
|
|
||||||
|
|
||||||
|
def launch_tasks(stats):
|
||||||
|
client = docker.from_env()
|
||||||
|
container = client.containers.get('yao-agent-helper')
|
||||||
|
entries_to_remove = []
|
||||||
|
lock.acquire()
|
||||||
|
for task_id, task in pending_tasks.items():
|
||||||
|
if stats[task['gpus'][0]]['utilization_gpu'] < 75:
|
||||||
|
entries_to_remove.append(task_id)
|
||||||
|
script = " ".join([
|
||||||
|
"docker exec",
|
||||||
|
id,
|
||||||
|
"pkill sleep"
|
||||||
|
])
|
||||||
|
container.exec_run('sh -c \'' + script + '\'')
|
||||||
|
|
||||||
|
for k in entries_to_remove:
|
||||||
|
pending_tasks.pop(k, None)
|
||||||
|
lock.release()
|
||||||
|
|
||||||
|
|
||||||
class MyHandler(BaseHTTPRequestHandler):
|
class MyHandler(BaseHTTPRequestHandler):
|
||||||
# Handler for the GET requests
|
# Handler for the GET requests
|
||||||
@ -104,6 +128,10 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
container = client.containers.get('yao-agent-helper')
|
container = client.containers.get('yao-agent-helper')
|
||||||
exit_code, output = container.exec_run('sh -c \'' + script + '\'')
|
exit_code, output = container.exec_run('sh -c \'' + script + '\'')
|
||||||
msg = {"code": 0, "id": output.decode('utf-8').rstrip('\n')}
|
msg = {"code": 0, "id": output.decode('utf-8').rstrip('\n')}
|
||||||
|
|
||||||
|
lock.acquire()
|
||||||
|
pending_tasks[msg['id']] = {'gpus': str(docker_gpus).split(',')}
|
||||||
|
lock.release()
|
||||||
if exit_code != 0:
|
if exit_code != 0:
|
||||||
msg["code"] = 1
|
msg["code"] = 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
14
monitor.py
14
monitor.py
@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
import subprocess
|
import subprocess
|
||||||
import json
|
import json
|
||||||
@ -8,6 +9,7 @@ from kafka import KafkaProducer
|
|||||||
import multiprocessing
|
import multiprocessing
|
||||||
import psutil
|
import psutil
|
||||||
import math
|
import math
|
||||||
|
from executor import launch_tasks
|
||||||
|
|
||||||
ClientID = os.getenv('ClientID', 1)
|
ClientID = os.getenv('ClientID', 1)
|
||||||
ClientHost = os.getenv('ClientHost', "localhost")
|
ClientHost = os.getenv('ClientHost', "localhost")
|
||||||
@ -15,13 +17,16 @@ KafkaBrokers = os.getenv('KafkaBrokers', 'localhost:9092').split(',')
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
interval = 10
|
interval = 5
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
status, msg_gpu = execute(['nvidia-smi', '-q', '-x', '-f', 'status.xml'])
|
status, msg_gpu = execute(['nvidia-smi', '-q', '-x', '-f', 'status.xml'])
|
||||||
if not status:
|
if not status:
|
||||||
print("execute failed, ", msg_gpu)
|
print("execute failed, ", msg_gpu)
|
||||||
report_msg()
|
stats = get_gpu_status()
|
||||||
|
report_msg(stats)
|
||||||
|
t = threading.Thread(target=launch_tasks, name='launch_tasks',args=(stats,))
|
||||||
|
t.start()
|
||||||
time.sleep(interval)
|
time.sleep(interval)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
@ -38,7 +43,7 @@ def execute(cmd):
|
|||||||
return False, e
|
return False, e
|
||||||
|
|
||||||
|
|
||||||
def report_msg():
|
def get_gpu_status():
|
||||||
DOMTree = xml.dom.minidom.parse("status.xml")
|
DOMTree = xml.dom.minidom.parse("status.xml")
|
||||||
collection = DOMTree.documentElement
|
collection = DOMTree.documentElement
|
||||||
gpus = collection.getElementsByTagName("gpu")
|
gpus = collection.getElementsByTagName("gpu")
|
||||||
@ -73,7 +78,10 @@ def report_msg():
|
|||||||
stat['power_draw'] = int(float(stat['power_draw'].split(' ')[0]))
|
stat['power_draw'] = int(float(stat['power_draw'].split(' ')[0]))
|
||||||
|
|
||||||
stats.append(stat)
|
stats.append(stat)
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def report_msg(stats):
|
||||||
mem = psutil.virtual_memory()
|
mem = psutil.virtual_memory()
|
||||||
|
|
||||||
post_fields = {
|
post_fields = {
|
||||||
|
Loading…
Reference in New Issue
Block a user