mirror of
https://github.com/newnius/YAO-agent.git
synced 2025-06-07 22:01:55 +00:00
add event driven heart beats
This commit is contained in:
parent
afc5b55a93
commit
055d554a43
157
agent.py
157
agent.py
@ -14,61 +14,48 @@ from http.server import BaseHTTPRequestHandler, HTTPServer
|
|||||||
import cgi
|
import cgi
|
||||||
import docker
|
import docker
|
||||||
from urllib import parse
|
from urllib import parse
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
|
||||||
ClientID = os.getenv('ClientID', 1)
|
ClientID = os.getenv('ClientID', 1)
|
||||||
ClientHost = os.getenv('ClientHost', "localhost")
|
ClientHost = os.getenv('ClientHost', "localhost")
|
||||||
|
ClientExtHost = os.getenv('ClientExtHost', "localhost")
|
||||||
KafkaBrokers = os.getenv('KafkaBrokers', 'localhost:9092').split(',')
|
KafkaBrokers = os.getenv('KafkaBrokers', 'localhost:9092').split(',')
|
||||||
|
|
||||||
PORT_NUMBER = 8000
|
PORT = os.getenv('Port', 8000)
|
||||||
|
HeartbeatInterval = os.getenv('HeartbeatInterval', 5)
|
||||||
|
|
||||||
lock = Lock()
|
lock = Lock()
|
||||||
pending_tasks = {}
|
# pending_tasks = {}
|
||||||
|
id2token = {}
|
||||||
ver = 0
|
|
||||||
last_version = {}
|
|
||||||
|
|
||||||
counter = {}
|
counter = {}
|
||||||
|
|
||||||
|
|
||||||
def launch_task_in_background(container, task_id):
|
def generate_token(stringLength=8):
|
||||||
script = " ".join([
|
letters = string.ascii_lowercase
|
||||||
"docker exec",
|
return ''.join(random.choice(letters) for i in range(stringLength))
|
||||||
task_id,
|
|
||||||
"pkill",
|
|
||||||
"sleep"
|
|
||||||
])
|
|
||||||
|
|
||||||
while True:
|
|
||||||
code = container.exec_run('sh -c \'' + script + '\'').exit_code
|
|
||||||
if code == 0:
|
|
||||||
break
|
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
|
|
||||||
def launch_tasks(stats):
|
def launch_tasks(stats):
|
||||||
utils = {}
|
utils = {}
|
||||||
mems = {}
|
mem_frees = {}
|
||||||
for stat in stats:
|
for stat in stats:
|
||||||
utils[stat['uuid']] = stat['utilization_gpu']
|
utils[stat['uuid']] = stat['utilization_gpu']
|
||||||
if int(stat['utilization_gpu']) < 60:
|
if int(stat['utilization_gpu']) < 10:
|
||||||
if stat['uuid'] not in counter:
|
if stat['uuid'] not in counter:
|
||||||
counter[stat['uuid']] = 0
|
counter[stat['uuid']] = 0
|
||||||
counter[stat['uuid']] += 1
|
counter[stat['uuid']] += 1
|
||||||
else:
|
else:
|
||||||
counter[stat['uuid']] = 0
|
counter[stat['uuid']] = 0
|
||||||
mems[stat['uuid']] = stat['memory_free']
|
mem_frees[stat['uuid']] = stat['memory_free']
|
||||||
|
|
||||||
client = docker.from_env()
|
|
||||||
container = client.containers.get('yao-agent-helper')
|
|
||||||
entries_to_remove = []
|
entries_to_remove = []
|
||||||
lock.acquire()
|
lock.acquire()
|
||||||
for task_id, task in pending_tasks.items():
|
for token, task in pending_tasks.items():
|
||||||
if int(utils[task['gpus'][0]]) < 60 and counter[task['gpus'][0]] >= 2 \
|
if int(utils[task['gpus'][0]]) < 10 and counter[task['gpus'][0]] >= 2 \
|
||||||
and mems[task['gpus'][0]] > task['gpu_mem']:
|
and mem_frees[task['gpus'][0]] > task['gpu_mem']:
|
||||||
entries_to_remove.append(task_id)
|
entries_to_remove.append(token)
|
||||||
|
|
||||||
t = Thread(target=launch_task_in_background, name='launch_task', args=(container, task_id,))
|
|
||||||
t.start()
|
|
||||||
|
|
||||||
for k in entries_to_remove:
|
for k in entries_to_remove:
|
||||||
pending_tasks.pop(k, None)
|
pending_tasks.pop(k, None)
|
||||||
@ -87,6 +74,33 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
self.end_headers()
|
self.end_headers()
|
||||||
self.wfile.write(bytes("pong", "utf-8"))
|
self.wfile.write(bytes("pong", "utf-8"))
|
||||||
|
|
||||||
|
if req.path == "/debug":
|
||||||
|
msg = {
|
||||||
|
'pending_tasks': pending_tasks,
|
||||||
|
'id2token': id2token
|
||||||
|
}
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header('Content-type', 'application/json')
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(bytes(json.dumps(msg), "utf-8"))
|
||||||
|
|
||||||
|
if req.path == "/can_run":
|
||||||
|
res = "1"
|
||||||
|
try:
|
||||||
|
token = query.get('token')[0]
|
||||||
|
for i in range(0, 50):
|
||||||
|
if token in pending_tasks:
|
||||||
|
res = "0"
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
time.sleep(0.1)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header('Content-type', 'application/json')
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(bytes(res, "utf-8"))
|
||||||
|
|
||||||
elif req.path == "/logs":
|
elif req.path == "/logs":
|
||||||
try:
|
try:
|
||||||
container_id = query.get('id')[0]
|
container_id = query.get('id')[0]
|
||||||
@ -118,8 +132,10 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
'hostname': container.attrs['Config']['Hostname'],
|
'hostname': container.attrs['Config']['Hostname'],
|
||||||
'state': container.attrs['State']
|
'state': container.attrs['State']
|
||||||
}
|
}
|
||||||
# if container_id in pending_tasks:
|
if container_id in id2token:
|
||||||
# status['status'] = 'ready'
|
token = id2token[container_id]
|
||||||
|
if token in pending_tasks:
|
||||||
|
status['status'] = 'ready'
|
||||||
if status['command'] is not None:
|
if status['command'] is not None:
|
||||||
status['command'] = ' '.join(container.attrs['Config']['Cmd'])
|
status['command'] = ' '.join(container.attrs['Config']['Cmd'])
|
||||||
msg = {'code': 0, 'status': status}
|
msg = {'code': 0, 'status': status}
|
||||||
@ -158,6 +174,7 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
docker_hdfs_address = form.getvalue('hdfs_address')
|
docker_hdfs_address = form.getvalue('hdfs_address')
|
||||||
docker_hdfs_dir = form.getvalue('hdfs_dir')
|
docker_hdfs_dir = form.getvalue('hdfs_dir')
|
||||||
docker_gpu_mem = form.getvalue('gpu_mem')
|
docker_gpu_mem = form.getvalue('gpu_mem')
|
||||||
|
token = generate_token(16)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
script = " ".join([
|
script = " ".join([
|
||||||
@ -171,6 +188,7 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
"--cpus " + docker_cpu_limit,
|
"--cpus " + docker_cpu_limit,
|
||||||
"--env repo=" + docker_workspace,
|
"--env repo=" + docker_workspace,
|
||||||
"--env should_wait=" + docker_wait,
|
"--env should_wait=" + docker_wait,
|
||||||
|
"--env should_cb=" + 'http://' + ClientExtHost + ':' + PORT + '/can_run?token=' + token,
|
||||||
"--env output_dir=" + docker_output,
|
"--env output_dir=" + docker_output,
|
||||||
"--env hdfs_address=" + docker_hdfs_address,
|
"--env hdfs_address=" + docker_hdfs_address,
|
||||||
"--env hdfs_dir=" + docker_hdfs_dir,
|
"--env hdfs_dir=" + docker_hdfs_dir,
|
||||||
@ -185,7 +203,8 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
msg = {"code": 0, "id": output.decode('utf-8').rstrip('\n')}
|
msg = {"code": 0, "id": output.decode('utf-8').rstrip('\n')}
|
||||||
|
|
||||||
lock.acquire()
|
lock.acquire()
|
||||||
pending_tasks[msg['id']] = {'gpus': str(docker_gpus).split(','), 'gpu_mem': int(docker_gpu_mem)}
|
pending_tasks[token] = {'gpus': str(docker_gpus).split(','), 'gpu_mem': int(docker_gpu_mem)}
|
||||||
|
id2token[msg['id']] = token
|
||||||
lock.release()
|
lock.release()
|
||||||
if exit_code != 0:
|
if exit_code != 0:
|
||||||
msg["code"] = 1
|
msg["code"] = 1
|
||||||
@ -249,9 +268,14 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
self.send_error(404, 'File Not Found: %s' % self.path)
|
self.send_error(404, 'File Not Found: %s' % self.path)
|
||||||
|
|
||||||
|
|
||||||
|
def event_trigger():
|
||||||
|
client = docker.from_env()
|
||||||
|
for event in client.events(decode=True, filters={'event': 'die'}):
|
||||||
|
Thread(target=report).start()
|
||||||
|
print(event)
|
||||||
|
|
||||||
|
|
||||||
def report():
|
def report():
|
||||||
interval = 3
|
|
||||||
while True:
|
|
||||||
try:
|
try:
|
||||||
status, msg_gpu = execute(['nvidia-smi', '-q', '-x', '-f', 'status.xml'])
|
status, msg_gpu = execute(['nvidia-smi', '-q', '-x', '-f', 'status.xml'])
|
||||||
if not status:
|
if not status:
|
||||||
@ -260,10 +284,14 @@ def report():
|
|||||||
report_msg(stats)
|
report_msg(stats)
|
||||||
t = Thread(target=launch_tasks, name='launch_tasks', args=(stats,))
|
t = Thread(target=launch_tasks, name='launch_tasks', args=(stats,))
|
||||||
t.start()
|
t.start()
|
||||||
time.sleep(interval)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
time.sleep(interval)
|
|
||||||
|
|
||||||
|
def reporter():
|
||||||
|
while True:
|
||||||
|
report()
|
||||||
|
time.sleep(HeartbeatInterval)
|
||||||
|
|
||||||
|
|
||||||
def execute(cmd):
|
def execute(cmd):
|
||||||
@ -315,10 +343,7 @@ def get_gpu_status():
|
|||||||
|
|
||||||
|
|
||||||
def report_msg(stats):
|
def report_msg(stats):
|
||||||
global last_version
|
|
||||||
global ver
|
|
||||||
mem = psutil.virtual_memory()
|
mem = psutil.virtual_memory()
|
||||||
|
|
||||||
post_fields = {
|
post_fields = {
|
||||||
'id': ClientID,
|
'id': ClientID,
|
||||||
'host': ClientHost,
|
'host': ClientHost,
|
||||||
@ -326,59 +351,29 @@ def report_msg(stats):
|
|||||||
'cpu_num': multiprocessing.cpu_count(),
|
'cpu_num': multiprocessing.cpu_count(),
|
||||||
'cpu_load': os.getloadavg()[0],
|
'cpu_load': os.getloadavg()[0],
|
||||||
'mem_total': math.floor(mem.total / (1024. ** 3)),
|
'mem_total': math.floor(mem.total / (1024. ** 3)),
|
||||||
'mem_available': math.floor(mem.available / (1024. ** 3))
|
'mem_available': math.floor(mem.available / (1024. ** 3)),
|
||||||
|
'version': time.time()
|
||||||
}
|
}
|
||||||
|
|
||||||
flag = False
|
|
||||||
if 'cpu_num' in last_version: # not null
|
|
||||||
if abs(last_version['cpu_num'] - post_fields['cpu_num']) > 0.0:
|
|
||||||
flag = True
|
|
||||||
if abs(last_version['cpu_load'] - post_fields['cpu_load']) / post_fields['cpu_num'] > 0.1:
|
|
||||||
flag = True
|
|
||||||
if abs(last_version['mem_total'] - post_fields['mem_total']) > 0.0:
|
|
||||||
flag = True
|
|
||||||
if abs(last_version['mem_available'] - post_fields['mem_available']) / post_fields['mem_available'] > 0.05:
|
|
||||||
flag = True
|
|
||||||
for i in range(len(stats)):
|
|
||||||
if abs(last_version['status'][i]['memory_total'] - post_fields['status'][i]['memory_total']) > 0.0:
|
|
||||||
flag = True
|
|
||||||
if abs(last_version['status'][i]['memory_free'] - post_fields['status'][i]['memory_free']) / \
|
|
||||||
post_fields['status'][i]['memory_total'] > 0.05:
|
|
||||||
flag = True
|
|
||||||
if abs(last_version['status'][i]['utilization_gpu'] - post_fields['status'][i]['utilization_gpu']) > 25.0:
|
|
||||||
flag = True
|
|
||||||
else:
|
|
||||||
flag = True
|
|
||||||
|
|
||||||
if flag:
|
|
||||||
ver = time.time()
|
|
||||||
last_version = post_fields
|
|
||||||
|
|
||||||
post_fields['version'] = ver
|
|
||||||
data = json.dumps(post_fields)
|
data = json.dumps(post_fields)
|
||||||
|
|
||||||
if flag:
|
|
||||||
print(ver)
|
|
||||||
print(post_fields)
|
|
||||||
|
|
||||||
producer = KafkaProducer(bootstrap_servers=KafkaBrokers)
|
producer = KafkaProducer(bootstrap_servers=KafkaBrokers)
|
||||||
future = producer.send('yao', value=data.encode(), partition=0)
|
future = producer.send('yao', value=data.encode(), partition=0)
|
||||||
result = future.get(timeout=10)
|
result = future.get(timeout=5)
|
||||||
# print(result)
|
|
||||||
|
|
||||||
|
|
||||||
def listener():
|
def listener():
|
||||||
|
global server
|
||||||
try:
|
try:
|
||||||
# Create a web server and define the handler to manage the
|
# Create a web server and define the handler to manage the
|
||||||
# incoming request
|
# incoming request
|
||||||
server = HTTPServer(('', PORT_NUMBER), MyHandler)
|
server = HTTPServer(('', PORT), MyHandler)
|
||||||
print('Started http server on port ', PORT_NUMBER)
|
print('Started http server on port ', PORT)
|
||||||
|
|
||||||
# Wait forever for incoming http requests
|
# Wait forever for incoming http requests
|
||||||
server.serve_forever()
|
server.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print('^C received, shutting down the web server')
|
print('^C received, shutting down the web server')
|
||||||
|
|
||||||
server.socket.close()
|
server.socket.close()
|
||||||
|
|
||||||
|
|
||||||
@ -386,10 +381,12 @@ if __name__ == '__main__':
|
|||||||
os.environ["TZ"] = 'Asia/Shanghai'
|
os.environ["TZ"] = 'Asia/Shanghai'
|
||||||
if hasattr(time, 'tzset'):
|
if hasattr(time, 'tzset'):
|
||||||
time.tzset()
|
time.tzset()
|
||||||
t1 = Thread(target=report)
|
t1 = Thread(target=reporter)
|
||||||
t2 = Thread(target=listener)
|
t2 = Thread(target=listener)
|
||||||
|
t3 = Thread(target=event_trigger)
|
||||||
t1.start()
|
t1.start()
|
||||||
t2.start()
|
t2.start()
|
||||||
while True:
|
t3.start()
|
||||||
time.sleep(5)
|
t1.join()
|
||||||
pass
|
t2.join()
|
||||||
|
t3.join()
|
||||||
|
39
mock.py
39
mock.py
@ -10,8 +10,8 @@ NUMS = os.getenv('NUMS', 1)
|
|||||||
|
|
||||||
ClientHost = os.getenv('ClientHost', "localhost")
|
ClientHost = os.getenv('ClientHost', "localhost")
|
||||||
KafkaBrokers = os.getenv('KafkaBrokers', 'localhost:9092').split(',')
|
KafkaBrokers = os.getenv('KafkaBrokers', 'localhost:9092').split(',')
|
||||||
|
PORT = os.getenv('Port', 8000)
|
||||||
PORT_NUMBER = 8000
|
HeartbeatInterval = os.getenv('HeartbeatInterval', 5)
|
||||||
|
|
||||||
|
|
||||||
class MyHandler(BaseHTTPRequestHandler):
|
class MyHandler(BaseHTTPRequestHandler):
|
||||||
@ -85,7 +85,6 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
|
|
||||||
def report(ClientID):
|
def report(ClientID):
|
||||||
interval = 3
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
stats = []
|
stats = []
|
||||||
@ -119,21 +118,21 @@ def report(ClientID):
|
|||||||
|
|
||||||
producer = KafkaProducer(bootstrap_servers=KafkaBrokers)
|
producer = KafkaProducer(bootstrap_servers=KafkaBrokers)
|
||||||
future = producer.send('yao', value=data.encode(), partition=0)
|
future = producer.send('yao', value=data.encode(), partition=0)
|
||||||
result = future.get(timeout=10)
|
result = future.get(timeout=5)
|
||||||
# print(result)
|
|
||||||
|
|
||||||
time.sleep(interval)
|
time.sleep(HeartbeatInterval)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
time.sleep(interval)
|
time.sleep(HeartbeatInterval)
|
||||||
|
|
||||||
|
|
||||||
def listener():
|
def listener():
|
||||||
|
global server
|
||||||
try:
|
try:
|
||||||
# Create a web server and define the handler to manage the
|
# Create a web server and define the handler to manage the
|
||||||
# incoming request
|
# incoming request
|
||||||
server = HTTPServer(('', PORT_NUMBER), MyHandler)
|
server = HTTPServer(('', PORT), MyHandler)
|
||||||
print('Started http server on port ', PORT_NUMBER)
|
print('Started http server on port ', PORT)
|
||||||
|
|
||||||
# Wait forever for incoming http requests
|
# Wait forever for incoming http requests
|
||||||
server.serve_forever()
|
server.serve_forever()
|
||||||
@ -147,14 +146,18 @@ if __name__ == '__main__':
|
|||||||
os.environ["TZ"] = 'Asia/Shanghai'
|
os.environ["TZ"] = 'Asia/Shanghai'
|
||||||
if hasattr(time, 'tzset'):
|
if hasattr(time, 'tzset'):
|
||||||
time.tzset()
|
time.tzset()
|
||||||
t1 = Thread(target=report)
|
threads = []
|
||||||
|
for clientID in range(0, int(NUMS)):
|
||||||
for i in range(0, int(NUMS)):
|
t = Thread(target=report, name=ClientHost + '_' + str(clientID), args=(ClientHost + '_' + str(clientID),))
|
||||||
t = Thread(target=report, name=ClientHost + '_' + str(i), args=(ClientHost + '_' + str(i),))
|
threads.append(t)
|
||||||
t.start()
|
|
||||||
|
|
||||||
t2 = Thread(target=listener)
|
t2 = Thread(target=listener)
|
||||||
t2.start()
|
threads.append(t2)
|
||||||
while True:
|
|
||||||
time.sleep(5)
|
# Start all threads
|
||||||
pass
|
for t in threads:
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
# Wait for all of them to finish
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
Loading…
Reference in New Issue
Block a user