1
0
mirror of https://github.com/newnius/YAO-agent.git synced 2025-06-07 13:51:56 +00:00

update version based heart beats

This commit is contained in:
Newnius 2020-03-27 17:32:01 +08:00
parent b94e13d3e3
commit 6229798726

View File

@ -39,7 +39,7 @@ def launch_tasks(stats):
entries_to_remove = [] entries_to_remove = []
lock.acquire() lock.acquire()
for task_id, task in pending_tasks.items(): for task_id, task in pending_tasks.items():
if int(utils[task['gpus'][0]]) < 85: if int(utils[task['gpus'][0]]) < 70:
entries_to_remove.append(task_id) entries_to_remove.append(task_id)
script = " ".join([ script = " ".join([
"docker exec", "docker exec",
@ -307,12 +307,14 @@ def report_msg(stats):
flag = True flag = True
if abs(last_version['mem_available'] - post_fields['mem_available']) > 2.0: if abs(last_version['mem_available'] - post_fields['mem_available']) > 2.0:
flag = True flag = True
for stat in stats: for i in range(len(stats)):
if abs(last_version['status']['memory_total'] - stat['memory_total']) > 0.0: if abs(last_version['status'][i]['memory_total'] - post_fields['status'][i]['memory_total']) > 0.0:
flag = True flag = True
if abs(last_version['status']['memory_free'] - stat['memory_free']) > 512.0: if abs(last_version['status'][i]['memory_free'] - post_fields['status'][i]['memory_free']) > 512.0:
flag = True flag = True
if abs(last_version['status']['utilization_gpu'] - stat['utilization_gpu']) > 15.0: if abs(last_version['status'][i]['utilization_gpu'] - post_fields['status'][i]['utilization_gpu']) > 15.0:
flag = True
else:
flag = True flag = True
if flag: if flag:
@ -322,6 +324,10 @@ def report_msg(stats):
post_fields['flag'] = ver post_fields['flag'] = ver
data = json.dumps(post_fields) data = json.dumps(post_fields)
if flag:
print(ver)
print(post_fields)
producer = KafkaProducer(bootstrap_servers=KafkaBrokers) producer = KafkaProducer(bootstrap_servers=KafkaBrokers)
future = producer.send('yao', value=data.encode(), partition=0) future = producer.send('yao', value=data.encode(), partition=0)
result = future.get(timeout=10) result = future.get(timeout=10)