mirror of
https://github.com/newnius/YAO-agent.git
synced 2025-06-06 05:21:55 +00:00
update, add gpu stats for contaier
This commit is contained in:
parent
e614a15474
commit
f77465bb8b
55
agent.py
55
agent.py
@ -45,6 +45,12 @@ client = docker.from_env()
|
|||||||
taskStats = {}
|
taskStats = {}
|
||||||
taskStatsLock = Lock()
|
taskStatsLock = Lock()
|
||||||
|
|
||||||
|
active_stats = {0: {
|
||||||
|
'util': 0,
|
||||||
|
'mem_util': 0,
|
||||||
|
'mem': 0
|
||||||
|
}}
|
||||||
|
|
||||||
|
|
||||||
def generate_token(stringLength=8):
|
def generate_token(stringLength=8):
|
||||||
letters = string.ascii_lowercase
|
letters = string.ascii_lowercase
|
||||||
@ -54,12 +60,22 @@ def generate_token(stringLength=8):
|
|||||||
def monitor_task(container_id):
|
def monitor_task(container_id):
|
||||||
print(container_id)
|
print(container_id)
|
||||||
container = client.containers.get(container_id)
|
container = client.containers.get(container_id)
|
||||||
|
|
||||||
|
pid = 0
|
||||||
|
|
||||||
maxCPU = 0
|
maxCPU = 0
|
||||||
maxMem = 0
|
maxMem = 0
|
||||||
last_bw_rx = 0
|
last_bw_rx = 0
|
||||||
last_bw_tx = 0
|
last_bw_tx = 0
|
||||||
last_time = time.time() - 1
|
last_time = time.time() - 1
|
||||||
for statR in container.stats():
|
for statR in container.stats():
|
||||||
|
if pid == 0:
|
||||||
|
res = container.top()['Processes']
|
||||||
|
for x in res:
|
||||||
|
if "/workspace" in x[7] and int(x[1]) in active_stats:
|
||||||
|
pid = int(x[1])
|
||||||
|
break
|
||||||
|
|
||||||
stat = json.loads(statR)
|
stat = json.loads(statR)
|
||||||
# print(stat)
|
# print(stat)
|
||||||
if stat['read'] == '0001-01-01T00:00:00Z':
|
if stat['read'] == '0001-01-01T00:00:00Z':
|
||||||
@ -90,7 +106,16 @@ def monitor_task(container_id):
|
|||||||
bw_rx /= dur
|
bw_rx /= dur
|
||||||
bw_tx /= dur
|
bw_tx /= dur
|
||||||
|
|
||||||
taskStats[container_id] = {'cpu': utilCPU, 'mem': mem, 'bw_rx': bw_rx, 'bw_tx': bw_tx}
|
taskStats[container_id] = {
|
||||||
|
'cpu': utilCPU,
|
||||||
|
'mem': mem,
|
||||||
|
'bw_rx': bw_rx,
|
||||||
|
'bw_tx': bw_tx,
|
||||||
|
'gpu_util': active_stats[pid]['util'],
|
||||||
|
'gpu_mem_util': active_stats[pid]['mem_util'],
|
||||||
|
'gpu_mem': active_stats[pid]['mem'],
|
||||||
|
}
|
||||||
|
# print(taskStats[container_id])
|
||||||
# print(utilCPU, mem, maxCPU, maxMem, bw_rx, bw_tx)
|
# print(utilCPU, mem, maxCPU, maxMem, bw_rx, bw_tx)
|
||||||
taskStatsLock.release()
|
taskStatsLock.release()
|
||||||
if stat['preread'] == '0001-01-01T00:00:00Z':
|
if stat['preread'] == '0001-01-01T00:00:00Z':
|
||||||
@ -206,6 +231,10 @@ class MyHandler(BaseHTTPRequestHandler):
|
|||||||
status['mem'] = taskStats[container_id]['mem']
|
status['mem'] = taskStats[container_id]['mem']
|
||||||
status['bw_rx'] = taskStats[container_id]['bw_rx']
|
status['bw_rx'] = taskStats[container_id]['bw_rx']
|
||||||
status['bw_tx'] = taskStats[container_id]['bw_tx']
|
status['bw_tx'] = taskStats[container_id]['bw_tx']
|
||||||
|
status['bw_tx'] = taskStats[container_id]['bw_tx']
|
||||||
|
status['gpu_util'] = taskStats[container_id]['gpu_util']
|
||||||
|
status['gpu_mem_util'] = taskStats[container_id]['gpu_mem_util']
|
||||||
|
status['gpu_mem'] = taskStats[container_id]['gpu_mem']
|
||||||
taskStatsLock.release()
|
taskStatsLock.release()
|
||||||
if container_id in id2token:
|
if container_id in id2token:
|
||||||
token = id2token[container_id]
|
token = id2token[container_id]
|
||||||
@ -390,6 +419,29 @@ def reporter():
|
|||||||
time.sleep(HeartbeatInterval)
|
time.sleep(HeartbeatInterval)
|
||||||
|
|
||||||
|
|
||||||
|
def pmon():
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
status, msg_gpu = execute(['nvidia-smi', 'pmon', '-c', '1', '-s', 'um'])
|
||||||
|
if not status:
|
||||||
|
print("[WARN] execute failed, ", msg_gpu, status)
|
||||||
|
lists = msg_gpu.split('\n')
|
||||||
|
for p in lists:
|
||||||
|
if "#" not in p and "-" not in p:
|
||||||
|
tmp = p.split()
|
||||||
|
data = {
|
||||||
|
'idx': int(tmp[0]),
|
||||||
|
'pid': int(tmp[1]),
|
||||||
|
'util': int(tmp[3]),
|
||||||
|
'mem_util': int(tmp[4]),
|
||||||
|
'mem': int(tmp[7])
|
||||||
|
}
|
||||||
|
active_stats[int(tmp[1])] = data
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
time.sleep(HeartbeatInterval)
|
||||||
|
|
||||||
|
|
||||||
def execute(cmd):
|
def execute(cmd):
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
@ -482,6 +534,7 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
Thread(target=reporter).start()
|
Thread(target=reporter).start()
|
||||||
Thread(target=listener).start()
|
Thread(target=listener).start()
|
||||||
|
Thread(target=pmon).start()
|
||||||
if EnableEventTrigger == 'true':
|
if EnableEventTrigger == 'true':
|
||||||
print('start event trigger')
|
print('start event trigger')
|
||||||
Thread(target=event_trigger).start()
|
Thread(target=event_trigger).start()
|
||||||
|
49
test.py
49
test.py
@ -1,3 +1,4 @@
|
|||||||
|
import subprocess
|
||||||
import docker
|
import docker
|
||||||
|
|
||||||
|
|
||||||
@ -100,11 +101,53 @@ def create_container():
|
|||||||
def exec_run():
|
def exec_run():
|
||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
container = client.containers.get('yao-agent-helper')
|
container = client.containers.get('yao-agent-helper')
|
||||||
exit_code, output = container.exec_run(cmd="sh -c 'docker run --gpus all --detach=True tensorflow/tensorflow:1.14.0-gpu nvidia-smi'")
|
exit_code, output = container.exec_run(
|
||||||
|
cmd="sh -c 'docker run --gpus all --detach=True tensorflow/tensorflow:1.14.0-gpu nvidia-smi'")
|
||||||
if exit_code == 0:
|
if exit_code == 0:
|
||||||
print(output.decode('utf-8').rstrip('\n'))
|
print(output.decode('utf-8').rstrip('\n'))
|
||||||
|
|
||||||
|
|
||||||
|
def report():
|
||||||
|
try:
|
||||||
|
status, msg_gpu = execute(['nvidia-smi', 'pmon', '-c', '1', '-s', 'um'])
|
||||||
|
if not status:
|
||||||
|
print("execute failed, ", msg_gpu, status)
|
||||||
|
lists = msg_gpu.split('\n')
|
||||||
|
for p in lists:
|
||||||
|
if "#" not in p and "-" not in p:
|
||||||
|
tmp = p.split()
|
||||||
|
data = {
|
||||||
|
'idx': int(tmp[0]),
|
||||||
|
'pid': int(tmp[1]),
|
||||||
|
'util': int(tmp[3]),
|
||||||
|
'mem_util': int(tmp[4]),
|
||||||
|
'mem': int(tmp[7])
|
||||||
|
}
|
||||||
|
print(data)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
def execute(cmd):
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
if result.returncode == 0:
|
||||||
|
return True, result.stdout.decode('utf-8').rstrip('\n')
|
||||||
|
return False, result.stderr.decode('utf-8').rstrip('\n')
|
||||||
|
except Exception as e:
|
||||||
|
return False, e
|
||||||
|
|
||||||
|
|
||||||
|
def getPID(container_id):
|
||||||
|
client = docker.from_env()
|
||||||
|
container = client.containers.get(container_id)
|
||||||
|
res = container.top()['Processes']
|
||||||
|
for x in res:
|
||||||
|
if "/workspace" in x[7]:
|
||||||
|
print(res[1])
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
# create_network()
|
# create_network()
|
||||||
# list_networks()
|
# list_networks()
|
||||||
|
|
||||||
@ -112,4 +155,6 @@ def exec_run():
|
|||||||
# get_status('af121babda9b')
|
# get_status('af121babda9b')
|
||||||
# exec_run()
|
# exec_run()
|
||||||
# run()
|
# run()
|
||||||
create_container()
|
# create_container()
|
||||||
|
# report()
|
||||||
|
getPID('a6543cef3c85')
|
||||||
|
Loading…
Reference in New Issue
Block a user