update, add gpu stats for contaier

2025-12-15 13:56:44 +00:00 · 2020-06-24 17:10:26 +08:00
parent e614a15474
commit f77465bb8b
2 changed files with 101 additions and 3 deletions
--- a/agent.py
+++ b/agent.py
@@ -45,6 +45,12 @@ client = docker.from_env()
 taskStats = {}
 taskStatsLock = Lock()
 active_stats = {0: {
 	'util': 0,
 	'mem_util': 0,
 	'mem': 0
 }}
 def generate_token(stringLength=8):
 	letters = string.ascii_lowercase
@@ -54,12 +60,22 @@ def generate_token(stringLength=8):
 def monitor_task(container_id):
 	print(container_id)
 	container = client.containers.get(container_id)
 	pid = 0
 	maxCPU = 0
 	maxMem = 0
 	last_bw_rx = 0
 	last_bw_tx = 0
 	last_time = time.time() - 1
 	for statR in container.stats():
 		if pid == 0:
 			res = container.top()['Processes']
 			for x in res:
 				if "/workspace" in x[7] and int(x[1]) in active_stats:
 					pid = int(x[1])
 					break
 		stat = json.loads(statR)
 		# print(stat)
 		if stat['read'] == '0001-01-01T00:00:00Z':
@@ -90,7 +106,16 @@ def monitor_task(container_id):
 		bw_rx /= dur
 		bw_tx /= dur
-		taskStats[container_id] = {'cpu': utilCPU, 'mem': mem, 'bw_rx': bw_rx, 'bw_tx': bw_tx}
+		taskStats[container_id] = {
 			'cpu': utilCPU,
 			'mem': mem,
 			'bw_rx': bw_rx,
 			'bw_tx': bw_tx,
 			'gpu_util': active_stats[pid]['util'],
 			'gpu_mem_util': active_stats[pid]['mem_util'],
 			'gpu_mem': active_stats[pid]['mem'],
 		}
 		# print(taskStats[container_id])
 		# print(utilCPU, mem, maxCPU, maxMem, bw_rx, bw_tx)
 		taskStatsLock.release()
 		if stat['preread'] == '0001-01-01T00:00:00Z':
@@ -206,6 +231,10 @@ class MyHandler(BaseHTTPRequestHandler):
 						status['mem'] = taskStats[container_id]['mem']
 						status['bw_rx'] = taskStats[container_id]['bw_rx']
 						status['bw_tx'] = taskStats[container_id]['bw_tx']
 						status['bw_tx'] = taskStats[container_id]['bw_tx']
 						status['gpu_util'] = taskStats[container_id]['gpu_util']
 						status['gpu_mem_util'] = taskStats[container_id]['gpu_mem_util']
 						status['gpu_mem'] = taskStats[container_id]['gpu_mem']
 						taskStatsLock.release()
 					if container_id in id2token:
 						token = id2token[container_id]
@@ -390,6 +419,29 @@ def reporter():
 		time.sleep(HeartbeatInterval)
 def pmon():
 	while True:
 		try:
 			status, msg_gpu = execute(['nvidia-smi', 'pmon', '-c', '1', '-s', 'um'])
 			if not status:
 				print("[WARN] execute failed, ", msg_gpu, status)
 			lists = msg_gpu.split('\n')
 			for p in lists:
 				if "#" not in p and "-" not in p:
 					tmp = p.split()
 					data = {
 						'idx': int(tmp[0]),
 						'pid': int(tmp[1]),
 						'util': int(tmp[3]),
 						'mem_util': int(tmp[4]),
 						'mem': int(tmp[7])
 					}
 					active_stats[int(tmp[1])] = data
 		except Exception as e:
 			print(e)
 		time.sleep(HeartbeatInterval)
 def execute(cmd):
 	try:
 		result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -482,6 +534,7 @@ if __name__ == '__main__':
 	Thread(target=reporter).start()
 	Thread(target=listener).start()
 	Thread(target=pmon).start()
 	if EnableEventTrigger == 'true':
 		print('start event trigger')
 		Thread(target=event_trigger).start()
--- a/test.py
+++ b/test.py
@@ -1,3 +1,4 @@
 import subprocess
 import docker
@@ -100,11 +101,53 @@ def create_container():
 def exec_run():
 	client = docker.from_env()
 	container = client.containers.get('yao-agent-helper')
-	exit_code, output = container.exec_run(cmd="sh -c 'docker run --gpus all --detach=True tensorflow/tensorflow:1.14.0-gpu nvidia-smi'")
+	exit_code, output = container.exec_run(
 		cmd="sh -c 'docker run --gpus all --detach=True tensorflow/tensorflow:1.14.0-gpu nvidia-smi'")
 	if exit_code == 0:
 		print(output.decode('utf-8').rstrip('\n'))
 def report():
 	try:
 		status, msg_gpu = execute(['nvidia-smi', 'pmon', '-c', '1', '-s', 'um'])
 		if not status:
 			print("execute failed, ", msg_gpu, status)
 		lists = msg_gpu.split('\n')
 		for p in lists:
 			if "#" not in p and "-" not in p:
 				tmp = p.split()
 				data = {
 					'idx': int(tmp[0]),
 					'pid': int(tmp[1]),
 					'util': int(tmp[3]),
 					'mem_util': int(tmp[4]),
 					'mem': int(tmp[7])
 				}
 				print(data)
 	except Exception as e:
 		print(e)
 def execute(cmd):
 	try:
 		result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 		if result.returncode == 0:
 			return True, result.stdout.decode('utf-8').rstrip('\n')
 		return False, result.stderr.decode('utf-8').rstrip('\n')
 	except Exception as e:
 		return False, e
 def getPID(container_id):
 	client = docker.from_env()
 	container = client.containers.get(container_id)
 	res = container.top()['Processes']
 	for x in res:
 		if "/workspace" in x[7]:
 			print(res[1])
 			break
 # create_network()
 # list_networks()
@@ -112,4 +155,6 @@ def exec_run():
 # get_status('af121babda9b')
 # exec_run()
 # run()
-create_container()
+# create_container()
 # report()
 getPID('a6543cef3c85')