diff --git a/src/evaluator.go b/src/evaluator.go index 1381603..c7106ed 100644 --- a/src/evaluator.go +++ b/src/evaluator.go @@ -66,11 +66,6 @@ func (eva *Evaluator) add(node NodeStatus, task Task) { eva.totalWorker++ } - if task.IsPS { - //eva.costLoad += 1 - } else { - //eva.costLoad += 0.5 - } numberGPU := 1 for _, gpu := range node.Status { if gpu.MemoryAllocated != 0 { @@ -104,11 +99,6 @@ func (eva *Evaluator) remove(node NodeStatus, task Task) { eva.totalWorker-- } - if task.IsPS { - //eva.costLoad -= 1 - } else { - //eva.costLoad -= 0.5 - } numberGPU := 1 for _, gpu := range node.Status { if gpu.MemoryAllocated != 0 { diff --git a/src/job_manager.go b/src/job_manager.go index ac98a4b..57421f5 100644 --- a/src/job_manager.go +++ b/src/job_manager.go @@ -80,7 +80,7 @@ func (jm *JobManager) start() { v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[index].Memory)+"m") v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[index].NumberCPU)) v.Set("network", jm.network) - v.Set("should_wait", "1") + v.Set("should_wait", "0") v.Set("output_dir", "/tmp/") v.Set("hdfs_address", "http://192.168.100.104:50070/") v.Set("hdfs_dir", "/user/yao/output/"+jm.job.Name) diff --git a/src/resource_pool.go b/src/resource_pool.go index 9da08fd..22d6bb5 100644 --- a/src/resource_pool.go +++ b/src/resource_pool.go @@ -896,7 +896,12 @@ func (pool *ResourcePool) releaseResource(job Job, agent NodeStatus) { seg.Lock.Lock() defer seg.Lock.Unlock() - node := seg.Nodes[agent.ClientID] + node, ok := seg.Nodes[agent.ClientID] + if !ok { + /* in case node is offline */ + /* TODO, update usingTotalGPU correctly */ + return + } for _, gpu := range agent.Status { for j := range node.Status { if gpu.UUID == node.Status[j].UUID {