diff --git a/src/job_manager.go b/src/job_manager.go index c9bcacb..f23a5d1 100644 --- a/src/job_manager.go +++ b/src/job_manager.go @@ -58,7 +58,7 @@ func (jm *JobManager) start() { log.Info("return resource ", tt.ClientID) jm.resources[i].ClientID = "null" for _, t := range tt.Status { - jm.scheduler.Detach(t.UUID, jm.job.Name) + jm.scheduler.Detach(t.UUID, jm.job) } } } @@ -170,7 +170,7 @@ func (jm *JobManager) start() { InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, res.Status[i]) } else { log.Info(jm.job.Name, "-", i, " ", res.Status[i].Status) - if exitCode, ok := res.Status[i].State["ExitCode"].(float64); ok { + if exitCode, ok := res.Status[i].State["ExitCode"].(float64); ok && !jm.job.Tasks[i].IsPS { if exitCode != 0 && !jm.killedFlag { log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode) jm.killedFlag = true @@ -195,7 +195,7 @@ func (jm *JobManager) start() { jm.resources[i].ClientID = "null" for _, t := range jm.resources[i].Status { - jm.scheduler.Detach(t.UUID, jm.job.Name) + jm.scheduler.Detach(t.UUID, jm.job) } InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, res.Status[i]) diff --git a/src/resource_pool.go b/src/resource_pool.go index a4253f4..98a046a 100644 --- a/src/resource_pool.go +++ b/src/resource_pool.go @@ -102,12 +102,12 @@ func (pool *ResourcePool) checkDeadNodes() { } pool.TotalGPUMu.Lock() + seg.Lock.Lock() if _, ok := seg.Nodes[k]; ok { pool.TotalGPU -= len(seg.Nodes[k].Status) } pool.TotalGPUMu.Unlock() - seg.Lock.Lock() delete(seg.Nodes, k) seg.Lock.Unlock() pool.versionsMu.Lock() @@ -361,7 +361,6 @@ func (pool *ResourcePool) list() MsgResource { start := pool.pools[0].Next for cur := start; ; { - log.Info(cur.ID) cur.Lock.Lock() for k, node := range cur.Nodes { nodes[k] = *node @@ -441,20 +440,20 @@ func (pool *ResourcePool) attach(GPU string, job string) { } } -func (pool *ResourcePool) detach(GPU string, jobName string) { +func (pool *ResourcePool) detach(GPU string, job Job) { pool.bindingsMu.Lock() defer pool.bindingsMu.Unlock() if _, ok := pool.bindings[GPU]; ok { if _, ok2 := pool.utils[GPU]; ok2 { - if len(pool.bindings[GPU]) == 1 { - InstanceOfOptimizer().feed(jobName, pool.utils[GPU]) + if len(pool.bindings[GPU]) == 1 && job.Status == Finished { + InstanceOfOptimizer().feed(job.Name, pool.utils[GPU]) } delete(pool.utils, GPU) } } if list, ok := pool.bindings[GPU]; ok { - delete(list, jobName) + delete(list, job.Name) } } diff --git a/src/scheduler.go b/src/scheduler.go index c5a0269..0b8e475 100644 --- a/src/scheduler.go +++ b/src/scheduler.go @@ -27,7 +27,7 @@ type Scheduler interface { Attach(GPU string, job string) - Detach(GPU string, job string) + Detach(GPU string, job Job) Enable() bool diff --git a/src/scheduler_FCFS.go b/src/scheduler_FCFS.go index 338a6a1..b4e2214 100644 --- a/src/scheduler_FCFS.go +++ b/src/scheduler_FCFS.go @@ -253,7 +253,7 @@ func (scheduler *SchedulerFCFS) Attach(GPU string, job string) { pool.attach(GPU, job) } -func (scheduler *SchedulerFCFS) Detach(GPU string, job string) { +func (scheduler *SchedulerFCFS) Detach(GPU string, job Job) { pool.detach(GPU, job) } diff --git a/src/scheduler_fair.go b/src/scheduler_fair.go index c5c5e03..880d18d 100644 --- a/src/scheduler_fair.go +++ b/src/scheduler_fair.go @@ -181,8 +181,8 @@ func (scheduler *SchedulerFair) Start() { if len(t) == 0 || !InstanceOfGroupManager().groups[t[0].Group].Reserved { continue } - log.Info(scheduler.queueUsingGPU) - log.Info(scheduler.queuesSchedulingCnt) + //log.Info(scheduler.queueUsingGPU) + //log.Info(scheduler.queuesSchedulingCnt) scheduler.queuesUsingGPUMu.Lock() if cnt, ok := scheduler.queuesSchedulingCnt[t[0].Group]; ok && cnt > 0 { scheduler.queuesUsingGPUMu.Unlock() @@ -495,7 +495,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node if len(candidates) > 0 { log.Info("allocationType is ", allocationType) - log.Info(candidates) + //log.Info(candidates) } /* assign */ @@ -586,7 +586,7 @@ func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) { scheduler.UsingGPUMu.Unlock() log.Info(node.Status[j].UUID, " is released") } - log.Info(node.Status[j].MemoryAllocated) + //log.Info(node.Status[j].MemoryAllocated) } } } @@ -801,7 +801,7 @@ func (scheduler *SchedulerFair) Attach(GPU string, job string) { pool.attach(GPU, job) } -func (scheduler *SchedulerFair) Detach(GPU string, job string) { +func (scheduler *SchedulerFair) Detach(GPU string, job Job) { pool.detach(GPU, job) } diff --git a/src/scheduler_priority.go b/src/scheduler_priority.go index 7367517..4cbb025 100644 --- a/src/scheduler_priority.go +++ b/src/scheduler_priority.go @@ -277,7 +277,7 @@ func (scheduler *SchedulerPriority) Attach(GPU string, job string) { pool.attach(GPU, job) } -func (scheduler *SchedulerPriority) Detach(GPU string, job string) { +func (scheduler *SchedulerPriority) Detach(GPU string, job Job) { pool.detach(GPU, job) }