diff --git a/src/resource_pool.go b/src/resource_pool.go index 49fb6c7..c34dc35 100644 --- a/src/resource_pool.go +++ b/src/resource_pool.go @@ -911,6 +911,7 @@ func (pool *ResourcePool) releaseResource(job Job, agent NodeStatus) { /* in case node is offline */ if !ok { /* TODO, update usingTotalGPU correctly */ + log.Warn("node ", agent.ClientID, " not present") return } for _, gpu := range agent.Status { diff --git a/src/scheduler_capacity.go b/src/scheduler_capacity.go index 8cc9a6a..afb61e3 100644 --- a/src/scheduler_capacity.go +++ b/src/scheduler_capacity.go @@ -250,19 +250,20 @@ func (scheduler *SchedulerCapacity) AcquireResource(job Job) []NodeStatus { func (scheduler *SchedulerCapacity) ReleaseResource(job Job, agent NodeStatus) { InstanceOfResourcePool().releaseResource(job, agent) + + scheduler.resourceAllocationsMu.Lock() + if _, ok := scheduler.resourceAllocations[job.Group]; !ok { + scheduler.resourceAllocations[job.Group] = &ResourceCount{} + } + cnt, _ := scheduler.resourceAllocations[job.Group] + cnt.CPU -= agent.MemTotal + cnt.Memory -= agent.NumCPU + for _, v := range agent.Status { + cnt.NumberGPU -- + cnt.MemoryGPU -= v.MemoryTotal + } + scheduler.resourceAllocationsMu.Unlock() go func(res NodeStatus) { - scheduler.resourceAllocationsMu.Lock() - if _, ok := scheduler.resourceAllocations[job.Group]; !ok { - scheduler.resourceAllocations[job.Group] = &ResourceCount{} - } - cnt, _ := scheduler.resourceAllocations[job.Group] - cnt.CPU -= res.MemTotal - cnt.Memory -= res.NumCPU - for _, v := range res.Status { - cnt.NumberGPU -- - cnt.MemoryGPU -= v.MemoryTotal - } - scheduler.resourceAllocationsMu.Unlock() scheduler.UpdateNextQueue() }(agent) } diff --git a/src/scheduler_fair.go b/src/scheduler_fair.go index ffe0d26..4ee9895 100644 --- a/src/scheduler_fair.go +++ b/src/scheduler_fair.go @@ -276,17 +276,17 @@ func (scheduler *SchedulerFair) AcquireResource(job Job) []NodeStatus { func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) { InstanceOfResourcePool().releaseResource(job, agent) - go func(res NodeStatus) { - scheduler.resourceAllocationsMu.Lock() - if _, ok := scheduler.resourceAllocations[job.Group]; !ok { - scheduler.resourceAllocations[job.Group] = &ResourceCount{} - } - cnt, _ := scheduler.resourceAllocations[job.Group] - cnt.CPU -= res.NumCPU - cnt.Memory -= res.MemTotal - cnt.NumberGPU -= len(res.Status) - scheduler.resourceAllocationsMu.Unlock() - }(agent) + + scheduler.resourceAllocationsMu.Lock() + if _, ok := scheduler.resourceAllocations[job.Group]; !ok { + scheduler.resourceAllocations[job.Group] = &ResourceCount{} + } + cnt, _ := scheduler.resourceAllocations[job.Group] + cnt.CPU -= agent.NumCPU + cnt.Memory -= agent.MemTotal + cnt.NumberGPU -= len(agent.Status) + scheduler.resourceAllocationsMu.Unlock() + go func() { scheduler.UpdateQuota() }()