From ff0c10647b8c38cd8c4887739b056f4cb2ef2703 Mon Sep 17 00:00:00 2001 From: Newnius Date: Thu, 30 Apr 2020 18:39:47 +0800 Subject: [PATCH] update --- src/resource_pool.go | 10 +++++----- src/scheduler_fair.go | 24 +++++++++++++++++++----- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/resource_pool.go b/src/resource_pool.go index bd4b066..fd3638f 100644 --- a/src/resource_pool.go +++ b/src/resource_pool.go @@ -32,7 +32,7 @@ type ResourcePool struct { counter int counterTotal int - bindings map[string]map[string]bool + bindings map[string]map[string]int bindingsMu sync.Mutex utils map[string][]int @@ -59,7 +59,7 @@ func (pool *ResourcePool) start() { pool.networksFree = map[string]bool{} pool.versions = map[string]float64{} - pool.bindings = map[string]map[string]bool{} + pool.bindings = map[string]map[string]int{} pool.utils = map[string][]int{} pool.TotalGPU = 0 @@ -270,9 +270,9 @@ func (pool *ResourcePool) attach(GPU string, job string) { pool.bindingsMu.Lock() defer pool.bindingsMu.Unlock() if _, ok := pool.bindings[GPU]; !ok { - pool.bindings[GPU] = map[string]bool{} + pool.bindings[GPU] = map[string]int{} } - pool.bindings[GPU][job] = true + pool.bindings[GPU][job] = int(time.Now().Unix()) if _, ok := pool.utils[GPU]; !ok { pool.utils[GPU] = []int{} @@ -294,6 +294,6 @@ func (pool *ResourcePool) detach(GPU string, jobName string) { } } -func (pool *ResourcePool) getBindings() map[string]map[string]bool { +func (pool *ResourcePool) getBindings() map[string]map[string]int { return pool.bindings } diff --git a/src/scheduler_fair.go b/src/scheduler_fair.go index 4b1e8d3..eb1a630 100644 --- a/src/scheduler_fair.go +++ b/src/scheduler_fair.go @@ -205,7 +205,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus { var candidates []NodeStatus /* first, choose sharable GPUs */ - if scheduler.enableShare && (pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) > scheduler.enableShareRatio) { + if scheduler.enableShare && (pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enableShareRatio) { // check sharable allocationType = 1 if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid { @@ -279,8 +279,9 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus { } /* third round, find gpu to be released */ - if len(candidates) == 0 && len(job.Tasks) == 1 && scheduler.enablePreSchedule { - if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) > scheduler.enablePreScheduleRatio { + if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && scheduler.enablePreSchedule { + estimate, valid := InstanceOfOptimizer().predictTime(job.Name) + if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid { allocationType = 3 for i := 0; i < pool.poolsCount; i++ { pool.poolsMu[(i+poolID)%pool.poolsCount].Lock() @@ -288,8 +289,21 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus { for _, node := range pool.pools[(i+poolID)%pool.poolsCount] { var available []GPUStatus for _, status := range node.Status { - if status.MemoryAllocated == 0 && status.MemoryUsed < 10 { - available = append(available, status) + bindings := pool.getBindings() + if tasks, ok := bindings[status.UUID]; ok { + if len(tasks) > 1 { + continue + } + for task_t, s := range tasks { + est, valid2 := InstanceOfOptimizer().predictTime(task_t) + if valid2 { + t := s + now := (int)(time.Now().Unix()) + if now-t > est.Total-est.Post-estimate.Pre && status.MemoryFree > task.MemoryGPU { + available = append(available, status) + } + } + } } } if len(available) >= task.NumberGPU {