diff --git a/src/optimizer.go b/src/optimizer.go index c9a8f70..1688224 100644 --- a/src/optimizer.go +++ b/src/optimizer.go @@ -33,7 +33,7 @@ func InstanceOfOptimizer() *Optimizer { return optimizerInstance } -func (optimizer *Optimizer) feed(job string, utils []int) { +func (optimizer *Optimizer) feed(job string, utils []UtilGPUTimeSeries) { log.Info("optimizer feed") log.Info(job, utils) @@ -44,12 +44,11 @@ func (optimizer *Optimizer) feed(job string, utils []int) { go func() { str := strings.Split(job, "-") if len(str) == 2 { - preCnt := 0 jobName := str[0] sum := 0 for i := 0; i < len(utils); i++ { - sum += utils[i] + sum += utils[i].Util } sum /= len(utils) if _, ok := optimizer.jobUtilsGPU[jobName]; !ok { @@ -59,27 +58,26 @@ func (optimizer *Optimizer) feed(job string, utils []int) { t.Util = (t.Version*t.Util + sum) / (t.Version + 1) t.Version++ + preTime := 0 for i := 0; i < len(utils); i++ { - if utils[i] > 15 { + if utils[i].Util > 15 { + preTime = utils[i].Time - utils[0].Time break } - preCnt++ } - postCnt := 0 + postTime := 0 for i := len(utils) - 1; i >= 0; i-- { - if utils[i] > 15 { + if utils[i].Util > 15 { + postTime = utils[len(utils)-1].Time - utils[i].Time break } - postCnt++ } if _, ok := optimizer.predicts[jobName]; !ok { optimizer.predicts[jobName] = &OptimizerJobExecutionTime{} } - postTime := postCnt * optimizer.heartbeatInterval - preTime := preCnt * optimizer.heartbeatInterval - totalTime := len(utils) * optimizer.heartbeatInterval + totalTime := utils[len(utils)-1].Time - utils[0].Time predict := optimizer.predicts[jobName] predict.Pre = ((predict.Pre * predict.Version) + preTime) / (predict.Version + 1) diff --git a/src/resource_pool.go b/src/resource_pool.go index e00247a..9e9639d 100644 --- a/src/resource_pool.go +++ b/src/resource_pool.go @@ -34,7 +34,7 @@ type ResourcePool struct { bindings map[string]map[string]int bindingsMu sync.Mutex - utils map[string][]int + utils map[string][]UtilGPUTimeSeries TotalGPU int } @@ -64,7 +64,7 @@ func (pool *ResourcePool) start() { pool.versions = map[string]float64{} pool.bindings = map[string]map[string]int{} - pool.utils = map[string][]int{} + pool.utils = map[string][]UtilGPUTimeSeries{} pool.TotalGPU = 0 @@ -168,7 +168,8 @@ func (pool *ResourcePool) update(node NodeStatus) { for _, gpu := range node.Status { if _, ok := pool.bindings[gpu.UUID]; ok { if len(pool.bindings[gpu.UUID]) == 1 { - pool.utils[gpu.UUID] = append(pool.utils[gpu.UUID], gpu.UtilizationGPU) + pool.utils[gpu.UUID] = append(pool.utils[gpu.UUID], + UtilGPUTimeSeries{Time: (int)(time.Now().Unix()), Util: gpu.UtilizationGPU}) } } } @@ -279,7 +280,7 @@ func (pool *ResourcePool) attach(GPU string, job string) { pool.bindings[GPU][job] = int(time.Now().Unix()) if _, ok := pool.utils[GPU]; !ok { - pool.utils[GPU] = []int{} + pool.utils[GPU] = []UtilGPUTimeSeries{} } } @@ -289,7 +290,7 @@ func (pool *ResourcePool) detach(GPU string, jobName string) { if _, ok := pool.bindings[GPU]; ok { if len(pool.bindings[GPU]) == 1 { InstanceOfOptimizer().feed(jobName, pool.utils[GPU]) - pool.utils[GPU] = []int{} + pool.utils[GPU] = []UtilGPUTimeSeries{} } } @@ -313,6 +314,5 @@ func (pool *ResourcePool) pickNode(nodes []*NodeStatus) *NodeStatus { /* sort */ - return nodes[0] } diff --git a/src/scheduler_fair.go b/src/scheduler_fair.go index 4063516..7c7ec46 100644 --- a/src/scheduler_fair.go +++ b/src/scheduler_fair.go @@ -36,6 +36,9 @@ type SchedulerFair struct { UsingGPU int UsingGPUMu sync.Mutex + + allocatingGPU int + allocatingGPUMu sync.Mutex } type FairJobSorter []Job @@ -66,6 +69,7 @@ func (scheduler *SchedulerFair) Start() { scheduler.enablePreScheduleRatio = 0.95 scheduler.UsingGPU = 0 + scheduler.allocatingGPU = 0 scheduler.parallelism = 1 @@ -82,6 +86,7 @@ func (scheduler *SchedulerFair) Start() { time.Sleep(time.Millisecond * 100) continue } + scheduler.schedulingJobsCnt++ scheduler.schedulingMu.Unlock() scheduler.queueMu.Lock() @@ -90,6 +95,24 @@ func (scheduler *SchedulerFair) Start() { jm := JobManager{} jm.job = scheduler.queues[queue][0] + cnt := 0 + for _, task := range jm.job.Tasks { + cnt += task.NumberGPU + } + if scheduler.schedulingJobsCnt > 1 { + if (cnt+scheduler.allocatingGPU+1)*13 > (pool.TotalGPU-scheduler.UsingGPU)*10 { + scheduler.schedulingMu.Lock() + scheduler.schedulingJobsCnt-- + scheduler.schedulingMu.Unlock() + continue + } + } + scheduler.allocatingGPUMu.Lock() + scheduler.allocatingGPU += cnt + scheduler.allocatingGPUMu.Unlock() + log.Info("allocatingGPU is ", scheduler.allocatingGPU) + log.Info("schedulingJobsCnt is ", scheduler.schedulingJobsCnt) + scheduler.queues[queue] = scheduler.queues[queue][1:] jm.scheduler = scheduler scheduler.jobs[jm.job.Name] = &jm @@ -285,9 +308,9 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && scheduler.enablePreSchedule { estimate, valid := InstanceOfOptimizer().predictTime(job.Name) - log.Info(pool.TotalGPU) - log.Info(estimate, valid) - log.Info(scheduler.UsingGPU) + //log.Info(pool.TotalGPU) + //log.Info(estimate, valid) + //log.Info(scheduler.UsingGPU) if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid { allocationType = 3 @@ -363,6 +386,10 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node for _, t := range res.Status { scheduler.Attach(t.UUID, job.Name) } + scheduler.allocatingGPUMu.Lock() + scheduler.allocatingGPU -= task.NumberGPU + scheduler.allocatingGPUMu.Unlock() + log.Info("allocatingGPU is ", scheduler.allocatingGPU) } for i := range locks { diff --git a/src/util.go b/src/util.go index c4f0153..df973a1 100644 --- a/src/util.go +++ b/src/util.go @@ -166,6 +166,11 @@ type MsgGroupList struct { Groups []Group `json:"groups"` } +type UtilGPUTimeSeries struct { + Time int `json:"time"` + Util int `json:"util"` +} + type OptimizerJobExecutionTime struct { Pre int `json:"pre"` Post int `json:"post"`