1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-15 08:16:43 +00:00
This commit is contained in:
2020-04-30 23:06:12 +08:00
parent 1447192a45
commit e880c21e82
4 changed files with 50 additions and 20 deletions

View File

@@ -33,7 +33,7 @@ func InstanceOfOptimizer() *Optimizer {
return optimizerInstance return optimizerInstance
} }
func (optimizer *Optimizer) feed(job string, utils []int) { func (optimizer *Optimizer) feed(job string, utils []UtilGPUTimeSeries) {
log.Info("optimizer feed") log.Info("optimizer feed")
log.Info(job, utils) log.Info(job, utils)
@@ -44,12 +44,11 @@ func (optimizer *Optimizer) feed(job string, utils []int) {
go func() { go func() {
str := strings.Split(job, "-") str := strings.Split(job, "-")
if len(str) == 2 { if len(str) == 2 {
preCnt := 0
jobName := str[0] jobName := str[0]
sum := 0 sum := 0
for i := 0; i < len(utils); i++ { for i := 0; i < len(utils); i++ {
sum += utils[i] sum += utils[i].Util
} }
sum /= len(utils) sum /= len(utils)
if _, ok := optimizer.jobUtilsGPU[jobName]; !ok { if _, ok := optimizer.jobUtilsGPU[jobName]; !ok {
@@ -59,27 +58,26 @@ func (optimizer *Optimizer) feed(job string, utils []int) {
t.Util = (t.Version*t.Util + sum) / (t.Version + 1) t.Util = (t.Version*t.Util + sum) / (t.Version + 1)
t.Version++ t.Version++
preTime := 0
for i := 0; i < len(utils); i++ { for i := 0; i < len(utils); i++ {
if utils[i] > 15 { if utils[i].Util > 15 {
preTime = utils[i].Time - utils[0].Time
break break
} }
preCnt++
} }
postCnt := 0 postTime := 0
for i := len(utils) - 1; i >= 0; i-- { for i := len(utils) - 1; i >= 0; i-- {
if utils[i] > 15 { if utils[i].Util > 15 {
postTime = utils[len(utils)-1].Time - utils[i].Time
break break
} }
postCnt++
} }
if _, ok := optimizer.predicts[jobName]; !ok { if _, ok := optimizer.predicts[jobName]; !ok {
optimizer.predicts[jobName] = &OptimizerJobExecutionTime{} optimizer.predicts[jobName] = &OptimizerJobExecutionTime{}
} }
postTime := postCnt * optimizer.heartbeatInterval totalTime := utils[len(utils)-1].Time - utils[0].Time
preTime := preCnt * optimizer.heartbeatInterval
totalTime := len(utils) * optimizer.heartbeatInterval
predict := optimizer.predicts[jobName] predict := optimizer.predicts[jobName]
predict.Pre = ((predict.Pre * predict.Version) + preTime) / (predict.Version + 1) predict.Pre = ((predict.Pre * predict.Version) + preTime) / (predict.Version + 1)

View File

@@ -34,7 +34,7 @@ type ResourcePool struct {
bindings map[string]map[string]int bindings map[string]map[string]int
bindingsMu sync.Mutex bindingsMu sync.Mutex
utils map[string][]int utils map[string][]UtilGPUTimeSeries
TotalGPU int TotalGPU int
} }
@@ -64,7 +64,7 @@ func (pool *ResourcePool) start() {
pool.versions = map[string]float64{} pool.versions = map[string]float64{}
pool.bindings = map[string]map[string]int{} pool.bindings = map[string]map[string]int{}
pool.utils = map[string][]int{} pool.utils = map[string][]UtilGPUTimeSeries{}
pool.TotalGPU = 0 pool.TotalGPU = 0
@@ -168,7 +168,8 @@ func (pool *ResourcePool) update(node NodeStatus) {
for _, gpu := range node.Status { for _, gpu := range node.Status {
if _, ok := pool.bindings[gpu.UUID]; ok { if _, ok := pool.bindings[gpu.UUID]; ok {
if len(pool.bindings[gpu.UUID]) == 1 { if len(pool.bindings[gpu.UUID]) == 1 {
pool.utils[gpu.UUID] = append(pool.utils[gpu.UUID], gpu.UtilizationGPU) pool.utils[gpu.UUID] = append(pool.utils[gpu.UUID],
UtilGPUTimeSeries{Time: (int)(time.Now().Unix()), Util: gpu.UtilizationGPU})
} }
} }
} }
@@ -279,7 +280,7 @@ func (pool *ResourcePool) attach(GPU string, job string) {
pool.bindings[GPU][job] = int(time.Now().Unix()) pool.bindings[GPU][job] = int(time.Now().Unix())
if _, ok := pool.utils[GPU]; !ok { if _, ok := pool.utils[GPU]; !ok {
pool.utils[GPU] = []int{} pool.utils[GPU] = []UtilGPUTimeSeries{}
} }
} }
@@ -289,7 +290,7 @@ func (pool *ResourcePool) detach(GPU string, jobName string) {
if _, ok := pool.bindings[GPU]; ok { if _, ok := pool.bindings[GPU]; ok {
if len(pool.bindings[GPU]) == 1 { if len(pool.bindings[GPU]) == 1 {
InstanceOfOptimizer().feed(jobName, pool.utils[GPU]) InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
pool.utils[GPU] = []int{} pool.utils[GPU] = []UtilGPUTimeSeries{}
} }
} }
@@ -313,6 +314,5 @@ func (pool *ResourcePool) pickNode(nodes []*NodeStatus) *NodeStatus {
/* sort */ /* sort */
return nodes[0] return nodes[0]
} }

View File

@@ -36,6 +36,9 @@ type SchedulerFair struct {
UsingGPU int UsingGPU int
UsingGPUMu sync.Mutex UsingGPUMu sync.Mutex
allocatingGPU int
allocatingGPUMu sync.Mutex
} }
type FairJobSorter []Job type FairJobSorter []Job
@@ -66,6 +69,7 @@ func (scheduler *SchedulerFair) Start() {
scheduler.enablePreScheduleRatio = 0.95 scheduler.enablePreScheduleRatio = 0.95
scheduler.UsingGPU = 0 scheduler.UsingGPU = 0
scheduler.allocatingGPU = 0
scheduler.parallelism = 1 scheduler.parallelism = 1
@@ -82,6 +86,7 @@ func (scheduler *SchedulerFair) Start() {
time.Sleep(time.Millisecond * 100) time.Sleep(time.Millisecond * 100)
continue continue
} }
scheduler.schedulingJobsCnt++ scheduler.schedulingJobsCnt++
scheduler.schedulingMu.Unlock() scheduler.schedulingMu.Unlock()
scheduler.queueMu.Lock() scheduler.queueMu.Lock()
@@ -90,6 +95,24 @@ func (scheduler *SchedulerFair) Start() {
jm := JobManager{} jm := JobManager{}
jm.job = scheduler.queues[queue][0] jm.job = scheduler.queues[queue][0]
cnt := 0
for _, task := range jm.job.Tasks {
cnt += task.NumberGPU
}
if scheduler.schedulingJobsCnt > 1 {
if (cnt+scheduler.allocatingGPU+1)*13 > (pool.TotalGPU-scheduler.UsingGPU)*10 {
scheduler.schedulingMu.Lock()
scheduler.schedulingJobsCnt--
scheduler.schedulingMu.Unlock()
continue
}
}
scheduler.allocatingGPUMu.Lock()
scheduler.allocatingGPU += cnt
scheduler.allocatingGPUMu.Unlock()
log.Info("allocatingGPU is ", scheduler.allocatingGPU)
log.Info("schedulingJobsCnt is ", scheduler.schedulingJobsCnt)
scheduler.queues[queue] = scheduler.queues[queue][1:] scheduler.queues[queue] = scheduler.queues[queue][1:]
jm.scheduler = scheduler jm.scheduler = scheduler
scheduler.jobs[jm.job.Name] = &jm scheduler.jobs[jm.job.Name] = &jm
@@ -285,9 +308,9 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && scheduler.enablePreSchedule { if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && scheduler.enablePreSchedule {
estimate, valid := InstanceOfOptimizer().predictTime(job.Name) estimate, valid := InstanceOfOptimizer().predictTime(job.Name)
log.Info(pool.TotalGPU) //log.Info(pool.TotalGPU)
log.Info(estimate, valid) //log.Info(estimate, valid)
log.Info(scheduler.UsingGPU) //log.Info(scheduler.UsingGPU)
if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid { if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid {
allocationType = 3 allocationType = 3
@@ -363,6 +386,10 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
for _, t := range res.Status { for _, t := range res.Status {
scheduler.Attach(t.UUID, job.Name) scheduler.Attach(t.UUID, job.Name)
} }
scheduler.allocatingGPUMu.Lock()
scheduler.allocatingGPU -= task.NumberGPU
scheduler.allocatingGPUMu.Unlock()
log.Info("allocatingGPU is ", scheduler.allocatingGPU)
} }
for i := range locks { for i := range locks {

View File

@@ -166,6 +166,11 @@ type MsgGroupList struct {
Groups []Group `json:"groups"` Groups []Group `json:"groups"`
} }
type UtilGPUTimeSeries struct {
Time int `json:"time"`
Util int `json:"util"`
}
type OptimizerJobExecutionTime struct { type OptimizerJobExecutionTime struct {
Pre int `json:"pre"` Pre int `json:"pre"`
Post int `json:"post"` Post int `json:"post"`