1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-15 16:16:44 +00:00
This commit is contained in:
2020-06-29 23:24:33 +08:00
parent 769fa702f3
commit 1f32eeea40
7 changed files with 332 additions and 353 deletions

View File

@@ -336,7 +336,7 @@ func (pool *ResourcePool) update(node NodeStatus) {
if _, ok := pool.bindings[gpu.UUID]; ok {
if _, ok2 := pool.utils[gpu.UUID]; ok2 {
pool.utils[gpu.UUID] = append(pool.utils[gpu.UUID],
UtilGPUTimeSeries{Time: (int)(time.Now().Unix()), Util: gpu.UtilizationGPU})
UtilGPUTimeSeries{Time: time.Now().Unix(), Util: gpu.UtilizationGPU})
}
}
@@ -743,58 +743,59 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
if pool.TotalGPU == 0 {
return []NodeStatus{}
}
loadRatio := float64(pool.UsingGPU) / float64(pool.TotalGPU)
//loadRatio := float64(pool.UsingGPU) / float64(pool.TotalGPU)
/* first, choose sharable GPUs */
/*
if pool.enableShare && len(job.Tasks) == 1 && task.NumberGPU == 1 && loadRatio >= pool.enableShareRatio {
// check sharable
allocationType = 1
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
pred := InstanceOfOptimizer().PredictReq(job, "Worker")
for cur := start; ; {
if _, ok := locks[cur.ID]; !ok {
cur.Lock.Lock()
locks[cur.ID] = &cur.Lock
}
for cur := start; ; {
if _, ok := locks[cur.ID]; !ok {
cur.Lock.Lock()
locks[cur.ID] = &cur.Lock
}
for _, node := range cur.Nodes {
var available []GPUStatus
for _, status := range node.Status {
if status.MemoryAllocated > 0 && status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated {
for _, node := range cur.Nodes {
var available []GPUStatus
for _, status := range node.Status {
if status.MemoryAllocated > 0 && status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated {
if jobs, ok := pool.bindings[status.UUID]; ok {
totalUtil := util
for job := range jobs {
if utilT, ok := InstanceOfOptimizer().predictUtilGPU(job); ok {
totalUtil += utilT
} else {
totalUtil += 100
}
}
if totalUtil < 100 {
available = append(available, status)
if jobs, ok := pool.bindings[status.UUID]; ok {
totalUtil := pred.UtilGPU
for job := range jobs {
if utilT, ok := InstanceOfOptimizer().predictUtilGPU(job); ok {
totalUtil += utilT
} else {
totalUtil += 100
}
}
if totalUtil < 100 {
available = append(available, status)
}
}
}
if len(available) >= task.NumberGPU {
candidates = append(candidates, *node)
if len(candidates) >= len(job.Tasks)*3+5 {
break
}
}
if len(available) >= task.NumberGPU {
candidates = append(candidates, *node)
if len(candidates) >= len(job.Tasks)*3+5 {
break
}
}
if len(candidates) >= len(job.Tasks)*3+5 {
break
}
if cur.ID > cur.Next.ID {
break
}
cur = cur.Next
}
if len(candidates) >= len(job.Tasks)*3+5 {
break
}
if cur.ID > cur.Next.ID {
break
}
cur = cur.Next
}
//log.Info(candidates)
}
*/
//log.Info(candidates)
/* second round, find vacant gpu */
if len(candidates) == 0 {
@@ -831,10 +832,11 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
}
/* third round, find gpu to be released */
/*
if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && pool.enablePreSchedule {
estimate, valid := InstanceOfOptimizer().predictTime(job.Name)
estimate := InstanceOfOptimizer().PredictTime(job)
if loadRatio >= pool.enablePreScheduleRatio && valid {
if loadRatio >= pool.enablePreScheduleRatio {
allocationType = 3
for cur := start; ; {
if _, ok := locks[cur.ID]; !ok {
@@ -850,13 +852,11 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
continue
}
for taskT, s := range tasks {
est, valid2 := InstanceOfOptimizer().predictTime(taskT)
if valid2 {
now := (int)(time.Now().Unix())
log.Info(s, now, estimate, est)
if now-s > est.Total-est.Post-estimate.Pre-15 {
available = append(available, status)
}
est := InstanceOfOptimizer().PredictTime(taskT)
now := (int)(time.Now().Unix())
log.Info(s, now, estimate, est)
if now-s > est.Total-est.Post-estimate.Pre-15 {
available = append(available, status)
}
}
}
@@ -879,6 +879,7 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
//log.Info(candidates)
}
}
*/
if len(candidates) > 0 {
log.Info("allocationType is ", allocationType)