1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-12 23:36:44 +00:00
This commit is contained in:
2020-05-05 15:44:48 +08:00
parent 3f8c4ed936
commit f34b157e0b
2 changed files with 22 additions and 5 deletions

View File

@@ -39,7 +39,7 @@ func InstanceOfOptimizer() *Optimizer {
func (optimizer *Optimizer) feed(job string, utils []UtilGPUTimeSeries) {
log.Info("optimizer feed")
log.Info(job, utils)
//log.Info(job, utils)
if len(utils) == 0 {
return
@@ -161,6 +161,7 @@ func (optimizer *Optimizer) feedData(job string, seq int, pre int, main int, pos
err := spider.do()
if err != nil {
log.Warn(err)
return
}
@@ -170,6 +171,7 @@ func (optimizer *Optimizer) feedData(job string, seq int, pre int, main int, pos
}
resp.Body.Close()
if err != nil {
log.Warn(err)
return
}
}

View File

@@ -131,7 +131,7 @@ func (scheduler *SchedulerFair) Start() {
scheduler.queuesUsingGPUMu.Unlock()
log.Info(cnt, reserved, pool.TotalGPU, scheduler.UsingGPU, scheduler.allocatingGPU)
if cnt*10+(scheduler.allocatingGPU)*13 > (pool.TotalGPU-scheduler.UsingGPU-reserved)*10 {
if scheduler.schedulingJobsCnt > 1 && (cnt*10+(scheduler.allocatingGPU)*13 > (pool.TotalGPU-scheduler.UsingGPU-reserved)*10) {
scheduler.schedulingMu.Lock()
scheduler.schedulingJobsCnt--
scheduler.schedulingMu.Unlock()
@@ -371,7 +371,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
for _, node := range cur.Nodes {
var available []GPUStatus
for _, status := range node.Status {
if status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated && status.MemoryFree > task.MemoryGPU {
if status.MemoryAllocated > 0 && status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated && status.MemoryFree > task.MemoryGPU {
if jobs, ok := pool.bindings[status.UUID]; ok {
totalUtil := util
@@ -409,7 +409,22 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
}
/* second round, find vacant gpu */
if len(candidates) == 0 {
flag := false
reserved := scheduler.reservedGPU
scheduler.queuesUsingGPUMu.Lock()
for g, v := range scheduler.queueUsingGPU {
if InstanceOfGroupManager().groups[g].Reserved {
reserved -= v
}
}
scheduler.queuesUsingGPUMu.Unlock()
if g, ok := InstanceOfGroupManager().groups[job.Group]; ok && g.Reserved && g.NumGPU > scheduler.queueUsingGPU[job.Group] {
flag = true
}
if task.NumberGPU <= pool.TotalGPU-scheduler.UsingGPU-reserved {
flag = true
}
if len(candidates) == 0 && !flag {
allocationType = 2
for cur := start; ; {
if _, ok := locks[cur.ID]; !ok {
@@ -462,7 +477,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
for _, status := range node.Status {
bindings := pool.getBindings()
if tasks, ok := bindings[status.UUID]; ok {
if len(tasks) > 1 {
if len(tasks) > 1 || status.MemoryAllocated == 0 {
continue
}
for task_t, s := range tasks {