1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-15 08:16:43 +00:00
This commit is contained in:
2020-05-05 15:44:48 +08:00
parent 3f8c4ed936
commit f34b157e0b
2 changed files with 22 additions and 5 deletions

View File

@@ -39,7 +39,7 @@ func InstanceOfOptimizer() *Optimizer {
func (optimizer *Optimizer) feed(job string, utils []UtilGPUTimeSeries) { func (optimizer *Optimizer) feed(job string, utils []UtilGPUTimeSeries) {
log.Info("optimizer feed") log.Info("optimizer feed")
log.Info(job, utils) //log.Info(job, utils)
if len(utils) == 0 { if len(utils) == 0 {
return return
@@ -161,6 +161,7 @@ func (optimizer *Optimizer) feedData(job string, seq int, pre int, main int, pos
err := spider.do() err := spider.do()
if err != nil { if err != nil {
log.Warn(err)
return return
} }
@@ -170,6 +171,7 @@ func (optimizer *Optimizer) feedData(job string, seq int, pre int, main int, pos
} }
resp.Body.Close() resp.Body.Close()
if err != nil { if err != nil {
log.Warn(err)
return return
} }
} }

View File

@@ -131,7 +131,7 @@ func (scheduler *SchedulerFair) Start() {
scheduler.queuesUsingGPUMu.Unlock() scheduler.queuesUsingGPUMu.Unlock()
log.Info(cnt, reserved, pool.TotalGPU, scheduler.UsingGPU, scheduler.allocatingGPU) log.Info(cnt, reserved, pool.TotalGPU, scheduler.UsingGPU, scheduler.allocatingGPU)
if cnt*10+(scheduler.allocatingGPU)*13 > (pool.TotalGPU-scheduler.UsingGPU-reserved)*10 { if scheduler.schedulingJobsCnt > 1 && (cnt*10+(scheduler.allocatingGPU)*13 > (pool.TotalGPU-scheduler.UsingGPU-reserved)*10) {
scheduler.schedulingMu.Lock() scheduler.schedulingMu.Lock()
scheduler.schedulingJobsCnt-- scheduler.schedulingJobsCnt--
scheduler.schedulingMu.Unlock() scheduler.schedulingMu.Unlock()
@@ -371,7 +371,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
for _, node := range cur.Nodes { for _, node := range cur.Nodes {
var available []GPUStatus var available []GPUStatus
for _, status := range node.Status { for _, status := range node.Status {
if status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated && status.MemoryFree > task.MemoryGPU { if status.MemoryAllocated > 0 && status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated && status.MemoryFree > task.MemoryGPU {
if jobs, ok := pool.bindings[status.UUID]; ok { if jobs, ok := pool.bindings[status.UUID]; ok {
totalUtil := util totalUtil := util
@@ -409,7 +409,22 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
} }
/* second round, find vacant gpu */ /* second round, find vacant gpu */
if len(candidates) == 0 { flag := false
reserved := scheduler.reservedGPU
scheduler.queuesUsingGPUMu.Lock()
for g, v := range scheduler.queueUsingGPU {
if InstanceOfGroupManager().groups[g].Reserved {
reserved -= v
}
}
scheduler.queuesUsingGPUMu.Unlock()
if g, ok := InstanceOfGroupManager().groups[job.Group]; ok && g.Reserved && g.NumGPU > scheduler.queueUsingGPU[job.Group] {
flag = true
}
if task.NumberGPU <= pool.TotalGPU-scheduler.UsingGPU-reserved {
flag = true
}
if len(candidates) == 0 && !flag {
allocationType = 2 allocationType = 2
for cur := start; ; { for cur := start; ; {
if _, ok := locks[cur.ID]; !ok { if _, ok := locks[cur.ID]; !ok {
@@ -462,7 +477,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
for _, status := range node.Status { for _, status := range node.Status {
bindings := pool.getBindings() bindings := pool.getBindings()
if tasks, ok := bindings[status.UUID]; ok { if tasks, ok := bindings[status.UUID]; ok {
if len(tasks) > 1 { if len(tasks) > 1 || status.MemoryAllocated == 0 {
continue continue
} }
for task_t, s := range tasks { for task_t, s := range tasks {