1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-15 08:16:43 +00:00
This commit is contained in:
2020-06-08 20:49:50 +08:00
parent e8c6ea53e2
commit 6f01eef504
3 changed files with 122 additions and 47 deletions

View File

@@ -61,6 +61,11 @@ type ResourcePool struct {
enableShareRatio float64
enablePreSchedule bool
enablePreScheduleRatio float64
enableBatch bool
batchJobs []Job
batchMu sync.Mutex
batchAllocations map[string][]NodeStatus
}
func (pool *ResourcePool) init(conf Configuration) {
@@ -83,6 +88,8 @@ func (pool *ResourcePool) init(conf Configuration) {
pool.enablePreSchedule = true
pool.enablePreScheduleRatio = 0.95
pool.enableBatch = false
/* init pools */
pool.poolsCount = 300
for i := 0; i < pool.poolsCount; i++ {
@@ -115,6 +122,31 @@ func (pool *ResourcePool) init(conf Configuration) {
go func() {
pool.saveStatusHistory()
}()
go func() {
time.Sleep(time.Second * 10)
pool.batchMu.Lock()
var tasks []Task
for _, job := range pool.batchJobs {
for _, task := range job.Tasks {
task.Job = job.Name
tasks = append(tasks, task)
}
}
job := Job{Tasks: tasks}
nodes := pool.doAcquireResource(job)
for i, task := range job.Tasks {
if _, ok := pool.batchAllocations[task.Job]; !ok {
pool.batchAllocations[task.Job] = []NodeStatus{}
}
pool.batchAllocations[task.Job] = append(pool.batchAllocations[task.Job], nodes[i])
}
if len(nodes) > 0 {
pool.batchJobs = []Job{}
}
pool.batchMu.Unlock()
}()
}
/* check dead nodes periodically */
@@ -634,6 +666,27 @@ func (pool *ResourcePool) pickNode(candidates []*NodeStatus, availableGPUs map[s
}
func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
if !pool.enableBatch {
for i := range job.Tasks {
job.Tasks[i].Job = job.Name
}
return pool.acquireResource(job)
}
for {
if _, ok := pool.batchAllocations[job.Name]; ok {
break
} else {
time.Sleep(time.Millisecond * 100)
}
}
pool.batchMu.Lock()
nodes := pool.batchAllocations[job.Name]
delete(pool.batchAllocations, job.Name)
pool.batchMu.Unlock()
return nodes
}
func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
if len(job.Tasks) == 0 {
return []NodeStatus{}
}
@@ -876,7 +929,7 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
}
}
for _, t := range res.Status {
pool.attach(t.UUID, job.Name)
pool.attach(t.UUID, task.Job)
}
for i := range job.Tasks {