1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-06-07 14:21:55 +00:00

update fair

This commit is contained in:
Newnius 2020-05-28 09:40:10 +08:00
parent d1a9bad295
commit f6e0081d53
2 changed files with 23 additions and 45 deletions

View File

@ -309,12 +309,15 @@ func main() {
case "FCFS": case "FCFS":
scheduler = &SchedulerFCFS{} scheduler = &SchedulerFCFS{}
break break
case "fair":
scheduler = &SchedulerCapacity{}
break
case "priority": case "priority":
scheduler = &SchedulerPriority{} scheduler = &SchedulerPriority{}
break break
case "capacity":
scheduler = &SchedulerCapacity{}
break
case "fair":
scheduler = &SchedulerFair{}
break
default: default:
scheduler = &SchedulerFCFS{} scheduler = &SchedulerFCFS{}
} }

View File

@ -15,7 +15,7 @@ type SchedulerCapacity struct {
nextQueue string nextQueue string
jobs map[string]*JobManager jobs map[string]*JobManager
queues map[string][]Job queues map[string][]Job
queueMu sync.Mutex queuesMu sync.Mutex
schedulingJobs map[string]bool schedulingJobs map[string]bool
schedulingMu sync.Mutex schedulingMu sync.Mutex
@ -28,9 +28,6 @@ type SchedulerCapacity struct {
allocatingGPU int allocatingGPU int
allocatingGPUMu sync.Mutex allocatingGPUMu sync.Mutex
queuesSchedulingCnt map[string]int
queuesUsingGPUMu sync.Mutex
} }
type FairJobSorter []Job type FairJobSorter []Job
@ -46,7 +43,7 @@ func (s FairJobSorter) Less(i, j int) bool {
} }
func (scheduler *SchedulerCapacity) Start() { func (scheduler *SchedulerCapacity) Start() {
log.Info("JS started") log.Info("JS (capacity) started")
scheduler.jobs = map[string]*JobManager{} scheduler.jobs = map[string]*JobManager{}
scheduler.history = []*Job{} scheduler.history = []*Job{}
@ -57,12 +54,10 @@ func (scheduler *SchedulerCapacity) Start() {
scheduler.enabled = true scheduler.enabled = true
scheduler.schedulingJobs = map[string]bool{} scheduler.schedulingJobs = map[string]bool{}
scheduler.allocatingGPU = 0 scheduler.allocatingGPU = 0
scheduler.queuesSchedulingCnt = map[string]int{}
scheduler.parallelism = 1 scheduler.parallelism = 1
go func() { go func() {
/* capacity scheduler */
flag := true flag := true
for { for {
log.Debug("Scheduling") log.Debug("Scheduling")
@ -82,7 +77,7 @@ func (scheduler *SchedulerCapacity) Start() {
} }
scheduler.schedulingMu.Unlock() scheduler.schedulingMu.Unlock()
scheduler.queueMu.Lock() scheduler.queuesMu.Lock()
queue := scheduler.nextQueue queue := scheduler.nextQueue
go func() { go func() {
scheduler.UpdateNextQueue() scheduler.UpdateNextQueue()
@ -99,7 +94,7 @@ func (scheduler *SchedulerCapacity) Start() {
pool := InstanceOfResourcePool() pool := InstanceOfResourcePool()
log.Info(cnt, pool.TotalGPU, pool.UsingGPU, scheduler.allocatingGPU) log.Info(cnt, pool.TotalGPU, pool.UsingGPU, scheduler.allocatingGPU)
if len(scheduler.schedulingJobs) > 1 && (cnt*10+(scheduler.allocatingGPU)*13 > (pool.TotalGPU-pool.UsingGPU)*10) { if len(scheduler.schedulingJobs) > 1 && (cnt*10+(scheduler.allocatingGPU)*13 > (pool.TotalGPU-pool.UsingGPU)*10) {
scheduler.queueMu.Unlock() scheduler.queuesMu.Unlock()
continue continue
} }
@ -119,10 +114,6 @@ func (scheduler *SchedulerCapacity) Start() {
scheduler.history = append(scheduler.history, &jm.job) scheduler.history = append(scheduler.history, &jm.job)
scheduler.historyMu.Unlock() scheduler.historyMu.Unlock()
scheduler.queuesUsingGPUMu.Lock()
scheduler.queuesSchedulingCnt[jm.job.Group]++
scheduler.queuesUsingGPUMu.Unlock()
scheduler.schedulingMu.Lock() scheduler.schedulingMu.Lock()
scheduler.schedulingJobs[jm.job.Name] = true scheduler.schedulingJobs[jm.job.Name] = true
scheduler.schedulingMu.Unlock() scheduler.schedulingMu.Unlock()
@ -132,7 +123,7 @@ func (scheduler *SchedulerCapacity) Start() {
} else { } else {
log.Debug("No more jobs to scheduling ", time.Now()) log.Debug("No more jobs to scheduling ", time.Now())
} }
scheduler.queueMu.Unlock() scheduler.queuesMu.Unlock()
} }
}() }()
} }
@ -147,16 +138,6 @@ func (scheduler *SchedulerCapacity) UpdateProgress(job Job, state State) {
switch state { switch state {
case Running: case Running:
scheduler.queuesUsingGPUMu.Lock()
if _, ok := scheduler.queuesSchedulingCnt[job.Group]; ok {
scheduler.queuesSchedulingCnt[job.Group]--
if scheduler.queuesSchedulingCnt[job.Group] < 0 {
scheduler.queuesSchedulingCnt[job.Group] = 0
log.Warn("scheduler.queuesSchedulingCnt less than 0", job.Group)
}
}
scheduler.queuesUsingGPUMu.Unlock()
for i := range scheduler.history { for i := range scheduler.history {
if scheduler.history[i].Name == job.Name { if scheduler.history[i].Name == job.Name {
scheduler.history[i].Status = Running scheduler.history[i].Status = Running
@ -192,8 +173,8 @@ func (scheduler *SchedulerCapacity) UpdateProgress(job Job, state State) {
} }
func (scheduler *SchedulerCapacity) Schedule(job Job) { func (scheduler *SchedulerCapacity) Schedule(job Job) {
scheduler.queueMu.Lock() scheduler.queuesMu.Lock()
defer scheduler.queueMu.Unlock() defer scheduler.queuesMu.Unlock()
queue := job.Group queue := job.Group
_, ok := scheduler.queues[queue] _, ok := scheduler.queues[queue]
@ -287,9 +268,9 @@ func (scheduler *SchedulerCapacity) ReleaseResource(job Job, agent NodeStatus) {
} }
func (scheduler *SchedulerCapacity) QueryState(jobName string) MsgJobStatus { func (scheduler *SchedulerCapacity) QueryState(jobName string) MsgJobStatus {
scheduler.queueMu.Lock() scheduler.queuesMu.Lock()
jm, ok := scheduler.jobs[jobName] jm, ok := scheduler.jobs[jobName]
scheduler.queueMu.Unlock() scheduler.queuesMu.Unlock()
if !ok { if !ok {
return MsgJobStatus{Code: 1, Error: "Job not exist!"} return MsgJobStatus{Code: 1, Error: "Job not exist!"}
} }
@ -297,9 +278,9 @@ func (scheduler *SchedulerCapacity) QueryState(jobName string) MsgJobStatus {
} }
func (scheduler *SchedulerCapacity) Stop(jobName string) MsgStop { func (scheduler *SchedulerCapacity) Stop(jobName string) MsgStop {
scheduler.queueMu.Lock() scheduler.queuesMu.Lock()
jm, ok := scheduler.jobs[jobName] jm, ok := scheduler.jobs[jobName]
scheduler.queueMu.Unlock() scheduler.queuesMu.Unlock()
if !ok { if !ok {
return MsgStop{Code: 1, Error: "Job not exist!"} return MsgStop{Code: 1, Error: "Job not exist!"}
} }
@ -307,9 +288,9 @@ func (scheduler *SchedulerCapacity) Stop(jobName string) MsgStop {
} }
func (scheduler *SchedulerCapacity) QueryLogs(jobName string, taskName string) MsgLog { func (scheduler *SchedulerCapacity) QueryLogs(jobName string, taskName string) MsgLog {
scheduler.queueMu.Lock() scheduler.queuesMu.Lock()
jm, ok := scheduler.jobs[jobName] jm, ok := scheduler.jobs[jobName]
scheduler.queueMu.Unlock() scheduler.queuesMu.Unlock()
if !ok { if !ok {
return MsgLog{Code: 1, Error: "Job not exist!"} return MsgLog{Code: 1, Error: "Job not exist!"}
} }
@ -347,11 +328,11 @@ func (scheduler *SchedulerCapacity) Summary() MsgSummary {
} }
scheduler.historyMu.Unlock() scheduler.historyMu.Unlock()
scheduler.queueMu.Lock() scheduler.queuesMu.Lock()
for _, v := range scheduler.queues { for _, v := range scheduler.queues {
tmp = append(tmp, v...) tmp = append(tmp, v...)
} }
scheduler.queueMu.Unlock() scheduler.queuesMu.Unlock()
for _, job := range tmp { for _, job := range tmp {
switch job.Status { switch job.Status {
@ -383,7 +364,7 @@ func (scheduler *SchedulerCapacity) UpdateNextQueue() {
NumberGPU := float64(InstanceOfResourcePool().TotalGPU) + 0.00001 NumberGPU := float64(InstanceOfResourcePool().TotalGPU) + 0.00001
scheduler.queueMu.Lock() scheduler.queuesMu.Lock()
for k, t := range scheduler.queues { for k, t := range scheduler.queues {
if len(t) == 0 { if len(t) == 0 {
continue continue
@ -407,7 +388,7 @@ func (scheduler *SchedulerCapacity) UpdateNextQueue() {
} }
} }
scheduler.nextQueue = next scheduler.nextQueue = next
scheduler.queueMu.Unlock() scheduler.queuesMu.Unlock()
log.Debug("updateNextQueue ->", next) log.Debug("updateNextQueue ->", next)
} }
@ -430,11 +411,5 @@ func (scheduler *SchedulerCapacity) UpdateParallelism(parallelism int) bool {
} }
func (scheduler *SchedulerCapacity) updateGroup(group Group) bool { func (scheduler *SchedulerCapacity) updateGroup(group Group) bool {
num := 0
for _, g := range InstanceOfGroupManager().List().Groups {
if g.Reserved {
num += g.NumGPU
}
}
return true return true
} }