mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-06-08 14:51:55 +00:00
update shceduler_fair
This commit is contained in:
parent
e178154ca0
commit
74373fb950
@ -54,8 +54,7 @@ func (gm *GroupManager) Remove(group Group) MsgGroupCreate {
|
|||||||
func (gm *GroupManager) List() MsgGroupList {
|
func (gm *GroupManager) List() MsgGroupList {
|
||||||
defer gm.mu.Unlock()
|
defer gm.mu.Unlock()
|
||||||
gm.mu.Lock()
|
gm.mu.Lock()
|
||||||
// cannot change to `var`, since it would be json_encoded to null
|
var result []Group
|
||||||
result := []Group{}
|
|
||||||
for _, v := range gm.groups {
|
for _, v := range gm.groups {
|
||||||
result = append(result, v)
|
result = append(result, v)
|
||||||
}
|
}
|
||||||
|
@ -28,7 +28,7 @@ func (jm *JobManager) start() {
|
|||||||
for i := range jm.job.Tasks {
|
for i := range jm.job.Tasks {
|
||||||
var resource NodeStatus
|
var resource NodeStatus
|
||||||
for {
|
for {
|
||||||
resource = jm.scheduler.AcquireResource(jm.job.Tasks[i])
|
resource = jm.scheduler.AcquireResource(jm.job, jm.job.Tasks[i])
|
||||||
if len(resource.Status) > 0 {
|
if len(resource.Status) > 0 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -92,7 +92,7 @@ func (jm *JobManager) start() {
|
|||||||
/* save logs etc. */
|
/* save logs etc. */
|
||||||
|
|
||||||
/* return resource */
|
/* return resource */
|
||||||
jm.scheduler.ReleaseResource(jm.resources[i])
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
||||||
fmt.Println("return resource ", jm.resources[i].ClientID)
|
fmt.Println("return resource ", jm.resources[i].ClientID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -175,7 +175,7 @@ func (jm *JobManager) stop() MsgStop {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i := range jm.resources {
|
for i := range jm.resources {
|
||||||
jm.scheduler.ReleaseResource(jm.resources[i])
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
||||||
}
|
}
|
||||||
jm.scheduler.UpdateProgress(jm.job.Name, Stopped)
|
jm.scheduler.UpdateProgress(jm.job.Name, Stopped)
|
||||||
return MsgStop{Code: 0}
|
return MsgStop{Code: 0}
|
||||||
|
@ -7,9 +7,9 @@ type Scheduler interface {
|
|||||||
|
|
||||||
UpdateProgress(jobName string, state State)
|
UpdateProgress(jobName string, state State)
|
||||||
|
|
||||||
AcquireResource(Task) NodeStatus
|
AcquireResource(Job, Task) NodeStatus
|
||||||
|
|
||||||
ReleaseResource(NodeStatus)
|
ReleaseResource(Job, NodeStatus)
|
||||||
|
|
||||||
AcquireNetwork() string
|
AcquireNetwork() string
|
||||||
|
|
||||||
|
@ -83,7 +83,7 @@ func (scheduler *SchedulerFCFS) Schedule(job Job) {
|
|||||||
job.Status = Created
|
job.Status = Created
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFCFS) AcquireResource(task Task) NodeStatus {
|
func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus {
|
||||||
pool.mu.Lock()
|
pool.mu.Lock()
|
||||||
defer pool.mu.Unlock()
|
defer pool.mu.Unlock()
|
||||||
|
|
||||||
@ -114,7 +114,7 @@ func (scheduler *SchedulerFCFS) AcquireResource(task Task) NodeStatus {
|
|||||||
return res
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFCFS) ReleaseResource(agent NodeStatus) {
|
func (scheduler *SchedulerFCFS) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
pool.mu.Lock()
|
pool.mu.Lock()
|
||||||
defer pool.mu.Unlock()
|
defer pool.mu.Unlock()
|
||||||
nodes := pool.nodes[agent.ClientID]
|
nodes := pool.nodes[agent.ClientID]
|
||||||
|
@ -5,15 +5,23 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
"sort"
|
"sort"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type ResourceCount struct {
|
||||||
|
NumberGPU int
|
||||||
|
MemoryGPU int
|
||||||
|
CPU int
|
||||||
|
Memory int
|
||||||
|
}
|
||||||
|
|
||||||
type SchedulerFair struct {
|
type SchedulerFair struct {
|
||||||
history []*Job
|
history []*Job
|
||||||
queues map[string][]Job
|
queues map[string][]Job
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
scheduling sync.Mutex
|
scheduling sync.Mutex
|
||||||
jobs map[string]*JobManager
|
jobs map[string]*JobManager
|
||||||
nextQueue string
|
nextQueue string
|
||||||
|
resourceAllocations map[string]ResourceCount
|
||||||
}
|
}
|
||||||
|
|
||||||
type FairJobSorter []Job
|
type FairJobSorter []Job
|
||||||
@ -37,16 +45,16 @@ func (scheduler *SchedulerFair) Start() {
|
|||||||
scheduler.nextQueue = "default"
|
scheduler.nextQueue = "default"
|
||||||
scheduler.queues = map[string][]Job{}
|
scheduler.queues = map[string][]Job{}
|
||||||
scheduler.queues["default"] = []Job{}
|
scheduler.queues["default"] = []Job{}
|
||||||
|
scheduler.resourceAllocations = map[string]ResourceCount{}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for {
|
for {
|
||||||
log.Info("Scheduling")
|
log.Debug("Scheduling")
|
||||||
time.Sleep(time.Second * 5)
|
time.Sleep(time.Second * 5)
|
||||||
scheduler.scheduling.Lock()
|
scheduler.scheduling.Lock()
|
||||||
scheduler.mu.Lock()
|
scheduler.mu.Lock()
|
||||||
queue := scheduler.nextQueue
|
queue := scheduler.nextQueue
|
||||||
if len(scheduler.queues[queue]) > 0 {
|
if len(scheduler.queues[queue]) > 0 {
|
||||||
|
|
||||||
jm := JobManager{}
|
jm := JobManager{}
|
||||||
jm.job = scheduler.queues[queue][0]
|
jm.job = scheduler.queues[queue][0]
|
||||||
|
|
||||||
@ -59,10 +67,12 @@ func (scheduler *SchedulerFair) Start() {
|
|||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
jm.start()
|
jm.start()
|
||||||
scheduler.UpdateNextQueue()
|
|
||||||
}()
|
}()
|
||||||
} else {
|
} else {
|
||||||
scheduler.scheduling.Unlock()
|
scheduler.scheduling.Unlock()
|
||||||
|
go func() {
|
||||||
|
scheduler.UpdateNextQueue()
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
scheduler.mu.Unlock()
|
scheduler.mu.Unlock()
|
||||||
}
|
}
|
||||||
@ -140,7 +150,7 @@ func (scheduler *SchedulerFair) Schedule(job Job) {
|
|||||||
job.Status = Created
|
job.Status = Created
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) AcquireResource(task Task) NodeStatus {
|
func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
||||||
pool.mu.Lock()
|
pool.mu.Lock()
|
||||||
defer pool.mu.Unlock()
|
defer pool.mu.Unlock()
|
||||||
|
|
||||||
@ -156,6 +166,8 @@ func (scheduler *SchedulerFair) AcquireResource(task Task) NodeStatus {
|
|||||||
res.ClientID = id
|
res.ClientID = id
|
||||||
res.ClientHost = node.ClientHost
|
res.ClientHost = node.ClientHost
|
||||||
res.Status = available[0:task.NumberGPU]
|
res.Status = available[0:task.NumberGPU]
|
||||||
|
res.NumCPU = task.NumberCPU
|
||||||
|
res.MemTotal = task.Memory
|
||||||
|
|
||||||
for i := range res.Status {
|
for i := range res.Status {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
@ -168,10 +180,23 @@ func (scheduler *SchedulerFair) AcquireResource(task Task) NodeStatus {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
go func(res NodeStatus) {
|
||||||
|
if _, ok := scheduler.resourceAllocations[job.Group]; !ok {
|
||||||
|
scheduler.resourceAllocations[job.Group] = ResourceCount{}
|
||||||
|
}
|
||||||
|
cnt, _ := scheduler.resourceAllocations[job.Group]
|
||||||
|
cnt.CPU += res.MemTotal
|
||||||
|
cnt.Memory += res.NumCPU
|
||||||
|
for _, v := range res.Status {
|
||||||
|
cnt.NumberGPU ++
|
||||||
|
cnt.MemoryGPU += v.MemoryTotal
|
||||||
|
}
|
||||||
|
scheduler.UpdateNextQueue()
|
||||||
|
}(res)
|
||||||
return res
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) ReleaseResource(agent NodeStatus) {
|
func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
pool.mu.Lock()
|
pool.mu.Lock()
|
||||||
defer pool.mu.Unlock()
|
defer pool.mu.Unlock()
|
||||||
nodes := pool.nodes[agent.ClientID]
|
nodes := pool.nodes[agent.ClientID]
|
||||||
@ -182,6 +207,19 @@ func (scheduler *SchedulerFair) ReleaseResource(agent NodeStatus) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
go func(res NodeStatus) {
|
||||||
|
if _, ok := scheduler.resourceAllocations[job.Group]; !ok {
|
||||||
|
scheduler.resourceAllocations[job.Group] = ResourceCount{}
|
||||||
|
}
|
||||||
|
cnt, _ := scheduler.resourceAllocations[job.Group]
|
||||||
|
cnt.CPU -= res.MemTotal
|
||||||
|
cnt.Memory -= res.NumCPU
|
||||||
|
for _, v := range res.Status {
|
||||||
|
cnt.NumberGPU --
|
||||||
|
cnt.MemoryGPU -= v.MemoryTotal
|
||||||
|
}
|
||||||
|
scheduler.UpdateNextQueue()
|
||||||
|
}(agent)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) QueryState(jobName string) MsgJobStatus {
|
func (scheduler *SchedulerFair) QueryState(jobName string) MsgJobStatus {
|
||||||
@ -285,15 +323,36 @@ func (scheduler *SchedulerFair) ReleaseNetwork(network string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) UpdateNextQueue() {
|
func (scheduler *SchedulerFair) UpdateNextQueue() {
|
||||||
flag := false
|
next := "default"
|
||||||
for k := range scheduler.queues {
|
quota := 9999.0
|
||||||
if flag {
|
|
||||||
scheduler.nextQueue = k
|
NumberGPU := 0.00001
|
||||||
return
|
MemoryGPU := 0.00001
|
||||||
}
|
CPU := 0.00001
|
||||||
if k == scheduler.nextQueue {
|
Memory := 0.0001
|
||||||
flag = true
|
for _, node := range pool.nodes {
|
||||||
|
CPU += float64(node.NumCPU)
|
||||||
|
Memory += float64(node.MemTotal)
|
||||||
|
for _, card := range node.Status {
|
||||||
|
NumberGPU += 1.0
|
||||||
|
MemoryGPU += float64(card.MemoryTotal)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
scheduler.nextQueue = "default"
|
|
||||||
|
for k, v := range scheduler.resourceAllocations {
|
||||||
|
tmp := 0.0
|
||||||
|
tmp += float64(v.CPU) / CPU
|
||||||
|
tmp += float64(v.Memory) / Memory
|
||||||
|
tmp += float64(v.NumberGPU) / NumberGPU
|
||||||
|
tmp += float64(v.MemoryGPU) / MemoryGPU
|
||||||
|
tmp /= 4
|
||||||
|
if tmp < quota {
|
||||||
|
quota = tmp
|
||||||
|
next = k
|
||||||
|
}
|
||||||
|
}
|
||||||
|
scheduler.nextQueue = next
|
||||||
|
log.Info("updateNextQueue")
|
||||||
|
log.Info(scheduler.resourceAllocations)
|
||||||
|
log.Info("updateNextQueue ->", next)
|
||||||
}
|
}
|
||||||
|
@ -107,7 +107,7 @@ func (scheduler *SchedulerPriority) Schedule(job Job) {
|
|||||||
job.Status = Created
|
job.Status = Created
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerPriority) AcquireResource(task Task) NodeStatus {
|
func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStatus {
|
||||||
pool.mu.Lock()
|
pool.mu.Lock()
|
||||||
defer pool.mu.Unlock()
|
defer pool.mu.Unlock()
|
||||||
|
|
||||||
@ -138,7 +138,7 @@ func (scheduler *SchedulerPriority) AcquireResource(task Task) NodeStatus {
|
|||||||
return res
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerPriority) ReleaseResource(agent NodeStatus) {
|
func (scheduler *SchedulerPriority) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
pool.mu.Lock()
|
pool.mu.Lock()
|
||||||
defer pool.mu.Unlock()
|
defer pool.mu.Unlock()
|
||||||
nodes := pool.nodes[agent.ClientID]
|
nodes := pool.nodes[agent.ClientID]
|
||||||
|
Loading…
Reference in New Issue
Block a user