2019-07-29 07:01:59 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
log "github.com/sirupsen/logrus"
|
2019-08-01 03:11:37 +00:00
|
|
|
"sort"
|
2019-08-01 05:42:53 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
type ResourceCount struct {
|
|
|
|
NumberGPU int
|
|
|
|
MemoryGPU int
|
|
|
|
CPU int
|
|
|
|
Memory int
|
|
|
|
}
|
2019-07-29 07:01:59 +00:00
|
|
|
|
|
|
|
type SchedulerFair struct {
|
2019-08-01 05:42:53 +00:00
|
|
|
history []*Job
|
|
|
|
queues map[string][]Job
|
|
|
|
mu sync.Mutex
|
|
|
|
scheduling sync.Mutex
|
|
|
|
jobs map[string]*JobManager
|
|
|
|
nextQueue string
|
2019-08-01 06:00:24 +00:00
|
|
|
resourceAllocations map[string]*ResourceCount
|
2020-04-13 10:26:40 +00:00
|
|
|
enabled bool
|
2019-08-01 03:11:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type FairJobSorter []Job
|
|
|
|
|
|
|
|
func (s FairJobSorter) Len() int {
|
|
|
|
return len(s)
|
|
|
|
}
|
|
|
|
func (s FairJobSorter) Swap(i, j int) {
|
|
|
|
s[i], s[j] = s[j], s[i]
|
|
|
|
}
|
|
|
|
func (s FairJobSorter) Less(i, j int) bool {
|
2019-08-01 07:03:56 +00:00
|
|
|
return s[i].CreatedAt < s[j].CreatedAt
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) Start() {
|
|
|
|
scheduler.jobs = map[string]*JobManager{}
|
|
|
|
scheduler.history = []*Job{}
|
2019-08-01 03:11:37 +00:00
|
|
|
scheduler.nextQueue = "default"
|
2019-08-01 03:14:14 +00:00
|
|
|
scheduler.queues = map[string][]Job{}
|
2019-08-01 03:13:14 +00:00
|
|
|
scheduler.queues["default"] = []Job{}
|
2019-08-01 06:00:24 +00:00
|
|
|
scheduler.resourceAllocations = map[string]*ResourceCount{}
|
2020-04-13 10:37:54 +00:00
|
|
|
scheduler.enabled = true
|
2019-07-29 07:01:59 +00:00
|
|
|
|
|
|
|
go func() {
|
|
|
|
for {
|
2019-08-01 05:42:53 +00:00
|
|
|
log.Debug("Scheduling")
|
2020-04-12 03:13:23 +00:00
|
|
|
time.Sleep(time.Millisecond * 100)
|
2020-04-13 10:26:40 +00:00
|
|
|
if !scheduler.enabled {
|
|
|
|
continue
|
|
|
|
}
|
2019-07-29 07:01:59 +00:00
|
|
|
scheduler.scheduling.Lock()
|
|
|
|
scheduler.mu.Lock()
|
2019-08-01 03:11:37 +00:00
|
|
|
queue := scheduler.nextQueue
|
2019-08-01 02:42:37 +00:00
|
|
|
if len(scheduler.queues[queue]) > 0 {
|
2019-07-29 07:01:59 +00:00
|
|
|
jm := JobManager{}
|
2019-08-01 02:42:37 +00:00
|
|
|
jm.job = scheduler.queues[queue][0]
|
2019-07-29 07:01:59 +00:00
|
|
|
|
2019-08-01 02:42:37 +00:00
|
|
|
scheduler.queues[queue] = scheduler.queues[queue][1:]
|
2019-07-29 07:01:59 +00:00
|
|
|
jm.scheduler = scheduler
|
|
|
|
scheduler.jobs[jm.job.Name] = &jm
|
|
|
|
|
|
|
|
jm.job.Status = Starting
|
|
|
|
scheduler.history = append(scheduler.history, &jm.job)
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
jm.start()
|
|
|
|
}()
|
|
|
|
} else {
|
2020-04-13 11:41:28 +00:00
|
|
|
log.Info("No more jobs to schedule", time.Now())
|
2019-07-29 07:01:59 +00:00
|
|
|
scheduler.scheduling.Unlock()
|
2019-08-01 05:42:53 +00:00
|
|
|
go func() {
|
|
|
|
scheduler.UpdateNextQueue()
|
|
|
|
}()
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
|
|
|
scheduler.mu.Unlock()
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) UpdateProgress(jobName string, state State) {
|
|
|
|
switch state {
|
|
|
|
case Running:
|
|
|
|
scheduler.scheduling.Unlock()
|
|
|
|
|
|
|
|
for i := range scheduler.history {
|
|
|
|
if scheduler.history[i].Name == jobName {
|
|
|
|
scheduler.history[i].Status = Running
|
2020-04-12 10:42:55 +00:00
|
|
|
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break
|
|
|
|
case Finished:
|
|
|
|
for i := range scheduler.history {
|
|
|
|
if scheduler.history[i].Name == jobName {
|
|
|
|
scheduler.history[i].Status = Finished
|
2020-04-12 10:42:55 +00:00
|
|
|
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break
|
|
|
|
case Stopped:
|
|
|
|
for i := range scheduler.history {
|
|
|
|
if scheduler.history[i].Name == jobName {
|
|
|
|
scheduler.history[i].Status = Stopped
|
2020-04-12 10:42:55 +00:00
|
|
|
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) Schedule(job Job) {
|
|
|
|
scheduler.mu.Lock()
|
|
|
|
defer scheduler.mu.Unlock()
|
|
|
|
|
2019-08-01 03:11:37 +00:00
|
|
|
queue := job.Group
|
2019-08-01 02:42:37 +00:00
|
|
|
_, ok := scheduler.queues[queue]
|
|
|
|
if !ok {
|
|
|
|
if InstanceOfGroupManager().get(queue) != nil {
|
|
|
|
scheduler.queues[queue] = []Job{}
|
|
|
|
} else {
|
|
|
|
queue = "default"
|
|
|
|
}
|
|
|
|
}
|
2019-07-29 07:01:59 +00:00
|
|
|
|
2019-08-01 02:42:37 +00:00
|
|
|
index := 0
|
2019-07-29 07:01:59 +00:00
|
|
|
left := 0
|
2019-08-01 03:17:57 +00:00
|
|
|
right := len(scheduler.queues[queue]) - 1
|
2019-07-29 07:01:59 +00:00
|
|
|
for ; left <= right; {
|
|
|
|
mid := (left + right) / 2
|
2019-08-01 02:42:37 +00:00
|
|
|
if scheduler.queues[queue][left].Priority < job.Priority {
|
2019-07-29 07:01:59 +00:00
|
|
|
index = left
|
|
|
|
break
|
|
|
|
}
|
2019-08-01 02:42:37 +00:00
|
|
|
if scheduler.queues[queue][right].Priority >= job.Priority {
|
2019-07-29 07:01:59 +00:00
|
|
|
index = right + 1
|
|
|
|
break
|
|
|
|
}
|
2019-08-01 02:42:37 +00:00
|
|
|
if scheduler.queues[queue][mid].Priority >= job.Priority {
|
2019-07-29 07:01:59 +00:00
|
|
|
left = mid + 1
|
|
|
|
} else {
|
|
|
|
right = mid - 1
|
|
|
|
}
|
|
|
|
}
|
2019-08-01 02:42:37 +00:00
|
|
|
scheduler.queues[queue] = append(scheduler.queues[queue], Job{})
|
2019-07-29 07:01:59 +00:00
|
|
|
|
2019-08-01 02:42:37 +00:00
|
|
|
copy(scheduler.queues[queue][index+1:], scheduler.queues[queue][index:])
|
|
|
|
scheduler.queues[queue][index] = job
|
2019-07-29 07:01:59 +00:00
|
|
|
|
|
|
|
job.Status = Created
|
|
|
|
}
|
|
|
|
|
2019-08-01 05:42:53 +00:00
|
|
|
func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
2019-07-29 07:01:59 +00:00
|
|
|
pool.mu.Lock()
|
|
|
|
defer pool.mu.Unlock()
|
|
|
|
|
|
|
|
res := NodeStatus{}
|
|
|
|
for id, node := range pool.nodes {
|
|
|
|
var available []GPUStatus
|
|
|
|
for _, status := range node.Status {
|
|
|
|
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
|
|
|
available = append(available, status)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(available) >= task.NumberGPU {
|
|
|
|
res.ClientID = id
|
|
|
|
res.ClientHost = node.ClientHost
|
|
|
|
res.Status = available[0:task.NumberGPU]
|
2019-08-01 05:42:53 +00:00
|
|
|
res.NumCPU = task.NumberCPU
|
|
|
|
res.MemTotal = task.Memory
|
2019-07-29 07:01:59 +00:00
|
|
|
|
|
|
|
for i := range res.Status {
|
|
|
|
for j := range node.Status {
|
|
|
|
if res.Status[i].UUID == node.Status[j].UUID {
|
|
|
|
node.Status[j].MemoryAllocated += task.MemoryGPU
|
|
|
|
res.Status[i].MemoryTotal = task.MemoryGPU
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2019-08-01 05:42:53 +00:00
|
|
|
go func(res NodeStatus) {
|
2019-08-01 06:16:44 +00:00
|
|
|
if len(res.Status) == 0 {
|
|
|
|
return
|
|
|
|
}
|
2019-08-01 05:42:53 +00:00
|
|
|
if _, ok := scheduler.resourceAllocations[job.Group]; !ok {
|
2019-08-01 06:00:24 +00:00
|
|
|
scheduler.resourceAllocations[job.Group] = &ResourceCount{}
|
2019-08-01 05:42:53 +00:00
|
|
|
}
|
2019-08-01 06:03:17 +00:00
|
|
|
cnt, _ := scheduler.resourceAllocations[job.Group]
|
2019-08-01 05:42:53 +00:00
|
|
|
cnt.CPU += res.MemTotal
|
|
|
|
cnt.Memory += res.NumCPU
|
|
|
|
for _, v := range res.Status {
|
|
|
|
cnt.NumberGPU ++
|
|
|
|
cnt.MemoryGPU += v.MemoryTotal
|
|
|
|
}
|
2019-08-01 06:16:44 +00:00
|
|
|
scheduler.UpdateNextQueue()
|
|
|
|
|
2019-08-01 05:42:53 +00:00
|
|
|
}(res)
|
2019-07-29 07:01:59 +00:00
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
2019-08-01 05:42:53 +00:00
|
|
|
func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
|
2019-07-29 07:01:59 +00:00
|
|
|
pool.mu.Lock()
|
|
|
|
defer pool.mu.Unlock()
|
2020-04-13 11:41:28 +00:00
|
|
|
node := pool.nodes[agent.ClientID]
|
2019-07-29 07:01:59 +00:00
|
|
|
for _, gpu := range agent.Status {
|
2020-04-13 11:41:28 +00:00
|
|
|
for j := range node.Status {
|
|
|
|
if gpu.UUID == node.Status[j].UUID {
|
|
|
|
node.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
|
|
|
if node.Status[j].MemoryAllocated < 0 {
|
2020-04-13 10:26:40 +00:00
|
|
|
// in case of error
|
2020-04-13 11:41:28 +00:00
|
|
|
log.Warn(node.ClientID, "More Memory Allocated")
|
|
|
|
node.Status[j].MemoryAllocated = 0
|
2020-04-12 17:30:25 +00:00
|
|
|
}
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-08-01 05:42:53 +00:00
|
|
|
go func(res NodeStatus) {
|
|
|
|
if _, ok := scheduler.resourceAllocations[job.Group]; !ok {
|
2019-08-01 06:00:24 +00:00
|
|
|
scheduler.resourceAllocations[job.Group] = &ResourceCount{}
|
2019-08-01 05:42:53 +00:00
|
|
|
}
|
|
|
|
cnt, _ := scheduler.resourceAllocations[job.Group]
|
|
|
|
cnt.CPU -= res.MemTotal
|
|
|
|
cnt.Memory -= res.NumCPU
|
|
|
|
for _, v := range res.Status {
|
|
|
|
cnt.NumberGPU --
|
|
|
|
cnt.MemoryGPU -= v.MemoryTotal
|
|
|
|
}
|
|
|
|
scheduler.UpdateNextQueue()
|
|
|
|
}(agent)
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) QueryState(jobName string) MsgJobStatus {
|
|
|
|
jm, ok := scheduler.jobs[jobName]
|
|
|
|
if !ok {
|
|
|
|
return MsgJobStatus{Code: 1, Error: "Job not exist!"}
|
|
|
|
}
|
|
|
|
return jm.status()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) Stop(jobName string) MsgStop {
|
|
|
|
jm, ok := scheduler.jobs[jobName]
|
|
|
|
if !ok {
|
|
|
|
return MsgStop{Code: 1, Error: "Job not exist!"}
|
|
|
|
}
|
|
|
|
return jm.stop()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) QueryLogs(jobName string, taskName string) MsgLog {
|
|
|
|
jm, ok := scheduler.jobs[jobName]
|
|
|
|
if !ok {
|
|
|
|
return MsgLog{Code: 1, Error: "Job not exist!"}
|
|
|
|
}
|
|
|
|
return jm.logs(taskName)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) ListJobs() MsgJobList {
|
2019-08-01 03:11:37 +00:00
|
|
|
var jobs []Job
|
2019-07-29 07:01:59 +00:00
|
|
|
for _, job := range scheduler.history {
|
2019-08-01 03:11:37 +00:00
|
|
|
jobs = append(jobs, *job)
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
2019-08-01 03:11:37 +00:00
|
|
|
var tmp []Job
|
|
|
|
for _, v := range scheduler.queues {
|
|
|
|
tmp = append(tmp, v...)
|
|
|
|
}
|
|
|
|
sort.Sort(FairJobSorter(tmp))
|
|
|
|
jobs = append(jobs, tmp...)
|
|
|
|
return MsgJobList{Code: 0, Jobs: jobs}
|
2019-07-29 07:01:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) Summary() MsgSummary {
|
|
|
|
summary := MsgSummary{}
|
|
|
|
summary.Code = 0
|
|
|
|
|
|
|
|
finishedJobsCounter := 0
|
|
|
|
runningJobsCounter := 0
|
|
|
|
pendingJobsCounter := 0
|
|
|
|
|
|
|
|
var tmp []Job
|
|
|
|
for _, job := range scheduler.history {
|
|
|
|
tmp = append(tmp, *job)
|
|
|
|
}
|
2019-08-01 03:11:37 +00:00
|
|
|
for _, v := range scheduler.queues {
|
|
|
|
tmp = append(tmp, v...)
|
|
|
|
}
|
2019-07-29 07:01:59 +00:00
|
|
|
|
|
|
|
for _, job := range tmp {
|
|
|
|
switch job.Status {
|
|
|
|
case Created:
|
|
|
|
pendingJobsCounter++
|
|
|
|
case Starting:
|
|
|
|
pendingJobsCounter++
|
|
|
|
break
|
|
|
|
case Running:
|
|
|
|
runningJobsCounter++
|
|
|
|
break;
|
|
|
|
case Finished:
|
|
|
|
finishedJobsCounter++
|
|
|
|
case Stopped:
|
|
|
|
finishedJobsCounter++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
summary.JobsFinished = finishedJobsCounter
|
|
|
|
summary.JobsPending = pendingJobsCounter
|
|
|
|
summary.JobsRunning = runningJobsCounter
|
|
|
|
|
|
|
|
FreeGPU := 0
|
|
|
|
UsingGPU := 0
|
|
|
|
|
|
|
|
for _, node := range pool.nodes {
|
|
|
|
for j := range node.Status {
|
|
|
|
if node.Status[j].MemoryAllocated == 0 {
|
|
|
|
FreeGPU++
|
|
|
|
} else {
|
|
|
|
UsingGPU++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
summary.FreeGPU = FreeGPU
|
|
|
|
summary.UsingGPU = UsingGPU
|
|
|
|
|
|
|
|
return summary
|
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) AcquireNetwork() string {
|
|
|
|
return pool.acquireNetwork()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) ReleaseNetwork(network string) {
|
|
|
|
pool.releaseNetwork(network)
|
|
|
|
}
|
2019-08-01 03:11:37 +00:00
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) UpdateNextQueue() {
|
2019-08-01 05:42:53 +00:00
|
|
|
next := "default"
|
|
|
|
quota := 9999.0
|
|
|
|
|
|
|
|
NumberGPU := 0.00001
|
|
|
|
MemoryGPU := 0.00001
|
|
|
|
CPU := 0.00001
|
|
|
|
Memory := 0.0001
|
|
|
|
for _, node := range pool.nodes {
|
|
|
|
CPU += float64(node.NumCPU)
|
|
|
|
Memory += float64(node.MemTotal)
|
|
|
|
for _, card := range node.Status {
|
|
|
|
NumberGPU += 1.0
|
|
|
|
MemoryGPU += float64(card.MemoryTotal)
|
2019-08-01 03:11:37 +00:00
|
|
|
}
|
2019-08-01 05:42:53 +00:00
|
|
|
}
|
|
|
|
|
2019-08-01 06:07:51 +00:00
|
|
|
for k, t := range scheduler.queues {
|
|
|
|
if len(t) == 0 {
|
2019-08-01 06:03:17 +00:00
|
|
|
continue
|
|
|
|
}
|
2019-08-01 06:07:51 +00:00
|
|
|
if _, ok := scheduler.resourceAllocations[k]; !ok {
|
|
|
|
scheduler.resourceAllocations[k] = &ResourceCount{}
|
|
|
|
}
|
|
|
|
v := scheduler.resourceAllocations[k]
|
|
|
|
|
2019-08-01 05:42:53 +00:00
|
|
|
tmp := 0.0
|
|
|
|
tmp += float64(v.CPU) / CPU
|
|
|
|
tmp += float64(v.Memory) / Memory
|
|
|
|
tmp += float64(v.NumberGPU) / NumberGPU
|
|
|
|
tmp += float64(v.MemoryGPU) / MemoryGPU
|
|
|
|
tmp /= 4
|
2019-10-24 05:31:03 +00:00
|
|
|
weight := 10
|
|
|
|
if g, ok2 := InstanceOfGroupManager().groups[k]; !ok2 {
|
|
|
|
weight = g.Weight
|
|
|
|
}
|
|
|
|
tmp /= float64(weight)
|
2019-08-01 05:42:53 +00:00
|
|
|
if tmp < quota {
|
|
|
|
quota = tmp
|
|
|
|
next = k
|
2019-08-01 03:11:37 +00:00
|
|
|
}
|
|
|
|
}
|
2019-08-01 05:42:53 +00:00
|
|
|
scheduler.nextQueue = next
|
2020-04-12 03:15:56 +00:00
|
|
|
log.Debug("updateNextQueue ->", next)
|
2019-08-01 03:11:37 +00:00
|
|
|
}
|
2020-04-11 03:38:04 +00:00
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) Attach(GPU string, job string) {
|
|
|
|
pool.attach(GPU, job)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (scheduler *SchedulerFair) Detach(GPU string, job string) {
|
|
|
|
pool.detach(GPU, job)
|
2020-04-12 17:30:25 +00:00
|
|
|
}
|
2020-04-13 10:26:40 +00:00
|
|
|
|
2020-04-13 10:37:54 +00:00
|
|
|
func (scheduler *SchedulerFair) Enable() bool {
|
2020-04-13 10:26:40 +00:00
|
|
|
scheduler.enabled = true
|
2020-04-13 11:44:27 +00:00
|
|
|
log.Info("scheduler is enabled")
|
2020-04-13 10:37:54 +00:00
|
|
|
return true
|
2020-04-13 10:26:40 +00:00
|
|
|
}
|
|
|
|
|
2020-04-13 10:37:54 +00:00
|
|
|
func (scheduler *SchedulerFair) Disable() bool {
|
2020-04-13 10:26:40 +00:00
|
|
|
scheduler.enabled = false
|
2020-04-13 11:44:27 +00:00
|
|
|
log.Info("scheduler is disabled")
|
2020-04-13 10:37:54 +00:00
|
|
|
return true
|
2020-04-13 10:26:40 +00:00
|
|
|
}
|