1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-06-07 22:31:55 +00:00
YAO-scheduler/src/scheduler_fair.go

870 lines
21 KiB
Go
Raw Normal View History

2019-07-29 07:01:59 +00:00
package main
import (
"sync"
"time"
log "github.com/sirupsen/logrus"
2019-08-01 03:11:37 +00:00
"sort"
2020-04-13 14:35:17 +00:00
"math/rand"
2019-08-01 05:42:53 +00:00
)
type ResourceCount struct {
NumberGPU int
MemoryGPU int
CPU int
Memory int
}
2019-07-29 07:01:59 +00:00
type SchedulerFair struct {
2020-05-03 03:04:17 +00:00
history []*Job
historyMu sync.Mutex
nextQueue string
jobs map[string]*JobManager
queues map[string][]Job
queueMu sync.Mutex
schedulingJobsCnt int
schedulingMu sync.Mutex
2020-04-13 16:06:15 +00:00
resourceAllocations map[string]*ResourceCount
resourceAllocationsMu sync.Mutex
2020-05-03 03:04:17 +00:00
enabled bool
parallelism int
2020-04-30 09:52:52 +00:00
enableShare bool
enableShareRatio float64
enablePreSchedule bool
enablePreScheduleRatio float64
2020-04-30 13:26:04 +00:00
UsingGPU int
2020-04-30 13:22:21 +00:00
UsingGPUMu sync.Mutex
2020-04-30 15:06:12 +00:00
allocatingGPU int
allocatingGPUMu sync.Mutex
2020-05-02 16:00:51 +00:00
reservedGPU int
queuesSchedulingCnt map[string]int
2020-05-03 03:04:17 +00:00
queueUsingGPU map[string]int
queuesUsingGPUMu sync.Mutex
2020-05-05 11:12:11 +00:00
mu sync.Mutex
2019-08-01 03:11:37 +00:00
}
type FairJobSorter []Job
func (s FairJobSorter) Len() int {
return len(s)
}
func (s FairJobSorter) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s FairJobSorter) Less(i, j int) bool {
2019-08-01 07:03:56 +00:00
return s[i].CreatedAt < s[j].CreatedAt
2019-07-29 07:01:59 +00:00
}
func (scheduler *SchedulerFair) Start() {
2020-05-04 06:04:32 +00:00
log.Info("JS started")
2019-07-29 07:01:59 +00:00
scheduler.jobs = map[string]*JobManager{}
scheduler.history = []*Job{}
2019-08-01 03:11:37 +00:00
scheduler.nextQueue = "default"
2019-08-01 03:14:14 +00:00
scheduler.queues = map[string][]Job{}
2019-08-01 03:13:14 +00:00
scheduler.queues["default"] = []Job{}
2019-08-01 06:00:24 +00:00
scheduler.resourceAllocations = map[string]*ResourceCount{}
2020-04-13 10:37:54 +00:00
scheduler.enabled = true
2020-04-13 14:35:17 +00:00
scheduler.schedulingJobsCnt = 0
2020-05-02 16:00:51 +00:00
scheduler.queueUsingGPU = map[string]int{}
2020-04-30 09:52:52 +00:00
2020-04-30 06:48:58 +00:00
scheduler.enableShare = true
2020-04-30 09:52:52 +00:00
scheduler.enableShareRatio = 0.75
2020-04-30 06:48:58 +00:00
scheduler.enablePreSchedule = true
2020-04-30 09:52:52 +00:00
scheduler.enablePreScheduleRatio = 0.95
scheduler.UsingGPU = 0
2020-04-30 15:06:12 +00:00
scheduler.allocatingGPU = 0
2020-05-02 16:00:51 +00:00
scheduler.queuesSchedulingCnt = map[string]int{}
2019-07-29 07:01:59 +00:00
2020-04-13 15:53:38 +00:00
scheduler.parallelism = 1
2019-07-29 07:01:59 +00:00
go func() {
2020-05-04 05:59:01 +00:00
/* fair scheduler */
2020-05-03 03:04:17 +00:00
flag := true
2019-07-29 07:01:59 +00:00
for {
2019-08-01 05:42:53 +00:00
log.Debug("Scheduling")
2020-05-03 03:04:17 +00:00
if !flag {
2020-05-03 03:33:22 +00:00
time.Sleep(time.Millisecond * 100)
2020-05-03 03:04:17 +00:00
}
flag = false
2020-04-13 10:26:40 +00:00
if !scheduler.enabled {
2020-04-13 15:41:01 +00:00
time.Sleep(time.Millisecond * 100)
2020-04-13 10:26:40 +00:00
continue
}
2020-04-13 14:35:17 +00:00
scheduler.schedulingMu.Lock()
2020-04-13 15:53:38 +00:00
if scheduler.schedulingJobsCnt >= scheduler.parallelism {
2020-04-13 14:35:17 +00:00
scheduler.schedulingMu.Unlock()
2020-04-13 15:41:01 +00:00
time.Sleep(time.Millisecond * 100)
2020-04-13 14:35:17 +00:00
continue
}
scheduler.schedulingJobsCnt++
scheduler.schedulingMu.Unlock()
2020-05-03 03:04:17 +00:00
2020-04-13 14:35:17 +00:00
scheduler.queueMu.Lock()
2019-08-01 03:11:37 +00:00
queue := scheduler.nextQueue
2020-05-03 03:31:45 +00:00
go func() {
scheduler.UpdateNextQueue()
}()
2019-08-01 02:42:37 +00:00
if len(scheduler.queues[queue]) > 0 {
2019-07-29 07:01:59 +00:00
jm := JobManager{}
2019-08-01 02:42:37 +00:00
jm.job = scheduler.queues[queue][0]
2019-07-29 07:01:59 +00:00
2020-04-30 15:06:12 +00:00
cnt := 0
for _, task := range jm.job.Tasks {
cnt += task.NumberGPU
}
2020-05-03 02:30:12 +00:00
reserved := scheduler.reservedGPU
scheduler.queuesUsingGPUMu.Lock()
for g, v := range scheduler.queueUsingGPU {
if InstanceOfGroupManager().groups[g].Reserved {
reserved -= v
}
}
scheduler.queuesUsingGPUMu.Unlock()
2020-05-03 03:38:59 +00:00
log.Info(cnt, reserved, pool.TotalGPU, scheduler.UsingGPU, scheduler.allocatingGPU)
2020-05-05 07:44:48 +00:00
if scheduler.schedulingJobsCnt > 1 && (cnt*10+(scheduler.allocatingGPU)*13 > (pool.TotalGPU-scheduler.UsingGPU-reserved)*10) {
2020-05-03 03:04:17 +00:00
scheduler.schedulingMu.Lock()
scheduler.schedulingJobsCnt--
scheduler.schedulingMu.Unlock()
scheduler.queueMu.Unlock()
continue
2020-04-30 15:06:12 +00:00
}
2020-05-03 03:04:17 +00:00
flag = true
2020-04-30 15:06:12 +00:00
scheduler.allocatingGPUMu.Lock()
scheduler.allocatingGPU += cnt
scheduler.allocatingGPUMu.Unlock()
log.Info("allocatingGPU is ", scheduler.allocatingGPU)
log.Info("schedulingJobsCnt is ", scheduler.schedulingJobsCnt)
2019-08-01 02:42:37 +00:00
scheduler.queues[queue] = scheduler.queues[queue][1:]
2019-07-29 07:01:59 +00:00
jm.scheduler = scheduler
scheduler.jobs[jm.job.Name] = &jm
jm.job.Status = Starting
2020-04-13 16:06:15 +00:00
scheduler.historyMu.Lock()
2019-07-29 07:01:59 +00:00
scheduler.history = append(scheduler.history, &jm.job)
2020-04-13 16:06:15 +00:00
scheduler.historyMu.Unlock()
2019-07-29 07:01:59 +00:00
2020-05-03 03:54:19 +00:00
scheduler.queuesUsingGPUMu.Lock()
scheduler.queuesSchedulingCnt[jm.job.Group]++
scheduler.queuesUsingGPUMu.Unlock()
2019-07-29 07:01:59 +00:00
go func() {
jm.start()
}()
} else {
2020-04-30 13:22:21 +00:00
log.Debug("No more jobs to scheduling ", time.Now())
2020-04-13 14:37:24 +00:00
scheduler.schedulingMu.Lock()
scheduler.schedulingJobsCnt--
scheduler.schedulingMu.Unlock()
2019-07-29 07:01:59 +00:00
}
2020-04-13 14:35:17 +00:00
scheduler.queueMu.Unlock()
2019-07-29 07:01:59 +00:00
}
}()
2020-05-02 16:00:51 +00:00
/* schedule capacity queues */
go func() {
for {
flag := false
scheduler.queueMu.Lock()
for q, t := range scheduler.queues {
if len(t) == 0 || !InstanceOfGroupManager().groups[t[0].Group].Reserved {
continue
}
2020-05-04 10:05:11 +00:00
//log.Info(scheduler.queueUsingGPU)
//log.Info(scheduler.queuesSchedulingCnt)
2020-05-03 03:04:17 +00:00
scheduler.queuesUsingGPUMu.Lock()
2020-05-02 16:00:51 +00:00
if cnt, ok := scheduler.queuesSchedulingCnt[t[0].Group]; ok && cnt > 0 {
2020-05-03 03:04:17 +00:00
scheduler.queuesUsingGPUMu.Unlock()
2020-05-02 16:00:51 +00:00
continue
}
2020-05-03 03:04:17 +00:00
scheduler.queuesUsingGPUMu.Unlock()
2020-05-02 16:00:51 +00:00
numberGPU := 0
for _, v := range t[0].Tasks {
numberGPU += v.NumberGPU
}
available := InstanceOfGroupManager().groups[t[0].Group].NumGPU
2020-05-03 02:30:12 +00:00
scheduler.queuesUsingGPUMu.Lock()
2020-05-02 16:00:51 +00:00
if cnt, ok := scheduler.queueUsingGPU[t[0].Group]; ok {
available -= cnt
}
2020-05-03 02:30:12 +00:00
scheduler.queuesUsingGPUMu.Unlock()
2020-05-02 16:00:51 +00:00
if pool.TotalGPU-scheduler.UsingGPU-scheduler.allocatingGPU*13/10 < 0 {
continue
}
2020-05-03 04:02:48 +00:00
if numberGPU <= available {
2020-05-02 16:00:51 +00:00
jm := JobManager{}
jm.job = scheduler.queues[q][0]
scheduler.schedulingMu.Lock()
scheduler.schedulingJobsCnt++
scheduler.schedulingMu.Unlock()
2020-05-03 03:04:17 +00:00
scheduler.queuesUsingGPUMu.Lock()
scheduler.queuesSchedulingCnt[jm.job.Group]++
scheduler.queuesUsingGPUMu.Unlock()
2020-05-02 16:00:51 +00:00
scheduler.allocatingGPUMu.Lock()
scheduler.allocatingGPU += numberGPU
scheduler.allocatingGPUMu.Unlock()
log.Info("allocatingGPU is ", scheduler.allocatingGPU)
log.Info("schedulingJobsCnt is ", scheduler.schedulingJobsCnt)
scheduler.queues[q] = scheduler.queues[q][1:]
jm.scheduler = scheduler
scheduler.jobs[jm.job.Name] = &jm
jm.job.Status = Starting
scheduler.historyMu.Lock()
scheduler.history = append(scheduler.history, &jm.job)
scheduler.historyMu.Unlock()
go func() {
jm.start()
}()
flag = true
}
}
scheduler.queueMu.Unlock()
if !flag {
time.Sleep(time.Millisecond * 100)
}
}
}()
2019-07-29 07:01:59 +00:00
}
2020-05-02 16:14:08 +00:00
func (scheduler *SchedulerFair) UpdateProgress(job Job, state State) {
2020-04-13 16:06:15 +00:00
scheduler.historyMu.Lock()
defer scheduler.historyMu.Unlock()
2019-07-29 07:01:59 +00:00
switch state {
case Running:
2020-04-13 14:35:17 +00:00
scheduler.schedulingMu.Lock()
scheduler.schedulingJobsCnt--
2020-05-03 03:04:17 +00:00
scheduler.schedulingMu.Unlock()
scheduler.queuesUsingGPUMu.Lock()
2020-05-02 16:14:08 +00:00
if _, ok := scheduler.queuesSchedulingCnt[job.Group]; ok {
scheduler.queuesSchedulingCnt[job.Group]--
2020-05-03 03:54:19 +00:00
if scheduler.queuesSchedulingCnt[job.Group] < 0 {
scheduler.queuesSchedulingCnt[job.Group] = 0
log.Warn("scheduler.queuesSchedulingCnt less than 0", job.Group)
}
2020-05-02 16:14:08 +00:00
}
2020-05-03 03:04:17 +00:00
scheduler.queuesUsingGPUMu.Unlock()
2019-07-29 07:01:59 +00:00
for i := range scheduler.history {
2020-05-02 16:14:08 +00:00
if scheduler.history[i].Name == job.Name {
2019-07-29 07:01:59 +00:00
scheduler.history[i].Status = Running
2020-04-12 10:42:55 +00:00
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
2019-07-29 07:01:59 +00:00
}
}
break
case Finished:
for i := range scheduler.history {
2020-05-02 16:14:08 +00:00
if scheduler.history[i].Name == job.Name {
2019-07-29 07:01:59 +00:00
scheduler.history[i].Status = Finished
2020-04-12 10:42:55 +00:00
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
2019-07-29 07:01:59 +00:00
}
}
break
case Stopped:
for i := range scheduler.history {
2020-05-02 16:14:08 +00:00
if scheduler.history[i].Name == job.Name {
2019-07-29 07:01:59 +00:00
scheduler.history[i].Status = Stopped
2020-04-12 10:42:55 +00:00
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
2019-07-29 07:01:59 +00:00
}
}
break
2020-05-04 05:59:01 +00:00
case Failed:
for i := range scheduler.history {
if scheduler.history[i].Name == job.Name {
scheduler.history[i].Status = Failed
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
}
}
break
2019-07-29 07:01:59 +00:00
}
}
func (scheduler *SchedulerFair) Schedule(job Job) {
2020-04-13 14:35:17 +00:00
scheduler.queueMu.Lock()
defer scheduler.queueMu.Unlock()
2019-07-29 07:01:59 +00:00
2019-08-01 03:11:37 +00:00
queue := job.Group
2019-08-01 02:42:37 +00:00
_, ok := scheduler.queues[queue]
if !ok {
if InstanceOfGroupManager().get(queue) != nil {
scheduler.queues[queue] = []Job{}
} else {
queue = "default"
}
}
2019-07-29 07:01:59 +00:00
2019-08-01 02:42:37 +00:00
index := 0
2019-07-29 07:01:59 +00:00
left := 0
2019-08-01 03:17:57 +00:00
right := len(scheduler.queues[queue]) - 1
2019-07-29 07:01:59 +00:00
for ; left <= right; {
mid := (left + right) / 2
2019-08-01 02:42:37 +00:00
if scheduler.queues[queue][left].Priority < job.Priority {
2019-07-29 07:01:59 +00:00
index = left
break
}
2019-08-01 02:42:37 +00:00
if scheduler.queues[queue][right].Priority >= job.Priority {
2019-07-29 07:01:59 +00:00
index = right + 1
break
}
2019-08-01 02:42:37 +00:00
if scheduler.queues[queue][mid].Priority >= job.Priority {
2019-07-29 07:01:59 +00:00
left = mid + 1
} else {
right = mid - 1
}
}
2019-08-01 02:42:37 +00:00
scheduler.queues[queue] = append(scheduler.queues[queue], Job{})
2019-07-29 07:01:59 +00:00
2019-08-01 02:42:37 +00:00
copy(scheduler.queues[queue][index+1:], scheduler.queues[queue][index:])
scheduler.queues[queue][index] = job
2019-07-29 07:01:59 +00:00
job.Status = Created
}
2020-04-30 13:22:21 +00:00
func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []NodeStatus) NodeStatus {
2020-05-03 15:32:38 +00:00
segID := rand.Intn(pool.poolsCount)
2020-05-23 18:22:05 +00:00
if pool.TotalGPU < 100 {
segID = 0
}
2019-07-29 07:01:59 +00:00
res := NodeStatus{}
2020-05-03 16:02:45 +00:00
start := &pool.pools[segID]
if start.Nodes == nil {
start = start.Next
}
2020-04-13 15:03:34 +00:00
2020-05-03 16:45:50 +00:00
locks := map[int]*sync.Mutex{}
2020-04-30 06:04:40 +00:00
2020-04-30 09:52:52 +00:00
allocationType := 0
2020-04-30 07:06:36 +00:00
availableGPUs := map[string][]GPUStatus{}
2020-04-30 06:44:02 +00:00
2020-04-30 13:22:21 +00:00
var candidates []*NodeStatus
2020-04-30 06:04:40 +00:00
2020-04-30 09:52:52 +00:00
/* first, choose sharable GPUs */
2020-04-30 10:39:47 +00:00
if scheduler.enableShare && (pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enableShareRatio) {
2020-04-30 06:04:40 +00:00
// check sharable
2020-04-30 09:52:52 +00:00
allocationType = 1
2020-04-30 06:04:40 +00:00
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
2020-05-23 18:22:05 +00:00
for cur := start; cur.ID < cur.Next.ID; {
2020-05-03 15:32:38 +00:00
if _, ok := locks[cur.ID]; !ok {
cur.Lock.Lock()
2020-05-03 16:45:50 +00:00
locks[cur.ID] = &cur.Lock
2020-04-30 06:15:44 +00:00
}
2020-05-03 15:32:38 +00:00
for _, node := range cur.Nodes {
2020-04-30 06:04:40 +00:00
var available []GPUStatus
for _, status := range node.Status {
2020-05-23 17:02:17 +00:00
if status.MemoryAllocated > 0 && status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated {
2020-04-30 06:04:40 +00:00
2020-04-30 07:01:00 +00:00
if jobs, ok := pool.bindings[status.UUID]; ok {
2020-04-30 06:04:40 +00:00
totalUtil := util
for job := range jobs {
2020-04-30 07:01:00 +00:00
if utilT, ok := InstanceOfOptimizer().predictUtilGPU(job); ok {
2020-04-30 06:04:40 +00:00
totalUtil += utilT
2020-05-03 07:19:21 +00:00
} else {
totalUtil += 100
2020-04-30 06:04:40 +00:00
}
}
2020-04-30 09:52:52 +00:00
if totalUtil < 100 {
2020-04-30 06:04:40 +00:00
available = append(available, status)
2020-04-30 07:06:36 +00:00
availableGPUs[node.ClientID] = available
2020-04-30 06:04:40 +00:00
}
}
2020-04-13 15:03:34 +00:00
}
2019-07-29 07:01:59 +00:00
}
2020-04-30 06:04:40 +00:00
if len(available) >= task.NumberGPU {
2020-05-03 15:32:38 +00:00
candidates = append(candidates, node)
2020-04-30 06:04:40 +00:00
if len(candidates) >= 8 {
break
}
}
}
if len(candidates) >= 8 {
break
2019-07-29 07:01:59 +00:00
}
2020-05-03 15:32:38 +00:00
cur = cur.Next
2020-05-03 16:02:45 +00:00
if cur.ID == start.ID {
2020-05-03 15:32:38 +00:00
break
}
2019-07-29 07:01:59 +00:00
}
2020-04-13 15:03:34 +00:00
}
2020-04-30 11:31:26 +00:00
//log.Info(candidates)
2020-04-30 06:04:40 +00:00
}
2020-04-30 09:52:52 +00:00
/* second round, find vacant gpu */
2020-05-05 07:44:48 +00:00
flag := false
reserved := scheduler.reservedGPU
scheduler.queuesUsingGPUMu.Lock()
for g, v := range scheduler.queueUsingGPU {
if InstanceOfGroupManager().groups[g].Reserved {
reserved -= v
}
}
scheduler.queuesUsingGPUMu.Unlock()
if g, ok := InstanceOfGroupManager().groups[job.Group]; ok && g.Reserved && g.NumGPU > scheduler.queueUsingGPU[job.Group] {
flag = true
}
if task.NumberGPU <= pool.TotalGPU-scheduler.UsingGPU-reserved {
flag = true
}
2020-05-05 08:06:44 +00:00
if len(candidates) == 0 && flag {
2020-04-30 09:52:52 +00:00
allocationType = 2
2020-05-23 18:22:05 +00:00
for cur := start; cur.ID < cur.Next.ID; {
2020-05-03 15:32:38 +00:00
if _, ok := locks[cur.ID]; !ok {
cur.Lock.Lock()
2020-05-03 16:45:50 +00:00
locks[cur.ID] = &cur.Lock
2020-04-30 10:55:54 +00:00
}
2020-05-03 15:32:38 +00:00
for _, node := range cur.Nodes {
2020-04-30 09:52:52 +00:00
var available []GPUStatus
for _, status := range node.Status {
if status.MemoryAllocated == 0 && status.MemoryUsed < 10 {
available = append(available, status)
}
}
if len(available) >= task.NumberGPU {
2020-05-03 15:32:38 +00:00
candidates = append(candidates, node)
2020-04-30 09:52:52 +00:00
availableGPUs[node.ClientID] = available
if len(candidates) >= 8 {
break
}
}
}
if len(candidates) >= 8 {
break
}
2020-05-03 15:32:38 +00:00
cur = cur.Next
2020-05-03 16:02:45 +00:00
if cur.ID == start.ID {
2020-05-03 15:32:38 +00:00
break
}
2020-04-30 09:52:52 +00:00
}
2020-04-30 11:31:26 +00:00
//log.Info(candidates)
2020-04-30 09:52:52 +00:00
}
/* third round, find gpu to be released */
2020-04-30 10:39:47 +00:00
if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && scheduler.enablePreSchedule {
estimate, valid := InstanceOfOptimizer().predictTime(job.Name)
2020-04-30 10:50:50 +00:00
2020-04-30 15:06:12 +00:00
//log.Info(pool.TotalGPU)
//log.Info(estimate, valid)
//log.Info(scheduler.UsingGPU)
2020-04-30 10:50:50 +00:00
2020-04-30 10:39:47 +00:00
if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid {
2020-04-30 09:52:52 +00:00
allocationType = 3
2020-05-23 18:22:05 +00:00
for cur := start; cur.ID < cur.Next.ID; {
2020-05-03 15:32:38 +00:00
if _, ok := locks[cur.ID]; !ok {
cur.Lock.Lock()
2020-05-03 16:45:50 +00:00
locks[cur.ID] = &cur.Lock
2020-04-30 10:55:54 +00:00
}
2020-05-03 15:32:38 +00:00
for _, node := range cur.Nodes {
2020-04-30 09:52:52 +00:00
var available []GPUStatus
for _, status := range node.Status {
2020-04-30 10:39:47 +00:00
bindings := pool.getBindings()
if tasks, ok := bindings[status.UUID]; ok {
2020-05-05 07:44:48 +00:00
if len(tasks) > 1 || status.MemoryAllocated == 0 {
2020-04-30 10:39:47 +00:00
continue
}
for task_t, s := range tasks {
est, valid2 := InstanceOfOptimizer().predictTime(task_t)
if valid2 {
now := (int)(time.Now().Unix())
2020-05-01 04:48:06 +00:00
log.Info(s, now, estimate, est)
2020-05-04 04:48:29 +00:00
if now-s > est.Total-est.Post-estimate.Pre-15 {
2020-04-30 10:39:47 +00:00
available = append(available, status)
}
}
}
2020-04-30 09:52:52 +00:00
}
}
if len(available) >= task.NumberGPU {
2020-05-03 15:32:38 +00:00
candidates = append(candidates, node)
2020-04-30 09:52:52 +00:00
availableGPUs[node.ClientID] = available
if len(candidates) >= 8 {
break
}
}
}
if len(candidates) >= 8 {
break
}
}
2020-04-30 11:31:26 +00:00
//log.Info(candidates)
2020-04-30 09:52:52 +00:00
}
}
2020-04-30 11:31:26 +00:00
if len(candidates) > 0 {
log.Info("allocationType is ", allocationType)
2020-05-04 10:05:11 +00:00
//log.Info(candidates)
2020-04-30 11:31:26 +00:00
}
2020-04-30 09:52:52 +00:00
/* assign */
2020-04-30 06:04:40 +00:00
if len(candidates) > 0 {
2020-05-01 04:48:06 +00:00
node := pool.pickNode(candidates, availableGPUs, task, job, nodes)
2020-04-30 06:04:40 +00:00
res.ClientID = node.ClientID
res.ClientHost = node.ClientHost
2020-04-30 07:06:36 +00:00
res.Status = availableGPUs[node.ClientID][0:task.NumberGPU]
2020-04-30 06:04:40 +00:00
res.NumCPU = task.NumberCPU
res.MemTotal = task.Memory
for i := range res.Status {
for j := range node.Status {
if res.Status[i].UUID == node.Status[j].UUID {
2020-04-30 13:59:33 +00:00
if node.Status[j].MemoryAllocated == 0 {
scheduler.UsingGPUMu.Lock()
scheduler.UsingGPU ++
scheduler.UsingGPUMu.Unlock()
}
2020-04-30 06:04:40 +00:00
node.Status[j].MemoryAllocated += task.MemoryGPU
res.Status[i].MemoryTotal = task.MemoryGPU
}
}
2019-07-29 07:01:59 +00:00
}
2020-04-30 13:59:33 +00:00
for _, t := range res.Status {
scheduler.Attach(t.UUID, job.Name)
2020-04-30 11:22:23 +00:00
}
2020-05-03 02:30:12 +00:00
scheduler.queuesUsingGPUMu.Lock()
scheduler.queueUsingGPU[job.Group] += task.NumberGPU
scheduler.queuesUsingGPUMu.Unlock()
2020-04-30 15:06:12 +00:00
scheduler.allocatingGPUMu.Lock()
scheduler.allocatingGPU -= task.NumberGPU
scheduler.allocatingGPUMu.Unlock()
log.Info("allocatingGPU is ", scheduler.allocatingGPU)
2019-07-29 07:01:59 +00:00
}
2020-04-30 06:04:40 +00:00
2020-05-03 15:49:07 +00:00
for segID, lock := range locks {
2020-05-03 16:48:16 +00:00
log.Debug("Unlock ", segID)
2020-05-03 15:34:55 +00:00
lock.Unlock()
2020-04-30 06:04:40 +00:00
}
2020-05-02 16:14:08 +00:00
2019-08-01 05:42:53 +00:00
go func(res NodeStatus) {
2019-08-01 06:16:44 +00:00
if len(res.Status) == 0 {
return
}
2020-04-13 16:06:15 +00:00
scheduler.resourceAllocationsMu.Lock()
2019-08-01 05:42:53 +00:00
if _, ok := scheduler.resourceAllocations[job.Group]; !ok {
2019-08-01 06:00:24 +00:00
scheduler.resourceAllocations[job.Group] = &ResourceCount{}
2019-08-01 05:42:53 +00:00
}
2019-08-01 06:03:17 +00:00
cnt, _ := scheduler.resourceAllocations[job.Group]
2019-08-01 05:42:53 +00:00
cnt.CPU += res.MemTotal
cnt.Memory += res.NumCPU
for _, v := range res.Status {
cnt.NumberGPU ++
cnt.MemoryGPU += v.MemoryTotal
}
2020-04-13 16:06:15 +00:00
scheduler.resourceAllocationsMu.Unlock()
2019-08-01 06:16:44 +00:00
scheduler.UpdateNextQueue()
2019-08-01 05:42:53 +00:00
}(res)
2019-07-29 07:01:59 +00:00
return res
}
2019-08-01 05:42:53 +00:00
func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
2020-05-03 15:32:38 +00:00
segID := pool.getNodePool(agent.ClientID)
seg := pool.pools[segID]
if seg.Nodes == nil {
seg = *seg.Next
}
seg.Lock.Lock()
defer seg.Lock.Unlock()
2020-04-13 14:35:17 +00:00
2020-05-03 15:32:38 +00:00
node := seg.Nodes[agent.ClientID]
2019-07-29 07:01:59 +00:00
for _, gpu := range agent.Status {
2020-04-13 11:41:28 +00:00
for j := range node.Status {
if gpu.UUID == node.Status[j].UUID {
node.Status[j].MemoryAllocated -= gpu.MemoryTotal
if node.Status[j].MemoryAllocated < 0 {
2020-04-13 10:26:40 +00:00
// in case of error
2020-04-13 11:41:28 +00:00
log.Warn(node.ClientID, "More Memory Allocated")
node.Status[j].MemoryAllocated = 0
2020-04-12 17:30:25 +00:00
}
2020-04-30 11:22:23 +00:00
if node.Status[j].MemoryAllocated == 0 {
2020-04-30 13:22:21 +00:00
scheduler.UsingGPUMu.Lock()
2020-04-30 11:22:23 +00:00
scheduler.UsingGPU--
2020-04-30 13:22:21 +00:00
scheduler.UsingGPUMu.Unlock()
log.Info(node.Status[j].UUID, " is released")
2020-04-30 11:22:23 +00:00
}
2020-05-04 10:05:11 +00:00
//log.Info(node.Status[j].MemoryAllocated)
2019-07-29 07:01:59 +00:00
}
}
}
2020-05-03 02:30:12 +00:00
scheduler.queuesUsingGPUMu.Lock()
if _, ok := scheduler.queueUsingGPU[job.Group]; ok {
scheduler.queueUsingGPU[job.Group] -= len(agent.Status)
if scheduler.queueUsingGPU[job.Group] < 0 {
log.Warn("queueUsingGPU exceeded ", scheduler.queueUsingGPU[job.Group])
scheduler.queueUsingGPU[job.Group] = 0
}
}
scheduler.queuesUsingGPUMu.Unlock()
2019-08-01 05:42:53 +00:00
go func(res NodeStatus) {
2020-04-13 16:06:15 +00:00
scheduler.resourceAllocationsMu.Lock()
2019-08-01 05:42:53 +00:00
if _, ok := scheduler.resourceAllocations[job.Group]; !ok {
2019-08-01 06:00:24 +00:00
scheduler.resourceAllocations[job.Group] = &ResourceCount{}
2019-08-01 05:42:53 +00:00
}
cnt, _ := scheduler.resourceAllocations[job.Group]
cnt.CPU -= res.MemTotal
cnt.Memory -= res.NumCPU
for _, v := range res.Status {
cnt.NumberGPU --
cnt.MemoryGPU -= v.MemoryTotal
}
2020-04-13 16:06:15 +00:00
scheduler.resourceAllocationsMu.Unlock()
2019-08-01 05:42:53 +00:00
scheduler.UpdateNextQueue()
}(agent)
2019-07-29 07:01:59 +00:00
}
func (scheduler *SchedulerFair) QueryState(jobName string) MsgJobStatus {
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Lock()
2019-07-29 07:01:59 +00:00
jm, ok := scheduler.jobs[jobName]
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Unlock()
2019-07-29 07:01:59 +00:00
if !ok {
return MsgJobStatus{Code: 1, Error: "Job not exist!"}
}
return jm.status()
}
func (scheduler *SchedulerFair) Stop(jobName string) MsgStop {
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Lock()
2019-07-29 07:01:59 +00:00
jm, ok := scheduler.jobs[jobName]
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Unlock()
2019-07-29 07:01:59 +00:00
if !ok {
return MsgStop{Code: 1, Error: "Job not exist!"}
}
return jm.stop()
}
func (scheduler *SchedulerFair) QueryLogs(jobName string, taskName string) MsgLog {
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Lock()
2019-07-29 07:01:59 +00:00
jm, ok := scheduler.jobs[jobName]
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Unlock()
2019-07-29 07:01:59 +00:00
if !ok {
return MsgLog{Code: 1, Error: "Job not exist!"}
}
return jm.logs(taskName)
}
func (scheduler *SchedulerFair) ListJobs() MsgJobList {
2019-08-01 03:11:37 +00:00
var jobs []Job
2020-04-13 16:06:15 +00:00
scheduler.historyMu.Lock()
2019-07-29 07:01:59 +00:00
for _, job := range scheduler.history {
2019-08-01 03:11:37 +00:00
jobs = append(jobs, *job)
2019-07-29 07:01:59 +00:00
}
2020-04-13 16:06:15 +00:00
scheduler.historyMu.Unlock()
2019-08-01 03:11:37 +00:00
var tmp []Job
for _, v := range scheduler.queues {
tmp = append(tmp, v...)
}
sort.Sort(FairJobSorter(tmp))
jobs = append(jobs, tmp...)
return MsgJobList{Code: 0, Jobs: jobs}
2019-07-29 07:01:59 +00:00
}
func (scheduler *SchedulerFair) Summary() MsgSummary {
summary := MsgSummary{}
summary.Code = 0
finishedJobsCounter := 0
runningJobsCounter := 0
pendingJobsCounter := 0
var tmp []Job
2020-04-13 16:06:15 +00:00
scheduler.historyMu.Lock()
2019-07-29 07:01:59 +00:00
for _, job := range scheduler.history {
tmp = append(tmp, *job)
}
2020-04-13 16:06:15 +00:00
scheduler.historyMu.Unlock()
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Lock()
2019-08-01 03:11:37 +00:00
for _, v := range scheduler.queues {
tmp = append(tmp, v...)
}
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Unlock()
2019-07-29 07:01:59 +00:00
for _, job := range tmp {
switch job.Status {
case Created:
pendingJobsCounter++
case Starting:
pendingJobsCounter++
break
case Running:
runningJobsCounter++
break;
case Finished:
finishedJobsCounter++
case Stopped:
finishedJobsCounter++
}
}
summary.JobsFinished = finishedJobsCounter
summary.JobsPending = pendingJobsCounter
summary.JobsRunning = runningJobsCounter
FreeGPU := 0
UsingGPU := 0
2020-05-03 15:32:38 +00:00
start := pool.pools[0].Next
for cur := start; ; {
cur.Lock.Lock()
for _, node := range cur.Nodes {
2020-04-13 14:35:17 +00:00
for j := range node.Status {
if node.Status[j].MemoryAllocated == 0 {
FreeGPU++
} else {
UsingGPU++
}
2019-07-29 07:01:59 +00:00
}
}
2020-05-03 15:32:38 +00:00
cur.Lock.Unlock()
cur = cur.Next
2020-05-03 16:02:45 +00:00
if cur.ID == start.ID {
2020-05-03 15:32:38 +00:00
break
}
2019-07-29 07:01:59 +00:00
}
summary.FreeGPU = FreeGPU
summary.UsingGPU = UsingGPU
return summary
}
func (scheduler *SchedulerFair) AcquireNetwork() string {
return pool.acquireNetwork()
}
func (scheduler *SchedulerFair) ReleaseNetwork(network string) {
pool.releaseNetwork(network)
}
2019-08-01 03:11:37 +00:00
func (scheduler *SchedulerFair) UpdateNextQueue() {
2019-08-01 05:42:53 +00:00
next := "default"
quota := 9999.0
NumberGPU := 0.00001
MemoryGPU := 0.00001
CPU := 0.00001
Memory := 0.0001
2020-05-03 15:32:38 +00:00
start := pool.pools[0].Next
for cur := start; ; {
2020-05-03 16:44:04 +00:00
cur.Lock.Lock()
2020-05-03 15:32:38 +00:00
for _, node := range cur.Nodes {
2020-04-13 14:35:17 +00:00
CPU += float64(node.NumCPU)
Memory += float64(node.MemTotal)
for _, card := range node.Status {
NumberGPU += 1.0
MemoryGPU += float64(card.MemoryTotal)
}
2019-08-01 03:11:37 +00:00
}
2020-05-03 16:44:04 +00:00
cur.Lock.Unlock()
2020-05-03 15:32:38 +00:00
cur = cur.Next
if cur == start {
break
}
2019-08-01 05:42:53 +00:00
}
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Lock()
2019-08-01 06:07:51 +00:00
for k, t := range scheduler.queues {
if len(t) == 0 {
2019-08-01 06:03:17 +00:00
continue
}
2020-04-13 16:06:15 +00:00
scheduler.resourceAllocationsMu.Lock()
2019-08-01 06:07:51 +00:00
if _, ok := scheduler.resourceAllocations[k]; !ok {
scheduler.resourceAllocations[k] = &ResourceCount{}
}
v := scheduler.resourceAllocations[k]
2019-08-01 05:42:53 +00:00
tmp := 0.0
tmp += float64(v.CPU) / CPU
tmp += float64(v.Memory) / Memory
tmp += float64(v.NumberGPU) / NumberGPU
tmp += float64(v.MemoryGPU) / MemoryGPU
2020-04-13 16:06:15 +00:00
scheduler.resourceAllocationsMu.Unlock()
2019-08-01 05:42:53 +00:00
tmp /= 4
2019-10-24 05:31:03 +00:00
weight := 10
if g, ok2 := InstanceOfGroupManager().groups[k]; !ok2 {
weight = g.Weight
}
tmp /= float64(weight)
2019-08-01 05:42:53 +00:00
if tmp < quota {
quota = tmp
next = k
2019-08-01 03:11:37 +00:00
}
}
2019-08-01 05:42:53 +00:00
scheduler.nextQueue = next
2020-05-03 03:04:17 +00:00
scheduler.queueMu.Unlock()
2020-05-03 16:48:16 +00:00
log.Debug("updateNextQueue ->", next)
2019-08-01 03:11:37 +00:00
}
2020-04-11 03:38:04 +00:00
func (scheduler *SchedulerFair) Attach(GPU string, job string) {
pool.attach(GPU, job)
}
2020-05-04 10:05:11 +00:00
func (scheduler *SchedulerFair) Detach(GPU string, job Job) {
2020-04-11 03:38:04 +00:00
pool.detach(GPU, job)
2020-04-12 17:30:25 +00:00
}
2020-04-13 10:26:40 +00:00
2020-04-13 10:37:54 +00:00
func (scheduler *SchedulerFair) Enable() bool {
2020-04-13 10:26:40 +00:00
scheduler.enabled = true
2020-04-30 13:22:21 +00:00
log.Info("scheduler is enabled ", time.Now())
2020-04-13 10:37:54 +00:00
return true
2020-04-13 10:26:40 +00:00
}
2020-04-13 10:37:54 +00:00
func (scheduler *SchedulerFair) Disable() bool {
2020-04-13 10:26:40 +00:00
scheduler.enabled = false
2020-04-30 13:22:21 +00:00
log.Info("scheduler is disabled ", time.Now())
2020-04-13 10:37:54 +00:00
return true
2020-04-13 10:26:40 +00:00
}
2020-04-13 15:53:38 +00:00
func (scheduler *SchedulerFair) UpdateParallelism(parallelism int) bool {
scheduler.parallelism = parallelism
2020-04-30 13:22:21 +00:00
log.Info("parallelism is updated to ", parallelism)
2020-04-13 15:53:38 +00:00
return true
}
2020-04-30 09:52:52 +00:00
func (scheduler *SchedulerFair) SetShareRatio(ratio float64) bool {
scheduler.enableShareRatio = ratio
2020-04-30 13:22:21 +00:00
log.Info("enableShareRatio is updated to ", ratio)
2020-04-30 09:52:52 +00:00
return true
}
func (scheduler *SchedulerFair) SetPreScheduleRatio(ratio float64) bool {
scheduler.enablePreScheduleRatio = ratio
2020-04-30 13:22:21 +00:00
log.Info("enablePreScheduleRatio is updated to ", ratio)
2020-04-30 09:52:52 +00:00
return true
}
2020-05-03 02:30:12 +00:00
func (scheduler *SchedulerFair) updateGroup(group Group) bool {
num := 0
for _, g := range InstanceOfGroupManager().List().Groups {
if g.Reserved {
num += g.NumGPU
}
}
scheduler.queuesUsingGPUMu.Lock()
scheduler.reservedGPU = num
scheduler.queuesUsingGPUMu.Unlock()
return true
}