mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-06-07 14:21:55 +00:00
update
This commit is contained in:
parent
eda386fdf6
commit
51976911e1
@ -51,6 +51,7 @@ func (jm *JobManager) start() {
|
|||||||
/* sleep random Millisecond to avoid deadlock */
|
/* sleep random Millisecond to avoid deadlock */
|
||||||
time.Sleep(time.Millisecond * time.Duration(500+rand.Intn(500)))
|
time.Sleep(time.Millisecond * time.Duration(500+rand.Intn(500)))
|
||||||
}
|
}
|
||||||
|
jm.job.StartedAt = time.Now().Unix()
|
||||||
|
|
||||||
if InstanceOfConfiguration().mock {
|
if InstanceOfConfiguration().mock {
|
||||||
jm.scheduler.UpdateProgress(jm.job, Running)
|
jm.scheduler.UpdateProgress(jm.job, Running)
|
||||||
|
@ -46,7 +46,7 @@ type ResourcePool struct {
|
|||||||
networksFree map[string]bool
|
networksFree map[string]bool
|
||||||
networkMu sync.Mutex
|
networkMu sync.Mutex
|
||||||
|
|
||||||
bindings map[string]map[string]int
|
bindings map[string]map[string]Job
|
||||||
bindingsMu sync.Mutex
|
bindingsMu sync.Mutex
|
||||||
utils map[string][]UtilGPUTimeSeries
|
utils map[string][]UtilGPUTimeSeries
|
||||||
|
|
||||||
@ -75,7 +75,7 @@ func (pool *ResourcePool) init(conf Configuration) {
|
|||||||
pool.networks = map[string]bool{}
|
pool.networks = map[string]bool{}
|
||||||
pool.networksFree = map[string]bool{}
|
pool.networksFree = map[string]bool{}
|
||||||
|
|
||||||
pool.bindings = map[string]map[string]int{}
|
pool.bindings = map[string]map[string]Job{}
|
||||||
pool.utils = map[string][]UtilGPUTimeSeries{}
|
pool.utils = map[string][]UtilGPUTimeSeries{}
|
||||||
|
|
||||||
pool.TotalGPU = 0
|
pool.TotalGPU = 0
|
||||||
@ -555,7 +555,7 @@ func (pool *ResourcePool) releaseNetwork(network string) {
|
|||||||
pool.networkMu.Unlock()
|
pool.networkMu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) attach(GPU string, job string) {
|
func (pool *ResourcePool) attach(GPU string, job Job) {
|
||||||
pool.subscriptionsMu.Lock()
|
pool.subscriptionsMu.Lock()
|
||||||
defer pool.subscriptionsMu.Unlock()
|
defer pool.subscriptionsMu.Unlock()
|
||||||
pool.bindingsMu.Lock()
|
pool.bindingsMu.Lock()
|
||||||
@ -564,12 +564,12 @@ func (pool *ResourcePool) attach(GPU string, job string) {
|
|||||||
if _, ok := pool.subscriptions[GPU]; !ok {
|
if _, ok := pool.subscriptions[GPU]; !ok {
|
||||||
pool.subscriptions[GPU] = map[string]int{}
|
pool.subscriptions[GPU] = map[string]int{}
|
||||||
}
|
}
|
||||||
pool.subscriptions[GPU][job] = int(time.Now().Unix())
|
pool.subscriptions[GPU][job.Name] = int(time.Now().Unix())
|
||||||
|
|
||||||
if _, ok := pool.bindings[GPU]; !ok {
|
if _, ok := pool.bindings[GPU]; !ok {
|
||||||
pool.bindings[GPU] = map[string]int{}
|
pool.bindings[GPU] = map[string]Job{}
|
||||||
}
|
}
|
||||||
pool.bindings[GPU][job] = int(time.Now().Unix())
|
pool.bindings[GPU][job.Name] = job
|
||||||
|
|
||||||
if _, ok := pool.utils[GPU]; !ok {
|
if _, ok := pool.utils[GPU]; !ok {
|
||||||
pool.utils[GPU] = []UtilGPUTimeSeries{}
|
pool.utils[GPU] = []UtilGPUTimeSeries{}
|
||||||
@ -609,7 +609,7 @@ func (pool *ResourcePool) countGPU() (int, int) {
|
|||||||
return pool.TotalGPU - pool.UsingGPU, pool.UsingGPU
|
return pool.TotalGPU - pool.UsingGPU, pool.UsingGPU
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) getBindings() map[string]map[string]int {
|
func (pool *ResourcePool) getBindings() map[string]map[string]Job {
|
||||||
return pool.bindings
|
return pool.bindings
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -743,15 +743,14 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
if pool.TotalGPU == 0 {
|
if pool.TotalGPU == 0 {
|
||||||
return []NodeStatus{}
|
return []NodeStatus{}
|
||||||
}
|
}
|
||||||
//loadRatio := float64(pool.UsingGPU) / float64(pool.TotalGPU)
|
|
||||||
|
|
||||||
|
loadRatio := float64(pool.UsingGPU) / float64(pool.TotalGPU)
|
||||||
/* first, choose sharable GPUs */
|
/* first, choose sharable GPUs */
|
||||||
/*
|
|
||||||
if pool.enableShare && len(job.Tasks) == 1 && task.NumberGPU == 1 && loadRatio >= pool.enableShareRatio {
|
if pool.enableShare && len(job.Tasks) == 1 && task.NumberGPU == 1 && loadRatio >= pool.enableShareRatio {
|
||||||
// check sharable
|
// check sharable
|
||||||
allocationType = 1
|
allocationType = 1
|
||||||
pred := InstanceOfOptimizer().PredictReq(job, "Worker")
|
pred := InstanceOfOptimizer().PredictReq(job, "Worker")
|
||||||
|
availables := map[string][]GPUStatus{}
|
||||||
for cur := start; ; {
|
for cur := start; ; {
|
||||||
if _, ok := locks[cur.ID]; !ok {
|
if _, ok := locks[cur.ID]; !ok {
|
||||||
cur.Lock.Lock()
|
cur.Lock.Lock()
|
||||||
@ -765,14 +764,11 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
|
|
||||||
if jobs, ok := pool.bindings[status.UUID]; ok {
|
if jobs, ok := pool.bindings[status.UUID]; ok {
|
||||||
totalUtil := pred.UtilGPU
|
totalUtil := pred.UtilGPU
|
||||||
for job := range jobs {
|
for _, job := range jobs {
|
||||||
if utilT, ok := InstanceOfOptimizer().predictUtilGPU(job); ok {
|
utilT := InstanceOfOptimizer().PredictReq(job, "Worker").UtilGPU
|
||||||
totalUtil += utilT
|
totalUtil += utilT
|
||||||
} else {
|
|
||||||
totalUtil += 100
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if totalUtil < 100 {
|
if totalUtil < 200 {
|
||||||
available = append(available, status)
|
available = append(available, status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -780,6 +776,7 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
}
|
}
|
||||||
if len(available) >= task.NumberGPU {
|
if len(available) >= task.NumberGPU {
|
||||||
candidates = append(candidates, *node)
|
candidates = append(candidates, *node)
|
||||||
|
availables[node.ClientHost] = available
|
||||||
if len(candidates) >= len(job.Tasks)*3+5 {
|
if len(candidates) >= len(job.Tasks)*3+5 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -793,8 +790,35 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
}
|
}
|
||||||
cur = cur.Next
|
cur = cur.Next
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(candidates) > 0 {
|
||||||
|
node := candidates[0]
|
||||||
|
res := NodeStatus{}
|
||||||
|
res.ClientID = node.ClientID
|
||||||
|
res.ClientHost = node.ClientHost
|
||||||
|
res.NumCPU = task.NumberCPU
|
||||||
|
res.MemTotal = task.Memory
|
||||||
|
res.Status = availables[node.ClientHost][0:task.NumberGPU]
|
||||||
|
|
||||||
|
for i := range res.Status {
|
||||||
|
for j := range node.Status {
|
||||||
|
if res.Status[i].UUID == node.Status[j].UUID {
|
||||||
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
|
pool.UsingGPUMu.Lock()
|
||||||
|
pool.UsingGPU ++
|
||||||
|
pool.UsingGPUMu.Unlock()
|
||||||
|
}
|
||||||
|
node.Status[j].MemoryAllocated += task.MemoryGPU
|
||||||
|
res.Status[i].MemoryTotal = task.MemoryGPU
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, t := range res.Status {
|
||||||
|
pool.attach(t.UUID, job)
|
||||||
|
}
|
||||||
|
return []NodeStatus{res}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
//log.Info(candidates)
|
//log.Info(candidates)
|
||||||
|
|
||||||
/* second round, find vacant gpu */
|
/* second round, find vacant gpu */
|
||||||
@ -832,12 +856,12 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* third round, find gpu to be released */
|
/* third round, find gpu to be released */
|
||||||
/*
|
|
||||||
if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && pool.enablePreSchedule {
|
if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && pool.enablePreSchedule {
|
||||||
estimate := InstanceOfOptimizer().PredictTime(job)
|
estimate := InstanceOfOptimizer().PredictTime(job)
|
||||||
|
|
||||||
if loadRatio >= pool.enablePreScheduleRatio {
|
if loadRatio >= pool.enablePreScheduleRatio {
|
||||||
allocationType = 3
|
allocationType = 3
|
||||||
|
availables := map[string][]GPUStatus{}
|
||||||
for cur := start; ; {
|
for cur := start; ; {
|
||||||
if _, ok := locks[cur.ID]; !ok {
|
if _, ok := locks[cur.ID]; !ok {
|
||||||
cur.Lock.Lock()
|
cur.Lock.Lock()
|
||||||
@ -847,15 +871,14 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
bindings := pool.getBindings()
|
bindings := pool.getBindings()
|
||||||
if tasks, ok := bindings[status.UUID]; ok {
|
if jobs, ok := bindings[status.UUID]; ok {
|
||||||
if len(tasks) > 1 || status.MemoryAllocated == 0 {
|
if len(jobs) > 1 || status.MemoryAllocated == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for taskT, s := range tasks {
|
for _, jobT := range jobs {
|
||||||
est := InstanceOfOptimizer().PredictTime(taskT)
|
est := InstanceOfOptimizer().PredictTime(jobT)
|
||||||
now := (int)(time.Now().Unix())
|
now := time.Now().Unix()
|
||||||
log.Info(s, now, estimate, est)
|
if int(now-jobT.StartedAt) > est.Total-est.Post-estimate.Pre-15 {
|
||||||
if now-s > est.Total-est.Post-estimate.Pre-15 {
|
|
||||||
available = append(available, status)
|
available = append(available, status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -863,6 +886,7 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
}
|
}
|
||||||
if len(available) >= task.NumberGPU {
|
if len(available) >= task.NumberGPU {
|
||||||
candidates = append(candidates, *node)
|
candidates = append(candidates, *node)
|
||||||
|
availables[node.ClientHost] = available
|
||||||
if len(candidates) >= len(job.Tasks)*3+5 {
|
if len(candidates) >= len(job.Tasks)*3+5 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -877,9 +901,35 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
cur = cur.Next
|
cur = cur.Next
|
||||||
}
|
}
|
||||||
//log.Info(candidates)
|
//log.Info(candidates)
|
||||||
|
if len(candidates) > 0 {
|
||||||
|
node := candidates[0]
|
||||||
|
res := NodeStatus{}
|
||||||
|
res.ClientID = node.ClientID
|
||||||
|
res.ClientHost = node.ClientHost
|
||||||
|
res.NumCPU = task.NumberCPU
|
||||||
|
res.MemTotal = task.Memory
|
||||||
|
res.Status = availables[node.ClientHost][0:task.NumberGPU]
|
||||||
|
|
||||||
|
for i := range res.Status {
|
||||||
|
for j := range node.Status {
|
||||||
|
if res.Status[i].UUID == node.Status[j].UUID {
|
||||||
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
|
pool.UsingGPUMu.Lock()
|
||||||
|
pool.UsingGPU ++
|
||||||
|
pool.UsingGPUMu.Unlock()
|
||||||
|
}
|
||||||
|
node.Status[j].MemoryAllocated += task.MemoryGPU
|
||||||
|
res.Status[i].MemoryTotal = task.MemoryGPU
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, t := range res.Status {
|
||||||
|
pool.attach(t.UUID, job)
|
||||||
|
}
|
||||||
|
return []NodeStatus{res}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
if len(candidates) > 0 {
|
if len(candidates) > 0 {
|
||||||
log.Info("allocationType is ", allocationType)
|
log.Info("allocationType is ", allocationType)
|
||||||
@ -967,7 +1017,7 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, t := range res.Status {
|
for _, t := range res.Status {
|
||||||
pool.attach(t.UUID, task.Job)
|
pool.attach(t.UUID, job)
|
||||||
}
|
}
|
||||||
|
|
||||||
flag := false
|
flag := false
|
||||||
|
@ -18,6 +18,7 @@ type Job struct {
|
|||||||
Priority JobPriority `json:"priority"`
|
Priority JobPriority `json:"priority"`
|
||||||
RunBefore int `json:"run_before"`
|
RunBefore int `json:"run_before"`
|
||||||
CreatedAt int `json:"created_at"`
|
CreatedAt int `json:"created_at"`
|
||||||
|
StartedAt int64 `json:"started_at"`
|
||||||
UpdatedAt int `json:"updated_at"`
|
UpdatedAt int `json:"updated_at"`
|
||||||
CreatedBy int `json:"created_by"`
|
CreatedBy int `json:"created_by"`
|
||||||
Locality int `json:"locality"`
|
Locality int `json:"locality"`
|
||||||
|
Loading…
Reference in New Issue
Block a user