1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-13 07:46:43 +00:00
This commit is contained in:
2020-05-01 12:48:06 +08:00
parent e880c21e82
commit 72ba430f4d
3 changed files with 76 additions and 8 deletions

View File

@@ -9,6 +9,7 @@ import (
"math/rand"
"strconv"
"hash/fnv"
"sort"
)
type ResourcePool struct {
@@ -303,16 +304,82 @@ func (pool *ResourcePool) getBindings() map[string]map[string]int {
return pool.bindings
}
func (pool *ResourcePool) pickNode(nodes []*NodeStatus) *NodeStatus {
func (pool *ResourcePool) pickNode(candidates []*NodeStatus, availableGPUs map[string][]GPUStatus, task Task, job Job, nodes []NodeStatus) *NodeStatus {
/* shuffle */
r := rand.New(rand.NewSource(time.Now().Unix()))
for n := len(nodes); n > 0; n-- {
for n := len(candidates); n > 0; n-- {
randIndex := r.Intn(n)
nodes[n-1], nodes[randIndex] = nodes[randIndex], nodes[n-1]
candidates[n-1], candidates[randIndex] = candidates[randIndex], candidates[n-1]
}
/* sort */
// single node, single GPU
sort.Slice(candidates, func(a, b int) bool {
diffA := pool.GPUModelToPower(candidates[a].Status[0].ProductName) - pool.GPUModelToPower(task.ModelGPU)
diffB := pool.GPUModelToPower(candidates[b].Status[0].ProductName) - pool.GPUModelToPower(task.ModelGPU)
return nodes[0]
if diffA > 0 && diffB >= 0 && diffA > diffB {
return false //b
}
if diffA < 0 && diffB < 0 && diffA > diffB {
return false
}
if diffA < 0 && diffB >= 0 {
return false
}
if diffA == diffB {
if len(availableGPUs[candidates[a].ClientID]) == len(availableGPUs[candidates[b].ClientID]) {
return candidates[a].UtilCPU > candidates[b].UtilCPU
}
return len(availableGPUs[candidates[a].ClientID]) < len(availableGPUs[candidates[b].ClientID])
}
return true //a
})
var t []*NodeStatus
bestGPU := candidates[0].Status[0].ProductName
for _, node := range candidates {
if node.Status[0].ProductName != bestGPU {
break
}
t = append(t, node)
}
candidates = t
if (len(job.Tasks) == 1) && task.NumberGPU > 1 { //single node, multi GPUs
sort.Slice(candidates, func(a, b int) bool {
if len(availableGPUs[candidates[a].ClientID]) == len(availableGPUs[candidates[b].ClientID]) {
return candidates[a].UtilCPU > candidates[b].UtilCPU
}
return len(availableGPUs[candidates[a].ClientID]) < len(availableGPUs[candidates[b].ClientID])
})
}
if len(job.Tasks) > 1 { //multi nodes, multi GPUs
sort.Slice(candidates, func(a, b int) bool {
distanceA := 0
distanceB := 0
for _, node := range nodes {
if node.Rack != candidates[a].Rack {
distanceA += 10
}
if node.ClientID != candidates[a].ClientID {
distanceA += 1
}
if node.Rack != candidates[b].Rack {
distanceB += 10
}
if node.ClientID != candidates[b].ClientID {
distanceB += 1
}
}
if distanceA == distanceB {
return len(availableGPUs[candidates[a].ClientID]) > len(availableGPUs[candidates[b].ClientID])
}
return distanceA*job.Locality < distanceB*job.Locality
})
}
return candidates[0]
}

View File

@@ -330,10 +330,9 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
for task_t, s := range tasks {
est, valid2 := InstanceOfOptimizer().predictTime(task_t)
if valid2 {
t := s
now := (int)(time.Now().Unix())
log.Info(t, now, estimate, est)
if now-t > est.Total-est.Post-estimate.Pre && status.MemoryFree > task.MemoryGPU {
log.Info(s, now, estimate, est)
if now-s > est.Total-est.Post-estimate.Pre-15 && status.MemoryFree > task.MemoryGPU {
available = append(available, status)
}
}
@@ -363,7 +362,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
/* assign */
if len(candidates) > 0 {
node := pool.pickNode(candidates)
node := pool.pickNode(candidates, availableGPUs, task, job, nodes)
res.ClientID = node.ClientID
res.ClientHost = node.ClientHost
res.Status = availableGPUs[node.ClientID][0:task.NumberGPU]

View File

@@ -111,6 +111,7 @@ type GPUStatus struct {
type NodeStatus struct {
ClientID string `json:"id"`
ClientHost string `json:"host"`
Rack int `json:"rack"`
Version float64 `json:"version"`
NumCPU int `json:"cpu_num"`
UtilCPU float64 `json:"cpu_load"`
@@ -130,6 +131,7 @@ type Job struct {
CreatedAt int `json:"created_at"`
UpdatedAt int `json:"updated_at"`
CreatedBy int `json:"created_by"`
Locality int `json:"created_by"`
Status State `json:"status"`
}