mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-15 16:16:44 +00:00
update
This commit is contained in:
@@ -9,6 +9,7 @@ import (
|
|||||||
"math/rand"
|
"math/rand"
|
||||||
"strconv"
|
"strconv"
|
||||||
"hash/fnv"
|
"hash/fnv"
|
||||||
|
"sort"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ResourcePool struct {
|
type ResourcePool struct {
|
||||||
@@ -303,16 +304,82 @@ func (pool *ResourcePool) getBindings() map[string]map[string]int {
|
|||||||
return pool.bindings
|
return pool.bindings
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) pickNode(nodes []*NodeStatus) *NodeStatus {
|
func (pool *ResourcePool) pickNode(candidates []*NodeStatus, availableGPUs map[string][]GPUStatus, task Task, job Job, nodes []NodeStatus) *NodeStatus {
|
||||||
|
|
||||||
/* shuffle */
|
/* shuffle */
|
||||||
r := rand.New(rand.NewSource(time.Now().Unix()))
|
r := rand.New(rand.NewSource(time.Now().Unix()))
|
||||||
for n := len(nodes); n > 0; n-- {
|
for n := len(candidates); n > 0; n-- {
|
||||||
randIndex := r.Intn(n)
|
randIndex := r.Intn(n)
|
||||||
nodes[n-1], nodes[randIndex] = nodes[randIndex], nodes[n-1]
|
candidates[n-1], candidates[randIndex] = candidates[randIndex], candidates[n-1]
|
||||||
}
|
}
|
||||||
|
|
||||||
/* sort */
|
/* sort */
|
||||||
|
// single node, single GPU
|
||||||
|
sort.Slice(candidates, func(a, b int) bool {
|
||||||
|
diffA := pool.GPUModelToPower(candidates[a].Status[0].ProductName) - pool.GPUModelToPower(task.ModelGPU)
|
||||||
|
diffB := pool.GPUModelToPower(candidates[b].Status[0].ProductName) - pool.GPUModelToPower(task.ModelGPU)
|
||||||
|
|
||||||
return nodes[0]
|
if diffA > 0 && diffB >= 0 && diffA > diffB {
|
||||||
|
return false //b
|
||||||
|
}
|
||||||
|
if diffA < 0 && diffB < 0 && diffA > diffB {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if diffA < 0 && diffB >= 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if diffA == diffB {
|
||||||
|
if len(availableGPUs[candidates[a].ClientID]) == len(availableGPUs[candidates[b].ClientID]) {
|
||||||
|
return candidates[a].UtilCPU > candidates[b].UtilCPU
|
||||||
|
}
|
||||||
|
return len(availableGPUs[candidates[a].ClientID]) < len(availableGPUs[candidates[b].ClientID])
|
||||||
|
}
|
||||||
|
return true //a
|
||||||
|
})
|
||||||
|
|
||||||
|
var t []*NodeStatus
|
||||||
|
bestGPU := candidates[0].Status[0].ProductName
|
||||||
|
for _, node := range candidates {
|
||||||
|
if node.Status[0].ProductName != bestGPU {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
t = append(t, node)
|
||||||
|
}
|
||||||
|
candidates = t
|
||||||
|
|
||||||
|
if (len(job.Tasks) == 1) && task.NumberGPU > 1 { //single node, multi GPUs
|
||||||
|
sort.Slice(candidates, func(a, b int) bool {
|
||||||
|
if len(availableGPUs[candidates[a].ClientID]) == len(availableGPUs[candidates[b].ClientID]) {
|
||||||
|
return candidates[a].UtilCPU > candidates[b].UtilCPU
|
||||||
|
}
|
||||||
|
return len(availableGPUs[candidates[a].ClientID]) < len(availableGPUs[candidates[b].ClientID])
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(job.Tasks) > 1 { //multi nodes, multi GPUs
|
||||||
|
sort.Slice(candidates, func(a, b int) bool {
|
||||||
|
distanceA := 0
|
||||||
|
distanceB := 0
|
||||||
|
for _, node := range nodes {
|
||||||
|
if node.Rack != candidates[a].Rack {
|
||||||
|
distanceA += 10
|
||||||
|
}
|
||||||
|
if node.ClientID != candidates[a].ClientID {
|
||||||
|
distanceA += 1
|
||||||
|
}
|
||||||
|
if node.Rack != candidates[b].Rack {
|
||||||
|
distanceB += 10
|
||||||
|
}
|
||||||
|
if node.ClientID != candidates[b].ClientID {
|
||||||
|
distanceB += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if distanceA == distanceB {
|
||||||
|
return len(availableGPUs[candidates[a].ClientID]) > len(availableGPUs[candidates[b].ClientID])
|
||||||
|
}
|
||||||
|
return distanceA*job.Locality < distanceB*job.Locality
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidates[0]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -330,10 +330,9 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
for task_t, s := range tasks {
|
for task_t, s := range tasks {
|
||||||
est, valid2 := InstanceOfOptimizer().predictTime(task_t)
|
est, valid2 := InstanceOfOptimizer().predictTime(task_t)
|
||||||
if valid2 {
|
if valid2 {
|
||||||
t := s
|
|
||||||
now := (int)(time.Now().Unix())
|
now := (int)(time.Now().Unix())
|
||||||
log.Info(t, now, estimate, est)
|
log.Info(s, now, estimate, est)
|
||||||
if now-t > est.Total-est.Post-estimate.Pre && status.MemoryFree > task.MemoryGPU {
|
if now-s > est.Total-est.Post-estimate.Pre-15 && status.MemoryFree > task.MemoryGPU {
|
||||||
available = append(available, status)
|
available = append(available, status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -363,7 +362,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
|
|
||||||
/* assign */
|
/* assign */
|
||||||
if len(candidates) > 0 {
|
if len(candidates) > 0 {
|
||||||
node := pool.pickNode(candidates)
|
node := pool.pickNode(candidates, availableGPUs, task, job, nodes)
|
||||||
res.ClientID = node.ClientID
|
res.ClientID = node.ClientID
|
||||||
res.ClientHost = node.ClientHost
|
res.ClientHost = node.ClientHost
|
||||||
res.Status = availableGPUs[node.ClientID][0:task.NumberGPU]
|
res.Status = availableGPUs[node.ClientID][0:task.NumberGPU]
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ type GPUStatus struct {
|
|||||||
type NodeStatus struct {
|
type NodeStatus struct {
|
||||||
ClientID string `json:"id"`
|
ClientID string `json:"id"`
|
||||||
ClientHost string `json:"host"`
|
ClientHost string `json:"host"`
|
||||||
|
Rack int `json:"rack"`
|
||||||
Version float64 `json:"version"`
|
Version float64 `json:"version"`
|
||||||
NumCPU int `json:"cpu_num"`
|
NumCPU int `json:"cpu_num"`
|
||||||
UtilCPU float64 `json:"cpu_load"`
|
UtilCPU float64 `json:"cpu_load"`
|
||||||
@@ -130,6 +131,7 @@ type Job struct {
|
|||||||
CreatedAt int `json:"created_at"`
|
CreatedAt int `json:"created_at"`
|
||||||
UpdatedAt int `json:"updated_at"`
|
UpdatedAt int `json:"updated_at"`
|
||||||
CreatedBy int `json:"created_by"`
|
CreatedBy int `json:"created_by"`
|
||||||
|
Locality int `json:"created_by"`
|
||||||
Status State `json:"status"`
|
Status State `json:"status"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user