mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-06-07 14:21:55 +00:00
update
This commit is contained in:
parent
e8c6ea53e2
commit
6f01eef504
@ -1,10 +1,12 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
type Evaluator struct {
|
type Evaluator struct {
|
||||||
domains map[string]map[string]int
|
domains map[string]map[string]map[string]int
|
||||||
racks map[string]map[string]int
|
racks map[string]map[string]map[string]int
|
||||||
nodes map[string]map[string]int
|
nodes map[string]map[string]map[string]int
|
||||||
upstreams map[string]string
|
//upstreams map[string]map[string]string
|
||||||
|
totalPSs map[string]int
|
||||||
|
totalWorkers map[string]int
|
||||||
totalPS int
|
totalPS int
|
||||||
totalWorker int
|
totalWorker int
|
||||||
|
|
||||||
@ -19,10 +21,12 @@ type Evaluator struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (eva *Evaluator) init(nodes []NodeStatus, tasks []Task) {
|
func (eva *Evaluator) init(nodes []NodeStatus, tasks []Task) {
|
||||||
eva.domains = map[string]map[string]int{}
|
eva.domains = map[string]map[string]map[string]int{}
|
||||||
eva.racks = map[string]map[string]int{}
|
eva.racks = map[string]map[string]map[string]int{}
|
||||||
eva.nodes = map[string]map[string]int{}
|
eva.nodes = map[string]map[string]map[string]int{}
|
||||||
eva.upstreams = map[string]string{}
|
//eva.upstreams = map[string]string{}
|
||||||
|
eva.totalPSs = map[string]int{}
|
||||||
|
eva.totalWorkers = map[string]int{}
|
||||||
eva.totalPS = 0
|
eva.totalPS = 0
|
||||||
eva.totalWorker = 0
|
eva.totalWorker = 0
|
||||||
eva.factorNode = 1.0
|
eva.factorNode = 1.0
|
||||||
@ -34,33 +38,42 @@ func (eva *Evaluator) init(nodes []NodeStatus, tasks []Task) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (eva *Evaluator) add(node NodeStatus, task Task) {
|
func (eva *Evaluator) add(node NodeStatus, task Task) {
|
||||||
|
if _, ok := eva.nodes[task.Job]; !ok {
|
||||||
|
eva.nodes[task.Job] = map[string]map[string]int{}
|
||||||
|
eva.racks[task.Job] = map[string]map[string]int{}
|
||||||
|
eva.domains[task.Job] = map[string]map[string]int{}
|
||||||
|
eva.totalPSs = map[string]int{}
|
||||||
|
eva.totalWorkers = map[string]int{}
|
||||||
|
}
|
||||||
/* update network cost */
|
/* update network cost */
|
||||||
if _, ok := eva.nodes[node.ClientID]; !ok {
|
if _, ok := eva.nodes[task.Job][node.ClientID]; !ok {
|
||||||
eva.nodes[node.ClientID] = map[string]int{"PS": 0, "Worker": 0}
|
eva.nodes[task.Job][node.ClientID] = map[string]int{"PS": 0, "Worker": 0}
|
||||||
}
|
}
|
||||||
if _, ok := eva.racks[node.Rack]; !ok {
|
if _, ok := eva.racks[task.Job][node.Rack]; !ok {
|
||||||
eva.racks[node.Rack] = map[string]int{"PS": 0, "Worker": 0}
|
eva.racks[task.Job][node.Rack] = map[string]int{"PS": 0, "Worker": 0}
|
||||||
}
|
}
|
||||||
if _, ok := eva.domains[node.Domain]; !ok {
|
if _, ok := eva.domains[task.Job][node.Domain]; !ok {
|
||||||
eva.domains[node.Domain] = map[string]int{"PS": 0, "Worker": 0}
|
eva.domains[task.Job][node.Domain] = map[string]int{"PS": 0, "Worker": 0}
|
||||||
}
|
}
|
||||||
if task.IsPS {
|
if task.IsPS {
|
||||||
eva.costNetwork += eva.factorNode * float64(eva.racks[node.Rack]["Worker"]-eva.nodes[node.ClientID]["Worker"])
|
eva.costNetwork += eva.factorNode * float64(eva.racks[task.Job][node.Rack]["Worker"]-eva.nodes[task.Job][node.ClientID]["Worker"])
|
||||||
eva.costNetwork += eva.factorRack * float64(eva.domains[node.Domain]["Worker"]-eva.racks[node.Rack]["Worker"])
|
eva.costNetwork += eva.factorRack * float64(eva.domains[task.Job][node.Domain]["Worker"]-eva.racks[task.Job][node.Rack]["Worker"])
|
||||||
eva.costNetwork += eva.factorDomain * float64(eva.totalWorker-eva.domains[node.Domain]["Worker"])
|
eva.costNetwork += eva.factorDomain * float64(eva.totalWorkers[task.Job]-eva.domains[task.Job][node.Domain]["Worker"])
|
||||||
|
|
||||||
eva.nodes[node.ClientID]["PS"]++
|
eva.nodes[task.Job][node.ClientID]["PS"]++
|
||||||
eva.racks[node.Rack]["PS"]++
|
eva.racks[task.Job][node.Rack]["PS"]++
|
||||||
eva.domains[node.Domain]["PS"]++
|
eva.domains[task.Job][node.Domain]["PS"]++
|
||||||
|
eva.totalPSs[task.Job]++
|
||||||
eva.totalPS++
|
eva.totalPS++
|
||||||
} else {
|
} else {
|
||||||
eva.costNetwork += eva.factorNode * float64(eva.racks[node.Rack]["PS"]-eva.nodes[node.ClientID]["PS"])
|
eva.costNetwork += eva.factorNode * float64(eva.racks[task.Job][node.Rack]["PS"]-eva.nodes[task.Job][node.ClientID]["PS"])
|
||||||
eva.costNetwork += eva.factorRack * float64(eva.domains[node.Domain]["PS"]-eva.racks[node.Rack]["PS"])
|
eva.costNetwork += eva.factorRack * float64(eva.domains[task.Job][node.Domain]["PS"]-eva.racks[task.Job][node.Rack]["PS"])
|
||||||
eva.costNetwork += eva.factorDomain * float64(eva.totalPS-eva.domains[node.Domain]["PS"])
|
eva.costNetwork += eva.factorDomain * float64(eva.totalPSs[task.Job]-eva.domains[task.Job][node.Domain]["PS"])
|
||||||
|
|
||||||
eva.nodes[node.ClientID]["Worker"]++
|
eva.nodes[task.Job][node.ClientID]["Worker"]++
|
||||||
eva.racks[node.Rack]["Worker"]++
|
eva.racks[task.Job][node.Rack]["Worker"]++
|
||||||
eva.domains[node.Domain]["Worker"]++
|
eva.domains[task.Job][node.Domain]["Worker"]++
|
||||||
|
eva.totalWorkers[task.Job]++
|
||||||
eva.totalWorker++
|
eva.totalWorker++
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,22 +91,24 @@ func (eva *Evaluator) add(node NodeStatus, task Task) {
|
|||||||
func (eva *Evaluator) remove(node NodeStatus, task Task) {
|
func (eva *Evaluator) remove(node NodeStatus, task Task) {
|
||||||
/* update network cost */
|
/* update network cost */
|
||||||
if task.IsPS {
|
if task.IsPS {
|
||||||
eva.costNetwork -= eva.factorNode * float64(eva.racks[node.Rack]["Worker"]-eva.nodes[node.ClientID]["Worker"])
|
eva.costNetwork -= eva.factorNode * float64(eva.racks[task.Job][node.Rack]["Worker"]-eva.nodes[task.Job][node.ClientID]["Worker"])
|
||||||
eva.costNetwork -= eva.factorRack * float64(eva.domains[node.Domain]["Worker"]-eva.racks[node.Rack]["Worker"])
|
eva.costNetwork -= eva.factorRack * float64(eva.domains[task.Job][node.Domain]["Worker"]-eva.racks[task.Job][node.Rack]["Worker"])
|
||||||
eva.costNetwork -= eva.factorDomain * float64(eva.totalWorker-eva.domains[node.Domain]["Worker"])
|
eva.costNetwork -= eva.factorDomain * float64(eva.totalWorkers[task.Job]-eva.domains[task.Job][node.Domain]["Worker"])
|
||||||
|
|
||||||
eva.nodes[node.ClientID]["PS"]--
|
eva.nodes[task.Job][node.ClientID]["PS"]--
|
||||||
eva.racks[node.Rack]["PS"]--
|
eva.racks[task.Job][node.Rack]["PS"]--
|
||||||
eva.domains[node.Domain]["PS"]--
|
eva.domains[task.Job][node.Domain]["PS"]--
|
||||||
|
eva.totalPSs[task.Job]--
|
||||||
eva.totalPS--
|
eva.totalPS--
|
||||||
} else {
|
} else {
|
||||||
eva.costNetwork -= eva.factorNode * float64(eva.racks[node.Rack]["PS"]-eva.nodes[node.ClientID]["PS"])
|
eva.costNetwork -= eva.factorNode * float64(eva.racks[task.Job][node.Rack]["PS"]-eva.nodes[task.Job][node.ClientID]["PS"])
|
||||||
eva.costNetwork -= eva.factorRack * float64(eva.domains[node.Domain]["PS"]-eva.racks[node.Rack]["PS"])
|
eva.costNetwork -= eva.factorRack * float64(eva.domains[task.Job][node.Domain]["PS"]-eva.racks[task.Job][node.Rack]["PS"])
|
||||||
eva.costNetwork -= eva.factorDomain * float64(eva.totalPS-eva.domains[node.Domain]["PS"])
|
eva.costNetwork -= eva.factorDomain * float64(eva.totalPSs[task.Job]-eva.domains[task.Job][node.Domain]["PS"])
|
||||||
|
|
||||||
eva.nodes[node.ClientID]["Worker"]--
|
eva.nodes[task.Job][node.ClientID]["Worker"]--
|
||||||
eva.racks[node.Rack]["Worker"]--
|
eva.racks[task.Job][node.Rack]["Worker"]--
|
||||||
eva.domains[node.Domain]["Worker"]--
|
eva.domains[task.Job][node.Domain]["Worker"]--
|
||||||
|
eva.totalWorkers[task.Job]--
|
||||||
eva.totalWorker--
|
eva.totalWorker--
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,13 +124,19 @@ func (eva *Evaluator) remove(node NodeStatus, task Task) {
|
|||||||
|
|
||||||
func (eva *Evaluator) calculate() float64 {
|
func (eva *Evaluator) calculate() float64 {
|
||||||
usingNodes := 0.0
|
usingNodes := 0.0
|
||||||
for _, pair := range eva.nodes {
|
for _, job := range eva.nodes {
|
||||||
|
for _, pair := range job {
|
||||||
if v, ok := pair["PS"]; ok && v > 0 {
|
if v, ok := pair["PS"]; ok && v > 0 {
|
||||||
usingNodes += 1.0
|
usingNodes += 1.0
|
||||||
} else if v, ok := pair["Worker"]; ok && v > 0 {
|
} else if v, ok := pair["Worker"]; ok && v > 0 {
|
||||||
usingNodes += 1.0
|
usingNodes += 1.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if eva.totalPS+eva.totalWorker == 0 {
|
||||||
|
usingNodes = 1.0
|
||||||
|
} else {
|
||||||
usingNodes /= float64(eva.totalWorker + eva.totalPS)
|
usingNodes /= float64(eva.totalWorker + eva.totalPS)
|
||||||
|
}
|
||||||
return eva.costNetwork + eva.factorSpread*eva.costLoad/float64(eva.totalPS+eva.totalWorker) + usingNodes
|
return eva.costNetwork + eva.factorSpread*eva.costLoad/float64(eva.totalPS+eva.totalWorker) + usingNodes
|
||||||
}
|
}
|
||||||
|
@ -61,6 +61,11 @@ type ResourcePool struct {
|
|||||||
enableShareRatio float64
|
enableShareRatio float64
|
||||||
enablePreSchedule bool
|
enablePreSchedule bool
|
||||||
enablePreScheduleRatio float64
|
enablePreScheduleRatio float64
|
||||||
|
|
||||||
|
enableBatch bool
|
||||||
|
batchJobs []Job
|
||||||
|
batchMu sync.Mutex
|
||||||
|
batchAllocations map[string][]NodeStatus
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) init(conf Configuration) {
|
func (pool *ResourcePool) init(conf Configuration) {
|
||||||
@ -83,6 +88,8 @@ func (pool *ResourcePool) init(conf Configuration) {
|
|||||||
pool.enablePreSchedule = true
|
pool.enablePreSchedule = true
|
||||||
pool.enablePreScheduleRatio = 0.95
|
pool.enablePreScheduleRatio = 0.95
|
||||||
|
|
||||||
|
pool.enableBatch = false
|
||||||
|
|
||||||
/* init pools */
|
/* init pools */
|
||||||
pool.poolsCount = 300
|
pool.poolsCount = 300
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
@ -115,6 +122,31 @@ func (pool *ResourcePool) init(conf Configuration) {
|
|||||||
go func() {
|
go func() {
|
||||||
pool.saveStatusHistory()
|
pool.saveStatusHistory()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
time.Sleep(time.Second * 10)
|
||||||
|
pool.batchMu.Lock()
|
||||||
|
var tasks []Task
|
||||||
|
for _, job := range pool.batchJobs {
|
||||||
|
for _, task := range job.Tasks {
|
||||||
|
task.Job = job.Name
|
||||||
|
tasks = append(tasks, task)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
job := Job{Tasks: tasks}
|
||||||
|
|
||||||
|
nodes := pool.doAcquireResource(job)
|
||||||
|
for i, task := range job.Tasks {
|
||||||
|
if _, ok := pool.batchAllocations[task.Job]; !ok {
|
||||||
|
pool.batchAllocations[task.Job] = []NodeStatus{}
|
||||||
|
}
|
||||||
|
pool.batchAllocations[task.Job] = append(pool.batchAllocations[task.Job], nodes[i])
|
||||||
|
}
|
||||||
|
if len(nodes) > 0 {
|
||||||
|
pool.batchJobs = []Job{}
|
||||||
|
}
|
||||||
|
pool.batchMu.Unlock()
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check dead nodes periodically */
|
/* check dead nodes periodically */
|
||||||
@ -634,6 +666,27 @@ func (pool *ResourcePool) pickNode(candidates []*NodeStatus, availableGPUs map[s
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
|
func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
|
||||||
|
if !pool.enableBatch {
|
||||||
|
for i := range job.Tasks {
|
||||||
|
job.Tasks[i].Job = job.Name
|
||||||
|
}
|
||||||
|
return pool.acquireResource(job)
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
if _, ok := pool.batchAllocations[job.Name]; ok {
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
time.Sleep(time.Millisecond * 100)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pool.batchMu.Lock()
|
||||||
|
nodes := pool.batchAllocations[job.Name]
|
||||||
|
delete(pool.batchAllocations, job.Name)
|
||||||
|
pool.batchMu.Unlock()
|
||||||
|
return nodes
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
||||||
if len(job.Tasks) == 0 {
|
if len(job.Tasks) == 0 {
|
||||||
return []NodeStatus{}
|
return []NodeStatus{}
|
||||||
}
|
}
|
||||||
@ -876,7 +929,7 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, t := range res.Status {
|
for _, t := range res.Status {
|
||||||
pool.attach(t.UUID, job.Name)
|
pool.attach(t.UUID, task.Job)
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range job.Tasks {
|
for i := range job.Tasks {
|
||||||
|
@ -31,6 +31,7 @@ type Job struct {
|
|||||||
|
|
||||||
type Task struct {
|
type Task struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
|
Job string `json:"job_name"`
|
||||||
Image string `json:"image"`
|
Image string `json:"image"`
|
||||||
Cmd string `json:"cmd"`
|
Cmd string `json:"cmd"`
|
||||||
NumberCPU int `json:"cpu_number"`
|
NumberCPU int `json:"cpu_number"`
|
||||||
|
Loading…
Reference in New Issue
Block a user