1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-06-07 14:21:55 +00:00
This commit is contained in:
Newnius 2020-05-25 11:35:44 +08:00
parent b63f1ba609
commit ad551109fe
10 changed files with 232 additions and 419 deletions

View File

@ -11,7 +11,7 @@ import (
var collectorInstance *Collector var collectorInstance *Collector
var collectorInstanceLock sync.Mutex var collectorInstanceLock sync.Mutex
func InstanceOfColector() *Collector { func InstanceOfCollector() *Collector {
defer collectorInstanceLock.Unlock() defer collectorInstanceLock.Unlock()
collectorInstanceLock.Lock() collectorInstanceLock.Lock()

305
src/ga.go
View File

@ -1,142 +1,21 @@
package main package main
import ( import (
"fmt"
"math/rand" "math/rand"
"github.com/MaxHalford/eaopt" "github.com/MaxHalford/eaopt"
"time" "time"
"strconv" "strconv"
"math" "math"
log "github.com/sirupsen/logrus"
) )
type Evaluator struct { var nodesMap map[string]NodeStatus
domains map[string]map[string]int
racks map[string]map[string]int
nodes map[string]map[string]int
upstreams map[string]string
cost float64
totalPS int
totalWorker int
costNetwork float64
factorNode float64
factorRack float64
factorDomain float64
}
func (eva *Evaluator) init(nodes []Node, tasks []Task) {
eva.domains = map[string]map[string]int{}
eva.racks = map[string]map[string]int{}
eva.nodes = map[string]map[string]int{}
eva.upstreams = map[string]string{}
eva.totalPS = 0
eva.totalWorker = 0
eva.factorNode = 1.0
eva.factorRack = 4.0
eva.factorDomain = 40.0
eva.cost = 0.0
eva.costNetwork = 0.0
}
func (eva *Evaluator) add(node Node, task Task) {
/* update node load cost */
/* update network cost */
if _, ok := eva.nodes[node.ClientID]; !ok {
eva.nodes[node.ClientID] = map[string]int{"PS": 0, "Worker": 0}
}
if _, ok := eva.racks[node.Rack]; !ok {
eva.racks[node.Rack] = map[string]int{"PS": 0, "Worker": 0}
}
if _, ok := eva.domains[node.Domain]; !ok {
eva.domains[node.Domain] = map[string]int{"PS": 0, "Worker": 0}
}
if task.IsPS {
eva.costNetwork += eva.factorNode * float64(eva.racks[node.Rack]["Worker"]-eva.nodes[node.ClientID]["Worker"])
eva.costNetwork += eva.factorRack * float64(eva.domains[node.Domain]["Worker"]-eva.racks[node.Rack]["Worker"])
eva.costNetwork += eva.factorDomain * float64(eva.totalWorker-eva.domains[node.Domain]["Worker"])
eva.nodes[node.ClientID]["PS"]++
eva.racks[node.Rack]["PS"]++
eva.domains[node.Domain]["PS"]++
eva.totalPS++
} else {
eva.costNetwork += eva.factorNode * float64(eva.racks[node.Rack]["PS"]-eva.nodes[node.ClientID]["PS"])
eva.costNetwork += eva.factorRack * float64(eva.domains[node.Domain]["PS"]-eva.racks[node.Rack]["PS"])
eva.costNetwork += eva.factorDomain * float64(eva.totalPS-eva.domains[node.Domain]["PS"])
eva.nodes[node.ClientID]["Worker"]++
eva.racks[node.Rack]["Worker"]++
eva.domains[node.Domain]["Worker"]++
eva.totalWorker++
}
eva.cost = eva.costNetwork
}
func (eva *Evaluator) remove(node Node, task Task) {
if task.IsPS {
eva.costNetwork -= eva.factorNode * float64(eva.racks[node.Rack]["Worker"]-eva.nodes[node.ClientID]["Worker"])
eva.costNetwork -= eva.factorRack * float64(eva.domains[node.Domain]["Worker"]-eva.racks[node.Rack]["Worker"])
eva.costNetwork -= eva.factorDomain * float64(eva.totalWorker-eva.domains[node.Domain]["Worker"])
eva.nodes[node.ClientID]["PS"]--
eva.racks[node.Rack]["PS"]--
eva.domains[node.Domain]["PS"]--
eva.totalPS--
} else {
eva.costNetwork -= eva.factorNode * float64(eva.racks[node.Rack]["PS"]-eva.nodes[node.ClientID]["PS"])
eva.costNetwork -= eva.factorRack * float64(eva.domains[node.Domain]["PS"]-eva.racks[node.Rack]["PS"])
eva.costNetwork -= eva.factorDomain * float64(eva.totalPS-eva.domains[node.Domain]["PS"])
//fmt.Println(eva.totalWorker, eva.domains[node.Domain])
eva.nodes[node.ClientID]["Worker"]--
eva.racks[node.Rack]["Worker"]--
eva.domains[node.Domain]["Worker"]--
eva.totalWorker--
}
eva.cost = eva.costNetwork
}
func (eva *Evaluator) calculate() float64 {
return eva.cost
}
var nodesMap map[string]Node
var tasksMap map[string]Task var tasksMap map[string]Task
type Node struct { // A resource allocation
ClientID string `json:"id"`
Domain string `json:"domain"`
Rack string `json:"rack"`
Version float64 `json:"version"`
NumCPU int `json:"cpu_num"`
UtilCPU float64 `json:"cpu_load"`
MemTotal int `json:"mem_total"`
MemAvailable int `json:"mem_available"`
UsingBW float64 `json:"bw_using"`
TotalBW float64 `json:"bw_total"`
numberGPU int
//Status []GPUStatus `json:"status"`
}
type Task3 struct {
Name string `json:"name"`
Image string `json:"image"`
Cmd string `json:"cmd"`
NumberCPU int `json:"cpu_number"`
Memory int `json:"memory"`
NumberGPU int `json:"gpu_number"`
MemoryGPU int `json:"gpu_memory"`
IsPS bool `json:"is_ps"`
ModelGPU string `json:"gpu_model"`
}
// An valid allocation
type Allocation struct { type Allocation struct {
TasksOnNode map[string][]Task // tasks on nodes[id] TasksOnNode map[string][]Task // tasks on nodes[id]
Nodes map[string]Node Nodes map[string]NodeStatus
NodeIDs []string NodeIDs []string
Flags map[string]bool Flags map[string]bool
Evaluator Evaluator Evaluator Evaluator
@ -146,7 +25,13 @@ func randomFit(allocation Allocation, task Task) (string, bool) {
flag := false flag := false
nodeID := "" nodeID := ""
for nodeID = range allocation.Nodes { for nodeID = range allocation.Nodes {
if node, ok := allocation.Nodes[nodeID]; ok && len(allocation.TasksOnNode[nodeID]) < node.numberGPU { numberGPU := 0
for _, gpu := range allocation.Nodes[nodeID].Status {
if gpu.MemoryAllocated == 0 {
numberGPU += 0
}
}
if _, ok := allocation.Nodes[nodeID]; ok && len(allocation.TasksOnNode[nodeID]) < numberGPU {
flag = true flag = true
break break
} }
@ -158,7 +43,13 @@ func firstFit(allocation Allocation, task Task) (string, bool) {
flag := false flag := false
nodeID := "" nodeID := ""
for _, nodeID = range allocation.NodeIDs { for _, nodeID = range allocation.NodeIDs {
if node, ok := allocation.Nodes[nodeID]; ok && len(allocation.TasksOnNode[nodeID]) < node.numberGPU { numberGPU := 0
for _, gpu := range allocation.Nodes[nodeID].Status {
if gpu.MemoryAllocated == 0 {
numberGPU += 0
}
}
if _, ok := allocation.Nodes[nodeID]; ok && len(allocation.TasksOnNode[nodeID]) < numberGPU {
flag = true flag = true
break break
} }
@ -166,7 +57,7 @@ func firstFit(allocation Allocation, task Task) (string, bool) {
return nodeID, flag return nodeID, flag
} }
func fastBestFit(nodes []Node, tasks []Task) Allocation { func fastBestFit(nodes []NodeStatus, tasks []Task) Allocation {
eva := Evaluator{} eva := Evaluator{}
eva.init(nodes, tasks) eva.init(nodes, tasks)
@ -179,7 +70,13 @@ func fastBestFit(nodes []Node, tasks []Task) Allocation {
if _, ok := allocation.TasksOnNode[node.ClientID]; !ok { if _, ok := allocation.TasksOnNode[node.ClientID]; !ok {
allocation.TasksOnNode[node.ClientID] = []Task{} allocation.TasksOnNode[node.ClientID] = []Task{}
} }
if len(allocation.TasksOnNode[node.ClientID]) >= node.numberGPU { numberGPU := 0
for _, gpu := range allocation.Nodes[nodeID].Status {
if gpu.MemoryAllocated == 0 {
numberGPU += 0
}
}
if len(allocation.TasksOnNode[node.ClientID]) >= numberGPU {
continue continue
} }
eva.add(node, task) eva.add(node, task)
@ -200,7 +97,7 @@ func fastBestFit(nodes []Node, tasks []Task) Allocation {
eva.add(nodesMap[nodeID], task) eva.add(nodesMap[nodeID], task)
} }
} }
fmt.Println(eva.calculate()) log.Println(eva.calculate())
return allocation return allocation
} }
@ -209,12 +106,18 @@ func bestFit(allocation Allocation, task Task) (string, bool) {
nodeID := "" nodeID := ""
minCost := math.MaxFloat64 minCost := math.MaxFloat64
for _, id := range allocation.NodeIDs { for _, id := range allocation.NodeIDs {
if node, ok := allocation.Nodes[id]; ok && len(allocation.TasksOnNode[id]) < node.numberGPU { numberGPU := 0
for _, gpu := range allocation.Nodes[id].Status {
if gpu.MemoryAllocated == 0 {
numberGPU += 0
}
}
if _, ok := allocation.Nodes[id]; ok && len(allocation.TasksOnNode[id]) < numberGPU {
/* add */ /* add */
allocation.TasksOnNode[id] = append(allocation.TasksOnNode[id], task) allocation.TasksOnNode[id] = append(allocation.TasksOnNode[id], task)
/* evaluate */ /* evaluate */
cost := evaluatue(allocation) cost := evaluate(allocation)
/* revert */ /* revert */
idx := -1 idx := -1
@ -236,99 +139,6 @@ func bestFit(allocation Allocation, task Task) (string, bool) {
return nodeID, flag return nodeID, flag
} }
func evaluatue(allocation Allocation) float64 {
/* Calculate cost for network */
costNetwork := 0.0
domains := map[string]map[string]int{}
racks := map[string]map[string]int{}
upstreams := map[string]string{}
totalPS := 0
totalWorker := 0
taskToNode := map[string]string{}
for nodeID, tasks := range allocation.TasksOnNode {
numPS := 0
numWorker := 0
node := allocation.Nodes[nodeID]
for _, task := range tasks {
taskToNode[task.Name] = nodeID
if _, ok := domains[node.Domain]; !ok {
domains[node.Domain] = map[string]int{"PS": 0, "Worker": 0}
}
if _, ok := racks[node.Rack]; !ok {
racks[node.Rack] = map[string]int{"PS": 0, "Worker": 0}
}
if task.IsPS {
domains[node.Domain]["PS"]++
racks[node.Rack]["PS"]++
numPS++
totalPS++
} else {
domains[node.Domain]["Worker"]++
racks[node.Rack]["Worker"]++
numWorker++
totalWorker++
}
upstreams[node.Rack] = node.Domain
}
costNetwork -= float64(numPS * numWorker)
}
/* in the same domain */
for rackID, pair := range racks {
// in the same rack
costNetwork += float64(pair["PS"]*pair["Worker"]) * 1.0
// cross rack, but in the same domain
costNetwork += float64(pair["PS"]*(domains[upstreams[rackID]]["Worker"]-pair["Worker"])) * 4.0
}
/* across domain */
for _, pair := range domains {
costNetwork += float64(pair["PS"]*(totalWorker-pair["Worker"])) * 40.0
}
/* calculate cost for node fitness */
//cpu, memory, bw
costLB := 0.0
for nodeID, tasks := range allocation.TasksOnNode {
costCPU := 0.0
costMem := 0.0
costBW := 0.0
costGPU := 0.0
requestCPU := 0
requestMem := 0
requestBW := 0.0
requestGPU := 0
numberPS := 0
numberWorker := 0
for _, task := range tasks {
requestCPU += task.NumberCPU
requestMem += task.Memory
requestGPU += task.NumberGPU
if task.IsPS {
numberPS++
} else {
numberWorker++
}
}
requestBW = float64(numberPS*(totalWorker-numberWorker) + numberWorker*(totalPS-numberPS))
node := allocation.Nodes[nodeID]
costCPU += (float64(requestCPU) + node.UtilCPU) / float64(node.NumCPU) * 1.0
costMem += (float64(requestMem + (node.MemTotal - node.MemAvailable))) / float64(node.MemTotal) * 1.0
costBW += (float64(requestBW) + (node.TotalBW - node.UsingBW)) / node.TotalBW * 2.0
costGPU += (float64(requestGPU + node.numberGPU)) / float64(node.numberGPU) * 3.0
costLB += (costCPU + costMem + costBW + costGPU) / (1.0 + 1.0 + 2.0 + 3.0)
}
costLB /= float64(len(allocation.TasksOnNode))
costLB *= 100
//fmt.Println(costLB)
cost := 0.0*costLB + 1.0*costNetwork
return cost
}
/* Evaluate the allocation */ /* Evaluate the allocation */
func (X Allocation) Evaluate() (float64, error) { func (X Allocation) Evaluate() (float64, error) {
if !X.Flags["valid"] { if !X.Flags["valid"] {
@ -336,7 +146,7 @@ func (X Allocation) Evaluate() (float64, error) {
return math.MaxFloat64, nil return math.MaxFloat64, nil
} }
costNetwork := evaluatue(X) costNetwork := evaluate(X)
cost := costNetwork cost := costNetwork
//fmt.Println(taskToNode, cost, len(X.Nodes)) //fmt.Println(taskToNode, cost, len(X.Nodes))
@ -443,7 +253,7 @@ func (X Allocation) Crossover(Y eaopt.Genome, rng *rand.Rand) {
idx := -1 idx := -1
nodeID2, ok := taskToNode[task.Name] nodeID2, ok := taskToNode[task.Name]
if !ok { if !ok {
fmt.Println("Error", taskToNode, X.TasksOnNode, task.Name) log.Println("Error", taskToNode, X.TasksOnNode, task.Name)
} }
for i, task2 := range X.TasksOnNode[nodeID2] { for i, task2 := range X.TasksOnNode[nodeID2] {
if task2.Name == task.Name { if task2.Name == task.Name {
@ -451,7 +261,7 @@ func (X Allocation) Crossover(Y eaopt.Genome, rng *rand.Rand) {
} }
} }
if idx == -1 { if idx == -1 {
fmt.Println("Error 2", taskToNode, X.TasksOnNode, task.Name) log.Println("Error 2", taskToNode, X.TasksOnNode, task.Name)
} }
//fmt.Println(X.TasksOnNode) //fmt.Println(X.TasksOnNode)
copy(X.TasksOnNode[nodeID2][idx:], X.TasksOnNode[nodeID2][idx+1:]) copy(X.TasksOnNode[nodeID2][idx:], X.TasksOnNode[nodeID2][idx+1:])
@ -496,7 +306,7 @@ func (X Allocation) Clone() eaopt.Genome {
if !X.Flags["valid"] { if !X.Flags["valid"] {
//fmt.Println(X.Valid) //fmt.Println(X.Valid)
} }
Y := Allocation{TasksOnNode: map[string][]Task{}, Nodes: map[string]Node{}, Flags: map[string]bool{"valid": X.Flags["valid"]}} Y := Allocation{TasksOnNode: map[string][]Task{}, Nodes: map[string]NodeStatus{}, Flags: map[string]bool{"valid": X.Flags["valid"]}}
for id, node := range X.Nodes { for id, node := range X.Nodes {
Y.Nodes[id] = node Y.Nodes[id] = node
Y.NodeIDs = append(Y.NodeIDs, node.ClientID) Y.NodeIDs = append(Y.NodeIDs, node.ClientID)
@ -512,9 +322,9 @@ func (X Allocation) Clone() eaopt.Genome {
} }
func VectorFactory(rng *rand.Rand) eaopt.Genome { func VectorFactory(rng *rand.Rand) eaopt.Genome {
allocation := Allocation{TasksOnNode: map[string][]Task{}, Nodes: map[string]Node{}, Flags: map[string]bool{"valid": true}} allocation := Allocation{TasksOnNode: map[string][]Task{}, Nodes: map[string]NodeStatus{}, Flags: map[string]bool{"valid": true}}
var nodes []Node var nodes []NodeStatus
var tasks []Task var tasks []Task
for _, node := range nodesMap { for _, node := range nodesMap {
@ -556,7 +366,7 @@ func VectorFactory(rng *rand.Rand) eaopt.Genome {
*/ */
allocation.TasksOnNode = fastBestFit(nodes, tasks).TasksOnNode allocation.TasksOnNode = fastBestFit(nodes, tasks).TasksOnNode
fmt.Println(time.Since(ts)) log.Println(time.Since(ts))
//fmt.Println("Best Fit") //fmt.Println("Best Fit")
} else if t%2 == 0 { } else if t%2 == 0 {
/* first-fit */ /* first-fit */
@ -582,23 +392,26 @@ func VectorFactory(rng *rand.Rand) eaopt.Genome {
return allocation return allocation
} }
func main3() { func testGA() {
numTask := 20 numTask := 20
nodesMap = map[string]Node{} nodesMap = map[string]NodeStatus{}
tasksMap = map[string]Task{} tasksMap = map[string]Task{}
for i := 0; i < numTask*3; i++ { for i := 0; i < numTask*3; i++ {
node := Node{ClientID: strconv.Itoa(i), Rack: strconv.Itoa(i % 40), Domain: strconv.Itoa(i % 4)} node := NodeStatus{ClientID: strconv.Itoa(i), Rack: strconv.Itoa(i % 40), Domain: strconv.Itoa(i % 4)}
node.NumCPU = 24 node.NumCPU = 24
node.MemTotal = 188 node.MemTotal = 188
node.TotalBW = 100 node.TotalBW = 100
node.numberGPU = rand.Intn(3) + 1 cnt := rand.Intn(3) + 1
for i := 0; i < cnt; i++ {
node.Status = append(node.Status, GPUStatus{MemoryTotal: 11439, MemoryAllocated: 0, UUID: node.ClientID + strconv.Itoa(i)})
}
nodesMap[strconv.Itoa(i)] = node nodesMap[strconv.Itoa(i)] = node
} }
for i := 0; i < numTask; i++ { for i := 0; i < numTask; i++ {
isPS := false isPS := false
if i>= 3 { if i >= 3 {
isPS = true isPS = true
} }
task := Task{Name: strconv.Itoa(i), IsPS: isPS} task := Task{Name: strconv.Itoa(i), IsPS: isPS}
@ -608,7 +421,7 @@ func main3() {
tasksMap[strconv.Itoa(i)] = task tasksMap[strconv.Itoa(i)] = task
} }
var nodes []Node var nodes []NodeStatus
var tasks []Task var tasks []Task
for _, node := range nodesMap { for _, node := range nodesMap {
@ -619,12 +432,12 @@ func main3() {
} }
s := time.Now() s := time.Now()
allocation := fastBestFit(nodes, tasks) allocation := fastBestFit(nodes, tasks)
fmt.Println(time.Since(s)) log.Println(time.Since(s))
// Instantiate a GA with a GAConfig // Instantiate a GA with a GAConfig
var ga, err = eaopt.NewDefaultGAConfig().NewGA() var ga, err = eaopt.NewDefaultGAConfig().NewGA()
if err != nil { if err != nil {
fmt.Println(err) log.Println(err)
return return
} }
@ -635,7 +448,7 @@ func main3() {
// Add a custom print function to track progress // Add a custom print function to track progress
ga.Callback = func(ga *eaopt.GA) { ga.Callback = func(ga *eaopt.GA) {
fmt.Printf("Best fitness at generation %d: %f\n", ga.Generations, ga.HallOfFame[0].Fitness) log.Printf("Best fitness at generation %d: %f\n", ga.Generations, ga.HallOfFame[0].Fitness)
} }
bestFitness := math.MaxFloat64 bestFitness := math.MaxFloat64
@ -647,7 +460,7 @@ func main3() {
gap := math.Abs(ga.HallOfFame[0].Fitness - bestFitness) gap := math.Abs(ga.HallOfFame[0].Fitness - bestFitness)
if gap <= 0.000001 || ga.HallOfFame[0].Fitness >= bestFitness { if gap <= 0.000001 || ga.HallOfFame[0].Fitness >= bestFitness {
if count >= 30 || time.Since(ts) > time.Second*30 { if count >= 30 || time.Since(ts) > time.Second*30 {
fmt.Println("Early Stop") log.Println("Early Stop")
return true return true
} else { } else {
count++ count++
@ -661,13 +474,13 @@ func main3() {
// Find the minimum // Find the minimum
err = ga.Minimize(VectorFactory) err = ga.Minimize(VectorFactory)
fmt.Println(time.Since(ts)) log.Println(time.Since(ts))
fmt.Println(ga.HallOfFame[0].Genome.(Allocation).TasksOnNode) log.Println(ga.HallOfFame[0].Genome.(Allocation).TasksOnNode)
//fmt.Println(ga.HallOfFame[0].Genome.(Allocation).Nodes) //fmt.Println(ga.HallOfFame[0].Genome.(Allocation).Nodes)
if err != nil { if err != nil {
fmt.Println(err) log.Println(err)
return return
} }
fmt.Println(allocation) log.Println(allocation)
} }

View File

@ -9,5 +9,3 @@ type Group struct {
CPU int `json:"quota_cpu"` CPU int `json:"quota_cpu"`
Memory int `json:"quota_mem"` Memory int `json:"quota_mem"`
} }

View File

@ -6,124 +6,109 @@ import (
"strings" "strings"
"io/ioutil" "io/ioutil"
"encoding/json" "encoding/json"
"strconv"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"sync"
"strconv"
"math/rand"
) )
type JobManager struct { type JobManager struct {
scheduler Scheduler scheduler Scheduler
job Job job Job
jobStatus JobStatus jobStatus JobStatus
resources []NodeStatus resources []NodeStatus
killedFlag bool resourcesMu sync.Mutex
isRunning bool isRunning bool
network string killFlag bool
network string
} }
func (jm *JobManager) start() { func (jm *JobManager) start() {
log.Info("start job ", jm.job.Name, time.Now()) log.Info("start job ", jm.job.Name, time.Now())
jm.isRunning = false jm.isRunning = false
jm.killFlag = false
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}} jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
jm.network = InstanceOfResourcePool().acquireNetwork() /* register in JHL */
InstanceJobHistoryLogger().submitJob(jm.job) InstanceJobHistoryLogger().submitJob(jm.job)
/* request for private network */
jm.network = InstanceOfResourcePool().acquireNetwork()
/* request for resources */ /* request for resources */
for range jm.job.Tasks { //append would cause uncertain order
jm.resources = append(jm.resources, NodeStatus{ClientID: "null"})
}
var nodes []NodeStatus
for { for {
if jm.killedFlag { if jm.killFlag {
break break
} }
nodes = jm.scheduler.AcquireResource(jm.job) jm.resources = jm.scheduler.AcquireResource(jm.job)
if len(nodes) > 0 { if len(jm.resources) > 0 {
log.Info("Receive resource", jm.resources)
break break
} }
time.Sleep(time.Second * 1) /* sleep random Millisecond to avoid deadlock */
} time.Sleep(time.Millisecond * time.Duration(500+rand.Intn(500)))
log.Info("Receive resource", nodes)
jm.resources = nodes
for _, node := range nodes {
for _, t := range node.Status {
InstanceOfResourcePool().attach(t.UUID, jm.job.Name)
}
} }
if !jm.killedFlag { if !jm.killFlag {
/* switch to Running state */
jm.scheduler.UpdateProgress(jm.job, Running) jm.scheduler.UpdateProgress(jm.job, Running)
jm.isRunning = true
log.Info("ready to run job ", jm.job.Name, time.Now()) log.Info("ready to run job ", jm.job.Name, time.Now())
}
/* bring up containers */ /* bring up containers */
for i := range jm.job.Tasks { wg := sync.WaitGroup{}
if jm.killedFlag { for i := range jm.job.Tasks {
break wg.Add(1)
}
var GPUs []string go func(index int) {
for _, GPU := range jm.resources[i].Status { defer wg.Done()
GPUs = append(GPUs, GPU.UUID) var UUIDs []string
} for _, GPU := range jm.resources[index].Status {
UUIDs = append(UUIDs, GPU.UUID)
for attempt := 0; attempt < 3; attempt++ {
if attempt == 2 { //failed more than once /* attach to GPUs */
//for { InstanceOfResourcePool().attach(GPU.UUID, jm.job.Name)
// resource := jm.scheduler.AcquireResource(jm.job, jm.job.Tasks[i], jm.resources) }
// if len(resource.Status) > 0 { GPUs := strings.Join(UUIDs, ",")
// break
// } v := url.Values{}
time.Sleep(time.Second * 1) v.Set("image", jm.job.Tasks[index].Image)
break v.Set("cmd", jm.job.Tasks[index].Cmd)
//} v.Set("name", jm.job.Tasks[index].Name)
} v.Set("workspace", jm.job.Workspace)
v.Set("gpus", GPUs)
v := url.Values{} v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[index].Memory)+"m")
v.Set("image", jm.job.Tasks[i].Image) v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[index].NumberCPU))
v.Set("cmd", jm.job.Tasks[i].Cmd) v.Set("network", jm.network)
v.Set("name", jm.job.Tasks[i].Name) v.Set("should_wait", "1")
v.Set("workspace", jm.job.Workspace) v.Set("output_dir", "/tmp/")
v.Set("gpus", strings.Join(GPUs, ",")) v.Set("hdfs_address", "http://192.168.100.104:50070/")
v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[i].Memory)+"m") v.Set("hdfs_dir", "/user/yao/output/"+jm.job.Name)
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[i].NumberCPU)) v.Set("gpu_mem", strconv.Itoa(jm.job.Tasks[index].MemoryGPU))
v.Set("network", jm.network)
v.Set("should_wait", "1") resp, err := doRequest("POST", "http://"+jm.resources[index].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
v.Set("output_dir", "/tmp/") if err != nil {
v.Set("hdfs_address", "http://192.168.100.104:50070/") log.Warn(err.Error())
v.Set("hdfs_dir", "/user/yao/output/"+jm.job.Name) return
v.Set("gpu_mem", strconv.Itoa(jm.job.Tasks[i].MemoryGPU)) }
resp, err := doRequest("POST", "http://"+jm.resources[i].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "") body, err := ioutil.ReadAll(resp.Body)
if err != nil { resp.Body.Close()
log.Warn(err.Error()) if err != nil {
continue log.Warn(err)
} return
}
body, err := ioutil.ReadAll(resp.Body)
resp.Body.Close() var res MsgCreate
if err != nil { err = json.Unmarshal([]byte(string(body)), &res)
log.Warn(err) if err != nil || res.Code != 0 {
continue log.Warn(res)
} return
}
var res MsgCreate jm.jobStatus.tasks[jm.job.Tasks[index].Name] = TaskStatus{Id: res.Id, Node: jm.resources[index].ClientHost}
err = json.Unmarshal([]byte(string(body)), &res) }(i)
if err != nil {
log.Warn(err)
continue
}
if res.Code != 0 {
log.Warn(res)
}
if res.Code == 0 {
jm.jobStatus.tasks[jm.job.Tasks[i].Name] = TaskStatus{Id: res.Id, Node: jm.resources[i].ClientHost}
break
}
} }
wg.Wait()
jm.isRunning = true
} }
/* monitor job execution */ /* monitor job execution */
@ -134,87 +119,94 @@ func (jm *JobManager) start() {
} }
time.Sleep(time.Second * 25) time.Sleep(time.Second * 25)
} }
/* make sure resource are released */
jm.returnResource(jm.status().Status)
log.Info("JobMaster exited ", jm.job.Name)
} }
func (jm *JobManager) checkStatus(status []TaskStatus) bool { /* release all resource */
if !jm.isRunning { func (jm *JobManager) returnResource(status []TaskStatus) {
return false jm.resourcesMu.Lock()
defer jm.resourcesMu.Unlock()
if len(jm.resources) == 0 {
return
} }
flag := false /* return resource */
for i := range jm.resources {
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
log.Info("return resource ", jm.resources[i].ClientID)
for _, t := range jm.resources[i].Status {
InstanceOfResourcePool().detach(t.UUID, jm.job)
}
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
/* remove exited containers */
//v := url.Values{}
//v.Set("id", res.Status[i].Id)
//
//_, err := doRequest("POST", "http://"+res.Status[i].Node+":8000/remove", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
//if err != nil {
// log.Warn(err.Error())
// continue
//}
}
InstanceOfResourcePool().releaseNetwork(jm.network)
jm.resources = []NodeStatus{}
}
/* monitor all tasks */
func (jm *JobManager) checkStatus(status []TaskStatus) {
if !jm.isRunning {
return
}
flagRunning := false
onlyPS := true onlyPS := true
for i := range status { for i := range status {
if status[i].Status == "ready" { if status[i].Status == "ready" {
log.Debug(jm.job.Name, "-", i, " is ready to run") log.Debug(jm.job.Name, "-", i, " is ready to run")
flag = true flagRunning = true
if !jm.job.Tasks[i].IsPS {
onlyPS = false
}
} else if status[i].Status == "unknown" {
log.Debug(jm.job.Name, "-", i, " is starting")
flag = true
if !jm.job.Tasks[i].IsPS { if !jm.job.Tasks[i].IsPS {
onlyPS = false onlyPS = false
} }
} else if status[i].Status == "running" { } else if status[i].Status == "running" {
log.Debug(jm.job.Name, "-", i, " is running") log.Debug(jm.job.Name, "-", i, " is running")
flag = true flagRunning = true
if !jm.job.Tasks[i].IsPS { if !jm.job.Tasks[i].IsPS {
onlyPS = false onlyPS = false
} }
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i]) InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
} else { } else {
log.Info(jm.job.Name, "-", i, " ", status[i].Status) log.Info(jm.job.Name, "-", i, " ", status[i].Status)
if exitCode, ok := status[i].State["ExitCode"].(float64); ok && !jm.job.Tasks[i].IsPS { if exitCode, ok := status[i].State["ExitCode"].(float64); ok && exitCode != 0 && !jm.killFlag {
if exitCode != 0 && !jm.killedFlag { log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode)
log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode) jm.isRunning = false
jm.killedFlag = true jm.stop(false)
jm.scheduler.UpdateProgress(jm.job, Failed) jm.scheduler.UpdateProgress(jm.job, Failed)
} jm.returnResource(status)
} break
/* remove exited containers */
//v := url.Values{}
//v.Set("id", res.Status[i].Id)
//
//_, err := doRequest("POST", "http://"+res.Status[i].Node+":8000/remove", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
//if err != nil {
// log.Warn(err.Error())
// continue
//}
/* return resource */
if jm.resources[i].ClientID != "null" {
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
log.Info("return resource ", jm.resources[i].ClientID)
jm.resources[i].ClientID = "null"
for _, t := range jm.resources[i].Status {
InstanceOfResourcePool().detach(t.UUID, jm.job)
}
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
} }
} }
} }
if flag && onlyPS { if jm.isRunning && onlyPS {
jm.stop()
log.Info("Only PS is running, stop ", jm.job.Name) log.Info("Only PS is running, stop ", jm.job.Name)
jm.killedFlag = false
}
if !flag {
jm.isRunning = false jm.isRunning = false
InstanceOfResourcePool().releaseNetwork(jm.network) jm.stop(false)
jm.scheduler.UpdateProgress(jm.job, Finished)
if !jm.killedFlag { jm.returnResource(status)
jm.scheduler.UpdateProgress(jm.job, Finished) }
log.Info("finish job ", jm.job.Name)
} if jm.isRunning && !flagRunning && !jm.killFlag {
log.Info("JobMaster exited ", jm.job.Name) jm.isRunning = false
jm.scheduler.UpdateProgress(jm.job, Finished)
jm.returnResource(status)
log.Info("finish job ", jm.job.Name)
} }
return flag
} }
/* fetch logs of task */
func (jm *JobManager) logs(taskName string) MsgLog { func (jm *JobManager) logs(taskName string) MsgLog {
spider := Spider{} spider := Spider{}
spider.Method = "GET" spider.Method = "GET"
@ -234,21 +226,22 @@ func (jm *JobManager) logs(taskName string) MsgLog {
body, err := ioutil.ReadAll(resp.Body) body, err := ioutil.ReadAll(resp.Body)
if err != nil { if err != nil {
return MsgLog{Code: 1, Error: err.Error()} return MsgLog{Code: 2, Error: err.Error()}
} }
var res MsgLog var res MsgLog
err = json.Unmarshal([]byte(string(body)), &res) err = json.Unmarshal([]byte(string(body)), &res)
if err != nil { if err != nil {
log.Println(err) log.Println(err)
return MsgLog{Code: 1, Error: "Unknown"} return MsgLog{Code: 3, Error: "Unknown"}
} }
return res return res
} }
/* fetch job tasks status */
func (jm *JobManager) status() MsgJobStatus { func (jm *JobManager) status() MsgJobStatus {
var tasksStatus []TaskStatus var tasksStatus []TaskStatus
for range jm.job.Tasks { for range jm.job.Tasks { //append would cause uncertain order
tasksStatus = append(tasksStatus, TaskStatus{}) tasksStatus = append(tasksStatus, TaskStatus{})
} }
@ -286,22 +279,23 @@ func (jm *JobManager) status() MsgJobStatus {
return MsgJobStatus{Status: tasksStatus} return MsgJobStatus{Status: tasksStatus}
} }
func (jm *JobManager) stop() MsgStop { /* force stop all containers */
jm.killedFlag = true func (jm *JobManager) stop(force bool) MsgStop {
go func() { /* kill at background */ for _, taskStatus := range jm.jobStatus.tasks {
for _, taskStatus := range jm.jobStatus.tasks { v := url.Values{}
v := url.Values{} v.Set("id", taskStatus.Id)
v.Set("id", taskStatus.Id)
_, err := doRequest("POST", "http://"+taskStatus.Node+":8000/stop", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "") _, err := doRequest("POST", "http://"+taskStatus.Node+":8000/stop", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
if err != nil { if err != nil {
log.Warn(err.Error()) log.Warn(err.Error())
continue continue
}
} }
}() }
jm.scheduler.UpdateProgress(jm.job, Stopped) if force {
log.Info("kill job, ", jm.job.Name) jm.killFlag = true
jm.scheduler.UpdateProgress(jm.job, Stopped)
log.Info("kill job, ", jm.job.Name)
}
return MsgStop{Code: 0} return MsgStop{Code: 0}
} }

View File

@ -292,7 +292,7 @@ func main() {
/* init components */ /* init components */
InstanceOfResourcePool().init(config) InstanceOfResourcePool().init(config)
InstanceOfColector().init(config) InstanceOfCollector().init(config)
InstanceJobHistoryLogger().init(config) InstanceJobHistoryLogger().init(config)
InstanceOfOptimizer().init(config) InstanceOfOptimizer().init(config)
InstanceOfGroupManager().init(config) InstanceOfGroupManager().init(config)

View File

@ -30,11 +30,13 @@ type NodeStatus struct {
ClientID string `json:"id"` ClientID string `json:"id"`
ClientHost string `json:"host"` ClientHost string `json:"host"`
Domain string `json:"domain"` Domain string `json:"domain"`
Rack int `json:"rack"` Rack string `json:"rack"`
Version float64 `json:"version"` Version float64 `json:"version"`
NumCPU int `json:"cpu_num"` NumCPU int `json:"cpu_num"`
UtilCPU float64 `json:"cpu_load"` UtilCPU float64 `json:"cpu_load"`
MemTotal int `json:"mem_total"` MemTotal int `json:"mem_total"`
MemAvailable int `json:"mem_available"` MemAvailable int `json:"mem_available"`
UsingBW float64 `json:"bw_using"`
TotalBW float64 `json:"bw_total"`
Status []GPUStatus `json:"status"` Status []GPUStatus `json:"status"`
} }

View File

@ -807,6 +807,12 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
/* assign */ /* assign */
var ress []NodeStatus var ress []NodeStatus
if len(candidates) > 0 { if len(candidates) > 0 {
/*
for range job.Tasks { //append would cause uncertain order
resources = append(resources, NodeStatus{ClientID: "null"})
}
*/
var nodes []NodeStatus var nodes []NodeStatus
if len(job.Tasks) == 1 { if len(job.Tasks) == 1 {
node := pool.pickNode(candidates, availableGPUs, task, job, []NodeStatus{}) node := pool.pickNode(candidates, availableGPUs, task, job, []NodeStatus{})

View File

@ -407,7 +407,7 @@ func (scheduler *SchedulerFair) Stop(jobName string) MsgStop {
if !ok { if !ok {
return MsgStop{Code: 1, Error: "Job not exist!"} return MsgStop{Code: 1, Error: "Job not exist!"}
} }
return jm.stop() return jm.stop(true)
} }
func (scheduler *SchedulerFair) QueryLogs(jobName string, taskName string) MsgLog { func (scheduler *SchedulerFair) QueryLogs(jobName string, taskName string) MsgLog {

View File

@ -4,7 +4,7 @@ import (
"sync" "sync"
"time" "time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
type SchedulerPriority struct { type SchedulerPriority struct {
history []*Job history []*Job

View File

@ -6,7 +6,7 @@ import (
"time" "time"
"io" "io"
"net/http" "net/http"
) )
type Configuration struct { type Configuration struct {
KafkaBrokers []string `json:"kafkaBrokers"` KafkaBrokers []string `json:"kafkaBrokers"`