mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-06-07 14:21:55 +00:00
update
This commit is contained in:
parent
b63f1ba609
commit
ad551109fe
@ -11,7 +11,7 @@ import (
|
|||||||
var collectorInstance *Collector
|
var collectorInstance *Collector
|
||||||
var collectorInstanceLock sync.Mutex
|
var collectorInstanceLock sync.Mutex
|
||||||
|
|
||||||
func InstanceOfColector() *Collector {
|
func InstanceOfCollector() *Collector {
|
||||||
defer collectorInstanceLock.Unlock()
|
defer collectorInstanceLock.Unlock()
|
||||||
collectorInstanceLock.Lock()
|
collectorInstanceLock.Lock()
|
||||||
|
|
||||||
|
305
src/ga.go
305
src/ga.go
@ -1,142 +1,21 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"github.com/MaxHalford/eaopt"
|
"github.com/MaxHalford/eaopt"
|
||||||
"time"
|
"time"
|
||||||
"strconv"
|
"strconv"
|
||||||
"math"
|
"math"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Evaluator struct {
|
var nodesMap map[string]NodeStatus
|
||||||
domains map[string]map[string]int
|
|
||||||
racks map[string]map[string]int
|
|
||||||
nodes map[string]map[string]int
|
|
||||||
upstreams map[string]string
|
|
||||||
cost float64
|
|
||||||
totalPS int
|
|
||||||
totalWorker int
|
|
||||||
|
|
||||||
costNetwork float64
|
|
||||||
|
|
||||||
factorNode float64
|
|
||||||
factorRack float64
|
|
||||||
factorDomain float64
|
|
||||||
}
|
|
||||||
|
|
||||||
func (eva *Evaluator) init(nodes []Node, tasks []Task) {
|
|
||||||
eva.domains = map[string]map[string]int{}
|
|
||||||
eva.racks = map[string]map[string]int{}
|
|
||||||
eva.nodes = map[string]map[string]int{}
|
|
||||||
eva.upstreams = map[string]string{}
|
|
||||||
eva.totalPS = 0
|
|
||||||
eva.totalWorker = 0
|
|
||||||
eva.factorNode = 1.0
|
|
||||||
eva.factorRack = 4.0
|
|
||||||
eva.factorDomain = 40.0
|
|
||||||
eva.cost = 0.0
|
|
||||||
eva.costNetwork = 0.0
|
|
||||||
}
|
|
||||||
|
|
||||||
func (eva *Evaluator) add(node Node, task Task) {
|
|
||||||
/* update node load cost */
|
|
||||||
|
|
||||||
/* update network cost */
|
|
||||||
if _, ok := eva.nodes[node.ClientID]; !ok {
|
|
||||||
eva.nodes[node.ClientID] = map[string]int{"PS": 0, "Worker": 0}
|
|
||||||
}
|
|
||||||
if _, ok := eva.racks[node.Rack]; !ok {
|
|
||||||
eva.racks[node.Rack] = map[string]int{"PS": 0, "Worker": 0}
|
|
||||||
}
|
|
||||||
if _, ok := eva.domains[node.Domain]; !ok {
|
|
||||||
eva.domains[node.Domain] = map[string]int{"PS": 0, "Worker": 0}
|
|
||||||
}
|
|
||||||
if task.IsPS {
|
|
||||||
eva.costNetwork += eva.factorNode * float64(eva.racks[node.Rack]["Worker"]-eva.nodes[node.ClientID]["Worker"])
|
|
||||||
eva.costNetwork += eva.factorRack * float64(eva.domains[node.Domain]["Worker"]-eva.racks[node.Rack]["Worker"])
|
|
||||||
eva.costNetwork += eva.factorDomain * float64(eva.totalWorker-eva.domains[node.Domain]["Worker"])
|
|
||||||
|
|
||||||
eva.nodes[node.ClientID]["PS"]++
|
|
||||||
eva.racks[node.Rack]["PS"]++
|
|
||||||
eva.domains[node.Domain]["PS"]++
|
|
||||||
eva.totalPS++
|
|
||||||
} else {
|
|
||||||
eva.costNetwork += eva.factorNode * float64(eva.racks[node.Rack]["PS"]-eva.nodes[node.ClientID]["PS"])
|
|
||||||
eva.costNetwork += eva.factorRack * float64(eva.domains[node.Domain]["PS"]-eva.racks[node.Rack]["PS"])
|
|
||||||
eva.costNetwork += eva.factorDomain * float64(eva.totalPS-eva.domains[node.Domain]["PS"])
|
|
||||||
|
|
||||||
eva.nodes[node.ClientID]["Worker"]++
|
|
||||||
eva.racks[node.Rack]["Worker"]++
|
|
||||||
eva.domains[node.Domain]["Worker"]++
|
|
||||||
eva.totalWorker++
|
|
||||||
}
|
|
||||||
eva.cost = eva.costNetwork
|
|
||||||
}
|
|
||||||
|
|
||||||
func (eva *Evaluator) remove(node Node, task Task) {
|
|
||||||
if task.IsPS {
|
|
||||||
eva.costNetwork -= eva.factorNode * float64(eva.racks[node.Rack]["Worker"]-eva.nodes[node.ClientID]["Worker"])
|
|
||||||
eva.costNetwork -= eva.factorRack * float64(eva.domains[node.Domain]["Worker"]-eva.racks[node.Rack]["Worker"])
|
|
||||||
eva.costNetwork -= eva.factorDomain * float64(eva.totalWorker-eva.domains[node.Domain]["Worker"])
|
|
||||||
|
|
||||||
eva.nodes[node.ClientID]["PS"]--
|
|
||||||
eva.racks[node.Rack]["PS"]--
|
|
||||||
eva.domains[node.Domain]["PS"]--
|
|
||||||
eva.totalPS--
|
|
||||||
} else {
|
|
||||||
eva.costNetwork -= eva.factorNode * float64(eva.racks[node.Rack]["PS"]-eva.nodes[node.ClientID]["PS"])
|
|
||||||
eva.costNetwork -= eva.factorRack * float64(eva.domains[node.Domain]["PS"]-eva.racks[node.Rack]["PS"])
|
|
||||||
eva.costNetwork -= eva.factorDomain * float64(eva.totalPS-eva.domains[node.Domain]["PS"])
|
|
||||||
|
|
||||||
//fmt.Println(eva.totalWorker, eva.domains[node.Domain])
|
|
||||||
|
|
||||||
eva.nodes[node.ClientID]["Worker"]--
|
|
||||||
eva.racks[node.Rack]["Worker"]--
|
|
||||||
eva.domains[node.Domain]["Worker"]--
|
|
||||||
eva.totalWorker--
|
|
||||||
}
|
|
||||||
eva.cost = eva.costNetwork
|
|
||||||
}
|
|
||||||
|
|
||||||
func (eva *Evaluator) calculate() float64 {
|
|
||||||
return eva.cost
|
|
||||||
}
|
|
||||||
|
|
||||||
var nodesMap map[string]Node
|
|
||||||
var tasksMap map[string]Task
|
var tasksMap map[string]Task
|
||||||
|
|
||||||
type Node struct {
|
// A resource allocation
|
||||||
ClientID string `json:"id"`
|
|
||||||
Domain string `json:"domain"`
|
|
||||||
Rack string `json:"rack"`
|
|
||||||
Version float64 `json:"version"`
|
|
||||||
NumCPU int `json:"cpu_num"`
|
|
||||||
UtilCPU float64 `json:"cpu_load"`
|
|
||||||
MemTotal int `json:"mem_total"`
|
|
||||||
MemAvailable int `json:"mem_available"`
|
|
||||||
UsingBW float64 `json:"bw_using"`
|
|
||||||
TotalBW float64 `json:"bw_total"`
|
|
||||||
numberGPU int
|
|
||||||
//Status []GPUStatus `json:"status"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type Task3 struct {
|
|
||||||
Name string `json:"name"`
|
|
||||||
Image string `json:"image"`
|
|
||||||
Cmd string `json:"cmd"`
|
|
||||||
NumberCPU int `json:"cpu_number"`
|
|
||||||
Memory int `json:"memory"`
|
|
||||||
NumberGPU int `json:"gpu_number"`
|
|
||||||
MemoryGPU int `json:"gpu_memory"`
|
|
||||||
IsPS bool `json:"is_ps"`
|
|
||||||
ModelGPU string `json:"gpu_model"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// An valid allocation
|
|
||||||
type Allocation struct {
|
type Allocation struct {
|
||||||
TasksOnNode map[string][]Task // tasks on nodes[id]
|
TasksOnNode map[string][]Task // tasks on nodes[id]
|
||||||
Nodes map[string]Node
|
Nodes map[string]NodeStatus
|
||||||
NodeIDs []string
|
NodeIDs []string
|
||||||
Flags map[string]bool
|
Flags map[string]bool
|
||||||
Evaluator Evaluator
|
Evaluator Evaluator
|
||||||
@ -146,7 +25,13 @@ func randomFit(allocation Allocation, task Task) (string, bool) {
|
|||||||
flag := false
|
flag := false
|
||||||
nodeID := ""
|
nodeID := ""
|
||||||
for nodeID = range allocation.Nodes {
|
for nodeID = range allocation.Nodes {
|
||||||
if node, ok := allocation.Nodes[nodeID]; ok && len(allocation.TasksOnNode[nodeID]) < node.numberGPU {
|
numberGPU := 0
|
||||||
|
for _, gpu := range allocation.Nodes[nodeID].Status {
|
||||||
|
if gpu.MemoryAllocated == 0 {
|
||||||
|
numberGPU += 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if _, ok := allocation.Nodes[nodeID]; ok && len(allocation.TasksOnNode[nodeID]) < numberGPU {
|
||||||
flag = true
|
flag = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -158,7 +43,13 @@ func firstFit(allocation Allocation, task Task) (string, bool) {
|
|||||||
flag := false
|
flag := false
|
||||||
nodeID := ""
|
nodeID := ""
|
||||||
for _, nodeID = range allocation.NodeIDs {
|
for _, nodeID = range allocation.NodeIDs {
|
||||||
if node, ok := allocation.Nodes[nodeID]; ok && len(allocation.TasksOnNode[nodeID]) < node.numberGPU {
|
numberGPU := 0
|
||||||
|
for _, gpu := range allocation.Nodes[nodeID].Status {
|
||||||
|
if gpu.MemoryAllocated == 0 {
|
||||||
|
numberGPU += 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if _, ok := allocation.Nodes[nodeID]; ok && len(allocation.TasksOnNode[nodeID]) < numberGPU {
|
||||||
flag = true
|
flag = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -166,7 +57,7 @@ func firstFit(allocation Allocation, task Task) (string, bool) {
|
|||||||
return nodeID, flag
|
return nodeID, flag
|
||||||
}
|
}
|
||||||
|
|
||||||
func fastBestFit(nodes []Node, tasks []Task) Allocation {
|
func fastBestFit(nodes []NodeStatus, tasks []Task) Allocation {
|
||||||
eva := Evaluator{}
|
eva := Evaluator{}
|
||||||
eva.init(nodes, tasks)
|
eva.init(nodes, tasks)
|
||||||
|
|
||||||
@ -179,7 +70,13 @@ func fastBestFit(nodes []Node, tasks []Task) Allocation {
|
|||||||
if _, ok := allocation.TasksOnNode[node.ClientID]; !ok {
|
if _, ok := allocation.TasksOnNode[node.ClientID]; !ok {
|
||||||
allocation.TasksOnNode[node.ClientID] = []Task{}
|
allocation.TasksOnNode[node.ClientID] = []Task{}
|
||||||
}
|
}
|
||||||
if len(allocation.TasksOnNode[node.ClientID]) >= node.numberGPU {
|
numberGPU := 0
|
||||||
|
for _, gpu := range allocation.Nodes[nodeID].Status {
|
||||||
|
if gpu.MemoryAllocated == 0 {
|
||||||
|
numberGPU += 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(allocation.TasksOnNode[node.ClientID]) >= numberGPU {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
eva.add(node, task)
|
eva.add(node, task)
|
||||||
@ -200,7 +97,7 @@ func fastBestFit(nodes []Node, tasks []Task) Allocation {
|
|||||||
eva.add(nodesMap[nodeID], task)
|
eva.add(nodesMap[nodeID], task)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmt.Println(eva.calculate())
|
log.Println(eva.calculate())
|
||||||
return allocation
|
return allocation
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -209,12 +106,18 @@ func bestFit(allocation Allocation, task Task) (string, bool) {
|
|||||||
nodeID := ""
|
nodeID := ""
|
||||||
minCost := math.MaxFloat64
|
minCost := math.MaxFloat64
|
||||||
for _, id := range allocation.NodeIDs {
|
for _, id := range allocation.NodeIDs {
|
||||||
if node, ok := allocation.Nodes[id]; ok && len(allocation.TasksOnNode[id]) < node.numberGPU {
|
numberGPU := 0
|
||||||
|
for _, gpu := range allocation.Nodes[id].Status {
|
||||||
|
if gpu.MemoryAllocated == 0 {
|
||||||
|
numberGPU += 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if _, ok := allocation.Nodes[id]; ok && len(allocation.TasksOnNode[id]) < numberGPU {
|
||||||
/* add */
|
/* add */
|
||||||
allocation.TasksOnNode[id] = append(allocation.TasksOnNode[id], task)
|
allocation.TasksOnNode[id] = append(allocation.TasksOnNode[id], task)
|
||||||
|
|
||||||
/* evaluate */
|
/* evaluate */
|
||||||
cost := evaluatue(allocation)
|
cost := evaluate(allocation)
|
||||||
|
|
||||||
/* revert */
|
/* revert */
|
||||||
idx := -1
|
idx := -1
|
||||||
@ -236,99 +139,6 @@ func bestFit(allocation Allocation, task Task) (string, bool) {
|
|||||||
return nodeID, flag
|
return nodeID, flag
|
||||||
}
|
}
|
||||||
|
|
||||||
func evaluatue(allocation Allocation) float64 {
|
|
||||||
/* Calculate cost for network */
|
|
||||||
costNetwork := 0.0
|
|
||||||
domains := map[string]map[string]int{}
|
|
||||||
racks := map[string]map[string]int{}
|
|
||||||
upstreams := map[string]string{}
|
|
||||||
totalPS := 0
|
|
||||||
totalWorker := 0
|
|
||||||
|
|
||||||
taskToNode := map[string]string{}
|
|
||||||
for nodeID, tasks := range allocation.TasksOnNode {
|
|
||||||
numPS := 0
|
|
||||||
numWorker := 0
|
|
||||||
node := allocation.Nodes[nodeID]
|
|
||||||
for _, task := range tasks {
|
|
||||||
taskToNode[task.Name] = nodeID
|
|
||||||
|
|
||||||
if _, ok := domains[node.Domain]; !ok {
|
|
||||||
domains[node.Domain] = map[string]int{"PS": 0, "Worker": 0}
|
|
||||||
}
|
|
||||||
if _, ok := racks[node.Rack]; !ok {
|
|
||||||
racks[node.Rack] = map[string]int{"PS": 0, "Worker": 0}
|
|
||||||
}
|
|
||||||
|
|
||||||
if task.IsPS {
|
|
||||||
domains[node.Domain]["PS"]++
|
|
||||||
racks[node.Rack]["PS"]++
|
|
||||||
numPS++
|
|
||||||
totalPS++
|
|
||||||
} else {
|
|
||||||
domains[node.Domain]["Worker"]++
|
|
||||||
racks[node.Rack]["Worker"]++
|
|
||||||
numWorker++
|
|
||||||
totalWorker++
|
|
||||||
}
|
|
||||||
upstreams[node.Rack] = node.Domain
|
|
||||||
}
|
|
||||||
costNetwork -= float64(numPS * numWorker)
|
|
||||||
}
|
|
||||||
|
|
||||||
/* in the same domain */
|
|
||||||
for rackID, pair := range racks {
|
|
||||||
// in the same rack
|
|
||||||
costNetwork += float64(pair["PS"]*pair["Worker"]) * 1.0
|
|
||||||
// cross rack, but in the same domain
|
|
||||||
costNetwork += float64(pair["PS"]*(domains[upstreams[rackID]]["Worker"]-pair["Worker"])) * 4.0
|
|
||||||
}
|
|
||||||
|
|
||||||
/* across domain */
|
|
||||||
for _, pair := range domains {
|
|
||||||
costNetwork += float64(pair["PS"]*(totalWorker-pair["Worker"])) * 40.0
|
|
||||||
}
|
|
||||||
|
|
||||||
/* calculate cost for node fitness */
|
|
||||||
//cpu, memory, bw
|
|
||||||
costLB := 0.0
|
|
||||||
for nodeID, tasks := range allocation.TasksOnNode {
|
|
||||||
costCPU := 0.0
|
|
||||||
costMem := 0.0
|
|
||||||
costBW := 0.0
|
|
||||||
costGPU := 0.0
|
|
||||||
requestCPU := 0
|
|
||||||
requestMem := 0
|
|
||||||
requestBW := 0.0
|
|
||||||
requestGPU := 0
|
|
||||||
numberPS := 0
|
|
||||||
numberWorker := 0
|
|
||||||
for _, task := range tasks {
|
|
||||||
requestCPU += task.NumberCPU
|
|
||||||
requestMem += task.Memory
|
|
||||||
requestGPU += task.NumberGPU
|
|
||||||
if task.IsPS {
|
|
||||||
numberPS++
|
|
||||||
} else {
|
|
||||||
numberWorker++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
requestBW = float64(numberPS*(totalWorker-numberWorker) + numberWorker*(totalPS-numberPS))
|
|
||||||
node := allocation.Nodes[nodeID]
|
|
||||||
costCPU += (float64(requestCPU) + node.UtilCPU) / float64(node.NumCPU) * 1.0
|
|
||||||
costMem += (float64(requestMem + (node.MemTotal - node.MemAvailable))) / float64(node.MemTotal) * 1.0
|
|
||||||
costBW += (float64(requestBW) + (node.TotalBW - node.UsingBW)) / node.TotalBW * 2.0
|
|
||||||
costGPU += (float64(requestGPU + node.numberGPU)) / float64(node.numberGPU) * 3.0
|
|
||||||
costLB += (costCPU + costMem + costBW + costGPU) / (1.0 + 1.0 + 2.0 + 3.0)
|
|
||||||
}
|
|
||||||
costLB /= float64(len(allocation.TasksOnNode))
|
|
||||||
costLB *= 100
|
|
||||||
//fmt.Println(costLB)
|
|
||||||
|
|
||||||
cost := 0.0*costLB + 1.0*costNetwork
|
|
||||||
return cost
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Evaluate the allocation */
|
/* Evaluate the allocation */
|
||||||
func (X Allocation) Evaluate() (float64, error) {
|
func (X Allocation) Evaluate() (float64, error) {
|
||||||
if !X.Flags["valid"] {
|
if !X.Flags["valid"] {
|
||||||
@ -336,7 +146,7 @@ func (X Allocation) Evaluate() (float64, error) {
|
|||||||
return math.MaxFloat64, nil
|
return math.MaxFloat64, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
costNetwork := evaluatue(X)
|
costNetwork := evaluate(X)
|
||||||
|
|
||||||
cost := costNetwork
|
cost := costNetwork
|
||||||
//fmt.Println(taskToNode, cost, len(X.Nodes))
|
//fmt.Println(taskToNode, cost, len(X.Nodes))
|
||||||
@ -443,7 +253,7 @@ func (X Allocation) Crossover(Y eaopt.Genome, rng *rand.Rand) {
|
|||||||
idx := -1
|
idx := -1
|
||||||
nodeID2, ok := taskToNode[task.Name]
|
nodeID2, ok := taskToNode[task.Name]
|
||||||
if !ok {
|
if !ok {
|
||||||
fmt.Println("Error", taskToNode, X.TasksOnNode, task.Name)
|
log.Println("Error", taskToNode, X.TasksOnNode, task.Name)
|
||||||
}
|
}
|
||||||
for i, task2 := range X.TasksOnNode[nodeID2] {
|
for i, task2 := range X.TasksOnNode[nodeID2] {
|
||||||
if task2.Name == task.Name {
|
if task2.Name == task.Name {
|
||||||
@ -451,7 +261,7 @@ func (X Allocation) Crossover(Y eaopt.Genome, rng *rand.Rand) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if idx == -1 {
|
if idx == -1 {
|
||||||
fmt.Println("Error 2", taskToNode, X.TasksOnNode, task.Name)
|
log.Println("Error 2", taskToNode, X.TasksOnNode, task.Name)
|
||||||
}
|
}
|
||||||
//fmt.Println(X.TasksOnNode)
|
//fmt.Println(X.TasksOnNode)
|
||||||
copy(X.TasksOnNode[nodeID2][idx:], X.TasksOnNode[nodeID2][idx+1:])
|
copy(X.TasksOnNode[nodeID2][idx:], X.TasksOnNode[nodeID2][idx+1:])
|
||||||
@ -496,7 +306,7 @@ func (X Allocation) Clone() eaopt.Genome {
|
|||||||
if !X.Flags["valid"] {
|
if !X.Flags["valid"] {
|
||||||
//fmt.Println(X.Valid)
|
//fmt.Println(X.Valid)
|
||||||
}
|
}
|
||||||
Y := Allocation{TasksOnNode: map[string][]Task{}, Nodes: map[string]Node{}, Flags: map[string]bool{"valid": X.Flags["valid"]}}
|
Y := Allocation{TasksOnNode: map[string][]Task{}, Nodes: map[string]NodeStatus{}, Flags: map[string]bool{"valid": X.Flags["valid"]}}
|
||||||
for id, node := range X.Nodes {
|
for id, node := range X.Nodes {
|
||||||
Y.Nodes[id] = node
|
Y.Nodes[id] = node
|
||||||
Y.NodeIDs = append(Y.NodeIDs, node.ClientID)
|
Y.NodeIDs = append(Y.NodeIDs, node.ClientID)
|
||||||
@ -512,9 +322,9 @@ func (X Allocation) Clone() eaopt.Genome {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func VectorFactory(rng *rand.Rand) eaopt.Genome {
|
func VectorFactory(rng *rand.Rand) eaopt.Genome {
|
||||||
allocation := Allocation{TasksOnNode: map[string][]Task{}, Nodes: map[string]Node{}, Flags: map[string]bool{"valid": true}}
|
allocation := Allocation{TasksOnNode: map[string][]Task{}, Nodes: map[string]NodeStatus{}, Flags: map[string]bool{"valid": true}}
|
||||||
|
|
||||||
var nodes []Node
|
var nodes []NodeStatus
|
||||||
var tasks []Task
|
var tasks []Task
|
||||||
|
|
||||||
for _, node := range nodesMap {
|
for _, node := range nodesMap {
|
||||||
@ -556,7 +366,7 @@ func VectorFactory(rng *rand.Rand) eaopt.Genome {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
allocation.TasksOnNode = fastBestFit(nodes, tasks).TasksOnNode
|
allocation.TasksOnNode = fastBestFit(nodes, tasks).TasksOnNode
|
||||||
fmt.Println(time.Since(ts))
|
log.Println(time.Since(ts))
|
||||||
//fmt.Println("Best Fit")
|
//fmt.Println("Best Fit")
|
||||||
} else if t%2 == 0 {
|
} else if t%2 == 0 {
|
||||||
/* first-fit */
|
/* first-fit */
|
||||||
@ -582,23 +392,26 @@ func VectorFactory(rng *rand.Rand) eaopt.Genome {
|
|||||||
return allocation
|
return allocation
|
||||||
}
|
}
|
||||||
|
|
||||||
func main3() {
|
func testGA() {
|
||||||
numTask := 20
|
numTask := 20
|
||||||
|
|
||||||
nodesMap = map[string]Node{}
|
nodesMap = map[string]NodeStatus{}
|
||||||
tasksMap = map[string]Task{}
|
tasksMap = map[string]Task{}
|
||||||
|
|
||||||
for i := 0; i < numTask*3; i++ {
|
for i := 0; i < numTask*3; i++ {
|
||||||
node := Node{ClientID: strconv.Itoa(i), Rack: strconv.Itoa(i % 40), Domain: strconv.Itoa(i % 4)}
|
node := NodeStatus{ClientID: strconv.Itoa(i), Rack: strconv.Itoa(i % 40), Domain: strconv.Itoa(i % 4)}
|
||||||
node.NumCPU = 24
|
node.NumCPU = 24
|
||||||
node.MemTotal = 188
|
node.MemTotal = 188
|
||||||
node.TotalBW = 100
|
node.TotalBW = 100
|
||||||
node.numberGPU = rand.Intn(3) + 1
|
cnt := rand.Intn(3) + 1
|
||||||
|
for i := 0; i < cnt; i++ {
|
||||||
|
node.Status = append(node.Status, GPUStatus{MemoryTotal: 11439, MemoryAllocated: 0, UUID: node.ClientID + strconv.Itoa(i)})
|
||||||
|
}
|
||||||
nodesMap[strconv.Itoa(i)] = node
|
nodesMap[strconv.Itoa(i)] = node
|
||||||
}
|
}
|
||||||
for i := 0; i < numTask; i++ {
|
for i := 0; i < numTask; i++ {
|
||||||
isPS := false
|
isPS := false
|
||||||
if i>= 3 {
|
if i >= 3 {
|
||||||
isPS = true
|
isPS = true
|
||||||
}
|
}
|
||||||
task := Task{Name: strconv.Itoa(i), IsPS: isPS}
|
task := Task{Name: strconv.Itoa(i), IsPS: isPS}
|
||||||
@ -608,7 +421,7 @@ func main3() {
|
|||||||
tasksMap[strconv.Itoa(i)] = task
|
tasksMap[strconv.Itoa(i)] = task
|
||||||
}
|
}
|
||||||
|
|
||||||
var nodes []Node
|
var nodes []NodeStatus
|
||||||
var tasks []Task
|
var tasks []Task
|
||||||
|
|
||||||
for _, node := range nodesMap {
|
for _, node := range nodesMap {
|
||||||
@ -619,12 +432,12 @@ func main3() {
|
|||||||
}
|
}
|
||||||
s := time.Now()
|
s := time.Now()
|
||||||
allocation := fastBestFit(nodes, tasks)
|
allocation := fastBestFit(nodes, tasks)
|
||||||
fmt.Println(time.Since(s))
|
log.Println(time.Since(s))
|
||||||
|
|
||||||
// Instantiate a GA with a GAConfig
|
// Instantiate a GA with a GAConfig
|
||||||
var ga, err = eaopt.NewDefaultGAConfig().NewGA()
|
var ga, err = eaopt.NewDefaultGAConfig().NewGA()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
log.Println(err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -635,7 +448,7 @@ func main3() {
|
|||||||
|
|
||||||
// Add a custom print function to track progress
|
// Add a custom print function to track progress
|
||||||
ga.Callback = func(ga *eaopt.GA) {
|
ga.Callback = func(ga *eaopt.GA) {
|
||||||
fmt.Printf("Best fitness at generation %d: %f\n", ga.Generations, ga.HallOfFame[0].Fitness)
|
log.Printf("Best fitness at generation %d: %f\n", ga.Generations, ga.HallOfFame[0].Fitness)
|
||||||
}
|
}
|
||||||
|
|
||||||
bestFitness := math.MaxFloat64
|
bestFitness := math.MaxFloat64
|
||||||
@ -647,7 +460,7 @@ func main3() {
|
|||||||
gap := math.Abs(ga.HallOfFame[0].Fitness - bestFitness)
|
gap := math.Abs(ga.HallOfFame[0].Fitness - bestFitness)
|
||||||
if gap <= 0.000001 || ga.HallOfFame[0].Fitness >= bestFitness {
|
if gap <= 0.000001 || ga.HallOfFame[0].Fitness >= bestFitness {
|
||||||
if count >= 30 || time.Since(ts) > time.Second*30 {
|
if count >= 30 || time.Since(ts) > time.Second*30 {
|
||||||
fmt.Println("Early Stop")
|
log.Println("Early Stop")
|
||||||
return true
|
return true
|
||||||
} else {
|
} else {
|
||||||
count++
|
count++
|
||||||
@ -661,13 +474,13 @@ func main3() {
|
|||||||
|
|
||||||
// Find the minimum
|
// Find the minimum
|
||||||
err = ga.Minimize(VectorFactory)
|
err = ga.Minimize(VectorFactory)
|
||||||
fmt.Println(time.Since(ts))
|
log.Println(time.Since(ts))
|
||||||
fmt.Println(ga.HallOfFame[0].Genome.(Allocation).TasksOnNode)
|
log.Println(ga.HallOfFame[0].Genome.(Allocation).TasksOnNode)
|
||||||
//fmt.Println(ga.HallOfFame[0].Genome.(Allocation).Nodes)
|
//fmt.Println(ga.HallOfFame[0].Genome.(Allocation).Nodes)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
log.Println(err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println(allocation)
|
log.Println(allocation)
|
||||||
}
|
}
|
||||||
|
@ -9,5 +9,3 @@ type Group struct {
|
|||||||
CPU int `json:"quota_cpu"`
|
CPU int `json:"quota_cpu"`
|
||||||
Memory int `json:"quota_mem"`
|
Memory int `json:"quota_mem"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,124 +6,109 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"strconv"
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
"sync"
|
||||||
|
"strconv"
|
||||||
|
"math/rand"
|
||||||
)
|
)
|
||||||
|
|
||||||
type JobManager struct {
|
type JobManager struct {
|
||||||
scheduler Scheduler
|
scheduler Scheduler
|
||||||
job Job
|
job Job
|
||||||
jobStatus JobStatus
|
jobStatus JobStatus
|
||||||
resources []NodeStatus
|
resources []NodeStatus
|
||||||
killedFlag bool
|
resourcesMu sync.Mutex
|
||||||
isRunning bool
|
isRunning bool
|
||||||
network string
|
killFlag bool
|
||||||
|
network string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (jm *JobManager) start() {
|
func (jm *JobManager) start() {
|
||||||
log.Info("start job ", jm.job.Name, time.Now())
|
log.Info("start job ", jm.job.Name, time.Now())
|
||||||
jm.isRunning = false
|
jm.isRunning = false
|
||||||
|
jm.killFlag = false
|
||||||
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
||||||
|
|
||||||
jm.network = InstanceOfResourcePool().acquireNetwork()
|
/* register in JHL */
|
||||||
|
|
||||||
InstanceJobHistoryLogger().submitJob(jm.job)
|
InstanceJobHistoryLogger().submitJob(jm.job)
|
||||||
|
|
||||||
|
/* request for private network */
|
||||||
|
jm.network = InstanceOfResourcePool().acquireNetwork()
|
||||||
|
|
||||||
/* request for resources */
|
/* request for resources */
|
||||||
for range jm.job.Tasks { //append would cause uncertain order
|
|
||||||
jm.resources = append(jm.resources, NodeStatus{ClientID: "null"})
|
|
||||||
}
|
|
||||||
|
|
||||||
var nodes []NodeStatus
|
|
||||||
for {
|
for {
|
||||||
if jm.killedFlag {
|
if jm.killFlag {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
nodes = jm.scheduler.AcquireResource(jm.job)
|
jm.resources = jm.scheduler.AcquireResource(jm.job)
|
||||||
if len(nodes) > 0 {
|
if len(jm.resources) > 0 {
|
||||||
|
log.Info("Receive resource", jm.resources)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
time.Sleep(time.Second * 1)
|
/* sleep random Millisecond to avoid deadlock */
|
||||||
}
|
time.Sleep(time.Millisecond * time.Duration(500+rand.Intn(500)))
|
||||||
log.Info("Receive resource", nodes)
|
|
||||||
jm.resources = nodes
|
|
||||||
|
|
||||||
for _, node := range nodes {
|
|
||||||
for _, t := range node.Status {
|
|
||||||
InstanceOfResourcePool().attach(t.UUID, jm.job.Name)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !jm.killedFlag {
|
if !jm.killFlag {
|
||||||
|
/* switch to Running state */
|
||||||
jm.scheduler.UpdateProgress(jm.job, Running)
|
jm.scheduler.UpdateProgress(jm.job, Running)
|
||||||
jm.isRunning = true
|
|
||||||
log.Info("ready to run job ", jm.job.Name, time.Now())
|
log.Info("ready to run job ", jm.job.Name, time.Now())
|
||||||
}
|
|
||||||
|
|
||||||
/* bring up containers */
|
/* bring up containers */
|
||||||
for i := range jm.job.Tasks {
|
wg := sync.WaitGroup{}
|
||||||
if jm.killedFlag {
|
for i := range jm.job.Tasks {
|
||||||
break
|
wg.Add(1)
|
||||||
}
|
|
||||||
var GPUs []string
|
go func(index int) {
|
||||||
for _, GPU := range jm.resources[i].Status {
|
defer wg.Done()
|
||||||
GPUs = append(GPUs, GPU.UUID)
|
var UUIDs []string
|
||||||
}
|
for _, GPU := range jm.resources[index].Status {
|
||||||
|
UUIDs = append(UUIDs, GPU.UUID)
|
||||||
for attempt := 0; attempt < 3; attempt++ {
|
|
||||||
if attempt == 2 { //failed more than once
|
/* attach to GPUs */
|
||||||
//for {
|
InstanceOfResourcePool().attach(GPU.UUID, jm.job.Name)
|
||||||
// resource := jm.scheduler.AcquireResource(jm.job, jm.job.Tasks[i], jm.resources)
|
}
|
||||||
// if len(resource.Status) > 0 {
|
GPUs := strings.Join(UUIDs, ",")
|
||||||
// break
|
|
||||||
// }
|
v := url.Values{}
|
||||||
time.Sleep(time.Second * 1)
|
v.Set("image", jm.job.Tasks[index].Image)
|
||||||
break
|
v.Set("cmd", jm.job.Tasks[index].Cmd)
|
||||||
//}
|
v.Set("name", jm.job.Tasks[index].Name)
|
||||||
}
|
v.Set("workspace", jm.job.Workspace)
|
||||||
|
v.Set("gpus", GPUs)
|
||||||
v := url.Values{}
|
v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[index].Memory)+"m")
|
||||||
v.Set("image", jm.job.Tasks[i].Image)
|
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[index].NumberCPU))
|
||||||
v.Set("cmd", jm.job.Tasks[i].Cmd)
|
v.Set("network", jm.network)
|
||||||
v.Set("name", jm.job.Tasks[i].Name)
|
v.Set("should_wait", "1")
|
||||||
v.Set("workspace", jm.job.Workspace)
|
v.Set("output_dir", "/tmp/")
|
||||||
v.Set("gpus", strings.Join(GPUs, ","))
|
v.Set("hdfs_address", "http://192.168.100.104:50070/")
|
||||||
v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[i].Memory)+"m")
|
v.Set("hdfs_dir", "/user/yao/output/"+jm.job.Name)
|
||||||
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[i].NumberCPU))
|
v.Set("gpu_mem", strconv.Itoa(jm.job.Tasks[index].MemoryGPU))
|
||||||
v.Set("network", jm.network)
|
|
||||||
v.Set("should_wait", "1")
|
resp, err := doRequest("POST", "http://"+jm.resources[index].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
||||||
v.Set("output_dir", "/tmp/")
|
if err != nil {
|
||||||
v.Set("hdfs_address", "http://192.168.100.104:50070/")
|
log.Warn(err.Error())
|
||||||
v.Set("hdfs_dir", "/user/yao/output/"+jm.job.Name)
|
return
|
||||||
v.Set("gpu_mem", strconv.Itoa(jm.job.Tasks[i].MemoryGPU))
|
}
|
||||||
|
|
||||||
resp, err := doRequest("POST", "http://"+jm.resources[i].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
body, err := ioutil.ReadAll(resp.Body)
|
||||||
if err != nil {
|
resp.Body.Close()
|
||||||
log.Warn(err.Error())
|
if err != nil {
|
||||||
continue
|
log.Warn(err)
|
||||||
}
|
return
|
||||||
|
}
|
||||||
body, err := ioutil.ReadAll(resp.Body)
|
|
||||||
resp.Body.Close()
|
var res MsgCreate
|
||||||
if err != nil {
|
err = json.Unmarshal([]byte(string(body)), &res)
|
||||||
log.Warn(err)
|
if err != nil || res.Code != 0 {
|
||||||
continue
|
log.Warn(res)
|
||||||
}
|
return
|
||||||
|
}
|
||||||
var res MsgCreate
|
jm.jobStatus.tasks[jm.job.Tasks[index].Name] = TaskStatus{Id: res.Id, Node: jm.resources[index].ClientHost}
|
||||||
err = json.Unmarshal([]byte(string(body)), &res)
|
}(i)
|
||||||
if err != nil {
|
|
||||||
log.Warn(err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if res.Code != 0 {
|
|
||||||
log.Warn(res)
|
|
||||||
}
|
|
||||||
if res.Code == 0 {
|
|
||||||
jm.jobStatus.tasks[jm.job.Tasks[i].Name] = TaskStatus{Id: res.Id, Node: jm.resources[i].ClientHost}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
wg.Wait()
|
||||||
|
jm.isRunning = true
|
||||||
}
|
}
|
||||||
|
|
||||||
/* monitor job execution */
|
/* monitor job execution */
|
||||||
@ -134,87 +119,94 @@ func (jm *JobManager) start() {
|
|||||||
}
|
}
|
||||||
time.Sleep(time.Second * 25)
|
time.Sleep(time.Second * 25)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* make sure resource are released */
|
||||||
|
jm.returnResource(jm.status().Status)
|
||||||
|
log.Info("JobMaster exited ", jm.job.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (jm *JobManager) checkStatus(status []TaskStatus) bool {
|
/* release all resource */
|
||||||
if !jm.isRunning {
|
func (jm *JobManager) returnResource(status []TaskStatus) {
|
||||||
return false
|
jm.resourcesMu.Lock()
|
||||||
|
defer jm.resourcesMu.Unlock()
|
||||||
|
if len(jm.resources) == 0 {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
flag := false
|
/* return resource */
|
||||||
|
for i := range jm.resources {
|
||||||
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
||||||
|
log.Info("return resource ", jm.resources[i].ClientID)
|
||||||
|
|
||||||
|
for _, t := range jm.resources[i].Status {
|
||||||
|
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
||||||
|
}
|
||||||
|
|
||||||
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
||||||
|
|
||||||
|
/* remove exited containers */
|
||||||
|
//v := url.Values{}
|
||||||
|
//v.Set("id", res.Status[i].Id)
|
||||||
|
//
|
||||||
|
//_, err := doRequest("POST", "http://"+res.Status[i].Node+":8000/remove", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
||||||
|
//if err != nil {
|
||||||
|
// log.Warn(err.Error())
|
||||||
|
// continue
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
InstanceOfResourcePool().releaseNetwork(jm.network)
|
||||||
|
jm.resources = []NodeStatus{}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* monitor all tasks */
|
||||||
|
func (jm *JobManager) checkStatus(status []TaskStatus) {
|
||||||
|
if !jm.isRunning {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
flagRunning := false
|
||||||
onlyPS := true
|
onlyPS := true
|
||||||
for i := range status {
|
for i := range status {
|
||||||
if status[i].Status == "ready" {
|
if status[i].Status == "ready" {
|
||||||
log.Debug(jm.job.Name, "-", i, " is ready to run")
|
log.Debug(jm.job.Name, "-", i, " is ready to run")
|
||||||
flag = true
|
flagRunning = true
|
||||||
if !jm.job.Tasks[i].IsPS {
|
|
||||||
onlyPS = false
|
|
||||||
}
|
|
||||||
} else if status[i].Status == "unknown" {
|
|
||||||
log.Debug(jm.job.Name, "-", i, " is starting")
|
|
||||||
flag = true
|
|
||||||
if !jm.job.Tasks[i].IsPS {
|
if !jm.job.Tasks[i].IsPS {
|
||||||
onlyPS = false
|
onlyPS = false
|
||||||
}
|
}
|
||||||
} else if status[i].Status == "running" {
|
} else if status[i].Status == "running" {
|
||||||
log.Debug(jm.job.Name, "-", i, " is running")
|
log.Debug(jm.job.Name, "-", i, " is running")
|
||||||
flag = true
|
flagRunning = true
|
||||||
if !jm.job.Tasks[i].IsPS {
|
if !jm.job.Tasks[i].IsPS {
|
||||||
onlyPS = false
|
onlyPS = false
|
||||||
}
|
}
|
||||||
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
||||||
} else {
|
} else {
|
||||||
log.Info(jm.job.Name, "-", i, " ", status[i].Status)
|
log.Info(jm.job.Name, "-", i, " ", status[i].Status)
|
||||||
if exitCode, ok := status[i].State["ExitCode"].(float64); ok && !jm.job.Tasks[i].IsPS {
|
if exitCode, ok := status[i].State["ExitCode"].(float64); ok && exitCode != 0 && !jm.killFlag {
|
||||||
if exitCode != 0 && !jm.killedFlag {
|
log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode)
|
||||||
log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode)
|
jm.isRunning = false
|
||||||
jm.killedFlag = true
|
jm.stop(false)
|
||||||
jm.scheduler.UpdateProgress(jm.job, Failed)
|
jm.scheduler.UpdateProgress(jm.job, Failed)
|
||||||
}
|
jm.returnResource(status)
|
||||||
}
|
break
|
||||||
|
|
||||||
/* remove exited containers */
|
|
||||||
//v := url.Values{}
|
|
||||||
//v.Set("id", res.Status[i].Id)
|
|
||||||
//
|
|
||||||
//_, err := doRequest("POST", "http://"+res.Status[i].Node+":8000/remove", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
|
||||||
//if err != nil {
|
|
||||||
// log.Warn(err.Error())
|
|
||||||
// continue
|
|
||||||
//}
|
|
||||||
|
|
||||||
/* return resource */
|
|
||||||
if jm.resources[i].ClientID != "null" {
|
|
||||||
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
|
||||||
log.Info("return resource ", jm.resources[i].ClientID)
|
|
||||||
jm.resources[i].ClientID = "null"
|
|
||||||
|
|
||||||
for _, t := range jm.resources[i].Status {
|
|
||||||
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
|
||||||
}
|
|
||||||
|
|
||||||
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if flag && onlyPS {
|
if jm.isRunning && onlyPS {
|
||||||
jm.stop()
|
|
||||||
log.Info("Only PS is running, stop ", jm.job.Name)
|
log.Info("Only PS is running, stop ", jm.job.Name)
|
||||||
jm.killedFlag = false
|
|
||||||
}
|
|
||||||
|
|
||||||
if !flag {
|
|
||||||
jm.isRunning = false
|
jm.isRunning = false
|
||||||
InstanceOfResourcePool().releaseNetwork(jm.network)
|
jm.stop(false)
|
||||||
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
if !jm.killedFlag {
|
jm.returnResource(status)
|
||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
}
|
||||||
log.Info("finish job ", jm.job.Name)
|
|
||||||
}
|
if jm.isRunning && !flagRunning && !jm.killFlag {
|
||||||
log.Info("JobMaster exited ", jm.job.Name)
|
jm.isRunning = false
|
||||||
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
|
jm.returnResource(status)
|
||||||
|
log.Info("finish job ", jm.job.Name)
|
||||||
}
|
}
|
||||||
return flag
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* fetch logs of task */
|
||||||
func (jm *JobManager) logs(taskName string) MsgLog {
|
func (jm *JobManager) logs(taskName string) MsgLog {
|
||||||
spider := Spider{}
|
spider := Spider{}
|
||||||
spider.Method = "GET"
|
spider.Method = "GET"
|
||||||
@ -234,21 +226,22 @@ func (jm *JobManager) logs(taskName string) MsgLog {
|
|||||||
|
|
||||||
body, err := ioutil.ReadAll(resp.Body)
|
body, err := ioutil.ReadAll(resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return MsgLog{Code: 1, Error: err.Error()}
|
return MsgLog{Code: 2, Error: err.Error()}
|
||||||
}
|
}
|
||||||
|
|
||||||
var res MsgLog
|
var res MsgLog
|
||||||
err = json.Unmarshal([]byte(string(body)), &res)
|
err = json.Unmarshal([]byte(string(body)), &res)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Println(err)
|
log.Println(err)
|
||||||
return MsgLog{Code: 1, Error: "Unknown"}
|
return MsgLog{Code: 3, Error: "Unknown"}
|
||||||
}
|
}
|
||||||
return res
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* fetch job tasks status */
|
||||||
func (jm *JobManager) status() MsgJobStatus {
|
func (jm *JobManager) status() MsgJobStatus {
|
||||||
var tasksStatus []TaskStatus
|
var tasksStatus []TaskStatus
|
||||||
for range jm.job.Tasks {
|
for range jm.job.Tasks { //append would cause uncertain order
|
||||||
tasksStatus = append(tasksStatus, TaskStatus{})
|
tasksStatus = append(tasksStatus, TaskStatus{})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -286,22 +279,23 @@ func (jm *JobManager) status() MsgJobStatus {
|
|||||||
return MsgJobStatus{Status: tasksStatus}
|
return MsgJobStatus{Status: tasksStatus}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (jm *JobManager) stop() MsgStop {
|
/* force stop all containers */
|
||||||
jm.killedFlag = true
|
func (jm *JobManager) stop(force bool) MsgStop {
|
||||||
go func() { /* kill at background */
|
for _, taskStatus := range jm.jobStatus.tasks {
|
||||||
for _, taskStatus := range jm.jobStatus.tasks {
|
v := url.Values{}
|
||||||
v := url.Values{}
|
v.Set("id", taskStatus.Id)
|
||||||
v.Set("id", taskStatus.Id)
|
|
||||||
|
|
||||||
_, err := doRequest("POST", "http://"+taskStatus.Node+":8000/stop", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
_, err := doRequest("POST", "http://"+taskStatus.Node+":8000/stop", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn(err.Error())
|
log.Warn(err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}()
|
}
|
||||||
|
|
||||||
jm.scheduler.UpdateProgress(jm.job, Stopped)
|
if force {
|
||||||
log.Info("kill job, ", jm.job.Name)
|
jm.killFlag = true
|
||||||
|
jm.scheduler.UpdateProgress(jm.job, Stopped)
|
||||||
|
log.Info("kill job, ", jm.job.Name)
|
||||||
|
}
|
||||||
return MsgStop{Code: 0}
|
return MsgStop{Code: 0}
|
||||||
}
|
}
|
||||||
|
@ -292,7 +292,7 @@ func main() {
|
|||||||
|
|
||||||
/* init components */
|
/* init components */
|
||||||
InstanceOfResourcePool().init(config)
|
InstanceOfResourcePool().init(config)
|
||||||
InstanceOfColector().init(config)
|
InstanceOfCollector().init(config)
|
||||||
InstanceJobHistoryLogger().init(config)
|
InstanceJobHistoryLogger().init(config)
|
||||||
InstanceOfOptimizer().init(config)
|
InstanceOfOptimizer().init(config)
|
||||||
InstanceOfGroupManager().init(config)
|
InstanceOfGroupManager().init(config)
|
||||||
|
@ -30,11 +30,13 @@ type NodeStatus struct {
|
|||||||
ClientID string `json:"id"`
|
ClientID string `json:"id"`
|
||||||
ClientHost string `json:"host"`
|
ClientHost string `json:"host"`
|
||||||
Domain string `json:"domain"`
|
Domain string `json:"domain"`
|
||||||
Rack int `json:"rack"`
|
Rack string `json:"rack"`
|
||||||
Version float64 `json:"version"`
|
Version float64 `json:"version"`
|
||||||
NumCPU int `json:"cpu_num"`
|
NumCPU int `json:"cpu_num"`
|
||||||
UtilCPU float64 `json:"cpu_load"`
|
UtilCPU float64 `json:"cpu_load"`
|
||||||
MemTotal int `json:"mem_total"`
|
MemTotal int `json:"mem_total"`
|
||||||
MemAvailable int `json:"mem_available"`
|
MemAvailable int `json:"mem_available"`
|
||||||
|
UsingBW float64 `json:"bw_using"`
|
||||||
|
TotalBW float64 `json:"bw_total"`
|
||||||
Status []GPUStatus `json:"status"`
|
Status []GPUStatus `json:"status"`
|
||||||
}
|
}
|
||||||
|
@ -807,6 +807,12 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
|
|||||||
/* assign */
|
/* assign */
|
||||||
var ress []NodeStatus
|
var ress []NodeStatus
|
||||||
if len(candidates) > 0 {
|
if len(candidates) > 0 {
|
||||||
|
/*
|
||||||
|
for range job.Tasks { //append would cause uncertain order
|
||||||
|
resources = append(resources, NodeStatus{ClientID: "null"})
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
var nodes []NodeStatus
|
var nodes []NodeStatus
|
||||||
if len(job.Tasks) == 1 {
|
if len(job.Tasks) == 1 {
|
||||||
node := pool.pickNode(candidates, availableGPUs, task, job, []NodeStatus{})
|
node := pool.pickNode(candidates, availableGPUs, task, job, []NodeStatus{})
|
||||||
|
@ -407,7 +407,7 @@ func (scheduler *SchedulerFair) Stop(jobName string) MsgStop {
|
|||||||
if !ok {
|
if !ok {
|
||||||
return MsgStop{Code: 1, Error: "Job not exist!"}
|
return MsgStop{Code: 1, Error: "Job not exist!"}
|
||||||
}
|
}
|
||||||
return jm.stop()
|
return jm.stop(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) QueryLogs(jobName string, taskName string) MsgLog {
|
func (scheduler *SchedulerFair) QueryLogs(jobName string, taskName string) MsgLog {
|
||||||
|
@ -4,7 +4,7 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SchedulerPriority struct {
|
type SchedulerPriority struct {
|
||||||
history []*Job
|
history []*Job
|
||||||
|
@ -6,7 +6,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Configuration struct {
|
type Configuration struct {
|
||||||
KafkaBrokers []string `json:"kafkaBrokers"`
|
KafkaBrokers []string `json:"kafkaBrokers"`
|
||||||
|
Loading…
Reference in New Issue
Block a user