1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-15 08:16:43 +00:00
This commit is contained in:
2020-05-26 20:46:11 +08:00
parent f7149310e8
commit ec30e79c81
5 changed files with 298 additions and 218 deletions

View File

@@ -158,14 +158,14 @@ func (pool *ResourcePool) checkDeadNodes() {
func (pool *ResourcePool) GPUModelToPower(model string) int {
mapper := map[string]int{
"K40": 1, "Tesla K40": 1,
"K80": 2, "Tesla K80": 2,
"P100": 3, "Tesla P100": 3,
"K40": 2, "Tesla K40": 2,
"K80": 3, "Tesla K80": 3,
"P100": 4, "Tesla P100": 4,
}
if power, err := mapper[model]; !err {
return power
}
return 0
return 1
}
func (pool *ResourcePool) getNodePool(name string) int {
@@ -639,12 +639,16 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
locks := map[int]*sync.Mutex{}
allocationType := 0
availableGPUs := map[string][]GPUStatus{}
var candidates []*NodeStatus
var candidates []NodeStatus
if pool.TotalGPU == 0 {
return []NodeStatus{}
}
loadRatio := float64(pool.UsingGPU) / float64(pool.TotalGPU)
/* first, choose sharable GPUs */
if pool.enableShare && (pool.TotalGPU != 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && float64(pool.UsingGPU)/float64(pool.TotalGPU) >= pool.enableShareRatio) {
if pool.enableShare && len(job.Tasks) == 1 && task.NumberGPU == 1 && loadRatio >= pool.enableShareRatio {
// check sharable
allocationType = 1
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
@@ -671,13 +675,12 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
}
if totalUtil < 100 {
available = append(available, status)
availableGPUs[node.ClientID] = available
}
}
}
}
if len(available) >= task.NumberGPU {
candidates = append(candidates, node)
candidates = append(candidates, *node)
if len(candidates) >= len(job.Tasks)*3+5 {
break
}
@@ -711,8 +714,7 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
}
}
if len(available) >= task.NumberGPU {
candidates = append(candidates, node)
availableGPUs[node.ClientID] = available
candidates = append(candidates, *node)
if len(candidates) >= len(job.Tasks)*3+5 {
break
}
@@ -733,11 +735,7 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && pool.enablePreSchedule {
estimate, valid := InstanceOfOptimizer().predictTime(job.Name)
//log.Info(pool.TotalGPU)
//log.Info(estimate, valid)
//log.Info(scheduler.UsingGPU)
if pool.TotalGPU != 0 && float64(pool.UsingGPU)/float64(pool.TotalGPU) >= pool.enablePreScheduleRatio && valid {
if loadRatio >= pool.enablePreScheduleRatio && valid {
allocationType = 3
for cur := start; ; {
if _, ok := locks[cur.ID]; !ok {
@@ -765,8 +763,7 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
}
}
if len(available) >= task.NumberGPU {
candidates = append(candidates, node)
availableGPUs[node.ClientID] = available
candidates = append(candidates, *node)
if len(candidates) >= len(job.Tasks)*3+5 {
break
}
@@ -792,44 +789,69 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
/* assign */
var ress []NodeStatus
if len(candidates) > 0 {
/*
for range job.Tasks { //append would cause uncertain order
resources = append(resources, NodeStatus{ClientID: "null"})
}
*/
var nodes []NodeStatus
if len(job.Tasks) == 1 {
node := pool.pickNode(candidates, availableGPUs, task, job, []NodeStatus{})
nodes = append(nodes, *node)
for range job.Tasks { //append would cause uncertain order
ress = append(ress, NodeStatus{ClientID: "null"})
}
for _, node := range nodes {
res := NodeStatus{}
res.ClientID = node.ClientID
res.ClientHost = node.ClientHost
res.Status = availableGPUs[node.ClientID][0:task.NumberGPU]
res.NumCPU = task.NumberCPU
res.MemTotal = task.Memory
var nodesT []NodeStatus
for _, node := range candidates {
nodesT = append(nodesT, node.Copy())
}
for i := range res.Status {
for j := range node.Status {
if res.Status[i].UUID == node.Status[j].UUID {
if node.Status[j].MemoryAllocated == 0 {
pool.UsingGPUMu.Lock()
pool.UsingGPU ++
pool.UsingGPUMu.Unlock()
allocation := fastBestFit(nodesT, job.Tasks)
if !allocation.Flags["valid"] {
return []NodeStatus{}
}
for nodeID, tasks := range allocation.TasksOnNode {
var node *NodeStatus
for i := range candidates {
if candidates[i].ClientID == nodeID {
node = &candidates[i]
}
}
var available []GPUStatus
for _, gpu := range node.Status {
if gpu.MemoryAllocated == 0 {
available = append(available, gpu)
}
}
for _, task := range tasks {
res := NodeStatus{}
res.ClientID = node.ClientID
res.ClientHost = node.ClientHost
res.NumCPU = task.NumberCPU
res.MemTotal = task.Memory
res.Status = available[0:task.NumberGPU]
available = available[task.NumberGPU:]
for i := range res.Status {
for j := range node.Status {
if res.Status[i].UUID == node.Status[j].UUID {
if node.Status[j].MemoryAllocated == 0 {
pool.UsingGPUMu.Lock()
pool.UsingGPU ++
pool.UsingGPUMu.Unlock()
}
node.Status[j].MemoryAllocated += task.MemoryGPU
res.Status[i].MemoryTotal = task.MemoryGPU
}
node.Status[j].MemoryAllocated += task.MemoryGPU
res.Status[i].MemoryTotal = task.MemoryGPU
}
}
for _, t := range res.Status {
pool.attach(t.UUID, job.Name)
}
for i := range job.Tasks {
if job.Tasks[i].Name == task.Name {
ress[i] = res
}
}
}
for _, t := range res.Status {
pool.attach(t.UUID, job.Name)
}
ress = append(ress, res)
}
}
for segID, lock := range locks {