1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-13 07:46:43 +00:00
This commit is contained in:
2020-04-30 14:04:40 +08:00
parent 875e4665be
commit dbd880c971
4 changed files with 98 additions and 20 deletions

View File

@@ -139,6 +139,7 @@ func (jm *JobManager) start() {
if onlyPS {
jm.stop()
log.Info("Only PS is running, stop", jm.job.Name)
jm.killedFlag = false
break
}
if !flag {

View File

@@ -88,7 +88,7 @@ func (optimizer *Optimizer) feed(job string, utils []int) {
}()
}
func (optimizer *Optimizer) predictTime(job string, utils []int) (int, bool) {
func (optimizer *Optimizer) predictUtilGPU(job string) (int, bool) {
if _, err := optimizer.jobUtilsGPU[job]; err {
return 100, false
}
@@ -98,7 +98,7 @@ func (optimizer *Optimizer) predictTime(job string, utils []int) (int, bool) {
return optimizer.jobUtilsGPU[job], false
}
func (optimizer *Optimizer) predictUtilGPU(job string) (OptimizerJobExecutionTime, bool) {
func (optimizer *Optimizer) predictTime(job string) (OptimizerJobExecutionTime, bool) {
if _, err := optimizer.predicts[job]; err {
return OptimizerJobExecutionTime{}, false
}

View File

@@ -37,6 +37,14 @@ type ResourcePool struct {
utils map[string][]int
}
func (pool *ResourcePool) GPUModelToPower(model string) int {
mapper := map[string]int{"k40": 1, "K80": 2, "P100": 3}
if power, err := mapper[model]; !err {
return power
}
return 0
}
func (pool *ResourcePool) getNodePool(name string) int {
h := fnv.New32a()
h.Write([]byte(name))

View File

@@ -183,40 +183,109 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
poolID := rand.Intn(pool.poolsCount)
res := NodeStatus{}
var locks []sync.Mutex
var candidates []NodeStatus
/* first round, find vacant gpu */
for i := poolID; i < pool.poolsCount; i++ {
pool.poolsMu[i].Lock()
flag := false
for id, node := range pool.pools[i] {
locks = append(locks, pool.poolsMu[i])
for _, node := range pool.pools[i] {
var available []GPUStatus
for _, status := range node.Status {
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
if status.MemoryTotal >= task.MemoryGPU && status.MemoryUsed < 10 {
available = append(available, status)
}
}
if len(available) >= task.NumberGPU {
res.ClientID = id
res.ClientHost = node.ClientHost
res.Status = available[0:task.NumberGPU]
res.NumCPU = task.NumberCPU
res.MemTotal = task.Memory
tmp := NodeStatus{}
tmp.ClientID = node.ClientID
tmp.ClientHost = node.ClientHost
tmp.Status = available
tmp.NumCPU = node.NumCPU
tmp.MemTotal = node.MemAvailable
candidates = append(candidates, tmp)
if len(candidates) >= 8 {
break
}
}
}
if len(candidates) >= 8 {
break
}
}
log.Info(candidates)
for i := range res.Status {
for j := range node.Status {
if res.Status[i].UUID == node.Status[j].UUID {
node.Status[j].MemoryAllocated += task.MemoryGPU
res.Status[i].MemoryTotal = task.MemoryGPU
/* second round, find sharable gpu */
if len(candidates) == 0 {
// check sharable
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
for i := poolID; i < pool.poolsCount; i++ {
pool.poolsMu[i].Lock()
locks = append(locks, pool.poolsMu[i])
for _, node := range pool.pools[i] {
var available []GPUStatus
for _, status := range node.Status {
if status.MemoryTotal >= task.MemoryGPU+status.MemoryAllocated && status.MemoryFree > task.MemoryGPU {
if jobs, err := pool.bindings[status.UUID]; !err {
totalUtil := util
for job := range jobs {
if utilT, err := InstanceOfOptimizer().predictUtilGPU(job); !err {
totalUtil += utilT
}
}
if totalUtil < 100 {
available = append(available, status)
}
}
}
}
if len(available) >= task.NumberGPU {
tmp := NodeStatus{}
tmp.ClientID = node.ClientID
tmp.ClientHost = node.ClientHost
tmp.Status = available
tmp.NumCPU = node.NumCPU
tmp.MemTotal = node.MemAvailable
candidates = append(candidates, tmp)
if len(candidates) >= 8 {
break
}
}
}
flag = true
break
if len(candidates) >= 8 {
break
}
}
}
pool.poolsMu[i].Unlock()
if flag {
break
}
log.Info(candidates)
/*assign*/
if len(candidates) > 0 {
node := candidates[0]
res := NodeStatus{}
res.ClientID = node.ClientID
res.ClientHost = node.ClientHost
res.Status = candidates[0].Status[0:task.NumberGPU]
res.NumCPU = task.NumberCPU
res.MemTotal = task.Memory
for i := range res.Status {
for j := range node.Status {
if res.Status[i].UUID == node.Status[j].UUID {
node.Status[j].MemoryAllocated += task.MemoryGPU
res.Status[i].MemoryTotal = task.MemoryGPU
}
}
}
}
for _, lock := range locks {
lock.Unlock()
}
go func(res NodeStatus) {
if len(res.Status) == 0 {
return