mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-15 08:16:43 +00:00
update
This commit is contained in:
@@ -139,6 +139,7 @@ func (jm *JobManager) start() {
|
|||||||
if onlyPS {
|
if onlyPS {
|
||||||
jm.stop()
|
jm.stop()
|
||||||
log.Info("Only PS is running, stop", jm.job.Name)
|
log.Info("Only PS is running, stop", jm.job.Name)
|
||||||
|
jm.killedFlag = false
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if !flag {
|
if !flag {
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ func (optimizer *Optimizer) feed(job string, utils []int) {
|
|||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (optimizer *Optimizer) predictTime(job string, utils []int) (int, bool) {
|
func (optimizer *Optimizer) predictUtilGPU(job string) (int, bool) {
|
||||||
if _, err := optimizer.jobUtilsGPU[job]; err {
|
if _, err := optimizer.jobUtilsGPU[job]; err {
|
||||||
return 100, false
|
return 100, false
|
||||||
}
|
}
|
||||||
@@ -98,7 +98,7 @@ func (optimizer *Optimizer) predictTime(job string, utils []int) (int, bool) {
|
|||||||
return optimizer.jobUtilsGPU[job], false
|
return optimizer.jobUtilsGPU[job], false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (optimizer *Optimizer) predictUtilGPU(job string) (OptimizerJobExecutionTime, bool) {
|
func (optimizer *Optimizer) predictTime(job string) (OptimizerJobExecutionTime, bool) {
|
||||||
if _, err := optimizer.predicts[job]; err {
|
if _, err := optimizer.predicts[job]; err {
|
||||||
return OptimizerJobExecutionTime{}, false
|
return OptimizerJobExecutionTime{}, false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,6 +37,14 @@ type ResourcePool struct {
|
|||||||
utils map[string][]int
|
utils map[string][]int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pool *ResourcePool) GPUModelToPower(model string) int {
|
||||||
|
mapper := map[string]int{"k40": 1, "K80": 2, "P100": 3}
|
||||||
|
if power, err := mapper[model]; !err {
|
||||||
|
return power
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) getNodePool(name string) int {
|
func (pool *ResourcePool) getNodePool(name string) int {
|
||||||
h := fnv.New32a()
|
h := fnv.New32a()
|
||||||
h.Write([]byte(name))
|
h.Write([]byte(name))
|
||||||
|
|||||||
@@ -183,40 +183,109 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
|||||||
poolID := rand.Intn(pool.poolsCount)
|
poolID := rand.Intn(pool.poolsCount)
|
||||||
res := NodeStatus{}
|
res := NodeStatus{}
|
||||||
|
|
||||||
|
var locks []sync.Mutex
|
||||||
|
|
||||||
|
var candidates []NodeStatus
|
||||||
|
/* first round, find vacant gpu */
|
||||||
for i := poolID; i < pool.poolsCount; i++ {
|
for i := poolID; i < pool.poolsCount; i++ {
|
||||||
pool.poolsMu[i].Lock()
|
pool.poolsMu[i].Lock()
|
||||||
flag := false
|
locks = append(locks, pool.poolsMu[i])
|
||||||
for id, node := range pool.pools[i] {
|
for _, node := range pool.pools[i] {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
if status.MemoryTotal >= task.MemoryGPU && status.MemoryUsed < 10 {
|
||||||
available = append(available, status)
|
available = append(available, status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(available) >= task.NumberGPU {
|
if len(available) >= task.NumberGPU {
|
||||||
res.ClientID = id
|
tmp := NodeStatus{}
|
||||||
res.ClientHost = node.ClientHost
|
tmp.ClientID = node.ClientID
|
||||||
res.Status = available[0:task.NumberGPU]
|
tmp.ClientHost = node.ClientHost
|
||||||
res.NumCPU = task.NumberCPU
|
tmp.Status = available
|
||||||
res.MemTotal = task.Memory
|
tmp.NumCPU = node.NumCPU
|
||||||
|
tmp.MemTotal = node.MemAvailable
|
||||||
|
candidates = append(candidates, tmp)
|
||||||
|
if len(candidates) >= 8 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(candidates) >= 8 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.Info(candidates)
|
||||||
|
|
||||||
for i := range res.Status {
|
/* second round, find sharable gpu */
|
||||||
for j := range node.Status {
|
if len(candidates) == 0 {
|
||||||
if res.Status[i].UUID == node.Status[j].UUID {
|
// check sharable
|
||||||
node.Status[j].MemoryAllocated += task.MemoryGPU
|
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
|
||||||
res.Status[i].MemoryTotal = task.MemoryGPU
|
|
||||||
|
for i := poolID; i < pool.poolsCount; i++ {
|
||||||
|
pool.poolsMu[i].Lock()
|
||||||
|
locks = append(locks, pool.poolsMu[i])
|
||||||
|
for _, node := range pool.pools[i] {
|
||||||
|
var available []GPUStatus
|
||||||
|
for _, status := range node.Status {
|
||||||
|
if status.MemoryTotal >= task.MemoryGPU+status.MemoryAllocated && status.MemoryFree > task.MemoryGPU {
|
||||||
|
|
||||||
|
if jobs, err := pool.bindings[status.UUID]; !err {
|
||||||
|
totalUtil := util
|
||||||
|
for job := range jobs {
|
||||||
|
if utilT, err := InstanceOfOptimizer().predictUtilGPU(job); !err {
|
||||||
|
totalUtil += utilT
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if totalUtil < 100 {
|
||||||
|
available = append(available, status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(available) >= task.NumberGPU {
|
||||||
|
tmp := NodeStatus{}
|
||||||
|
tmp.ClientID = node.ClientID
|
||||||
|
tmp.ClientHost = node.ClientHost
|
||||||
|
tmp.Status = available
|
||||||
|
tmp.NumCPU = node.NumCPU
|
||||||
|
tmp.MemTotal = node.MemAvailable
|
||||||
|
candidates = append(candidates, tmp)
|
||||||
|
if len(candidates) >= 8 {
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
flag = true
|
if len(candidates) >= 8 {
|
||||||
break
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.poolsMu[i].Unlock()
|
}
|
||||||
if flag {
|
log.Info(candidates)
|
||||||
break
|
|
||||||
|
/*assign*/
|
||||||
|
if len(candidates) > 0 {
|
||||||
|
node := candidates[0]
|
||||||
|
res := NodeStatus{}
|
||||||
|
res.ClientID = node.ClientID
|
||||||
|
res.ClientHost = node.ClientHost
|
||||||
|
res.Status = candidates[0].Status[0:task.NumberGPU]
|
||||||
|
res.NumCPU = task.NumberCPU
|
||||||
|
res.MemTotal = task.Memory
|
||||||
|
|
||||||
|
for i := range res.Status {
|
||||||
|
for j := range node.Status {
|
||||||
|
if res.Status[i].UUID == node.Status[j].UUID {
|
||||||
|
node.Status[j].MemoryAllocated += task.MemoryGPU
|
||||||
|
res.Status[i].MemoryTotal = task.MemoryGPU
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, lock := range locks {
|
||||||
|
lock.Unlock()
|
||||||
|
}
|
||||||
go func(res NodeStatus) {
|
go func(res NodeStatus) {
|
||||||
if len(res.Status) == 0 {
|
if len(res.Status) == 0 {
|
||||||
return
|
return
|
||||||
|
|||||||
Reference in New Issue
Block a user