mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-15 08:16:43 +00:00
update
This commit is contained in:
@@ -32,7 +32,7 @@ type ResourcePool struct {
|
|||||||
counter int
|
counter int
|
||||||
counterTotal int
|
counterTotal int
|
||||||
|
|
||||||
bindings map[string]map[string]bool
|
bindings map[string]map[string]int
|
||||||
bindingsMu sync.Mutex
|
bindingsMu sync.Mutex
|
||||||
utils map[string][]int
|
utils map[string][]int
|
||||||
|
|
||||||
@@ -59,7 +59,7 @@ func (pool *ResourcePool) start() {
|
|||||||
pool.networksFree = map[string]bool{}
|
pool.networksFree = map[string]bool{}
|
||||||
pool.versions = map[string]float64{}
|
pool.versions = map[string]float64{}
|
||||||
|
|
||||||
pool.bindings = map[string]map[string]bool{}
|
pool.bindings = map[string]map[string]int{}
|
||||||
pool.utils = map[string][]int{}
|
pool.utils = map[string][]int{}
|
||||||
|
|
||||||
pool.TotalGPU = 0
|
pool.TotalGPU = 0
|
||||||
@@ -270,9 +270,9 @@ func (pool *ResourcePool) attach(GPU string, job string) {
|
|||||||
pool.bindingsMu.Lock()
|
pool.bindingsMu.Lock()
|
||||||
defer pool.bindingsMu.Unlock()
|
defer pool.bindingsMu.Unlock()
|
||||||
if _, ok := pool.bindings[GPU]; !ok {
|
if _, ok := pool.bindings[GPU]; !ok {
|
||||||
pool.bindings[GPU] = map[string]bool{}
|
pool.bindings[GPU] = map[string]int{}
|
||||||
}
|
}
|
||||||
pool.bindings[GPU][job] = true
|
pool.bindings[GPU][job] = int(time.Now().Unix())
|
||||||
|
|
||||||
if _, ok := pool.utils[GPU]; !ok {
|
if _, ok := pool.utils[GPU]; !ok {
|
||||||
pool.utils[GPU] = []int{}
|
pool.utils[GPU] = []int{}
|
||||||
@@ -294,6 +294,6 @@ func (pool *ResourcePool) detach(GPU string, jobName string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) getBindings() map[string]map[string]bool {
|
func (pool *ResourcePool) getBindings() map[string]map[string]int {
|
||||||
return pool.bindings
|
return pool.bindings
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -205,7 +205,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
|||||||
var candidates []NodeStatus
|
var candidates []NodeStatus
|
||||||
|
|
||||||
/* first, choose sharable GPUs */
|
/* first, choose sharable GPUs */
|
||||||
if scheduler.enableShare && (pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) > scheduler.enableShareRatio) {
|
if scheduler.enableShare && (pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enableShareRatio) {
|
||||||
// check sharable
|
// check sharable
|
||||||
allocationType = 1
|
allocationType = 1
|
||||||
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
|
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
|
||||||
@@ -279,8 +279,9 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* third round, find gpu to be released */
|
/* third round, find gpu to be released */
|
||||||
if len(candidates) == 0 && len(job.Tasks) == 1 && scheduler.enablePreSchedule {
|
if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && scheduler.enablePreSchedule {
|
||||||
if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) > scheduler.enablePreScheduleRatio {
|
estimate, valid := InstanceOfOptimizer().predictTime(job.Name)
|
||||||
|
if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid {
|
||||||
allocationType = 3
|
allocationType = 3
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
pool.poolsMu[(i+poolID)%pool.poolsCount].Lock()
|
pool.poolsMu[(i+poolID)%pool.poolsCount].Lock()
|
||||||
@@ -288,8 +289,21 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
|||||||
for _, node := range pool.pools[(i+poolID)%pool.poolsCount] {
|
for _, node := range pool.pools[(i+poolID)%pool.poolsCount] {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryAllocated == 0 && status.MemoryUsed < 10 {
|
bindings := pool.getBindings()
|
||||||
available = append(available, status)
|
if tasks, ok := bindings[status.UUID]; ok {
|
||||||
|
if len(tasks) > 1 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for task_t, s := range tasks {
|
||||||
|
est, valid2 := InstanceOfOptimizer().predictTime(task_t)
|
||||||
|
if valid2 {
|
||||||
|
t := s
|
||||||
|
now := (int)(time.Now().Unix())
|
||||||
|
if now-t > est.Total-est.Post-estimate.Pre && status.MemoryFree > task.MemoryGPU {
|
||||||
|
available = append(available, status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(available) >= task.NumberGPU {
|
if len(available) >= task.NumberGPU {
|
||||||
|
|||||||
Reference in New Issue
Block a user