update

2026-02-04 05:45:54 +00:00 · 2020-04-30 18:39:47 +08:00
parent c447a85471
commit ff0c10647b
2 changed files with 24 additions and 10 deletions
--- a/src/resource_pool.go
+++ b/src/resource_pool.go
@@ -32,7 +32,7 @@ type ResourcePool struct {
 	counter      int
 	counterTotal int

-	bindings   map[string]map[string]bool
+	bindings   map[string]map[string]int
 	bindingsMu sync.Mutex
 	utils      map[string][]int

@@ -59,7 +59,7 @@ func (pool *ResourcePool) start() {
 	pool.networksFree = map[string]bool{}
 	pool.versions = map[string]float64{}

-	pool.bindings = map[string]map[string]bool{}
+	pool.bindings = map[string]map[string]int{}
 	pool.utils = map[string][]int{}

 	pool.TotalGPU = 0
@@ -270,9 +270,9 @@ func (pool *ResourcePool) attach(GPU string, job string) {
 	pool.bindingsMu.Lock()
 	defer pool.bindingsMu.Unlock()
 	if _, ok := pool.bindings[GPU]; !ok {
-		pool.bindings[GPU] = map[string]bool{}
+		pool.bindings[GPU] = map[string]int{}
 	}
-	pool.bindings[GPU][job] = true
+	pool.bindings[GPU][job] = int(time.Now().Unix())

 	if _, ok := pool.utils[GPU]; !ok {
 		pool.utils[GPU] = []int{}
@@ -294,6 +294,6 @@ func (pool *ResourcePool) detach(GPU string, jobName string) {
 	}
 }

-func (pool *ResourcePool) getBindings() map[string]map[string]bool {
+func (pool *ResourcePool) getBindings() map[string]map[string]int {
 	return pool.bindings
 }
--- a/src/scheduler_fair.go
+++ b/src/scheduler_fair.go
@@ -205,7 +205,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
 	var candidates []NodeStatus

 	/* first, choose sharable GPUs */
-	if scheduler.enableShare && (pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) > scheduler.enableShareRatio) {
+	if scheduler.enableShare && (pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enableShareRatio) {
 		// check sharable
 		allocationType = 1
 		if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
@@ -279,8 +279,9 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
 	}

 	/* third round, find gpu to be released */
-	if len(candidates) == 0 && len(job.Tasks) == 1 && scheduler.enablePreSchedule {
-		if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) > scheduler.enablePreScheduleRatio {
+	if len(candidates) == 0 && len(job.Tasks) == 1 && task.NumberGPU == 1 && scheduler.enablePreSchedule {
+		estimate, valid := InstanceOfOptimizer().predictTime(job.Name)
+		if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid {
 			allocationType = 3
 			for i := 0; i < pool.poolsCount; i++ {
 				pool.poolsMu[(i+poolID)%pool.poolsCount].Lock()
@@ -288,8 +289,21 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
 				for _, node := range pool.pools[(i+poolID)%pool.poolsCount] {
 					var available []GPUStatus
 					for _, status := range node.Status {
-						if status.MemoryAllocated == 0 && status.MemoryUsed < 10 {
-							available = append(available, status)
+						bindings := pool.getBindings()
+						if tasks, ok := bindings[status.UUID]; ok {
+							if len(tasks) > 1 {
+								continue
+							}
+							for task_t, s := range tasks {
+								est, valid2 := InstanceOfOptimizer().predictTime(task_t)
+								if valid2 {
+									t := s
+									now := (int)(time.Now().Unix())
+									if now-t > est.Total-est.Post-estimate.Pre && status.MemoryFree > task.MemoryGPU {
+										available = append(available, status)
+									}
+								}
+							}
 						}
 					}
 					if len(available) >= task.NumberGPU {