update fair

2026-02-04 05:45:54 +00:00 · 2020-05-28 14:26:08 +08:00
parent f43f4e24ad
commit 10a46937c9
2 changed files with 70 additions and 14 deletions
--- a/src/resource_pool.go
+++ b/src/resource_pool.go
@@ -52,6 +52,8 @@ type ResourcePool struct {

 	TotalGPU    int
 	TotalGPUMu  sync.Mutex
+	TotalCPU    int
+	TotalMemory int
 	UsingGPU    int
 	UsingGPUMu  sync.Mutex

@@ -73,6 +75,9 @@ func (pool *ResourcePool) init(conf Configuration) {
 	pool.TotalGPU = 0
 	pool.UsingGPU = 0

+	pool.TotalCPU = 0
+	pool.TotalMemory = 0
+
 	pool.enableShare = true
 	pool.enableShareRatio = 0.75
 	pool.enablePreSchedule = true
@@ -129,6 +134,8 @@ func (pool *ResourcePool) checkDeadNodes() {
 				pool.TotalGPUMu.Lock()
 				if _, ok := seg.Nodes[k]; ok {
 					pool.TotalGPU -= len(seg.Nodes[k].Status)
+					pool.TotalCPU -= seg.Nodes[k].NumCPU
+					pool.TotalMemory -= seg.Nodes[k].MemTotal
 				}
 				pool.TotalGPUMu.Unlock()
 				delete(seg.Nodes, k)
@@ -240,6 +247,8 @@ func (pool *ResourcePool) saveStatusHistory() {

 		pool.TotalGPUMu.Lock()
 		pool.TotalGPU = TotalGPU
+		pool.TotalCPU = TotalCPU
+		pool.TotalMemory = TotalMemGPU
 		pool.TotalGPUMu.Unlock()
 		time.Sleep(time.Second * 60)
 	}
@@ -307,6 +316,8 @@ func (pool *ResourcePool) update(node NodeStatus) {
 		/* TODO: double check node do belong to this seg */
 		pool.TotalGPUMu.Lock()
 		pool.TotalGPU += len(node.Status)
+		pool.TotalCPU += node.NumCPU
+		pool.TotalMemory += node.MemTotal
 		pool.TotalGPUMu.Unlock()
 		log.Info("node ", node.ClientID, " is online")
 	}
--- a/src/scheduler_fair.go
+++ b/src/scheduler_fair.go
@@ -107,9 +107,11 @@ func (scheduler *SchedulerFair) Start() {
 			if bestQueue != "" {
 				numberGPUtmp := 0
 				numberCPUtmp := 0
+				Memorytmp := 0
 				for _, task := range scheduler.queues[bestQueue][0].Tasks {
 					numberGPUtmp += task.NumberGPU
 					numberCPUtmp += task.NumberCPU
+					Memorytmp += task.Memory
 				}

 				log.Info("schedulingJobs are ", scheduler.schedulingJobs)
@@ -122,6 +124,7 @@ func (scheduler *SchedulerFair) Start() {
 					if quota, ok := scheduler.queuesQuota[bestQueue]; ok {
 						quota.NumberGPU -= numberGPUtmp * 100
 						quota.CPU -= numberCPUtmp * 100
+						quota.Memory -= Memorytmp
 					}
 					log.Info("After, ", scheduler.queuesQuota[bestQueue])

@@ -300,25 +303,35 @@ func (scheduler *SchedulerFair) UpdateQuota() {

 	/* phase 1: DRF */
 	usingGPU := 0
+	usingCPU := 0
+	usingMemory := 0
 	allocatedGPU := 0
+	allocatedCPU := 0
+	allocatedMemory := 0
 	scheduler.resourceAllocationsMu.Lock()
 	for _, quota := range scheduler.resourceAllocations {
 		usingGPU += quota.NumberGPU
+		usingCPU += quota.CPU
+		usingMemory += quota.Memory
 	}
 	scheduler.resourceAllocationsMu.Unlock()

 	for _, quota := range scheduler.queuesQuota {
 		allocatedGPU += quota.NumberGPU
+		allocatedCPU += quota.CPU
+		allocatedMemory += quota.Memory
 	}

 	pool := InstanceOfResourcePool()

-	available := pool.TotalGPU - usingGPU - allocatedGPU/100
+	availableGPU := pool.TotalGPU - usingGPU - allocatedGPU/100
+	availableCPU := pool.TotalCPU - usingCPU - allocatedCPU/100
+	availableMemory := pool.TotalMemory - usingMemory - allocatedMemory
 	/* <0 means some nodes exited */
-	if available <= 0 {
+	if availableGPU <= 0 {
 		return
 	}
-	log.Info("Can allocate ", available)
+	log.Info("Can allocate ", availableGPU)
 	log.Info("Before ")
 	for queue, quota := range scheduler.queuesQuota {
 		log.Info("Queue<->", queue)
@@ -326,20 +339,52 @@ func (scheduler *SchedulerFair) UpdateQuota() {
 		log.Info("CPU:", quota.CPU)
 		log.Info("Memory:", quota.Memory)
 	}
-	available *= 100
-	per := available / len(scheduler.queues)
-	for queue := range scheduler.queues {
+	availableGPU *= 100
+	availableCPU *= 100
+
+	var candidates []string
+	requests := map[string]ResourceCount{}
+	weights := 0
+
+	for queue, jobs := range scheduler.queues {
+		if len(jobs) > 0 {
+			candidates = append(candidates, queue)
+		}
+		weights += InstanceOfGroupManager().groups[queue].Weight
+		request := ResourceCount{}
+		for _, job := range jobs {
+			GPU := 0
+			CPU := 0
+			Memory := 0
+			for _, task := range job.Tasks {
+				GPU += task.NumberGPU
+				CPU += task.NumberCPU
+				Memory += task.Memory
+			}
+			request.NumberGPU += GPU
+			request.CPU += CPU
+			request.Memory += Memory
+		}
+		requests[queue] = request
+	}
+
+	per := availableGPU / weights
+	for _, queue := range candidates {
 		if _, ok := scheduler.queuesQuota[queue]; !ok {
 			scheduler.queuesQuota[queue] = &ResourceCount{}
 		}
+		weight := InstanceOfGroupManager().groups[queue].Weight
 		quota := scheduler.queuesQuota[queue]
-		quota.NumberGPU += per
-		available -= per
+		quota.NumberGPU += per * weight
+		availableGPU -= per * weight
+
+		quota.CPU += (requests[queue].CPU / requests[queue].NumberGPU) * per * weight
+		quota.Memory += (requests[queue].Memory / requests[queue].NumberGPU) * per * weight
 	}
-	if available > 0 {
+	if availableGPU > 0 {
 		for queue := range scheduler.queues {
 			quota := scheduler.queuesQuota[queue]
-			quota.NumberGPU += available
+			quota.NumberGPU += availableGPU
 			break
 		}
 	}