1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-15 08:16:43 +00:00
This commit is contained in:
2020-05-23 21:06:31 +08:00
parent 655cf79c00
commit ea2718fe4f
3 changed files with 116 additions and 71 deletions

View File

@@ -38,6 +38,9 @@ type ResourcePool struct {
TotalGPU int
TotalGPUMu sync.Mutex
subscriptions map[string]map[string]int
subscriptionsMu sync.Mutex
}
func (pool *ResourcePool) start() {
@@ -50,6 +53,8 @@ func (pool *ResourcePool) start() {
pool.bindings = map[string]map[string]int{}
pool.utils = map[string][]UtilGPUTimeSeries{}
pool.subscriptions = map[string]map[string]int{}
pool.TotalGPU = 0
/* init pools */
@@ -233,6 +238,8 @@ func (pool *ResourcePool) update(node NodeStatus) {
/* init bindings */
go func(node NodeStatus) {
pool.subscriptionsMu.Lock()
defer pool.subscriptionsMu.Unlock()
pool.bindingsMu.Lock()
defer pool.bindingsMu.Unlock()
for _, gpu := range node.Status {
@@ -242,6 +249,12 @@ func (pool *ResourcePool) update(node NodeStatus) {
UtilGPUTimeSeries{Time: (int)(time.Now().Unix()), Util: gpu.UtilizationGPU})
}
}
if _, ok := pool.subscriptions[gpu.UUID]; ok {
for jobName := range pool.subscriptions[gpu.UUID] {
scheduler.QueryState(jobName)
}
}
}
pool.heartBeatMu.Lock()
pool.heartBeat[node.ClientID] = time.Now()
@@ -438,8 +451,16 @@ func (pool *ResourcePool) releaseNetwork(network string) {
}
func (pool *ResourcePool) attach(GPU string, job string) {
pool.subscriptionsMu.Lock()
defer pool.subscriptionsMu.Unlock()
pool.bindingsMu.Lock()
defer pool.bindingsMu.Unlock()
if _, ok := pool.subscriptions[GPU]; !ok {
pool.subscriptions[GPU] = map[string]int{}
}
pool.subscriptions[GPU][job] = int(time.Now().Unix())
if _, ok := pool.bindings[GPU]; !ok {
pool.bindings[GPU] = map[string]int{}
}
@@ -455,8 +476,15 @@ func (pool *ResourcePool) attach(GPU string, job string) {
}
func (pool *ResourcePool) detach(GPU string, job Job) {
pool.subscriptionsMu.Lock()
defer pool.subscriptionsMu.Unlock()
pool.bindingsMu.Lock()
defer pool.bindingsMu.Unlock()
if _, ok := pool.subscriptions[GPU]; ok {
delete(pool.subscriptions[GPU], job.Name)
}
if _, ok := pool.bindings[GPU]; ok {
if _, ok2 := pool.utils[GPU]; ok2 {
if len(pool.bindings[GPU]) == 1 && job.Status != Failed && job.Status != Stopped {