1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-15 08:16:43 +00:00

fix some concurrent problems

This commit is contained in:
2020-04-13 20:29:58 +08:00
parent 83b9a7faae
commit 490ce3f26a
3 changed files with 27 additions and 14 deletions

View File

@@ -19,7 +19,7 @@ type JobManager struct {
} }
func (jm *JobManager) start() { func (jm *JobManager) start() {
log.Info("start job ", jm.job.Name, time.Now()) log.Debug("start job ", jm.job.Name, time.Now())
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}} jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
network := jm.scheduler.AcquireNetwork() network := jm.scheduler.AcquireNetwork()
@@ -49,7 +49,7 @@ func (jm *JobManager) start() {
} }
jm.scheduler.UpdateProgress(jm.job.Name, Running) jm.scheduler.UpdateProgress(jm.job.Name, Running)
log.Info("ready to run job ", jm.job.Name, time.Now()) log.Debug("ready to run job ", jm.job.Name, time.Now())
/* bring up containers */ /* bring up containers */
for i := range jm.job.Tasks { for i := range jm.job.Tasks {

View File

@@ -16,7 +16,8 @@ type ResourcePool struct {
history []PoolStatus history []PoolStatus
heartBeat map[string]time.Time heartBeat map[string]time.Time
heartBeatMu sync.Mutex
networks map[string]bool networks map[string]bool
networksFree map[string]bool networksFree map[string]bool
@@ -27,9 +28,9 @@ type ResourcePool struct {
counter int counter int
counterTotal int counterTotal int
bindings map[string]map[string]bool bindings map[string]map[string]bool
utils map[string][]int bindingsMu sync.Mutex
bindingMu sync.Mutex utils map[string][]int
} }
func (pool *ResourcePool) start() { func (pool *ResourcePool) start() {
@@ -46,12 +47,16 @@ func (pool *ResourcePool) start() {
pool.heartBeat = map[string]time.Time{} pool.heartBeat = map[string]time.Time{}
for { for {
pool.heartBeatMu.Lock()
for k, v := range pool.heartBeat { for k, v := range pool.heartBeat {
if v.Add(time.Second * 30).Before(time.Now()) { if v.Add(time.Second * 30).Before(time.Now()) {
pool.mu.Lock()
delete(pool.nodes, k) delete(pool.nodes, k)
delete(pool.versions, k) delete(pool.versions, k)
pool.mu.Unlock()
} }
} }
pool.heartBeatMu.Unlock()
time.Sleep(time.Second * 10) time.Sleep(time.Second * 10)
} }
}() }()
@@ -73,6 +78,7 @@ func (pool *ResourcePool) start() {
UtilGPU := 0 UtilGPU := 0
TotalMemGPU := 0 TotalMemGPU := 0
AvailableMemGPU := 0 AvailableMemGPU := 0
pool.mu.Lock()
for _, node := range pool.nodes { for _, node := range pool.nodes {
UtilCPU += node.UtilCPU UtilCPU += node.UtilCPU
TotalCPU += node.NumCPU TotalCPU += node.NumCPU
@@ -86,8 +92,10 @@ func (pool *ResourcePool) start() {
AvailableMemGPU += GPU.MemoryFree AvailableMemGPU += GPU.MemoryFree
} }
} }
size := len(pool.nodes)
pool.mu.Unlock()
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05") summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
summary.UtilCPU = UtilCPU / (float64(len(pool.nodes)) + 0.001) summary.UtilCPU = UtilCPU / (float64(size) + 0.001)
summary.TotalCPU = TotalCPU summary.TotalCPU = TotalCPU
summary.TotalMem = TotalMem summary.TotalMem = TotalMem
summary.AvailableMem = AvailableMem summary.AvailableMem = AvailableMem
@@ -115,6 +123,8 @@ func (pool *ResourcePool) update(node NodeStatus) {
defer pool.mu.Unlock() defer pool.mu.Unlock()
go func(node NodeStatus) { go func(node NodeStatus) {
pool.bindingsMu.Lock()
defer pool.bindingsMu.Unlock()
for _, gpu := range node.Status { for _, gpu := range node.Status {
if _, ok := pool.bindings[gpu.UUID]; ok { if _, ok := pool.bindings[gpu.UUID]; ok {
if len(pool.bindings[gpu.UUID]) == 1 { if len(pool.bindings[gpu.UUID]) == 1 {
@@ -122,10 +132,11 @@ func (pool *ResourcePool) update(node NodeStatus) {
} }
} }
} }
pool.heartBeatMu.Lock()
pool.heartBeat[node.ClientID] = time.Now()
pool.heartBeatMu.Unlock()
}(node) }(node)
pool.heartBeat[node.ClientID] = time.Now()
pool.counterTotal++ pool.counterTotal++
if version, ok := pool.versions[node.ClientID]; ok && version == node.Version { if version, ok := pool.versions[node.ClientID]; ok && version == node.Version {
return return
@@ -211,8 +222,8 @@ func (pool *ResourcePool) releaseNetwork(network string) {
} }
func (pool *ResourcePool) attach(GPU string, job string) { func (pool *ResourcePool) attach(GPU string, job string) {
pool.bindingMu.Lock() pool.bindingsMu.Lock()
defer pool.bindingMu.Unlock() defer pool.bindingsMu.Unlock()
if _, ok := pool.bindings[GPU]; !ok { if _, ok := pool.bindings[GPU]; !ok {
pool.bindings[GPU] = map[string]bool{} pool.bindings[GPU] = map[string]bool{}
} }
@@ -224,8 +235,8 @@ func (pool *ResourcePool) attach(GPU string, job string) {
} }
func (pool *ResourcePool) detach(GPU string, jobName string) { func (pool *ResourcePool) detach(GPU string, jobName string) {
pool.bindingMu.Lock() pool.bindingsMu.Lock()
defer pool.bindingMu.Unlock() defer pool.bindingsMu.Unlock()
if _, ok := pool.bindings[GPU]; ok { if _, ok := pool.bindings[GPU]; ok {
if len(pool.bindings[GPU]) == 1 { if len(pool.bindings[GPU]) == 1 {
InstanceOfOptimizer().feed(jobName, pool.utils[GPU]) InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
@@ -240,4 +251,4 @@ func (pool *ResourcePool) detach(GPU string, jobName string) {
func (pool *ResourcePool) getBindings() map[string]map[string]bool { func (pool *ResourcePool) getBindings() map[string]map[string]bool {
return pool.bindings return pool.bindings
} }

View File

@@ -23,6 +23,7 @@ type SchedulerFair struct {
nextQueue string nextQueue string
resourceAllocations map[string]*ResourceCount resourceAllocations map[string]*ResourceCount
enabled bool enabled bool
latestPoolIndex int
} }
type FairJobSorter []Job type FairJobSorter []Job
@@ -71,6 +72,7 @@ func (scheduler *SchedulerFair) Start() {
jm.start() jm.start()
}() }()
} else { } else {
log.Info("No more jobs to scheduling")
scheduler.scheduling.Unlock() scheduler.scheduling.Unlock()
go func() { go func() {
scheduler.UpdateNextQueue() scheduler.UpdateNextQueue()