mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-15 08:16:43 +00:00
fix some concurrent problems
This commit is contained in:
@@ -19,7 +19,7 @@ type JobManager struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (jm *JobManager) start() {
|
func (jm *JobManager) start() {
|
||||||
log.Info("start job ", jm.job.Name, time.Now())
|
log.Debug("start job ", jm.job.Name, time.Now())
|
||||||
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
||||||
|
|
||||||
network := jm.scheduler.AcquireNetwork()
|
network := jm.scheduler.AcquireNetwork()
|
||||||
@@ -49,7 +49,7 @@ func (jm *JobManager) start() {
|
|||||||
}
|
}
|
||||||
jm.scheduler.UpdateProgress(jm.job.Name, Running)
|
jm.scheduler.UpdateProgress(jm.job.Name, Running)
|
||||||
|
|
||||||
log.Info("ready to run job ", jm.job.Name, time.Now())
|
log.Debug("ready to run job ", jm.job.Name, time.Now())
|
||||||
|
|
||||||
/* bring up containers */
|
/* bring up containers */
|
||||||
for i := range jm.job.Tasks {
|
for i := range jm.job.Tasks {
|
||||||
|
|||||||
@@ -16,7 +16,8 @@ type ResourcePool struct {
|
|||||||
|
|
||||||
history []PoolStatus
|
history []PoolStatus
|
||||||
|
|
||||||
heartBeat map[string]time.Time
|
heartBeat map[string]time.Time
|
||||||
|
heartBeatMu sync.Mutex
|
||||||
|
|
||||||
networks map[string]bool
|
networks map[string]bool
|
||||||
networksFree map[string]bool
|
networksFree map[string]bool
|
||||||
@@ -27,9 +28,9 @@ type ResourcePool struct {
|
|||||||
counter int
|
counter int
|
||||||
counterTotal int
|
counterTotal int
|
||||||
|
|
||||||
bindings map[string]map[string]bool
|
bindings map[string]map[string]bool
|
||||||
utils map[string][]int
|
bindingsMu sync.Mutex
|
||||||
bindingMu sync.Mutex
|
utils map[string][]int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) start() {
|
func (pool *ResourcePool) start() {
|
||||||
@@ -46,12 +47,16 @@ func (pool *ResourcePool) start() {
|
|||||||
pool.heartBeat = map[string]time.Time{}
|
pool.heartBeat = map[string]time.Time{}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
pool.heartBeatMu.Lock()
|
||||||
for k, v := range pool.heartBeat {
|
for k, v := range pool.heartBeat {
|
||||||
if v.Add(time.Second * 30).Before(time.Now()) {
|
if v.Add(time.Second * 30).Before(time.Now()) {
|
||||||
|
pool.mu.Lock()
|
||||||
delete(pool.nodes, k)
|
delete(pool.nodes, k)
|
||||||
delete(pool.versions, k)
|
delete(pool.versions, k)
|
||||||
|
pool.mu.Unlock()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pool.heartBeatMu.Unlock()
|
||||||
time.Sleep(time.Second * 10)
|
time.Sleep(time.Second * 10)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@@ -73,6 +78,7 @@ func (pool *ResourcePool) start() {
|
|||||||
UtilGPU := 0
|
UtilGPU := 0
|
||||||
TotalMemGPU := 0
|
TotalMemGPU := 0
|
||||||
AvailableMemGPU := 0
|
AvailableMemGPU := 0
|
||||||
|
pool.mu.Lock()
|
||||||
for _, node := range pool.nodes {
|
for _, node := range pool.nodes {
|
||||||
UtilCPU += node.UtilCPU
|
UtilCPU += node.UtilCPU
|
||||||
TotalCPU += node.NumCPU
|
TotalCPU += node.NumCPU
|
||||||
@@ -86,8 +92,10 @@ func (pool *ResourcePool) start() {
|
|||||||
AvailableMemGPU += GPU.MemoryFree
|
AvailableMemGPU += GPU.MemoryFree
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
size := len(pool.nodes)
|
||||||
|
pool.mu.Unlock()
|
||||||
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
|
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
|
||||||
summary.UtilCPU = UtilCPU / (float64(len(pool.nodes)) + 0.001)
|
summary.UtilCPU = UtilCPU / (float64(size) + 0.001)
|
||||||
summary.TotalCPU = TotalCPU
|
summary.TotalCPU = TotalCPU
|
||||||
summary.TotalMem = TotalMem
|
summary.TotalMem = TotalMem
|
||||||
summary.AvailableMem = AvailableMem
|
summary.AvailableMem = AvailableMem
|
||||||
@@ -115,6 +123,8 @@ func (pool *ResourcePool) update(node NodeStatus) {
|
|||||||
defer pool.mu.Unlock()
|
defer pool.mu.Unlock()
|
||||||
|
|
||||||
go func(node NodeStatus) {
|
go func(node NodeStatus) {
|
||||||
|
pool.bindingsMu.Lock()
|
||||||
|
defer pool.bindingsMu.Unlock()
|
||||||
for _, gpu := range node.Status {
|
for _, gpu := range node.Status {
|
||||||
if _, ok := pool.bindings[gpu.UUID]; ok {
|
if _, ok := pool.bindings[gpu.UUID]; ok {
|
||||||
if len(pool.bindings[gpu.UUID]) == 1 {
|
if len(pool.bindings[gpu.UUID]) == 1 {
|
||||||
@@ -122,10 +132,11 @@ func (pool *ResourcePool) update(node NodeStatus) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pool.heartBeatMu.Lock()
|
||||||
|
pool.heartBeat[node.ClientID] = time.Now()
|
||||||
|
pool.heartBeatMu.Unlock()
|
||||||
}(node)
|
}(node)
|
||||||
|
|
||||||
pool.heartBeat[node.ClientID] = time.Now()
|
|
||||||
|
|
||||||
pool.counterTotal++
|
pool.counterTotal++
|
||||||
if version, ok := pool.versions[node.ClientID]; ok && version == node.Version {
|
if version, ok := pool.versions[node.ClientID]; ok && version == node.Version {
|
||||||
return
|
return
|
||||||
@@ -211,8 +222,8 @@ func (pool *ResourcePool) releaseNetwork(network string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) attach(GPU string, job string) {
|
func (pool *ResourcePool) attach(GPU string, job string) {
|
||||||
pool.bindingMu.Lock()
|
pool.bindingsMu.Lock()
|
||||||
defer pool.bindingMu.Unlock()
|
defer pool.bindingsMu.Unlock()
|
||||||
if _, ok := pool.bindings[GPU]; !ok {
|
if _, ok := pool.bindings[GPU]; !ok {
|
||||||
pool.bindings[GPU] = map[string]bool{}
|
pool.bindings[GPU] = map[string]bool{}
|
||||||
}
|
}
|
||||||
@@ -224,8 +235,8 @@ func (pool *ResourcePool) attach(GPU string, job string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) detach(GPU string, jobName string) {
|
func (pool *ResourcePool) detach(GPU string, jobName string) {
|
||||||
pool.bindingMu.Lock()
|
pool.bindingsMu.Lock()
|
||||||
defer pool.bindingMu.Unlock()
|
defer pool.bindingsMu.Unlock()
|
||||||
if _, ok := pool.bindings[GPU]; ok {
|
if _, ok := pool.bindings[GPU]; ok {
|
||||||
if len(pool.bindings[GPU]) == 1 {
|
if len(pool.bindings[GPU]) == 1 {
|
||||||
InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
|
InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
|
||||||
@@ -240,4 +251,4 @@ func (pool *ResourcePool) detach(GPU string, jobName string) {
|
|||||||
|
|
||||||
func (pool *ResourcePool) getBindings() map[string]map[string]bool {
|
func (pool *ResourcePool) getBindings() map[string]map[string]bool {
|
||||||
return pool.bindings
|
return pool.bindings
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ type SchedulerFair struct {
|
|||||||
nextQueue string
|
nextQueue string
|
||||||
resourceAllocations map[string]*ResourceCount
|
resourceAllocations map[string]*ResourceCount
|
||||||
enabled bool
|
enabled bool
|
||||||
|
latestPoolIndex int
|
||||||
}
|
}
|
||||||
|
|
||||||
type FairJobSorter []Job
|
type FairJobSorter []Job
|
||||||
@@ -71,6 +72,7 @@ func (scheduler *SchedulerFair) Start() {
|
|||||||
jm.start()
|
jm.start()
|
||||||
}()
|
}()
|
||||||
} else {
|
} else {
|
||||||
|
log.Info("No more jobs to scheduling")
|
||||||
scheduler.scheduling.Unlock()
|
scheduler.scheduling.Unlock()
|
||||||
go func() {
|
go func() {
|
||||||
scheduler.UpdateNextQueue()
|
scheduler.UpdateNextQueue()
|
||||||
|
|||||||
Reference in New Issue
Block a user