mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-12 23:36:44 +00:00
update
This commit is contained in:
@@ -58,7 +58,7 @@ func (jm *JobManager) start() {
|
||||
log.Info("return resource ", tt.ClientID)
|
||||
jm.resources[i].ClientID = "null"
|
||||
for _, t := range tt.Status {
|
||||
jm.scheduler.Detach(t.UUID, jm.job.Name)
|
||||
jm.scheduler.Detach(t.UUID, jm.job)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -170,7 +170,7 @@ func (jm *JobManager) start() {
|
||||
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, res.Status[i])
|
||||
} else {
|
||||
log.Info(jm.job.Name, "-", i, " ", res.Status[i].Status)
|
||||
if exitCode, ok := res.Status[i].State["ExitCode"].(float64); ok {
|
||||
if exitCode, ok := res.Status[i].State["ExitCode"].(float64); ok && !jm.job.Tasks[i].IsPS {
|
||||
if exitCode != 0 && !jm.killedFlag {
|
||||
log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode)
|
||||
jm.killedFlag = true
|
||||
@@ -195,7 +195,7 @@ func (jm *JobManager) start() {
|
||||
jm.resources[i].ClientID = "null"
|
||||
|
||||
for _, t := range jm.resources[i].Status {
|
||||
jm.scheduler.Detach(t.UUID, jm.job.Name)
|
||||
jm.scheduler.Detach(t.UUID, jm.job)
|
||||
}
|
||||
|
||||
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, res.Status[i])
|
||||
|
||||
@@ -102,12 +102,12 @@ func (pool *ResourcePool) checkDeadNodes() {
|
||||
}
|
||||
|
||||
pool.TotalGPUMu.Lock()
|
||||
seg.Lock.Lock()
|
||||
if _, ok := seg.Nodes[k]; ok {
|
||||
pool.TotalGPU -= len(seg.Nodes[k].Status)
|
||||
}
|
||||
pool.TotalGPUMu.Unlock()
|
||||
|
||||
seg.Lock.Lock()
|
||||
delete(seg.Nodes, k)
|
||||
seg.Lock.Unlock()
|
||||
pool.versionsMu.Lock()
|
||||
@@ -361,7 +361,6 @@ func (pool *ResourcePool) list() MsgResource {
|
||||
|
||||
start := pool.pools[0].Next
|
||||
for cur := start; ; {
|
||||
log.Info(cur.ID)
|
||||
cur.Lock.Lock()
|
||||
for k, node := range cur.Nodes {
|
||||
nodes[k] = *node
|
||||
@@ -441,20 +440,20 @@ func (pool *ResourcePool) attach(GPU string, job string) {
|
||||
}
|
||||
}
|
||||
|
||||
func (pool *ResourcePool) detach(GPU string, jobName string) {
|
||||
func (pool *ResourcePool) detach(GPU string, job Job) {
|
||||
pool.bindingsMu.Lock()
|
||||
defer pool.bindingsMu.Unlock()
|
||||
if _, ok := pool.bindings[GPU]; ok {
|
||||
if _, ok2 := pool.utils[GPU]; ok2 {
|
||||
if len(pool.bindings[GPU]) == 1 {
|
||||
InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
|
||||
if len(pool.bindings[GPU]) == 1 && job.Status == Finished {
|
||||
InstanceOfOptimizer().feed(job.Name, pool.utils[GPU])
|
||||
}
|
||||
delete(pool.utils, GPU)
|
||||
}
|
||||
}
|
||||
|
||||
if list, ok := pool.bindings[GPU]; ok {
|
||||
delete(list, jobName)
|
||||
delete(list, job.Name)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ type Scheduler interface {
|
||||
|
||||
Attach(GPU string, job string)
|
||||
|
||||
Detach(GPU string, job string)
|
||||
Detach(GPU string, job Job)
|
||||
|
||||
Enable() bool
|
||||
|
||||
|
||||
@@ -253,7 +253,7 @@ func (scheduler *SchedulerFCFS) Attach(GPU string, job string) {
|
||||
pool.attach(GPU, job)
|
||||
}
|
||||
|
||||
func (scheduler *SchedulerFCFS) Detach(GPU string, job string) {
|
||||
func (scheduler *SchedulerFCFS) Detach(GPU string, job Job) {
|
||||
pool.detach(GPU, job)
|
||||
}
|
||||
|
||||
|
||||
@@ -181,8 +181,8 @@ func (scheduler *SchedulerFair) Start() {
|
||||
if len(t) == 0 || !InstanceOfGroupManager().groups[t[0].Group].Reserved {
|
||||
continue
|
||||
}
|
||||
log.Info(scheduler.queueUsingGPU)
|
||||
log.Info(scheduler.queuesSchedulingCnt)
|
||||
//log.Info(scheduler.queueUsingGPU)
|
||||
//log.Info(scheduler.queuesSchedulingCnt)
|
||||
scheduler.queuesUsingGPUMu.Lock()
|
||||
if cnt, ok := scheduler.queuesSchedulingCnt[t[0].Group]; ok && cnt > 0 {
|
||||
scheduler.queuesUsingGPUMu.Unlock()
|
||||
@@ -495,7 +495,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
||||
|
||||
if len(candidates) > 0 {
|
||||
log.Info("allocationType is ", allocationType)
|
||||
log.Info(candidates)
|
||||
//log.Info(candidates)
|
||||
}
|
||||
|
||||
/* assign */
|
||||
@@ -586,7 +586,7 @@ func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
|
||||
scheduler.UsingGPUMu.Unlock()
|
||||
log.Info(node.Status[j].UUID, " is released")
|
||||
}
|
||||
log.Info(node.Status[j].MemoryAllocated)
|
||||
//log.Info(node.Status[j].MemoryAllocated)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -801,7 +801,7 @@ func (scheduler *SchedulerFair) Attach(GPU string, job string) {
|
||||
pool.attach(GPU, job)
|
||||
}
|
||||
|
||||
func (scheduler *SchedulerFair) Detach(GPU string, job string) {
|
||||
func (scheduler *SchedulerFair) Detach(GPU string, job Job) {
|
||||
pool.detach(GPU, job)
|
||||
}
|
||||
|
||||
|
||||
@@ -277,7 +277,7 @@ func (scheduler *SchedulerPriority) Attach(GPU string, job string) {
|
||||
pool.attach(GPU, job)
|
||||
}
|
||||
|
||||
func (scheduler *SchedulerPriority) Detach(GPU string, job string) {
|
||||
func (scheduler *SchedulerPriority) Detach(GPU string, job Job) {
|
||||
pool.detach(GPU, job)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user