mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-06-06 22:01:55 +00:00
bugfix, set status to launching when task is launching
This commit is contained in:
parent
731f173503
commit
72d8083d04
@ -24,7 +24,7 @@ type JobManager struct {
|
|||||||
/* status */
|
/* status */
|
||||||
jobStatus JobStatus
|
jobStatus JobStatus
|
||||||
isRunning bool
|
isRunning bool
|
||||||
lastHeartBeat int64
|
lastHeartBeat map[string]int64
|
||||||
|
|
||||||
/* history info */
|
/* history info */
|
||||||
stats [][]TaskStatus
|
stats [][]TaskStatus
|
||||||
@ -33,7 +33,7 @@ type JobManager struct {
|
|||||||
func (jm *JobManager) start() {
|
func (jm *JobManager) start() {
|
||||||
log.Info("start job ", jm.job.Name, " at ", time.Now())
|
log.Info("start job ", jm.job.Name, " at ", time.Now())
|
||||||
jm.isRunning = true
|
jm.isRunning = true
|
||||||
jm.lastHeartBeat = time.Now().Unix()
|
jm.lastHeartBeat = map[string]int64{}
|
||||||
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
||||||
jm.resources = map[string]NodeStatus{}
|
jm.resources = map[string]NodeStatus{}
|
||||||
|
|
||||||
@ -52,7 +52,7 @@ func (jm *JobManager) start() {
|
|||||||
for i, node := range resources {
|
for i, node := range resources {
|
||||||
jm.resources[jm.job.Tasks[i].Name] = node
|
jm.resources[jm.job.Tasks[i].Name] = node
|
||||||
}
|
}
|
||||||
log.Info(jm.job.Name, " receive resource", jm.resources)
|
log.Info(jm.job.Name, " receive resource ", jm.resources)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
/* sleep random Millisecond to avoid deadlock */
|
/* sleep random Millisecond to avoid deadlock */
|
||||||
@ -70,7 +70,7 @@ func (jm *JobManager) start() {
|
|||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
}
|
}
|
||||||
jm.returnResource()
|
jm.returnResource()
|
||||||
log.Info("JobMaster exited ", jm.job.Name)
|
log.Info(jm.job.Name, "JobMaster exited ")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,6 +154,7 @@ func (jm *JobManager) start() {
|
|||||||
}
|
}
|
||||||
taskStatus := TaskStatus{Id: res.Id, Node: node.ClientHost, HostName: jm.job.Tasks[i].Name}
|
taskStatus := TaskStatus{Id: res.Id, Node: node.ClientHost, HostName: jm.job.Tasks[i].Name}
|
||||||
jm.jobStatus.tasks[task.Name] = taskStatus
|
jm.jobStatus.tasks[task.Name] = taskStatus
|
||||||
|
jm.lastHeartBeat[task.Name] = time.Now().Unix()
|
||||||
|
|
||||||
}(task, jm.resources[task.Name])
|
}(task, jm.resources[task.Name])
|
||||||
}
|
}
|
||||||
@ -163,6 +164,8 @@ func (jm *JobManager) start() {
|
|||||||
jm.isRunning = false
|
jm.isRunning = false
|
||||||
jm.scheduler.UpdateProgress(jm.job, Failed)
|
jm.scheduler.UpdateProgress(jm.job, Failed)
|
||||||
jm.stop()
|
jm.stop()
|
||||||
|
} else {
|
||||||
|
log.Info(jm.job.Name, " all tasks launched success")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,8 +174,11 @@ func (jm *JobManager) start() {
|
|||||||
if !jm.isRunning {
|
if !jm.isRunning {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if time.Now().Unix()-jm.lastHeartBeat > 30 {
|
now := time.Now().Unix()
|
||||||
log.Warn(jm.job.Name, " heartbeat longer tha 30s")
|
for task, pre := range jm.lastHeartBeat {
|
||||||
|
if now-pre > 30 {
|
||||||
|
log.Warn(jm.job.Name, "-", task, " heartbeat longer tha 30s")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
time.Sleep(time.Second * 1)
|
time.Sleep(time.Second * 1)
|
||||||
}
|
}
|
||||||
@ -239,7 +245,7 @@ func (jm *JobManager) start() {
|
|||||||
// }(task)
|
// }(task)
|
||||||
//}
|
//}
|
||||||
|
|
||||||
log.Info("JobMaster exited ", jm.job.Name)
|
log.Info(jm.job.Name, " JobMaster exited ")
|
||||||
}
|
}
|
||||||
|
|
||||||
/* release all resource */
|
/* release all resource */
|
||||||
@ -268,7 +274,7 @@ func (jm *JobManager) checkStatus(status []TaskStatus) {
|
|||||||
flagRunning := false
|
flagRunning := false
|
||||||
onlyPS := true
|
onlyPS := true
|
||||||
for i := range status {
|
for i := range status {
|
||||||
if status[i].Status == "ready" || status[i].Status == "running" {
|
if status[i].Status == "ready" || status[i].Status == "running" || status[i].Status == "launching" {
|
||||||
flagRunning = true
|
flagRunning = true
|
||||||
if !jm.job.Tasks[i].IsPS {
|
if !jm.job.Tasks[i].IsPS {
|
||||||
onlyPS = false
|
onlyPS = false
|
||||||
@ -287,7 +293,7 @@ func (jm *JobManager) checkStatus(status []TaskStatus) {
|
|||||||
jm.scheduler.UpdateProgress(jm.job, Failed)
|
jm.scheduler.UpdateProgress(jm.job, Failed)
|
||||||
jm.stop()
|
jm.stop()
|
||||||
} else if jm.isRunning {
|
} else if jm.isRunning {
|
||||||
log.Info("Some instance exited, close others")
|
log.Info(jm.job.Name, " Some instance exited, close others")
|
||||||
jm.isRunning = false
|
jm.isRunning = false
|
||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
jm.stop()
|
jm.stop()
|
||||||
@ -297,7 +303,7 @@ func (jm *JobManager) checkStatus(status []TaskStatus) {
|
|||||||
nodeID := jm.job.Tasks[i].Name
|
nodeID := jm.job.Tasks[i].Name
|
||||||
if _, ok := jm.resources[nodeID]; ok {
|
if _, ok := jm.resources[nodeID]; ok {
|
||||||
jm.scheduler.ReleaseResource(jm.job, jm.resources[nodeID])
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[nodeID])
|
||||||
log.Info("return resource ", jm.resources[nodeID].ClientID)
|
log.Info(jm.job.Name, " return resource ", jm.resources[nodeID].ClientID)
|
||||||
|
|
||||||
for _, t := range jm.resources[nodeID].Status {
|
for _, t := range jm.resources[nodeID].Status {
|
||||||
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
||||||
@ -307,16 +313,17 @@ func (jm *JobManager) checkStatus(status []TaskStatus) {
|
|||||||
}
|
}
|
||||||
jm.resourcesMu.Unlock()
|
jm.resourcesMu.Unlock()
|
||||||
}
|
}
|
||||||
|
jm.lastHeartBeat[jm.job.Tasks[i].Name] = time.Now().Unix()
|
||||||
}
|
}
|
||||||
if flagRunning && onlyPS && jm.isRunning {
|
if flagRunning && onlyPS && jm.isRunning {
|
||||||
log.Info("Only PS is running, stop ", jm.job.Name)
|
log.Info(jm.job.Name, " Only PS is running, stop ")
|
||||||
jm.isRunning = false
|
jm.isRunning = false
|
||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
jm.stop()
|
jm.stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
if !flagRunning && jm.isRunning {
|
if !flagRunning && jm.isRunning {
|
||||||
log.Info("finish job ", jm.job.Name)
|
log.Info(jm.job.Name, " finish job ")
|
||||||
jm.isRunning = false
|
jm.isRunning = false
|
||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
}
|
}
|
||||||
@ -367,6 +374,7 @@ func (jm *JobManager) status() MsgJobStatus {
|
|||||||
|
|
||||||
/* still in launching phase */
|
/* still in launching phase */
|
||||||
if len(taskStatus.Node) == 0 {
|
if len(taskStatus.Node) == 0 {
|
||||||
|
tasksStatus[i] = TaskStatus{Status: "launching", State: map[string]interface{}{"ExitCode": float64(0)}}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ func (logger *Logger) Debug(args ...interface{}) {
|
|||||||
if ok && details != nil {
|
if ok && details != nil {
|
||||||
module = details.Name()
|
module = details.Name()
|
||||||
}
|
}
|
||||||
args = append(args, module)
|
args = append(args, "<--"+module)
|
||||||
_log.Debug(args...)
|
_log.Debug(args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ func (logger *Logger) Info(args ...interface{}) {
|
|||||||
if ok && details != nil {
|
if ok && details != nil {
|
||||||
module = details.Name()
|
module = details.Name()
|
||||||
}
|
}
|
||||||
args = append(args, module)
|
args = append(args, "<--"+module)
|
||||||
_log.Info(args...)
|
_log.Info(args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -47,7 +47,7 @@ func (logger *Logger) Warn(args ...interface{}) {
|
|||||||
if ok && details != nil {
|
if ok && details != nil {
|
||||||
module = details.Name()
|
module = details.Name()
|
||||||
}
|
}
|
||||||
args = append(args, module)
|
args = append(args, "<--"+module)
|
||||||
_log.Warn(args...)
|
_log.Warn(args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ func (logger *Logger) Fatal(args ...interface{}) {
|
|||||||
if ok && details != nil {
|
if ok && details != nil {
|
||||||
module = details.Name()
|
module = details.Name()
|
||||||
}
|
}
|
||||||
args = append(args, module)
|
args = append(args, "<--"+module)
|
||||||
_log.Fatal(args...)
|
_log.Fatal(args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -69,7 +69,7 @@ func (logger *Logger) Fatalf(format string, args ...interface{}) {
|
|||||||
if ok && details != nil {
|
if ok && details != nil {
|
||||||
module = details.Name()
|
module = details.Name()
|
||||||
}
|
}
|
||||||
args = append(args, module)
|
args = append(args, "<--"+module)
|
||||||
_log.Fatalf(format, args...)
|
_log.Fatalf(format, args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user