2019-03-04 09:19:55 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"time"
|
2019-03-20 03:14:07 +00:00
|
|
|
"net/url"
|
|
|
|
"strings"
|
|
|
|
"io/ioutil"
|
|
|
|
"encoding/json"
|
2019-07-10 12:40:43 +00:00
|
|
|
log "github.com/sirupsen/logrus"
|
2020-05-25 03:35:44 +00:00
|
|
|
"sync"
|
|
|
|
"strconv"
|
|
|
|
"math/rand"
|
2020-05-04 06:10:29 +00:00
|
|
|
)
|
2019-03-04 09:19:55 +00:00
|
|
|
|
|
|
|
type JobManager struct {
|
2020-05-25 03:35:44 +00:00
|
|
|
scheduler Scheduler
|
|
|
|
job Job
|
|
|
|
jobStatus JobStatus
|
|
|
|
resources []NodeStatus
|
|
|
|
resourcesMu sync.Mutex
|
|
|
|
isRunning bool
|
|
|
|
killFlag bool
|
2020-05-25 09:37:07 +00:00
|
|
|
|
|
|
|
network string
|
2020-06-05 07:33:23 +00:00
|
|
|
|
|
|
|
stats [][]TaskStatus
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
func (jm *JobManager) start() {
|
2020-05-25 05:28:58 +00:00
|
|
|
log.Info("start job ", jm.job.Name, " at ", time.Now())
|
2020-05-23 16:47:40 +00:00
|
|
|
jm.isRunning = false
|
2020-05-25 03:35:44 +00:00
|
|
|
jm.killFlag = false
|
2019-03-25 07:36:30 +00:00
|
|
|
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
2019-03-20 03:14:07 +00:00
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
/* register in JHL */
|
2020-03-29 13:12:44 +00:00
|
|
|
InstanceJobHistoryLogger().submitJob(jm.job)
|
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
/* request for private network */
|
|
|
|
jm.network = InstanceOfResourcePool().acquireNetwork()
|
2020-05-01 06:06:12 +00:00
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
/* request for resources */
|
2020-05-24 13:07:02 +00:00
|
|
|
for {
|
2020-05-25 03:35:44 +00:00
|
|
|
if jm.killFlag {
|
2020-05-24 13:07:02 +00:00
|
|
|
break
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
jm.resources = jm.scheduler.AcquireResource(jm.job)
|
|
|
|
if len(jm.resources) > 0 {
|
2020-06-10 15:55:50 +00:00
|
|
|
log.Info(jm.job.Name, " receive resource", jm.resources)
|
2020-05-24 13:07:02 +00:00
|
|
|
break
|
2020-04-11 03:38:04 +00:00
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
/* sleep random Millisecond to avoid deadlock */
|
|
|
|
time.Sleep(time.Millisecond * time.Duration(500+rand.Intn(500)))
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
2020-06-30 08:16:30 +00:00
|
|
|
jm.job.StartedAt = time.Now().Unix()
|
2020-05-24 13:07:02 +00:00
|
|
|
|
2020-06-14 13:12:22 +00:00
|
|
|
if InstanceOfConfiguration().mock {
|
2020-06-14 13:14:43 +00:00
|
|
|
jm.scheduler.UpdateProgress(jm.job, Running)
|
2020-06-29 15:24:33 +00:00
|
|
|
jm.job.Status = Running
|
2020-06-14 13:12:22 +00:00
|
|
|
jm.isRunning = false
|
|
|
|
duration := InstanceOfMocker().GetDuration(jm.job, jm.resources)
|
|
|
|
log.Info("mock ", jm.job.Name, ", wait ", duration)
|
|
|
|
time.Sleep(time.Second * time.Duration(duration))
|
|
|
|
jm.returnResource([]TaskStatus{})
|
2020-06-14 13:37:56 +00:00
|
|
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
2020-06-29 15:24:33 +00:00
|
|
|
jm.job.Status = Finished
|
2020-06-14 13:12:22 +00:00
|
|
|
log.Info("JobMaster exited ", jm.job.Name)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2020-07-01 03:44:09 +00:00
|
|
|
isShare := false
|
|
|
|
isScheduleAhead := false
|
2020-05-25 03:35:44 +00:00
|
|
|
if !jm.killFlag {
|
|
|
|
/* switch to Running state */
|
2020-05-05 08:12:46 +00:00
|
|
|
jm.scheduler.UpdateProgress(jm.job, Running)
|
2020-06-29 15:24:33 +00:00
|
|
|
jm.job.Status = Running
|
2020-04-12 02:44:32 +00:00
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
/* bring up containers */
|
|
|
|
wg := sync.WaitGroup{}
|
|
|
|
for i := range jm.job.Tasks {
|
|
|
|
wg.Add(1)
|
2019-04-12 09:21:09 +00:00
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
go func(index int) {
|
|
|
|
defer wg.Done()
|
|
|
|
var UUIDs []string
|
2020-06-30 15:18:23 +00:00
|
|
|
shouldWait := "0"
|
2020-05-25 03:35:44 +00:00
|
|
|
for _, GPU := range jm.resources[index].Status {
|
|
|
|
UUIDs = append(UUIDs, GPU.UUID)
|
2020-06-30 15:18:23 +00:00
|
|
|
if GPU.MemoryUsed == GPU.MemoryTotal {
|
|
|
|
shouldWait = "1"
|
2020-07-01 03:44:09 +00:00
|
|
|
isScheduleAhead = true
|
|
|
|
} else if GPU.MemoryUsed > 0 {
|
|
|
|
isShare = true
|
2020-06-30 15:18:23 +00:00
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
/* attach to GPUs */
|
2020-06-30 08:17:20 +00:00
|
|
|
InstanceOfResourcePool().attach(GPU.UUID, jm.job)
|
2020-05-25 03:35:44 +00:00
|
|
|
}
|
|
|
|
GPUs := strings.Join(UUIDs, ",")
|
|
|
|
|
|
|
|
v := url.Values{}
|
|
|
|
v.Set("image", jm.job.Tasks[index].Image)
|
|
|
|
v.Set("cmd", jm.job.Tasks[index].Cmd)
|
|
|
|
v.Set("name", jm.job.Tasks[index].Name)
|
|
|
|
v.Set("workspace", jm.job.Workspace)
|
|
|
|
v.Set("gpus", GPUs)
|
|
|
|
v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[index].Memory)+"m")
|
|
|
|
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[index].NumberCPU))
|
|
|
|
v.Set("network", jm.network)
|
2020-06-30 15:18:23 +00:00
|
|
|
v.Set("should_wait", shouldWait)
|
2020-05-25 03:35:44 +00:00
|
|
|
v.Set("output_dir", "/tmp/")
|
2020-07-02 08:58:44 +00:00
|
|
|
v.Set("hdfs_address", InstanceOfConfiguration().HDFSAddress)
|
|
|
|
v.Set("hdfs_dir", InstanceOfConfiguration().HDFSBaseDir+jm.job.Name)
|
2020-05-25 03:35:44 +00:00
|
|
|
v.Set("gpu_mem", strconv.Itoa(jm.job.Tasks[index].MemoryGPU))
|
2020-07-02 09:14:32 +00:00
|
|
|
if InstanceOfConfiguration().DFSBaseDir != "" {
|
|
|
|
v.Set("dfs_src", InstanceOfConfiguration().DFSBaseDir+jm.job.Name+"/task-"+strconv.Itoa(index))
|
|
|
|
} else {
|
|
|
|
v.Set("dfs_src", "")
|
|
|
|
}
|
2020-06-03 01:26:22 +00:00
|
|
|
v.Set("dfs_dst", "/tmp")
|
2020-05-25 03:35:44 +00:00
|
|
|
|
|
|
|
resp, err := doRequest("POST", "http://"+jm.resources[index].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
|
|
|
if err != nil {
|
|
|
|
log.Warn(err.Error())
|
|
|
|
return
|
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
|
|
resp.Body.Close()
|
|
|
|
if err != nil {
|
|
|
|
log.Warn(err)
|
|
|
|
return
|
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
var res MsgCreate
|
|
|
|
err = json.Unmarshal([]byte(string(body)), &res)
|
|
|
|
if err != nil || res.Code != 0 {
|
|
|
|
log.Warn(res)
|
|
|
|
return
|
|
|
|
}
|
2020-06-22 12:51:29 +00:00
|
|
|
jm.jobStatus.tasks[jm.job.Tasks[index].Name] = TaskStatus{Id: res.Id, Node: jm.resources[index].ClientHost, HostName: jm.job.Tasks[i].Name}
|
2020-05-25 03:35:44 +00:00
|
|
|
}(i)
|
2020-05-02 14:35:31 +00:00
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
wg.Wait()
|
|
|
|
jm.isRunning = true
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
|
|
|
|
2019-03-04 09:19:55 +00:00
|
|
|
/* monitor job execution */
|
|
|
|
for {
|
2020-07-02 12:26:22 +00:00
|
|
|
//jm.status()
|
2020-07-02 13:07:42 +00:00
|
|
|
if !jm.isRunning || jm.killFlag {
|
2019-03-25 07:36:30 +00:00
|
|
|
break
|
|
|
|
}
|
2020-07-02 12:26:22 +00:00
|
|
|
time.Sleep(time.Second * 1)
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
|
2020-05-25 05:28:58 +00:00
|
|
|
/* make sure resources are released */
|
2020-06-29 15:31:01 +00:00
|
|
|
jm.returnResource(jm.status().Status)
|
|
|
|
|
|
|
|
/* feed data to optimizer */
|
2020-06-22 02:25:47 +00:00
|
|
|
var stats [][]TaskStatus
|
2020-06-22 13:06:39 +00:00
|
|
|
for _, vals := range jm.stats {
|
|
|
|
var stat []TaskStatus
|
|
|
|
for i, task := range jm.job.Tasks {
|
|
|
|
if task.IsPS {
|
|
|
|
stat = append(stat, vals[i])
|
|
|
|
}
|
2020-06-22 02:25:47 +00:00
|
|
|
}
|
2020-06-24 11:24:18 +00:00
|
|
|
if len(stat) > 0 {
|
|
|
|
stats = append(stats, stat)
|
|
|
|
}
|
2020-06-22 02:25:47 +00:00
|
|
|
}
|
2020-06-29 15:24:33 +00:00
|
|
|
InstanceOfOptimizer().FeedStats(jm.job, "PS", stats)
|
2020-06-22 02:25:47 +00:00
|
|
|
stats = [][]TaskStatus{}
|
2020-06-22 13:06:39 +00:00
|
|
|
for _, vals := range jm.stats {
|
|
|
|
var stat []TaskStatus
|
|
|
|
for i, task := range jm.job.Tasks {
|
|
|
|
if !task.IsPS {
|
|
|
|
stat = append(stat, vals[i])
|
|
|
|
}
|
2020-06-22 02:25:47 +00:00
|
|
|
}
|
2020-06-24 09:37:39 +00:00
|
|
|
if len(stat) > 0 {
|
|
|
|
stats = append(stats, stat)
|
|
|
|
}
|
2020-06-22 02:25:47 +00:00
|
|
|
}
|
2020-06-29 15:24:33 +00:00
|
|
|
InstanceOfOptimizer().FeedStats(jm.job, "Worker", stats)
|
2020-06-29 15:31:01 +00:00
|
|
|
|
2020-07-01 08:53:06 +00:00
|
|
|
if len(jm.job.Tasks) == 1 && !isShare && !isScheduleAhead && jm.job.Status == Finished {
|
2020-06-29 15:31:01 +00:00
|
|
|
InstanceOfOptimizer().FeedTime(jm.job, stats)
|
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
log.Info("JobMaster exited ", jm.job.Name)
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
/* release all resource */
|
|
|
|
func (jm *JobManager) returnResource(status []TaskStatus) {
|
|
|
|
jm.resourcesMu.Lock()
|
|
|
|
defer jm.resourcesMu.Unlock()
|
|
|
|
/* return resource */
|
|
|
|
for i := range jm.resources {
|
2020-05-27 12:39:54 +00:00
|
|
|
if jm.resources[i].ClientID == "_released_" {
|
|
|
|
continue
|
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
2020-05-27 12:48:55 +00:00
|
|
|
log.Info("return resource again ", jm.resources[i].ClientID)
|
2020-05-28 09:02:08 +00:00
|
|
|
jm.resources[i].ClientID = "_released_"
|
2020-05-25 03:35:44 +00:00
|
|
|
|
|
|
|
for _, t := range jm.resources[i].Status {
|
|
|
|
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
|
|
|
}
|
|
|
|
|
2020-06-14 13:12:22 +00:00
|
|
|
if !InstanceOfConfiguration().mock {
|
|
|
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
|
|
|
|
/* remove exited containers */
|
|
|
|
//v := url.Values{}
|
|
|
|
//v.Set("id", res.Status[i].Id)
|
|
|
|
//
|
|
|
|
//_, err := doRequest("POST", "http://"+res.Status[i].Node+":8000/remove", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
|
|
|
//if err != nil {
|
|
|
|
// log.Warn(err.Error())
|
|
|
|
// continue
|
|
|
|
//}
|
|
|
|
}
|
2020-05-28 03:15:29 +00:00
|
|
|
if jm.network != "" {
|
|
|
|
InstanceOfResourcePool().releaseNetwork(jm.network)
|
|
|
|
jm.network = ""
|
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* monitor all tasks */
|
|
|
|
func (jm *JobManager) checkStatus(status []TaskStatus) {
|
2020-05-23 16:47:40 +00:00
|
|
|
if !jm.isRunning {
|
2020-05-25 03:35:44 +00:00
|
|
|
return
|
2020-05-23 16:47:40 +00:00
|
|
|
}
|
2020-05-25 03:35:44 +00:00
|
|
|
flagRunning := false
|
2020-05-23 13:06:31 +00:00
|
|
|
onlyPS := true
|
2020-05-23 17:38:42 +00:00
|
|
|
for i := range status {
|
|
|
|
if status[i].Status == "ready" {
|
2020-05-23 13:06:31 +00:00
|
|
|
log.Debug(jm.job.Name, "-", i, " is ready to run")
|
2020-05-25 03:35:44 +00:00
|
|
|
flagRunning = true
|
2020-05-23 17:46:57 +00:00
|
|
|
if !jm.job.Tasks[i].IsPS {
|
|
|
|
onlyPS = false
|
|
|
|
}
|
2020-05-23 17:38:42 +00:00
|
|
|
} else if status[i].Status == "running" {
|
2020-05-23 13:06:31 +00:00
|
|
|
log.Debug(jm.job.Name, "-", i, " is running")
|
2020-05-25 03:35:44 +00:00
|
|
|
flagRunning = true
|
2020-05-23 13:06:31 +00:00
|
|
|
if !jm.job.Tasks[i].IsPS {
|
|
|
|
onlyPS = false
|
|
|
|
}
|
2020-05-23 17:38:42 +00:00
|
|
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
2020-05-26 01:03:21 +00:00
|
|
|
} else if status[i].Status == "unknown" {
|
|
|
|
log.Warn(jm.job.Name, "-", i, " is unknown")
|
|
|
|
flagRunning = true
|
|
|
|
if !jm.job.Tasks[i].IsPS {
|
|
|
|
onlyPS = false
|
|
|
|
}
|
|
|
|
//InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
2020-05-23 13:06:31 +00:00
|
|
|
} else {
|
2020-05-31 10:55:07 +00:00
|
|
|
jm.resourcesMu.Lock()
|
2020-05-31 04:14:06 +00:00
|
|
|
if jm.resources[i].ClientID == "_released_" {
|
2020-05-31 10:55:07 +00:00
|
|
|
jm.resourcesMu.Unlock()
|
2020-05-31 04:14:06 +00:00
|
|
|
continue
|
|
|
|
}
|
2020-05-23 17:38:42 +00:00
|
|
|
log.Info(jm.job.Name, "-", i, " ", status[i].Status)
|
2020-05-25 03:35:44 +00:00
|
|
|
if exitCode, ok := status[i].State["ExitCode"].(float64); ok && exitCode != 0 && !jm.killFlag {
|
|
|
|
log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode)
|
|
|
|
jm.stop(false)
|
2020-05-25 05:28:58 +00:00
|
|
|
jm.killFlag = true
|
2020-05-25 03:35:44 +00:00
|
|
|
jm.scheduler.UpdateProgress(jm.job, Failed)
|
2020-06-29 15:24:33 +00:00
|
|
|
jm.job.Status = Failed
|
2020-06-09 15:57:52 +00:00
|
|
|
} else if !jm.killFlag {
|
2020-06-11 13:21:57 +00:00
|
|
|
log.Info("Some instance exited, close others")
|
2020-06-09 15:57:52 +00:00
|
|
|
jm.stop(false)
|
|
|
|
jm.killFlag = true
|
|
|
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
2020-06-29 15:24:33 +00:00
|
|
|
jm.job.Status = Finished
|
2020-05-23 13:06:31 +00:00
|
|
|
}
|
2020-05-27 12:39:54 +00:00
|
|
|
|
2020-05-28 02:59:20 +00:00
|
|
|
if jm.resources[i].ClientID != "_released_" {
|
|
|
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
|
|
|
log.Info("return resource ", jm.resources[i].ClientID)
|
|
|
|
jm.resources[i].ClientID = "_released_"
|
2020-05-27 12:39:54 +00:00
|
|
|
|
2020-05-28 03:23:27 +00:00
|
|
|
for _, t := range jm.resources[i].Status {
|
|
|
|
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
|
|
|
}
|
|
|
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
|
|
|
}
|
2020-05-28 04:50:43 +00:00
|
|
|
jm.resourcesMu.Unlock()
|
2020-05-23 13:06:31 +00:00
|
|
|
}
|
|
|
|
}
|
2020-05-25 09:37:07 +00:00
|
|
|
if flagRunning && onlyPS && !jm.killFlag {
|
2020-05-23 13:06:31 +00:00
|
|
|
log.Info("Only PS is running, stop ", jm.job.Name)
|
2020-05-25 03:35:44 +00:00
|
|
|
jm.stop(false)
|
2020-05-25 09:37:07 +00:00
|
|
|
jm.killFlag = true
|
|
|
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
2020-06-29 15:24:33 +00:00
|
|
|
jm.job.Status = Finished
|
2020-05-25 05:28:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if !flagRunning && !jm.killFlag {
|
2020-05-25 03:35:44 +00:00
|
|
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
2020-06-29 15:24:33 +00:00
|
|
|
jm.job.Status = Finished
|
2020-05-25 05:28:58 +00:00
|
|
|
log.Info("finish job ", jm.job.Name)
|
2020-05-23 13:06:31 +00:00
|
|
|
}
|
2020-05-23 16:47:40 +00:00
|
|
|
|
2020-05-25 05:28:58 +00:00
|
|
|
if !flagRunning {
|
2020-05-23 16:47:40 +00:00
|
|
|
jm.isRunning = false
|
2020-05-25 03:35:44 +00:00
|
|
|
jm.returnResource(status)
|
2020-05-23 16:47:40 +00:00
|
|
|
}
|
2020-05-23 13:06:31 +00:00
|
|
|
}
|
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
/* fetch logs of task */
|
2019-03-20 03:14:07 +00:00
|
|
|
func (jm *JobManager) logs(taskName string) MsgLog {
|
|
|
|
spider := Spider{}
|
|
|
|
spider.Method = "GET"
|
2019-04-16 08:59:19 +00:00
|
|
|
spider.URL = "http://" + jm.jobStatus.tasks[taskName].Node + ":8000/logs?id=" + jm.jobStatus.tasks[taskName].Id
|
2019-03-20 03:14:07 +00:00
|
|
|
|
2019-08-01 06:32:27 +00:00
|
|
|
if _, ok := jm.jobStatus.tasks[taskName]; !ok {
|
|
|
|
return MsgLog{Code: -1, Error: "Task not exist"}
|
2019-08-01 06:26:05 +00:00
|
|
|
}
|
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
err := spider.do()
|
|
|
|
if err != nil {
|
|
|
|
return MsgLog{Code: 1, Error: err.Error()}
|
|
|
|
}
|
|
|
|
|
|
|
|
resp := spider.getResponse()
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
|
|
if err != nil {
|
2020-05-25 03:35:44 +00:00
|
|
|
return MsgLog{Code: 2, Error: err.Error()}
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
var res MsgLog
|
|
|
|
err = json.Unmarshal([]byte(string(body)), &res)
|
|
|
|
if err != nil {
|
|
|
|
log.Println(err)
|
2020-05-25 03:35:44 +00:00
|
|
|
return MsgLog{Code: 3, Error: "Unknown"}
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
/* fetch job tasks status */
|
2019-03-20 03:14:07 +00:00
|
|
|
func (jm *JobManager) status() MsgJobStatus {
|
|
|
|
var tasksStatus []TaskStatus
|
2020-05-25 03:35:44 +00:00
|
|
|
for range jm.job.Tasks { //append would cause uncertain order
|
2020-06-29 15:38:58 +00:00
|
|
|
tasksStatus = append(tasksStatus, TaskStatus{})
|
2020-05-01 06:06:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for i, task := range jm.job.Tasks {
|
|
|
|
taskStatus := jm.jobStatus.tasks[task.Name]
|
2020-05-27 12:32:53 +00:00
|
|
|
|
|
|
|
/* still in launching phase */
|
|
|
|
if len(taskStatus.Node) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
spider := Spider{}
|
|
|
|
spider.Method = "GET"
|
2019-04-16 08:59:19 +00:00
|
|
|
spider.URL = "http://" + taskStatus.Node + ":8000/status?id=" + taskStatus.Id
|
2019-03-20 03:14:07 +00:00
|
|
|
|
|
|
|
err := spider.do()
|
|
|
|
if err != nil {
|
2020-05-27 11:12:17 +00:00
|
|
|
log.Warn(err)
|
2020-05-04 11:58:38 +00:00
|
|
|
tasksStatus[i] = TaskStatus{Status: "unknown", State: map[string]interface{}{"ExitCode": float64(-1)}}
|
2019-03-20 03:14:07 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
resp := spider.getResponse()
|
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
2019-11-11 07:33:04 +00:00
|
|
|
resp.Body.Close()
|
2019-03-20 03:14:07 +00:00
|
|
|
if err != nil {
|
2020-05-04 11:58:38 +00:00
|
|
|
tasksStatus[i] = TaskStatus{Status: "unknown", State: map[string]interface{}{"ExitCode": float64(-1)}}
|
2019-03-20 03:14:07 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
var res MsgTaskStatus
|
|
|
|
err = json.Unmarshal([]byte(string(body)), &res)
|
|
|
|
if err != nil {
|
2020-05-27 11:12:17 +00:00
|
|
|
log.Warn(err)
|
2020-05-04 11:58:38 +00:00
|
|
|
tasksStatus[i] = TaskStatus{Status: "unknown", State: map[string]interface{}{"ExitCode": float64(-1)}}
|
2019-03-20 03:14:07 +00:00
|
|
|
continue
|
|
|
|
}
|
2020-06-10 02:10:30 +00:00
|
|
|
if res.Code == 2 {
|
|
|
|
tasksStatus[i] = TaskStatus{Status: "unknown", State: map[string]interface{}{"ExitCode": float64(-2)}}
|
|
|
|
log.Warn(res.Error)
|
|
|
|
continue
|
|
|
|
}
|
2020-05-26 01:03:21 +00:00
|
|
|
if res.Code != 0 {
|
2020-06-10 02:10:30 +00:00
|
|
|
tasksStatus[i] = TaskStatus{Status: "notexist", State: map[string]interface{}{"ExitCode": float64(res.Code)}}
|
2020-05-26 01:03:21 +00:00
|
|
|
continue
|
|
|
|
}
|
2020-03-29 13:12:44 +00:00
|
|
|
res.Status.Node = taskStatus.Node
|
2020-05-01 06:06:12 +00:00
|
|
|
tasksStatus[i] = res.Status
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
2020-06-29 15:38:58 +00:00
|
|
|
for i := range jm.job.Tasks {
|
|
|
|
tasksStatus[i].TimeStamp = time.Now().Unix()
|
|
|
|
}
|
2019-03-04 09:19:55 +00:00
|
|
|
|
2020-05-27 11:26:41 +00:00
|
|
|
if jm.isRunning {
|
2020-05-31 04:14:06 +00:00
|
|
|
go func() {
|
|
|
|
jm.checkStatus(tasksStatus)
|
|
|
|
}()
|
2020-06-05 07:33:23 +00:00
|
|
|
jm.stats = append(jm.stats, tasksStatus)
|
|
|
|
|
2020-05-27 11:26:41 +00:00
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
return MsgJobStatus{Status: tasksStatus}
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
2019-04-18 09:25:37 +00:00
|
|
|
|
2020-05-25 03:35:44 +00:00
|
|
|
/* force stop all containers */
|
|
|
|
func (jm *JobManager) stop(force bool) MsgStop {
|
|
|
|
for _, taskStatus := range jm.jobStatus.tasks {
|
2020-05-25 05:28:58 +00:00
|
|
|
/* stop at background */
|
|
|
|
go func(task TaskStatus) {
|
|
|
|
v := url.Values{}
|
|
|
|
v.Set("id", task.Id)
|
|
|
|
|
2020-05-28 06:57:30 +00:00
|
|
|
resp, err := doRequest("POST", "http://"+task.Node+":8000/stop", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
2020-05-25 05:28:58 +00:00
|
|
|
if err != nil {
|
|
|
|
log.Warn(err.Error())
|
|
|
|
}
|
2020-05-28 06:57:30 +00:00
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
|
|
resp.Body.Close()
|
|
|
|
if err != nil {
|
|
|
|
log.Warn(err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
var res MsgStop
|
|
|
|
err = json.Unmarshal([]byte(string(body)), &res)
|
|
|
|
if err != nil || res.Code != 0 {
|
|
|
|
log.Warn(res)
|
|
|
|
return
|
|
|
|
}
|
2020-05-28 12:16:24 +00:00
|
|
|
if res.Code != 0 {
|
|
|
|
log.Warn(res.Error)
|
|
|
|
}
|
2020-06-24 08:47:21 +00:00
|
|
|
log.Info(jm.job.Name, ":", task.HostName, " is killed:", task.Id)
|
2020-05-25 05:28:58 +00:00
|
|
|
}(taskStatus)
|
2020-05-25 03:35:44 +00:00
|
|
|
}
|
2019-04-18 09:25:37 +00:00
|
|
|
|
2020-06-03 09:26:11 +00:00
|
|
|
go func() {
|
|
|
|
if force {
|
|
|
|
jm.killFlag = true
|
|
|
|
jm.scheduler.UpdateProgress(jm.job, Stopped)
|
2020-06-29 15:24:33 +00:00
|
|
|
jm.job.Status = Stopped
|
2020-06-03 09:26:11 +00:00
|
|
|
log.Info("kill job, ", jm.job.Name)
|
|
|
|
}
|
|
|
|
}()
|
2019-04-18 09:25:37 +00:00
|
|
|
return MsgStop{Code: 0}
|
|
|
|
}
|