2019-03-04 09:19:55 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"time"
|
2019-03-20 03:14:07 +00:00
|
|
|
"net/url"
|
|
|
|
"strings"
|
|
|
|
"io/ioutil"
|
|
|
|
"encoding/json"
|
2019-06-05 09:09:22 +00:00
|
|
|
"strconv"
|
2019-07-10 12:40:43 +00:00
|
|
|
log "github.com/sirupsen/logrus"
|
2019-03-04 09:19:55 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
type JobManager struct {
|
2019-08-01 07:03:56 +00:00
|
|
|
scheduler Scheduler
|
|
|
|
job Job
|
|
|
|
jobStatus JobStatus
|
|
|
|
resources []NodeStatus
|
|
|
|
killedFlag bool
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
func (jm *JobManager) start() {
|
2020-04-26 10:48:49 +00:00
|
|
|
log.Info("start job ", jm.job.Name, time.Now())
|
2019-03-25 07:36:30 +00:00
|
|
|
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
2019-03-20 03:14:07 +00:00
|
|
|
|
2019-07-10 12:40:43 +00:00
|
|
|
network := jm.scheduler.AcquireNetwork()
|
2019-06-05 09:09:22 +00:00
|
|
|
|
2020-03-29 13:12:44 +00:00
|
|
|
InstanceJobHistoryLogger().submitJob(jm.job)
|
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
/* request for resources */
|
2020-05-01 06:54:29 +00:00
|
|
|
for range jm.job.Tasks { //append would cause uncertain order
|
|
|
|
jm.resources = append(jm.resources, NodeStatus{ClientID: "null"})
|
2020-05-01 06:06:12 +00:00
|
|
|
}
|
|
|
|
|
2020-05-01 06:54:29 +00:00
|
|
|
start := time.Now().Unix()
|
|
|
|
for i := 0; i < len(jm.job.Tasks); i++ {
|
2019-04-16 08:59:19 +00:00
|
|
|
var resource NodeStatus
|
2019-03-20 03:14:07 +00:00
|
|
|
for {
|
2019-08-01 07:03:56 +00:00
|
|
|
if jm.killedFlag {
|
|
|
|
break
|
|
|
|
}
|
2020-05-01 06:54:29 +00:00
|
|
|
|
|
|
|
var tmp []NodeStatus
|
|
|
|
for _, t := range jm.resources {
|
|
|
|
if t.ClientID != "null" {
|
|
|
|
tmp = append(tmp, t)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
resource = jm.scheduler.AcquireResource(jm.job, jm.job.Tasks[i], tmp)
|
2019-03-20 03:14:07 +00:00
|
|
|
if len(resource.Status) > 0 {
|
|
|
|
break
|
|
|
|
}
|
2020-05-01 06:54:29 +00:00
|
|
|
|
|
|
|
if time.Now().Unix()-start > 30 {
|
|
|
|
log.Info("Wait too long, return all resource and retry")
|
|
|
|
for _, tt := range jm.resources {
|
|
|
|
if tt.ClientID != "null" {
|
|
|
|
jm.scheduler.ReleaseResource(jm.job, tt)
|
|
|
|
log.Info("return resource ", tt.ClientID)
|
|
|
|
jm.resources[i].ClientID = "null"
|
|
|
|
for _, t := range tt.Status {
|
|
|
|
jm.scheduler.Detach(t.UUID, jm.job.Name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i = -1
|
|
|
|
start = time.Now().Unix()
|
|
|
|
}
|
2020-05-02 13:42:27 +00:00
|
|
|
if i == -1 {
|
|
|
|
break
|
|
|
|
}
|
2019-08-01 06:16:44 +00:00
|
|
|
time.Sleep(time.Second * 1)
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
2020-05-01 06:54:29 +00:00
|
|
|
if len(resource.Status) > 0 {
|
|
|
|
log.Info("Receive resource", resource)
|
|
|
|
jm.resources[i] = resource
|
2020-04-11 03:38:04 +00:00
|
|
|
|
2020-05-01 06:54:29 +00:00
|
|
|
for _, t := range resource.Status {
|
|
|
|
jm.scheduler.Attach(t.UUID, jm.job.Name)
|
|
|
|
}
|
2020-04-11 03:38:04 +00:00
|
|
|
}
|
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
2020-05-02 16:16:28 +00:00
|
|
|
jm.scheduler.UpdateProgress(jm.job, Running)
|
2019-03-04 09:19:55 +00:00
|
|
|
|
2020-04-26 10:48:49 +00:00
|
|
|
log.Info("ready to run job ", jm.job.Name, time.Now())
|
2020-04-12 02:44:32 +00:00
|
|
|
|
2019-03-04 09:19:55 +00:00
|
|
|
/* bring up containers */
|
2019-03-20 03:14:07 +00:00
|
|
|
for i := range jm.job.Tasks {
|
2019-08-01 07:03:56 +00:00
|
|
|
if jm.killedFlag {
|
|
|
|
break
|
|
|
|
}
|
2019-04-12 09:21:09 +00:00
|
|
|
var GPUs []string
|
|
|
|
for _, GPU := range jm.resources[i].Status {
|
|
|
|
GPUs = append(GPUs, GPU.UUID)
|
|
|
|
}
|
|
|
|
|
2020-05-02 14:55:49 +00:00
|
|
|
for attempt := 0; attempt < 3; attempt++ {
|
2020-05-02 14:55:33 +00:00
|
|
|
if attempt == 2 { //failed more than once
|
|
|
|
//for {
|
|
|
|
// resource := jm.scheduler.AcquireResource(jm.job, jm.job.Tasks[i], jm.resources)
|
|
|
|
// if len(resource.Status) > 0 {
|
|
|
|
// break
|
|
|
|
// }
|
|
|
|
time.Sleep(time.Second * 1)
|
|
|
|
//}
|
|
|
|
}
|
|
|
|
|
2020-05-02 14:51:13 +00:00
|
|
|
v := url.Values{}
|
|
|
|
v.Set("image", jm.job.Tasks[i].Image)
|
|
|
|
v.Set("cmd", jm.job.Tasks[i].Cmd)
|
|
|
|
v.Set("name", jm.job.Tasks[i].Name)
|
|
|
|
v.Set("workspace", jm.job.Workspace)
|
|
|
|
v.Set("gpus", strings.Join(GPUs, ","))
|
|
|
|
v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[i].Memory)+"m")
|
|
|
|
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[i].NumberCPU))
|
|
|
|
v.Set("network", network)
|
|
|
|
v.Set("should_wait", "1")
|
|
|
|
|
|
|
|
resp, err := doRequest("POST", "http://"+jm.resources[i].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
|
|
|
if err != nil {
|
|
|
|
log.Warn(err.Error())
|
|
|
|
continue
|
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
|
2020-05-02 14:51:13 +00:00
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
|
|
resp.Body.Close()
|
|
|
|
if err != nil {
|
|
|
|
log.Warn(err)
|
|
|
|
continue
|
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
|
2020-05-02 14:51:13 +00:00
|
|
|
var res MsgCreate
|
|
|
|
err = json.Unmarshal([]byte(string(body)), &res)
|
|
|
|
if err != nil {
|
|
|
|
log.Warn(err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if res.Code != 0 {
|
|
|
|
log.Warn(res)
|
|
|
|
}
|
2020-05-02 14:52:01 +00:00
|
|
|
if res.Code == 0 {
|
|
|
|
jm.jobStatus.tasks[jm.job.Tasks[i].Name] = TaskStatus{Id: res.Id, Node: jm.resources[i].ClientHost}
|
|
|
|
break
|
|
|
|
}
|
2020-05-02 14:35:31 +00:00
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
|
|
|
|
2019-03-04 09:19:55 +00:00
|
|
|
/* monitor job execution */
|
|
|
|
for {
|
2019-03-20 03:14:07 +00:00
|
|
|
res := jm.status()
|
2019-03-25 07:36:30 +00:00
|
|
|
flag := false
|
2020-04-30 04:13:21 +00:00
|
|
|
onlyPS := true
|
2019-03-20 03:14:07 +00:00
|
|
|
for i := range res.Status {
|
2020-04-10 10:55:51 +00:00
|
|
|
if res.Status[i].Status == "ready" {
|
2020-04-12 03:47:20 +00:00
|
|
|
log.Debug(jm.job.Name, "-", i, " is ready to run")
|
2020-04-10 10:55:51 +00:00
|
|
|
flag = true
|
2020-04-30 06:35:58 +00:00
|
|
|
if !jm.job.Tasks[i].IsPS {
|
|
|
|
onlyPS = false
|
|
|
|
}
|
2020-04-10 10:55:51 +00:00
|
|
|
} else if res.Status[i].Status == "running" {
|
2020-04-12 03:47:20 +00:00
|
|
|
log.Debug(jm.job.Name, "-", i, " is running")
|
2019-03-25 07:36:30 +00:00
|
|
|
flag = true
|
2020-04-30 04:13:21 +00:00
|
|
|
if !jm.job.Tasks[i].IsPS {
|
|
|
|
onlyPS = false
|
|
|
|
}
|
2020-04-10 10:55:51 +00:00
|
|
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, res.Status[i])
|
2019-03-20 03:14:07 +00:00
|
|
|
} else {
|
2020-04-12 03:47:20 +00:00
|
|
|
log.Info(jm.job.Name, "-", i, " ", res.Status[i].Status)
|
2019-03-20 03:14:07 +00:00
|
|
|
/* save logs etc. */
|
|
|
|
|
2019-12-04 11:07:38 +00:00
|
|
|
/* remove exited containers */
|
|
|
|
//v := url.Values{}
|
|
|
|
//v.Set("id", res.Status[i].Id)
|
|
|
|
//
|
|
|
|
//_, err := doRequest("POST", "http://"+res.Status[i].Node+":8000/remove", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
|
|
|
//if err != nil {
|
|
|
|
// log.Warn(err.Error())
|
|
|
|
// continue
|
|
|
|
//}
|
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
/* return resource */
|
2020-05-01 05:01:53 +00:00
|
|
|
if jm.resources[i].ClientID != "null" {
|
|
|
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
|
|
|
log.Info("return resource ", jm.resources[i].ClientID)
|
|
|
|
jm.resources[i].ClientID = "null"
|
2020-03-29 13:12:44 +00:00
|
|
|
|
2020-05-01 05:01:53 +00:00
|
|
|
for _, t := range jm.resources[i].Status {
|
|
|
|
jm.scheduler.Detach(t.UUID, jm.job.Name)
|
|
|
|
}
|
2020-04-11 03:38:04 +00:00
|
|
|
|
2020-05-01 05:01:53 +00:00
|
|
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, res.Status[i])
|
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
|
|
|
}
|
2020-04-30 07:50:05 +00:00
|
|
|
if flag && onlyPS {
|
2020-04-30 04:13:21 +00:00
|
|
|
jm.stop()
|
2020-05-01 06:24:49 +00:00
|
|
|
log.Info("Only PS is running, stop ", jm.job.Name)
|
2020-04-30 06:04:40 +00:00
|
|
|
jm.killedFlag = false
|
2020-04-30 04:13:21 +00:00
|
|
|
}
|
2019-03-25 07:36:30 +00:00
|
|
|
if !flag {
|
|
|
|
break
|
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
time.Sleep(time.Second * 10)
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
|
|
|
|
2019-07-10 12:40:43 +00:00
|
|
|
jm.scheduler.ReleaseNetwork(network)
|
2019-06-05 09:09:22 +00:00
|
|
|
|
2019-12-04 11:07:38 +00:00
|
|
|
if !jm.killedFlag {
|
2020-05-02 16:16:28 +00:00
|
|
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
2020-05-01 06:24:49 +00:00
|
|
|
log.Info("finish job ", jm.job.Name)
|
2019-12-04 11:07:38 +00:00
|
|
|
}
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (jm *JobManager) logs(taskName string) MsgLog {
|
|
|
|
spider := Spider{}
|
|
|
|
spider.Method = "GET"
|
2019-04-16 08:59:19 +00:00
|
|
|
spider.URL = "http://" + jm.jobStatus.tasks[taskName].Node + ":8000/logs?id=" + jm.jobStatus.tasks[taskName].Id
|
2019-03-20 03:14:07 +00:00
|
|
|
|
2019-08-01 06:32:27 +00:00
|
|
|
if _, ok := jm.jobStatus.tasks[taskName]; !ok {
|
|
|
|
return MsgLog{Code: -1, Error: "Task not exist"}
|
2019-08-01 06:26:05 +00:00
|
|
|
}
|
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
err := spider.do()
|
|
|
|
if err != nil {
|
|
|
|
return MsgLog{Code: 1, Error: err.Error()}
|
|
|
|
}
|
|
|
|
|
|
|
|
resp := spider.getResponse()
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
|
|
if err != nil {
|
|
|
|
return MsgLog{Code: 1, Error: err.Error()}
|
|
|
|
}
|
|
|
|
|
|
|
|
var res MsgLog
|
|
|
|
err = json.Unmarshal([]byte(string(body)), &res)
|
|
|
|
if err != nil {
|
|
|
|
log.Println(err)
|
|
|
|
return MsgLog{Code: 1, Error: "Unknown"}
|
|
|
|
}
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
|
|
|
func (jm *JobManager) status() MsgJobStatus {
|
|
|
|
var tasksStatus []TaskStatus
|
2020-05-01 06:06:12 +00:00
|
|
|
for range jm.jobStatus.tasks {
|
|
|
|
tasksStatus = append(tasksStatus, TaskStatus{})
|
|
|
|
}
|
|
|
|
|
|
|
|
for i, task := range jm.job.Tasks {
|
|
|
|
taskStatus := jm.jobStatus.tasks[task.Name]
|
2019-03-20 03:14:07 +00:00
|
|
|
spider := Spider{}
|
|
|
|
spider.Method = "GET"
|
2019-04-16 08:59:19 +00:00
|
|
|
spider.URL = "http://" + taskStatus.Node + ":8000/status?id=" + taskStatus.Id
|
2019-03-20 03:14:07 +00:00
|
|
|
|
|
|
|
err := spider.do()
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
resp := spider.getResponse()
|
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
2019-11-11 07:33:04 +00:00
|
|
|
resp.Body.Close()
|
2019-03-20 03:14:07 +00:00
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
var res MsgTaskStatus
|
|
|
|
err = json.Unmarshal([]byte(string(body)), &res)
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
2020-03-29 13:12:44 +00:00
|
|
|
res.Status.Node = taskStatus.Node
|
2020-05-01 06:06:12 +00:00
|
|
|
tasksStatus[i] = res.Status
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
2019-03-04 09:19:55 +00:00
|
|
|
|
2019-03-20 03:14:07 +00:00
|
|
|
return MsgJobStatus{Status: tasksStatus}
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
2019-04-18 09:25:37 +00:00
|
|
|
|
|
|
|
func (jm *JobManager) stop() MsgStop {
|
2019-12-04 11:07:38 +00:00
|
|
|
jm.killedFlag = true
|
|
|
|
go func() { /* kill at background */
|
|
|
|
for _, taskStatus := range jm.jobStatus.tasks {
|
|
|
|
v := url.Values{}
|
|
|
|
v.Set("id", taskStatus.Id)
|
|
|
|
|
|
|
|
_, err := doRequest("POST", "http://"+taskStatus.Node+":8000/stop", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
|
|
|
if err != nil {
|
|
|
|
log.Warn(err.Error())
|
|
|
|
continue
|
|
|
|
}
|
2019-11-11 07:33:04 +00:00
|
|
|
}
|
2019-12-04 11:07:38 +00:00
|
|
|
}()
|
2019-04-18 09:25:37 +00:00
|
|
|
|
2020-05-02 16:16:28 +00:00
|
|
|
jm.scheduler.UpdateProgress(jm.job, Stopped)
|
2020-04-30 05:08:08 +00:00
|
|
|
log.Info("kill job, ", jm.job.Name)
|
2019-04-18 09:25:37 +00:00
|
|
|
return MsgStop{Code: 0}
|
|
|
|
}
|