1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-06-08 06:41:56 +00:00
YAO-scheduler/src/job_manager.go

243 lines
5.7 KiB
Go
Raw Normal View History

2019-03-04 09:19:55 +00:00
package main
import (
"time"
2019-03-20 03:14:07 +00:00
"net/url"
"strings"
"io/ioutil"
"encoding/json"
2019-06-05 09:09:22 +00:00
"strconv"
2019-07-10 12:40:43 +00:00
log "github.com/sirupsen/logrus"
2019-03-04 09:19:55 +00:00
)
type JobManager struct {
2019-08-01 07:03:56 +00:00
scheduler Scheduler
job Job
jobStatus JobStatus
resources []NodeStatus
killedFlag bool
2019-03-04 09:19:55 +00:00
}
2019-03-20 03:14:07 +00:00
func (jm *JobManager) start() {
2020-04-26 10:48:49 +00:00
log.Info("start job ", jm.job.Name, time.Now())
2019-03-25 07:36:30 +00:00
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
2019-03-20 03:14:07 +00:00
2019-07-10 12:40:43 +00:00
network := jm.scheduler.AcquireNetwork()
2019-06-05 09:09:22 +00:00
2020-03-29 13:12:44 +00:00
InstanceJobHistoryLogger().submitJob(jm.job)
2019-03-20 03:14:07 +00:00
/* request for resources */
for i := range jm.job.Tasks {
2019-04-16 08:59:19 +00:00
var resource NodeStatus
2019-03-20 03:14:07 +00:00
for {
2019-08-01 07:03:56 +00:00
if jm.killedFlag {
break
}
2019-08-01 05:42:53 +00:00
resource = jm.scheduler.AcquireResource(jm.job, jm.job.Tasks[i])
2019-03-20 03:14:07 +00:00
if len(resource.Status) > 0 {
break
}
2019-08-01 06:16:44 +00:00
time.Sleep(time.Second * 1)
2019-03-20 03:14:07 +00:00
}
2020-04-30 06:11:18 +00:00
log.Info("Receive resource", resource)
2019-03-20 03:14:07 +00:00
jm.resources = append(jm.resources, resource)
2020-04-11 03:38:04 +00:00
for _, t := range resource.Status {
jm.scheduler.Attach(t.UUID, jm.job.Name)
}
2019-03-20 03:14:07 +00:00
}
2019-07-10 12:40:43 +00:00
jm.scheduler.UpdateProgress(jm.job.Name, Running)
2019-03-04 09:19:55 +00:00
2020-04-26 10:48:49 +00:00
log.Info("ready to run job ", jm.job.Name, time.Now())
2020-04-12 02:44:32 +00:00
2019-03-04 09:19:55 +00:00
/* bring up containers */
2019-03-20 03:14:07 +00:00
for i := range jm.job.Tasks {
2019-08-01 07:03:56 +00:00
if jm.killedFlag {
break
}
2019-04-12 09:21:09 +00:00
var GPUs []string
for _, GPU := range jm.resources[i].Status {
GPUs = append(GPUs, GPU.UUID)
}
2019-03-20 03:14:07 +00:00
v := url.Values{}
2019-04-12 09:21:09 +00:00
v.Set("image", jm.job.Tasks[i].Image)
2019-03-20 03:14:07 +00:00
v.Set("cmd", jm.job.Tasks[i].Cmd)
2019-04-12 09:21:09 +00:00
v.Set("name", jm.job.Tasks[i].Name)
v.Set("workspace", jm.job.Workspace)
v.Set("gpus", strings.Join(GPUs, ","))
2019-06-05 09:09:22 +00:00
v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[i].Memory)+"m")
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[i].NumberCPU))
v.Set("network", network)
2020-04-30 05:08:08 +00:00
v.Set("should_wait", "1")
2019-04-12 09:21:09 +00:00
2019-04-16 08:59:19 +00:00
resp, err := doRequest("POST", "http://"+jm.resources[i].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
2019-03-20 03:14:07 +00:00
if err != nil {
2019-07-10 12:40:43 +00:00
log.Warn(err.Error())
2019-11-11 07:33:04 +00:00
continue
2019-03-20 03:14:07 +00:00
}
body, err := ioutil.ReadAll(resp.Body)
2019-11-11 07:33:04 +00:00
resp.Body.Close()
2019-03-20 03:14:07 +00:00
if err != nil {
2019-07-10 12:40:43 +00:00
log.Warn(err)
2019-11-11 07:33:04 +00:00
continue
2019-03-20 03:14:07 +00:00
}
var res MsgCreate
err = json.Unmarshal([]byte(string(body)), &res)
if err != nil {
2019-07-10 12:40:43 +00:00
log.Warn(err)
2019-11-11 07:33:04 +00:00
continue
2019-03-20 03:14:07 +00:00
}
2019-04-16 08:59:19 +00:00
jm.jobStatus.tasks[jm.job.Tasks[i].Name] = TaskStatus{Id: res.Id, Node: jm.resources[i].ClientHost}
2019-03-20 03:14:07 +00:00
}
2019-03-04 09:19:55 +00:00
/* monitor job execution */
for {
2019-03-20 03:14:07 +00:00
res := jm.status()
2019-03-25 07:36:30 +00:00
flag := false
2020-04-30 04:13:21 +00:00
onlyPS := true
2019-03-20 03:14:07 +00:00
for i := range res.Status {
2020-04-10 10:55:51 +00:00
if res.Status[i].Status == "ready" {
2020-04-12 03:47:20 +00:00
log.Debug(jm.job.Name, "-", i, " is ready to run")
2020-04-10 10:55:51 +00:00
flag = true
2020-04-30 06:35:58 +00:00
if !jm.job.Tasks[i].IsPS {
onlyPS = false
}
2020-04-10 10:55:51 +00:00
} else if res.Status[i].Status == "running" {
2020-04-12 03:47:20 +00:00
log.Debug(jm.job.Name, "-", i, " is running")
2019-03-25 07:36:30 +00:00
flag = true
2020-04-30 04:13:21 +00:00
if !jm.job.Tasks[i].IsPS {
onlyPS = false
}
2020-04-10 10:55:51 +00:00
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, res.Status[i])
2019-03-20 03:14:07 +00:00
} else {
2020-04-12 03:47:20 +00:00
log.Info(jm.job.Name, "-", i, " ", res.Status[i].Status)
2019-03-20 03:14:07 +00:00
/* save logs etc. */
2019-12-04 11:07:38 +00:00
/* remove exited containers */
//v := url.Values{}
//v.Set("id", res.Status[i].Id)
//
//_, err := doRequest("POST", "http://"+res.Status[i].Node+":8000/remove", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
//if err != nil {
// log.Warn(err.Error())
// continue
//}
2019-03-20 03:14:07 +00:00
/* return resource */
2019-08-01 05:42:53 +00:00
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
2020-04-13 11:48:47 +00:00
log.Info("return resource ", jm.resources[i].ClientID)
2020-03-29 13:12:44 +00:00
2020-04-11 03:38:04 +00:00
for _, t := range jm.resources[i].Status {
2020-04-12 03:47:20 +00:00
jm.scheduler.Detach(t.UUID, jm.job.Name)
2020-04-11 03:38:04 +00:00
}
2020-03-29 13:12:44 +00:00
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, res.Status[i])
2019-03-20 03:14:07 +00:00
}
}
2020-04-30 04:13:21 +00:00
if onlyPS {
jm.stop()
2020-04-30 05:08:08 +00:00
log.Info("Only PS is running, stop", jm.job.Name)
2020-04-30 06:04:40 +00:00
jm.killedFlag = false
2020-04-30 04:13:21 +00:00
break
}
2019-03-25 07:36:30 +00:00
if !flag {
break
}
2019-03-20 03:14:07 +00:00
time.Sleep(time.Second * 10)
2019-03-04 09:19:55 +00:00
}
2019-07-10 12:40:43 +00:00
jm.scheduler.ReleaseNetwork(network)
2019-06-05 09:09:22 +00:00
2019-12-04 11:07:38 +00:00
if !jm.killedFlag {
jm.scheduler.UpdateProgress(jm.job.Name, Finished)
log.Info("finish job", jm.job.Name)
}
2019-03-20 03:14:07 +00:00
}
func (jm *JobManager) logs(taskName string) MsgLog {
spider := Spider{}
spider.Method = "GET"
2019-04-16 08:59:19 +00:00
spider.URL = "http://" + jm.jobStatus.tasks[taskName].Node + ":8000/logs?id=" + jm.jobStatus.tasks[taskName].Id
2019-03-20 03:14:07 +00:00
2019-08-01 06:32:27 +00:00
if _, ok := jm.jobStatus.tasks[taskName]; !ok {
return MsgLog{Code: -1, Error: "Task not exist"}
2019-08-01 06:26:05 +00:00
}
2019-03-20 03:14:07 +00:00
err := spider.do()
if err != nil {
return MsgLog{Code: 1, Error: err.Error()}
}
resp := spider.getResponse()
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return MsgLog{Code: 1, Error: err.Error()}
}
var res MsgLog
err = json.Unmarshal([]byte(string(body)), &res)
if err != nil {
log.Println(err)
return MsgLog{Code: 1, Error: "Unknown"}
}
return res
}
func (jm *JobManager) status() MsgJobStatus {
var tasksStatus []TaskStatus
for _, taskStatus := range jm.jobStatus.tasks {
spider := Spider{}
spider.Method = "GET"
2019-04-16 08:59:19 +00:00
spider.URL = "http://" + taskStatus.Node + ":8000/status?id=" + taskStatus.Id
2019-03-20 03:14:07 +00:00
err := spider.do()
if err != nil {
continue
}
resp := spider.getResponse()
body, err := ioutil.ReadAll(resp.Body)
2019-11-11 07:33:04 +00:00
resp.Body.Close()
2019-03-20 03:14:07 +00:00
if err != nil {
continue
}
var res MsgTaskStatus
err = json.Unmarshal([]byte(string(body)), &res)
if err != nil {
continue
}
2020-03-29 13:12:44 +00:00
res.Status.Node = taskStatus.Node
2019-03-20 03:14:07 +00:00
tasksStatus = append(tasksStatus, res.Status)
}
2019-03-04 09:19:55 +00:00
2019-03-20 03:14:07 +00:00
return MsgJobStatus{Status: tasksStatus}
2019-03-04 09:19:55 +00:00
}
2019-04-18 09:25:37 +00:00
func (jm *JobManager) stop() MsgStop {
2019-12-04 11:07:38 +00:00
jm.killedFlag = true
go func() { /* kill at background */
for _, taskStatus := range jm.jobStatus.tasks {
v := url.Values{}
v.Set("id", taskStatus.Id)
_, err := doRequest("POST", "http://"+taskStatus.Node+":8000/stop", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
if err != nil {
log.Warn(err.Error())
continue
}
2019-11-11 07:33:04 +00:00
}
2019-12-04 11:07:38 +00:00
}()
2019-04-18 09:25:37 +00:00
2019-07-10 12:40:43 +00:00
jm.scheduler.UpdateProgress(jm.job.Name, Stopped)
2020-04-30 05:08:08 +00:00
log.Info("kill job, ", jm.job.Name)
2019-04-18 09:25:37 +00:00
return MsgStop{Code: 0}
}