mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-13 07:46:43 +00:00
refactor
This commit is contained in:
@@ -52,70 +52,72 @@ func InstanceOfConfiguration() *Configuration {
|
|||||||
EnablePreScheduleRatio: 1.5,
|
EnablePreScheduleRatio: 1.5,
|
||||||
PreScheduleExtraTime: 15,
|
PreScheduleExtraTime: 15,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* override conf value from env */
|
|
||||||
value := os.Getenv("KafkaBrokers")
|
|
||||||
if len(value) != 0 {
|
|
||||||
configurationInstance.KafkaBrokers = strings.Split(value, ",")
|
|
||||||
}
|
|
||||||
value = os.Getenv("KafkaTopic")
|
|
||||||
if len(value) != 0 {
|
|
||||||
configurationInstance.KafkaTopic = value
|
|
||||||
}
|
|
||||||
value = os.Getenv("SchedulerPolicy")
|
|
||||||
if len(value) != 0 {
|
|
||||||
configurationInstance.SchedulerPolicy = value
|
|
||||||
}
|
|
||||||
value = os.Getenv("ListenAddr")
|
|
||||||
if len(value) != 0 {
|
|
||||||
configurationInstance.ListenAddr = value
|
|
||||||
}
|
|
||||||
value = os.Getenv("HDFSAddress")
|
|
||||||
if len(value) != 0 {
|
|
||||||
configurationInstance.HDFSAddress = value
|
|
||||||
}
|
|
||||||
value = os.Getenv("HDFSBaseDir")
|
|
||||||
if len(value) != 0 {
|
|
||||||
configurationInstance.HDFSBaseDir = value
|
|
||||||
}
|
|
||||||
value = os.Getenv("DFSBaseDir")
|
|
||||||
if len(value) != 0 {
|
|
||||||
configurationInstance.DFSBaseDir = value
|
|
||||||
}
|
|
||||||
value = os.Getenv("EnableShareRatio")
|
|
||||||
if len(value) != 0 {
|
|
||||||
if val, err := strconv.ParseFloat(value, 32); err == nil {
|
|
||||||
configurationInstance.EnableShareRatio = val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
value = os.Getenv("ShareMaxUtilization")
|
|
||||||
if len(value) != 0 {
|
|
||||||
if val, err := strconv.ParseFloat(value, 32); err == nil {
|
|
||||||
configurationInstance.ShareMaxUtilization = val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
value = os.Getenv("EnablePreScheduleRatio")
|
|
||||||
if len(value) != 0 {
|
|
||||||
if val, err := strconv.ParseFloat(value, 32); err == nil {
|
|
||||||
configurationInstance.EnablePreScheduleRatio = val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
value = os.Getenv("PreScheduleExtraTime")
|
|
||||||
if len(value) != 0 {
|
|
||||||
if val, err := strconv.Atoi(value); err == nil {
|
|
||||||
configurationInstance.PreScheduleExtraTime = val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
value = os.Getenv("PreScheduleTimeout")
|
|
||||||
if len(value) != 0 {
|
|
||||||
if val, err := strconv.Atoi(value); err == nil {
|
|
||||||
configurationInstance.PreScheduleTimeout = val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return configurationInstance
|
return configurationInstance
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* read conf value from env */
|
||||||
|
func (config *Configuration) InitFromEnv() {
|
||||||
|
value := os.Getenv("KafkaBrokers")
|
||||||
|
if len(value) != 0 {
|
||||||
|
configurationInstance.KafkaBrokers = strings.Split(value, ",")
|
||||||
|
}
|
||||||
|
value = os.Getenv("KafkaTopic")
|
||||||
|
if len(value) != 0 {
|
||||||
|
configurationInstance.KafkaTopic = value
|
||||||
|
}
|
||||||
|
value = os.Getenv("SchedulerPolicy")
|
||||||
|
if len(value) != 0 {
|
||||||
|
configurationInstance.SchedulerPolicy = value
|
||||||
|
}
|
||||||
|
value = os.Getenv("ListenAddr")
|
||||||
|
if len(value) != 0 {
|
||||||
|
configurationInstance.ListenAddr = value
|
||||||
|
}
|
||||||
|
value = os.Getenv("HDFSAddress")
|
||||||
|
if len(value) != 0 {
|
||||||
|
configurationInstance.HDFSAddress = value
|
||||||
|
}
|
||||||
|
value = os.Getenv("HDFSBaseDir")
|
||||||
|
if len(value) != 0 {
|
||||||
|
configurationInstance.HDFSBaseDir = value
|
||||||
|
}
|
||||||
|
value = os.Getenv("DFSBaseDir")
|
||||||
|
if len(value) != 0 {
|
||||||
|
configurationInstance.DFSBaseDir = value
|
||||||
|
}
|
||||||
|
value = os.Getenv("EnableShareRatio")
|
||||||
|
if len(value) != 0 {
|
||||||
|
if val, err := strconv.ParseFloat(value, 32); err == nil {
|
||||||
|
configurationInstance.EnableShareRatio = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
value = os.Getenv("ShareMaxUtilization")
|
||||||
|
if len(value) != 0 {
|
||||||
|
if val, err := strconv.ParseFloat(value, 32); err == nil {
|
||||||
|
configurationInstance.ShareMaxUtilization = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
value = os.Getenv("EnablePreScheduleRatio")
|
||||||
|
if len(value) != 0 {
|
||||||
|
if val, err := strconv.ParseFloat(value, 32); err == nil {
|
||||||
|
configurationInstance.EnablePreScheduleRatio = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
value = os.Getenv("PreScheduleExtraTime")
|
||||||
|
if len(value) != 0 {
|
||||||
|
if val, err := strconv.Atoi(value); err == nil {
|
||||||
|
configurationInstance.PreScheduleExtraTime = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
value = os.Getenv("PreScheduleTimeout")
|
||||||
|
if len(value) != 0 {
|
||||||
|
if val, err := strconv.Atoi(value); err == nil {
|
||||||
|
configurationInstance.PreScheduleTimeout = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (config *Configuration) SetMockEnabled(enabled bool) bool {
|
func (config *Configuration) SetMockEnabled(enabled bool) bool {
|
||||||
config.mu.Lock()
|
config.mu.Lock()
|
||||||
defer config.mu.Unlock()
|
defer config.mu.Unlock()
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ func TestGA(t *testing.T) {
|
|||||||
|
|
||||||
allocation = InstanceOfAllocator().fastBestFit(nodes, tasks)
|
allocation = InstanceOfAllocator().fastBestFit(nodes, tasks)
|
||||||
|
|
||||||
InstanceOfResourcePool().init(Configuration{})
|
InstanceOfResourcePool().Start()
|
||||||
allocatedNodes := InstanceOfResourcePool().acquireResource(Job{Tasks: tasks})
|
allocatedNodes := InstanceOfResourcePool().acquireResource(Job{Tasks: tasks})
|
||||||
log.Info(allocatedNodes)
|
log.Info(allocatedNodes)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ func InstanceOfGroupManager() *GroupManager {
|
|||||||
return groupManagerInstance
|
return groupManagerInstance
|
||||||
}
|
}
|
||||||
|
|
||||||
func (gm *GroupManager) init(conf Configuration) {
|
func (gm *GroupManager) Start() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ func InstanceJobHistoryLogger() *JobHistoryLogger {
|
|||||||
return jobHistoryLoggerInstance
|
return jobHistoryLoggerInstance
|
||||||
}
|
}
|
||||||
|
|
||||||
func (jhl *JobHistoryLogger) init(conf Configuration) {
|
func (jhl *JobHistoryLogger) Start() {
|
||||||
log.Info("jhl init")
|
log.Info("jhl init")
|
||||||
jhl.jobs = map[string]Job{}
|
jhl.jobs = map[string]Job{}
|
||||||
jhl.tasks = map[string][]TaskStatus{}
|
jhl.tasks = map[string][]TaskStatus{}
|
||||||
|
|||||||
@@ -12,77 +12,85 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type JobManager struct {
|
type JobManager struct {
|
||||||
scheduler Scheduler
|
/* meta */
|
||||||
job Job
|
scheduler Scheduler
|
||||||
jobStatus JobStatus
|
job Job
|
||||||
resources []NodeStatus
|
|
||||||
|
/* resource */
|
||||||
|
network string
|
||||||
|
resources map[string]NodeStatus
|
||||||
resourcesMu sync.Mutex
|
resourcesMu sync.Mutex
|
||||||
isRunning bool
|
|
||||||
killFlag bool
|
|
||||||
|
|
||||||
network string
|
/* status */
|
||||||
|
jobStatus JobStatus
|
||||||
|
isRunning bool
|
||||||
|
lastHeartBeat int64
|
||||||
|
|
||||||
|
/* history info */
|
||||||
stats [][]TaskStatus
|
stats [][]TaskStatus
|
||||||
}
|
}
|
||||||
|
|
||||||
func (jm *JobManager) start() {
|
func (jm *JobManager) start() {
|
||||||
log.Info("start job ", jm.job.Name, " at ", time.Now())
|
log.Info("start job ", jm.job.Name, " at ", time.Now())
|
||||||
jm.isRunning = false
|
jm.isRunning = true
|
||||||
jm.killFlag = false
|
jm.lastHeartBeat = time.Now().Unix()
|
||||||
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
jm.jobStatus = JobStatus{Name: jm.job.Name, tasks: map[string]TaskStatus{}}
|
||||||
|
jm.resources = map[string]NodeStatus{}
|
||||||
|
|
||||||
/* register in JHL */
|
/* register in JHL */
|
||||||
InstanceJobHistoryLogger().submitJob(jm.job)
|
InstanceJobHistoryLogger().submitJob(jm.job)
|
||||||
|
|
||||||
/* request for private network */
|
|
||||||
jm.network = InstanceOfResourcePool().acquireNetwork()
|
|
||||||
|
|
||||||
/* request for resources */
|
/* request for resources */
|
||||||
|
jm.resourcesMu.Lock()
|
||||||
|
jm.network = InstanceOfResourcePool().acquireNetwork()
|
||||||
for {
|
for {
|
||||||
if jm.killFlag {
|
if !jm.isRunning {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
jm.resources = jm.scheduler.AcquireResource(jm.job)
|
resources := jm.scheduler.AcquireResource(jm.job)
|
||||||
if len(jm.resources) > 0 {
|
if len(resources) > 0 {
|
||||||
|
for i, node := range resources {
|
||||||
|
jm.resources[jm.job.Tasks[i].Name] = node
|
||||||
|
}
|
||||||
log.Info(jm.job.Name, " receive resource", jm.resources)
|
log.Info(jm.job.Name, " receive resource", jm.resources)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
/* sleep random Millisecond to avoid deadlock */
|
/* sleep random Millisecond to avoid deadlock */
|
||||||
time.Sleep(time.Millisecond * time.Duration(500+rand.Intn(500)))
|
time.Sleep(time.Millisecond * time.Duration(500+rand.Intn(500)))
|
||||||
}
|
}
|
||||||
jm.job.StartedAt = time.Now().Unix()
|
jm.resourcesMu.Unlock()
|
||||||
|
|
||||||
if InstanceOfConfiguration().mock {
|
if InstanceOfConfiguration().mock {
|
||||||
jm.scheduler.UpdateProgress(jm.job, Running)
|
if jm.isRunning {
|
||||||
jm.job.Status = Running
|
jm.scheduler.UpdateProgress(jm.job, Running)
|
||||||
jm.isRunning = false
|
duration := InstanceOfMocker().GetDuration(jm.job, jm.resources)
|
||||||
duration := InstanceOfMocker().GetDuration(jm.job, jm.resources)
|
log.Info("mock ", jm.job.Name, ", wait ", duration)
|
||||||
log.Info("mock ", jm.job.Name, ", wait ", duration)
|
time.Sleep(time.Second * time.Duration(duration))
|
||||||
time.Sleep(time.Second * time.Duration(duration))
|
jm.isRunning = false
|
||||||
jm.returnResource([]TaskStatus{})
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
}
|
||||||
jm.job.Status = Finished
|
jm.returnResource()
|
||||||
log.Info("JobMaster exited ", jm.job.Name)
|
log.Info("JobMaster exited ", jm.job.Name)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
isShare := false
|
isShare := false
|
||||||
isScheduleAhead := false
|
isScheduleAhead := false
|
||||||
if !jm.killFlag {
|
if jm.isRunning {
|
||||||
/* switch to Running state */
|
/* switch to Running state */
|
||||||
jm.scheduler.UpdateProgress(jm.job, Running)
|
jm.scheduler.UpdateProgress(jm.job, Running)
|
||||||
jm.job.Status = Running
|
|
||||||
|
|
||||||
/* bring up containers */
|
/* bring up containers */
|
||||||
wg := sync.WaitGroup{}
|
wg := sync.WaitGroup{}
|
||||||
for i := range jm.job.Tasks {
|
success := true
|
||||||
|
for i, task := range jm.job.Tasks {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
|
|
||||||
go func(index int) {
|
go func(task Task, node NodeStatus) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
var UUIDs []string
|
var UUIDs []string
|
||||||
shouldWait := "0"
|
shouldWait := "0"
|
||||||
for _, GPU := range jm.resources[index].Status {
|
for _, GPU := range node.Status {
|
||||||
UUIDs = append(UUIDs, GPU.UUID)
|
UUIDs = append(UUIDs, GPU.UUID)
|
||||||
if GPU.MemoryUsed == GPU.MemoryTotal {
|
if GPU.MemoryUsed == GPU.MemoryTotal {
|
||||||
shouldWait = "1"
|
shouldWait = "1"
|
||||||
@@ -96,40 +104,44 @@ func (jm *JobManager) start() {
|
|||||||
GPUs := strings.Join(UUIDs, ",")
|
GPUs := strings.Join(UUIDs, ",")
|
||||||
|
|
||||||
v := url.Values{}
|
v := url.Values{}
|
||||||
v.Set("image", jm.job.Tasks[index].Image)
|
v.Set("image", task.Image)
|
||||||
v.Set("cmd", jm.job.Tasks[index].Cmd)
|
v.Set("cmd", task.Cmd)
|
||||||
v.Set("name", jm.job.Tasks[index].Name)
|
v.Set("name", task.Name)
|
||||||
v.Set("workspace", jm.job.Workspace)
|
v.Set("workspace", jm.job.Workspace)
|
||||||
v.Set("gpus", GPUs)
|
v.Set("gpus", GPUs)
|
||||||
v.Set("mem_limit", strconv.Itoa(jm.job.Tasks[index].Memory)+"m")
|
v.Set("mem_limit", strconv.Itoa(task.Memory)+"m")
|
||||||
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[index].NumberCPU))
|
v.Set("cpu_limit", strconv.Itoa(task.NumberCPU))
|
||||||
v.Set("network", jm.network)
|
v.Set("network", jm.network)
|
||||||
v.Set("should_wait", shouldWait)
|
v.Set("should_wait", shouldWait)
|
||||||
v.Set("output_dir", "/tmp/")
|
v.Set("output_dir", "/tmp/")
|
||||||
v.Set("hdfs_address", InstanceOfConfiguration().HDFSAddress)
|
v.Set("hdfs_address", InstanceOfConfiguration().HDFSAddress)
|
||||||
v.Set("hdfs_dir", InstanceOfConfiguration().HDFSBaseDir+jm.job.Name)
|
v.Set("hdfs_dir", InstanceOfConfiguration().HDFSBaseDir+jm.job.Name)
|
||||||
v.Set("gpu_mem", strconv.Itoa(jm.job.Tasks[index].MemoryGPU))
|
v.Set("gpu_mem", strconv.Itoa(task.MemoryGPU))
|
||||||
if InstanceOfConfiguration().DFSBaseDir != "" {
|
if InstanceOfConfiguration().DFSBaseDir != "" {
|
||||||
v.Set("dfs_src", InstanceOfConfiguration().DFSBaseDir+jm.job.Name+"/task-"+strconv.Itoa(index))
|
v.Set("dfs_src", InstanceOfConfiguration().DFSBaseDir+jm.job.Name+"/task-"+task.Name)
|
||||||
} else {
|
} else {
|
||||||
v.Set("dfs_src", "")
|
v.Set("dfs_src", "")
|
||||||
}
|
}
|
||||||
v.Set("dfs_dst", "/tmp")
|
v.Set("dfs_dst", "/tmp")
|
||||||
|
|
||||||
resp, err := doRequest("POST", "http://"+jm.resources[index].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
spider := Spider{}
|
||||||
|
spider.Method = "POST"
|
||||||
|
spider.URL = "http://" + node.ClientHost + ":8000/create"
|
||||||
|
spider.Data = v
|
||||||
|
spider.ContentType = "application/x-www-form-urlencoded"
|
||||||
|
err := spider.do()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn(err.Error())
|
log.Warn(err.Error())
|
||||||
jm.job.Status = Failed
|
success = false
|
||||||
jm.stop(false)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
resp := spider.getResponse()
|
||||||
|
|
||||||
body, err := ioutil.ReadAll(resp.Body)
|
body, err := ioutil.ReadAll(resp.Body)
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn(err)
|
log.Warn(err)
|
||||||
jm.job.Status = Failed
|
success = false
|
||||||
jm.stop(false)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,28 +149,36 @@ func (jm *JobManager) start() {
|
|||||||
err = json.Unmarshal([]byte(string(body)), &res)
|
err = json.Unmarshal([]byte(string(body)), &res)
|
||||||
if err != nil || res.Code != 0 {
|
if err != nil || res.Code != 0 {
|
||||||
log.Warn(res)
|
log.Warn(res)
|
||||||
jm.job.Status = Failed
|
success = false
|
||||||
jm.stop(false)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
jm.jobStatus.tasks[jm.job.Tasks[index].Name] = TaskStatus{Id: res.Id, Node: jm.resources[index].ClientHost, HostName: jm.job.Tasks[i].Name}
|
taskStatus := TaskStatus{Id: res.Id, Node: node.ClientHost, HostName: jm.job.Tasks[i].Name}
|
||||||
}(i)
|
jm.jobStatus.tasks[task.Name] = taskStatus
|
||||||
|
|
||||||
|
}(task, jm.resources[task.Name])
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
jm.isRunning = true
|
/* start failed */
|
||||||
|
if !success {
|
||||||
|
jm.isRunning = false
|
||||||
|
jm.scheduler.UpdateProgress(jm.job, Failed)
|
||||||
|
jm.stop()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* monitor job execution */
|
/* monitor job execution */
|
||||||
for {
|
for {
|
||||||
//jm.status()
|
if !jm.isRunning {
|
||||||
if !jm.isRunning || jm.killFlag {
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
if time.Now().Unix()-jm.lastHeartBeat > 30 {
|
||||||
|
log.Warn(jm.job.Name, " heartbeat longer tha 30s")
|
||||||
|
}
|
||||||
time.Sleep(time.Second * 1)
|
time.Sleep(time.Second * 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* make sure resources are released */
|
/* release again to make sure resources are released */
|
||||||
jm.returnResource(jm.status().Status)
|
jm.returnResource()
|
||||||
|
|
||||||
/* feed data to optimizer */
|
/* feed data to optimizer */
|
||||||
isExclusive := InstanceOfResourcePool().isExclusive(jm.job.Name)
|
isExclusive := InstanceOfResourcePool().isExclusive(jm.job.Name)
|
||||||
@@ -197,47 +217,50 @@ func (jm *JobManager) start() {
|
|||||||
if len(jm.job.Tasks) == 1 && !isShare && !isScheduleAhead && jm.job.Status == Finished && isExclusive {
|
if len(jm.job.Tasks) == 1 && !isShare && !isScheduleAhead && jm.job.Status == Finished && isExclusive {
|
||||||
InstanceOfOptimizer().FeedTime(jm.job, stats)
|
InstanceOfOptimizer().FeedTime(jm.job, stats)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* clear, to reduce memory usage */
|
||||||
|
jm.stats = [][]TaskStatus{}
|
||||||
|
|
||||||
|
/* remove exited containers */
|
||||||
|
//for _, task := range jm.jobStatus.tasks {
|
||||||
|
// go func(container TaskStatus) {
|
||||||
|
// v := url.Values{}
|
||||||
|
// v.Set("id", container.Id)
|
||||||
|
//
|
||||||
|
// spider := Spider{}
|
||||||
|
// spider.Method = "POST"
|
||||||
|
// spider.URL = "http://" + container.Node + ":8000/remove"
|
||||||
|
// spider.Data = v
|
||||||
|
// spider.ContentType = "application/x-www-form-urlencoded"
|
||||||
|
// err := spider.do()
|
||||||
|
// if err != nil {
|
||||||
|
// log.Warn(err.Error())
|
||||||
|
// }
|
||||||
|
// }(task)
|
||||||
|
//}
|
||||||
|
|
||||||
log.Info("JobMaster exited ", jm.job.Name)
|
log.Info("JobMaster exited ", jm.job.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* release all resource */
|
/* release all resource */
|
||||||
func (jm *JobManager) returnResource(status []TaskStatus) {
|
func (jm *JobManager) returnResource() {
|
||||||
jm.resourcesMu.Lock()
|
jm.resourcesMu.Lock()
|
||||||
defer jm.resourcesMu.Unlock()
|
defer jm.resourcesMu.Unlock()
|
||||||
/* return resource */
|
/* return resource */
|
||||||
for i := range jm.resources {
|
for i := range jm.resources {
|
||||||
if jm.resources[i].ClientID == "_released_" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
||||||
log.Info("return resource again ", jm.resources[i].ClientID)
|
|
||||||
jm.resources[i].ClientID = "_released_"
|
|
||||||
|
|
||||||
for _, t := range jm.resources[i].Status {
|
for _, t := range jm.resources[i].Status {
|
||||||
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
||||||
}
|
}
|
||||||
|
|
||||||
if !InstanceOfConfiguration().mock {
|
|
||||||
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
|
||||||
}
|
|
||||||
|
|
||||||
/* remove exited containers */
|
|
||||||
//v := url.Values{}
|
|
||||||
//v.Set("id", res.Status[i].Id)
|
|
||||||
//
|
|
||||||
//_, err := doRequest("POST", "http://"+res.Status[i].Node+":8000/remove", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
|
||||||
//if err != nil {
|
|
||||||
// log.Warn(err.Error())
|
|
||||||
// continue
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
|
jm.resources = map[string]NodeStatus{}
|
||||||
if jm.network != "" {
|
if jm.network != "" {
|
||||||
InstanceOfResourcePool().releaseNetwork(jm.network)
|
InstanceOfResourcePool().releaseNetwork(jm.network)
|
||||||
jm.network = ""
|
jm.network = ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* monitor all tasks */
|
/* monitor all tasks, update job status */
|
||||||
func (jm *JobManager) checkStatus(status []TaskStatus) {
|
func (jm *JobManager) checkStatus(status []TaskStatus) {
|
||||||
if !jm.isRunning {
|
if !jm.isRunning {
|
||||||
return
|
return
|
||||||
@@ -245,77 +268,57 @@ func (jm *JobManager) checkStatus(status []TaskStatus) {
|
|||||||
flagRunning := false
|
flagRunning := false
|
||||||
onlyPS := true
|
onlyPS := true
|
||||||
for i := range status {
|
for i := range status {
|
||||||
if status[i].Status == "ready" {
|
if status[i].Status == "ready" || status[i].Status == "running" {
|
||||||
log.Debug(jm.job.Name, "-", i, " is ready to run")
|
|
||||||
flagRunning = true
|
|
||||||
if !jm.job.Tasks[i].IsPS {
|
|
||||||
onlyPS = false
|
|
||||||
}
|
|
||||||
} else if status[i].Status == "running" {
|
|
||||||
log.Debug(jm.job.Name, "-", i, " is running")
|
|
||||||
flagRunning = true
|
flagRunning = true
|
||||||
if !jm.job.Tasks[i].IsPS {
|
if !jm.job.Tasks[i].IsPS {
|
||||||
onlyPS = false
|
onlyPS = false
|
||||||
}
|
}
|
||||||
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
||||||
} else if status[i].Status == "unknown" {
|
} else if status[i].Status == "unknown" {
|
||||||
log.Warn(jm.job.Name, "-", i, " is unknown")
|
|
||||||
flagRunning = true
|
flagRunning = true
|
||||||
if !jm.job.Tasks[i].IsPS {
|
if !jm.job.Tasks[i].IsPS {
|
||||||
onlyPS = false
|
onlyPS = false
|
||||||
}
|
}
|
||||||
//InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
|
||||||
} else {
|
} else {
|
||||||
jm.resourcesMu.Lock()
|
|
||||||
if jm.resources[i].ClientID == "_released_" {
|
|
||||||
jm.resourcesMu.Unlock()
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
log.Info(jm.job.Name, "-", i, " ", status[i].Status)
|
log.Info(jm.job.Name, "-", i, " ", status[i].Status)
|
||||||
if exitCode, ok := status[i].State["ExitCode"].(float64); ok && exitCode != 0 && !jm.killFlag {
|
if exitCode, ok := status[i].State["ExitCode"].(float64); ok && exitCode != 0 && jm.isRunning {
|
||||||
log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode)
|
log.Warn(jm.job.Name+"-"+jm.job.Tasks[i].Name+" exited unexpected, exitCode=", exitCode)
|
||||||
jm.stop(false)
|
jm.isRunning = false
|
||||||
jm.killFlag = true
|
|
||||||
jm.scheduler.UpdateProgress(jm.job, Failed)
|
jm.scheduler.UpdateProgress(jm.job, Failed)
|
||||||
jm.job.Status = Failed
|
jm.stop()
|
||||||
} else if !jm.killFlag {
|
} else if jm.isRunning {
|
||||||
log.Info("Some instance exited, close others")
|
log.Info("Some instance exited, close others")
|
||||||
jm.stop(false)
|
jm.isRunning = false
|
||||||
jm.killFlag = true
|
|
||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
jm.job.Status = Finished
|
jm.stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
if jm.resources[i].ClientID != "_released_" {
|
jm.resourcesMu.Lock()
|
||||||
jm.scheduler.ReleaseResource(jm.job, jm.resources[i])
|
nodeID := jm.job.Tasks[i].Name
|
||||||
log.Info("return resource ", jm.resources[i].ClientID)
|
if _, ok := jm.resources[nodeID]; ok {
|
||||||
jm.resources[i].ClientID = "_released_"
|
jm.scheduler.ReleaseResource(jm.job, jm.resources[nodeID])
|
||||||
|
log.Info("return resource ", jm.resources[nodeID].ClientID)
|
||||||
|
|
||||||
for _, t := range jm.resources[i].Status {
|
for _, t := range jm.resources[nodeID].Status {
|
||||||
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
InstanceOfResourcePool().detach(t.UUID, jm.job)
|
||||||
}
|
}
|
||||||
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
InstanceJobHistoryLogger().submitTaskStatus(jm.job.Name, status[i])
|
||||||
|
delete(jm.resources, nodeID)
|
||||||
}
|
}
|
||||||
jm.resourcesMu.Unlock()
|
jm.resourcesMu.Unlock()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if flagRunning && onlyPS && !jm.killFlag {
|
if flagRunning && onlyPS && jm.isRunning {
|
||||||
log.Info("Only PS is running, stop ", jm.job.Name)
|
log.Info("Only PS is running, stop ", jm.job.Name)
|
||||||
jm.stop(false)
|
|
||||||
jm.killFlag = true
|
|
||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
|
||||||
jm.job.Status = Finished
|
|
||||||
}
|
|
||||||
|
|
||||||
if !flagRunning && !jm.killFlag {
|
|
||||||
jm.scheduler.UpdateProgress(jm.job, Finished)
|
|
||||||
jm.job.Status = Finished
|
|
||||||
log.Info("finish job ", jm.job.Name)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !flagRunning {
|
|
||||||
jm.isRunning = false
|
jm.isRunning = false
|
||||||
jm.returnResource(status)
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
|
jm.stop()
|
||||||
|
}
|
||||||
|
|
||||||
|
if !flagRunning && jm.isRunning {
|
||||||
|
log.Info("finish job ", jm.job.Name)
|
||||||
|
jm.isRunning = false
|
||||||
|
jm.scheduler.UpdateProgress(jm.job, Finished)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -354,7 +357,8 @@ func (jm *JobManager) logs(taskName string) MsgLog {
|
|||||||
/* fetch job tasks status */
|
/* fetch job tasks status */
|
||||||
func (jm *JobManager) status() MsgJobStatus {
|
func (jm *JobManager) status() MsgJobStatus {
|
||||||
var tasksStatus []TaskStatus
|
var tasksStatus []TaskStatus
|
||||||
for range jm.job.Tasks { //append would cause uncertain order
|
/* create slice ahead, since append would cause uncertain order */
|
||||||
|
for range jm.job.Tasks {
|
||||||
tasksStatus = append(tasksStatus, TaskStatus{})
|
tasksStatus = append(tasksStatus, TaskStatus{})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -418,18 +422,32 @@ func (jm *JobManager) status() MsgJobStatus {
|
|||||||
return MsgJobStatus{Status: tasksStatus}
|
return MsgJobStatus{Status: tasksStatus}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* force stop all containers */
|
func (jm *JobManager) stop() MsgStop {
|
||||||
func (jm *JobManager) stop(force bool) MsgStop {
|
if jm.isRunning {
|
||||||
|
jm.isRunning = false
|
||||||
|
jm.scheduler.UpdateProgress(jm.job, Stopped)
|
||||||
|
log.Info("kill job, ", jm.job.Name)
|
||||||
|
}
|
||||||
|
|
||||||
for _, taskStatus := range jm.jobStatus.tasks {
|
for _, taskStatus := range jm.jobStatus.tasks {
|
||||||
/* stop at background */
|
/* stop at background */
|
||||||
go func(task TaskStatus) {
|
go func(task TaskStatus) {
|
||||||
|
log.Info("kill ", jm.job.Name, "-", task.Id, " :", task.HostName)
|
||||||
v := url.Values{}
|
v := url.Values{}
|
||||||
v.Set("id", task.Id)
|
v.Set("id", task.Id)
|
||||||
|
|
||||||
resp, err := doRequest("POST", "http://"+task.Node+":8000/stop", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
spider := Spider{}
|
||||||
|
spider.Method = "POST"
|
||||||
|
spider.URL = "http://" + task.Node + ":8000/stop"
|
||||||
|
spider.Data = v
|
||||||
|
spider.ContentType = "application/x-www-form-urlencoded"
|
||||||
|
|
||||||
|
err := spider.do()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn(err.Error())
|
log.Warn(err.Error())
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
resp := spider.getResponse()
|
||||||
body, err := ioutil.ReadAll(resp.Body)
|
body, err := ioutil.ReadAll(resp.Body)
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -445,17 +463,7 @@ func (jm *JobManager) stop(force bool) MsgStop {
|
|||||||
if res.Code != 0 {
|
if res.Code != 0 {
|
||||||
log.Warn(res.Error)
|
log.Warn(res.Error)
|
||||||
}
|
}
|
||||||
log.Info(jm.job.Name, ":", task.HostName, " is killed:", task.Id)
|
|
||||||
}(taskStatus)
|
}(taskStatus)
|
||||||
}
|
}
|
||||||
|
|
||||||
go func() {
|
|
||||||
if force {
|
|
||||||
jm.killFlag = true
|
|
||||||
jm.scheduler.UpdateProgress(jm.job, Stopped)
|
|
||||||
jm.job.Status = Stopped
|
|
||||||
log.Info("kill job, ", jm.job.Name)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
return MsgStop{Code: 0}
|
return MsgStop{Code: 0}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"sync"
|
"sync"
|
||||||
"runtime"
|
"runtime"
|
||||||
"fmt"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type Logger struct {
|
type Logger struct {
|
||||||
@@ -19,29 +18,59 @@ func (logger *Logger) Init() {
|
|||||||
logger.LoggerModuleDisabled = map[string]bool{}
|
logger.LoggerModuleDisabled = map[string]bool{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (logger *Logger) Debug(args ... interface{}) {
|
func (logger *Logger) Debug(args ...interface{}) {
|
||||||
_log.Debug(args)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (logger *Logger) Info(args ... interface{}) {
|
|
||||||
pc, _, _, ok := runtime.Caller(1)
|
pc, _, _, ok := runtime.Caller(1)
|
||||||
details := runtime.FuncForPC(pc)
|
details := runtime.FuncForPC(pc)
|
||||||
|
module := "unknown"
|
||||||
if ok && details != nil {
|
if ok && details != nil {
|
||||||
fmt.Printf("called from %s\n", details.Name())
|
module = details.Name()
|
||||||
}
|
}
|
||||||
_log.Info(args)
|
args = append(args, module)
|
||||||
|
_log.Debug(args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (logger *Logger) Warn(args ... interface{}) {
|
func (logger *Logger) Info(args ...interface{}) {
|
||||||
_log.Warn(args)
|
pc, _, _, ok := runtime.Caller(1)
|
||||||
|
details := runtime.FuncForPC(pc)
|
||||||
|
module := "unknown"
|
||||||
|
if ok && details != nil {
|
||||||
|
module = details.Name()
|
||||||
|
}
|
||||||
|
args = append(args, module)
|
||||||
|
_log.Info(args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (logger *Logger) Fatal(args ... interface{}) {
|
func (logger *Logger) Warn(args ...interface{}) {
|
||||||
_log.Fatal(args)
|
pc, _, _, ok := runtime.Caller(1)
|
||||||
|
details := runtime.FuncForPC(pc)
|
||||||
|
module := "unknown"
|
||||||
|
if ok && details != nil {
|
||||||
|
module = details.Name()
|
||||||
|
}
|
||||||
|
args = append(args, module)
|
||||||
|
_log.Warn(args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (logger *Logger) Fatalf(format string, args ... interface{}) {
|
func (logger *Logger) Fatal(args ...interface{}) {
|
||||||
_log.Fatalf(format, args)
|
pc, _, _, ok := runtime.Caller(1)
|
||||||
|
details := runtime.FuncForPC(pc)
|
||||||
|
module := "unknown"
|
||||||
|
if ok && details != nil {
|
||||||
|
module = details.Name()
|
||||||
|
}
|
||||||
|
args = append(args, module)
|
||||||
|
_log.Fatal(args...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (logger *Logger) Fatalf(format string, args ...interface{}) {
|
||||||
|
pc, _, _, ok := runtime.Caller(1)
|
||||||
|
details := runtime.FuncForPC(pc)
|
||||||
|
module := "unknown"
|
||||||
|
if ok && details != nil {
|
||||||
|
module = details.Name()
|
||||||
|
}
|
||||||
|
args = append(args, module)
|
||||||
|
_log.Fatalf(format, args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (logger *Logger) SetOutput(f io.Writer) {
|
func (logger *Logger) SetOutput(f io.Writer) {
|
||||||
|
|||||||
152
src/main.go
152
src/main.go
@@ -7,6 +7,7 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"os"
|
"os"
|
||||||
|
"fmt"
|
||||||
)
|
)
|
||||||
|
|
||||||
var log Logger
|
var log Logger
|
||||||
@@ -15,6 +16,7 @@ var scheduler Scheduler
|
|||||||
|
|
||||||
func serverAPI(w http.ResponseWriter, r *http.Request) {
|
func serverAPI(w http.ResponseWriter, r *http.Request) {
|
||||||
switch r.URL.Query().Get("action") {
|
switch r.URL.Query().Get("action") {
|
||||||
|
/* resource pool */
|
||||||
case "agent_report":
|
case "agent_report":
|
||||||
log.Debug("agent_report")
|
log.Debug("agent_report")
|
||||||
msgAgentReport := MsgAgentReport{Code: 0}
|
msgAgentReport := MsgAgentReport{Code: 0}
|
||||||
@@ -50,6 +52,28 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
w.Write(js)
|
w.Write(js)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
case "pool_status_history":
|
||||||
|
log.Debug("pool_status_history")
|
||||||
|
js, _ := json.Marshal(InstanceOfResourcePool().statusHistory())
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
|
case "get_counter":
|
||||||
|
log.Debug("get_counters")
|
||||||
|
js, _ := json.Marshal(InstanceOfResourcePool().getCounter())
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
|
case "debug_pool_dump":
|
||||||
|
log.Debug("debug_pool_dump")
|
||||||
|
js, _ := json.Marshal(InstanceOfResourcePool().Dump())
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
|
/* scheduler */
|
||||||
case "job_submit":
|
case "job_submit":
|
||||||
var job Job
|
var job Job
|
||||||
log.Debug("job_submit")
|
log.Debug("job_submit")
|
||||||
@@ -95,6 +119,42 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
w.Write(js)
|
w.Write(js)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
case "job_stop":
|
||||||
|
log.Debug("job_stop")
|
||||||
|
js, _ := json.Marshal(scheduler.Stop(string(r.PostFormValue("id"))))
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
|
case "task_logs":
|
||||||
|
log.Debug("task_logs")
|
||||||
|
js, _ := json.Marshal(scheduler.QueryLogs(r.URL.Query().Get("job"), r.URL.Query().Get("task")))
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
|
case "jobs":
|
||||||
|
log.Debug("job_list")
|
||||||
|
js, _ := json.Marshal(scheduler.ListJobs())
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
|
case "debug_scheduler_dump":
|
||||||
|
log.Debug("debug_scheduler_dump")
|
||||||
|
js, _ := json.Marshal(scheduler.DebugDump())
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
|
case "summary":
|
||||||
|
log.Debug("summary")
|
||||||
|
js, _ := json.Marshal(scheduler.Summary())
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
|
/* optimizer */
|
||||||
case "job_predict_req":
|
case "job_predict_req":
|
||||||
log.Debug("job_predict_req")
|
log.Debug("job_predict_req")
|
||||||
var job Job
|
var job Job
|
||||||
@@ -143,48 +203,15 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
w.Write(js)
|
w.Write(js)
|
||||||
break
|
break
|
||||||
|
|
||||||
case "job_stop":
|
/* job history logger */
|
||||||
log.Debug("job_stop")
|
case "jhl_job_status":
|
||||||
js, _ := json.Marshal(scheduler.Stop(string(r.PostFormValue("id"))))
|
log.Debug("jhl_job_status")
|
||||||
w.Header().Set("Content-Type", "application/json")
|
js, _ := json.Marshal(InstanceJobHistoryLogger().getTaskStatus(r.URL.Query().Get("job")))
|
||||||
w.Write(js)
|
|
||||||
break
|
|
||||||
|
|
||||||
case "task_logs":
|
|
||||||
log.Debug("task_logs")
|
|
||||||
js, _ := json.Marshal(scheduler.QueryLogs(r.URL.Query().Get("job"), r.URL.Query().Get("task")))
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.Write(js)
|
|
||||||
break
|
|
||||||
|
|
||||||
case "jobs":
|
|
||||||
log.Debug("job_list")
|
|
||||||
js, _ := json.Marshal(scheduler.ListJobs())
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.Write(js)
|
|
||||||
break
|
|
||||||
|
|
||||||
case "summary":
|
|
||||||
log.Debug("summary")
|
|
||||||
js, _ := json.Marshal(scheduler.Summary())
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.Write(js)
|
|
||||||
break
|
|
||||||
|
|
||||||
case "pool_status_history":
|
|
||||||
log.Debug("pool_status_history")
|
|
||||||
js, _ := json.Marshal(InstanceOfResourcePool().statusHistory())
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.Write(js)
|
|
||||||
break
|
|
||||||
|
|
||||||
case "get_counter":
|
|
||||||
log.Debug("get_counters")
|
|
||||||
js, _ := json.Marshal(InstanceOfResourcePool().getCounter())
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
w.Write(js)
|
w.Write(js)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
/* group */
|
||||||
case "group_list":
|
case "group_list":
|
||||||
log.Debug("group_list")
|
log.Debug("group_list")
|
||||||
js, _ := json.Marshal(InstanceOfGroupManager().List())
|
js, _ := json.Marshal(InstanceOfGroupManager().List())
|
||||||
@@ -227,7 +254,6 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
break
|
break
|
||||||
|
|
||||||
case "group_remove":
|
case "group_remove":
|
||||||
/* TODO: rearrange jobs to other queues */
|
|
||||||
log.Debug("group_remove")
|
log.Debug("group_remove")
|
||||||
var group Group
|
var group Group
|
||||||
msg := MsgGroupCreate{Code: 0}
|
msg := MsgGroupCreate{Code: 0}
|
||||||
@@ -244,27 +270,7 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
w.Write(js)
|
w.Write(js)
|
||||||
break
|
break
|
||||||
|
|
||||||
case "jhl_job_status":
|
/* configuration */
|
||||||
log.Debug("jhl_job_status")
|
|
||||||
js, _ := json.Marshal(InstanceJobHistoryLogger().getTaskStatus(r.URL.Query().Get("job")))
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.Write(js)
|
|
||||||
break
|
|
||||||
|
|
||||||
case "debug_scheduler_dump":
|
|
||||||
log.Debug("debug_scheduler_dump")
|
|
||||||
js, _ := json.Marshal(scheduler.DebugDump())
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.Write(js)
|
|
||||||
break
|
|
||||||
|
|
||||||
case "debug_pool_dump":
|
|
||||||
log.Debug("debug_pool_dump")
|
|
||||||
js, _ := json.Marshal(InstanceOfResourcePool().Dump())
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.Write(js)
|
|
||||||
break
|
|
||||||
|
|
||||||
case "conf_list":
|
case "conf_list":
|
||||||
log.Debug("conf_list")
|
log.Debug("conf_list")
|
||||||
var msg MsgConfList
|
var msg MsgConfList
|
||||||
@@ -308,8 +314,9 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
break
|
break
|
||||||
|
|
||||||
case "pool.batch.interval":
|
case "pool.batch.interval":
|
||||||
interval := str2int(value, 1)
|
if interval, err := strconv.Atoi(value); err == nil {
|
||||||
ok = InstanceOfResourcePool().SetBatchInterval(interval)
|
ok = InstanceOfResourcePool().SetBatchInterval(interval)
|
||||||
|
}
|
||||||
break
|
break
|
||||||
|
|
||||||
/* scheduler.mock */
|
/* scheduler.mock */
|
||||||
@@ -324,8 +331,9 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
/* scheduler.parallelism */
|
/* scheduler.parallelism */
|
||||||
case "scheduler.parallelism":
|
case "scheduler.parallelism":
|
||||||
parallelism, _ := strconv.Atoi(value)
|
if parallelism, err := strconv.Atoi(value); err == nil {
|
||||||
ok = scheduler.UpdateParallelism(parallelism)
|
ok = scheduler.UpdateParallelism(parallelism)
|
||||||
|
}
|
||||||
break
|
break
|
||||||
|
|
||||||
/* allocator.strategy */
|
/* allocator.strategy */
|
||||||
@@ -333,6 +341,7 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
ok = InstanceOfAllocator().updateStrategy(value)
|
ok = InstanceOfAllocator().updateStrategy(value)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
/* logger */
|
||||||
case "logger.level":
|
case "logger.level":
|
||||||
ok = log.SetLoggerLevel(value)
|
ok = log.SetLoggerLevel(value)
|
||||||
break
|
break
|
||||||
@@ -350,7 +359,7 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
msg.Code = 0
|
msg.Code = 0
|
||||||
if !ok {
|
if !ok {
|
||||||
msg.Code = 1
|
msg.Code = 1
|
||||||
msg.Error = "Option not exist or invalid value"
|
msg.Error = fmt.Sprintf("Option (%s) not exist or invalid value (%s)", option, value)
|
||||||
}
|
}
|
||||||
js, _ := json.Marshal(msg)
|
js, _ := json.Marshal(msg)
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
@@ -381,15 +390,14 @@ func main() {
|
|||||||
log.SetOutput(f)
|
log.SetOutput(f)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* parse configuration */
|
config := InstanceOfConfiguration()
|
||||||
config := *InstanceOfConfiguration()
|
config.InitFromEnv()
|
||||||
|
|
||||||
/* init components */
|
/* init components */
|
||||||
InstanceOfResourcePool().init(config)
|
InstanceOfResourcePool().Start()
|
||||||
//InstanceOfCollector().init(config)
|
InstanceJobHistoryLogger().Start()
|
||||||
InstanceJobHistoryLogger().init(config)
|
InstanceOfOptimizer().Start()
|
||||||
InstanceOfOptimizer().Init(config)
|
InstanceOfGroupManager().Start()
|
||||||
InstanceOfGroupManager().init(config)
|
|
||||||
|
|
||||||
switch config.SchedulerPolicy {
|
switch config.SchedulerPolicy {
|
||||||
case "FCFS":
|
case "FCFS":
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ func InstanceOfMocker() *Mocker {
|
|||||||
return MockerInstance
|
return MockerInstance
|
||||||
}
|
}
|
||||||
|
|
||||||
func (mocker *Mocker) GetDuration(job Job, nodes []NodeStatus) int {
|
func (mocker *Mocker) GetDuration(job Job, nodes map[string]NodeStatus) int {
|
||||||
str := strings.Split(job.Name, "-")
|
str := strings.Split(job.Name, "-")
|
||||||
duration := 300
|
duration := 300
|
||||||
|
|
||||||
@@ -37,11 +37,11 @@ func (mocker *Mocker) GetDuration(job Job, nodes []NodeStatus) int {
|
|||||||
} else if len(job.Tasks) == 3 {
|
} else if len(job.Tasks) == 3 {
|
||||||
var psNodes []string
|
var psNodes []string
|
||||||
var workerNodes []string
|
var workerNodes []string
|
||||||
for i, task := range job.Tasks {
|
for _, task := range job.Tasks {
|
||||||
if task.IsPS {
|
if task.IsPS {
|
||||||
psNodes = append(psNodes, nodes[i].ClientHost)
|
psNodes = append(psNodes, nodes[task.Name].ClientHost)
|
||||||
} else {
|
} else {
|
||||||
workerNodes = append(workerNodes, nodes[i].ClientHost)
|
workerNodes = append(workerNodes, nodes[task.Name].ClientHost)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if psNodes[0] == workerNodes[0] {
|
if psNodes[0] == workerNodes[0] {
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ func InstanceOfOptimizer() *Optimizer {
|
|||||||
return optimizerInstance
|
return optimizerInstance
|
||||||
}
|
}
|
||||||
|
|
||||||
func (optimizer *Optimizer) Init(conf Configuration) {
|
func (optimizer *Optimizer) Start() {
|
||||||
log.Info("optimizer started")
|
log.Info("optimizer started")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import (
|
|||||||
|
|
||||||
func TestPool(t *testing.T) {
|
func TestPool(t *testing.T) {
|
||||||
return
|
return
|
||||||
InstanceOfResourcePool().init(Configuration{})
|
InstanceOfResourcePool().Start()
|
||||||
|
|
||||||
for j := 0; j < 100; j++ {
|
for j := 0; j < 100; j++ {
|
||||||
for i := 0; i < 1000; i++ {
|
for i := 0; i < 1000; i++ {
|
||||||
@@ -36,7 +36,7 @@ func TestPool(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestAllocate(t *testing.T) {
|
func TestAllocate(t *testing.T) {
|
||||||
InstanceOfResourcePool().init(Configuration{})
|
InstanceOfResourcePool().Start()
|
||||||
|
|
||||||
job := Job{Name: strconv.Itoa(int(time.Now().Unix() % 1000000000))}
|
job := Job{Name: strconv.Itoa(int(time.Now().Unix() % 1000000000))}
|
||||||
job.Group = "default"
|
job.Group = "default"
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"strconv"
|
"strconv"
|
||||||
"sort"
|
"sort"
|
||||||
@@ -51,11 +50,11 @@ type ResourcePool struct {
|
|||||||
exclusiveJobs map[string]bool
|
exclusiveJobs map[string]bool
|
||||||
|
|
||||||
TotalGPU int
|
TotalGPU int
|
||||||
TotalGPUMu sync.Mutex
|
|
||||||
TotalCPU int
|
TotalCPU int
|
||||||
TotalMemory int
|
TotalMemory int
|
||||||
|
TotalMu sync.Mutex
|
||||||
UsingGPU int
|
UsingGPU int
|
||||||
UsingGPUMu sync.Mutex
|
UsingMu sync.Mutex
|
||||||
|
|
||||||
enableBatch bool
|
enableBatch bool
|
||||||
batchJobs map[string]Job
|
batchJobs map[string]Job
|
||||||
@@ -64,7 +63,7 @@ type ResourcePool struct {
|
|||||||
batchInterval int
|
batchInterval int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) init(conf Configuration) {
|
func (pool *ResourcePool) Start() {
|
||||||
log.Info("RM started ")
|
log.Info("RM started ")
|
||||||
|
|
||||||
pool.networks = map[string]bool{}
|
pool.networks = map[string]bool{}
|
||||||
@@ -181,13 +180,13 @@ func (pool *ResourcePool) checkDeadNodes() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
seg.Lock.Lock()
|
seg.Lock.Lock()
|
||||||
pool.TotalGPUMu.Lock()
|
pool.TotalMu.Lock()
|
||||||
if _, ok := seg.Nodes[k]; ok {
|
if _, ok := seg.Nodes[k]; ok {
|
||||||
pool.TotalGPU -= len(seg.Nodes[k].Status)
|
pool.TotalGPU -= len(seg.Nodes[k].Status)
|
||||||
pool.TotalCPU -= seg.Nodes[k].NumCPU
|
pool.TotalCPU -= seg.Nodes[k].NumCPU
|
||||||
pool.TotalMemory -= seg.Nodes[k].MemTotal
|
pool.TotalMemory -= seg.Nodes[k].MemTotal
|
||||||
}
|
}
|
||||||
pool.TotalGPUMu.Unlock()
|
pool.TotalMu.Unlock()
|
||||||
delete(seg.Nodes, k)
|
delete(seg.Nodes, k)
|
||||||
seg.Lock.Unlock()
|
seg.Lock.Unlock()
|
||||||
pool.versionsMu.Lock()
|
pool.versionsMu.Lock()
|
||||||
@@ -297,11 +296,11 @@ func (pool *ResourcePool) saveStatusHistory() {
|
|||||||
}
|
}
|
||||||
pool.historyMu.Unlock()
|
pool.historyMu.Unlock()
|
||||||
|
|
||||||
pool.TotalGPUMu.Lock()
|
pool.TotalMu.Lock()
|
||||||
pool.TotalGPU = TotalGPU
|
pool.TotalGPU = TotalGPU
|
||||||
pool.TotalCPU = TotalCPU
|
pool.TotalCPU = TotalCPU
|
||||||
pool.TotalMemory = TotalMemGPU
|
pool.TotalMemory = TotalMemGPU
|
||||||
pool.TotalGPUMu.Unlock()
|
pool.TotalMu.Unlock()
|
||||||
time.Sleep(time.Second * 60)
|
time.Sleep(time.Second * 60)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -359,11 +358,11 @@ func (pool *ResourcePool) update(node NodeStatus) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* TODO: double check node do belong to this seg */
|
/* TODO: double check node do belong to this seg */
|
||||||
pool.TotalGPUMu.Lock()
|
pool.TotalMu.Lock()
|
||||||
pool.TotalGPU += len(node.Status)
|
pool.TotalGPU += len(node.Status)
|
||||||
pool.TotalCPU += node.NumCPU
|
pool.TotalCPU += node.NumCPU
|
||||||
pool.TotalMemory += node.MemTotal
|
pool.TotalMemory += node.MemTotal
|
||||||
pool.TotalGPUMu.Unlock()
|
pool.TotalMu.Unlock()
|
||||||
log.Info("node ", node.ClientID, " is online")
|
log.Info("node ", node.ClientID, " is online")
|
||||||
}
|
}
|
||||||
seg.Nodes[node.ClientID] = &node
|
seg.Nodes[node.ClientID] = &node
|
||||||
@@ -517,11 +516,17 @@ func (pool *ResourcePool) acquireNetwork() string {
|
|||||||
}
|
}
|
||||||
v := url.Values{}
|
v := url.Values{}
|
||||||
v.Set("name", network)
|
v.Set("name", network)
|
||||||
resp, err := doRequest("POST", "http://yao-agent-master:8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
spider := Spider{}
|
||||||
|
spider.Method = "POST"
|
||||||
|
spider.URL = "http://yao-agent-master:8000/create"
|
||||||
|
spider.Data = v
|
||||||
|
spider.ContentType = "application/x-www-form-urlencoded"
|
||||||
|
err := spider.do()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn(err.Error())
|
log.Warn(err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
resp := spider.getResponse()
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
pool.networksFree[network] = true
|
pool.networksFree[network] = true
|
||||||
pool.networks[network] = true
|
pool.networks[network] = true
|
||||||
@@ -580,86 +585,6 @@ func (pool *ResourcePool) countGPU() (int, int) {
|
|||||||
return pool.TotalGPU - pool.UsingGPU, pool.UsingGPU
|
return pool.TotalGPU - pool.UsingGPU, pool.UsingGPU
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) pickNode(candidates []*NodeStatus, availableGPUs map[string][]GPUStatus, task Task, job Job, nodes []NodeStatus) *NodeStatus {
|
|
||||||
|
|
||||||
/* shuffle */
|
|
||||||
r := rand.New(rand.NewSource(time.Now().Unix()))
|
|
||||||
for n := len(candidates); n > 0; n-- {
|
|
||||||
randIndex := r.Intn(n)
|
|
||||||
candidates[n-1], candidates[randIndex] = candidates[randIndex], candidates[n-1]
|
|
||||||
}
|
|
||||||
|
|
||||||
/* sort */
|
|
||||||
// single node, single GPU
|
|
||||||
sort.Slice(candidates, func(a, b int) bool {
|
|
||||||
diffA := pool.GPUModelToPower(candidates[a].Status[0].ProductName) - pool.GPUModelToPower(task.ModelGPU)
|
|
||||||
diffB := pool.GPUModelToPower(candidates[b].Status[0].ProductName) - pool.GPUModelToPower(task.ModelGPU)
|
|
||||||
|
|
||||||
if diffA > 0 && diffB >= 0 && diffA > diffB {
|
|
||||||
return false //b
|
|
||||||
}
|
|
||||||
if diffA < 0 && diffB < 0 && diffA > diffB {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if diffA < 0 && diffB >= 0 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if diffA == diffB {
|
|
||||||
if len(availableGPUs[candidates[a].ClientID]) == len(availableGPUs[candidates[b].ClientID]) {
|
|
||||||
return candidates[a].UtilCPU > candidates[b].UtilCPU
|
|
||||||
}
|
|
||||||
return len(availableGPUs[candidates[a].ClientID]) < len(availableGPUs[candidates[b].ClientID])
|
|
||||||
}
|
|
||||||
return true //a
|
|
||||||
})
|
|
||||||
|
|
||||||
var t []*NodeStatus
|
|
||||||
bestGPU := candidates[0].Status[0].ProductName
|
|
||||||
for _, node := range candidates {
|
|
||||||
if node.Status[0].ProductName != bestGPU {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
t = append(t, node)
|
|
||||||
}
|
|
||||||
candidates = t
|
|
||||||
|
|
||||||
if (len(job.Tasks) == 1) && task.NumberGPU > 1 { //single node, multi GPUs
|
|
||||||
sort.Slice(candidates, func(a, b int) bool {
|
|
||||||
if len(availableGPUs[candidates[a].ClientID]) == len(availableGPUs[candidates[b].ClientID]) {
|
|
||||||
return candidates[a].UtilCPU > candidates[b].UtilCPU
|
|
||||||
}
|
|
||||||
return len(availableGPUs[candidates[a].ClientID]) < len(availableGPUs[candidates[b].ClientID])
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(job.Tasks) > 1 { //multi nodes, multi GPUs
|
|
||||||
sort.Slice(candidates, func(a, b int) bool {
|
|
||||||
distanceA := 0
|
|
||||||
distanceB := 0
|
|
||||||
for _, node := range nodes {
|
|
||||||
if node.Rack != candidates[a].Rack {
|
|
||||||
distanceA += 10
|
|
||||||
}
|
|
||||||
if node.ClientID != candidates[a].ClientID {
|
|
||||||
distanceA += 1
|
|
||||||
}
|
|
||||||
if node.Rack != candidates[b].Rack {
|
|
||||||
distanceB += 10
|
|
||||||
}
|
|
||||||
if node.ClientID != candidates[b].ClientID {
|
|
||||||
distanceB += 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if distanceA == distanceB {
|
|
||||||
return len(availableGPUs[candidates[a].ClientID]) > len(availableGPUs[candidates[b].ClientID])
|
|
||||||
}
|
|
||||||
return distanceA*job.Locality < distanceB*job.Locality
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
return candidates[0]
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
|
func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
|
||||||
for i := range job.Tasks {
|
for i := range job.Tasks {
|
||||||
job.Tasks[i].Job = job.Name
|
job.Tasks[i].Job = job.Name
|
||||||
@@ -671,6 +596,7 @@ func (pool *ResourcePool) acquireResource(job Job) []NodeStatus {
|
|||||||
pool.batchJobs[job.Name] = job
|
pool.batchJobs[job.Name] = job
|
||||||
pool.batchMu.Unlock()
|
pool.batchMu.Unlock()
|
||||||
for {
|
for {
|
||||||
|
/* wait until request is satisfied */
|
||||||
pool.batchMu.Lock()
|
pool.batchMu.Lock()
|
||||||
if _, ok := pool.batchAllocations[job.Name]; ok {
|
if _, ok := pool.batchAllocations[job.Name]; ok {
|
||||||
pool.batchMu.Unlock()
|
pool.batchMu.Unlock()
|
||||||
@@ -785,9 +711,9 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if res.Status[i].UUID == node.Status[j].UUID {
|
if res.Status[i].UUID == node.Status[j].UUID {
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
pool.UsingGPUMu.Lock()
|
pool.UsingMu.Lock()
|
||||||
pool.UsingGPU ++
|
pool.UsingGPU ++
|
||||||
pool.UsingGPUMu.Unlock()
|
pool.UsingMu.Unlock()
|
||||||
}
|
}
|
||||||
node.Status[j].MemoryAllocated += task.MemoryGPU
|
node.Status[j].MemoryAllocated += task.MemoryGPU
|
||||||
res.Status[i].MemoryTotal = task.MemoryGPU
|
res.Status[i].MemoryTotal = task.MemoryGPU
|
||||||
@@ -895,9 +821,9 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if res.Status[i].UUID == node.Status[j].UUID {
|
if res.Status[i].UUID == node.Status[j].UUID {
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
pool.UsingGPUMu.Lock()
|
pool.UsingMu.Lock()
|
||||||
pool.UsingGPU ++
|
pool.UsingGPU ++
|
||||||
pool.UsingGPUMu.Unlock()
|
pool.UsingMu.Unlock()
|
||||||
}
|
}
|
||||||
node.Status[j].MemoryAllocated += task.MemoryGPU
|
node.Status[j].MemoryAllocated += task.MemoryGPU
|
||||||
res.Status[i].MemoryTotal = task.MemoryGPU
|
res.Status[i].MemoryTotal = task.MemoryGPU
|
||||||
@@ -989,9 +915,9 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if res.Status[i].UUID == node.Status[j].UUID {
|
if res.Status[i].UUID == node.Status[j].UUID {
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
pool.UsingGPUMu.Lock()
|
pool.UsingMu.Lock()
|
||||||
pool.UsingGPU ++
|
pool.UsingGPU ++
|
||||||
pool.UsingGPUMu.Unlock()
|
pool.UsingMu.Unlock()
|
||||||
}
|
}
|
||||||
node.Status[j].MemoryAllocated += task.MemoryGPU
|
node.Status[j].MemoryAllocated += task.MemoryGPU
|
||||||
res.Status[i].MemoryTotal = task.MemoryGPU
|
res.Status[i].MemoryTotal = task.MemoryGPU
|
||||||
@@ -1040,6 +966,11 @@ func (pool *ResourcePool) doAcquireResource(job Job) []NodeStatus {
|
|||||||
return ress
|
return ress
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
TODO:
|
||||||
|
bug-1: node is offline, unable to retrieve allocation info
|
||||||
|
bug-2: when node offline & back, allocation info is lost
|
||||||
|
*/
|
||||||
func (pool *ResourcePool) releaseResource(job Job, agent NodeStatus) {
|
func (pool *ResourcePool) releaseResource(job Job, agent NodeStatus) {
|
||||||
segID := pool.getNodePool(agent.ClientID)
|
segID := pool.getNodePool(agent.ClientID)
|
||||||
seg := pool.pools[segID]
|
seg := pool.pools[segID]
|
||||||
@@ -1052,7 +983,7 @@ func (pool *ResourcePool) releaseResource(job Job, agent NodeStatus) {
|
|||||||
node, ok := seg.Nodes[agent.ClientID]
|
node, ok := seg.Nodes[agent.ClientID]
|
||||||
/* in case node is offline */
|
/* in case node is offline */
|
||||||
if !ok {
|
if !ok {
|
||||||
/* TODO, update usingTotalGPU correctly */
|
/* bug-1 */
|
||||||
log.Warn("node ", agent.ClientID, " not present")
|
log.Warn("node ", agent.ClientID, " not present")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -1060,19 +991,18 @@ func (pool *ResourcePool) releaseResource(job Job, agent NodeStatus) {
|
|||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if gpu.UUID == node.Status[j].UUID {
|
if gpu.UUID == node.Status[j].UUID {
|
||||||
node.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
node.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
||||||
|
log.Debug(node.Status[j].MemoryAllocated)
|
||||||
if node.Status[j].MemoryAllocated < 0 {
|
if node.Status[j].MemoryAllocated < 0 {
|
||||||
// in case of error
|
/* bug-2: a node is offline and then online, the allocation info will be lost */
|
||||||
/* Case 0: a node is offline and then online, the allocation info will be lost */
|
|
||||||
log.Warn(node.ClientID, " UUID=", gpu.UUID, " More Memory Allocated")
|
log.Warn(node.ClientID, " UUID=", gpu.UUID, " More Memory Allocated")
|
||||||
node.Status[j].MemoryAllocated = 0
|
node.Status[j].MemoryAllocated = 0
|
||||||
}
|
}
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
pool.UsingGPUMu.Lock()
|
pool.UsingMu.Lock()
|
||||||
pool.UsingGPU--
|
pool.UsingGPU--
|
||||||
pool.UsingGPUMu.Unlock()
|
pool.UsingMu.Unlock()
|
||||||
log.Info(node.Status[j].UUID, " is released")
|
log.Info(node.Status[j].UUID, " is released")
|
||||||
}
|
}
|
||||||
//log.Info(node.Status[j].MemoryAllocated)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ type Scheduler interface {
|
|||||||
|
|
||||||
UpdateParallelism(parallelism int) bool
|
UpdateParallelism(parallelism int) bool
|
||||||
|
|
||||||
|
/* TODO: rearrange jobs to other queues */
|
||||||
updateGroup(group Group) bool
|
updateGroup(group Group) bool
|
||||||
|
|
||||||
DebugDump() map[string]interface{}
|
DebugDump() map[string]interface{}
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ func (scheduler *SchedulerFCFS) Stop(jobName string) MsgStop {
|
|||||||
if !ok {
|
if !ok {
|
||||||
return MsgStop{Code: 1, Error: "Job not exist!"}
|
return MsgStop{Code: 1, Error: "Job not exist!"}
|
||||||
}
|
}
|
||||||
return jm.stop(true)
|
return jm.stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFCFS) QueryLogs(jobName string, taskName string) MsgLog {
|
func (scheduler *SchedulerFCFS) QueryLogs(jobName string, taskName string) MsgLog {
|
||||||
|
|||||||
@@ -272,7 +272,7 @@ func (scheduler *SchedulerCapacity) Stop(jobName string) MsgStop {
|
|||||||
if !ok {
|
if !ok {
|
||||||
return MsgStop{Code: 1, Error: "Job not exist!"}
|
return MsgStop{Code: 1, Error: "Job not exist!"}
|
||||||
}
|
}
|
||||||
return jm.stop(true)
|
return jm.stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerCapacity) QueryLogs(jobName string, taskName string) MsgLog {
|
func (scheduler *SchedulerCapacity) QueryLogs(jobName string, taskName string) MsgLog {
|
||||||
|
|||||||
@@ -379,6 +379,7 @@ func (scheduler *SchedulerFair) UpdateProgress(job Job, state State) {
|
|||||||
if scheduler.history[i].Name == job.Name {
|
if scheduler.history[i].Name == job.Name {
|
||||||
scheduler.history[i].Status = Running
|
scheduler.history[i].Status = Running
|
||||||
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
|
scheduler.history[i].UpdatedAt = int(time.Now().Unix())
|
||||||
|
scheduler.history[i].StartedAt = time.Now().Unix()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
@@ -694,7 +695,7 @@ func (scheduler *SchedulerFair) Stop(jobName string) MsgStop {
|
|||||||
jm, ok := scheduler.jobs[jobName]
|
jm, ok := scheduler.jobs[jobName]
|
||||||
scheduler.queuesMu.Unlock()
|
scheduler.queuesMu.Unlock()
|
||||||
if ok {
|
if ok {
|
||||||
return jm.stop(true)
|
return jm.stop()
|
||||||
} else {
|
} else {
|
||||||
found := false
|
found := false
|
||||||
for queue := range scheduler.queues {
|
for queue := range scheduler.queues {
|
||||||
@@ -802,6 +803,9 @@ func (scheduler *SchedulerFair) SetEnabled(enabled bool) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) UpdateParallelism(parallelism int) bool {
|
func (scheduler *SchedulerFair) UpdateParallelism(parallelism int) bool {
|
||||||
|
if parallelism < 1 {
|
||||||
|
parallelism = 1
|
||||||
|
}
|
||||||
scheduler.parallelism = parallelism
|
scheduler.parallelism = parallelism
|
||||||
log.Info("parallelism is updated to ", parallelism)
|
log.Info("parallelism is updated to ", parallelism)
|
||||||
return true
|
return true
|
||||||
|
|||||||
@@ -252,7 +252,7 @@ func (scheduler *SchedulerPriority) Stop(jobName string) MsgStop {
|
|||||||
if !ok {
|
if !ok {
|
||||||
return MsgStop{Code: 1, Error: "Job not exist!"}
|
return MsgStop{Code: 1, Error: "Job not exist!"}
|
||||||
}
|
}
|
||||||
return jm.stop(true)
|
return jm.stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerPriority) QueryLogs(jobName string, taskName string) MsgLog {
|
func (scheduler *SchedulerPriority) QueryLogs(jobName string, taskName string) MsgLog {
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ type Spider struct {
|
|||||||
ContentType string
|
ContentType string
|
||||||
Referer string
|
Referer string
|
||||||
Data url.Values
|
Data url.Values
|
||||||
Response *http.Response
|
Response *http.Response
|
||||||
}
|
}
|
||||||
|
|
||||||
func (spider *Spider) do() error {
|
func (spider *Spider) do() error {
|
||||||
@@ -25,17 +25,21 @@ func (spider *Spider) do() error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(spider.ContentType) > 0 {
|
if len(spider.ContentType) == 0 {
|
||||||
req.Header.Set("Content-Type", spider.ContentType)
|
spider.ContentType = ""
|
||||||
}
|
}
|
||||||
|
req.Header.Set("Content-Type", spider.ContentType)
|
||||||
|
|
||||||
|
/* set user-agent */
|
||||||
if len(spider.UserAgent) == 0 {
|
if len(spider.UserAgent) == 0 {
|
||||||
req.Header.Set("User-Agent", getUA())
|
spider.UserAgent = spider.getUA()
|
||||||
}
|
}
|
||||||
|
req.Header.Set("User-Agent", spider.UserAgent)
|
||||||
|
|
||||||
if len(spider.Referer) > 0 {
|
if len(spider.Referer) == 0 {
|
||||||
req.Header.Set("Referer", spider.Referer)
|
spider.Referer = ""
|
||||||
}
|
}
|
||||||
|
req.Header.Set("Referer", spider.Referer)
|
||||||
|
|
||||||
spider.Response, err = client.Do(req)
|
spider.Response, err = client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
42
src/util.go
42
src/util.go
@@ -2,10 +2,6 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"strconv"
|
"strconv"
|
||||||
"math/rand"
|
|
||||||
"time"
|
|
||||||
"io"
|
|
||||||
"net/http"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type Job struct {
|
type Job struct {
|
||||||
@@ -24,6 +20,7 @@ type Job struct {
|
|||||||
Locality int `json:"locality"`
|
Locality int `json:"locality"`
|
||||||
Status State `json:"status"`
|
Status State `json:"status"`
|
||||||
NumberGPU int `json:"number_GPU"`
|
NumberGPU int `json:"number_GPU"`
|
||||||
|
Retries int `json:"retries"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Task struct {
|
type Task struct {
|
||||||
@@ -65,43 +62,10 @@ type ResourceCount struct {
|
|||||||
Memory int
|
Memory int
|
||||||
}
|
}
|
||||||
|
|
||||||
func str2int(str string, defaultValue int) int {
|
func str2int(s string, defaultValue int) int {
|
||||||
i, err := strconv.Atoi(str)
|
i, err := strconv.Atoi(s)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return i
|
return i
|
||||||
}
|
}
|
||||||
return defaultValue
|
return defaultValue
|
||||||
}
|
}
|
||||||
|
|
||||||
func getUA() string {
|
|
||||||
rand.Seed(time.Now().Unix())
|
|
||||||
UAs := []string{
|
|
||||||
"Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0",
|
|
||||||
"Mozilla/5.0 (X11; Linux i586; rv:63.0) Gecko/20100101 Firefox/63.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:10.0) Gecko/20100101 Firefox/62.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.13; ko; rv:1.9.1b2) Gecko/20081201 Firefox/60.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/58.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
|
|
||||||
}
|
|
||||||
return UAs[rand.Intn(len(UAs))]
|
|
||||||
}
|
|
||||||
|
|
||||||
func doRequest(method string, url string, r io.Reader, contentType string, referer string) (*http.Response, error) {
|
|
||||||
client := &http.Client{}
|
|
||||||
req, err := http.NewRequest(method, url, r)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
req.Header.Set("Content-Type", contentType)
|
|
||||||
req.Header.Set("User-Agent", getUA())
|
|
||||||
req.Header.Set("Referer", referer)
|
|
||||||
|
|
||||||
resp, err := client.Do(req)
|
|
||||||
return resp, err
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user