mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-06-07 22:31:55 +00:00
update preempt
This commit is contained in:
parent
0d67d4558e
commit
6f0a9617e4
@ -86,3 +86,8 @@ GPU is occupied by which job(s)
|
|||||||
```
|
```
|
||||||
?action=debug_scheduler_dump
|
?action=debug_scheduler_dump
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**DescribeJob**
|
||||||
|
```
|
||||||
|
?action=debug_optimizer_describe_job&job=
|
||||||
|
```
|
@ -22,6 +22,8 @@ type JobManager struct {
|
|||||||
killFlag bool
|
killFlag bool
|
||||||
|
|
||||||
network string
|
network string
|
||||||
|
|
||||||
|
stats [][]TaskStatus
|
||||||
}
|
}
|
||||||
|
|
||||||
func (jm *JobManager) start() {
|
func (jm *JobManager) start() {
|
||||||
@ -320,6 +322,8 @@ func (jm *JobManager) status() MsgJobStatus {
|
|||||||
go func() {
|
go func() {
|
||||||
jm.checkStatus(tasksStatus)
|
jm.checkStatus(tasksStatus)
|
||||||
}()
|
}()
|
||||||
|
jm.stats = append(jm.stats, tasksStatus)
|
||||||
|
|
||||||
}
|
}
|
||||||
return MsgJobStatus{Status: tasksStatus}
|
return MsgJobStatus{Status: tasksStatus}
|
||||||
}
|
}
|
||||||
|
@ -16,4 +16,8 @@ type TaskStatus struct {
|
|||||||
FinishedAt string `json:"finished_at"`
|
FinishedAt string `json:"finished_at"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
State map[string]interface{} `json:"state"`
|
State map[string]interface{} `json:"state"`
|
||||||
|
UtilCPU float64 `json:"cpu"`
|
||||||
|
Mem float64 `json:"mem"`
|
||||||
|
BwRX float64 `json:"bw_rx"`
|
||||||
|
BWTx float64 `json:"bw_tx"`
|
||||||
}
|
}
|
||||||
|
@ -256,6 +256,15 @@ func serverAPI(w http.ResponseWriter, r *http.Request) {
|
|||||||
w.Write(js)
|
w.Write(js)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
case "debug_optimizer_describe_job":
|
||||||
|
log.Debug("debug_optimizer_describe_job")
|
||||||
|
var job string
|
||||||
|
job = r.URL.Query().Get("job")
|
||||||
|
js, _ := json.Marshal(InstanceOfOptimizer().describe(job))
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(js)
|
||||||
|
break
|
||||||
|
|
||||||
case "debug_optimizer_train_dl":
|
case "debug_optimizer_train_dl":
|
||||||
log.Debug("debug_optimizer_train_dl")
|
log.Debug("debug_optimizer_train_dl")
|
||||||
InstanceOfOptimizer().train(r.URL.Query().Get("job"))
|
InstanceOfOptimizer().train(r.URL.Query().Get("job"))
|
||||||
|
@ -8,6 +8,7 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"time"
|
"time"
|
||||||
|
"math"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Optimizer struct {
|
type Optimizer struct {
|
||||||
@ -19,6 +20,8 @@ type Optimizer struct {
|
|||||||
jobUtilsGPU map[string]*OptimizerUtilGPU
|
jobUtilsGPU map[string]*OptimizerUtilGPU
|
||||||
|
|
||||||
cache map[string]*OptimizerJobExecutionTime
|
cache map[string]*OptimizerJobExecutionTime
|
||||||
|
|
||||||
|
stats map[string]map[string]float64
|
||||||
}
|
}
|
||||||
|
|
||||||
var optimizerInstance *Optimizer
|
var optimizerInstance *Optimizer
|
||||||
@ -33,6 +36,7 @@ func InstanceOfOptimizer() *Optimizer {
|
|||||||
optimizerInstance.predicts = map[string]*OptimizerJobExecutionTime{}
|
optimizerInstance.predicts = map[string]*OptimizerJobExecutionTime{}
|
||||||
optimizerInstance.jobUtilsGPU = map[string]*OptimizerUtilGPU{}
|
optimizerInstance.jobUtilsGPU = map[string]*OptimizerUtilGPU{}
|
||||||
optimizerInstance.cache = map[string]*OptimizerJobExecutionTime{}
|
optimizerInstance.cache = map[string]*OptimizerJobExecutionTime{}
|
||||||
|
optimizerInstance.stats = map[string]map[string]float64{}
|
||||||
}
|
}
|
||||||
return optimizerInstance
|
return optimizerInstance
|
||||||
}
|
}
|
||||||
@ -41,6 +45,65 @@ func (optimizer *Optimizer) init(conf Configuration) {
|
|||||||
log.Info("optimizer started")
|
log.Info("optimizer started")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (optimizer *Optimizer) feedStats(job string, stats [][]TaskStatus) {
|
||||||
|
var UtilsCPU []float64
|
||||||
|
var Mems []float64
|
||||||
|
var BwRxs []float64
|
||||||
|
var BwTxs []float64
|
||||||
|
for _, stat := range stats {
|
||||||
|
for _, task := range stat {
|
||||||
|
UtilsCPU = append(UtilsCPU, task.UtilCPU)
|
||||||
|
Mems = append(Mems, task.Mem)
|
||||||
|
BwRxs = append(BwRxs, task.BwRX)
|
||||||
|
BwTxs = append(BwTxs, task.BWTx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
optimizer.stats[job] = map[string]float64{
|
||||||
|
"cpu": optimizer.mean(UtilsCPU),
|
||||||
|
"cpu_std": optimizer.std(UtilsCPU),
|
||||||
|
"mem": optimizer.max(Mems),
|
||||||
|
"bw_rx": optimizer.mean(BwRxs),
|
||||||
|
"bw_tx": optimizer.mean(BwTxs),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (optimizer *Optimizer) max(values []float64) float64 {
|
||||||
|
value := 0.0
|
||||||
|
for _, v := range values {
|
||||||
|
if v < value {
|
||||||
|
value = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func (optimizer *Optimizer) mean(values []float64) float64 {
|
||||||
|
sum := 0.0
|
||||||
|
for _, v := range values {
|
||||||
|
sum += v
|
||||||
|
}
|
||||||
|
return sum / float64(len(values))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (optimizer *Optimizer) std(values []float64) float64 {
|
||||||
|
mean := optimizer.mean(values)
|
||||||
|
std := 0.0
|
||||||
|
for j := 0; j < len(values); j++ {
|
||||||
|
// The use of Pow math function func Pow(x, y float64) float64
|
||||||
|
std += math.Pow(values[j]-mean, 2)
|
||||||
|
}
|
||||||
|
// The use of Sqrt math function func Sqrt(x float64) float64
|
||||||
|
std = math.Sqrt(std / float64(len(values)))
|
||||||
|
return std
|
||||||
|
}
|
||||||
|
|
||||||
|
func (optimizer *Optimizer) describe(job string) map[string]float64 {
|
||||||
|
if stat, ok := optimizer.stats[job]; ok {
|
||||||
|
return stat
|
||||||
|
}
|
||||||
|
return map[string]float64{}
|
||||||
|
}
|
||||||
|
|
||||||
func (optimizer *Optimizer) feed(job string, utils []UtilGPUTimeSeries) {
|
func (optimizer *Optimizer) feed(job string, utils []UtilGPUTimeSeries) {
|
||||||
log.Info("optimizer feed")
|
log.Info("optimizer feed")
|
||||||
//log.Info(job, utils)
|
//log.Info(job, utils)
|
||||||
|
Loading…
Reference in New Issue
Block a user