mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-15 08:16:43 +00:00
bugfix
This commit is contained in:
@@ -3,11 +3,14 @@ package main
|
|||||||
import (
|
import (
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
"sync"
|
"sync"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Optimizer struct {
|
type Optimizer struct {
|
||||||
scheduler Scheduler
|
scheduler Scheduler
|
||||||
killedFlag bool
|
killedFlag bool
|
||||||
|
|
||||||
|
predicts map[string]OptimizerJobExecutionTime
|
||||||
}
|
}
|
||||||
|
|
||||||
var optimizerInstance *Optimizer
|
var optimizerInstance *Optimizer
|
||||||
@@ -19,11 +22,42 @@ func InstanceOfOptimizer() *Optimizer {
|
|||||||
|
|
||||||
if optimizerInstance == nil {
|
if optimizerInstance == nil {
|
||||||
optimizerInstance = &Optimizer{}
|
optimizerInstance = &Optimizer{}
|
||||||
|
optimizerInstance.predicts = map[string]OptimizerJobExecutionTime{}
|
||||||
}
|
}
|
||||||
return optimizerInstance
|
return optimizerInstance
|
||||||
}
|
}
|
||||||
|
|
||||||
func (jhl *Optimizer) feed(job string, utils []int) {
|
func (optimizer *Optimizer) feed(job string, utils []int) {
|
||||||
log.Info("optimizer feed")
|
log.Info("optimizer feed")
|
||||||
log.Info(job, utils)
|
log.Info(job, utils)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
str := strings.Split(job, "-")
|
||||||
|
if len(str) == 2 {
|
||||||
|
preCnt := 0
|
||||||
|
for i := 0; i < len(utils); i++ {
|
||||||
|
if utils[i] > 15 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
preCnt++
|
||||||
|
}
|
||||||
|
|
||||||
|
postCnt := 0
|
||||||
|
for i := len(utils); i >= 0; i-- {
|
||||||
|
if utils[i] > 15 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
postCnt++
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := optimizer.predicts[str[0]]; !ok {
|
||||||
|
optimizer.predicts[str[0]] = OptimizerJobExecutionTime{}
|
||||||
|
}
|
||||||
|
predict := optimizer.predicts[str[0]]
|
||||||
|
predict.Pre = ((predict.Pre * predict.Version) + preCnt) / (predict.Version + 1)
|
||||||
|
predict.Post = ((predict.Post * predict.Version) + postCnt) / (predict.Version + 1)
|
||||||
|
predict.Total = ((predict.Total * predict.Version) + len(utils)) / (predict.Version + 1)
|
||||||
|
predict.Version++
|
||||||
|
}
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,8 +27,9 @@ type ResourcePool struct {
|
|||||||
counter int
|
counter int
|
||||||
counterTotal int
|
counterTotal int
|
||||||
|
|
||||||
bindings map[string]map[string]bool
|
bindings map[string]map[string]bool
|
||||||
utils map[string][]int
|
utils map[string][]int
|
||||||
|
bindingMu sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) start() {
|
func (pool *ResourcePool) start() {
|
||||||
@@ -210,6 +211,8 @@ func (pool *ResourcePool) releaseNetwork(network string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) attach(GPU string, job string) {
|
func (pool *ResourcePool) attach(GPU string, job string) {
|
||||||
|
pool.bindingMu.Lock()
|
||||||
|
defer pool.bindingMu.Unlock()
|
||||||
if _, ok := pool.bindings[GPU]; !ok {
|
if _, ok := pool.bindings[GPU]; !ok {
|
||||||
pool.bindings[GPU] = map[string]bool{}
|
pool.bindings[GPU] = map[string]bool{}
|
||||||
}
|
}
|
||||||
@@ -221,6 +224,8 @@ func (pool *ResourcePool) attach(GPU string, job string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) detach(GPU string, jobName string) {
|
func (pool *ResourcePool) detach(GPU string, jobName string) {
|
||||||
|
pool.bindingMu.Lock()
|
||||||
|
defer pool.bindingMu.Unlock()
|
||||||
if _, ok := pool.bindings[GPU]; ok {
|
if _, ok := pool.bindings[GPU]; ok {
|
||||||
if len(pool.bindings[GPU]) == 1 {
|
if len(pool.bindings[GPU]) == 1 {
|
||||||
InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
|
InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
|
||||||
|
|||||||
@@ -206,6 +206,10 @@ func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
|
|||||||
for j := range nodes.Status {
|
for j := range nodes.Status {
|
||||||
if gpu.UUID == nodes.Status[j].UUID {
|
if gpu.UUID == nodes.Status[j].UUID {
|
||||||
nodes.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
nodes.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
||||||
|
if nodes.Status[j].MemoryAllocated < 0 {
|
||||||
|
// in case error
|
||||||
|
nodes.Status[j].MemoryAllocated = 0
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -376,4 +380,4 @@ func (scheduler *SchedulerFair) Attach(GPU string, job string) {
|
|||||||
|
|
||||||
func (scheduler *SchedulerFair) Detach(GPU string, job string) {
|
func (scheduler *SchedulerFair) Detach(GPU string, job string) {
|
||||||
pool.detach(GPU, job)
|
pool.detach(GPU, job)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ type GPUStatus struct {
|
|||||||
type NodeStatus struct {
|
type NodeStatus struct {
|
||||||
ClientID string `json:"id"`
|
ClientID string `json:"id"`
|
||||||
ClientHost string `json:"host"`
|
ClientHost string `json:"host"`
|
||||||
Version float64 `json:"version"`
|
Version float64 `json:"version"`
|
||||||
NumCPU int `json:"cpu_num"`
|
NumCPU int `json:"cpu_num"`
|
||||||
UtilCPU float64 `json:"cpu_load"`
|
UtilCPU float64 `json:"cpu_load"`
|
||||||
MemTotal int `json:"mem_total"`
|
MemTotal int `json:"mem_total"`
|
||||||
@@ -164,6 +164,13 @@ type MsgGroupList struct {
|
|||||||
Groups []Group `json:"groups"`
|
Groups []Group `json:"groups"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type OptimizerJobExecutionTime struct {
|
||||||
|
Pre int `json:"pre"`
|
||||||
|
Post int `json:"post"`
|
||||||
|
Total int `json:"total"`
|
||||||
|
Version int `json:"version"`
|
||||||
|
}
|
||||||
|
|
||||||
func str2int(str string, defaultValue int) int {
|
func str2int(str string, defaultValue int) int {
|
||||||
i, err := strconv.Atoi(str)
|
i, err := strconv.Atoi(str)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user