1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-12-15 08:16:43 +00:00

add concurrent

This commit is contained in:
2020-04-13 22:35:17 +08:00
parent 2705e12c52
commit 7e1c4349b9
5 changed files with 168 additions and 91 deletions

View File

@@ -207,7 +207,6 @@ func main() {
InstanceJobHistoryLogger().init() InstanceJobHistoryLogger().init()
pool = &ResourcePool{} pool = &ResourcePool{}
pool.nodes = make(map[string]NodeStatus)
pool.start() pool.start()
switch config.SchedulerPolicy { switch config.SchedulerPolicy {

View File

@@ -8,11 +8,15 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"math/rand" "math/rand"
"strconv" "strconv"
"hash/fnv"
) )
type ResourcePool struct { type ResourcePool struct {
mu sync.Mutex //mu sync.Mutex
nodes map[string]NodeStatus //nodes map[string]NodeStatus
pools []map[string]NodeStatus
poolsMu []sync.Mutex
poolsCount int
history []PoolStatus history []PoolStatus
@@ -33,6 +37,12 @@ type ResourcePool struct {
utils map[string][]int utils map[string][]int
} }
func (pool *ResourcePool) getNodePool(name string) int {
h := fnv.New32a()
h.Write([]byte(name))
return int(h.Sum32()) % pool.poolsCount
}
func (pool *ResourcePool) start() { func (pool *ResourcePool) start() {
//TODO: retrieve networks from yao-agent-master in blocking io //TODO: retrieve networks from yao-agent-master in blocking io
pool.networks = map[string]bool{} pool.networks = map[string]bool{}
@@ -42,6 +52,12 @@ func (pool *ResourcePool) start() {
pool.bindings = map[string]map[string]bool{} pool.bindings = map[string]map[string]bool{}
pool.utils = map[string][]int{} pool.utils = map[string][]int{}
pool.poolsCount = 10
for i := 0; i < pool.poolsCount; i++ {
pool.pools = append(pool.pools, map[string]NodeStatus{})
pool.poolsMu = append(pool.poolsMu, sync.Mutex{})
}
/* check dead nodes */ /* check dead nodes */
go func() { go func() {
pool.heartBeat = map[string]time.Time{} pool.heartBeat = map[string]time.Time{}
@@ -50,10 +66,11 @@ func (pool *ResourcePool) start() {
pool.heartBeatMu.Lock() pool.heartBeatMu.Lock()
for k, v := range pool.heartBeat { for k, v := range pool.heartBeat {
if v.Add(time.Second * 30).Before(time.Now()) { if v.Add(time.Second * 30).Before(time.Now()) {
pool.mu.Lock() poolID := pool.getNodePool(k)
delete(pool.nodes, k) pool.poolsMu[poolID].Lock()
delete(pool.pools[poolID], k)
delete(pool.versions, k) delete(pool.versions, k)
pool.mu.Unlock() pool.poolsMu[poolID].Unlock()
} }
} }
pool.heartBeatMu.Unlock() pool.heartBeatMu.Unlock()
@@ -78,24 +95,27 @@ func (pool *ResourcePool) start() {
UtilGPU := 0 UtilGPU := 0
TotalMemGPU := 0 TotalMemGPU := 0
AvailableMemGPU := 0 AvailableMemGPU := 0
pool.mu.Lock() nodesCount := 0
for _, node := range pool.nodes { for i := 0; i < pool.poolsCount; i++ {
UtilCPU += node.UtilCPU pool.poolsMu[i].Lock()
TotalCPU += node.NumCPU for _, node := range pool.pools[i] {
TotalMem += node.MemTotal UtilCPU += node.UtilCPU
AvailableMem += node.MemAvailable TotalCPU += node.NumCPU
TotalMem += node.MemTotal
AvailableMem += node.MemAvailable
for _, GPU := range node.Status { for _, GPU := range node.Status {
UtilGPU += GPU.UtilizationGPU UtilGPU += GPU.UtilizationGPU
TotalGPU ++ TotalGPU ++
TotalMemGPU += GPU.MemoryTotal TotalMemGPU += GPU.MemoryTotal
AvailableMemGPU += GPU.MemoryFree AvailableMemGPU += GPU.MemoryFree
}
} }
nodesCount += len(pool.pools[i])
pool.poolsMu[i].Unlock()
} }
size := len(pool.nodes)
pool.mu.Unlock()
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05") summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
summary.UtilCPU = UtilCPU / (float64(size) + 0.001) summary.UtilCPU = UtilCPU / (float64(nodesCount) + 0.001)
summary.TotalCPU = TotalCPU summary.TotalCPU = TotalCPU
summary.TotalMem = TotalMem summary.TotalMem = TotalMem
summary.AvailableMem = AvailableMem summary.AvailableMem = AvailableMem
@@ -119,8 +139,10 @@ func (pool *ResourcePool) start() {
} }
func (pool *ResourcePool) update(node NodeStatus) { func (pool *ResourcePool) update(node NodeStatus) {
pool.mu.Lock() poolID := pool.getNodePool(node.ClientID)
defer pool.mu.Unlock()
pool.poolsMu[poolID].Lock()
defer pool.poolsMu[poolID].Unlock()
go func(node NodeStatus) { go func(node NodeStatus) {
pool.bindingsMu.Lock() pool.bindingsMu.Lock()
@@ -145,7 +167,7 @@ func (pool *ResourcePool) update(node NodeStatus) {
log.Debug(node.Version, "!=", pool.versions[node.ClientID]) log.Debug(node.Version, "!=", pool.versions[node.ClientID])
pool.counter++ pool.counter++
status, ok := pool.nodes[node.ClientID] status, ok := pool.pools[poolID][node.ClientID]
if ok { if ok {
for i, GPU := range status.Status { for i, GPU := range status.Status {
if GPU.UUID == node.Status[i].UUID { if GPU.UUID == node.Status[i].UUID {
@@ -153,16 +175,17 @@ func (pool *ResourcePool) update(node NodeStatus) {
} }
} }
} }
pool.nodes[node.ClientID] = node pool.pools[poolID][node.ClientID] = node
pool.versions[node.ClientID] = node.Version pool.versions[node.ClientID] = node.Version
log.Debug(pool.nodes)
} }
func (pool *ResourcePool) getByID(id string) NodeStatus { func (pool *ResourcePool) getByID(id string) NodeStatus {
pool.mu.Lock() poolID := pool.getNodePool(id)
defer pool.mu.Unlock()
status, ok := pool.nodes[id] pool.poolsMu[poolID].Lock()
defer pool.poolsMu[poolID].Unlock()
status, ok := pool.pools[poolID][id]
if ok { if ok {
return status return status
} }
@@ -170,7 +193,15 @@ func (pool *ResourcePool) getByID(id string) NodeStatus {
} }
func (pool *ResourcePool) list() MsgResource { func (pool *ResourcePool) list() MsgResource {
return MsgResource{Code: 0, Resource: pool.nodes} nodes := map[string]NodeStatus{}
for i := 0; i < pool.poolsCount; i++ {
pool.poolsMu[i].Lock()
for k, node := range pool.pools[i] {
nodes[k] = node
}
pool.poolsMu[i].Unlock()
}
return MsgResource{Code: 0, Resource: nodes}
} }
func (pool *ResourcePool) statusHistory() MsgPoolStatusHistory { func (pool *ResourcePool) statusHistory() MsgPoolStatusHistory {

View File

@@ -4,6 +4,7 @@ import (
"sync" "sync"
"time" "time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"math/rand"
) )
type SchedulerFCFS struct { type SchedulerFCFS struct {
@@ -86,11 +87,12 @@ func (scheduler *SchedulerFCFS) Schedule(job Job) {
} }
func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus { func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus {
pool.mu.Lock() poolID := rand.Intn(pool.poolsCount)
defer pool.mu.Unlock() pool.poolsMu[poolID].Lock()
defer pool.poolsMu[poolID].Unlock()
res := NodeStatus{} res := NodeStatus{}
for id, node := range pool.nodes { for id, node := range pool.pools[poolID] {
var available []GPUStatus var available []GPUStatus
for _, status := range node.Status { for _, status := range node.Status {
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU { if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
@@ -101,6 +103,8 @@ func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus {
res.ClientID = id res.ClientID = id
res.ClientHost = node.ClientHost res.ClientHost = node.ClientHost
res.Status = available[0:task.NumberGPU] res.Status = available[0:task.NumberGPU]
res.NumCPU = task.NumberCPU
res.MemTotal = task.Memory
for i := range res.Status { for i := range res.Status {
for j := range node.Status { for j := range node.Status {
@@ -117,13 +121,20 @@ func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus {
} }
func (scheduler *SchedulerFCFS) ReleaseResource(job Job, agent NodeStatus) { func (scheduler *SchedulerFCFS) ReleaseResource(job Job, agent NodeStatus) {
pool.mu.Lock() poolID := rand.Intn(pool.poolsCount)
defer pool.mu.Unlock() pool.poolsMu[poolID].Lock()
nodes := pool.nodes[agent.ClientID] defer pool.poolsMu[poolID].Unlock()
node := pool.pools[poolID][agent.ClientID]
for _, gpu := range agent.Status { for _, gpu := range agent.Status {
for j := range nodes.Status { for j := range node.Status {
if gpu.UUID == nodes.Status[j].UUID { if gpu.UUID == node.Status[j].UUID {
nodes.Status[j].MemoryAllocated -= gpu.MemoryTotal node.Status[j].MemoryAllocated -= gpu.MemoryTotal
if node.Status[j].MemoryAllocated < 0 {
// in case of error
log.Warn(node.ClientID, "More Memory Allocated")
node.Status[j].MemoryAllocated = 0
}
} }
} }
} }
@@ -199,14 +210,18 @@ func (scheduler *SchedulerFCFS) Summary() MsgSummary {
FreeGPU := 0 FreeGPU := 0
UsingGPU := 0 UsingGPU := 0
for _, node := range pool.nodes { for i := 0; i < pool.poolsCount; i++ {
for j := range node.Status { pool.poolsMu[i].Lock()
if node.Status[j].MemoryAllocated == 0 { for _, node := range pool.pools[i] {
FreeGPU++ for j := range node.Status {
} else { if node.Status[j].MemoryAllocated == 0 {
UsingGPU++ FreeGPU++
} else {
UsingGPU++
}
} }
} }
pool.poolsMu[i].Unlock()
} }
summary.FreeGPU = FreeGPU summary.FreeGPU = FreeGPU
summary.UsingGPU = UsingGPU summary.UsingGPU = UsingGPU

View File

@@ -5,6 +5,7 @@ import (
"time" "time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"sort" "sort"
"math/rand"
) )
type ResourceCount struct { type ResourceCount struct {
@@ -17,8 +18,9 @@ type ResourceCount struct {
type SchedulerFair struct { type SchedulerFair struct {
history []*Job history []*Job
queues map[string][]Job queues map[string][]Job
mu sync.Mutex queueMu sync.Mutex
scheduling sync.Mutex schedulingMu sync.Mutex
schedulingJobsCnt int
jobs map[string]*JobManager jobs map[string]*JobManager
nextQueue string nextQueue string
resourceAllocations map[string]*ResourceCount resourceAllocations map[string]*ResourceCount
@@ -46,6 +48,7 @@ func (scheduler *SchedulerFair) Start() {
scheduler.queues["default"] = []Job{} scheduler.queues["default"] = []Job{}
scheduler.resourceAllocations = map[string]*ResourceCount{} scheduler.resourceAllocations = map[string]*ResourceCount{}
scheduler.enabled = true scheduler.enabled = true
scheduler.schedulingJobsCnt = 0
go func() { go func() {
for { for {
@@ -54,8 +57,14 @@ func (scheduler *SchedulerFair) Start() {
if !scheduler.enabled { if !scheduler.enabled {
continue continue
} }
scheduler.scheduling.Lock() scheduler.schedulingMu.Lock()
scheduler.mu.Lock() if scheduler.schedulingJobsCnt >= pool.poolsCount {
scheduler.schedulingMu.Unlock()
continue
}
scheduler.schedulingJobsCnt++
scheduler.schedulingMu.Unlock()
scheduler.queueMu.Lock()
queue := scheduler.nextQueue queue := scheduler.nextQueue
if len(scheduler.queues[queue]) > 0 { if len(scheduler.queues[queue]) > 0 {
jm := JobManager{} jm := JobManager{}
@@ -72,13 +81,12 @@ func (scheduler *SchedulerFair) Start() {
jm.start() jm.start()
}() }()
} else { } else {
log.Info("No more jobs to scheduling") log.Info("No more jobs to scheduling", time.Now())
scheduler.scheduling.Unlock()
go func() { go func() {
scheduler.UpdateNextQueue() scheduler.UpdateNextQueue()
}() }()
} }
scheduler.mu.Unlock() scheduler.queueMu.Unlock()
} }
}() }()
} }
@@ -86,7 +94,9 @@ func (scheduler *SchedulerFair) Start() {
func (scheduler *SchedulerFair) UpdateProgress(jobName string, state State) { func (scheduler *SchedulerFair) UpdateProgress(jobName string, state State) {
switch state { switch state {
case Running: case Running:
scheduler.scheduling.Unlock() scheduler.schedulingMu.Lock()
scheduler.schedulingJobsCnt--
scheduler.schedulingMu.Unlock()
for i := range scheduler.history { for i := range scheduler.history {
if scheduler.history[i].Name == jobName { if scheduler.history[i].Name == jobName {
@@ -115,8 +125,8 @@ func (scheduler *SchedulerFair) UpdateProgress(jobName string, state State) {
} }
func (scheduler *SchedulerFair) Schedule(job Job) { func (scheduler *SchedulerFair) Schedule(job Job) {
scheduler.mu.Lock() scheduler.queueMu.Lock()
defer scheduler.mu.Unlock() defer scheduler.queueMu.Unlock()
queue := job.Group queue := job.Group
_, ok := scheduler.queues[queue] _, ok := scheduler.queues[queue]
@@ -156,11 +166,12 @@ func (scheduler *SchedulerFair) Schedule(job Job) {
} }
func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus { func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
pool.mu.Lock() poolID := rand.Intn(pool.poolsCount)
defer pool.mu.Unlock() pool.poolsMu[poolID].Lock()
defer pool.poolsMu[poolID].Unlock()
res := NodeStatus{} res := NodeStatus{}
for id, node := range pool.nodes { for id, node := range pool.pools[poolID] {
var available []GPUStatus var available []GPUStatus
for _, status := range node.Status { for _, status := range node.Status {
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU { if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
@@ -206,9 +217,11 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
} }
func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) { func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
pool.mu.Lock() poolID := rand.Intn(pool.poolsCount)
defer pool.mu.Unlock() pool.poolsMu[poolID].Lock()
node := pool.nodes[agent.ClientID] defer pool.poolsMu[poolID].Unlock()
node := pool.pools[poolID][agent.ClientID]
for _, gpu := range agent.Status { for _, gpu := range agent.Status {
for j := range node.Status { for j := range node.Status {
if gpu.UUID == node.Status[j].UUID { if gpu.UUID == node.Status[j].UUID {
@@ -313,17 +326,19 @@ func (scheduler *SchedulerFair) Summary() MsgSummary {
FreeGPU := 0 FreeGPU := 0
UsingGPU := 0 UsingGPU := 0
pool.mu.Lock() for i := 0; i < pool.poolsCount; i++ {
for _, node := range pool.nodes { pool.poolsMu[i].Lock()
for j := range node.Status { for _, node := range pool.pools[i] {
if node.Status[j].MemoryAllocated == 0 { for j := range node.Status {
FreeGPU++ if node.Status[j].MemoryAllocated == 0 {
} else { FreeGPU++
UsingGPU++ } else {
UsingGPU++
}
} }
} }
pool.poolsMu[i].Unlock()
} }
pool.mu.Unlock()
summary.FreeGPU = FreeGPU summary.FreeGPU = FreeGPU
summary.UsingGPU = UsingGPU summary.UsingGPU = UsingGPU
@@ -346,16 +361,18 @@ func (scheduler *SchedulerFair) UpdateNextQueue() {
MemoryGPU := 0.00001 MemoryGPU := 0.00001
CPU := 0.00001 CPU := 0.00001
Memory := 0.0001 Memory := 0.0001
pool.mu.Lock() for i := 0; i < pool.poolsCount; i++ {
for _, node := range pool.nodes { pool.poolsMu[i].Lock()
CPU += float64(node.NumCPU) for _, node := range pool.pools[i] {
Memory += float64(node.MemTotal) CPU += float64(node.NumCPU)
for _, card := range node.Status { Memory += float64(node.MemTotal)
NumberGPU += 1.0 for _, card := range node.Status {
MemoryGPU += float64(card.MemoryTotal) NumberGPU += 1.0
MemoryGPU += float64(card.MemoryTotal)
}
} }
pool.poolsMu[i].Unlock()
} }
pool.mu.Unlock()
for k, t := range scheduler.queues { for k, t := range scheduler.queues {
if len(t) == 0 { if len(t) == 0 {

View File

@@ -4,6 +4,7 @@ import (
"sync" "sync"
"time" "time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"math/rand"
) )
type SchedulerPriority struct { type SchedulerPriority struct {
@@ -110,11 +111,12 @@ func (scheduler *SchedulerPriority) Schedule(job Job) {
} }
func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStatus { func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStatus {
pool.mu.Lock() poolID := rand.Intn(pool.poolsCount)
defer pool.mu.Unlock() pool.poolsMu[poolID].Lock()
defer pool.poolsMu[poolID].Unlock()
res := NodeStatus{} res := NodeStatus{}
for id, node := range pool.nodes { for id, node := range pool.pools[poolID] {
var available []GPUStatus var available []GPUStatus
for _, status := range node.Status { for _, status := range node.Status {
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU { if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
@@ -125,6 +127,8 @@ func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStat
res.ClientID = id res.ClientID = id
res.ClientHost = node.ClientHost res.ClientHost = node.ClientHost
res.Status = available[0:task.NumberGPU] res.Status = available[0:task.NumberGPU]
res.NumCPU = task.NumberCPU
res.MemTotal = task.Memory
for i := range res.Status { for i := range res.Status {
for j := range node.Status { for j := range node.Status {
@@ -141,13 +145,20 @@ func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStat
} }
func (scheduler *SchedulerPriority) ReleaseResource(job Job, agent NodeStatus) { func (scheduler *SchedulerPriority) ReleaseResource(job Job, agent NodeStatus) {
pool.mu.Lock() poolID := rand.Intn(pool.poolsCount)
defer pool.mu.Unlock() pool.poolsMu[poolID].Lock()
nodes := pool.nodes[agent.ClientID] defer pool.poolsMu[poolID].Unlock()
node := pool.pools[poolID][agent.ClientID]
for _, gpu := range agent.Status { for _, gpu := range agent.Status {
for j := range nodes.Status { for j := range node.Status {
if gpu.UUID == nodes.Status[j].UUID { if gpu.UUID == node.Status[j].UUID {
nodes.Status[j].MemoryAllocated -= gpu.MemoryTotal node.Status[j].MemoryAllocated -= gpu.MemoryTotal
if node.Status[j].MemoryAllocated < 0 {
// in case of error
log.Warn(node.ClientID, "More Memory Allocated")
node.Status[j].MemoryAllocated = 0
}
} }
} }
} }
@@ -223,14 +234,18 @@ func (scheduler *SchedulerPriority) Summary() MsgSummary {
FreeGPU := 0 FreeGPU := 0
UsingGPU := 0 UsingGPU := 0
for _, node := range pool.nodes { for i := 0; i < pool.poolsCount; i++ {
for j := range node.Status { pool.poolsMu[i].Lock()
if node.Status[j].MemoryAllocated == 0 { for _, node := range pool.pools[i] {
FreeGPU++ for j := range node.Status {
} else { if node.Status[j].MemoryAllocated == 0 {
UsingGPU++ FreeGPU++
} else {
UsingGPU++
}
} }
} }
pool.poolsMu[i].Unlock()
} }
summary.FreeGPU = FreeGPU summary.FreeGPU = FreeGPU
summary.UsingGPU = UsingGPU summary.UsingGPU = UsingGPU