mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-15 08:16:43 +00:00
add concurrent
This commit is contained in:
@@ -207,7 +207,6 @@ func main() {
|
|||||||
InstanceJobHistoryLogger().init()
|
InstanceJobHistoryLogger().init()
|
||||||
|
|
||||||
pool = &ResourcePool{}
|
pool = &ResourcePool{}
|
||||||
pool.nodes = make(map[string]NodeStatus)
|
|
||||||
pool.start()
|
pool.start()
|
||||||
|
|
||||||
switch config.SchedulerPolicy {
|
switch config.SchedulerPolicy {
|
||||||
|
|||||||
@@ -8,11 +8,15 @@ import (
|
|||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"hash/fnv"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ResourcePool struct {
|
type ResourcePool struct {
|
||||||
mu sync.Mutex
|
//mu sync.Mutex
|
||||||
nodes map[string]NodeStatus
|
//nodes map[string]NodeStatus
|
||||||
|
pools []map[string]NodeStatus
|
||||||
|
poolsMu []sync.Mutex
|
||||||
|
poolsCount int
|
||||||
|
|
||||||
history []PoolStatus
|
history []PoolStatus
|
||||||
|
|
||||||
@@ -33,6 +37,12 @@ type ResourcePool struct {
|
|||||||
utils map[string][]int
|
utils map[string][]int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pool *ResourcePool) getNodePool(name string) int {
|
||||||
|
h := fnv.New32a()
|
||||||
|
h.Write([]byte(name))
|
||||||
|
return int(h.Sum32()) % pool.poolsCount
|
||||||
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) start() {
|
func (pool *ResourcePool) start() {
|
||||||
//TODO: retrieve networks from yao-agent-master in blocking io
|
//TODO: retrieve networks from yao-agent-master in blocking io
|
||||||
pool.networks = map[string]bool{}
|
pool.networks = map[string]bool{}
|
||||||
@@ -42,6 +52,12 @@ func (pool *ResourcePool) start() {
|
|||||||
pool.bindings = map[string]map[string]bool{}
|
pool.bindings = map[string]map[string]bool{}
|
||||||
pool.utils = map[string][]int{}
|
pool.utils = map[string][]int{}
|
||||||
|
|
||||||
|
pool.poolsCount = 10
|
||||||
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
|
pool.pools = append(pool.pools, map[string]NodeStatus{})
|
||||||
|
pool.poolsMu = append(pool.poolsMu, sync.Mutex{})
|
||||||
|
}
|
||||||
|
|
||||||
/* check dead nodes */
|
/* check dead nodes */
|
||||||
go func() {
|
go func() {
|
||||||
pool.heartBeat = map[string]time.Time{}
|
pool.heartBeat = map[string]time.Time{}
|
||||||
@@ -50,10 +66,11 @@ func (pool *ResourcePool) start() {
|
|||||||
pool.heartBeatMu.Lock()
|
pool.heartBeatMu.Lock()
|
||||||
for k, v := range pool.heartBeat {
|
for k, v := range pool.heartBeat {
|
||||||
if v.Add(time.Second * 30).Before(time.Now()) {
|
if v.Add(time.Second * 30).Before(time.Now()) {
|
||||||
pool.mu.Lock()
|
poolID := pool.getNodePool(k)
|
||||||
delete(pool.nodes, k)
|
pool.poolsMu[poolID].Lock()
|
||||||
|
delete(pool.pools[poolID], k)
|
||||||
delete(pool.versions, k)
|
delete(pool.versions, k)
|
||||||
pool.mu.Unlock()
|
pool.poolsMu[poolID].Unlock()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.heartBeatMu.Unlock()
|
pool.heartBeatMu.Unlock()
|
||||||
@@ -78,24 +95,27 @@ func (pool *ResourcePool) start() {
|
|||||||
UtilGPU := 0
|
UtilGPU := 0
|
||||||
TotalMemGPU := 0
|
TotalMemGPU := 0
|
||||||
AvailableMemGPU := 0
|
AvailableMemGPU := 0
|
||||||
pool.mu.Lock()
|
nodesCount := 0
|
||||||
for _, node := range pool.nodes {
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
UtilCPU += node.UtilCPU
|
pool.poolsMu[i].Lock()
|
||||||
TotalCPU += node.NumCPU
|
for _, node := range pool.pools[i] {
|
||||||
TotalMem += node.MemTotal
|
UtilCPU += node.UtilCPU
|
||||||
AvailableMem += node.MemAvailable
|
TotalCPU += node.NumCPU
|
||||||
|
TotalMem += node.MemTotal
|
||||||
|
AvailableMem += node.MemAvailable
|
||||||
|
|
||||||
for _, GPU := range node.Status {
|
for _, GPU := range node.Status {
|
||||||
UtilGPU += GPU.UtilizationGPU
|
UtilGPU += GPU.UtilizationGPU
|
||||||
TotalGPU ++
|
TotalGPU ++
|
||||||
TotalMemGPU += GPU.MemoryTotal
|
TotalMemGPU += GPU.MemoryTotal
|
||||||
AvailableMemGPU += GPU.MemoryFree
|
AvailableMemGPU += GPU.MemoryFree
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
nodesCount += len(pool.pools[i])
|
||||||
|
pool.poolsMu[i].Unlock()
|
||||||
}
|
}
|
||||||
size := len(pool.nodes)
|
|
||||||
pool.mu.Unlock()
|
|
||||||
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
|
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
|
||||||
summary.UtilCPU = UtilCPU / (float64(size) + 0.001)
|
summary.UtilCPU = UtilCPU / (float64(nodesCount) + 0.001)
|
||||||
summary.TotalCPU = TotalCPU
|
summary.TotalCPU = TotalCPU
|
||||||
summary.TotalMem = TotalMem
|
summary.TotalMem = TotalMem
|
||||||
summary.AvailableMem = AvailableMem
|
summary.AvailableMem = AvailableMem
|
||||||
@@ -119,8 +139,10 @@ func (pool *ResourcePool) start() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) update(node NodeStatus) {
|
func (pool *ResourcePool) update(node NodeStatus) {
|
||||||
pool.mu.Lock()
|
poolID := pool.getNodePool(node.ClientID)
|
||||||
defer pool.mu.Unlock()
|
|
||||||
|
pool.poolsMu[poolID].Lock()
|
||||||
|
defer pool.poolsMu[poolID].Unlock()
|
||||||
|
|
||||||
go func(node NodeStatus) {
|
go func(node NodeStatus) {
|
||||||
pool.bindingsMu.Lock()
|
pool.bindingsMu.Lock()
|
||||||
@@ -145,7 +167,7 @@ func (pool *ResourcePool) update(node NodeStatus) {
|
|||||||
log.Debug(node.Version, "!=", pool.versions[node.ClientID])
|
log.Debug(node.Version, "!=", pool.versions[node.ClientID])
|
||||||
|
|
||||||
pool.counter++
|
pool.counter++
|
||||||
status, ok := pool.nodes[node.ClientID]
|
status, ok := pool.pools[poolID][node.ClientID]
|
||||||
if ok {
|
if ok {
|
||||||
for i, GPU := range status.Status {
|
for i, GPU := range status.Status {
|
||||||
if GPU.UUID == node.Status[i].UUID {
|
if GPU.UUID == node.Status[i].UUID {
|
||||||
@@ -153,16 +175,17 @@ func (pool *ResourcePool) update(node NodeStatus) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.nodes[node.ClientID] = node
|
pool.pools[poolID][node.ClientID] = node
|
||||||
pool.versions[node.ClientID] = node.Version
|
pool.versions[node.ClientID] = node.Version
|
||||||
log.Debug(pool.nodes)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) getByID(id string) NodeStatus {
|
func (pool *ResourcePool) getByID(id string) NodeStatus {
|
||||||
pool.mu.Lock()
|
poolID := pool.getNodePool(id)
|
||||||
defer pool.mu.Unlock()
|
|
||||||
|
|
||||||
status, ok := pool.nodes[id]
|
pool.poolsMu[poolID].Lock()
|
||||||
|
defer pool.poolsMu[poolID].Unlock()
|
||||||
|
|
||||||
|
status, ok := pool.pools[poolID][id]
|
||||||
if ok {
|
if ok {
|
||||||
return status
|
return status
|
||||||
}
|
}
|
||||||
@@ -170,7 +193,15 @@ func (pool *ResourcePool) getByID(id string) NodeStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) list() MsgResource {
|
func (pool *ResourcePool) list() MsgResource {
|
||||||
return MsgResource{Code: 0, Resource: pool.nodes}
|
nodes := map[string]NodeStatus{}
|
||||||
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
|
pool.poolsMu[i].Lock()
|
||||||
|
for k, node := range pool.pools[i] {
|
||||||
|
nodes[k] = node
|
||||||
|
}
|
||||||
|
pool.poolsMu[i].Unlock()
|
||||||
|
}
|
||||||
|
return MsgResource{Code: 0, Resource: nodes}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) statusHistory() MsgPoolStatusHistory {
|
func (pool *ResourcePool) statusHistory() MsgPoolStatusHistory {
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
"math/rand"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SchedulerFCFS struct {
|
type SchedulerFCFS struct {
|
||||||
@@ -86,11 +87,12 @@ func (scheduler *SchedulerFCFS) Schedule(job Job) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus {
|
func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus {
|
||||||
pool.mu.Lock()
|
poolID := rand.Intn(pool.poolsCount)
|
||||||
defer pool.mu.Unlock()
|
pool.poolsMu[poolID].Lock()
|
||||||
|
defer pool.poolsMu[poolID].Unlock()
|
||||||
|
|
||||||
res := NodeStatus{}
|
res := NodeStatus{}
|
||||||
for id, node := range pool.nodes {
|
for id, node := range pool.pools[poolID] {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
||||||
@@ -101,6 +103,8 @@ func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus {
|
|||||||
res.ClientID = id
|
res.ClientID = id
|
||||||
res.ClientHost = node.ClientHost
|
res.ClientHost = node.ClientHost
|
||||||
res.Status = available[0:task.NumberGPU]
|
res.Status = available[0:task.NumberGPU]
|
||||||
|
res.NumCPU = task.NumberCPU
|
||||||
|
res.MemTotal = task.Memory
|
||||||
|
|
||||||
for i := range res.Status {
|
for i := range res.Status {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
@@ -117,13 +121,20 @@ func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task) NodeStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFCFS) ReleaseResource(job Job, agent NodeStatus) {
|
func (scheduler *SchedulerFCFS) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
pool.mu.Lock()
|
poolID := rand.Intn(pool.poolsCount)
|
||||||
defer pool.mu.Unlock()
|
pool.poolsMu[poolID].Lock()
|
||||||
nodes := pool.nodes[agent.ClientID]
|
defer pool.poolsMu[poolID].Unlock()
|
||||||
|
|
||||||
|
node := pool.pools[poolID][agent.ClientID]
|
||||||
for _, gpu := range agent.Status {
|
for _, gpu := range agent.Status {
|
||||||
for j := range nodes.Status {
|
for j := range node.Status {
|
||||||
if gpu.UUID == nodes.Status[j].UUID {
|
if gpu.UUID == node.Status[j].UUID {
|
||||||
nodes.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
node.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
||||||
|
if node.Status[j].MemoryAllocated < 0 {
|
||||||
|
// in case of error
|
||||||
|
log.Warn(node.ClientID, "More Memory Allocated")
|
||||||
|
node.Status[j].MemoryAllocated = 0
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -199,14 +210,18 @@ func (scheduler *SchedulerFCFS) Summary() MsgSummary {
|
|||||||
FreeGPU := 0
|
FreeGPU := 0
|
||||||
UsingGPU := 0
|
UsingGPU := 0
|
||||||
|
|
||||||
for _, node := range pool.nodes {
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
for j := range node.Status {
|
pool.poolsMu[i].Lock()
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
for _, node := range pool.pools[i] {
|
||||||
FreeGPU++
|
for j := range node.Status {
|
||||||
} else {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
UsingGPU++
|
FreeGPU++
|
||||||
|
} else {
|
||||||
|
UsingGPU++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pool.poolsMu[i].Unlock()
|
||||||
}
|
}
|
||||||
summary.FreeGPU = FreeGPU
|
summary.FreeGPU = FreeGPU
|
||||||
summary.UsingGPU = UsingGPU
|
summary.UsingGPU = UsingGPU
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
"sort"
|
"sort"
|
||||||
|
"math/rand"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ResourceCount struct {
|
type ResourceCount struct {
|
||||||
@@ -17,8 +18,9 @@ type ResourceCount struct {
|
|||||||
type SchedulerFair struct {
|
type SchedulerFair struct {
|
||||||
history []*Job
|
history []*Job
|
||||||
queues map[string][]Job
|
queues map[string][]Job
|
||||||
mu sync.Mutex
|
queueMu sync.Mutex
|
||||||
scheduling sync.Mutex
|
schedulingMu sync.Mutex
|
||||||
|
schedulingJobsCnt int
|
||||||
jobs map[string]*JobManager
|
jobs map[string]*JobManager
|
||||||
nextQueue string
|
nextQueue string
|
||||||
resourceAllocations map[string]*ResourceCount
|
resourceAllocations map[string]*ResourceCount
|
||||||
@@ -46,6 +48,7 @@ func (scheduler *SchedulerFair) Start() {
|
|||||||
scheduler.queues["default"] = []Job{}
|
scheduler.queues["default"] = []Job{}
|
||||||
scheduler.resourceAllocations = map[string]*ResourceCount{}
|
scheduler.resourceAllocations = map[string]*ResourceCount{}
|
||||||
scheduler.enabled = true
|
scheduler.enabled = true
|
||||||
|
scheduler.schedulingJobsCnt = 0
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for {
|
for {
|
||||||
@@ -54,8 +57,14 @@ func (scheduler *SchedulerFair) Start() {
|
|||||||
if !scheduler.enabled {
|
if !scheduler.enabled {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
scheduler.scheduling.Lock()
|
scheduler.schedulingMu.Lock()
|
||||||
scheduler.mu.Lock()
|
if scheduler.schedulingJobsCnt >= pool.poolsCount {
|
||||||
|
scheduler.schedulingMu.Unlock()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
scheduler.schedulingJobsCnt++
|
||||||
|
scheduler.schedulingMu.Unlock()
|
||||||
|
scheduler.queueMu.Lock()
|
||||||
queue := scheduler.nextQueue
|
queue := scheduler.nextQueue
|
||||||
if len(scheduler.queues[queue]) > 0 {
|
if len(scheduler.queues[queue]) > 0 {
|
||||||
jm := JobManager{}
|
jm := JobManager{}
|
||||||
@@ -72,13 +81,12 @@ func (scheduler *SchedulerFair) Start() {
|
|||||||
jm.start()
|
jm.start()
|
||||||
}()
|
}()
|
||||||
} else {
|
} else {
|
||||||
log.Info("No more jobs to scheduling")
|
log.Info("No more jobs to scheduling", time.Now())
|
||||||
scheduler.scheduling.Unlock()
|
|
||||||
go func() {
|
go func() {
|
||||||
scheduler.UpdateNextQueue()
|
scheduler.UpdateNextQueue()
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
scheduler.mu.Unlock()
|
scheduler.queueMu.Unlock()
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
@@ -86,7 +94,9 @@ func (scheduler *SchedulerFair) Start() {
|
|||||||
func (scheduler *SchedulerFair) UpdateProgress(jobName string, state State) {
|
func (scheduler *SchedulerFair) UpdateProgress(jobName string, state State) {
|
||||||
switch state {
|
switch state {
|
||||||
case Running:
|
case Running:
|
||||||
scheduler.scheduling.Unlock()
|
scheduler.schedulingMu.Lock()
|
||||||
|
scheduler.schedulingJobsCnt--
|
||||||
|
scheduler.schedulingMu.Unlock()
|
||||||
|
|
||||||
for i := range scheduler.history {
|
for i := range scheduler.history {
|
||||||
if scheduler.history[i].Name == jobName {
|
if scheduler.history[i].Name == jobName {
|
||||||
@@ -115,8 +125,8 @@ func (scheduler *SchedulerFair) UpdateProgress(jobName string, state State) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) Schedule(job Job) {
|
func (scheduler *SchedulerFair) Schedule(job Job) {
|
||||||
scheduler.mu.Lock()
|
scheduler.queueMu.Lock()
|
||||||
defer scheduler.mu.Unlock()
|
defer scheduler.queueMu.Unlock()
|
||||||
|
|
||||||
queue := job.Group
|
queue := job.Group
|
||||||
_, ok := scheduler.queues[queue]
|
_, ok := scheduler.queues[queue]
|
||||||
@@ -156,11 +166,12 @@ func (scheduler *SchedulerFair) Schedule(job Job) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
||||||
pool.mu.Lock()
|
poolID := rand.Intn(pool.poolsCount)
|
||||||
defer pool.mu.Unlock()
|
pool.poolsMu[poolID].Lock()
|
||||||
|
defer pool.poolsMu[poolID].Unlock()
|
||||||
|
|
||||||
res := NodeStatus{}
|
res := NodeStatus{}
|
||||||
for id, node := range pool.nodes {
|
for id, node := range pool.pools[poolID] {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
||||||
@@ -206,9 +217,11 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task) NodeStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
|
func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
pool.mu.Lock()
|
poolID := rand.Intn(pool.poolsCount)
|
||||||
defer pool.mu.Unlock()
|
pool.poolsMu[poolID].Lock()
|
||||||
node := pool.nodes[agent.ClientID]
|
defer pool.poolsMu[poolID].Unlock()
|
||||||
|
|
||||||
|
node := pool.pools[poolID][agent.ClientID]
|
||||||
for _, gpu := range agent.Status {
|
for _, gpu := range agent.Status {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if gpu.UUID == node.Status[j].UUID {
|
if gpu.UUID == node.Status[j].UUID {
|
||||||
@@ -313,17 +326,19 @@ func (scheduler *SchedulerFair) Summary() MsgSummary {
|
|||||||
FreeGPU := 0
|
FreeGPU := 0
|
||||||
UsingGPU := 0
|
UsingGPU := 0
|
||||||
|
|
||||||
pool.mu.Lock()
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
for _, node := range pool.nodes {
|
pool.poolsMu[i].Lock()
|
||||||
for j := range node.Status {
|
for _, node := range pool.pools[i] {
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
for j := range node.Status {
|
||||||
FreeGPU++
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
} else {
|
FreeGPU++
|
||||||
UsingGPU++
|
} else {
|
||||||
|
UsingGPU++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pool.poolsMu[i].Unlock()
|
||||||
}
|
}
|
||||||
pool.mu.Unlock()
|
|
||||||
summary.FreeGPU = FreeGPU
|
summary.FreeGPU = FreeGPU
|
||||||
summary.UsingGPU = UsingGPU
|
summary.UsingGPU = UsingGPU
|
||||||
|
|
||||||
@@ -346,16 +361,18 @@ func (scheduler *SchedulerFair) UpdateNextQueue() {
|
|||||||
MemoryGPU := 0.00001
|
MemoryGPU := 0.00001
|
||||||
CPU := 0.00001
|
CPU := 0.00001
|
||||||
Memory := 0.0001
|
Memory := 0.0001
|
||||||
pool.mu.Lock()
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
for _, node := range pool.nodes {
|
pool.poolsMu[i].Lock()
|
||||||
CPU += float64(node.NumCPU)
|
for _, node := range pool.pools[i] {
|
||||||
Memory += float64(node.MemTotal)
|
CPU += float64(node.NumCPU)
|
||||||
for _, card := range node.Status {
|
Memory += float64(node.MemTotal)
|
||||||
NumberGPU += 1.0
|
for _, card := range node.Status {
|
||||||
MemoryGPU += float64(card.MemoryTotal)
|
NumberGPU += 1.0
|
||||||
|
MemoryGPU += float64(card.MemoryTotal)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
pool.poolsMu[i].Unlock()
|
||||||
}
|
}
|
||||||
pool.mu.Unlock()
|
|
||||||
|
|
||||||
for k, t := range scheduler.queues {
|
for k, t := range scheduler.queues {
|
||||||
if len(t) == 0 {
|
if len(t) == 0 {
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
"math/rand"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SchedulerPriority struct {
|
type SchedulerPriority struct {
|
||||||
@@ -110,11 +111,12 @@ func (scheduler *SchedulerPriority) Schedule(job Job) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStatus {
|
func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStatus {
|
||||||
pool.mu.Lock()
|
poolID := rand.Intn(pool.poolsCount)
|
||||||
defer pool.mu.Unlock()
|
pool.poolsMu[poolID].Lock()
|
||||||
|
defer pool.poolsMu[poolID].Unlock()
|
||||||
|
|
||||||
res := NodeStatus{}
|
res := NodeStatus{}
|
||||||
for id, node := range pool.nodes {
|
for id, node := range pool.pools[poolID] {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
||||||
@@ -125,6 +127,8 @@ func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStat
|
|||||||
res.ClientID = id
|
res.ClientID = id
|
||||||
res.ClientHost = node.ClientHost
|
res.ClientHost = node.ClientHost
|
||||||
res.Status = available[0:task.NumberGPU]
|
res.Status = available[0:task.NumberGPU]
|
||||||
|
res.NumCPU = task.NumberCPU
|
||||||
|
res.MemTotal = task.Memory
|
||||||
|
|
||||||
for i := range res.Status {
|
for i := range res.Status {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
@@ -141,13 +145,20 @@ func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task) NodeStat
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerPriority) ReleaseResource(job Job, agent NodeStatus) {
|
func (scheduler *SchedulerPriority) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
pool.mu.Lock()
|
poolID := rand.Intn(pool.poolsCount)
|
||||||
defer pool.mu.Unlock()
|
pool.poolsMu[poolID].Lock()
|
||||||
nodes := pool.nodes[agent.ClientID]
|
defer pool.poolsMu[poolID].Unlock()
|
||||||
|
|
||||||
|
node := pool.pools[poolID][agent.ClientID]
|
||||||
for _, gpu := range agent.Status {
|
for _, gpu := range agent.Status {
|
||||||
for j := range nodes.Status {
|
for j := range node.Status {
|
||||||
if gpu.UUID == nodes.Status[j].UUID {
|
if gpu.UUID == node.Status[j].UUID {
|
||||||
nodes.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
node.Status[j].MemoryAllocated -= gpu.MemoryTotal
|
||||||
|
if node.Status[j].MemoryAllocated < 0 {
|
||||||
|
// in case of error
|
||||||
|
log.Warn(node.ClientID, "More Memory Allocated")
|
||||||
|
node.Status[j].MemoryAllocated = 0
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -223,14 +234,18 @@ func (scheduler *SchedulerPriority) Summary() MsgSummary {
|
|||||||
FreeGPU := 0
|
FreeGPU := 0
|
||||||
UsingGPU := 0
|
UsingGPU := 0
|
||||||
|
|
||||||
for _, node := range pool.nodes {
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
for j := range node.Status {
|
pool.poolsMu[i].Lock()
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
for _, node := range pool.pools[i] {
|
||||||
FreeGPU++
|
for j := range node.Status {
|
||||||
} else {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
UsingGPU++
|
FreeGPU++
|
||||||
|
} else {
|
||||||
|
UsingGPU++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pool.poolsMu[i].Unlock()
|
||||||
}
|
}
|
||||||
summary.FreeGPU = FreeGPU
|
summary.FreeGPU = FreeGPU
|
||||||
summary.UsingGPU = UsingGPU
|
summary.UsingGPU = UsingGPU
|
||||||
|
|||||||
Reference in New Issue
Block a user