1
0
mirror of https://github.com/newnius/YAO-scheduler.git synced 2025-06-07 22:31:55 +00:00
YAO-scheduler/src/resource_pool.go

286 lines
6.6 KiB
Go
Raw Normal View History

2019-03-04 09:19:55 +00:00
package main
import (
"sync"
2019-04-29 09:05:15 +00:00
"time"
2019-06-05 09:09:22 +00:00
"net/url"
"strings"
2019-07-10 12:40:43 +00:00
log "github.com/sirupsen/logrus"
2019-06-05 09:09:22 +00:00
"math/rand"
"strconv"
2020-04-13 14:35:17 +00:00
"hash/fnv"
2019-10-24 12:25:59 +00:00
)
2019-03-04 09:19:55 +00:00
type ResourcePool struct {
2020-04-13 14:35:17 +00:00
//mu sync.Mutex
//nodes map[string]NodeStatus
pools []map[string]NodeStatus
poolsMu []sync.Mutex
poolsCount int
2019-04-29 09:05:15 +00:00
2019-04-29 12:57:32 +00:00
history []PoolStatus
2019-06-04 03:08:49 +00:00
2020-04-13 12:29:58 +00:00
heartBeat map[string]time.Time
heartBeatMu sync.Mutex
2019-06-05 09:09:22 +00:00
networks map[string]bool
networksFree map[string]bool
networkMu sync.Mutex
2020-03-29 13:12:44 +00:00
2020-04-10 08:50:36 +00:00
versions map[string]float64
2020-03-29 13:12:44 +00:00
counter int
counterTotal int
2020-04-11 03:38:04 +00:00
2020-04-13 12:29:58 +00:00
bindings map[string]map[string]bool
bindingsMu sync.Mutex
utils map[string][]int
2019-04-29 09:05:15 +00:00
}
2020-04-13 14:35:17 +00:00
func (pool *ResourcePool) getNodePool(name string) int {
h := fnv.New32a()
h.Write([]byte(name))
return int(h.Sum32()) % pool.poolsCount
}
2019-04-29 09:05:15 +00:00
func (pool *ResourcePool) start() {
2019-06-13 02:53:00 +00:00
//TODO: retrieve networks from yao-agent-master in blocking io
2019-06-05 09:09:22 +00:00
pool.networks = map[string]bool{}
pool.networksFree = map[string]bool{}
2020-04-10 08:50:36 +00:00
pool.versions = map[string]float64{}
2019-06-05 09:09:22 +00:00
2020-04-11 03:38:04 +00:00
pool.bindings = map[string]map[string]bool{}
2020-04-12 03:17:29 +00:00
pool.utils = map[string][]int{}
2020-04-11 03:38:04 +00:00
2020-04-13 15:03:34 +00:00
pool.poolsCount = 100
2020-04-13 14:35:17 +00:00
for i := 0; i < pool.poolsCount; i++ {
pool.pools = append(pool.pools, map[string]NodeStatus{})
pool.poolsMu = append(pool.poolsMu, sync.Mutex{})
}
2019-06-04 03:08:49 +00:00
/* check dead nodes */
2019-06-04 03:15:12 +00:00
go func() {
pool.heartBeat = map[string]time.Time{}
for {
2020-04-13 12:29:58 +00:00
pool.heartBeatMu.Lock()
2019-06-04 03:15:12 +00:00
for k, v := range pool.heartBeat {
if v.Add(time.Second * 30).Before(time.Now()) {
2020-04-13 14:35:17 +00:00
poolID := pool.getNodePool(k)
pool.poolsMu[poolID].Lock()
delete(pool.pools[poolID], k)
2020-03-29 13:12:44 +00:00
delete(pool.versions, k)
2020-04-13 14:35:17 +00:00
pool.poolsMu[poolID].Unlock()
2019-06-04 03:15:12 +00:00
}
}
2020-04-13 12:29:58 +00:00
pool.heartBeatMu.Unlock()
2019-06-04 03:15:12 +00:00
time.Sleep(time.Second * 10)
2019-06-04 03:08:49 +00:00
}
2019-06-04 03:15:12 +00:00
}()
2019-06-04 03:08:49 +00:00
/* save pool status periodically */
2019-04-29 09:05:15 +00:00
go func() {
2019-04-29 12:57:32 +00:00
/* waiting for data */
2019-05-06 07:36:31 +00:00
pool.history = []PoolStatus{}
2019-04-29 12:57:32 +00:00
time.Sleep(time.Second * 30)
2019-04-29 09:05:15 +00:00
for {
2019-04-29 12:57:32 +00:00
summary := PoolStatus{}
2019-04-29 09:05:15 +00:00
UtilCPU := 0.0
TotalCPU := 0
TotalMem := 0
AvailableMem := 0
TotalGPU := 0
UtilGPU := 0
TotalMemGPU := 0
AvailableMemGPU := 0
2020-04-13 14:35:17 +00:00
nodesCount := 0
for i := 0; i < pool.poolsCount; i++ {
pool.poolsMu[i].Lock()
for _, node := range pool.pools[i] {
UtilCPU += node.UtilCPU
TotalCPU += node.NumCPU
TotalMem += node.MemTotal
AvailableMem += node.MemAvailable
for _, GPU := range node.Status {
UtilGPU += GPU.UtilizationGPU
TotalGPU ++
TotalMemGPU += GPU.MemoryTotal
AvailableMemGPU += GPU.MemoryFree
}
2019-04-29 09:05:15 +00:00
}
2020-04-13 14:35:17 +00:00
nodesCount += len(pool.pools[i])
pool.poolsMu[i].Unlock()
2019-04-29 09:05:15 +00:00
}
2019-04-29 12:57:32 +00:00
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
2020-04-13 14:35:17 +00:00
summary.UtilCPU = UtilCPU / (float64(nodesCount) + 0.001)
2019-04-29 12:57:32 +00:00
summary.TotalCPU = TotalCPU
summary.TotalMem = TotalMem
summary.AvailableMem = AvailableMem
summary.TotalGPU = TotalGPU
2019-04-29 09:05:15 +00:00
if TotalGPU == 0 {
2019-04-29 12:57:32 +00:00
summary.UtilGPU = 0.0
2019-04-29 09:05:15 +00:00
} else {
2019-04-29 12:57:32 +00:00
summary.UtilGPU = UtilGPU / TotalGPU
2019-04-29 09:05:15 +00:00
}
2019-04-29 12:57:32 +00:00
summary.TotalMemGPU = TotalMemGPU
summary.AvailableMemGPU = AvailableMemGPU
2019-04-29 09:05:15 +00:00
pool.history = append(pool.history, summary)
if len(pool.history) > 60 {
2019-05-06 07:36:31 +00:00
pool.history = pool.history[len(pool.history)-60:]
2019-04-29 09:05:15 +00:00
}
time.Sleep(time.Second * 60)
}
}()
2019-03-04 09:19:55 +00:00
}
2019-04-16 08:59:19 +00:00
func (pool *ResourcePool) update(node NodeStatus) {
2020-04-13 14:35:17 +00:00
poolID := pool.getNodePool(node.ClientID)
pool.poolsMu[poolID].Lock()
defer pool.poolsMu[poolID].Unlock()
2019-03-04 09:19:55 +00:00
2020-04-12 03:13:23 +00:00
go func(node NodeStatus) {
2020-04-13 12:29:58 +00:00
pool.bindingsMu.Lock()
defer pool.bindingsMu.Unlock()
2020-04-12 03:13:23 +00:00
for _, gpu := range node.Status {
if _, ok := pool.bindings[gpu.UUID]; ok {
if len(pool.bindings[gpu.UUID]) == 1 {
pool.utils[gpu.UUID] = append(pool.utils[gpu.UUID], gpu.UtilizationGPU)
}
}
}
2020-04-13 12:29:58 +00:00
pool.heartBeatMu.Lock()
pool.heartBeat[node.ClientID] = time.Now()
pool.heartBeatMu.Unlock()
2020-04-12 03:13:23 +00:00
}(node)
2020-03-29 13:12:44 +00:00
pool.counterTotal++
if version, ok := pool.versions[node.ClientID]; ok && version == node.Version {
return
}
2020-04-12 02:44:32 +00:00
log.Debug(node.Version, "!=", pool.versions[node.ClientID])
2020-04-11 03:38:04 +00:00
2020-03-29 13:12:44 +00:00
pool.counter++
2020-04-13 14:35:17 +00:00
status, ok := pool.pools[poolID][node.ClientID]
2019-03-20 03:14:07 +00:00
if ok {
2019-04-16 08:59:19 +00:00
for i, GPU := range status.Status {
if GPU.UUID == node.Status[i].UUID {
node.Status[i].MemoryAllocated = GPU.MemoryAllocated
2019-03-20 03:14:07 +00:00
}
}
}
2020-04-13 14:35:17 +00:00
pool.pools[poolID][node.ClientID] = node
2020-03-29 13:12:44 +00:00
pool.versions[node.ClientID] = node.Version
2019-03-04 09:19:55 +00:00
}
2019-04-18 09:25:37 +00:00
func (pool *ResourcePool) getByID(id string) NodeStatus {
2020-04-13 14:35:17 +00:00
poolID := pool.getNodePool(id)
pool.poolsMu[poolID].Lock()
defer pool.poolsMu[poolID].Unlock()
2019-03-04 09:19:55 +00:00
2020-04-13 14:35:17 +00:00
status, ok := pool.pools[poolID][id]
2019-03-04 09:19:55 +00:00
if ok {
return status
}
2019-04-16 08:59:19 +00:00
return NodeStatus{}
2019-03-04 09:19:55 +00:00
}
2019-04-29 09:05:15 +00:00
func (pool *ResourcePool) list() MsgResource {
2020-04-13 14:35:17 +00:00
nodes := map[string]NodeStatus{}
for i := 0; i < pool.poolsCount; i++ {
pool.poolsMu[i].Lock()
for k, node := range pool.pools[i] {
nodes[k] = node
}
pool.poolsMu[i].Unlock()
}
return MsgResource{Code: 0, Resource: nodes}
2019-04-29 09:05:15 +00:00
}
func (pool *ResourcePool) statusHistory() MsgPoolStatusHistory {
return MsgPoolStatusHistory{Code: 0, Data: pool.history}
}
2019-06-05 09:09:22 +00:00
2020-03-29 13:12:44 +00:00
func (pool *ResourcePool) getCounter() map[string]int {
return map[string]int{"counter": pool.counter, "counterTotal": pool.counterTotal}
}
2019-06-05 09:09:22 +00:00
func (pool *ResourcePool) acquireNetwork() string {
2019-06-13 02:53:00 +00:00
pool.networkMu.Lock()
defer pool.networkMu.Unlock()
2019-06-05 09:09:22 +00:00
var network string
2020-04-13 11:41:28 +00:00
log.Debug(pool.networksFree)
2019-06-05 09:09:22 +00:00
if len(pool.networksFree) == 0 {
for {
2019-06-13 02:53:00 +00:00
for {
network = "yao-net-" + strconv.Itoa(rand.Intn(999999))
if _, ok := pool.networks[network]; !ok {
break
}
2019-06-05 09:09:22 +00:00
}
2019-06-13 02:53:00 +00:00
v := url.Values{}
v.Set("name", network)
2019-06-13 03:30:55 +00:00
resp, err := doRequest("POST", "http://yao-agent-master:8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
2019-06-13 02:53:00 +00:00
if err != nil {
log.Println(err.Error())
continue
}
defer resp.Body.Close()
pool.networksFree[network] = true
pool.networks[network] = true
break
2019-06-05 09:09:22 +00:00
}
}
2019-06-13 02:53:00 +00:00
2019-06-05 09:09:22 +00:00
for k := range pool.networksFree {
network = k
delete(pool.networksFree, k)
}
return network
}
func (pool *ResourcePool) releaseNetwork(network string) {
pool.networkMu.Lock()
pool.networksFree[network] = true
pool.networkMu.Unlock()
}
2020-04-11 03:38:04 +00:00
func (pool *ResourcePool) attach(GPU string, job string) {
2020-04-13 12:29:58 +00:00
pool.bindingsMu.Lock()
defer pool.bindingsMu.Unlock()
2020-04-12 03:13:23 +00:00
if _, ok := pool.bindings[GPU]; !ok {
pool.bindings[GPU] = map[string]bool{}
}
pool.bindings[GPU][job] = true
if _, ok := pool.utils[GPU]; !ok {
pool.utils[GPU] = []int{}
2020-04-11 03:38:04 +00:00
}
}
2020-04-12 03:13:23 +00:00
func (pool *ResourcePool) detach(GPU string, jobName string) {
2020-04-13 12:29:58 +00:00
pool.bindingsMu.Lock()
defer pool.bindingsMu.Unlock()
2020-04-12 03:13:23 +00:00
if _, ok := pool.bindings[GPU]; ok {
if len(pool.bindings[GPU]) == 1 {
2020-04-12 03:14:53 +00:00
InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
2020-04-12 03:13:23 +00:00
pool.utils[GPU] = []int{}
}
}
2020-04-11 03:38:04 +00:00
if list, ok := pool.bindings[GPU]; ok {
2020-04-12 03:13:23 +00:00
delete(list, jobName)
2020-04-11 03:38:04 +00:00
}
}
2020-04-12 02:44:32 +00:00
2020-04-12 03:13:23 +00:00
func (pool *ResourcePool) getBindings() map[string]map[string]bool {
2020-04-12 02:44:32 +00:00
return pool.bindings
2020-04-13 12:29:58 +00:00
}