2019-03-04 09:19:55 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"sync"
|
2019-04-29 09:05:15 +00:00
|
|
|
"time"
|
2019-06-05 09:09:22 +00:00
|
|
|
"net/url"
|
|
|
|
"strings"
|
2019-07-10 12:40:43 +00:00
|
|
|
log "github.com/sirupsen/logrus"
|
2019-06-05 09:09:22 +00:00
|
|
|
"math/rand"
|
|
|
|
"strconv"
|
2020-05-01 04:48:06 +00:00
|
|
|
"sort"
|
2020-05-03 15:32:38 +00:00
|
|
|
"hash/fnv"
|
2019-10-24 12:25:59 +00:00
|
|
|
)
|
2019-03-04 09:19:55 +00:00
|
|
|
|
|
|
|
type ResourcePool struct {
|
2020-04-13 14:35:17 +00:00
|
|
|
poolsCount int
|
2020-05-03 15:32:38 +00:00
|
|
|
pools []PoolSeg
|
|
|
|
poolsMu sync.Mutex
|
2019-04-29 09:05:15 +00:00
|
|
|
|
2019-04-29 12:57:32 +00:00
|
|
|
history []PoolStatus
|
2019-06-04 03:08:49 +00:00
|
|
|
|
2020-04-13 12:29:58 +00:00
|
|
|
heartBeat map[string]time.Time
|
|
|
|
heartBeatMu sync.Mutex
|
2019-06-05 09:09:22 +00:00
|
|
|
|
|
|
|
networks map[string]bool
|
|
|
|
networksFree map[string]bool
|
|
|
|
networkMu sync.Mutex
|
2020-03-29 13:12:44 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
versions map[string]float64
|
|
|
|
versionsMu sync.Mutex
|
2020-03-29 13:12:44 +00:00
|
|
|
|
|
|
|
counter int
|
|
|
|
counterTotal int
|
2020-04-11 03:38:04 +00:00
|
|
|
|
2020-04-30 10:39:47 +00:00
|
|
|
bindings map[string]map[string]int
|
2020-04-13 12:29:58 +00:00
|
|
|
bindingsMu sync.Mutex
|
2020-04-30 15:06:12 +00:00
|
|
|
utils map[string][]UtilGPUTimeSeries
|
2020-04-30 09:52:52 +00:00
|
|
|
|
|
|
|
TotalGPU int
|
2019-04-29 09:05:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (pool *ResourcePool) start() {
|
2019-06-05 09:09:22 +00:00
|
|
|
pool.networks = map[string]bool{}
|
|
|
|
pool.networksFree = map[string]bool{}
|
2020-04-10 08:50:36 +00:00
|
|
|
pool.versions = map[string]float64{}
|
2019-06-05 09:09:22 +00:00
|
|
|
|
2020-04-30 10:39:47 +00:00
|
|
|
pool.bindings = map[string]map[string]int{}
|
2020-04-30 15:06:12 +00:00
|
|
|
pool.utils = map[string][]UtilGPUTimeSeries{}
|
2020-04-11 03:38:04 +00:00
|
|
|
|
2020-04-30 09:52:52 +00:00
|
|
|
pool.TotalGPU = 0
|
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
/* init pools */
|
|
|
|
pool.poolsCount = 300
|
2020-04-13 14:35:17 +00:00
|
|
|
for i := 0; i < pool.poolsCount; i++ {
|
2020-05-03 15:32:38 +00:00
|
|
|
pool.pools = append(pool.pools, PoolSeg{Lock: sync.Mutex{}, IsVirtual: true, ID: i})
|
|
|
|
}
|
|
|
|
/* make non-virtual seg */
|
|
|
|
for i := 0; i < pool.poolsCount/3; i++ {
|
|
|
|
pool.pools[rand.Intn(pool.poolsCount)].IsVirtual = false
|
|
|
|
}
|
|
|
|
/* make working srg */
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
pool.pools[rand.Intn(pool.poolsCount)].Nodes = map[string]*NodeStatus{}
|
|
|
|
}
|
|
|
|
/* init Next pointer */
|
|
|
|
var pre *PoolSeg
|
|
|
|
for i := pool.poolsCount*2 - 1; ; i-- {
|
|
|
|
if pool.pools[i%pool.poolsCount].Next != nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
pool.pools[i%pool.poolsCount].Next = pre
|
|
|
|
if pool.pools[i%pool.poolsCount].Nodes != nil {
|
|
|
|
pre = &pool.pools[i%pool.poolsCount]
|
|
|
|
}
|
2020-04-13 14:35:17 +00:00
|
|
|
}
|
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
pool.heartBeat = map[string]time.Time{}
|
2019-06-04 03:15:12 +00:00
|
|
|
go func() {
|
2020-05-03 15:32:38 +00:00
|
|
|
pool.checkDeadNodes()
|
2019-06-04 03:15:12 +00:00
|
|
|
}()
|
2019-06-04 03:08:49 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
pool.history = []PoolStatus{}
|
2019-04-29 09:05:15 +00:00
|
|
|
go func() {
|
2020-05-03 15:32:38 +00:00
|
|
|
pool.saveStatusHistory()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check dead nodes periodically */
|
|
|
|
func (pool *ResourcePool) checkDeadNodes() {
|
|
|
|
for {
|
|
|
|
pool.heartBeatMu.Lock()
|
|
|
|
for k, v := range pool.heartBeat {
|
|
|
|
if v.Add(time.Second * 30).Before(time.Now()) {
|
|
|
|
poolID := pool.getNodePool(k)
|
|
|
|
seg := &pool.pools[poolID]
|
|
|
|
if seg.Nodes == nil {
|
|
|
|
seg = seg.Next
|
2019-04-29 09:05:15 +00:00
|
|
|
}
|
2020-05-03 15:32:38 +00:00
|
|
|
seg.Lock.Lock()
|
|
|
|
delete(seg.Nodes, k)
|
|
|
|
seg.Lock.Unlock()
|
|
|
|
pool.versionsMu.Lock()
|
|
|
|
delete(pool.versions, k)
|
|
|
|
pool.versionsMu.Unlock()
|
|
|
|
log.Info(" node ", k, " is offline")
|
2019-04-29 09:05:15 +00:00
|
|
|
}
|
2020-05-03 15:32:38 +00:00
|
|
|
}
|
|
|
|
pool.heartBeatMu.Unlock()
|
|
|
|
time.Sleep(time.Second * 10)
|
|
|
|
}
|
|
|
|
}
|
2019-04-29 09:05:15 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
func (pool *ResourcePool) GPUModelToPower(model string) int {
|
|
|
|
mapper := map[string]int{
|
|
|
|
"K40": 1, "Tesla K40": 1,
|
|
|
|
"K80": 2, "Tesla K80": 2,
|
|
|
|
"P100": 3, "Tesla P100": 3,
|
|
|
|
}
|
|
|
|
if power, err := mapper[model]; !err {
|
|
|
|
return power
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
2019-04-29 09:05:15 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
func (pool *ResourcePool) getNodePool(name string) int {
|
|
|
|
h := fnv.New32a()
|
|
|
|
h.Write([]byte(name))
|
|
|
|
return int(h.Sum32()) % pool.poolsCount
|
|
|
|
}
|
|
|
|
|
|
|
|
/* save pool status periodically */
|
|
|
|
func (pool *ResourcePool) saveStatusHistory() {
|
|
|
|
/* waiting for data */
|
|
|
|
time.Sleep(time.Second * 30)
|
|
|
|
for {
|
|
|
|
summary := PoolStatus{}
|
|
|
|
|
|
|
|
UtilCPU := 0.0
|
|
|
|
TotalCPU := 0
|
|
|
|
TotalMem := 0
|
|
|
|
AvailableMem := 0
|
|
|
|
|
|
|
|
TotalGPU := 0
|
|
|
|
UtilGPU := 0
|
|
|
|
TotalMemGPU := 0
|
|
|
|
AvailableMemGPU := 0
|
|
|
|
nodesCount := 0
|
|
|
|
|
|
|
|
start := pool.pools[0].Next
|
|
|
|
for cur := start; ; {
|
|
|
|
cur.Lock.Lock()
|
|
|
|
for _, node := range cur.Nodes {
|
|
|
|
UtilCPU += node.UtilCPU
|
|
|
|
TotalCPU += node.NumCPU
|
|
|
|
TotalMem += node.MemTotal
|
|
|
|
AvailableMem += node.MemAvailable
|
|
|
|
|
|
|
|
for _, GPU := range node.Status {
|
|
|
|
UtilGPU += GPU.UtilizationGPU
|
|
|
|
TotalGPU ++
|
|
|
|
TotalMemGPU += GPU.MemoryTotal
|
|
|
|
AvailableMemGPU += GPU.MemoryFree
|
|
|
|
}
|
|
|
|
}
|
|
|
|
nodesCount += len(cur.Nodes)
|
|
|
|
cur.Lock.Unlock()
|
|
|
|
cur = cur.Next
|
|
|
|
if cur == start {
|
|
|
|
break
|
2019-04-29 09:05:15 +00:00
|
|
|
}
|
2020-05-03 15:32:38 +00:00
|
|
|
}
|
|
|
|
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
|
|
|
|
summary.UtilCPU = UtilCPU / (float64(nodesCount) + 0.001)
|
|
|
|
summary.TotalCPU = TotalCPU
|
|
|
|
summary.TotalMem = TotalMem
|
|
|
|
summary.AvailableMem = AvailableMem
|
|
|
|
summary.TotalGPU = TotalGPU
|
|
|
|
if TotalGPU == 0 {
|
|
|
|
summary.UtilGPU = 0.0
|
|
|
|
} else {
|
|
|
|
summary.UtilGPU = UtilGPU / TotalGPU
|
|
|
|
}
|
|
|
|
summary.TotalMemGPU = TotalMemGPU
|
|
|
|
summary.AvailableMemGPU = AvailableMemGPU
|
|
|
|
|
|
|
|
pool.history = append(pool.history, summary)
|
2020-04-30 09:52:52 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
if len(pool.history) > 60 {
|
|
|
|
pool.history = pool.history[len(pool.history)-60:]
|
2019-04-29 09:05:15 +00:00
|
|
|
}
|
2020-05-03 15:32:38 +00:00
|
|
|
|
|
|
|
pool.TotalGPU = TotalGPU
|
|
|
|
time.Sleep(time.Second * 60)
|
|
|
|
}
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
/* update node info */
|
2019-04-16 08:59:19 +00:00
|
|
|
func (pool *ResourcePool) update(node NodeStatus) {
|
2020-05-03 15:32:38 +00:00
|
|
|
segID := pool.getNodePool(node.ClientID)
|
|
|
|
seg := &pool.pools[segID]
|
|
|
|
if seg.Nodes == nil {
|
|
|
|
seg = seg.Next
|
|
|
|
}
|
|
|
|
seg.Lock.Lock()
|
|
|
|
defer seg.Lock.Unlock()
|
2019-03-04 09:19:55 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
/* init bindings */
|
2020-04-12 03:13:23 +00:00
|
|
|
go func(node NodeStatus) {
|
2020-04-13 12:29:58 +00:00
|
|
|
pool.bindingsMu.Lock()
|
|
|
|
defer pool.bindingsMu.Unlock()
|
2020-04-12 03:13:23 +00:00
|
|
|
for _, gpu := range node.Status {
|
|
|
|
if _, ok := pool.bindings[gpu.UUID]; ok {
|
|
|
|
if len(pool.bindings[gpu.UUID]) == 1 {
|
2020-04-30 15:06:12 +00:00
|
|
|
pool.utils[gpu.UUID] = append(pool.utils[gpu.UUID],
|
|
|
|
UtilGPUTimeSeries{Time: (int)(time.Now().Unix()), Util: gpu.UtilizationGPU})
|
2020-04-12 03:13:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-04-13 12:29:58 +00:00
|
|
|
pool.heartBeatMu.Lock()
|
|
|
|
pool.heartBeat[node.ClientID] = time.Now()
|
|
|
|
pool.heartBeatMu.Unlock()
|
2020-04-12 03:13:23 +00:00
|
|
|
}(node)
|
|
|
|
|
2020-03-29 13:12:44 +00:00
|
|
|
pool.counterTotal++
|
2020-05-03 15:32:38 +00:00
|
|
|
pool.versionsMu.Lock()
|
2020-03-29 13:12:44 +00:00
|
|
|
if version, ok := pool.versions[node.ClientID]; ok && version == node.Version {
|
2020-05-03 15:32:38 +00:00
|
|
|
pool.versionsMu.Unlock()
|
2020-03-29 13:12:44 +00:00
|
|
|
return
|
|
|
|
}
|
2020-05-03 15:32:38 +00:00
|
|
|
pool.versionsMu.Unlock()
|
|
|
|
pool.counter++
|
2020-04-12 02:44:32 +00:00
|
|
|
log.Debug(node.Version, "!=", pool.versions[node.ClientID])
|
2020-04-11 03:38:04 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
status, ok := seg.Nodes[node.ClientID]
|
2019-03-20 03:14:07 +00:00
|
|
|
if ok {
|
2020-05-03 15:32:38 +00:00
|
|
|
/* remain allocation info */
|
2019-04-16 08:59:19 +00:00
|
|
|
for i, GPU := range status.Status {
|
|
|
|
if GPU.UUID == node.Status[i].UUID {
|
|
|
|
node.Status[i].MemoryAllocated = GPU.MemoryAllocated
|
2019-03-20 03:14:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-05-03 15:32:38 +00:00
|
|
|
seg.Nodes[node.ClientID] = &node
|
|
|
|
if len(seg.Nodes) > 10 {
|
|
|
|
pool.scaleSeg(seg)
|
|
|
|
}
|
2020-03-29 13:12:44 +00:00
|
|
|
pool.versions[node.ClientID] = node.Version
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
/* spilt seg */
|
|
|
|
func (pool *ResourcePool) scaleSeg(seg *PoolSeg) {
|
|
|
|
go func() {
|
|
|
|
pool.poolsMu.Lock()
|
|
|
|
defer pool.poolsMu.Unlock()
|
|
|
|
|
|
|
|
var candidate *PoolSeg
|
|
|
|
seg.Lock.Lock()
|
|
|
|
|
|
|
|
/* find previous seg */
|
|
|
|
var pre *PoolSeg
|
|
|
|
for i := seg.ID + pool.poolsCount - 1; i >= 0; i-- {
|
|
|
|
if pool.pools[i%pool.poolsCount].Next != seg {
|
|
|
|
pre = &pool.pools[i%pool.poolsCount]
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
step := seg.ID - pre.ID
|
|
|
|
if step < 0 {
|
|
|
|
step += pool.poolsCount
|
|
|
|
}
|
|
|
|
|
|
|
|
/* find seg in the nearest middle */
|
|
|
|
minDistance := step
|
|
|
|
for i := 1; i < step; i++ {
|
|
|
|
if !pool.pools[(i+pre.ID)%pool.poolsCount].IsVirtual {
|
|
|
|
distance := i - step/2
|
|
|
|
if distance < 0 {
|
|
|
|
distance = -distance
|
|
|
|
}
|
|
|
|
if candidate == nil || distance < minDistance {
|
|
|
|
candidate = &pool.pools[i]
|
|
|
|
minDistance = distance
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* update Next */
|
|
|
|
if candidate != nil {
|
|
|
|
distance := candidate.ID - seg.ID
|
|
|
|
if distance < 0 {
|
|
|
|
distance = -distance
|
|
|
|
}
|
|
|
|
for i := 0; i < distance; i++ {
|
|
|
|
pool.pools[(i+pre.ID)%pool.poolsCount].Lock.Lock()
|
|
|
|
pool.pools[(i+pre.ID)%pool.poolsCount].Next = candidate
|
|
|
|
pool.pools[(i+pre.ID)%pool.poolsCount].Lock.Unlock()
|
|
|
|
}
|
|
|
|
candidate.Lock.Lock()
|
|
|
|
candidate.Next = seg
|
|
|
|
/* move nodes */
|
|
|
|
nodesToMove := map[string]*NodeStatus{}
|
|
|
|
for _, node := range seg.Nodes {
|
|
|
|
seg2ID := pool.getNodePool(node.ClientID)
|
|
|
|
seg2 := &pool.pools[seg2ID]
|
|
|
|
if seg2.Nodes == nil {
|
|
|
|
seg2 = seg2.Next
|
|
|
|
}
|
|
|
|
if seg2 != seg {
|
|
|
|
nodesToMove[node.ClientID] = node
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for _, node := range nodesToMove {
|
|
|
|
delete(seg.Nodes, node.ClientID)
|
|
|
|
}
|
|
|
|
candidate.Nodes = nodesToMove
|
|
|
|
candidate.Lock.Unlock()
|
|
|
|
}
|
|
|
|
seg.Lock.Unlock()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get node by ClientID */
|
2019-04-18 09:25:37 +00:00
|
|
|
func (pool *ResourcePool) getByID(id string) NodeStatus {
|
2020-04-13 14:35:17 +00:00
|
|
|
poolID := pool.getNodePool(id)
|
2020-05-03 15:32:38 +00:00
|
|
|
seg := &pool.pools[poolID]
|
|
|
|
if seg.Nodes == nil {
|
|
|
|
seg = seg.Next
|
|
|
|
}
|
|
|
|
seg.Lock.Lock()
|
|
|
|
defer seg.Lock.Unlock()
|
2020-04-13 14:35:17 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
status, ok := seg.Nodes[id]
|
2019-03-04 09:19:55 +00:00
|
|
|
if ok {
|
2020-05-03 15:32:38 +00:00
|
|
|
return *status
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
2019-04-16 08:59:19 +00:00
|
|
|
return NodeStatus{}
|
2019-03-04 09:19:55 +00:00
|
|
|
}
|
2019-04-29 09:05:15 +00:00
|
|
|
|
2020-05-03 15:32:38 +00:00
|
|
|
/* get all nodes */
|
2019-04-29 09:05:15 +00:00
|
|
|
func (pool *ResourcePool) list() MsgResource {
|
2020-04-13 14:35:17 +00:00
|
|
|
nodes := map[string]NodeStatus{}
|
2020-05-03 15:32:38 +00:00
|
|
|
|
|
|
|
start := pool.pools[0].Next
|
|
|
|
for cur := start; ; {
|
|
|
|
cur.Lock.Lock()
|
|
|
|
for k, node := range cur.Nodes {
|
|
|
|
nodes[k] = *node
|
|
|
|
}
|
2020-05-03 15:43:47 +00:00
|
|
|
cur.Lock.Unlock()
|
2020-05-03 15:32:38 +00:00
|
|
|
cur = cur.Next
|
|
|
|
if cur == start {
|
|
|
|
break
|
2020-04-13 14:35:17 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return MsgResource{Code: 0, Resource: nodes}
|
2019-04-29 09:05:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (pool *ResourcePool) statusHistory() MsgPoolStatusHistory {
|
|
|
|
return MsgPoolStatusHistory{Code: 0, Data: pool.history}
|
|
|
|
}
|
2019-06-05 09:09:22 +00:00
|
|
|
|
2020-03-29 13:12:44 +00:00
|
|
|
func (pool *ResourcePool) getCounter() map[string]int {
|
|
|
|
return map[string]int{"counter": pool.counter, "counterTotal": pool.counterTotal}
|
|
|
|
}
|
|
|
|
|
2019-06-05 09:09:22 +00:00
|
|
|
func (pool *ResourcePool) acquireNetwork() string {
|
2019-06-13 02:53:00 +00:00
|
|
|
pool.networkMu.Lock()
|
|
|
|
defer pool.networkMu.Unlock()
|
2019-06-05 09:09:22 +00:00
|
|
|
var network string
|
2020-04-13 11:41:28 +00:00
|
|
|
log.Debug(pool.networksFree)
|
2019-06-05 09:09:22 +00:00
|
|
|
if len(pool.networksFree) == 0 {
|
|
|
|
for {
|
2019-06-13 02:53:00 +00:00
|
|
|
for {
|
|
|
|
network = "yao-net-" + strconv.Itoa(rand.Intn(999999))
|
|
|
|
if _, ok := pool.networks[network]; !ok {
|
|
|
|
break
|
|
|
|
}
|
2019-06-05 09:09:22 +00:00
|
|
|
}
|
2019-06-13 02:53:00 +00:00
|
|
|
v := url.Values{}
|
|
|
|
v.Set("name", network)
|
2019-06-13 03:30:55 +00:00
|
|
|
resp, err := doRequest("POST", "http://yao-agent-master:8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
2019-06-13 02:53:00 +00:00
|
|
|
if err != nil {
|
|
|
|
log.Println(err.Error())
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
|
|
pool.networksFree[network] = true
|
|
|
|
pool.networks[network] = true
|
|
|
|
break
|
2019-06-05 09:09:22 +00:00
|
|
|
}
|
|
|
|
}
|
2019-06-13 02:53:00 +00:00
|
|
|
|
2019-06-05 09:09:22 +00:00
|
|
|
for k := range pool.networksFree {
|
|
|
|
network = k
|
|
|
|
delete(pool.networksFree, k)
|
2020-05-03 07:19:21 +00:00
|
|
|
break
|
2019-06-05 09:09:22 +00:00
|
|
|
}
|
|
|
|
return network
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pool *ResourcePool) releaseNetwork(network string) {
|
|
|
|
pool.networkMu.Lock()
|
|
|
|
pool.networksFree[network] = true
|
|
|
|
pool.networkMu.Unlock()
|
|
|
|
}
|
2020-04-11 03:38:04 +00:00
|
|
|
|
|
|
|
func (pool *ResourcePool) attach(GPU string, job string) {
|
2020-04-13 12:29:58 +00:00
|
|
|
pool.bindingsMu.Lock()
|
|
|
|
defer pool.bindingsMu.Unlock()
|
2020-04-12 03:13:23 +00:00
|
|
|
if _, ok := pool.bindings[GPU]; !ok {
|
2020-04-30 10:39:47 +00:00
|
|
|
pool.bindings[GPU] = map[string]int{}
|
2020-04-12 03:13:23 +00:00
|
|
|
}
|
2020-04-30 10:39:47 +00:00
|
|
|
pool.bindings[GPU][job] = int(time.Now().Unix())
|
2020-04-12 03:13:23 +00:00
|
|
|
|
|
|
|
if _, ok := pool.utils[GPU]; !ok {
|
2020-04-30 15:06:12 +00:00
|
|
|
pool.utils[GPU] = []UtilGPUTimeSeries{}
|
2020-04-11 03:38:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-12 03:13:23 +00:00
|
|
|
func (pool *ResourcePool) detach(GPU string, jobName string) {
|
2020-04-13 12:29:58 +00:00
|
|
|
pool.bindingsMu.Lock()
|
|
|
|
defer pool.bindingsMu.Unlock()
|
2020-04-12 03:13:23 +00:00
|
|
|
if _, ok := pool.bindings[GPU]; ok {
|
|
|
|
if len(pool.bindings[GPU]) == 1 {
|
2020-04-12 03:14:53 +00:00
|
|
|
InstanceOfOptimizer().feed(jobName, pool.utils[GPU])
|
2020-04-30 15:06:12 +00:00
|
|
|
pool.utils[GPU] = []UtilGPUTimeSeries{}
|
2020-04-12 03:13:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-11 03:38:04 +00:00
|
|
|
if list, ok := pool.bindings[GPU]; ok {
|
2020-04-12 03:13:23 +00:00
|
|
|
delete(list, jobName)
|
2020-04-11 03:38:04 +00:00
|
|
|
}
|
|
|
|
}
|
2020-04-12 02:44:32 +00:00
|
|
|
|
2020-04-30 10:39:47 +00:00
|
|
|
func (pool *ResourcePool) getBindings() map[string]map[string]int {
|
2020-04-12 02:44:32 +00:00
|
|
|
return pool.bindings
|
2020-04-13 12:29:58 +00:00
|
|
|
}
|
2020-04-30 13:22:21 +00:00
|
|
|
|
2020-05-01 04:48:06 +00:00
|
|
|
func (pool *ResourcePool) pickNode(candidates []*NodeStatus, availableGPUs map[string][]GPUStatus, task Task, job Job, nodes []NodeStatus) *NodeStatus {
|
2020-04-30 13:22:21 +00:00
|
|
|
|
|
|
|
/* shuffle */
|
|
|
|
r := rand.New(rand.NewSource(time.Now().Unix()))
|
2020-05-01 04:48:06 +00:00
|
|
|
for n := len(candidates); n > 0; n-- {
|
2020-04-30 13:22:21 +00:00
|
|
|
randIndex := r.Intn(n)
|
2020-05-01 04:48:06 +00:00
|
|
|
candidates[n-1], candidates[randIndex] = candidates[randIndex], candidates[n-1]
|
2020-04-30 13:22:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* sort */
|
2020-05-01 04:48:06 +00:00
|
|
|
// single node, single GPU
|
|
|
|
sort.Slice(candidates, func(a, b int) bool {
|
|
|
|
diffA := pool.GPUModelToPower(candidates[a].Status[0].ProductName) - pool.GPUModelToPower(task.ModelGPU)
|
|
|
|
diffB := pool.GPUModelToPower(candidates[b].Status[0].ProductName) - pool.GPUModelToPower(task.ModelGPU)
|
2020-04-30 13:22:21 +00:00
|
|
|
|
2020-05-01 04:48:06 +00:00
|
|
|
if diffA > 0 && diffB >= 0 && diffA > diffB {
|
|
|
|
return false //b
|
|
|
|
}
|
|
|
|
if diffA < 0 && diffB < 0 && diffA > diffB {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if diffA < 0 && diffB >= 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if diffA == diffB {
|
|
|
|
if len(availableGPUs[candidates[a].ClientID]) == len(availableGPUs[candidates[b].ClientID]) {
|
|
|
|
return candidates[a].UtilCPU > candidates[b].UtilCPU
|
|
|
|
}
|
|
|
|
return len(availableGPUs[candidates[a].ClientID]) < len(availableGPUs[candidates[b].ClientID])
|
|
|
|
}
|
|
|
|
return true //a
|
|
|
|
})
|
|
|
|
|
|
|
|
var t []*NodeStatus
|
|
|
|
bestGPU := candidates[0].Status[0].ProductName
|
|
|
|
for _, node := range candidates {
|
|
|
|
if node.Status[0].ProductName != bestGPU {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
t = append(t, node)
|
|
|
|
}
|
|
|
|
candidates = t
|
|
|
|
|
|
|
|
if (len(job.Tasks) == 1) && task.NumberGPU > 1 { //single node, multi GPUs
|
|
|
|
sort.Slice(candidates, func(a, b int) bool {
|
|
|
|
if len(availableGPUs[candidates[a].ClientID]) == len(availableGPUs[candidates[b].ClientID]) {
|
|
|
|
return candidates[a].UtilCPU > candidates[b].UtilCPU
|
|
|
|
}
|
|
|
|
return len(availableGPUs[candidates[a].ClientID]) < len(availableGPUs[candidates[b].ClientID])
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(job.Tasks) > 1 { //multi nodes, multi GPUs
|
|
|
|
sort.Slice(candidates, func(a, b int) bool {
|
|
|
|
distanceA := 0
|
|
|
|
distanceB := 0
|
|
|
|
for _, node := range nodes {
|
|
|
|
if node.Rack != candidates[a].Rack {
|
|
|
|
distanceA += 10
|
|
|
|
}
|
|
|
|
if node.ClientID != candidates[a].ClientID {
|
|
|
|
distanceA += 1
|
|
|
|
}
|
|
|
|
if node.Rack != candidates[b].Rack {
|
|
|
|
distanceB += 10
|
|
|
|
}
|
|
|
|
if node.ClientID != candidates[b].ClientID {
|
|
|
|
distanceB += 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if distanceA == distanceB {
|
|
|
|
return len(availableGPUs[candidates[a].ClientID]) > len(availableGPUs[candidates[b].ClientID])
|
|
|
|
}
|
|
|
|
return distanceA*job.Locality < distanceB*job.Locality
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
return candidates[0]
|
2020-04-30 13:22:21 +00:00
|
|
|
}
|