mirror of
https://github.com/newnius/YAO-scheduler.git
synced 2025-12-13 07:46:43 +00:00
update
This commit is contained in:
@@ -115,6 +115,7 @@ func (jm *JobManager) start() {
|
|||||||
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[i].NumberCPU))
|
v.Set("cpu_limit", strconv.Itoa(jm.job.Tasks[i].NumberCPU))
|
||||||
v.Set("network", network)
|
v.Set("network", network)
|
||||||
v.Set("should_wait", "1")
|
v.Set("should_wait", "1")
|
||||||
|
v.Set("HDFS_path", "1")
|
||||||
|
|
||||||
resp, err := doRequest("POST", "http://"+jm.resources[i].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
resp, err := doRequest("POST", "http://"+jm.resources[i].ClientHost+":8000/create", strings.NewReader(v.Encode()), "application/x-www-form-urlencoded", "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -8,16 +8,14 @@ import (
|
|||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"strconv"
|
"strconv"
|
||||||
"hash/fnv"
|
|
||||||
"sort"
|
"sort"
|
||||||
|
"hash/fnv"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ResourcePool struct {
|
type ResourcePool struct {
|
||||||
//mu sync.Mutex
|
|
||||||
//nodes map[string]NodeStatus
|
|
||||||
pools []map[string]NodeStatus
|
|
||||||
poolsMu []sync.Mutex
|
|
||||||
poolsCount int
|
poolsCount int
|
||||||
|
pools []PoolSeg
|
||||||
|
poolsMu sync.Mutex
|
||||||
|
|
||||||
history []PoolStatus
|
history []PoolStatus
|
||||||
|
|
||||||
@@ -29,6 +27,7 @@ type ResourcePool struct {
|
|||||||
networkMu sync.Mutex
|
networkMu sync.Mutex
|
||||||
|
|
||||||
versions map[string]float64
|
versions map[string]float64
|
||||||
|
versionsMu sync.Mutex
|
||||||
|
|
||||||
counter int
|
counter int
|
||||||
counterTotal int
|
counterTotal int
|
||||||
@@ -40,6 +39,77 @@ type ResourcePool struct {
|
|||||||
TotalGPU int
|
TotalGPU int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pool *ResourcePool) start() {
|
||||||
|
pool.networks = map[string]bool{}
|
||||||
|
pool.networksFree = map[string]bool{}
|
||||||
|
pool.versions = map[string]float64{}
|
||||||
|
|
||||||
|
pool.bindings = map[string]map[string]int{}
|
||||||
|
pool.utils = map[string][]UtilGPUTimeSeries{}
|
||||||
|
|
||||||
|
pool.TotalGPU = 0
|
||||||
|
|
||||||
|
/* init pools */
|
||||||
|
pool.poolsCount = 300
|
||||||
|
for i := 0; i < pool.poolsCount; i++ {
|
||||||
|
pool.pools = append(pool.pools, PoolSeg{Lock: sync.Mutex{}, IsVirtual: true, ID: i})
|
||||||
|
}
|
||||||
|
/* make non-virtual seg */
|
||||||
|
for i := 0; i < pool.poolsCount/3; i++ {
|
||||||
|
pool.pools[rand.Intn(pool.poolsCount)].IsVirtual = false
|
||||||
|
}
|
||||||
|
/* make working srg */
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
pool.pools[rand.Intn(pool.poolsCount)].Nodes = map[string]*NodeStatus{}
|
||||||
|
}
|
||||||
|
/* init Next pointer */
|
||||||
|
var pre *PoolSeg
|
||||||
|
for i := pool.poolsCount*2 - 1; ; i-- {
|
||||||
|
if pool.pools[i%pool.poolsCount].Next != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
pool.pools[i%pool.poolsCount].Next = pre
|
||||||
|
if pool.pools[i%pool.poolsCount].Nodes != nil {
|
||||||
|
pre = &pool.pools[i%pool.poolsCount]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pool.heartBeat = map[string]time.Time{}
|
||||||
|
go func() {
|
||||||
|
pool.checkDeadNodes()
|
||||||
|
}()
|
||||||
|
|
||||||
|
pool.history = []PoolStatus{}
|
||||||
|
go func() {
|
||||||
|
pool.saveStatusHistory()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check dead nodes periodically */
|
||||||
|
func (pool *ResourcePool) checkDeadNodes() {
|
||||||
|
for {
|
||||||
|
pool.heartBeatMu.Lock()
|
||||||
|
for k, v := range pool.heartBeat {
|
||||||
|
if v.Add(time.Second * 30).Before(time.Now()) {
|
||||||
|
poolID := pool.getNodePool(k)
|
||||||
|
seg := &pool.pools[poolID]
|
||||||
|
if seg.Nodes == nil {
|
||||||
|
seg = seg.Next
|
||||||
|
}
|
||||||
|
seg.Lock.Lock()
|
||||||
|
delete(seg.Nodes, k)
|
||||||
|
seg.Lock.Unlock()
|
||||||
|
pool.versionsMu.Lock()
|
||||||
|
delete(pool.versions, k)
|
||||||
|
pool.versionsMu.Unlock()
|
||||||
|
log.Info(" node ", k, " is offline")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pool.heartBeatMu.Unlock()
|
||||||
|
time.Sleep(time.Second * 10)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) GPUModelToPower(model string) int {
|
func (pool *ResourcePool) GPUModelToPower(model string) int {
|
||||||
mapper := map[string]int{
|
mapper := map[string]int{
|
||||||
"K40": 1, "Tesla K40": 1,
|
"K40": 1, "Tesla K40": 1,
|
||||||
@@ -58,47 +128,9 @@ func (pool *ResourcePool) getNodePool(name string) int {
|
|||||||
return int(h.Sum32()) % pool.poolsCount
|
return int(h.Sum32()) % pool.poolsCount
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pool *ResourcePool) start() {
|
|
||||||
//TODO: retrieve networks from yao-agent-master in blocking io
|
|
||||||
pool.networks = map[string]bool{}
|
|
||||||
pool.networksFree = map[string]bool{}
|
|
||||||
pool.versions = map[string]float64{}
|
|
||||||
|
|
||||||
pool.bindings = map[string]map[string]int{}
|
|
||||||
pool.utils = map[string][]UtilGPUTimeSeries{}
|
|
||||||
|
|
||||||
pool.TotalGPU = 0
|
|
||||||
|
|
||||||
pool.poolsCount = 100
|
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
|
||||||
pool.pools = append(pool.pools, map[string]NodeStatus{})
|
|
||||||
pool.poolsMu = append(pool.poolsMu, sync.Mutex{})
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check dead nodes */
|
|
||||||
go func() {
|
|
||||||
pool.heartBeat = map[string]time.Time{}
|
|
||||||
|
|
||||||
for {
|
|
||||||
pool.heartBeatMu.Lock()
|
|
||||||
for k, v := range pool.heartBeat {
|
|
||||||
if v.Add(time.Second * 30).Before(time.Now()) {
|
|
||||||
poolID := pool.getNodePool(k)
|
|
||||||
pool.poolsMu[poolID].Lock()
|
|
||||||
delete(pool.pools[poolID], k)
|
|
||||||
delete(pool.versions, k)
|
|
||||||
pool.poolsMu[poolID].Unlock()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pool.heartBeatMu.Unlock()
|
|
||||||
time.Sleep(time.Second * 10)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
/* save pool status periodically */
|
/* save pool status periodically */
|
||||||
go func() {
|
func (pool *ResourcePool) saveStatusHistory() {
|
||||||
/* waiting for data */
|
/* waiting for data */
|
||||||
pool.history = []PoolStatus{}
|
|
||||||
time.Sleep(time.Second * 30)
|
time.Sleep(time.Second * 30)
|
||||||
for {
|
for {
|
||||||
summary := PoolStatus{}
|
summary := PoolStatus{}
|
||||||
@@ -113,9 +145,11 @@ func (pool *ResourcePool) start() {
|
|||||||
TotalMemGPU := 0
|
TotalMemGPU := 0
|
||||||
AvailableMemGPU := 0
|
AvailableMemGPU := 0
|
||||||
nodesCount := 0
|
nodesCount := 0
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
|
||||||
pool.poolsMu[i].Lock()
|
start := pool.pools[0].Next
|
||||||
for _, node := range pool.pools[i] {
|
for cur := start; ; {
|
||||||
|
cur.Lock.Lock()
|
||||||
|
for _, node := range cur.Nodes {
|
||||||
UtilCPU += node.UtilCPU
|
UtilCPU += node.UtilCPU
|
||||||
TotalCPU += node.NumCPU
|
TotalCPU += node.NumCPU
|
||||||
TotalMem += node.MemTotal
|
TotalMem += node.MemTotal
|
||||||
@@ -128,8 +162,12 @@ func (pool *ResourcePool) start() {
|
|||||||
AvailableMemGPU += GPU.MemoryFree
|
AvailableMemGPU += GPU.MemoryFree
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
nodesCount += len(pool.pools[i])
|
nodesCount += len(cur.Nodes)
|
||||||
pool.poolsMu[i].Unlock()
|
cur.Lock.Unlock()
|
||||||
|
cur = cur.Next
|
||||||
|
if cur == start {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
|
summary.TimeStamp = time.Now().Format("2006-01-02 15:04:05")
|
||||||
summary.UtilCPU = UtilCPU / (float64(nodesCount) + 0.001)
|
summary.UtilCPU = UtilCPU / (float64(nodesCount) + 0.001)
|
||||||
@@ -154,15 +192,19 @@ func (pool *ResourcePool) start() {
|
|||||||
pool.TotalGPU = TotalGPU
|
pool.TotalGPU = TotalGPU
|
||||||
time.Sleep(time.Second * 60)
|
time.Sleep(time.Second * 60)
|
||||||
}
|
}
|
||||||
}()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* update node info */
|
||||||
func (pool *ResourcePool) update(node NodeStatus) {
|
func (pool *ResourcePool) update(node NodeStatus) {
|
||||||
poolID := pool.getNodePool(node.ClientID)
|
segID := pool.getNodePool(node.ClientID)
|
||||||
|
seg := &pool.pools[segID]
|
||||||
pool.poolsMu[poolID].Lock()
|
if seg.Nodes == nil {
|
||||||
defer pool.poolsMu[poolID].Unlock()
|
seg = seg.Next
|
||||||
|
}
|
||||||
|
seg.Lock.Lock()
|
||||||
|
defer seg.Lock.Unlock()
|
||||||
|
|
||||||
|
/* init bindings */
|
||||||
go func(node NodeStatus) {
|
go func(node NodeStatus) {
|
||||||
pool.bindingsMu.Lock()
|
pool.bindingsMu.Lock()
|
||||||
defer pool.bindingsMu.Unlock()
|
defer pool.bindingsMu.Unlock()
|
||||||
@@ -180,46 +222,137 @@ func (pool *ResourcePool) update(node NodeStatus) {
|
|||||||
}(node)
|
}(node)
|
||||||
|
|
||||||
pool.counterTotal++
|
pool.counterTotal++
|
||||||
|
pool.versionsMu.Lock()
|
||||||
if version, ok := pool.versions[node.ClientID]; ok && version == node.Version {
|
if version, ok := pool.versions[node.ClientID]; ok && version == node.Version {
|
||||||
|
pool.versionsMu.Unlock()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
pool.versionsMu.Unlock()
|
||||||
|
pool.counter++
|
||||||
log.Debug(node.Version, "!=", pool.versions[node.ClientID])
|
log.Debug(node.Version, "!=", pool.versions[node.ClientID])
|
||||||
|
|
||||||
pool.counter++
|
status, ok := seg.Nodes[node.ClientID]
|
||||||
status, ok := pool.pools[poolID][node.ClientID]
|
|
||||||
if ok {
|
if ok {
|
||||||
|
/* remain allocation info */
|
||||||
for i, GPU := range status.Status {
|
for i, GPU := range status.Status {
|
||||||
if GPU.UUID == node.Status[i].UUID {
|
if GPU.UUID == node.Status[i].UUID {
|
||||||
node.Status[i].MemoryAllocated = GPU.MemoryAllocated
|
node.Status[i].MemoryAllocated = GPU.MemoryAllocated
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.pools[poolID][node.ClientID] = node
|
seg.Nodes[node.ClientID] = &node
|
||||||
|
if len(seg.Nodes) > 10 {
|
||||||
|
pool.scaleSeg(seg)
|
||||||
|
}
|
||||||
pool.versions[node.ClientID] = node.Version
|
pool.versions[node.ClientID] = node.Version
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* spilt seg */
|
||||||
|
func (pool *ResourcePool) scaleSeg(seg *PoolSeg) {
|
||||||
|
go func() {
|
||||||
|
pool.poolsMu.Lock()
|
||||||
|
defer pool.poolsMu.Unlock()
|
||||||
|
|
||||||
|
var candidate *PoolSeg
|
||||||
|
seg.Lock.Lock()
|
||||||
|
|
||||||
|
/* find previous seg */
|
||||||
|
var pre *PoolSeg
|
||||||
|
for i := seg.ID + pool.poolsCount - 1; i >= 0; i-- {
|
||||||
|
if pool.pools[i%pool.poolsCount].Next != seg {
|
||||||
|
pre = &pool.pools[i%pool.poolsCount]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
step := seg.ID - pre.ID
|
||||||
|
if step < 0 {
|
||||||
|
step += pool.poolsCount
|
||||||
|
}
|
||||||
|
|
||||||
|
/* find seg in the nearest middle */
|
||||||
|
minDistance := step
|
||||||
|
for i := 1; i < step; i++ {
|
||||||
|
if !pool.pools[(i+pre.ID)%pool.poolsCount].IsVirtual {
|
||||||
|
distance := i - step/2
|
||||||
|
if distance < 0 {
|
||||||
|
distance = -distance
|
||||||
|
}
|
||||||
|
if candidate == nil || distance < minDistance {
|
||||||
|
candidate = &pool.pools[i]
|
||||||
|
minDistance = distance
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* update Next */
|
||||||
|
if candidate != nil {
|
||||||
|
distance := candidate.ID - seg.ID
|
||||||
|
if distance < 0 {
|
||||||
|
distance = -distance
|
||||||
|
}
|
||||||
|
for i := 0; i < distance; i++ {
|
||||||
|
pool.pools[(i+pre.ID)%pool.poolsCount].Lock.Lock()
|
||||||
|
pool.pools[(i+pre.ID)%pool.poolsCount].Next = candidate
|
||||||
|
pool.pools[(i+pre.ID)%pool.poolsCount].Lock.Unlock()
|
||||||
|
}
|
||||||
|
candidate.Lock.Lock()
|
||||||
|
candidate.Next = seg
|
||||||
|
/* move nodes */
|
||||||
|
nodesToMove := map[string]*NodeStatus{}
|
||||||
|
for _, node := range seg.Nodes {
|
||||||
|
seg2ID := pool.getNodePool(node.ClientID)
|
||||||
|
seg2 := &pool.pools[seg2ID]
|
||||||
|
if seg2.Nodes == nil {
|
||||||
|
seg2 = seg2.Next
|
||||||
|
}
|
||||||
|
if seg2 != seg {
|
||||||
|
nodesToMove[node.ClientID] = node
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, node := range nodesToMove {
|
||||||
|
delete(seg.Nodes, node.ClientID)
|
||||||
|
}
|
||||||
|
candidate.Nodes = nodesToMove
|
||||||
|
candidate.Lock.Unlock()
|
||||||
|
}
|
||||||
|
seg.Lock.Unlock()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
/* get node by ClientID */
|
||||||
func (pool *ResourcePool) getByID(id string) NodeStatus {
|
func (pool *ResourcePool) getByID(id string) NodeStatus {
|
||||||
poolID := pool.getNodePool(id)
|
poolID := pool.getNodePool(id)
|
||||||
|
seg := &pool.pools[poolID]
|
||||||
|
if seg.Nodes == nil {
|
||||||
|
seg = seg.Next
|
||||||
|
}
|
||||||
|
seg.Lock.Lock()
|
||||||
|
defer seg.Lock.Unlock()
|
||||||
|
|
||||||
pool.poolsMu[poolID].Lock()
|
status, ok := seg.Nodes[id]
|
||||||
defer pool.poolsMu[poolID].Unlock()
|
|
||||||
|
|
||||||
status, ok := pool.pools[poolID][id]
|
|
||||||
if ok {
|
if ok {
|
||||||
return status
|
return *status
|
||||||
}
|
}
|
||||||
return NodeStatus{}
|
return NodeStatus{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* get all nodes */
|
||||||
func (pool *ResourcePool) list() MsgResource {
|
func (pool *ResourcePool) list() MsgResource {
|
||||||
nodes := map[string]NodeStatus{}
|
nodes := map[string]NodeStatus{}
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
|
||||||
pool.poolsMu[i].Lock()
|
start := pool.pools[0].Next
|
||||||
for k, node := range pool.pools[i] {
|
for cur := start; ; {
|
||||||
nodes[k] = node
|
cur.Lock.Lock()
|
||||||
|
cur.Lock.Unlock()
|
||||||
|
|
||||||
|
for k, node := range cur.Nodes {
|
||||||
|
nodes[k] = *node
|
||||||
|
}
|
||||||
|
cur = cur.Next
|
||||||
|
if cur == start {
|
||||||
|
break
|
||||||
}
|
}
|
||||||
pool.poolsMu[i].Unlock()
|
|
||||||
}
|
}
|
||||||
return MsgResource{Code: 0, Resource: nodes}
|
return MsgResource{Code: 0, Resource: nodes}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -88,12 +88,14 @@ func (scheduler *SchedulerFCFS) Schedule(job Job) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task, nodes []NodeStatus) NodeStatus {
|
func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task, nodes []NodeStatus) NodeStatus {
|
||||||
poolID := rand.Intn(pool.poolsCount)
|
segID := rand.Intn(pool.poolsCount)
|
||||||
pool.poolsMu[poolID].Lock()
|
seg := &pool.pools[segID]
|
||||||
defer pool.poolsMu[poolID].Unlock()
|
if seg.Nodes == nil {
|
||||||
|
seg = seg.Next
|
||||||
|
}
|
||||||
|
|
||||||
res := NodeStatus{}
|
res := NodeStatus{}
|
||||||
for id, node := range pool.pools[poolID] {
|
for id, node := range seg.Nodes {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
||||||
@@ -122,11 +124,15 @@ func (scheduler *SchedulerFCFS) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFCFS) ReleaseResource(job Job, agent NodeStatus) {
|
func (scheduler *SchedulerFCFS) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
poolID := rand.Intn(pool.poolsCount)
|
segID := pool.getNodePool(agent.ClientID)
|
||||||
pool.poolsMu[poolID].Lock()
|
seg := &pool.pools[segID]
|
||||||
defer pool.poolsMu[poolID].Unlock()
|
if seg.Nodes == nil {
|
||||||
|
seg = seg.Next
|
||||||
|
}
|
||||||
|
seg.Lock.Lock()
|
||||||
|
defer seg.Lock.Unlock()
|
||||||
|
|
||||||
node := pool.pools[poolID][agent.ClientID]
|
node := seg.Nodes[agent.ClientID]
|
||||||
for _, gpu := range agent.Status {
|
for _, gpu := range agent.Status {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if gpu.UUID == node.Status[j].UUID {
|
if gpu.UUID == node.Status[j].UUID {
|
||||||
@@ -211,9 +217,10 @@ func (scheduler *SchedulerFCFS) Summary() MsgSummary {
|
|||||||
FreeGPU := 0
|
FreeGPU := 0
|
||||||
UsingGPU := 0
|
UsingGPU := 0
|
||||||
|
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
start := pool.pools[0].Next
|
||||||
pool.poolsMu[i].Lock()
|
for cur := start; ; {
|
||||||
for _, node := range pool.pools[i] {
|
cur.Lock.Lock()
|
||||||
|
for _, node := range cur.Nodes {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
FreeGPU++
|
FreeGPU++
|
||||||
@@ -222,7 +229,11 @@ func (scheduler *SchedulerFCFS) Summary() MsgSummary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.poolsMu[i].Unlock()
|
cur.Lock.Unlock()
|
||||||
|
cur = cur.Next
|
||||||
|
if cur == start {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
summary.FreeGPU = FreeGPU
|
summary.FreeGPU = FreeGPU
|
||||||
summary.UsingGPU = UsingGPU
|
summary.UsingGPU = UsingGPU
|
||||||
|
|||||||
@@ -331,7 +331,7 @@ func (scheduler *SchedulerFair) Schedule(job Job) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []NodeStatus) NodeStatus {
|
func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []NodeStatus) NodeStatus {
|
||||||
poolID := rand.Intn(pool.poolsCount)
|
segID := rand.Intn(pool.poolsCount)
|
||||||
res := NodeStatus{}
|
res := NodeStatus{}
|
||||||
|
|
||||||
locks := map[int]sync.Mutex{}
|
locks := map[int]sync.Mutex{}
|
||||||
@@ -347,13 +347,14 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
allocationType = 1
|
allocationType = 1
|
||||||
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
|
if util, valid := InstanceOfOptimizer().predictUtilGPU(job.Name); valid {
|
||||||
|
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
start := pool.pools[segID].Next
|
||||||
if _, ok := locks[(i+poolID)%pool.poolsCount]; !ok {
|
for cur := start; ; {
|
||||||
pool.poolsMu[(i+poolID)%pool.poolsCount].Lock()
|
if _, ok := locks[cur.ID]; !ok {
|
||||||
locks[(i+poolID)%pool.poolsCount] = pool.poolsMu[(i+poolID)%pool.poolsCount]
|
cur.Lock.Lock()
|
||||||
|
locks[cur.ID] = cur.Lock
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, node := range pool.pools[(i+poolID)%pool.poolsCount] {
|
for _, node := range cur.Nodes {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated && status.MemoryFree > task.MemoryGPU {
|
if status.MemoryTotal > task.MemoryGPU+status.MemoryAllocated && status.MemoryFree > task.MemoryGPU {
|
||||||
@@ -375,7 +376,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(available) >= task.NumberGPU {
|
if len(available) >= task.NumberGPU {
|
||||||
candidates = append(candidates, &node)
|
candidates = append(candidates, node)
|
||||||
if len(candidates) >= 8 {
|
if len(candidates) >= 8 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -384,6 +385,10 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
if len(candidates) >= 8 {
|
if len(candidates) >= 8 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
cur = cur.Next
|
||||||
|
if cur == start {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//log.Info(candidates)
|
//log.Info(candidates)
|
||||||
@@ -392,12 +397,13 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
/* second round, find vacant gpu */
|
/* second round, find vacant gpu */
|
||||||
if len(candidates) == 0 {
|
if len(candidates) == 0 {
|
||||||
allocationType = 2
|
allocationType = 2
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
start := pool.pools[segID].Next
|
||||||
if _, ok := locks[(i+poolID)%pool.poolsCount]; !ok {
|
for cur := start; ; {
|
||||||
pool.poolsMu[(i+poolID)%pool.poolsCount].Lock()
|
if _, ok := locks[cur.ID]; !ok {
|
||||||
locks[(i+poolID)%pool.poolsCount] = pool.poolsMu[(i+poolID)%pool.poolsCount]
|
cur.Lock.Lock()
|
||||||
|
locks[cur.ID] = cur.Lock
|
||||||
}
|
}
|
||||||
for _, node := range pool.pools[(i+poolID)%pool.poolsCount] {
|
for _, node := range cur.Nodes {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryAllocated == 0 && status.MemoryUsed < 10 {
|
if status.MemoryAllocated == 0 && status.MemoryUsed < 10 {
|
||||||
@@ -405,7 +411,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(available) >= task.NumberGPU {
|
if len(available) >= task.NumberGPU {
|
||||||
candidates = append(candidates, &node)
|
candidates = append(candidates, node)
|
||||||
availableGPUs[node.ClientID] = available
|
availableGPUs[node.ClientID] = available
|
||||||
if len(candidates) >= 8 {
|
if len(candidates) >= 8 {
|
||||||
break
|
break
|
||||||
@@ -415,6 +421,10 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
if len(candidates) >= 8 {
|
if len(candidates) >= 8 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
cur = cur.Next
|
||||||
|
if cur == start {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
//log.Info(candidates)
|
//log.Info(candidates)
|
||||||
}
|
}
|
||||||
@@ -429,12 +439,13 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
|
|
||||||
if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid {
|
if pool.TotalGPU != 0 && float64(scheduler.UsingGPU)/float64(pool.TotalGPU) >= scheduler.enablePreScheduleRatio && valid {
|
||||||
allocationType = 3
|
allocationType = 3
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
start := pool.pools[segID].Next
|
||||||
if _, ok := locks[(i+poolID)%pool.poolsCount]; !ok {
|
for cur := start; ; {
|
||||||
pool.poolsMu[(i+poolID)%pool.poolsCount].Lock()
|
if _, ok := locks[cur.ID]; !ok {
|
||||||
locks[(i+poolID)%pool.poolsCount] = pool.poolsMu[(i+poolID)%pool.poolsCount]
|
cur.Lock.Lock()
|
||||||
|
locks[cur.ID] = cur.Lock
|
||||||
}
|
}
|
||||||
for _, node := range pool.pools[(i+poolID)%pool.poolsCount] {
|
for _, node := range cur.Nodes {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
bindings := pool.getBindings()
|
bindings := pool.getBindings()
|
||||||
@@ -455,7 +466,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(available) >= task.NumberGPU {
|
if len(available) >= task.NumberGPU {
|
||||||
candidates = append(candidates, &node)
|
candidates = append(candidates, node)
|
||||||
availableGPUs[node.ClientID] = available
|
availableGPUs[node.ClientID] = available
|
||||||
if len(candidates) >= 8 {
|
if len(candidates) >= 8 {
|
||||||
break
|
break
|
||||||
@@ -512,7 +523,7 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i := range locks {
|
for i := range locks {
|
||||||
pool.poolsMu[i].Unlock()
|
locks[i].Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
go func(res NodeStatus) {
|
go func(res NodeStatus) {
|
||||||
@@ -538,11 +549,15 @@ func (scheduler *SchedulerFair) AcquireResource(job Job, task Task, nodes []Node
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
|
func (scheduler *SchedulerFair) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
poolID := pool.getNodePool(agent.ClientID)
|
segID := pool.getNodePool(agent.ClientID)
|
||||||
pool.poolsMu[poolID].Lock()
|
seg := pool.pools[segID]
|
||||||
defer pool.poolsMu[poolID].Unlock()
|
if seg.Nodes == nil {
|
||||||
|
seg = *seg.Next
|
||||||
|
}
|
||||||
|
seg.Lock.Lock()
|
||||||
|
defer seg.Lock.Unlock()
|
||||||
|
|
||||||
node := pool.pools[poolID][agent.ClientID]
|
node := seg.Nodes[agent.ClientID]
|
||||||
for _, gpu := range agent.Status {
|
for _, gpu := range agent.Status {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if gpu.UUID == node.Status[j].UUID {
|
if gpu.UUID == node.Status[j].UUID {
|
||||||
@@ -678,9 +693,10 @@ func (scheduler *SchedulerFair) Summary() MsgSummary {
|
|||||||
FreeGPU := 0
|
FreeGPU := 0
|
||||||
UsingGPU := 0
|
UsingGPU := 0
|
||||||
|
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
start := pool.pools[0].Next
|
||||||
pool.poolsMu[i].Lock()
|
for cur := start; ; {
|
||||||
for _, node := range pool.pools[i] {
|
cur.Lock.Lock()
|
||||||
|
for _, node := range cur.Nodes {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
FreeGPU++
|
FreeGPU++
|
||||||
@@ -689,7 +705,11 @@ func (scheduler *SchedulerFair) Summary() MsgSummary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.poolsMu[i].Unlock()
|
cur.Lock.Unlock()
|
||||||
|
cur = cur.Next
|
||||||
|
if cur == start {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
summary.FreeGPU = FreeGPU
|
summary.FreeGPU = FreeGPU
|
||||||
summary.UsingGPU = UsingGPU
|
summary.UsingGPU = UsingGPU
|
||||||
@@ -713,9 +733,10 @@ func (scheduler *SchedulerFair) UpdateNextQueue() {
|
|||||||
MemoryGPU := 0.00001
|
MemoryGPU := 0.00001
|
||||||
CPU := 0.00001
|
CPU := 0.00001
|
||||||
Memory := 0.0001
|
Memory := 0.0001
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
start := pool.pools[0].Next
|
||||||
pool.poolsMu[i].Lock()
|
for cur := start; ; {
|
||||||
for _, node := range pool.pools[i] {
|
cur.Lock.Lock()
|
||||||
|
for _, node := range cur.Nodes {
|
||||||
CPU += float64(node.NumCPU)
|
CPU += float64(node.NumCPU)
|
||||||
Memory += float64(node.MemTotal)
|
Memory += float64(node.MemTotal)
|
||||||
for _, card := range node.Status {
|
for _, card := range node.Status {
|
||||||
@@ -723,7 +744,11 @@ func (scheduler *SchedulerFair) UpdateNextQueue() {
|
|||||||
MemoryGPU += float64(card.MemoryTotal)
|
MemoryGPU += float64(card.MemoryTotal)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.poolsMu[i].Unlock()
|
cur.Lock.Unlock()
|
||||||
|
cur = cur.Next
|
||||||
|
if cur == start {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
scheduler.queueMu.Lock()
|
scheduler.queueMu.Lock()
|
||||||
|
|||||||
@@ -112,12 +112,14 @@ func (scheduler *SchedulerPriority) Schedule(job Job) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task, nodes []NodeStatus) NodeStatus {
|
func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task, nodes []NodeStatus) NodeStatus {
|
||||||
poolID := rand.Intn(pool.poolsCount)
|
segID := rand.Intn(pool.poolsCount)
|
||||||
pool.poolsMu[poolID].Lock()
|
seg := &pool.pools[segID]
|
||||||
defer pool.poolsMu[poolID].Unlock()
|
if seg.Nodes == nil {
|
||||||
|
seg = seg.Next
|
||||||
|
}
|
||||||
|
|
||||||
res := NodeStatus{}
|
res := NodeStatus{}
|
||||||
for id, node := range pool.pools[poolID] {
|
for id, node := range seg.Nodes {
|
||||||
var available []GPUStatus
|
var available []GPUStatus
|
||||||
for _, status := range node.Status {
|
for _, status := range node.Status {
|
||||||
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
if status.MemoryTotal-status.MemoryAllocated >= task.MemoryGPU {
|
||||||
@@ -146,11 +148,15 @@ func (scheduler *SchedulerPriority) AcquireResource(job Job, task Task, nodes []
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (scheduler *SchedulerPriority) ReleaseResource(job Job, agent NodeStatus) {
|
func (scheduler *SchedulerPriority) ReleaseResource(job Job, agent NodeStatus) {
|
||||||
poolID := rand.Intn(pool.poolsCount)
|
segID := pool.getNodePool(agent.ClientID)
|
||||||
pool.poolsMu[poolID].Lock()
|
seg := &pool.pools[segID]
|
||||||
defer pool.poolsMu[poolID].Unlock()
|
if seg.Nodes == nil {
|
||||||
|
seg = seg.Next
|
||||||
|
}
|
||||||
|
seg.Lock.Lock()
|
||||||
|
defer seg.Lock.Unlock()
|
||||||
|
|
||||||
node := pool.pools[poolID][agent.ClientID]
|
node := seg.Nodes[agent.ClientID]
|
||||||
for _, gpu := range agent.Status {
|
for _, gpu := range agent.Status {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if gpu.UUID == node.Status[j].UUID {
|
if gpu.UUID == node.Status[j].UUID {
|
||||||
@@ -235,9 +241,10 @@ func (scheduler *SchedulerPriority) Summary() MsgSummary {
|
|||||||
FreeGPU := 0
|
FreeGPU := 0
|
||||||
UsingGPU := 0
|
UsingGPU := 0
|
||||||
|
|
||||||
for i := 0; i < pool.poolsCount; i++ {
|
start := pool.pools[0].Next
|
||||||
pool.poolsMu[i].Lock()
|
for cur := start; ; {
|
||||||
for _, node := range pool.pools[i] {
|
cur.Lock.Lock()
|
||||||
|
for _, node := range cur.Nodes {
|
||||||
for j := range node.Status {
|
for j := range node.Status {
|
||||||
if node.Status[j].MemoryAllocated == 0 {
|
if node.Status[j].MemoryAllocated == 0 {
|
||||||
FreeGPU++
|
FreeGPU++
|
||||||
@@ -246,7 +253,11 @@ func (scheduler *SchedulerPriority) Summary() MsgSummary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pool.poolsMu[i].Unlock()
|
cur.Lock.Unlock()
|
||||||
|
cur = cur.Next
|
||||||
|
if cur == start {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
summary.FreeGPU = FreeGPU
|
summary.FreeGPU = FreeGPU
|
||||||
summary.UsingGPU = UsingGPU
|
summary.UsingGPU = UsingGPU
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Configuration struct {
|
type Configuration struct {
|
||||||
@@ -195,6 +196,14 @@ type MsgOptimizerPredict struct {
|
|||||||
Post int `json:"post"`
|
Post int `json:"post"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type PoolSeg struct {
|
||||||
|
ID int
|
||||||
|
Nodes map[string]*NodeStatus
|
||||||
|
Lock sync.Mutex
|
||||||
|
Next *PoolSeg
|
||||||
|
IsVirtual bool
|
||||||
|
}
|
||||||
|
|
||||||
func str2int(str string, defaultValue int) int {
|
func str2int(str string, defaultValue int) int {
|
||||||
i, err := strconv.Atoi(str)
|
i, err := strconv.Atoi(str)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user