Skip to content

Commit

Permalink
Merge pull request #111 from Cray-HPE/CASMTRIAGE-7594
Browse files Browse the repository at this point in the history
CASMTRIAGE-7594 - improve node acquisition and rebalance.
  • Loading branch information
dlaine-hpe authored Dec 19, 2024
2 parents e7d9e92 + 0c2192a commit 50b9ba1
Show file tree
Hide file tree
Showing 6 changed files with 281 additions and 81 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Fixed
- CASMTRIAGE-7594 - clean up resilience, rebalance nodes, and accept other worker nodes
- CASMCMS-9126 - watch permissions on log files to insure they can be written to

## [2.6.0] - 2024-11-22
### Fixed
Expand Down
7 changes: 5 additions & 2 deletions src/console_node/consoleNodeMain.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ var httpListen string = ":26776"
// global to signify service is shutting down
var inShutdown bool = false

// global pointer to the active OperatorService
var opService OperatorService = nil

// identify what the name of this pod is
func setPodName() {
// The pod name is set as an env variable by the k8s system on pod
Expand Down Expand Up @@ -142,10 +145,10 @@ func main() {
setPodName()

// Construct services
operatorService := NewOperatorService()
opService = NewOperatorService()

// Find pod location in k8s, this must block and retry
setPodLocation(operatorService)
setPodLocation(opService)

// start the aggregation log
respinAggLog()
Expand Down
30 changes: 30 additions & 0 deletions src/console_node/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,36 @@ type NodeConsoleInfo struct {
NodeConsoleName string `json:"nodeconsolename"` // the pod console
}

// Struct to hold information about currently active node pods
type NodePodInfo struct {
NumActivePods int `json:"numactivepods"`
}

// Query the console-data pod to get the number of currently active console-node pods
func getNumActiveNodePods() (int, error) {
retVal := 1
// make the call to console-data
url := fmt.Sprintf("%s/activepods", dataAddrBase)
rb, _, err := getURL(url, nil)
if err != nil {
log.Printf("Error in console-data active pods query: %s", err)
return retVal, err
}

// process the return
var numPodsInfo NodePodInfo
if rb != nil {
// should be an array of nodeConsoleInfo structs
err := json.Unmarshal(rb, &numPodsInfo)
if err != nil {
log.Printf("Error unmarshalling active pods return data: %s", err)
return retVal, err
}
retVal = numPodsInfo.NumActivePods
}
return retVal, nil
}

// Function to acquire new consoles to monitor
func acquireNewNodes(numMtn, numRvr int, podLocation *PodLocationDataResponse) []nodeConsoleInfo {
// NOTE: in doGetNewNodes thread
Expand Down
24 changes: 24 additions & 0 deletions src/console_node/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ package main
import (
"bytes"
"crypto/sha256"
"fmt"
"io"
"log"
"os"
Expand Down Expand Up @@ -60,12 +61,35 @@ func checkForChanges() {
restartConman = true
}

// make sure that the log files still have the correct permissions
checkLogFiles()

//restart conman if necessary
if restartConman {
signalConmanTERM()
}
}

// function to check the permissions on the log files
func checkLogFiles() {
// gather the names of the current nodes
nodes := getCurrNodeXnames()

// check the write permissions of the log files
for _, nn := range nodes {
filename := fmt.Sprintf("/var/log/conman/console.%s", nn)
fs, err := os.Stat(filename)
if err != nil {
continue
}
if fs.Mode()&0600 == 0 {
log.Printf("Log file %s not user read/write - changing permissions", nn)
newMod := fs.Mode() | 0600
os.Chmod(filename, newMod)
}
}
}

// function to continuously monitor for changes that require conman to restart
func doMonitor() {
// NOTE: this is intended to be constantly running in its own thread
Expand Down
Loading

0 comments on commit 50b9ba1

Please sign in to comment.