Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CASMTRIAGE-7594 - improve node acquisition and rebalance. #111

Merged
merged 1 commit into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Fixed
- CASMTRIAGE-7594 - clean up resilience, rebalance nodes, and accept other worker nodes
- CASMCMS-9126 - watch permissions on log files to insure they can be written to

## [2.6.0] - 2024-11-22
### Fixed
Expand Down
7 changes: 5 additions & 2 deletions src/console_node/consoleNodeMain.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ var httpListen string = ":26776"
// global to signify service is shutting down
var inShutdown bool = false

// global pointer to the active OperatorService
var opService OperatorService = nil

// identify what the name of this pod is
func setPodName() {
// The pod name is set as an env variable by the k8s system on pod
Expand Down Expand Up @@ -142,10 +145,10 @@ func main() {
setPodName()

// Construct services
operatorService := NewOperatorService()
opService = NewOperatorService()

// Find pod location in k8s, this must block and retry
setPodLocation(operatorService)
setPodLocation(opService)

// start the aggregation log
respinAggLog()
Expand Down
30 changes: 30 additions & 0 deletions src/console_node/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,36 @@ type NodeConsoleInfo struct {
NodeConsoleName string `json:"nodeconsolename"` // the pod console
}

// Struct to hold information about currently active node pods
type NodePodInfo struct {
NumActivePods int `json:"numactivepods"`
}

// Query the console-data pod to get the number of currently active console-node pods
func getNumActiveNodePods() (int, error) {
retVal := 1
// make the call to console-data
url := fmt.Sprintf("%s/activepods", dataAddrBase)
rb, _, err := getURL(url, nil)
if err != nil {
log.Printf("Error in console-data active pods query: %s", err)
return retVal, err
}

// process the return
var numPodsInfo NodePodInfo
if rb != nil {
// should be an array of nodeConsoleInfo structs
err := json.Unmarshal(rb, &numPodsInfo)
if err != nil {
log.Printf("Error unmarshalling active pods return data: %s", err)
return retVal, err
}
retVal = numPodsInfo.NumActivePods
}
return retVal, nil
}

// Function to acquire new consoles to monitor
func acquireNewNodes(numMtn, numRvr int, podLocation *PodLocationDataResponse) []nodeConsoleInfo {
// NOTE: in doGetNewNodes thread
Expand Down
24 changes: 24 additions & 0 deletions src/console_node/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ package main
import (
"bytes"
"crypto/sha256"
"fmt"
"io"
"log"
"os"
Expand Down Expand Up @@ -60,12 +61,35 @@ func checkForChanges() {
restartConman = true
}

// make sure that the log files still have the correct permissions
checkLogFiles()

//restart conman if necessary
if restartConman {
signalConmanTERM()
}
}

// function to check the permissions on the log files
func checkLogFiles() {
dlaine-hpe marked this conversation as resolved.
Show resolved Hide resolved
// gather the names of the current nodes
nodes := getCurrNodeXnames()

// check the write permissions of the log files
for _, nn := range nodes {
filename := fmt.Sprintf("/var/log/conman/console.%s", nn)
fs, err := os.Stat(filename)
if err != nil {
continue
}
if fs.Mode()&0600 == 0 {
log.Printf("Log file %s not user read/write - changing permissions", nn)
newMod := fs.Mode() | 0600
os.Chmod(filename, newMod)
}
}
}

// function to continuously monitor for changes that require conman to restart
func doMonitor() {
// NOTE: this is intended to be constantly running in its own thread
Expand Down
Loading
Loading