diff --git a/background_scripts/xdmod-supremm-admin b/background_scripts/xdmod-supremm-admin new file mode 100644 index 00000000..376e8189 --- /dev/null +++ b/background_scripts/xdmod-supremm-admin @@ -0,0 +1,278 @@ +#!/usr/bin/env php +crit(array( + 'message' => $e->getMessage(), + 'stacktrace' => $e->getTraceAsString(), + )); + } while ($e = $e->getPrevious()); + exit(1); +} + +/** + * Main function. + */ +function main() +{ + global $argv, $logger; + + $opts = array( + 'h' => 'help', + 'v' => 'verbose', + 'd' => 'debug', + 'q' => 'quiet', + 'f' => 'force', + 'r:' => 'resource:', + 'a:' => 'action:' + ); + + $shortOptions = implode('', array_keys($opts)); + $longOptions = array_values($opts); + + $args = getopt($shortOptions, $longOptions); + + if ($args === false) { + fwrite(STDERR, "Failed to parse arguments\n"); + exit(1); + } + + $help = false; + + $config = array( + 'logLevel' => -1, + 'action' => null, + 'resource' => null, + 'force' => false + ); + + foreach ($args as $key => $value) { + if (is_array($value)) { + fwrite(STDERR, "Multiple values not allowed for '$key'\n"); + exit(1); + } + + switch ($key) { + case 'h': + case 'help': + $help = true; + break; + case 'q': + case 'quiet': + $config['logLevel'] = max($config['logLevel'], Log::WARNING); + break; + case 'v': + case 'verbose': + $config['logLevel'] = max($config['logLevel'], Log::INFO); + break; + case 'd': + case 'debug': + $config['logLevel'] = max($config['logLevel'], Log::DEBUG); + break; + case 'f': + case 'force': + $config['force'] = true; + break; + case 'r': + case 'resource': + $config['resource'] = $value; + break; + case 'a': + case 'action': + $config['action'] = $value; + break; + default: + fwrite(STDERR, "Unexpected option '$key'\n"); + exit(1); + break; + } + } + + if ($help) { + displayHelpText(); + exit; + } + + if ($config['logLevel'] === -1) { $config['logLevel'] = Log::NOTICE; } + + $logConf = array( + 'file' => false, + 'mail' => false, + 'consoleLogLevel' => $config['logLevel'], + ); + + $logger = Log::factory('xdmod-supremm-admin', $logConf); + + $cmd = implode(' ', array_map('escapeshellarg', $argv)); + $logger->info("Command: $cmd"); + + switch ($config['action']) { + case 'truncate': + truncateAction($config); + break; + default: + $logger->crit('No action specified'); + displayHelpText(); + exit(1); + } + + exit; +} + +/** + * Truncate all the job tables. + */ +function truncateAction($config) +{ + global $logger; + + if ($config['resource'] === null) { + $logger->crit('No resource specified'); + exit(1); + } + + if (!$config['force'] && !confirm('Truncate all job data for resource ' . $config['resource'] . '?')) { + return; + } + + $db = DB::factory('datawarehouse'); + + $searchColumn = 'code'; + if (is_numeric($config['resource'])) { + $searchColumn = 'id'; + } + $resourceQuery = $db->prepare('SELECT id FROM `modw`.`resourcefact` WHERE ' . $searchColumn . ' = :resource'); + $resourceQuery->execute(array('resource' => $config['resource'])); + $result = $resourceQuery->fetchAll(PDO::FETCH_COLUMN, 0); + + if (count($result) !== 1) { + $logger->crit('Unable to find resource ' . $config['resource'] . ' in the database'); + return; + } + + $resource_id = $result[0]; + + $logger->notice('Reset job ingest status for job summaries in mongo for resource_id ' . $resource_id); + + $sdb = new \DataWarehouse\Query\SUPREMM\SupremmDbInterface(); + + $docs_updated = $sdb->updateEtlVersion($resource_id, 0); + if ($docs_updated === null) { + $logger->warning('No mongo data found for resource. No action taken'); + } else { + if ($docs_updated['ok']) { + $logger->notice('Job summary documents status reset for ' . $docs_updated['nModified'] . ' jobs'); + } else { + $logger->warning('Job summary document update failed ' . json_encode($docs_updated)); + } + } + + $logger->notice('Removing job data from sql database for resource_id ' . $resource_id); + + $tables = array( + 'executable', + 'cwd', + 'host', + 'jobhost', + 'job_errors' + ); + + foreach ($tables as $table) { + $rows = $db->execute('DELETE FROM `modw_supremm`.`' . $table . '` WHERE resource_id = :resource_id', array('resource_id' => $resource_id)); + $logger->notice('Deleted ' . $rows . ' rows from ' . $table); + } + + $multiDel = <<execute($multiDel, array('resource_id' => $resource_id)); + $logger->notice('Deleted ' . $rows . ' rows from job, job_name and job_peers tables.'); +} + +/** + * Prompt the user for confirmation. + * + * @param string $msg Confirmation message. + * + * @return bool True if the message is confirmed. + */ +function confirm($msg) +{ + $response = null; + + while ($response === null) { + $response = readline("$msg (yes/no): [no] "); + + if (!in_array($response, array('yes', 'no', ''))) { + echo "\n'$response' is not a valid option.\n\n"; + $response = null; + } + } + + return $response == 'yes'; +} + +/** + * Output help text to STDOUT. + */ +function displayHelpText() +{ + echo <<<'EOF' + +Perform administrative tasks. + +This command currently supports truncating all job data for a given resource. + +Usage: xdmod-supremm-admin -a ACTION [OPTS] + + -h, --help + Display this message and exit. + + -v, --verbose + Output info level logging. + + --debug + Output debug level logging. + + -q, --quiet + Output warning level logging. + + -f, --force + Force the action. You will not be prompted to confirm the + action requested. + + -a, --action=[ACTION] + Perform the requested ACTION. + +Actions: + + truncate --resource=RESOURCE + +The truncate action removes the data for a given resource + + Examples: + xdmod-admin --action=truncate --resource='hpc' + + This will remove all job data from the supremm realm for the resource 'hpc'. + +EOF; +} + diff --git a/classes/DataWarehouse/Query/SUPREMM/SupremmDbInterface.php b/classes/DataWarehouse/Query/SUPREMM/SupremmDbInterface.php index 000e397c..8aa65324 100644 --- a/classes/DataWarehouse/Query/SUPREMM/SupremmDbInterface.php +++ b/classes/DataWarehouse/Query/SUPREMM/SupremmDbInterface.php @@ -41,6 +41,29 @@ public function __construct() { } + /** + * Update the etl ingest version number for all documents in mongo for a + * resource. + */ + public function updateEtlVersion($resource_id, $new_etl_version) { + + $resconf =& $this->getResourceConfig($resource_id); + + if( $resconf === null) { + return null; + } + + $collection = $resconf['handle']->selectCollection($resconf['collection']); + + $result = $collection->update( + array('processed.' . $this->getEtlUid() . '.version' => $this->etl_version), + array('$set' => array('processed.' . $this->getEtlUid() . '.version' => $new_etl_version)), + array('multiple' => true, 'socketTimeoutMS' => -1, 'wTimeoutMS' => -1) + ); + + return $result; + } + /** get the list of configured resources * @return array list of resource ids of the configured resources */ diff --git a/docs/supremm-install-overview.md b/docs/supremm-install-overview.md index e9397c70..2e39e508 100644 --- a/docs/supremm-install-overview.md +++ b/docs/supremm-install-overview.md @@ -36,7 +36,7 @@ identical to the version of the base XDMoD install. The versions of the summarization software and PCP software for a given XDMoD version are listed below. -### Open XDMoD 8.0.0 +### Open XDMoD 8.1.0 @@ -46,7 +46,7 @@ below. - + @@ -58,6 +58,10 @@ below. The SUPReMM software has been tested with MongoDB version 3.4.15. We expect that the software is compatible with any supported release version of MongoDB. +The summarization software should be compatible with the 4.x releases of PCP. +However the XDMoD team have not tested this configuration and will not be able +to provide support for it. + System Requirements --------------------- diff --git a/docs/supremm-upgrade.md b/docs/supremm-upgrade.md index 8ff3e2a4..f50604d9 100644 --- a/docs/supremm-upgrade.md +++ b/docs/supremm-upgrade.md @@ -9,26 +9,55 @@ The Job Performance (SUPReMM) XDMoD module should be upgraded at the same time a software. The upgrade procedure is documented on the [XDMoD upgrade page](https://open.xdmod.org/upgrade.html). -**NOTE:** the recommended MySQL/MariaDB database settings have changed and must be updated. See -the [XDMoD Software Requirements](https://open.xdmod.org/8.0/software-requirements.html#mysql) for -details. - -7.5.1 to 8.0.0 Upgrade Notes +8.0.0 to 8.1.0 Upgrade Notes ---------------------------- - This upgrade includes database schema changes. - Modifies `modw_supremm` schema. - - Modifies `modw_aggregates` schema. -The `modw_supremm.batchscripts` table is deprecated and is replaced by the `modw_supremm.job_scripts` table. -The contents of the `batchscripts` table are migrated to the `job_scripts` table by -the `xdmod-upgrade` script. The `batchscripts` table is not deleted automatically, but can be safely -dropped from the database after a successful upgrade. +The `modw_supremm.job` and `modw_supremm.job_error` table have extra columns to store the job +energy metrics. These columns are added to the tables by the upgrade script. + +The `modw_suprem.jobhosts` table stores information about the compute nodes for each job. Prior +to 8.1.0 this table used the job identifier provided by the resource manager as a unique identifier. +This data stored in this table is used to determine whether a job shared compute node with +any other job. The consequence of this design is that the shared jobs and job peers data in XDMoD +would be incorrect if the resource manager re-used job identifiers. This has been observed when +the job identifier counter on the resource manager wraps around. This update adds the job's end time +to the unique constraint and populates it based on the existing information in the datawarehouse. + +No further action is required if any of the following apply: +- The HPC resources are not configured to allow HPC jobs to share compute nodes +- The shared jobs flag is set to false for a resource +- The HPC resource manager job identifier is unique + +However, if there is data for an HPC resource that has non-unique job identifiers and shared +compute nodes then it will be necessary to re-ingest the job data to get correct information +into the database. This is done as follows: + +First the existing data for the resource must be removed from the database using the following +command: + + $ /usr/lib64/xdmod/xdmod-supremm-admin --action truncate --resource [RESOURCE] -d + +The amount of time this script takes depends on the number of jobs that have +information in the database and the performance of the MongoDB and MySQL +databases. In a test run for a resource that had approximately 2 million jobs it took +approximately 20 minutes to reset the status in MongoDB and 10 minutes to delete the +records from the MySQL database tables. + +Then the job data should be re-ingested: + + $ cd /usr/share/xdmod/etl/js + $ node etl.cluster.js --dataset=[RESOURCE] + +Then the shared jobs script should be run to reprocess all jobs for the resource: + + $ /usr/lib64/xdmod/supremm_sharedjobs.php --resource [RESOURCE] -a + +Finally the aggregation step should be run: -The `modw_supremm.jobstatus` table was used to track the aggregation status of each row -in the `modw_supremm.job` table. The `jobstatus` table is not longer used and is removed -by the upgrade. The `modw_supremm.job` table now has an extra column that stores the modification -time for each row. + $ aggregate_supremm.sh -The modifications to the `modw_aggregates` schema are made by the aggregation software. These run -automatically the first time `aggregate_supremm.sh` runs after the upgrade. +The debug flag `-d` may also be specified if you wish to track the progress of the +scripts.
Job Summarization 1.1.0 1.1.x Job Summarization 1.1.1 1.1.x
PCP 3.12.2 3.11.x - 3.12.x