Merge pull request #166 from jpwhite4/network

Added extra mappings
ubccr · Sep 19, 2019 · 046edc6 · 046edc6
2 parents 9dc19ad + 102c1c2
commit 046edc6
Show file tree

Hide file tree

Showing 16 changed files with 603 additions and 167 deletions.
diff --git a/background_scripts/xdmod-supremm-admin b/background_scripts/xdmod-supremm-admin
@@ -121,7 +121,10 @@ function main()
 
     switch ($config['action']) {
         case 'truncate':
-            truncateAction($config);
+            truncateAction($config, true);
+            break;
+        case 'reset':
+            truncateAction($config, false);
             break;
         default:
             $logger->crit('No action specified');
@@ -132,10 +135,12 @@ function main()
     exit;
 }
 
+
 /**
- * Truncate all the job tables.
+ * Reset the job status in mongo and optionally delete the contents
+ * of the SQL tables.
  */
-function truncateAction($config)
+function truncateAction($config, $deleteSqlData)
 {
     global $logger;
 
@@ -144,7 +149,9 @@ function truncateAction($config)
         exit(1);
     }
 
-    if (!$config['force'] && !confirm('Truncate all job data for resource ' . $config['resource'] . '?')) {
+    $verb = $deleteSqlData ? 'Truncate' : 'Reset';
+
+    if (!$config['force'] && !confirm($verb . ' all job data for resource ' . $config['resource'] . '?')) {
         return;
     }
 
@@ -180,31 +187,33 @@ function truncateAction($config)
         }
     }
 
-    $logger->notice('Removing job data from sql database for resource_id ' . $resource_id);
+    if ($deleteSqlData) {
+        $logger->notice('Removing job data from sql database for resource_id ' . $resource_id);
 
-    $tables = array(
-        'executable',
-        'cwd',
-        'host',
-        'jobhost',
-        'job_errors'
-    );
+        $tables = array(
+            'executable',
+            'cwd',
+            'host',
+            'jobhost',
+            'job_errors'
+        );
 
-    foreach ($tables as $table) {
-        $rows = $db->execute('DELETE FROM `modw_supremm`.`' . $table . '` WHERE resource_id = :resource_id', array('resource_id' => $resource_id));
-        $logger->notice('Deleted ' . $rows . ' rows from ' . $table);
-    }
+        foreach ($tables as $table) {
+            $rows = $db->execute('DELETE FROM `modw_supremm`.`' . $table . '` WHERE resource_id = :resource_id', array('resource_id' => $resource_id));
+            $logger->notice('Deleted ' . $rows . ' rows from ' . $table);
+        }
 
-    $multiDel = <<<EOF
+        $multiDel = <<<EOF
 DELETE FROM `modw_supremm` . `job_name` , `modw_supremm` . `job_peers`
 USING `modw_supremm`.`job`
         LEFT JOIN `modw_supremm`.`job_name` ON `modw_supremm`.`job`.jobname_id = `modw_supremm`.`job_name`.id
         LEFT JOIN `modw_supremm`.`job_peers` ON `modw_supremm`.`job`._id = `modw_supremm`.`job_peers`.job_id 
 WHERE
     `modw_supremm`.`job`.resource_id = :resource_id
 EOF;
-    $rows = $db->execute($multiDel, array('resource_id' => $resource_id));
-    $logger->notice('Deleted ' . $rows . ' rows from job_name and job_peers tables.');
+        $rows = $db->execute($multiDel, array('resource_id' => $resource_id));
+        $logger->notice('Deleted ' . $rows . ' rows from job_name and job_peers tables.');
+    }
 }
 
 /**
@@ -273,6 +282,20 @@ The truncate action removes the data for a given resource
 
     This will remove all job data from the supremm realm for the resource 'hpc'.
 
+    reset --resource=RESOURCE
+
+The reset action resets the ingest status in MongoDB for the jobs for a given
+resource. This will cause all job data for the resource to be reingested next
+time the aggregate_supremm script is run.
+
+    Examples:
+
+        xdmod-admin --action=reset --resource='hpc'
+
+    Reset the ingest status for all jobs for the 'hpc' resource in MongoDB that
+    have already been ingested.
+
+
 EOF;
 }
 
diff --git a/configuration/etl/etl.d/supremm-migration-8_1_2-8_5_0.json b/configuration/etl/etl.d/supremm-migration-8_1_2-8_5_0.json
@@ -36,6 +36,25 @@
                 }
             }
         },
+        {
+            "name": "table-maintenance",
+            "description": "Modify job tables",
+            "class": "ManageTables",
+            "namespace": "ETL\\Maintenance",
+            "options_class": "MaintenanceOptions",
+            "definition_file_list": [
+                "supremm/job.json",
+                "supremm/job_errors.json"
+            ],
+            "endpoints": {
+                "destination": {
+                    "type": "mysql",
+                    "name": "Datawarehouse",
+                    "config": "datawarehouse",
+                    "schema": "modw_supremm"
+                }
+            }
+        },
         {
             "name": "table-create",
             "description": "Setup tables",

diff --git a/configuration/supremm_resources.json b/configuration/supremm_resources.json
@@ -5,7 +5,16 @@
             "resource_id": 1,
             "datasetmap": "pcp",
             "hardware": {
-                "gpfs": "gpfs0"
+                "gpfs": "gpfs0",
+                "network": [
+                    "em1",
+                    "eno1"
+                ],
+                "mounts": {
+                   "projects": "/projects",
+                   "home": "/user",
+                   "util": "/util"
+                }
             }
         }
     ]

diff --git a/docs/supremm-configuration.md b/docs/supremm-configuration.md
@@ -96,7 +96,16 @@ the `resources.json` and `resource_specs.json` main configuration files
             "enabled": true,
             "datasetmap": "pcp",
             "hardware": {
-                "gpfs": ""
+                "gpfs": "gpfs0",
+                "network": [
+                    "em1",
+                    "eno1"
+                ],
+                "mounts": {
+                   "projects": "/projects",
+                   "home": "/user",
+                   "util": "/util"
+                }
             }
         }
     ]
@@ -121,10 +130,78 @@ The `datasetmap` option allows the ingestion of Job Performance data from differ
 data sources. Currently PCP is the only supported data source.
 
 The `hardware` property is used by the dataset mapping code to process PCP
-metrics that have device-specific names. The only configurable mapping
-in this release is the name of the GPFS mount point. If the resource has
-a GPFS filesystem then set `hardware.gpfs` to the name of the GPFS mount point.
-Set this to an empty string if there is no GPFS filesystem for the resource.
+metrics that have device-specific names. There are configurable mappings
+for Ethernet network devices, GPFS devices and mounted NFS filesystems.
+The XDMoD statistics for each mapping setting is displayed in the table below.
+<table>
+<thead>
+<tr>
+<th>Configuration Property</th> <th>XDMoD Statistics</th> <th>XDMoD Group Bys</th> <th>Job Viewer Summary tab statistics</th>
+</tr>
+</thead>
+<tbody>
+<tr> <td><code>hardware.gpfs</code></td><td>"Average gpfs receive rate", "Average gpfs transmit rate"</td><td>"GPFS bytes received"</td><td>Parallel filesystem gpfs &ast;"</td></tr>
+<tr> <td><code>hardware.network</code></td><td>"Average eth0 receive rate", "Average eth0 transmit rate"</td><td> </td><td>"Net Eth0 &ast;"</td></tr>
+<tr> <td><code>hardware.mounts.projects</code></td><td>"Avg /projects write rate"</td><td></td><td>"Mount point "projects" data &ast;"</td></tr>
+<tr> <td><code>hardware.mounts.home</code></td><td>"Avg /home write rate"</td><td></td><td>"Mount point "home" data &ast;"</td></tr>
+<tr> <td><code>hardware.mounts.util</code></td><td>"Avg /util write rate"</td><td></td><td>"Mount point "util" data &ast;"</td></tr>
+</tbody>
+</table>
+<br />
+
+The mapping allows multiple modes of operation. The mapping software
+can compute the sum of all the statistics collected from the devices.
+It can also be given a list of device names in priority order and will
+use the statistics from the first device that is found. This feature
+is particularly useful for heterogeneous clusters.
+The list below describes the appropriate value to set in the configuration
+file for a given scenario.
+
+- Specify the name of the device as reported by the O/S on the compute nodes.
+- Specify an empty string if the device is absent or you do not wish the
+  data to appear in XDMoD.
+- Specify the string `all` if you would like the metric in XDMoD to be the
+  sum of all of the detected devices on the compute nodes.
+- Specify a list of device names. The mapping software will use the first device
+  name in the list that is present in the summary statistics for each job.
+
+
+#### Examples
+
+An example hardware configuration setting is shown below:
+```json
+    "hardware": {
+        "gpfs": "gpfs0.example.edu",
+        "network": [
+            "em1",
+            "eno1"
+        ],
+        "mounts": {
+           "projects": "/projects",
+           "home": "/user",
+           "util": "/util"
+        }
+    }
+```
+In this example the mapping would work as follows:
+- The various XDMoD GPFS parallel filesystem statistics would be based on the GPFS filesystem `gpfs0.example.edu` mounted on the compute nodes.
+- The various XDMoD Ethernet statistics would be based on the data read from and written to the `em1` device on compute nodes that had an `em1` Ethernet device and `eno1` for compute
+nodes that did not have an `em1` device but did have an `eno1` device.
+- The XDMoD `projects` filesystem statistics would be from the statistics collected from the NFS filesystem mounted at `/projects` on the compute nodes.
+- The XDMoD `home` filesystem statistics would be from the statistics collected from the NFS filesystem mounted at `/user` on the compute nodes.
+- The XDMoD `util` filesystem statistics would be from the statistics collected from the NFS filesystem mounted at `/util` on the compute nodes.
+
+```json
+    "hardware": {
+        "gpfs": "all",
+        "network": "eth1"
+    }
+```
+In this example the mapping would work as follows:
+- The various XDMoD GPFS parallel filesystem statistics would be based on sum of the statistics for all of the mounted GPFS filesystems.
+- The various XDMoD Ethernet statistics would be based on the data read from and written to the `eth1` device on compute nodes.
+- No data would be stored in XDMoD for NFS filesystems.
+
 
 ### portal_settings.d/supremm.ini
 

diff --git a/docs/supremm-upgrade.md b/docs/supremm-upgrade.md
@@ -9,55 +9,35 @@ The Job Performance (SUPReMM) XDMoD module should be upgraded at the same time a
 software. The upgrade procedure is documented on the [XDMoD upgrade
 page](https://open.xdmod.org/upgrade.html).
 
-8.0.0 to 8.1.0 Upgrade Notes
+8.1.0 to 8.5.0 Upgrade Notes
 ----------------------------
 
 - This upgrade includes database schema changes.
     - Modifies `modw_supremm` schema.
 
-The `modw_supremm.job` and `modw_supremm.job_error` table have extra columns to store the job
-energy metrics. These columns are added to the tables by the upgrade script.
+The `modw_supremm.job` and `modw_supremm.job_error` table have extra columns to store metrics
+about job I/O. These columns are added to the tables by the upgrade script.
 
-The `modw_suprem.jobhosts` table stores information about the compute nodes for each job. Prior
-to 8.1.0 this table used the job identifier provided by the resource manager as a unique identifier.
-This data stored in this table is used to determine whether a job shared compute node with
-any other job. The consequence of this design is that the shared jobs and job peers data in XDMoD
-would be incorrect if the resource manager re-used job identifiers. This has been observed when
-the job identifier counter on the resource manager wraps around. This update adds the job's end time
-to the unique constraint and populates it based on the existing information in the datawarehouse.
+See the [Configuration Guide](supremm-configuration.md#supremm_resourcesjson) for information
+about how to define the data mapping for the new I/O metrics.
 
-No further action is required if any of the following apply:
-- The HPC resources are not configured to allow HPC jobs to share compute nodes
-- The shared jobs flag is set to false for a resource
-- The HPC resource manager job identifier is unique
+Changes to the mapping only affect job metrics ingested after the configuration file
+is modified. The metrics for jobs that have already been ingested are not automatically
+updated. To update the data for existing jobs it is necessary to reset the job ingest
+status and then run the ingest and aggregation script.
 
-However, if there is data for an HPC resource that has non-unique job identifiers and shared
-compute nodes then it will be necessary to re-ingest the job data to get correct information
-into the database. This is done as follows:
+** If you do not update the data mapping then you do not need to perform these steps **
 
-First the existing data for the resource must be removed from the database using the following
-command:
+Resetting the job ingest status and re-ingesting the data is done as follows:
 
-    $ /usr/lib64/xdmod/xdmod-supremm-admin --action truncate --resource [RESOURCE] -d
+1) Reset the job ingest status for all jobs on each HPC resource:
 
-The amount of time this script takes depends on the number of jobs that have
-information in the database and the performance of the MongoDB and MySQL
-databases. In a test run for a resource that had approximately 2 million jobs it took
-approximately 20 minutes to reset the status in MongoDB and 10 minutes to delete the
-records from the MySQL database tables.
+    $ /usr/lib64/xdmod/xdmod-supremm-admin --action reset --resource [RESOURCE] -d
 
-Then the job data should be re-ingested:
+The amount of time this script takes depends on the number of jobs. In a test
+run for a resource that had approximately 2 million jobs it took approximately
+20 minutes to reset the status.
 
-    $ cd /usr/share/xdmod/etl/js
-    $ node --max-old-space-size=4096 etl.cluster.js --dataset=[RESOURCE]
-
-Then the shared jobs script should be run to reprocess all jobs for the resource:
-
-    $ /usr/lib64/xdmod/supremm_sharedjobs.php --resource [RESOURCE] -a
-
-Finally the aggregation step should be run:
+2) Run the ingest and aggregation script:
 
     $ aggregate_supremm.sh
-
-The debug flag `-d` may also be specified if you wish to track the progress of the 
-scripts.