From 740b18daa001aa1d11ca0e6aa13132b4f442cfa9 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Sep 2024 14:44:00 -0400 Subject: [PATCH 1/3] include validDataLocation in cached value/query --- src/main/java/edu/harvard/iq/dataverse/api/Metrics.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Metrics.java b/src/main/java/edu/harvard/iq/dataverse/api/Metrics.java index 452e5df9f9a..f36c514859e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Metrics.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Metrics.java @@ -206,12 +206,13 @@ public Response getDatasetsTimeSeriest(@Context Request req, @Context UriInfo ur return error(BAD_REQUEST, ia.getLocalizedMessage()); } String metricName = "datasets"; - JsonArray jsonArray = MetricsUtil.stringToJsonArray(metricsSvc.returnUnexpiredCacheAllTime(metricName, null, d)); + String validDataLocation = MetricsUtil.validateDataLocationStringType(dataLocation); + JsonArray jsonArray = MetricsUtil.stringToJsonArray(metricsSvc.returnUnexpiredCacheAllTime(metricName, validDataLocation, d)); if (null == jsonArray) { // run query and save - jsonArray = metricsSvc.getDatasetsTimeSeries(uriInfo, dataLocation, d); - metricsSvc.save(new Metric(metricName, null, null, d, jsonArray.toString())); + jsonArray = metricsSvc.getDatasetsTimeSeries(uriInfo, validDataLocation, d); + metricsSvc.save(new Metric(metricName, null, validDataLocation, d, jsonArray.toString())); } MediaType requestedType = getVariant(req, MediaType.valueOf(FileUtil.MIME_TYPE_CSV), MediaType.APPLICATION_JSON_TYPE); if ((requestedType != null) && (requestedType.equals(MediaType.APPLICATION_JSON_TYPE))) { From efb90050c69b6fcb37a0221694c9bfab873c4901 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Sep 2024 14:44:49 -0400 Subject: [PATCH 2/3] fix queries w.r.t. minorversion ordering --- .../dataverse/metrics/MetricsServiceBean.java | 39 +++++++------------ 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/metrics/MetricsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/metrics/MetricsServiceBean.java index a74474efa15..5bdbeac031d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/metrics/MetricsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/metrics/MetricsServiceBean.java @@ -168,25 +168,12 @@ public long datasetsToMonth(String yyyymm, String dataLocation, Dataverse d) { } } - // Note that this SQL line in the code below: - // datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) - // behaves somewhat counter-intuitively if the versionnumber and/or - // minorversionnumber is/are NULL - it results in an empty string - // (NOT the string "{dataset_id}:", in other words). Some harvested - // versions do not have version numbers (only the ones harvested from - // other Dataverses!) It works fine - // for our purposes below, because we are simply counting the selected - // lines - i.e. we don't care if some of these lines are empty. - // But do not use this notation if you need the values returned to - // meaningfully identify the datasets! - - Query query = em.createNativeQuery( "select count(*)\n" + "from (\n" - + "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))\n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.dataset_id \n" + "from datasetversion\n" + "join dataset on dataset.id = datasetversion.dataset_id\n" + ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n") @@ -194,7 +181,7 @@ public long datasetsToMonth(String yyyymm, String dataLocation, Dataverse d) { + ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n ") + "and \n" + dataLocationLine // be careful about adding more and statements after this line. - + "group by dataset_id \n" + + " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc\n" +") sub_temp" ); logger.log(Level.FINE, "Metric query: {0}", query); @@ -207,15 +194,15 @@ public List datasetsBySubjectToMonth(String yyyymm, String dataLocatio // A published local datasets may have more than one released version! // So that's why we have to jump through some extra hoops below // in order to select the latest one: - String originClause = "(datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in\n" + + String originClause = "(datasetversion.id in\n" + "(\n" + - "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))\n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id\n" + " from datasetversion\n" + " join dataset on dataset.id = datasetversion.dataset_id\n" + " where versionstate='RELEASED'\n" + " and dataset.harvestingclient_id is null\n" + " and date_trunc('month', releasetime) <= to_date('" + yyyymm + "','YYYY-MM')\n" + - " group by dataset_id\n" + + " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc\n" + "))\n"; if (!DATA_LOCATION_LOCAL.equals(dataLocation)) { // Default api state is DATA_LOCATION_LOCAL @@ -273,7 +260,7 @@ public long datasetsPastDays(int days, String dataLocation, Dataverse d) { Query query = em.createNativeQuery( "select count(*)\n" + "from (\n" - + "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max\n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id\n" + "from datasetversion\n" + "join dataset on dataset.id = datasetversion.dataset_id\n" + ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n") @@ -281,7 +268,7 @@ public long datasetsPastDays(int days, String dataLocation, Dataverse d) { + ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n") + "and \n" + dataLocationLine // be careful about adding more and statements after this line. - + "group by dataset_id \n" + + " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc \n" +") sub_temp" ); logger.log(Level.FINE, "Metric query: {0}", query); @@ -322,9 +309,9 @@ public long filesToMonth(String yyyymm, Dataverse d) { + "select count(*)\n" + "from filemetadata\n" + "join datasetversion on datasetversion.id = filemetadata.datasetversion_id\n" - + "where datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in \n" + + "where datasetversion.id in \n" + "(\n" - + "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max \n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id \n" + "from datasetversion\n" + "join dataset on dataset.id = datasetversion.dataset_id\n" + ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n") @@ -332,7 +319,7 @@ public long filesToMonth(String yyyymm, Dataverse d) { + ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n") + "and date_trunc('month', releasetime) <= to_date('" + yyyymm + "','YYYY-MM')\n" + "and dataset.harvestingclient_id is null\n" - + "group by dataset_id \n" + + "order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber \n" + ");" ); logger.log(Level.FINE, "Metric query: {0}", query); @@ -345,9 +332,9 @@ public long filesPastDays(int days, Dataverse d) { + "select count(*)\n" + "from filemetadata\n" + "join datasetversion on datasetversion.id = filemetadata.datasetversion_id\n" - + "where datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in \n" + + "where datasetversion.id in \n" + "(\n" - + "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max \n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id \n" + "from datasetversion\n" + "join dataset on dataset.id = datasetversion.dataset_id\n" + ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n") @@ -355,7 +342,7 @@ public long filesPastDays(int days, Dataverse d) { + "and releasetime > current_date - interval '" + days + "' day\n" + ((d == null) ? "" : "AND dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n") + "and dataset.harvestingclient_id is null\n" - + "group by dataset_id \n" + + "order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc \n" + ");" ); logger.log(Level.FINE, "Metric query: {0}", query); From 35c9b5abcbec7aaaa5f93209f16da7740dd6147d Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 19 Sep 2024 15:12:04 -0400 Subject: [PATCH 3/3] release note --- doc/release-notes/10379-MetricsBugsFixes.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 doc/release-notes/10379-MetricsBugsFixes.md diff --git a/doc/release-notes/10379-MetricsBugsFixes.md b/doc/release-notes/10379-MetricsBugsFixes.md new file mode 100644 index 00000000000..0ebc6d99f0b --- /dev/null +++ b/doc/release-notes/10379-MetricsBugsFixes.md @@ -0,0 +1,10 @@ + +### Metrics API Bug fixes + +Two bugs in the Metrics API have been fixed: + +- The /datasets and /datasets/byMonth endpoints could report incorrect values if/when they have been called using the dataLocation parameter (which allows getting metrics for local, remote (harvested), or all datasets) as the metrics cache was not storing different values for these cases. + +- Metrics endpoints who's calculation relied on finding the latest published datasetversion were incorrect if/when the minor version number was > 9. + +When deploying the new release, the [/api/admin/clearMetricsCache](https://guides.dataverse.org/en/latest/api/native-api.html#metrics) API should be called to remove old cached values that may be incorrect. \ No newline at end of file