bird-house · mishaschwartz · Jun 4, 2024 · Jun 7, 2024 · Jun 18, 2024 · Jun 18, 2024
@@ -15,7 +15,39 @@
 [Unreleased](https://github.com/bird-house/birdhouse-deploy/tree/master) (latest)
 ------------------------------------------------------------------------------------------------------------------
 
-[//]: # (list changes here, using '-' for each new entry, remove this when items are added)
+## Changes
+
+- Add the `prometheus-longterm-metrics` and `thanos` optional components
+
+  The `prometheus-longterm-metrics` component collects longterm monitoring metrics from the original prometheus instance
+  (the one created by the ``components/monitoring`` component).
+
+  Longterm metrics are any prometheus rule that have the label ``group: longterm-metrics`` or in other words are
+  selectable using prometheus's ``'{group="longterm-metrics"}'`` query filter. To see which longterm metric rules are
+  added by default see the 
+  ``optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template`` file.
+
+  To configure this component:
+
+  * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus
+
+  Enabling the `prometheus-longterm-metrics` component creates the additional endpoint ``/prometheus-longterm-metrics``.
+
+  The `thanos` component enables better storage of longterm metrics collected by the 
+  ``optional-components/prometheus-longterm-metrics`` component. Data will be collected from the
+  ``prometheus-longterm-metrics`` and stored in an S3 object store indefinitely.
+
+  When enabling this component, please change the default values for the ``THANOS_MINIO_ROOT_USER`` and ``THANOS_MINIO_ROOT_PASSWORD``
+  by updating the ``env.local`` file. These set the login credentials for the root user that runs the 
+  [minio](https://min.io/) object store.
+
+- Enabling the `thanos` component creates the additional endpoints:
+
+  * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos
+  * ``/thanos-minio``: a minio web console to inspect the data stored by minio.
+
+- Update the prometheus version from `v2.19.0` to the current latest `v2.52.0`. This is required to support the interaction between 
+  prometheus and thanos.
 
 [2.5.3](https://github.com/bird-house/birdhouse-deploy/tree/2.5.3) (2024-09-11)
 ------------------------------------------------------------------------------------------------------------------

@@ -371,6 +371,7 @@ AlertManager for Alert Dashboard and Silencing
 .. image:: monitoring/images/alertmanager-dashboard.png
 .. image:: monitoring/images/alertmanager-silence-alert.png
 
+.. _monitoring-customize-the-component
 
 Customizing the Component
 -------------------------
@@ -389,6 +390,23 @@ Customizing the Component
   Slack or other services accepting webhooks), ``ALERTMANAGER_EXTRA_RECEIVERS``.
 
 
+Longterm Storage of Prometheus Metrics
+--------------------------------------
+
+Prometheus stores metrics for 90 days by default. This may be sufficient for some use cases but you may wish to store
+some metrics for longer. In order to store certain metrics for a longer than 90 days, you can enable the following
+additional components:
+
+- :ref:`prometheus-longterm-metrics`: a second Prometheus instance used to collect the metrics that you want to store longterm
+- :ref:`thanos`: a service that enables more efficient storage of the metrics collected by the :ref:`prometheus-longterm-metrics`
+  component.
+
+.. note::
+    A separate prometheus instance is necessary since the retention time for prometheus metrics is set at the 
+    instance level. This means that increasing the retention time must be done for all metrics at once which is undesirable
+    because you probably don't need to store every metric for a long period of time and you'll end up using a lot more
+    disk space than needed.
+
 Weaver
 ======
 

diff --git a/birdhouse/components/monitoring/default.env b/birdhouse/components/monitoring/default.env
@@ -8,7 +8,7 @@ export GRAFANA_VERSION="7.0.3"
 export GRAFANA_DOCKER=grafana/grafana
 export GRAFANA_IMAGE='${GRAFANA_DOCKER}:${GRAFANA_VERSION}'
 
-export PROMETHEUS_VERSION="v2.19.0"
+export PROMETHEUS_VERSION="v2.52.0"
 export PROMETHEUS_DOCKER=prom/prometheus
 export PROMETHEUS_IMAGE='${PROMETHEUS_DOCKER}:${PROMETHEUS_VERSION}'
 

@@ -574,6 +574,14 @@ export THREDDS_ADDITIONAL_CATALOG=""
 #export ALERTMANAGER_EXTRA_INHIBITION=""
 #export ALERTMANAGER_EXTRA_RECEIVERS=""
 
+# Below are for the prometheus-longterm-metrics optional component
+#export PROMETHEUS_LONGTERM_RETENTION_TIME=1y
+
+# Below are for the thanos optional component
+# Change these from the default for added security
+#export THANOS_MINIO_ROOT_USER="${__DEFAULT__THANOS_MINIO_ROOT_USER}"
+#export THANOS_MINIO_ROOT_PASSWORD="${__DEFAULT__THANOS_MINIO_ROOT_PASSWORD}"
+
 #############################################################################
 # Emu optional vars
 #############################################################################

@@ -443,3 +443,54 @@ How to enable X-Robots-Tag Header in ``env.local`` (a copy from `env.local.examp
 
     .. seealso::
         See the `env.local.example`_ file for more details about this ``BIRDHOUSE_PROXY_ROOT_LOCATION`` behaviour.
+
+.. _prometheus-longterm-metrics
+
+Prometheus Long-term Metrics
+----------------------------
+
+This is a second prometheus instance that collects longterm monitoring metrics from the original prometheus instance
+(the one created by the ``components/monitoring`` component).
+
+Longterm metrics are any prometheus rule that have the label ``group: longterm-metrics`` or in other words are
+selectable using prometheus' ``'{group="longterm-metrics"}'`` query filter. To add some default longterm metrics rules
+also enable the ``prometheus-longterm-rules`` component.
+
+You may also choose to create your own set of rules instead of, or as well as, the default ones. See how to 
+:ref:`add additional rules here <monitoring-customize-the-component>`.
+
+To configure this component:
+
+    * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus
+
+Enabling this component creates the additional endpoint ``/prometheus-longterm-metrics``.
+
+.. _prometheus-longterm-rules
+
+Prometheus Long-term Rules
+--------------------------
+
+This adds some default longterm metrics rules to the `prometheus` component for use by the `prometheus-longterm-metrics`
+component. These rules all have the label ``group: longterm-metrics``.
+
+To see which rules are added, check out the
+`optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules` file.
+
+.. _thanos
+
+Thanos
+------
+
+This enables better storage of longterm metrics collected by the ``optional-components/prometheus-longterm-metrics``
+component. Data will be collected from the ``prometheus-longterm-metrics`` and stored in an S3 object store
+indefinitely.
+
+When enabling this component, please change the default values for the ``THANOS_MINIO_ROOT_USER`` and
+``THANOS_MINIO_ROOT_PASSWORD`` by updating the ``env.local`` file. These set the login credentials for the root user
+that runs the minio_ object store.
+
+Enabling this component creates the additional endpoints:
+    * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos
+    * ``/thanos-minio``: a minio_ web console to inspect the data stored by minio_.
+
+.. _minio: https://min.io/
@@ -0,0 +1,3 @@
+prometheus.yml
+config/magpie/config.yml
+config/proxy/conf.extra-service.d/monitoring.conf
@@ -0,0 +1,28 @@
+providers:
+  prometheus-longterm-metrics:
+    # below URL is only used to fill in the required location in Magpie
+    # actual auth validation is performed with Twitcher 'verify' endpoint without accessing this proxied URL
+    url: http://proxy:80
+    title: PrometheusLongtermMetrics
+    public: true
+    c4i: false
+    type: api
+    sync_type: api
+
+permissions:
+  - service: prometheus-longterm-metrics
+    permission: read
+    group: administrators
+    action: create
+  - service: prometheus-longterm-metrics
+    permission: write
+    group: administrators
+    action: create
+  - service: prometheus-longterm-metrics
+    permission: read
+    group: monitoring
+    action: create
+  - service: prometheus-longterm-metrics
+    permission: write
+    group: monitoring
+    action: create
@@ -0,0 +1,7 @@
+version: "3.4"
+
+services:
+  magpie:
+    volumes:
+      - ./optional-components/prometheus-longterm-metrics/config/magpie/config.yml:${MAGPIE_PERMISSIONS_CONFIG_PATH}/prometheus-longterm-metrics.yml:ro
+      - ./optional-components/prometheus-longterm-metrics/config/magpie/config.yml:${MAGPIE_PROVIDERS_CONFIG_PATH}/prometheus-longterm-metrics.yml:ro
@@ -0,0 +1,18 @@
+    location /prometheus-longterm-metrics {
+        auth_request /secure-prometheus-longterm-metrics-auth;
+        auth_request_set $auth_status $upstream_status;
+        proxy_pass http://prometheus-longterm-metrics:9090;
+        proxy_set_header Host $host;
+    }
+
+    location = /secure-prometheus-longterm-metrics-auth {
+        internal;
+        proxy_pass https://${BIRDHOUSE_FQDN_PUBLIC}${TWITCHER_VERIFY_PATH}/prometheus-longterm-metrics$request_uri;
+        proxy_pass_request_body off;
+        proxy_set_header Host $host;
+        proxy_set_header Content-Length "";
+        proxy_set_header X-Original-URI $request_uri;
+        proxy_set_header X-Forwarded-Proto $real_scheme;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Host $host:$server_port;
+    }
@@ -0,0 +1,6 @@
+version: "3.4"
+
+services:
+  proxy:
+    volumes:
+      - ./optional-components/prometheus-longterm-metrics/config/proxy/conf.extra-service.d:/etc/nginx/conf.extra-service.d/prometheus-longterm-metrics:ro
@@ -0,0 +1,29 @@
+export PROMETHEUS_LONGTERM_VERSION='${PROMETHEUS_VERSION:-"v2.52.0"}'
+export PROMETHEUS_LONGTERM_DOCKER='${PROMETHEUS_DOCKER:-prom/prometheus}'
+export PROMETHEUS_LONGTERM_IMAGE='${PROMETHEUS_LONGTERM_DOCKER}:${PROMETHEUS_LONGTERM_VERSION}'
+
+export PROMETHEUS_LONGTERM_RETENTION_TIME=1y
+export PROMETHEUS_LONGTERM_SCRAPE_INTERVAL=1h
+
+# These are the prometheus defaults
+export PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION=2h
+export PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION=1d12h
+
+# These are the targets that
+export PROMETHEUS_LONGTERM_TARGETS='["prometheus:9090"]' # yaml list syntax
+
+OPTIONAL_VARS="
+  $OPTIONAL_VARS
+  \$PROMETHEUS_LONGTERM_SCRAPE_INTERVAL
+  \$PROMETHEUS_LONGTERM_TARGETS
+"
+
+export DELAYED_EVAL="
+  $DELAYED_EVAL
+  PROMETHEUS_LONGTERM_VERSION
+  PROMETHEUS_LONGTERM_DOCKER
+  PROMETHEUS_LONGTERM_IMAGE
+"
+
+# Note that this component does not depend explicitly on the `components/monitoring` component so that this can
+# theoretically be deployed on a different machine than the `prometheus` service. This is currently untested.
 prometheus: 
   image: ${PROMETHEUS_IMAGE} 
   container_name: prometheus 
   volumes: 
     - ./components/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro 
     - ./components/monitoring/prometheus.rules:/etc/prometheus/prometheus.rules:ro 
     - prometheus_persistence:/prometheus:rw 
   command: 
     # restore original CMD from image 
     - --config.file=/etc/prometheus/prometheus.yml 
     - --storage.tsdb.path=/prometheus 
     - --web.console.libraries=/usr/share/prometheus/console_libraries 
     - --web.console.templates=/usr/share/prometheus/consoles 
     # https://prometheus.io/docs/prometheus/latest/storage/ 
     - --storage.tsdb.retention.time=90d 
     # wrong default was http://container-hash:9090/ 
     - --web.external-url=https://${BIRDHOUSE_FQDN_PUBLIC}/prometheus/ 
   restart: always 
 services: 
   prometheus: 
     volumes: 
       - ./optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro 
 prometheus: 
   image: ${PROMETHEUS_IMAGE} 
   container_name: prometheus 
   volumes: 
     - ./components/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro 
     - ./components/monitoring/prometheus.rules:/etc/prometheus/prometheus.rules:ro 
     - prometheus_persistence:/prometheus:rw 
   command: 
     # restore original CMD from image 
     - --config.file=/etc/prometheus/prometheus.yml 
     - --storage.tsdb.path=/prometheus 
     - --web.console.libraries=/usr/share/prometheus/console_libraries 
     - --web.console.templates=/usr/share/prometheus/consoles 
     # https://prometheus.io/docs/prometheus/latest/storage/ 
     - --storage.tsdb.retention.time=90d 
     # wrong default was http://container-hash:9090/ 
     - --web.external-url=https://${BIRDHOUSE_FQDN_PUBLIC}/prometheus/ 
   restart: always 
 services: 
   prometheus: 
     volumes: 
       - ./optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro 
@@ -0,0 +1,32 @@
+version: "3.4"
+
+x-logging:
+  &default-logging
+  driver: "json-file"
+  options:
+    max-size: "50m"
+    max-file: "10"
+
+services:
+  prometheus-longterm-metrics:
+    image: ${PROMETHEUS_LONGTERM_IMAGE}
+    container_name: prometheus-longterm-metrics
+    volumes:
+      - ./optional-components/prometheus-longterm-metrics/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - prometheus_longterm_persistence:/prometheus:rw
+    command:
+      - --config.file=/etc/prometheus/prometheus.yml
+      - --storage.tsdb.path=/prometheus
+      - --web.console.libraries=/usr/share/prometheus/console_libraries
+      - --web.console.templates=/usr/share/prometheus/consoles
+      - --storage.tsdb.retention.time=${PROMETHEUS_LONGTERM_RETENTION_TIME}
+      - --web.external-url=https://${BIRDHOUSE_FQDN_PUBLIC}/prometheus-longterm-metrics/
+      - --storage.tsdb.min-block-duration=${PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION}
+      - --storage.tsdb.max-block-duration=${PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION}
+    restart: always
+    logging: *default-logging
+
+volumes:
+  prometheus_longterm_persistence:
+    external:
+      name: prometheus_longterm_persistence
@@ -0,0 +1,3 @@
+#!/bin/sh -x
+
+docker volume create prometheus_longterm_persistence  # metrics db
@@ -0,0 +1,17 @@
+global:
+  external_labels:
+    instance_name: prometheus-longterm-metrics
+
+scrape_configs:
+  - job_name: 'federate'
+    scrape_interval: ${PROMETHEUS_LONGTERM_SCRAPE_INTERVAL}
+
+    honor_labels: true
+    metrics_path: '/prometheus/federate'
+
+    params:
+      'match[]':
+        - '{group="longterm-metrics"}'
+
+    static_configs:
+      - targets: ${PROMETHEUS_LONGTERM_TARGETS}
@@ -0,0 +1,6 @@
+version: "3.4"
+
+services:
+  prometheus:
+    volumes:
+      - ./optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro
@@ -0,0 +1,15 @@
+groups:
+    - name: longterm-metrics-hourly
+      interval: 1h
+      rules:
+        # percentage of the time, over the last hour, that all CPUs were working
+        # 1 means all CPUs were working all the time, 0 means they were all idle all the time
+        - record: instance:cpu_load:avg_rate1h
+          expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[1h]))
+          labels:
+            group: longterm-metrics
+        # total number of bytes that were sent or received over the network in the last hour
+        - record: instance:network_bytes_transmitted:sum_rate1h
+          expr: sum by(instance) (rate(node_network_transmit_bytes_total[1h]) + rate(node_network_receive_bytes_total[1h]))
+          labels:
+            group: longterm-metrics
@@ -0,0 +1,2 @@
+config/magpie/config.yml
+config/proxy/conf.extra-service.d/monitoring.conf
@@ -0,0 +1,28 @@
+providers:
+  thanos:
+    # below URL is only used to fill in the required location in Magpie
+    # actual auth validation is performed with Twitcher 'verify' endpoint without accessing this proxied URL
+    url: http://proxy:80
+    title: Thanos
+    public: true
+    c4i: false
+    type: api
+    sync_type: api
+
+permissions:
+  - service: thanos
+    permission: read
+    group: administrators
+    action: create
+  - service: thanos
+    permission: write
+    group: administrators
+    action: create
+  - service: thanos
+    permission: read
+    group: monitoring
+    action: create
+  - service: thanos
+    permission: write
+    group: monitoring
+    action: create
@@ -0,0 +1,7 @@
+version: "3.4"
+
+services:
+  magpie:
+    volumes:
+      - ./optional-components/thanos/config/magpie/config.yml:${MAGPIE_PERMISSIONS_CONFIG_PATH}/thanos.yml:ro
+      - ./optional-components/thanos/config/magpie/config.yml:${MAGPIE_PROVIDERS_CONFIG_PATH}/thanos.yml:ro