bird-house · mishaschwartz · Jun 4, 2024 · Jun 7, 2024 · Jun 18, 2024 · Jun 18, 2024
@@ -15,7 +15,42 @@
 [Unreleased](https://github.com/bird-house/birdhouse-deploy/tree/master) (latest)
 ------------------------------------------------------------------------------------------------------------------
 
-[//]: # (list changes here, using '-' for each new entry, remove this when items are added)
+## Changes
+
+- Add the `prometheus-longterm-metrics` and `thanos` optional components
+
+  The `prometheus-longterm-metrics` component collects longterm monitoring metrics from the original prometheus instance
+  (the one created by the ``components/monitoring`` component).
+
+  Longterm metrics are any prometheus rule that have the label ``group: longterm-metrics`` or in other words are
+  selectable using prometheus's ``'{group="longterm-metrics"}'`` query filter. To see which longterm metric rules are
+  added by default see the 
+  ``optional-components/prometheus-longterm-metrics/config/monitoring/prometheus.rules.template`` file.
+
+  If you do not want the default longterm-metric rules included, set the ``PROMETHEUS_LONGTERM_RULES_FILE`` to anything 
+  other than ``True`` in your ``env.local`` file.
+
+  To configure this component:
+
+  * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus
+
+  Enabling the `prometheus-longterm-metrics` component creates the additional endpoint ``/prometheus-longterm-metrics``.
+
+  The `thanos` component enables better storage of longterm metrics collected by the 
+  ``optional-components/prometheus-longterm-metrics`` component. Data will be collected from the
+  ``prometheus-longterm-metrics`` and stored in an S3 object store indefinitely.
+
+  When enabling this component, please change the default values for the ``THANOS_MINIO_ROOT_USER`` and ``THANOS_MINIO_ROOT_PASSWORD``
+  by updating the ``env.local`` file. These set the login credentials for the root user that runs the 
+  [minio](https://min.io/) object store.
+
+  Enabling the `thanos` component creates the additional endpoints:
+
+  * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos
+  * ``/thanos-minio``: a minio web console to inspect the data stored by minio.
+
+  This also includes an update to the prometheus version from `v2.19.0` to the current latest `v2.52.0`. This is to
+  required to support the interaction between prometheus and thanos.
 
 [2.4.0](https://github.com/bird-house/birdhouse-deploy/tree/2.4.0) (2024-06-04)
 ------------------------------------------------------------------------------------------------------------------

diff --git a/birdhouse/components/monitoring/default.env b/birdhouse/components/monitoring/default.env
@@ -8,7 +8,7 @@ export GRAFANA_VERSION="7.0.3"
 export GRAFANA_DOCKER=grafana/grafana
 export GRAFANA_IMAGE='${GRAFANA_DOCKER}:${GRAFANA_VERSION}'
 
-export PROMETHEUS_VERSION="v2.19.0"
+export PROMETHEUS_VERSION="v2.52.0"
 export PROMETHEUS_DOCKER=prom/prometheus
 export PROMETHEUS_IMAGE='${PROMETHEUS_DOCKER}:${PROMETHEUS_VERSION}'
 

@@ -574,6 +574,14 @@ export THREDDS_ADDITIONAL_CATALOG=""
 #export ALERTMANAGER_EXTRA_INHIBITION=""
 #export ALERTMANAGER_EXTRA_RECEIVERS=""
 
+# Below are for the prometheus-longterm-metrics optional component
+#export PROMETHEUS_LONGTERM_RETENTION_TIME=1y
+
+# Below are for the thanos optional component
+# Change these from the default for added security
+#export THANOS_MINIO_ROOT_USER="${__DEFAULT__THANOS_MINIO_ROOT_USER}"
+#export THANOS_MINIO_ROOT_PASSWORD="${__DEFAULT__THANOS_MINIO_ROOT_PASSWORD}"
+
 #############################################################################
 # Emu optional vars
 #############################################################################

@@ -443,3 +443,48 @@ How to enable X-Robots-Tag Header in ``env.local`` (a copy from `env.local.examp
 
     .. seealso::
         See the `env.local.example`_ file for more details about this ``BIRDHOUSE_PROXY_ROOT_LOCATION`` behaviour.
+
+Prometheus Long-term Metrics
+----------------------------
+
+This is a second prometheus instance that collects longterm monitoring metrics from the original prometheus instance
+(the one created by the ``components/monitoring`` component).
+
+Longterm metrics are any prometheus rule that have the label ``group: longterm-metrics`` or in other words are
+selectable using prometheus' ``'{group="longterm-metrics"}'`` query filter. To add some default longterm metrics rules
+also enable the ``prometheus-longterm-rules`` component.
+
+You may also choose to create your own set of rules in another component that you would like to use instead of the
+default ones.
+
+To configure this component:
+
+    * update the ``PROMETHEUS_LONGTERM_RETENTION_TIME`` variable to set how long the data will be kept by prometheus
+
+Enabling this component creates the additional endpoint ``/prometheus-longterm-metrics``.
+
+Prometheus Long-term Rules
+--------------------------
+
+This adds some default longterm metrics rules to the `prometheus` component for use by the `prometheus-longterm-metrics`
+component. These rules all have the label ``group: longterm-metrics``.
+
+To see which rules are added, check out the
+`optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules` file.
+
+Thanos
+------
+
+This enables better storage of longterm metrics collected by the ``optional-components/prometheus-longterm-metrics``
+component. Data will be collected from the ``prometheus-longterm-metrics`` and stored in an S3 object store
+indefinitely.
+
+When enabling this component, please change the default values for the ``THANOS_MINIO_ROOT_USER`` and
+``THANOS_MINIO_ROOT_PASSWORD`` by updating the ``env.local`` file. These set the login credentials for the root user
+that runs the minio_ object store.
+
+Enabling this component creates the additional endpoints:
+    * ``/thanos-query``: a prometheus-like query interface to inspect the data stored by thanos
+    * ``/thanos-minio``: a minio_ web console to inspect the data stored by minio_.
+
+.. _minio: https://min.io/
@@ -0,0 +1,3 @@
+prometheus.yml
+config/magpie/config.yml
+config/proxy/conf.extra-service.d/monitoring.conf
@@ -0,0 +1,28 @@
+providers:
+  prometheus-longterm-metrics:
+    # below URL is only used to fill in the required location in Magpie
+    # actual auth validation is performed with Twitcher 'verify' endpoint without accessing this proxied URL
+    url: http://proxy:80
+    title: PrometheusLongtermMetrics
+    public: true
+    c4i: false
+    type: api
+    sync_type: api
+
+permissions:
+  - service: prometheus-longterm-metrics
+    permission: read
+    group: administrators
+    action: create
+  - service: prometheus-longterm-metrics
+    permission: write
+    group: administrators
+    action: create
+  - service: prometheus-longterm-metrics
+    permission: read
+    group: monitoring
+    action: create
+  - service: prometheus-longterm-metrics
+    permission: write
+    group: monitoring
+    action: create
@@ -0,0 +1,7 @@
+version: "3.4"
+
+services:
+  magpie:
+    volumes:
+      - ./optional-components/prometheus-longterm-metrics/config/magpie/config.yml:${MAGPIE_PERMISSIONS_CONFIG_PATH}/prometheus-longterm-metrics.yml:ro
+      - ./optional-components/prometheus-longterm-metrics/config/magpie/config.yml:${MAGPIE_PROVIDERS_CONFIG_PATH}/prometheus-longterm-metrics.yml:ro
@@ -0,0 +1,18 @@
+    location /prometheus-longterm-metrics {
+        auth_request /secure-prometheus-longterm-metrics-auth;
+        auth_request_set $auth_status $upstream_status;
+        proxy_pass http://prometheus-longterm-metrics:9090;
+        proxy_set_header Host $host;
+    }
+
+    location = /secure-prometheus-longterm-metrics-auth {
+        internal;
+        proxy_pass https://${BIRDHOUSE_FQDN_PUBLIC}${TWITCHER_VERIFY_PATH}/prometheus-longterm-metrics$request_uri;
+        proxy_pass_request_body off;
+        proxy_set_header Host $host;
+        proxy_set_header Content-Length "";
+        proxy_set_header X-Original-URI $request_uri;
+        proxy_set_header X-Forwarded-Proto $real_scheme;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Host $host:$server_port;
+    }
@@ -0,0 +1,6 @@
+version: "3.4"
+
+services:
+  proxy:
+    volumes:
+      - ./optional-components/prometheus-longterm-metrics/config/proxy/conf.extra-service.d:/etc/nginx/conf.extra-service.d/prometheus-longterm-metrics:ro
@@ -0,0 +1,30 @@
+export PROMETHEUS_LONGTERM_VERSION='${PROMETHEUS_VERSION:-"v2.52.0"}'
+export PROMETHEUS_LONGTERM_DOCKER='${PROMETHEUS_DOCKER:-prom/prometheus}'
+export PROMETHEUS_LONGTERM_IMAGE='${PROMETHEUS_LONGTERM_DOCKER}:${PROMETHEUS_LONGTERM_VERSION}'
+
+export PROMETHEUS_LONGTERM_RETENTION_TIME=1y
+export PROMETHEUS_LONGTERM_SCRAPE_INTERVAL=1h
+
+# These are the prometheus defaults
+export PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION=2h
+export PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION=1d12h
+
+# These are the targets that
+export PROMETHEUS_LONGTERM_TARGETS='["prometheus:9090"]' # yaml list syntax
+
+OPTIONAL_VARS="
+  $OPTIONAL_VARS
+  \$PROMETHEUS_LONGTERM_SCRAPE_INTERVAL
+  \$PROMETHEUS_LONGTERM_TARGETS
+"
+
+export DELAYED_EVAL="
+  $DELAYED_EVAL
+  PROMETHEUS_LONGTERM_VERSION
+  PROMETHEUS_LONGTERM_DOCKER
+  PROMETHEUS_LONGTERM_IMAGE
+  PROMETHEUS_LONGTERM_RULES_FILE
+"
+
+# Note that this component does not depend explicitly on the `components/monitoring` component so that this can
+# theoretically be deployed on a different machine than the `prometheus` service. This is currently untested.
 prometheus: 
   image: ${PROMETHEUS_IMAGE} 
   container_name: prometheus 
   volumes: 
     - ./components/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro 
     - ./components/monitoring/prometheus.rules:/etc/prometheus/prometheus.rules:ro 
     - prometheus_persistence:/prometheus:rw 
   command: 
     # restore original CMD from image 
     - --config.file=/etc/prometheus/prometheus.yml 
     - --storage.tsdb.path=/prometheus 
     - --web.console.libraries=/usr/share/prometheus/console_libraries 
     - --web.console.templates=/usr/share/prometheus/consoles 
     # https://prometheus.io/docs/prometheus/latest/storage/ 
     - --storage.tsdb.retention.time=90d 
     # wrong default was http://container-hash:9090/ 
     - --web.external-url=https://${BIRDHOUSE_FQDN_PUBLIC}/prometheus/ 
   restart: always 
 services: 
   prometheus: 
     volumes: 
       - ./optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro 
 prometheus: 
   image: ${PROMETHEUS_IMAGE} 
   container_name: prometheus 
   volumes: 
     - ./components/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro 
     - ./components/monitoring/prometheus.rules:/etc/prometheus/prometheus.rules:ro 
     - prometheus_persistence:/prometheus:rw 
   command: 
     # restore original CMD from image 
     - --config.file=/etc/prometheus/prometheus.yml 
     - --storage.tsdb.path=/prometheus 
     - --web.console.libraries=/usr/share/prometheus/console_libraries 
     - --web.console.templates=/usr/share/prometheus/consoles 
     # https://prometheus.io/docs/prometheus/latest/storage/ 
     - --storage.tsdb.retention.time=90d 
     # wrong default was http://container-hash:9090/ 
     - --web.external-url=https://${BIRDHOUSE_FQDN_PUBLIC}/prometheus/ 
   restart: always 
 services: 
   prometheus: 
     volumes: 
       - ./optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro 
@@ -0,0 +1,32 @@
+version: "3.4"
+
+x-logging:
+  &default-logging
+  driver: "json-file"
+  options:
+    max-size: "50m"
+    max-file: "10"
+
+services:
+  prometheus-longterm-metrics:
+    image: ${PROMETHEUS_LONGTERM_IMAGE}
+    container_name: prometheus-longterm-metrics
+    volumes:
+      - ./optional-components/prometheus-longterm-metrics/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - prometheus_longterm_persistence:/prometheus:rw
+    command:
+      - --config.file=/etc/prometheus/prometheus.yml
+      - --storage.tsdb.path=/prometheus
+      - --web.console.libraries=/usr/share/prometheus/console_libraries
+      - --web.console.templates=/usr/share/prometheus/consoles
+      - --storage.tsdb.retention.time=${PROMETHEUS_LONGTERM_RETENTION_TIME}
+      - --web.external-url=https://${BIRDHOUSE_FQDN_PUBLIC}/prometheus-longterm-metrics/
+      - --storage.tsdb.min-block-duration=${PROMETHEUS_LONGTERM_TSDB_MIN_BLOCK_DURATION}
+      - --storage.tsdb.max-block-duration=${PROMETHEUS_LONGTERM_TSDB_MAX_BLOCK_DURATION}
+    restart: always
+    logging: *default-logging
+
+volumes:
+  prometheus_longterm_persistence:
+    external:
+      name: prometheus_longterm_persistence
@@ -0,0 +1,3 @@
+#!/bin/sh -x
+
+docker volume create prometheus_longterm_persistence  # metrics db
@@ -0,0 +1,17 @@
+global:
+  external_labels:
+    instance_name: prometheus-longterm-metrics
+
+scrape_configs:
+  - job_name: 'federate'
+    scrape_interval: ${PROMETHEUS_LONGTERM_SCRAPE_INTERVAL}
+
+    honor_labels: true
+    metrics_path: '/prometheus/federate'
+
+    params:
+      'match[]':
+        - '{group="longterm-metrics"}'
+
+    static_configs:
+      - targets: ${PROMETHEUS_LONGTERM_TARGETS}
@@ -0,0 +1,6 @@
+version: "3.4"
+
+services:
+  prometheus:
+    volumes:
+      - ./optional-components/prometheus-longterm-rules/config/monitoring/prometheus.rules:/etc/prometheus/prometheus-longterm-metrics.rules:ro
@@ -0,0 +1,15 @@
+groups:
+    - name: longterm-metrics-hourly
+      interval: 1h
+      rules:
+        # percentage of the time, over the last hour, that all CPUs were working
+        # 1 means all CPUs were working all the time, 0 means they were all idle all the time
+        - record: instance:cpu_load:avg_rate1h
+          expr: avg by(instance) (rate(node_cpu_seconds_total{mode!="idle"}[1h]))
+          labels:
+            group: longterm-metrics
+        # total number of bytes that were sent or received over the network in the last hour
+        - record: instance:network_bytes_transmitted:sum_rate1h
+          expr: sum by(instance) (rate(node_network_transmit_bytes_total[1h]) + rate(node_network_receive_bytes_total[1h]))
+          labels:
+            group: longterm-metrics
@@ -0,0 +1,2 @@
+config/magpie/config.yml
+config/proxy/conf.extra-service.d/monitoring.conf
@@ -0,0 +1,28 @@
+providers:
+  thanos:
+    # below URL is only used to fill in the required location in Magpie
+    # actual auth validation is performed with Twitcher 'verify' endpoint without accessing this proxied URL
+    url: http://proxy:80
+    title: Thanos
+    public: true
+    c4i: false
+    type: api
+    sync_type: api
+
+permissions:
+  - service: thanos
+    permission: read
+    group: administrators
+    action: create
+  - service: thanos
+    permission: write
+    group: administrators
+    action: create
+  - service: thanos
+    permission: read
+    group: monitoring
+    action: create
+  - service: thanos
+    permission: write
+    group: monitoring
+    action: create
@@ -0,0 +1,7 @@
+version: "3.4"
+
+services:
+  magpie:
+    volumes:
+      - ./optional-components/thanos/config/magpie/config.yml:${MAGPIE_PERMISSIONS_CONFIG_PATH}/thanos.yml:ro
+      - ./optional-components/thanos/config/magpie/config.yml:${MAGPIE_PROVIDERS_CONFIG_PATH}/thanos.yml:ro
@@ -0,0 +1,38 @@
+    location /thanos-query {
+        auth_request /secure-thanos-auth;
+        auth_request_set $auth_status $upstream_status;
+        proxy_pass http://thanos-query:19192;
+        proxy_set_header Host $host;
+    }
+
+    location /thanos-minio/ {
+        auth_request /secure-thanos-auth;
+        auth_request_set $auth_status $upstream_status;
+
+        rewrite   ^/thanos-minio/(.*) /$1 break;
+        proxy_pass http://thanos-minio:9001;
+
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "Upgrade";
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+
+        # This allows WebSocket connections
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+    }
+
+    location = /secure-thanos-auth {
+        internal;
+        proxy_pass https://${BIRDHOUSE_FQDN_PUBLIC}${TWITCHER_VERIFY_PATH}/thanos$request_uri;
+        proxy_pass_request_body off;
+        proxy_set_header Host $host;
+        proxy_set_header Content-Length "";
+        proxy_set_header X-Original-URI $request_uri;
+        proxy_set_header X-Forwarded-Proto $real_scheme;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Host $host:$server_port;
+    }
@@ -0,0 +1,6 @@
+version: "3.4"
+
+services:
+  proxy:
+    volumes:
+      - ./optional-components/thanos/config/proxy/conf.extra-service.d:/etc/nginx/conf.extra-service.d/thanos:ro