Add dynamic backend, closest to the origin, cache-status

This is the second iteration which: - uses dynamic backends - changes the healthcheck to pre-warm the backends - places the initial instance closest to the origin - improves on the x-cache header via cache-status We also made the VARNISH_SIZE smaller so that it fits the default instance size. We are leaving room for "sidecar" processes (log drains, purge worker, etc.) and also for testing what happens when the memory fills up. We did this together with James A Rosen & Matt Johnson on August 2, 2024. Will add the recording when it's public. It's a follow-up to: - thechangelog/changelog.com#518 Signed-off-by: Gerhard Lazu <[email protected]>
thechangelog · Aug 19, 2024 · 1b28535 · 1b28535
1 parent 17d3899
commit 1b28535
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 47 deletions.
diff --git a/default.vcl b/default.vcl
@@ -1,24 +1,56 @@
 # https://varnish-cache.org/docs/7.4/reference/vcl.html#versioning
 vcl 4.1;
 
+# Import std for duration comparisons & access to env vars
 import std;
 
-# Thanks Matt Johnson! 👋
-# - https://github.com/magento/magento2/blob/03621bbcd75cbac4ffa8266a51aa2606980f4830/app/code/Magento/PageCache/etc/varnish6.vcl
-# - https://abhishekjakhotiya.medium.com/magento-internals-cache-purging-and-cache-tags-bf7772e60797
+# Import vmod_dynamic to resolve backend hosts via DNS
+import dynamic;
+
+# Disable default backend, we are using dynamic backends **only**
+backend default none;
+
+# Force IPv6 backends **only**
+acl ipv6_only { "::0"/0; }
+
+probe changelog_health {
+  .url = "/health";
+  .interval = 5s;
+  .timeout = 2s;
+  .window = 10;
+  .threshold = 5;
+}
 
-backend default {
-  .host = "top1.nearest.of.changelog-2024-01-12.internal";
-  .host_header = "changelog-2024-01-12.fly.dev";
-  .port = "4000";
-  .first_byte_timeout = 5s;
-  .probe = {
-    .url = "/health";
-    .timeout = 2s;
-    .interval = 5s;
-    .window = 10;
-    .threshold = 5;
+# Setup a dynamic director
+sub vcl_init {
+  # https://github.com/nigoroll/libvmod-dynamic/blob/3697d6f195fe077fe213918b7b67f5da4efdede2/src/tbl/list_prop.h
+  new changelog = dynamic.director(
+    ttl = 10s,
+    probe = changelog_health,
+    host_header = "changelog-2024-01-12.fly.dev",
+    first_byte_timeout = 5s,
+    connect_timeout = 5s,
+    between_bytes_timeout = 30s,
+    whitelist = ipv6_only
+  );
+}
+
+
+# NOTE: vcl_recv is called at the beginning of a request, after the complete
+# request has been received and parsed. Its purpose is to decide whether or not
+# to serve the request, how to do it, and, if applicable, which backend to use.
+sub vcl_recv {
+  # https://varnish-cache.org/docs/7.4/users-guide/purging.html
+  if (req.method == "PURGE") {
+    return (purge);
+  }
+
+  # Implement a Varnish health-check
+  if (req.method == "GET" && req.url == "/varnish_status") {
+    return(synth(204));
   }
+
+  set req.backend_hint = changelog.backend("changelog-2024-01-12.internal", "4000");
 }
 
 # https://varnish-cache.org/docs/7.4/users-guide/vcl-grace.html
@@ -51,43 +83,34 @@ sub vcl_backend_response {
   # 🤔 QUESTION: Should we configure beresp.keep?
 }
 
-# NOTE: vcl_recv is called at the beginning of a request, after the complete
-# request has been received and parsed. Its purpose is to decide whether or not
-# to serve the request, how to do it, and, if applicable, which backend to use.
-sub vcl_recv {
-  # https://varnish-cache.org/docs/7.4/users-guide/purging.html
-  if (req.method == "PURGE") {
-    return (purge);
-  }
-
-  # Implement a Varnish health-check
-  if (req.method == "GET" && req.url == "/varnish_status") {
-    return(synth(204));
-  }
-}
 
 # https://gist.github.com/leotsem/1246511/824cb9027a0a65d717c83e678850021dad84688d#file-default-vcl-pl
 # https://varnish-cache.org/docs/7.4/reference/vcl-var.html#obj
 sub vcl_deliver {
+  set resp.http.cache-status = "Edge";
+
   # What is the remaining TTL for this object?
-  set resp.http.x-ttl = obj.ttl;
+  set resp.http.cache-status = resp.http.cache-status + "; ttl=" + obj.ttl;
   # What is the max object staleness permitted?
-  set resp.http.x-grace = obj.grace;
+  set resp.http.cache-status = resp.http.cache-status + "; grace=" + obj.grace;
 
   # Did the response come from Varnish or from the backend?
   if (obj.hits > 0) {
-    set resp.http.x-cache = "HIT";
+    set resp.http.cache-status = resp.http.cache-status + "; hit";
   } else {
-    set resp.http.x-cache = "MISS";
+    set resp.http.cache-status = resp.http.cache-status + "; miss";
   }
 
   # Is this object stale?
-  if (obj.ttl < std.duration(integer=0)) {
-    set resp.http.x-cache = "STALE";
+  if (obj.hits > 0 && obj.ttl < std.duration(integer=0)) {
+    set resp.http.cache-status = resp.http.cache-status + "; stale";
   }
 
   # How many times has this response been served from Varnish?
-  set resp.http.x-cache-hits = obj.hits;
+  set resp.http.cache-status = resp.http.cache-status + "; hits=" + obj.hits;
+
+  # Which region is serving this request?
+  set resp.http.cache-status = resp.http.cache-status + "; region=" + std.getenv("FLY_REGION");
 }
 
 # TODOS:
@@ -97,15 +120,16 @@ sub vcl_deliver {
 #   - QUESTION: Should the app control this via Surrogate-Control? Should we remove this header?
 #   - EXPLORE: varnishstat
 #   - EXPLORE: varnishtop
-#   - EXPLORE: varnishncsa -c -F '%m %U %H %{x-cache}o %{x-cache-hits}o'
+#   - EXPLORE: varnishncsa -c -f '%m %u %h %{x-cache}o %{x-cache-hits}o'
 # - ✅ Serve stale content on backend error
 #   - https://varnish-cache.org/docs/7.4/users-guide/vcl-grace.html#misbehaving-servers
-# - If the backend gets restarted (e.g. new deploy), backend remains sick in Varnish
+# - ✅ Expose FLY_REGION=sjc env var as a custom header
+#   - https://github.com/varnish/docker-varnish/blob/45c6204864d46dbd9e18485c91f915f89f822859/old/debian/default.vcl#L35
+# - ✅ If the backend gets restarted (e.g. new deploy), backend remains sick in Varnish
 #   - https://info.varnish-software.com/blog/two-minute-tech-tuesdays-backend-health
 #   - EXPLORE: varnishlog -g raw -i backend_health
+#   - EXPLORE: varnishadm backend.list
 # - Implement If-Modified-Since? keep
-# - Expose FLY_REGION=sjc env var as a custom header
-#   - https://varnish-cache.org/lists/pipermail/varnish-misc/2019-September/026656.html
 # - Add Feeds backend: /feed -> https://feeds.changelog.place/feed.xml
 # - Store cache on disk? A pre-requisite for static backend 
 #   - https://varnish-cache.org/docs/trunk/users-guide/storage-backends.html#file
@@ -115,3 +139,7 @@ sub vcl_deliver {
 # - Run varnishncsa as a separate process (will need a supervisor + log drain)
 #   - https://info.varnish-software.com/blog/varnish-and-json-logging
 # - How to cache purge across all varnish instances?
+#
+# LINKS:
+# - https://github.com/magento/magento2/blob/03621bbcd75cbac4ffa8266a51aa2606980f4830/app/code/Magento/PageCache/etc/varnish6.vcl
+# - https://abhishekjakhotiya.medium.com/magento-internals-cache-purging-and-cache-tags-bf7772e60797
diff --git a/fly.toml b/fly.toml
@@ -1,14 +1,16 @@
 # Full app config reference: https://fly.io/docs/reference/configuration/
-app = "cdn-2024-01-26"
-# Closest to James
-primary_region = "sjc"
-# Secondary region will be "lhr", closest to Gerhard
+app = "cdn-2024-08-02"
+# Closest to the origin
+primary_region = "iad"
 
 kill_signal = "SIGTERM"
 kill_timeout = 30
 
 [env]
-VARNISH_SIZE="500M"
+# We want a low value so that:
+# - we leave room for "sidecar" processes (log drains, purge worker, etc.)
+# - easier to test what happens when the memory fills up
+VARNISH_SIZE="100M"
 
 [[vm]]
 size = "shared-cpu-1x"
@@ -23,11 +25,11 @@ protocol = "tcp"
 
 [[services.http_checks]]
 grace_period = "5s"
-interval = "5s"
+interval = "10s"
 method = "get"
-path = "/varnish_status"
+path = "/health"
 protocol = "http"
-timeout = "4s"
+timeout = "5s"
 
 [[services.ports]]
 handlers = ["tls", "http"]