diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md
new file mode 100644
index 000000000..cd6c7d48c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -0,0 +1,46 @@
+---
+name: Bug template
+about: Template that will help you to submit a visible and actionable bug report.
+title: 'Bug: '
+labels: t/bug
+assignees: ''
+
+---
+
+## Environment
+
+<!--
+e.g. local development, staging, production
+-->
+
+## Steps to reproduce
+
+<!--
+If you can, try to reproduce the bug. If e.g. this occurred in production under
+rare circumstances, indicate as such. Any related events can be included in the
+later "Actual result" section.
+
+If this occurred while you were testing, but you don't have logs for it, say
+what kind of testing you were doing! Provide any scripts / commands that were
+relevant.
+-->
+
+## Expected result
+
+<!--
+What did you expect to happen?
+
+Try to avoid vague statements like "operation should succeed". Instead, say more
+specifically what the success should look like (e.g. "VM should scale up").
+-->
+
+## Actual result
+
+<!--
+Remember to include logs, graphs, and other supporting information to show
+how/why things did not function as expected.
+-->
+
+## Other logs, links
+
+- ...
diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md
new file mode 100644
index 000000000..5f666b0a7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -0,0 +1,51 @@
+---
+name: Epic Template
+about: A set of related tasks contributing towards specific outcome, comprising of
+  more than 1 week of work.
+title: 'Epic: '
+labels: t/Epic
+assignees: ''
+
+---
+
+## Motivation
+
+<!--
+Why is this worthwhile?
+
+Provide any relevant context, either written here or with links to slack, other issues, etc.
+-->
+
+## DoD
+
+<!--
+DoD = "definition of done".
+
+Put any ideas you have for how the changes should look *externally* - i.e. to a
+user of the system or component that needs changing. For each idea, give the
+DoD: "what external changes need to be made for the feature to be complete?"
+-->
+
+
+## Implementation ideas
+
+<!--
+If you have ideas about how the feature could be implemented, please share!
+
+If not, leave this section here so that it can be filled in by others.
+-->
+
+TODO
+
+
+## Tasks
+
+```[tasklist]
+- [ ] ...
+- [ ] List tasks as they're created for this Epic
+```
+
+
+## Other related tasks, Epics, and links
+
+-
diff --git a/.github/ISSUE_TEMPLATE/feature-request-template.md b/.github/ISSUE_TEMPLATE/feature-request-template.md
new file mode 100644
index 000000000..5b4142de7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request-template.md
@@ -0,0 +1,37 @@
+---
+name: Feature request template
+about: Template that will help you to submit a visible and actionable feature request.
+title: 'Feature: '
+labels: t/feature
+assignees: ''
+
+---
+
+## Problem description / Motivation
+
+<!--
+Feature requests are for when something isn't a bug (because there's nothing
+*unexpected* per se), but there's still room for improvement.
+
+This area is the most important part! Sometimes we think one solution is best,
+but gradually realize that another is simpler. This is only possible if the
+problem description is clear about what is *actually* required.
+-->
+
+## Feature idea(s) / DoD
+
+<!--
+DoD = "definition of done".
+
+Put any ideas you have for how the changes should look *externally* - i.e. to a
+user of the system or component that needs changing. For each idea, give the
+DoD: "what external changes need to be made for the feature to be complete?"
+-->
+
+## Implementation ideas
+
+<!--
+If you have ideas about how the feature could be implemented, please share!
+
+If not, feel free to remove this section.
+-->
diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml
index 71130df80..52417781c 100644
--- a/.github/workflows/e2e-test.yaml
+++ b/.github/workflows/e2e-test.yaml
@@ -8,6 +8,7 @@ on:
 jobs:
   e2e-tests:
     strategy:
+      fail-fast: false
       matrix:
         cluster:
           - k3d
@@ -64,8 +65,11 @@ jobs:
       - run: make ${{ matrix.cluster }}-setup
 
       - run: make deploy
+        timeout-minutes: 10
       - run: make example-vms
+        timeout-minutes: 10
       - run: make e2e
+        timeout-minutes: 15
 
       - name: Get k8s logs and events
         if: always()
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index b9e07ab15..b6f4fd6f8 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -7,7 +7,8 @@ on:
 env:
   AGENT_IMAGE: "neondatabase/autoscaler-agent"
   SCHED_IMAGE: "neondatabase/autoscale-scheduler"
-  INFORMANT_IMAGE: "neondatabase/vm-informant"
+  MONITOR_IMAGE: "neondatabase/vm-monitor"
+  VM_MONITOR_BRANCH: "main"
 
   KUSTOMIZE_VERSION:        "4.5.7"
   CONTROLLER_TOOLS_VERSION: "0.10.0"
@@ -50,7 +51,10 @@ jobs:
         run: go build ./...
 
       - name: build binaries
-        run:  make VM_INFORMANT_IMG=${{ env.INFORMANT_IMAGE }}:${{ steps.get_vcs_info.outputs.version }} build
+        run: |
+          make \
+            VM_MONITOR_IMG=${{ env.MONITOR_IMAGE }}:${{ steps.get_vcs_info.outputs.version }} \
+            build
 
       - name: docker - install qemu
         uses: docker/setup-qemu-action@v2
@@ -119,16 +123,17 @@ jobs:
           build-args: |
             GIT_INFO=${{ steps.get_vcs_info.outputs.git_info }}
 
-      - name: build and push vm-informant image
+      - name: build and push vm-monitor image
         uses: docker/build-push-action@v3
         with:
           context: .
           platforms: linux/amd64
           push: true
-          file: build/vm-informant/Dockerfile
-          tags: ${{ env.INFORMANT_IMAGE }}:${{ steps.get_vcs_info.outputs.version }}
+          file: build/vm-monitor/Dockerfile
+          tags: ${{ env.MONITOR_IMAGE }}:${{ steps.get_vcs_info.outputs.version }}
           build-args: |
             GIT_INFO=${{ steps.get_vcs_info.outputs.git_info }}
+            BRANCH=${{ env.VM_MONITOR_BRANCH }}
 
       - name: render kubernetes resources
         uses: stefanprodan/kube-tools@v1
@@ -149,12 +154,12 @@ jobs:
             kustomize build deploy/scheduler > rendered_manifests/autoscale-scheduler.yaml
             kustomize build deploy/agent > rendered_manifests/autoscaler-agent.yaml
 
-      # Because we want a docker image for the VM informant, the easiest way for us to also provide
+      # Because we want a docker image for the VM monitor, the easiest way for us to also provide
       # a binary is by just extracting it from the container image itself.
-      - name: extract vm-informant binary
+      - name: extract vm-monitor binary
         run: |
-          ID=$(docker create ${{ env.INFORMANT_IMAGE }}:${{ steps.get_vcs_info.outputs.version }})
-          docker cp $ID:/usr/bin/vm-informant bin/vm-informant
+          ID=$(docker create ${{ env.MONITOR_IMAGE }}:${{ steps.get_vcs_info.outputs.version }})
+          docker cp $ID:/usr/bin/vm-monitor bin/vm-monitor
           docker rm -f $ID
 
       - name: build and push cluster-autoscaler image
@@ -173,10 +178,11 @@ jobs:
           files: |
             bin/vm-builder
             bin/vm-builder-generic
-            bin/vm-informant
+            bin/vm-monitor
             rendered_manifests/autoscale-scheduler.yaml
             rendered_manifests/autoscaler-agent.yaml
             rendered_manifests/neonvm.yaml
             rendered_manifests/multus.yaml
             rendered_manifests/multus-eks.yaml
             rendered_manifests/whereabouts.yaml
+            deploy/vmscrape.yaml
diff --git a/.golangci.yml b/.golangci.yml
index 40fa640ae..523d877c4 100644
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -9,7 +9,7 @@ run:
 issues:
   exclude:
     # ChanMutex contains only a channel, which *is* safe to copy
-    - 'copylocks: return copies lock value: github\.com/neondatabase/autoscaling/pkg/util\.ChanMutex'
+    - 'copylocks: .* copies lock value.*: github\.com/neondatabase/autoscaling/pkg/util\.ChanMutex'
 
 output:
   format: colored-line-number
@@ -54,8 +54,8 @@ linters-settings:
     exclude:
       - '^net/http\.(Client|Server)'
       - '^net\.TCPAddr$'
-      # metav1.{CreateOptions,GetOptions,ListOptions,WatchOptions,PatchOptions}
-      - '^k8s\.io/apimachinery/pkg/apis/meta/v1\.(Create|Get|List|Watch|Patch)Options$'
+      # metav1.{CreateOptions,GetOptions,ListOptions,WatchOptions,PatchOptions,DeleteOptions}
+      - '^k8s\.io/apimachinery/pkg/apis/meta/v1\.(Create|Get|List|Watch|Patch|Delete)Options$'
       - '^k8s\.io/apimachinery/pkg/apis/meta/v1\.ObjectMeta$'
       - '^k8s\.io/apimachinery/pkg/api/resource\.Quantity$'
       - '^github.com/prometheus/client_golang/prometheus(/.*)?\.\w+Opts$'
diff --git a/ARCHITECTURE-network-diagram.org b/ARCHITECTURE-network-diagram.org
index 5f836c820..9c565a8a8 100644
--- a/ARCHITECTURE-network-diagram.org
+++ b/ARCHITECTURE-network-diagram.org
@@ -32,29 +32,29 @@ awk '/#\+BEGIN_SRC/{flag=1;next}/#\+END_SRC/{flag=0}flag' ARCHITECTURE-network-d
                   |                      +---------|  autoscaler agent  |
                   |                                |                    |
                   |                                | (one per K8s node) |
-                  |                                +-----------------*--+
-                  |                                  |           |   ^ random port
-                  |                                  |           |   |  (per VM)
-                  |                                  |           |   |
-+=================|==================================|===========|===|======+
-: K8s pod         |                                  |           |   |      :
-:             QMP |                                  |           |   |      :
-:           20183 V                                  |           |   |      :
-: +---------------*----------------------------------|-----------|---|---+  :
-: |                                                  |           |   |   |  :
-: | QEMU process                                     |           |   |   |  :
-: |                                                  |           |   |   |  :
-: |                                                  |           |   |   |  :
-: |            compute_ctl    postgres       metrics | informant |   |   |  :
-: |               mgmt API    postgres    prometheus | informant |   |   |  :
-: |                   3080        5432          9100 V     10301 V   |   |  :
+                  |                                +--------------------+
+                  |                                  |           |
+                  |                                  |           |    
+                  |                                  |           |    
++=================|==================================|===========|==========+
+: K8s pod         |                                  |           |          :
+:             QMP |                                  |           |          :
+:           20183 V                                  |           |          :
+: +---------------*----------------------------------|-----------|-------+  :
+: |                                                  |           |       |  :
+: | QEMU process                                     |           |       |  :
+: |                                                  |           |       |  :
+: |                                                  |           |       |  :
+: |            compute_ctl    postgres       metrics |   monitor |       |  :
+: |               mgmt API    postgres    prometheus | websocket |       |  :
+: |                   3080        5432          9100 V     10301 V       |  :
 : +------------------------*-----------*-------------*-----------*-------+  :
 : | VM                                                                   |  :
 : |                                                                      |  :
 : | Inside the VM runs:                                                  |  :
 : |    - compute_ctl (listens on port 3080)                              |  :
+: |    - VM monitor (port 10301 via websocket)                           |  :
 : |    - Postgres (port 5432)                                            |  :
-: |    - VM informant (port 10301)                                       |  :
 : |    - vector (metrics on port 9100)                                   |  :
 : |                                                                      |  :
 : +----------------------------------------------------------------------+  :
diff --git a/ARCHITECTURE-network-diagram.png b/ARCHITECTURE-network-diagram.png
index 81c367cc9..e58c91c98 100644
Binary files a/ARCHITECTURE-network-diagram.png and b/ARCHITECTURE-network-diagram.png differ
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 44ab79110..337c5a2e5 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -18,7 +18,7 @@ This document should be up-to-date. If it isn't, that's a mistake (open an issue
   * [Agent-Scheduler protocol steps](#agent-scheduler-protocol-steps)
   * [Node pressure and watermarks](#node-pressure-and-watermarks)
 * [High-level consequences of the Agent-Scheduler protocol](#high-level-consequences-of-the-agent-scheduler-protocol)
-* [Agent-Informant protocol details](#agent-informant-protocol-details)
+* [Agent-Monitor protocol details](#agent-monitor-protocol-details)
 * [Footguns](#footguns)
 
 ## See also
@@ -27,32 +27,38 @@ This isn't the only architecture document. You may also want to look at:
 
 * [`pkg/plugin/ARCHITECTURE.md`](pkg/plugin/ARCHITECTURE.md) — detail on the implementation of the
   scheduler plugin
+* [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor) -
+where the (VM) monitor, an autoscaling component that manages a Postgres, lives.
 
 ## High-level overview
 
-At a high level, this repository provides three components:
+At a high level, this repository provides two components:
 
 1. A modified Kubernetes scheduler (using the [plugin interface]) — known as "the (scheduler)
-   plugin", `AutoscaleEnforcer`, `autscale-scheduler`
+   plugin", `AutoscaleEnforcer`, `autoscale-scheduler`
 2. A daemonset responsible for making VM scaling decisions & checking with interested parties
    — known as `autoscaler-agent` or simply `agent`
-3. A binary running inside of the VM to (a) provide metrics to the `autoscaler-agent`, (b) validate
-   that downscaling is ok, and (c) request immediate upscaling due to sharp changes in demand —
-   known as "the (VM) informant"
+
+A third component, a binary running inside of the VM to (a) handle being upscaled
+(b) validate that downscaling is ok, and (c) request immediate upscaling due to sharp changes in demand
+— known as "the (VM) monitor", lives in
+[`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor)
 
 [plugin interface]: https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/
 
 The scheduler plugin is responsible for handling resource requests from the `autoscaler-agent`,
 capping increases so that node resources aren't overcommitted.
 
-The `autoscaler-agent` periodically reads from a metrics source in the VM (defined by the
-_informant_) and makes scaling decisions about the _desired_ resource allocation. It then
-requests these resources from the scheduler plugin, and submits a patch request for its NeonVM to
-update the resources.
+The `autoscaler-agent` periodically reads from a metrics source in the VM (currently vector's
+`node_exporter`-like functionality) and makes scaling decisions about the _desired_ resource
+allocation. It then requests these resources from the scheduler plugin, and submits a patch request
+for its NeonVM to update the resources.
 
-The VM informant provides is responsible for handling all of the functionality inside the VM that
-the `autoscaler-agent` cannot. It provides metrics (or: informs the agent where it can find those)
-and approves attempts to downscale resource usage (or: rejects them, if they're still in use).
+The VM monitor is responsible for handling all of the resource management functionality inside
+the VM that the `autoscaler-agent` cannot. This constitutes handling upscales (eg. increasing Postgres
+file cache size), approving attempts to downscale resource usage (or: rejecting them, if those
+resources are still in use), and requesting upscale when memory usage increases too rapidly for
+metrics to catch.
 
 NeonVM is able to live-scale the resources given to a VM (i.e. CPU and memory _slots_) by handling
 patches to the Kubernetes VM object, which requires connecting to QEMU running on the outer
@@ -66,15 +72,15 @@ discussed more in the [high-level consequences] section below.
 
 ## Network connections between components
 
-![Diagram of network connections between the components listed above, in addition to the kubernetes API and Neon compute node. Directed arrows indicate which component initiates each TCP connection](ARCHITECTURE-network-diagram.png)
+![Diagram of network connections between the components listed above, in addition to the Kubernetes API and Neon compute node. Directed arrows indicate which component initiates each TCP connection](ARCHITECTURE-network-diagram.png)
 [Diagram source](ARCHITECTURE-network-diagram.org)
 
 ## Repository structure
 
 * `build/` — scripts for building the scheduler (`autoscale-scheduler`) and `autoscaler-agent`
 * `cluster-autoscaler/` — patch and Dockerfile for building a NeonVM-compatible [cluster-autoscaler]
-* `cmd/` — entrypoints for the `autoscaler-agent`, VM informant, and scheduler plugin. Very little
-    functionality implemented here. (See: `pkg/agent`, `pkg/informant`, and `pkg/plugin`)
+* `cmd/` — entrypoints for the `autoscaler-agent` and scheduler plugin. Very little
+    functionality implemented here. (See: `pkg/agent` and `pkg/plugin`)
 * `deploy/` — YAML files used during cluster init. Of these, only the following two are manually
   written:
     * `deploy/autoscaler-agent.yaml`
@@ -90,7 +96,6 @@ discussed more in the [high-level consequences] section below.
         independently used by multiple components.
     * `pkg/billing/` — consumption metrics API, primarily used in
         [`pkg/agent/billing.go`](pkg/agent/billing.go)
-    * `pkg/informant/` — implementation of the VM informant
     * `pkg/plugin/` — implementation of the scheduler plugin
     * `pkg/util/` — miscellaneous utilities that are too general to be included in `agent` or
       `plugin`.
@@ -98,7 +103,7 @@ discussed more in the [high-level consequences] section below.
     * `scripts/patch-*.json` — patches for testing live-updating of a VM or config
     * `scripts/replace-scheduler.sh` — replaces the currently running scheduler, for quick redeploy
     * `scripts/repeat-delete-scheduler.sh` — repeatedly deletes the scheduler (which will be
-        recreated by the depoyment). For debugging.
+        recreated by the deployment). For debugging.
     * `scripts/run-bench.sh` — starts a CPU-intensive pgbench connected to a VM. Useful to watch
       the TPS and get confirmation that autoscaled CPUs are being used.
     * `scripts/scheduler-logs.sh` — convenience script to tail the scheduler's logs
@@ -111,7 +116,7 @@ discussed more in the [high-level consequences] section below.
 * `vm-examples/` — collection of VMs:
     * `pg14-disk-test/` — VM with Postgres 14 and and ssh access
       * Refer to [`vm-examples/pg14-disk-test/README.md`](./vm-examples/pg14-disk-test)  for more information.
-    * `postgres-minimal/` — Minimal postgres 15 VM used in e2e tests
+    * `postgres-minimal/` — Minimal Postgres 15 VM used in e2e tests
 
 [cluster-autoscaler]: https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler
 
@@ -132,7 +137,7 @@ on each node, the scheduler can prevent
 
 ### Agent-Scheduler protocol steps
 
-1. On startup (for a particular VM), the `autoscaler-agent` [connects to the VM informant] and
+1. On startup (for a particular VM), the `autoscaler-agent` [connects to the VM monitor] and
    fetches some initial metrics.
 2. After successfully receiving a response, the autoscaler-agent sends an `AgentRequest` with the
    metrics and current resource allocation (i.e. it does not request any scaling).
@@ -159,7 +164,7 @@ on each node, the scheduler can prevent
        that scales those resources up.
        * This has the same connection flow as the earlier patch request.
 
-[connects to the VM informant]: #agent-informant-protocol-details
+[connects to the VM monitor]: #agent-monitor-protocol-details
 
 ### Node pressure and watermarks
 
@@ -192,72 +197,48 @@ than the amount of pressure already accounted for.
    cause the scheduler to return `Permit`s that aren't a clean multiple of a compute unit.
    (e.g., nodes have mismatched memory vs CPU, or external pods / system reserved are mismatched)
 
-## Agent-Informant protocol details
-
-A brief note before we get started: There are a lot of specific difficulties around making sure that
-the informant is always talking to _some_ agent — ideally the most recent one. While _in theory_
-there should only ever be one, we might have `n=0` or `n>1` during rollouts of new versions. Our
-process for handling this is not discussed here — this section only covers the communciations
-between a single agent and informant.
-
-The relevant types for the agent-informant protocol are all in [`pkg/api/types.go`]. If using this
-as a reference, it may be helpful to have that file open at the same time.
-
-[`pkg/api/types.go`]: ./pkg/api/types.go
-
-It may also be worth noting that this protocol is versioned. For an overview of version
-compatibility and how it relates to releases of this repository, refer to
-[`pkg/api/VERSIONING.md`](./pkg/api/VERSIONING.md).
-
-The protocol is as follows:
-
-1. On startup, the VM informant starts an HTTP server listening on `0.0.0.0:10301`.
-2. On startup for this VM, the `autoscaler-agent` starts an HTTP server listening _some_ port
-3. The agent sends an `AgentDesc` to the informant as a POST request on the `/register` endpoint.
-   Before responding:
-    1. If the informant has already registered an agent with the same `AgentDesc.AgentID`, it
-       immediately responds with HTTP code 409.
-    2. If the informant's protocol version doesn't match the `AgentDesc`'s min/max protocol
-       versions, it immediately responds with HTTP code 400.
-    3. Using the provided `ServerAddr` from the agent's `AgentDesc`, the informant makes a GET
-       request on the agent's `/id` endpoint
-    4. The agent responds immediately to the `/id` request with an `AgentMessage[AgentIdentification]`.
-    5. If the agent's `AgentIdentification.AgentID` doesn't match the original `AgentDesc.AgentID`,
-       the informant responds with HTTP code 400.
-    6. Otherwise, the informant responds with HTTP code 200, returning an `InformantDesc` describing
-       its capabilities and which protocol version to use.
-4. Begin "normal operation". During this, there are a few types of requests made between the agent
-   and informant. Each party can make **only one request at a time**. The agent starts in the
-   "suspended" state.
-    1. The informant's `/health-check` endpoint (via PUT), with `AgentIdentification`. This allows
-       the autoscaler-agent to check that the informant is up and running, and that it still
-       recognizes the agent.
-    2. The informant's `/downscale` endpoint (via PUT), with `AgentResourceMessage`. This serves as the
-       agent _politely asking_ the informant to decrease resource usage to the specified amount.
-       The informant returns a `DownscaleResult` indicating whether it was able to downscale (it may
-       not, if e.g. memory usage is too high).
-    3. The informant's `/upscale` endpoint (via PUT), with `AgentResourceMessage`. This serves as the agent
-       _notifying_ the informant that its resources have increased to the provided amount.
-    4. The agent's `/suspend` endpoint (via POST), with `SuspendAgent`. This allows the informant to
-       inform the agent that it is no longer in use for the VM. While suspended, the agent **must
-       not** make any `downscale` or `upscale` requests. The informant **must not** double-suspend
-       an agent.
-    5. The agent's `/resume` endpoint (via POST), with `ResumeAgent`. This allows the informant to
-       pick up communication with an agent that was previously suspended. The informant **must not**
-       double-resume an agent.
-    6. The agent's `/id` endpoint (via GET) is also available during normal operation, and is used
-       as a health check by the informant.
-    7. The agent's `/try-upscale` endpoint (via POST), with `MoreResources`. This allows the
-       informant to request more of a particular resource (e.g. memory). The agent MUST respond
-       immediately with an `AgentIdentification`. It MAY later send an `/upscale` request to the
-       informant once the requested increase in resources has been achieved.
-5. If explicitly cut off, communication ends with the agent sending the original `AgentDesc` as a
-   DELETE request on the `/unregister` endpoint. The informant returns an `UnregisterAgent`.
-
-Broadly, agent<->informant connections are not expected to survive restarts of the informant (due
-to failure, or otherwise). So, it is expected that *sometimes*, the informant will receive a request
-for an agent that it has no connection to. When that happens, the informant MUST respond with HTTP
-code 404, and the agent SHOULD try reconnecting.
+## Agent-Monitor protocol details
+
+Agent-Monitor communication is carried out through a relatively simple _versioned_ protocol
+over websocket. One party sends a message, the other responds. There are various
+message types that each party sends, and all messages are annotated with an ID.
+The allows a sender to recognize responses to its previous messages. If the
+out-message has ID X, then the return message will also have ID X.
+
+Like the other protocols, relevant types are located in [`pkg/api/types.go`].
+
+1. On startup, the VM monitor listens for websocket connections on `127.0.0.1:10369`
+2. On startup, the agent connects to the monitor via websocket on `127.0.0.1:10369/monitor`
+3. The agent then sends a `VersionRange[MonitorProtocolVersion]` with the range of
+   protocols it supports.
+4. The monitor responds with the highest common version between the two. If there is no
+   compatible protocol, it returns an error.
+5. From this point on, either party may initiate a transaction by sending a Message.
+6. The other party responds with the appropriate message, with the same ID attached
+   so that the receiver knows it has received a response.
+
+Currently, the following interactions are supported:
+```
+Monitor   sends   UpscaleRequest
+Agent     returns NotifyUpscale
+
+Agent     sends   TryDownscale
+Monitor   returns DownscaleResult
+
+Agent     sends   NotifyUpscale
+Monitor   returns UpscaleConfirmation
+
+Agent     sends   HealthCheck
+Monitor   returns HealthCheck
+```
+
+*Healthchecks*: the agent initiates a health check every 5 seconds. The monitor
+simply returns with an ack.
+
+There are two additional messages types that either party may send:
+- `InvalidMessage`: sent when either party fails to deserialize a message it received
+- `InternalError`: used to indicate that an error occured while processing a request,
+  for example, if the monitor errors while trying to downscale
 
 ## Footguns
 
diff --git a/LOGGING.md b/LOGGING.md
index d56ffa8a4..ecfdfabb2 100644
--- a/LOGGING.md
+++ b/LOGGING.md
@@ -10,9 +10,8 @@ The following components have been updated to follow this document:
 
 - [x] autoscaler-agent
 - [x] autoscale-scheduler (scheduler plugin)
-- [x] vm-informant
 - [ ] neonvm-controlller
-- [ ] neonvm-runner
+- [x] neonvm-runner
 
 ## Common keys
 
@@ -48,9 +47,8 @@ the VM's resources, so they share a logger name (`agent.runner.main`).
 
 ## Logger naming conventions
 
-- `component.*` — each component (e.g. "autoscaler-agent", "vm-informant", etc) has logger names
+- `component.*` — each component (e.g. "autoscaler-agent") has logger names
   prefixed with the name of the component
-    - This is necessary so that compute logs can filter out the vm-informant.
 - `*.main` — if the bulk of the logic for something is in one straightforward loop (like
   `autoscaler-agent.runner.main`)
 - `*.klog` — for klog output that's been redirected
diff --git a/Makefile b/Makefile
index 8f352aeab..9e51582dc 100644
--- a/Makefile
+++ b/Makefile
@@ -6,10 +6,13 @@ IMG_VXLAN ?= vxlan-controller:dev
 # Autoscaler related images
 AUTOSCALER_SCHEDULER_IMG ?= autoscale-scheduler:dev
 AUTOSCALER_AGENT_IMG ?= autoscaler-agent:dev
-VM_INFORMANT_IMG ?= vm-informant:dev
+VM_MONITOR_IMG ?= vm-monitor:dev
 E2E_TESTS_VM_IMG ?= vm-postgres:15-bullseye
 PG14_DISK_TEST_IMG ?= pg14-disk-test:dev
 
+# Which branch of neondatabase/neon to pull the vm-monitor from
+VM_MONITOR_BRANCH ?= main
+
 # kernel for guests
 VM_KERNEL_VERSION ?= "5.15.80"
 
@@ -120,7 +123,7 @@ build: fmt vet bin/vm-builder bin/vm-builder-generic ## Build all neonvm binarie
 
 .PHONY: bin/vm-builder
 bin/vm-builder: ## Build vm-builder binary.
-	CGO_ENABLED=0 go build -o bin/vm-builder -ldflags "-X main.Version=${GIT_INFO} -X main.VMInformant=${VM_INFORMANT_IMG}" neonvm/tools/vm-builder/main.go
+	CGO_ENABLED=0 go build -o bin/vm-builder -ldflags "-X main.Version=${GIT_INFO} -X main.VMMonitor=${VM_MONITOR_IMG}" neonvm/tools/vm-builder/main.go
 
 .PHONY: bin/vm-builder-generic
 bin/vm-builder-generic: ## Build vm-builder-generic binary.
@@ -130,20 +133,20 @@ bin/vm-builder-generic: ## Build vm-builder-generic binary.
 run: fmt vet ## Run a controller from your host.
 	go run ./neonvm/main.go
 
-.PHONY: vm-informant
-vm-informant: ## Build vm-informant image
+.PHONY: vm-monitor
+vm-monitor: ## Build vm-monitor image
 	docker buildx build \
-		--tag $(VM_INFORMANT_IMG) \
+		--tag $(VM_MONITOR_IMG) \
 		--load \
-		--build-arg GIT_INFO=$(GIT_INFO) \
-		--file build/vm-informant/Dockerfile \
+		--build-arg BRANCH=$(VM_MONITOR_BRANCH) \
+		--file build/vm-monitor/Dockerfile \
 		.
 
 # If you wish built the controller image targeting other platforms you can use the --platform flag.
 # (i.e. docker build --platform linux/arm64 ). However, you must enable docker buildKit for it.
 # More info: https://docs.docker.com/develop/develop-images/build_enhancements/
 .PHONY: docker-build
-docker-build: docker-build-controller docker-build-runner docker-build-vxlan-controller docker-build-autoscaler-agent docker-build-scheduler vm-informant ## Build docker images for NeonVM controllers, NeonVM runner, autoscaler-agent, and scheduler
+docker-build: docker-build-controller docker-build-runner docker-build-vxlan-controller docker-build-autoscaler-agent docker-build-scheduler vm-monitor ## Build docker images for NeonVM controllers, NeonVM runner, autoscaler-agent, scheduler, vm-monitor
 
 .PHONY: docker-push
 docker-push: docker-build ## Push docker images to docker registry
@@ -152,7 +155,7 @@ docker-push: docker-build ## Push docker images to docker registry
 	docker push -q $(IMG_VXLAN)
 	docker push -q $(AUTOSCALER_SCHEDULER_IMG)
 	docker push -q $(AUTOSCALER_AGENT_IMG)
-	docker push -q $(VM_INFORMANT_IMG)
+	docker push -q $(VM_MONITOR_IMG)
 
 .PHONY: docker-build-controller
 docker-build-controller: ## Build docker image for NeonVM controller
@@ -185,11 +188,11 @@ docker-build-scheduler: ## Build docker image for (autoscaling) scheduler
 		.
 
 .PHONY: docker-build-examples
-docker-build-examples: vm-informant bin/vm-builder ## Build docker images for testing VMs
-	./bin/vm-builder -src postgres:15-bullseye -dst $(E2E_TESTS_VM_IMG)
+docker-build-examples: bin/vm-builder ## Build docker images for testing VMs
+	./bin/vm-builder -src postgres:15-bullseye -dst $(E2E_TESTS_VM_IMG) -enable-monitor
 
 .PHONY: docker-build-pg14-disk-test
-docker-build-pg14-disk-test: vm-informant bin/vm-builder-generic ## Build a VM image for testing
+docker-build-pg14-disk-test: vm-monitor bin/vm-builder-generic ## Build a VM image for testing
 	if [ -a 'vm-examples/pg14-disk-test/ssh_id_rsa' ]; then \
 	    echo "Skipping keygen because 'ssh_id_rsa' already exists"; \
 	else \
diff --git a/README.md b/README.md
index e74c364f3..4f5e004e2 100644
--- a/README.md
+++ b/README.md
@@ -10,9 +10,9 @@ Images are available as:
 |----------------|------------|
 | scheduler (and plugin) | `neondatabase/autoscale-scheduler` |
 | autoscaler-agent | `neondatabase/autoscaler-agent` |
-| VM informant | `neondatabase/vm-informant` |
+| VM monitor | `neondatabase/vm-monitor` |
 
-The deployment files and a VM informant binary are attached to each release.
+The deployment files and a VM monitor binary are attached to each release.
 
 For information on inter-version compatibility, see
 [`pkg/api/VERSIONING.md`](./pkg/api/VERSIONING.md).
@@ -41,7 +41,7 @@ settled on the following:
   demand is above a pre-configured threshold
 * Each K8s node has an `autoscaler-agent` pod that triggers scaling decisions and makes resource
   requests to the K8s scheduler on the VMs' behalf to reserve additional resources for them
-* Each VM runs the _VM informant_ binary, which communicates to the autoscaler-agent so that it can
+* Each compute node runs the _VM monitor binary, which communicates to the autoscaler-agent so that it can
   immediately respond to memory pressure by allocating more (among other things).
 
 Networking is preserved across migrations by giving each VM an additional IP address on a bridge
diff --git a/build/vm-informant/Dockerfile b/build/vm-informant/Dockerfile
deleted file mode 100644
index bfa31e893..000000000
--- a/build/vm-informant/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM golang:1.20-alpine AS builder
-WORKDIR /workspace
-
-RUN apk add gcc musl-dev # gcc (and therefore musl-dev) is required for cgo extensions
-
-COPY go.mod go.mod
-COPY go.sum go.sum
-RUN go mod download
-
-COPY neonvm/apis      neonvm/apis
-COPY pkg/api          pkg/api
-COPY pkg/informant    pkg/informant
-COPY pkg/util         pkg/util
-COPY cmd/vm-informant cmd/vm-informant
-
-ARG GIT_INFO
-
-RUN --mount=type=cache,target=/root/.cache/go-build \
-    go build -a \
-	# future compat: don't modify go.mod if we have a vendor directory \
-	-mod readonly \
-    # -ldflags "-X ..." allows us to overwrite the value of a variable in a package \
-    -ldflags "-X 'github.com/neondatabase/autoscaling/pkg/util.BuildGitInfo=$GIT_INFO'" \
-    cmd/vm-informant/main.go
-
-FROM alpine
-COPY --from=builder /workspace/main /usr/bin/vm-informant
-ENTRYPOINT ["/usr/bin/vm-informant"]
diff --git a/build/vm-monitor/Dockerfile b/build/vm-monitor/Dockerfile
new file mode 100644
index 000000000..6e4b95052
--- /dev/null
+++ b/build/vm-monitor/Dockerfile
@@ -0,0 +1,21 @@
+FROM rust:1.70-alpine as builder
+WORKDIR /workspace
+
+RUN apk add musl-dev git
+
+# Which branch to pull from
+ARG BRANCH
+
+# Ensures we reclone upon new commits
+# https://stackoverflow.com/questions/35134713
+ADD "https://api.github.com/repos/neondatabase/neon/commits/$BRANCH" latest_commit
+
+RUN git clone --depth 1 --branch $BRANCH https://github.com/neondatabase/neon.git
+RUN cargo build --release --manifest-path neon/libs/vm_monitor/Cargo.toml
+# Move binary so we can cargo clean
+RUN mkdir -p /workspace/bin && cp /workspace/neon/target/release/vm-monitor /workspace/bin
+# Cargo clean dramatically reduces the size of the image
+RUN cargo clean --release --manifest-path neon/libs/vm_monitor/Cargo.toml
+
+FROM builder
+COPY --from=builder /workspace/bin/vm-monitor /usr/bin/vm-monitor
diff --git a/cmd/autoscale-scheduler/main.go b/cmd/autoscale-scheduler/main.go
index c8bb5a5b1..0024a049c 100644
--- a/cmd/autoscale-scheduler/main.go
+++ b/cmd/autoscale-scheduler/main.go
@@ -23,7 +23,9 @@ import (
 // all of the juicy bits are defined in pkg/plugin/
 
 func main() {
-	logger := zap.Must(zap.NewProduction()).Named("autoscale-scheduler")
+	logConfig := zap.NewProductionConfig()
+	logConfig.Sampling = nil // Disable sampling, which the production config enables by default.
+	logger := zap.Must(logConfig.Build()).Named("autoscale-scheduler")
 	logger.Info("", zap.Any("buildInfo", util.GetBuildInfo()))
 
 	if err := runProgram(logger); err != nil {
diff --git a/cmd/autoscaler-agent/main.go b/cmd/autoscaler-agent/main.go
index 42997c537..c6292877a 100644
--- a/cmd/autoscaler-agent/main.go
+++ b/cmd/autoscaler-agent/main.go
@@ -21,7 +21,9 @@ import (
 )
 
 func main() {
-	logger := zap.Must(zap.NewProduction()).Named("autoscaler-agent")
+	logConfig := zap.NewProductionConfig()
+	logConfig.Sampling = nil // Disable sampling, which the production config enables by default.
+	logger := zap.Must(logConfig.Build()).Named("autoscaler-agent")
 	defer logger.Sync() //nolint:errcheck // what are we gonna do, log something about it?
 
 	logger.Info("", zap.Any("buildInfo", util.GetBuildInfo()))
diff --git a/cmd/vm-informant/main.go b/cmd/vm-informant/main.go
deleted file mode 100644
index 93b773e15..000000000
--- a/cmd/vm-informant/main.go
+++ /dev/null
@@ -1,242 +0,0 @@
-package main
-
-import (
-	"context"
-	"flag"
-	"fmt"
-	"net/http"
-	"os"
-	"os/exec"
-	"os/signal"
-	"syscall"
-	"time"
-
-	"github.com/containerd/cgroups/v3/cgroup2"
-	"github.com/tychoish/fun/srv"
-	"go.uber.org/zap"
-
-	"github.com/neondatabase/autoscaling/pkg/informant"
-	"github.com/neondatabase/autoscaling/pkg/util"
-)
-
-const minSubProcessRestartInterval = 5 * time.Second
-
-func main() {
-	logger := zap.Must(zap.NewProduction()).Named("vm-informant")
-	defer logger.Sync() //nolint:errcheck // what are we gonna do, log something about it?
-
-	logger.Info("", zap.Any("buildInfo", util.GetBuildInfo()))
-
-	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM)
-	defer cancel()
-	ctx = srv.SetShutdownSignal(ctx) // allows workers to cause a shutdown
-	ctx = srv.WithOrchestrator(ctx)  // creates and starts an orchestrator
-	ctx = srv.SetBaseContext(ctx)    // sets a context for starting async work in request scopes
-
-	orca := srv.GetOrchestrator(ctx)
-
-	defer func() {
-		if err := orca.Service().Wait(); err != nil {
-			logger.Panic("Failed to shut down service", zap.Error(err))
-		}
-	}()
-
-	// Below, we want to be able to distinguish between absence of flags and presence of empty
-	// flags. The only way we can reliably do this is by setting defaults to a sentinel value that
-	// isn't possible to create otherwise. In this case, it's a string containing a null byte, which
-	// cannot be provided (due to C's null-terminated strings).
-	invalidArgValue := "\x00"
-
-	var cgroupName string
-	var autoRestart bool
-	var pgConnStr string
-	flag.StringVar(&cgroupName, "cgroup", invalidArgValue, "Sets the cgroup to monitor (optional)")
-	flag.BoolVar(&autoRestart, "auto-restart", false, "Automatically cleanup and restart on failure or exit")
-	flag.StringVar(&pgConnStr, "pgconnstr", invalidArgValue, "Sets the postgres connection string to enable file cache (optional)")
-
-	flag.Parse()
-
-	// If we were asked to restart on failure, handle that separately:
-	if autoRestart {
-		logger = logger.Named("parent")
-
-		var args []string
-		var cleanupHooks []func()
-
-		if pgConnStr != invalidArgValue {
-			args = append(args, "-pgconnstr", pgConnStr)
-		}
-		if cgroupName != invalidArgValue {
-			args = append(args, "-cgroup", cgroupName)
-			cleanupHooks = append(cleanupHooks, func() {
-				logger.Info("cleanup hook: making sure cgroup is thawed", zap.String("cgroup", cgroupName))
-				manager, err := cgroup2.Load(fmt.Sprint("/", cgroupName))
-				if err != nil {
-					logger.Error("Error making cgroup handler", zap.Error(err))
-					return
-				}
-				if err := manager.Thaw(); err != nil {
-					logger.Error("Error thawing cgroup", zap.Error(err))
-				}
-			})
-		}
-
-		runRestartOnFailure(ctx, logger, args, cleanupHooks)
-		closer := srv.GetShutdownSignal(ctx)
-		// this cancels the process' underlying context
-		closer()
-		// this drops to the defer that waits for all services to shutdown
-		// will run now.
-		return
-	}
-
-	var stateOpts []informant.NewStateOpts
-
-	if cgroupName != invalidArgValue {
-		logger := logger.With(zap.String("cgroup", cgroupName))
-
-		cgroupConfig := informant.DefaultCgroupConfig
-		logger.Info("Selected cgroup, starting handler", zap.Any("config", cgroupConfig))
-		cgroup, err := informant.NewCgroupManager(logger.Named("cgroup").Named("manager"), cgroupName)
-		if err != nil {
-			logger.Fatal("Error starting cgroup handler", zap.Error(err))
-		}
-
-		stateOpts = append(stateOpts, informant.WithCgroup(cgroup, cgroupConfig))
-	} else {
-		logger.Info("No cgroup selected")
-	}
-
-	if pgConnStr != invalidArgValue {
-		logger := logger.With(zap.String("fileCacheConnstr", pgConnStr))
-
-		fileCacheConfig := informant.DefaultFileCacheConfig
-		logger.Info("Selected postgres file cache", zap.Any("config", fileCacheConfig))
-		stateOpts = append(stateOpts, informant.WithPostgresFileCache(pgConnStr, fileCacheConfig))
-	} else {
-		logger.Info("No postgres file cache selected")
-	}
-
-	agents := informant.NewAgentSet(logger)
-	state, err := informant.NewState(logger, agents, informant.DefaultStateConfig, stateOpts...)
-	if err != nil {
-		logger.Fatal("Error starting informant.NewState", zap.Error(err))
-	}
-
-	mux := http.NewServeMux()
-	hl := logger.Named("handle")
-	util.AddHandler(hl, mux, "/register", http.MethodPost, "AgentDesc", state.RegisterAgent)
-	util.AddHandler(hl, mux, "/health-check", http.MethodPut, "AgentIdentification", state.HealthCheck)
-	util.AddHandler(hl, mux, "/downscale", http.MethodPut, "AgentResourceMessage", state.TryDownscale)
-	util.AddHandler(hl, mux, "/upscale", http.MethodPut, "AgentResourceMessage", state.NotifyUpscale)
-	util.AddHandler(hl, mux, "/unregister", http.MethodDelete, "AgentDesc", state.UnregisterAgent)
-
-	addr := "0.0.0.0:10301"
-	hl.Info("Starting server", zap.String("addr", addr))
-
-	// we create an http service and add it to the orchestrator,
-	// which will start it and manage its lifecycle.
-	if err := orca.Add(srv.HTTP("vm-informant-api", 5*time.Second, &http.Server{Addr: addr, Handler: mux})); err != nil {
-		logger.Fatal("Failed to add API server", zap.Error(err))
-	}
-
-	// we drop to the defers now, which will block until the signal
-	// handler is called.
-}
-
-// runRestartOnFailure repeatedly calls this binary with the same flags, but with 'auto-restart'
-// removed.
-//
-// We execute ourselves as a subprocess so that it's possible to appropriately cleanup after
-// termination by various signals (or an unhandled panic!). This is worthwhile because we *really*
-// don't want to leave the cgroup frozen while waiting to restart.
-func runRestartOnFailure(ctx context.Context, logger *zap.Logger, args []string, cleanupHooks []func()) {
-	selfPath := os.Args[0]
-	timer := time.NewTimer(0)
-	defer timer.Stop()
-
-	for {
-		startTime := time.Now()
-		sig := make(chan struct{})
-
-		func() {
-			pctx, pcancel := context.WithCancel(context.Background())
-			defer pcancel()
-
-			cmd := exec.Command(selfPath, args...)
-			cmd.Stdout = os.Stdout
-			cmd.Stderr = os.Stderr
-
-			logger.Info("Starting child vm-informant", zap.Any("args", args))
-			err := cmd.Start()
-			if err == nil {
-				go func() {
-					defer close(sig)
-
-					select {
-					case <-pctx.Done():
-						return
-					case <-ctx.Done():
-						if pctx.Err() != nil {
-							// the process has already returned
-							// and we don't need to signal it
-							return
-						}
-						if err := cmd.Process.Signal(syscall.SIGTERM); err != nil {
-							logger.Warn("Could not signal child vm-informant process", zap.Error(err))
-						}
-					}
-				}()
-
-				// this is blocking, but we should
-				// have killed the process in the
-				// wait goroutine, or the process would
-				// return normally.
-				err = cmd.Wait()
-				// stop the goroutine above, as the
-				// process has already returned.
-				pcancel()
-			}
-
-			if err != nil {
-				logger.Error("Child vm-informrant exited with error", zap.Error(err))
-			} else {
-				logger.Warn("Child vm-informant exited without error. This should not happen")
-			}
-
-			for _, h := range cleanupHooks {
-				h()
-			}
-		}()
-
-		select {
-		case <-ctx.Done():
-			logger.Info("Received shutdown signal")
-			return
-		case <-sig:
-			dur := time.Since(startTime)
-			if dur < minSubProcessRestartInterval {
-				// drain the timer before resetting it, required by Timer.Reset::
-				if !timer.Stop() {
-					<-timer.C
-				}
-				timer.Reset(minSubProcessRestartInterval - dur)
-
-				logger.Info(
-					"Child vm-informant failed, respecting minimum delay before restart",
-					zap.Duration("delay", minSubProcessRestartInterval),
-				)
-				select {
-				case <-ctx.Done():
-					logger.Info("Received shutdown signal while delaying before restart", zap.Duration("delay", minSubProcessRestartInterval))
-					return
-				case <-timer.C:
-					continue
-				}
-			}
-
-			logger.Info("Restarting child vm-informant immediately")
-			continue
-		}
-	}
-}
diff --git a/deploy/agent/config_map.yaml b/deploy/agent/config_map.yaml
index 280f8d2b5..5f06d0ff8 100644
--- a/deploy/agent/config_map.yaml
+++ b/deploy/agent/config_map.yaml
@@ -9,23 +9,23 @@ data:
       "scaling": {
         "requestTimeoutSeconds": 10,
         "defaultConfig": {
-          "loadAverageFractionTarget": 0.9
+          "loadAverageFractionTarget": 0.9,
+          "memoryUsageFractionTarget": 0.75
         }
       },
-      "informant": {
-        "serverPort": 10301,
-        "retryServerMinWaitSeconds": 5,
-        "retryServerNormalWaitSeconds": 5,
-        "retryDeniedDownscaleSeconds": 5,
-        "retryFailedRequestSeconds": 3,
-        "registerRetrySeconds": 5,
-        "requestTimeoutSeconds": 1,
-        "registerTimeoutSeconds": 2,
-        "downscaleTimeoutSeconds": 2,
-        "unhealthyAfterSilenceDurationSeconds": 20,
-        "unhealthyStartupGracePeriodSeconds": 20
+      "monitor": {
+          "serverPort": 10301,
+          "responseTimeoutSeconds": 5,
+          "connectionTimeoutSeconds": 4,
+          "connectionRetryMinWaitSeconds": 5,
+          "unhealthyAfterSilenceDurationSeconds": 20,
+          "unhealthyStartupGracePeriodSeconds": 20,
+          "maxHealthCheckSequentialFailuresSeconds": 30,
+          "retryDeniedDownscaleSeconds": 5,
+          "retryFailedRequestSeconds": 3
       },
       "metrics": {
+        "port": 9100,
         "loadMetricPrefix": "host_",
         "requestTimeoutSeconds": 2,
         "secondsBetweenRequests": 5
diff --git a/deploy/scheduler/config_map.yaml b/deploy/scheduler/config_map.yaml
index 075823f92..95c9378b5 100644
--- a/deploy/scheduler/config_map.yaml
+++ b/deploy/scheduler/config_map.yaml
@@ -27,9 +27,12 @@ data:
     {
       "memBlockSize": "1Gi",
       "nodeDefaults": {
-        "cpu": { "watermark": 0.7, "system": "500m" },
-        "memory": { "watermark": 0.7, "system": "0.5Gi" },
-        "computeUnit": { "vCPUs": 0.25, "mem": 1 }
+        "cpu": { "watermark": 0.9 },
+        "memory": { "watermark": 0.9 },
+        "computeUnit": { "vCPUs": 0.25, "mem": 1 },
+        "minUsageScore": 0.5,
+        "maxUsageScore": 0,
+        "scorePeak": 0.8
       },
       "nodeOverrides": [],
       "schedulerName": "autoscale-scheduler",
@@ -37,5 +40,7 @@ data:
         "port": 10298,
         "timeoutSeconds": 5
       },
-      "doMigration": false
+      "migrationDeletionRetrySeconds": 5,
+      "doMigration": false,
+      "randomizeScores": true
     }
diff --git a/doc/vm-builder/README.md b/doc/vm-builder/README.md
new file mode 100644
index 000000000..bcef96602
--- /dev/null
+++ b/doc/vm-builder/README.md
@@ -0,0 +1,189 @@
+This README covers non-trivial implementation details of vm-builder / vm-builder-generic.
+
+What `vm-builder` Does
+======================
+
+vm-builder consumes a Docker image and turns it into a new docker image that runs the Docker container in a qemu VM.
+The OS in the VM is a minimal Alpine Linux / busybox environment.
+We use busybox `init` as the init system, configured through `/etc/inittab`.
+Likewise, the `poweroff` command is provided by busybox.
+
+We use a virtual CDROM to deliver the container launch command / entrypoint+arguments into the VM.
+The script is called `vmstarter.sh`.
+It is launched by the `vmstart` script which in turn is configured as a `respawn` service in the `inittab`.
+After `vmstarter.sh` exits, `vmstart` exits, and then gets restarted by `respawn`.
+This is a bit like docker in `--restart=always` mode.
+
+**Graceful shutdown** of the container-turned-VM is done through a virtual ACPI power button event.
+`acpid` handles the ACPI events and we configure it to call the busybox `poweroff` command.
+
+Busybox Init & Shutdown
+=======================
+
+The busybox `poweroff` command is integrated with the busybox `init` command as follows:
+
+0. Invoking busybox `poweroff` signals SIGUSR2 to the busybox `init` process.
+   The `init` process then does the following:
+1. Stop waiting for child processes to exit, and stop restarting child
+   processes that are marked `respawn` in the inittab.
+2. Run the `shutdown` directives in the inittab, in the order
+   in which they are specified.
+3. Send SIGTERM to all processes.
+4. Sleep 1 second.
+5. (minor details omitted)
+6. Call into kernel to poweroff.
+
+What follows are links to the busybox source code to "prove" the above.
+
+The `poweroff` command invoked by acpid is the busybox poweroff.
+At runtime, we take the following branch:
+https://github.com/brgl/busybox//blob/97e9a72c71d0238c9f241612ce4af923c16954c7/init/halt.c#L172-L173
+The `signals[which]` variable is `SIGUSR2` for the `poweroff` "applet".
+
+The code in `init` that handles `SIGUSR2` is the `check_delayed_signals` function that is called form inside `init`'s main loop.
+Code taken at runtime when `poweroff` signals `SIGUSR2`:
+
+* main loop calls `check_delayed_signals`: https://github.com/brgl/busybox//blob/f35ad3bd1287627fc6ca7cc9c1f48b186257dd87/init/init.c#L1219
+* check_delayed_signals detects `SIGUSR2` was signalled and calls `halt_reboot_pwoff`, this call will never return: https://github.com/brgl/busybox//blob/f35ad3bd1287627fc6ca7cc9c1f48b186257dd87/init/init.c#L996-L1005
+* it calls `run_shutdown_and_kill_processes` https://github.com/brgl/busybox//blob/f35ad3bd1287627fc6ca7cc9c1f48b186257dd87/init/init.c#L821
+* Runs `shutdown` actions in the inittab: https://github.com/brgl/busybox//blob/f35ad3bd1287627fc6ca7cc9c1f48b186257dd87/init/init.c#L751-L754
+* SIGTERM, pause, SIGKILL (not relevant for as because we take down postgres & compute_ctl through the shutdown action added in this PR: https://github.com/brgl/busybox//blob/f35ad3bd1287627fc6ca7cc9c1f48b186257dd87/init/init.c#L758-L766
+* Log shutdown and call into kernel: https://github.com/brgl/busybox//blob/f35ad3bd1287627fc6ca7cc9c1f48b186257dd87/init/init.c#L832-L833
+
+
+The Role Of `vm-builder` in Neon Autoscaling
+============================================
+
+In Neon's autoscaling, we use `vm-builder` to turn the `neon.git` compute Docker image into a VM.
+This means the `vmstarter.sh` will launch the `compute_ctl`, which in turn:
+1. waits for a spec
+2. gets basebackup from compute
+3. launches Postgres
+4. waits for Postgres to exit
+5. does a sync safekeepers
+6. exits itself.
+
+Neon Control Plane's `suspend_compute` relies on ACPI shutdown
+signalling for graceful shutdown of the NeonVM.
+If the NeonVM doesn't shut down timely, the pod that contains
+the qemu process gets SIGKILLed.
+
+What Happens On ACPI Shutdown
+=============================
+
+Here is a mermaid diagram of what happens during shutdown:
+
+
+```mermaid
+sequenceDiagram
+
+    participant k8s
+    participant vmrunner
+    participant Qemu
+    participant GuestKernel
+    participant acpid
+    participant poweroff
+    participant init
+    participant vmstart
+    participant vmshutdown
+    participant vmstart.allowed
+    participant flock as flock vmstart.lock
+    participant vmstarter.sh
+
+
+
+    GuestKernel->>init: start
+    init->>vmstart.allowed: create
+    activate vmstart.allowed
+
+    init->>vmstart: start
+    vmstart->>+flock: flock
+    flock->>vmstart.allowed: check existence
+    vmstart.allowed->>flock: .
+
+    flock->>+vmstarter.sh: start and wait
+    Note over vmstarter.sh: workload is running
+    Note over vmstarter.sh: exits for whatever reason
+    vmstarter.sh->>-flock: exits
+    flock->>-vmstart: exits
+    vmstart->>init: exits
+
+    Note over init: has not yet received shutdown request
+    Note over init: so, respawn
+
+    init->>vmstart: start
+    vmstart->>+flock: .
+    flock->>vmstart.allowed: check existence
+    vmstart.allowed->>flock: .
+
+    flock->>+vmstarter.sh: start and wait
+    Note over vmstarter.sh: workload is running
+
+    k8s-)vmrunner: SIGTERM
+    Note over k8s: will SIGKILL the container,<br/>including QEMU after timeout
+    vmrunner->>Qemu: send ACPI power button event
+    Qemu-)GuestKernel: ACPI power button event
+    GuestKernel->>acpid: .
+    acpid->>poweroff: .
+    poweroff->>init: SIGUSR2
+    Note over init: will no longer respawn<br/>but also not stop anything either
+    Note over init: run shutdown actions
+    init->>vmshutdown: start
+    vmshutdown->>vmstart.allowed: unlink
+    deactivate vmstart.allowed
+    Note over vmstart.allowed: vmstart's existence check<br/>will fail from here on
+    loop Until we win the flock
+        vmshutdown-xflock: nonblock try acquire fails
+        vmshutdown-)vmstarter.sh: signal to shut down
+    end
+    vmstarter.sh->>-flock: eventually exits in<br/>response to signal
+    flock->>-vmstart: exits
+    vmshutdown->>+flock: nonblock try acquire succeeds
+    flock->>-vmshutdown: exit immediately
+
+    Note over vmshutdown: we acquired the flock once after removing vmstart.allowed.<br/>This ensures vmstarter.sh is not running.
+
+    vmshutdown->>init: exit
+
+    Note over init: SIGTERM
+    Note over init: sleep 1 second
+    Note over init: kill everything
+    init->>GuestKernel: power off system call
+    Note over GuestKernel: powers off the machine
+```
+
+## How It Looks Inside The VM
+
+In a neon.git-compute-image-turned-vm image, running in staging, it looks like this
+
+```
+ps -eHo pid,command | cat
+...
+/neonvm/bin/sh /neonvm/bin/vmstart
+  149     flock /neonvm/vmstart.lock -c test -e /neonvm/vmstart.allowed && /neonvm/bin/su-exec postgres /neonvm/bin/sh /neonvm/bin/vmstarter.sh
+  150       /bin/sh -c test -e /neonvm/vmstart.allowed && /neonvm/bin/su-exec postgres /neonvm/bin/sh /neonvm/bin/vmstarter.sh
+  151         /neonvm/bin/sh /neonvm/bin/vmstarter.sh
+  152           /usr/local/bin/compute_ctl -D /var/db/postgres/compute/pgdata -b /usr/local/bin/postgres -C postgresql://cloud_admin@127.0.0.1/postgres?options=-c%20default_transaction_read_only=false --remote-ext-config {"bucket":"neon-dev-extensions-us-east-2","region":"us-east-2"} --compute-id compute-long-flower-94034268 --control-plane-uri http://neon-compute-api.aws.neon.build:9096
+  178             /usr/local/bin/postgres -D /var/db/postgres/compute/pgdata
+  182               postgres: checkpointer
+  183               postgres: background writer
+  185               postgres: walwriter
+  186               postgres: autovacuum launcher
+  187               postgres: pg_cron launcher
+  188               postgres: TimescaleDB Background Worker Launcher
+  189               postgres: WAL proposer streaming 0/1FD62B0
+  190               postgres: Local free space monitor
+  191               postgres: logical replication launcher
+  201               postgres: cloud_admin postgres 127.0.0.1(33860) idle
+  204               postgres: cloud_admin postgres ::1(53686) idle
+...
+```
+
+## TLA+ Model Of Shutdown
+
+The `./shutdown/shutdown.tla` model is a PlusCal specification of the shutdown procedure.
+
+TLC model checker configuration:
+
+* Check for deadlocks, there shouldn't be any.
+* Check temporal properties `TEMPORAL PROPERTIES` at the bottom of the spec.
diff --git a/doc/vm-builder/shutdown/shutdown.tla b/doc/vm-builder/shutdown/shutdown.tla
new file mode 100644
index 000000000..9d8666481
--- /dev/null
+++ b/doc/vm-builder/shutdown/shutdown.tla
@@ -0,0 +1,515 @@
+----------------------------- MODULE vmshutdown -----------------------------
+
+EXTENDS Sequences, Integers, TLC
+
+CONSTANT NULL
+
+(*--algorithm vmshutdown
+
+variables
+    start_allowed = TRUE, \* vmstart.allowed
+    start_allowed_locked = FALSE, \* vmstart.lock
+
+    \* ACPI & unix signal delivery, modeled through variables that are polled/await'ed
+    shutdown_signal_received = FALSE,
+    postgres_running = NULL,
+    postgres_spawn_pending = NULL,
+    postgres_shutdown_request_pending = NULL,
+    postgres_next_pids = <<1,2>>, \* bound number of crashes
+    postgres_exited_pids = {},
+
+    machine_running = TRUE,
+    vmshutdown_exited = FALSE,
+
+    \* for temporal invariants
+    vmstarter_sh_running = FALSE
+
+
+fair process init = "init"
+begin
+    init:
+    while ~shutdown_signal_received do
+        either
+            \* disable respawn loop & run vmshutdown
+            shutdown_signal_received := TRUE;
+        or
+            skip;
+        end either;
+    end while;
+    wait_for_vmshutdown:
+        await vmshutdown_exited;
+    poweroff_to_kernel:
+        machine_running := FALSE;
+end process;
+
+fair process respawn_vmstart = "respawn_vmstart"
+variables
+    respawn_current_postgres_pid = NULL
+begin
+    init:
+    while ~shutdown_signal_received do
+
+        respawn_flock_enter:
+            await start_allowed_locked = FALSE;
+            start_allowed_locked := TRUE;
+        respawn_check_start_allowed:
+            if start_allowed then
+        respawn_launch_vmstarter_sh:
+                vmstarter_sh_running := TRUE;
+        respawn_vmstarter_launch_postgres:
+                postgres_spawn_pending := Head(postgres_next_pids);
+                respawn_current_postgres_pid := postgres_spawn_pending;
+                postgres_next_pids := Tail(postgres_next_pids);
+        respawn_vmstarter_wait_postgres:
+                await respawn_current_postgres_pid \in postgres_exited_pids;
+        respawn_vmstarter_sh_exits:
+                vmstarter_sh_running := FALSE;
+            else
+        respawn_not_allowed:
+                skip;
+            end if;
+        respawn_flock_exit:
+            start_allowed_locked := FALSE;
+    end while;
+
+end process;
+
+fair process postgres = "postgres"
+begin
+    init:
+    while machine_running do
+        postgres_wait_to_be_launched:
+            await ~machine_running \/ postgres_spawn_pending /= NULL;
+            if ~machine_running then
+                goto halt;
+            else
+                postgres_running := postgres_spawn_pending;
+                postgres_spawn_pending := NULL;
+            end if;
+
+        postgres_await_shutdown_or_crash:
+
+            \* bound number of crashes to pids left, otherwise we have infinite state space "until" shutdown signal gets delivered
+            if Len(postgres_next_pids) > 0 then
+                either
+                    await postgres_shutdown_request_pending = postgres_running;
+                or
+                    \* crash / exit on its own
+                    skip;
+                end either;
+            else
+                await postgres_shutdown_request_pending = postgres_running;
+            end if;
+            postgres_exited_pids  := postgres_exited_pids \union {postgres_running};
+            postgres_running := NULL;
+    end while;
+    halt:
+       skip;
+end process;
+
+fair process vmshutdown = "vmshutdown"
+begin
+    init:
+        await shutdown_signal_received;
+
+    vmshutdown_inhibit_new_starts:
+        start_allowed := FALSE; \* rm the vmstart.allowed file on disk
+    vmshutdown_kill_running_command:
+        \* if there was a command running from before vmshutdown_inhibit_new_starts,
+        \* it is holding the lock.
+        if start_allowed_locked = TRUE then \* use trylock to implement this
+    vmshutdown_pg_ctl_stop:
+            \* the `if` models signal loss
+            if postgres_running /= NULL then
+                postgres_shutdown_request_pending := postgres_running;
+            end if;
+            goto vmshutdown_kill_running_command;
+        end if;
+    vmshutdown_done:
+        vmshutdown_exited := TRUE;
+        skip;
+end process;
+
+
+end algorithm; *)
+\* BEGIN TRANSLATION (chksum(pcal) = "d013f716" /\ chksum(tla) = "e8963d9a")
+\* Label init of process init at line 31 col 5 changed to init_
+\* Label init of process respawn_vmstart at line 50 col 5 changed to init_r
+\* Label init of process postgres at line 80 col 5 changed to init_p
+\* Label init of process vmshutdown at line 113 col 9 changed to init_v
+VARIABLES start_allowed, start_allowed_locked, shutdown_signal_received,
+          postgres_running, postgres_spawn_pending,
+          postgres_shutdown_request_pending, postgres_next_pids,
+          postgres_exited_pids, machine_running, vmshutdown_exited,
+          vmstarter_sh_running, pc, respawn_current_postgres_pid
+
+vars == << start_allowed, start_allowed_locked, shutdown_signal_received,
+           postgres_running, postgres_spawn_pending,
+           postgres_shutdown_request_pending, postgres_next_pids,
+           postgres_exited_pids, machine_running, vmshutdown_exited,
+           vmstarter_sh_running, pc, respawn_current_postgres_pid >>
+
+ProcSet == {"init"} \cup {"respawn_vmstart"} \cup {"postgres"} \cup {"vmshutdown"}
+
+Init == (* Global variables *)
+        /\ start_allowed = TRUE
+        /\ start_allowed_locked = FALSE
+        /\ shutdown_signal_received = FALSE
+        /\ postgres_running = NULL
+        /\ postgres_spawn_pending = NULL
+        /\ postgres_shutdown_request_pending = NULL
+        /\ postgres_next_pids = <<1,2>>
+        /\ postgres_exited_pids = {}
+        /\ machine_running = TRUE
+        /\ vmshutdown_exited = FALSE
+        /\ vmstarter_sh_running = FALSE
+        (* Process respawn_vmstart *)
+        /\ respawn_current_postgres_pid = NULL
+        /\ pc = [self \in ProcSet |-> CASE self = "init" -> "init_"
+                                        [] self = "respawn_vmstart" -> "init_r"
+                                        [] self = "postgres" -> "init_p"
+                                        [] self = "vmshutdown" -> "init_v"]
+
+init_ == /\ pc["init"] = "init_"
+         /\ IF ~shutdown_signal_received
+               THEN /\ \/ /\ shutdown_signal_received' = TRUE
+                       \/ /\ TRUE
+                          /\ UNCHANGED shutdown_signal_received
+                    /\ pc' = [pc EXCEPT !["init"] = "init_"]
+               ELSE /\ pc' = [pc EXCEPT !["init"] = "wait_for_vmshutdown"]
+                    /\ UNCHANGED shutdown_signal_received
+         /\ UNCHANGED << start_allowed, start_allowed_locked, postgres_running,
+                         postgres_spawn_pending,
+                         postgres_shutdown_request_pending, postgres_next_pids,
+                         postgres_exited_pids, machine_running,
+                         vmshutdown_exited, vmstarter_sh_running,
+                         respawn_current_postgres_pid >>
+
+wait_for_vmshutdown == /\ pc["init"] = "wait_for_vmshutdown"
+                       /\ vmshutdown_exited
+                       /\ pc' = [pc EXCEPT !["init"] = "poweroff_to_kernel"]
+                       /\ UNCHANGED << start_allowed, start_allowed_locked,
+                                       shutdown_signal_received,
+                                       postgres_running,
+                                       postgres_spawn_pending,
+                                       postgres_shutdown_request_pending,
+                                       postgres_next_pids,
+                                       postgres_exited_pids, machine_running,
+                                       vmshutdown_exited, vmstarter_sh_running,
+                                       respawn_current_postgres_pid >>
+
+poweroff_to_kernel == /\ pc["init"] = "poweroff_to_kernel"
+                      /\ machine_running' = FALSE
+                      /\ pc' = [pc EXCEPT !["init"] = "Done"]
+                      /\ UNCHANGED << start_allowed, start_allowed_locked,
+                                      shutdown_signal_received,
+                                      postgres_running, postgres_spawn_pending,
+                                      postgres_shutdown_request_pending,
+                                      postgres_next_pids, postgres_exited_pids,
+                                      vmshutdown_exited, vmstarter_sh_running,
+                                      respawn_current_postgres_pid >>
+
+init == init_ \/ wait_for_vmshutdown \/ poweroff_to_kernel
+
+init_r == /\ pc["respawn_vmstart"] = "init_r"
+          /\ IF ~shutdown_signal_received
+                THEN /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_flock_enter"]
+                ELSE /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "Done"]
+          /\ UNCHANGED << start_allowed, start_allowed_locked,
+                          shutdown_signal_received, postgres_running,
+                          postgres_spawn_pending,
+                          postgres_shutdown_request_pending,
+                          postgres_next_pids, postgres_exited_pids,
+                          machine_running, vmshutdown_exited,
+                          vmstarter_sh_running, respawn_current_postgres_pid >>
+
+respawn_flock_enter == /\ pc["respawn_vmstart"] = "respawn_flock_enter"
+                       /\ start_allowed_locked = FALSE
+                       /\ start_allowed_locked' = TRUE
+                       /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_check_start_allowed"]
+                       /\ UNCHANGED << start_allowed, shutdown_signal_received,
+                                       postgres_running,
+                                       postgres_spawn_pending,
+                                       postgres_shutdown_request_pending,
+                                       postgres_next_pids,
+                                       postgres_exited_pids, machine_running,
+                                       vmshutdown_exited, vmstarter_sh_running,
+                                       respawn_current_postgres_pid >>
+
+respawn_check_start_allowed == /\ pc["respawn_vmstart"] = "respawn_check_start_allowed"
+                               /\ IF start_allowed
+                                     THEN /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_launch_vmstarter_sh"]
+                                     ELSE /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_not_allowed"]
+                               /\ UNCHANGED << start_allowed,
+                                               start_allowed_locked,
+                                               shutdown_signal_received,
+                                               postgres_running,
+                                               postgres_spawn_pending,
+                                               postgres_shutdown_request_pending,
+                                               postgres_next_pids,
+                                               postgres_exited_pids,
+                                               machine_running,
+                                               vmshutdown_exited,
+                                               vmstarter_sh_running,
+                                               respawn_current_postgres_pid >>
+
+respawn_launch_vmstarter_sh == /\ pc["respawn_vmstart"] = "respawn_launch_vmstarter_sh"
+                               /\ vmstarter_sh_running' = TRUE
+                               /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_vmstarter_launch_postgres"]
+                               /\ UNCHANGED << start_allowed,
+                                               start_allowed_locked,
+                                               shutdown_signal_received,
+                                               postgres_running,
+                                               postgres_spawn_pending,
+                                               postgres_shutdown_request_pending,
+                                               postgres_next_pids,
+                                               postgres_exited_pids,
+                                               machine_running,
+                                               vmshutdown_exited,
+                                               respawn_current_postgres_pid >>
+
+respawn_vmstarter_launch_postgres == /\ pc["respawn_vmstart"] = "respawn_vmstarter_launch_postgres"
+                                     /\ postgres_spawn_pending' = Head(postgres_next_pids)
+                                     /\ respawn_current_postgres_pid' = postgres_spawn_pending'
+                                     /\ postgres_next_pids' = Tail(postgres_next_pids)
+                                     /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_vmstarter_wait_postgres"]
+                                     /\ UNCHANGED << start_allowed,
+                                                     start_allowed_locked,
+                                                     shutdown_signal_received,
+                                                     postgres_running,
+                                                     postgres_shutdown_request_pending,
+                                                     postgres_exited_pids,
+                                                     machine_running,
+                                                     vmshutdown_exited,
+                                                     vmstarter_sh_running >>
+
+respawn_vmstarter_wait_postgres == /\ pc["respawn_vmstart"] = "respawn_vmstarter_wait_postgres"
+                                   /\ respawn_current_postgres_pid \in postgres_exited_pids
+                                   /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_vmstarter_sh_exits"]
+                                   /\ UNCHANGED << start_allowed,
+                                                   start_allowed_locked,
+                                                   shutdown_signal_received,
+                                                   postgres_running,
+                                                   postgres_spawn_pending,
+                                                   postgres_shutdown_request_pending,
+                                                   postgres_next_pids,
+                                                   postgres_exited_pids,
+                                                   machine_running,
+                                                   vmshutdown_exited,
+                                                   vmstarter_sh_running,
+                                                   respawn_current_postgres_pid >>
+
+respawn_vmstarter_sh_exits == /\ pc["respawn_vmstart"] = "respawn_vmstarter_sh_exits"
+                              /\ vmstarter_sh_running' = FALSE
+                              /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_flock_exit"]
+                              /\ UNCHANGED << start_allowed,
+                                              start_allowed_locked,
+                                              shutdown_signal_received,
+                                              postgres_running,
+                                              postgres_spawn_pending,
+                                              postgres_shutdown_request_pending,
+                                              postgres_next_pids,
+                                              postgres_exited_pids,
+                                              machine_running,
+                                              vmshutdown_exited,
+                                              respawn_current_postgres_pid >>
+
+respawn_not_allowed == /\ pc["respawn_vmstart"] = "respawn_not_allowed"
+                       /\ TRUE
+                       /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "respawn_flock_exit"]
+                       /\ UNCHANGED << start_allowed, start_allowed_locked,
+                                       shutdown_signal_received,
+                                       postgres_running,
+                                       postgres_spawn_pending,
+                                       postgres_shutdown_request_pending,
+                                       postgres_next_pids,
+                                       postgres_exited_pids, machine_running,
+                                       vmshutdown_exited, vmstarter_sh_running,
+                                       respawn_current_postgres_pid >>
+
+respawn_flock_exit == /\ pc["respawn_vmstart"] = "respawn_flock_exit"
+                      /\ start_allowed_locked' = FALSE
+                      /\ pc' = [pc EXCEPT !["respawn_vmstart"] = "init_r"]
+                      /\ UNCHANGED << start_allowed, shutdown_signal_received,
+                                      postgres_running, postgres_spawn_pending,
+                                      postgres_shutdown_request_pending,
+                                      postgres_next_pids, postgres_exited_pids,
+                                      machine_running, vmshutdown_exited,
+                                      vmstarter_sh_running,
+                                      respawn_current_postgres_pid >>
+
+respawn_vmstart == init_r \/ respawn_flock_enter
+                      \/ respawn_check_start_allowed
+                      \/ respawn_launch_vmstarter_sh
+                      \/ respawn_vmstarter_launch_postgres
+                      \/ respawn_vmstarter_wait_postgres
+                      \/ respawn_vmstarter_sh_exits \/ respawn_not_allowed
+                      \/ respawn_flock_exit
+
+init_p == /\ pc["postgres"] = "init_p"
+          /\ IF machine_running
+                THEN /\ pc' = [pc EXCEPT !["postgres"] = "postgres_wait_to_be_launched"]
+                ELSE /\ pc' = [pc EXCEPT !["postgres"] = "halt"]
+          /\ UNCHANGED << start_allowed, start_allowed_locked,
+                          shutdown_signal_received, postgres_running,
+                          postgres_spawn_pending,
+                          postgres_shutdown_request_pending,
+                          postgres_next_pids, postgres_exited_pids,
+                          machine_running, vmshutdown_exited,
+                          vmstarter_sh_running, respawn_current_postgres_pid >>
+
+postgres_wait_to_be_launched == /\ pc["postgres"] = "postgres_wait_to_be_launched"
+                                /\ ~machine_running \/ postgres_spawn_pending /= NULL
+                                /\ IF ~machine_running
+                                      THEN /\ pc' = [pc EXCEPT !["postgres"] = "halt"]
+                                           /\ UNCHANGED << postgres_running,
+                                                           postgres_spawn_pending >>
+                                      ELSE /\ postgres_running' = postgres_spawn_pending
+                                           /\ postgres_spawn_pending' = NULL
+                                           /\ pc' = [pc EXCEPT !["postgres"] = "postgres_await_shutdown_or_crash"]
+                                /\ UNCHANGED << start_allowed,
+                                                start_allowed_locked,
+                                                shutdown_signal_received,
+                                                postgres_shutdown_request_pending,
+                                                postgres_next_pids,
+                                                postgres_exited_pids,
+                                                machine_running,
+                                                vmshutdown_exited,
+                                                vmstarter_sh_running,
+                                                respawn_current_postgres_pid >>
+
+postgres_await_shutdown_or_crash == /\ pc["postgres"] = "postgres_await_shutdown_or_crash"
+                                    /\ IF Len(postgres_next_pids) > 0
+                                          THEN /\ \/ /\ postgres_shutdown_request_pending = postgres_running
+                                                  \/ /\ TRUE
+                                          ELSE /\ postgres_shutdown_request_pending = postgres_running
+                                    /\ postgres_exited_pids' = (postgres_exited_pids \union {postgres_running})
+                                    /\ postgres_running' = NULL
+                                    /\ pc' = [pc EXCEPT !["postgres"] = "init_p"]
+                                    /\ UNCHANGED << start_allowed,
+                                                    start_allowed_locked,
+                                                    shutdown_signal_received,
+                                                    postgres_spawn_pending,
+                                                    postgres_shutdown_request_pending,
+                                                    postgres_next_pids,
+                                                    machine_running,
+                                                    vmshutdown_exited,
+                                                    vmstarter_sh_running,
+                                                    respawn_current_postgres_pid >>
+
+halt == /\ pc["postgres"] = "halt"
+        /\ TRUE
+        /\ pc' = [pc EXCEPT !["postgres"] = "Done"]
+        /\ UNCHANGED << start_allowed, start_allowed_locked,
+                        shutdown_signal_received, postgres_running,
+                        postgres_spawn_pending,
+                        postgres_shutdown_request_pending, postgres_next_pids,
+                        postgres_exited_pids, machine_running,
+                        vmshutdown_exited, vmstarter_sh_running,
+                        respawn_current_postgres_pid >>
+
+postgres == init_p \/ postgres_wait_to_be_launched
+               \/ postgres_await_shutdown_or_crash \/ halt
+
+init_v == /\ pc["vmshutdown"] = "init_v"
+          /\ shutdown_signal_received
+          /\ pc' = [pc EXCEPT !["vmshutdown"] = "vmshutdown_inhibit_new_starts"]
+          /\ UNCHANGED << start_allowed, start_allowed_locked,
+                          shutdown_signal_received, postgres_running,
+                          postgres_spawn_pending,
+                          postgres_shutdown_request_pending,
+                          postgres_next_pids, postgres_exited_pids,
+                          machine_running, vmshutdown_exited,
+                          vmstarter_sh_running, respawn_current_postgres_pid >>
+
+vmshutdown_inhibit_new_starts == /\ pc["vmshutdown"] = "vmshutdown_inhibit_new_starts"
+                                 /\ start_allowed' = FALSE
+                                 /\ pc' = [pc EXCEPT !["vmshutdown"] = "vmshutdown_kill_running_command"]
+                                 /\ UNCHANGED << start_allowed_locked,
+                                                 shutdown_signal_received,
+                                                 postgres_running,
+                                                 postgres_spawn_pending,
+                                                 postgres_shutdown_request_pending,
+                                                 postgres_next_pids,
+                                                 postgres_exited_pids,
+                                                 machine_running,
+                                                 vmshutdown_exited,
+                                                 vmstarter_sh_running,
+                                                 respawn_current_postgres_pid >>
+
+vmshutdown_kill_running_command == /\ pc["vmshutdown"] = "vmshutdown_kill_running_command"
+                                   /\ IF start_allowed_locked = TRUE
+                                         THEN /\ pc' = [pc EXCEPT !["vmshutdown"] = "vmshutdown_pg_ctl_stop"]
+                                         ELSE /\ pc' = [pc EXCEPT !["vmshutdown"] = "vmshutdown_done"]
+                                   /\ UNCHANGED << start_allowed,
+                                                   start_allowed_locked,
+                                                   shutdown_signal_received,
+                                                   postgres_running,
+                                                   postgres_spawn_pending,
+                                                   postgres_shutdown_request_pending,
+                                                   postgres_next_pids,
+                                                   postgres_exited_pids,
+                                                   machine_running,
+                                                   vmshutdown_exited,
+                                                   vmstarter_sh_running,
+                                                   respawn_current_postgres_pid >>
+
+vmshutdown_pg_ctl_stop == /\ pc["vmshutdown"] = "vmshutdown_pg_ctl_stop"
+                          /\ IF postgres_running /= NULL
+                                THEN /\ postgres_shutdown_request_pending' = postgres_running
+                                ELSE /\ TRUE
+                                     /\ UNCHANGED postgres_shutdown_request_pending
+                          /\ pc' = [pc EXCEPT !["vmshutdown"] = "vmshutdown_kill_running_command"]
+                          /\ UNCHANGED << start_allowed, start_allowed_locked,
+                                          shutdown_signal_received,
+                                          postgres_running,
+                                          postgres_spawn_pending,
+                                          postgres_next_pids,
+                                          postgres_exited_pids,
+                                          machine_running, vmshutdown_exited,
+                                          vmstarter_sh_running,
+                                          respawn_current_postgres_pid >>
+
+vmshutdown_done == /\ pc["vmshutdown"] = "vmshutdown_done"
+                   /\ vmshutdown_exited' = TRUE
+                   /\ TRUE
+                   /\ pc' = [pc EXCEPT !["vmshutdown"] = "Done"]
+                   /\ UNCHANGED << start_allowed, start_allowed_locked,
+                                   shutdown_signal_received, postgres_running,
+                                   postgres_spawn_pending,
+                                   postgres_shutdown_request_pending,
+                                   postgres_next_pids, postgres_exited_pids,
+                                   machine_running, vmstarter_sh_running,
+                                   respawn_current_postgres_pid >>
+
+vmshutdown == init_v \/ vmshutdown_inhibit_new_starts
+                 \/ vmshutdown_kill_running_command
+                 \/ vmshutdown_pg_ctl_stop \/ vmshutdown_done
+
+(* Allow infinite stuttering to prevent deadlock on termination. *)
+Terminating == /\ \A self \in ProcSet: pc[self] = "Done"
+               /\ UNCHANGED vars
+
+Next == init \/ respawn_vmstart \/ postgres \/ vmshutdown
+           \/ Terminating
+
+Spec == /\ Init /\ [][Next]_vars
+        /\ WF_vars(init)
+        /\ WF_vars(respawn_vmstart)
+        /\ WF_vars(postgres)
+        /\ WF_vars(vmshutdown)
+
+Termination == <>(\A self \in ProcSet: pc[self] = "Done")
+
+\* END TRANSLATION
+
+\* TEMPORAL PROPERTIES:
+\* If we signal ACPI shutdown, vmstart eventually stops running and never restarts
+ShutdownSignalWorks == (shutdown_signal_received ~> ([](~vmstarter_sh_running)))
+\* Before we signal ACPI shutdown, respawn works
+RespawnBeforeShutdownCanRestartWithoutPendingShutdown == TRUE \* TODO: how to express this?
+
+=============================================================================
+\* Modification History
+\* Last modified Mon Sep 25 11:19:20 CEST 2023 by cs
+\* Created Sun Sep 24 12:17:50 CEST 2023 by cs
diff --git a/go.mod b/go.mod
index 219dd64f7..b91fed8c1 100644
--- a/go.mod
+++ b/go.mod
@@ -2,6 +2,14 @@ module github.com/neondatabase/autoscaling
 
 go 1.20
 
+// Replace directives from github.com/cilium/cilium. Keep in sync when updating Cilium!
+replace (
+	github.com/miekg/dns => github.com/cilium/dns v1.1.51-0.20220729113855-5b94b11b46fc
+	github.com/optiopay/kafka => github.com/cilium/kafka v0.0.0-20180809090225-01ce283b732b
+	go.universe.tf/metallb => github.com/cilium/metallb v0.1.1-0.20220829170633-5d7dfb1129f7
+	sigs.k8s.io/controller-tools => github.com/cilium/controller-tools v0.6.2
+)
+
 replace (
 	k8s.io/api => k8s.io/api v0.25.11
 	k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.25.11
@@ -41,12 +49,11 @@ require (
 	github.com/digitalocean/go-qemu v0.0.0-20220826173844-d5f5e3ceed89
 	github.com/docker/docker v20.10.24+incompatible
 	github.com/docker/libnetwork v0.8.0-dev.2.0.20210525090646-64b7a4574d14
-	github.com/elastic/go-sysinfo v1.9.0
 	github.com/google/uuid v1.3.0
 	github.com/k8snetworkplumbingwg/network-attachment-definition-client v1.4.0
 	github.com/k8snetworkplumbingwg/whereabouts v0.6.1
 	github.com/kdomanski/iso9660 v0.3.3
-	github.com/lib/pq v1.10.7
+	github.com/lithammer/shortuuid v3.0.0+incompatible
 	github.com/onsi/ginkgo/v2 v2.6.1
 	github.com/onsi/gomega v1.24.2
 	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
@@ -61,12 +68,13 @@ require (
 	k8s.io/client-go v0.25.11
 	k8s.io/klog/v2 v2.80.1
 	k8s.io/kubernetes v1.25.11
+	nhooyr.io/websocket v1.8.7
 	sigs.k8s.io/controller-runtime v0.13.1
 	sigs.k8s.io/controller-tools v0.10.0
 )
 
 require (
-	cloud.google.com/go/compute v1.14.0 // indirect
+	cloud.google.com/go/compute v1.15.1 // indirect
 	cloud.google.com/go/compute/metadata v0.2.3 // indirect
 	github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
 	github.com/Azure/go-autorest v14.2.0+incompatible // indirect
@@ -90,7 +98,6 @@ require (
 	github.com/docker/distribution v2.8.2+incompatible // indirect
 	github.com/docker/go-connections v0.4.0 // indirect
 	github.com/docker/go-units v0.5.0 // indirect
-	github.com/elastic/go-windows v1.0.0 // indirect
 	github.com/emicklei/go-restful/v3 v3.8.0 // indirect
 	github.com/evanphx/json-patch v5.6.0+incompatible // indirect
 	github.com/evanphx/json-patch/v5 v5.6.0 // indirect
@@ -116,9 +123,9 @@ require (
 	github.com/imdario/mergo v0.3.12 // indirect
 	github.com/inconshreveable/mousetrap v1.0.1 // indirect
 	github.com/ishidawataru/sctp v0.0.0-20210707070123-9a39160e9062 // indirect
-	github.com/joeshaw/multierror v0.0.0-20140124173710-69b34d4ec901 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/klauspost/compress v1.10.3 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/mattn/go-colorable v0.1.11 // indirect
 	github.com/mattn/go-isatty v0.0.14 // indirect
@@ -159,7 +166,7 @@ require (
 	golang.org/x/crypto v0.5.0 // indirect
 	golang.org/x/mod v0.8.0 // indirect
 	golang.org/x/net v0.8.0 // indirect
-	golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 // indirect
+	golang.org/x/oauth2 v0.4.0 // indirect
 	golang.org/x/sync v0.1.0 // indirect
 	golang.org/x/sys v0.6.0 // indirect
 	golang.org/x/term v0.6.0 // indirect
@@ -168,15 +175,14 @@ require (
 	golang.org/x/tools v0.6.0 // indirect
 	gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
-	google.golang.org/genproto v0.0.0-20230106154932-a12b697841d9 // indirect
-	google.golang.org/grpc v1.51.0 // indirect
+	google.golang.org/genproto v0.0.0-20230110181048-76db0878b65f // indirect
+	google.golang.org/grpc v1.53.0 // indirect
 	google.golang.org/protobuf v1.28.1 // indirect
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
-	howett.net/plist v0.0.0-20181124034731-591f970eefbb // indirect
-	k8s.io/apiextensions-apiserver v0.26.1 // indirect
+	k8s.io/apiextensions-apiserver v0.25.11 // indirect
 	k8s.io/cloud-provider v0.0.0 // indirect
 	k8s.io/component-base v0.25.11 // indirect
 	k8s.io/component-helpers v0.25.11 // indirect
diff --git a/go.sum b/go.sum
index 40a08697b..67ba25037 100644
--- a/go.sum
+++ b/go.sum
@@ -13,18 +13,31 @@ cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKV
 cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs=
 cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc=
 cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY=
+cloud.google.com/go v0.72.0/go.mod h1:M+5Vjvlc2wnp6tjzE102Dw08nGShTscUx2nZMufOKPI=
+cloud.google.com/go v0.74.0/go.mod h1:VV1xSbzvo+9QJOxLDaJfTjx5e+MePCpCWwvftOeQmWk=
+cloud.google.com/go v0.78.0/go.mod h1:QjdrLG0uq+YwhjoVOLsS1t7TW8fs36kLs4XO5R5ECHg=
+cloud.google.com/go v0.79.0/go.mod h1:3bzgcEeQlzbuEAYu4mrWhKqWjmpprinYgKJLgKHnbb8=
+cloud.google.com/go v0.81.0/go.mod h1:mk/AM35KwGk/Nm2YSeZbxXdrNK3KZOYHmLkOqC2V6E0=
+cloud.google.com/go v0.83.0/go.mod h1:Z7MJUsANfY0pYPdw0lbnivPx4/vhy/e2FEkSkF7vAVY=
+cloud.google.com/go v0.84.0/go.mod h1:RazrYuxIK6Kb7YrzzhPoLmCVzl7Sup4NrbKPg8KHSUM=
+cloud.google.com/go v0.87.0/go.mod h1:TpDYlFy7vuLzZMMZ+B6iRiELaY7z/gJPaqbMx6mlWcY=
+cloud.google.com/go v0.90.0/go.mod h1:kRX0mNRHe0e2rC6oNakvwQqzyDmg57xJ+SZU1eT2aDQ=
+cloud.google.com/go v0.93.3/go.mod h1:8utlLll2EF5XMAV15woO4lSbWQlk8rer9aLOfLh7+YI=
+cloud.google.com/go v0.94.1/go.mod h1:qAlAugsXlC+JWO+Bke5vCtc9ONxjQT3drlTTnAplMW4=
+cloud.google.com/go v0.97.0/go.mod h1:GF7l59pYBVlXQIBLx3a761cZ41F9bBH3JUlihCt2Udc=
 cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
 cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
 cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc=
 cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
 cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
 cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ=
-cloud.google.com/go/compute v1.14.0 h1:hfm2+FfxVmnRlh6LpB7cg1ZNU+5edAHmW679JePztk0=
-cloud.google.com/go/compute v1.14.0/go.mod h1:YfLtxrj9sU4Yxv+sXzZkyPjEyPBZfXHUvjxega5vAdo=
+cloud.google.com/go/compute v1.15.1 h1:7UGq3QknM33pw5xATlpzeoomNxsacIVvTqTTvbfajmE=
+cloud.google.com/go/compute v1.15.1/go.mod h1:bjjoF/NtFUrkD/urWfdHaKuOPDR5nWIs63rR+SXhcpA=
 cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY=
 cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA=
 cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
 cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
+cloud.google.com/go/firestore v1.1.0/go.mod h1:ulACoGHTpvq5r8rxGJ4ddJZBZqakUQqClKRT5SZwBmk=
 cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
 cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw=
 cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA=
@@ -59,9 +72,12 @@ github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbi
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
 github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg=
 github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE=
+github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
 github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
 github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
 github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
+github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
+github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
@@ -70,6 +86,12 @@ github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk5
 github.com/alessio/shellescape v1.4.1 h1:V7yhSDDn8LP4lc4jS8pFkt0zCnzVJlG5JXy9BVKJUX0=
 github.com/alessio/shellescape v1.4.1/go.mod h1:PZAiSCk0LJaZkiCSkPv8qIobYglO3FPpyFjDCtHLS30=
 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
+github.com/antlr/antlr4/runtime/Go/antlr v0.0.0-20220418222510-f25a4f6275ed/go.mod h1:F7bn7fEU90QkQ3tnmaTx3LTKLEDqnwWODIYppRQ5hnY=
+github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o=
+github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY=
+github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
+github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
+github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
 github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM=
 github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
 github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A=
@@ -78,10 +100,15 @@ github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24
 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
+github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84=
+github.com/bketelsen/crypt v0.0.4/go.mod h1:aI6NrJ0pMGgvZKL1iVgXLnfIFJtfV+bKCoqOes/6LfM=
 github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
 github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
 github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/certifi/gocertifi v0.0.0-20191021191039-0944d244cd40/go.mod h1:sGbDF6GwGcLpkNXPUTkMRoywsNa/ol15pxFe6ERfguA=
+github.com/certifi/gocertifi v0.0.0-20200922220541-2c3bb06c6054/go.mod h1:sGbDF6GwGcLpkNXPUTkMRoywsNa/ol15pxFe6ERfguA=
 github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
 github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
@@ -92,30 +119,51 @@ github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5P
 github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
 github.com/cilium/cilium v1.11.16 h1:7e4TAkMZeDJHbEBOoB1sBbp+fhGe0sdJYxiMIW4LIZ4=
 github.com/cilium/cilium v1.11.16/go.mod h1:79NKzx+ZKD6gcr3Y6RqdsoP4vzUj0CLFJUdQrFBClYw=
+github.com/cilium/controller-tools v0.6.2 h1:oIkqAzqncKsm+lQFJVP6n+bqHOVs9nUZ06hgZ4PxlMM=
+github.com/cilium/controller-tools v0.6.2/go.mod h1:oaeGpjXn6+ZSEIQkUe/+3I40PNiDYp9aeawbt3xTgJ8=
+github.com/cilium/dns v1.1.51-0.20220729113855-5b94b11b46fc/go.mod h1:e3IlAVfNqAllflbibAZEWOXOQ+Ynzk/dDozDxY7XnME=
 github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4=
 github.com/cilium/ebpf v0.9.1/go.mod h1:+OhNOIXx/Fnu1IE8bJz2dzOA+VSfyTfdNUVdlQnxUFY=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
 github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
+github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4/go.mod h1:6pvJx4me5XPnfI9Z40ddWsdw2W/uZgQLFXToKeRcDiI=
 github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
 github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
+github.com/cncf/xds/go v0.0.0-20210922020428-25de7278fc84/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
+github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
+github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
+github.com/cockroachdb/datadriven v0.0.0-20200714090401-bf6692d28da5/go.mod h1:h6jFvWxBdQXxjopDMZyH2UVceIRfR84bdzbkoKrsWNo=
+github.com/cockroachdb/errors v1.2.4/go.mod h1:rQD95gz6FARkaKkQXUksEje/d9a6wBJoCr5oaCLELYA=
+github.com/cockroachdb/logtags v0.0.0-20190617123548-eb05cc24525f/go.mod h1:i/u985jwjWRlyHXQbwatDASoW0RMlZ/3i9yJHE2xLkI=
 github.com/containerd/cgroups/v3 v3.0.1 h1:4hfGvu8rfGIwVIDd+nLzn/B9ZXx4BcCjzt5ToenJRaE=
 github.com/containerd/cgroups/v3 v3.0.1/go.mod h1:/vtwk1VXrtoa5AaZLkypuOJgA/6DyPMZHJPGQNtlHnw=
 github.com/containernetworking/cni v1.0.1 h1:9OIL/sZmMYDBe+G8svzILAlulUpaDTUjeAbtH/JNLBo=
 github.com/containernetworking/cni v1.0.1/go.mod h1:AKuhXbN5EzmD4yTNtfSsX3tPcmtrBI6QcRV0NiNt15Y=
+github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk=
+github.com/coreos/etcd v3.3.13+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
 github.com/coreos/go-iptables v0.6.0 h1:is9qnZMPYjLd8LYqmm/qlE+wwEgJIkTYdhV3rfZo4jk=
 github.com/coreos/go-iptables v0.6.0/go.mod h1:Qe8Bv2Xik5FyTXwgIbLAnv2sWSBmvWdFETJConOQ//Q=
+github.com/coreos/go-oidc v2.1.0+incompatible/go.mod h1:CgnwVTmzoESiwO9qyAFEMiHoZ1nMCKZlZ9V6mm3/LKc=
 github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmfM=
 github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
+github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
 github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
+github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
+github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
+github.com/cpuguy83/go-md2man/v2 v2.0.1/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
+github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
 github.com/digitalocean/go-libvirt v0.0.0-20220804181439-8648fbde413e h1:SCnqm8SjSa0QqRxXbo5YY//S+OryeJioe17nK+iDZpg=
 github.com/digitalocean/go-libvirt v0.0.0-20220804181439-8648fbde413e/go.mod h1:o129ljs6alsIQTc8d6eweihqpmmrbxZ2g1jhgjhPykI=
 github.com/digitalocean/go-qemu v0.0.0-20220826173844-d5f5e3ceed89 h1:2/52ma1zkjfR9aIrAX1F9H24rpj+PkCDkAwhQgqVR/A=
@@ -133,38 +181,47 @@ github.com/docker/libnetwork v0.8.0-dev.2.0.20210525090646-64b7a4574d14/go.mod h
 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
 github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
-github.com/elastic/go-sysinfo v1.9.0 h1:usICqY/Nw4Mpn9f4LdtpFrKxXroJDe81GaxxUlCckIo=
-github.com/elastic/go-sysinfo v1.9.0/go.mod h1:eBD1wEGVaRnRLGecc9iG1z8eOv5HnEdz9+nWd8UAxcE=
-github.com/elastic/go-windows v1.0.0 h1:qLURgZFkkrYyTTkvYpsZIgf83AUsdIHfvlJaqaZ7aSY=
-github.com/elastic/go-windows v1.0.0/go.mod h1:TsU0Nrp7/y3+VwE82FoZF8gC/XFg/Elz6CcloAxnPgU=
+github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc=
 github.com/emicklei/go-restful/v3 v3.8.0 h1:eCZ8ulSerjdAiaNpF7GxXIE7ZCMo1moN1qX+S609eVw=
 github.com/emicklei/go-restful/v3 v3.8.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/go-control-plane v0.9.7/go.mod h1:cwu0lG7PUMfa9snN8LXBig5ynNVH9qI8YYLbd1fK2po=
 github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
 github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
 github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ=
 github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.mod h1:AFq3mo9L8Lqqiid3OhADV3RfLJnjiw63cSpi+fDTRC0=
+github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
 github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ=
+github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
 github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U=
 github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
 github.com/evanphx/json-patch/v5 v5.6.0 h1:b91NhWfaz02IuVxO9faSllyAtNXHMPkC5J8sJCLunww=
 github.com/evanphx/json-patch/v5 v5.6.0/go.mod h1:G79N1coSVB93tBe7j6PhzjmR3/2VvlbKOFpnXhI9Bw4=
 github.com/fatih/camelcase v1.0.0/go.mod h1:yN2Sb0lFhZJUdVvtELVWefmrXpuZESvPmqwoZc+/fpc=
+github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
+github.com/fatih/color v1.12.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM=
 github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w=
 github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk=
 github.com/felixge/httpsnoop v1.0.1 h1:lvB5Jl89CsZtGIWuTcDM1E/vkVs49/Ml7JJe07l8SPQ=
 github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/flowstack/go-jsonschema v0.1.1/go.mod h1:yL7fNggx1o8rm9RlgXv7hTBWxdBM0rVwpMwimd3F3N0=
 github.com/form3tech-oss/jwt-go v3.2.3+incompatible h1:7ZaBxOI7TMoYBfyA3cQHErNNyAWIKUMIwqxEtgHOs5c=
+github.com/form3tech-oss/jwt-go v3.2.3+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k=
 github.com/frankban/quicktest v1.14.0 h1:+cqqvzZV87b4adx/5ayVOaYZ2CrvM4ejQvUdBzPPUss=
 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
 github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
 github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
 github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
+github.com/getkin/kin-openapi v0.76.0/go.mod h1:660oXbgy5JFMKreazJaQTw7o+X00qeSyhcnluiMv+Xg=
+github.com/getsentry/raven-go v0.2.0/go.mod h1:KungGk8q33+aIAZUIVWZDr2OfAEBsO49PX4NzFV5kcQ=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
+github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
+github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
+github.com/gin-gonic/gin v1.6.3 h1:ahKqKTFpO5KTPHxWZjEdPScmYaGtLo8Y4DMHoEsnp14=
+github.com/gin-gonic/gin v1.6.3/go.mod h1:75u5sXoLsGZoRN5Sgbi1eraJ4GU3++wFwWzhwvtwp4M=
 github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
@@ -176,6 +233,8 @@ github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9
 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
 github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A=
 github.com/go-logfmt/logfmt v0.5.1/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs=
+github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas=
+github.com/go-logr/logr v0.2.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU=
 github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
@@ -185,24 +244,45 @@ github.com/go-logr/zapr v1.2.3/go.mod h1:eIauM6P8qSvTw5o2ez6UEAfGjQKrxQTl5EoK+Qa
 github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg=
 github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY=
 github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg=
+github.com/go-openapi/jsonreference v0.19.3/go.mod h1:rjx6GuL8TTa9VaixXglHmQmIL98+wF9xc8zWvFonSJ8=
+github.com/go-openapi/jsonreference v0.19.5/go.mod h1:RdybgQwPxbL4UEjuAruzK1x3nE69AqPYEJeo/TWfEeg=
 github.com/go-openapi/jsonreference v0.20.0 h1:MYlu0sBgChmCfJxxUKZ8g1cPWFOB37YSZqewK7OKeyA=
 github.com/go-openapi/jsonreference v0.20.0/go.mod h1:Ag74Ico3lPc+zR+qjn4XBUmXymS4zJbYVCZmcgkasdo=
 github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk=
+github.com/go-openapi/swag v0.19.14/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ=
 github.com/go-openapi/swag v0.21.1 h1:wm0rhTb5z7qpJRHBdPOMuY4QjVUMbF6/kwoYeRAOrKU=
 github.com/go-openapi/swag v0.21.1/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ=
+github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
+github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q=
+github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8=
+github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no=
+github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA=
+github.com/go-playground/validator/v10 v10.2.0 h1:KgJ0snyC2R9VXYN2rneOtQcw5aHQB1Vv0sFl1UcHBOY=
+github.com/go-playground/validator/v10 v10.2.0/go.mod h1:uOYAAleCW8F/7oMFd6aG0GOhaH6EGOAJShg8Id5JGkI=
 github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
+github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE=
+github.com/gobuffalo/flect v0.2.3/go.mod h1:vmkQwuZYhN5Pc4ljYQZzP+1sq+NEkK+lh20jmEmX3jc=
 github.com/gobuffalo/flect v0.3.0 h1:erfPWM+K1rFNIQeRPdeEXxo8yFr/PO17lhRnS8FUrtk=
 github.com/gobuffalo/flect v0.3.0/go.mod h1:5pf3aGnsvqvCj50AVni7mJJF8ICxGZ8HomberC3pXLE=
+github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
+github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
+github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
+github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
+github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo=
+github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/godbus/dbus/v5 v5.0.6 h1:mkgN1ofwASrYnJ5W6U/BxG15eXXXjirgZc7CLqkcaro=
 github.com/godbus/dbus/v5 v5.0.6/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
+github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
+github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 github.com/golang-jwt/jwt/v4 v4.0.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg=
 github.com/golang-jwt/jwt/v4 v4.2.0 h1:besgBTC8w8HjP6NzQdxwKH9Z5oQMZ24ThTrHp3cZ8eU=
 github.com/golang-jwt/jwt/v4 v4.2.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
@@ -215,6 +295,8 @@ github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt
 github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
 github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
 github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4=
+github.com/golang/mock v1.5.0/go.mod h1:CWnOUgYIOo4TcNZ0wHX3YZCqsaM1I1Jvs6v3mP3KVu8=
+github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs=
 github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
@@ -230,11 +312,16 @@ github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QD
 github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
 github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
 github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/golang/protobuf v1.5.1/go.mod h1:DopwsBzvsk0Fs44TXzsVbJyPhcCPeIwnvohx4u74HPM=
 github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
 github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
+github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/btree v1.0.1 h1:gK4Kx5IaGY9CD5sPJ36FHiBJ6ZXl0kilRiiCj+jdYp4=
+github.com/google/btree v1.0.1/go.mod h1:xXMiIv4Fb/0kKde4SpL7qlzvu5cMJDRkFDxJfI9uaxA=
+github.com/google/cel-go v0.12.6/go.mod h1:Jk7ljRzLBhkmiAwBoUxB1sZSCVBAzkqPF25olK/iRDw=
+github.com/google/gnostic v0.5.7-v3refs/go.mod h1:73MKFl6jIHelAJNaBGFzt3SPtZULs9dYrGFt8OiIsHQ=
 github.com/google/gnostic v0.6.9 h1:ZK/5VhkoX835RikCHpSUJV9a+S3e1zLh59YnyWeBW+0=
 github.com/google/gnostic v0.6.9/go.mod h1:Nm8234We1lq6iB9OmlgNv3nH91XLLVZHCDayfA3xq+E=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
@@ -244,15 +331,22 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
 github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
 github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0=
+github.com/google/martian/v3 v3.1.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0=
+github.com/google/martian/v3 v3.2.1/go.mod h1:oBOf6HBosgwRXnUGWUB05QECsc6uvmMiJ3+6W4l/CUk=
 github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
 github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
 github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
@@ -260,42 +354,82 @@ github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hf
 github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
 github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
 github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
+github.com/google/pprof v0.0.0-20201023163331-3e6fc7fc9c4c/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
+github.com/google/pprof v0.0.0-20201203190320-1bf35d6f28c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
+github.com/google/pprof v0.0.0-20210122040257-d980be63207e/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
+github.com/google/pprof v0.0.0-20210226084205-cbba55b83ad5/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
+github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
+github.com/google/pprof v0.0.0-20210601050228-01bbb1931b22/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
+github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
+github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
 github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
 github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
+github.com/googleapis/gax-go/v2 v2.1.0/go.mod h1:Q3nei7sK6ybPYH7twZdmQpAd1MKb7pfu6SK+H1/DsU0=
+github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
+github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
+github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0Ufc=
+github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
+github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
 github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 h1:+9834+KizmvFV7pXQGSXQTsaWhq2GjuNUt0aUU0YBYw=
+github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y=
 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 h1:Ovs26xHkKqVztRpIrF/92BcuyuQ/YW4NSIpoGtfXNho=
 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
+github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
 github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo=
 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
+github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q=
+github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
+github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
+github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
+github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
+github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
+github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
+github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU=
+github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU=
+github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=
+github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
+github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
+github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90=
 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
 github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
+github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
+github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64=
+github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ=
+github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I=
+github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
+github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
+github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
 github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU=
 github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA=
+github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
 github.com/inconshreveable/mousetrap v1.0.1 h1:U3uMjPSQEBMNp1lFxmllqCPM6P5u/Xq7Pgzkat/bFNc=
 github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/ishidawataru/sctp v0.0.0-20210707070123-9a39160e9062 h1:G1+wBT0dwjIrBdLy0MIG0i+E4CQxEnedHXdauJEIH6g=
 github.com/ishidawataru/sctp v0.0.0-20210707070123-9a39160e9062/go.mod h1:co9pwDoBCm1kGxawmb4sPq0cSIOOWNPT4KnHotMP1Zg=
 github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
-github.com/joeshaw/multierror v0.0.0-20140124173710-69b34d4ec901 h1:rp+c0RAYOWj8l6qbCUTSiRLG/iKnW3K3/QfPPuSsBt4=
-github.com/joeshaw/multierror v0.0.0-20140124173710-69b34d4ec901/go.mod h1:Z86h9688Y0wesXCyonoVr47MasHilkuLMqGhRZ4Hpak=
+github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
 github.com/jonboulle/clockwork v0.2.2 h1:UOGuzwb1PwsrDAObMuhUnj0p5ULPj8V/xJ7Kx9qUBdQ=
+github.com/jonboulle/clockwork v0.2.2/go.mod h1:Pkfl5aHPm1nk2H9h0bjmnJD/BcgbGXUBGnn1kMkgxc8=
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
 github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
+github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
 github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
 github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
+github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
 github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
 github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM=
 github.com/k8snetworkplumbingwg/network-attachment-definition-client v1.4.0 h1:VzM3TYHDgqPkettiP6I6q2jOeQFL4nrJM+UcAc4f6Fs=
@@ -304,10 +438,15 @@ github.com/k8snetworkplumbingwg/whereabouts v0.6.1 h1:3pfShDMF9+/7ijzKUPezoBqN2I
 github.com/k8snetworkplumbingwg/whereabouts v0.6.1/go.mod h1:FbmUjZg27cI6om0IAc+NV5Ur+IKwHyqdLaeR0SGfWJc=
 github.com/kdomanski/iso9660 v0.3.3 h1:cNwM9L2L1Hzc5hZWGy6fPJ92UyWDccaY69DmEPlfDNY=
 github.com/kdomanski/iso9660 v0.3.3/go.mod h1:K+UlIGxKgtrdAWyoigPnFbeQLVs/Xudz4iztWFThBwo=
+github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
+github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/klauspost/compress v1.10.3 h1:OP96hzwJVBIHYU52pVTI6CczrxPvrGfgqF9N5eTO0Q8=
+github.com/klauspost/compress v1.10.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
@@ -316,24 +455,42 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/lib/pq v1.10.7 h1:p7ZhMD+KsSRozJr34udlUrhboJwWAgCg34+/ZZNvZZw=
-github.com/lib/pq v1.10.7/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
+github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y=
+github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII=
+github.com/lithammer/shortuuid v3.0.0+incompatible h1:NcD0xWW/MZYXEHa6ITy6kaXN5nwm/V115vj2YXfhS0w=
+github.com/lithammer/shortuuid v3.0.0+incompatible/go.mod h1:FR74pbAuElzOUuenUHTK2Tciko1/vKuIKS9dSkDrA4w=
+github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
+github.com/magiconair/properties v1.8.5/go.mod h1:y3VJvCyxH9uVvJTWEGAELF3aiYNyPKd5NZ3oSwXrF60=
 github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
+github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU=
+github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
 github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
 github.com/mattn/go-colorable v0.1.11 h1:nQ+aFkoE2TMGc0b68U2OKSexC+eq46+XwZzWXHRmPYs=
 github.com/mattn/go-colorable v0.1.11/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
+github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4=
 github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
 github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y=
 github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
 github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
 github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 h1:I0XW9+e1XWDxdcEniV4rQAIOPUGDq67JSCiRCgGCZLI=
 github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
+github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc=
+github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
+github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
+github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI=
+github.com/mitchellh/gox v0.4.0/go.mod h1:Sd9lOJ0+aimLBi73mGofS1ycjY8lL3uZM3JPS42BGNg=
+github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY=
+github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
+github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
+github.com/mitchellh/mapstructure v1.4.1/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
+github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c=
 github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vygl78=
 github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI=
+github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6/go.mod h1:E2VnQOmVuvZB6UYnnDB0qG5Nq/1tD9acaOpo6xmt0Kw=
 github.com/moby/term v0.0.0-20221205130635-1aeaba878587 h1:HfkjXDfhgVaN5rmueG8cL8KKeFNecRCXFhaJ2qZ5SKA=
 github.com/moby/term v0.0.0-20221205130635-1aeaba878587/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -344,21 +501,33 @@ github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3Rllmb
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
+github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
 github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
+github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
 github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
 github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
 github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE=
+github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU=
+github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
 github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
 github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk=
 github.com/onsi/ginkgo v1.13.0/go.mod h1:+REjRxOmWfHCjfv9TTWB1jD1Frx4XydAD3zm1lskyM0=
+github.com/onsi/ginkgo v1.16.4/go.mod h1:dX+/inL/fNMqNlz0e9LfyB9TswhZpCVdJM/Z6Vvnwo0=
 github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE=
+github.com/onsi/ginkgo/v2 v2.1.3/go.mod h1:vw5CSIxN1JObi/U8gcbwft7ZxR2dgaR70JSE3/PpL4c=
+github.com/onsi/ginkgo/v2 v2.1.4/go.mod h1:um6tUpWM/cxCK3/FK8BXqEiUMUwRgSM4JXG47RKZmLU=
+github.com/onsi/ginkgo/v2 v2.1.6/go.mod h1:MEH45j8TBi6u9BMogfbp0stKC5cdGjumZj5Y7AG4VIk=
 github.com/onsi/ginkgo/v2 v2.6.1 h1:1xQPCjcqYw/J5LchOcp4/2q/jzJFjiAOc25chhnDw+Q=
 github.com/onsi/ginkgo/v2 v2.6.1/go.mod h1:yjiuMwPokqY1XauOgju45q3sJt6VzQ/Fict1LFVcsAo=
 github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY=
 github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo=
+github.com/onsi/gomega v1.14.0/go.mod h1:cIuvLEne0aoVhAgh/O6ac0Op8WWw9H6eYCriF+tEHG0=
+github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY=
+github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro=
+github.com/onsi/gomega v1.20.1/go.mod h1:DtrZpjmvpn2mPm4YWQa0/ALMDj9v4YxLgojwPeREyVo=
 github.com/onsi/gomega v1.24.2 h1:J/tulyYK6JwBldPViHJReihxxZ+22FHs0piGjQAvoUE=
 github.com/onsi/gomega v1.24.2/go.mod h1:gs3J10IS7Z7r7eXRoNJIrNqU4ToQukCJhFtKrWgHWnk=
 github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
@@ -369,13 +538,22 @@ github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3
 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
 github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU=
 github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI=
+github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
+github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
+github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
+github.com/pelletier/go-toml v1.9.3/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
+github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
 github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI=
+github.com/pquerna/cachecontrol v0.1.0/go.mod h1:NrUG3Z7Rdu85UNR3vm7SOsl1nFIeSiQnrHV5K9mBcUI=
 github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
+github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso=
 github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
 github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
 github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0=
@@ -389,6 +567,8 @@ github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:
 github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4=
 github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w=
+github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
+github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
 github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
 github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
 github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc=
@@ -396,28 +576,54 @@ github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+
 github.com/prometheus/common v0.37.0 h1:ccBbHCgIiT9uSoFY0vX8H3zsNR5eLt17/RQLUvn8pXE=
 github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
 github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
+github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
 github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
 github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
 github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
 github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
 github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo=
 github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4=
+github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
+github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=
 github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
 github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8=
+github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
 github.com/sclevine/agouti v3.0.0+incompatible/go.mod h1:b4WX9W9L1sfQKXeJf1mUTLZKJ48R1S7H23Ji7oFO5Bw=
+github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
+github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
 github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
+github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
+github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
 github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0=
 github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
+github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
+github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM=
 github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js=
+github.com/soheilhy/cmux v0.1.5/go.mod h1:T7TcVDs9LWfQgPlPsdngu6I6QIoyIFZDDC6sNE1GqG0=
 github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
+github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
+github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk=
+github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I=
+github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
+github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
+github.com/spf13/cobra v1.1.3/go.mod h1:pGADOWyqRD/YMrPZigI/zbliZ2wVD/23d+is3pSWzOo=
+github.com/spf13/cobra v1.2.1/go.mod h1:ExllRjgxM/piMAM+3tAZvg8fsklGAf3tPfi+i8t68Nk=
+github.com/spf13/cobra v1.4.0/go.mod h1:Wo4iy3BUC+X2Fybo0PDqwJIv3dNRiZLHQymsfxlB84g=
 github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA=
 github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY=
+github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
+github.com/spf13/jwalterweatherman v1.1.0/go.mod h1:aNWZUN0dPAAO/Ljvb5BEdw96iTZ0EXowPYD95IqWIGo=
+github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
 github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
 github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/viper v1.7.0/go.mod h1:8WkrPz2fc9jxqZNCJI/76HCieCp4Q8HaLFoCha5qpdg=
+github.com/spf13/viper v1.8.1/go.mod h1:o0Pch8wJ9BVSWGQMbra6iw0oQ5oktSIBaujf1rJH9Ns=
 github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@@ -433,9 +639,16 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
+github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
 github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802 h1:uruHq4dN7GR16kFc5fp3d1RIYzJW5onx8Ybykw2YQFA=
+github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
 github.com/tychoish/fun v0.8.5 h1:8uTFk2fG8mxDyRmqMj6llKE8+vTuQRclUkl0/tyYwAU=
 github.com/tychoish/fun v0.8.5/go.mod h1:84A+BwGecz23UotmbB4mtvVS5ZcsZpspecduxpwF/XM=
+github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo=
+github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
+github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs=
+github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
 github.com/vishvananda/netlink v1.1.1-0.20220125195016-0639e7e787ba h1:MU5oPE25XZhDS8Z0xFG0/1ERBEu5rZIw62TImubLusU=
 github.com/vishvananda/netlink v1.1.1-0.20220125195016-0639e7e787ba/go.mod h1:twkDnbuQxJYemMlGd4JFIcuhgX83tXhKS2B/PRMpOho=
 github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0=
@@ -445,27 +658,44 @@ github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2
 github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
 github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
 github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8=
+github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
 github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
+github.com/yuin/goldmark v1.4.1/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
 go.etcd.io/bbolt v1.3.6 h1:/ecaJf0sk1l4l6V4awd65v2C3ILy7MSj+s/x1ADCIMU=
+go.etcd.io/bbolt v1.3.6/go.mod h1:qXsaaIqmgQH0T+OPdb99Bf+PKfBBQVAdyD6TY9G8XM4=
+go.etcd.io/etcd/api/v3 v3.5.0/go.mod h1:cbVKeC6lCfl7j/8jBhAK6aIYO9XOjdptoxU/nLQcPvs=
+go.etcd.io/etcd/api/v3 v3.5.4/go.mod h1:5GB2vv4A4AOn3yk7MftYGHkUfGtDHnEraIjym4dYz5A=
 go.etcd.io/etcd/api/v3 v3.5.6 h1:Cy2qx3npLcYqTKqGJzMypnMv2tiRyifZJ17BlWIWA7A=
 go.etcd.io/etcd/api/v3 v3.5.6/go.mod h1:KFtNaxGDw4Yx/BA4iPPwevUTAuqcsPxzyX8PHydchN8=
+go.etcd.io/etcd/client/pkg/v3 v3.5.0/go.mod h1:IJHfcCEKxYu1Os13ZdwCwIUTUVGYTSAM3YSwc9/Ac1g=
+go.etcd.io/etcd/client/pkg/v3 v3.5.4/go.mod h1:IJHfcCEKxYu1Os13ZdwCwIUTUVGYTSAM3YSwc9/Ac1g=
 go.etcd.io/etcd/client/pkg/v3 v3.5.6 h1:TXQWYceBKqLp4sa87rcPs11SXxUA/mHwH975v+BDvLU=
 go.etcd.io/etcd/client/pkg/v3 v3.5.6/go.mod h1:ggrwbk069qxpKPq8/FKkQ3Xq9y39kbFR4LnKszpRXeQ=
+go.etcd.io/etcd/client/v2 v2.305.0/go.mod h1:h9puh54ZTgAKtEbut2oe9P4L/oqKCVB6xsXlzd7alYQ=
 go.etcd.io/etcd/client/v2 v2.305.4 h1:Dcx3/MYyfKcPNLpR4VVQUP5KgYrBeJtktBwEKkw08Ao=
+go.etcd.io/etcd/client/v2 v2.305.4/go.mod h1:Ud+VUwIi9/uQHOMA+4ekToJ12lTxlv0zB/+DHwTGEbU=
+go.etcd.io/etcd/client/v3 v3.5.4/go.mod h1:ZaRkVgBZC+L+dLCjTcF1hRXpgZXQPOvnA/Ak/gq3kiY=
 go.etcd.io/etcd/client/v3 v3.5.6 h1:coLs69PWCXE9G4FKquzNaSHrRyMCAXwF+IX1tAPVO8E=
 go.etcd.io/etcd/client/v3 v3.5.6/go.mod h1:f6GRinRMCsFVv9Ht42EyY7nfsVGwrNO0WEoS2pRKzQk=
 go.etcd.io/etcd/pkg/v3 v3.5.4 h1:V5Dvl7S39ZDwjkKqJG2BfXgxZ3QREqqKifWQgIw5IM0=
+go.etcd.io/etcd/pkg/v3 v3.5.4/go.mod h1:OI+TtO+Aa3nhQSppMbwE4ld3uF1/fqqwbpfndbbrEe0=
 go.etcd.io/etcd/raft/v3 v3.5.4 h1:YGrnAgRfgXloBNuqa+oBI/aRZMcK/1GS6trJePJ/Gqc=
+go.etcd.io/etcd/raft/v3 v3.5.4/go.mod h1:SCuunjYvZFC0fBX0vxMSPjuZmpcSk+XaAcMrD6Do03w=
 go.etcd.io/etcd/server/v3 v3.5.4 h1:CMAZd0g8Bn5NRhynW6pKhc4FRg41/0QYy3d7aNm9874=
+go.etcd.io/etcd/server/v3 v3.5.4/go.mod h1:S5/YTU15KxymM5l3T6b09sNOHPXqGYIZStpuuGbb65c=
 go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
 go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
 go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
 go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
 go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
+go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk=
+go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E=
 go.opentelemetry.io/contrib v0.20.0 h1:ubFQUn0VCZ0gPwIoJfBJVpeBlyRMxu8Mm/huKWYd9p0=
 go.opentelemetry.io/contrib v0.20.0/go.mod h1:G/EtFaa6qaN7+LxqfIAT3GiZa7Wv5DTBUzl5H4LY0Kc=
 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.20.0 h1:sO4WKdPAudZGKPcpZT4MJn6JaDmpyLrMPDGGyA1SttE=
@@ -490,26 +720,34 @@ go.opentelemetry.io/otel/trace v0.20.0 h1:1DL6EXUdcg95gukhuRRvLDO/4X5THh/5dIV52l
 go.opentelemetry.io/otel/trace v0.20.0/go.mod h1:6GjCW8zgDjwGHGa6GkyeB8+/5vjT16gUEi0Nf1iBdgw=
 go.opentelemetry.io/proto/otlp v0.7.0 h1:rwOQPCuKAKmwGKq2aVNnYIibI6wnV7EvzgfTCzcdGg8=
 go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
+go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
 go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
 go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ=
 go.uber.org/atomic v1.10.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
 go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
 go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk=
+go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo=
+go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
 go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
 go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI=
 go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ=
+go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
 go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo=
 go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI=
 go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60=
 go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg=
 golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
+golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
+golang.org/x/crypto v0.0.0-20220315160706-3147a52a75dd/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.5.0 h1:U/0M97KRkSFvyD/3FSmdP5W5swImpNgle/EHFhOsQPE=
 golang.org/x/crypto v0.5.0/go.mod h1:NK/OQwhpMQP3MwtdjgLlYHnH9ebylxKWv3e0fK+mkQU=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -536,6 +774,7 @@ golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHl
 golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs=
 golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
 golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
+golang.org/x/lint v0.0.0-20201208152925-83fdc39ff7b5/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
 golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
 golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
 golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
@@ -545,13 +784,20 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB
 golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
 golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8=
 golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
@@ -563,6 +809,7 @@ golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
@@ -579,13 +826,27 @@ golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20201209123823-ac852fbbde11/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20210316092652-d523dce5a7f4/go.mod h1:RBQZq4jEuRlivfhVLdyRGr576XBO4/greRjx4P4O3yc=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
+golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk=
+golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20210726213435-c6fcb2dbf985/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
 golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
+golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
 golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
 golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
@@ -593,10 +854,21 @@ golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4Iltr
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20200902213428-5d25da1a8d43/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20201109201403-9fd604954f58/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20201208152858-08078c50e5b5/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20210220000619-9bb904979d93/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20210313182246-cd4f82c27b84/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20210402161424-2e8d93401602/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
 golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20210628180205-a41e5a781914/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20210805134026-6f1e6394065a/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20210819190943-2bc19b11175f/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
+golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
 golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc=
-golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 h1:nt+Q6cXKz4MosCSpnbMtqiQ8Oz0pxTef2B4Vca2lvfk=
-golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg=
+golang.org/x/oauth2 v0.4.0 h1:NF0gk8LVPg1Ml7SSbGyySuoxdsXitj7TvgvuRxIMc/M=
+golang.org/x/oauth2 v0.4.0/go.mod h1:RznEsdpjGAINPTOF0UH/t+xJ75L18YO3Ho6Pyn+uRec=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -608,11 +880,15 @@ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
 golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -626,6 +902,7 @@ golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -650,27 +927,48 @@ golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200905004654-be1d3432aa8f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200923182605-d9f96fdee20d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201201145000-ef89a241ccb3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210104204734-6f8348627aad/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210220050731-9a76102bfb43/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210305230114-8fe3ee5dd75b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210514084401-e8d321eab015/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210603125802-9665404d3644/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211019181941-9d821ace8654/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220319134239-a9b59b0215f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220422013727-9388b58f7150/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
 golang.org/x/term v0.6.0 h1:clScbb1cHjoCkyRbWwBEUZ5H/tIFu5TAXIqaZD0Gcjw=
 golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
 golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@@ -678,32 +976,41 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
 golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20220609170525-579cf78fd858 h1:Dpdu/EMxGMFgq0CeYMh4fazTD2vtlZRYE7wyynxJb9U=
 golang.org/x/time v0.0.0-20220609170525-579cf78fd858/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
 golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
 golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
 golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
 golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
 golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20190624222133-a101b041ded4/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
 golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
 golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191112195655-aa38f8e97acc/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
@@ -723,6 +1030,7 @@ golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjs
 golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
 golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8=
 golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20200505023115-26f46d2f7ef8/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
@@ -730,9 +1038,22 @@ golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roY
 golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
 golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
 golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
+golang.org/x/tools v0.0.0-20200904185747-39188db58858/go.mod h1:Cj7w3i3Rnn0Xh82ur9kSqwfTHTeVxaDqrfMjpcNT6bE=
+golang.org/x/tools v0.0.0-20201110124207-079ba7bd75cd/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.0.0-20201201161351-ac6f37ff4c2a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.0.0-20201208233053-a543418bbed2/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.0.0-20210105154028-b0ab187a4818/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
 golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
+golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
+golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
+golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
+golang.org/x/tools v0.1.6-0.20210726203631-07bc1bf47fb2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
+golang.org/x/tools v0.1.10/go.mod h1:Uh6Zz+xoGYZom868N8YTex3t7RhtHDBrE8Gzo9bV56E=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
 golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM=
 golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -757,6 +1078,19 @@ google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0M
 google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
 google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM=
 google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc=
+google.golang.org/api v0.35.0/go.mod h1:/XrVsuzM0rZmrsbjJutiuftIzeuTQcEeaYcSk/mQ1dg=
+google.golang.org/api v0.36.0/go.mod h1:+z5ficQTmoYpPn8LCUNVpK5I7hwkpjbcgqA7I34qYtE=
+google.golang.org/api v0.40.0/go.mod h1:fYKFpnQN0DsDSKRVRcQSDQNtqWPfM9i+zNPxepjRCQ8=
+google.golang.org/api v0.41.0/go.mod h1:RkxM5lITDfTzmyKFPt+wGrCJbVfniCr2ool8kTBzRTU=
+google.golang.org/api v0.43.0/go.mod h1:nQsDGjRXMo4lvh5hP0TKqF244gqhGcr/YSIykhUk/94=
+google.golang.org/api v0.44.0/go.mod h1:EBOGZqzyhtvMDoxwS97ctnh0zUmYY6CxqXsc1AvkYD8=
+google.golang.org/api v0.47.0/go.mod h1:Wbvgpq1HddcWVtzsVLyfLp8lDg6AA241LmgIL59tHXo=
+google.golang.org/api v0.48.0/go.mod h1:71Pr1vy+TAZRPkPs/xlCf5SsU8WjuAWv1Pfjbtukyy4=
+google.golang.org/api v0.50.0/go.mod h1:4bNT5pAuq5ji4SRZm+5QIkjny9JAyVD/3gaSihNefaw=
+google.golang.org/api v0.51.0/go.mod h1:t4HdrdoNgyN5cbEfm7Lum0lcLDLiise1F8qDKX00sOU=
+google.golang.org/api v0.54.0/go.mod h1:7C4bFFOvVDGXjfDTAsgGwDgAxRDeQ4X8NvUedIt6z3k=
+google.golang.org/api v0.55.0/go.mod h1:38yMfeP1kfjsl8isn0tliTjIb1rJXcQi4UXlbqivdVE=
+google.golang.org/api v0.57.0/go.mod h1:dVPlbZyBo2/OjBpmvNdpn2GRm6rPy75jyU7bmhdrMgI=
 google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
 google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
@@ -786,6 +1120,7 @@ google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfG
 google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
+google.golang.org/genproto v0.0.0-20200423170343-7949de9c1215/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
@@ -795,10 +1130,36 @@ google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7Fc
 google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
 google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
 google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20200904004341-0bd0a958aa1d/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20201109203340-2640f1f9cdfb/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20201201144952-b05cb90ed32e/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20201210142538-e3217bee35cc/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20201214200347-8c77b98c765d/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20210222152913-aa3ee6e6a81c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20210303154014-9728d6b83eeb/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20210310155132-4ce2db91004e/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20210319143718-93e7006c17a6/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20210402141018-6c239bbf2bb1/go.mod h1:9lPAdzaEmUacj36I+k7YKbEc5CXzPIeORRgDAUOu28A=
+google.golang.org/genproto v0.0.0-20210513213006-bf773b8c8384/go.mod h1:P3QM42oQyzQSnHPnZ/vqoCdDmzH28fzWByN9asMeM8A=
 google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c/go.mod h1:UODoCrxHCcBojKKwX1terBiRUaqAsFqJiF615XL43r0=
+google.golang.org/genproto v0.0.0-20210604141403-392c879c8b08/go.mod h1:UODoCrxHCcBojKKwX1terBiRUaqAsFqJiF615XL43r0=
+google.golang.org/genproto v0.0.0-20210608205507-b6d2f5bf0d7d/go.mod h1:UODoCrxHCcBojKKwX1terBiRUaqAsFqJiF615XL43r0=
+google.golang.org/genproto v0.0.0-20210624195500-8bfb893ecb84/go.mod h1:SzzZ/N+nwJDaO1kznhnlzqS8ocJICar6hYhVyhi++24=
+google.golang.org/genproto v0.0.0-20210713002101-d411969a0d9a/go.mod h1:AxrInvYm1dci+enl5hChSFPOmmUF1+uAa/UsgNRWd7k=
+google.golang.org/genproto v0.0.0-20210716133855-ce7ef5c701ea/go.mod h1:AxrInvYm1dci+enl5hChSFPOmmUF1+uAa/UsgNRWd7k=
+google.golang.org/genproto v0.0.0-20210728212813-7823e685a01f/go.mod h1:ob2IJxKrgPT52GcgX759i1sleT07tiKowYBGbczaW48=
+google.golang.org/genproto v0.0.0-20210805201207-89edb61ffb67/go.mod h1:ob2IJxKrgPT52GcgX759i1sleT07tiKowYBGbczaW48=
+google.golang.org/genproto v0.0.0-20210813162853-db860fec028c/go.mod h1:cFeNkxwySK631ADgubI+/XFU/xp8FD5KIVV4rj8UC5w=
+google.golang.org/genproto v0.0.0-20210821163610-241b8fcbd6c8/go.mod h1:eFjDcFEctNawg4eG61bRv87N7iHBWyVhJu7u1kqDUXY=
+google.golang.org/genproto v0.0.0-20210828152312-66f60bf46e71/go.mod h1:eFjDcFEctNawg4eG61bRv87N7iHBWyVhJu7u1kqDUXY=
+google.golang.org/genproto v0.0.0-20210831024726-fe130286e0e2/go.mod h1:eFjDcFEctNawg4eG61bRv87N7iHBWyVhJu7u1kqDUXY=
+google.golang.org/genproto v0.0.0-20210903162649-d08c68adba83/go.mod h1:eFjDcFEctNawg4eG61bRv87N7iHBWyVhJu7u1kqDUXY=
+google.golang.org/genproto v0.0.0-20210924002016-3dee208752a0/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc=
 google.golang.org/genproto v0.0.0-20220107163113-42d7afdf6368/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc=
-google.golang.org/genproto v0.0.0-20230106154932-a12b697841d9 h1:3wPBShTLWQnEkZ9VW/HZZ8zT/9LLtleBtq7l8SKtJIA=
-google.golang.org/genproto v0.0.0-20230106154932-a12b697841d9/go.mod h1:RGgjbofJ8xD9Sq1VVhDM1Vok1vRONV+rg+CjzG4SZKM=
+google.golang.org/genproto v0.0.0-20220502173005-c8bf987b8c21/go.mod h1:RAyBrSAP7Fh3Nc84ghnVLDPuV51xc9agzmm4Ph6i0Q4=
+google.golang.org/genproto v0.0.0-20230110181048-76db0878b65f h1:BWUVssLB0HVOSY78gIdvk1dTVYtT1y8SBWtPYuTJ/6w=
+google.golang.org/genproto v0.0.0-20230110181048-76db0878b65f/go.mod h1:RGgjbofJ8xD9Sq1VVhDM1Vok1vRONV+rg+CjzG4SZKM=
 google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
 google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
 google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
@@ -811,14 +1172,25 @@ google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKa
 google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk=
 google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
 google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
+google.golang.org/grpc v1.31.1/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
 google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0=
+google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
+google.golang.org/grpc v1.34.0/go.mod h1:WotjhfgOW/POjDeRt8vscBtXq+2VjORFy659qA51WJ8=
+google.golang.org/grpc v1.35.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
 google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
+google.golang.org/grpc v1.36.1/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
 google.golang.org/grpc v1.37.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
+google.golang.org/grpc v1.37.1/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
 google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
+google.golang.org/grpc v1.39.0/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE=
+google.golang.org/grpc v1.39.1/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE=
 google.golang.org/grpc v1.40.0/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9KAK34=
 google.golang.org/grpc v1.41.0/go.mod h1:U3l9uK9J0sini8mHphKoXyaqDA/8VyGnDee1zzIUK6k=
-google.golang.org/grpc v1.51.0 h1:E1eGv1FTqoLIdnBCZufiSHgKjlqG6fKFf6pPWtMTh8U=
-google.golang.org/grpc v1.51.0/go.mod h1:wgNDFcnuBGmxLKI/qn4T+m5BtEBYXJPvibbUPsAIPww=
+google.golang.org/grpc v1.46.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk=
+google.golang.org/grpc v1.47.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk=
+google.golang.org/grpc v1.53.0 h1:LAv2ds7cmFV/XTS3XG1NneeENYrXGmorPxsBbptIjNc=
+google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw=
+google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
 google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
 google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
 google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
@@ -832,6 +1204,7 @@ google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlba
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
 google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w=
 google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
@@ -844,10 +1217,15 @@ gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
 gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
 gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
 gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
+gopkg.in/ini.v1 v1.51.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/ini.v1 v1.62.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/natefinch/lumberjack.v2 v2.0.0 h1:1Lc07Kr7qY4U2YPouBjpCLxpiyxIVoxqXgkXLknAOE8=
 gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k=
+gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo=
+gopkg.in/square/go-jose.v2 v2.2.2/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
+gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
@@ -862,7 +1240,9 @@ gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C
 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk=
 gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0=
+gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8=
 honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
@@ -870,8 +1250,6 @@ honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWh
 honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
 honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
-howett.net/plist v0.0.0-20181124034731-591f970eefbb h1:jhnBjNi9UFpfpl8YZhA9CrOqpnJdvzuiHsl/dnxl11M=
-howett.net/plist v0.0.0-20181124034731-591f970eefbb/go.mod h1:vMygbs4qMhSZSc4lCUl2OEE+rDiIIJAIdR4m7MiMcm0=
 k8s.io/api v0.25.11 h1:4mjYDfE3yp22jrytjH0knwgzjXKkxHX4D01ZCAazvZM=
 k8s.io/api v0.25.11/go.mod h1:bK4UvD4bthtutNlvensrfBX21PRQ/vs2cIYggHkOOAo=
 k8s.io/apiextensions-apiserver v0.25.11 h1:qZY0kCt0tW3QHPKcogp3k4zrlZhe9f8H6EJOr7sNRbA=
@@ -884,12 +1262,18 @@ k8s.io/client-go v0.25.11 h1:DJQ141UsbNRI6wYSlcYLP5J5BW5Wq7Bgm42Ztq2SW70=
 k8s.io/client-go v0.25.11/go.mod h1:41Xs7p1SfhoReUnmjjYCfCNWFiq4xSkexwJfbxF2F7A=
 k8s.io/cloud-provider v0.25.11 h1:t/mMWKvO52IrznQ5dAziigNt+EzXuM9jWfisEmAaaYQ=
 k8s.io/cloud-provider v0.25.11/go.mod h1:9xL8k1YZsU6dCN3djftvum0y84rwYW+xorF+8LFs5Ho=
+k8s.io/code-generator v0.25.11/go.mod h1:FA5a4rk4tMTCgmiDeNdRjml+AGvm72SwZYwD5lBrezY=
 k8s.io/component-base v0.25.11 h1:3QmISCE9n9CJkVpTA4spQO1IZCrLlOwbKdzSN9dqZZA=
 k8s.io/component-base v0.25.11/go.mod h1:wFR4pfB+xTc6FBak+RoWRNeTmelGE4XWJP/xVOvn3vM=
 k8s.io/component-helpers v0.25.11 h1:NO8FqIZd0LgEYiNhzFcwlEa6P8/8lX366r00niFB2XY=
 k8s.io/component-helpers v0.25.11/go.mod h1:TeIbtyuelY6lnG6F3Uu+/lzMp31TEg/YtyuYWBNTVHY=
 k8s.io/csi-translation-lib v0.25.11 h1:JgpoBenEAfCjpbfwjCPvL8bI/P9un+BQUV/uNxZnhP0=
 k8s.io/csi-translation-lib v0.25.11/go.mod h1:Ff2gRYDRoGkoIoosW3jcZ6Q1T0MO+iZEGO21RSVKWbs=
+k8s.io/gengo v0.0.0-20210813121822-485abfe95c7c/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E=
+k8s.io/gengo v0.0.0-20211129171323-c02415ce4185/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E=
+k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE=
+k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
+k8s.io/klog/v2 v2.70.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0=
 k8s.io/klog/v2 v2.80.1 h1:atnLQ121W371wYYFawwYx1aEY2eUfs4l3J72wtgAwV4=
 k8s.io/klog/v2 v2.80.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0=
 k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkIFQtZShWqoha7snGixVgEA=
@@ -900,8 +1284,12 @@ k8s.io/kubernetes v1.25.11 h1:vl9UYkjHuWOyk1EAfnzVlakFCziEBMazLthW/YuHb8M=
 k8s.io/kubernetes v1.25.11/go.mod h1:uokqZvgUrcgwuapBSvrq9+y5TMXsvm68qgiRiidZs2A=
 k8s.io/mount-utils v0.25.11 h1:WFzlMxcML7xXDHuVDzqcJpl1xF4P6hwrnbHTruNBWno=
 k8s.io/mount-utils v0.25.11/go.mod h1:IM9QOFh15E1a4Nb6Rcn8FJ9Z1PbBpuyAPCty/qvKSAw=
+k8s.io/utils v0.0.0-20210802155522-efc7438f0176/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA=
+k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA=
 k8s.io/utils v0.0.0-20221107191617-1a15be271d1d h1:0Smp/HP1OH4Rvhe+4B8nWGERtlqAGSftbSbbmm45oFs=
 k8s.io/utils v0.0.0-20221107191617-1a15be271d1d/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
+nhooyr.io/websocket v1.8.7 h1:usjR2uOr/zjjkVMy0lW+PPohFok7PCow5sDjLgX4P4g=
+nhooyr.io/websocket v1.8.7/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
 rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
@@ -909,8 +1297,6 @@ sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.37 h1:fAPTNEpzQMOLM
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.37/go.mod h1:vfnxT4FXNT8eGvO+xi/DsyC/qHmdujqwrUa1WSspCsk=
 sigs.k8s.io/controller-runtime v0.13.1 h1:tUsRCSJVM1QQOOeViGeX3GMT3dQF1eePPw6sEE3xSlg=
 sigs.k8s.io/controller-runtime v0.13.1/go.mod h1:Zbz+el8Yg31jubvAEyglRZGdLAjplZl+PgtYNI6WNTI=
-sigs.k8s.io/controller-tools v0.10.0 h1:0L5DTDTFB67jm9DkfrONgTGmfc/zYow0ZaHyppizU2U=
-sigs.k8s.io/controller-tools v0.10.0/go.mod h1:uvr0EW6IsprfB0jpQq6evtKy+hHyHCXNfdWI5ONPx94=
 sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k=
 sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
 sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE=
diff --git a/neonvm/apis/neonvm/v1/virtualmachine_types.go b/neonvm/apis/neonvm/v1/virtualmachine_types.go
index 6629b7a39..d5cf8b1b6 100644
--- a/neonvm/apis/neonvm/v1/virtualmachine_types.go
+++ b/neonvm/apis/neonvm/v1/virtualmachine_types.go
@@ -69,11 +69,12 @@ type VirtualMachineSpec struct {
 	// +optional
 	TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds"`
 
-	NodeSelector  map[string]string           `json:"nodeSelector,omitempty"`
-	Affinity      *corev1.Affinity            `json:"affinity,omitempty"`
-	Tolerations   []corev1.Toleration         `json:"tolerations,omitempty"`
-	SchedulerName string                      `json:"schedulerName,omitempty"`
-	PodResources  corev1.ResourceRequirements `json:"podResources,omitempty"`
+	NodeSelector       map[string]string           `json:"nodeSelector,omitempty"`
+	Affinity           *corev1.Affinity            `json:"affinity,omitempty"`
+	Tolerations        []corev1.Toleration         `json:"tolerations,omitempty"`
+	SchedulerName      string                      `json:"schedulerName,omitempty"`
+	ServiceAccountName string                      `json:"serviceAccountName,omitempty"`
+	PodResources       corev1.ResourceRequirements `json:"podResources,omitempty"`
 
 	// +kubebuilder:default:=Always
 	// +optional
@@ -83,6 +84,8 @@ type VirtualMachineSpec struct {
 
 	Guest Guest `json:"guest"`
 
+	ExtraInitContainers []corev1.Container `json:"extraInitContainers,omitempty"`
+
 	// List of disk that can be mounted by virtual machine.
 	// +optional
 	Disks []Disk `json:"disks,omitempty"`
@@ -97,7 +100,7 @@ type VirtualMachineSpec struct {
 	// Use KVM acceleation
 	// +kubebuilder:default:=true
 	// +optional
-	EnableAcceleration bool `json:"enableAcceleration"`
+	EnableAcceleration *bool `json:"enableAcceleration,omitempty"`
 }
 
 // +kubebuilder:validation:Enum=Always;OnFailure;Never
@@ -308,6 +311,8 @@ type DiskSource struct {
 
 type EmptyDiskSource struct {
 	Size resource.Quantity `json:"size"`
+	// Discard enables the "discard" mount option for the filesystem
+	Discard bool `json:"discard,omitempty"`
 }
 
 type TmpfsDiskSource struct {
diff --git a/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go b/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go
index 5131cb359..24ee8e058 100644
--- a/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go
+++ b/neonvm/apis/neonvm/v1/zz_generated.deepcopy.go
@@ -625,6 +625,13 @@ func (in *VirtualMachineSpec) DeepCopyInto(out *VirtualMachineSpec) {
 		copy(*out, *in)
 	}
 	in.Guest.DeepCopyInto(&out.Guest)
+	if in.ExtraInitContainers != nil {
+		in, out := &in.ExtraInitContainers, &out.ExtraInitContainers
+		*out = make([]corev1.Container, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
 	if in.Disks != nil {
 		in, out := &in.Disks, &out.Disks
 		*out = make([]Disk, len(*in))
@@ -642,6 +649,11 @@ func (in *VirtualMachineSpec) DeepCopyInto(out *VirtualMachineSpec) {
 		*out = new(bool)
 		**out = **in
 	}
+	if in.EnableAcceleration != nil {
+		in, out := &in.EnableAcceleration, &out.EnableAcceleration
+		*out = new(bool)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineSpec.
diff --git a/neonvm/config/common/crd/bases/vm.neon.tech_virtualmachines.yaml b/neonvm/config/common/crd/bases/vm.neon.tech_virtualmachines.yaml
index b34bf889b..f8dc1e5f9 100644
--- a/neonvm/config/common/crd/bases/vm.neon.tech_virtualmachines.yaml
+++ b/neonvm/config/common/crd/bases/vm.neon.tech_virtualmachines.yaml
@@ -963,6 +963,10 @@ spec:
                       description: EmptyDisk represents a temporary empty qcow2 disk
                         that shares a vm's lifetime.
                       properties:
+                        discard:
+                          description: Discard enables the "discard" mount option
+                            for the filesystem
+                          type: boolean
                         size:
                           anyOf:
                           - type: integer
@@ -1070,6 +1074,1238 @@ spec:
                 default: true
                 description: Use KVM acceleation
                 type: boolean
+              extraInitContainers:
+                items:
+                  description: A single application container that you want to run
+                    within a pod.
+                  properties:
+                    args:
+                      description: 'Arguments to the entrypoint. The container image''s
+                        CMD is used if this is not provided. Variable references $(VAR_NAME)
+                        are expanded using the container''s environment. If a variable
+                        cannot be resolved, the reference in the input string will
+                        be unchanged. Double $$ are reduced to a single $, which allows
+                        for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will
+                        produce the string literal "$(VAR_NAME)". Escaped references
+                        will never be expanded, regardless of whether the variable
+                        exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell'
+                      items:
+                        type: string
+                      type: array
+                    command:
+                      description: 'Entrypoint array. Not executed within a shell.
+                        The container image''s ENTRYPOINT is used if this is not provided.
+                        Variable references $(VAR_NAME) are expanded using the container''s
+                        environment. If a variable cannot be resolved, the reference
+                        in the input string will be unchanged. Double $$ are reduced
+                        to a single $, which allows for escaping the $(VAR_NAME) syntax:
+                        i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)".
+                        Escaped references will never be expanded, regardless of whether
+                        the variable exists or not. Cannot be updated. More info:
+                        https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell'
+                      items:
+                        type: string
+                      type: array
+                    env:
+                      description: List of environment variables to set in the container.
+                        Cannot be updated.
+                      items:
+                        description: EnvVar represents an environment variable present
+                          in a Container.
+                        properties:
+                          name:
+                            description: Name of the environment variable. Must be
+                              a C_IDENTIFIER.
+                            type: string
+                          value:
+                            description: 'Variable references $(VAR_NAME) are expanded
+                              using the previously defined environment variables in
+                              the container and any service environment variables.
+                              If a variable cannot be resolved, the reference in the
+                              input string will be unchanged. Double $$ are reduced
+                              to a single $, which allows for escaping the $(VAR_NAME)
+                              syntax: i.e. "$$(VAR_NAME)" will produce the string
+                              literal "$(VAR_NAME)". Escaped references will never
+                              be expanded, regardless of whether the variable exists
+                              or not. Defaults to "".'
+                            type: string
+                          valueFrom:
+                            description: Source for the environment variable's value.
+                              Cannot be used if value is not empty.
+                            properties:
+                              configMapKeyRef:
+                                description: Selects a key of a ConfigMap.
+                                properties:
+                                  key:
+                                    description: The key to select.
+                                    type: string
+                                  name:
+                                    description: 'Name of the referent. More info:
+                                      https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                      TODO: Add other useful fields. apiVersion, kind,
+                                      uid?'
+                                    type: string
+                                  optional:
+                                    description: Specify whether the ConfigMap or
+                                      its key must be defined
+                                    type: boolean
+                                required:
+                                - key
+                                type: object
+                                x-kubernetes-map-type: atomic
+                              fieldRef:
+                                description: 'Selects a field of the pod: supports
+                                  metadata.name, metadata.namespace, `metadata.labels[''<KEY>'']`,
+                                  `metadata.annotations[''<KEY>'']`, spec.nodeName,
+                                  spec.serviceAccountName, status.hostIP, status.podIP,
+                                  status.podIPs.'
+                                properties:
+                                  apiVersion:
+                                    description: Version of the schema the FieldPath
+                                      is written in terms of, defaults to "v1".
+                                    type: string
+                                  fieldPath:
+                                    description: Path of the field to select in the
+                                      specified API version.
+                                    type: string
+                                required:
+                                - fieldPath
+                                type: object
+                                x-kubernetes-map-type: atomic
+                              resourceFieldRef:
+                                description: 'Selects a resource of the container:
+                                  only resources limits and requests (limits.cpu,
+                                  limits.memory, limits.ephemeral-storage, requests.cpu,
+                                  requests.memory and requests.ephemeral-storage)
+                                  are currently supported.'
+                                properties:
+                                  containerName:
+                                    description: 'Container name: required for volumes,
+                                      optional for env vars'
+                                    type: string
+                                  divisor:
+                                    anyOf:
+                                    - type: integer
+                                    - type: string
+                                    description: Specifies the output format of the
+                                      exposed resources, defaults to "1"
+                                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                    x-kubernetes-int-or-string: true
+                                  resource:
+                                    description: 'Required: resource to select'
+                                    type: string
+                                required:
+                                - resource
+                                type: object
+                                x-kubernetes-map-type: atomic
+                              secretKeyRef:
+                                description: Selects a key of a secret in the pod's
+                                  namespace
+                                properties:
+                                  key:
+                                    description: The key of the secret to select from.  Must
+                                      be a valid secret key.
+                                    type: string
+                                  name:
+                                    description: 'Name of the referent. More info:
+                                      https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                      TODO: Add other useful fields. apiVersion, kind,
+                                      uid?'
+                                    type: string
+                                  optional:
+                                    description: Specify whether the Secret or its
+                                      key must be defined
+                                    type: boolean
+                                required:
+                                - key
+                                type: object
+                                x-kubernetes-map-type: atomic
+                            type: object
+                        required:
+                        - name
+                        type: object
+                      type: array
+                    envFrom:
+                      description: List of sources to populate environment variables
+                        in the container. The keys defined within a source must be
+                        a C_IDENTIFIER. All invalid keys will be reported as an event
+                        when the container is starting. When a key exists in multiple
+                        sources, the value associated with the last source will take
+                        precedence. Values defined by an Env with a duplicate key
+                        will take precedence. Cannot be updated.
+                      items:
+                        description: EnvFromSource represents the source of a set
+                          of ConfigMaps
+                        properties:
+                          configMapRef:
+                            description: The ConfigMap to select from
+                            properties:
+                              name:
+                                description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                  TODO: Add other useful fields. apiVersion, kind,
+                                  uid?'
+                                type: string
+                              optional:
+                                description: Specify whether the ConfigMap must be
+                                  defined
+                                type: boolean
+                            type: object
+                            x-kubernetes-map-type: atomic
+                          prefix:
+                            description: An optional identifier to prepend to each
+                              key in the ConfigMap. Must be a C_IDENTIFIER.
+                            type: string
+                          secretRef:
+                            description: The Secret to select from
+                            properties:
+                              name:
+                                description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                  TODO: Add other useful fields. apiVersion, kind,
+                                  uid?'
+                                type: string
+                              optional:
+                                description: Specify whether the Secret must be defined
+                                type: boolean
+                            type: object
+                            x-kubernetes-map-type: atomic
+                        type: object
+                      type: array
+                    image:
+                      description: 'Container image name. More info: https://kubernetes.io/docs/concepts/containers/images
+                        This field is optional to allow higher level config management
+                        to default or override container images in workload controllers
+                        like Deployments and StatefulSets.'
+                      type: string
+                    imagePullPolicy:
+                      description: 'Image pull policy. One of Always, Never, IfNotPresent.
+                        Defaults to Always if :latest tag is specified, or IfNotPresent
+                        otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images'
+                      type: string
+                    lifecycle:
+                      description: Actions that the management system should take
+                        in response to container lifecycle events. Cannot be updated.
+                      properties:
+                        postStart:
+                          description: 'PostStart is called immediately after a container
+                            is created. If the handler fails, the container is terminated
+                            and restarted according to its restart policy. Other management
+                            of the container blocks until the hook completes. More
+                            info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks'
+                          properties:
+                            exec:
+                              description: Exec specifies the action to take.
+                              properties:
+                                command:
+                                  description: Command is the command line to execute
+                                    inside the container, the working directory for
+                                    the command  is root ('/') in the container's
+                                    filesystem. The command is simply exec'd, it is
+                                    not run inside a shell, so traditional shell instructions
+                                    ('|', etc) won't work. To use a shell, you need
+                                    to explicitly call out to that shell. Exit status
+                                    of 0 is treated as live/healthy and non-zero is
+                                    unhealthy.
+                                  items:
+                                    type: string
+                                  type: array
+                              type: object
+                            httpGet:
+                              description: HTTPGet specifies the http request to perform.
+                              properties:
+                                host:
+                                  description: Host name to connect to, defaults to
+                                    the pod IP. You probably want to set "Host" in
+                                    httpHeaders instead.
+                                  type: string
+                                httpHeaders:
+                                  description: Custom headers to set in the request.
+                                    HTTP allows repeated headers.
+                                  items:
+                                    description: HTTPHeader describes a custom header
+                                      to be used in HTTP probes
+                                    properties:
+                                      name:
+                                        description: The header field name. This will
+                                          be canonicalized upon output, so case-variant
+                                          names will be understood as the same header.
+                                        type: string
+                                      value:
+                                        description: The header field value
+                                        type: string
+                                    required:
+                                    - name
+                                    - value
+                                    type: object
+                                  type: array
+                                path:
+                                  description: Path to access on the HTTP server.
+                                  type: string
+                                port:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: Name or number of the port to access
+                                    on the container. Number must be in the range
+                                    1 to 65535. Name must be an IANA_SVC_NAME.
+                                  x-kubernetes-int-or-string: true
+                                scheme:
+                                  description: Scheme to use for connecting to the
+                                    host. Defaults to HTTP.
+                                  type: string
+                              required:
+                              - port
+                              type: object
+                            tcpSocket:
+                              description: Deprecated. TCPSocket is NOT supported
+                                as a LifecycleHandler and kept for the backward compatibility.
+                                There are no validation of this field and lifecycle
+                                hooks will fail in runtime when tcp handler is specified.
+                              properties:
+                                host:
+                                  description: 'Optional: Host name to connect to,
+                                    defaults to the pod IP.'
+                                  type: string
+                                port:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: Number or name of the port to access
+                                    on the container. Number must be in the range
+                                    1 to 65535. Name must be an IANA_SVC_NAME.
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - port
+                              type: object
+                          type: object
+                        preStop:
+                          description: 'PreStop is called immediately before a container
+                            is terminated due to an API request or management event
+                            such as liveness/startup probe failure, preemption, resource
+                            contention, etc. The handler is not called if the container
+                            crashes or exits. The Pod''s termination grace period
+                            countdown begins before the PreStop hook is executed.
+                            Regardless of the outcome of the handler, the container
+                            will eventually terminate within the Pod''s termination
+                            grace period (unless delayed by finalizers). Other management
+                            of the container blocks until the hook completes or until
+                            the termination grace period is reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks'
+                          properties:
+                            exec:
+                              description: Exec specifies the action to take.
+                              properties:
+                                command:
+                                  description: Command is the command line to execute
+                                    inside the container, the working directory for
+                                    the command  is root ('/') in the container's
+                                    filesystem. The command is simply exec'd, it is
+                                    not run inside a shell, so traditional shell instructions
+                                    ('|', etc) won't work. To use a shell, you need
+                                    to explicitly call out to that shell. Exit status
+                                    of 0 is treated as live/healthy and non-zero is
+                                    unhealthy.
+                                  items:
+                                    type: string
+                                  type: array
+                              type: object
+                            httpGet:
+                              description: HTTPGet specifies the http request to perform.
+                              properties:
+                                host:
+                                  description: Host name to connect to, defaults to
+                                    the pod IP. You probably want to set "Host" in
+                                    httpHeaders instead.
+                                  type: string
+                                httpHeaders:
+                                  description: Custom headers to set in the request.
+                                    HTTP allows repeated headers.
+                                  items:
+                                    description: HTTPHeader describes a custom header
+                                      to be used in HTTP probes
+                                    properties:
+                                      name:
+                                        description: The header field name. This will
+                                          be canonicalized upon output, so case-variant
+                                          names will be understood as the same header.
+                                        type: string
+                                      value:
+                                        description: The header field value
+                                        type: string
+                                    required:
+                                    - name
+                                    - value
+                                    type: object
+                                  type: array
+                                path:
+                                  description: Path to access on the HTTP server.
+                                  type: string
+                                port:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: Name or number of the port to access
+                                    on the container. Number must be in the range
+                                    1 to 65535. Name must be an IANA_SVC_NAME.
+                                  x-kubernetes-int-or-string: true
+                                scheme:
+                                  description: Scheme to use for connecting to the
+                                    host. Defaults to HTTP.
+                                  type: string
+                              required:
+                              - port
+                              type: object
+                            tcpSocket:
+                              description: Deprecated. TCPSocket is NOT supported
+                                as a LifecycleHandler and kept for the backward compatibility.
+                                There are no validation of this field and lifecycle
+                                hooks will fail in runtime when tcp handler is specified.
+                              properties:
+                                host:
+                                  description: 'Optional: Host name to connect to,
+                                    defaults to the pod IP.'
+                                  type: string
+                                port:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: Number or name of the port to access
+                                    on the container. Number must be in the range
+                                    1 to 65535. Name must be an IANA_SVC_NAME.
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - port
+                              type: object
+                          type: object
+                      type: object
+                    livenessProbe:
+                      description: 'Periodic probe of container liveness. Container
+                        will be restarted if the probe fails. Cannot be updated. More
+                        info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                      properties:
+                        exec:
+                          description: Exec specifies the action to take.
+                          properties:
+                            command:
+                              description: Command is the command line to execute
+                                inside the container, the working directory for the
+                                command  is root ('/') in the container's filesystem.
+                                The command is simply exec'd, it is not run inside
+                                a shell, so traditional shell instructions ('|', etc)
+                                won't work. To use a shell, you need to explicitly
+                                call out to that shell. Exit status of 0 is treated
+                                as live/healthy and non-zero is unhealthy.
+                              items:
+                                type: string
+                              type: array
+                          type: object
+                        failureThreshold:
+                          description: Minimum consecutive failures for the probe
+                            to be considered failed after having succeeded. Defaults
+                            to 3. Minimum value is 1.
+                          format: int32
+                          type: integer
+                        grpc:
+                          description: GRPC specifies an action involving a GRPC port.
+                            This is a beta field and requires enabling GRPCContainerProbe
+                            feature gate.
+                          properties:
+                            port:
+                              description: Port number of the gRPC service. Number
+                                must be in the range 1 to 65535.
+                              format: int32
+                              type: integer
+                            service:
+                              description: "Service is the name of the service to
+                                place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+                                \n If this is not specified, the default behavior
+                                is defined by gRPC."
+                              type: string
+                          required:
+                          - port
+                          type: object
+                        httpGet:
+                          description: HTTPGet specifies the http request to perform.
+                          properties:
+                            host:
+                              description: Host name to connect to, defaults to the
+                                pod IP. You probably want to set "Host" in httpHeaders
+                                instead.
+                              type: string
+                            httpHeaders:
+                              description: Custom headers to set in the request. HTTP
+                                allows repeated headers.
+                              items:
+                                description: HTTPHeader describes a custom header
+                                  to be used in HTTP probes
+                                properties:
+                                  name:
+                                    description: The header field name. This will
+                                      be canonicalized upon output, so case-variant
+                                      names will be understood as the same header.
+                                    type: string
+                                  value:
+                                    description: The header field value
+                                    type: string
+                                required:
+                                - name
+                                - value
+                                type: object
+                              type: array
+                            path:
+                              description: Path to access on the HTTP server.
+                              type: string
+                            port:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              description: Name or number of the port to access on
+                                the container. Number must be in the range 1 to 65535.
+                                Name must be an IANA_SVC_NAME.
+                              x-kubernetes-int-or-string: true
+                            scheme:
+                              description: Scheme to use for connecting to the host.
+                                Defaults to HTTP.
+                              type: string
+                          required:
+                          - port
+                          type: object
+                        initialDelaySeconds:
+                          description: 'Number of seconds after the container has
+                            started before liveness probes are initiated. More info:
+                            https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                          format: int32
+                          type: integer
+                        periodSeconds:
+                          description: How often (in seconds) to perform the probe.
+                            Default to 10 seconds. Minimum value is 1.
+                          format: int32
+                          type: integer
+                        successThreshold:
+                          description: Minimum consecutive successes for the probe
+                            to be considered successful after having failed. Defaults
+                            to 1. Must be 1 for liveness and startup. Minimum value
+                            is 1.
+                          format: int32
+                          type: integer
+                        tcpSocket:
+                          description: TCPSocket specifies an action involving a TCP
+                            port.
+                          properties:
+                            host:
+                              description: 'Optional: Host name to connect to, defaults
+                                to the pod IP.'
+                              type: string
+                            port:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              description: Number or name of the port to access on
+                                the container. Number must be in the range 1 to 65535.
+                                Name must be an IANA_SVC_NAME.
+                              x-kubernetes-int-or-string: true
+                          required:
+                          - port
+                          type: object
+                        terminationGracePeriodSeconds:
+                          description: Optional duration in seconds the pod needs
+                            to terminate gracefully upon probe failure. The grace
+                            period is the duration in seconds after the processes
+                            running in the pod are sent a termination signal and the
+                            time when the processes are forcibly halted with a kill
+                            signal. Set this value longer than the expected cleanup
+                            time for your process. If this value is nil, the pod's
+                            terminationGracePeriodSeconds will be used. Otherwise,
+                            this value overrides the value provided by the pod spec.
+                            Value must be non-negative integer. The value zero indicates
+                            stop immediately via the kill signal (no opportunity to
+                            shut down). This is a beta field and requires enabling
+                            ProbeTerminationGracePeriod feature gate. Minimum value
+                            is 1. spec.terminationGracePeriodSeconds is used if unset.
+                          format: int64
+                          type: integer
+                        timeoutSeconds:
+                          description: 'Number of seconds after which the probe times
+                            out. Defaults to 1 second. Minimum value is 1. More info:
+                            https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                          format: int32
+                          type: integer
+                      type: object
+                    name:
+                      description: Name of the container specified as a DNS_LABEL.
+                        Each container in a pod must have a unique name (DNS_LABEL).
+                        Cannot be updated.
+                      type: string
+                    ports:
+                      description: List of ports to expose from the container. Not
+                        specifying a port here DOES NOT prevent that port from being
+                        exposed. Any port which is listening on the default "0.0.0.0"
+                        address inside a container will be accessible from the network.
+                        Modifying this array with strategic merge patch may corrupt
+                        the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255.
+                        Cannot be updated.
+                      items:
+                        description: ContainerPort represents a network port in a
+                          single container.
+                        properties:
+                          containerPort:
+                            description: Number of port to expose on the pod's IP
+                              address. This must be a valid port number, 0 < x < 65536.
+                            format: int32
+                            type: integer
+                          hostIP:
+                            description: What host IP to bind the external port to.
+                            type: string
+                          hostPort:
+                            description: Number of port to expose on the host. If
+                              specified, this must be a valid port number, 0 < x <
+                              65536. If HostNetwork is specified, this must match
+                              ContainerPort. Most containers do not need this.
+                            format: int32
+                            type: integer
+                          name:
+                            description: If specified, this must be an IANA_SVC_NAME
+                              and unique within the pod. Each named port in a pod
+                              must have a unique name. Name for the port that can
+                              be referred to by services.
+                            type: string
+                          protocol:
+                            default: TCP
+                            description: Protocol for port. Must be UDP, TCP, or SCTP.
+                              Defaults to "TCP".
+                            type: string
+                        required:
+                        - containerPort
+                        type: object
+                      type: array
+                      x-kubernetes-list-map-keys:
+                      - containerPort
+                      - protocol
+                      x-kubernetes-list-type: map
+                    readinessProbe:
+                      description: 'Periodic probe of container service readiness.
+                        Container will be removed from service endpoints if the probe
+                        fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                      properties:
+                        exec:
+                          description: Exec specifies the action to take.
+                          properties:
+                            command:
+                              description: Command is the command line to execute
+                                inside the container, the working directory for the
+                                command  is root ('/') in the container's filesystem.
+                                The command is simply exec'd, it is not run inside
+                                a shell, so traditional shell instructions ('|', etc)
+                                won't work. To use a shell, you need to explicitly
+                                call out to that shell. Exit status of 0 is treated
+                                as live/healthy and non-zero is unhealthy.
+                              items:
+                                type: string
+                              type: array
+                          type: object
+                        failureThreshold:
+                          description: Minimum consecutive failures for the probe
+                            to be considered failed after having succeeded. Defaults
+                            to 3. Minimum value is 1.
+                          format: int32
+                          type: integer
+                        grpc:
+                          description: GRPC specifies an action involving a GRPC port.
+                            This is a beta field and requires enabling GRPCContainerProbe
+                            feature gate.
+                          properties:
+                            port:
+                              description: Port number of the gRPC service. Number
+                                must be in the range 1 to 65535.
+                              format: int32
+                              type: integer
+                            service:
+                              description: "Service is the name of the service to
+                                place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+                                \n If this is not specified, the default behavior
+                                is defined by gRPC."
+                              type: string
+                          required:
+                          - port
+                          type: object
+                        httpGet:
+                          description: HTTPGet specifies the http request to perform.
+                          properties:
+                            host:
+                              description: Host name to connect to, defaults to the
+                                pod IP. You probably want to set "Host" in httpHeaders
+                                instead.
+                              type: string
+                            httpHeaders:
+                              description: Custom headers to set in the request. HTTP
+                                allows repeated headers.
+                              items:
+                                description: HTTPHeader describes a custom header
+                                  to be used in HTTP probes
+                                properties:
+                                  name:
+                                    description: The header field name. This will
+                                      be canonicalized upon output, so case-variant
+                                      names will be understood as the same header.
+                                    type: string
+                                  value:
+                                    description: The header field value
+                                    type: string
+                                required:
+                                - name
+                                - value
+                                type: object
+                              type: array
+                            path:
+                              description: Path to access on the HTTP server.
+                              type: string
+                            port:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              description: Name or number of the port to access on
+                                the container. Number must be in the range 1 to 65535.
+                                Name must be an IANA_SVC_NAME.
+                              x-kubernetes-int-or-string: true
+                            scheme:
+                              description: Scheme to use for connecting to the host.
+                                Defaults to HTTP.
+                              type: string
+                          required:
+                          - port
+                          type: object
+                        initialDelaySeconds:
+                          description: 'Number of seconds after the container has
+                            started before liveness probes are initiated. More info:
+                            https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                          format: int32
+                          type: integer
+                        periodSeconds:
+                          description: How often (in seconds) to perform the probe.
+                            Default to 10 seconds. Minimum value is 1.
+                          format: int32
+                          type: integer
+                        successThreshold:
+                          description: Minimum consecutive successes for the probe
+                            to be considered successful after having failed. Defaults
+                            to 1. Must be 1 for liveness and startup. Minimum value
+                            is 1.
+                          format: int32
+                          type: integer
+                        tcpSocket:
+                          description: TCPSocket specifies an action involving a TCP
+                            port.
+                          properties:
+                            host:
+                              description: 'Optional: Host name to connect to, defaults
+                                to the pod IP.'
+                              type: string
+                            port:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              description: Number or name of the port to access on
+                                the container. Number must be in the range 1 to 65535.
+                                Name must be an IANA_SVC_NAME.
+                              x-kubernetes-int-or-string: true
+                          required:
+                          - port
+                          type: object
+                        terminationGracePeriodSeconds:
+                          description: Optional duration in seconds the pod needs
+                            to terminate gracefully upon probe failure. The grace
+                            period is the duration in seconds after the processes
+                            running in the pod are sent a termination signal and the
+                            time when the processes are forcibly halted with a kill
+                            signal. Set this value longer than the expected cleanup
+                            time for your process. If this value is nil, the pod's
+                            terminationGracePeriodSeconds will be used. Otherwise,
+                            this value overrides the value provided by the pod spec.
+                            Value must be non-negative integer. The value zero indicates
+                            stop immediately via the kill signal (no opportunity to
+                            shut down). This is a beta field and requires enabling
+                            ProbeTerminationGracePeriod feature gate. Minimum value
+                            is 1. spec.terminationGracePeriodSeconds is used if unset.
+                          format: int64
+                          type: integer
+                        timeoutSeconds:
+                          description: 'Number of seconds after which the probe times
+                            out. Defaults to 1 second. Minimum value is 1. More info:
+                            https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                          format: int32
+                          type: integer
+                      type: object
+                    resources:
+                      description: 'Compute Resources required by this container.
+                        Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/'
+                      properties:
+                        limits:
+                          additionalProperties:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          description: 'Limits describes the maximum amount of compute
+                            resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/'
+                          type: object
+                        requests:
+                          additionalProperties:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          description: 'Requests describes the minimum amount of compute
+                            resources required. If Requests is omitted for a container,
+                            it defaults to Limits if that is explicitly specified,
+                            otherwise to an implementation-defined value. More info:
+                            https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/'
+                          type: object
+                      type: object
+                    securityContext:
+                      description: 'SecurityContext defines the security options the
+                        container should be run with. If set, the fields of SecurityContext
+                        override the equivalent fields of PodSecurityContext. More
+                        info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/'
+                      properties:
+                        allowPrivilegeEscalation:
+                          description: 'AllowPrivilegeEscalation controls whether
+                            a process can gain more privileges than its parent process.
+                            This bool directly controls if the no_new_privs flag will
+                            be set on the container process. AllowPrivilegeEscalation
+                            is true always when the container is: 1) run as Privileged
+                            2) has CAP_SYS_ADMIN Note that this field cannot be set
+                            when spec.os.name is windows.'
+                          type: boolean
+                        capabilities:
+                          description: The capabilities to add/drop when running containers.
+                            Defaults to the default set of capabilities granted by
+                            the container runtime. Note that this field cannot be
+                            set when spec.os.name is windows.
+                          properties:
+                            add:
+                              description: Added capabilities
+                              items:
+                                description: Capability represent POSIX capabilities
+                                  type
+                                type: string
+                              type: array
+                            drop:
+                              description: Removed capabilities
+                              items:
+                                description: Capability represent POSIX capabilities
+                                  type
+                                type: string
+                              type: array
+                          type: object
+                        privileged:
+                          description: Run container in privileged mode. Processes
+                            in privileged containers are essentially equivalent to
+                            root on the host. Defaults to false. Note that this field
+                            cannot be set when spec.os.name is windows.
+                          type: boolean
+                        procMount:
+                          description: procMount denotes the type of proc mount to
+                            use for the containers. The default is DefaultProcMount
+                            which uses the container runtime defaults for readonly
+                            paths and masked paths. This requires the ProcMountType
+                            feature flag to be enabled. Note that this field cannot
+                            be set when spec.os.name is windows.
+                          type: string
+                        readOnlyRootFilesystem:
+                          description: Whether this container has a read-only root
+                            filesystem. Default is false. Note that this field cannot
+                            be set when spec.os.name is windows.
+                          type: boolean
+                        runAsGroup:
+                          description: The GID to run the entrypoint of the container
+                            process. Uses runtime default if unset. May also be set
+                            in PodSecurityContext.  If set in both SecurityContext
+                            and PodSecurityContext, the value specified in SecurityContext
+                            takes precedence. Note that this field cannot be set when
+                            spec.os.name is windows.
+                          format: int64
+                          type: integer
+                        runAsNonRoot:
+                          description: Indicates that the container must run as a
+                            non-root user. If true, the Kubelet will validate the
+                            image at runtime to ensure that it does not run as UID
+                            0 (root) and fail to start the container if it does. If
+                            unset or false, no such validation will be performed.
+                            May also be set in PodSecurityContext.  If set in both
+                            SecurityContext and PodSecurityContext, the value specified
+                            in SecurityContext takes precedence.
+                          type: boolean
+                        runAsUser:
+                          description: The UID to run the entrypoint of the container
+                            process. Defaults to user specified in image metadata
+                            if unspecified. May also be set in PodSecurityContext.  If
+                            set in both SecurityContext and PodSecurityContext, the
+                            value specified in SecurityContext takes precedence. Note
+                            that this field cannot be set when spec.os.name is windows.
+                          format: int64
+                          type: integer
+                        seLinuxOptions:
+                          description: The SELinux context to be applied to the container.
+                            If unspecified, the container runtime will allocate a
+                            random SELinux context for each container.  May also be
+                            set in PodSecurityContext.  If set in both SecurityContext
+                            and PodSecurityContext, the value specified in SecurityContext
+                            takes precedence. Note that this field cannot be set when
+                            spec.os.name is windows.
+                          properties:
+                            level:
+                              description: Level is SELinux level label that applies
+                                to the container.
+                              type: string
+                            role:
+                              description: Role is a SELinux role label that applies
+                                to the container.
+                              type: string
+                            type:
+                              description: Type is a SELinux type label that applies
+                                to the container.
+                              type: string
+                            user:
+                              description: User is a SELinux user label that applies
+                                to the container.
+                              type: string
+                          type: object
+                        seccompProfile:
+                          description: The seccomp options to use by this container.
+                            If seccomp options are provided at both the pod & container
+                            level, the container options override the pod options.
+                            Note that this field cannot be set when spec.os.name is
+                            windows.
+                          properties:
+                            localhostProfile:
+                              description: localhostProfile indicates a profile defined
+                                in a file on the node should be used. The profile
+                                must be preconfigured on the node to work. Must be
+                                a descending path, relative to the kubelet's configured
+                                seccomp profile location. Must only be set if type
+                                is "Localhost".
+                              type: string
+                            type:
+                              description: "type indicates which kind of seccomp profile
+                                will be applied. Valid options are: \n Localhost -
+                                a profile defined in a file on the node should be
+                                used. RuntimeDefault - the container runtime default
+                                profile should be used. Unconfined - no profile should
+                                be applied."
+                              type: string
+                          required:
+                          - type
+                          type: object
+                        windowsOptions:
+                          description: The Windows specific settings applied to all
+                            containers. If unspecified, the options from the PodSecurityContext
+                            will be used. If set in both SecurityContext and PodSecurityContext,
+                            the value specified in SecurityContext takes precedence.
+                            Note that this field cannot be set when spec.os.name is
+                            linux.
+                          properties:
+                            gmsaCredentialSpec:
+                              description: GMSACredentialSpec is where the GMSA admission
+                                webhook (https://github.com/kubernetes-sigs/windows-gmsa)
+                                inlines the contents of the GMSA credential spec named
+                                by the GMSACredentialSpecName field.
+                              type: string
+                            gmsaCredentialSpecName:
+                              description: GMSACredentialSpecName is the name of the
+                                GMSA credential spec to use.
+                              type: string
+                            hostProcess:
+                              description: HostProcess determines if a container should
+                                be run as a 'Host Process' container. This field is
+                                alpha-level and will only be honored by components
+                                that enable the WindowsHostProcessContainers feature
+                                flag. Setting this field without the feature flag
+                                will result in errors when validating the Pod. All
+                                of a Pod's containers must have the same effective
+                                HostProcess value (it is not allowed to have a mix
+                                of HostProcess containers and non-HostProcess containers).  In
+                                addition, if HostProcess is true then HostNetwork
+                                must also be set to true.
+                              type: boolean
+                            runAsUserName:
+                              description: The UserName in Windows to run the entrypoint
+                                of the container process. Defaults to the user specified
+                                in image metadata if unspecified. May also be set
+                                in PodSecurityContext. If set in both SecurityContext
+                                and PodSecurityContext, the value specified in SecurityContext
+                                takes precedence.
+                              type: string
+                          type: object
+                      type: object
+                    startupProbe:
+                      description: 'StartupProbe indicates that the Pod has successfully
+                        initialized. If specified, no other probes are executed until
+                        this completes successfully. If this probe fails, the Pod
+                        will be restarted, just as if the livenessProbe failed. This
+                        can be used to provide different probe parameters at the beginning
+                        of a Pod''s lifecycle, when it might take a long time to load
+                        data or warm a cache, than during steady-state operation.
+                        This cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                      properties:
+                        exec:
+                          description: Exec specifies the action to take.
+                          properties:
+                            command:
+                              description: Command is the command line to execute
+                                inside the container, the working directory for the
+                                command  is root ('/') in the container's filesystem.
+                                The command is simply exec'd, it is not run inside
+                                a shell, so traditional shell instructions ('|', etc)
+                                won't work. To use a shell, you need to explicitly
+                                call out to that shell. Exit status of 0 is treated
+                                as live/healthy and non-zero is unhealthy.
+                              items:
+                                type: string
+                              type: array
+                          type: object
+                        failureThreshold:
+                          description: Minimum consecutive failures for the probe
+                            to be considered failed after having succeeded. Defaults
+                            to 3. Minimum value is 1.
+                          format: int32
+                          type: integer
+                        grpc:
+                          description: GRPC specifies an action involving a GRPC port.
+                            This is a beta field and requires enabling GRPCContainerProbe
+                            feature gate.
+                          properties:
+                            port:
+                              description: Port number of the gRPC service. Number
+                                must be in the range 1 to 65535.
+                              format: int32
+                              type: integer
+                            service:
+                              description: "Service is the name of the service to
+                                place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).
+                                \n If this is not specified, the default behavior
+                                is defined by gRPC."
+                              type: string
+                          required:
+                          - port
+                          type: object
+                        httpGet:
+                          description: HTTPGet specifies the http request to perform.
+                          properties:
+                            host:
+                              description: Host name to connect to, defaults to the
+                                pod IP. You probably want to set "Host" in httpHeaders
+                                instead.
+                              type: string
+                            httpHeaders:
+                              description: Custom headers to set in the request. HTTP
+                                allows repeated headers.
+                              items:
+                                description: HTTPHeader describes a custom header
+                                  to be used in HTTP probes
+                                properties:
+                                  name:
+                                    description: The header field name. This will
+                                      be canonicalized upon output, so case-variant
+                                      names will be understood as the same header.
+                                    type: string
+                                  value:
+                                    description: The header field value
+                                    type: string
+                                required:
+                                - name
+                                - value
+                                type: object
+                              type: array
+                            path:
+                              description: Path to access on the HTTP server.
+                              type: string
+                            port:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              description: Name or number of the port to access on
+                                the container. Number must be in the range 1 to 65535.
+                                Name must be an IANA_SVC_NAME.
+                              x-kubernetes-int-or-string: true
+                            scheme:
+                              description: Scheme to use for connecting to the host.
+                                Defaults to HTTP.
+                              type: string
+                          required:
+                          - port
+                          type: object
+                        initialDelaySeconds:
+                          description: 'Number of seconds after the container has
+                            started before liveness probes are initiated. More info:
+                            https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                          format: int32
+                          type: integer
+                        periodSeconds:
+                          description: How often (in seconds) to perform the probe.
+                            Default to 10 seconds. Minimum value is 1.
+                          format: int32
+                          type: integer
+                        successThreshold:
+                          description: Minimum consecutive successes for the probe
+                            to be considered successful after having failed. Defaults
+                            to 1. Must be 1 for liveness and startup. Minimum value
+                            is 1.
+                          format: int32
+                          type: integer
+                        tcpSocket:
+                          description: TCPSocket specifies an action involving a TCP
+                            port.
+                          properties:
+                            host:
+                              description: 'Optional: Host name to connect to, defaults
+                                to the pod IP.'
+                              type: string
+                            port:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              description: Number or name of the port to access on
+                                the container. Number must be in the range 1 to 65535.
+                                Name must be an IANA_SVC_NAME.
+                              x-kubernetes-int-or-string: true
+                          required:
+                          - port
+                          type: object
+                        terminationGracePeriodSeconds:
+                          description: Optional duration in seconds the pod needs
+                            to terminate gracefully upon probe failure. The grace
+                            period is the duration in seconds after the processes
+                            running in the pod are sent a termination signal and the
+                            time when the processes are forcibly halted with a kill
+                            signal. Set this value longer than the expected cleanup
+                            time for your process. If this value is nil, the pod's
+                            terminationGracePeriodSeconds will be used. Otherwise,
+                            this value overrides the value provided by the pod spec.
+                            Value must be non-negative integer. The value zero indicates
+                            stop immediately via the kill signal (no opportunity to
+                            shut down). This is a beta field and requires enabling
+                            ProbeTerminationGracePeriod feature gate. Minimum value
+                            is 1. spec.terminationGracePeriodSeconds is used if unset.
+                          format: int64
+                          type: integer
+                        timeoutSeconds:
+                          description: 'Number of seconds after which the probe times
+                            out. Defaults to 1 second. Minimum value is 1. More info:
+                            https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes'
+                          format: int32
+                          type: integer
+                      type: object
+                    stdin:
+                      description: Whether this container should allocate a buffer
+                        for stdin in the container runtime. If this is not set, reads
+                        from stdin in the container will always result in EOF. Default
+                        is false.
+                      type: boolean
+                    stdinOnce:
+                      description: Whether the container runtime should close the
+                        stdin channel after it has been opened by a single attach.
+                        When stdin is true the stdin stream will remain open across
+                        multiple attach sessions. If stdinOnce is set to true, stdin
+                        is opened on container start, is empty until the first client
+                        attaches to stdin, and then remains open and accepts data
+                        until the client disconnects, at which time stdin is closed
+                        and remains closed until the container is restarted. If this
+                        flag is false, a container processes that reads from stdin
+                        will never receive an EOF. Default is false
+                      type: boolean
+                    terminationMessagePath:
+                      description: 'Optional: Path at which the file to which the
+                        container''s termination message will be written is mounted
+                        into the container''s filesystem. Message written is intended
+                        to be brief final status, such as an assertion failure message.
+                        Will be truncated by the node if greater than 4096 bytes.
+                        The total message length across all containers will be limited
+                        to 12kb. Defaults to /dev/termination-log. Cannot be updated.'
+                      type: string
+                    terminationMessagePolicy:
+                      description: Indicate how the termination message should be
+                        populated. File will use the contents of terminationMessagePath
+                        to populate the container status message on both success and
+                        failure. FallbackToLogsOnError will use the last chunk of
+                        container log output if the termination message file is empty
+                        and the container exited with an error. The log output is
+                        limited to 2048 bytes or 80 lines, whichever is smaller. Defaults
+                        to File. Cannot be updated.
+                      type: string
+                    tty:
+                      description: Whether this container should allocate a TTY for
+                        itself, also requires 'stdin' to be true. Default is false.
+                      type: boolean
+                    volumeDevices:
+                      description: volumeDevices is the list of block devices to be
+                        used by the container.
+                      items:
+                        description: volumeDevice describes a mapping of a raw block
+                          device within a container.
+                        properties:
+                          devicePath:
+                            description: devicePath is the path inside of the container
+                              that the device will be mapped to.
+                            type: string
+                          name:
+                            description: name must match the name of a persistentVolumeClaim
+                              in the pod
+                            type: string
+                        required:
+                        - devicePath
+                        - name
+                        type: object
+                      type: array
+                    volumeMounts:
+                      description: Pod volumes to mount into the container's filesystem.
+                        Cannot be updated.
+                      items:
+                        description: VolumeMount describes a mounting of a Volume
+                          within a container.
+                        properties:
+                          mountPath:
+                            description: Path within the container at which the volume
+                              should be mounted.  Must not contain ':'.
+                            type: string
+                          mountPropagation:
+                            description: mountPropagation determines how mounts are
+                              propagated from the host to container and the other
+                              way around. When not set, MountPropagationNone is used.
+                              This field is beta in 1.10.
+                            type: string
+                          name:
+                            description: This must match the Name of a Volume.
+                            type: string
+                          readOnly:
+                            description: Mounted read-only if true, read-write otherwise
+                              (false or unspecified). Defaults to false.
+                            type: boolean
+                          subPath:
+                            description: Path within the volume from which the container's
+                              volume should be mounted. Defaults to "" (volume's root).
+                            type: string
+                          subPathExpr:
+                            description: Expanded path within the volume from which
+                              the container's volume should be mounted. Behaves similarly
+                              to SubPath but environment variable references $(VAR_NAME)
+                              are expanded using the container's environment. Defaults
+                              to "" (volume's root). SubPathExpr and SubPath are mutually
+                              exclusive.
+                            type: string
+                        required:
+                        - mountPath
+                        - name
+                        type: object
+                      type: array
+                    workingDir:
+                      description: Container's working directory. If not specified,
+                        the container runtime's default will be used, which might
+                        be configured in the container image. Cannot be updated.
+                      type: string
+                  required:
+                  - name
+                  type: object
+                type: array
               extraNetwork:
                 description: Extra network interface attached to network provided
                   by Mutlus CNI.
@@ -1280,6 +2516,8 @@ spec:
                 type: string
               service_links:
                 type: boolean
+              serviceAccountName:
+                type: string
               terminationGracePeriodSeconds:
                 default: 5
                 format: int64
diff --git a/neonvm/config/common/rbac/virtualmachine_editor_role.yaml b/neonvm/config/common/rbac/virtualmachine_editor_role.yaml
index 3712a9e56..72f38313c 100644
--- a/neonvm/config/common/rbac/virtualmachine_editor_role.yaml
+++ b/neonvm/config/common/rbac/virtualmachine_editor_role.yaml
@@ -9,6 +9,8 @@ metadata:
     app.kubernetes.io/created-by: neonvm
     app.kubernetes.io/part-of: neonvm
     app.kubernetes.io/managed-by: kustomize
+    rbac.authorization.k8s.io/aggregate-to-edit: "true"
+    rbac.authorization.k8s.io/aggregate-to-admin: "true"
   name: virtualmachine-editor-role
 rules:
 - apiGroups:
diff --git a/neonvm/config/common/rbac/virtualmachine_viewer_role.yaml b/neonvm/config/common/rbac/virtualmachine_viewer_role.yaml
index 509d66eb9..9becd630f 100644
--- a/neonvm/config/common/rbac/virtualmachine_viewer_role.yaml
+++ b/neonvm/config/common/rbac/virtualmachine_viewer_role.yaml
@@ -9,6 +9,9 @@ metadata:
     app.kubernetes.io/created-by: neonvm
     app.kubernetes.io/part-of: neonvm
     app.kubernetes.io/managed-by: kustomize
+    rbac.authorization.k8s.io/aggregate-to-view: "true"
+    rbac.authorization.k8s.io/aggregate-to-edit: "true"
+    rbac.authorization.k8s.io/aggregate-to-admin: "true"
   name: virtualmachine-viewer-role
 rules:
 - apiGroups:
diff --git a/neonvm/config/common/rbac/virtualmachinemigration_editor_role.yaml b/neonvm/config/common/rbac/virtualmachinemigration_editor_role.yaml
index c6ed235f8..9bc05af8f 100644
--- a/neonvm/config/common/rbac/virtualmachinemigration_editor_role.yaml
+++ b/neonvm/config/common/rbac/virtualmachinemigration_editor_role.yaml
@@ -9,6 +9,8 @@ metadata:
     app.kubernetes.io/created-by: neonvm
     app.kubernetes.io/part-of: neonvm
     app.kubernetes.io/managed-by: kustomize
+    rbac.authorization.k8s.io/aggregate-to-edit: "true"
+    rbac.authorization.k8s.io/aggregate-to-admin: "true"
   name: virtualmachinemigration-editor-role
 rules:
 - apiGroups:
diff --git a/neonvm/config/common/rbac/virtualmachinemigration_viewer_role.yaml b/neonvm/config/common/rbac/virtualmachinemigration_viewer_role.yaml
index f8b7eb1c9..48d97f97d 100644
--- a/neonvm/config/common/rbac/virtualmachinemigration_viewer_role.yaml
+++ b/neonvm/config/common/rbac/virtualmachinemigration_viewer_role.yaml
@@ -9,6 +9,9 @@ metadata:
     app.kubernetes.io/created-by: neonvm
     app.kubernetes.io/part-of: neonvm
     app.kubernetes.io/managed-by: kustomize
+    rbac.authorization.k8s.io/aggregate-to-view: "true"
+    rbac.authorization.k8s.io/aggregate-to-edit: "true"
+    rbac.authorization.k8s.io/aggregate-to-admin: "true"
   name: virtualmachinemigration-viewer-role
 rules:
 - apiGroups:
diff --git a/neonvm/controllers/virtualmachine_controller.go b/neonvm/controllers/virtualmachine_controller.go
index 5888ddd00..fae0ccd60 100644
--- a/neonvm/controllers/virtualmachine_controller.go
+++ b/neonvm/controllers/virtualmachine_controller.go
@@ -379,7 +379,7 @@ func (r *VirtualMachineReconciler) doReconcile(ctx context.Context, virtualmachi
 		if err != nil && apierrors.IsNotFound(err) {
 			// lost runner pod for running VirtualMachine ?
 			r.Recorder.Event(virtualmachine, "Warning", "NotFound",
-				fmt.Sprintf("runner pod %s not fodund",
+				fmt.Sprintf("runner pod %s not found",
 					virtualmachine.Status.PodName))
 			virtualmachine.Status.Phase = vmv1.VmFailed
 			meta.SetStatusCondition(&virtualmachine.Status.Conditions,
@@ -409,7 +409,7 @@ func (r *VirtualMachineReconciler) doReconcile(ctx context.Context, virtualmachi
 			virtualmachine.Status.Node = vmRunner.Spec.NodeName
 
 			// get CPU details from QEMU and update status
-			cpuSlotsPlugged, _, err := QmpGetCpus(virtualmachine)
+			cpuSlotsPlugged, _, err := QmpGetCpus(QmpAddr(virtualmachine))
 			if err != nil {
 				log.Error(err, "Failed to get CPU details from VirtualMachine", "VirtualMachine", virtualmachine.Name)
 				return err
@@ -452,7 +452,7 @@ func (r *VirtualMachineReconciler) doReconcile(ctx context.Context, virtualmachi
 			}
 
 			// get Memory details from hypervisor and update VM status
-			memorySize, err := QmpGetMemorySize(virtualmachine)
+			memorySize, err := QmpGetMemorySize(QmpAddr(virtualmachine))
 			if err != nil {
 				log.Error(err, "Failed to get Memory details from VirtualMachine", "VirtualMachine", virtualmachine.Name)
 				return err
@@ -508,12 +508,67 @@ func (r *VirtualMachineReconciler) doReconcile(ctx context.Context, virtualmachi
 		}
 
 	case vmv1.VmScaling:
+		// Check that runner pod is still ok
+		vmRunner := &corev1.Pod{}
+		err := r.Get(ctx, types.NamespacedName{Name: virtualmachine.Status.PodName, Namespace: virtualmachine.Namespace}, vmRunner)
+		if err != nil && apierrors.IsNotFound(err) {
+			// lost runner pod for running VirtualMachine ?
+			r.Recorder.Event(virtualmachine, "Warning", "NotFound",
+				fmt.Sprintf("runner pod %s not found",
+					virtualmachine.Status.PodName))
+			virtualmachine.Status.Phase = vmv1.VmFailed
+			meta.SetStatusCondition(&virtualmachine.Status.Conditions,
+				metav1.Condition{Type: typeDegradedVirtualMachine,
+					Status:  metav1.ConditionTrue,
+					Reason:  "Reconciling",
+					Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) not found", virtualmachine.Status.PodName, virtualmachine.Name)})
+		} else if err != nil {
+			log.Error(err, "Failed to get runner Pod")
+			return err
+		}
+
+		// Update the metadata (including "usage" annotation) before anything else, so that it
+		// will be correctly set even if the rest of the reconcile operation fails.
+		if err := updatePodMetadataIfNecessary(ctx, r.Client, virtualmachine, vmRunner); err != nil {
+			log.Error(err, "Failed to sync pod labels and annotations", "VirtualMachine", virtualmachine.Name)
+		}
+
+		// runner pod found, check that it's still up:
+		switch vmRunner.Status.Phase {
+		case corev1.PodSucceeded:
+			virtualmachine.Status.Phase = vmv1.VmSucceeded
+			meta.SetStatusCondition(&virtualmachine.Status.Conditions,
+				metav1.Condition{Type: typeAvailableVirtualMachine,
+					Status:  metav1.ConditionFalse,
+					Reason:  "Reconciling",
+					Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) succeeded", virtualmachine.Status.PodName, virtualmachine.Name)})
+			return nil
+		case corev1.PodFailed:
+			virtualmachine.Status.Phase = vmv1.VmFailed
+			meta.SetStatusCondition(&virtualmachine.Status.Conditions,
+				metav1.Condition{Type: typeDegradedVirtualMachine,
+					Status:  metav1.ConditionTrue,
+					Reason:  "Reconciling",
+					Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) failed", virtualmachine.Status.PodName, virtualmachine.Name)})
+			return nil
+		case corev1.PodUnknown:
+			virtualmachine.Status.Phase = vmv1.VmPending
+			meta.SetStatusCondition(&virtualmachine.Status.Conditions,
+				metav1.Condition{Type: typeAvailableVirtualMachine,
+					Status:  metav1.ConditionUnknown,
+					Reason:  "Reconciling",
+					Message: fmt.Sprintf("Pod (%s) for VirtualMachine (%s) in Unknown phase", virtualmachine.Status.PodName, virtualmachine.Name)})
+			return nil
+		default:
+			// do nothing
+		}
+
 		cpuScaled := false
 		ramScaled := false
 
 		// do hotplug/unplug CPU
 		// firstly get current state from QEMU
-		cpuSlotsPlugged, _, err := QmpGetCpus(virtualmachine)
+		cpuSlotsPlugged, _, err := QmpGetCpus(QmpAddr(virtualmachine))
 		if err != nil {
 			log.Error(err, "Failed to get CPU details from VirtualMachine", "VirtualMachine", virtualmachine.Name)
 			return err
@@ -524,7 +579,7 @@ func (r *VirtualMachineReconciler) doReconcile(ctx context.Context, virtualmachi
 		if specCPU > pluggedCPU {
 			// going to plug one CPU
 			log.Info("Plug one more CPU into VM")
-			if err := QmpPlugCpu(virtualmachine); err != nil {
+			if err := QmpPlugCpu(QmpAddr(virtualmachine)); err != nil {
 				return err
 			}
 			r.Recorder.Event(virtualmachine, "Normal", "ScaleUp",
@@ -533,7 +588,7 @@ func (r *VirtualMachineReconciler) doReconcile(ctx context.Context, virtualmachi
 		} else if specCPU < pluggedCPU {
 			// going to unplug one CPU
 			log.Info("Unplug one CPU from VM")
-			if err := QmpUnplugCpu(virtualmachine); err != nil {
+			if err := QmpUnplugCpu(QmpAddr(virtualmachine)); err != nil {
 				return err
 			}
 			r.Recorder.Event(virtualmachine, "Normal", "ScaleDown",
@@ -546,7 +601,7 @@ func (r *VirtualMachineReconciler) doReconcile(ctx context.Context, virtualmachi
 
 		// do hotplug/unplug Memory
 		// firstly get current state from QEMU
-		memoryDevices, err := QmpQueryMemoryDevices(virtualmachine)
+		memoryDevices, err := QmpQueryMemoryDevices(QmpAddr(virtualmachine))
 		memoryPluggedSlots := *virtualmachine.Spec.Guest.MemorySlots.Min + int32(len(memoryDevices))
 		if err != nil {
 			log.Error(err, "Failed to get Memory details from VirtualMachine", "VirtualMachine", virtualmachine.Name)
@@ -565,7 +620,7 @@ func (r *VirtualMachineReconciler) doReconcile(ctx context.Context, virtualmachi
 		} else if *virtualmachine.Spec.Guest.MemorySlots.Use < memoryPluggedSlots {
 			// going to unplug one Memory Slot
 			log.Info("Unplug one Memory module from VM")
-			if err := QmpUnplugMemory(virtualmachine); err != nil {
+			if err := QmpUnplugMemory(QmpAddr(virtualmachine)); err != nil {
 				// special case !
 				// error means VM hadn't memory devices available for unplug
 				// need set .memorySlots.Use back to real value
@@ -991,6 +1046,7 @@ func podSpec(virtualmachine *vmv1.VirtualMachine) (*corev1.Pod, error) {
 			NodeSelector:                  virtualmachine.Spec.NodeSelector,
 			ImagePullSecrets:              virtualmachine.Spec.ImagePullSecrets,
 			Tolerations:                   virtualmachine.Spec.Tolerations,
+			ServiceAccountName:            virtualmachine.Spec.ServiceAccountName,
 			SchedulerName:                 virtualmachine.Spec.SchedulerName,
 			Affinity:                      affinity,
 			InitContainers: []corev1.Container{
@@ -1043,6 +1099,14 @@ func podSpec(virtualmachine *vmv1.VirtualMachine) (*corev1.Pod, error) {
 					"-vmspec", base64.StdEncoding.EncodeToString(vmSpecJson),
 					"-vmstatus", base64.StdEncoding.EncodeToString(vmStatusJson),
 				},
+				Env: []corev1.EnvVar{{
+					Name: "K8S_POD_NAME",
+					ValueFrom: &corev1.EnvVarSource{
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: "metadata.name",
+						},
+					},
+				}},
 				VolumeMounts: []corev1.VolumeMount{
 					{
 						Name:      "virtualmachineimages",
@@ -1081,12 +1145,16 @@ func podSpec(virtualmachine *vmv1.VirtualMachine) (*corev1.Pod, error) {
 		},
 	}
 
+	// Add any InitContainers that were specified by the spec
+	pod.Spec.InitContainers = append(pod.Spec.InitContainers, virtualmachine.Spec.ExtraInitContainers...)
+
 	// allow access to /dev/kvm and /dev/vhost-net devices by generic-device-plugin for kubelet
 	if pod.Spec.Containers[0].Resources.Limits == nil {
 		pod.Spec.Containers[0].Resources.Limits = corev1.ResourceList{}
 	}
 	pod.Spec.Containers[0].Resources.Limits["neonvm/vhost-net"] = resource.MustParse("1")
-	if virtualmachine.Spec.EnableAcceleration {
+	// NB: EnableAcceleration guaranteed non-nil because the k8s API server sets the default for us.
+	if *virtualmachine.Spec.EnableAcceleration {
 		pod.Spec.Containers[0].Resources.Limits["neonvm/kvm"] = resource.MustParse("1")
 	}
 
diff --git a/neonvm/controllers/virtualmachine_qmp_queries.go b/neonvm/controllers/virtualmachine_qmp_queries.go
index 9e7a2f2e2..2b44fb1ba 100644
--- a/neonvm/controllers/virtualmachine_qmp_queries.go
+++ b/neonvm/controllers/virtualmachine_qmp_queries.go
@@ -81,22 +81,11 @@ type MigrationInfo struct {
 	} `json:"compression"`
 }
 
-func QmpConnect(virtualmachine *vmv1.VirtualMachine) (*qmp.SocketMonitor, error) {
-	ip := virtualmachine.Status.PodIP
-	port := virtualmachine.Spec.QMP
-
-	mon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", ip, port), 2*time.Second)
-	if err != nil {
-		return nil, err
-	}
-	if err := mon.Connect(); err != nil {
-		return nil, err
-	}
-
-	return mon, nil
+func QmpAddr(vm *vmv1.VirtualMachine) (ip string, port int32) {
+	return vm.Status.PodIP, vm.Spec.QMP
 }
 
-func QmpConnectByIP(ip string, port int32) (*qmp.SocketMonitor, error) {
+func QmpConnect(ip string, port int32) (*qmp.SocketMonitor, error) {
 	mon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("%s:%d", ip, port), 2*time.Second)
 	if err != nil {
 		return nil, err
@@ -108,8 +97,8 @@ func QmpConnectByIP(ip string, port int32) (*qmp.SocketMonitor, error) {
 	return mon, nil
 }
 
-func QmpGetCpus(virtualmachine *vmv1.VirtualMachine) ([]QmpCpuSlot, []QmpCpuSlot, error) {
-	mon, err := QmpConnect(virtualmachine)
+func QmpGetCpus(ip string, port int32) ([]QmpCpuSlot, []QmpCpuSlot, error) {
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return nil, nil, err
 	}
@@ -137,37 +126,8 @@ func QmpGetCpus(virtualmachine *vmv1.VirtualMachine) ([]QmpCpuSlot, []QmpCpuSlot
 	return plugged, empty, nil
 }
 
-func QmpGetCpusFromRunner(ip string, port int32) ([]QmpCpuSlot, []QmpCpuSlot, error) {
-	mon, err := QmpConnectByIP(ip, port)
-	if err != nil {
-		return nil, nil, err
-	}
-	defer mon.Disconnect()
-
-	qmpcmd := []byte(`{"execute": "query-hotpluggable-cpus"}`)
-	raw, err := mon.Run(qmpcmd)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	var result QmpCpus
-	json.Unmarshal(raw, &result)
-
-	plugged := []QmpCpuSlot{}
-	empty := []QmpCpuSlot{}
-	for _, entry := range result.Return {
-		if entry.QomPath != nil {
-			plugged = append(plugged, QmpCpuSlot{Core: entry.Props.CoreId, QOM: *entry.QomPath, Type: entry.Type})
-		} else {
-			empty = append(empty, QmpCpuSlot{Core: entry.Props.CoreId, QOM: "", Type: entry.Type})
-		}
-	}
-
-	return plugged, empty, nil
-}
-
-func QmpPlugCpu(virtualmachine *vmv1.VirtualMachine) error {
-	_, empty, err := QmpGetCpus(virtualmachine)
+func QmpPlugCpu(ip string, port int32) error {
+	_, empty, err := QmpGetCpus(ip, port)
 	if err != nil {
 		return err
 	}
@@ -175,7 +135,7 @@ func QmpPlugCpu(virtualmachine *vmv1.VirtualMachine) error {
 		return errors.New("no empty slots for CPU hotplug")
 	}
 
-	mon, err := QmpConnect(virtualmachine)
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return err
 	}
@@ -193,35 +153,8 @@ func QmpPlugCpu(virtualmachine *vmv1.VirtualMachine) error {
 	return nil
 }
 
-func QmpPlugCpuToRunner(ip string, port int32) error {
-	_, empty, err := QmpGetCpusFromRunner(ip, port)
-	if err != nil {
-		return err
-	}
-	if len(empty) == 0 {
-		return errors.New("no empty slots for CPU hotplug")
-	}
-
-	mon, err := QmpConnectByIP(ip, port)
-	if err != nil {
-		return err
-	}
-	defer mon.Disconnect()
-
-	// empty list reversed, first cpu slot in the end of list and last cpu slot in the beginning
-	slot := empty[len(empty)-1]
-	qmpcmd := []byte(fmt.Sprintf(`{"execute": "device_add", "arguments": {"id": "cpu%d", "driver": "%s", "core-id": %d, "socket-id": 0,  "thread-id": 0}}`, slot.Core, slot.Type, slot.Core))
-
-	_, err = mon.Run(qmpcmd)
-	if err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func QmpUnplugCpu(virtualmachine *vmv1.VirtualMachine) error {
-	plugged, _, err := QmpGetCpus(virtualmachine)
+func QmpUnplugCpu(ip string, port int32) error {
+	plugged, _, err := QmpGetCpus(ip, port)
 	if err != nil {
 		return err
 	}
@@ -239,7 +172,7 @@ func QmpUnplugCpu(virtualmachine *vmv1.VirtualMachine) error {
 		return errors.New("there are no unpluggable CPUs")
 	}
 
-	mon, err := QmpConnect(virtualmachine)
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return err
 	}
@@ -257,11 +190,11 @@ func QmpUnplugCpu(virtualmachine *vmv1.VirtualMachine) error {
 }
 
 func QmpSyncCpuToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachineMigration) error {
-	plugged, _, err := QmpGetCpus(vm)
+	plugged, _, err := QmpGetCpus(QmpAddr(vm))
 	if err != nil {
 		return err
 	}
-	pluggedInTarget, _, err := QmpGetCpusFromRunner(migration.Status.TargetPodIP, vm.Spec.QMP)
+	pluggedInTarget, _, err := QmpGetCpus(migration.Status.TargetPodIP, vm.Spec.QMP)
 	if err != nil {
 		return err
 	}
@@ -270,7 +203,7 @@ func QmpSyncCpuToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachineM
 		return nil
 	}
 
-	target, err := QmpConnectByIP(migration.Status.TargetPodIP, vm.Spec.QMP)
+	target, err := QmpConnect(migration.Status.TargetPodIP, vm.Spec.QMP)
 	if err != nil {
 		return err
 	}
@@ -296,25 +229,8 @@ searchForEmpty:
 	return nil
 }
 
-func QmpQueryMemoryDevices(virtualmachine *vmv1.VirtualMachine) ([]QmpMemoryDevice, error) {
-	mon, err := QmpConnect(virtualmachine)
-	if err != nil {
-		return nil, err
-	}
-	defer mon.Disconnect()
-
-	var result QmpMemoryDevices
-	cmd := []byte(`{"execute": "query-memory-devices"}`)
-	raw, err := mon.Run(cmd)
-	if err != nil {
-		return nil, err
-	}
-	json.Unmarshal(raw, &result)
-	return result.Return, nil
-}
-
-func QmpQueryMemoryDevicesFromRunner(ip string, port int32) ([]QmpMemoryDevice, error) {
-	mon, err := QmpConnectByIP(ip, port)
+func QmpQueryMemoryDevices(ip string, port int32) ([]QmpMemoryDevice, error) {
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return nil, err
 	}
@@ -334,7 +250,7 @@ func QmpPlugMemory(virtualmachine *vmv1.VirtualMachine) error {
 	// slots - number of pluggable memory slots (Max - Min)
 	slots := *virtualmachine.Spec.Guest.MemorySlots.Max - *virtualmachine.Spec.Guest.MemorySlots.Min
 
-	memoryDevices, err := QmpQueryMemoryDevices(virtualmachine)
+	memoryDevices, err := QmpQueryMemoryDevices(QmpAddr(virtualmachine))
 	if err != nil {
 		return err
 	}
@@ -345,7 +261,7 @@ func QmpPlugMemory(virtualmachine *vmv1.VirtualMachine) error {
 		return errors.New("no empty slots for Memory hotplug")
 	}
 
-	mon, err := QmpConnect(virtualmachine)
+	mon, err := QmpConnect(QmpAddr(virtualmachine))
 	if err != nil {
 		return err
 	}
@@ -382,16 +298,16 @@ func QmpPlugMemory(virtualmachine *vmv1.VirtualMachine) error {
 }
 
 func QmpSyncMemoryToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachineMigration) error {
-	memoryDevices, err := QmpQueryMemoryDevices(vm)
+	memoryDevices, err := QmpQueryMemoryDevices(QmpAddr(vm))
 	if err != nil {
 		return err
 	}
-	memoryDevicesInTarget, err := QmpQueryMemoryDevicesFromRunner(migration.Status.TargetPodIP, vm.Spec.QMP)
+	memoryDevicesInTarget, err := QmpQueryMemoryDevices(migration.Status.TargetPodIP, vm.Spec.QMP)
 	if err != nil {
 		return err
 	}
 
-	target, err := QmpConnectByIP(migration.Status.TargetPodIP, vm.Spec.QMP)
+	target, err := QmpConnect(migration.Status.TargetPodIP, vm.Spec.QMP)
 	if err != nil {
 		return err
 	}
@@ -432,13 +348,13 @@ func QmpSyncMemoryToTarget(vm *vmv1.VirtualMachine, migration *vmv1.VirtualMachi
 }
 
 func QmpPlugMemoryToRunner(ip string, port int32, size int64) error {
-	memoryDevices, err := QmpQueryMemoryDevicesFromRunner(ip, port)
+	memoryDevices, err := QmpQueryMemoryDevices(ip, port)
 	if err != nil {
 		return err
 	}
 	plugged := int32(len(memoryDevices))
 
-	mon, err := QmpConnectByIP(ip, port)
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return err
 	}
@@ -469,8 +385,8 @@ func QmpPlugMemoryToRunner(ip string, port int32, size int64) error {
 	return nil
 }
 
-func QmpUnplugMemory(virtualmachine *vmv1.VirtualMachine) error {
-	memoryDevices, err := QmpQueryMemoryDevices(virtualmachine)
+func QmpUnplugMemory(ip string, port int32) error {
+	memoryDevices, err := QmpQueryMemoryDevices(ip, port)
 	if err != nil {
 		return err
 	}
@@ -479,7 +395,7 @@ func QmpUnplugMemory(virtualmachine *vmv1.VirtualMachine) error {
 		return errors.New("there are no unpluggable Memory slots")
 	}
 
-	mon, err := QmpConnect(virtualmachine)
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return err
 	}
@@ -517,8 +433,8 @@ func QmpUnplugMemory(virtualmachine *vmv1.VirtualMachine) error {
 	return merr
 }
 
-func QmpGetMemorySize(virtualmachine *vmv1.VirtualMachine) (*resource.Quantity, error) {
-	mon, err := QmpConnect(virtualmachine)
+func QmpGetMemorySize(ip string, port int32) (*resource.Quantity, error) {
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return nil, err
 	}
@@ -654,9 +570,9 @@ func QmpStartMigration(virtualmachine *vmv1.VirtualMachine, virtualmachinemigrat
 	return nil
 }
 
-func QmpGetMigrationInfo(virtualmachine *vmv1.VirtualMachine) (MigrationInfo, error) {
+func QmpGetMigrationInfo(ip string, port int32) (MigrationInfo, error) {
 	empty := MigrationInfo{}
-	mon, err := QmpConnect(virtualmachine)
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return empty, err
 	}
@@ -674,8 +590,8 @@ func QmpGetMigrationInfo(virtualmachine *vmv1.VirtualMachine) (MigrationInfo, er
 	return result.Return, nil
 }
 
-func QmpCancelMigration(virtualmachine *vmv1.VirtualMachine) error {
-	mon, err := QmpConnect(virtualmachine)
+func QmpCancelMigration(ip string, port int32) error {
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return err
 	}
@@ -691,7 +607,7 @@ func QmpCancelMigration(virtualmachine *vmv1.VirtualMachine) error {
 }
 
 func QmpQuit(ip string, port int32) error {
-	mon, err := QmpConnectByIP(ip, port)
+	mon, err := QmpConnect(ip, port)
 	if err != nil {
 		return err
 	}
diff --git a/neonvm/controllers/virtualmachinemigration_controller.go b/neonvm/controllers/virtualmachinemigration_controller.go
index 01e251e0a..07f3beee7 100644
--- a/neonvm/controllers/virtualmachinemigration_controller.go
+++ b/neonvm/controllers/virtualmachinemigration_controller.go
@@ -371,7 +371,7 @@ func (r *VirtualMachineMigrationReconciler) Reconcile(ctx context.Context, req c
 		}
 
 		// retrieve migration statistics
-		migrationInfo, err := QmpGetMigrationInfo(vm)
+		migrationInfo, err := QmpGetMigrationInfo(QmpAddr(vm))
 		if err != nil {
 			log.Error(err, "Failed to get migration info")
 			return ctrl.Result{}, err
@@ -475,7 +475,7 @@ func (r *VirtualMachineMigrationReconciler) Reconcile(ctx context.Context, req c
 		// seems migration still going on, just update status with migration progress once per second
 		time.Sleep(time.Second)
 		// re-retrieve migration statistics
-		migrationInfo, err = QmpGetMigrationInfo(vm)
+		migrationInfo, err = QmpGetMigrationInfo(QmpAddr(vm))
 		if err != nil {
 			log.Error(err, "Failed to re-get migration info")
 			return ctrl.Result{}, err
@@ -576,7 +576,7 @@ func (r *VirtualMachineMigrationReconciler) doFinalizerOperationsForVirtualMachi
 
 		// try to cancel migration
 		log.Info("Canceling migration")
-		if err := QmpCancelMigration(vm); err != nil {
+		if err := QmpCancelMigration(QmpAddr(vm)); err != nil {
 			// inform about error but not return error to avoid stuckness in reconciliation cycle
 			log.Error(err, "Migration canceling failed")
 		}
diff --git a/neonvm/runner/main.go b/neonvm/runner/main.go
index b4933b0bd..017aa4f00 100644
--- a/neonvm/runner/main.go
+++ b/neonvm/runner/main.go
@@ -12,7 +12,6 @@ import (
 	"bytes"
 	"flag"
 	"fmt"
-	"log"
 	"math"
 	"net"
 	"os"
@@ -36,6 +35,7 @@ import (
 	"github.com/kdomanski/iso9660"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/vishvananda/netlink"
+	"go.uber.org/zap"
 	"k8s.io/apimachinery/pkg/api/resource"
 
 	vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
@@ -46,11 +46,12 @@ const (
 	QEMU_BIN      = "qemu-system-x86_64"
 	QEMU_IMG_BIN  = "qemu-img"
 	kernelPath    = "/vm/kernel/vmlinuz"
-	kernelCmdline = "init=/neonvm/bin/init memhp_default_state=online_movable console=ttyS1 loglevel=7 root=/dev/vda rw"
+	kernelCmdline = "panic=-1 init=/neonvm/bin/init memhp_default_state=online_movable console=ttyS1 loglevel=7 root=/dev/vda rw"
 
-	rootDiskPath    = "/vm/images/rootdisk.qcow2"
-	runtimeDiskPath = "/vm/images/runtime.iso"
-	mountedDiskPath = "/vm/images"
+	rootDiskPath                   = "/vm/images/rootdisk.qcow2"
+	runtimeDiskPath                = "/vm/images/runtime.iso"
+	mountedDiskPath                = "/vm/images"
+	qmpUnixSocketForSigtermHandler = "/vm/qmp-sigterm.sock"
 
 	defaultNetworkBridgeName = "br-def"
 	defaultNetworkTapName    = "tap-def"
@@ -68,6 +69,17 @@ const (
 	// in microseconds. Min 1000 microseconds, max 1 second
 	cgroupPeriod     = uint64(100000)
 	cgroupMountPoint = "/sys/fs/cgroup"
+
+	// cpuLimitOvercommitFactor sets the amount above the VM's spec.guest.cpus.use that we set the
+	// QEMU cgroup's CPU limit to. e.g. if cpuLimitOvercommitFactor = 3 and the VM is using 0.5
+	// CPUs, we set the cgroup to limit QEMU+VM to 1.5 CPUs.
+	//
+	// This exists because setting the cgroup exactly equal to the VM's CPU value is overly
+	// pessimistic, results in a lot of unused capacity on the host, and particularly impacts
+	// operations that parallelize between the VM and QEMU, like heavy disk access.
+	//
+	// See also: https://neondb.slack.com/archives/C03TN5G758R/p1693462680623239
+	cpuLimitOvercommitFactor = 4
 )
 
 var (
@@ -210,7 +222,14 @@ func createISO9660runtime(diskPath string, command []string, args []string, env
 			mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mkdir -p %s`, disk.MountPath))
 			switch {
 			case disk.EmptyDisk != nil:
-				mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount $(/neonvm/bin/blkid -L %s) %s`, disk.Name, disk.MountPath))
+				opts := ""
+				if disk.EmptyDisk.Discard {
+					opts = "-o discard"
+				}
+
+				mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount %s $(/neonvm/bin/blkid -L %s) %s`, opts, disk.Name, disk.MountPath))
+				// Note: chmod must be after mount, otherwise it gets overwritten by mount.
+				mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/chmod 0777 %s`, disk.MountPath))
 			case disk.ConfigMap != nil || disk.Secret != nil:
 				mounts = append(mounts, fmt.Sprintf(`/neonvm/bin/mount -o ro,mode=0644 $(/neonvm/bin/blkid -L %s) %s`, disk.Name, disk.MountPath))
 			case disk.Tmpfs != nil:
@@ -330,7 +349,7 @@ func createQCOW2(diskName string, diskPath string, diskSize *resource.Quantity,
 	return nil
 }
 
-func createISO9660FromPath(diskName string, diskPath string, contentPath string) error {
+func createISO9660FromPath(logger *zap.Logger, diskName string, diskPath string, contentPath string) error {
 	writer, err := iso9660.NewWriter()
 	if err != nil {
 		return err
@@ -370,7 +389,7 @@ func createISO9660FromPath(diskName string, diskPath string, contentPath string)
 			continue
 		}
 
-		log.Printf("adding file: %s\n", outputPath)
+		logger.Info("adding file to ISO9660 disk", zap.String("path", outputPath))
 		fileToAdd, err := os.Open(fileName)
 		if err != nil {
 			return err
@@ -427,28 +446,35 @@ func checkDevTun() bool {
 }
 
 func main() {
+	logger := zap.Must(zap.NewProduction()).Named("neonvm-runner")
+
 	var vmSpecDump string
 	var vmStatusDump string
 	flag.StringVar(&vmSpecDump, "vmspec", vmSpecDump, "Base64 encoded VirtualMachine json specification")
 	flag.StringVar(&vmStatusDump, "vmstatus", vmStatusDump, "Base64 encoded VirtualMachine json status")
 	flag.Parse()
 
+	selfPodName, ok := os.LookupEnv("K8S_POD_NAME")
+	if !ok {
+		logger.Fatal("environment variable K8S_POD_NAME missing")
+	}
+
 	vmSpecJson, err := base64.StdEncoding.DecodeString(vmSpecDump)
 	if err != nil {
-		log.Fatalf("Failed to decode VirtualMachine Spec dump: %s", err)
+		logger.Fatal("Failed to decode VirtualMachine Spec dump", zap.Error(err))
 	}
 	vmStatusJson, err := base64.StdEncoding.DecodeString(vmStatusDump)
 	if err != nil {
-		log.Fatalf("Failed to decode VirtualMachine Status dump: %s", err)
+		logger.Fatal("Failed to decode VirtualMachine Status dump", zap.Error(err))
 	}
 
 	vmSpec := &vmv1.VirtualMachineSpec{}
 	if err := json.Unmarshal(vmSpecJson, vmSpec); err != nil {
-		log.Fatalf("Failed to unmarshal VM Spec: %s", err)
+		logger.Fatal("Failed to unmarshal VM spec", zap.Error(err))
 	}
 	vmStatus := &vmv1.VirtualMachineStatus{}
 	if err := json.Unmarshal(vmStatusJson, vmStatus); err != nil {
-		log.Fatalf("Failed to unmarshal VM Status: %s", err)
+		logger.Fatal("Failed to unmarshal VM Status", zap.Error(err))
 	}
 
 	qemuCPUs := processCPUs(vmSpec.Guest.CPUs)
@@ -468,7 +494,7 @@ func main() {
 
 	// create iso9660 disk with runtime options (command, args, envs, mounts)
 	if err = createISO9660runtime(runtimeDiskPath, vmSpec.Guest.Command, vmSpec.Guest.Args, vmSpec.Guest.Env, vmSpec.Disks); err != nil {
-		log.Fatalln(err)
+		logger.Fatal("Failed to create iso9660 disk", zap.Error(err))
 	}
 
 	// resize rootDisk image of size specified and new size more than current
@@ -478,7 +504,7 @@ func main() {
 	// get current disk size by qemu-img info command
 	qemuImgOut, err := exec.Command(QEMU_IMG_BIN, "info", "--output=json", rootDiskPath).Output()
 	if err != nil {
-		log.Fatalln(err)
+		logger.Fatal("could not get root image size", zap.Error(err))
 	}
 	imageSize := QemuImgOutputPartial{}
 	json.Unmarshal(qemuImgOut, &imageSize)
@@ -487,12 +513,12 @@ func main() {
 	// going to resize
 	if !vmSpec.Guest.RootDisk.Size.IsZero() {
 		if vmSpec.Guest.RootDisk.Size.Cmp(*imageSizeQuantity) == 1 {
-			log.Printf("resizing rootDisk from %s to %s\n", imageSizeQuantity.String(), vmSpec.Guest.RootDisk.Size.String())
+			logger.Info(fmt.Sprintf("resizing rootDisk from %s to %s", imageSizeQuantity.String(), vmSpec.Guest.RootDisk.Size.String()))
 			if err := execFg(QEMU_IMG_BIN, "resize", rootDiskPath, fmt.Sprintf("%d", vmSpec.Guest.RootDisk.Size.Value())); err != nil {
-				log.Fatal(err)
+				logger.Fatal("Failed to resize rootDisk", zap.Error(err))
 			}
 		} else {
-			log.Printf("rootDisk.size (%s) should be more than size in image (%s)\n", vmSpec.Guest.RootDisk.Size.String(), imageSizeQuantity.String())
+			logger.Info(fmt.Sprintf("rootDisk.size (%s) is less than than image size (%s)", vmSpec.Guest.RootDisk.Size.String(), imageSizeQuantity.String()))
 		}
 	}
 
@@ -509,6 +535,7 @@ func main() {
 		"-serial", "stdio",
 		"-msg", "timestamp=on",
 		"-qmp", fmt.Sprintf("tcp:0.0.0.0:%d,server,wait=off", vmSpec.QMP),
+		"-qmp", fmt.Sprintf("unix:%s,server,wait=off", qmpUnixSocketForSigtermHandler),
 	}
 
 	// disk details
@@ -517,18 +544,22 @@ func main() {
 	for _, disk := range vmSpec.Disks {
 		switch {
 		case disk.EmptyDisk != nil:
-			log.Printf("creating QCOW2 image '%s' with empty ext4 filesystem", disk.Name)
+			logger.Info("creating QCOW2 image with empty ext4 filesystem", zap.String("diskName", disk.Name))
 			dPath := fmt.Sprintf("%s/%s.qcow2", mountedDiskPath, disk.Name)
 			if err := createQCOW2(disk.Name, dPath, &disk.EmptyDisk.Size, nil); err != nil {
-				log.Fatalln(err)
+				logger.Fatal("Failed to create QCOW2 image", zap.Error(err))
 			}
-			qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=disk,cache=none", disk.Name, dPath))
+			discard := ""
+			if disk.EmptyDisk.Discard {
+				discard = ",discard=unmap"
+			}
+			qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=disk,cache=none%s", disk.Name, dPath, discard))
 		case disk.ConfigMap != nil || disk.Secret != nil:
 			dPath := fmt.Sprintf("%s/%s.qcow2", mountedDiskPath, disk.Name)
 			mnt := fmt.Sprintf("/vm/mounts%s", disk.MountPath)
-			log.Printf("creating iso9660 image '%s' for '%s' from path '%s'", dPath, disk.Name, mnt)
-			if err := createISO9660FromPath(disk.Name, dPath, mnt); err != nil {
-				log.Fatalln(err)
+			logger.Info("creating iso9660 image", zap.String("diskPath", dPath), zap.String("diskName", disk.Name), zap.String("mountPath", mnt))
+			if err := createISO9660FromPath(logger, disk.Name, dPath, mnt); err != nil {
+				logger.Fatal("Failed to create ISO9660 image", zap.Error(err))
 			}
 			qemuCmd = append(qemuCmd, "-drive", fmt.Sprintf("id=%s,file=%s,if=virtio,media=cdrom,cache=none", disk.Name, dPath))
 		default:
@@ -537,9 +568,12 @@ func main() {
 	}
 
 	// cpu details
-	if vmSpec.EnableAcceleration && checkKVM() {
-		log.Println("using KVM acceleration")
+	// NB: EnableAcceleration guaranteed non-nil because the k8s API server sets the default for us.
+	if *vmSpec.EnableAcceleration && checkKVM() {
+		logger.Info("using KVM acceleration")
 		qemuCmd = append(qemuCmd, "-enable-kvm")
+	} else {
+		logger.Warn("not using KVM acceleration")
 	}
 	qemuCmd = append(qemuCmd, "-cpu", "max")
 	qemuCmd = append(qemuCmd, "-smp", strings.Join(cpus, ","))
@@ -548,9 +582,9 @@ func main() {
 	qemuCmd = append(qemuCmd, "-m", strings.Join(memory, ","))
 
 	// default (pod) net details
-	macDefault, err := defaultNetwork(defaultNetworkCIDR, vmSpec.Guest.Ports)
+	macDefault, err := defaultNetwork(logger, defaultNetworkCIDR, vmSpec.Guest.Ports)
 	if err != nil {
-		log.Fatalf("can not setup default network: %s", err)
+		logger.Fatal("cannot set up default network", zap.Error(err))
 	}
 	qemuCmd = append(qemuCmd, "-netdev", fmt.Sprintf("tap,id=default,ifname=%s,script=no,downscript=no,vhost=on", defaultNetworkTapName))
 	qemuCmd = append(qemuCmd, "-device", fmt.Sprintf("virtio-net-pci,netdev=default,mac=%s", macDefault.String()))
@@ -559,7 +593,7 @@ func main() {
 	if vmSpec.ExtraNetwork != nil && vmSpec.ExtraNetwork.Enable {
 		macOverlay, err := overlayNetwork(vmSpec.ExtraNetwork.Interface)
 		if err != nil {
-			log.Fatalf("can not setup overlay network: %s", err)
+			logger.Fatal("cannot set up overlay network", zap.Error(err))
 		}
 		qemuCmd = append(qemuCmd, "-netdev", fmt.Sprintf("tap,id=overlay,ifname=%s,script=no,downscript=no,vhost=on", overlayNetworkTapName))
 		qemuCmd = append(qemuCmd, "-device", fmt.Sprintf("virtio-net-pci,netdev=overlay,mac=%s", macOverlay.String()))
@@ -578,40 +612,56 @@ func main() {
 		qemuCmd = append(qemuCmd, "-incoming", fmt.Sprintf("tcp:0:%d", vmv1.MigrationPort))
 	}
 
-	// leading slash is important
-	cgroupPath := fmt.Sprintf("/%s-vm-runner", vmStatus.PodName)
+	selfCgroupPath, err := getSelfCgroupPath(logger)
+	if err != nil {
+		logger.Fatal("Failed to get self cgroup path", zap.Error(err))
+	}
+	// Sometimes we'll get just '/' as our cgroup path. If that's the case, we should reset it so
+	// that the cgroup '/neonvm-qemu-...' still works.
+	if selfCgroupPath == "/" {
+		selfCgroupPath = ""
+	}
+	// ... but also we should have some uniqueness just in case, so we're not sharing a root level
+	// cgroup if that *is* what's happening. This *should* only be relevant for local clusters.
+	//
+	// We don't want to just use the VM spec's .status.PodName because during migrations that will
+	// be equal to the source pod, not this one, which may be... somewhat confusing.
+	cgroupPath := fmt.Sprintf("%s/neonvm-qemu-%s", selfCgroupPath, selfPodName)
+
+	logger.Info("Determined QEMU cgroup path", zap.String("path", cgroupPath))
 
-	if err := setCgroupLimit(qemuCPUs.use, cgroupPath); err != nil {
-		log.Fatalf("Failed to set cgroup limit: %s", err)
+	if err := setCgroupLimit(logger, qemuCPUs.use, cgroupPath); err != nil {
+		logger.Fatal("Failed to set cgroup limit", zap.Error(err))
 	}
-	defer cleanupCgroup(cgroupPath)
 	ctx, cancel := context.WithCancel(context.Background())
 	wg := sync.WaitGroup{}
 
 	wg.Add(1)
-	go terminateQemuOnSigterm(ctx, vmSpec.QMP, &wg)
+	go terminateQemuOnSigterm(ctx, logger, &wg)
 	wg.Add(1)
-	go listenForCPUChanges(ctx, vmSpec.RunnerPort, cgroupPath, &wg)
+	go listenForCPUChanges(ctx, logger, vmSpec.RunnerPort, cgroupPath, &wg)
 
 	args := append([]string{"-g", fmt.Sprintf("cpu:%s", cgroupPath), QEMU_BIN}, qemuCmd...)
-	log.Printf("using cgexec args: %v", args)
+	logger.Info("calling cgexec", zap.Strings("args", args))
 	if err := execFg("cgexec", args...); err != nil {
-		log.Printf("Qemu exited: %s", err)
+		logger.Error("QEMU exited with error", zap.Error(err))
+	} else {
+		logger.Info("QEMU exited without error")
 	}
 
 	cancel()
 	wg.Wait()
 }
 
-func handleCPUChange(w http.ResponseWriter, r *http.Request, cgroupPath string) {
+func handleCPUChange(logger *zap.Logger, w http.ResponseWriter, r *http.Request, cgroupPath string) {
 	if r.Method != "POST" {
-		log.Printf("unexpected method: %s\n", r.Method)
+		logger.Error("unexpected method", zap.String("method", r.Method))
 		w.WriteHeader(400)
 		return
 	}
 	body, err := io.ReadAll(r.Body)
 	if err != nil {
-		log.Printf("could not read body: %s\n", err)
+		logger.Error("could not read body", zap.Error(err))
 		w.WriteHeader(400)
 		return
 	}
@@ -619,16 +669,16 @@ func handleCPUChange(w http.ResponseWriter, r *http.Request, cgroupPath string)
 	parsed := api.VCPUChange{}
 	err = json.Unmarshal(body, &parsed)
 	if err != nil {
-		log.Printf("could not parse body: %s\n", err)
+		logger.Error("could not parse body", zap.Error(err))
 		w.WriteHeader(400)
 		return
 	}
 
 	// update cgroup
-	log.Printf("got CPU update %v", parsed.VCPUs.AsFloat64())
-	err = setCgroupLimit(parsed.VCPUs, cgroupPath)
+	logger.Info("got CPU update", zap.Float64("CPU", parsed.VCPUs.AsFloat64()))
+	err = setCgroupLimit(logger, parsed.VCPUs, cgroupPath)
 	if err != nil {
-		log.Printf("could not set cgroup limit: %s\n", err)
+		logger.Error("could not set cgroup limit", zap.Error(err))
 		w.WriteHeader(500)
 		return
 	}
@@ -636,23 +686,23 @@ func handleCPUChange(w http.ResponseWriter, r *http.Request, cgroupPath string)
 	w.WriteHeader(200)
 }
 
-func handleCPUCurrent(w http.ResponseWriter, r *http.Request, cgroupPath string) {
+func handleCPUCurrent(logger *zap.Logger, w http.ResponseWriter, r *http.Request, cgroupPath string) {
 	if r.Method != "GET" {
-		log.Printf("unexpected method: %s\n", r.Method)
+		logger.Error("unexpected method", zap.String("method", r.Method))
 		w.WriteHeader(400)
 		return
 	}
 
 	cpus, err := getCgroupQuota(cgroupPath)
 	if err != nil {
-		log.Printf("could not get cgroup quota: %s\n", err)
+		logger.Error("could not get cgroup quota", zap.Error(err))
 		w.WriteHeader(500)
 		return
 	}
 	resp := api.VCPUCgroup{VCPUs: *cpus}
 	body, err := json.Marshal(resp)
 	if err != nil {
-		log.Printf("could not marshal body: %s\n", err)
+		logger.Error("could not marshal body", zap.Error(err))
 		w.WriteHeader(500)
 		return
 	}
@@ -661,14 +711,17 @@ func handleCPUCurrent(w http.ResponseWriter, r *http.Request, cgroupPath string)
 	w.Write(body)
 }
 
-func listenForCPUChanges(ctx context.Context, port int32, cgroupPath string, wg *sync.WaitGroup) {
+func listenForCPUChanges(ctx context.Context, logger *zap.Logger, port int32, cgroupPath string, wg *sync.WaitGroup) {
 	defer wg.Done()
 	mux := http.NewServeMux()
+	loggerHandlers := logger.Named("http-handlers")
+	cpuChangeLogger := loggerHandlers.Named("cpu_change")
 	mux.HandleFunc("/cpu_change", func(w http.ResponseWriter, r *http.Request) {
-		handleCPUChange(w, r, cgroupPath)
+		handleCPUChange(cpuChangeLogger, w, r, cgroupPath)
 	})
+	cpuCurrentLogger := loggerHandlers.Named("cpu_current")
 	mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
-		handleCPUCurrent(w, r, cgroupPath)
+		handleCPUCurrent(cpuCurrentLogger, w, r, cgroupPath)
 	})
 	server := http.Server{
 		Addr:              fmt.Sprintf("0.0.0.0:%d", port),
@@ -684,23 +737,141 @@ func listenForCPUChanges(ctx context.Context, port int32, cgroupPath string, wg
 	select {
 	case err := <-errChan:
 		if errors.Is(err, http.ErrServerClosed) {
-			log.Println("cpu_change server closed")
+			logger.Info("cpu_change server closed")
 		} else if err != nil {
-			log.Fatalf("error starting server: %s\n", err)
+			logger.Fatal("cpu_change exited with error", zap.Error(err))
 		}
 	case <-ctx.Done():
 		err := server.Shutdown(context.Background())
-		log.Printf("shut down cpu_change server: %v", err)
+		logger.Info("shut down cpu_change server", zap.Error(err))
+	}
+}
+
+func getSelfCgroupPath(logger *zap.Logger) (string, error) {
+	// There's some fun stuff here. For general information, refer to `man 7 cgroups` - specifically
+	// the section titled "/proc files" - for "/proc/cgroups" and "/proc/pid/cgroup".
+	//
+	// In general, the idea is this: If we start QEMU outside of the cgroup for the container we're
+	// running in, we run into multiple problems - it won't show up in metrics, and we'll have to
+	// clean up the cgroup ourselves. (not good!).
+	//
+	// So we'd like to start it in the same cgroup - the question is just how to find the name of
+	// the cgroup we're running in. Thankfully, this is visible in `/proc/self/cgroup`!
+	// The only difficulty is the file format.
+	//
+	// In cgroup v1 (which is what we have on EKS [as of 2023-07]), the contents of
+	// /proc/<pid>/cgroup tend to look like:
+	//
+	//   11:cpuset:/path/to/cgroup
+	//   10:perf_event:/path/to/cgroup
+	//   9:hugetlb:/path/to/cgroup
+	//   8:blkio:/path/to/cgroup
+	//   7:pids:/path/to/cgroup
+	//   6:freezer:/path/to/cgroup
+	//   5:memory:/path/to/cgroup
+	//   4:net_cls,net_prio:/path/to/cgroup
+	//   3:cpu,cpuacct:/path/to/cgroup
+	//   2:devices:/path/to/cgroup
+	//   1:name=systemd:/path/to/cgroup
+	//
+	// For cgroup v2, we have:
+	//
+	//   0::/path/to/cgroup
+	//
+	// The file format is defined to have 3 fields, separated by colons. The first field gives the
+	// Hierarchy ID, which is guaranteed to be 0 if the cgroup is part of a cgroup v2 ("unified")
+	// hierarchy.
+	// The second field is a comma-separated list of the controllers. Or, if it's cgroup v2, nothing.
+	// The third field is the "pathname" of the cgroup *in its hierarchy*, relative to the mount
+	// point of the hierarchy.
+	//
+	// So we're looking for EITHER:
+	//  1. an entry like '<N>:<controller...>,cpu,<controller...>:/path/to/cgroup (cgroup v1); OR
+	//  2. an entry like '0::/path/to/cgroup', and we'll return the path (cgroup v2)
+	// We primarily care about the 'cpu' controller, so for cgroup v1, we'll search for that instead
+	// of e.g. "name=systemd", although it *really* shouldn't matter because the paths will be the
+	// same anyways.
+	//
+	// Now: Technically it's possible to run a "hybrid" system with both cgroup v1 and v2
+	// hierarchies. If this is the case, it's possible for /proc/self/cgroup to show *some* v1
+	// hierarchies attached, in addition to the v2 "unified" hierarchy, for the same cgroup. To
+	// handle this, we should look for a cgroup v1 "cpu" controller, and if we can't find it, try
+	// for the cgroup v2 unified entry.
+	//
+	// As far as I (@sharnoff) can tell, the only case where that might actually get messed up is if
+	// the CPU controller isn't available for the cgroup we're running in, in which case there's
+	// nothing we can do about it! (other than e.g. using a cgroup higher up the chain, which would
+	// be really bad tbh).
+
+	// ---
+	// On to the show!
+
+	procSelfCgroupContents, err := os.ReadFile("/proc/self/cgroup")
+	if err != nil {
+		return "", fmt.Errorf("failed to read /proc/self/cgroup: %w", err)
+	}
+	logger.Info("Read /proc/self/cgroup", zap.String("contents", string(procSelfCgroupContents)))
+
+	// Collect all candidate paths from the lines of the file. If there isn't exactly one,
+	// something's wrong and we should make an error.
+	var v1Candidates []string
+	var v2Candidates []string
+	for lineno, line := range strings.Split(string(procSelfCgroupContents), "\n") {
+		if line == "" {
+			continue
+		}
+
+		// Split into the three ':'-delimited fields
+		fields := strings.Split(line, ":")
+		if len(fields) != 3 {
+			return "", fmt.Errorf("line %d of /proc/self/cgroup did not have 3 colon-delimited fields", lineno+1)
+		}
+
+		id := fields[0]
+		controllers := fields[1]
+		path := fields[2]
+		if id == "0" {
+			v2Candidates = append(v2Candidates, path)
+			continue
+		}
+
+		// It's not cgroup v2, otherwise id would have been 0. So, check if the comma-separated list
+		// of controllers contains 'cpu' as an entry.
+		for _, c := range strings.Split(controllers, ",") {
+			if c == "cpu" {
+				v1Candidates = append(v1Candidates, path)
+				break // ... and then continue to the next loop iteration
+			}
+		}
+	}
+
+	var errMsg string
+
+	// Check v1, then v2
+	if len(v1Candidates) == 1 {
+		return v1Candidates[0], nil
+	} else if len(v1Candidates) != 0 {
+		errMsg = "More than one applicable cgroup v1 entry in /proc/self/cgroup"
+	} else if len(v2Candidates) == 1 {
+		return v2Candidates[0], nil
+	} else if len(v2Candidates) != 0 {
+		errMsg = "More than one applicable cgroup v2 entry in /proc/self/cgroup"
+	} else {
+		errMsg = "Couldn't find applicable entry in /proc/self/cgroup"
 	}
+
+	return "", errors.New(errMsg)
 }
 
-func setCgroupLimit(r vmv1.MilliCPU, cgroupPath string) error {
+func setCgroupLimit(logger *zap.Logger, r vmv1.MilliCPU, cgroupPath string) error {
+	r *= cpuLimitOvercommitFactor
+
 	isV2 := cgroups.Mode() == cgroups.Unified
 	period := cgroupPeriod
 	// quota may be greater than period if the cgroup is allowed
 	// to use more than 100% of a CPU.
 	quota := int64(float64(r) / float64(1000) * float64(cgroupPeriod))
-	log.Printf("setting cgroup to %v %v\n", quota, period)
+	logger.Info(fmt.Sprintf("setting cgroup CPU limit %v %v", quota, period))
 	if isV2 {
 		resources := cgroup2.Resources{
 			CPU: &cgroup2.CPU{
@@ -726,23 +897,6 @@ func setCgroupLimit(r vmv1.MilliCPU, cgroupPath string) error {
 	return nil
 }
 
-func cleanupCgroup(cgroupPath string) error {
-	isV2 := cgroups.Mode() == cgroups.Unified
-	if isV2 {
-		control, err := cgroup2.Load(cgroupPath)
-		if err != nil {
-			return err
-		}
-		return control.Delete()
-	} else {
-		control, err := cgroup1.Load(cgroup1.StaticPath(cgroupPath))
-		if err != nil {
-			return err
-		}
-		return control.Delete()
-	}
-}
-
 func getCgroupQuota(cgroupPath string) (*vmv1.MilliCPU, error) {
 	isV2 := cgroups.Mode() == cgroups.Unified
 	var path string
@@ -765,6 +919,7 @@ func getCgroupQuota(cgroupPath string) (*vmv1.MilliCPU, error) {
 		return nil, err
 	}
 	cpu := vmv1.MilliCPU(uint32(quota * 1000 / cgroupPeriod))
+	cpu /= cpuLimitOvercommitFactor
 	return &cpu, nil
 }
 
@@ -793,9 +948,11 @@ func processCPUs(cpus vmv1.CPUs) QemuCPUs {
 	}
 }
 
-func terminateQemuOnSigterm(ctx context.Context, qmpPort int32, wg *sync.WaitGroup) {
+func terminateQemuOnSigterm(ctx context.Context, logger *zap.Logger, wg *sync.WaitGroup) {
+	logger = logger.Named("terminate-qemu-on-sigterm")
+
 	defer wg.Done()
-	log.Println("watching OS signals")
+	logger.Info("watching OS signals")
 	c := make(chan os.Signal, 1) // we need to reserve to buffer size 1, so the notifier are not blocked
 	signal.Notify(c, os.Interrupt, syscall.SIGTERM)
 	select {
@@ -803,16 +960,16 @@ func terminateQemuOnSigterm(ctx context.Context, qmpPort int32, wg *sync.WaitGro
 	case <-ctx.Done():
 	}
 
-	log.Println("got signal, sending powerdown command to QEMU")
+	logger.Info("got signal, sending powerdown command to QEMU")
 
-	mon, err := qmp.NewSocketMonitor("tcp", fmt.Sprintf("127.0.0.1:%d", qmpPort), 2*time.Second)
+	mon, err := qmp.NewSocketMonitor("unix", qmpUnixSocketForSigtermHandler, 2*time.Second)
 	if err != nil {
-		log.Println(err)
+		logger.Error("failed to connect to QEMU monitor", zap.Error(err))
 		return
 	}
 
 	if err := mon.Connect(); err != nil {
-		log.Println(err)
+		logger.Error("failed to start monitor connection", zap.Error(err))
 		return
 	}
 	defer mon.Disconnect()
@@ -820,11 +977,11 @@ func terminateQemuOnSigterm(ctx context.Context, qmpPort int32, wg *sync.WaitGro
 	qmpcmd := []byte(`{"execute": "system_powerdown"}`)
 	_, err = mon.Run(qmpcmd)
 	if err != nil {
-		log.Println(err)
+		logger.Error("failed to execute system_powerdown command", zap.Error(err))
 		return
 	}
 
-	log.Println("system_powerdown command sent to QEMU")
+	logger.Info("system_powerdown command sent to QEMU")
 
 	return
 }
@@ -867,25 +1024,28 @@ func execFg(name string, arg ...string) error {
 	return nil
 }
 
-func defaultNetwork(cidr string, ports []vmv1.Port) (mac.MAC, error) {
+func defaultNetwork(logger *zap.Logger, cidr string, ports []vmv1.Port) (mac.MAC, error) {
 	// gerenare random MAC for default Guest interface
 	mac, err := mac.GenerateRandMAC()
 	if err != nil {
+		logger.Fatal("could not generate random MAC", zap.Error(err))
 		return nil, err
 	}
 
 	// create an configure linux bridge
-	log.Printf("setup bridge interface %s", defaultNetworkBridgeName)
+	logger.Info("setup bridge interface", zap.String("name", defaultNetworkBridgeName))
 	bridge := &netlink.Bridge{
 		LinkAttrs: netlink.LinkAttrs{
 			Name: defaultNetworkBridgeName,
 		},
 	}
 	if err := netlink.LinkAdd(bridge); err != nil {
+		logger.Fatal("could not create bridge interface", zap.Error(err))
 		return nil, err
 	}
 	ipPod, ipVm, mask, err := calcIPs(cidr)
 	if err != nil {
+		logger.Fatal("could not parse IP", zap.Error(err))
 		return nil, err
 	}
 	bridgeAddr := &netlink.Addr{
@@ -895,15 +1055,17 @@ func defaultNetwork(cidr string, ports []vmv1.Port) (mac.MAC, error) {
 		},
 	}
 	if err := netlink.AddrAdd(bridge, bridgeAddr); err != nil {
+		logger.Fatal("could not parse IP", zap.Error(err))
 		return nil, err
 	}
 	if err := netlink.LinkSetUp(bridge); err != nil {
+		logger.Fatal("could not set up bridge", zap.Error(err))
 		return nil, err
 	}
 
 	// create an configure TAP interface
 	if !checkDevTun() {
-		log.Printf("create /dev/net/tun")
+		logger.Info("create /dev/net/tun")
 		if err := execFg("mkdir", "-p", "/dev/net"); err != nil {
 			return nil, err
 		}
@@ -915,7 +1077,7 @@ func defaultNetwork(cidr string, ports []vmv1.Port) (mac.MAC, error) {
 		}
 	}
 
-	log.Printf("setup tap interface %s", defaultNetworkTapName)
+	logger.Info("setup tap interface", zap.String("name", defaultNetworkTapName))
 	tap := &netlink.Tuntap{
 		LinkAttrs: netlink.LinkAttrs{
 			Name: defaultNetworkTapName,
@@ -924,30 +1086,35 @@ func defaultNetwork(cidr string, ports []vmv1.Port) (mac.MAC, error) {
 		Flags: netlink.TUNTAP_DEFAULTS,
 	}
 	if err := netlink.LinkAdd(tap); err != nil {
+		logger.Error("could not add tap device", zap.Error(err))
 		return nil, err
 	}
 	if err := netlink.LinkSetMaster(tap, bridge); err != nil {
+		logger.Error("could not set up tap as master", zap.Error(err))
 		return nil, err
 	}
 	if err := netlink.LinkSetUp(tap); err != nil {
+		logger.Error("could not set up tap device", zap.Error(err))
 		return nil, err
 	}
 
 	// setup masquerading outgoing (from VM) traffic
-	log.Println("setup masquerading for outgoing traffic")
+	logger.Info("setup masquerading for outgoing traffic")
 	if err := execFg("iptables", "-t", "nat", "-A", "POSTROUTING", "-o", "eth0", "-j", "MASQUERADE"); err != nil {
+		logger.Error("could not setup masquerading for outgoing traffic", zap.Error(err))
 		return nil, err
 	}
 
 	// pass incoming traffic to .Guest.Spec.Ports into VM
 	for _, port := range ports {
-		log.Printf("setup DNAT for incoming traffic, port %d", port.Port)
+		logger.Info(fmt.Sprintf("setup DNAT for incoming traffic, port %d", port.Port))
 		iptablesArgs := []string{
 			"-t", "nat", "-A", "PREROUTING",
 			"-i", "eth0", "-p", fmt.Sprint(port.Protocol), "--dport", fmt.Sprint(port.Port),
 			"-j", "DNAT", "--to", fmt.Sprintf("%s:%d", ipVm.String(), port.Port),
 		}
 		if err := execFg("iptables", iptablesArgs...); err != nil {
+			logger.Error("could not set up DNAT for incoming traffic", zap.Error(err))
 			return nil, err
 		}
 	}
@@ -955,13 +1122,14 @@ func defaultNetwork(cidr string, ports []vmv1.Port) (mac.MAC, error) {
 	// get dns details from /etc/resolv.conf
 	resolvConf, err := getResolvConf()
 	if err != nil {
+		logger.Error("could not get DNS details", zap.Error(err))
 		return nil, err
 	}
 	dns := getNameservers(resolvConf.Content, types.IP)[0]
 	dnsSearch := strings.Join(getSearchDomains(resolvConf.Content), ",")
 
 	// prepare dnsmask command line (instead of config file)
-	log.Printf("run dnsmqsq for interface %s", defaultNetworkBridgeName)
+	logger.Info("run dnsmasq for interface", zap.String("name", defaultNetworkBridgeName))
 	dnsMaskCmd := []string{
 		"--port=0",
 		"--bind-interfaces",
@@ -977,6 +1145,7 @@ func defaultNetwork(cidr string, ports []vmv1.Port) (mac.MAC, error) {
 
 	// run dnsmasq for default Guest interface
 	if err := execFg("dnsmasq", dnsMaskCmd...); err != nil {
+		logger.Error("could not run dnsmasq", zap.Error(err))
 		return nil, err
 	}
 
diff --git a/neonvm/tools/vm-builder-generic/main.go b/neonvm/tools/vm-builder-generic/main.go
index d7110d668..a13f1066c 100644
--- a/neonvm/tools/vm-builder-generic/main.go
+++ b/neonvm/tools/vm-builder-generic/main.go
@@ -44,12 +44,14 @@ RUN set -e \
 		su-exec \
 		e2fsprogs-extra \
 		blkid \
+		flock \
 	&& mv /sbin/acpid         /neonvm/bin/ \
 	&& mv /sbin/udevd         /neonvm/bin/ \
 	&& mv /sbin/agetty        /neonvm/bin/ \
 	&& mv /sbin/su-exec       /neonvm/bin/ \
 	&& mv /usr/sbin/resize2fs /neonvm/bin/resize2fs \
 	&& mv /sbin/blkid         /neonvm/bin/blkid \
+	&& mv /usr/bin/flock	  /neonvm/bin/flock \
 	&& mkdir -p /neonvm/lib \
 	&& cp -f /lib/ld-musl-x86_64.so.1  /neonvm/lib/ \
 	&& cp -f /lib/libblkid.so.1.1.0    /neonvm/lib/libblkid.so.1 \
@@ -77,9 +79,9 @@ RUN set -e \
 ADD inittab   /neonvm/bin/inittab
 ADD vminit    /neonvm/bin/vminit
 ADD vmstart   /neonvm/bin/vmstart
+ADD vmshutdown /neonvm/bin/vmshutdown
 ADD vmacpi    /neonvm/acpi/vmacpi
-ADD powerdown /neonvm/bin/powerdown
-RUN chmod +rx /neonvm/bin/vminit /neonvm/bin/vmstart /neonvm/bin/powerdown
+RUN chmod +rx /neonvm/bin/vminit /neonvm/bin/vmstart /neonvm/bin/vmshutdown
 
 FROM vm-runtime AS builder
 ARG DISK_SIZE
@@ -143,26 +145,37 @@ fi
 
 /neonvm/bin/chmod +x /neonvm/bin/vmstarter.sh
 
-/neonvm/bin/su-exec {{.User}} /neonvm/bin/sh /neonvm/bin/vmstarter.sh
+/neonvm/bin/flock -o /neonvm/vmstart.lock -c 'test -e /neonvm/vmstart.allowed && /neonvm/bin/su-exec {{.User}} /neonvm/bin/sh /neonvm/bin/vmstarter.sh'
 `
 
 	scriptInitTab = `
 ::sysinit:/neonvm/bin/vminit
+::once:/neonvm/bin/touch /neonvm/vmstart.allowed
 ::respawn:/neonvm/bin/udhcpc -t 1 -T 1 -A 1 -f -i eth0 -O 121 -O 119 -s /neonvm/bin/udhcpc.script
 ::respawn:/neonvm/bin/udevd
 ::respawn:/neonvm/bin/acpid -f -c /neonvm/acpi
 ::respawn:/neonvm/bin/vmstart
 ttyS0::respawn:/neonvm/bin/agetty --8bits --local-line --noissue --noclear --noreset --host console --login-program /neonvm/bin/login --login-pause --autologin root 115200 ttyS0 linux
+::shutdown:/neonvm/bin/vmshutdown
 `
 
 	scriptVmAcpi = `
 event=button/power
-action=/neonvm/bin/powerdown
+action=/neonvm/bin/poweroff
 `
 
-	scriptPowerDown = `#!/neonvm/bin/sh
-
-/neonvm/bin/poweroff
+	scriptVmShutdown = `#!/neonvm/bin/sh
+rm /neonvm/vmstart.allowed
+if [ -e /neonvm/vmstart.allowed ]; then
+	echo "Error: could not remove vmstart.allowed marker, might hang indefinitely during shutdown" 1>&2
+fi
+# we inhibited new command starts, but there may still be a command running
+while ! /neonvm/bin/flock -n /neonvm/vmstart.lock true; do
+	# TODO: should be sufficient to keep track of the vmstarter.sh pid and signal it.
+	echo "Warning: no generic mechanism to signal graceful shutdown request to vmstarter.sh" 1>&2
+	exit 2
+done
+echo "vmstart workload shut down cleanly" 1>&2
 `
 
 	scriptVmInit = `#!/neonvm/bin/sh
@@ -187,6 +200,18 @@ chmod 1777 /dev/shm
 mount -t proc  proc  /proc
 mount -t sysfs sysfs /sys
 mount -t cgroup2 cgroup2 /sys/fs/cgroup
+
+# Allow all users to move processes to/from the root cgroup.
+#
+# This is required in order to be able to 'cgexec' anything, if the entrypoint is not being run as
+# root, because moving tasks betweeen one cgroup and another *requires write access to the
+# cgroup.procs file of the common ancestor*, and because the entrypoint isn't already in a cgroup,
+# any new tasks are automatically placed in the top-level cgroup.
+#
+# This *would* be bad for security, if we relied on cgroups for security; but instead because they
+# are just used for cooperative signaling, this should be mostly ok.
+chmod go+w /sys/fs/cgroup/cgroup.procs
+
 mount -t devpts -o noexec,nosuid       devpts    /dev/pts
 mount -t tmpfs  -o noexec,nosuid,nodev shm-tmpfs /dev/shm
 
@@ -366,9 +391,9 @@ func main() {
 	}{
 		{"Dockerfile", dockerfileVmBuilder},
 		{"vmstart", scriptVmStart},
+		{"vmshutdown", scriptVmShutdown},
 		{"inittab", scriptInitTab},
 		{"vmacpi", scriptVmAcpi},
-		{"powerdown", scriptPowerDown},
 		{"vminit", scriptVmInit},
 	}
 
diff --git a/neonvm/tools/vm-builder/main.go b/neonvm/tools/vm-builder/main.go
index 3f1045202..58ef05a3e 100644
--- a/neonvm/tools/vm-builder/main.go
+++ b/neonvm/tools/vm-builder/main.go
@@ -22,16 +22,14 @@ import (
 
 // vm-builder --src alpine:3.16 --dst vm-alpine:dev --file vm-alpine.qcow2
 
-var entrypointPrefix = []string{"/usr/bin/cgexec", "-g", "memory:neon-postgres"}
-
 const (
 	dockerfileVmBuilder = `
-FROM {{.InformantImage}} as informant
+FROM {{.MonitorImage}} as monitor
 
 # Build cgroup-tools
 #
 # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
-# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
+# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
 # requires cgroup v2, so we'll build cgroup-tools ourselves.
 FROM debian:bullseye-slim as libcgroup-builder
 ENV LIBCGROUP_VERSION v2.0.3
@@ -89,7 +87,7 @@ RUN set -e \
 FROM {{.RootDiskImage}} AS rootdisk
 
 USER root
-RUN adduser --system --disabled-login --no-create-home --home /nonexistent --gecos "informant user" --shell /bin/false vm-informant
+RUN adduser --system --disabled-login --no-create-home --home /nonexistent --gecos "monitor user" --shell /bin/false vm-monitor
 
 # tweak nofile limits
 RUN set -e \
@@ -108,7 +106,7 @@ RUN set -e \
 
 USER postgres
 
-COPY --from=informant         /usr/bin/vm-informant /usr/local/bin/vm-informant
+COPY --from=monitor           /usr/bin/vm-monitor /usr/local/bin/vm-monitor
 COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
 COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
 COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
@@ -133,12 +131,14 @@ RUN set -e \
 		su-exec \
 		e2fsprogs-extra \
 		blkid \
+		flock \
 	&& mv /sbin/acpid         /neonvm/bin/ \
 	&& mv /sbin/udevd         /neonvm/bin/ \
 	&& mv /sbin/agetty        /neonvm/bin/ \
 	&& mv /sbin/su-exec       /neonvm/bin/ \
 	&& mv /usr/sbin/resize2fs /neonvm/bin/resize2fs \
 	&& mv /sbin/blkid         /neonvm/bin/blkid \
+	&& mv /usr/bin/flock	  /neonvm/bin/flock \
 	&& mkdir -p /neonvm/lib \
 	&& cp -f /lib/ld-musl-x86_64.so.1  /neonvm/lib/ \
 	&& cp -f /lib/libblkid.so.1.1.0    /neonvm/lib/libblkid.so.1 \
@@ -171,10 +171,10 @@ RUN set -e \
 ADD inittab   /neonvm/bin/inittab
 ADD vminit    /neonvm/bin/vminit
 ADD vmstart   /neonvm/bin/vmstart
+ADD vmshutdown /neonvm/bin/vmshutdown
 ADD vmacpi    /neonvm/acpi/vmacpi
 ADD vector.yaml /neonvm/config/vector.yaml
-ADD powerdown /neonvm/bin/powerdown
-RUN chmod +rx /neonvm/bin/vminit /neonvm/bin/vmstart /neonvm/bin/powerdown
+RUN chmod +rx /neonvm/bin/vminit /neonvm/bin/vmstart /neonvm/bin/vmshutdown
 
 FROM vm-runtime AS builder
 ARG DISK_SIZE
@@ -235,32 +235,42 @@ fi
 
 /neonvm/bin/chmod +x /neonvm/bin/vmstarter.sh
 
-/neonvm/bin/su-exec {{.User}} /neonvm/bin/sh /neonvm/bin/vmstarter.sh
+/neonvm/bin/flock -o /neonvm/vmstart.lock -c 'test -e /neonvm/vmstart.allowed && /neonvm/bin/su-exec {{.User}} /neonvm/bin/sh /neonvm/bin/vmstarter.sh'
 `
 
 	scriptInitTab = `
 ::sysinit:/neonvm/bin/vminit
 ::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664
+::once:/neonvm/bin/touch /neonvm/vmstart.allowed
 ::respawn:/neonvm/bin/udhcpc -t 1 -T 1 -A 1 -f -i eth0 -O 121 -O 119 -s /neonvm/bin/udhcpc.script
 ::respawn:/neonvm/bin/udevd
 ::respawn:/neonvm/bin/acpid -f -c /neonvm/acpi
 ::respawn:/neonvm/bin/vector -c /neonvm/config/vector.yaml --config-dir /etc/vector
 ::respawn:/neonvm/bin/vmstart
-::respawn:su -p vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres{{if .FileCache}} --pgconnstr="dbname=postgres user=cloud_admin sslmode=disable"{{end}}'
+{{if .EnableMonitor}}
+::respawn:su -p vm-monitor -c 'RUST_LOG=info /usr/local/bin/vm-monitor --addr "0.0.0.0:10301" --cgroup=neon-postgres{{if .FileCache}} --pgconnstr="host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable"{{end}}'
+{{end}}
 ::respawn:su -p nobody -c '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
 ::respawn:su -p nobody -c 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter --auto-discover-databases --exclude-databases=template0,template1'
 ttyS0::respawn:/neonvm/bin/agetty --8bits --local-line --noissue --noclear --noreset --host console --login-program /neonvm/bin/login --login-pause --autologin root 115200 ttyS0 linux
+::shutdown:/neonvm/bin/vmshutdown
 `
 
 	scriptVmAcpi = `
 event=button/power
-action=/neonvm/bin/powerdown
+action=/neonvm/bin/poweroff
 `
 
-	scriptPowerDown = `#!/neonvm/bin/sh
-
-su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
-/neonvm/bin/poweroff
+	scriptVmShutdown = `#!/neonvm/bin/sh
+rm /neonvm/vmstart.allowed
+if [ -e /neonvm/vmstart.allowed ]; then
+	echo "Error: could not remove vmstart.allowed marker, might hang indefinitely during shutdown" 1>&2
+fi
+# we inhibited new command starts, but there may still be a command running
+while ! /neonvm/bin/flock -n /neonvm/vmstart.lock true; do
+	su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
+done
+echo "vmstart workload shut down cleanly" 1>&2
 `
 
 	scriptVmInit = `#!/neonvm/bin/sh
@@ -285,6 +295,18 @@ chmod 1777 /dev/shm
 mount -t proc  proc  /proc
 mount -t sysfs sysfs /sys
 mount -t cgroup2 cgroup2 /sys/fs/cgroup
+
+# Allow all users to move processes to/from the root cgroup.
+#
+# This is required in order to be able to 'cgexec' anything, if the entrypoint is not being run as
+# root, because moving tasks betweeen one cgroup and another *requires write access to the
+# cgroup.procs file of the common ancestor*, and because the entrypoint isn't already in a cgroup,
+# any new tasks are automatically placed in the top-level cgroup.
+#
+# This *would* be bad for security, if we relied on cgroups for security; but instead because they
+# are just used for cooperative signaling, this should be mostly ok.
+chmod go+w /sys/fs/cgroup/cgroup.procs
+
 mount -t devpts -o noexec,nosuid       devpts    /dev/pts
 mount -t tmpfs  -o noexec,nosuid,nodev shm-tmpfs /dev/shm
 
@@ -329,7 +351,7 @@ sinks:
 group neon-postgres {
     perm {
         admin {
-            uid = vm-informant;
+            uid = {{.CgroupUID}};
         }
         task {
             gid = users;
@@ -357,18 +379,20 @@ default_pool_size=16
 )
 
 var (
-	Version     string
-	VMInformant string
-
-	srcImage  = flag.String("src", "", `Docker image used as source for virtual machine disk image: --src=alpine:3.16`)
-	dstImage  = flag.String("dst", "", `Docker image with resulting disk image: --dst=vm-alpine:3.16`)
-	size      = flag.String("size", "1G", `Size for disk image: --size=1G`)
-	outFile   = flag.String("file", "", `Save disk image as file: --file=vm-alpine.qcow2`)
-	quiet     = flag.Bool("quiet", false, `Show less output from the docker build process`)
-	forcePull = flag.Bool("pull", false, `Pull src image even if already present locally`)
-	informant = flag.String("informant", VMInformant, `vm-informant docker image`)
-	fileCache = flag.Bool("enable-file-cache", false, `enables the vm-informant's file cache integration`)
-	version   = flag.Bool("version", false, `Print vm-builder version`)
+	Version   string
+	VMMonitor string
+
+	srcImage      = flag.String("src", "", `Docker image used as source for virtual machine disk image: --src=alpine:3.16`)
+	dstImage      = flag.String("dst", "", `Docker image with resulting disk image: --dst=vm-alpine:3.16`)
+	size          = flag.String("size", "1G", `Size for disk image: --size=1G`)
+	outFile       = flag.String("file", "", `Save disk image as file: --file=vm-alpine.qcow2`)
+	quiet         = flag.Bool("quiet", false, `Show less output from the docker build process`)
+	forcePull     = flag.Bool("pull", false, `Pull src image even if already present locally`)
+	monitor       = flag.String("monitor", VMMonitor, `vm-monitor docker image`)
+	enableMonitor = flag.Bool("enable-monitor", false, `start the vm-monitor during VM startup`)
+	fileCache     = flag.Bool("enable-file-cache", false, `enables the vm-monitor's file cache integration`)
+	cgroupUID     = flag.String("cgroup-uid", "vm-monitor", `specifies the user that owns the neon-postgres cgroup`)
+	version       = flag.Bool("version", false, `Print vm-builder version`)
 )
 
 type dockerMessage struct {
@@ -419,13 +443,15 @@ func AddTemplatedFileToTar(tw *tar.Writer, tmplArgs any, filename string, tmplSt
 }
 
 type TemplatesContext struct {
-	User           string
-	Entrypoint     []string
-	Cmd            []string
-	Env            []string
-	RootDiskImage  string
-	InformantImage string
-	FileCache      bool
+	User          string
+	Entrypoint    []string
+	Cmd           []string
+	Env           []string
+	RootDiskImage string
+	MonitorImage  string
+	FileCache     bool
+	EnableMonitor bool
+	CgroupUID     string
 }
 
 func main() {
@@ -507,12 +533,14 @@ func main() {
 	}
 
 	tmplArgs := TemplatesContext{
-		Entrypoint:     append(entrypointPrefix, imageSpec.Config.Entrypoint...),
-		Cmd:            imageSpec.Config.Cmd,
-		Env:            imageSpec.Config.Env,
-		RootDiskImage:  *srcImage,
-		InformantImage: *informant,
-		FileCache:      *fileCache,
+		Entrypoint:    imageSpec.Config.Entrypoint,
+		Cmd:           imageSpec.Config.Cmd,
+		Env:           imageSpec.Config.Env,
+		RootDiskImage: *srcImage,
+		MonitorImage:  *monitor,
+		FileCache:     *fileCache,
+		EnableMonitor: *enableMonitor,
+		CgroupUID:     *cgroupUID,
 	}
 
 	if len(imageSpec.Config.User) != 0 {
@@ -531,9 +559,9 @@ func main() {
 	}{
 		{"Dockerfile", dockerfileVmBuilder},
 		{"vmstart", scriptVmStart},
+		{"vmshutdown", scriptVmShutdown},
 		{"inittab", scriptInitTab},
 		{"vmacpi", scriptVmAcpi},
-		{"powerdown", scriptPowerDown},
 		{"vminit", scriptVmInit},
 		{"cgconfig.conf", configCgroup},
 		{"vector.yaml", configVector},
diff --git a/pkg/agent/billing/billing.go b/pkg/agent/billing/billing.go
index 7243c77b4..6a72476a0 100644
--- a/pkg/agent/billing/billing.go
+++ b/pkg/agent/billing/billing.go
@@ -19,12 +19,14 @@ import (
 )
 
 type Config struct {
-	URL                  string `json:"url"`
-	CPUMetricName        string `json:"cpuMetricName"`
-	ActiveTimeMetricName string `json:"activeTimeMetricName"`
-	CollectEverySeconds  uint   `json:"collectEverySeconds"`
-	PushEverySeconds     uint   `json:"pushEverySeconds"`
-	PushTimeoutSeconds   uint   `json:"pushTimeoutSeconds"`
+	URL                       string `json:"url"`
+	CPUMetricName             string `json:"cpuMetricName"`
+	ActiveTimeMetricName      string `json:"activeTimeMetricName"`
+	CollectEverySeconds       uint   `json:"collectEverySeconds"`
+	AccumulateEverySeconds    uint   `json:"accumulateEverySeconds"`
+	PushEverySeconds          uint   `json:"pushEverySeconds"`
+	PushRequestTimeoutSeconds uint   `json:"pushRequestTimeoutSeconds"`
+	MaxBatchSize              uint   `json:"maxBatchSize"`
 }
 
 type metricsState struct {
@@ -85,8 +87,8 @@ func RunBillingMetricsCollector(
 	defer collectTicker.Stop()
 	// Offset by half a second, so it's a bit more deterministic.
 	time.Sleep(500 * time.Millisecond)
-	pushTicker := time.NewTicker(time.Second * time.Duration(conf.PushEverySeconds))
-	defer pushTicker.Stop()
+	accumulateTicker := time.NewTicker(time.Second * time.Duration(conf.AccumulateEverySeconds))
+	defer accumulateTicker.Stop()
 
 	state := metricsState{
 		historical:      make(map[metricsKey]vmMetricsHistory),
@@ -95,8 +97,25 @@ func RunBillingMetricsCollector(
 		pushWindowStart: time.Now(),
 	}
 
-	state.collect(conf, store, metrics)
-	batch := client.NewBatch()
+	queueWriter, queueReader := newEventQueue[*billing.IncrementalEvent](metrics.queueSizeCurrent)
+
+	// Start the sender
+	signalDone, thisThreadFinished := util.NewCondChannelPair()
+	defer signalDone.Send()
+	sender := eventSender{
+		client:            client,
+		config:            conf,
+		metrics:           metrics,
+		queue:             queueReader,
+		collectorFinished: thisThreadFinished,
+		lastSendDuration:  0,
+	}
+	go sender.senderLoop(logger.Named("send"))
+
+	// The rest of this function is to do with collection
+	logger = logger.Named("collect")
+
+	state.collect(logger, conf, store, metrics)
 
 	for {
 		select {
@@ -106,40 +125,17 @@ func RunBillingMetricsCollector(
 				err := errors.New("VM store stopped but background context is still live")
 				logger.Panic("Validation check failed", zap.Error(err))
 			}
-			state.collect(conf, store, metrics)
-		case <-pushTicker.C:
+			state.collect(logger, conf, store, metrics)
+		case <-accumulateTicker.C:
 			logger.Info("Creating billing batch")
-			state.drainAppendToBatch(logger, conf, batch)
-			metrics.batchSizeCurrent.Set(float64(batch.Count()))
-			logger.Info("Pushing billing events", zap.Int("count", batch.Count()))
-			_ = logger.Sync() // Sync before making the network request, so we guarantee logs for the action
-			if err := pushBillingEvents(conf, batch); err != nil {
-				metrics.sendErrorsTotal.Inc()
-				logger.Error("Failed to push billing events", zap.Error(err))
-				continue
-			}
-			// Sending was successful; clear the batch.
-			//
-			// Don't reset metrics.batchSizeCurrent because it stores the *most recent* batch size.
-			// (The "current" suffix refers to the fact the metric is a gague, not a counter)
-			batch = client.NewBatch()
+			state.drainEnqueue(logger, conf, client.Hostname(), queueWriter)
 		case <-backgroundCtx.Done():
-			// If we're being shut down, push the latests events we have before returning.
-			logger.Info("Creating final billing batch")
-			state.drainAppendToBatch(logger, conf, batch)
-			metrics.batchSizeCurrent.Set(float64(batch.Count()))
-			logger.Info("Pushing final billing events", zap.Int("count", batch.Count()))
-			_ = logger.Sync() // Sync before making the network request, so we guarantee logs for the action
-			if err := pushBillingEvents(conf, batch); err != nil {
-				metrics.sendErrorsTotal.Inc()
-				logger.Error("Failed to push billing events", zap.Error(err))
-			}
 			return
 		}
 	}
 }
 
-func (s *metricsState) collect(conf *Config, store VMStoreForNode, metrics PromMetrics) {
+func (s *metricsState) collect(logger *zap.Logger, conf *Config, store VMStoreForNode, metrics PromMetrics) {
 	now := time.Now()
 
 	metricsBatch := metrics.forBatch()
@@ -147,9 +143,14 @@ func (s *metricsState) collect(conf *Config, store VMStoreForNode, metrics PromM
 
 	old := s.present
 	s.present = make(map[metricsKey]vmMetricsInstant)
-	vmsOnThisNode := store.ListIndexed(func(i *VMNodeIndex) []*vmapi.VirtualMachine {
-		return i.List()
-	})
+	var vmsOnThisNode []*vmapi.VirtualMachine
+	if store.Failing() {
+		logger.Error("VM store is currently stopped. No events will be recorded")
+	} else {
+		vmsOnThisNode = store.ListIndexed(func(i *VMNodeIndex) []*vmapi.VirtualMachine {
+			return i.List()
+		})
+	}
 	for _, vm := range vmsOnThisNode {
 		endpointID, isEndpoint := vm.Labels[EndpointLabel]
 		metricsBatch.inc(isEndpointFlag(isEndpoint), autoscalingEnabledFlag(api.HasAutoscalingEnabled(vm)), vm.Status.Phase)
@@ -248,9 +249,10 @@ func (s *metricsTimeSlice) tryMerge(next metricsTimeSlice) bool {
 	return merged
 }
 
-func logAddedEvent(logger *zap.Logger, event billing.IncrementalEvent) billing.IncrementalEvent {
+func logAddedEvent(logger *zap.Logger, event *billing.IncrementalEvent) *billing.IncrementalEvent {
 	logger.Info(
 		"Adding event to batch",
+		zap.String("IdempotencyKey", event.IdempotencyKey),
 		zap.String("EndpointID", event.EndpointID),
 		zap.String("MetricName", event.MetricName),
 		zap.Int("Value", event.Value),
@@ -258,42 +260,35 @@ func logAddedEvent(logger *zap.Logger, event billing.IncrementalEvent) billing.I
 	return event
 }
 
-// drainAppendToBatch clears the current history, adding it as events to the batch
-func (s *metricsState) drainAppendToBatch(logger *zap.Logger, conf *Config, batch *billing.Batch) {
+// drainEnqueue clears the current history, adding it as events to the queue
+func (s *metricsState) drainEnqueue(logger *zap.Logger, conf *Config, hostname string, queue eventQueuePusher[*billing.IncrementalEvent]) {
 	now := time.Now()
 
 	for key, history := range s.historical {
 		history.finalizeCurrentTimeSlice()
 
-		batch.AddIncrementalEvent(logAddedEvent(logger, billing.IncrementalEvent{
+		queue.enqueue(logAddedEvent(logger, billing.Enrich(hostname, &billing.IncrementalEvent{
 			MetricName:     conf.CPUMetricName,
-			Type:           "", // set in batch method
-			IdempotencyKey: "", // set in batch method
+			Type:           "", // set by billing.Enrich
+			IdempotencyKey: "", // set by billing.Enrich
 			EndpointID:     key.endpointID,
 			// TODO: maybe we should store start/stop time in the vmMetricsHistory object itself?
 			// That way we can be aligned to collection, rather than pushing.
 			StartTime: s.pushWindowStart,
 			StopTime:  now,
 			Value:     int(math.Round(history.total.cpu)),
-		}))
-		batch.AddIncrementalEvent(logAddedEvent(logger, billing.IncrementalEvent{
+		})))
+		queue.enqueue(logAddedEvent(logger, billing.Enrich(hostname, &billing.IncrementalEvent{
 			MetricName:     conf.ActiveTimeMetricName,
-			Type:           "", // set in batch method
-			IdempotencyKey: "", // set in batch method
+			Type:           "", // set by billing.Enrich
+			IdempotencyKey: "", // set by billing.Enrich
 			EndpointID:     key.endpointID,
 			StartTime:      s.pushWindowStart,
 			StopTime:       now,
 			Value:          int(math.Round(history.total.activeTime.Seconds())),
-		}))
+		})))
 	}
 
 	s.pushWindowStart = now
 	s.historical = make(map[metricsKey]vmMetricsHistory)
 }
-
-func pushBillingEvents(conf *Config, batch *billing.Batch) error {
-	ctx, cancel := context.WithTimeout(context.TODO(), time.Second*time.Duration(conf.PushTimeoutSeconds))
-	defer cancel()
-
-	return batch.Send(ctx)
-}
diff --git a/pkg/agent/billing/prommetrics.go b/pkg/agent/billing/prommetrics.go
index be973ebca..9e7a3f503 100644
--- a/pkg/agent/billing/prommetrics.go
+++ b/pkg/agent/billing/prommetrics.go
@@ -13,7 +13,8 @@ import (
 type PromMetrics struct {
 	vmsProcessedTotal *prometheus.CounterVec
 	vmsCurrent        *prometheus.GaugeVec
-	batchSizeCurrent  prometheus.Gauge
+	queueSizeCurrent  prometheus.Gauge
+	lastSendDuration  prometheus.Gauge
 	sendErrorsTotal   prometheus.Counter
 }
 
@@ -33,10 +34,16 @@ func NewPromMetrics() PromMetrics {
 			},
 			[]string{"is_endpoint", "autoscaling_enabled", "phase"},
 		),
-		batchSizeCurrent: prometheus.NewGauge(
+		queueSizeCurrent: prometheus.NewGauge(
 			prometheus.GaugeOpts{
-				Name: "autoscaling_agent_billing_batch_size",
-				Help: "Size of the billing subsystem's most recent batch",
+				Name: "autoscaling_agent_billing_queue_size",
+				Help: "Size of the billing subsystem's queue of unsent events",
+			},
+		),
+		lastSendDuration: prometheus.NewGauge(
+			prometheus.GaugeOpts{
+				Name: "autoscaling_agent_billing_last_send_duration_seconds",
+				Help: "Duration, in seconds, that it took to send the latest set of billing events (or current time if ongoing)",
 			},
 		),
 		sendErrorsTotal: prometheus.NewCounter(
@@ -51,7 +58,7 @@ func NewPromMetrics() PromMetrics {
 func (m PromMetrics) MustRegister(reg *prometheus.Registry) {
 	reg.MustRegister(m.vmsProcessedTotal)
 	reg.MustRegister(m.vmsCurrent)
-	reg.MustRegister(m.batchSizeCurrent)
+	reg.MustRegister(m.queueSizeCurrent)
 	reg.MustRegister(m.sendErrorsTotal)
 }
 
diff --git a/pkg/agent/billing/queue.go b/pkg/agent/billing/queue.go
new file mode 100644
index 000000000..1e3f5db00
--- /dev/null
+++ b/pkg/agent/billing/queue.go
@@ -0,0 +1,79 @@
+package billing
+
+// Implementation of the event queue for mediating event generation and event sending.
+//
+// The "public" (ish - it's all one package) types are eventQueuePuller and eventQueuePusher, two
+// halves of the same queue. Each half is only safe for use from a single thread, but *together*
+// they can be used in separate threads.
+
+import (
+	"sync"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"golang.org/x/exp/slices"
+
+	"github.com/neondatabase/autoscaling/pkg/util"
+)
+
+// this is generic just so there's less typing - "billing.IncrementalEvent" is long!
+type eventQueueInternals[E any] struct {
+	mu        sync.Mutex
+	items     []E
+	sizeGauge prometheus.Gauge
+}
+
+type eventQueuePuller[E any] struct {
+	internals *eventQueueInternals[E]
+}
+
+type eventQueuePusher[E any] struct {
+	internals *eventQueueInternals[E]
+}
+
+func newEventQueue[E any](sizeGauge prometheus.Gauge) (eventQueuePusher[E], eventQueuePuller[E]) {
+	internals := &eventQueueInternals[E]{
+		mu:        sync.Mutex{},
+		items:     nil,
+		sizeGauge: sizeGauge,
+	}
+	return eventQueuePusher[E]{internals}, eventQueuePuller[E]{internals}
+}
+
+// NB: must hold mu
+func (qi *eventQueueInternals[E]) updateGauge() {
+	qi.sizeGauge.Set(float64(len(qi.items)))
+}
+
+func (q eventQueuePusher[E]) enqueue(events ...E) {
+	q.internals.mu.Lock()
+	defer q.internals.mu.Unlock()
+
+	q.internals.items = append(q.internals.items, events...)
+	q.internals.updateGauge()
+}
+
+func (q eventQueuePuller[E]) size() int {
+	q.internals.mu.Lock()
+	defer q.internals.mu.Unlock()
+
+	return len(q.internals.items)
+}
+
+func (q eventQueuePuller[E]) get(limit int) []E {
+	q.internals.mu.Lock()
+	defer q.internals.mu.Unlock()
+
+	count := util.Min(limit, len(q.internals.items))
+	// NOTE: this kind of access escaping the mutex is only sound because this access is only
+	// granted to the puller, and there's only one puller, and it isn't sound to use the output of a
+	// previous get() after calling drop().
+	return q.internals.items[:count]
+}
+
+func (q eventQueuePuller[E]) drop(count int) {
+	q.internals.mu.Lock()
+	defer q.internals.mu.Unlock()
+
+	q.internals.items = slices.Replace(q.internals.items, 0, count)
+	q.internals.updateGauge()
+}
diff --git a/pkg/agent/billing/send.go b/pkg/agent/billing/send.go
new file mode 100644
index 000000000..6a1751fe6
--- /dev/null
+++ b/pkg/agent/billing/send.go
@@ -0,0 +1,161 @@
+package billing
+
+// Logic responsible for sending billing events by repeatedly pulling from the eventQueue
+
+import (
+	"context"
+	"time"
+
+	"go.uber.org/zap"
+
+	"github.com/neondatabase/autoscaling/pkg/billing"
+	"github.com/neondatabase/autoscaling/pkg/util"
+)
+
+type eventSender struct {
+	client            billing.Client
+	config            *Config
+	metrics           PromMetrics
+	queue             eventQueuePuller[*billing.IncrementalEvent]
+	collectorFinished util.CondChannelReceiver
+
+	// lastSendDuration tracks the "real" last full duration of (eventSender).sendAllCurrentEvents().
+	//
+	// It's separate from metrics.lastSendDuration because (a) we'd like to include the duration of
+	// ongoing calls to sendAllCurrentEvents, but (b) we don't want the bias towards lower durations
+	// that comes with that.
+	//
+	// Here's some more detail:
+	//
+	// To make sure that long-running sendAllCurrentEvents() loops show up in the metrics while
+	// they're still running, we want to periodically update metrics.lastSendDuration before the
+	// loop has finished. A side-effect of doing this naively is that the gauge will sometimes
+	// return durations that are much shorter than the *actual* previous send loop duration.
+	//
+	// In order to fix this, we store that *actual* previous duration in this field, but and only
+	// update the metric when either (a) the loop is done, or (b) the duration so far is already
+	// longer than the previous one.
+	//
+	// This means that we remove the bias towards shorter durations, at the expense of sometimes
+	// returning higher durations for too long. IMO that's ok, and we'd rather have our metrics give
+	// a pessimistic but more accurate view.
+	lastSendDuration time.Duration
+}
+
+func (s eventSender) senderLoop(logger *zap.Logger) {
+	ticker := time.NewTicker(time.Second * time.Duration(s.config.PushEverySeconds))
+	defer ticker.Stop()
+
+	for {
+		final := false
+
+		select {
+		case <-s.collectorFinished.Recv():
+			logger.Info("Received notification that collector finished")
+			final = true
+		case <-ticker.C:
+		}
+
+		s.sendAllCurrentEvents(logger)
+
+		if final {
+			logger.Info("Ending events sender loop")
+			return
+		}
+	}
+}
+
+func (s eventSender) sendAllCurrentEvents(logger *zap.Logger) {
+	logger.Info("Pushing all available events")
+
+	if s.queue.size() == 0 {
+		logger.Info("No billing events to push")
+		s.lastSendDuration = 0
+		s.metrics.lastSendDuration.Set(1e-6) // small value, to indicate that nothing happened
+		return
+	}
+
+	total := 0
+	startTime := time.Now()
+
+	// while there's still events in the queue, send them
+	//
+	// If events are being added to the queue faster than we can send them, this loop will not
+	// terminate. For the most part, that's ok: worst-case, we miss the collectorFinished
+	// notification, which isn't the end of the world. Any long-running call to this function will
+	// be reported by s.metrics.lastSendDuration as we go (provided the request timeout isn't too
+	// long).
+	for {
+		if size := s.queue.size(); size != 0 {
+			logger.Info("Current queue size is non-zero", zap.Int("queueSize", size))
+		}
+
+		chunk := s.queue.get(int(s.config.MaxBatchSize))
+		count := len(chunk)
+		if count == 0 {
+			totalTime := time.Since(startTime)
+			s.lastSendDuration = totalTime
+			s.metrics.lastSendDuration.Set(totalTime.Seconds())
+
+			logger.Info(
+				"All available events have been sent",
+				zap.Int("total", total),
+				zap.Duration("totalTime", totalTime),
+			)
+			return
+		}
+
+		traceID := s.client.GenerateTraceID()
+
+		logger.Info(
+			"Pushing billing events",
+			zap.String("traceID", string(traceID)),
+			zap.Int("count", count),
+		)
+
+		reqStart := time.Now()
+		err := func() error {
+			reqCtx, cancel := context.WithTimeout(context.TODO(), time.Second*time.Duration(s.config.PushRequestTimeoutSeconds))
+			defer cancel()
+
+			return billing.Send(reqCtx, s.client, traceID, chunk)
+		}()
+		reqDuration := time.Since(reqStart)
+
+		if err != nil {
+			// Something went wrong and we're going to abandon attempting to push any further
+			// events.
+			logger.Error(
+				"Failed to push billing events",
+				zap.String("traceID", string(traceID)),
+				zap.Int("count", count),
+				zap.Duration("after", reqDuration),
+				zap.Int("total", total),
+				zap.Duration("totalTime", time.Since(startTime)),
+				zap.Error(err),
+			)
+			s.metrics.sendErrorsTotal.Inc()
+			s.lastSendDuration = 0
+			s.metrics.lastSendDuration.Set(0.0) // use 0 as a flag that something went wrong; there's no valid time here.
+			return
+		}
+
+		s.queue.drop(count) // mark len(chunk) as successfully processed
+		total += len(chunk)
+		currentTotalTime := time.Since(startTime)
+
+		logger.Info(
+			"Successfully pushed some billing events",
+			zap.String("traceID", string(traceID)),
+			zap.Int("count", count),
+			zap.Duration("after", reqDuration),
+			zap.Int("total", total),
+			zap.Duration("totalTime", currentTotalTime),
+		)
+
+		if currentTotalTime > s.lastSendDuration {
+			s.lastSendDuration = currentTotalTime
+			s.metrics.lastSendDuration.Set(currentTotalTime.Seconds())
+		}
+	}
+}
diff --git a/pkg/agent/config.go b/pkg/agent/config.go
index 8ab7d0329..ae13327e2 100644
--- a/pkg/agent/config.go
+++ b/pkg/agent/config.go
@@ -14,12 +14,41 @@ import (
 type Config struct {
 	DumpState *DumpStateConfig `json:"dumpState"`
 	Scaling   ScalingConfig    `json:"scaling"`
-	Informant InformantConfig  `json:"informant"`
 	Metrics   MetricsConfig    `json:"metrics"`
 	Scheduler SchedulerConfig  `json:"scheduler"`
+	Monitor   MonitorConfig    `json:"monitor"`
 	Billing   *billing.Config  `json:"billing,omitempty"`
 }
 
+type MonitorConfig struct {
+	ResponseTimeoutSeconds uint `json:"responseTimeoutSeconds"`
+	// ConnectionTimeoutSeconds gives how long we may take to connect to the
+	// monitor before cancelling.
+	ConnectionTimeoutSeconds uint `json:"connectionTimeoutSeconds"`
+	// ConnectionRetryMinWaitSeconds gives the minimum amount of time we must wait between attempts
+	// to connect to the vm-monitor, regardless of whether they're successful.
+	ConnectionRetryMinWaitSeconds uint `json:"connectionRetryMinWaitSeconds"`
+	// ServerPort is the port that the dispatcher serves from
+	ServerPort uint16 `json:"serverPort"`
+	// UnhealthyAfterSilenceDurationSeconds gives the duration, in seconds, after which failing to
+	// receive a successful request from the monitor indicates that it is probably unhealthy.
+	UnhealthyAfterSilenceDurationSeconds uint `json:"unhealthyAfterSilenceDurationSeconds"`
+	// UnhealthyStartupGracePeriodSeconds gives the duration, in seconds, after which we will no
+	// longer excuse total VM monitor failures - i.e. when unhealthyAfterSilenceDurationSeconds
+	// kicks in.
+	UnhealthyStartupGracePeriodSeconds uint `json:"unhealthyStartupGracePeriodSeconds"`
+	// MaxHealthCheckSequentialFailuresSeconds gives the duration, in seconds, after which we
+	// should restart the connection to the vm-monitor if health checks aren't succeeding.
+	MaxHealthCheckSequentialFailuresSeconds uint `json:"maxHealthCheckSequentialFailuresSeconds"`
+
+	// RetryFailedRequestSeconds gives the duration, in seconds, that we must wait before retrying a
+	// request that previously failed.
+	RetryFailedRequestSeconds uint `json:"retryFailedRequestSeconds"`
+	// RetryDeniedDownscaleSeconds gives the duration, in seconds, that we must wait before retrying
+	// a downscale request that was previously denied
+	RetryDeniedDownscaleSeconds uint `json:"retryDeniedDownscaleSeconds"`
+}
+
 // DumpStateConfig configures the endpoint to dump all internal state
 type DumpStateConfig struct {
 	// Port is the port to serve on
@@ -38,54 +67,10 @@ type ScalingConfig struct {
 	DefaultConfig api.ScalingConfig `json:"defaultConfig"`
 }
 
-type InformantConfig struct {
-	// ServerPort is the port that the VM informant serves from
-	ServerPort uint16 `json:"serverPort"`
-
-	// RetryServerMinWaitSeconds gives the minimum duration, in seconds, that we must wait between the
-	// start of one InformantServer and the next
-	//
-	// This "minimum wait" is only used when thethe
-	RetryServerMinWaitSeconds uint `json:"retryServerMinWaitSeconds"`
-	// RetryServerNormalWaitSeconds gives the typical duration, in seconds, that we wait between an
-	// InformantServer failing and our retry.
-	RetryServerNormalWaitSeconds uint `json:"retryServerNormalWaitSeconds"`
-	// RegisterRetrySeconds gives the duration, in seconds, to wait between retrying a failed
-	// register request.
-	RegisterRetrySeconds uint `json:"registerRetrySeconds"`
-
-	// RetryFailedRequestSeconds gives the duration, in seconds, that we must wait before retrying a
-	// request that previously failed.
-	RetryFailedRequestSeconds uint `json:"retryFailedRequestSeconds"`
-	// RetryDeniedDownscaleSeconds gives the duration, in seconds, that we must wait before retrying
-	// a downscale request that was previously denied
-	RetryDeniedDownscaleSeconds uint `json:"retryDeniedDownscaleSeconds"`
-
-	// RequestTimeoutSeconds gives the timeout for any individual request to the informant, except
-	// for those with separately-defined values below.
-	RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
-	// RegisterTimeoutSeconds gives the timeout duration, in seconds, for a register request.
-	//
-	// This is a separate field from RequestTimeoutSeconds because registering may require that the
-	// informant suspend a previous agent, which could take longer.
-	RegisterTimeoutSeconds uint `json:"registerTimeoutSeconds"`
-	// DownscaleTimeoutSeconds gives the timeout duration, in seconds, for a downscale request.
-	//
-	// This is a separate field from RequestTimeoutSeconds it's possible that downscaling may
-	// require some non-trivial work that we want to allow to complete.
-	DownscaleTimeoutSeconds uint `json:"downscaleTimeoutSeconds"`
-
-	// UnhealthyAfterSilenceDurationSeconds gives the duration, in seconds, after which failing to
-	// receive a successful request from the informant indicates that it is probably unhealthy.
-	UnhealthyAfterSilenceDurationSeconds uint `json:"unhealthyAfterSilenceDurationSeconds"`
-	// UnhealthyStartupGracePeriodSeconds gives the duration, in seconds, after which we will no
-	// longer excuse total VM informant failures - i.e. when unhealthyAfterSilenceDurationSeconds
-	// kicks in.
-	UnhealthyStartupGracePeriodSeconds uint `json:"unhealthyStartupGracePeriodSeconds"`
-}
-
 // MetricsConfig defines a few parameters for metrics requests to the VM
 type MetricsConfig struct {
+	// Port is the port that VMs are expected to provide metrics on
+	Port uint16 `json:"port"`
 	// LoadMetricPrefix is the prefix at the beginning of the load metrics that we use. For
 	// node_exporter, this is "node_", and for vector it's "host_"
 	LoadMetricPrefix string `json:"loadMetricPrefix"`
@@ -141,35 +126,34 @@ func (c *Config) validate() error {
 		zeroTmpl  = "field %q cannot be zero"
 	)
 
+	erc.Whenf(ec, c.Billing != nil && c.Billing.ActiveTimeMetricName == "", emptyTmpl, ".billing.activeTimeMetricName")
+	erc.Whenf(ec, c.Billing != nil && c.Billing.CPUMetricName == "", emptyTmpl, ".billing.cpuMetricName")
+	erc.Whenf(ec, c.Billing != nil && c.Billing.CollectEverySeconds == 0, zeroTmpl, ".billing.collectEverySeconds")
+	erc.Whenf(ec, c.Billing != nil && c.Billing.AccumulateEverySeconds == 0, zeroTmpl, ".billing.accumulateEverySeconds")
+	erc.Whenf(ec, c.Billing != nil && c.Billing.PushEverySeconds == 0, zeroTmpl, ".billing.pushEverySeconds")
+	erc.Whenf(ec, c.Billing != nil && c.Billing.PushRequestTimeoutSeconds == 0, zeroTmpl, ".billing.pushRequestTimeoutSeconds")
+	erc.Whenf(ec, c.Billing != nil && c.Billing.MaxBatchSize == 0, zeroTmpl, ".billing.maxBatchSize")
+	erc.Whenf(ec, c.Billing != nil && c.Billing.URL == "", emptyTmpl, ".billing.url")
 	erc.Whenf(ec, c.DumpState != nil && c.DumpState.Port == 0, zeroTmpl, ".dumpState.port")
 	erc.Whenf(ec, c.DumpState != nil && c.DumpState.TimeoutSeconds == 0, zeroTmpl, ".dumpState.timeoutSeconds")
+	erc.Whenf(ec, c.Metrics.Port == 0, zeroTmpl, ".metrics.port")
+	erc.Whenf(ec, c.Metrics.LoadMetricPrefix == "", emptyTmpl, ".metrics.loadMetricPrefix")
+	erc.Whenf(ec, c.Metrics.SecondsBetweenRequests == 0, zeroTmpl, ".metrics.secondsBetweenRequests")
 	erc.Whenf(ec, c.Scaling.RequestTimeoutSeconds == 0, zeroTmpl, ".scaling.requestTimeoutSeconds")
+	erc.Whenf(ec, c.Monitor.ResponseTimeoutSeconds == 0, zeroTmpl, ".monitor.responseTimeoutSeconds")
+	erc.Whenf(ec, c.Monitor.ConnectionTimeoutSeconds == 0, zeroTmpl, ".monitor.connectionTimeoutSeconds")
+	erc.Whenf(ec, c.Monitor.ConnectionRetryMinWaitSeconds == 0, zeroTmpl, ".monitor.connectionRetryMinWaitSeconds")
+	erc.Whenf(ec, c.Monitor.ServerPort == 0, zeroTmpl, ".monitor.serverPort")
+	erc.Whenf(ec, c.Monitor.UnhealthyAfterSilenceDurationSeconds == 0, zeroTmpl, ".monitor.unhealthyAfterSilenceDurationSeconds")
+	erc.Whenf(ec, c.Monitor.UnhealthyStartupGracePeriodSeconds == 0, zeroTmpl, ".monitor.unhealthyStartupGracePeriodSeconds")
+	erc.Whenf(ec, c.Monitor.MaxHealthCheckSequentialFailuresSeconds == 0, zeroTmpl, ".monitor.maxHealthCheckSequentialFailuresSeconds")
+	erc.Whenf(ec, c.Monitor.RetryFailedRequestSeconds == 0, zeroTmpl, ".monitor.retryFailedRequestSeconds")
+	erc.Whenf(ec, c.Monitor.RetryDeniedDownscaleSeconds == 0, zeroTmpl, ".monitor.retryDeniedDownscaleSeconds")
 	// add all errors if there are any: https://github.com/neondatabase/autoscaling/pull/195#discussion_r1170893494
 	ec.Add(c.Scaling.DefaultConfig.Validate())
-	erc.Whenf(ec, c.Informant.ServerPort == 0, zeroTmpl, ".informant.serverPort")
-	erc.Whenf(ec, c.Informant.RetryServerMinWaitSeconds == 0, zeroTmpl, ".informant.retryServerMinWaitSeconds")
-	erc.Whenf(ec, c.Informant.RetryServerNormalWaitSeconds == 0, zeroTmpl, ".informant.retryServerNormalWaitSeconds")
-	erc.Whenf(ec, c.Informant.RegisterRetrySeconds == 0, zeroTmpl, ".informant.registerRetrySeconds")
-	erc.Whenf(ec, c.Informant.RetryFailedRequestSeconds == 0, zeroTmpl, ".informant.retryFailedRequestSeconds")
-	erc.Whenf(ec, c.Informant.RetryDeniedDownscaleSeconds == 0, zeroTmpl, ".informant.retryDeniedDownscaleSeconds")
-	erc.Whenf(ec, c.Informant.RequestTimeoutSeconds == 0, zeroTmpl, ".informant.requestTimeoutSeconds")
-	erc.Whenf(ec, c.Informant.RegisterTimeoutSeconds == 0, zeroTmpl, ".informant.registerTimeoutSeconds")
-	erc.Whenf(ec, c.Informant.DownscaleTimeoutSeconds == 0, zeroTmpl, ".informant.downscaleTimeoutSeconds")
-	erc.Whenf(ec, c.Informant.UnhealthyAfterSilenceDurationSeconds == 0, zeroTmpl, ".informant.unhealthyAfterSilenceDurationSeconds")
-	erc.Whenf(ec, c.Informant.UnhealthyStartupGracePeriodSeconds == 0, zeroTmpl, ".informant.unhealthyStartupGracePeriodSeconds")
-	erc.Whenf(ec, c.Metrics.LoadMetricPrefix == "", emptyTmpl, ".metrics.loadMetricPrefix")
-	erc.Whenf(ec, c.Metrics.RequestTimeoutSeconds == 0, zeroTmpl, ".metrics.requestTimeoutSeconds")
-	erc.Whenf(ec, c.Metrics.SecondsBetweenRequests == 0, zeroTmpl, ".metrics.secondsBetweenRequests")
-	erc.Whenf(ec, c.Scheduler.SchedulerName == "", emptyTmpl, ".scheduler.schedulerName")
-	// note: c.Scheduler.RequestTimeoutSeconds == 0 is valid
-	erc.Whenf(ec, c.Scheduler.RequestAtLeastEverySeconds == 0, zeroTmpl, ".scheduler.requestAtLeastEverySeconds")
 	erc.Whenf(ec, c.Scheduler.RequestPort == 0, zeroTmpl, ".scheduler.requestPort")
-	erc.Whenf(ec, c.Billing != nil && c.Billing.URL == "", emptyTmpl, ".billing.url")
-	erc.Whenf(ec, c.Billing != nil && c.Billing.CPUMetricName == "", emptyTmpl, ".billing.cpuMetricName")
-	erc.Whenf(ec, c.Billing != nil && c.Billing.ActiveTimeMetricName == "", emptyTmpl, ".billing.activeTimeMetricName")
-	erc.Whenf(ec, c.Billing != nil && c.Billing.CollectEverySeconds == 0, zeroTmpl, ".billing.collectEverySeconds")
-	erc.Whenf(ec, c.Billing != nil && c.Billing.PushEverySeconds == 0, zeroTmpl, ".billing.pushEverySeconds")
-	erc.Whenf(ec, c.Billing != nil && c.Billing.PushTimeoutSeconds == 0, zeroTmpl, ".billing.pushTimeoutSeconds")
+	erc.Whenf(ec, c.Scheduler.RequestTimeoutSeconds == 0, zeroTmpl, ".scheduler.requestTimeoutSeconds")
+	erc.Whenf(ec, c.Scheduler.SchedulerName == "", emptyTmpl, ".scheduler.schedulerName")
 
 	return ec.Resolve()
 }
diff --git a/pkg/agent/core/action.go b/pkg/agent/core/action.go
index 990064d62..7314f3894 100644
--- a/pkg/agent/core/action.go
+++ b/pkg/agent/core/action.go
@@ -7,11 +7,11 @@ import (
 )
 
 type ActionSet struct {
-	Wait               *ActionWait               `json:"wait,omitempty"`
-	PluginRequest      *ActionPluginRequest      `json:"pluginRequest,omitempty"`
-	NeonVMRequest      *ActionNeonVMRequest      `json:"neonvmRequest,omitempty"`
-	InformantDownscale *ActionInformantDownscale `json:"informantDownscale,omitempty"`
-	InformantUpscale   *ActionInformantUpscale   `json:"informantUpscale,omitempty"`
+	Wait             *ActionWait             `json:"wait,omitempty"`
+	PluginRequest    *ActionPluginRequest    `json:"pluginRequest,omitempty"`
+	NeonVMRequest    *ActionNeonVMRequest    `json:"neonvmRequest,omitempty"`
+	MonitorDownscale *ActionMonitorDownscale `json:"monitorDownscale,omitempty"`
+	MonitorUpscale   *ActionMonitorUpscale   `json:"monitorUpscale,omitempty"`
 }
 
 type ActionWait struct {
@@ -29,12 +29,12 @@ type ActionNeonVMRequest struct {
 	Target  api.Resources `json:"target"`
 }
 
-type ActionInformantDownscale struct {
+type ActionMonitorDownscale struct {
 	Current api.Resources `json:"current"`
 	Target  api.Resources `json:"target"`
 }
 
-type ActionInformantUpscale struct {
+type ActionMonitorUpscale struct {
 	Current api.Resources `json:"current"`
 	Target  api.Resources `json:"target"`
 }
diff --git a/pkg/agent/core/dumpstate.go b/pkg/agent/core/dumpstate.go
index b36874a6a..03c06dce6 100644
--- a/pkg/agent/core/dumpstate.go
+++ b/pkg/agent/core/dumpstate.go
@@ -19,23 +19,23 @@ func shallowCopy[T any](ptr *T) *T {
 
 // StateDump provides introspection into the current values of the fields of State
 type StateDump struct {
-	Config    Config             `json:"config"`
-	VM        api.VmInfo         `json:"vm"`
-	Plugin    pluginStateDump    `json:"plugin"`
-	Informant informantStateDump `json:"informant"`
-	NeonVM    neonvmStateDump    `json:"neonvm"`
-	Metrics   *api.Metrics       `json:"metrics"`
+	Config  Config           `json:"config"`
+	VM      api.VmInfo       `json:"vm"`
+	Plugin  pluginStateDump  `json:"plugin"`
+	Monitor monitorStateDump `json:"monitor"`
+	NeonVM  neonvmStateDump  `json:"neonvm"`
+	Metrics *api.Metrics     `json:"metrics"`
 }
 
 // Dump produces a JSON-serializable representation of the State
 func (s *State) Dump() StateDump {
 	return StateDump{
-		Config:    s.config,
-		VM:        s.vm,
-		Plugin:    s.plugin.dump(),
-		Informant: s.informant.dump(),
-		NeonVM:    s.neonvm.dump(),
-		Metrics:   shallowCopy(s.metrics),
+		Config:  s.config,
+		VM:      s.vm,
+		Plugin:  s.plugin.dump(),
+		Monitor: s.monitor.dump(),
+		NeonVM:  s.neonvm.dump(),
+		Metrics: shallowCopy(s.metrics),
 	}
 }
 
@@ -69,17 +69,17 @@ func (s *pluginState) dump() pluginStateDump {
 	}
 }
 
-type informantStateDump struct {
-	Active             bool                         `json:"active"`
-	OngoingRequest     *OngoingInformantRequestDump `json:"ongoingRequest"`
-	RequestedUpscale   *requestedUpscaleDump        `json:"requestedUpscale"`
-	DeniedDownscale    *deniedDownscaleDump         `json:"deniedDownscale"`
-	Approved           *api.Resources               `json:"approved"`
-	DownscaleFailureAt *time.Time                   `json:"downscaleFailureAt"`
-	UpscaleFailureAt   *time.Time                   `json:"upscaleFailureAt"`
+type monitorStateDump struct {
+	Active             bool                       `json:"active"`
+	OngoingRequest     *OngoingMonitorRequestDump `json:"ongoingRequest"`
+	RequestedUpscale   *requestedUpscaleDump      `json:"requestedUpscale"`
+	DeniedDownscale    *deniedDownscaleDump       `json:"deniedDownscale"`
+	Approved           *api.Resources             `json:"approved"`
+	DownscaleFailureAt *time.Time                 `json:"downscaleFailureAt"`
+	UpscaleFailureAt   *time.Time                 `json:"upscaleFailureAt"`
 }
-type OngoingInformantRequestDump struct {
-	Kind informantRequestKind `json:"kind"`
+type OngoingMonitorRequestDump struct {
+	Kind monitorRequestKind `json:"kind"`
 }
 type requestedUpscaleDump struct {
 	At        time.Time         `json:"at"`
@@ -91,7 +91,7 @@ type deniedDownscaleDump struct {
 	Requested api.Resources `json:"requested"`
 }
 
-func (s *informantState) dump() informantStateDump {
+func (s *monitorState) dump() monitorStateDump {
 	var requestedUpscale *requestedUpscaleDump
 	if s.requestedUpscale != nil {
 		requestedUpscale = &requestedUpscaleDump{
@@ -109,14 +109,14 @@ func (s *informantState) dump() informantStateDump {
 		}
 	}
 
-	var ongoingRequest *OngoingInformantRequestDump
+	var ongoingRequest *OngoingMonitorRequestDump
 	if s.ongoingRequest != nil {
-		ongoingRequest = &OngoingInformantRequestDump{
+		ongoingRequest = &OngoingMonitorRequestDump{
 			Kind: s.ongoingRequest.kind,
 		}
 	}
 
-	return informantStateDump{
+	return monitorStateDump{
 		Active:             s.active,
 		OngoingRequest:     ongoingRequest,
 		RequestedUpscale:   requestedUpscale,
diff --git a/pkg/agent/core/state.go b/pkg/agent/core/state.go
index 287a92a48..637c8eb01 100644
--- a/pkg/agent/core/state.go
+++ b/pkg/agent/core/state.go
@@ -13,7 +13,7 @@ package core
 //
 // That said, there's still some tricky semantics we want to maintain. Internally, the
 // autoscaler-agent must be designed around eventual consistency, but the API we expose to the
-// vm-informant is strictly synchonous. As such, there's some subtle logic to make sure that we're
+// vm-monitor is strictly synchonous. As such, there's some subtle logic to make sure that we're
 // not violating our own guarantees.
 //
 // ---
@@ -40,12 +40,12 @@ type Config struct {
 	// plugin, even if nothing's changed.
 	PluginRequestTick time.Duration
 
-	// InformantDeniedDownscaleCooldown gives the time we must wait between making duplicate
-	// downscale requests to the vm-informant where the previous failed.
-	InformantDeniedDownscaleCooldown time.Duration
+	// MonitorDeniedDownscaleCooldown gives the time we must wait between making duplicate
+	// downscale requests to the vm-monitor where the previous failed.
+	MonitorDeniedDownscaleCooldown time.Duration
 
-	// InformantRetryWait gives the amount of time to wait to retry after a *failed* request.
-	InformantRetryWait time.Duration
+	// MonitorRetryWait gives the amount of time to wait to retry after a *failed* request.
+	MonitorRetryWait time.Duration
 
 	// Warn provides an outlet for (*State).Next() to give warnings about conditions that are
 	// impeding its ability to execute. (e.g. "wanted to do X but couldn't because of Y")
@@ -68,8 +68,8 @@ type State struct {
 	// plugin records all state relevant to communications with the scheduler plugin
 	plugin pluginState
 
-	// informant records all state relevant to communications with the vm-informant
-	informant informantState
+	// monitor records all state relevant to communications with the vm-monitor
+	monitor monitorState
 
 	// neonvm records all state relevant to the NeonVM k8s API
 	neonvm neonvmState
@@ -98,15 +98,15 @@ type pluginRequested struct {
 	resources api.Resources
 }
 
-type informantState struct {
-	// active is true iff the agent is currently "confirmed" and not "suspended" by the informant.
+type monitorState struct {
+	// active is true iff the agent is currently "confirmed" and not "suspended" by the monitor.
 	// Otherwise, we shouldn't be making any kind of scaling requests.
 	active bool
 
-	ongoingRequest *ongoingInformantRequest
+	ongoingRequest *ongoingMonitorRequest
 
 	// requestedUpscale, if not nil, stores the most recent *unresolved* upscaling requested by the
-	// vm-informant, along with the time at which it occurred.
+	// vm-monitor, along with the time at which it occurred.
 	requestedUpscale *requestedUpscale
 
 	// deniedDownscale, if not nil, stores the result of the lastest denied /downscale request.
@@ -120,15 +120,15 @@ type informantState struct {
 	upscaleFailureAt   *time.Time
 }
 
-type ongoingInformantRequest struct {
-	kind informantRequestKind
+type ongoingMonitorRequest struct {
+	kind monitorRequestKind
 }
 
-type informantRequestKind string
+type monitorRequestKind string
 
 const (
-	informantRequestKindDownscale informantRequestKind = "downscale"
-	informantRequestKindUpscale   informantRequestKind = "upscale"
+	monitorRequestKindDownscale monitorRequestKind = "downscale"
+	monitorRequestKindUpscale   monitorRequestKind = "upscale"
 )
 
 type requestedUpscale struct {
@@ -160,7 +160,7 @@ func NewState(vm api.VmInfo, config Config) *State {
 			lastRequest:    nil,
 			permit:         nil,
 		},
-		informant: informantState{
+		monitor: monitorState{
 			active:             false,
 			ongoingRequest:     nil,
 			requestedUpscale:   nil,
@@ -185,30 +185,12 @@ func (s *State) NextActions(now time.Time) ActionSet {
 
 	using := s.vm.Using()
 
-	var desiredResources api.Resources
+	desiredResources := s.DesiredResourcesFromMetricsOrRequestedUpscaling()
 
-	if s.informant.active {
-		desiredResources = s.desiredResourcesFromMetricsOrRequestedUpscaling()
-	} else {
-		// If we're not deemed "active" by the informant, then we shouldn't be making any kind of
-		// scaling requests on its behalf.
-		//
-		// We'll still talk to the scheduler to inform it about the current resource usage though,
-		// to mitigate any reliability issues - much of the informant is built (as of 2023-07-09)
-		// under the assumption that we could, in theory, have multiple autoscaler-agents on the
-		// same node at the same time. That's... not really true, so an informant that isn't
-		// "active" is more likely to just be crash-looping due to a bug.
-		//
-		// *In theory* if we had mutliple autoscaler-agents talking to a single informant, this
-		// would be incorrect; we'd override another one's scaling requests. But this should be
-		// fine.
-		desiredResources = using
-	}
-
-	desiredResourcesApprovedByInformant := s.boundResourcesByInformantApproved(desiredResources)
+	desiredResourcesApprovedByMonitor := s.boundResourcesByMonitorApproved(desiredResources)
 	desiredResourcesApprovedByPlugin := s.boundResourcesByPluginApproved(desiredResources)
-	// NB: informant approved provides a lower bound
-	approvedDesiredResources := desiredResourcesApprovedByPlugin.Max(desiredResourcesApprovedByInformant)
+	// NB: monitor approved provides a lower bound
+	approvedDesiredResources := desiredResourcesApprovedByPlugin.Max(desiredResourcesApprovedByMonitor)
 
 	ongoingNeonVMRequest := s.neonvm.ongoingRequested != nil
 
@@ -248,7 +230,7 @@ func (s *State) NextActions(now time.Time) ActionSet {
 			// ... Otherwise, we should try requesting something new form it.
 			actions.PluginRequest = &ActionPluginRequest{
 				LastPermit: s.plugin.permit,
-				Target:     desiredResourcesApprovedByInformant,
+				Target:     desiredResourcesApprovedByMonitor,
 				Metrics:    s.metrics,
 			}
 		}
@@ -285,63 +267,63 @@ func (s *State) NextActions(now time.Time) ActionSet {
 		}
 	}
 
-	// We should make an upscale request to the informant if we've upscaled and the informant
+	// We should make an upscale request to the monitor if we've upscaled and the monitor
 	// doesn't know about it.
-	wantInformantUpscaleRequest := s.informant.approved != nil && *s.informant.approved != desiredResources.Max(*s.informant.approved)
+	wantMonitorUpscaleRequest := s.monitor.approved != nil && *s.monitor.approved != desiredResources.Max(*s.monitor.approved)
 	// However, we may need to wait before retrying (or for any ongoing requests to finish)
-	makeInformantUpscaleRequest := wantInformantUpscaleRequest &&
-		s.informant.active &&
-		s.informant.ongoingRequest == nil &&
-		(s.informant.upscaleFailureAt == nil ||
-			now.Sub(*s.informant.upscaleFailureAt) >= s.config.InformantRetryWait)
-	if wantInformantUpscaleRequest {
-		if makeInformantUpscaleRequest {
-			actions.InformantUpscale = &ActionInformantUpscale{
-				Current: *s.informant.approved,
-				Target:  desiredResources.Max(*s.informant.approved),
+	makeMonitorUpscaleRequest := wantMonitorUpscaleRequest &&
+		s.monitor.active &&
+		s.monitor.ongoingRequest == nil &&
+		(s.monitor.upscaleFailureAt == nil ||
+			now.Sub(*s.monitor.upscaleFailureAt) >= s.config.MonitorRetryWait)
+	if wantMonitorUpscaleRequest {
+		if makeMonitorUpscaleRequest {
+			actions.MonitorUpscale = &ActionMonitorUpscale{
+				Current: *s.monitor.approved,
+				Target:  desiredResources.Max(*s.monitor.approved),
 			}
-		} else if !s.informant.active {
+		} else if !s.monitor.active {
 			s.config.Warn("Wanted to send informant upscale request, but not active")
-		} else if s.informant.ongoingRequest != nil && s.informant.ongoingRequest.kind != informantRequestKindUpscale {
-			s.config.Warn("Wanted to send informant upscale request, but waiting other ongoing %s request", s.informant.ongoingRequest.kind)
-		} else if s.informant.ongoingRequest == nil {
+		} else if s.monitor.ongoingRequest != nil && s.monitor.ongoingRequest.kind != monitorRequestKindUpscale {
+			s.config.Warn("Wanted to send informant upscale request, but waiting other ongoing %s request", s.monitor.ongoingRequest.kind)
+		} else if s.monitor.ongoingRequest == nil {
 			s.config.Warn("Wanted to send informant upscale request, but waiting on retry rate limit")
 		}
 	}
 
-	// We should make a downscale request to the informant if we want to downscale but haven't been
+	// We should make a downscale request to the monitor if we want to downscale but haven't been
 	// approved for it.
-	var resourcesForInformantDownscale api.Resources
-	if s.informant.approved != nil {
-		resourcesForInformantDownscale = desiredResources.Min(*s.informant.approved)
+	var resourcesForMonitorDownscale api.Resources
+	if s.monitor.approved != nil {
+		resourcesForMonitorDownscale = desiredResources.Min(*s.monitor.approved)
 	} else {
-		resourcesForInformantDownscale = desiredResources.Min(using)
+		resourcesForMonitorDownscale = desiredResources.Min(using)
 	}
-	wantInformantDownscaleRequest := s.informant.approved != nil && *s.informant.approved != resourcesForInformantDownscale
-	if s.informant.approved == nil && resourcesForInformantDownscale != using {
+	wantMonitorDownscaleRequest := s.monitor.approved != nil && *s.monitor.approved != resourcesForMonitorDownscale
+	if s.monitor.approved == nil && resourcesForMonitorDownscale != using {
 		s.config.Warn("Wanted to send informant downscale request, but haven't yet gotten information about its resources")
 	}
 	// However, we may need to wait before retrying (or for any ongoing requests to finish)
-	makeInformantDownscaleRequest := wantInformantDownscaleRequest &&
-		s.informant.active &&
-		s.informant.ongoingRequest == nil &&
-		(s.informant.deniedDownscale == nil ||
-			s.informant.deniedDownscale.requested != desiredResources.Min(using) ||
-			now.Sub(s.informant.deniedDownscale.at) >= s.config.InformantDeniedDownscaleCooldown) &&
-		(s.informant.downscaleFailureAt == nil ||
-			now.Sub(*s.informant.downscaleFailureAt) >= s.config.InformantRetryWait)
-
-	if wantInformantDownscaleRequest {
-		if makeInformantDownscaleRequest {
-			actions.InformantDownscale = &ActionInformantDownscale{
-				Current: *s.informant.approved,
-				Target:  resourcesForInformantDownscale,
+	makeMonitorDownscaleRequest := wantMonitorDownscaleRequest &&
+		s.monitor.active &&
+		s.monitor.ongoingRequest == nil &&
+		(s.monitor.deniedDownscale == nil ||
+			s.monitor.deniedDownscale.requested != desiredResources.Min(using) ||
+			now.Sub(s.monitor.deniedDownscale.at) >= s.config.MonitorDeniedDownscaleCooldown) &&
+		(s.monitor.downscaleFailureAt == nil ||
+			now.Sub(*s.monitor.downscaleFailureAt) >= s.config.MonitorRetryWait)
+
+	if wantMonitorDownscaleRequest {
+		if makeMonitorDownscaleRequest {
+			actions.MonitorDownscale = &ActionMonitorDownscale{
+				Current: *s.monitor.approved,
+				Target:  resourcesForMonitorDownscale,
 			}
-		} else if !s.informant.active {
+		} else if !s.monitor.active {
 			s.config.Warn("Wanted to send informant downscale request, but not active")
-		} else if s.informant.ongoingRequest != nil && s.informant.ongoingRequest.kind != informantRequestKindDownscale {
-			s.config.Warn("Wanted to send informant downscale request, but waiting on other ongoing %s request", s.informant.ongoingRequest.kind)
-		} else if s.informant.ongoingRequest == nil {
+		} else if s.monitor.ongoingRequest != nil && s.monitor.ongoingRequest.kind != monitorRequestKindDownscale {
+			s.config.Warn("Wanted to send informant downscale request, but waiting on other ongoing %s request", s.monitor.ongoingRequest.kind)
+		} else if s.monitor.ongoingRequest == nil {
 			s.config.Warn("Wanted to send informant downscale request, but waiting on retry rate limit")
 		}
 	}
@@ -349,7 +331,7 @@ func (s *State) NextActions(now time.Time) ActionSet {
 	// --- and that's all the request types! ---
 
 	// If there's anything waiting, we should also note how long we should wait for.
-	// There's two components we could be waiting on: the scheduler plugin, and the vm-informant.
+	// There's two components we could be waiting on: the scheduler plugin, and the vm-monitor.
 	maximumDuration := time.Duration(int64(uint64(1)<<63 - 1))
 	requiredWait := maximumDuration
 
@@ -365,28 +347,28 @@ func (s *State) NextActions(now time.Time) ActionSet {
 		requiredWait = util.Min(requiredWait, now.Sub(s.plugin.lastRequest.at))
 	}
 
-	// For the vm-informant:
+	// For the vm-monitor:
 	// if we wanted to make EITHER a downscale or upscale request, but we previously couldn't
-	// because of retry timeouts, we should wait for s.config.InformantRetryWait before trying
+	// because of retry timeouts, we should wait for s.config.MonitorRetryWait before trying
 	// again.
 	// OR if we wanted to downscale but got denied, we should wait for
-	// s.config.InformantDownscaleCooldown before retrying.
-	if s.informant.ongoingRequest == nil {
+	// s.config.MonitorDownscaleCooldown before retrying.
+	if s.monitor.ongoingRequest == nil {
 		// Retry upscale on failure
-		if wantInformantUpscaleRequest && s.informant.upscaleFailureAt != nil {
-			if wait := now.Sub(*s.informant.upscaleFailureAt); wait >= s.config.InformantRetryWait {
+		if wantMonitorUpscaleRequest && s.monitor.upscaleFailureAt != nil {
+			if wait := now.Sub(*s.monitor.upscaleFailureAt); wait >= s.config.MonitorRetryWait {
 				requiredWait = util.Min(requiredWait, wait)
 			}
 		}
 		// Retry downscale on failure
-		if wantInformantDownscaleRequest && s.informant.downscaleFailureAt != nil {
-			if wait := now.Sub(*s.informant.downscaleFailureAt); wait >= s.config.InformantRetryWait {
+		if wantMonitorDownscaleRequest && s.monitor.downscaleFailureAt != nil {
+			if wait := now.Sub(*s.monitor.downscaleFailureAt); wait >= s.config.MonitorRetryWait {
 				requiredWait = util.Min(requiredWait, wait)
 			}
 		}
 		// Retry downscale if denied
-		if wantInformantDownscaleRequest && s.informant.deniedDownscale != nil && resourcesForInformantDownscale == s.informant.deniedDownscale.requested {
-			if wait := now.Sub(s.informant.deniedDownscale.at); wait >= s.config.InformantDeniedDownscaleCooldown {
+		if wantMonitorDownscaleRequest && s.monitor.deniedDownscale != nil && resourcesForMonitorDownscale == s.monitor.deniedDownscale.requested {
+			if wait := now.Sub(s.monitor.deniedDownscale.at); wait >= s.config.MonitorDeniedDownscaleCooldown {
 				requiredWait = util.Min(requiredWait, wait)
 			}
 		}
@@ -408,7 +390,7 @@ func (s *State) scalingConfig() api.ScalingConfig {
 	}
 }
 
-func (s *State) desiredResourcesFromMetricsOrRequestedUpscaling() api.Resources {
+func (s *State) DesiredResourcesFromMetricsOrRequestedUpscaling() api.Resources {
 	// There's some annoying edge cases that this function has to be able to handle properly. For
 	// the sake of completeness, they are:
 	//
@@ -445,9 +427,9 @@ func (s *State) desiredResourcesFromMetricsOrRequestedUpscaling() api.Resources
 	// resources for the desired "goal" compute units
 	var goalResources api.Resources
 
-	// If there's no constraints from s.metrics or s.informant.requestedUpscale, then we'd prefer to
+	// If there's no constraints from s.metrics or s.monitor.requestedUpscale, then we'd prefer to
 	// keep things as-is, rather than scaling down (because otherwise goalCU = 0).
-	if s.metrics == nil && s.informant.requestedUpscale == nil {
+	if s.metrics == nil && s.monitor.requestedUpscale == nil {
 		goalResources = s.vm.Using()
 	} else {
 		goalResources = s.plugin.computeUnit.Mul(uint16(goalCU))
@@ -476,12 +458,12 @@ func (s *State) desiredResourcesFromMetricsOrRequestedUpscaling() api.Resources
 // NB: we could just use s.plugin.computeUnit, but that's sometimes nil. This way, it's clear that
 // it's the caller's responsibility to ensure that s.plugin.computeUnit != nil.
 func (s *State) requiredCUForRequestedUpscaling(computeUnit api.Resources) uint32 {
-	if s.informant.requestedUpscale == nil {
+	if s.monitor.requestedUpscale == nil {
 		return 0
 	}
 
 	var required uint32
-	requested := s.informant.requestedUpscale.requested
+	requested := s.monitor.requestedUpscale.requested
 
 	// note: floor(x / M) + 1 gives the minimum integer value greater than x / M.
 
@@ -495,10 +477,10 @@ func (s *State) requiredCUForRequestedUpscaling(computeUnit api.Resources) uint3
 	return required
 }
 
-func (s *State) boundResourcesByInformantApproved(resources api.Resources) api.Resources {
+func (s *State) boundResourcesByMonitorApproved(resources api.Resources) api.Resources {
 	var lowerBound api.Resources
-	if s.informant.approved != nil {
-		lowerBound = *s.informant.approved
+	if s.monitor.approved != nil {
+		lowerBound = *s.monitor.approved
 	} else {
 		lowerBound = s.vm.Using()
 	}
@@ -598,17 +580,17 @@ func (h PluginHandle) RequestSuccessful(now time.Time, resp api.PluginResponse)
 	return nil
 }
 
-// InformantHandle provides write access to the vm-informant pieces of an UpdateState
-type InformantHandle struct {
+// MonitorHandle provides write access to the vm-monitor pieces of an UpdateState
+type MonitorHandle struct {
 	s *State
 }
 
-func (s *State) Informant() InformantHandle {
-	return InformantHandle{s}
+func (s *State) Monitor() MonitorHandle {
+	return MonitorHandle{s}
 }
 
-func (h InformantHandle) Reset() {
-	h.s.informant = informantState{
+func (h MonitorHandle) Reset() {
+	h.s.monitor = monitorState{
 		active:             false,
 		ongoingRequest:     nil,
 		requestedUpscale:   nil,
@@ -619,61 +601,56 @@ func (h InformantHandle) Reset() {
 	}
 }
 
-func (h InformantHandle) Active(active bool) {
-	h.s.informant.active = active
-}
-
-func (h InformantHandle) SuccessfullyRegistered() {
-	using := h.s.vm.Using()
-	h.s.informant.approved = &using // TODO: this is racy (although... informant synchronization should help *some* with this?)
+func (h MonitorHandle) Active(active bool) {
+	h.s.monitor.active = active
 }
 
-func (h InformantHandle) UpscaleRequested(now time.Time, resources api.MoreResources) {
-	h.s.informant.requestedUpscale = &requestedUpscale{
+func (h MonitorHandle) UpscaleRequested(now time.Time, resources api.MoreResources) {
+	h.s.monitor.requestedUpscale = &requestedUpscale{
 		at:        now,
-		base:      h.s.vm.Using(), // TODO: this is racy (maybe the resources were different when the informant originally made the request)
+		base:      h.s.vm.Using(), // TODO: this is racy (maybe the resources were different when the monitor originally made the request)
 		requested: resources,
 	}
 }
 
-func (h InformantHandle) StartingUpscaleRequest(now time.Time) {
-	h.s.informant.ongoingRequest = &ongoingInformantRequest{kind: informantRequestKindUpscale}
-	h.s.informant.upscaleFailureAt = nil
+func (h MonitorHandle) StartingUpscaleRequest(now time.Time) {
+	h.s.monitor.ongoingRequest = &ongoingMonitorRequest{kind: monitorRequestKindUpscale}
+	h.s.monitor.upscaleFailureAt = nil
 }
 
-func (h InformantHandle) UpscaleRequestSuccessful(now time.Time, resources api.Resources) {
-	h.s.informant.ongoingRequest = nil
-	h.s.informant.approved = &resources
+func (h MonitorHandle) UpscaleRequestSuccessful(now time.Time, resources api.Resources) {
+	h.s.monitor.ongoingRequest = nil
+	h.s.monitor.approved = &resources
 }
 
-func (h InformantHandle) UpscaleRequestFailed(now time.Time) {
-	h.s.informant.ongoingRequest = nil
-	h.s.informant.upscaleFailureAt = &now
+func (h MonitorHandle) UpscaleRequestFailed(now time.Time) {
+	h.s.monitor.ongoingRequest = nil
+	h.s.monitor.upscaleFailureAt = &now
 }
 
-func (h InformantHandle) StartingDownscaleRequest(now time.Time) {
-	h.s.informant.ongoingRequest = &ongoingInformantRequest{kind: informantRequestKindDownscale}
-	h.s.informant.downscaleFailureAt = nil
+func (h MonitorHandle) StartingDownscaleRequest(now time.Time) {
+	h.s.monitor.ongoingRequest = &ongoingMonitorRequest{kind: monitorRequestKindDownscale}
+	h.s.monitor.downscaleFailureAt = nil
 }
 
-func (h InformantHandle) DownscaleRequestAllowed(now time.Time, requested api.Resources) {
-	h.s.informant.ongoingRequest = nil
-	h.s.informant.approved = &requested
-	h.s.informant.deniedDownscale = nil
+func (h MonitorHandle) DownscaleRequestAllowed(now time.Time, requested api.Resources) {
+	h.s.monitor.ongoingRequest = nil
+	h.s.monitor.approved = &requested
+	h.s.monitor.deniedDownscale = nil
 }
 
-// Downscale request was successful but the informant denied our request.
-func (h InformantHandle) DownscaleRequestDenied(now time.Time, requested api.Resources) {
-	h.s.informant.ongoingRequest = nil
-	h.s.informant.deniedDownscale = &deniedDownscale{
+// Downscale request was successful but the monitor denied our request.
+func (h MonitorHandle) DownscaleRequestDenied(now time.Time, requested api.Resources) {
+	h.s.monitor.ongoingRequest = nil
+	h.s.monitor.deniedDownscale = &deniedDownscale{
 		at:        now,
 		requested: requested,
 	}
 }
 
-func (h InformantHandle) DownscaleRequestFailed(now time.Time) {
-	h.s.informant.ongoingRequest = nil
-	h.s.informant.downscaleFailureAt = &now
+func (h MonitorHandle) DownscaleRequestFailed(now time.Time) {
+	h.s.monitor.ongoingRequest = nil
+	h.s.monitor.downscaleFailureAt = &now
 }
 
 type NeonVMHandle struct {
diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go
new file mode 100644
index 000000000..8abc19c7e
--- /dev/null
+++ b/pkg/agent/core/state_test.go
@@ -0,0 +1,83 @@
+package core_test
+
+import (
+	"testing"
+	"time"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+
+	"github.com/neondatabase/autoscaling/pkg/agent/core"
+	"github.com/neondatabase/autoscaling/pkg/api"
+)
+
+func Test_desiredVMState(t *testing.T) {
+	cases := []struct {
+		name string
+
+		// helpers for setting fields (ish) of State:
+		metrics          api.Metrics
+		vmUsing          api.Resources
+		requestedUpscale api.MoreResources
+
+		// expected output from (*State).DesiredResourcesFromMetricsOrRequestedUpscaling()
+		expected api.Resources
+	}{
+		{
+			name: "BasicScaleup",
+			metrics: api.Metrics{
+				LoadAverage1Min:  0.30,
+				LoadAverage5Min:  0.0, // unused
+				MemoryUsageBytes: 0.0,
+			},
+			vmUsing:          api.Resources{VCPU: 250, Mem: 1},
+			requestedUpscale: api.MoreResources{Cpu: false, Memory: false},
+
+			expected: api.Resources{VCPU: 500, Mem: 2},
+		},
+	}
+
+	for _, c := range cases {
+		state := core.NewState(
+			api.VmInfo{
+				Name:      "test",
+				Namespace: "test",
+				Cpu: api.VmCpuInfo{
+					Min: 250,
+					Use: c.vmUsing.VCPU,
+					Max: 1000,
+				},
+				Mem: api.VmMemInfo{
+					SlotSize: resource.NewQuantity(1<<30 /* 1 Gi */, resource.BinarySI), // unused, doesn't actually matter.
+					Min:      1,
+					Use:      c.vmUsing.Mem,
+					Max:      4,
+				},
+				// remaining fields are also unused:
+				ScalingConfig:  nil,
+				AlwaysMigrate:  false,
+				ScalingEnabled: true,
+			},
+			core.Config{
+				DefaultScalingConfig: api.ScalingConfig{
+					LoadAverageFractionTarget: 0.5,
+					MemoryUsageFractionTarget: 0.5,
+				},
+				// these don't really matter, because we're not using (*State).NextActions()
+				PluginRequestTick:              time.Second,
+				MonitorDeniedDownscaleCooldown: time.Second,
+				MonitorRetryWait:               time.Second,
+				Warn:                           nil,
+			},
+		)
+
+		// set the metrics
+		state.UpdateMetrics(c.metrics)
+
+		t.Run(c.name, func(t *testing.T) {
+			actual := state.DesiredResourcesFromMetricsOrRequestedUpscaling()
+			if actual != c.expected {
+				t.Errorf("expected output %+v but got %+v", c.expected, actual)
+			}
+		})
+	}
+}
diff --git a/pkg/agent/dispatcher.go b/pkg/agent/dispatcher.go
new file mode 100644
index 000000000..ac61f2567
--- /dev/null
+++ b/pkg/agent/dispatcher.go
@@ -0,0 +1,678 @@
+package agent
+
+// The Dispatcher is our interface with the monitor. We interact via a websocket
+// connection through a simple RPC-style protocol.
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"go.uber.org/zap"
+	"nhooyr.io/websocket"
+	"nhooyr.io/websocket/wsjson"
+
+	"github.com/neondatabase/autoscaling/pkg/api"
+	"github.com/neondatabase/autoscaling/pkg/util"
+)
+
+const (
+	MinMonitorProtocolVersion api.MonitorProtoVersion = api.MonitorProtoV1_0
+	MaxMonitorProtocolVersion api.MonitorProtoVersion = api.MonitorProtoV1_0
+)
+
+// This struct represents the result of a dispatcher.Call. Because the SignalSender
+// passed in can only be generic over one type - we have this mock enum. Only
+// one field should ever be non-nil, and it should always be clear which field
+// is readable. For example, the caller of dispatcher.call(HealthCheck { .. })
+// should only read the healthcheck field.
+type MonitorResult struct {
+	Result       *api.DownscaleResult
+	Confirmation *api.UpscaleConfirmation
+	HealthCheck  *api.HealthCheck
+}
+
+// The Dispatcher is the main object managing the websocket connection to the
+// monitor. For more information on the protocol, see pkg/api/types.go
+type Dispatcher struct {
+	// The underlying connection we are managing
+	conn *websocket.Conn
+
+	// When someone sends a message, the dispatcher will attach a transaction id
+	// to it so that it knows when a response is back. When it receives a message
+	// with the same transaction id, it knows that that is the repsonse to the original
+	// message and will send it down the SignalSender so the original sender can use it.
+	waiters map[uint64]util.SignalSender[waiterResult]
+
+	// lock guards mutating the waiters, exitError, and (closing) exitSignal field.
+	// conn and lastTransactionID are all thread safe.
+	// runner, exit, and protoVersion are never modified.
+	lock sync.Mutex
+
+	// The runner that this dispatcher is part of
+	runner *Runner
+
+	exit func(status websocket.StatusCode, err error)
+
+	exitError  error
+	exitSignal chan struct{}
+
+	// lastTransactionID is the last transaction id. When we need a new one
+	// we simply bump it and take the new number.
+	//
+	// In order to prevent collisions between the IDs generated here vs by
+	// the monitor, we only generate even IDs, and the monitor only generates
+	// odd ones. So generating a new value is done by adding 2.
+	lastTransactionID atomic.Uint64
+
+	protoVersion api.MonitorProtoVersion
+}
+
+type waiterResult struct {
+	err error
+	res *MonitorResult
+}
+
+// Create a new Dispatcher, establishing a connection with the vm-monitor and setting up all the
+// background threads to manage the connection.
+func NewDispatcher(
+	ctx context.Context,
+	logger *zap.Logger,
+	addr string,
+	runner *Runner,
+	sendUpscaleRequested func(request api.MoreResources, withLock func()),
+) (_finalDispatcher *Dispatcher, _ error) {
+	// Create a new root-level context for this Dispatcher so that we can cancel if need be
+	ctx, cancelRootContext := context.WithCancel(ctx)
+	defer func() {
+		// cancel on failure or panic
+		if _finalDispatcher == nil {
+			cancelRootContext()
+		}
+	}()
+
+	connectTimeout := time.Second * time.Duration(runner.global.config.Monitor.ConnectionTimeoutSeconds)
+	conn, protoVersion, err := connectToMonitor(ctx, logger, addr, connectTimeout)
+	if err != nil {
+		return nil, err
+	}
+
+	disp := &Dispatcher{
+		conn:              conn,
+		waiters:           make(map[uint64]util.SignalSender[waiterResult]),
+		runner:            runner,
+		lock:              sync.Mutex{},
+		exit:              nil, // set below
+		exitError:         nil,
+		exitSignal:        make(chan struct{}),
+		lastTransactionID: atomic.Uint64{}, // Note: initialized to 0, so it's even, as required.
+		protoVersion:      *protoVersion,
+	}
+	disp.exit = func(status websocket.StatusCode, err error) {
+		disp.lock.Lock()
+		defer disp.lock.Unlock()
+
+		if disp.Exited() {
+			return
+		}
+
+		close(disp.exitSignal)
+		disp.exitError = err
+		cancelRootContext()
+
+		var closeReason string
+		if err != nil {
+			closeReason = err.Error()
+		} else {
+			closeReason = "normal exit"
+		}
+
+		// Run the actual websocket closing in a separate goroutine so we don't block while holding
+		// the lock. It can take up to 10s to close:
+		//
+		// > [Close] will write a WebSocket close frame with a timeout of 5s and then wait 5s for
+		// > the peer to send a close frame.
+		//
+		// This *potentially* runs us into race issues, but those are probably less bad to deal
+		// with, tbh.
+		go disp.conn.Close(status, closeReason)
+	}
+
+	go func() {
+		<-ctx.Done()
+		disp.exit(websocket.StatusNormalClosure, nil)
+	}()
+
+	msgHandlerLogger := logger.Named("message-handler")
+	runner.spawnBackgroundWorker(ctx, msgHandlerLogger, "vm-monitor message handler", func(c context.Context, l *zap.Logger) {
+		disp.run(c, l, sendUpscaleRequested)
+	})
+	runner.spawnBackgroundWorker(ctx, logger.Named("health-checks"), "vm-monitor health checks", func(ctx context.Context, logger *zap.Logger) {
+		timeout := time.Second * time.Duration(runner.global.config.Monitor.ResponseTimeoutSeconds)
+		// FIXME: make this duration configurable
+		ticker := time.NewTicker(5 * time.Second)
+		defer ticker.Stop()
+
+		// if we've had sequential failures for more than
+		var firstSequentialFailure *time.Time
+		continuedFailureAbortTimeout := time.Second * time.Duration(runner.global.config.Monitor.MaxHealthCheckSequentialFailuresSeconds)
+
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+			}
+
+			_, err := disp.Call(ctx, logger, timeout, "HealthCheck", api.HealthCheck{})
+			if err != nil {
+				logger.Warn("vm-monitor health check failed", zap.Error(err))
+
+				if firstSequentialFailure == nil {
+					now := time.Now()
+					firstSequentialFailure = &now
+				} else if since := time.Since(*firstSequentialFailure); since > continuedFailureAbortTimeout {
+					err := fmt.Errorf("vm-monitor has been failing health checks for at least %s", continuedFailureAbortTimeout)
+					logger.Error(fmt.Sprintf("%s, triggering connection restart", err.Error()))
+					disp.exit(websocket.StatusInternalError, err)
+				}
+			} else {
+				// health check was successful, so reset the sequential failures count
+				firstSequentialFailure = nil
+
+				runner.status.update(runner.global, func(s podStatus) podStatus {
+					now := time.Now()
+					s.lastSuccessfulMonitorComm = &now
+					return s
+				})
+			}
+		}
+	})
+	return disp, nil
+}
+
+func connectToMonitor(
+	ctx context.Context,
+	logger *zap.Logger,
+	addr string,
+	timeout time.Duration,
+) (_ *websocket.Conn, _ *api.MonitorProtoVersion, finalErr error) {
+	ctx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	logger.Info("Connecting to vm-monitor via websocket", zap.String("addr", addr))
+
+	// We do not need to close the response body according to docs.
+	// Doing so causes memory bugs.
+	c, _, err := websocket.Dial(ctx, addr, nil) //nolint:bodyclose // see comment above
+	if err != nil {
+		return nil, nil, fmt.Errorf("error establishing websocket connection to %s: %w", addr, err)
+	}
+
+	// If we return early, make sure we close the websocket
+	var failureReason websocket.StatusCode
+	defer func() {
+		if finalErr != nil {
+			if failureReason == 0 {
+				failureReason = websocket.StatusInternalError
+			}
+			c.Close(failureReason, finalErr.Error())
+		}
+	}()
+
+	versionRange := api.VersionRange[api.MonitorProtoVersion]{
+		Min: MinMonitorProtocolVersion,
+		Max: MaxMonitorProtocolVersion,
+	}
+	logger.Info("Sending protocol version range", zap.Any("range", versionRange))
+
+	// Figure out protocol version
+	err = wsjson.Write(ctx, c, versionRange)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error sending protocol range to monitor: %w", err)
+	}
+
+	logger.Info("Reading monitor version response")
+	var resp api.MonitorProtocolResponse
+	err = wsjson.Read(ctx, c, &resp)
+	if err != nil {
+		logger.Error("Failed to read monitor response", zap.Error(err))
+		failureReason = websocket.StatusProtocolError
+		return nil, nil, fmt.Errorf("Error reading vm-monitor response during protocol handshake: %w", err)
+	}
+
+	logger.Info("Got monitor version response", zap.Any("response", resp))
+	if resp.Error != nil {
+		logger.Error("Got error response from vm-monitor", zap.Any("response", resp), zap.String("error", *resp.Error))
+		failureReason = websocket.StatusProtocolError
+		return nil, nil, fmt.Errorf("Monitor returned error during protocol handshake: %q", *resp.Error)
+	}
+
+	logger.Info("negotiated protocol version with monitor", zap.Any("response", resp), zap.String("version", resp.Version.String()))
+	return c, &resp.Version, nil
+}
+
+// ExitSignal returns a channel that is closed when the Dispatcher is no longer running
+func (disp *Dispatcher) ExitSignal() <-chan struct{} {
+	return disp.exitSignal
+}
+
+// Exited returns whether the Dispatcher is no longer running
+//
+// Exited will return true iff the channel returned by ExitSignal is closed.
+func (disp *Dispatcher) Exited() bool {
+	select {
+	case <-disp.exitSignal:
+		return true
+	default:
+		return false
+	}
+}
+
+// ExitError returns the error that caused the dispatcher to exit, if there was one
+func (disp *Dispatcher) ExitError() error {
+	disp.lock.Lock()
+	defer disp.lock.Unlock()
+	return disp.exitError
+}
+
+// Send a message down the connection. Only call this method with types that
+// SerializeMonitorMessage can handle.
+func (disp *Dispatcher) send(ctx context.Context, logger *zap.Logger, id uint64, message any) error {
+	data, err := api.SerializeMonitorMessage(message, id)
+	if err != nil {
+		return fmt.Errorf("error serializing message: %w", err)
+	}
+	// wsjson.Write serializes whatever is passed in, and go serializes []byte
+	// by base64 encoding it, so use RawMessage to avoid serializing to []byte
+	// (done by SerializeMonitorMessage), and then base64 encoding again
+	raw := json.RawMessage(data)
+	logger.Info("sending message to monitor", zap.ByteString("message", raw))
+	return wsjson.Write(ctx, disp.conn, &raw)
+}
+
+// registerWaiter registers a util.SignalSender to get notified when a
+// message with the given id arrives.
+func (disp *Dispatcher) registerWaiter(id uint64, sender util.SignalSender[waiterResult]) {
+	disp.lock.Lock()
+	defer disp.lock.Unlock()
+	disp.waiters[id] = sender
+}
+
+// unregisterWaiter deletes a preexisting waiter without interacting with it.
+func (disp *Dispatcher) unregisterWaiter(id uint64) {
+	disp.lock.Lock()
+	defer disp.lock.Unlock()
+	delete(disp.waiters, id)
+}
+
+// Make a request to the monitor and wait for a response. The value passed as message must be a
+// valid value to send to the monitor. See the docs for SerializeMonitorMessage for more.
+//
+// This function must NOT be called while holding disp.runner.lock.
+func (disp *Dispatcher) Call(
+	ctx context.Context,
+	logger *zap.Logger,
+	timeout time.Duration,
+	messageType string,
+	message any,
+) (*MonitorResult, error) {
+	id := disp.lastTransactionID.Add(2)
+	sender, receiver := util.NewSingleSignalPair[waiterResult]()
+
+	status := "internal error"
+	defer func() {
+		disp.runner.global.metrics.monitorRequestsOutbound.WithLabelValues(messageType, status).Inc()
+	}()
+
+	// register the waiter *before* sending, so that we avoid a potential race where we'd get a
+	// reply to the message before being ready to receive it.
+	disp.registerWaiter(id, sender)
+	err := disp.send(ctx, logger, id, message)
+	if err != nil {
+		logger.Error("failed to send message", zap.Any("message", message), zap.Error(err))
+		disp.unregisterWaiter(id)
+		status = "[error: failed to send]"
+		return nil, err
+	}
+
+	timer := time.NewTimer(timeout)
+	defer timer.Stop()
+
+	select {
+	case result := <-receiver.Recv():
+		if result.err != nil {
+			status = fmt.Sprintf("[error: %s]", result.err)
+			return nil, errors.New("monitor experienced an internal error")
+		}
+
+		status = "ok"
+		return result.res, nil
+	case <-timer.C:
+		err := fmt.Errorf("timed out waiting %v for monitor response", timeout)
+		disp.unregisterWaiter(id)
+		status = "[error: timed out waiting for response]"
+		return nil, err
+	}
+}
+
+func extractField[T any](data map[string]interface{}, key string) (*T, error) {
+	field, ok := data[key]
+	if !ok {
+		return nil, fmt.Errorf("data had no key %q", key)
+	}
+
+	coerced, ok := field.(T)
+	if !ok {
+		return nil, fmt.Errorf("data[%q] was not of type %T", key, *new(T))
+	}
+
+	return &coerced, nil
+}
+
+type messageHandlerFuncs struct {
+	handleUpscaleRequest      func(api.UpscaleRequest)
+	handleUpscaleConfirmation func(api.UpscaleConfirmation, uint64) error
+	handleDownscaleResult     func(api.DownscaleResult, uint64) error
+	handleMonitorError        func(api.InternalError, uint64) error
+	handleHealthCheck         func(api.HealthCheck, uint64) error
+}
+
+// Handle messages from the monitor. Make sure that all message types the monitor
+// can send are included in the inner switch statement.
+func (disp *Dispatcher) HandleMessage(
+	ctx context.Context,
+	logger *zap.Logger,
+	handlers messageHandlerFuncs,
+) error {
+	// Deserialization has several steps:
+	// 1. Deserialize into an unstructured map[string]interface{}
+	// 2. Read the `type` field to know the type of the message
+	// 3. Then try to to deserialize again, but into that specific type
+	// 4. All message also come with an integer id under the key `id`
+
+	// wsjson.Read tries to deserialize the message. If we were to read to a
+	// []byte, it would base64 encode it as part of deserialization. json.RawMessage
+	// avoids this, and we manually deserialize later
+	var message json.RawMessage
+	if err := wsjson.Read(ctx, disp.conn, &message); err != nil {
+		return fmt.Errorf("Error receiving message: %w", err)
+	}
+	logger.Info("(pre-decoding): received a message", zap.ByteString("message", message))
+
+	var unstructured map[string]interface{}
+	if err := json.Unmarshal(message, &unstructured); err != nil {
+		return fmt.Errorf("Error deserializing message: %q", string(message))
+	}
+
+	typeStr, err := extractField[string](unstructured, "type")
+	if err != nil {
+		return fmt.Errorf("Error extracting 'type' field: %w", err)
+	}
+
+	// go thinks all json numbers are float64 so we first deserialize to that to
+	// avoid the type error, then cast to uint64
+	f, err := extractField[float64](unstructured, "id")
+	if err != nil {
+		return fmt.Errorf("Error extracting 'id field: %w", err)
+	}
+	id := uint64(*f)
+
+	var rootErr error
+
+	// now that we have the waiter's ID, make sure that if there's some failure past this point, we
+	// propagate that along to the monitor and remove it
+	defer func() {
+		// speculatively determine the root error, to send that along to the instance of Call
+		// waiting for it.
+		var err error
+
+		panicPayload := recover()
+		if panicPayload != nil {
+			err = errors.New("panicked")
+		} else if rootErr != nil {
+			err = rootErr
+		} else {
+			// if HandleMessage bailed without panicking or setting rootErr, but *also* without
+			// sending a message to the waiter, we should make sure that *something* gets sent, so
+			// the message doesn't just time out. But we don't have more information, so the error
+			// is still just "unknown".
+			err = errors.New("unknown")
+		}
+
+		disp.lock.Lock()
+		defer disp.lock.Unlock()
+		if sender, ok := disp.waiters[id]; ok {
+			sender.Send(waiterResult{err: err, res: nil})
+			delete(disp.waiters, id)
+		} else if rootErr != nil {
+			// we had some error while handling the message with this ID, and there wasn't a
+			// corresponding waiter. We should make note of this in the metrics:
+			status := fmt.Sprintf("[error: %s]", rootErr)
+			disp.runner.global.metrics.monitorRequestsInbound.WithLabelValues(*typeStr, status)
+		}
+
+		// resume panicking if we were before
+		if panicPayload != nil {
+			panic(panicPayload)
+		}
+	}()
+
+	// Helper function to handle common unmarshalling logic
+	unmarshal := func(value any) error {
+		if err := json.Unmarshal(message, value); err != nil {
+			rootErr = errors.New("Failed unmarshaling JSON")
+			err := fmt.Errorf("Error unmarshaling %s: %w", *typeStr, err)
+			logger.Error(rootErr.Error(), zap.Error(err))
+			// we're already on the error path anyways
+			_ = disp.send(ctx, logger, id, api.InvalidMessage{Error: err.Error()})
+			return err
+		}
+
+		return nil
+	}
+
+	switch *typeStr {
+	case "UpscaleRequest":
+		var req api.UpscaleRequest
+		if err := unmarshal(&req); err != nil {
+			return err
+		}
+		handlers.handleUpscaleRequest(req)
+		return nil
+	case "UpscaleConfirmation":
+		var confirmation api.UpscaleConfirmation
+		if err := unmarshal(&confirmation); err != nil {
+			return err
+		}
+		return handlers.handleUpscaleConfirmation(confirmation, id)
+	case "DownscaleResult":
+		var res api.DownscaleResult
+		if err := unmarshal(&res); err != nil {
+			return err
+		}
+		return handlers.handleDownscaleResult(res, id)
+	case "InternalError":
+		var monitorErr api.InternalError
+		if err := unmarshal(&monitorErr); err != nil {
+			return err
+		}
+		return handlers.handleMonitorError(monitorErr, id)
+	case "HealthCheck":
+		var healthCheck api.HealthCheck
+		if err := unmarshal(&healthCheck); err != nil {
+			return err
+		}
+		return handlers.handleHealthCheck(healthCheck, id)
+	case "InvalidMessage":
+		var warning api.InvalidMessage
+		if err := unmarshal(&warning); err != nil {
+			return err
+		}
+		logger.Warn("Received notification we sent an invalid message", zap.Any("warning", warning))
+		return nil
+	default:
+		rootErr = errors.New("Received unknown message type")
+		return disp.send(
+			ctx,
+			logger,
+			id,
+			api.InvalidMessage{Error: fmt.Sprintf("Received message of unknown type: %q", *typeStr)},
+		)
+	}
+}
+
+// Long running function that orchestrates all requests/responses.
+func (disp *Dispatcher) run(ctx context.Context, logger *zap.Logger, upscaleRequester func(_ api.MoreResources, withLock func())) {
+	logger.Info("Starting message handler")
+
+	// Utility for logging + returning an error when we get a message with an
+	// id we're unaware of. Note: unknownMessage is not a message type.
+	handleUnkownMessage := func(messageType string, id uint64) error {
+		fmtString := "Received %s with id %d but no record of previous message with that id"
+		msg := fmt.Sprintf(fmtString, messageType, id)
+		logger.Warn(msg, zap.Uint64("id", id))
+		return disp.send(ctx, logger, id, api.InvalidMessage{Error: msg})
+	}
+
+	// Does not take a message id because we don't know when the agent will
+	// upscale. The monitor will get the result back as a NotifyUpscale message
+	// from us, with a new id.
+	handleUpscaleRequest := func(req api.UpscaleRequest) {
+		// TODO: it shouldn't be this function's responsibility to update metrics.
+		defer func() {
+			disp.runner.global.metrics.monitorRequestsInbound.WithLabelValues("UpscaleRequest", "ok")
+		}()
+
+		resourceReq := api.MoreResources{
+			Cpu:    false,
+			Memory: true,
+		}
+
+		upscaleRequester(resourceReq, func() {
+			logger.Info("Updating requested upscale", zap.Any("requested", resourceReq))
+		})
+	}
+	handleUpscaleConfirmation := func(_ api.UpscaleConfirmation, id uint64) error {
+		disp.lock.Lock()
+		defer disp.lock.Unlock()
+
+		sender, ok := disp.waiters[id]
+		if ok {
+			logger.Info("vm-monitor confirmed upscale", zap.Uint64("id", id))
+			sender.Send(waiterResult{
+				err: nil,
+				res: &MonitorResult{
+					Confirmation: &api.UpscaleConfirmation{},
+					Result:       nil,
+					HealthCheck:  nil,
+				},
+			})
+			// Don't forget to delete the waiter
+			delete(disp.waiters, id)
+			return nil
+		} else {
+			return handleUnkownMessage("UpscaleConfirmation", id)
+		}
+	}
+	handleDownscaleResult := func(res api.DownscaleResult, id uint64) error {
+		disp.lock.Lock()
+		defer disp.lock.Unlock()
+
+		sender, ok := disp.waiters[id]
+		if ok {
+			logger.Info("vm-monitor returned downscale result", zap.Uint64("id", id), zap.Any("result", res))
+			sender.Send(waiterResult{
+				err: nil,
+				res: &MonitorResult{
+					Result:       &res,
+					Confirmation: nil,
+					HealthCheck:  nil,
+				},
+			})
+			// Don't forget to delete the waiter
+			delete(disp.waiters, id)
+			return nil
+		} else {
+			return handleUnkownMessage("DownscaleResult", id)
+		}
+	}
+	handleMonitorError := func(err api.InternalError, id uint64) error {
+		disp.lock.Lock()
+		defer disp.lock.Unlock()
+
+		sender, ok := disp.waiters[id]
+		if ok {
+			logger.Warn(
+				"vm-monitor experienced an internal error",
+				zap.Uint64("id", id),
+				zap.String("error", err.Error),
+			)
+			// Indicate to the receiver that an error occured
+			sender.Send(waiterResult{
+				err: errors.New("vm-monitor internal error"),
+				res: nil,
+			})
+			// Don't forget to delete the waiter
+			delete(disp.waiters, id)
+			return nil
+		} else {
+			return handleUnkownMessage("MonitorError", id)
+		}
+	}
+	handleHealthCheck := func(confirmation api.HealthCheck, id uint64) error {
+		disp.lock.Lock()
+		defer disp.lock.Unlock()
+
+		sender, ok := disp.waiters[id]
+		if ok {
+			logger.Info("vm-monitor responded to health check", zap.Uint64("id", id))
+			// Indicate to the receiver that an error occured
+			sender.Send(waiterResult{
+				err: nil,
+				res: &MonitorResult{
+					HealthCheck:  &api.HealthCheck{},
+					Result:       nil,
+					Confirmation: nil,
+				},
+			})
+			// Don't forget to delete the waiter
+			delete(disp.waiters, id)
+			return nil
+		} else {
+			return handleUnkownMessage("HealthCheck", id)
+		}
+	}
+
+	handlers := messageHandlerFuncs{
+		handleUpscaleRequest:      handleUpscaleRequest,
+		handleUpscaleConfirmation: handleUpscaleConfirmation,
+		handleDownscaleResult:     handleDownscaleResult,
+		handleMonitorError:        handleMonitorError,
+		handleHealthCheck:         handleHealthCheck,
+	}
+
+	for {
+		err := disp.HandleMessage(ctx, logger, handlers)
+		if err != nil {
+			if ctx.Err() != nil {
+				// The context is already cancelled, so this error is mostly likely
+				// expected. For example, if the context is cancelled because the
+				// runner exited, we should expect to fail to read off the connection,
+				// which is closed by the server exit.
+				logger.Warn("Error handling message", zap.Error(err))
+			} else {
+				logger.Error("Error handling message, shutting down connection", zap.Error(err))
+				err = fmt.Errorf("Error handling message: %w", err)
+				// note: in theory we *could* be more descriptive with these statuses, but the only
+				// consumer of this API is the vm-monitor, and it doesn't check those.
+				disp.exit(websocket.StatusInternalError, err)
+			}
+			return
+		}
+	}
+}
diff --git a/pkg/agent/execbridge.go b/pkg/agent/execbridge.go
index 9eba23565..50d56a64c 100644
--- a/pkg/agent/execbridge.go
+++ b/pkg/agent/execbridge.go
@@ -16,9 +16,9 @@ import (
 )
 
 var (
-	_ executor.PluginInterface    = (*execPluginInterface)(nil)
-	_ executor.NeonVMInterface    = (*execNeonVMInterface)(nil)
-	_ executor.InformantInterface = (*execInformantInterface)(nil)
+	_ executor.PluginInterface  = (*execPluginInterface)(nil)
+	_ executor.NeonVMInterface  = (*execNeonVMInterface)(nil)
+	_ executor.MonitorInterface = (*execMonitorInterface)(nil)
 )
 
 /////////////////////////////////////////////////////////////
@@ -119,46 +119,53 @@ func (iface *execNeonVMInterface) Request(ctx context.Context, logger *zap.Logge
 }
 
 ////////////////////////////////////////////////////
-// Informant-related interface and implementation //
+// Monitor-related interface and implementation //
 ////////////////////////////////////////////////////
 
-type execInformantInterface struct {
-	runner *Runner
-	core   *executor.ExecutorCore
+type execMonitorInterface struct {
+	runner      *Runner
+	core        *executor.ExecutorCore
+	requestLock util.ChanMutex
 }
 
-func makeInformantInterface(r *Runner, core *executor.ExecutorCore) *execInformantInterface {
-	return &execInformantInterface{runner: r, core: core}
+func makeMonitorInterface(r *Runner, core *executor.ExecutorCore) *execMonitorInterface {
+	return &execMonitorInterface{runner: r, core: core, requestLock: util.NewChanMutex()}
 }
 
-// EmptyID implements executor.InformantInterface
-func (iface *execInformantInterface) EmptyID() string {
+// EmptyID implements executor.MonitorInterface
+func (iface *execMonitorInterface) EmptyID() string {
 	return "<none>"
 }
 
-func (iface *execInformantInterface) GetHandle() executor.InformantHandle {
-	server := iface.runner.server.Load()
+func (iface *execMonitorInterface) GetHandle() executor.MonitorHandle {
+	dispatcher := iface.runner.monitor.Load()
 
-	if server == nil || server.ExitStatus() != nil {
+	if dispatcher == nil || dispatcher.Exited() {
 		return nil
 	}
 
-	return &execInformantHandle{server: server}
+	return &execMonitorHandle{
+		runner:      iface.runner,
+		dispatcher:  dispatcher,
+		requestLock: iface.requestLock,
+	}
 }
 
-type execInformantHandle struct {
-	server *InformantServer
+type execMonitorHandle struct {
+	runner      *Runner
+	dispatcher  *Dispatcher
+	requestLock util.ChanMutex
 }
 
-func (h *execInformantHandle) ID() string {
-	return h.server.desc.AgentID.String()
+func (h *execMonitorHandle) ID() string {
+	panic("todo")
 }
 
-func (h *execInformantHandle) RequestLock() util.ChanMutex {
-	return h.server.requestLock
+func (h *execMonitorHandle) RequestLock() util.ChanMutex {
+	return h.requestLock
 }
 
-func (h *execInformantHandle) Downscale(
+func (h *execMonitorHandle) Downscale(
 	ctx context.Context,
 	logger *zap.Logger,
 	current api.Resources,
@@ -167,33 +174,33 @@ func (h *execInformantHandle) Downscale(
 	// Check validity of the message we're sending
 	if target.HasFieldGreaterThan(current) {
 		innerMsg := fmt.Errorf("%+v has field greater than %+v", target, current)
-		panic(fmt.Errorf("(*execInformantHandle).Downscale() called with target greater than current: %w", innerMsg))
+		panic(fmt.Errorf("(*execMonitorHandle).Downscale() called with target greater than current: %w", innerMsg))
 	}
 
-	h.server.runner.recordResourceChange(current, target, h.server.runner.global.metrics.informantRequestedChange)
+	h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorRequestedChange)
 
-	result, err := h.server.Downscale(ctx, logger, target)
+	result, err := doMonitorDownscale(ctx, logger, h.dispatcher, target)
 
 	if err != nil && result.Ok {
-		h.server.runner.recordResourceChange(current, target, h.server.runner.global.metrics.informantApprovedChange)
+		h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorApprovedChange)
 	}
 
 	return result, err
 }
 
-func (h *execInformantHandle) Upscale(ctx context.Context, logger *zap.Logger, current, target api.Resources) error {
+func (h *execMonitorHandle) Upscale(ctx context.Context, logger *zap.Logger, current, target api.Resources) error {
 	// Check validity of the message we're sending
 	if target.HasFieldLessThan(current) {
 		innerMsg := fmt.Errorf("%+v has field less than %+v", target, current)
-		panic(fmt.Errorf("(*execInformantHandle).Upscale() called with target less than current: %w", innerMsg))
+		panic(fmt.Errorf("(*execMonitorHandle).Upscale() called with target less than current: %w", innerMsg))
 	}
 
-	h.server.runner.recordResourceChange(current, target, h.server.runner.global.metrics.informantRequestedChange)
+	h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorRequestedChange)
 
-	err := h.server.Upscale(ctx, logger, target)
+	err := doMonitorUpscale(ctx, logger, h.dispatcher, target)
 
 	if err != nil {
-		h.server.runner.recordResourceChange(current, target, h.server.runner.global.metrics.informantApprovedChange)
+		h.runner.recordResourceChange(current, target, h.runner.global.metrics.monitorApprovedChange)
 	}
 
 	return err
diff --git a/pkg/agent/executor/core.go b/pkg/agent/executor/core.go
index 5ae6cfd10..81c9573ec 100644
--- a/pkg/agent/executor/core.go
+++ b/pkg/agent/executor/core.go
@@ -31,9 +31,9 @@ type ExecutorCore struct {
 }
 
 type ClientSet struct {
-	Plugin    PluginInterface
-	NeonVM    NeonVMInterface
-	Informant InformantInterface
+	Plugin  PluginInterface
+	NeonVM  NeonVMInterface
+	Monitor MonitorInterface
 }
 
 func NewExecutorCore(stateLogger *zap.Logger, vm api.VmInfo, config core.Config) *ExecutorCore {
@@ -123,33 +123,23 @@ func (c ExecutorCoreUpdater) SchedulerGone(withLock func()) {
 	})
 }
 
-func (c ExecutorCoreUpdater) ResetInformant(withLock func()) {
+func (c ExecutorCoreUpdater) ResetMonitor(withLock func()) {
 	c.core.update(func(state *core.State) {
-		state.Informant().Reset()
+		state.Monitor().Reset()
 		withLock()
 	})
 }
 
 func (c ExecutorCoreUpdater) UpscaleRequested(resources api.MoreResources, withLock func()) {
 	c.core.update(func(state *core.State) {
-		state.Informant().UpscaleRequested(time.Now(), resources)
+		state.Monitor().UpscaleRequested(time.Now(), resources)
 		withLock()
 	})
 }
 
-func (c ExecutorCoreUpdater) InformantRegistered(active bool, withLock func()) {
+func (c ExecutorCoreUpdater) MonitorActive(active bool, withLock func()) {
 	c.core.update(func(state *core.State) {
-		state.Informant().SuccessfullyRegistered()
-		if active {
-			state.Informant().Active(active)
-		}
-		withLock()
-	})
-}
-
-func (c ExecutorCoreUpdater) InformantActive(active bool, withLock func()) {
-	c.core.update(func(state *core.State) {
-		state.Informant().Active(active)
+		state.Monitor().Active(active)
 		withLock()
 	})
 }
diff --git a/pkg/agent/executor/exec_informant.go b/pkg/agent/executor/exec_monitor.go
similarity index 66%
rename from pkg/agent/executor/exec_informant.go
rename to pkg/agent/executor/exec_monitor.go
index 6f758d079..817a6213c 100644
--- a/pkg/agent/executor/exec_informant.go
+++ b/pkg/agent/executor/exec_monitor.go
@@ -12,19 +12,19 @@ import (
 	"github.com/neondatabase/autoscaling/pkg/util"
 )
 
-type InformantInterface interface {
+type MonitorInterface interface {
 	EmptyID() string
-	GetHandle() InformantHandle
+	GetHandle() MonitorHandle
 }
 
-type InformantHandle interface {
+type MonitorHandle interface {
 	ID() string
 	RequestLock() util.ChanMutex
 	Downscale(_ context.Context, _ *zap.Logger, current, target api.Resources) (*api.DownscaleResult, error)
 	Upscale(_ context.Context, _ *zap.Logger, current, target api.Resources) error
 }
 
-func (c *ExecutorCoreWithClients) DoInformantDownscales(ctx context.Context, logger *zap.Logger) {
+func (c *ExecutorCoreWithClients) DoMonitorDownscales(ctx context.Context, logger *zap.Logger) {
 	var (
 		updates     util.BroadcastReceiver = c.updates.NewReceiver()
 		requestLock util.ChanMutex         = util.NewChanMutex()
@@ -42,10 +42,10 @@ func (c *ExecutorCoreWithClients) DoInformantDownscales(ctx context.Context, log
 
 	// meant to be called while holding c's lock
 	idUnchanged := func(current string) bool {
-		if h := c.clients.Informant.GetHandle(); h != nil {
+		if h := c.clients.Monitor.GetHandle(); h != nil {
 			return current == h.ID()
 		} else {
-			return current == c.clients.Informant.EmptyID()
+			return current == c.clients.Monitor.EmptyID()
 		}
 	}
 
@@ -63,7 +63,7 @@ func (c *ExecutorCoreWithClients) DoInformantDownscales(ctx context.Context, log
 		}
 
 		// Wait until we're supposed to make a request.
-		if last.actions.InformantDownscale == nil {
+		if last.actions.MonitorDownscale == nil {
 			select {
 			case <-ctx.Done():
 				return
@@ -73,12 +73,12 @@ func (c *ExecutorCoreWithClients) DoInformantDownscales(ctx context.Context, log
 			}
 		}
 
-		action := *last.actions.InformantDownscale
+		action := *last.actions.MonitorDownscale
 
-		informant := c.clients.Informant.GetHandle()
+		monitor := c.clients.Monitor.GetHandle()
 
-		if informant != nil {
-			requestLock = informant.RequestLock()
+		if monitor != nil {
+			requestLock = monitor.RequestLock()
 
 			// Try to acquire the request lock, but if something happens while we're waiting, we'll
 			// abort & retry on the next loop iteration (or maybe not, if last.actions changed).
@@ -95,16 +95,16 @@ func (c *ExecutorCoreWithClients) DoInformantDownscales(ctx context.Context, log
 
 		var startTime time.Time
 		c.update(func(state *core.State) {
-			logger.Info("Starting informant downscale request", zap.Any("action", action))
+			logger.Info("Starting vm-monitor downscale request", zap.Any("action", action))
 			startTime = time.Now()
-			state.Informant().StartingDownscaleRequest(startTime)
+			state.Monitor().StartingDownscaleRequest(startTime)
 		})
 
-		result, err := doSingleInformantDownscaleRequest(ctx, ifaceLogger, informant, action)
+		result, err := doSingleMonitorDownscaleRequest(ctx, ifaceLogger, monitor, action)
 		endTime := time.Now()
 
 		c.update(func(state *core.State) {
-			unchanged := idUnchanged(informant.ID())
+			unchanged := idUnchanged(monitor.ID())
 			logFields := []zap.Field{
 				zap.Any("action", action),
 				zap.Duration("duration", endTime.Sub(startTime)),
@@ -112,9 +112,9 @@ func (c *ExecutorCoreWithClients) DoInformantDownscales(ctx context.Context, log
 			}
 
 			if err != nil {
-				logger.Error("Informant downscale request failed", append(logFields, zap.Error(err))...)
+				logger.Error("vm-monitor downscale request failed", append(logFields, zap.Error(err))...)
 				if unchanged {
-					state.Informant().DownscaleRequestFailed(endTime)
+					state.Monitor().DownscaleRequestFailed(endTime)
 				}
 				return
 			}
@@ -122,34 +122,34 @@ func (c *ExecutorCoreWithClients) DoInformantDownscales(ctx context.Context, log
 			logFields = append(logFields, zap.Any("response", result))
 
 			if !result.Ok {
-				logger.Warn("Informant denied downscale", logFields...)
+				logger.Warn("vm-monitor denied downscale", logFields...)
 				if unchanged {
-					state.Informant().DownscaleRequestDenied(endTime, action.Target)
+					state.Monitor().DownscaleRequestDenied(endTime, action.Target)
 				}
 			} else {
-				logger.Info("Informant approved downscale", logFields...)
+				logger.Info("vm-monitor approved downscale", logFields...)
 				if unchanged {
-					state.Informant().DownscaleRequestAllowed(endTime, action.Target)
+					state.Monitor().DownscaleRequestAllowed(endTime, action.Target)
 				}
 			}
 		})
 	}
 }
 
-func doSingleInformantDownscaleRequest(
+func doSingleMonitorDownscaleRequest(
 	ctx context.Context,
 	logger *zap.Logger,
-	iface InformantHandle,
-	action core.ActionInformantDownscale,
+	iface MonitorHandle,
+	action core.ActionMonitorDownscale,
 ) (*api.DownscaleResult, error) {
 	if iface == nil {
-		return nil, errors.New("No currently active informant")
+		return nil, errors.New("No currently active vm-monitor connection")
 	}
 
 	return iface.Downscale(ctx, logger, action.Current, action.Target)
 }
 
-func (c *ExecutorCoreWithClients) DoInformantUpscales(ctx context.Context, logger *zap.Logger) {
+func (c *ExecutorCoreWithClients) DoMonitorUpscales(ctx context.Context, logger *zap.Logger) {
 	var (
 		updates     util.BroadcastReceiver = c.updates.NewReceiver()
 		requestLock util.ChanMutex         = util.NewChanMutex()
@@ -167,10 +167,10 @@ func (c *ExecutorCoreWithClients) DoInformantUpscales(ctx context.Context, logge
 
 	// meant to be called while holding c's lock
 	idUnchanged := func(current string) bool {
-		if h := c.clients.Informant.GetHandle(); h != nil {
+		if h := c.clients.Monitor.GetHandle(); h != nil {
 			return current == h.ID()
 		} else {
-			return current == c.clients.Informant.EmptyID()
+			return current == c.clients.Monitor.EmptyID()
 		}
 	}
 
@@ -188,7 +188,7 @@ func (c *ExecutorCoreWithClients) DoInformantUpscales(ctx context.Context, logge
 		}
 
 		// Wait until we're supposed to make a request.
-		if last.actions.InformantUpscale == nil {
+		if last.actions.MonitorUpscale == nil {
 			select {
 			case <-ctx.Done():
 				return
@@ -198,12 +198,12 @@ func (c *ExecutorCoreWithClients) DoInformantUpscales(ctx context.Context, logge
 			}
 		}
 
-		action := *last.actions.InformantUpscale
+		action := *last.actions.MonitorUpscale
 
-		informant := c.clients.Informant.GetHandle()
+		monitor := c.clients.Monitor.GetHandle()
 
-		if informant != nil {
-			requestLock = informant.RequestLock()
+		if monitor != nil {
+			requestLock = monitor.RequestLock()
 
 			// Try to acquire the request lock, but if something happens while we're waiting, we'll
 			// abort & retry on the next loop iteration (or maybe not, if last.actions changed).
@@ -220,16 +220,16 @@ func (c *ExecutorCoreWithClients) DoInformantUpscales(ctx context.Context, logge
 
 		var startTime time.Time
 		c.update(func(state *core.State) {
-			logger.Info("Starting informant upscale request", zap.Any("action", action))
+			logger.Info("Starting vm-monitor upscale request", zap.Any("action", action))
 			startTime = time.Now()
-			state.Informant().StartingUpscaleRequest(startTime)
+			state.Monitor().StartingUpscaleRequest(startTime)
 		})
 
-		err := doSingleInformantUpscaleRequest(ctx, ifaceLogger, informant, action)
+		err := doSingleMonitorUpscaleRequest(ctx, ifaceLogger, monitor, action)
 		endTime := time.Now()
 
 		c.update(func(state *core.State) {
-			unchanged := idUnchanged(informant.ID())
+			unchanged := idUnchanged(monitor.ID())
 			logFields := []zap.Field{
 				zap.Any("action", action),
 				zap.Duration("duration", endTime.Sub(startTime)),
@@ -237,29 +237,29 @@ func (c *ExecutorCoreWithClients) DoInformantUpscales(ctx context.Context, logge
 			}
 
 			if err != nil {
-				logger.Error("Informant upscale request failed", append(logFields, zap.Error(err))...)
+				logger.Error("vm-monitor upscale request failed", append(logFields, zap.Error(err))...)
 				if unchanged {
-					state.Informant().UpscaleRequestFailed(endTime)
+					state.Monitor().UpscaleRequestFailed(endTime)
 				}
 				return
 			}
 
-			logger.Info("Informant upscale request successful", logFields...)
+			logger.Info("vm-monitor upscale request successful", logFields...)
 			if unchanged {
-				state.Informant().UpscaleRequestSuccessful(endTime, action.Target)
+				state.Monitor().UpscaleRequestSuccessful(endTime, action.Target)
 			}
 		})
 	}
 }
 
-func doSingleInformantUpscaleRequest(
+func doSingleMonitorUpscaleRequest(
 	ctx context.Context,
 	logger *zap.Logger,
-	iface InformantHandle,
-	action core.ActionInformantUpscale,
+	iface MonitorHandle,
+	action core.ActionMonitorUpscale,
 ) error {
 	if iface == nil {
-		return errors.New("No currently active informant")
+		return errors.New("No currently active vm-monitor connection")
 	}
 
 	return iface.Upscale(ctx, logger, action.Current, action.Target)
diff --git a/pkg/agent/globalstate.go b/pkg/agent/globalstate.go
index db877e634..34e17996a 100644
--- a/pkg/agent/globalstate.go
+++ b/pkg/agent/globalstate.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"strconv"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -115,14 +116,22 @@ func (s *agentState) handleEvent(ctx context.Context, logger *zap.Logger, event
 	switch event.kind {
 	case vmEventDeleted:
 		state.stop()
-		delete(s.pods, podName)
+		// mark the status as deleted, so that it gets removed from metrics.
+		state.status.update(s, func(stat podStatus) podStatus {
+			stat.deleted = true
+			delete(s.pods, podName) // Do the removal while synchronized, because we can :)
+			return stat
+		})
 	case vmEventUpdated:
-		state.status.mu.Lock()
-		defer state.status.mu.Unlock()
-
-		state.status.vmInfo = event.vmInfo
-		state.status.endpointID = event.endpointID
-		state.vmInfoUpdated.Send()
+		state.status.update(s, func(stat podStatus) podStatus {
+			now := time.Now()
+			stat.vmInfo = event.vmInfo
+			stat.endpointID = event.endpointID
+			stat.endpointAssignedAt = &now
+			state.vmInfoUpdated.Send()
+
+			return stat
+		})
 	case vmEventAdded:
 		s.handleVMEventAdded(ctx, event, podName)
 	default:
@@ -137,17 +146,28 @@ func (s *agentState) handleVMEventAdded(
 ) {
 	runnerCtx, cancelRunnerContext := context.WithCancel(ctx)
 
-	status := &podStatus{
-		mu:                sync.Mutex{},
-		endState:          nil,
-		previousEndStates: nil,
-		vmInfo:            event.vmInfo,
-		endpointID:        event.endpointID,
-
-		startTime:                   time.Now(),
-		lastSuccessfulInformantComm: nil,
+	now := time.Now()
+
+	status := &lockedPodStatus{
+		mu: sync.Mutex{},
+		podStatus: podStatus{
+			deleted:            false,
+			endState:           nil,
+			previousEndStates:  nil,
+			vmInfo:             event.vmInfo,
+			endpointID:         event.endpointID,
+			endpointAssignedAt: &now,
+			state:              "", // Explicitly set state to empty so that the initial state update does no decrement
+			stateUpdatedAt:     now,
+
+			startTime:                 now,
+			lastSuccessfulMonitorComm: nil,
+		},
 	}
 
+	// Empty update to trigger updating metrics and state.
+	status.update(s, func(s podStatus) podStatus { return s })
+
 	restartCount := 0
 	runner := s.newRunner(event.vmInfo, podName, event.podIP, restartCount)
 	runner.status = status
@@ -183,7 +203,7 @@ func (s *agentState) TriggerRestartIfNecessary(runnerCtx context.Context, logger
 	//  2. Wait for a random amount of time (between RunnerRestartMinWaitSeconds and RunnerRestartMaxWaitSeconds)
 	//  3. Restart the Runner (if it still should be restarted)
 
-	status, ok := func() (*podStatus, bool) {
+	status, ok := func() (*lockedPodStatus, bool) {
 		s.lock.Lock()
 		defer s.lock.Unlock()
 		// note: pod.status has a separate lock, so we're ok to release s.lock
@@ -248,7 +268,7 @@ func (s *agentState) TriggerRestartIfNecessary(runnerCtx context.Context, logger
 		r := util.NewTimeRange(time.Second, RunnerRestartMinWaitSeconds, RunnerRestartMaxWaitSeconds)
 		waitDuration = r.Random()
 		logger.Info(
-			"Runner was not runnign for long, restarting after delay",
+			"Runner was not running for long, restarting after delay",
 			zap.Duration("totalRuntime", totalRuntime),
 			zap.Duration("delay", waitDuration),
 		)
@@ -287,34 +307,34 @@ func (s *agentState) TriggerRestartIfNecessary(runnerCtx context.Context, logger
 			return
 		}
 
-		pod.status.mu.Lock()
-		defer pod.status.mu.Unlock()
-
-		// Runner was already restarted
-		if pod.status.endState == nil {
-			addedInfo := "this generally shouldn't happen, but could if there's a new pod with the same name"
-			logCancel(logger.Warn, fmt.Errorf("Runner was already restarted (%s)", addedInfo))
-			return
-		}
+		pod.status.update(s, func(status podStatus) podStatus {
+			// Runner was already restarted
+			if status.endState == nil {
+				addedInfo := "this generally shouldn't happen, but could if there's a new pod with the same name"
+				logCancel(logger.Warn, fmt.Errorf("Runner was already restarted (%s)", addedInfo))
+				return status
+			}
 
-		logger.Info("Restarting runner", zap.String("exitKind", string(exitKind)), zap.Duration("delay", time.Since(endTime)))
-		s.metrics.runnerRestarts.Inc()
+			logger.Info("Restarting runner", zap.String("exitKind", string(exitKind)), zap.Duration("delay", time.Since(endTime)))
+			s.metrics.runnerRestarts.Inc()
 
-		restartCount := len(pod.status.previousEndStates) + 1
-		runner := s.newRunner(pod.status.vmInfo, podName, podIP, restartCount)
-		runner.status = pod.status
+			restartCount := len(status.previousEndStates) + 1
+			runner := s.newRunner(status.vmInfo, podName, podIP, restartCount)
+			runner.status = pod.status
 
-		txVMUpdate, rxVMUpdate := util.NewCondChannelPair()
-		// note: pod is *podState, so we don't need to re-assign to the map.
-		pod.vmInfoUpdated = txVMUpdate
-		pod.runner = runner
+			txVMUpdate, rxVMUpdate := util.NewCondChannelPair()
+			// note: pod is *podState, so we don't need to re-assign to the map.
+			pod.vmInfoUpdated = txVMUpdate
+			pod.runner = runner
 
-		pod.status.previousEndStates = append(pod.status.previousEndStates, *pod.status.endState)
-		pod.status.endState = nil
-		pod.status.startTime = time.Now()
+			status.previousEndStates = append(status.previousEndStates, *status.endState)
+			status.endState = nil
+			status.startTime = time.Now()
 
-		runnerLogger := s.loggerForRunner(pod.status.vmInfo.NamespacedName(), podName)
-		runner.Spawn(runnerCtx, runnerLogger, rxVMUpdate)
+			runnerLogger := s.loggerForRunner(status.vmInfo.NamespacedName(), podName)
+			runner.Spawn(runnerCtx, runnerLogger, rxVMUpdate)
+			return status
+		})
 	}()
 }
 
@@ -338,12 +358,10 @@ func (s *agentState) newRunner(vmInfo api.VmInfo, podName util.NamespacedName, p
 
 		lastMetrics:        nil,
 		scheduler:          atomic.Pointer[Scheduler]{},
-		server:             atomic.Pointer[InformantServer]{},
-		informant:          nil,
+		monitor:            atomic.Pointer[Dispatcher]{},
 		computeUnit:        nil,
 		lastApproved:       nil,
 		lastSchedulerError: nil,
-		lastInformantError: nil,
 
 		backgroundWorkerCount: atomic.Int64{},
 		backgroundPanic:       make(chan error),
@@ -355,7 +373,7 @@ type podState struct {
 
 	stop   context.CancelFunc
 	runner *Runner
-	status *podStatus
+	status *lockedPodStatus
 
 	vmInfoUpdated util.CondChannelSender
 }
@@ -381,16 +399,23 @@ func (p *podState) dump(ctx context.Context) podStateDump {
 	}
 }
 
-type podStatus struct {
+type lockedPodStatus struct {
 	mu sync.Mutex
 
+	podStatus
+}
+
+type podStatus struct {
 	startTime time.Time
 
+	// if true, the corresponding podState is no longer included in the global pod map
+	deleted bool
+
 	// if non-nil, the runner is finished
 	endState          *podStatusEndState
 	previousEndStates []podStatusEndState
 
-	lastSuccessfulInformantComm *time.Time
+	lastSuccessfulMonitorComm *time.Time
 
 	// vmInfo stores the latest information about the VM, as given by the global VM watcher.
 	//
@@ -400,6 +425,12 @@ type podStatus struct {
 
 	// endpointID, if non-empty, stores the ID of the endpoint associated with the VM
 	endpointID string
+
+	// NB: this value, once non-nil, is never changed.
+	endpointAssignedAt *time.Time
+
+	state          runnerMetricState
+	stateUpdatedAt time.Time
 }
 
 type podStatusDump struct {
@@ -408,11 +439,15 @@ type podStatusDump struct {
 	EndState          *podStatusEndState  `json:"endState"`
 	PreviousEndStates []podStatusEndState `json:"previousEndStates"`
 
-	LastSuccessfulInformantComm *time.Time `json:"lastSuccessfulInformantComm"`
+	LastSuccessfulMonitorComm *time.Time `json:"lastSuccessfulMonitorComm"`
 
 	VMInfo api.VmInfo `json:"vmInfo"`
 
-	EndpointID string `json:"endpointID"`
+	EndpointID         string     `json:"endpointID"`
+	EndpointAssignedAt *time.Time `json:"endpointAssignedAt"`
+
+	State          runnerMetricState `json:"state"`
+	StateUpdatedAt time.Time         `json:"stateUpdatedAt"`
 }
 
 type podStatusEndState struct {
@@ -431,21 +466,115 @@ const (
 	podStatusExitCanceled podStatusExitKind = "canceled" // top-down signal that the Runner should stop.
 )
 
-func (s *podStatus) informantIsUnhealthy(config *Config) bool {
+func (s *lockedPodStatus) update(global *agentState, with func(podStatus) podStatus) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	startupGracePeriod := time.Second * time.Duration(config.Informant.UnhealthyStartupGracePeriodSeconds)
-	unhealthySilencePeriod := time.Second * time.Duration(config.Informant.UnhealthyAfterSilenceDurationSeconds)
+	newStatus := with(s.podStatus)
+	now := time.Now()
+
+	// Calculate the new state:
+	var newState runnerMetricState
+	if s.deleted {
+		// If deleted, don't change anything.
+	} else if s.endState != nil {
+		switch s.endState.ExitKind {
+		case podStatusExitCanceled:
+			// If canceled, don't change the state.
+			newState = s.state
+		case podStatusExitErrored:
+			newState = runnerMetricStateErrored
+		case podStatusExitPanicked:
+			newState = runnerMetricStatePanicked
+		}
+	} else if newStatus.monitorStuckAt(global.config).Before(now) {
+		newState = runnerMetricStateStuck
+	} else {
+		newState = runnerMetricStateOk
+	}
+
+	if !newStatus.deleted {
+		newStatus.state = newState
+		newStatus.stateUpdatedAt = now
+	}
 
-	if s.lastSuccessfulInformantComm == nil {
-		return time.Since(s.startTime) >= startupGracePeriod
+	// Update the metrics:
+	// Note: s.state is initialized to the empty string to signify that it's not yet represented in
+	// the metrics.
+	if !s.deleted && s.state != "" {
+		oldIsEndpoint := strconv.FormatBool(s.endpointID != "")
+		global.metrics.runnersCount.WithLabelValues(oldIsEndpoint, string(s.state)).Dec()
+	}
+
+	if !newStatus.deleted && newStatus.state != "" {
+		newIsEndpoint := strconv.FormatBool(newStatus.endpointID != "")
+		global.metrics.runnersCount.WithLabelValues(newIsEndpoint, string(newStatus.state)).Inc()
+	}
+
+	s.podStatus = newStatus
+}
+
+// monitorStuckAt returns the time at which the Runner will be marked "stuck"
+func (s podStatus) monitorStuckAt(config *Config) time.Time {
+	startupGracePeriod := time.Second * time.Duration(config.Monitor.UnhealthyStartupGracePeriodSeconds)
+	unhealthySilencePeriod := time.Second * time.Duration(config.Monitor.UnhealthyAfterSilenceDurationSeconds)
+
+	if s.lastSuccessfulMonitorComm == nil {
+		start := s.startTime
+
+		// For endpoints, we should start the grace period from when the VM was *assigned* the
+		// endpoint, rather than when the VM was created.
+		if s.endpointID != "" {
+			start = *s.endpointAssignedAt
+		}
+
+		return start.Add(startupGracePeriod)
 	} else {
-		return time.Since(*s.lastSuccessfulInformantComm) >= unhealthySilencePeriod
+		return s.lastSuccessfulMonitorComm.Add(unhealthySilencePeriod)
+	}
+}
+
+func (s *lockedPodStatus) periodicallyRefreshState(ctx context.Context, logger *zap.Logger, global *agentState) {
+	maxUpdateSeconds := util.Min(
+		global.config.Monitor.UnhealthyStartupGracePeriodSeconds,
+		global.config.Monitor.UnhealthyAfterSilenceDurationSeconds,
+	)
+	// make maxTick a bit less than maxUpdateSeconds for the benefit of consistency and having
+	// relatively frequent log messages if things are stuck.
+	maxTick := time.Second * time.Duration(maxUpdateSeconds/2)
+
+	timer := time.NewTimer(0)
+	defer timer.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-timer.C:
+		}
+
+		// use s.update to trigger re-evaluating the metrics, and simultaneously reset the timer to
+		// the next point in time at which the state might have changed, so that we minimize the
+		// time between the VM meeting the conditions for being "stuck" and us recognizing it.
+		s.update(global, func(stat podStatus) podStatus {
+			stuckAt := stat.monitorStuckAt(global.config)
+			now := time.Now()
+			if stuckAt.Before(now) && stat.state != runnerMetricStateErrored && stat.state != runnerMetricStatePanicked {
+				if stat.endpointID != "" {
+					logger.Warn("Runner with endpoint is currently stuck", zap.String("endpointID", stat.endpointID))
+				} else {
+					logger.Warn("Runner without endpoint is currently stuck")
+				}
+				timer.Reset(maxTick)
+			} else {
+				timer.Reset(util.Min(maxTick, stuckAt.Sub(now)))
+			}
+			return stat
+		})
 	}
 }
 
-func (s *podStatus) dump() podStatusDump {
+func (s *lockedPodStatus) dump() podStatusDump {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -463,10 +592,14 @@ func (s *podStatus) dump() podStatusDump {
 		PreviousEndStates: previousEndStates,
 
 		// FIXME: api.VmInfo contains a resource.Quantity - is that safe to copy by value?
-		VMInfo:     s.vmInfo,
-		EndpointID: s.endpointID,
-		StartTime:  s.startTime,
+		VMInfo:             s.vmInfo,
+		EndpointID:         s.endpointID,
+		EndpointAssignedAt: s.endpointAssignedAt, // ok to share the pointer, because it's not updated
+		StartTime:          s.startTime,
+
+		State:          s.state,
+		StateUpdatedAt: s.stateUpdatedAt,
 
-		LastSuccessfulInformantComm: s.lastSuccessfulInformantComm,
+		LastSuccessfulMonitorComm: s.lastSuccessfulMonitorComm,
 	}
 }
diff --git a/pkg/agent/informant.go b/pkg/agent/informant.go
deleted file mode 100644
index 9b38895da..000000000
--- a/pkg/agent/informant.go
+++ /dev/null
@@ -1,1075 +0,0 @@
-package agent
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"io"
-	"net"
-	"net/http"
-	"strconv"
-	"strings"
-	"sync/atomic"
-	"time"
-
-	"github.com/google/uuid"
-	"github.com/tychoish/fun/srv"
-	"go.uber.org/zap"
-
-	"github.com/neondatabase/autoscaling/pkg/api"
-	"github.com/neondatabase/autoscaling/pkg/util"
-)
-
-// The autoscaler-agent currently supports v1.0 to v2.0 of the agent<->informant protocol.
-//
-// If you update either of these values, make sure to also update VERSIONING.md.
-const (
-	MinInformantProtocolVersion api.InformantProtoVersion = api.InformantProtoV1_0
-	MaxInformantProtocolVersion api.InformantProtoVersion = api.InformantProtoV2_0
-)
-
-type InformantServer struct {
-	// runner is the Runner currently responsible for this InformantServer. We must acquire its lock
-	// before making any updates to other fields of this struct
-	runner *Runner
-
-	// desc is the AgentDesc describing this VM informant server. This field is immutable.
-	desc api.AgentDesc
-
-	seqNum uint64
-	// receivedIDCheck is true if the server has received at least one successful request at the /id
-	// endpoint by the expected IP address of the VM
-	//
-	// This field is used to check for protocol violations (i.e. responding to /register without
-	// checking with /id), and *may* help prevent certain IP-spoofing based attacks - although the
-	// security implications are entirely speculation.
-	receivedIDCheck bool
-
-	// madeContact is true if any request to the VM informant could have interacted with it.
-	//
-	// If madeContact is false, then mode is guaranteed to be InformantServerUnconfirmed, so
-	// madeContact only needs to be set on /register requests (because all others require a
-	// successful register first).
-	//
-	// This field MUST NOT be updated without holding BOTH runner.lock and requestLock.
-	//
-	// This field MAY be read while holding EITHER runner.lock OR requestLock.
-	madeContact bool
-
-	// protoVersion gives the version of the agent<->informant protocol currently in use, if the
-	// server has been confirmed.
-	//
-	// In other words, this field is not nil if and only if mode is not InformantServerUnconfirmed.
-	protoVersion *api.InformantProtoVersion
-
-	// mode indicates whether the informant has marked the connection as resumed or not
-	//
-	// This field MUST NOT be updated without holding BOTH runner.lock AND requestLock.
-	//
-	// This field MAY be read while holding EITHER runner.lock OR requestLock.
-	mode InformantServerMode
-
-	// callbacks provide an abstraction for
-	callbacks informantStateCallbacks
-
-	// requestLock guards requests to the VM informant to make sure that only one request is being
-	// made at a time.
-	//
-	// If both requestLock and runner.lock are required, then requestLock MUST be acquired before
-	// runner.lock.
-	requestLock util.ChanMutex
-
-	// exitStatus holds some information about why the server exited
-	exitStatus atomic.Pointer[InformantServerExitStatus]
-
-	// exit signals that the server should shut down, and sets exitStatus to status.
-	//
-	// This function MUST be called while holding runner.lock.
-	exit func(status InformantServerExitStatus)
-}
-
-type InformantServerMode string
-
-const (
-	InformantServerUnconfirmed InformantServerMode = "unconfirmed"
-	InformantServerSuspended   InformantServerMode = "suspended"
-	InformantServerRunning     InformantServerMode = "running"
-)
-
-// InformantServerState is the serializable state of the InformantServer, produced by calls to the
-// Runner's State() method.
-type InformantServerState struct {
-	Desc            api.AgentDesc              `json:"desc"`
-	SeqNum          uint64                     `json:"seqNum"`
-	ReceivedIDCheck bool                       `json:"receivedIDCheck"`
-	MadeContact     bool                       `json:"madeContact"`
-	ProtoVersion    *api.InformantProtoVersion `json:"protoVersion"`
-	Mode            InformantServerMode        `json:"mode"`
-	ExitStatus      *InformantServerExitStatus `json:"exitStatus"`
-}
-
-type InformantServerExitStatus struct {
-	// Err is the error, if any, that caused the server to exit. This is only non-nil when context
-	// used to start the server becomes canceled (i.e. the Runner is exiting).
-	Err error
-	// RetryShouldFix is true if simply retrying should resolve err. This is true when e.g. the
-	// informant responds with a 404 to a downscale or upscale request - it might've restarted, so
-	// we just need to re-register.
-	RetryShouldFix bool
-}
-
-// NewInformantServer starts an InformantServer, returning it and a signal receiver that will be
-// signalled when it exits.
-func NewInformantServer(
-	ctx context.Context,
-	logger *zap.Logger,
-	runner *Runner,
-	callbacks informantStateCallbacks,
-) (*InformantServer, util.SignalReceiver, error) {
-	// Manually start the TCP listener so that we can see the port it's assigned
-	addr := net.TCPAddr{IP: net.IPv4zero, Port: 0 /* 0 means it'll be assigned any(-ish) port */}
-	listener, err := net.ListenTCP("tcp", &addr)
-	if err != nil {
-		return nil, util.SignalReceiver{}, fmt.Errorf("Error listening on TCP: %w", err)
-	}
-
-	// Get back the assigned port
-	var serverAddr string
-	switch addr := listener.Addr().(type) {
-	case *net.TCPAddr:
-		serverAddr = fmt.Sprintf("%s:%d", runner.global.podIP, addr.Port)
-	default:
-		panic(errors.New("unexpected net.Addr type"))
-	}
-
-	server := &InformantServer{
-		runner: runner,
-		desc: api.AgentDesc{
-			AgentID:         uuid.New(),
-			ServerAddr:      serverAddr,
-			MinProtoVersion: MinInformantProtocolVersion,
-			MaxProtoVersion: MaxInformantProtocolVersion,
-		},
-		seqNum:          0,
-		receivedIDCheck: false,
-		madeContact:     false,
-		protoVersion:    nil,
-		mode:            InformantServerUnconfirmed,
-		callbacks:       callbacks,
-		requestLock:     util.NewChanMutex(),
-		exitStatus:      atomic.Pointer[InformantServerExitStatus]{},
-		exit:            nil, // see below.
-	}
-
-	logger = logger.With(zap.Object("server", server.desc))
-	logger.Info("Starting Informant server")
-
-	mux := http.NewServeMux()
-	util.AddHandler(logger, mux, "/id", http.MethodGet, "struct{}", server.handleID)
-	util.AddHandler(logger, mux, "/resume", http.MethodPost, "ResumeAgent", server.handleResume)
-	util.AddHandler(logger, mux, "/suspend", http.MethodPost, "SuspendAgent", server.handleSuspend)
-	util.AddHandler(logger, mux, "/try-upscale", http.MethodPost, "MoreResourcesRequest", server.handleTryUpscale)
-	httpServer := &http.Server{Handler: mux}
-
-	sendFinished, recvFinished := util.NewSingleSignalPair()
-	backgroundCtx, cancelBackground := context.WithCancel(ctx)
-
-	// note: docs for server.exit guarantee this function is called while holding runner.lock.
-	server.exit = func(status InformantServerExitStatus) {
-		sendFinished.Send()
-		cancelBackground()
-
-		// Set server.exitStatus if isn't already
-		if swapped := server.exitStatus.CompareAndSwap(nil, &status); swapped {
-			logFunc := logger.Warn
-			if status.RetryShouldFix {
-				logFunc = logger.Info
-			}
-
-			logFunc("Informant server exiting", zap.Bool("retry", status.RetryShouldFix), zap.Error(status.Err))
-		}
-
-		// we need to spawn these in separate threads so the caller doesn't block while holding
-		// runner.lock
-		runner.spawnBackgroundWorker(srv.GetBaseContext(ctx), logger, "InformantServer shutdown", func(_ context.Context, logger *zap.Logger) {
-			// we want shutdown to (potentially) live longer than the request which
-			// made it, but having a timeout is still good.
-			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-			defer cancel()
-
-			if err := httpServer.Shutdown(ctx); err != nil {
-				logger.Warn("Error shutting down InformantServer", zap.Error(err))
-			}
-		})
-		if server.madeContact {
-			// only unregister the server if we could have plausibly contacted the informant
-			runner.spawnBackgroundWorker(srv.GetBaseContext(ctx), logger, "InformantServer unregister", func(_ context.Context, logger *zap.Logger) {
-				// we want shutdown to (potentially) live longer than the request which
-				// made it, but having a timeout is still good.
-				ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-				defer cancel()
-
-				if err := server.unregisterFromInformant(ctx, logger); err != nil {
-					logger.Warn("Error unregistering", zap.Error(err))
-				}
-			})
-		}
-	}
-
-	// Deadlock checker for server.requestLock
-	//
-	// FIXME: make these timeouts/delays separately defined constants, or configurable
-	deadlockChecker := server.requestLock.DeadlockChecker(5*time.Second, time.Second)
-	runner.spawnBackgroundWorker(backgroundCtx, logger, "InformantServer deadlock checker", ignoreLogger(deadlockChecker))
-
-	// Main thread running the server. After httpServer.Serve() completes, we do some error
-	// handling, but that's about it.
-	runner.spawnBackgroundWorker(ctx, logger, "InformantServer", func(c context.Context, logger *zap.Logger) {
-		if err := httpServer.Serve(listener); !errors.Is(err, http.ErrServerClosed) {
-			logger.Error("InformantServer exited with unexpected error", zap.Error(err))
-		}
-
-		// set server.exitStatus if it isn't already -- generally this should only occur if err
-		// isn't http.ErrServerClosed, because other server exits should be controlled by
-		server.exitStatus.CompareAndSwap(nil, &InformantServerExitStatus{
-			Err:            fmt.Errorf("Unexpected exit: %w", err),
-			RetryShouldFix: false,
-		})
-	})
-
-	// Thread waiting for the context to be canceled so we can use it to shut down the server
-	runner.spawnBackgroundWorker(ctx, logger, "InformantServer shutdown waiter", func(context.Context, *zap.Logger) {
-		// Wait until parent context OR server's context is done.
-		<-backgroundCtx.Done()
-		server.exit(InformantServerExitStatus{Err: nil, RetryShouldFix: false})
-	})
-
-	runner.spawnBackgroundWorker(backgroundCtx, logger, "InformantServer health-checker", func(c context.Context, logger *zap.Logger) {
-		// FIXME: make this duration configurable
-		ticker := time.NewTicker(5 * time.Second)
-		defer ticker.Stop()
-		for {
-			select {
-			case <-c.Done():
-				return
-			case <-ticker.C:
-			}
-
-			var done bool
-			func() {
-				server.requestLock.Lock()
-				defer server.requestLock.Unlock()
-
-				// If we've already registered with the informant, and it doesn't support health
-				// checks, exit.
-				if server.protoVersion != nil && !server.protoVersion.AllowsHealthCheck() {
-					logger.Info("Aborting future informant health checks because it does not support them")
-					done = true
-					return
-				}
-
-				if _, err := server.HealthCheck(c, logger); err != nil {
-					logger.Warn("Informant health check failed", zap.Error(err))
-				}
-			}()
-			if done {
-				return
-			}
-		}
-	})
-
-	return server, recvFinished, nil
-}
-
-var (
-	InformantServerAlreadyExitedError error = errors.New("Informant server has already exited")
-	InformantServerSuspendedError     error = errors.New("Informant server is currently suspended")
-	InformantServerUnconfirmedError   error = errors.New("Informant server has not yet been confirmed")
-	InformantServerNotCurrentError    error = errors.New("Informant server has been replaced")
-)
-
-// IsNormalInformantError returns true if the error is one of the "expected" errors that can occur
-// in valid exchanges - due to unavoidable raciness or otherwise.
-func IsNormalInformantError(err error) bool {
-	return errors.Is(err, InformantServerAlreadyExitedError) ||
-		errors.Is(err, InformantServerSuspendedError) ||
-		errors.Is(err, InformantServerUnconfirmedError) ||
-		errors.Is(err, InformantServerNotCurrentError)
-}
-
-// valid checks if the InformantServer is good to use for communication, returning an error if not
-//
-// This method can return errors for a number of unavoidably-racy protocol states - errors from this
-// method should be handled as unusual, but not unexpected. Any error returned will be one of
-// InformantServer{AlreadyExited,Suspended,Confirmed}Error.
-//
-// This method MUST be called while holding s.runner.lock.
-func (s *InformantServer) valid() error {
-	if s.exitStatus.Load() != nil {
-		return InformantServerAlreadyExitedError
-	}
-
-	switch s.mode {
-	case InformantServerRunning:
-		// all good; one more check
-	case InformantServerUnconfirmed:
-		return InformantServerUnconfirmedError
-	case InformantServerSuspended:
-		return InformantServerSuspendedError
-	default:
-		panic(fmt.Errorf("Unexpected InformantServerMode %q", s.mode))
-	}
-
-	if s.runner.server.Load() != s {
-		return InformantServerNotCurrentError
-	}
-	return nil
-}
-
-// ExitStatus returns the InformantServerExitStatus associated with the server, if it has been
-// instructed to exit
-func (s *InformantServer) ExitStatus() *InformantServerExitStatus {
-	return s.exitStatus.Load()
-}
-
-// setLastInformantError is a helper method to abbreviate setting the Runner's lastInformantError
-// field. If runnerLocked is true, s.runner.lock will be acquired.
-//
-// This method MUST be called while holding s.requestLock AND EITHER holding s.runner.lock OR
-// runnerLocked MUST be true.
-func (s *InformantServer) setLastInformantError(err error, runnerLocked bool) {
-	if !runnerLocked {
-		s.runner.lock.Lock()
-		defer s.runner.lock.Unlock()
-	}
-
-	if s.runner.server.Load() == s {
-		s.runner.lastInformantError = err
-	}
-}
-
-// RegisterWithInformant sends a /register request to the VM Informant
-//
-// If called after a prior success, this method will panic. If the server has already exited, this
-// method will return InformantServerAlreadyExitedError.
-//
-// On certain errors, this method will force the server to exit. This can be checked by calling
-// s.ExitStatus() and checking for a non-nil result.
-//
-// This method MUST NOT be called while holding s.requestLock OR s.runner.lock.
-func (s *InformantServer) RegisterWithInformant(ctx context.Context, logger *zap.Logger) error {
-	logger = logger.With(zap.Object("server", s.desc))
-
-	s.requestLock.Lock()
-	defer s.requestLock.Unlock()
-
-	// Check the current state:
-	err := func() error {
-		s.runner.lock.Lock()
-		defer s.runner.lock.Unlock()
-
-		switch s.mode {
-		case InformantServerUnconfirmed:
-			// good; this is what we're expecting
-		case InformantServerRunning, InformantServerSuspended:
-			panic(fmt.Errorf("Register called while InformantServer is already registered (mode = %q)", s.mode))
-		default:
-			panic(fmt.Errorf("Unexpected InformantServerMode %q", s.mode))
-		}
-
-		if s.ExitStatus() != nil {
-			err := InformantServerAlreadyExitedError
-			s.setLastInformantError(err, true)
-			return err
-		}
-
-		return nil
-	}()
-	if err != nil {
-		return err
-	}
-
-	// Make the request:
-	timeout := time.Second * time.Duration(s.runner.global.config.Informant.RegisterTimeoutSeconds)
-	resp, statusCode, err := doInformantRequest[api.AgentDesc, api.InformantDesc](
-		ctx, logger, s, timeout, http.MethodPost, "/register", &s.desc,
-	)
-	// Do some stuff with the lock acquired:
-	func() {
-		maybeMadeContact := statusCode != 0 || ctx.Err() != nil
-
-		s.runner.lock.Lock()
-		defer s.runner.lock.Unlock()
-
-		// Record whether we might've contacted the informant:
-		s.madeContact = maybeMadeContact
-
-		if err != nil {
-			s.setLastInformantError(fmt.Errorf("Register request failed: %w", err), true)
-
-			// If the informant responds that it's our fault, or it had an internal failure, we know
-			// that:
-			//  1. Neither should happen under normal operation, and
-			//  2. Restarting the server is *more likely* to fix it than continuing
-			// We shouldn't *assume* that restarting will actually fix it though, so we'll still set
-			// RetryShouldFix = false.
-			if 400 <= statusCode && statusCode <= 599 {
-				s.exit(InformantServerExitStatus{
-					Err:            err,
-					RetryShouldFix: false,
-				})
-			}
-		}
-	}()
-
-	if err != nil {
-		return err // the errors returned by doInformantRequest are descriptive enough.
-	}
-
-	if err := validateInformantDesc(&s.desc, resp); err != nil {
-		err = fmt.Errorf("Received bad InformantDesc: %w", err)
-		s.setLastInformantError(err, false)
-		return err
-	}
-
-	// Now that we know it's valid, set s.runner.informant ...
-	err = func() error {
-		// ... but only if the server is still current. We're ok setting it if the server isn't
-		// running, because it's good to have the information there.
-		s.runner.lock.Lock()
-		defer s.runner.lock.Unlock()
-
-		logger.Info(
-			"Informant server mode updated",
-			zap.String("action", "register"),
-			zap.String("oldMode", string(s.mode)),
-			zap.String("newMode", string(InformantServerSuspended)),
-		)
-
-		s.mode = InformantServerSuspended
-		s.protoVersion = &resp.ProtoVersion
-
-		if s.runner.server.Load() == s {
-			// signal we've changed the informant, and do the logging while we're at it, so there's
-			// a synchronous record of what happened.
-			s.callbacks.registered(false, func() {
-				oldInformant := s.runner.informant
-				s.runner.informant = resp
-
-				if oldInformant == nil {
-					logger.Info("Registered with informant", zap.Any("informant", *resp))
-				} else if *oldInformant != *resp {
-					logger.Info(
-						"Re-registered with informant, InformantDesc changed",
-						zap.Any("oldInformant", *oldInformant),
-						zap.Any("informant", *resp),
-					)
-				} else {
-					logger.Info("Re-registered with informant; InformantDesc unchanged", zap.Any("informant", *oldInformant))
-				}
-			})
-		} else {
-			logger.Warn("Registering with informant completed but the server has already been replaced")
-		}
-
-		// we also want to do a quick protocol check here as well
-		if !s.receivedIDCheck {
-			// protocol violation
-			err := errors.New("Informant responded to /register with 200 without requesting /id")
-			s.setLastInformantError(fmt.Errorf("Protocol violation: %w", err), true)
-			logger.Error("Protocol violation", zap.Error(err))
-			s.exit(InformantServerExitStatus{
-				Err:            err,
-				RetryShouldFix: false,
-			})
-			return errors.New("Protocol violation") // we already logged it; don't double-log a long message
-		}
-
-		return nil
-	}()
-
-	if err != nil {
-		return err
-	}
-
-	// Record that this request was handled without error
-	s.setLastInformantError(nil, false)
-	return nil
-}
-
-// validateInformantDesc checks that the provided api.InformantDesc is valid and matches with an
-// InformantServer's api.AgentDesc
-func validateInformantDesc(server *api.AgentDesc, informant *api.InformantDesc) error {
-	// To quote the docs for api.InformantDesc.ProtoVersion:
-	//
-	// > If the VM informant does not use a protocol version within [the agent's] bounds, then it
-	// > MUST respond with an error status code.
-	//
-	// So if we're asked to validate the response, mismatch *should* have already been handled.
-	goodProtoVersion := server.MinProtoVersion <= informant.ProtoVersion &&
-		informant.ProtoVersion <= server.MaxProtoVersion
-
-	if !goodProtoVersion {
-		return fmt.Errorf(
-			"Unexpected protocol version: should be between %d and %d, but got %d",
-			server.MinProtoVersion, server.MaxProtoVersion, informant.ProtoVersion,
-		)
-	}
-
-	// To quote the docs for api.InformantMetricsMethod:
-	//
-	// > At least one method *must* be provided in an InformantDesc, and more than one method gives
-	// > the autoscaler-agent freedom to choose.
-	//
-	// We just need to check that there aren't none.
-	hasMetricsMethod := informant.MetricsMethod.Prometheus != nil
-	if !hasMetricsMethod {
-		return errors.New("No known metrics method given")
-	}
-
-	return nil
-}
-
-// unregisterFromInformant is an internal-ish function that sends an /unregister request to the VM
-// informant
-//
-// Because sending an /unregister request is generally out of courtesy on exit, this method is more
-// permissive about server state, and is typically called with a different Context from what would
-// normally be expected.
-//
-// This method is only expected to be called by s.exit; calling this method before s.exitStatus has
-// been set will likely cause the server to restart.
-//
-// This method MUST NOT be called while holding s.requestLock OR s.runner.lock.
-func (s *InformantServer) unregisterFromInformant(ctx context.Context, logger *zap.Logger) error {
-	// note: Because this method is typically called during shutdown, we don't set
-	// s.runner.lastInformantError or call s.exit, even though other request helpers do.
-
-	logger = logger.With(zap.Object("server", s.desc))
-
-	s.requestLock.Lock()
-	defer s.requestLock.Unlock()
-
-	logger.Info("Sending unregister request to informant")
-
-	// Make the request:
-	timeout := time.Second * time.Duration(s.runner.global.config.Informant.RegisterTimeoutSeconds)
-	resp, _, err := doInformantRequest[api.AgentDesc, api.UnregisterAgent](
-		ctx, logger, s, timeout, http.MethodDelete, "/unregister", &s.desc,
-	)
-	if err != nil {
-		return err // the errors returned by doInformantRequest are descriptive enough.
-	}
-
-	logger.Info("Unregister request successful", zap.Any("response", *resp))
-	return nil
-}
-
-// doInformantRequest makes a single HTTP request to the VM informant, doing only the validation
-// required to JSON decode the response
-//
-// The returned int gives the status code of the response. It is possible for a response with status
-// 200 to still yield an error - either because of a later IO failure or bad JSON.
-//
-// If an error occurs before we get a response, the status code will be 0.
-//
-// This method MUST be called while holding s.requestLock. If not, the program will silently violate
-// the protocol guarantees.
-func doInformantRequest[Q any, R any](
-	ctx context.Context,
-	logger *zap.Logger,
-	s *InformantServer,
-	timeout time.Duration,
-	method string,
-	path string,
-	reqData *Q,
-) (_ *R, statusCode int, _ error) {
-	result := "<internal error>"
-	defer func() {
-		s.runner.global.metrics.informantRequestsOutbound.WithLabelValues(result).Inc()
-	}()
-
-	reqBody, err := json.Marshal(reqData)
-	if err != nil {
-		return nil, statusCode, fmt.Errorf("Error encoding request JSON: %w", err)
-	}
-
-	reqCtx, cancel := context.WithTimeout(ctx, timeout)
-	defer cancel()
-
-	url := s.informantURL(path)
-	request, err := http.NewRequestWithContext(reqCtx, method, url, bytes.NewReader(reqBody))
-	if err != nil {
-		return nil, statusCode, fmt.Errorf("Error building request to %q: %w", url, err)
-	}
-	request.Header.Set("content-type", "application/json")
-
-	logger.Info("Sending informant request", zap.String("url", url), zap.Any("request", reqData))
-
-	response, err := http.DefaultClient.Do(request)
-	if err != nil {
-		result = fmt.Sprintf("[error doing request: %s]", util.RootError(err))
-		return nil, statusCode, fmt.Errorf("Error doing request: %w", err)
-	}
-	defer response.Body.Close()
-
-	statusCode = response.StatusCode
-	result = strconv.Itoa(statusCode)
-
-	respBody, err := io.ReadAll(response.Body)
-	if err != nil {
-		return nil, statusCode, fmt.Errorf("Error reading body for response: %w", err)
-	}
-
-	if statusCode != 200 {
-		return nil, statusCode, fmt.Errorf(
-			"Received response status %d body %q", statusCode, string(respBody),
-		)
-	}
-
-	var respData R
-	if err := json.Unmarshal(respBody, &respData); err != nil {
-		return nil, statusCode, fmt.Errorf("Bad JSON response: %w", err)
-	}
-
-	logger.Info("Got informant response", zap.String("url", url), zap.Any("response", respData))
-
-	return &respData, statusCode, nil
-}
-
-// fetchAndIncrementSequenceNumber increments the sequence number and returns it
-//
-// This method MUST be called while holding s.runner.lock.
-func (s *InformantServer) incrementSequenceNumber() uint64 {
-	s.seqNum += 1
-	return s.seqNum
-}
-
-// informantURL creates a string representing the URL for a request to the VM informant, given the
-// path to use
-func (s *InformantServer) informantURL(path string) string {
-	if !strings.HasPrefix(path, "/") {
-		panic(errors.New("informant URL path must start with '/'"))
-	}
-
-	ip := s.runner.podIP
-	port := s.runner.global.config.Informant.ServerPort
-	return fmt.Sprintf("http://%s:%d/%s", ip, port, path[1:])
-}
-
-// handleID handles a request on the server's /id endpoint. This method should not be called outside
-// of that context.
-//
-// Returns: response body (if successful), status code, error (if unsuccessful)
-func (s *InformantServer) handleID(ctx context.Context, _ *zap.Logger, body *struct{}) (_ *api.AgentIdentificationMessage, code int, _ error) {
-	defer func() {
-		s.runner.global.metrics.informantRequestsInbound.WithLabelValues("/id", strconv.Itoa(code)).Inc()
-	}()
-
-	s.runner.lock.Lock()
-	defer s.runner.lock.Unlock()
-
-	s.receivedIDCheck = true
-
-	if s.ExitStatus() != nil {
-		return nil, 404, errors.New("Server has already exited")
-	}
-
-	// Update our record of the last successful time we heard from the informant, if the server is
-	// currently enabled. This allows us to detect cases where the informant is not currently
-	// communicating back to the agent - OR when the informant never /resume'd the agent.
-	if s.mode == InformantServerRunning {
-		s.runner.setStatus(func(s *podStatus) {
-			now := time.Now()
-			s.lastSuccessfulInformantComm = &now
-		})
-	}
-
-	return &api.AgentIdentificationMessage{
-		Data:           api.AgentIdentification{AgentID: s.desc.AgentID},
-		SequenceNumber: s.incrementSequenceNumber(),
-	}, 200, nil
-}
-
-// handleResume handles a request on the server's /resume endpoint. This method should not be called
-// outside of that context.
-//
-// Returns: response body (if successful), status code, error (if unsuccessful)
-func (s *InformantServer) handleResume(
-	ctx context.Context, logger *zap.Logger, body *api.ResumeAgent,
-) (_ *api.AgentIdentificationMessage, code int, _ error) {
-	defer func() {
-		s.runner.global.metrics.informantRequestsInbound.WithLabelValues("/resume", strconv.Itoa(code)).Inc()
-	}()
-
-	if body.ExpectedID != s.desc.AgentID {
-		logger.Warn("Request AgentID not found, server has a different one")
-		return nil, 404, fmt.Errorf("AgentID %q not found", body.ExpectedID)
-	}
-
-	s.runner.lock.Lock()
-	defer s.runner.lock.Unlock()
-
-	if s.ExitStatus() != nil {
-		return nil, 404, errors.New("Server has already exited")
-	}
-
-	// FIXME: Our handling of the protocol here is racy (because we might receive a /resume request
-	// before we've processed the response from our /register request). However, that's *probably*
-	// actually an issue with the protocol itself, rather than our handling.
-
-	switch s.mode {
-	case InformantServerSuspended:
-		s.mode = InformantServerRunning
-		s.callbacks.setActive(true, func() {
-			logger.Info(
-				"Informant server mode updated",
-				zap.String("action", "resume"),
-				zap.String("oldMode", string(InformantServerSuspended)),
-				zap.String("newMode", string(InformantServerRunning)),
-			)
-		})
-	case InformantServerRunning:
-		internalErr := errors.New("Got /resume request for server, but it is already running")
-		logger.Warn("Protocol violation", zap.Error(internalErr))
-
-		// To be nice, we'll restart the server. We don't want to make a temporary error permanent.
-		s.exit(InformantServerExitStatus{
-			Err:            internalErr,
-			RetryShouldFix: true,
-		})
-
-		return nil, 400, errors.New("Cannot resume agent that is already running")
-	case InformantServerUnconfirmed:
-		internalErr := errors.New("Got /resume request for server, but it is unconfirmed")
-		logger.Warn("Protocol violation", zap.Error(internalErr))
-
-		// To be nice, we'll restart the server. We don't want to make a temporary error permanent.
-		s.exit(InformantServerExitStatus{
-			Err:            internalErr,
-			RetryShouldFix: true,
-		})
-
-		return nil, 400, errors.New("Cannot resume agent that is not yet registered")
-	default:
-		panic(fmt.Errorf("Unexpected InformantServerMode %q", s.mode))
-	}
-
-	return &api.AgentIdentificationMessage{
-		Data:           api.AgentIdentification{AgentID: s.desc.AgentID},
-		SequenceNumber: s.incrementSequenceNumber(),
-	}, 200, nil
-}
-
-// handleSuspend handles a request on the server's /suspend endpoint. This method should not be
-// called outside of that context.
-//
-// Returns: response body (if successful), status code, error (if unsuccessful)
-func (s *InformantServer) handleSuspend(
-	ctx context.Context, logger *zap.Logger, body *api.SuspendAgent,
-) (_ *api.AgentIdentificationMessage, code int, _ error) {
-	defer func() {
-		s.runner.global.metrics.informantRequestsInbound.WithLabelValues("/suspend", strconv.Itoa(code)).Inc()
-	}()
-
-	if body.ExpectedID != s.desc.AgentID {
-		logger.Warn("Request AgentID not found, server has a different one")
-		return nil, 404, fmt.Errorf("AgentID %q not found", body.ExpectedID)
-	}
-
-	s.runner.lock.Lock()
-	locked := true
-	defer func() {
-		if locked {
-			s.runner.lock.Unlock()
-		}
-	}()
-
-	if s.ExitStatus() != nil {
-		return nil, 404, errors.New("Server has already exited")
-	}
-
-	switch s.mode {
-	case InformantServerRunning:
-		s.mode = InformantServerSuspended
-		s.callbacks.setActive(false, func() {
-			logger.Info(
-				"Informant server mode updated",
-				zap.String("action", "suspend"),
-				zap.String("oldMode", string(InformantServerRunning)),
-				zap.String("newMode", string(InformantServerSuspended)),
-			)
-		})
-	case InformantServerSuspended:
-		internalErr := errors.New("Got /suspend request for server, but it is already suspended")
-		logger.Warn("Protocol violation", zap.Error(internalErr))
-
-		// To be nice, we'll restart the server. We don't want to make a temporary error permanent.
-		s.exit(InformantServerExitStatus{
-			Err:            internalErr,
-			RetryShouldFix: true,
-		})
-
-		return nil, 400, errors.New("Cannot suspend agent that is already suspended")
-	case InformantServerUnconfirmed:
-		internalErr := errors.New("Got /suspend request for server, but it is unconfirmed")
-		logger.Warn("Protocol violation", zap.Error(internalErr))
-
-		// To be nice, we'll restart the server. We don't want to make a temporary error permanent.
-		s.exit(InformantServerExitStatus{
-			Err:            internalErr,
-			RetryShouldFix: true,
-		})
-
-		return nil, 400, errors.New("Cannot suspend agent that is not yet registered")
-	}
-
-	locked = false
-	s.runner.lock.Unlock()
-
-	// Acquire s.runner.requestLock so that when we return, we can guarantee that any future
-	// requests to NeonVM or the scheduler will first observe that the informant is suspended and
-	// exit early, before actually making the request.
-	if err := s.runner.requestLock.TryLock(ctx); err != nil {
-		err = fmt.Errorf("Context expired while trying to acquire requestLock: %w", err)
-		logger.Error("Failed to synchronize on requestLock", zap.Error(err))
-		return nil, 500, err
-	}
-	s.runner.requestLock.Unlock() // don't actually hold the lock, we're just using it as a barrier.
-
-	return &api.AgentIdentificationMessage{
-		Data:           api.AgentIdentification{AgentID: s.desc.AgentID},
-		SequenceNumber: s.incrementSequenceNumber(),
-	}, 200, nil
-}
-
-// handleTryUpscale handles a request on the server's /try-upscale endpoint. This method should not
-// be called outside of that context.
-//
-// Returns: response body (if successful), status code, error (if unsuccessful)
-func (s *InformantServer) handleTryUpscale(
-	ctx context.Context,
-	logger *zap.Logger,
-	body *api.MoreResourcesRequest,
-) (_ *api.AgentIdentificationMessage, code int, _ error) {
-	defer func() {
-		s.runner.global.metrics.informantRequestsInbound.WithLabelValues("/upscale", strconv.Itoa(code)).Inc()
-	}()
-
-	if body.ExpectedID != s.desc.AgentID {
-		logger.Warn("Request AgentID not found, server has a different one")
-		return nil, 404, fmt.Errorf("AgentID %q not found", body.ExpectedID)
-	}
-
-	s.runner.lock.Lock()
-	defer s.runner.lock.Unlock()
-
-	if s.ExitStatus() != nil {
-		return nil, 404, errors.New("Server has already exited")
-	}
-
-	switch s.mode {
-	case InformantServerRunning:
-		if !s.protoVersion.HasTryUpscale() {
-			err := fmt.Errorf("/try-upscale not supported for protocol version %v", *s.protoVersion)
-			return nil, 400, err
-		}
-
-		s.callbacks.upscaleRequested(body.MoreResources, func() {
-			if !body.MoreResources.Cpu && !body.MoreResources.Memory {
-				logger.Warn("Received try-upscale request that has no resources selected")
-			}
-
-			logger.Info(
-				"Updating requested upscale",
-				zap.Any("requested", body.MoreResources),
-			)
-		})
-
-		return &api.AgentIdentificationMessage{
-			Data:           api.AgentIdentification{AgentID: s.desc.AgentID},
-			SequenceNumber: s.incrementSequenceNumber(),
-		}, 200, nil
-	case InformantServerSuspended:
-		internalErr := errors.New("Got /try-upscale request for server, but server is suspended")
-		logger.Warn("Protocol violation", zap.Error(internalErr))
-
-		// To be nice, we'll restart the server. We don't want to make a temporary error permanent.
-		s.exit(InformantServerExitStatus{
-			Err:            internalErr,
-			RetryShouldFix: true,
-		})
-
-		return nil, 400, errors.New("Cannot process upscale while suspended")
-	case InformantServerUnconfirmed:
-		internalErr := errors.New("Got /try-upscale request for server, but server is suspended")
-		logger.Warn("Protocol violation", zap.Error(internalErr))
-
-		// To be nice, we'll restart the server. We don't want to make a temporary error permanent.
-		s.exit(InformantServerExitStatus{
-			Err:            internalErr,
-			RetryShouldFix: true,
-		})
-
-		return nil, 400, errors.New("Cannot process upscale while unconfirmed")
-	default:
-		panic(fmt.Errorf("unexpected server mode: %q", s.mode))
-	}
-}
-
-// HealthCheck makes a request to the informant's /health-check endpoint, using the server's ID.
-//
-// This method MUST be called while holding i.server.requestLock AND NOT i.server.runner.lock.
-func (s *InformantServer) HealthCheck(ctx context.Context, logger *zap.Logger) (*api.InformantHealthCheckResp, error) {
-	err := func() error {
-		s.runner.lock.Lock()
-		defer s.runner.lock.Unlock()
-		return s.valid()
-	}()
-	// NB: we want to continue to perform health checks even if the informant server is not properly
-	// available for *normal* use.
-	//
-	// We only need to check for InformantServerSuspendedError because
-	// InformantServerUnconfirmedError will be handled by the retryRegister loop in
-	// serveInformantLoop.
-	if err != nil && !errors.Is(err, InformantServerSuspendedError) {
-		return nil, err
-	}
-
-	logger = logger.With(zap.Object("server", s.desc))
-
-	timeout := time.Second * time.Duration(s.runner.global.config.Informant.RequestTimeoutSeconds)
-	id := api.AgentIdentification{AgentID: s.desc.AgentID}
-
-	logger.Info("Sending health-check", zap.Any("id", id))
-	resp, statusCode, err := doInformantRequest[api.AgentIdentification, api.InformantHealthCheckResp](
-		ctx, logger, s, timeout, http.MethodPut, "/health-check", &id,
-	)
-	if err != nil {
-		func() {
-			s.runner.lock.Lock()
-			defer s.runner.lock.Unlock()
-
-			s.setLastInformantError(fmt.Errorf("Health-check request failed: %w", err), true)
-
-			if 400 <= statusCode && statusCode <= 599 {
-				s.exit(InformantServerExitStatus{
-					Err:            err,
-					RetryShouldFix: statusCode == 404,
-				})
-			}
-		}()
-		return nil, err
-	}
-
-	logger.Info("Received OK health-check result")
-	return resp, nil
-}
-
-// Downscale makes a request to the informant's /downscale endpoint with the api.Resources
-//
-// This method MUST NOT be called while holding i.server.runner.lock.
-func (s *InformantServer) Downscale(ctx context.Context, logger *zap.Logger, to api.Resources) (*api.DownscaleResult, error) {
-	err := func() error {
-		s.runner.lock.Lock()
-		defer s.runner.lock.Unlock()
-		return s.valid()
-	}()
-	if err != nil {
-		return nil, err
-	}
-
-	logger = logger.With(zap.Object("server", s.desc))
-
-	logger.Info("Sending downscale", zap.Object("target", to))
-
-	timeout := time.Second * time.Duration(s.runner.global.config.Informant.DownscaleTimeoutSeconds)
-	id := api.AgentIdentification{AgentID: s.desc.AgentID}
-	rawResources := to.ConvertToRaw(s.runner.vm.Mem.SlotSize)
-
-	var statusCode int
-	var resp *api.DownscaleResult
-	if s.protoVersion.SignsResourceUpdates() {
-		signedRawResources := api.ResourceMessage{RawResources: rawResources, Id: id}
-		reqData := api.AgentResourceMessage{Data: signedRawResources, SequenceNumber: s.incrementSequenceNumber()}
-		resp, statusCode, err = doInformantRequest[api.AgentResourceMessage, api.DownscaleResult](
-			ctx, logger, s, timeout, http.MethodPut, "/downscale", &reqData,
-		)
-	} else {
-		resp, statusCode, err = doInformantRequest[api.RawResources, api.DownscaleResult](
-			ctx, logger, s, timeout, http.MethodPut, "/downscale", &rawResources,
-		)
-	}
-	if err != nil {
-		func() {
-			s.runner.lock.Lock()
-			defer s.runner.lock.Unlock()
-
-			s.setLastInformantError(fmt.Errorf("Downscale request failed: %w", err), true)
-
-			if 400 <= statusCode && statusCode <= 599 {
-				s.exit(InformantServerExitStatus{
-					Err:            err,
-					RetryShouldFix: statusCode == 404,
-				})
-			}
-		}()
-		return nil, err
-	}
-
-	logger.Info("Received downscale result") // already logged by doInformantRequest
-	return resp, nil
-}
-
-func (s *InformantServer) Upscale(ctx context.Context, logger *zap.Logger, to api.Resources) error {
-	err := func() error {
-		s.runner.lock.Lock()
-		defer s.runner.lock.Unlock()
-		return s.valid()
-	}()
-	if err != nil {
-		return err
-	}
-
-	logger = logger.With(zap.Object("server", s.desc))
-
-	logger.Info("Sending upscale", zap.Object("target", to))
-
-	timeout := time.Second * time.Duration(s.runner.global.config.Informant.DownscaleTimeoutSeconds)
-	id := api.AgentIdentification{AgentID: s.desc.AgentID}
-	rawResources := to.ConvertToRaw(s.runner.vm.Mem.SlotSize)
-
-	var statusCode int
-	if s.protoVersion.SignsResourceUpdates() {
-		signedRawResources := api.ResourceMessage{RawResources: rawResources, Id: id}
-		reqData := api.AgentResourceMessage{Data: signedRawResources, SequenceNumber: s.incrementSequenceNumber()}
-		_, statusCode, err = doInformantRequest[api.AgentResourceMessage, struct{}](
-			ctx, logger, s, timeout, http.MethodPut, "/upscale", &reqData,
-		)
-	} else {
-		_, statusCode, err = doInformantRequest[api.RawResources, struct{}](
-			ctx, logger, s, timeout, http.MethodPut, "/upscale", &rawResources,
-		)
-	}
-	if err != nil {
-		func() {
-			s.runner.lock.Lock()
-			defer s.runner.lock.Unlock()
-
-			s.setLastInformantError(fmt.Errorf("Downscale request failed: %w", err), true)
-
-			if 400 <= statusCode && statusCode <= 599 {
-				s.exit(InformantServerExitStatus{
-					Err:            err,
-					RetryShouldFix: statusCode == 404,
-				})
-			}
-		}()
-		return err
-	}
-
-	logger.Info("Received successful upscale result")
-	return nil
-}
diff --git a/pkg/agent/prommetrics.go b/pkg/agent/prommetrics.go
index b68770eeb..cf9fdd09c 100644
--- a/pkg/agent/prommetrics.go
+++ b/pkg/agent/prommetrics.go
@@ -12,14 +12,15 @@ type PromMetrics struct {
 	schedulerRequestedChange resourceChangePair
 	schedulerApprovedChange  resourceChangePair
 
-	informantRequestsOutbound *prometheus.CounterVec
-	informantRequestsInbound  *prometheus.CounterVec
-	informantRequestedChange  resourceChangePair
-	informantApprovedChange   resourceChangePair
+	monitorRequestsOutbound *prometheus.CounterVec
+	monitorRequestsInbound  *prometheus.CounterVec
+	monitorRequestedChange  resourceChangePair
+	monitorApprovedChange   resourceChangePair
 
 	neonvmRequestsOutbound *prometheus.CounterVec
 	neonvmRequestedChange  resourceChangePair
 
+	runnersCount       *prometheus.GaugeVec
 	runnerFatalErrors  prometheus.Counter
 	runnerThreadPanics prometheus.Counter
 	runnerStarts       prometheus.Counter
@@ -37,6 +38,15 @@ const (
 	directionValueDec = "dec"
 )
 
+type runnerMetricState string
+
+const (
+	runnerMetricStateOk       runnerMetricState = "ok"
+	runnerMetricStateStuck    runnerMetricState = "stuck"
+	runnerMetricStateErrored  runnerMetricState = "errored"
+	runnerMetricStatePanicked runnerMetricState = "panicked"
+)
+
 func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Registry) {
 	reg := prometheus.NewRegistry()
 
@@ -92,49 +102,49 @@ func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Regi
 			)),
 		},
 
-		// ---- INFORMANT ----
-		informantRequestsOutbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
+		// ---- MONITOR ----
+		monitorRequestsOutbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
 			prometheus.CounterOpts{
-				Name: "autoscaling_agent_informant_outbound_requests_total",
-				Help: "Number of attempted HTTP requests to vm-informants by autoscaler-agents",
+				Name: "autoscaling_agent_monitor_outbound_requests_total",
+				Help: "Number of attempted HTTP requests to vm-monitors by autoscaler-agents",
 			},
-			[]string{"code"},
+			[]string{"endpoint", "code"},
 		)),
-		informantRequestsInbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
+		monitorRequestsInbound: util.RegisterMetric(reg, prometheus.NewCounterVec(
 			prometheus.CounterOpts{
-				Name: "autoscaling_agent_informant_inbound_requests_total",
-				Help: "Number of HTTP requests from vm-informants received by autoscaler-agents",
+				Name: "autoscaling_agent_monitor_inbound_requests_total",
+				Help: "Number of HTTP requests from vm-monitors received by autoscaler-agents",
 			},
 			[]string{"endpoint", "code"},
 		)),
-		informantRequestedChange: resourceChangePair{
+		monitorRequestedChange: resourceChangePair{
 			cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
 				prometheus.CounterOpts{
-					Name: "autoscaling_agent_informant_requested_cpu_change_total",
-					Help: "Total change in CPU requested from the informant(s)",
+					Name: "autoscaling_agent_monitor_requested_cpu_change_total",
+					Help: "Total change in CPU requested from the vm-monitor(s)",
 				},
 				[]string{directionLabel},
 			)),
 			mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
 				prometheus.CounterOpts{
-					Name: "autoscaling_agent_informant_requested_mem_change_total",
-					Help: "Total change in memory (in MiB) requested from the informant(s)",
+					Name: "autoscaling_agent_monitor_requested_mem_change_total",
+					Help: "Total change in memory (in MiB) requested from the vm-monitor(s)",
 				},
 				[]string{directionLabel},
 			)),
 		},
-		informantApprovedChange: resourceChangePair{
+		monitorApprovedChange: resourceChangePair{
 			cpu: util.RegisterMetric(reg, prometheus.NewCounterVec(
 				prometheus.CounterOpts{
-					Name: "autoscaling_agent_informant_approved_cpu_change_total",
-					Help: "Total change in CPU approved by the informant(s)",
+					Name: "autoscaling_agent_monitor_approved_cpu_change_total",
+					Help: "Total change in CPU approved by the vm-monitor(s)",
 				},
 				[]string{directionLabel},
 			)),
 			mem: util.RegisterMetric(reg, prometheus.NewCounterVec(
 				prometheus.CounterOpts{
-					Name: "autoscaling_agent_informant_approved_mem_change_total",
-					Help: "Total change in memory (in MiB) approved by the informant(s)",
+					Name: "autoscaling_agent_monitor_approved_mem_change_total",
+					Help: "Total change in memory (in MiB) approved by the vm-monitor(s)",
 				},
 				[]string{directionLabel},
 			)),
@@ -168,6 +178,14 @@ func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Regi
 		},
 
 		// ---- RUNNER LIFECYCLE ----
+		runnersCount: util.RegisterMetric(reg, prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: "autoscaling_agent_runners_current",
+				Help: "Number of per-VM runners, with associated metadata",
+			},
+			// NB: is_endpoint ∈ ("true", "false"), state ∈ runnerMetricState = ("ok", "stuck", "errored", "panicked")
+			[]string{"is_endpoint", "state"},
+		)),
 		runnerFatalErrors: util.RegisterMetric(reg, prometheus.NewCounter(
 			prometheus.CounterOpts{
 				Name: "autoscaling_agent_runner_fatal_errors_total",
@@ -201,9 +219,9 @@ func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Regi
 		// scheduler:
 		metrics.schedulerRequestedChange,
 		metrics.schedulerApprovedChange,
-		// informant:
-		metrics.informantRequestedChange,
-		metrics.informantApprovedChange,
+		// monitor:
+		metrics.monitorRequestedChange,
+		metrics.monitorApprovedChange,
 		// neonvm:
 		metrics.neonvmRequestedChange,
 	}
@@ -214,114 +232,16 @@ func makePrometheusParts(globalstate *agentState) (PromMetrics, *prometheus.Regi
 		}
 	}
 
-	// the remaining metrics are computed at scrape time by prom:
-	// register them directly.
-	reg.MustRegister(prometheus.NewGaugeFunc(
-		prometheus.GaugeOpts{
-			Name: "autoscaling_errored_vm_runners_current",
-			Help: "Number of VMs whose per-VM runner has panicked (and not restarted)",
-		},
-		func() float64 {
-			globalstate.lock.Lock()
-			defer globalstate.lock.Unlock()
-
-			count := 0
-
-			for _, p := range globalstate.pods {
-				func() {
-					p.status.mu.Lock()
-					defer p.status.mu.Unlock()
-
-					if p.status.endState != nil && p.status.endState.ExitKind == podStatusExitErrored {
-						count += 1
-					}
-				}()
-			}
-
-			return float64(count)
-		},
-	))
-
-	reg.MustRegister(prometheus.NewGaugeFunc(
-		prometheus.GaugeOpts{
-			Name: "autoscaling_panicked_vm_runners_current",
-			Help: "Number of VMs whose per-VM runner has panicked (and not restarted)",
-		},
-		func() float64 {
-			globalstate.lock.Lock()
-			defer globalstate.lock.Unlock()
-
-			count := 0
-
-			for _, p := range globalstate.pods {
-				func() {
-					p.status.mu.Lock()
-					defer p.status.mu.Unlock()
-
-					if p.status.endState != nil && p.status.endState.ExitKind == podStatusExitPanicked {
-						count += 1
-					}
-				}()
-			}
-
-			return float64(count)
-		},
-	))
-
-	reg.MustRegister(prometheus.NewGaugeFunc(
-		prometheus.GaugeOpts{
-			Name: "autoscaling_agent_tracked_vms_current",
-			Help: "Number of autoscaling-enabled non-migrating VMs on the autoscaler-agent's node",
-		},
-		func() float64 {
-			globalstate.lock.Lock()
-			defer globalstate.lock.Unlock()
-
-			return float64(len(globalstate.pods))
-		},
-	))
-
-	reg.MustRegister(prometheus.NewGaugeFunc(
-		prometheus.GaugeOpts{
-			Name: "autoscaling_vms_unsuccessful_communication_with_informant_current",
-			Help: "Number of VMs whose vm-informants aren't successfully communicating with the autoscaler-agent",
-		},
-		func() float64 {
-			globalstate.lock.Lock()
-			defer globalstate.lock.Unlock()
-
-			count := 0
-
-			for _, p := range globalstate.pods {
-				if p.status.informantIsUnhealthy(globalstate.config) {
-					count++
-				}
-			}
-
-			return float64(count)
-		},
-	))
-
-	reg.MustRegister(prometheus.NewGaugeFunc(
-		prometheus.GaugeOpts{
-			Name: "autoscaling_billed_vms_unsuccessful_communication_with_informant_current",
-			Help: "Number of VMs *getting billed* whose vm-informants aren't successfully communicating with the autoscaler-agent",
-		},
-		func() float64 {
-			globalstate.lock.Lock()
-			defer globalstate.lock.Unlock()
-
-			count := 0
-
-			for _, p := range globalstate.pods {
-				if p.status.endpointID != "" && p.status.informantIsUnhealthy(globalstate.config) {
-					count++
-				}
-			}
-
-			return float64(count)
-		},
-	))
+	runnerStates := []runnerMetricState{
+		runnerMetricStateOk,
+		runnerMetricStateStuck,
+		runnerMetricStateErrored,
+		runnerMetricStatePanicked,
+	}
+	for _, s := range runnerStates {
+		metrics.runnersCount.WithLabelValues("true", string(s)).Set(0.0)
+		metrics.runnersCount.WithLabelValues("false", string(s)).Set(0.0)
+	}
 
 	return metrics, reg
 }
diff --git a/pkg/agent/runner.go b/pkg/agent/runner.go
index f14a8ec81..f463f92bb 100644
--- a/pkg/agent/runner.go
+++ b/pkg/agent/runner.go
@@ -12,7 +12,7 @@ package agent
 //  1. It should be OK to panic, if an error is truly unrecoverable
 //  2. A single Runner's panic shouldn't bring down the entire autoscaler-agent¹
 //  3. We want to expose a State() method to view (almost) all internal state
-//  4. Some high-level actions (e.g., HTTP request to Informant; update VM to desired state) require
+//  4. Some high-level actions (e.g., call to vm-monitor; update VM to desired state) require
 //     that we have *at most* one such action running at a time.
 //
 // There are a number of possible solutions to this set of goals. All reasonable solutions require
@@ -24,9 +24,7 @@ package agent
 //     * "track scheduler"
 //     * "get metrics"
 //     * "handle VM resources" - using metrics, calculates target resources level and contacts
-//       scheduler, informant, and NeonVM -- the "scaling" part of "autoscaling".
-//     * "informant server loop" - keeps Runner.informant and Runner.server up-to-date.
-//     * ... and a few more.
+//       scheduler, vm-monitor, and NeonVM -- the "scaling" part of "autoscaling".
 //  * Each thread makes *synchronous* HTTP requests while holding the necessary lock to prevent any other
 //    thread from making HTTP requests to the same entity. For example:
 //    * All requests to NeonVM and the scheduler plugin are guarded by Runner.requestLock, which
@@ -84,7 +82,7 @@ type Runner struct {
 	global *agentState
 	// status provides the high-level status of the Runner. Reading or updating the status requires
 	// holding podStatus.lock. Updates are typically done handled by the setStatus method.
-	status *podStatus
+	status *lockedPodStatus
 
 	// shutdown provides a clean way to trigger all background Runner threads to shut down. shutdown
 	// is set exactly once, by (*Runner).Run
@@ -134,15 +132,9 @@ type Runner struct {
 	// Each scheduler's info field is immutable. When a scheduler is replaced, only the pointer
 	// value here is updated; the original Scheduler remains unchanged.
 	scheduler atomic.Pointer[Scheduler]
-	server    atomic.Pointer[InformantServer]
-	// informant holds the most recent InformantDesc that an InformantServer has received in its
-	// normal operation. If there has been at least one InformantDesc received, this field will not
-	// be nil.
-	//
-	// This field really should not be used except for providing RunnerState. The correct interface
-	// is through server.Informant(), which does all the appropriate error handling if the
-	// connection to the informant is not in a suitable state.
-	informant *api.InformantDesc
+	// monitor, if non nil, stores the current Dispatcher in use for communicating with the
+	// vm-monitor
+	monitor atomic.Pointer[Dispatcher]
 	// computeUnit is the latest Compute Unit reported by a scheduler. It may be nil, if we haven't
 	// been able to contact one yet.
 	//
@@ -157,12 +149,6 @@ type Runner struct {
 	// to the current scheduler. This field is not nil only when scheduler is not nil.
 	lastSchedulerError error
 
-	// lastInformantError provides the error that occurred - if any - during the most recent request
-	// to the VM informant.
-	//
-	// This field MUST NOT be updated without holding BOTH lock AND server.requestLock.
-	lastInformantError error
-
 	// backgroundWorkerCount tracks the current number of background workers. It is exclusively
 	// updated by r.spawnBackgroundWorker
 	backgroundWorkerCount atomic.Int64
@@ -205,22 +191,19 @@ type Scheduler struct {
 
 	// fatal is used for signalling that fatalError has been set (and so we should look for a new
 	// scheduler)
-	fatal util.SignalSender
+	fatal util.SignalSender[struct{}]
 }
 
 // RunnerState is the serializable state of the Runner, extracted by its State method
 type RunnerState struct {
-	PodIP                 string                `json:"podIP"`
-	VM                    api.VmInfo            `json:"vm"`
-	LastMetrics           *api.Metrics          `json:"lastMetrics"`
-	Scheduler             *SchedulerState       `json:"scheduler"`
-	Server                *InformantServerState `json:"server"`
-	Informant             *api.InformantDesc    `json:"informant"`
-	ComputeUnit           *api.Resources        `json:"computeUnit"`
-	LastApproved          *api.Resources        `json:"lastApproved"`
-	LastSchedulerError    error                 `json:"lastSchedulerError"`
-	LastInformantError    error                 `json:"lastInformantError"`
-	BackgroundWorkerCount int64                 `json:"backgroundWorkerCount"`
+	PodIP                 string          `json:"podIP"`
+	VM                    api.VmInfo      `json:"vm"`
+	LastMetrics           *api.Metrics    `json:"lastMetrics"`
+	Scheduler             *SchedulerState `json:"scheduler"`
+	ComputeUnit           *api.Resources  `json:"computeUnit"`
+	LastApproved          *api.Resources  `json:"lastApproved"`
+	LastSchedulerError    error           `json:"lastSchedulerError"`
+	BackgroundWorkerCount int64           `json:"backgroundWorkerCount"`
 
 	SchedulerRespondedWithMigration bool `json:"migrationStarted"`
 }
@@ -247,28 +230,12 @@ func (r *Runner) State(ctx context.Context) (*RunnerState, error) {
 		}
 	}
 
-	var serverState *InformantServerState
-	if server := r.server.Load(); server != nil {
-		serverState = &InformantServerState{
-			Desc:            server.desc,
-			SeqNum:          server.seqNum,
-			ReceivedIDCheck: server.receivedIDCheck,
-			MadeContact:     server.madeContact,
-			ProtoVersion:    server.protoVersion,
-			Mode:            server.mode,
-			ExitStatus:      server.exitStatus.Load(),
-		}
-	}
-
 	return &RunnerState{
 		LastMetrics:           r.lastMetrics,
 		Scheduler:             scheduler,
-		Server:                serverState,
-		Informant:             r.informant,
 		ComputeUnit:           r.computeUnit,
 		LastApproved:          r.lastApproved,
 		LastSchedulerError:    r.lastSchedulerError,
-		LastInformantError:    r.lastInformantError,
 		VM:                    r.vm,
 		PodIP:                 r.podIP,
 		BackgroundWorkerCount: r.backgroundWorkerCount.Load(),
@@ -283,12 +250,13 @@ func (r *Runner) Spawn(ctx context.Context, logger *zap.Logger, vmInfoUpdated ut
 		defer func() {
 			if err := recover(); err != nil {
 				now := time.Now()
-				r.setStatus(func(stat *podStatus) {
+				r.status.update(r.global, func(stat podStatus) podStatus {
 					stat.endState = &podStatusEndState{
 						ExitKind: podStatusExitPanicked,
 						Error:    fmt.Errorf("Runner %v panicked: %v", r.vm.NamespacedName(), err),
 						Time:     now,
 					}
+					return stat
 				})
 			}
 
@@ -303,12 +271,13 @@ func (r *Runner) Spawn(ctx context.Context, logger *zap.Logger, vmInfoUpdated ut
 			exitKind = podStatusExitErrored
 			r.global.metrics.runnerFatalErrors.Inc()
 		}
-		r.setStatus(func(stat *podStatus) {
+		r.status.update(r.global, func(stat podStatus) podStatus {
 			stat.endState = &podStatusEndState{
 				ExitKind: exitKind,
 				Error:    err,
 				Time:     endTime,
 			}
+			return stat
 		})
 
 		if err != nil {
@@ -319,12 +288,6 @@ func (r *Runner) Spawn(ctx context.Context, logger *zap.Logger, vmInfoUpdated ut
 	}()
 }
 
-func (r *Runner) setStatus(with func(*podStatus)) {
-	r.status.mu.Lock()
-	defer r.status.mu.Unlock()
-	with(r.status)
-}
-
 // Run is the main entrypoint to the long-running per-VM pod tasks
 func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util.CondChannelReceiver) error {
 	ctx, r.shutdown = context.WithCancel(ctx)
@@ -349,10 +312,10 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util
 
 	coreExecLogger := execLogger.Named("core")
 	executorCore := executor.NewExecutorCore(coreExecLogger.Named("state"), r.vm, executor.Config{
-		DefaultScalingConfig:             r.global.config.Scaling.DefaultConfig,
-		PluginRequestTick:                time.Second * time.Duration(r.global.config.Scheduler.RequestAtLeastEverySeconds),
-		InformantDeniedDownscaleCooldown: time.Second * time.Duration(r.global.config.Informant.RetryDeniedDownscaleSeconds),
-		InformantRetryWait:               time.Second * time.Duration(r.global.config.Informant.RetryFailedRequestSeconds),
+		DefaultScalingConfig:           r.global.config.Scaling.DefaultConfig,
+		PluginRequestTick:              time.Second * time.Duration(r.global.config.Scheduler.RequestAtLeastEverySeconds),
+		MonitorDeniedDownscaleCooldown: time.Second * time.Duration(r.global.config.Monitor.RetryDeniedDownscaleSeconds),
+		MonitorRetryWait:               time.Second * time.Duration(r.global.config.Monitor.RetryFailedRequestSeconds),
 		Warn: func(msg string, args ...any) {
 			coreExecLogger.Warn(fmt.Sprintf(msg, args...))
 		},
@@ -360,13 +323,13 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util
 
 	pluginIface := makePluginInterface(r, executorCore)
 	neonvmIface := makeNeonVMInterface(r)
-	informantIface := makeInformantInterface(r, executorCore)
+	monitorIface := makeMonitorInterface(r, executorCore)
 
 	// "ecwc" stands for "ExecutorCoreWithClients"
 	ecwc := executorCore.WithClients(executor.ClientSet{
-		Plugin:    pluginIface,
-		NeonVM:    neonvmIface,
-		Informant: informantIface,
+		Plugin:  pluginIface,
+		NeonVM:  neonvmIface,
+		Monitor: monitorIface,
 	})
 
 	logger.Info("Starting background workers")
@@ -375,6 +338,9 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util
 	mainDeadlockChecker := r.lock.DeadlockChecker(250*time.Millisecond, time.Second)
 	reqDeadlockChecker := r.requestLock.DeadlockChecker(5*time.Second, time.Second)
 
+	r.spawnBackgroundWorker(ctx, logger, "podStatus updater", func(c context.Context, l *zap.Logger) {
+		r.status.periodicallyRefreshState(c, l, r.global)
+	})
 	r.spawnBackgroundWorker(ctx, logger, "deadlock checker (main)", ignoreLogger(mainDeadlockChecker))
 	r.spawnBackgroundWorker(ctx, logger, "deadlock checker (request lock)", ignoreLogger(reqDeadlockChecker))
 	r.spawnBackgroundWorker(ctx, logger, "track scheduler", func(c context.Context, l *zap.Logger) {
@@ -382,40 +348,29 @@ func (r *Runner) Run(ctx context.Context, logger *zap.Logger, vmInfoUpdated util
 			ecwc.Updater().NewScheduler(withLock)
 		})
 	})
-	sendInformantUpd, recvInformantUpd := util.NewCondChannelPair()
 	r.spawnBackgroundWorker(ctx, logger, "get metrics", func(c context.Context, l *zap.Logger) {
-		r.getMetricsLoop(c, l, recvInformantUpd, func(metrics api.Metrics, withLock func()) {
+		r.getMetricsLoop(c, l, func(metrics api.Metrics, withLock func()) {
 			ecwc.Updater().UpdateMetrics(metrics, withLock)
 		})
 	})
-	r.spawnBackgroundWorker(ctx, logger, "informant server loop", func(c context.Context, l *zap.Logger) {
-		r.serveInformantLoop(
-			c,
-			l,
-			informantStateCallbacks{
-				resetInformant: func(withLock func()) {
-					ecwc.Updater().ResetInformant(withLock)
-				},
-				upscaleRequested: func(request api.MoreResources, withLock func()) {
-					ecwc.Updater().UpscaleRequested(request, withLock)
-				},
-				registered: func(active bool, withLock func()) {
-					ecwc.Updater().InformantRegistered(active, func() {
-						sendInformantUpd.Send()
-						withLock()
-					})
-				},
-				setActive: func(active bool, withLock func()) {
-					ecwc.Updater().InformantActive(active, withLock)
-				},
+	r.spawnBackgroundWorker(ctx, logger.Named("vm-monitor"), "vm-monitor reconnection loop", func(c context.Context, l *zap.Logger) {
+		r.connectToMonitorLoop(c, l, monitorStateCallbacks{
+			reset: func(withLock func()) {
+				ecwc.Updater().ResetMonitor(withLock)
 			},
-		)
+			upscaleRequested: func(request api.MoreResources, withLock func()) {
+				ecwc.Updater().UpscaleRequested(request, withLock)
+			},
+			setActive: func(active bool, withLock func()) {
+				ecwc.Updater().MonitorActive(active, withLock)
+			},
+		})
 	})
 	r.spawnBackgroundWorker(ctx, execLogger.Named("sleeper"), "executor: sleeper", ecwc.DoSleeper)
 	r.spawnBackgroundWorker(ctx, execLogger.Named("plugin"), "executor: plugin", ecwc.DoPluginRequests)
 	r.spawnBackgroundWorker(ctx, execLogger.Named("neonvm"), "executor: neonvm", ecwc.DoNeonVMRequests)
-	r.spawnBackgroundWorker(ctx, execLogger.Named("informant-downscale"), "executor: informant downscale", ecwc.DoInformantDownscales)
-	r.spawnBackgroundWorker(ctx, execLogger.Named("informant-upscale"), "executor: informant upscale", ecwc.DoInformantUpscales)
+	r.spawnBackgroundWorker(ctx, execLogger.Named("vm-monitor-downscale"), "executor: vm-monitor downscale", ecwc.DoMonitorDownscales)
+	r.spawnBackgroundWorker(ctx, execLogger.Named("vm-monitor-upscale"), "executor: vm-monitor upscale", ecwc.DoMonitorUpscales)
 
 	// Note: Run doesn't terminate unless the parent context is cancelled - either because the VM
 	// pod was deleted, or the autoscaler-agent is exiting.
@@ -494,7 +449,6 @@ func (r *Runner) spawnBackgroundWorker(ctx context.Context, logger *zap.Logger,
 func (r *Runner) getMetricsLoop(
 	ctx context.Context,
 	logger *zap.Logger,
-	updatedInformant util.CondChannelReceiver,
 	newMetrics func(metrics api.Metrics, withLock func()),
 ) {
 	timeout := time.Second * time.Duration(r.global.config.Metrics.RequestTimeoutSeconds)
@@ -504,7 +458,7 @@ func (r *Runner) getMetricsLoop(
 	minWaitDuration := time.Second
 
 	for {
-		metrics, err := r.doMetricsRequestIfEnabled(ctx, logger, timeout, updatedInformant.Consume)
+		metrics, err := r.doMetricsRequest(ctx, logger, timeout)
 		if err != nil {
 			logger.Error("Error making metrics request", zap.Error(err))
 			goto next
@@ -526,158 +480,121 @@ func (r *Runner) getMetricsLoop(
 		case <-minWait:
 		}
 
-		// After waiting for the required minimum, allow shortcutting the normal wait if the
-		// informant was updated
 		select {
 		case <-ctx.Done():
 			return
-		case <-updatedInformant.Recv():
-			logger.Info("Shortcutting normal metrics wait because informant was updated")
 		case <-waitBetween:
 		}
-
 	}
 }
 
-type informantStateCallbacks struct {
-	resetInformant   func(withLock func())
+type monitorStateCallbacks struct {
+	reset            func(withLock func())
 	upscaleRequested func(request api.MoreResources, withLock func())
-	registered       func(active bool, withLock func())
 	setActive        func(active bool, withLock func())
 }
 
-// serveInformantLoop repeatedly creates an InformantServer to handle communications with the VM
-// informant
-//
-// This function directly sets the value of r.server and indirectly sets r.informant.
-func (r *Runner) serveInformantLoop(
+// connectToMonitorLoop does lifecycle management of the (re)connection to the vm-monitor
+func (r *Runner) connectToMonitorLoop(
 	ctx context.Context,
 	logger *zap.Logger,
-	callbacks informantStateCallbacks,
+	callbacks monitorStateCallbacks,
 ) {
-	// variables set & accessed across loop iterations
-	var (
-		normalRetryWait <-chan time.Time
-		minRetryWait    <-chan time.Time
-		lastStart       time.Time
-	)
+	addr := fmt.Sprintf("ws://%s:%d/monitor", r.podIP, r.global.config.Monitor.ServerPort)
 
-	// Loop-invariant duration constants
-	minWait := time.Second * time.Duration(r.global.config.Informant.RetryServerMinWaitSeconds)
-	normalWait := time.Second * time.Duration(r.global.config.Informant.RetryServerNormalWaitSeconds)
-	retryRegister := time.Second * time.Duration(r.global.config.Informant.RegisterRetrySeconds)
+	minWait := time.Second * time.Duration(r.global.config.Monitor.ConnectionRetryMinWaitSeconds)
+	var lastStart time.Time
 
-retryServer:
-	for {
-		if normalRetryWait != nil {
-			logger.Info("Retrying informant server after delay", zap.Duration("delay", normalWait))
-			select {
-			case <-ctx.Done():
-				return
-			case <-normalRetryWait:
+	for i := 0; ; i += 1 {
+		// Remove any prior Dispatcher from the Runner
+		if i != 0 {
+			func() {
+				r.lock.Lock()
+				defer r.lock.Unlock()
+				callbacks.reset(func() {
+					r.monitor.Store(nil)
+					logger.Info("Reset previous vm-monitor connection")
+				})
+			}()
+		}
+
+		// If the context was canceled, don't restart
+		if err := ctx.Err(); err != nil {
+			action := "attempt"
+			if i != 0 {
+				action = "retry "
 			}
+			logger.Info(
+				fmt.Sprintf("Aborting vm-monitor connection %s because context is already canceled", action),
+				zap.Error(err),
+			)
+			return
 		}
 
-		if minRetryWait != nil {
-			select {
-			case <-minRetryWait:
-				logger.Info("Retrying informant server")
-			default:
+		// Delayed restart management, long because of friendly logging:
+		if i != 0 {
+			endTime := time.Now()
+			runtime := endTime.Sub(lastStart)
+
+			if runtime > minWait {
+				logger.Info(
+					"Immediately retrying connection to vm-monitor",
+					zap.String("addr", addr),
+					zap.Duration("totalRuntime", runtime),
+				)
+			} else {
+				delay := minWait - runtime
 				logger.Info(
-					"Informant server ended quickly. Respecting minimum delay before restart",
-					zap.Duration("activeTime", time.Since(lastStart)), zap.Duration("delay", minWait),
+					"Connection to vm-monitor was not live for long, retrying after delay",
+					zap.Duration("delay", delay),
+					zap.Duration("totalRuntime", runtime),
 				)
+
 				select {
+				case <-time.After(delay):
+					logger.Info(
+						"Retrying connection to vm-monitor",
+						zap.Duration("delay", delay),
+						zap.Duration("waitTime", time.Since(endTime)),
+						zap.String("addr", addr),
+					)
 				case <-ctx.Done():
+					logger.Info(
+						"Canceling retrying connection to vm-monitor",
+						zap.Duration("delay", delay),
+						zap.Duration("waitTime", time.Since(endTime)),
+						zap.Error(ctx.Err()),
+					)
 					return
-				case <-minRetryWait:
 				}
 			}
+		} else {
+			logger.Info("Connecting to vm-monitor", zap.String("addr", addr))
 		}
 
-		normalRetryWait = nil // only "long wait" if an error occurred
-		minRetryWait = time.After(minWait)
-		lastStart = time.Now()
-
-		server, exited, err := NewInformantServer(ctx, logger, r, callbacks)
-		if ctx.Err() != nil {
-			if err != nil {
-				logger.Warn("Error starting informant server (but context canceled)", zap.Error(err))
-			}
-			return
-		} else if err != nil {
-			normalRetryWait = time.After(normalWait)
-			logger.Error("Error starting informant server", zap.Error(err))
-			continue retryServer
+		dispatcher, err := NewDispatcher(ctx, logger, addr, r, callbacks.upscaleRequested)
+		if err != nil {
+			logger.Error("Failed to connect to vm-monitor", zap.String("addr", addr), zap.Error(err))
+			continue
 		}
 
-		// Update r.server:
+		// Update runner to the new dispatcher
 		func() {
 			r.lock.Lock()
 			defer r.lock.Unlock()
-
-			var kind string
-			if r.server.Load() == nil {
-				kind = "Setting"
-			} else {
-				kind = "Updating"
-			}
-
-			logger.Info(fmt.Sprintf("%s initial informant server", kind), zap.Object("server", server.desc))
-			r.server.Store(server)
+			callbacks.setActive(true, func() {
+				r.monitor.Store(dispatcher)
+				logger.Info("Connected to vm-monitor")
+			})
 		}()
 
-		logger.Info("Registering with informant")
-
-		// Try to register with the informant:
-	retryRegister:
-		for {
-			err := server.RegisterWithInformant(ctx, logger)
-			if err == nil {
-				break // all good; wait for the server to finish.
-			} else if ctx.Err() != nil {
-				if err != nil {
-					logger.Warn("Error registering with informant (but context cancelled)", zap.Error(err))
-				}
-				return
-			}
-
-			logger.Warn("Error registering with informant", zap.Error(err))
-
-			// Server exited; can't just retry registering.
-			if server.ExitStatus() != nil {
-				normalRetryWait = time.After(normalWait)
-				continue retryServer
-			}
-
-			// Wait before retrying registering
-			logger.Info("Retrying registering with informant after delay", zap.Duration("delay", retryRegister))
-			select {
-			case <-time.After(retryRegister):
-				continue retryRegister
-			case <-ctx.Done():
-				return
-			}
-		}
-
-		// Wait for the server to finish
-		select {
-		case <-ctx.Done():
-			return
-		case <-exited.Recv():
-		}
+		// Wait until the dispatcher is no longer running, either due to error or because the
+		// root-level Runner context was canceled.
+		<-dispatcher.ExitSignal()
 
-		// Server finished
-		exitStatus := server.ExitStatus()
-		if exitStatus == nil {
-			panic(errors.New("Informant server signalled end but ExitStatus() == nil"))
+		if err := dispatcher.ExitError(); err != nil {
+			logger.Error("Dispatcher for vm-monitor connection exited due to error", zap.Error(err))
 		}
-
-		if !exitStatus.RetryShouldFix {
-			normalRetryWait = time.After(normalWait)
-		}
-
-		continue retryServer
 	}
 }
 
@@ -696,7 +613,7 @@ func (r *Runner) trackSchedulerLoop(
 		minWait     time.Duration    = 5 * time.Second // minimum time we have to wait between scheduler starts
 		okForNew    <-chan time.Time                   // channel that sends when we've waited long enough for a new scheduler
 		currentInfo schedwatch.SchedulerInfo
-		fatal       util.SignalReceiver
+		fatal       util.SignalReceiver[struct{}]
 		failed      bool
 	)
 
@@ -714,7 +631,7 @@ startScheduler:
 	failed = false
 
 	// Set the current scheduler
-	fatal = func() util.SignalReceiver {
+	fatal = func() util.SignalReceiver[struct{}] {
 		logger := logger.With(zap.Object("scheduler", currentInfo))
 
 		verb := "Setting"
@@ -722,7 +639,7 @@ startScheduler:
 			verb = "Updating"
 		}
 
-		sendFatal, recvFatal := util.NewSingleSignalPair()
+		sendFatal, recvFatal := util.NewSingleSignalPair[struct{}]()
 
 		sched := &Scheduler{
 			runner:     r,
@@ -830,65 +747,13 @@ waitForNewScheduler:
 // Lower-level implementation functions //
 //////////////////////////////////////////
 
-// doMetricsRequestIfEnabled makes a single metrics request to the VM informant, returning it
-//
-// This method expects that the Runner is not locked.
-func (r *Runner) doMetricsRequestIfEnabled(
+// doMetricsRequest makes a single metrics request to the VM
+func (r *Runner) doMetricsRequest(
 	ctx context.Context,
 	logger *zap.Logger,
 	timeout time.Duration,
-	clearNewInformantSignal func(),
 ) (*api.Metrics, error) {
-	logger.Info("Attempting metrics request")
-
-	// FIXME: the region where the lock is held should be extracted into a separate method, called
-	// something like buildMetricsRequest().
-
-	r.lock.Lock()
-	locked := true
-	defer func() {
-		if locked {
-			r.lock.Unlock()
-		}
-	}()
-
-	// Only clear the signal once we've locked, so that we're not racing.
-	//
-	// We don't *need* to do this, but its only cost is a small amount of code complexity, and it's
-	// nice to have have the guarantees around not racing.
-	clearNewInformantSignal()
-
-	if server := r.server.Load(); server == nil || server.mode != InformantServerRunning {
-		var state = "unset"
-		if server != nil {
-			state = string(server.mode)
-		}
-
-		logger.Info(fmt.Sprintf("Cannot make metrics request because informant server is %s", state))
-		return nil, nil
-	}
-
-	if r.informant == nil {
-		panic(errors.New("r.informant == nil but r.server.mode == InformantServerRunning"))
-	}
-
-	var url string
-	var handle func(body []byte) (*api.Metrics, error)
-
-	switch {
-	case r.informant.MetricsMethod.Prometheus != nil:
-		url = fmt.Sprintf("http://%s:%d/metrics", r.podIP, r.informant.MetricsMethod.Prometheus.Port)
-		handle = func(body []byte) (*api.Metrics, error) {
-			m, err := api.ReadMetrics(body, r.global.config.Metrics.LoadMetricPrefix)
-			if err != nil {
-				err = fmt.Errorf("Error reading metrics from prometheus output: %w", err)
-			}
-			return &m, err
-		}
-	default:
-		// Ok to panic here because this should be handled by the informant server
-		panic(errors.New("server's InformantDesc has unknown metrics method"))
-	}
+	url := fmt.Sprintf("http://%s:%d/metrics", r.podIP, r.global.config.Metrics.Port)
 
 	reqCtx, cancel := context.WithTimeout(ctx, timeout)
 	defer cancel()
@@ -898,10 +763,6 @@ func (r *Runner) doMetricsRequestIfEnabled(
 		panic(fmt.Errorf("Error constructing metrics request to %q: %w", url, err))
 	}
 
-	// Unlock while we perform the request:
-	locked = false
-	r.lock.Unlock()
-
 	logger.Info("Making metrics request to VM", zap.String("url", url))
 
 	resp, err := http.DefaultClient.Do(req)
@@ -921,7 +782,12 @@ func (r *Runner) doMetricsRequestIfEnabled(
 		return nil, fmt.Errorf("Unsuccessful response status %d: %s", resp.StatusCode, string(body))
 	}
 
-	return handle(body)
+	m, err := api.ReadMetrics(body, r.global.config.Metrics.LoadMetricPrefix)
+	if err != nil {
+		return nil, fmt.Errorf("Error reading metrics from prometheus output: %w", err)
+	}
+
+	return &m, nil
 }
 
 func (r *Runner) doNeonVMRequest(ctx context.Context, target api.Resources) error {
@@ -990,6 +856,44 @@ func (r *Runner) recordResourceChange(current, target api.Resources, metrics res
 	}
 }
 
+func doMonitorDownscale(
+	ctx context.Context,
+	logger *zap.Logger,
+	dispatcher *Dispatcher,
+	target api.Resources,
+) (*api.DownscaleResult, error) {
+	r := dispatcher.runner
+	rawResources := target.ConvertToAllocation(r.vm.Mem.SlotSize)
+
+	timeout := time.Second * time.Duration(r.global.config.Monitor.ResponseTimeoutSeconds)
+
+	res, err := dispatcher.Call(ctx, logger, timeout, "DownscaleRequest", api.DownscaleRequest{
+		Target: rawResources,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	return res.Result, nil
+}
+
+func doMonitorUpscale(
+	ctx context.Context,
+	logger *zap.Logger,
+	dispatcher *Dispatcher,
+	target api.Resources,
+) error {
+	r := dispatcher.runner
+	rawResources := target.ConvertToAllocation(r.vm.Mem.SlotSize)
+
+	timeout := time.Second * time.Duration(r.global.config.Monitor.ResponseTimeoutSeconds)
+
+	_, err := dispatcher.Call(ctx, logger, timeout, "UpscaleNotification", api.UpscaleNotification{
+		Granted: rawResources,
+	})
+	return err
+}
+
 // DoRequest sends a request to the scheduler and does not validate the response.
 func (s *Scheduler) DoRequest(
 	ctx context.Context,
diff --git a/pkg/agent/schedwatch/trackcurrent.go b/pkg/agent/schedwatch/trackcurrent.go
index 2e42e30a3..ace82c25f 100644
--- a/pkg/agent/schedwatch/trackcurrent.go
+++ b/pkg/agent/schedwatch/trackcurrent.go
@@ -24,7 +24,7 @@ type SchedulerWatch struct {
 	cmd   chan<- watchCmd
 	using chan<- SchedulerInfo
 
-	stop            util.SignalSender
+	stop            util.SignalSender[struct{}]
 	stopEventStream func()
 }
 
@@ -42,7 +42,7 @@ func (w SchedulerWatch) Using(sched SchedulerInfo) {
 
 func (w SchedulerWatch) Stop() {
 	w.stopEventStream()
-	w.stop.Send()
+	w.stop.Send(struct{}{})
 }
 
 const schedulerNamespace string = "kube-system"
@@ -69,7 +69,7 @@ func WatchSchedulerUpdates(
 	deleted := make(chan SchedulerInfo)
 	cmd := make(chan watchCmd)
 	using := make(chan SchedulerInfo)
-	stopSender, stopListener := util.NewSingleSignalPair()
+	stopSender, stopListener := util.NewSingleSignalPair[struct{}]()
 
 	state := schedulerWatchState{
 		queue:      make([]WatchEvent, 0, 1),
@@ -132,7 +132,7 @@ type schedulerWatchState struct {
 	cmd   <-chan watchCmd
 	using <-chan SchedulerInfo
 
-	stop   util.SignalReceiver
+	stop   util.SignalReceiver[struct{}]
 	logger *zap.Logger
 }
 
diff --git a/pkg/api/VERSIONING.md b/pkg/api/VERSIONING.md
index 899410d13..bbbd45f24 100644
--- a/pkg/api/VERSIONING.md
+++ b/pkg/api/VERSIONING.md
@@ -9,40 +9,14 @@ The table below should provide the necessary information. For each release, it g
 supported protocol versions by each component. The topmost line - "Current" - refers to the latest
 commit in this repository, possibly unreleased.
 
-## agent<->informant protocol
+## agent<->monitor protocol
 
-| Release | autoscaler-agent | VM informant |
-|---------|------------------|--------------|
-| _Current_ | v1.0 - v2.0 | v2.0 - v2.0 |
-| v0.11.0 | v1.0 - v2.0 | v2.0 - v2.0 |
-| v0.10.0 | **v1.0 - v2.0** | **v2.0 - v2.0** |
-| v0.9.0 | v1.0 - v1.2 | v1.1 - v1.2 |
-| v0.8.0 | v1.0 - v1.2 | v1.1 - v1.2 |
-| v0.7.2 | v1.0 - v1.2 | v1.1 - v1.2 |
-| v0.7.1 | v1.0 - v1.2 | v1.1 - v1.2 |
-| v0.7.0 | **v1.0 - v1.2** | **v1.1 - v1.2** |
-| v0.6.0 | v1.0 - v1.1 | v1.1 only |
-| v0.5.2 | v1.0 - v1.1 | v1.1 only |
-| v0.5.1 | v1.0 - v1.1 | v1.1 only |
-| v0.5.0 | v1.0 - v1.1 | v1.1 only |
-| v0.1.17 | v1.0 - v1.1 | v1.1 only |
-| v0.1.16 | v1.0 - v1.1 | v1.1 only |
-| v0.1.15 | v1.0 - v1.1 | v1.1 only |
-| v0.1.14 | v1.0 - v1.1 | v1.1 only |
-| v0.1.13 | v1.0 - v1.1 | v1.1 only |
-| v0.1.12 | v1.0 - v1.1 | v1.1 only |
-| v0.1.11 | v1.0 - v1.1 | v1.1 only |
-| v0.1.10 | v1.0 - v1.1 | v1.1 only |
-| v0.1.9 | v1.0 - v1.1 | v1.1 only |
-| v0.1.8 | v1.0 - v1.1 | v1.1 only |
-| v0.1.7 | v1.0 - v1.1 | v1.1 only |
-| v0.1.6 | v1.0 - v1.1 | v1.1 only |
-| v0.1.5 | v1.0 - v1.1 | v1.1 only |
-| v0.1.4 | **v1.0 - v1.1** | **v1.1** only |
-| v0.1.3 | v1.0 only | v1.0 only |
-| 0.1.2 | v1.0 only | v1.0 only |
-| 0.1.1 | v1.0 only | v1.0 only |
-| 0.1.0 | **v1.0** only | **v1.0** only |
+| Release | autoscaler-agent | VM monitor |
+|---------|------------------|------------|
+| _Current_ | v1.0 only | v1.0 only |
+| v0.17.0 | v1.0 only | v1.0 only |
+| v0.16.0 | v1.0 only | v1.0 only |
+| v0.15.0 | **v1.0** only | **v1.0** only |
 
 ## agent<->scheduler plugin protocol
 
@@ -54,6 +28,19 @@ number.
 | Release | autoscaler-agent | Scheduler plugin |
 |---------|------------------|------------------|
 | _Current_ | v2.0 only | v1.0-v2.0 |
+| v0.17.0 | v2.0 only | v1.0-v2.0 |
+| v0.16.0 | v2.0 only | v1.0-v2.0 |
+| v0.15.0 | v2.0 only | v1.0-v2.0 |
+| v0.14.2 | v2.0 only | v1.0-v2.0 |
+| v0.14.1 | v2.0 only | v1.0-v2.0 |
+| v0.14.0 | v2.0 only | v1.0-v2.0 |
+| v0.13.3 | v2.0 only | v1.0-v2.0 |
+| v0.13.2 | v2.0 only | v1.0-v2.0 |
+| v0.13.1 | v2.0 only | v1.0-v2.0 |
+| v0.13.0 | v2.0 only | v1.0-v2.0 |
+| v0.12.2 | v2.0 only | v1.0-v2.0 |
+| v0.12.1 | v2.0 only | v1.0-v2.0 |
+| v0.12.0 | v2.0 only | v1.0-v2.0 |
 | v0.11.0 | v2.0 only | v1.0-v2.0 |
 | v0.10.0 | v2.0 only | v1.0-v2.0 |
 | v0.9.0 | v2.0 only | v1.0-v2.0 |
@@ -90,6 +77,19 @@ Note: Components v0.6.0 and below did not have a versioned protocol between the
 | Release | controller | runner |
 |---------|------------|--------|
 | _Current_ | 0 - 1 | 1 |
+| v0.17.0 | 0 - 1 | 1 |
+| v0.16.0 | 0 - 1 | 1 |
+| v0.15.0 | 0 - 1 | 1 |
+| v0.14.2 | 0 - 1 | 1 |
+| v0.14.1 | 0 - 1 | 1 |
+| v0.14.0 | 0 - 1 | 1 |
+| v0.13.3 | 0 - 1 | 1 |
+| v0.13.2 | 0 - 1 | 1 |
+| v0.13.1 | 0 - 1 | 1 |
+| v0.13.0 | 0 - 1 | 1 |
+| v0.12.2 | 0 - 1 | 1 |
+| v0.12.1 | 0 - 1 | 1 |
+| v0.12.0 | 0 - 1 | 1 |
 | v0.11.0 | 0 - 1 | 1 |
 | v0.10.0 | 0 - 1 | 1 |
 | v0.9.0 | 0 - 1 | 1 |
diff --git a/pkg/api/metrics.go b/pkg/api/metrics.go
index 946799236..458be045b 100644
--- a/pkg/api/metrics.go
+++ b/pkg/api/metrics.go
@@ -11,8 +11,9 @@ import (
 // Metrics gives the information pulled from node_exporter that the scheduler may use to prioritize
 // which pods it should migrate.
 type Metrics struct {
-	LoadAverage1Min float32 `json:"loadAvg1M"`
-	LoadAverage5Min float32 `json:"loadAvg5M"`
+	LoadAverage1Min  float32 `json:"loadAvg1M"`
+	LoadAverage5Min  float32 `json:"loadAvg5M"`
+	MemoryUsageBytes float32 `json:"memoryUsageBytes"`
 }
 
 // ReadMetrics generates Metrics from node_exporter output, or returns error on failure
@@ -61,5 +62,17 @@ func ReadMetrics(nodeExporterOutput []byte, loadPrefix string) (m Metrics, err e
 		return
 	}
 
+	availableMem, err := getField(loadPrefix+"memory_available_bytes", "")
+	if err != nil {
+		return
+	}
+	totalMem, err := getField(loadPrefix+"memory_total_bytes", "")
+	if err != nil {
+		return
+	}
+
+	// Add an extra 100 MiB to account for kernel memory usage
+	m.MemoryUsageBytes = totalMem - availableMem + 100*(1<<20)
+
 	return
 }
diff --git a/pkg/api/types.go b/pkg/api/types.go
index 77b3aac0b..c629405bf 100644
--- a/pkg/api/types.go
+++ b/pkg/api/types.go
@@ -1,10 +1,11 @@
 package api
 
 import (
+	"encoding/json"
 	"errors"
 	"fmt"
+	"reflect"
 
-	"github.com/google/uuid"
 	"go.uber.org/zap/zapcore"
 
 	"k8s.io/apimachinery/pkg/api/resource"
@@ -232,11 +233,11 @@ func (r Resources) IncreaseFrom(old Resources) MoreResources {
 	}
 }
 
-// ConvertToRaw produces the RawResources equivalent to these Resources with the given slot size
-func (r Resources) ConvertToRaw(memSlotSize *resource.Quantity) RawResources {
-	return RawResources{
-		Cpu:    r.VCPU.ToResourceQuantity(),
-		Memory: resource.NewQuantity(int64(r.Mem)*memSlotSize.Value(), resource.BinarySI),
+// ConvertToRaw produces the Allocation equivalent to these Resources with the given slot size
+func (r Resources) ConvertToAllocation(memSlotSize *resource.Quantity) Allocation {
+	return Allocation{
+		Cpu: r.VCPU.ToResourceQuantity().AsApproximateFloat64(),
+		Mem: uint64(int64(r.Mem) * memSlotSize.Value()),
 	}
 }
 
@@ -273,251 +274,11 @@ type PluginResponse struct {
 // TODO: fill this with more information as required
 type MigrateResponse struct{}
 
-///////////////////////////
-// VM Informant Messages //
-///////////////////////////
-
-// InformantProtoVersion represents a single version of the agent<->informant protocol
-//
-// Each version of the agent<->informant protocol is named independently from releases of the
-// repository containing this code. Names follow semver, although this does not necessarily
-// guarantee support - for example, the VM informant may only support versions above v1.1.
-//
-// Version compatibility is documented in the neighboring file VERSIONING.md.
-type InformantProtoVersion uint32
-
-const (
-	// InformantProtoV1_0 represents v1.0 of the agent<->informant protocol - the initial version.
-	//
-	// Last used in release version 0.1.2.
-	InformantProtoV1_0 InformantProtoVersion = iota + 1 // +1 so we start from 1
-
-	// InformantProtoV1_1 represents v1.1 of the agent<->informant protocol.
-	//
-	// Changes from v1.0:
-	//
-	// * Adds /try-upscale endpoint to the autoscaler-agent.
-	//
-	// Last used in release version v0.6.0.
-	InformantProtoV1_1
-
-	// InformantProtoV1_2 represents v1.2 of the agent<->informant protocol.
-	//
-	// Changes from v1.1:
-	//
-	// * Adds /health-check endpoint to the vm-informant.
-	//
-	// Last used in release version v0.9.0
-	InformantProtoV1_2
-
-	// InformantProtoV2_0 represents v2.0 of the agent<->informant protocol.
-	//
-	// Changes from v1.2:
-	//
-	// * Agents now return a AgentResourceMessage when notifying VM's of changes
-	//   in resources on their /upscale and /downscale endpoints. Since
-	//   RawResources (the response type in previous protocols) is not
-	//   deserializable out of an AgentResourceMessage, this is a breaking
-	//   change.
-	//
-	// Currently the latest version.
-	InformantProtoV2_0
-
-	// latestInformantProtoVersion represents the latest version of the agent<->informant protocol
-	//
-	// This value is kept private because it should not be used externally; any desired
-	// functionality that could be implemented with it should instead be a method on
-	// InformantProtoVersion.
-	latestInformantProtoVersion InformantProtoVersion = iota // excluding +1 makes it equal to previous
-)
-
-func (v InformantProtoVersion) String() string {
-	var zero InformantProtoVersion
-
-	switch v {
-	case zero:
-		return "<invalid: zero>"
-	case InformantProtoV1_0:
-		return "v1.0"
-	case InformantProtoV1_1:
-		return "v1.1"
-	case InformantProtoV1_2:
-		return "v1.2"
-	case InformantProtoV2_0:
-		return "v2.0"
-	default:
-		diff := v - latestInformantProtoVersion
-		return fmt.Sprintf("<unknown = %v + %d>", latestInformantProtoVersion, diff)
-	}
-}
-
-// IsValid returns whether the protocol version is valid. The zero value is not valid.
-func (v InformantProtoVersion) IsValid() bool {
-	return uint(v) != 0
-}
-
-// HasTryUpscale returns whether this version of the protocol has the /try-upscale endpoint
-//
-// This is true for version v1.1 and greater.
-func (v InformantProtoVersion) HasTryUpscale() bool {
-	return v >= InformantProtoV1_1
-}
-
-// AllowsHealthCheck returns whether this version of the protocol has the informant's /health-check
-// endpoint
-//
-// This is true for version v1.2 and greater.
-func (v InformantProtoVersion) AllowsHealthCheck() bool {
-	return v >= InformantProtoV1_2
-}
-
-// SignsResourceUpdates returns whether agents inform VMs of resource updates with an
-// AgentResourceMessage in this version of the protocol
-//
-// This is true for version v2.0 and greater
-func (v InformantProtoVersion) SignsResourceUpdates() bool {
-	return v >= InformantProtoV2_0
-}
-
-// AgentMessage is used for (almost) every message sent from the autoscaler-agent to the VM
-// informant, and serves to wrap the type T with a SequenceNumber
-//
-// The SequenceNumber provides a total ordering of states, even if the ordering of HTTP requests and
-// responses are out of order. Fundamentally this is required because we have bidirectional
-// communication between the autoscaler-agent and VM informant — without it, we run the risk of racy
-// behavior, which could *actually* result in data corruption.
-type AgentMessage[T any] struct {
-	// Data is the content of the request or response
-	Data T `json:"data"`
-
-	// SequenceNumber is a unique-per-instance monotonically increasing number passed in each
-	// non-initial message from the autoscaler-agent to the VM informant, both requests and
-	// responses.
-	SequenceNumber uint64 `json:"sequenceNumber"`
-}
-
-// AgentDesc is the first message sent from an autoscaler-agent to a VM informant, describing some
-// information about the autoscaler-agent
-//
-// Each time an autoscaler-agent (re)connects to a VM informant, it sends an AgentDesc to the
-// "/register" endpoint.
-//
-// For more information on the agent<->informant protocol, refer to the top-level ARCHITECTURE.md
-type AgentDesc struct {
-	// AgentID is a unique UUID for the current instance of the autoscaler-agent
-	//
-	// This is helpful so that we can distinguish between (incorrect) duplicate calls to /register
-	// and (correct) re-registering of an agent.
-	AgentID uuid.UUID `json:"agentID"`
-
-	// ServeAddr gives the unique (per instance)
-	ServerAddr string `json:"agentServeAddr"`
-
-	// MinProtoVersion is the minimum version of the agent<->informant protocol that the
-	// autoscaler-agent supports
-	//
-	// Protocol versions are always non-zero.
-	//
-	// AgentDesc must always have MinProtoVersion <= MaxProtoVersion.
-	MinProtoVersion InformantProtoVersion `json:"minProtoVersion"`
-	// MaxProtoVersion is the maximum version of the agent<->informant protocol that the
-	// autoscaler-agent supports, inclusive.
-	//
-	// Protocol versions are always non-zero.
-	//
-	// AgentDesc must always have MinProtoVersion <= MaxProtoVersion.
-	MaxProtoVersion InformantProtoVersion `json:"maxProtoVersion"`
-}
-
-// MarshalLogObject implements zapcore.ObjectMarshaler, so that Resources can be used with zap.Object
-func (d AgentDesc) MarshalLogObject(enc zapcore.ObjectEncoder) error {
-	enc.AddString("agentID", d.AgentID.String())
-	enc.AddString("agentServeAddr", string(d.ServerAddr))
-	enc.AddString("minProtoVersion", d.MinProtoVersion.String())
-	enc.AddString("maxProtoVersion", d.MaxProtoVersion.String())
-	return nil
-}
-
-// ProtocolRange returns a VersionRange from d.MinProtoVersion to d.MaxProtoVersion.
-func (d AgentDesc) ProtocolRange() VersionRange[InformantProtoVersion] {
-	return VersionRange[InformantProtoVersion]{
-		Min: d.MinProtoVersion,
-		Max: d.MaxProtoVersion,
-	}
-}
-
-type AgentIdentificationMessage = AgentMessage[AgentIdentification]
-
-// AgentIdentification affirms the AgentID of the autoscaler-agent in its initial response to a VM
-// informant, on the /id endpoint. This response is always wrapped in an AgentMessage. A type alias
-// for this is provided as AgentIdentificationMessage, for convenience.
-type AgentIdentification struct {
-	// AgentID is the same AgentID as given in the AgentDesc initially provided to the VM informant
-	AgentID uuid.UUID `json:"agentID"`
-}
-
-// InformantDesc describes the capabilities of a VM informant, in response to an autoscaler-agent's
-// request on the /register endpoint
-//
-// For more information on the agent<->informant protocol, refer to the top-level ARCHITECTURE.md
-type InformantDesc struct {
-	// ProtoVersion is the version of the agent<->informant protocol that the VM informant has
-	// selected
-	//
-	// If an autoscaler-agent is successfully registered, a well-behaved VM informant MUST respond
-	// with a ProtoVersion within the bounds of the agent's declared minimum and maximum protocol
-	// versions. If the VM informant does not use a protocol version within those bounds, then it
-	// MUST respond with an error status code.
-	ProtoVersion InformantProtoVersion `json:"protoVersion"`
-
-	// MetricsMethod tells the autoscaler-agent how to fetch metrics from the VM
-	MetricsMethod InformantMetricsMethod `json:"metricsMethod"`
-}
-
-// InformantMetricsMethod collects the options for ways the VM informant can report metrics
-//
-// At least one method *must* be provided in an InformantDesc, and more than one method gives the
-// autoscaler-agent freedom to choose.
-//
-// We use this type so it's easier to ensure backwards compatibility with previous versions of the
-// VM informant — at least during the rollout of new autoscaler-agent or VM informant versions.
-type InformantMetricsMethod struct {
-	// Prometheus describes prometheus-format metrics, typically not through the informant itself
-	Prometheus *MetricsMethodPrometheus `json:"prometheus,omitempty"`
-}
-
-// MetricsMethodPrometheus describes VM informant's metrics in the prometheus format, made available
-// on a particular port
-type MetricsMethodPrometheus struct {
-	Port uint16 `json:"port"`
-}
-
-// InformantHealthCheckResp is the result of a successful request to a VM informant's /health-check
-// endpoint.
-type InformantHealthCheckResp struct{}
-
-// UnregisterAgent is the result of a successful request to a VM informant's /unregister endpoint
-type UnregisterAgent struct {
-	// WasActive indicates whether the unregistered autoscaler-agent was the one in-use by the VM
-	// informant
-	WasActive bool `json:"wasActive"`
-}
-
-// MoreResourcesRequest is the request type wrapping MoreResources that's sent by the VM informant
-// to the autoscaler-agent's /try-upscale endpoint when the VM is urgently in need of more
-// resources.
-type MoreResourcesRequest struct {
-	MoreResources
-
-	// ExpectedID is the expected AgentID of the autoscaler-agent
-	ExpectedID uuid.UUID `json:"expectedID"`
-}
-
 // MoreResources holds the data associated with a MoreResourcesRequest
 type MoreResources struct {
-	// Cpu is true if the VM informant is requesting more CPU
+	// Cpu is true if the vm-monitor is requesting more CPU
 	Cpu bool `json:"cpu"`
-	// Memory is true if the VM informant is requesting more memory
+	// Memory is true if the vm-monitor is requesting more memory
 	Memory bool `json:"memory"`
 }
 
@@ -537,48 +298,6 @@ func (m MoreResources) And(cmp MoreResources) MoreResources {
 	}
 }
 
-// RawResources signals raw resource amounts, and is primarily used in communications with the VM
-// informant because it doesn't know about things like memory slots.
-//
-// This is used in protocol versions <2. In later versions, AgentResourceMessage is used.
-type RawResources struct {
-	Cpu    *resource.Quantity `json:"cpu"`
-	Memory *resource.Quantity `json:"memory"`
-}
-
-type AgentResourceMessage = AgentMessage[ResourceMessage]
-
-// Similar to RawResources, stores raw resource amounts. However, it also stores the ID of the agent
-// notifying the VM of a change in resources. In protocol versions 2 and on, agents notify VM's of
-// changes to their available resources with an AgentResourceMessage. This allows VM informants to verify
-// the authenticity of the agent responding.
-type ResourceMessage struct {
-	RawResources
-	Id AgentIdentification `json:"id"`
-}
-
-// DownscaleResult is used by the VM informant to return whether it downscaled successfully, and
-// some indication of its status when doing so
-type DownscaleResult struct {
-	Ok     bool
-	Status string
-}
-
-// SuspendAgent is sent from the VM informant to the autoscaler-agent when it has been contacted by
-// a new autoscaler-agent and wishes to switch to that instead
-//
-// Instead of just cutting off any connection(s) to the agent, the informant keeps it around in case
-// the new one fails and it needs to fall back to the old one.
-type SuspendAgent struct {
-	ExpectedID uuid.UUID `json:"expectedID"`
-}
-
-// ResumeAgent is sent from the VM informant to the autoscaler-agent to resume contact when it was
-// previously suspended.
-type ResumeAgent struct {
-	ExpectedID uuid.UUID `json:"expectedID"`
-}
-
 ////////////////////////////////////
 // Controller <-> Runner Messages //
 ////////////////////////////////////
@@ -606,3 +325,158 @@ const (
 func (v RunnerProtoVersion) SupportsCgroupFractionalCPU() bool {
 	return v >= RunnerProtoV1
 }
+
+////////////////////////////////////
+//   Agent <-> Monitor Messages   //
+////////////////////////////////////
+
+// Represents the resources that a VM has been granted
+type Allocation struct {
+	// Number of vCPUs
+	Cpu float64 `json:"cpu"`
+
+	// Number of bytes
+	Mem uint64 `json:"mem"`
+}
+
+// ** Types sent by monitor **
+
+// This type is sent to the agent as a way to request immediate upscale.
+// Since the agent cannot control if the agent will choose to upscale the VM,
+// it does not return anything. If an upscale is granted, the agent will notify
+// the monitor via an UpscaleConfirmation
+type UpscaleRequest struct{}
+
+// This type is sent to the agent to confirm it successfully upscaled, meaning
+// it increased its filecache and/or cgroup memory limits. The agent does not
+// need to respond.
+type UpscaleConfirmation struct{}
+
+// This type is sent to the agent to indicate if downscaling was successful. The
+// agent does not need to respond.
+type DownscaleResult struct {
+	Ok     bool
+	Status string
+}
+
+// ** Types sent by agent **
+
+// This type is sent to the monitor to inform it that it has been granted a geater
+// allocation. Once the monitor is done applying this new allocation (i.e, increasing
+// file cache size, cgroup memory limits) it should reply with an UpscaleConfirmation.
+type UpscaleNotification struct {
+	Granted Allocation `json:"granted"`
+}
+
+// This type is sent to the monitor as a request to downscale its resource usage.
+// Once the monitor has downscaled or failed to do so, it should respond with a
+// DownscaleResult.
+type DownscaleRequest struct {
+	Target Allocation `json:"target"`
+}
+
+// ** Types shared by agent and monitor **
+
+// This type can be sent by either party whenever they receive a message they
+// cannot deserialize properly.
+type InvalidMessage struct {
+	Error string `json:"error"`
+}
+
+// This type can be sent by either party to signal that an error occured carrying
+// out the other party's request, for example, the monitor erroring while trying
+// to downscale. The receiving party can they log the error or propagate it as they
+// see fit.
+type InternalError struct {
+	Error string `json:"error"`
+}
+
+// This type is sent as part of a bidirectional heartbeat between the monitor and
+// agent. The check is initiated by the agent.
+type HealthCheck struct{}
+
+// This function is used to prepare a message for serialization. Any data passed
+// to the monitor should be serialized with this function. As of protocol v1.0,
+// the following types maybe be sent to the monitor, and thus passed in:
+// - DownscaleRequest
+// - UpscaleNotification
+// - InvalidMessage
+// - InternalError
+// - HealthCheck
+func SerializeMonitorMessage(content any, id uint64) ([]byte, error) {
+	// The final type that gets sent over the wire
+	type Bundle struct {
+		Content any    `json:"content"`
+		Type    string `json:"type"`
+		Id      uint64 `json:"id"`
+	}
+
+	var typeStr string
+	switch content.(type) {
+	case DownscaleRequest:
+		typeStr = "DownscaleRequest"
+	case UpscaleNotification:
+		typeStr = "UpscaleNotification"
+	case InvalidMessage:
+		typeStr = "InvalidMessage"
+	case InternalError:
+		typeStr = "InternalError"
+	case HealthCheck:
+		typeStr = "HealthCheck"
+	default:
+		return nil, fmt.Errorf("unknown message type \"%s\"", reflect.TypeOf(content))
+	}
+
+	return json.Marshal(Bundle{
+		Content: content,
+		Type:    typeStr,
+		Id:      id,
+	})
+}
+
+// MonitorProtoVersion represents a single version of the agent<->monitor protocol
+//
+// Each version of the agent<->monitor protocol is named independently from releases of the
+// repository containing this code. Names follow semver, although this does not necessarily
+// guarantee support - for example, the monitor may only support versions above v1.1.
+//
+// Version compatibility is documented in the neighboring file VERSIONING.md.
+type MonitorProtoVersion uint32
+
+const (
+	// MonitorProtoV1_0 represents v1.0 of the agent<->monitor protocol - the initial version.
+	//
+	// Currently the lastest version.
+	MonitorProtoV1_0 = iota + 1
+
+	// latestMonitorProtoVersion represents the latest version of the agent<->Monitor protocol
+	//
+	// This value is kept private because it should not be used externally; any desired
+	// functionality that could be implemented with it should instead be a method on
+	// MonitorProtoVersion.
+	latestMonitorProtoVersion MonitorProtoVersion = iota // excluding +1 makes it equal to previous
+)
+
+func (v MonitorProtoVersion) String() string {
+	var zero MonitorProtoVersion
+
+	switch v {
+	case zero:
+		return "<invalid: zero>"
+	case MonitorProtoV1_0:
+		return "v1.0"
+	default:
+		diff := v - latestMonitorProtoVersion
+		return fmt.Sprintf("<unknown = %v + %d>", latestMonitorProtoVersion, diff)
+	}
+}
+
+// Sent back by the monitor after figuring out what protocol version we should use
+type MonitorProtocolResponse struct {
+	// If `Error` is nil, contains the value of the settled on protocol version.
+	// Otherwise, will be set to 0 (MonitorProtocolVersion's zero value).
+	Version MonitorProtoVersion `json:"version,omitempty"`
+
+	// Will be nil if no error occured.
+	Error *string `json:"error,omitempty"`
+}
diff --git a/pkg/api/versionutils.go b/pkg/api/versionutils.go
index 1a79d8035..217cd5cb4 100644
--- a/pkg/api/versionutils.go
+++ b/pkg/api/versionutils.go
@@ -13,9 +13,12 @@ import (
 // VersionRange is a helper type to represent a range of versions.
 //
 // The bounds are inclusive, representing all versions v with Min <= v <= Max.
+//
+// This type is sent directly to the monitor during the creation of a new
+// Dispatcher as part of figuring out which protocol to use.
 type VersionRange[V constraints.Ordered] struct {
-	Min V
-	Max V
+	Min V `json:"min"`
+	Max V `json:"max"`
 }
 
 func (r VersionRange[V]) String() string {
diff --git a/pkg/api/vminfo.go b/pkg/api/vminfo.go
index ad5377a7a..8e2f37af7 100644
--- a/pkg/api/vminfo.go
+++ b/pkg/api/vminfo.go
@@ -213,7 +213,7 @@ func ExtractVmInfo(logger *zap.Logger, vm *vmapi.VirtualMachine) (*VmInfo, error
 }
 
 func (vm VmInfo) EqualScalingBounds(cmp VmInfo) bool {
-	return vm.Min() != cmp.Min() || vm.Max() != cmp.Max()
+	return vm.Min() == cmp.Min() && vm.Max() == cmp.Max()
 }
 
 func (vm *VmInfo) applyBounds(b ScalingBounds) {
@@ -277,6 +277,11 @@ type ScalingConfig struct {
 	// CPU,
 	// scaling CPU to make this happen.
 	LoadAverageFractionTarget float64 `json:"loadAverageFractionTarget"`
+
+	// MemoryUsageFractionTarget sets the desired fraction of current memory that
+	// we would like to be using. For example, with a value of 0.7, on a 4GB VM
+	// we'd like to be using 2.8GB of memory.
+	MemoryUsageFractionTarget float64 `json:"memoryUsageFractionTarget"`
 }
 
 func (c *ScalingConfig) Validate() error {
@@ -288,6 +293,10 @@ func (c *ScalingConfig) Validate() error {
 	erc.Whenf(ec, c.LoadAverageFractionTarget < 0.0, "%s must be set to value >= 0", ".loadAverageFractionTarget")
 	erc.Whenf(ec, c.LoadAverageFractionTarget >= 2.0, "%s must be set to value < 2 ", ".loadAverageFractionTarget")
 
+	// Make sure c.MemoryUsageFractionTarget is between 0 and 1
+	erc.Whenf(ec, c.MemoryUsageFractionTarget < 0.0, "%s must be set to value >= 0", ".memoryUsageFractionTarget")
+	erc.Whenf(ec, c.MemoryUsageFractionTarget >= 1.0, "%s must be set to value < 1 ", ".memoryUsageFractionTarget")
+
 	// heads-up! some functions elsewhere depend on the concrete return type of this function.
 	return ec.Resolve()
 }
diff --git a/pkg/api/vminfo_test.go b/pkg/api/vminfo_test.go
index a78d6d414..5404acacc 100644
--- a/pkg/api/vminfo_test.go
+++ b/pkg/api/vminfo_test.go
@@ -29,12 +29,13 @@ func TestFormatting(t *testing.T) {
 		},
 		ScalingConfig: &api.ScalingConfig{
 			LoadAverageFractionTarget: 0.7,
+			MemoryUsageFractionTarget: 0.7,
 		},
 		AlwaysMigrate:  false,
 		ScalingEnabled: true,
 	})
-	defaultFormat := "{Name:foo Namespace:bar Cpu:{Min:1 Max:5 Use:3.75} Mem:{Min:2 Max:6 Use:4 SlotSize:1Gi} ScalingConfig:&{LoadAverageFractionTarget:0.7} AlwaysMigrate:false ScalingEnabled:true}"
-	goSyntaxRepr := `api.VmInfo{Name:"foo", Namespace:"bar", Cpu:api.VmCpuInfo{Min:api.MilliCPU(1000), Max:api.MilliCPU(5000), Use:api.MilliCPU(3750)}, Mem:api.VmMemInfo{Min:2, Max:6, Use:4, SlotSize:&resource.Quantity{i:resource.int64Amount{value:1073741824, scale:0}, d:resource.infDecAmount{Dec:(*inf.Dec)(nil)}, s:"1Gi", Format:"BinarySI"}}, ScalingConfig:&api.ScalingConfig{LoadAverageFractionTarget:0.7}, AlwaysMigrate:false, ScalingEnabled:true}`
+	defaultFormat := "{Name:foo Namespace:bar Cpu:{Min:1 Max:5 Use:3.75} Mem:{Min:2 Max:6 Use:4 SlotSize:1Gi} ScalingConfig:&{LoadAverageFractionTarget:0.7 MemoryUsageFractionTarget:0.7} AlwaysMigrate:false ScalingEnabled:true}"
+	goSyntaxRepr := `api.VmInfo{Name:"foo", Namespace:"bar", Cpu:api.VmCpuInfo{Min:api.MilliCPU(1000), Max:api.MilliCPU(5000), Use:api.MilliCPU(3750)}, Mem:api.VmMemInfo{Min:2, Max:6, Use:4, SlotSize:&resource.Quantity{i:resource.int64Amount{value:1073741824, scale:0}, d:resource.infDecAmount{Dec:(*inf.Dec)(nil)}, s:"1Gi", Format:"BinarySI"}}, ScalingConfig:&api.ScalingConfig{LoadAverageFractionTarget:0.7, MemoryUsageFractionTarget:0.7}, AlwaysMigrate:false, ScalingEnabled:true}`
 	cases := []struct {
 		name     string
 		expected string
diff --git a/pkg/billing/client.go b/pkg/billing/client.go
index 2514744f2..6dfcf0334 100644
--- a/pkg/billing/client.go
+++ b/pkg/billing/client.go
@@ -11,6 +11,7 @@ import (
 	"time"
 
 	"github.com/google/uuid"
+	"github.com/lithammer/shortuuid"
 )
 
 type Client struct {
@@ -27,58 +28,53 @@ func NewClient(url string, c *http.Client) Client {
 	return Client{BaseURL: url, httpc: c, hostname: hostname}
 }
 
-func (c Client) NewBatch() *Batch { return &Batch{c: c, events: nil} }
-
-type Batch struct {
-	// Q: does this need a mutex?
-	c      Client
-	events []any
-}
-
-// Count returns the number of events in the batch
-func (b *Batch) Count() int {
-	return len(b.events)
+func (c Client) Hostname() string {
+	return c.hostname
 }
 
-func (b *Batch) idempotenize(key string) string {
-	if key != "" {
-		return key
-	}
+type TraceID string
 
-	return fmt.Sprintf("Host<%s>:ID<%s>:T<%s>", b.c.hostname, uuid.NewString(), time.Now().Format(time.RFC3339))
+func (c Client) GenerateTraceID() TraceID {
+	return TraceID(shortuuid.New())
 }
 
-func (b *Batch) AddAbsoluteEvent(e AbsoluteEvent) {
-	e.Type = "absolute"
-	e.IdempotencyKey = b.idempotenize(e.IdempotencyKey)
-	b.events = append(b.events, &e)
-}
+// Enrich sets the event's Type and IdempotencyKey fields, so that users of this API don't need to
+// manually set them
+func Enrich[E Event](hostname string, event E) E {
+	event.setType()
+
+	key := event.getIdempotencyKey()
+	if *key == "" {
+		*key = fmt.Sprintf("Host<%s>:ID<%s>:T<%s>", hostname, uuid.NewString(), time.Now().Format(time.RFC3339))
+	}
 
-func (b *Batch) AddIncrementalEvent(e IncrementalEvent) {
-	e.Type = "incremental"
-	e.IdempotencyKey = b.idempotenize(e.IdempotencyKey)
-	b.events = append(b.events, &e)
+	return event
 }
 
-func (b *Batch) Send(ctx context.Context) error {
-	if len(b.events) == 0 {
+// Send attempts to push the events to the remote endpoint.
+//
+// On failure, the error is guaranteed to be one of: JSONError, RequestError, or
+// UnexpectedStatusCodeError.
+func Send[E Event](ctx context.Context, client Client, traceID TraceID, events []E) error {
+	if len(events) == 0 {
 		return nil
 	}
 
 	payload, err := json.Marshal(struct {
-		Events []any `json:"events"`
-	}{Events: b.events})
+		Events []E `json:"events"`
+	}{Events: events})
 	if err != nil {
 		return err
 	}
 
-	r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/usage_events", b.c.BaseURL), bytes.NewReader(payload))
+	r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("%s/usage_events", client.BaseURL), bytes.NewReader(payload))
 	if err != nil {
 		return err
 	}
 	r.Header.Set("content-type", "application/json")
+	r.Header.Set("x-trace-id", string(traceID))
 
-	resp, err := b.c.httpc.Do(r)
+	resp, err := client.httpc.Do(r)
 	if err != nil {
 		return err
 	}
@@ -87,9 +83,8 @@ func (b *Batch) Send(ctx context.Context) error {
 	// theoretically if wanted/needed, we should use an http handler that
 	// does the retrying, to avoid writing that logic here.
 	if resp.StatusCode != http.StatusOK {
-		return fmt.Errorf("got code %d, posting %d events", resp.StatusCode, len(b.events))
+		return fmt.Errorf("got code %d, posting %d events", resp.StatusCode, len(events))
 	}
 
-	b.events = nil
 	return nil
 }
diff --git a/pkg/billing/model.go b/pkg/billing/model.go
index c3fcedb53..f66072bfb 100644
--- a/pkg/billing/model.go
+++ b/pkg/billing/model.go
@@ -4,6 +4,32 @@ import (
 	"time"
 )
 
+type Event interface {
+	*AbsoluteEvent | *IncrementalEvent
+
+	// eventMethods must be separate from Event so that we can assert that *AbsoluteEvent and
+	// *IncrementalEvent both implement it - Go does not allow converting to a value of type Event
+	// because it contains "*AbsoluteEvent | *IncrementalEvent", and such constraints can only be
+	// used inside of generics.
+	eventMethods
+}
+
+// eventMethods is a requirement for Event, but exists separately so that we can assert that the
+// event types implement it.
+//
+// The reason this interface even exists in the first place is because we're not allowed to assume
+// that a type E implementing Event actually has the common fields from AbsoluteEvent and
+// IncrementalEvent, even though it's constrained to either of those types.
+type eventMethods interface {
+	setType()
+	getIdempotencyKey() *string
+}
+
+var (
+	_ eventMethods = (*AbsoluteEvent)(nil)
+	_ eventMethods = (*IncrementalEvent)(nil)
+)
+
 type AbsoluteEvent struct {
 	IdempotencyKey string    `json:"idempotency_key"`
 	MetricName     string    `json:"metric"`
@@ -14,6 +40,16 @@ type AbsoluteEvent struct {
 	Value          int       `json:"value"`
 }
 
+// setType implements eventMethods
+func (e *AbsoluteEvent) setType() {
+	e.Type = "absolute"
+}
+
+// getIdempotencyKey implements eventMethods
+func (e *AbsoluteEvent) getIdempotencyKey() *string {
+	return &e.IdempotencyKey
+}
+
 type IncrementalEvent struct {
 	IdempotencyKey string    `json:"idempotency_key"`
 	MetricName     string    `json:"metric"`
@@ -23,3 +59,13 @@ type IncrementalEvent struct {
 	StopTime       time.Time `json:"stop_time"`
 	Value          int       `json:"value"`
 }
+
+// setType implements eventMethods
+func (e *IncrementalEvent) setType() {
+	e.Type = "incremental"
+}
+
+// getIdempotencyKey implements eventMethods
+func (e *IncrementalEvent) getIdempotencyKey() *string {
+	return &e.IdempotencyKey
+}
diff --git a/pkg/informant/agent.go b/pkg/informant/agent.go
deleted file mode 100644
index 87ba98f6f..000000000
--- a/pkg/informant/agent.go
+++ /dev/null
@@ -1,875 +0,0 @@
-package informant
-
-// This file contains the "client" methods for communicating with an autoscaler-agent
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"io"
-	"net/http"
-	"sync"
-	"time"
-
-	"github.com/google/uuid"
-	"go.uber.org/zap"
-	"go.uber.org/zap/zapcore"
-	"golang.org/x/exp/slices"
-
-	"github.com/neondatabase/autoscaling/pkg/api"
-	"github.com/neondatabase/autoscaling/pkg/util"
-)
-
-// The VM informant currently supports v1.1 and v1.2 of the agent<->informant protocol.
-//
-// If you update either of these values, make sure to also update VERSIONING.md.
-const (
-	MinProtocolVersion api.InformantProtoVersion = api.InformantProtoV2_0
-	MaxProtocolVersion api.InformantProtoVersion = api.InformantProtoV2_0
-)
-
-// AgentSet is the global state handling various autoscaler-agents that we could connect to
-type AgentSet struct {
-	lock util.ChanMutex
-
-	baseLogger *zap.Logger
-
-	// current is the agent we're currently communciating with. If there is none, then this value is
-	// nil
-	//
-	// This value may (temporarily) be nil even when there are other agents waiting in byIDs/byTime,
-	// because we rely on tryNewAgents to handle setting the value here.
-	current *Agent
-
-	// wantsMemoryUpscale is true if the most recent (internal) request for immediate upscaling has
-	// not yet been answered (externally) by notification of an upscale from the autoscaler-agent.
-	wantsMemoryUpscale bool
-
-	// byIDs stores all of the agents, indexed by their unique IDs
-	byIDs map[uuid.UUID]*Agent
-	// byTime stores all of the *successfully registered* agents, sorted in increasing order of
-	// their initial /register request. Agents that we're currently in the process of handling will
-	// be present in byIDs, but not here.
-	byTime []*Agent
-
-	tryNewAgent chan<- struct{}
-}
-
-type Agent struct {
-	// lock is required for accessing the mutable fields of this struct: parent and lastSeqNumber.
-	lock sync.Mutex
-
-	baseLogger *zap.Logger
-
-	// parent is the AgentSet containing this Agent. It is always non-nil, up until this Agent is
-	// unregistered with EnsureUnregistered()
-	parent *AgentSet
-
-	// suspended is true if this Agent was last sent a request on /suspend. This is only ever set by
-	suspended bool
-
-	// unregistered signalled when the agent is unregistered (due to an error or an /unregister
-	// request)
-	unregistered util.SignalReceiver
-	// Sending half of unregistered — only used by EnsureUnregistered()
-	signalUnregistered util.SignalSender
-
-	id         uuid.UUID
-	serverAddr string
-
-	protoVersion api.InformantProtoVersion
-
-	// all sends on requestQueue are made through the doRequest method; all receives are made from
-	// the runHandler background task.
-	requestQueue  chan agentRequest
-	lastSeqNumber uint64
-}
-
-type agentRequest struct {
-	ctx       context.Context
-	done      util.SignalSender
-	doRequest func(context.Context, *http.Client)
-}
-
-// NewAgentSet creates a new AgentSet and starts the necessary background tasks
-//
-// On completion, the background tasks should be ended with the Stop method.
-func NewAgentSet(logger *zap.Logger) *AgentSet {
-	tryNewAgent := make(chan struct{})
-
-	agents := &AgentSet{
-		lock:               util.NewChanMutex(),
-		baseLogger:         logger.Named("agent-set"),
-		current:            nil,
-		wantsMemoryUpscale: false,
-		byIDs:              make(map[uuid.UUID]*Agent),
-		byTime:             []*Agent{},
-		tryNewAgent:        tryNewAgent,
-	}
-
-	go agents.lock.DeadlockChecker(CheckDeadlockTimeout, CheckDeadlockDelay)(context.TODO())
-	go agents.tryNewAgents(agents.baseLogger.Named("try-new-agents"), tryNewAgent)
-	return agents
-}
-
-// Helper function to construct a zap.Field giving the necessary context for a particular
-// autoscaler-agent
-func agentZapField(id uuid.UUID, addr string) zap.Field {
-	return zap.Object("agent", zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
-		enc.AddString("id", id.String())
-		enc.AddString("addr", addr)
-		return nil
-	}))
-}
-
-// abbreviation for agentZapField(a.id, a.serveAddr) for when you're working with an Agent object directly
-func (a *Agent) zapField() zap.Field {
-	return agentZapField(a.id, a.serverAddr)
-}
-
-func (s *AgentSet) tryNewAgents(logger *zap.Logger, signal <-chan struct{}) {
-	// note: we don't close this. Sending stops when the context is done, and every read from this
-	// channel also handles the context being cancelled.
-	aggregate := make(chan struct{})
-
-	// Helper function to coalesce repeated incoming signals into a single output, so that we don't
-	// block anything from sending on signal
-	go func() {
-	noSignal:
-		<-signal
-
-	yesSignal:
-		select {
-		case <-signal:
-			goto yesSignal
-		case aggregate <- struct{}{}:
-			goto noSignal
-		}
-	}()
-
-	for {
-		<-aggregate
-
-		// Loop through applicable Agents
-	loopThroughAgents:
-		for {
-			// Remove any duplicate signals from aggregate if there are any
-			select {
-			case <-aggregate:
-			default:
-			}
-
-			candidate := func() *Agent {
-				s.lock.Lock()
-				defer s.lock.Unlock()
-
-				if len(s.byTime) == 0 || s.current == s.byTime[len(s.byTime)-1] {
-					return nil
-				}
-
-				return s.byTime[len(s.byTime)-1]
-			}()
-
-			// If there's no remaining candidates, stop trying.
-			if candidate == nil {
-				break loopThroughAgents
-			}
-
-			// Do we need to resume the agent? We will use this later
-			shouldResume := func() bool {
-				candidate.lock.Lock()
-				defer candidate.lock.Unlock()
-
-				wasSuspended := candidate.suspended
-				candidate.suspended = false
-				return !wasSuspended
-			}()
-
-			// Get the current agent, which we would like to replace with the candidate.
-			// We should suspend the old agent.
-			oldCurrent := func() (old *Agent) {
-				s.lock.Lock()
-				defer s.lock.Unlock()
-
-				if s.current != nil {
-					s.current.suspended = true
-				}
-
-				return s.current
-			}()
-
-			if oldCurrent != nil {
-				handleError := func(err error) {
-					if errors.Is(err, context.Canceled) {
-						return
-					}
-
-					logger.Warn("Error suspending previous Agent", oldCurrent.zapField(), zap.Error(err))
-				}
-
-				// Suspend the old agent
-				oldCurrent.Suspend(logger, AgentSuspendTimeout, handleError)
-			}
-
-			if shouldResume {
-				if err := candidate.Resume(logger, AgentResumeTimeout); err != nil {
-					// From Resume():
-					//
-					// > If the Agent becomes unregistered [ ... ] this method will return
-					// > context.Canceled
-					if err == context.Canceled { //nolint:errorlint // explicit error value guarantee from Resume()
-						continue loopThroughAgents
-					}
-
-					// From Resume():
-					//
-					// > If the request fails, the Agent will be unregistered
-					//
-					// We don't have to worry about anything extra here; just keep trying.
-					if err != nil {
-						logger.Warn("Error on Agent resume", candidate.zapField(), zap.Error(err))
-						continue loopThroughAgents
-					}
-				}
-			}
-
-			// Set the new agent, and do an upscale if it was requested.
-			func() {
-				s.lock.Lock()
-				defer s.lock.Unlock()
-
-				s.current = candidate
-
-				if s.wantsMemoryUpscale {
-					s.current.SpawnRequestUpscale(logger, AgentUpscaleTimeout, func(err error) {
-						if errors.Is(err, context.Canceled) {
-							return
-						}
-
-						// note: explicitly refer to candidate here instead of s.current, because
-						// the value of s.current could have changed by the time this function is
-						// called.
-						logger.Error("Error requesting upscale from Agent", candidate.zapField(), zap.Error(err))
-					})
-				}
-			}()
-		}
-	}
-}
-
-// RegisterNewAgent instantiates our local information about the autsocaler-agent
-//
-// Returns: protocol version, status code, error (if unsuccessful)
-func (s *AgentSet) RegisterNewAgent(logger *zap.Logger, info *api.AgentDesc) (api.InformantProtoVersion, int, error) {
-	expectedRange := api.VersionRange[api.InformantProtoVersion]{
-		Min: MinProtocolVersion,
-		Max: MaxProtocolVersion,
-	}
-
-	descProtoRange := info.ProtocolRange()
-
-	protoVersion, matches := expectedRange.LatestSharedVersion(descProtoRange)
-	if !matches {
-		return 0, 400, fmt.Errorf(
-			"Protocol version mismatch: Need %v but got %v", expectedRange, descProtoRange,
-		)
-	}
-
-	unregisterSend, unregisterRecv := util.NewSingleSignalPair()
-
-	agent := &Agent{
-		lock: sync.Mutex{},
-
-		baseLogger: s.baseLogger.Named("agent").With(agentZapField(info.AgentID, info.ServerAddr)),
-		parent:     s,
-
-		suspended:          false,
-		unregistered:       unregisterRecv,
-		signalUnregistered: unregisterSend,
-
-		id:         info.AgentID,
-		serverAddr: info.ServerAddr,
-
-		protoVersion: protoVersion,
-
-		lastSeqNumber: 0,
-		requestQueue:  make(chan agentRequest),
-	}
-
-	// Try to add the agent, if we can.
-	isDuplicate := func() bool {
-		s.lock.Lock()
-		defer s.lock.Unlock()
-
-		if _, ok := s.byIDs[info.AgentID]; ok {
-			return true
-		}
-
-		s.byIDs[info.AgentID] = agent
-		return false
-	}()
-
-	if isDuplicate {
-		return 0, 409, fmt.Errorf("Agent with ID %s is already registered", info.AgentID)
-	}
-
-	go agent.runHandler()
-	go agent.runBackgroundChecker()
-
-	if err := agent.CheckID(logger, AgentBackgroundCheckTimeout); err != nil {
-		return 0, 400, fmt.Errorf(
-			"Error checking ID for agent %s/%s: %w", agent.serverAddr, agent.id, err,
-		)
-	}
-
-	// note: At this point, the agent has been appropriately established, but we haven't added it to
-	// the AgentSet's list of successfully registered Agents
-	func() {
-		// We have to acquire a lock on the Agent state here so that we don't have a race from a
-		// concurrent call to EnsureUnregistered().
-		agent.lock.Lock()
-		defer agent.lock.Unlock()
-
-		if agent.parent == nil {
-			// Something caused the Agent to be unregistered. We don't know what, but it wasn't the
-			// fault of this request. Because there's no strict happens-before relation here, we can
-			// pretend like the error happened after the request was fully handled, and return a
-			// success.
-			logger.Warn("Agent was unregistered before register completed", agent.zapField())
-			return
-		}
-
-		s.lock.Lock()
-		defer s.lock.Unlock()
-
-		s.byTime = append(s.byTime, agent)
-		s.tryNewAgent <- struct{}{}
-	}()
-
-	return protoVersion, 200, nil
-}
-
-// RequestUpscale requests an immediate upscale for more memory, if there's an agent currently
-// enabled
-//
-// If there's no current agent, then RequestUpscale marks the upscale as desired, and will request
-// upscaling from the next agent we connect to.
-func (s *AgentSet) RequestUpscale(logger *zap.Logger) {
-	// FIXME: we should assign a timeout to these upscale requests, so that we don't continue trying
-	// to upscale after the demand has gone away.
-
-	agent := func() *Agent {
-		s.lock.Lock()
-		defer s.lock.Unlock()
-
-		// If we already have an ongoing request, don't create a new one.
-		if s.wantsMemoryUpscale {
-			return nil
-		}
-
-		s.wantsMemoryUpscale = true
-		return s.current
-	}()
-
-	if agent == nil {
-		return
-	}
-
-	// FIXME: it's possible to block for an unbounded amount of time waiting for the request to get
-	// picked up by the message queue. We *do* want backpressure here, but we should ideally have a
-	// way to cancel an attempted request if it's taking too long.
-	agent.SpawnRequestUpscale(logger, AgentUpscaleTimeout, func(err error) {
-		if errors.Is(err, context.Canceled) {
-			return
-		}
-
-		s.baseLogger.Error("Error requesting upscale from current Agent", agent.zapField(), zap.Error(err))
-	})
-}
-
-// ReceivedUpscale marks any desired upscaling from a prior s.RequestUpscale() as resolved
-//
-// Typically, (*CgroupState).ReceivedUpscale() is also called alongside this method.
-func (s *AgentSet) ReceivedUpscale() {
-	s.lock.Lock()
-	defer s.lock.Unlock()
-
-	s.wantsMemoryUpscale = false
-}
-
-// Returns the current agent, which can be nil
-func (s *AgentSet) Current() *Agent {
-	s.lock.Lock()
-	defer s.lock.Unlock()
-	return s.current
-}
-
-// Returns the id of the AgentSet's current agent as a string. If the current agent is nil,
-// returns "<nil>"
-func (s *AgentSet) CurrentIdStr() string {
-	if current := s.Current(); current == nil {
-		return "<nil>"
-	} else {
-		return current.id.String()
-	}
-}
-
-// Get returns the requested Agent, if it exists
-func (s *AgentSet) Get(id uuid.UUID) (_ *Agent, ok bool) {
-	s.lock.Lock()
-	defer s.lock.Unlock()
-
-	agent, ok := s.byIDs[id]
-	return agent, ok
-}
-
-// runHandler receives inputs from the requestSet and dispatches them
-func (a *Agent) runHandler() {
-	logger := a.baseLogger.Named("request-dispatcher")
-
-	client := http.Client{
-		CheckRedirect: func(req *http.Request, via []*http.Request) error {
-			err := fmt.Errorf("Unexpected redirect getting %s", req.URL)
-			logger.Warn(err.Error())
-			return err
-		},
-	}
-
-	defer client.CloseIdleConnections()
-
-	for {
-		// Ignore items in the requestQueue if the Agent's been unregistered.
-		select {
-		case <-a.unregistered.Recv():
-			return
-		default:
-		}
-
-		select {
-		case <-a.unregistered.Recv():
-			return
-		case req := <-a.requestQueue:
-			func() {
-				reqCtx, cancel := context.WithCancel(req.ctx)
-				defer cancel()
-
-				done := make(chan struct{})
-				go func() {
-					defer req.done.Send()
-					defer close(done)
-					req.doRequest(reqCtx, &client)
-				}()
-
-				select {
-				case <-a.unregistered.Recv():
-					cancel()
-					// Even if we've just cancelled it, we have to wait on done so that we know the
-					// http.Client won't be used by other goroutines
-					<-done
-				case <-done:
-				}
-			}()
-		}
-	}
-}
-
-// runBackgroundChecker performs periodic checks that the Agent is still available
-func (a *Agent) runBackgroundChecker() {
-	logger := a.baseLogger.Named("background-checker")
-
-	for {
-		select {
-		case <-a.unregistered.Recv():
-			return
-		case <-time.After(AgentBackgroundCheckDelay):
-			// all good
-		}
-
-		done := func() bool {
-			if err := a.CheckID(logger, AgentBackgroundCheckTimeout); err != nil {
-				// If this request was cancelled (because the agent was unregistered), we're done.
-				// We can't check a.unregistered because CheckID will already unregister on failure
-				// anyways.
-				if errors.Is(err, context.Canceled) {
-					return true
-				}
-
-				logger.Warn("Agent background check failed", zap.Error(err))
-				return true
-			}
-
-			return false
-		}()
-
-		if done {
-			return
-		}
-	}
-}
-
-// doRequest is the generic wrapper around requests to the autoscaler-agent to ensure that we're
-// only sending one at a time AND we appropriately keep track of sequence numbers.
-//
-// We can only send one at a time because http.Client isn't thread-safe, and we want to re-use it
-// between requests so that we can keep the TCP connections alive.
-//
-// There are no guarantees made about the equality or content of errors returned from this function.
-func doRequest[B any, R any](
-	agent *Agent,
-	timeout time.Duration,
-	method string,
-	path string,
-	body *B,
-) (_ *R, old bool, _ error) {
-	return doRequestWithStartSignal[B, R](
-		agent, timeout, nil, method, path, body,
-	)
-}
-
-func doRequestWithStartSignal[B any, R any](
-	agent *Agent,
-	timeout time.Duration,
-	start *util.SignalSender,
-	method string,
-	path string,
-	body *B,
-) (_ *R, old bool, _ error) {
-	logger := agent.baseLogger.Named("http")
-
-	outerContext, cancel := context.WithTimeout(context.TODO(), timeout)
-	defer cancel()
-
-	var (
-		responseBody api.AgentMessage[R]
-		oldSeqNum    bool
-		requestErr   error
-	)
-
-	sendDone, recvDone := util.NewSingleSignalPair()
-
-	url := fmt.Sprintf("http://%s%s", agent.serverAddr, path)
-
-	req := agentRequest{
-		ctx:  outerContext,
-		done: sendDone,
-		doRequest: func(ctx context.Context, client *http.Client) {
-			bodyBytes, err := json.Marshal(body)
-			if err != nil {
-				requestErr = fmt.Errorf("Error encoding JSON body: %w", err)
-				return
-			}
-
-			req, err := http.NewRequestWithContext(ctx, method, url, bytes.NewReader(bodyBytes))
-			if err != nil {
-				requestErr = fmt.Errorf("Error creating request: %w", err)
-				return
-			}
-
-			logger.Info("Sending request to agent", zap.String("path", path), zap.Any("request", body))
-
-			resp, err := client.Do(req)
-			if err != nil {
-				requestErr = err
-				return
-			}
-
-			defer resp.Body.Close()
-
-			respBodyBytes, err := io.ReadAll(resp.Body)
-			if err != nil {
-				requestErr = fmt.Errorf("Error reading response body: %w", err)
-				return
-			}
-			if resp.StatusCode != 200 {
-				requestErr = fmt.Errorf(
-					"Unsuccessful response status %d: %s",
-					resp.StatusCode, string(respBodyBytes),
-				)
-				return
-			}
-			if err := json.Unmarshal(respBodyBytes, &responseBody); err != nil {
-				requestErr = fmt.Errorf("Error reading response as JSON: %w", err)
-				return
-			}
-
-			logger.Info("Received response from agent", zap.String("path", path), zap.Any("response", responseBody))
-
-			if responseBody.SequenceNumber == 0 {
-				requestErr = errors.New("Got invalid sequence number 0")
-				return
-			}
-
-			// Acquire the Agent's lock so we can check the sequence number
-			agent.lock.Lock()
-			defer agent.lock.Unlock()
-
-			if agent.lastSeqNumber < responseBody.SequenceNumber {
-				agent.lastSeqNumber = responseBody.SequenceNumber
-			} else {
-				oldSeqNum = true
-			}
-		},
-	}
-
-	// Try to queue the request
-	select {
-	case <-outerContext.Done():
-		// Timeout reached
-		return nil, false, outerContext.Err()
-	case <-agent.unregistered.Recv():
-		return nil, false, context.Canceled
-	case agent.requestQueue <- req:
-		// Continue as normal
-	}
-
-	if start != nil {
-		start.Send()
-	}
-
-	// At this point, runHandler is appropriately handling the request, and will call
-	// sendDone.Send() the attempt at the request is finished. We don't need to worry about handling
-	// timeouts & unregistered Agents ourselves.
-	<-recvDone.Recv()
-
-	if requestErr != nil {
-		return nil, oldSeqNum, requestErr
-	} else {
-		return &responseBody.Data, oldSeqNum, nil
-	}
-}
-
-// EnsureUnregistered unregisters the Agent if it is currently registered, signalling the AgentSet
-// to use a new Agent if it isn't already
-//
-// Returns whether the agent was the current Agent in use.
-func (a *Agent) EnsureUnregistered(logger *zap.Logger) (wasCurrent bool) {
-	logger = logger.With(a.zapField())
-
-	a.lock.Lock()
-	defer a.lock.Unlock()
-
-	if a.parent == nil {
-		return
-	}
-
-	logger.Info("Unregistering agent")
-
-	a.signalUnregistered.Send()
-
-	a.parent.lock.Lock()
-	defer a.parent.lock.Unlock()
-
-	if _, ok := a.parent.byIDs[a.id]; ok {
-		delete(a.parent.byIDs, a.id)
-	} else {
-		logger.DPanic("Invalid state. Ignoring and continuing.", zap.String("error", "agent is registered but not in parent's agents map"))
-	}
-
-	if idx := slices.Index(a.parent.byTime, a); idx >= 0 {
-		a.parent.byTime = slices.Delete(a.parent.byTime, idx, idx+1)
-	}
-
-	if a.parent.current == a {
-		wasCurrent = true
-		a.parent.current = nil
-		a.parent.tryNewAgent <- struct{}{}
-	}
-
-	a.parent = nil
-
-	return
-}
-
-// CheckID checks that the Agent's ID matches what's expected
-//
-// If the agent has already been registered, then a failure in this method will unregister the
-// agent.
-//
-// If the Agent is unregistered before the call to CheckID() completes, the request will be cancelled
-// and this method will return context.Canceled.
-func (a *Agent) CheckID(logger *zap.Logger, timeout time.Duration) error {
-	// Quick unregistered check:
-	select {
-	case <-a.unregistered.Recv():
-		logger.Warn("CheckID called for Agent that is already unregistered (probably *not* a race?)", a.zapField())
-		return context.Canceled
-	default:
-	}
-
-	body := struct{}{}
-	id, _, err := doRequest[struct{}, api.AgentIdentification](a, timeout, http.MethodGet, "/id", &body)
-
-	select {
-	case <-a.unregistered.Recv():
-		return context.Canceled
-	default:
-	}
-
-	if err != nil {
-		a.EnsureUnregistered(logger)
-		return err
-	}
-
-	if id.AgentID != a.id {
-		a.EnsureUnregistered(logger)
-		return fmt.Errorf("Bad agent identification: expected %q but got %q", a.id, id.AgentID)
-	}
-
-	return nil
-}
-
-// Suspend signals to the Agent that it is not *currently* in use, sending a request to its /suspend
-// endpoint
-//
-// If the Agent is unregistered before the call to Suspend() completes, the request will be
-// cancelled and this method will return context.Canceled.
-//
-// If the request fails, the Agent will be unregistered.
-func (a *Agent) Suspend(logger *zap.Logger, timeout time.Duration, handleError func(error)) {
-	// Quick unregistered check:
-	select {
-	case <-a.unregistered.Recv():
-		logger.Warn("Suspend called for Agent that is already unregistered (probably *not* a race?)", a.zapField())
-		handleError(context.Canceled)
-		return
-	default:
-	}
-
-	body := api.SuspendAgent{ExpectedID: a.id}
-	id, _, err := doRequest[api.SuspendAgent, api.AgentIdentification](
-		a, timeout, http.MethodPost, "/suspend", &body,
-	)
-
-	select {
-	case <-a.unregistered.Recv():
-		handleError(context.Canceled)
-		return
-	default:
-	}
-
-	if err != nil {
-		a.EnsureUnregistered(logger)
-		handleError(err)
-		return
-	}
-
-	if id.AgentID != a.id {
-		a.EnsureUnregistered(logger)
-		handleError(fmt.Errorf("Bad agent identification: expected %q but got %q", a.id, id.AgentID))
-		return
-	}
-
-	a.suspended = false
-}
-
-// Resume attempts to restore the Agent as the current one in use, sending a request to its /resume
-// endpoint
-//
-// If the Agent is unregistered before the call to Resume() completes, the request will be cancelled
-// and this method will return context.Canceled.
-//
-// If the request fails, the Agent will be unregistered.
-func (a *Agent) Resume(logger *zap.Logger, timeout time.Duration) error {
-	// Quick unregistered check:
-	select {
-	case <-a.unregistered.Recv():
-		logger.Warn("Resume called for Agent that is already unregistered (probably *not* a race?)", a.zapField())
-		return context.Canceled
-	default:
-	}
-
-	body := api.ResumeAgent{ExpectedID: a.id}
-	id, _, err := doRequest[api.ResumeAgent, api.AgentIdentification](
-		a, timeout, http.MethodPost, "/resume", &body,
-	)
-
-	select {
-	case <-a.unregistered.Recv():
-		return context.Canceled
-	default:
-	}
-
-	if err != nil {
-		a.EnsureUnregistered(logger)
-		return err
-	}
-
-	if id.AgentID != a.id {
-		a.EnsureUnregistered(logger)
-		return fmt.Errorf("Bad agent identification: expected %q but got %q", a.id, id.AgentID)
-	}
-
-	return nil
-}
-
-// SpawnRequestUpscale requests that the Agent increase the resource allocation to this VM
-//
-// This method blocks until the request is picked up by the message queue, and returns without
-// waiting for the request to complete (it'll do that on its own).
-//
-// The timeout applies only once the request is in-flight.
-//
-// This method MUST NOT be called while holding a.parent.lock; if that happens, it may deadlock.
-func (a *Agent) SpawnRequestUpscale(logger *zap.Logger, timeout time.Duration, handleError func(error)) {
-	// Quick unregistered check
-	select {
-	case <-a.unregistered.Recv():
-		logger.Warn("RequestUpscale called for Agent that is already unregistered (probably *not* a race?)", a.zapField())
-		handleError(context.Canceled)
-		return
-	default:
-	}
-
-	sendDone, recvDone := util.NewSingleSignalPair()
-
-	go func() {
-		// If we exit early, signal that we're done.
-		defer sendDone.Send()
-
-		unsetWantsUpscale := func() {
-			// Unset s.wantsMemoryUpscale if the agent is still current. We want to allow further
-			// requests to try again.
-			a.parent.lock.Lock()
-			defer a.parent.lock.Unlock()
-
-			if a.parent.current == a {
-				a.parent.wantsMemoryUpscale = false
-			}
-		}
-
-		body := api.MoreResourcesRequest{
-			MoreResources: api.MoreResources{Cpu: false, Memory: true},
-			ExpectedID:    a.id,
-		}
-		// Pass the signal sender into doRequestWithStartSignal so that the signalling on
-		// start-of-handling is done for us.
-		id, _, err := doRequestWithStartSignal[api.MoreResourcesRequest, api.AgentIdentification](
-			a, timeout, &sendDone, http.MethodPost, "/try-upscale", &body,
-		)
-
-		select {
-		case <-a.unregistered.Recv():
-			handleError(context.Canceled)
-			return
-		default:
-		}
-
-		if err != nil {
-			unsetWantsUpscale()
-			a.EnsureUnregistered(logger)
-			handleError(err)
-			return
-		}
-
-		if id.AgentID != a.id {
-			unsetWantsUpscale()
-			a.EnsureUnregistered(logger)
-			handleError(fmt.Errorf("Bad agent identification: expected %q but got %q", a.id, id.AgentID))
-			return
-		}
-	}()
-
-	<-recvDone.Recv()
-}
diff --git a/pkg/informant/cgroup.go b/pkg/informant/cgroup.go
deleted file mode 100644
index 9b1d8fcde..000000000
--- a/pkg/informant/cgroup.go
+++ /dev/null
@@ -1,312 +0,0 @@
-package informant
-
-// Informant-specific usage and logic around cgroups, using CgroupManager.
-
-import (
-	"fmt"
-	"sync"
-	"time"
-
-	sysinfo "github.com/elastic/go-sysinfo"
-	sysinfotypes "github.com/elastic/go-sysinfo/types"
-	"go.uber.org/zap"
-
-	"github.com/neondatabase/autoscaling/pkg/util"
-)
-
-// CgroupState provides the high-level cgroup handling logic, building upon the low-level plumbing
-// provided by CgroupManager.
-type CgroupState struct {
-	// updateMemLimitsLock guards access to setting the cgroup's memory.high and memory.max
-	updateMemLimitsLock sync.Mutex
-
-	mgr    *CgroupManager
-	config CgroupConfig
-
-	upscaleEventsSendr util.CondChannelSender
-	upscaleEventsRecvr util.CondChannelReceiver
-
-	requestUpscale func(*zap.Logger)
-}
-
-// CgroupConfig provides some configuration options for State cgroup handling
-type CgroupConfig struct {
-	// OOMBufferBytes gives the target difference between the total memory reserved for the cgroup
-	// and the value of the cgroup's memory.high.
-	//
-	// In other words, memory.high + OOMBufferBytes will equal the total memory that the cgroup may
-	// use (equal to system memory, minus whatever's taken out for the file cache).
-	OOMBufferBytes uint64
-
-	// MemoryHighBufferBytes gives the amount of memory, in bytes, below a proposed new value for
-	// memory.high that the cgroup's memory usage must be for us to downscale
-	//
-	// In other words, we can downscale only when:
-	//
-	//   memory.current + MemoryHighBufferBytes < (proposed) memory.high
-	//
-	// TODO: there's some minor issues with this approach -- in particular, that we might have
-	// memory in use by the kernel's page cache that we're actually ok with getting rid of.
-	MemoryHighBufferBytes uint64
-
-	// MaxUpscaleWaitMillis gives the maximum duration, in milliseconds, that we're allowed to pause
-	// the cgroup for while waiting for the autoscaler-agent to upscale us
-	MaxUpscaleWaitMillis uint
-
-	// DoNotFreezeMoreOftenThanMillis gives a required minimum time, in milliseconds, that we must
-	// wait before re-freezing the cgroup while waiting for the autoscaler-agent to upscale us.
-	DoNotFreezeMoreOftenThanMillis uint
-
-	// MemoryHighIncreaseByBytes gives the amount of memory, in bytes, that we should periodically
-	// increase memory.high by while waiting for the autoscaler-agent to upscale us.
-	//
-	// This exists to avoid the excessive throttling that happens when a cgroup is above its
-	// memory.high for too long. See more here:
-	// https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
-	MemoryHighIncreaseByBytes uint64
-
-	// MemoryHighIncreaseEveryMillis gives the period, in milliseconds, at which we should
-	// repeatedly increase the value of the cgroup's memory.high while we're waiting on upscaling
-	// and memory.high is still being hit.
-	//
-	// Technically speaking, this actually serves as a rate limit to moderate responding to
-	// memory.high events, but these are roughly equivalent if the process is still allocating
-	// memory.
-	MemoryHighIncreaseEveryMillis uint
-}
-
-// ReceivedUpscale notifies s.upscaleEventsRecvr
-//
-// Typically, (*AgentSet).ReceivedUpscale() is also called alongside this method.
-func (s *CgroupState) ReceivedUpscale() {
-	s.upscaleEventsSendr.Send()
-}
-
-// mib is a helper function to format a quantity of bytes as a string
-func mib(bytes uint64) string {
-	return fmt.Sprintf("%g MiB", float64(bytes)/float64(1<<20))
-}
-
-// setMemoryLimits updates the cgroup's value of memory.high and memory.max, according to the memory
-// made available to the cgroup.
-//
-// This method MUST be called while holding s.updateMemLimitsLock.
-func (s *CgroupState) setMemoryLimits(logger *zap.Logger, availableMemory uint64) error {
-	newMemHigh := s.config.calculateMemoryHighValue(availableMemory)
-
-	logger.Info("Setting cgroup memory.high",
-		zap.String("availableMemory", mib(availableMemory)),
-		zap.String("target", mib(newMemHigh)),
-	)
-
-	s.mgr.MemoryHighEvent.Consume()
-
-	memLimits := memoryLimits{
-		highBytes: newMemHigh,
-		maxBytes:  availableMemory,
-	}
-	if err := s.mgr.SetMemLimits(memLimits); err != nil {
-		return fmt.Errorf("Error setting cgroup %q memory limits: %w", s.mgr.name, err)
-	}
-
-	logger.Info("Successfully set cgroup memory limits")
-	return nil
-}
-
-// handleCgroupSignals is an internal function that handles "memory high" signals from the cgroup
-func (s *CgroupState) handleCgroupSignalsLoop(logger *zap.Logger, config CgroupConfig) {
-	// FIXME: we should have "proper" error handling instead of just panicking. It's hard to
-	// determine what the correct behavior should be if a cgroup operation fails, though.
-
-	waitingOnUpscale := false
-
-	waitToIncreaseMemoryHigh := time.NewTimer(0)
-	waitToFreeze := time.NewTimer(0)
-
-	// hey! Before reading this function, have a read through the fields of CgroupConfig - it'll be
-	// hard to understand the control flow that's going on here without that.
-	for {
-		// Wait for a new signal
-		select {
-		case err := <-s.mgr.ErrCh:
-			panic(fmt.Errorf("Error listening for cgroup signals: %w", err))
-		case <-s.upscaleEventsRecvr.Recv():
-			logger.Info("Received upscale event")
-			s.mgr.MemoryHighEvent.Consume()
-
-			// note: Don't reset the timers. We still want to be precise about our rate limit, if
-			// upscale events are happening very frequently.
-
-		case <-s.mgr.MemoryHighEvent.Recv():
-			select {
-			case <-waitToFreeze.C:
-				var err error
-
-				// Freeze the cgroup and request more memory (maybe duplicate - that'll be handled
-				// internally so we're not spamming the agent)
-				waitingOnUpscale, err = s.handleMemoryHighEvent(logger, config)
-				if err != nil {
-					panic(fmt.Errorf("Error handling memory high event: %w", err))
-				}
-				waitToFreeze.Reset(time.Duration(config.DoNotFreezeMoreOftenThanMillis) * time.Millisecond)
-			default:
-				if !waitingOnUpscale {
-					logger.Info("Received memory.high event, but too soon to re-freeze. Requesting upscaling")
-
-					// Too soon after the last freeze, but there's currently no unsatisfied
-					// upscaling requests. We should send a new one:
-					func() {
-						s.updateMemLimitsLock.Lock()
-						defer s.updateMemLimitsLock.Unlock()
-
-						// Double-check we haven't already been upscaled (can happen if the agent
-						// independently decides to upscale us again)
-						select {
-						case <-s.upscaleEventsRecvr.Recv():
-							logger.Info("No need to request upscaling because we were already upscaled")
-							return
-						default:
-							s.requestUpscale(logger)
-						}
-					}()
-				} else {
-					// Maybe increase memory.high to reduce throttling:
-					select {
-					case <-waitToIncreaseMemoryHigh.C:
-						logger.Info("Received memory.high event, too soon to re-freeze, but increasing memory.high")
-
-						func() {
-							s.updateMemLimitsLock.Lock()
-							defer s.updateMemLimitsLock.Unlock()
-
-							// Double-check we haven't already been upscaled (can happen if the
-							// agent independently decides to upscale us again)
-							select {
-							case <-s.upscaleEventsRecvr.Recv():
-								logger.Info("No need to update memory.high because we were already upscaled")
-								return
-							default:
-								s.requestUpscale(logger)
-							}
-
-							memHigh, err := s.mgr.FetchMemoryHighBytes()
-							if err != nil {
-								panic(fmt.Errorf("Error fetching memory.high: %w", err))
-							} else if memHigh == nil {
-								panic(fmt.Errorf("memory.high is unset (equal to 'max') but should have been set to a value already"))
-							}
-
-							newMemHigh := *memHigh + config.MemoryHighIncreaseByBytes
-							logger.Info(
-								"Updating memory.high",
-								zap.String("current", mib(*memHigh)),
-								zap.String("target", mib(newMemHigh)),
-							)
-
-							if err := s.mgr.SetMemHighBytes(newMemHigh); err != nil {
-								panic(fmt.Errorf("Error setting memory limits: %w", err))
-							}
-						}()
-
-						waitToIncreaseMemoryHigh.Reset(time.Duration(config.MemoryHighIncreaseEveryMillis) * time.Millisecond)
-					default:
-						// Can't do anything.
-					}
-				}
-			}
-		}
-	}
-}
-
-// handleMemoryHighEvent performs the "freeze cgroup, request upscale, thaw cgroup" operation, in
-// response to a "memory high" event for the cgroup
-//
-// This method waits on s.agents.UpscaleEvents(), so incorrect behavior will occur if it's called at
-// the same time as anything else that waits on the upscale events. For that reason, both this
-// function and s.setMemoryHigh() are dispatched from within s.handleCgroupSignalsLoop().
-func (s *CgroupState) handleMemoryHighEvent(logger *zap.Logger, config CgroupConfig) (waitingOnUpscale bool, _ error) {
-	locked := true
-	s.updateMemLimitsLock.Lock()
-	defer func() {
-		if locked {
-			s.updateMemLimitsLock.TryLock()
-		}
-	}()
-
-	// If we've actually already received an upscale event, then we should ignore this memory.high
-	// event for the time being:
-	select {
-	case <-s.upscaleEventsRecvr.Recv():
-		logger.Info("Skipping memory.high event because there was an upscale event")
-		return false, nil
-	default:
-	}
-
-	logger.Info("Received memory high event. Freezing cgroup")
-
-	// Immediately freeze the cgroup before doing anything else.
-	if err := s.mgr.Freeze(); err != nil {
-		return false, fmt.Errorf("Error freezing cgroup: %w", err)
-	}
-
-	startTime := time.Now()
-
-	// Start a timer for the maximum time we'll leave the cgroup frozen for:
-	maxWaitBeforeThaw := time.Millisecond * time.Duration(config.MaxUpscaleWaitMillis)
-	mustThaw := time.After(maxWaitBeforeThaw)
-
-	logger.Info(fmt.Sprintf("Sending request for immediate upscaling, waiting for at most %s", maxWaitBeforeThaw))
-
-	s.requestUpscale(logger)
-
-	// Unlock before waiting:
-	locked = false
-	s.updateMemLimitsLock.Unlock()
-
-	var upscaled bool
-
-	select {
-	case <-s.upscaleEventsRecvr.Recv():
-		totalWait := time.Since(startTime)
-		logger.Info("Received notification that upscale occurred", zap.Duration("totalWait", totalWait))
-		upscaled = true
-	case <-mustThaw:
-		totalWait := time.Since(startTime)
-		logger.Info("Timed out waiting for upscale", zap.Duration("totalWait", totalWait))
-	}
-
-	logger.Info("Thawing cgroup")
-	if err := s.mgr.Thaw(); err != nil {
-		return false, fmt.Errorf("Error thawing cgroup: %w", err)
-	}
-
-	s.mgr.MemoryHighEvent.Consume()
-
-	return !upscaled, nil
-}
-
-// calculateMemoryHighValue calculates the new value for the cgroup's memory.high based on the total
-// system memory.
-func (c *CgroupConfig) calculateMemoryHighValue(totalSystemMem uint64) uint64 {
-	return util.SaturatingSub(totalSystemMem, c.OOMBufferBytes)
-}
-
-// getCgroupCurrentMemory fetches the current total memory usgae of the cgroup, in bytes
-func (s *CgroupState) getCurrentMemory() (uint64, error) {
-	return s.mgr.CurrentMemoryUsage()
-}
-
-// getTotalSystemMemory fetches the system's total memory, in bytes
-func getTotalSystemMemory() (*sysinfotypes.HostMemoryInfo, error) {
-	host, err := sysinfo.Host()
-	if err != nil {
-		return nil, fmt.Errorf("Error getting host info: %w", err)
-	}
-
-	mem, err := host.Memory()
-	if err != nil {
-		return nil, fmt.Errorf("Error getting host memory info: %w", err)
-	}
-
-	return mem, nil
-}
diff --git a/pkg/informant/cgroupmanager.go b/pkg/informant/cgroupmanager.go
deleted file mode 100644
index 375d1fd80..000000000
--- a/pkg/informant/cgroupmanager.go
+++ /dev/null
@@ -1,331 +0,0 @@
-package informant
-
-// A lightweight wrapper around cgroup2.Manager, with a mix of convenience and extra functionality.
-
-import (
-	"errors"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"sync/atomic"
-	"time"
-
-	cgroups "github.com/containerd/cgroups/v3"
-	"github.com/containerd/cgroups/v3/cgroup2"
-	"go.uber.org/zap"
-	"go.uber.org/zap/zapcore"
-
-	"github.com/neondatabase/autoscaling/pkg/util"
-)
-
-type CgroupManager struct {
-	MemoryHighEvent util.CondChannelReceiver
-	ErrCh           <-chan error
-
-	name    string
-	manager *cgroup2.Manager
-}
-
-func NewCgroupManager(logger *zap.Logger, groupName string) (*CgroupManager, error) {
-	mode := cgroups.Mode()
-	if mode != cgroups.Unified && mode != cgroups.Hybrid {
-		var modeString string
-		switch mode {
-		case cgroups.Unavailable:
-			modeString = "Unavailable"
-		case cgroups.Legacy:
-			modeString = "cgroups v1 ONLY"
-		default:
-			panic(fmt.Errorf("unexpected cgroups mode value %d", mode))
-		}
-
-		return nil, fmt.Errorf("cgroups v2 are not enabled, mode = %q", modeString)
-	}
-
-	// note: cgroup2.Load expects the cgroup "path" to start with '/', rooted at "/sys/fs/cgroup"
-	//
-	// The final path of the cgroup will be "/sys/fs/cgroup" + <name>, where <name> is what we give
-	// cgroup2.Load().
-	manager, err := cgroup2.Load(fmt.Sprint("/", groupName))
-	if err != nil {
-		return nil, fmt.Errorf("Error loading cgroup: %w", err)
-	}
-	sendEvent, recvEvent := util.NewCondChannelPair()
-
-	highEventCount := &atomic.Uint64{}
-	errCh := make(chan error, 1)
-
-	cgm := &CgroupManager{
-		MemoryHighEvent: recvEvent,
-		ErrCh:           errCh,
-		name:            groupName,
-		manager:         manager,
-	}
-
-	// Long-running handler task for memory events
-	go func() {
-		// FIXME: make this configurable
-		minWaitDuration := time.Second
-		var minWait <-chan time.Time
-
-		// Restart the event loop whenever it gets closed.
-		//
-		// This can happen, for instance, when the last task in the cgroup ends.
-		for {
-			if minWait != nil {
-				select {
-				case <-minWait:
-				default:
-					logger.Warn(
-						"Respecting minimum wait delay before restarting memory.events listener",
-						zap.Duration("delay", minWaitDuration),
-					)
-					<-minWait
-				}
-				logger.Info("Restarting memory.events listener")
-			}
-
-			minWait = time.After(minWaitDuration)
-
-			// FIXME: There's currently no way to stop the goroutine spawned by EventChan, so it
-			// doesn't yet make sense to provide a way to cancel the goroutine to handle its events.
-			// Eventually, we should either patch containerd/cgroups or write our own implementation
-			// here.
-			memEvents, eventErrCh := manager.EventChan()
-
-			select {
-			case event := <-memEvents:
-				// This is *kind of* on the hot path — we actually do want this to be pretty quick.
-				// So it makes reasonable sense to use zap.Object instead of zap.Any, event though
-				// there's some boilerplate required for it.
-				logger.Info("New memory.events", zap.Object("events", marshalMemoryEvents(event)))
-				highCount := event.High
-				oldHighCount := util.AtomicMax(highEventCount, highCount)
-
-				if highCount > oldHighCount {
-					sendEvent.Send()
-				}
-			case err, ok := <-eventErrCh:
-				if err == nil && !ok {
-					errCh <- errors.New("Memory event channel closed without error")
-				} else {
-					errCh <- fmt.Errorf("Error while waiting for memory events: %w", err)
-				}
-				return
-			}
-		}
-	}()
-
-	// Fetch the current "memory high" count
-	current, err := parseMemoryEvents(logger, groupName)
-	if err != nil {
-		return nil, fmt.Errorf("Error getting current memory events: %w", err)
-	}
-
-	logger.Info("Initial memory.events", zap.Object("events", marshalMemoryEvents(*current)))
-
-	util.AtomicMax(highEventCount, current.High)
-	recvEvent.Consume() // Clear events
-
-	return cgm, nil
-}
-
-func marshalMemoryEvents(events cgroup2.Event) zapcore.ObjectMarshalerFunc {
-	return zapcore.ObjectMarshalerFunc(func(enc zapcore.ObjectEncoder) error {
-		// NB: we're using lower snake-case names that are present in the actual
-		// memory.events file, instead of the field names from cgroup2.Event
-		enc.AddUint64("low", events.Low)
-		enc.AddUint64("high", events.High)
-		enc.AddUint64("max", events.Max)
-		enc.AddUint64("oom", events.OOM)
-		enc.AddUint64("oom_kill", events.OOMKill)
-		return nil
-	})
-}
-
-// TODO: no way to do this with github.com/containerd/cgroups ? Seems like that should be
-// exposed to the user... We *can* just parse it directly, but it's a bit annoying.
-func parseMemoryEvents(logger *zap.Logger, groupName string) (*cgroup2.Event, error) {
-	path := cgroupPath(groupName, "memory.events")
-	content, err := os.ReadFile(path)
-	if err != nil {
-		return nil, fmt.Errorf("Error reading file at %q: %w", path, err)
-	}
-
-	// note: When we read the memory.events file, it tends to look something like:
-	//
-	//   low 1
-	//   high 5
-	//   max 3
-	//   oom 1
-	//   oom_kill 0
-	//
-	// (numbers are made up)
-	//
-	// This map represents the field names we know about. Newer versions of the Linux kernel *might*
-	// add new fields, but that'll probably happen slowly, so we emit warnings only when the field
-	// name isn't recognized. For each entry in the map: v is the value of the field, set is true if
-	// we've already parsed the value, and required is true if we need the value in order to build a
-	// cgroup2.Event.
-	valueMap := map[string]struct {
-		v        uint64
-		set      bool
-		required bool
-	}{
-		"low":            {0, false, true},
-		"high":           {0, false, true},
-		"max":            {0, false, true},
-		"oom":            {0, false, true},
-		"oom_kill":       {0, false, true},
-		"oom_group_kill": {0, false, false}, // Added in 5.17
-	}
-
-	lines := strings.Split(strings.TrimSpace(string(content)), "\n")
-	for i, line := range lines {
-		fields := strings.Fields(line)
-		if len(fields) != 2 {
-			return nil, fmt.Errorf(
-				"Line %d of %q is not expected format: has %d fields", i, path, len(fields),
-			)
-		}
-
-		name := fields[0]
-		value, err := strconv.ParseUint(fields[1], 10, 64)
-		if err != nil {
-			return nil, fmt.Errorf(
-				"Error parsing field on line %d of %q as integer: %w", i, path, err,
-			)
-		}
-
-		pair, ok := valueMap[name]
-		if !ok {
-			logger.Warn("Unrecognized memory.events field (is the kernel new?)", zap.String("field", name))
-			continue
-		} else if pair.set {
-			return nil, fmt.Errorf("Duplicate field %q", name)
-		}
-		pair.v = value
-		pair.set = true
-		valueMap[name] = pair
-	}
-
-	var unset []string
-
-	// Check if there's any unset fields
-	for name, pair := range valueMap {
-		if !pair.set && pair.required {
-			unset = append(unset, name)
-		}
-	}
-
-	if len(unset) != 0 {
-		return nil, fmt.Errorf("Some required fields not provided: %+v", unset)
-	}
-
-	return &cgroup2.Event{
-		Low:     valueMap["low"].v,
-		High:    valueMap["high"].v,
-		Max:     valueMap["max"].v,
-		OOM:     valueMap["oom"].v,
-		OOMKill: valueMap["oom_kill"].v,
-	}, nil
-}
-
-// TODO: Open a PR in github.com/containerd/cgroups to expose this publicly. This function is
-// *basically* just copied from there.
-func fetchState(groupName string) (cgroup2.State, error) {
-	path := cgroupPath(groupName, "cgroup.freeze")
-	content, err := os.ReadFile(path)
-	if err != nil {
-		return cgroup2.Unknown, fmt.Errorf("Error reading file at %q: %w", path, err)
-	}
-	switch strings.TrimSpace(string(content)) {
-	case "1":
-		return cgroup2.Frozen, nil
-	case "0":
-		return cgroup2.Thawed, nil
-	default:
-		return cgroup2.Unknown, errors.New("Unexpected file content")
-	}
-}
-
-// TODO: not great that we're implementing this function ourselves. It's required for fetchState and
-// parseMemoryEvents, which we'd also like to get rid of.
-func cgroupPath(groupName string, file string) string {
-	// note: it's ok to use slashes, because this can only run on linux anyways.
-	return filepath.Join("/sys/fs/cgroup", groupName, file) //nolint:gocritic // see comment above.
-}
-
-type memoryLimits struct {
-	highBytes uint64
-	maxBytes  uint64
-}
-
-// SetMemLimits sets the cgroup's memory.high and memory.max to the values provided by the
-// memoryLimits.
-func (c *CgroupManager) SetMemLimits(limits memoryLimits) error {
-	// convert uint64 -> int64 so we can produce pointers
-	hb := int64(limits.highBytes)
-	mb := int64(limits.maxBytes)
-	return c.manager.Update(&cgroup2.Resources{
-		Memory: &cgroup2.Memory{High: &hb, Max: &mb},
-	})
-}
-
-func (c *CgroupManager) SetMemHighBytes(bytes uint64) error {
-	high := int64(bytes)
-	return c.manager.Update(&cgroup2.Resources{
-		Memory: &cgroup2.Memory{
-			High: &high,
-		},
-	})
-}
-
-func (c *CgroupManager) FetchMemoryHighBytes() (*uint64, error) {
-	path := cgroupPath(c.name, "memory.high")
-	content, err := os.ReadFile(path)
-	if err != nil {
-		return nil, fmt.Errorf("Error reading file at %q: %w", path, err)
-	}
-
-	stringContent := strings.TrimSpace(string(content))
-	if stringContent == "max" {
-		return nil, nil
-	}
-
-	amount, err := strconv.ParseUint(stringContent, 10, 64)
-	if err != nil {
-		return nil, fmt.Errorf("Error parsing as uint64: %w", err)
-	}
-	return &amount, nil
-}
-
-// FetchState returns a cgroup2.State indicating whether the cgroup is currently frozen
-func (c *CgroupManager) FetchState() (cgroup2.State, error) {
-	return fetchState(c.name)
-}
-
-// CurrentMemoryUsage returns the value at memory.current -- the cgroup's current memory usage.
-func (c *CgroupManager) CurrentMemoryUsage() (uint64, error) {
-	path := cgroupPath(c.name, "memory.current")
-	content, err := os.ReadFile(path)
-	if err != nil {
-		return 0, fmt.Errorf("Error reading file at %q: %w", path, err)
-	}
-
-	amount, err := strconv.ParseUint(strings.TrimSpace(string(content)), 10, 64)
-	if err != nil {
-		return 0, fmt.Errorf("Error parsing as uint64: %w", err)
-	}
-	return amount, nil
-}
-
-func (c *CgroupManager) Freeze() error {
-	return c.manager.Freeze()
-}
-
-func (c *CgroupManager) Thaw() error {
-	return c.manager.Thaw()
-}
diff --git a/pkg/informant/consts.go b/pkg/informant/consts.go
deleted file mode 100644
index 76045d232..000000000
--- a/pkg/informant/consts.go
+++ /dev/null
@@ -1,48 +0,0 @@
-package informant
-
-// Assorted constants that aren't worth having a configuration file for
-
-import (
-	"time"
-)
-
-const (
-	PrometheusPort uint16 = 9100
-
-	CheckDeadlockDelay   time.Duration = 1 * time.Second
-	CheckDeadlockTimeout time.Duration = 250 * time.Millisecond
-
-	AgentBackgroundCheckDelay   time.Duration = 10 * time.Second
-	AgentBackgroundCheckTimeout time.Duration = 250 * time.Millisecond
-
-	AgentResumeTimeout  time.Duration = 100 * time.Millisecond
-	AgentSuspendTimeout time.Duration = 5 * time.Second        // may take a while; it /suspend intentionally waits
-	AgentUpscaleTimeout time.Duration = 400 * time.Millisecond // does not include waiting for /upscale response
-)
-
-var (
-	// DefaultStateConfig is the default state passed to NewState
-	DefaultStateConfig StateConfig = StateConfig{
-		SysBufferBytes: 100 * (1 << 20), // 100 MiB
-	}
-
-	// DefaultCgroupConfig is the default CgroupConfig used for cgroup interaction logic
-	DefaultCgroupConfig CgroupConfig = CgroupConfig{
-		OOMBufferBytes:        100 * (1 << 20), // 100 MiB
-		MemoryHighBufferBytes: 100 * (1 << 20), // 100 MiB
-		// while waiting for upscale, don't freeze for more than 20ms every 1s
-		MaxUpscaleWaitMillis:           20,   // 20ms
-		DoNotFreezeMoreOftenThanMillis: 1000, // 1s
-		// while waiting for upscale, increase memory.high by 10 MiB every 25ms
-		MemoryHighIncreaseByBytes:     10 * (1 << 20), // 10 MiB
-		MemoryHighIncreaseEveryMillis: 25,             // 25ms
-	}
-
-	// DefaultFileCacheConfig is the default FileCacheConfig used for managing the file cache
-	DefaultFileCacheConfig FileCacheConfig = FileCacheConfig{
-		InMemory:               true,
-		ResourceMultiplier:     0.75,            // 75%
-		MinRemainingAfterCache: 640 * (1 << 20), // 640 MiB ; 640 = 512 + 128
-		SpreadFactor:           0.1,             // ensure any increase in file cache size is split 90-10 with 10% to other memory
-	}
-)
diff --git a/pkg/informant/endpoints.go b/pkg/informant/endpoints.go
deleted file mode 100644
index a0ddd6ffd..000000000
--- a/pkg/informant/endpoints.go
+++ /dev/null
@@ -1,493 +0,0 @@
-package informant
-
-// This file contains the high-level handlers for various HTTP endpoints
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"strings"
-	"sync"
-	"time"
-
-	"go.uber.org/zap"
-
-	"github.com/neondatabase/autoscaling/pkg/api"
-	"github.com/neondatabase/autoscaling/pkg/util"
-)
-
-// State is the global state of the informant
-type State struct {
-	config    StateConfig
-	agents    *AgentSet
-	cgroup    *CgroupState
-	fileCache *FileCacheState
-
-	// memReservedForFileCache stores the amount of memory that's currently reserved for the file
-	// cache.
-	//
-	// This field is mostly used during initialization, where it allows us to pass state from the
-	// file cache's startup hook to the cgroup's hook.
-	//
-	// There's definitely better ways of doing this, but the solution we have will work for now.
-	memReservedForFileCache uint64
-}
-
-type StateConfig struct {
-	// SysBufferBytes gives the estimated amount of memory, in bytes, that the kernel uses before
-	// handing out the rest to userspace. This value is the estimated difference between the
-	// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
-	//
-	// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
-	// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
-	//
-	// We only use SysBufferBytes when calculating the system memory from the *external* memory
-	// size, rather than the self-reported memory size, according to the kernel.
-	//
-	// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
-	// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
-	// should be removed once we have a better solution there.
-	SysBufferBytes uint64
-}
-
-// NewStateOpts are individual options provided to NewState
-type NewStateOpts struct {
-	kind      newStateOptKind
-	setFields func(*State)
-	post      func(_ *zap.Logger, s *State, memTotal uint64) error
-}
-
-type newStateOptKind int
-
-const (
-	optCgroup newStateOptKind = iota
-	optFileCache
-)
-
-// NewState instantiates a new State object, starting whatever background processes might be
-// required
-//
-// Optional configuration may be provided by NewStateOpts - see WithCgroup and
-// WithPostgresFileCache.
-func NewState(logger *zap.Logger, agents *AgentSet, config StateConfig, opts ...NewStateOpts) (*State, error) {
-	if config.SysBufferBytes == 0 {
-		panic("invalid StateConfig: SysBufferBytes cannot be zero")
-	}
-
-	s := &State{
-		config:                  config,
-		agents:                  agents,
-		cgroup:                  nil,
-		fileCache:               nil,
-		memReservedForFileCache: 0,
-	}
-	for _, opt := range opts {
-		opt.setFields(s)
-	}
-
-	memInfo, err := getTotalSystemMemory()
-	if err != nil {
-		return nil, fmt.Errorf("Error getting system meminfo: %w", err)
-	}
-
-	// We need to process file cache initialization before cgroup initialization, so that the memory
-	// allocated to the file cache is appropriately taken into account when we decide the cgroup's
-	// memory limits.
-	//
-	// TODO: this should be made cleaner, but it's mostly ok when there's only two options.
-	for _, kind := range []newStateOptKind{optFileCache, optCgroup} {
-		for _, opt := range opts {
-			if opt.kind == kind {
-				if err := opt.post(logger, s, memInfo.Total); err != nil {
-					return nil, err
-				}
-			}
-		}
-	}
-
-	return s, nil
-}
-
-// WithCgroup creates a NewStateOpts that sets its CgroupHandler
-//
-// This function will panic if the provided CgroupConfig is invalid.
-func WithCgroup(cgm *CgroupManager, config CgroupConfig) NewStateOpts {
-	if config.OOMBufferBytes == 0 {
-		panic("invalid CgroupConfig: OOMBufferBytes == 0")
-	} else if config.MaxUpscaleWaitMillis == 0 {
-		panic("invalid CgroupConfig: MaxUpscaleWaitMillis == 0")
-	}
-
-	return NewStateOpts{
-		kind: optCgroup,
-		setFields: func(s *State) {
-			if s.cgroup != nil {
-				panic("WithCgroupHandler option provided more than once")
-			}
-
-			upscaleEventsSendr, upscaleEventsRecvr := util.NewCondChannelPair()
-			s.cgroup = &CgroupState{
-				updateMemLimitsLock: sync.Mutex{},
-				mgr:                 cgm,
-				config:              config,
-				upscaleEventsSendr:  upscaleEventsSendr,
-				upscaleEventsRecvr:  upscaleEventsRecvr,
-				requestUpscale:      func(l *zap.Logger) { s.agents.RequestUpscale(l) },
-			}
-		},
-		post: func(logger *zap.Logger, s *State, memTotal uint64) error {
-			logger = logger.With(zap.String("cgroup", s.cgroup.mgr.name))
-
-			available := memTotal - s.memReservedForFileCache
-
-			// FIXME: This is technically racy across restarts. The sequence would be:
-			//  1. Respond "ok" to a downscale request
-			//  2. Restart
-			//  3. Read system memory
-			//  4. Get downscaled (as approved earlier)
-			// A potential way to fix this would be writing to a file to record approved downscale
-			// operations.
-			if err := s.cgroup.setMemoryLimits(logger, available); err != nil {
-				return fmt.Errorf("Error setting initial cgroup memory limits: %w", err)
-			}
-			go s.cgroup.handleCgroupSignalsLoop(logger.Named("signal-handler"), config)
-			return nil
-		},
-	}
-}
-
-// WithPostgresFileCache creates a NewStateOpts that enables connections to the postgres file cache
-func WithPostgresFileCache(connStr string, config FileCacheConfig) NewStateOpts {
-	if err := config.Validate(); err != nil {
-		panic(fmt.Errorf("invalid FileCacheConfig: %w", err))
-	}
-
-	return NewStateOpts{
-		kind: optFileCache,
-		setFields: func(s *State) {
-			if s.fileCache != nil {
-				panic("WithPostgresFileCache option provided more than once")
-			}
-
-			s.fileCache = &FileCacheState{
-				connStr: connStr,
-				config:  config,
-			}
-		},
-		post: func(logger *zap.Logger, s *State, memTotal uint64) error {
-			if !config.InMemory {
-				panic("file cache not in-memory unimplemented")
-			}
-
-			// FIXME: make the timeout configurable
-			ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
-			defer cancel()
-
-			// Check that we have permissions to set the file cache's size.
-			size, err := s.fileCache.GetFileCacheSize(ctx)
-			if err != nil {
-				return fmt.Errorf("Error getting file cache size: %w", err)
-			}
-
-			newSize := s.fileCache.config.CalculateCacheSize(memTotal)
-			logger.Info("Setting initial file cache size", zap.String("current", mib(size)), zap.String("target", mib(newSize)))
-
-			// note: Even if newSize == size, we want to explicitly set it *anwyays*, just to verify
-			// that we have the necessary permissions to do so.
-
-			actualSize, err := s.fileCache.SetFileCacheSize(ctx, logger, newSize)
-			if err != nil {
-				return fmt.Errorf("Error setting file cache size: %w", err)
-			}
-			s.memReservedForFileCache = actualSize
-
-			return nil
-		},
-	}
-}
-
-// RegisterAgent registers a new or updated autoscaler-agent
-//
-// Returns: body (if successful), status code, error (if unsuccessful)
-func (s *State) RegisterAgent(ctx context.Context, logger *zap.Logger, info *api.AgentDesc) (*api.InformantDesc, int, error) {
-	logger = logger.With(agentZapField(info.AgentID, info.ServerAddr))
-
-	protoVersion, status, err := s.agents.RegisterNewAgent(logger, info)
-	if err != nil {
-		return nil, status, err
-	}
-
-	desc := api.InformantDesc{
-		ProtoVersion: protoVersion,
-		MetricsMethod: api.InformantMetricsMethod{
-			Prometheus: &api.MetricsMethodPrometheus{Port: PrometheusPort},
-		},
-	}
-
-	return &desc, 200, nil
-}
-
-// HealthCheck is a dummy endpoint that allows the autoscaler-agent to check that (a) the informant
-// is up and running, and (b) the agent is still registered.
-//
-// Returns: body (if successful), status code, error (if unsuccessful)
-func (s *State) HealthCheck(ctx context.Context, logger *zap.Logger, info *api.AgentIdentification) (*api.InformantHealthCheckResp, int, error) {
-	agent, ok := s.agents.Get(info.AgentID)
-	if !ok {
-		return nil, 404, fmt.Errorf("No Agent with ID %s registered", info.AgentID)
-	} else if !agent.protoVersion.AllowsHealthCheck() {
-		return nil, 400, fmt.Errorf("health checks are not supported in protocol version %v", agent.protoVersion)
-	}
-
-	return &api.InformantHealthCheckResp{}, 200, nil
-}
-
-// TryDownscale tries to downscale the VM's current resource usage, returning whether the proposed
-// amount is ok
-//
-// Returns: body (if successful), status code and error (if unsuccessful)
-func (s *State) TryDownscale(ctx context.Context, logger *zap.Logger, target *api.AgentResourceMessage) (*api.DownscaleResult, int, error) {
-	currentId := s.agents.CurrentIdStr()
-	incomingId := target.Data.Id.AgentID.String()
-
-	// First verify agent's authenticity before doing anything.
-	// Note: if the current agent is nil, its id string will be "<nil>", which
-	// does not match any valid UUID
-	if incomingId != currentId {
-		return nil, 400, fmt.Errorf("Agent ID %s is not the active Agent", incomingId)
-	}
-
-	// Helper functions for abbreviating returns.
-	resultFromStatus := func(ok bool, status string) (*api.DownscaleResult, int, error) {
-		return &api.DownscaleResult{Ok: ok, Status: status}, 200, nil
-	}
-	internalError := func(err error) (*api.DownscaleResult, int, error) {
-		logger.Error("Internal error handling downscale request", zap.Error(err))
-		return nil, 500, errors.New("Internal error")
-	}
-
-	// If we aren't interacting with something that should be adjusted, then we don't need to do anything.
-	if s.cgroup == nil && s.fileCache == nil {
-		logger.Info("No action needed for downscale (no cgroup or file cache enabled)")
-		return resultFromStatus(true, "No action taken (no cgroup or file cache enabled)")
-	}
-
-	requestedMem := uint64(target.Data.Memory.Value())
-	usableSystemMemory := util.SaturatingSub(requestedMem, s.config.SysBufferBytes)
-
-	// Get the file cache's expected contribution to the memory usage
-	var expectedFileCacheMemUsage uint64
-	if s.fileCache != nil && s.fileCache.config.InMemory {
-		expectedFileCacheMemUsage = s.fileCache.config.CalculateCacheSize(usableSystemMemory)
-	}
-
-	mib := float64(1 << 20) // 1 MiB = 2^20 bytes. We'll use this for pretty-printing.
-
-	// Check whether this downscaling would be ok for the cgroup.
-	//
-	// Also, lock changing the cgroup between the initial calculations and later using them.
-	var newCgroupMemHigh uint64
-	if s.cgroup != nil {
-		s.cgroup.updateMemLimitsLock.Lock()
-		defer s.cgroup.updateMemLimitsLock.Unlock()
-
-		newCgroupMemHigh = s.cgroup.config.calculateMemoryHighValue(usableSystemMemory - expectedFileCacheMemUsage)
-
-		current, err := s.cgroup.getCurrentMemory()
-		if err != nil {
-			return internalError(fmt.Errorf("Error fetching getting cgroup memory: %w", err))
-		}
-
-		// For an explanation, refer to the documentation of CgroupConfig.MemoryHighBufferBytes
-		//
-		// TODO: this should be a method on (*CgroupConfig).
-		if newCgroupMemHigh < current+s.cgroup.config.MemoryHighBufferBytes {
-			verdict := "Calculated memory.high too low"
-			status := fmt.Sprintf(
-				"%s: %g MiB (new high) < %g MiB (current usage) + %g MiB (buffer)",
-				verdict,
-				float64(newCgroupMemHigh)/mib, float64(current)/mib,
-				float64(s.cgroup.config.MemoryHighBufferBytes)/mib,
-			)
-
-			return resultFromStatus(false, status)
-		}
-	}
-
-	var statusParts []string
-
-	var fileCacheMemUsage uint64
-
-	// The downscaling has been approved. Downscale the file cache, then the cgroup.
-	if s.fileCache != nil && s.fileCache.config.InMemory {
-		if !s.fileCache.config.InMemory {
-			panic("file cache not in-memory unimplemented")
-		}
-
-		// FIXME: make the timeout configurablek
-		dbCtx, cancel := context.WithTimeout(ctx, time.Second) // for talking to the DB
-		defer cancel()
-
-		actualUsage, err := s.fileCache.SetFileCacheSize(dbCtx, logger, expectedFileCacheMemUsage)
-		if err != nil {
-			return internalError(fmt.Errorf("Error setting file cache size: %w", err))
-		}
-
-		fileCacheMemUsage = actualUsage
-		status := fmt.Sprintf("Set file cache size to %g MiB", float64(actualUsage)/mib)
-		statusParts = append(statusParts, status)
-	}
-
-	if s.cgroup != nil {
-		availableMemory := usableSystemMemory - fileCacheMemUsage
-
-		if fileCacheMemUsage != expectedFileCacheMemUsage {
-			newCgroupMemHigh = s.cgroup.config.calculateMemoryHighValue(availableMemory)
-		}
-
-		memLimits := memoryLimits{
-			highBytes: newCgroupMemHigh,
-			maxBytes:  availableMemory,
-		}
-
-		// TODO: see similar note above. We shouldn't call methods on s.cgroup.mgr from here.
-		if err := s.cgroup.mgr.SetMemLimits(memLimits); err != nil {
-			return internalError(fmt.Errorf("Error setting cgroup memory.high: %w", err))
-		}
-
-		status := fmt.Sprintf(
-			"Set cgroup memory.high to %g MiB, of new max %g MiB",
-			float64(newCgroupMemHigh)/mib, float64(availableMemory)/mib,
-		)
-		statusParts = append(statusParts, status)
-	}
-
-	return resultFromStatus(true, strings.Join(statusParts, "; "))
-}
-
-// NotifyUpscale signals that the VM's resource usage has been increased to the new amount
-//
-// Returns: body (if successful), status code and error (if unsuccessful)
-func (s *State) NotifyUpscale(
-	ctx context.Context,
-	logger *zap.Logger,
-	newResources *api.AgentResourceMessage,
-) (*struct{}, int, error) {
-	// FIXME: we shouldn't just trust what the agent says
-	//
-	// Because of race conditions like in <https://github.com/neondatabase/autoscaling/issues/23>,
-	// it's possible for us to receive a notification on /upscale *before* NeonVM actually adds the
-	// memory.
-	//
-	// So until the race condition described in #23 is fixed, we have to just trust that the agent
-	// is telling the truth, *especially because it might not be*.
-
-	currentId := s.agents.CurrentIdStr()
-	incomingId := newResources.Data.Id.AgentID.String()
-
-	// First verify agent's authenticity before doing anything.
-	// Note: if the current agent is nil, its id string will be "<nil>", which
-	// does not match any valid UUID
-	if incomingId != currentId {
-		return nil, 400, fmt.Errorf("Agent ID %s is not the active Agent", incomingId)
-	}
-
-	// Helper function for abbreviating returns.
-	internalError := func(err error) (*struct{}, int, error) {
-		logger.Error("Error handling upscale request", zap.Error(err))
-		return nil, 500, errors.New("Internal error")
-	}
-
-	if s.cgroup == nil && s.fileCache == nil {
-		logger.Info("No action needed for upscale (no cgroup or file cache enabled)")
-		return &struct{}{}, 200, nil
-	}
-
-	newMem := uint64(newResources.Data.Memory.Value())
-	usableSystemMemory := util.SaturatingSub(newMem, s.config.SysBufferBytes)
-
-	if s.cgroup != nil {
-		s.cgroup.updateMemLimitsLock.Lock()
-		defer s.cgroup.updateMemLimitsLock.Unlock()
-	}
-
-	s.agents.ReceivedUpscale()
-
-	// Get the file cache's expected contribution to the memory usage
-	var fileCacheMemUsage uint64
-	if s.fileCache != nil {
-		logger := logger.With(zap.String("fileCacheConnstr", s.fileCache.connStr))
-
-		if !s.fileCache.config.InMemory {
-			panic("file cache not in-memory unimplemented")
-		}
-
-		// FIXME: make the timeout configurable
-		dbCtx, cancel := context.WithTimeout(ctx, time.Second) // for talking to the DB
-		defer cancel()
-
-		// Update the size of the file cache
-		expectedUsage := s.fileCache.config.CalculateCacheSize(usableSystemMemory)
-
-		logger.Info("Updating file cache size", zap.String("target", mib(expectedUsage)), zap.String("totalMemory", mib(newMem)))
-
-		actualUsage, err := s.fileCache.SetFileCacheSize(dbCtx, logger, expectedUsage)
-		if err != nil {
-			return internalError(fmt.Errorf("Error setting file cache size: %w", err))
-		}
-
-		if actualUsage != expectedUsage {
-			logger.Warn(
-				"File cache size was set to a different value than we wanted",
-				zap.String("target", mib(expectedUsage)),
-				zap.String("actual", mib(actualUsage)),
-			)
-		}
-
-		fileCacheMemUsage = actualUsage
-	}
-
-	if s.cgroup != nil {
-		logger := logger.With(zap.String("cgroup", s.cgroup.mgr.name))
-
-		availableMemory := usableSystemMemory - fileCacheMemUsage
-
-		newMemHigh := s.cgroup.config.calculateMemoryHighValue(availableMemory)
-		logger.Info("Updating cgroup memory.high", zap.String("target", mib(newMemHigh)), zap.String("totalMemory", mib(newMem)))
-
-		memLimits := memoryLimits{
-			highBytes: newMemHigh,
-			maxBytes:  availableMemory,
-		}
-
-		if err := s.cgroup.mgr.SetMemLimits(memLimits); err != nil {
-			return internalError(fmt.Errorf("Error setting cgroup memory.high: %w", err))
-		}
-
-		s.cgroup.upscaleEventsSendr.Send()
-	}
-
-	return &struct{}{}, 200, nil
-}
-
-// UnregisterAgent unregisters the autoscaler-agent given by info, if it is currently registered
-//
-// If a different autoscaler-agent is currently registered, this method will do nothing.
-//
-// Returns: body (if successful), status code and error (if unsuccessful)
-func (s *State) UnregisterAgent(ctx context.Context, logger *zap.Logger, info *api.AgentDesc) (*api.UnregisterAgent, int, error) {
-	agent, ok := s.agents.Get(info.AgentID)
-	if !ok {
-		return nil, 404, fmt.Errorf("No agent with ID %q", info.AgentID)
-	} else if agent.serverAddr != info.ServerAddr {
-		// On our side, log the address we're expecting, but don't give that to the client
-		logger.Warn(fmt.Sprintf(
-			"Agent serverAddr is incorrect, got %q but expected %q",
-			info.ServerAddr, agent.serverAddr,
-		))
-		return nil, 400, fmt.Errorf("Agent serverAddr is incorrect, got %q", info.ServerAddr)
-	}
-
-	wasActive := agent.EnsureUnregistered(logger)
-	return &api.UnregisterAgent{WasActive: wasActive}, 200, nil
-}
diff --git a/pkg/informant/filecache.go b/pkg/informant/filecache.go
deleted file mode 100644
index 7653d2eb1..000000000
--- a/pkg/informant/filecache.go
+++ /dev/null
@@ -1,187 +0,0 @@
-package informant
-
-// Integration with Neon's postgres local file cache
-
-import (
-	"context"
-	"database/sql"
-	"fmt"
-
-	_ "github.com/lib/pq"
-	"go.uber.org/zap"
-
-	"github.com/neondatabase/autoscaling/pkg/util"
-)
-
-type FileCacheState struct {
-	connStr string
-	config  FileCacheConfig
-}
-
-type FileCacheConfig struct {
-	// InMemory indicates whether the file cache is *actually* stored in memory (e.g. by writing to
-	// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
-	// memory available for the cgroup.
-	InMemory bool
-
-	// ResourceMultiplier gives the size of the file cache, in terms of the size of the resource it
-	// consumes (currently: only memory)
-	//
-	// For example, setting ResourceMultiplier = 0.75 gives the cache a target size of 75% of total
-	// resources.
-	//
-	// This value must be strictly between 0 and 1.
-	ResourceMultiplier float64
-
-	// MinRemainingAfterCache gives the required minimum amount of memory, in bytes, that must
-	// remain available after subtracting the file cache.
-	//
-	// This value must be non-zero.
-	MinRemainingAfterCache uint64
-
-	// SpreadFactor controls the rate of increase in the file cache's size as it grows from zero
-	// (when total resources equals MinRemainingAfterCache) to the desired size based on
-	// ResourceMultiplier.
-	//
-	// A SpreadFactor of zero means that all additional resources will go to the cache until it
-	// reaches the desired size. Setting SpreadFactor to N roughly means "for every 1 byte added to
-	// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
-	// its desired size".
-	//
-	// This value must be >= 0, and must retain an increase that is more than what would be given by
-	// ResourceMultiplier. For example, setting ResourceMultiplier = 0.75 but SpreadFactor = 1 would
-	// be invalid, because SpreadFactor would induce only 50% usage - never reaching the 75% as
-	// desired by ResourceMultiplier.
-	//
-	// SpreadFactor is too large if (SpreadFactor+1) * ResourceMultiplier is >= 1.
-	SpreadFactor float64
-}
-
-func (c *FileCacheConfig) Validate() error {
-	// Check single-field validity
-	if !(0.0 < c.ResourceMultiplier && c.ResourceMultiplier < 1.0) {
-		return fmt.Errorf("ResourceMultiplier must be between 0.0 and 1.0, exclusive. Got %g", c.ResourceMultiplier)
-	} else if !(c.SpreadFactor >= 0.0) {
-		return fmt.Errorf("SpreadFactor must be >= 0, got: %g", c.SpreadFactor)
-	} else if c.MinRemainingAfterCache == 0 {
-		return fmt.Errorf("MinRemainingAfterCache must not be 0")
-	}
-
-	// Check that ResourceMultiplier and SpreadFactor are valid w.r.t. each other.
-	//
-	// As shown in CalculateCacheSize, we have two lines resulting from ResourceMultiplier and
-	// SpreadFactor, respectively. They are:
-	//
-	//                total           MinRemainingAfterCache
-	//   size = —————————————————— - ————————————————————————
-	//           SpreadFactor + 1        SpreadFactor + 1
-	//
-	// and
-	//
-	//   size = ResourceMultiplier × total
-	//
-	// .. where 'total' is the total resources. These are isomorphic to the typical 'y = mx + b'
-	// form, with y = "size" and x = "total".
-	//
-	// These lines intersect at:
-	//
-	//               MinRemainingAfterCache
-	//   —————————————————————————————————————————————
-	//    1 - ResourceMultiplier × (SpreadFactor + 1)
-	//
-	// We want to ensure that this value (a) exists, and (b) is >= MinRemainingAfterCache. This is
-	// guaranteed when 'ResourceMultiplier × (SpreadFactor + 1)' is less than 1.
-	// (We also need it to be >= 0, but that's already guaranteed.)
-
-	intersectFactor := c.ResourceMultiplier * (c.SpreadFactor + 1)
-	if !(intersectFactor < 1.0) {
-		return fmt.Errorf("incompatible ResourceMultiplier and SpreadFactor")
-	}
-
-	return nil
-}
-
-// CalculateCacheSize returns the desired size of the cache, given the total memory.
-func (c *FileCacheConfig) CalculateCacheSize(total uint64) uint64 {
-	available := util.SaturatingSub(total, c.MinRemainingAfterCache)
-
-	if available == 0 {
-		return 0
-	}
-
-	sizeFromSpread := uint64(util.Max(0, int64(float64(available)/(1.0+c.SpreadFactor))))
-	//                ^^^^^^^^^^^^^^^^^^^^^^^^ make sure we don't overflow from floating-point ops
-	sizeFromNormal := uint64(float64(total) * c.ResourceMultiplier)
-
-	byteSize := util.Min(sizeFromSpread, sizeFromNormal)
-	var mib uint64 = 1 << 20 // 1 MiB = 1^20 bytes.
-
-	// The file cache operates in units of mebibytes, so the sizes we produce should be rounded to a
-	// mebibyte. We round down to be conservative.
-	return byteSize / mib * mib
-}
-
-// GetFileCacheSize returns the current size of the file cache, in bytes
-func (s *FileCacheState) GetFileCacheSize(ctx context.Context) (uint64, error) {
-	db, err := sql.Open("postgres", s.connStr)
-	if err != nil {
-		return 0, fmt.Errorf("Error connecting to postgres: %w", err)
-	}
-	defer db.Close()
-
-	// The file cache GUC variable is in MiB, but the conversion with pg_size_bytes means that the
-	// end result we get is in bytes.
-	var sizeInBytes uint64
-	if err := db.QueryRowContext(ctx, `SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));`).Scan(&sizeInBytes); err != nil {
-		return 0, fmt.Errorf("Error querying file cache size: %w", err)
-	}
-
-	return sizeInBytes, nil
-}
-
-// SetFileCacheSize sets the size of the file cache, returning the actual size it was set to
-func (s *FileCacheState) SetFileCacheSize(ctx context.Context, logger *zap.Logger, sizeInBytes uint64) (uint64, error) {
-	db, err := sql.Open("postgres", s.connStr)
-	if err != nil {
-		return 0, fmt.Errorf("Error connecting to postgres: %w", err)
-	}
-	defer db.Close()
-
-	logger.Info("Fetching maximum file cache size")
-
-	var maxSizeInBytes uint64
-	err = db.QueryRowContext(ctx, `SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));`).
-		Scan(&maxSizeInBytes)
-	if err != nil {
-		return 0, fmt.Errorf("Error querying max file cache size: %w", err)
-	}
-
-	var maybeCapped string
-	if sizeInBytes > maxSizeInBytes {
-		sizeInBytes = maxSizeInBytes
-		maybeCapped = " (capped by maximum size)"
-	}
-
-	logger.Info(
-		fmt.Sprintf("Updating file cache size %s", maybeCapped),
-		zap.String("size", mib(sizeInBytes)),
-		zap.String("max", mib(maxSizeInBytes)),
-	)
-
-	// note: even though the normal ways to get the cache size produce values with trailing "MB"
-	// (hence why we call pg_size_bytes in GetFileCacheSize's query), the format it expects to set
-	// the value is "integer number of MB" without trailing units. For some reason, this *really*
-	// wasn't working with normal arguments, so that's why we're constructing the query here.
-	sizeInMB := sizeInBytes / (1 << 20)
-	setQuery := fmt.Sprintf(`ALTER SYSTEM SET neon.file_cache_size_limit = %d;`, sizeInMB)
-	if _, err := db.ExecContext(ctx, setQuery); err != nil {
-		return 0, fmt.Errorf("Error changing cache setting: %w", err)
-	}
-
-	// must use pg_reload_conf to have the settings change take effect
-	if _, err := db.ExecContext(ctx, `SELECT pg_reload_conf();`); err != nil {
-		return 0, fmt.Errorf("Error reloading config: %w", err)
-	}
-
-	return sizeInMB * (1 << 20), nil
-}
diff --git a/pkg/plugin/config.go b/pkg/plugin/config.go
index 40f3ff093..79a349758 100644
--- a/pkg/plugin/config.go
+++ b/pkg/plugin/config.go
@@ -7,6 +7,8 @@ import (
 	"math"
 	"os"
 
+	"golang.org/x/exp/slices"
+
 	"k8s.io/apimachinery/pkg/api/resource"
 
 	vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
@@ -39,12 +41,39 @@ type Config struct {
 	// version handled.
 	SchedulerName string `json:"schedulerName"`
 
+	// RandomizeScores, if true, will cause the scheduler to score a node with a random number in
+	// the range [minScore + 1, trueScore], instead of the trueScore
+	RandomizeScores bool `json:"randomizeScores"`
+
+	// MigrationDeletionRetrySeconds gives the duration, in seconds, we should wait between retrying
+	// a failed attempt to delete a VirtualMachineMigration that's finished.
+	MigrationDeletionRetrySeconds uint `json:"migrationDeletionRetrySeconds"`
+
 	// DoMigration, if provided, allows VM migration to be disabled
 	//
 	// This flag is intended to be temporary, just until NeonVM supports mgirations and we can
 	// re-enable it.
 	DoMigration *bool `json:"doMigration"`
 
+	// K8sNodeGroupLabel, if provided, gives the label to use when recording k8s node groups in the
+	// metrics (like for autoscaling_plugin_node_{cpu,mem}_resources_current)
+	K8sNodeGroupLabel string `json:"k8sNodeGroupLabel"`
+
+	// K8sAvailabilityZoneLabel, if provided, gives the label to use when recording nodes'
+	// availability zones in the metrics (like for autoscaling_plugin_node_{cpu,mem}_resources_current)
+	K8sAvailabilityZoneLabel string `json:"k8sAvailabilityZoneLabel"`
+
+	// IgnoreNamespaces, if provided, gives a list of namespaces that the plugin should completely
+	// ignore, as if pods from those namespaces do not exist.
+	//
+	// This is specifically designed for our "overprovisioning" namespace, which creates paused pods
+	// to trigger cluster-autoscaler.
+	//
+	// The only exception to this rule is during Filter method calls, where we do still count the
+	// resources from such pods. The reason to do that is so that these overprovisioning pods can be
+	// evicted, which will allow cluster-autoscaler to trigger scale-up.
+	IgnoreNamespaces []string `json:"ignoreNamespaces"`
+
 	// DumpState, if provided, enables a server to dump internal state
 	DumpState *dumpStateConfig `json:"dumpState"`
 
@@ -61,6 +90,28 @@ type nodeConfig struct {
 	Cpu         resourceConfig `json:"cpu"`
 	Memory      resourceConfig `json:"memory"`
 	ComputeUnit api.Resources  `json:"computeUnit"`
+
+	// Details about node scoring:
+	// See also: https://www.desmos.com/calculator/wg8s0yn63s
+	// In the desmos, the value f(x,s) gives the score (from 0 to 1) of a node that's x amount full
+	// (where x is a fraction from 0 to 1), with a total size that is equal to the maximum size node
+	// times s (i.e. s (or: "scale") gives the ratio between this nodes's size and the biggest one).
+
+	// MinUsageScore gives the ratio of the score at the minimum usage (i.e. 0) relative to the
+	// score at the midpoint, which will have the maximum.
+	//
+	// This corresponds to y₀ in the desmos link above.
+	MinUsageScore float64 `json:"minUsageScore"`
+	// MaxUsageScore gives the ratio of the score at the maximum usage (i.e. full) relative to the
+	// score at the midpoint, which will have the maximum.
+	//
+	// This corresponds to y₁ in the desmos link above.
+	MaxUsageScore float64 `json:"maxUsageScore"`
+	// ScorePeak gives the fraction at which the "target" or highest score should be, with the score
+	// sloping down on either side towards MinUsageScore at 0 and MaxUsageScore at 1.
+	//
+	// This corresponds to xₚ in the desmos link.
+	ScorePeak float64 `json:"scorePeak"`
 }
 
 // resourceConfig configures the amount of a particular resource we're willing to allocate to VMs,
@@ -74,9 +125,6 @@ type resourceConfig struct {
 	// The word "watermark" was originally used by @zoete as a temporary stand-in term during a
 	// meeting, and so it has intentionally been made permanent to spite the concept of "temporary" 😛
 	Watermark float32 `json:"watermark,omitempty"`
-	// System is the absolute amount of the resource allocated to non-user node functions, like
-	// Kubernetes daemons
-	System resource.Quantity `json:"system,omitempty"`
 }
 
 func (c *Config) migrationEnabled() bool {
@@ -115,6 +163,10 @@ func (c *Config) validate() (string, error) {
 		}
 	}
 
+	if c.MigrationDeletionRetrySeconds == 0 {
+		return "migrationDeletionRetrySeconds", errors.New("value must be > 0")
+	}
+
 	return "", nil
 }
 
@@ -131,32 +183,34 @@ func (s *overrideSet) validate() (string, error) {
 }
 
 func (c *nodeConfig) validate() (string, error) {
-	if path, err := c.Cpu.validate(false); err != nil {
+	if path, err := c.Cpu.validate(); err != nil {
 		return fmt.Sprintf("cpu.%s", path), err
 	}
-	if path, err := c.Memory.validate(true); err != nil {
+	if path, err := c.Memory.validate(); err != nil {
 		return fmt.Sprintf("memory.%s", path), err
 	}
 	if err := c.ComputeUnit.ValidateNonZero(); err != nil {
 		return "computeUnit", err
 	}
 
+	if c.MinUsageScore < 0 || c.MinUsageScore > 1 {
+		return "minUsageScore", errors.New("value must be between 0 and 1, inclusive")
+	} else if c.MaxUsageScore < 0 || c.MaxUsageScore > 1 {
+		return "maxUsageScore", errors.New("value must be between 0 and 1, inclusive")
+	} else if c.ScorePeak < 0 || c.ScorePeak > 1 {
+		return "scorePeak", errors.New("value must be between 0 and 1, inclusive")
+	}
+
 	return "", nil
 }
 
-func (c *resourceConfig) validate(isMemory bool) (string, error) {
+func (c *resourceConfig) validate() (string, error) {
 	if c.Watermark <= 0.0 {
 		return "watermark", errors.New("value must be > 0")
 	} else if c.Watermark > 1.0 {
 		return "watermark", errors.New("value must be <= 1")
 	}
 
-	if c.System.Value() <= 0 {
-		return "system", errors.New("value must be > 0")
-	} else if isMemory && c.System.Value() < math.MaxInt64 && c.System.MilliValue()%1000 != 0 {
-		return "system", errors.New("value cannot have milli-precision")
-	}
-
 	return "", nil
 }
 
@@ -191,6 +245,11 @@ func ReadConfig(path string) (*Config, error) {
 // HELPER METHODS FOR USING CONFIGS //
 //////////////////////////////////////
 
+// ignoredNamespace returns whether items in the namespace should be treated as if they don't exist
+func (c *Config) ignoredNamespace(namespace string) bool {
+	return slices.Contains(c.IgnoreNamespaces, namespace)
+}
+
 // forNode returns the individual nodeConfig for a node with a particular name, taking override
 // settings into account
 func (c *Config) forNode(nodeName string) *nodeConfig {
@@ -206,32 +265,13 @@ func (c *Config) forNode(nodeName string) *nodeConfig {
 }
 
 func (c *nodeConfig) vCpuLimits(total *resource.Quantity) (_ nodeResourceState[vmapi.MilliCPU], margin *resource.Quantity, _ error) {
-	// We check both Value and MilliValue here in case the value overflows an int64 when
-	// multiplied by 1000, which is possible if c.Cpu.System is not in units of milli-CPU
-	if c.Cpu.System.Value() > total.Value() || c.Cpu.System.MilliValue() > total.MilliValue() {
-		err := fmt.Errorf("desired system vCPU %v greater than node total %v", &c.Cpu.System, total)
-		return nodeResourceState[vmapi.MilliCPU]{}, nil, err
-	}
+	totalMilli := total.MilliValue()
 
-	totalRounded := total.MilliValue() / 1000
-
-	// system CPU usage isn't measured directly, but as the number of additional *full* CPUs
-	// reserved for system functions *that we'd otherwise have available*.
-	//
-	// So if c.Cpu.System is less than the difference between total.MilliValue() and
-	// 1000*total.Value(), then systemCpus will be zero.
-	systemCpus := totalRounded - (total.MilliValue()-c.Cpu.System.MilliValue())/1000
-
-	reservableCpus := totalRounded - systemCpus
-	unreservableCpuMillis := total.MilliValue() - 1000*reservableCpus
-
-	margin = resource.NewMilliQuantity(unreservableCpuMillis, c.Cpu.System.Format)
-	margin.Sub(c.Cpu.System)
+	margin = resource.NewMilliQuantity(0, total.Format)
 
 	return nodeResourceState[vmapi.MilliCPU]{
-		Total:                vmapi.MilliCPU(totalRounded * 1000),
-		System:               vmapi.MilliCPU(systemCpus * 1000),
-		Watermark:            vmapi.MilliCPU(c.Cpu.Watermark * float32(reservableCpus) * 1000),
+		Total:                vmapi.MilliCPU(totalMilli),
+		Watermark:            vmapi.MilliCPU(c.Cpu.Watermark * float32(totalMilli)),
 		Reserved:             0,
 		Buffer:               0,
 		CapacityPressure:     0,
@@ -243,13 +283,7 @@ func (c *nodeConfig) memoryLimits(
 	total *resource.Quantity,
 	slotSize *resource.Quantity,
 ) (_ nodeResourceState[uint16], margin *resource.Quantity, _ error) {
-	if c.Memory.System.Cmp(*total) == 1 /* if c.Memory.System > total */ {
-		err := fmt.Errorf(
-			"desired system memory %v greater than node total %v",
-			&c.Memory.System, total,
-		)
-		return nodeResourceState[uint16]{}, nil, err
-	} else if slotSize.Cmp(*total) == 1 /* if slotSize > total */ {
+	if slotSize.Cmp(*total) == 1 /* if slotSize > total */ {
 		err := fmt.Errorf("slotSize %v greater than node total %v", slotSize, total)
 		return nodeResourceState[uint16]{}, nil, err
 	}
@@ -260,23 +294,13 @@ func (c *nodeConfig) memoryLimits(
 		return nodeResourceState[uint16]{}, nil, err
 	}
 
-	// systemSlots isn't measured directly, but as the number of additional slots reserved for
-	// system functions *that we'd otherwise have available*.
-	//
-	// So if c.Memory.System is less than the leftover space between totalSlots*slotSize and total,
-	// then systemSlots will be zero.
-	systemSlots := totalSlots - (total.Value()-c.Memory.System.Value())/slotSize.Value()
-
-	reservableSlots := totalSlots - systemSlots
-	unreservable := total.Value() - slotSize.Value()*reservableSlots
+	unreservable := total.Value() - slotSize.Value()*totalSlots
 
 	margin = resource.NewQuantity(unreservable, total.Format)
-	margin.Sub(c.Memory.System)
 
 	return nodeResourceState[uint16]{
 		Total:                uint16(totalSlots),
-		System:               uint16(systemSlots),
-		Watermark:            uint16(c.Memory.Watermark * float32(reservableSlots)),
+		Watermark:            uint16(c.Memory.Watermark * float32(totalSlots)),
 		Reserved:             0,
 		Buffer:               0,
 		CapacityPressure:     0,
diff --git a/pkg/plugin/dumpstate.go b/pkg/plugin/dumpstate.go
index 0f9da2570..d4e297e56 100644
--- a/pkg/plugin/dumpstate.go
+++ b/pkg/plugin/dumpstate.go
@@ -99,6 +99,8 @@ type keyed[K any, V any] struct {
 }
 
 type pluginStateDump struct {
+	OngoingMigrationDeletions []keyed[util.NamespacedName, int] `json:"ongoingMigrationDeletions"`
+
 	Nodes []keyed[string, nodeStateDump] `json:"nodes"`
 
 	VMPods    []podNameAndPointer `json:"vmPods"`
@@ -118,13 +120,15 @@ type podNameAndPointer struct {
 type pointerString string
 
 type nodeStateDump struct {
-	Obj       pointerString                                   `json:"obj"`
-	Name      string                                          `json:"name"`
-	VCPU      nodeResourceState[vmapi.MilliCPU]               `json:"vCPU"`
-	MemSlots  nodeResourceState[uint16]                       `json:"memSlots"`
-	Pods      []keyed[util.NamespacedName, podStateDump]      `json:"pods"`
-	OtherPods []keyed[util.NamespacedName, otherPodStateDump] `json:"otherPods"`
-	Mq        []*podNameAndPointer                            `json:"mq"`
+	Obj              pointerString                                   `json:"obj"`
+	Name             string                                          `json:"name"`
+	NodeGroup        string                                          `json:"nodeGroup"`
+	AvailabilityZone string                                          `json:"availabilityZone"`
+	VCPU             nodeResourceState[vmapi.MilliCPU]               `json:"vCPU"`
+	MemSlots         nodeResourceState[uint16]                       `json:"memSlots"`
+	Pods             []keyed[util.NamespacedName, podStateDump]      `json:"pods"`
+	OtherPods        []keyed[util.NamespacedName, otherPodStateDump] `json:"otherPods"`
+	Mq               []*podNameAndPointer                            `json:"mq"`
 }
 
 type podStateDump struct {
@@ -189,7 +193,14 @@ func (s *pluginState) dump(ctx context.Context) (*pluginStateDump, error) {
 		return kvx.Key < kvy.Key
 	})
 
+	ongoingMigrationDeletions := make([]keyed[util.NamespacedName, int], 0, len(s.ongoingMigrationDeletions))
+	for k, count := range s.ongoingMigrationDeletions {
+		ongoingMigrationDeletions = append(ongoingMigrationDeletions, keyed[util.NamespacedName, int]{Key: k, Value: count})
+	}
+	sortSliceByPodName(ongoingMigrationDeletions, func(kv keyed[util.NamespacedName, int]) util.NamespacedName { return kv.Key })
+
 	return &pluginStateDump{
+		OngoingMigrationDeletions:  ongoingMigrationDeletions,
 		Nodes:                      nodes,
 		VMPods:                     vmPods,
 		OtherPods:                  otherPods,
@@ -223,13 +234,15 @@ func (s *nodeState) dump() nodeStateDump {
 	}
 
 	return nodeStateDump{
-		Obj:       makePointerString(s),
-		Name:      s.name,
-		VCPU:      s.vCPU,
-		MemSlots:  s.memSlots,
-		Pods:      pods,
-		OtherPods: otherPods,
-		Mq:        mq,
+		Obj:              makePointerString(s),
+		Name:             s.name,
+		NodeGroup:        s.nodeGroup,
+		AvailabilityZone: s.availabilityZone,
+		VCPU:             s.vCPU,
+		MemSlots:         s.memSlots,
+		Pods:             pods,
+		OtherPods:        otherPods,
+		Mq:               mq,
 	}
 }
 
diff --git a/pkg/plugin/plugin.go b/pkg/plugin/plugin.go
index f6e9eb47b..ad9521f2c 100644
--- a/pkg/plugin/plugin.go
+++ b/pkg/plugin/plugin.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"math/rand"
 	"time"
 
 	"github.com/tychoish/fun/pubsub"
@@ -25,6 +26,7 @@ import (
 
 const Name = "AutoscaleEnforcer"
 const LabelVM = vmapi.VirtualMachineNameLabel
+const LabelPluginCreatedMigration = "autoscaling.neon.tech/created-by-scheduler"
 const ConfigMapNamespace = "kube-system"
 const ConfigMapName = "scheduler-plugin-config"
 const ConfigMapKey = "autoscaler-enforcer-config.json"
@@ -100,8 +102,9 @@ func makeAutoscaleEnforcerPlugin(
 		vmClient: vmClient,
 		// remaining fields are set by p.readClusterState and p.makePrometheusRegistry
 		state: pluginState{ //nolint:exhaustruct // see above.
-			lock: util.NewChanMutex(),
-			conf: config,
+			lock:                      util.NewChanMutex(),
+			ongoingMigrationDeletions: make(map[util.NamespacedName]int),
+			conf:                      config,
 		},
 		metrics:   PromMetrics{},      //nolint:exhaustruct // set by makePrometheusRegistry
 		vmStore:   IndexedVMStore{},   //nolint:exhaustruct // set below
@@ -130,6 +133,9 @@ func makeAutoscaleEnforcerPlugin(
 		},
 	}
 	pwc := podWatchCallbacks{
+		submitPodStarted: func(logger *zap.Logger, pod *corev1.Pod) {
+			pushToQueue(logger, func() { p.handlePodStarted(hlogger, pod) })
+		},
 		submitVMDeletion: func(logger *zap.Logger, pod util.NamespacedName) {
 			pushToQueue(logger, func() { p.handleVMDeletion(hlogger, pod) })
 		},
@@ -154,6 +160,13 @@ func makeAutoscaleEnforcerPlugin(
 			pushToQueue(logger, func() { p.handleNonAutoscalingUsageChange(hlogger, vm, podName) })
 		},
 	}
+	mwc := migrationWatchCallbacks{
+		submitMigrationFinished: func(vmm *vmapi.VirtualMachineMigration) {
+			// When cleaning up migrations, we don't want to process those events synchronously.
+			// So instead, we'll spawn a goroutine to delete the completed migration.
+			go p.cleanupMigration(hlogger, vmm)
+		},
+	}
 
 	watchMetrics := watch.NewMetrics("autoscaling_plugin_watchers")
 
@@ -166,16 +179,24 @@ func makeAutoscaleEnforcerPlugin(
 	p.nodeStore = watch.NewIndexedStore(nodeStore, watch.NewFlatNameIndex[corev1.Node]())
 
 	logger.Info("Starting pod watcher")
-	if err := p.watchPodEvents(ctx, logger, watchMetrics, pwc); err != nil {
+	podStore, err := p.watchPodEvents(ctx, logger, watchMetrics, pwc)
+	if err != nil {
 		return nil, fmt.Errorf("Error starting pod watcher: %w", err)
 	}
 
+	podIndex := watch.NewIndexedStore(podStore, watch.NewNameIndex[corev1.Pod]())
+
 	logger.Info("Starting VM watcher")
-	vmStore, err := p.watchVMEvents(ctx, logger, watchMetrics, vwc)
+	vmStore, err := p.watchVMEvents(ctx, logger, watchMetrics, vwc, podIndex)
 	if err != nil {
 		return nil, fmt.Errorf("Error starting VM watcher: %w", err)
 	}
 
+	logger.Info("Starting VM Migration watcher")
+	if _, err := p.watchMigrationEvents(ctx, logger, watchMetrics, mwc); err != nil {
+		return nil, fmt.Errorf("Error starting VM Migration watcher: %w", err)
+	}
+
 	p.vmStore = watch.NewIndexedStore(vmStore, watch.NewNameIndex[vmapi.VirtualMachine]())
 
 	// makePrometheusRegistry sets p.metrics, which we need to do before calling readClusterState,
@@ -190,14 +211,15 @@ func makeAutoscaleEnforcerPlugin(
 	}
 
 	go func() {
-		iter := queue.Iterator()
-		for iter.Next(ctx) {
-			callback := iter.Value()
+		for {
+			callback, err := queue.Wait(ctx) // NB: Wait pulls from the front of the queue
+			if err != nil {
+				logger.Info("Stopped waiting on pod/VM queue", zap.Error(err))
+				break
+			}
+
 			callback()
 		}
-		if err := iter.Close(); err != nil {
-			logger.Info("Stopped waiting on pod/VM queue", zap.Error(err))
-		}
 	}()
 
 	if err := util.StartPrometheusMetricsServer(ctx, logger.Named("prometheus"), 9100, promReg); err != nil {
@@ -233,7 +255,7 @@ func (e *AutoscaleEnforcer) Name() string {
 // getVmInfo is a helper for the plugin-related functions
 //
 // This function returns nil, nil if the pod is not associated with a NeonVM virtual machine.
-func getVmInfo(logger *zap.Logger, vmStore IndexedVMStore, pod *corev1.Pod) (*api.VmInfo, error) {
+func (e *AutoscaleEnforcer) getVmInfo(logger *zap.Logger, pod *corev1.Pod, action string) (*api.VmInfo, error) {
 	var vmName util.NamespacedName
 	vmName.Namespace = pod.Namespace
 
@@ -247,7 +269,7 @@ func getVmInfo(logger *zap.Logger, vmStore IndexedVMStore, pod *corev1.Pod) (*ap
 		return index.Get(vmName.Namespace, vmName.Name)
 	}
 
-	vm, ok := vmStore.GetIndexed(accessor)
+	vm, ok := e.vmStore.GetIndexed(accessor)
 	if !ok {
 		logger.Warn(
 			"VM is missing from local store. Relisting",
@@ -264,13 +286,13 @@ func getVmInfo(logger *zap.Logger, vmStore IndexedVMStore, pod *corev1.Pod) (*ap
 		defer timer.Stop()
 
 		select {
-		case <-vmStore.Relist():
+		case <-e.vmStore.Relist():
 		case <-timer.C:
 			return nil, fmt.Errorf("Timed out waiting on VM store relist (timeout = %s)", timeout)
 		}
 
 		// retry fetching the VM, now that we know it's been synced.
-		vm, ok = vmStore.GetIndexed(accessor)
+		vm, ok = e.vmStore.GetIndexed(accessor)
 		if !ok {
 			// if the VM is still not present after relisting, then either it's already been deleted
 			// or there's a deeper problem.
@@ -280,6 +302,15 @@ func getVmInfo(logger *zap.Logger, vmStore IndexedVMStore, pod *corev1.Pod) (*ap
 
 	vmInfo, err := api.ExtractVmInfo(logger, vm)
 	if err != nil {
+		e.handle.EventRecorder().Eventf(
+			vm,              // regarding
+			pod,             // related
+			"Warning",       // eventtype
+			"ExtractVmInfo", // reason
+			action,          // action
+			"Failed to extract autoscaling info about VM: %s", // node
+			err,
+		)
 		return nil, fmt.Errorf("Error extracting VM info: %w", err)
 	}
 
@@ -310,9 +341,11 @@ func (e *AutoscaleEnforcer) PreFilter(
 	state *framework.CycleState,
 	pod *corev1.Pod,
 ) (_ *framework.PreFilterResult, status *framework.Status) {
-	e.metrics.pluginCalls.WithLabelValues("PreFilter").Inc()
+	ignored := e.state.conf.ignoredNamespace(pod.Namespace)
+
+	e.metrics.IncMethodCall("PreFilter", ignored)
 	defer func() {
-		e.metrics.IncFailIfNotSuccess("PreFilter", status)
+		e.metrics.IncFailIfNotSuccess("PreFilter", ignored, status)
 	}()
 
 	return nil, nil
@@ -338,11 +371,16 @@ func (e *AutoscaleEnforcer) PostFilter(
 	pod *corev1.Pod,
 	filteredNodeStatusMap framework.NodeToStatusMap,
 ) (_ *framework.PostFilterResult, status *framework.Status) {
-	e.metrics.pluginCalls.WithLabelValues("PostFilter").Inc()
+	ignored := e.state.conf.ignoredNamespace(pod.Namespace)
+
+	e.metrics.IncMethodCall("PostFilter", ignored)
 	defer func() {
-		e.metrics.IncFailIfNotSuccess("PostFilter", status)
+		e.metrics.IncFailIfNotSuccess("PostFilter", ignored, status)
 	}()
 
+	logger := e.logger.With(zap.String("method", "Filter"), util.PodNameFields(pod))
+	logger.Error("Pod rejected by all Filter method calls")
+
 	return nil, nil // PostFilterResult is optional, nil Status is success.
 }
 
@@ -355,9 +393,11 @@ func (e *AutoscaleEnforcer) Filter(
 	pod *corev1.Pod,
 	nodeInfo *framework.NodeInfo,
 ) (status *framework.Status) {
-	e.metrics.pluginCalls.WithLabelValues("Filter").Inc()
+	ignored := e.state.conf.ignoredNamespace(pod.Namespace)
+
+	e.metrics.IncMethodCall("Filter", ignored)
 	defer func() {
-		e.metrics.IncFailIfNotSuccess("Filter", status)
+		e.metrics.IncFailIfNotSuccess("Filter", ignored, status)
 	}()
 
 	nodeName := nodeInfo.Node().Name // TODO: nodes also have namespaces? are they used at all?
@@ -365,7 +405,11 @@ func (e *AutoscaleEnforcer) Filter(
 	logger := e.logger.With(zap.String("method", "Filter"), zap.String("node", nodeName), util.PodNameFields(pod))
 	logger.Info("Handling Filter request")
 
-	vmInfo, err := getVmInfo(logger, e.vmStore, pod)
+	if ignored {
+		logger.Warn("Received Filter request for pod in ignored namespace, continuing anyways.")
+	}
+
+	vmInfo, err := e.getVmInfo(logger, pod, "Filter")
 	if err != nil {
 		logger.Error("Error getting VM info for Pod", zap.Error(err))
 		return framework.NewStatus(
@@ -429,17 +473,82 @@ func (e *AutoscaleEnforcer) Filter(
 	otherResources.MarginCPU = node.otherResources.MarginCPU
 	otherResources.MarginMemory = node.otherResources.MarginMemory
 
+	// As we process all pods, we should record all the pods that aren't present in both nodeInfo
+	// and e.state's maps, so that we can log any inconsistencies instead of silently using
+	// *potentially* bad data. Some differences are expected, but on the whole this extra
+	// information should be helpful.
+	missedPods := make(map[util.NamespacedName]struct{})
+	for name := range node.pods {
+		missedPods[name] = struct{}{}
+	}
+	for name := range node.otherPods {
+		missedPods[name] = struct{}{}
+	}
+
+	var includedIgnoredPods []util.NamespacedName
+
 	for _, podInfo := range nodeInfo.Pods {
 		pn := util.NamespacedName{Name: podInfo.Pod.Name, Namespace: podInfo.Pod.Namespace}
 		if podState, ok := e.state.podMap[pn]; ok {
 			totalNodeVCPU += podState.vCPU.Reserved
 			totalNodeMem += podState.memSlots.Reserved
+			delete(missedPods, pn)
 		} else if otherPodState, ok := e.state.otherPods[pn]; ok {
 			oldRes := otherResources
 			otherResources = oldRes.addPod(&e.state.conf.MemSlotSize, otherPodState.resources)
 			totalNodeVCPU += otherResources.ReservedCPU - oldRes.ReservedCPU
 			totalNodeMem += otherResources.ReservedMemSlots - oldRes.ReservedMemSlots
+			delete(missedPods, pn)
+		} else {
+			name := util.GetNamespacedName(podInfo.Pod)
+
+			if util.PodCompleted(podInfo.Pod) {
+				logger.Warn(
+					"Skipping completed Pod in Filter node's pods",
+					zap.Object("pod", name),
+					zap.String("phase", string(podInfo.Pod.Status.Phase)),
+				)
+				continue
+			}
+
+			if !e.state.conf.ignoredNamespace(podInfo.Pod.Namespace) {
+				// FIXME: this gets us duplicated "pod" fields. Not great. But we're using
+				// logger.With pretty pervasively, and it's hard to avoid this while using that.
+				// For now, we can get around this by including the pod name in an error.
+				logger.Error(
+					"Unknown-but-not-ignored Pod in Filter node's pods",
+					zap.Object("pod", name),
+					zap.Error(fmt.Errorf("Pod %v is unknown but not ignored", name)),
+				)
+			} else {
+				includedIgnoredPods = append(includedIgnoredPods, name)
+			}
+
+			// We *also* need to count pods in ignored namespaces
+			resources, err := extractPodOtherPodResourceState(podInfo.Pod)
+			if err != nil {
+				// FIXME: Same duplicate "pod" field issue as above; same temporary solution.
+				logger.Error(
+					"Error extracting resource state for non-VM Pod",
+					zap.Object("pod", name),
+					zap.Error(fmt.Errorf("Error extracting resource state for %v: %w", name, err)),
+				)
+				continue
+			}
+
+			oldRes := otherResources
+			otherResources = oldRes.addPod(&e.state.conf.MemSlotSize, resources)
+			totalNodeVCPU += otherResources.ReservedCPU - oldRes.ReservedCPU
+			totalNodeMem += otherResources.ReservedMemSlots - oldRes.ReservedMemSlots
+		}
+	}
+
+	if len(missedPods) != 0 {
+		var missedPodsList []util.NamespacedName
+		for name := range missedPods {
+			missedPodsList = append(missedPodsList, name)
 		}
+		logger.Warn("Some known Pods weren't included in Filter NodeInfo", zap.Objects("missedPods", missedPodsList))
 	}
 
 	nodeTotalReservableCPU := node.totalReservableCPU()
@@ -506,14 +615,18 @@ func (e *AutoscaleEnforcer) Filter(
 	}
 
 	var message string
+	var logFunc func(string, ...zap.Field)
 	if allowing {
 		message = "Allowing Pod"
+		logFunc = logger.Info
 	} else {
 		message = "Rejecting Pod"
+		logFunc = logger.Warn
 	}
 
-	logger.Info(
+	logFunc(
 		message,
+		zap.Objects("includedIgnoredPods", includedIgnoredPods),
 		zap.Object("verdict", verdictSet{
 			cpu: cpuMsg,
 			mem: memMsg,
@@ -545,9 +658,11 @@ func (e *AutoscaleEnforcer) Score(
 	pod *corev1.Pod,
 	nodeName string,
 ) (_ int64, status *framework.Status) {
-	e.metrics.pluginCalls.WithLabelValues("Score").Inc()
+	ignored := e.state.conf.ignoredNamespace(pod.Namespace)
+
+	e.metrics.IncMethodCall("Score", ignored)
 	defer func() {
-		e.metrics.IncFailIfNotSuccess("Score", status)
+		e.metrics.IncFailIfNotSuccess("Score", ignored, status)
 	}()
 
 	logger := e.logger.With(zap.String("method", "Score"), zap.String("node", nodeName), util.PodNameFields(pod))
@@ -555,7 +670,7 @@ func (e *AutoscaleEnforcer) Score(
 
 	scoreLen := framework.MaxNodeScore - framework.MinNodeScore
 
-	vmInfo, err := getVmInfo(logger, e.vmStore, pod)
+	vmInfo, err := e.getVmInfo(logger, pod, "Score")
 	if err != nil {
 		logger.Error("Error getting VM info for Pod", zap.Error(err))
 		return 0, framework.NewStatus(framework.Error, "Error getting info for pod")
@@ -583,32 +698,122 @@ func (e *AutoscaleEnforcer) Score(
 		(vmInfo.Cpu.Use > node.remainingReservableCPU() ||
 			vmInfo.Mem.Use > node.remainingReservableMemSlots())
 	if noRoom {
-		return framework.MinNodeScore, nil
+		score := framework.MinNodeScore
+		logger.Warn("No room on node, giving minimum score (typically handled by Filter method)", zap.Int64("score", score))
+		return score, nil
 	}
 
-	totalMilliCpu := int64(node.totalReservableCPU())
-	totalMem := int64(node.totalReservableMemSlots())
-	maxTotalMilliCpu := int64(e.state.maxTotalReservableCPU)
-	maxTotalMem := int64(e.state.maxTotalReservableMemSlots)
+	cpuRemaining := node.remainingReservableCPU()
+	cpuTotal := node.totalReservableCPU()
+	memRemaining := node.remainingReservableMemSlots()
+	memTotal := node.totalReservableMemSlots()
+
+	cpuFraction := 1 - cpuRemaining.AsFloat64()/cpuTotal.AsFloat64()
+	memFraction := 1 - float64(memRemaining)/float64(memTotal)
+	cpuScale := node.totalReservableCPU().AsFloat64() / e.state.maxTotalReservableCPU.AsFloat64()
+	memScale := float64(node.totalReservableMemSlots()) / float64(e.state.maxTotalReservableMemSlots)
+
+	nodeConf := e.state.conf.forNode(nodeName)
+
+	// Refer to the comments in nodeConfig for more. Also, see: https://www.desmos.com/calculator/wg8s0yn63s
+	calculateScore := func(fraction, scale float64) (float64, int64) {
+		y0 := nodeConf.MinUsageScore
+		y1 := nodeConf.MaxUsageScore
+		xp := nodeConf.ScorePeak
+
+		score := float64(1) // if fraction == nodeConf.ScorePeak
+		if fraction < nodeConf.ScorePeak {
+			score = y0 + (1-y0)/xp*fraction
+		} else if fraction > nodeConf.ScorePeak {
+			score = y1 + (1-y1)/(1-xp)*(1-fraction)
+		}
 
-	// The ordering of multiplying before dividing is intentional; it allows us to get an exact
-	// result, because scoreLen and total will both be small (i.e. their product fits within an int64)
-	scoreCpu := framework.MinNodeScore + scoreLen*totalMilliCpu/maxTotalMilliCpu
-	scoreMem := framework.MinNodeScore + scoreLen*totalMem/maxTotalMem
+		score *= scale
 
-	// return the minimum of the two resources scores
-	if scoreCpu < scoreMem {
-		return scoreCpu, nil
-	} else {
-		return scoreMem, nil
+		return score, framework.MinNodeScore + int64(float64(scoreLen)*score)
 	}
+
+	cpuFScore, cpuIScore := calculateScore(cpuFraction, cpuScale)
+	memFScore, memIScore := calculateScore(memFraction, memScale)
+
+	score := util.Min(cpuIScore, memIScore)
+	logger.Info(
+		"Scored pod placement for node",
+		zap.Int64("score", score),
+		zap.Object("verdict", verdictSet{
+			cpu: fmt.Sprintf(
+				"%d remaining reservable of %d total => fraction=%g, scale=%g => score=(%g :: %d)",
+				cpuRemaining, cpuTotal, cpuFraction, cpuScale, cpuFScore, cpuIScore,
+			),
+			mem: fmt.Sprintf(
+				"%d remaining reservable of %d total => fraction=%g, scale=%g => score=(%g :: %d)",
+				memRemaining, memTotal, memFraction, memScale, memFScore, memIScore,
+			),
+		}),
+	)
+
+	return score, nil
 }
 
-// ScoreExtensions is required for framework.ScorePlugin, and can return nil if it's not used
-func (e *AutoscaleEnforcer) ScoreExtensions() framework.ScoreExtensions {
+// NormalizeScore weights scores uniformly in the range [minScore, trueScore], where
+// minScore is framework.MinNodeScore + 1.
+func (e *AutoscaleEnforcer) NormalizeScore(
+	ctx context.Context,
+	state *framework.CycleState,
+	pod *corev1.Pod,
+	scores framework.NodeScoreList,
+) (status *framework.Status) {
+	ignored := e.state.conf.ignoredNamespace(pod.Namespace)
+
+	e.metrics.IncMethodCall("NormalizeScore", ignored)
+	defer func() {
+		e.metrics.IncFailIfNotSuccess("NormalizeScore", ignored, status)
+	}()
+
+	logger := e.logger.With(zap.String("method", "NormalizeScore"), util.PodNameFields(pod))
+	logger.Info("Handling NormalizeScore request")
+
+	for _, node := range scores {
+		nodeScore := node.Score
+		nodeName := node.Name
+
+		// rand.Intn will panic if we pass in 0
+		if nodeScore == 0 {
+			logger.Info("Ignoring node as it was assigned a score of 0", zap.String("node", nodeName))
+			continue
+		}
+
+		// This is different from framework.MinNodeScore. We use framework.MinNodeScore
+		// to indicate that a pod should not be placed on a node. The lowest
+		// actual score we assign a node is thus framework.MinNodeScore + 1
+		minScore := framework.MinNodeScore + 1
+
+		// We want to pick a score in the range [minScore, score], so use
+		// score _+ 1_ - minscore, as rand.Intn picks a number in the _half open_
+		// range [0, n)
+		newScore := int64(rand.Intn(int(nodeScore+1-minScore))) + minScore
+		logger.Info(
+			"Randomly choosing newScore from range [minScore, trueScore]",
+			zap.String("node", nodeName),
+			zap.Int64("newScore", newScore),
+			zap.Int64("minScore", minScore),
+			zap.Int64("trueScore", nodeScore),
+		)
+		node.Score = newScore
+	}
 	return nil
 }
 
+// ScoreExtensions is required for framework.ScorePlugin, and can return nil if it's not used.
+// However, we do use it, to randomize scores.
+func (e *AutoscaleEnforcer) ScoreExtensions() framework.ScoreExtensions {
+	if e.state.conf.RandomizeScores {
+		return e
+	} else {
+		return nil
+	}
+}
+
 // Reserve signals to our plugin that a particular pod will (probably) be bound to a node, giving us
 // a chance to both (a) reserve the resources it needs within the node and (b) reject the pod if
 // there aren't enough.
@@ -620,9 +825,11 @@ func (e *AutoscaleEnforcer) Reserve(
 	pod *corev1.Pod,
 	nodeName string,
 ) (status *framework.Status) {
-	e.metrics.pluginCalls.WithLabelValues("Reserve").Inc()
+	ignored := e.state.conf.ignoredNamespace(pod.Namespace)
+
+	e.metrics.IncMethodCall("Reserve", ignored)
 	defer func() {
-		e.metrics.IncFailIfNotSuccess("Reserve", status)
+		e.metrics.IncFailIfNotSuccess("Reserve", ignored, status)
 	}()
 
 	pName := util.GetNamespacedName(pod)
@@ -633,7 +840,13 @@ func (e *AutoscaleEnforcer) Reserve(
 
 	logger.Info("Handling Reserve request")
 
-	vmInfo, err := getVmInfo(logger, e.vmStore, pod)
+	if ignored {
+		// Generally, we shouldn't be getting plugin requests for resources that are ignored.
+		logger.Warn("Ignoring Reserve request for pod in ignored namespace")
+		return nil // success; allow the Pod onto the node.
+	}
+
+	vmInfo, err := e.getVmInfo(logger, pod, "Reserve")
 	if err != nil {
 		logger.Error("Error getting VM info for pod", zap.Error(err))
 		return framework.NewStatus(
@@ -737,12 +950,12 @@ func (e *AutoscaleEnforcer) Reserve(
 			}
 
 			cpuVerdict := fmt.Sprintf(
-				"need %v vCPU (%v -> %v raw), have %v available (%s)",
-				addCpu, &oldNodeRes.RawCPU, &newNodeRes.RawCPU, node.remainingReservableCPU(), cpuShortVerdict,
+				"need %v (%v -> %v raw), %v of %v used, so %v available (%s)",
+				addCpu, &oldNodeRes.RawCPU, &newNodeRes.RawCPU, node.vCPU.Reserved, node.totalReservableCPU(), node.remainingReservableCPU(), cpuShortVerdict,
 			)
 			memVerdict := fmt.Sprintf(
-				"need %v mem slots (%v -> %v raw), have %d available (%s)",
-				addMem, &oldNodeRes.RawMemory, &newNodeRes.RawMemory, node.remainingReservableMemSlots(), memShortVerdict,
+				"need %v (%v -> %v raw), %v of %v used, so %v available (%s)",
+				addMem, &oldNodeRes.RawMemory, &newNodeRes.RawMemory, node.memSlots.Reserved, node.totalReservableMemSlots(), node.remainingReservableMemSlots(), memShortVerdict,
 			)
 
 			logger.Error(
@@ -819,8 +1032,14 @@ func (e *AutoscaleEnforcer) Reserve(
 			memShortVerdict = "OK"
 		}
 
-		cpuVerdict := fmt.Sprintf("need %v vCPU, have %v available (%s)", vmInfo.Cpu.Use, node.remainingReservableCPU(), cpuShortVerdict)
-		memVerdict := fmt.Sprintf("need %v mem slots, have %v available (%s)", vmInfo.Mem.Use, node.remainingReservableMemSlots(), memShortVerdict)
+		cpuVerdict := fmt.Sprintf(
+			"need %v, %v of %v used, so %v available (%s)",
+			vmInfo.Cpu.Use, node.vCPU.Reserved, node.totalReservableCPU(), node.remainingReservableCPU(), cpuShortVerdict,
+		)
+		memVerdict := fmt.Sprintf(
+			"need %v, %v of %v used, so %v available (%s)",
+			vmInfo.Mem.Use, node.memSlots.Reserved, node.totalReservableMemSlots(), node.remainingReservableMemSlots(), memShortVerdict,
+		)
 
 		logger.Error(
 			"Can't reserve VM pod (not enough resources)",
@@ -846,13 +1065,20 @@ func (e *AutoscaleEnforcer) Unreserve(
 	pod *corev1.Pod,
 	nodeName string,
 ) {
-	e.metrics.pluginCalls.WithLabelValues("Unreserve").Inc()
+	ignored := e.state.conf.ignoredNamespace(pod.Namespace)
+	e.metrics.IncMethodCall("Unreserve", ignored)
 
 	podName := util.GetNamespacedName(pod)
 
 	logger := e.logger.With(zap.String("method", "Unreserve"), zap.String("node", nodeName), util.PodNameFields(pod))
 	logger.Info("Handling Unreserve request")
 
+	if ignored {
+		// Generally, we shouldn't be getting plugin requests for resources that are ignored.
+		logger.Warn("Ignoring Unreserve request for pod in ignored namespace")
+		return
+	}
+
 	e.state.lock.Lock()
 	defer e.state.lock.Unlock()
 
@@ -898,7 +1124,7 @@ func (e *AutoscaleEnforcer) Unreserve(
 
 		ps.node.updateMetrics(e.metrics, e.state.memSlotSizeBytes())
 	} else {
-		logger.Warn("Cannot find pod in podMap in otherPods")
+		logger.Warn("Cannot find pod in podMap or otherPods")
 		return
 	}
 }
diff --git a/pkg/plugin/prommetrics.go b/pkg/plugin/prommetrics.go
index 870479d9b..28ef2f46b 100644
--- a/pkg/plugin/prommetrics.go
+++ b/pkg/plugin/prommetrics.go
@@ -3,6 +3,8 @@ package plugin
 // defines prometheus metrics and provides the server, via (*AutoscaleEnforcer).startPrometheusServer()
 
 import (
+	"strconv"
+
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/collectors"
 
@@ -18,6 +20,10 @@ type PromMetrics struct {
 	validResourceRequests *prometheus.CounterVec
 	nodeCPUResources      *prometheus.GaugeVec
 	nodeMemResources      *prometheus.GaugeVec
+	migrationCreations    prometheus.Counter
+	migrationDeletions    *prometheus.CounterVec
+	migrationCreateFails  prometheus.Counter
+	migrationDeleteFails  *prometheus.CounterVec
 }
 
 func (p *AutoscaleEnforcer) makePrometheusRegistry() *prometheus.Registry {
@@ -38,14 +44,14 @@ func (p *AutoscaleEnforcer) makePrometheusRegistry() *prometheus.Registry {
 				Name: "autoscaling_plugin_extension_calls_total",
 				Help: "Number of calls to scheduler plugin extension points",
 			},
-			[]string{"method"},
+			[]string{"method", "ignored_namespace"},
 		)),
 		pluginCallFails: util.RegisterMetric(reg, prometheus.NewCounterVec(
 			prometheus.CounterOpts{
 				Name: "autoscaling_plugin_extension_call_fails_total",
 				Help: "Number of unsuccessful calls to scheduler plugin extension points",
 			},
-			[]string{"method", "status"},
+			[]string{"method", "ignored_namespace", "status"},
 		)),
 		resourceRequests: util.RegisterMetric(reg, prometheus.NewCounterVec(
 			prometheus.CounterOpts{
@@ -66,24 +72,54 @@ func (p *AutoscaleEnforcer) makePrometheusRegistry() *prometheus.Registry {
 				Name: "autoscaling_plugin_node_cpu_resources_current",
 				Help: "Current amount of CPU for 'nodeResourceState' fields",
 			},
-			[]string{"node", "field"},
+			[]string{"node", "node_group", "availability_zone", "field"},
 		)),
 		nodeMemResources: util.RegisterMetric(reg, prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
 				Name: "autoscaling_plugin_node_mem_resources_current",
 				Help: "Current amount of memory (in bytes) for 'nodeResourceState' fields",
 			},
-			[]string{"node", "field"},
+			[]string{"node", "node_group", "availability_zone", "field"},
+		)),
+		migrationCreations: util.RegisterMetric(reg, prometheus.NewCounter(
+			prometheus.CounterOpts{
+				Name: "autoscaling_plugin_migrations_created_total",
+				Help: "Number of successful VirtualMachineMigration Create requests by the plugin",
+			},
+		)),
+		migrationDeletions: util.RegisterMetric(reg, prometheus.NewCounterVec(
+			prometheus.CounterOpts{
+				Name: "autoscaling_plugin_migrations_deleted_total",
+				Help: "Number of successful VirtualMachineMigration Delete requests by the plugin",
+			},
+			[]string{"phase"},
+		)),
+		migrationCreateFails: util.RegisterMetric(reg, prometheus.NewCounter(
+			prometheus.CounterOpts{
+				Name: "autoscaling_plugin_migration_create_fails_total",
+				Help: "Number of failed VirtualMachineMigration Create requests by the plugin",
+			},
+		)),
+		migrationDeleteFails: util.RegisterMetric(reg, prometheus.NewCounterVec(
+			prometheus.CounterOpts{
+				Name: "autoscaling_plugin_migration_delete_fails_total",
+				Help: "Number of failed VirtualMachineMigration Delete requests by the plugin",
+			},
+			[]string{"phase"},
 		)),
 	}
 
 	return reg
 }
 
-func (m *PromMetrics) IncFailIfNotSuccess(method string, status *framework.Status) {
+func (m *PromMetrics) IncMethodCall(method string, ignored bool) {
+	m.pluginCalls.WithLabelValues(method, strconv.FormatBool(ignored)).Inc()
+}
+
+func (m *PromMetrics) IncFailIfNotSuccess(method string, ignored bool, status *framework.Status) {
 	if !status.IsSuccess() {
 		return
 	}
 
-	m.pluginCallFails.WithLabelValues(method, status.Code().String())
+	m.pluginCallFails.WithLabelValues(method, strconv.FormatBool(ignored), status.Code().String())
 }
diff --git a/pkg/plugin/run.go b/pkg/plugin/run.go
index 90ec312ba..33870d94b 100644
--- a/pkg/plugin/run.go
+++ b/pkg/plugin/run.go
@@ -180,11 +180,18 @@ func (e *AutoscaleEnforcer) handleAgentRequest(
 
 	var migrateDecision *api.MigrateResponse
 	if mustMigrate {
-		migrateDecision = &api.MigrateResponse{}
-		err = e.state.startMigration(context.Background(), logger, pod, e.vmClient)
+		created, err := e.startMigration(context.Background(), logger, pod)
 		if err != nil {
 			return nil, 500, fmt.Errorf("Error starting migration for pod %v: %w", pod.name, err)
 		}
+
+		// We should only signal to the autoscaler-agent that we've started migrating if we actually
+		// *created* the migration. We're not *supposed* to receive requests for a VM that's already
+		// migrating, so receiving one means that *something*'s gone wrong. If that's on us, we
+		// should try to avoid
+		if created {
+			migrateDecision = &api.MigrateResponse{}
+		}
 	}
 
 	resp := api.PluginResponse{
diff --git a/pkg/plugin/state.go b/pkg/plugin/state.go
index 66c5772da..6dc8cedfd 100644
--- a/pkg/plugin/state.go
+++ b/pkg/plugin/state.go
@@ -19,7 +19,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	vmapi "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
-	vmclient "github.com/neondatabase/autoscaling/neonvm/client/clientset/versioned"
 
 	"github.com/neondatabase/autoscaling/pkg/api"
 	"github.com/neondatabase/autoscaling/pkg/util"
@@ -33,6 +32,8 @@ import (
 type pluginState struct {
 	lock util.ChanMutex
 
+	ongoingMigrationDeletions map[util.NamespacedName]int
+
 	podMap  map[util.NamespacedName]*podState
 	nodeMap map[string]*nodeState
 
@@ -60,6 +61,12 @@ type nodeState struct {
 	// name is the name of the node, guaranteed by kubernetes to be unique
 	name string
 
+	// nodeGroup, if present, gives the node group that this node belongs to.
+	nodeGroup string
+
+	// availabilityZone, if present, gives the availability zone that this node is in.
+	availabilityZone string
+
 	// vCPU tracks the state of vCPU resources -- what's available and how
 	vCPU nodeResourceState[vmapi.MilliCPU]
 	// memSlots tracks the state of memory slots -- what's available and how
@@ -90,7 +97,6 @@ type nodeResourceStateField[T any] struct {
 func (s *nodeResourceState[T]) fields() []nodeResourceStateField[T] {
 	return []nodeResourceStateField[T]{
 		{"Total", s.Total},
-		{"System", s.System},
 		{"Watermark", s.Watermark},
 		{"Reserved", s.Reserved},
 		{"Buffer", s.Buffer},
@@ -100,15 +106,21 @@ func (s *nodeResourceState[T]) fields() []nodeResourceStateField[T] {
 }
 
 func (s *nodeState) updateMetrics(metrics PromMetrics, memSlotSizeBytes uint64) {
-	s.vCPU.updateMetrics(metrics.nodeCPUResources, s.name, vmapi.MilliCPU.AsFloat64)
-	s.memSlots.updateMetrics(metrics.nodeMemResources, s.name, func(memSlots uint16) float64 {
+	s.vCPU.updateMetrics(metrics.nodeCPUResources, s.name, s.nodeGroup, s.availabilityZone, vmapi.MilliCPU.AsFloat64)
+	s.memSlots.updateMetrics(metrics.nodeMemResources, s.name, s.nodeGroup, s.availabilityZone, func(memSlots uint16) float64 {
 		return float64(uint64(memSlots) * memSlotSizeBytes) // convert memSlots -> bytes
 	})
 }
 
-func (s *nodeResourceState[T]) updateMetrics(metric *prometheus.GaugeVec, nodeName string, convert func(T) float64) {
+func (s *nodeResourceState[T]) updateMetrics(
+	metric *prometheus.GaugeVec,
+	nodeName string,
+	nodeGroup string,
+	availabilityZone string,
+	convert func(T) float64,
+) {
 	for _, f := range s.fields() {
-		metric.WithLabelValues(nodeName, f.valueName).Set(convert(f.value))
+		metric.WithLabelValues(nodeName, nodeGroup, availabilityZone, f.valueName).Set(convert(f.value))
 	}
 }
 
@@ -118,7 +130,7 @@ func (s *nodeState) removeMetrics(metrics PromMetrics) {
 
 	for _, g := range gauges {
 		for _, f := range fields {
-			g.DeleteLabelValues(s.name, f.valueName)
+			g.DeleteLabelValues(s.name, s.nodeGroup, s.availabilityZone, f.valueName)
 		}
 	}
 }
@@ -127,23 +139,15 @@ func (s *nodeState) removeMetrics(metrics PromMetrics) {
 type nodeResourceState[T any] struct {
 	// Total is the Total amount of T available on the node. This value does not change.
 	Total T `json:"total"`
-	// System is the amount of T pre-reserved for system functions, and cannot be handed out to pods
-	// on the node. This amount CAN change on config updates, which may result in more of T than
-	// we'd like being already provided to the pods.
-	//
-	// This is equivalent to the value of this resource's resourceConfig.System, rounded up to the
-	// nearest size of the units of T.
-	System T `json:"system"`
 	// Watermark is the amount of T reserved to pods above which we attempt to reduce usage via
 	// migration.
 	Watermark T `json:"watermark"`
 	// Reserved is the current amount of T reserved to pods. It SHOULD be less than or equal to
-	// (Total - System), and we take active measures reduce it once it is above watermark.
+	// Total), and we take active measures reduce it once it is above Watermark.
 	//
 	// Reserved MAY be greater than Total on scheduler restart (because of buffering with VM scaling
 	// maximums), but (Reserved - Buffer) MUST be less than Total. In general, (Reserved - Buffer)
-	// SHOULD be less than or equal to (Total - System), but this can be temporarily violated after
-	// restart or config change.
+	// SHOULD be less than or equal to Total, but this can be temporarily violated after restart.
 	//
 	// For more information, refer to the ARCHITECTURE.md file in this directory.
 	//
@@ -183,7 +187,7 @@ type nodeOtherResourceState struct {
 	ReservedMemSlots uint16         `json:"reservedMemSlots"`
 
 	// MarginCPU and MarginMemory track the amount of other resources we can get "for free" because
-	// they were left out when rounding the System usage to fit in integer units of CPUs or memory
+	// they were left out when rounding the Total usage to fit in integer units of CPUs or memory
 	// slots
 	//
 	// These values are both only changed by configuration changes.
@@ -377,27 +381,25 @@ func (r *nodeOtherResourceState) calculateReserved(memSlotSize *resource.Quantit
 	}
 }
 
-// totalReservableCPU returns the amount of node CPU that may be allocated to VM pods -- i.e.,
-// excluding the CPU pre-reserved for system tasks.
+// totalReservableCPU returns the amount of node CPU that may be allocated to VM pods
 func (s *nodeState) totalReservableCPU() vmapi.MilliCPU {
-	return s.vCPU.Total - s.vCPU.System
+	return s.vCPU.Total
 }
 
-// totalReservableMemSlots returns the number of memory slots that may be allocated to VM pods --
-// i.e., excluding the memory pre-reserved for system tasks.
+// totalReservableMemSlots returns the number of memory slots that may be allocated to VM pods
 func (s *nodeState) totalReservableMemSlots() uint16 {
-	return s.memSlots.Total - s.memSlots.System
+	return s.memSlots.Total
 }
 
 // remainingReservableCPU returns the remaining CPU that can be allocated to VM pods
 func (s *nodeState) remainingReservableCPU() vmapi.MilliCPU {
-	return s.totalReservableCPU() - s.vCPU.Reserved
+	return util.SaturatingSub(s.totalReservableCPU(), s.vCPU.Reserved)
 }
 
 // remainingReservableMemSlots returns the remaining number of memory slots that can be allocated to
 // VM pods
 func (s *nodeState) remainingReservableMemSlots() uint16 {
-	return s.totalReservableMemSlots() - s.memSlots.Reserved
+	return util.SaturatingSub(s.totalReservableMemSlots(), s.memSlots.Reserved)
 }
 
 // tooMuchPressure is used to signal whether the node should start migrating pods out in order to
@@ -615,12 +617,32 @@ func buildInitialNodeState(logger *zap.Logger, node *corev1.Node, conf *Config)
 		return nil, fmt.Errorf("Error calculating memory slot limits for node %s: %w", node.Name, err)
 	}
 
+	var nodeGroup string
+	if conf.K8sNodeGroupLabel != "" {
+		var ok bool
+		nodeGroup, ok = node.Labels[conf.K8sNodeGroupLabel]
+		if !ok {
+			logger.Warn("Node does not have node group label", zap.String("label", conf.K8sNodeGroupLabel))
+		}
+	}
+
+	var availabilityZone string
+	if conf.K8sAvailabilityZoneLabel != "" {
+		var ok bool
+		availabilityZone, ok = node.Labels[conf.K8sAvailabilityZoneLabel]
+		if !ok {
+			logger.Warn("Node does not have availability zone label", zap.String("label", conf.K8sAvailabilityZoneLabel))
+		}
+	}
+
 	n := &nodeState{
-		name:      node.Name,
-		vCPU:      vCPU,
-		memSlots:  memSlots,
-		pods:      make(map[util.NamespacedName]*podState),
-		otherPods: make(map[util.NamespacedName]*otherPodState),
+		name:             node.Name,
+		nodeGroup:        nodeGroup,
+		availabilityZone: availabilityZone,
+		vCPU:             vCPU,
+		memSlots:         memSlots,
+		pods:             make(map[util.NamespacedName]*podState),
+		otherPods:        make(map[util.NamespacedName]*otherPodState),
 		otherResources: nodeOtherResourceState{
 			RawCPU:           resource.Quantity{},
 			RawMemory:        resource.Quantity{},
@@ -666,28 +688,14 @@ func extractPodOtherPodResourceState(pod *corev1.Pod) (podOtherResourceState, er
 	var cpu resource.Quantity
 	var mem resource.Quantity
 
-	for i, container := range pod.Spec.Containers {
-		// For each resource, use requests if it's provided, or fallback on the limit.
-
-		cpuRequest := container.Resources.Requests.Cpu()
-		cpuLimit := container.Resources.Limits.Cpu()
-		if cpuRequest.IsZero() && cpuLimit.IsZero() {
-			err := fmt.Errorf("containers[%d] (%q) missing resources.requests.cpu AND resources.limits.cpu", i, container.Name)
-			return podOtherResourceState{}, err
-		} else if cpuRequest.IsZero() /* && !cpuLimit.IsZero() */ {
-			cpuRequest = cpuLimit
-		}
-		cpu.Add(*cpuRequest)
-
-		memRequest := container.Resources.Requests.Memory()
-		memLimit := container.Resources.Limits.Memory()
-		if memRequest.IsZero() && memLimit.IsZero() {
-			err := fmt.Errorf("containers[%d] (%q) missing resources.limits.memory", i, container.Name)
-			return podOtherResourceState{}, err
-		} else if memRequest.IsZero() /* && !memLimit.IsZero() */ {
-			memRequest = memLimit
-		}
-		mem.Add(*memRequest)
+	for _, container := range pod.Spec.Containers {
+		// For each resource, add the requests, if they're provided. We use this because it matches
+		// what cluster-autoscaler uses.
+		//
+		// NB: .Cpu() returns a pointer to a value equal to zero if the resource is not present. So
+		// we can just add it either way.
+		cpu.Add(*container.Resources.Requests.Cpu())
+		mem.Add(*container.Resources.Requests.Memory())
 	}
 
 	return podOtherResourceState{RawCPU: cpu, RawMemory: mem}, nil
@@ -733,6 +741,90 @@ func (e *AutoscaleEnforcer) handleNodeDeletion(logger *zap.Logger, nodeName stri
 	logger.Info("Deleted node")
 }
 
+func (e *AutoscaleEnforcer) handlePodStarted(logger *zap.Logger, pod *corev1.Pod) {
+	podName := util.GetNamespacedName(pod)
+	nodeName := pod.Spec.NodeName
+
+	logger = logger.With(
+		zap.String("action", "Pod started"),
+		zap.Object("pod", podName),
+		zap.String("node", nodeName),
+	)
+
+	if pod.Spec.SchedulerName == e.state.conf.SchedulerName {
+		logger.Info("Got non-VM pod start event for pod assigned to this scheduler; nothing to do")
+		return
+	}
+
+	logger.Info("Handling non-VM pod start event")
+
+	podResources, err := extractPodOtherPodResourceState(pod)
+	if err != nil {
+		logger.Error("Error extracting resource state for non-VM pod", zap.Error(err))
+		return
+	}
+
+	e.state.lock.Lock()
+	defer e.state.lock.Unlock()
+
+	if _, ok := e.state.otherPods[podName]; ok {
+		logger.Info("Pod is already known") // will happen during startup
+		return
+	}
+
+	// Pod is not known - let's get information about the node!
+	node, err := e.state.getOrFetchNodeState(context.TODO(), logger, e.metrics, e.nodeStore, nodeName)
+	if err != nil {
+		logger.Error("Failed to state for node", zap.Error(err))
+	}
+
+	// TODO: this is pretty similar to the Reserve method. Maybe we should join them into one.
+	oldNodeRes := node.otherResources
+	newNodeRes := node.otherResources.addPod(&e.state.conf.MemSlotSize, podResources)
+
+	addCPU := newNodeRes.ReservedCPU - oldNodeRes.ReservedCPU
+	addMem := newNodeRes.ReservedMemSlots - oldNodeRes.ReservedMemSlots
+
+	oldNodeCPUReserved := node.vCPU.Reserved
+	oldNodeMemReserved := node.memSlots.Reserved
+
+	node.otherResources = newNodeRes
+	node.vCPU.Reserved += addCPU
+	node.memSlots.Reserved += addMem
+
+	ps := &otherPodState{
+		name:      podName,
+		node:      node,
+		resources: podResources,
+	}
+	node.otherPods[podName] = ps
+	e.state.otherPods[podName] = ps
+
+	cpuVerdict := fmt.Sprintf(
+		"node reserved %d -> %d / %d, node other resources %d -> %d rounded (%v -> %v raw, %v margin)",
+		oldNodeCPUReserved, node.vCPU.Reserved, node.vCPU.Total, oldNodeRes.ReservedCPU, newNodeRes.ReservedCPU, &oldNodeRes.RawCPU, &newNodeRes.RawCPU, newNodeRes.MarginCPU,
+	)
+	memVerdict := fmt.Sprintf(
+		"node reserved %d -> %d / %d, node other resources %d -> %d slots (%v -> %v raw, %v margin)",
+		oldNodeMemReserved, node.memSlots.Reserved, node.memSlots.Total, oldNodeRes.ReservedMemSlots, newNodeRes.ReservedMemSlots, &oldNodeRes.RawMemory, &newNodeRes.RawMemory, newNodeRes.MarginMemory,
+	)
+
+	log := logger.Info
+	if node.vCPU.Reserved > node.vCPU.Total || node.memSlots.Reserved > node.memSlots.Total {
+		log = logger.Warn
+	}
+
+	log(
+		"Handled new non-VM pod",
+		zap.Object("verdict", verdictSet{
+			cpu: cpuVerdict,
+			mem: memVerdict,
+		}),
+	)
+
+	node.updateMetrics(e.metrics, e.state.memSlotSizeBytes())
+}
+
 // This method is /basically/ the same as e.Unreserve, but the API is different and it has different
 // logs, so IMO it's worthwhile to have this separate.
 func (e *AutoscaleEnforcer) handleVMDeletion(logger *zap.Logger, podName util.NamespacedName) {
@@ -986,6 +1078,101 @@ func (e *AutoscaleEnforcer) handleNonAutoscalingUsageChange(logger *zap.Logger,
 	)
 }
 
+// NB: expected to be run in its own thread.
+func (e *AutoscaleEnforcer) cleanupMigration(logger *zap.Logger, vmm *vmapi.VirtualMachineMigration) {
+	vmmName := util.GetNamespacedName(vmm)
+
+	logger = logger.With(
+		// note: use the "virtualmachinemigration" key here for just the name, because it mirrors
+		// what we log in startMigration.
+		zap.Object("virtualmachinemigration", vmmName),
+		// also include the VM, for better association.
+		zap.Object("virtualmachine", util.NamespacedName{
+			Name:      vmm.Spec.VmName,
+			Namespace: vmm.Namespace,
+		}),
+	)
+	// Failed migrations should be noisy. Everything to do with cleaning up a failed migration
+	// should be logged at "Warn" or higher.
+	var logInfo func(string, ...zap.Field)
+	if vmm.Status.Phase == vmapi.VmmSucceeded {
+		logInfo = logger.Info
+	} else {
+		logInfo = logger.Warn
+	}
+	logInfo(
+		"Going to delete VirtualMachineMigration",
+		// Explicitly include "phase" here because we have metrics for it.
+		zap.String("phase", string(vmm.Status.Phase)),
+		// ... and then log the rest of the information about the migration:
+		zap.Any("spec", vmm.Spec),
+		zap.Any("status", vmm.Status),
+	)
+
+	// mark the operation as ongoing
+	func() {
+		e.state.lock.Lock()
+		defer e.state.lock.Unlock()
+
+		newCount := e.state.ongoingMigrationDeletions[vmmName] + 1
+		if newCount != 1 {
+			// context included by logger
+			logger.Error(
+				"More than one ongoing deletion for VirtualMachineMigration",
+				zap.Int("count", newCount),
+			)
+		}
+		e.state.ongoingMigrationDeletions[vmmName] = newCount
+	}()
+	// ... and remember to clean up when we're done:
+	defer func() {
+		e.state.lock.Lock()
+		defer e.state.lock.Unlock()
+
+		newCount := e.state.ongoingMigrationDeletions[vmmName] - 1
+		if newCount == 0 {
+			delete(e.state.ongoingMigrationDeletions, vmmName)
+		} else {
+			// context included by logger
+			logger.Error(
+				"More than one ongoing deletion for VirtualMachineMigration",
+				zap.Int("count", newCount),
+			)
+			e.state.ongoingMigrationDeletions[vmmName] = newCount
+		}
+	}()
+
+	// Continually retry the operation, until we're successful (or the VM doesn't exist anymore)
+
+	retryWait := time.Second * time.Duration(e.state.conf.MigrationDeletionRetrySeconds)
+
+	for {
+		logInfo("Attempting to delete VirtualMachineMigration")
+		err := e.vmClient.NeonvmV1().
+			VirtualMachineMigrations(vmmName.Namespace).
+			Delete(context.TODO(), vmmName.Name, metav1.DeleteOptions{})
+		if err == nil /* NB! This condition is inverted! */ {
+			logInfo("Successfully deleted VirtualMachineMigration")
+			e.metrics.migrationDeletions.WithLabelValues(string(vmm.Status.Phase)).Inc()
+			return
+		} else if apierrors.IsNotFound(err) {
+			logger.Warn("Deletion was handled for us; VirtualMachineMigration no longer exists")
+			return
+		}
+
+		logger.Error(
+			"Failed to delete VirtualMachineMigration, will try again after delay",
+			zap.Duration("delay", retryWait),
+			zap.Error(err),
+		)
+		e.metrics.migrationDeleteFails.WithLabelValues(string(vmm.Status.Phase)).Inc()
+
+		// retry after a delay
+		time.Sleep(retryWait)
+		continue
+	}
+}
+
 func (s *podState) isBetterMigrationTarget(other *podState) bool {
 	// TODO: this deprioritizes VMs whose metrics we can't collect. Maybe we don't want that?
 	if s.metrics == nil || other.metrics == nil {
@@ -1000,14 +1187,14 @@ func (s *podState) isBetterMigrationTarget(other *podState) bool {
 // send requests to the API server
 //
 // A lock will ALWAYS be held on return from this function.
-func (s *pluginState) startMigration(ctx context.Context, logger *zap.Logger, pod *podState, vmClient *vmclient.Clientset) error {
+func (e *AutoscaleEnforcer) startMigration(ctx context.Context, logger *zap.Logger, pod *podState) (created bool, _ error) {
 	if pod.currentlyMigrating() {
-		return fmt.Errorf("Pod is already migrating")
+		return false, fmt.Errorf("Pod is already migrating")
 	}
 
 	// Unlock to make the API request(s), then make sure we're locked on return.
-	s.lock.Unlock()
-	defer s.lock.Lock()
+	e.state.lock.Unlock()
+	defer e.state.lock.Lock()
 
 	vmmName := util.NamespacedName{
 		Name:      fmt.Sprintf("schedplugin-%s", pod.vmName.Name),
@@ -1024,17 +1211,23 @@ func (s *pluginState) startMigration(ctx context.Context, logger *zap.Logger, po
 	// We technically don't *need* this additional request here (because we can check the return
 	// from the Create request with apierrors.IsAlreadyExists). However: the benefit we get from
 	// this is that the logs are significantly clearer.
-	_, err := vmClient.NeonvmV1().
+	_, err := e.vmClient.NeonvmV1().
 		VirtualMachineMigrations(pod.name.Namespace).
 		Get(ctx, vmmName.Name, metav1.GetOptions{})
 	if err == nil {
 		logger.Warn("VirtualMachineMigration already exists, nothing to do")
-		return nil
+		return false, nil
 	} else if !apierrors.IsNotFound(err) {
 		// We're *expecting* to get IsNotFound = true; if err != nil and isn't NotFound, then
 		// there's some unexpected error.
 		logger.Error("Unexpected error doing Get request to check if migration already exists", zap.Error(err))
-		return fmt.Errorf("Error checking if migration exists: %w", err)
+		return false, fmt.Errorf("Error checking if migration exists: %w", err)
+	}
+
+	gitVersion := util.GetBuildInfo().GitInfo
+	// FIXME: make this not depend on GetBuildInfo() internals.
+	if gitVersion == "<unknown>" {
+		gitVersion = "unknown"
 	}
 
 	vmm := &vmapi.VirtualMachineMigration{
@@ -1043,6 +1236,14 @@ func (s *pluginState) startMigration(ctx context.Context, logger *zap.Logger, po
 			// should do if that happens.
 			Name:      vmmName.Name,
 			Namespace: pod.name.Namespace,
+			Labels: map[string]string{
+				// NB: There's requirements on what constitutes a valid label. Thankfully, the
+				// output of `git describe` always will.
+				//
+				// See also:
+				// https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
+				LabelPluginCreatedMigration: gitVersion,
+			},
 		},
 		Spec: vmapi.VirtualMachineMigrationSpec{
 			VmName: pod.vmName.Name,
@@ -1059,15 +1260,17 @@ func (s *pluginState) startMigration(ctx context.Context, logger *zap.Logger, po
 	}
 
 	logger.Info("Migration doesn't already exist, creating one for VM", zap.Any("spec", vmm.Spec))
-	_, err = vmClient.NeonvmV1().VirtualMachineMigrations(pod.name.Namespace).Create(ctx, vmm, metav1.CreateOptions{})
+	_, err = e.vmClient.NeonvmV1().VirtualMachineMigrations(pod.name.Namespace).Create(ctx, vmm, metav1.CreateOptions{})
 	if err != nil {
+		e.metrics.migrationCreateFails.Inc()
 		// log here, while the logger's fields are in scope
 		logger.Error("Unexpected error doing Create request for new migration", zap.Error(err))
-		return fmt.Errorf("Error creating migration: %w", err)
+		return false, fmt.Errorf("Error creating migration: %w", err)
 	}
+	e.metrics.migrationCreations.Inc()
 	logger.Info("VM migration request successful")
 
-	return nil
+	return true, nil
 }
 
 // readClusterState sets the initial node and pod maps for the plugin's state, getting its
@@ -1154,8 +1357,8 @@ func (p *AutoscaleEnforcer) readClusterState(ctx context.Context, logger *zap.Lo
 			skippedVms += 1
 		}
 
-		if pod.Spec.SchedulerName != p.state.conf.SchedulerName {
-			logSkip("Spec.SchedulerName %q != our config.SchedulerName %q", pod.Spec.SchedulerName, p.state.conf.SchedulerName)
+		if p.state.conf.ignoredNamespace(pod.Namespace) {
+			logSkip("VM is in ignored namespace")
 			continue
 		} else if pod.Spec.NodeName == "" {
 			logSkip("VM pod's Spec.NodeName = \"\" (maybe it hasn't been scheduled yet?)")
@@ -1236,7 +1439,9 @@ func (p *AutoscaleEnforcer) readClusterState(ctx context.Context, logger *zap.Lo
 		oldNodeMemBuffer := ns.memSlots.Buffer
 
 		ns.vCPU.Reserved += ps.vCPU.Reserved
+		ns.vCPU.Buffer += ps.vCPU.Buffer
 		ns.memSlots.Reserved += ps.memSlots.Reserved
+		ns.memSlots.Buffer += ps.memSlots.Buffer
 
 		cpuVerdict := fmt.Sprintf(
 			"pod = %v/%v (node %v -> %v / %v, %v -> %v buffer)",
@@ -1255,6 +1460,8 @@ func (p *AutoscaleEnforcer) readClusterState(ctx context.Context, logger *zap.Lo
 			}),
 		)
 
+		ns.updateMetrics(p.metrics, p.state.memSlotSizeBytes())
+
 		ns.pods[podName] = ps
 		p.state.podMap[podName] = ps
 	}
@@ -1288,8 +1495,8 @@ func (p *AutoscaleEnforcer) readClusterState(ctx context.Context, logger *zap.Lo
 
 		if _, ok := p.state.podMap[podName]; ok {
 			continue
-		} else if pod.Spec.SchedulerName != p.state.conf.SchedulerName {
-			logSkip("Spec.SchedulerName %q != our config.SchedulerName %q", pod.Spec.SchedulerName, p.state.conf.SchedulerName)
+		} else if p.state.conf.ignoredNamespace(pod.Namespace) {
+			logSkip("non-VM pod is in ignored namespace")
 			continue
 		}
 
@@ -1347,6 +1554,8 @@ func (p *AutoscaleEnforcer) readClusterState(ctx context.Context, logger *zap.Lo
 			}),
 		)
 
+		ns.updateMetrics(p.metrics, p.state.memSlotSizeBytes())
+
 		ns.otherPods[podName] = ps
 		p.state.otherPods[podName] = ps
 	}
diff --git a/pkg/plugin/trans.go b/pkg/plugin/trans.go
index 9734f40ba..6cc6482a2 100644
--- a/pkg/plugin/trans.go
+++ b/pkg/plugin/trans.go
@@ -85,7 +85,7 @@ func (s verdictSet) MarshalLogObject(enc zapcore.ObjectEncoder) error {
 //
 // A pretty-formatted summary of the outcome is returned as the verdict, for logging.
 func (r resourceTransition[T]) handleRequested(requested T, startingMigration bool, onlyThousands bool) (verdict string) {
-	totalReservable := r.node.Total - r.node.System
+	totalReservable := r.node.Total
 	// note: it's possible to temporarily have reserved > totalReservable, after loading state or
 	// config change; we have to use SaturatingSub here to account for that.
 	remainingReservable := util.SaturatingSub(totalReservable, r.oldNode.reserved)
@@ -184,15 +184,20 @@ func (r resourceTransition[T]) handleRequested(requested T, startingMigration bo
 	}
 
 	fmtString := "Register %d%s -> %d%s (pressure %d -> %d); " +
-		"node reserved %d -> %d (of %d), " +
+		"node reserved %d%s -> %d%s (of %d), " +
 		"node capacityPressure %d -> %d (%d -> %d spoken for)"
 
-	var buffer string
+	var podBuffer string
+	var oldNodeBuffer string
+	var newNodeBuffer string
 	if r.pod.Buffer != 0 {
-		buffer = fmt.Sprintf(" (buffer %d)", r.pod.Buffer)
+		podBuffer = fmt.Sprintf(" [buffer %d]", r.pod.Buffer)
+		oldNodeBuffer = fmt.Sprintf(" [buffer %d]", r.oldNode.buffer)
 
 		r.node.Buffer -= r.pod.Buffer
 		r.pod.Buffer = 0
+
+		newNodeBuffer = fmt.Sprintf(" [buffer %d]", r.node.Buffer)
 	}
 
 	var wanted string
@@ -203,9 +208,9 @@ func (r resourceTransition[T]) handleRequested(requested T, startingMigration bo
 	verdict = fmt.Sprintf(
 		fmtString,
 		// Register %d%s -> %d%s (pressure %d -> %d)
-		r.oldPod.reserved, buffer, r.pod.Reserved, wanted, r.oldPod.capacityPressure, r.pod.CapacityPressure,
-		// node reserved %d -> %d (of %d)
-		r.oldNode.reserved, r.node.Reserved, totalReservable,
+		r.oldPod.reserved, podBuffer, r.pod.Reserved, wanted, r.oldPod.capacityPressure, r.pod.CapacityPressure,
+		// node reserved %d%s -> %d%s (of %d)
+		r.oldNode.reserved, oldNodeBuffer, r.node.Reserved, newNodeBuffer, totalReservable,
 		// node capacityPressure %d -> %d (%d -> %d spoken for)
 		r.oldNode.capacityPressure, r.node.CapacityPressure, r.oldNode.pressureAccountedFor, r.node.PressureAccountedFor,
 	)
@@ -223,12 +228,23 @@ func (r resourceTransition[T]) handleDeleted(currentlyMigrating bool) (verdict s
 		r.node.PressureAccountedFor -= r.pod.Reserved + r.pod.CapacityPressure
 	}
 
-	fmtString := "pod had %d; node reserved %d -> %d, " +
+	var podBuffer string
+	var oldNodeBuffer string
+	var newNodeBuffer string
+	if r.pod.Buffer != 0 {
+		r.node.Buffer -= r.pod.Buffer
+
+		podBuffer = fmt.Sprintf(" [buffer %d]", r.pod.Buffer)
+		oldNodeBuffer = fmt.Sprintf(" [buffer %d]", r.oldNode.buffer)
+		newNodeBuffer = fmt.Sprintf(" [buffer %d]", r.node.Buffer)
+	}
+
+	fmtString := "pod had %d%s; node reserved %d%s -> %d%s, " +
 		"node capacityPressure %d -> %d (%d -> %d spoken for)"
 	verdict = fmt.Sprintf(
 		fmtString,
-		// pod had %d; node reserved %d -> %d
-		r.pod.Reserved, r.oldNode.reserved, r.node.Reserved,
+		// pod had %d%s; node reserved %d%s -> %d%s
+		r.pod.Reserved, podBuffer, r.oldNode.reserved, oldNodeBuffer, r.node.Reserved, newNodeBuffer,
 		// node capacityPressure %d -> %d (%d -> %d spoken for)
 		r.oldNode.capacityPressure, r.node.CapacityPressure, r.oldNode.pressureAccountedFor, r.node.PressureAccountedFor,
 	)
@@ -261,14 +277,19 @@ func (r resourceTransition[T]) handleAutoscalingDisabled() (verdict string) {
 	r.node.CapacityPressure -= r.pod.CapacityPressure
 	r.pod.CapacityPressure = 0
 
+	var nodeBufferChange string
+	if r.oldPod.buffer != 0 {
+		nodeBufferChange = fmt.Sprintf(" [buffer %d -> %d]", r.oldNode.buffer, r.node.Buffer)
+	}
+
 	fmtString := "pod had buffer %d, capacityPressure %d; " +
-		"node reserved %d -> %d, capacityPressure %d -> %d"
+		"node reserved %d -> %d%s, capacityPressure %d -> %d"
 	verdict = fmt.Sprintf(
 		fmtString,
 		// pod had buffer %d, capacityPressure %d;
 		r.oldPod.buffer, r.oldPod.capacityPressure,
-		// node reserved %d -> %d, capacityPressure %d -> %d
-		r.oldNode.reserved, r.node.Reserved, r.oldNode.capacityPressure, r.node.CapacityPressure,
+		// node reserved %d -> %d%s, capacityPressure %d -> %d
+		r.oldNode.reserved, r.node.Reserved, nodeBufferChange, r.oldNode.capacityPressure, r.node.CapacityPressure,
 	)
 	return verdict
 }
diff --git a/pkg/plugin/watch.go b/pkg/plugin/watch.go
index de1b649ad..e28055d29 100644
--- a/pkg/plugin/watch.go
+++ b/pkg/plugin/watch.go
@@ -63,6 +63,7 @@ func (e *AutoscaleEnforcer) watchNodeEvents(
 }
 
 type podWatchCallbacks struct {
+	submitPodStarted        func(*zap.Logger, *corev1.Pod)
 	submitVMDeletion        func(*zap.Logger, util.NamespacedName)
 	submitPodDeletion       func(*zap.Logger, util.NamespacedName)
 	submitPodStartMigration func(_ *zap.Logger, podName, migrationName util.NamespacedName, source bool)
@@ -81,10 +82,10 @@ func (e *AutoscaleEnforcer) watchPodEvents(
 	parentLogger *zap.Logger,
 	metrics watch.Metrics,
 	callbacks podWatchCallbacks,
-) error {
+) (*watch.Store[corev1.Pod], error) {
 	logger := parentLogger.Named("pod-watch")
 
-	_, err := watch.Watch(
+	return watch.Watch(
 		ctx,
 		logger.Named("watch"),
 		e.handle.ClientSet().CoreV1().Pods(corev1.NamespaceAll),
@@ -106,14 +107,49 @@ func (e *AutoscaleEnforcer) watchPodEvents(
 		watch.InitModeSync, // note: doesn't matter, because AddFunc = nil.
 		metav1.ListOptions{},
 		watch.HandlerFuncs[*corev1.Pod]{
+			AddFunc: func(pod *corev1.Pod, preexisting bool) {
+				name := util.GetNamespacedName(pod)
+
+				if e.state.conf.ignoredNamespace(pod.Namespace) {
+					logger.Info("Received add event for ignored pod", zap.Object("pod", name))
+					return
+				}
+
+				_, isVM := pod.Labels[LabelVM]
+
+				// Generate events for all non-VM pods that are running
+				if !isVM && pod.Status.Phase == corev1.PodRunning {
+					if !preexisting {
+						// Generally pods shouldn't be immediately running, so we log this as a
+						// warning. If it was preexisting, then it'll be handled on the initial
+						// cluster read already (but we generate the events anyways so that we
+						// definitely don't miss anything).
+						logger.Warn("Received add event for new non-VM pod already running", zap.Object("pod", name))
+					}
+					callbacks.submitPodStarted(logger, pod)
+				}
+			},
 			UpdateFunc: func(oldPod *corev1.Pod, newPod *corev1.Pod) {
 				name := util.GetNamespacedName(newPod)
 
+				if e.state.conf.ignoredNamespace(newPod.Namespace) {
+					logger.Info("Received update event for ignored pod", zap.Object("pod", name))
+					return
+				}
+
+				_, isVM := newPod.Labels[LabelVM]
+
+				// Check if a non-VM pod is now running.
+				if !isVM && oldPod.Status.Phase == corev1.PodPending && newPod.Status.Phase == corev1.PodRunning {
+					logger.Info("Received update event for non-VM pod now running", zap.Object("pod", name))
+					callbacks.submitPodStarted(logger, newPod)
+				}
+
 				// Check if pod is "completed" - handle that the same as deletion.
 				if !util.PodCompleted(oldPod) && util.PodCompleted(newPod) {
 					logger.Info("Received update event for completion of pod", zap.Object("pod", name))
 
-					if _, ok := newPod.Labels[LabelVM]; ok {
+					if isVM {
 						callbacks.submitVMDeletion(logger, name)
 					} else {
 						callbacks.submitPodDeletion(logger, name)
@@ -136,6 +172,11 @@ func (e *AutoscaleEnforcer) watchPodEvents(
 			DeleteFunc: func(pod *corev1.Pod, mayBeStale bool) {
 				name := util.GetNamespacedName(pod)
 
+				if e.state.conf.ignoredNamespace(pod.Namespace) {
+					logger.Info("Received delete event for ignored pod", zap.Object("pod", name))
+					return
+				}
+
 				if util.PodCompleted(pod) {
 					logger.Info("Received delete event for completed pod", zap.Object("pod", name))
 				} else {
@@ -149,7 +190,6 @@ func (e *AutoscaleEnforcer) watchPodEvents(
 			},
 		},
 	)
-	return err
 }
 
 // tryMigrationOwnerReference returns the name of the owning migration, if this pod *is* owned by a
@@ -228,6 +268,7 @@ func (e *AutoscaleEnforcer) watchVMEvents(
 	parentLogger *zap.Logger,
 	metrics watch.Metrics,
 	callbacks vmWatchCallbacks,
+	podIndex watch.IndexedStore[corev1.Pod, *watch.NameIndex[corev1.Pod]],
 ) (*watch.Store[vmapi.VirtualMachine], error) {
 	logger := parentLogger.Named("vm-watch")
 
@@ -252,14 +293,39 @@ func (e *AutoscaleEnforcer) watchVMEvents(
 		metav1.ListOptions{},
 		watch.HandlerFuncs[*vmapi.VirtualMachine]{
 			UpdateFunc: func(oldVM, newVM *vmapi.VirtualMachine) {
-				oldInfo, err := api.ExtractVmInfo(logger, oldVM)
-				if err != nil {
-					logger.Error("Failed to extract VM info in update for old VM", util.VMNameFields(oldVM), zap.Error(err))
+				if e.state.conf.ignoredNamespace(newVM.Namespace) {
+					logger.Info("Received update event for ignored VM", util.VMNameFields(newVM))
 					return
 				}
+
 				newInfo, err := api.ExtractVmInfo(logger, newVM)
 				if err != nil {
+					// Try to get the runner pod associated with the VM, if we can, but don't worry
+					// about it if we can't.
+					var runnerPod *corev1.Pod
+					if podName := newVM.Status.PodName; podName != "" {
+						// NB: index.Get returns nil if not found, so we only have a non-nil
+						// runnerPod if it's currently known.
+						runnerPod, _ = podIndex.GetIndexed(func(index *watch.NameIndex[corev1.Pod]) (*corev1.Pod, bool) {
+							return index.Get(newVM.Namespace, podName)
+						})
+					}
+
 					logger.Error("Failed to extract VM info in update for new VM", util.VMNameFields(newVM), zap.Error(err))
+					e.handle.EventRecorder().Eventf(
+						newVM,            // regarding
+						runnerPod,        // related
+						"Warning",        // eventtype
+						"ExtractVmInfo",  // reason
+						"HandleVmUpdate", // action
+						"Failed to extract autoscaling info about VM: %s", // note
+						err,
+					)
+					return
+				}
+				oldInfo, err := api.ExtractVmInfo(logger, oldVM)
+				if err != nil {
+					logger.Error("Failed to extract VM info in update for old VM", util.VMNameFields(oldVM), zap.Error(err))
 					return
 				}
 
@@ -298,3 +364,71 @@ func (e *AutoscaleEnforcer) watchVMEvents(
 		},
 	)
 }
+
+type migrationWatchCallbacks struct {
+	submitMigrationFinished func(*vmapi.VirtualMachineMigration)
+}
+
+// watchMigrationEvents *only* looks at migrations that were created by the scheduler plugin (or a
+// previous version of it).
+//
+// We use this to trigger cleaning up migrations once they're finished, because they don't
+// auto-delete, and our deterministic naming means that each we won't be able to create a new
+// migration for the same VM until the old one's gone.
+//
+// Tracking whether a migration was created by the scheduler plugin is done by adding the label
+// 'autoscaling.neon.tech/created-by-scheduler' to every migration we create.
+func (e *AutoscaleEnforcer) watchMigrationEvents(
+	ctx context.Context,
+	parentLogger *zap.Logger,
+	metrics watch.Metrics,
+	callbacks migrationWatchCallbacks,
+) (*watch.Store[vmapi.VirtualMachineMigration], error) {
+	logger := parentLogger.Named("vmm-watch")
+
+	return watch.Watch(
+		ctx,
+		logger.Named("watch"),
+		e.vmClient.NeonvmV1().VirtualMachineMigrations(corev1.NamespaceAll),
+		watch.Config{
+			ObjectNameLogField: "virtualmachinemigration",
+			Metrics: watch.MetricsConfig{
+				Metrics:  metrics,
+				Instance: "VirtualMachineMigrations",
+			},
+			// FIXME: make these durations configurable.
+			RetryRelistAfter: util.NewTimeRange(time.Second, 3, 5),
+			RetryWatchAfter:  util.NewTimeRange(time.Second, 3, 5),
+		},
+		watch.Accessors[*vmapi.VirtualMachineMigrationList, vmapi.VirtualMachineMigration]{
+			Items: func(list *vmapi.VirtualMachineMigrationList) []vmapi.VirtualMachineMigration { return list.Items },
+		},
+		watch.InitModeSync,
+		metav1.ListOptions{
+			// NB: Including just the label itself means that we select for objects that *have* the
+			// label, without caring about the actual value.
+			//
+			// See also:
+			// https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#set-based-requirement
+			LabelSelector: LabelPluginCreatedMigration,
+		},
+		watch.HandlerFuncs[*vmapi.VirtualMachineMigration]{
+			UpdateFunc: func(oldObj, newObj *vmapi.VirtualMachineMigration) {
+				if e.state.conf.ignoredNamespace(newObj.Namespace) {
+					logger.Info(
+						"Received update event for ignored VM Migration",
+						zap.Object("virtualmachinemigration", util.GetNamespacedName(newObj)),
+					)
+					return
+				}
+
+				shouldDelete := newObj.Status.Phase != oldObj.Status.Phase &&
+					(newObj.Status.Phase == vmapi.VmmSucceeded || newObj.Status.Phase == vmapi.VmmFailed)
+
+				if shouldDelete {
+					callbacks.submitMigrationFinished(newObj)
+				}
+			},
+		},
+	)
+}
diff --git a/pkg/util/buildinfo.go b/pkg/util/buildinfo.go
index c69e9b02d..207813e08 100644
--- a/pkg/util/buildinfo.go
+++ b/pkg/util/buildinfo.go
@@ -33,6 +33,8 @@ func GetBuildInfo() BuildInfo {
 		}
 	}
 
+	// FIXME: the "<unknown>" string is depended upon by the plugin's VirtualMachineMigration
+	// creation process. We should expose something better here.
 	gitInfo := BuildGitInfo
 	if BuildGitInfo == "" {
 		gitInfo = "<unknown>"
diff --git a/pkg/util/signal.go b/pkg/util/signal.go
index f83707d22..45b9b691f 100644
--- a/pkg/util/signal.go
+++ b/pkg/util/signal.go
@@ -7,32 +7,39 @@ import (
 	"sync"
 )
 
-func NewSingleSignalPair() (SignalSender, SignalReceiver) {
-	sigCh := make(chan struct{})
+func NewSingleSignalPair[T any]() (SignalSender[T], SignalReceiver[T]) {
+	sigCh := make(chan T, 1)
 	once := &sync.Once{}
 	closeSigCh := func() { once.Do(func() { close(sigCh) }) }
 
-	return SignalSender{send: closeSigCh}, SignalReceiver{sigCh: sigCh, closeSigCh: closeSigCh}
+	return SignalSender[T]{
+		send: func(data T) {
+			once.Do(func() {
+				sigCh <- data
+				close(sigCh)
+			})
+		},
+	}, SignalReceiver[T]{sigCh: sigCh, closeSigCh: closeSigCh}
 }
 
-type SignalSender struct {
-	send func()
+type SignalSender[T any] struct {
+	send func(T)
 }
 
-type SignalReceiver struct {
-	sigCh      chan struct{}
+type SignalReceiver[T any] struct {
+	sigCh      chan T
 	closeSigCh func()
 }
 
-func (s SignalSender) Send() {
-	s.send()
+func (s SignalSender[T]) Send(data T) {
+	s.send(data)
 }
 
-func (s SignalReceiver) Recv() chan struct{} {
+func (s SignalReceiver[T]) Recv() <-chan T {
 	return s.sigCh
 }
 
-func (s SignalReceiver) Close() {
+func (s SignalReceiver[T]) Close() {
 	s.closeSigCh()
 }
 
diff --git a/pkg/util/watch/watch.go b/pkg/util/watch/watch.go
index 859c38b96..d0c9ee080 100644
--- a/pkg/util/watch/watch.go
+++ b/pkg/util/watch/watch.go
@@ -146,7 +146,7 @@ func Watch[C Client[L], L metav1.ListMetaAccessor, T any, P Object[T]](
 	// the initial list
 	opts.ResourceVersion = initialList.GetListMeta().GetResourceVersion()
 
-	sendStop, stopSignal := util.NewSingleSignalPair()
+	sendStop, stopSignal := util.NewSingleSignalPair[struct{}]()
 
 	store := Store[T]{
 		mutex:         sync.Mutex{},
@@ -356,6 +356,7 @@ func Watch[C Client[L], L metav1.ListMetaAccessor, T any, P Object[T]](
 					retryAfter := config.RetryRelistAfter.Random()
 					logger.Info("Retrying relist after delay", zap.Duration("delay", retryAfter))
 
+					store.failing.Store(true)
 					config.Metrics.failing()
 
 					select {
@@ -371,6 +372,7 @@ func Watch[C Client[L], L metav1.ListMetaAccessor, T any, P Object[T]](
 					}
 				}
 
+				store.failing.Store(false)
 				config.Metrics.unfailing()
 
 				// err == nil, process relistList
@@ -452,6 +454,7 @@ func Watch[C Client[L], L metav1.ListMetaAccessor, T any, P Object[T]](
 					retryAfter := config.RetryWatchAfter.Random()
 					logger.Info("Retrying re-watch after delay", zap.Duration("delay", retryAfter))
 
+					store.failing.Store(true)
 					config.Metrics.failing()
 
 					select {
@@ -468,6 +471,7 @@ func Watch[C Client[L], L metav1.ListMetaAccessor, T any, P Object[T]](
 				}
 
 				// err == nil
+				store.failing.Store(false)
 				config.Metrics.unfailing()
 				break newWatcher
 			}
@@ -554,8 +558,9 @@ type Store[T any] struct {
 	nextIndexID uint64
 	indexes     map[uint64]Index[T]
 
-	stopSignal util.SignalSender
+	stopSignal util.SignalSender[struct{}]
 	stopped    atomic.Bool
+	failing    atomic.Bool
 }
 
 // Relist triggers re-listing the WatchStore, returning a channel that will be closed once the
@@ -573,10 +578,14 @@ func (w *Store[T]) Relist() <-chan struct{} {
 }
 
 func (w *Store[T]) Stop() {
-	w.stopSignal.Send()
+	w.stopSignal.Send(struct{}{})
 	w.stopped.Store(true)
 }
 
+func (w *Store[T]) Failing() bool {
+	return w.failing.Load()
+}
+
 func (w *Store[T]) Stopped() bool {
 	return w.stopped.Load()
 }
diff --git a/tests/e2e/autoscaling/00-create-vm.yaml b/tests/e2e/autoscaling/00-create-vm.yaml
index b9152e95a..006406785 100644
--- a/tests/e2e/autoscaling/00-create-vm.yaml
+++ b/tests/e2e/autoscaling/00-create-vm.yaml
@@ -51,7 +51,7 @@ spec:
         port: 5432
       - name: host-metrics
         port: 9100
-      - name: informant
+      - name: monitor
         port: 10301
   extraNetwork:
       enable: true
diff --git a/vm-deploy.yaml b/vm-deploy.yaml
index d7a69403c..93fa211cd 100644
--- a/vm-deploy.yaml
+++ b/vm-deploy.yaml
@@ -23,4 +23,4 @@ spec:
       - port: 22   # ssh
       - port: 5432 # postgres
       - port: 9100 # metrics
-      - port: 10301 # informant
+      - port: 10301 # monitor
diff --git a/vm-examples/pg14-disk-test/Dockerfile.vmdata b/vm-examples/pg14-disk-test/Dockerfile.vmdata
index ff4d04b79..f15bd1ada 100644
--- a/vm-examples/pg14-disk-test/Dockerfile.vmdata
+++ b/vm-examples/pg14-disk-test/Dockerfile.vmdata
@@ -1,5 +1,5 @@
-FROM vm-informant:dev as informant
-# ^ don't do anything with this; we just want it around for later use.
+FROM vm-monitor:dev as monitor
+# ^ don't do anything with these; we just want it around for later use.
 
 # Build the allocation tester:
 FROM alpine:3.16 AS allocate-loop-builder
@@ -33,12 +33,10 @@ RUN echo '::sysinit:/usr/sbin/cgconfigparser -l /etc/cgconfig.conf -s 1664' >> /
 # Add the allocate-loop tester
 COPY --from=allocate-loop-builder /bin/allocate-loop /bin/allocate-loop
 
-# Add the vm-informant
-COPY --from=informant /usr/bin/vm-informant /bin/vm-informant
-RUN adduser vm-informant --disabled-password --no-create-home
-# note: Use 'respawn' and '--auto-restart' so that the logs are noisy if the arguments are bad,
-# but we still have proper handling around cgroups, etc.
-RUN echo "::respawn:su vm-informant -c '/bin/vm-informant --auto-restart --cgroup=neon-test'" >> /etc/inittab
+# Add the vm-monitor
+COPY --from=monitor /usr/bin/vm-monitor /bin/vm-monitor 
+RUN adduser vm-monitor --disabled-password --no-create-home
+RUN echo "::respawn:su vm-monitor -c 'RUST_LOG=info /bin/vm-monitor --cgroup=neon-test --addr=\"0.0.0.0:10301\"'" >> /etc/inittab
 
 # Install vector.dev binary
 RUN set -e \
diff --git a/vm-examples/pg14-disk-test/cgconfig.conf b/vm-examples/pg14-disk-test/cgconfig.conf
index 290630bca..24514a3ac 100644
--- a/vm-examples/pg14-disk-test/cgconfig.conf
+++ b/vm-examples/pg14-disk-test/cgconfig.conf
@@ -4,7 +4,7 @@
 group neon-test {
     perm {
         admin {
-            uid = vm-informant;
+            uid = vm-monitor;
         }
         task {
             gid = users;
diff --git a/vm-examples/postgres-minimal/Dockerfile b/vm-examples/postgres-minimal/Dockerfile
index 84d406471..0ce897bfa 100644
--- a/vm-examples/postgres-minimal/Dockerfile
+++ b/vm-examples/postgres-minimal/Dockerfile
@@ -1,5 +1,5 @@
-FROM vm-informant:dev as informant
-# ^ don't do anything with this; we just want it around for later use.
+FROM vm-monitor:dev as monitor
+# ^ don't do anything with these; we just want it around for later use.
 
 FROM postgres:15-bullseye
 
@@ -18,9 +18,7 @@ RUN set -e \
 ADD cgconfig.conf /etc/cgconfig.conf
 RUN echo '::sysinit:/usr/sbin/cgconfigparser -l /etc/cgconfig.conf -s 1664' >> /etc/inittab
 
-# Add the vm-informant
-COPY --from=informant /usr/bin/vm-informant /bin/vm-informant
-RUN adduser vm-informant --disabled-password --no-create-home
-# note: Use 'respawn' and '--auto-restart' so that the logs are noisy if the arguments are bad,
-# but we still have proper handling around cgroups, etc.
-RUN echo "::respawn:su vm-informant -c '/bin/vm-informant --auto-restart --cgroup=neon-test'" >> /etc/inittab
+# Add the vm-monitor
+COPY --from=monitor /usr/bin/vm-monitor /bin/vm-monitor 
+RUN adduser vm-monitor --disabled-password --no-create-home
+RUN echo "::respawn:su vm-monitor -c 'RUST_LOG=info /bin/vm-monitor --cgroup=neon-test'" >> /etc/inittab
diff --git a/vm-examples/postgres-minimal/cgconfig.conf b/vm-examples/postgres-minimal/cgconfig.conf
index 290630bca..24514a3ac 100644
--- a/vm-examples/postgres-minimal/cgconfig.conf
+++ b/vm-examples/postgres-minimal/cgconfig.conf
@@ -4,7 +4,7 @@
 group neon-test {
     perm {
         admin {
-            uid = vm-informant;
+            uid = vm-monitor;
         }
         task {
             gid = users;