From bd12a8bed3e466480315bf4b9459173cb7d8e34f Mon Sep 17 00:00:00 2001 From: "Dmitry K." Date: Tue, 24 Sep 2024 13:11:55 -0700 Subject: [PATCH 01/27] --wip-- [skip ci] --- go.mod | 6 ++---- .../scripts/upgrades/authz_cancel_upgrade_tx.json | 10 ++++++++++ tools/scripts/upgrades/upgrade_tx.json | 15 +++++++++++++++ x/tokenomics/types/tx.pb.go | 1 - 4 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 tools/scripts/upgrades/authz_cancel_upgrade_tx.json create mode 100644 tools/scripts/upgrades/upgrade_tx.json diff --git a/go.mod b/go.mod index 91de15f1d..9f47b177c 100644 --- a/go.mod +++ b/go.mod @@ -79,10 +79,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 ) -require ( - cosmossdk.io/x/tx v0.13.4 - github.com/jhump/protoreflect v1.16.0 -) +require github.com/jhump/protoreflect v1.16.0 require ( buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.34.2-20240508200655-46a4cf4ba109.2 // indirect @@ -95,6 +92,7 @@ require ( connectrpc.com/connect v1.16.2 // indirect connectrpc.com/otelconnect v0.7.0 // indirect cosmossdk.io/collections v0.4.0 // indirect + cosmossdk.io/x/tx v0.13.4 // indirect filippo.io/edwards25519 v1.0.0 // indirect github.com/99designs/go-keychain v0.0.0-20191008050251-8e49817e8af4 // indirect github.com/99designs/keyring v1.2.1 // indirect diff --git a/tools/scripts/upgrades/authz_cancel_upgrade_tx.json b/tools/scripts/upgrades/authz_cancel_upgrade_tx.json new file mode 100644 index 000000000..014eaac60 --- /dev/null +++ b/tools/scripts/upgrades/authz_cancel_upgrade_tx.json @@ -0,0 +1,10 @@ +{ + "body": { + "messages": [ + { + "@type": "/cosmos.upgrade.v1beta1.MsgCancelUpgrade", + "authority": "pokt10d07y265gmmuvt4z0w9aw880jnsr700j8yv32t" + } + ] + } +} \ No newline at end of file diff --git a/tools/scripts/upgrades/upgrade_tx.json b/tools/scripts/upgrades/upgrade_tx.json new file mode 100644 index 000000000..c945229d9 --- /dev/null +++ b/tools/scripts/upgrades/upgrade_tx.json @@ -0,0 +1,15 @@ +{ + "body": { + "messages": [ + { + "@type": "/cosmos.upgrade.v1beta1.MsgSoftwareUpgrade", + "authority": "pokt10d07y265gmmuvt4z0w9aw880jnsr700j8yv32t", + "plan": { + "name": "v0.0.9", + "height": "15510", + "info": "{\"binaries\":{\"linux/amd64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_linux_amd64.tar.gz?checksum=sha256:ab5b99ca0bc4bfbdd7031378d5a01c2a9f040ff310b745866a4dee7e62321c94\",\"linux/arm64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_linux_arm64.tar.gz?checksum=sha256:4b68c2ad326da055d43af1ad1a580158cec0f229d2ec6d9e18280d065260b622\",\"darwin/amd64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_darwin_amd64.tar.gz?checksum=sha256:c81aabddeb190044b979412e5a518bbf5c88305272f72a47e32e13aa765c3330\",\"darwin/arm64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_darwin_arm64.tar.gz?checksum=sha256:e683c55ac13902d107d7a726ed4a5c5affb2af1be3c67dd131ec2072a2cfbcb2\"}}" + } + } + ] + } +} \ No newline at end of file diff --git a/x/tokenomics/types/tx.pb.go b/x/tokenomics/types/tx.pb.go index e4fec264c..9f18a148c 100644 --- a/x/tokenomics/types/tx.pb.go +++ b/x/tokenomics/types/tx.pb.go @@ -125,7 +125,6 @@ type MsgUpdateParam struct { // specified in the `Params` message in `proof/params.proto.` Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` // Types that are valid to be assigned to AsType: - // // *MsgUpdateParam_AsString // *MsgUpdateParam_AsInt64 // *MsgUpdateParam_AsBytes From 4ea092e675fddc3577107a998580bda95ac47e03 Mon Sep 17 00:00:00 2001 From: "Dmitry K." Date: Tue, 24 Sep 2024 16:45:02 -0700 Subject: [PATCH 02/27] document learnings and more checks --- app/upgrades.go | 1 + app/upgrades/historical.go | 7 +++ .../docs/protocol/upgrades/release_process.md | 7 +++ .../protocol/upgrades/upgrade_procedure.md | 44 ++++++++++++++++++- ...upgrade_tx.json => upgrade_tx_v0.0.9.json} | 0 5 files changed, 57 insertions(+), 2 deletions(-) rename tools/scripts/upgrades/{upgrade_tx.json => upgrade_tx_v0.0.9.json} (100%) diff --git a/app/upgrades.go b/app/upgrades.go index a2af2973e..5043bd078 100644 --- a/app/upgrades.go +++ b/app/upgrades.go @@ -11,6 +11,7 @@ import ( // so `cosmovisor` can automatically pull the binary from GitHub. var allUpgrades = []upgrades.Upgrade{ upgrades.Upgrade_0_0_4, + upgrades.Upgrade_0_0_9, } // setUpgrades sets upgrade handlers for all upgrades and executes KVStore migration if an upgrade plan file exists. diff --git a/app/upgrades/historical.go b/app/upgrades/historical.go index fe06f7096..2e71f0430 100644 --- a/app/upgrades/historical.go +++ b/app/upgrades/historical.go @@ -86,3 +86,10 @@ var Upgrade_0_0_4 = Upgrade{ // No changes to the KVStore in this upgrade. StoreUpgrades: storetypes.StoreUpgrades{}, } + +// Upgrade_0_0_9 is a small upgrade on TestNet. +var Upgrade_0_0_9 = Upgrade{ + PlanName: "v0.0.9", + CreateUpgradeHandler: defaultUpgradeHandler, + StoreUpgrades: storetypes.StoreUpgrades{}, +} diff --git a/docusaurus/docs/protocol/upgrades/release_process.md b/docusaurus/docs/protocol/upgrades/release_process.md index 2845f4c84..556f14fb2 100644 --- a/docusaurus/docs/protocol/upgrades/release_process.md +++ b/docusaurus/docs/protocol/upgrades/release_process.md @@ -59,6 +59,13 @@ You can find an example [here](https://github.com/pokt-network/poktroll/releases ```text ## Protocol Upgrades + + - **Planned Upgrade:** ❌ Not applicable for this release. - **Breaking Change:** ❌ Not applicable for this release. - **Manual Intervention Required:** ✅ Yes, but only for Alpha TestNet participants. If you are participating, please follow the [instructions provided here](https://dev.poktroll.com/operate/quickstart/docker_compose_walkthrough#restarting-a-full-node-after-re-genesis-) for restarting your full node after re-genesis. diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index b1098f509..38bbeb5cd 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -12,7 +12,9 @@ This page describes the protocol upgrade process, which is internal to the proto - [When is an Upgrade Warranted?](#when-is-an-upgrade-warranted) - [Implementing the Upgrade](#implementing-the-upgrade) - [Writing an Upgrade Transaction](#writing-an-upgrade-transaction) + - [Validate the URLs](#validate-the-urls) - [Submitting the upgrade on-chain](#submitting-the-upgrade-on-chain) +- [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - [LocalNet](#localnet) - [DevNet](#devnet) @@ -39,7 +41,7 @@ An upgrade is necessary whenever there's an API, State Machine, or other Consens 1. When a new version includes a consensus-breaking change, plan for the next protocol upgrade: - If there's a change to a specific module, bump that module's consensus version. - Note any potential parameter changes to include in the upgrade. -2. Create a new upgrade in `app/upgrades`: +2. Create a new upgrade in `app/upgrades`. **THIS MUST BE DONE** even if there are no state changes. - Refer to `historical.go` for past upgrades and examples. - Consult Cosmos-sdk documentation on upgrades for additional guidance [here](https://docs.cosmos.network/main/build/building-apps/app-upgrade) and [here](https://docs.cosmos.network/main/build/modules/upgrade). @@ -69,12 +71,42 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl - `height`: The height at which an upgrade should be executed and the node will be restarted. - `info`: While this field can theoretically contain any information about the upgrade, in practice, `cosmovisor`uses it to obtain information about the binaries. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is optional). +### Validate the URLs + +The URLs of the binaries contain checksums. It is important to make sure they are correct, otherwise Cosmovisor won't be able +to download the binaries and go through the upgrade. Here's a little command that uses `jq` and `go-getter` (same library used by Cosmovisor - so it is a good test). + +:::tip + +Go-getter can be installed using the following command: + +```bash +go install github.com/hashicorp/go-getter/cmd/go-getter@latest +``` + +::: + +```bash +jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do + go-getter "$url" . +done +``` + +The output should look like this: + +```text +2024/09/24 12:40:40 success! +2024/09/24 12:40:42 success! +2024/09/24 12:40:44 success! +2024/09/24 12:40:46 success! +``` + ## Submitting the upgrade on-chain The `MsgSoftwareUpgrade` can be submitted using the following command: ```bash -poktrolld tx authz exec PATH_TO_TRANSACTION_JSON --from pnf +poktrolld tx authz exec PATH_TO_UPGRADE_TRANSACTION_JSON --from pnf ``` If the transaction has been accepted, upgrade plan can be viewed with this command: @@ -83,6 +115,14 @@ If the transaction has been accepted, upgrade plan can be viewed with this comma poktrolld query upgrade plan ``` +## Cancelling the upgrade plan + +It is possible to cancel the upgrade before the upgrade plan height is reached. To do so, execute the following transaction: + +```bash +poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from pnf +``` + ## Testing the Upgrade :::warning diff --git a/tools/scripts/upgrades/upgrade_tx.json b/tools/scripts/upgrades/upgrade_tx_v0.0.9.json similarity index 100% rename from tools/scripts/upgrades/upgrade_tx.json rename to tools/scripts/upgrades/upgrade_tx_v0.0.9.json From 3d86aef7d3d95dc8d815d7f5254d3a640e27ef9c Mon Sep 17 00:00:00 2001 From: "Dmitry K." Date: Wed, 25 Sep 2024 10:44:29 -0700 Subject: [PATCH 03/27] ca-certs are needed for relayminer --- Dockerfile.release | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile.release b/Dockerfile.release index 07ccbb433..efd5d2f44 100644 --- a/Dockerfile.release +++ b/Dockerfile.release @@ -3,6 +3,11 @@ FROM debian:bookworm ARG TARGETARCH +# Install necessary packages. +RUN apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates && \ + rm -rf /var/lib/apt/lists/* + # Use `1025` G/UID so users can switch between this and `heighliner` image without a need to chown the files. RUN groupadd -g 1025 pocket && useradd -u 1025 -g pocket -m -s /sbin/nologin pocket From cb073bd027ff4260c220c61ba7c243ed5f61bfe8 Mon Sep 17 00:00:00 2001 From: DK Date: Mon, 14 Oct 2024 17:57:55 -0700 Subject: [PATCH 04/27] LocalNet upgrade procedure --- .../docs/protocol/upgrades/upgrade_list.md | 14 ++-- .../protocol/upgrades/upgrade_procedure.md | 68 ++++++++++++++----- tools/installer/full-node.sh | 11 +-- 3 files changed, 66 insertions(+), 27 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/upgrade_list.md b/docusaurus/docs/protocol/upgrades/upgrade_list.md index f4b2c8d19..ff88e7584 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_list.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_list.md @@ -26,12 +26,14 @@ This table is currently incomplete and does not include all protocol upgrades. O -| Version | Planned | Breaking | Requires Manual Intervention | Upgrade Height | -| ------------------------------------------------------------------------ | :-----: | :------: | :---------------------------------: | -------------- | -| [`v0.0.7`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.7) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.6`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.6) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.5`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.5) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.4`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.4) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| Version | Planned | Breaking | Requires Manual Intervention | Upgrade Height | +| ---------------------------------------------------------------------------- | :-----: | :------: | :---------------------------------: | -------------- | +| [`v0.0.9-3`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9-3) | ❌ | ✅ | ✅ (Alpha TestNet Participants Only) | `17102` | +| [`v0.0.9`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| [`v0.0.8`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.8) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| [`v0.0.6`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.6) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| [`v0.0.5`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.5) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| [`v0.0.4`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.4) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | ## MainNet diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 38bbeb5cd..25c5ffc6e 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -12,11 +12,13 @@ This page describes the protocol upgrade process, which is internal to the proto - [When is an Upgrade Warranted?](#when-is-an-upgrade-warranted) - [Implementing the Upgrade](#implementing-the-upgrade) - [Writing an Upgrade Transaction](#writing-an-upgrade-transaction) - - [Validate the URLs](#validate-the-urls) + - [Validate the URLs (live network only)](#validate-the-urls-live-network-only) - [Submitting the upgrade on-chain](#submitting-the-upgrade-on-chain) - [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - [LocalNet](#localnet) + - [TLDR](#tldr) + - [Full example](#full-example) - [DevNet](#devnet) - [TestNet](#testnet) - [Mainnet](#mainnet) @@ -69,9 +71,9 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl - `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. - `height`: The height at which an upgrade should be executed and the node will be restarted. -- `info`: While this field can theoretically contain any information about the upgrade, in practice, `cosmovisor`uses it to obtain information about the binaries. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is optional). +- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. -### Validate the URLs +### Validate the URLs (live network only) The URLs of the binaries contain checksums. It is important to make sure they are correct, otherwise Cosmovisor won't be able to download the binaries and go through the upgrade. Here's a little command that uses `jq` and `go-getter` (same library used by Cosmovisor - so it is a good test). @@ -87,7 +89,7 @@ go install github.com/hashicorp/go-getter/cmd/go-getter@latest ::: ```bash -jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do +jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' $PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do go-getter "$url" . done ``` @@ -106,7 +108,7 @@ The output should look like this: The `MsgSoftwareUpgrade` can be submitted using the following command: ```bash -poktrolld tx authz exec PATH_TO_UPGRADE_TRANSACTION_JSON --from pnf +poktrolld tx authz exec $PATH_TO_UPGRADE_TRANSACTION_JSON --from=pnf ``` If the transaction has been accepted, upgrade plan can be viewed with this command: @@ -120,7 +122,7 @@ poktrolld query upgrade plan It is possible to cancel the upgrade before the upgrade plan height is reached. To do so, execute the following transaction: ```bash -poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from pnf +poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from=pnf ``` ## Testing the Upgrade @@ -131,18 +133,50 @@ Note that for local testing, `cosmovisor` won't pull the binary from the info fi ### LocalNet -LocalNet currently does not support `cosmovisor` and automatic upgrades. However, we have provided scripts to facilitate local testing in the `tools/scripts/upgrades` directory: +LocalNet does not support `cosmovisor` and automatic upgrades at the moment. But we don't need it to simulate and test the upgrade procedure. + +#### TLDR + +In short, the procedure is: +- Pull git repo with old version (separate directory) +- Download release binary of the old version +- Wipe localnet data and generate genesis using OLD version +- Start node using OLD binary +- Write and submit an upgrade transaction on-chain +- When the Upgrade Plan height is reached, stop the old node and run the new binary +- Observe the behavior + +#### Full example + +As we are testing an upgrade, we need to have a network that first runs on the old version. So it is a good idea to have a LocalNet running using a binary from the [previous release you with to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. + +1. Make a note of the version you want to test an upgrade **FROM**. This will be the **OLD** version. For example, let's imagine we're upgrading from `v0.0.9`. +2. Pull a new `poktroll` repo (will be used as an "old" version): + ```bash + git clone https://github.com/pokt-network/poktroll.git poktroll-upgrade-old + cd poktroll-upgrade-old + git checkout v0.0.9 + + # Download the v0.0.9 binary: https://github.com/pokt-network/poktroll/releases + # CHANGE POKTROLLD_VERSION and ARCH + curl -L "https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" | tar -zxvf - -C . + + # Validate the version + ./poktrolld version + 0.0.9 + ``` +3. Stop LocalNet: `make localnet_down` +4. Reset the data: `./poktrolld comet unsafe-reset-all` +5. Create new genesis using old version (from `poktroll-upgrade-old` dir): `make localnet_regenesis` +6. Start the network: `./poktrolld start` +7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction (e.g. `poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf`) +8. Verify the plan is active: `poktrolld query upgrade plan` +9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. +10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade network **TO**. +11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. +12. Start the new version (from the **NEW VERSION REPO**: `./release_binaries/poktroll_darwin_arm64 start`) +13. Observe the behavior. Your node should go through the upgrade process and start using the new version. -1. Modify `tools/scripts/upgrades/authz_upgrade_tx_example_v0.0.4_height_30.json` to reflect the name of the upgrade and the height at which it should be scheduled. - -2. Check and update the `tools/scripts/upgrades/cosmovisor-start-node.sh` to point to the correct binaries: - - - The old binary should be compiled to work before the upgrade. - - The new binary should contain the upgrade logic to be executed immediately after the node is started using the new binary. - -3. Run `bash tools/scripts/upgrades/cosmovisor-start-node.sh` to wipe the `~/.poktroll` directory and place binaries in the correct locations. - -4. Execute the transaction as shown in [Submitting the upgrade on-chain](#submitting-the-upgrade-on-chain) section above. ### DevNet diff --git a/tools/installer/full-node.sh b/tools/installer/full-node.sh index 6faca639b..7ee542336 100644 --- a/tools/installer/full-node.sh +++ b/tools/installer/full-node.sh @@ -94,7 +94,7 @@ setup_env_vars() { echo "export DAEMON_HOME=\$HOME/.poktroll" >> \$HOME/.profile echo "export DAEMON_RESTART_AFTER_UPGRADE=true" >> \$HOME/.profile echo "export DAEMON_ALLOW_DOWNLOAD_BINARIES=true" >> \$HOME/.profile - echo "export UNSAFE_SKIP_BACKUP=true" >> \$HOME/.profile + echo "export UNSAFE_SKIP_BACKUP=false" >> \$HOME/.profile source \$HOME/.profile EOF print_color $GREEN "Environment variables set up successfully." @@ -138,12 +138,15 @@ setup_poktrolld() { exit 1 fi - # Use the direct download link for the latest release - LATEST_RELEASE_URL="https://github.com/pokt-network/poktroll/releases/latest/download/poktroll_linux_${ARCH}.tar.gz" + # Get the version genesis started from + POKTROLLD_VERSION=$(curl -s https://raw.githubusercontent.com/pokt-network/pocket-network-genesis/master/poktrolld/testnet-validated.init-version) + + # Use the direct download link for the correct release + RELEASE_URL="https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" sudo -u "$POKTROLL_USER" bash << EOF mkdir -p \$HOME/.poktroll/cosmovisor/genesis/bin - curl -L "$LATEST_RELEASE_URL" | tar -zxvf - -C \$HOME/.poktroll/cosmovisor/genesis/bin + curl -L "$RELEASE_URL" | tar -zxvf - -C \$HOME/.poktroll/cosmovisor/genesis/bin chmod +x \$HOME/.poktroll/cosmovisor/genesis/bin/poktrolld ln -sf \$HOME/.poktroll/cosmovisor/genesis/bin/poktrolld \$HOME/bin/poktrolld source \$HOME/.profile From 3fe1feaba7897760e4f6dbf2f5f013c6d9f6d34e Mon Sep 17 00:00:00 2001 From: DK Date: Tue, 15 Oct 2024 16:19:42 -0700 Subject: [PATCH 05/27] --wip-- [skip ci] --- app/upgrades/historical.go | 2 + .../protocol/upgrades/consensus_failure.md | 15 ++++++ .../protocol/upgrades/contigency_plans.md | 49 +++++++++++++++++++ .../docs/protocol/upgrades/release_process.md | 6 --- docusaurus/yarn.lock | 16 +++--- 5 files changed, 74 insertions(+), 14 deletions(-) create mode 100644 docusaurus/docs/protocol/upgrades/consensus_failure.md create mode 100644 docusaurus/docs/protocol/upgrades/contigency_plans.md diff --git a/app/upgrades/historical.go b/app/upgrades/historical.go index 2e71f0430..bb9eccccc 100644 --- a/app/upgrades/historical.go +++ b/app/upgrades/historical.go @@ -11,6 +11,7 @@ package upgrades import ( "context" + "fmt" storetypes "cosmossdk.io/store/types" upgradetypes "cosmossdk.io/x/upgrade/types" @@ -28,6 +29,7 @@ func defaultUpgradeHandler( configurator module.Configurator, ) upgradetypes.UpgradeHandler { return func(ctx context.Context, plan upgradetypes.Plan, vm module.VersionMap) (module.VersionMap, error) { + fmt.Println("Starting the migration in defaultUpgradeHandler.") return mm.RunMigrations(ctx, configurator, vm) } } diff --git a/docusaurus/docs/protocol/upgrades/consensus_failure.md b/docusaurus/docs/protocol/upgrades/consensus_failure.md new file mode 100644 index 000000000..229dd4544 --- /dev/null +++ b/docusaurus/docs/protocol/upgrades/consensus_failure.md @@ -0,0 +1,15 @@ +--- +title: Consensus failure recovery plan +sidebar_position: 6 +--- + +# Consensus Failure Recovery Plan + + + +## Common consensus failure errors + + + +- `wrong Block.Header.AppHash` - the data in block is different between nodes. Can be investigated by comparing the data dir - [more information here](../../develop/developer_guide/chain_halt_troubleshooting.md). + diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md new file mode 100644 index 000000000..90b3e8544 --- /dev/null +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -0,0 +1,49 @@ +--- +title: Failed upgrade contingency plan +sidebar_position: 5 +--- + +# Contingency plans + + +There's always a chance the upgrade will fail. We prepared some contingency plans, so we can try to recover without +significant downtime. + +:::tip + +This documentation covers failed upgrade contingency for `poktroll` - `cosmos-sdk` based chain. While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. + +::: + +- [Option 0: the bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) +- [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) +- [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) +- [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) + + +## Option 0: the bug is discovered before the upgrade height is reached + +Cancel the upgrade plan: [how](./upgrade_procedure.md#cancelling-the-upgrade-plan). + +## Option 1: The upgrade height is reached and the migration didn't start + +If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, but it's important that all nodes on the network do this at the same time. + +`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. + +:::caution +`--unsafe-skip-upgrade` needs to be documented and added to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. +::: + +## Option 2: The migration is stuck + +If the migration is stuck there's always a chance the state has been mutated for the upgrade but the migration didn't complete. In such case, we need to: + +- Rollback validators to the backup (snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). +- Skip the upgrade handler and store migrations with `--unsafe-skip-upgrade=$upgradeHeightNumber`. +- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. +- Resolve the issue with an upgrade and schedule another plan. + +## Option 3: The network is stuck at the future height after the upgrade + +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [here](./consensus_failure.md) for more information on how to handle such issues. diff --git a/docusaurus/docs/protocol/upgrades/release_process.md b/docusaurus/docs/protocol/upgrades/release_process.md index 556f14fb2..4a756d5a5 100644 --- a/docusaurus/docs/protocol/upgrades/release_process.md +++ b/docusaurus/docs/protocol/upgrades/release_process.md @@ -16,12 +16,6 @@ sidebar_position: 4 This document is for the Pocket Network protocol team's internal use only. ::: -- [1. Determine if the Release is Consensus-Breaking](#1-determine-if-the-release-is-consensus-breaking) -- [2. Create a GitHub Release](#2-create-a-github-release) - - [Legend](#legend) -- [3. Write an Upgrade Plan](#3-write-an-upgrade-plan) -- [4. Issue Upgrade on TestNet](#4-issue-upgrade-on-testnet) -- [5. Issue Upgrade on MainNet](#5-issue-upgrade-on-mainnet) ### 1. Determine if the Release is Consensus-Breaking diff --git a/docusaurus/yarn.lock b/docusaurus/yarn.lock index 93cee387e..5c2dcf935 100644 --- a/docusaurus/yarn.lock +++ b/docusaurus/yarn.lock @@ -1810,15 +1810,10 @@ dependencies: "@types/mdx" "^2.0.0" -"@node-rs/jieba-linux-x64-gnu@1.10.0": +"@node-rs/jieba-darwin-arm64@1.10.0": version "1.10.0" - resolved "https://registry.npmjs.org/@node-rs/jieba-linux-x64-gnu/-/jieba-linux-x64-gnu-1.10.0.tgz" - integrity sha512-rS5Shs8JITxJjFIjoIZ5a9O+GO21TJgKu03g2qwFE3QaN5ZOvXtz+/AqqyfT4GmmMhCujD83AGqfOGXDmItF9w== - -"@node-rs/jieba-linux-x64-musl@1.10.0": - version "1.10.0" - resolved "https://registry.npmjs.org/@node-rs/jieba-linux-x64-musl/-/jieba-linux-x64-musl-1.10.0.tgz" - integrity sha512-BvSiF2rR8Birh2oEVHcYwq0WGC1cegkEdddWsPrrSmpKmukJE2zyjcxaOOggq2apb8fIRsjyeeUh6X3R5AgjvA== + resolved "https://registry.npmjs.org/@node-rs/jieba-darwin-arm64/-/jieba-darwin-arm64-1.10.0.tgz" + integrity sha512-IhR5r+XxFcfhVsF93zQ3uCJy8ndotRntXzoW/JCyKqOahUo/ITQRT6vTKHKMyD9xNmjl222OZonBSo2+mlI2fQ== "@node-rs/jieba@^1.6.0": version "1.10.0" @@ -4619,6 +4614,11 @@ fs.realpath@^1.0.0: resolved "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz" integrity sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw== +fsevents@~2.3.2: + version "2.3.3" + resolved "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz" + integrity sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw== + function-bind@^1.1.2: version "1.1.2" resolved "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz" From 525eb5783fbf3085d5c21923edce4a9b8b9cccfd Mon Sep 17 00:00:00 2001 From: DK Date: Wed, 16 Oct 2024 18:03:07 -0700 Subject: [PATCH 06/27] --wip-- [skip ci] --- .../chain_halt_troubleshooting.md | 14 +++- .../recovery_from_chain_halt.md | 68 +++++++++++++++++++ .../protocol/upgrades/consensus_failure.md | 15 ---- .../protocol/upgrades/contigency_plans.md | 7 +- .../docs/protocol/upgrades/upgrade_list.md | 21 ++---- 5 files changed, 92 insertions(+), 33 deletions(-) create mode 100644 docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md delete mode 100644 docusaurus/docs/protocol/upgrades/consensus_failure.md diff --git a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md index 72da1f4f3..3ded9ff73 100644 --- a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md +++ b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md @@ -8,13 +8,15 @@ title: Chain Halt Troubleshooting - [Understanding Chain Halts](#understanding-chain-halts) - [Definition and Causes](#definition-and-causes) - [Impact on Network](#impact-on-network) -- [Troubleshooting Process](#troubleshooting-process) +- [`wrong Block.Header.AppHash` Troubleshooting Process](#wrong-blockheaderapphash-troubleshooting-process) - [Step 1: Identifying the Issue](#step-1-identifying-the-issue) - [Step 2: Collecting Node Data](#step-2-collecting-node-data) - [Step 3: Analyzing Discrepancies](#step-3-analyzing-discrepancies) - [Step 4: Decoding and Interpreting Data](#step-4-decoding-and-interpreting-data) - [Step 5: Comparing Records](#step-5-comparing-records) - [Step 6: Investigation and Resolution](#step-6-investigation-and-resolution) +- [`wrong Block.Header.LastResultsHash`](#wrong-blockheaderlastresultshash) +- [Syncing from genesis](#syncing-from-genesis) ## Understanding Chain Halts @@ -40,7 +42,7 @@ Chain halts can have severe consequences for the network: Given these impacts, swift and effective troubleshooting is crucial to maintain network health and user trust. -## Troubleshooting Process +## `wrong Block.Header.AppHash` Troubleshooting Process ### Step 1: Identifying the Issue @@ -94,3 +96,11 @@ Based on the identified discrepancies: 2. Develop a fix or patch to address the issue. 3. If necessary, initiate discussions with the validator community to reach social consensus on how to proceed. 4. Implement the agreed-upon solution and monitor the network closely during and after the fix. + +## `wrong Block.Header.LastResultsHash` + +Errors such as `reactor validation error: wrong Block.Header.LastResultsHash.` are most likely to come from the non-deterministic gas calculation. That can happen when the node runs on a different version. The solution is to use the correct binary version. + +## Syncing from genesis + +If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md new file mode 100644 index 000000000..29cd949d3 --- /dev/null +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -0,0 +1,68 @@ +--- +sidebar_position: 7 +title: Chain Halt Recovery +--- + +## Chain Halt Recovery + +This document describes how to recover from a chain halt. This document assumes the cause of the chain halt has been identified and the new release has been created and verified to work. + +:::tip +See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more information on identifying the cause of a chain halt. +::: + +- [Background](#background) +- [Halt during network upgrade](#halt-during-network-upgrade) +- [Replacing the binary manually (preferred)](#replacing-the-binary-manually-preferred) +- [Rollback, fork and upgrade](#rollback-fork-and-upgrade) + + +## Background + +Pocket network is built on top of `cosmos-sdk` which utilizes CometBFT consensus engine. Bezantine Fault Tolerant (BFT) consensus algorithm requires that at least 2/3 of validators are online and voting for the same block to reach a consensus. In order to get the chain moving, we need the majority of validators to participate to get the chain moving. + +## Halt during network upgrade + +If the halt is caused by the network upgrade, it is possible the solution can be as simple as skipping an upgrade (`unsafe-skip-upgrade`) and creating a new (fixed) one. Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_plans.md). + + +## Replacing the binary manually (preferred) + +**This is preferred way of resolving the consensus-breaking issues**. + +Since the chain is not moving, it would be impossible to issue an automatic upgrade with an upgrade plan. Instead, we need to gather a social consensus to manually replace the binary and get the chain moving. + +Currently this means breaking a way to sync the network from genesis without human interaction, but there are some plans to make the process less painful in the future. + + + + + +## Rollback, fork and upgrade + +:::info + +This part is relevant for Pocket Network Shannon release only, as we do not rely on `x/gov` module for upgrades in Shannon. Instead, our DAO can issue upgrade transactions on the Pocket Network chain directly. Conventional `cosmos-sdk` upgrade process would require to go through the voting process to issue an upgrade. + +::: + +Perfrorming a rollback basically means forking the network at the older height. Modern CometBFT versions are incredibly hard to fork. As a result, **it is not recommended to perform rollbacks** unless absolutely necessary. If we do decide to go ahead with a rollback, these are the steps: + +- Prepare and verify the new version that addresses the consensus-breaking issue. +- [Create a release](../../protocol/upgrades/release_process.md). +- [Prepare an upgrade transaction](../../protocol/upgrades/upgrade_procedure.md#writing-an-upgrade-transaction) to the new version. +- Get the state of the validators on the network to **three blocks** prior to the consensus-breaking issue. + - For example, if there was an issue at height `103`, we need to get the state to the height of `100`. At `101` we will submit an upgrade transaction so the chain upgrades on `102` and avoids the issue at height `103`. + - Can be done in two ways: + - `poktrolld rollback --hard` until the command responds with the desired block number. **OR,** + - The node can be restored from the snapshot and started with `--halt-height=100` parameter so it only syncs up to ceirtan height and then gracefully shuts down. +- **Make sure all validators use the same data directory** or have been rolled back to the same height. +- **Isolate validators from the other nodes** that have not been rolled back to the older state. If that means using a firewall or isolating from the internet - this is the way. Validators should be able to only gossip blocks between themselves. **Having at least one node that has knowledge of the forking ledger can jeopardize the whole process**. In particular, the following errors are the sign of the nodes populating existing blocks: + - `found conflicting vote from ourselves; did you unsafe_reset a validator?` + - `conflicting votes from validator` +- Start the network and perform an upgrade (following the example aboce): + - We would not be able to submit an transaction at `100` (this needs to be investigated, but for some reason we were not able to) due to `signature verification failed; please verify account number (0) and chain-id (poktroll): (unable to verify single signer signature): unauthorized`. + - On block `101`, we will submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. + - `x/upgrade` performs an upgrade in the `EndBlocker` of the block `102` and waits for the node operator or `cosmovisor` to replace the binary. +- The network should go through successfull upgrade and climb to the next block. +- After the chain has been reached over the hight of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can peform the rollback or use a snapshot as well. diff --git a/docusaurus/docs/protocol/upgrades/consensus_failure.md b/docusaurus/docs/protocol/upgrades/consensus_failure.md deleted file mode 100644 index 229dd4544..000000000 --- a/docusaurus/docs/protocol/upgrades/consensus_failure.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: Consensus failure recovery plan -sidebar_position: 6 ---- - -# Consensus Failure Recovery Plan - - - -## Common consensus failure errors - - - -- `wrong Block.Header.AppHash` - the data in block is different between nodes. Can be investigated by comparing the data dir - [more information here](../../develop/developer_guide/chain_halt_troubleshooting.md). - diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 90b3e8544..7029bf3ef 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -27,12 +27,14 @@ Cancel the upgrade plan: [how](./upgrade_procedure.md#cancelling-the-upgrade-pla ## Option 1: The upgrade height is reached and the migration didn't start -If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, but it's important that all nodes on the network do this at the same time. +If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue going and the protocol team to plan another release `--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. :::caution `--unsafe-skip-upgrade` needs to be documented and added to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. + + ::: ## Option 2: The migration is stuck @@ -46,4 +48,5 @@ If the migration is stuck there's always a chance the state has been mutated for ## Option 3: The network is stuck at the future height after the upgrade -This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [here](./consensus_failure.md) for more information on how to handle such issues. +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. + diff --git a/docusaurus/docs/protocol/upgrades/upgrade_list.md b/docusaurus/docs/protocol/upgrades/upgrade_list.md index ff88e7584..1df1dd52a 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_list.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_list.md @@ -8,7 +8,7 @@ sidebar_position: 1 The tables below provide a list of past and upcoming protocol upgrades. For more detailed information about what upgrades are, how they work, and what changes they bring to the protocol, please refer to our [upgrade overview page](./protocol_upgrades.md). - [Legend](#legend) -- [TestNet](#testnet) +- [Alpha TestNet](#alpha-testnet) - [MainNet](#mainnet) ## Legend @@ -18,22 +18,15 @@ The tables below provide a list of past and upcoming protocol upgrades. For more - ❓ - Unknown/To Be Determined - ⚠️ - Warning/Caution Required -## TestNet - -:::warning -This table is currently incomplete and does not include all protocol upgrades. Our recent TestNet upgrades, which were performed via a regenesis, are not listed here. -::: +## Alpha TestNet -| Version | Planned | Breaking | Requires Manual Intervention | Upgrade Height | -| ---------------------------------------------------------------------------- | :-----: | :------: | :---------------------------------: | -------------- | -| [`v0.0.9-3`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9-3) | ❌ | ✅ | ✅ (Alpha TestNet Participants Only) | `17102` | -| [`v0.0.9`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.8`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.8) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.6`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.6) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.5`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.5) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | -| [`v0.0.4`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.4) | ❓ | ❓ | ✅ (Alpha TestNet Participants Only) | ❓ | +| Version | Planned | Breaking | Requires Manual Intervention | Upgrade Height | +| ---------------------------------------------------------------------------- | :-----: | :------: | :-------------------------------: | -------------- | +| [`v0.0.9-3`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9-3) | ❌ | ✅ | ⚠️ Alpha TestNet Participants Only | `17102` | +| [`v0.0.9`](https://github.com/pokt-network/poktroll/releases/tag/v0.0.9) | ❓ | ❓ | N/A: genesis version | ❓ | + ## MainNet From 4e8c7dda086ae6356c5ba587627288f8e09eeb78 Mon Sep 17 00:00:00 2001 From: DK Date: Wed, 16 Oct 2024 18:15:45 -0700 Subject: [PATCH 07/27] spell checking --- .../chain_halt_troubleshooting.md | 2 +- .../recovery_from_chain_halt.md | 14 +++++------ .../protocol/upgrades/contigency_plans.md | 25 ++++++++----------- .../protocol/upgrades/upgrade_procedure.md | 11 ++++---- 4 files changed, 24 insertions(+), 28 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md index 3ded9ff73..e97f535db 100644 --- a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md +++ b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md @@ -103,4 +103,4 @@ Errors such as `reactor validation error: wrong Block.Header.LastResultsHash.` a ## Syncing from genesis -If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). +If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running the correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). \ No newline at end of file diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 29cd949d3..b56b0e725 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -19,7 +19,7 @@ See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more infor ## Background -Pocket network is built on top of `cosmos-sdk` which utilizes CometBFT consensus engine. Bezantine Fault Tolerant (BFT) consensus algorithm requires that at least 2/3 of validators are online and voting for the same block to reach a consensus. In order to get the chain moving, we need the majority of validators to participate to get the chain moving. +Pocket network is built on top of `cosmos-sdk` which utilizes CometBFT consensus engine. Byzantine Fault Tolerant (BFT) consensus algorithm requires that at least 2/3 of validators are online and voting for the same block to reach a consensus. In order to get the chain moving, we need the majority of validators to participate to get the chain moving. ## Halt during network upgrade @@ -46,7 +46,7 @@ This part is relevant for Pocket Network Shannon release only, as we do not rely ::: -Perfrorming a rollback basically means forking the network at the older height. Modern CometBFT versions are incredibly hard to fork. As a result, **it is not recommended to perform rollbacks** unless absolutely necessary. If we do decide to go ahead with a rollback, these are the steps: +Performing a rollback basically means forking the network at the older height. Modern CometBFT versions are incredibly hard to fork. As a result, **it is not recommended to perform rollbacks** unless absolutely necessary. If we do decide to go ahead with a rollback, these are the steps: - Prepare and verify the new version that addresses the consensus-breaking issue. - [Create a release](../../protocol/upgrades/release_process.md). @@ -55,14 +55,14 @@ Perfrorming a rollback basically means forking the network at the older height. - For example, if there was an issue at height `103`, we need to get the state to the height of `100`. At `101` we will submit an upgrade transaction so the chain upgrades on `102` and avoids the issue at height `103`. - Can be done in two ways: - `poktrolld rollback --hard` until the command responds with the desired block number. **OR,** - - The node can be restored from the snapshot and started with `--halt-height=100` parameter so it only syncs up to ceirtan height and then gracefully shuts down. + - The node can be restored from the snapshot and started with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. - **Make sure all validators use the same data directory** or have been rolled back to the same height. - **Isolate validators from the other nodes** that have not been rolled back to the older state. If that means using a firewall or isolating from the internet - this is the way. Validators should be able to only gossip blocks between themselves. **Having at least one node that has knowledge of the forking ledger can jeopardize the whole process**. In particular, the following errors are the sign of the nodes populating existing blocks: - `found conflicting vote from ourselves; did you unsafe_reset a validator?` - `conflicting votes from validator` -- Start the network and perform an upgrade (following the example aboce): - - We would not be able to submit an transaction at `100` (this needs to be investigated, but for some reason we were not able to) due to `signature verification failed; please verify account number (0) and chain-id (poktroll): (unable to verify single signer signature): unauthorized`. +- Start the network and perform an upgrade (following the example above): + - We would not be able to submit a transaction at `100` (this needs to be investigated, but for some reason we were not able to) due to `signature verification failed; please verify account number (0) and chain-id (poktroll): (unable to verify single signer signature): unauthorized`. - On block `101`, we will submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. - `x/upgrade` performs an upgrade in the `EndBlocker` of the block `102` and waits for the node operator or `cosmovisor` to replace the binary. -- The network should go through successfull upgrade and climb to the next block. -- After the chain has been reached over the hight of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can peform the rollback or use a snapshot as well. +- The network should go through successful upgrade and climb to the next block. +- After the chain has been reached over the height of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can perform the rollback or use a snapshot as well. \ No newline at end of file diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 7029bf3ef..75438c9be 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -5,48 +5,45 @@ sidebar_position: 5 # Contingency plans - -There's always a chance the upgrade will fail. We prepared some contingency plans, so we can try to recover without -significant downtime. +There's always a chance the upgrade will fail. We have prepared some contingency plans, so we can recover without significant downtime. :::tip -This documentation covers failed upgrade contingency for `poktroll` - `cosmos-sdk` based chain. While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. +This documentation covers failed upgrade contingency for `poktroll` - a `cosmos-sdk` based chain. While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. ::: -- [Option 0: the bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) +- [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) - [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) - [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) - [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) -## Option 0: the bug is discovered before the upgrade height is reached +## Option 0: The bug is discovered before the upgrade height is reached Cancel the upgrade plan: [how](./upgrade_procedure.md#cancelling-the-upgrade-plan). ## Option 1: The upgrade height is reached and the migration didn't start -If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue going and the protocol team to plan another release +If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. -`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. +`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations, and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. :::caution -`--unsafe-skip-upgrade` needs to be documented and added to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. +`--unsafe-skip-upgrade` needs to be documented and added to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. ::: ## Option 2: The migration is stuck -If the migration is stuck there's always a chance the state has been mutated for the upgrade but the migration didn't complete. In such case, we need to: +If the migration is stuck, there's always a chance the state has been mutated for the upgrade but the migration didn't complete. In such a case, we need to: -- Rollback validators to the backup (snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). +- Roll back validators to the backup (a snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). - Skip the upgrade handler and store migrations with `--unsafe-skip-upgrade=$upgradeHeightNumber`. -- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so next time somebody tries to sync the network from the genesis - they will automatically skip the failed upgrade. +- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. - Resolve the issue with an upgrade and schedule another plan. ## Option 3: The network is stuck at the future height after the upgrade -This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. - +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. \ No newline at end of file diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 25c5ffc6e..a5f990f54 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -71,7 +71,7 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl - `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. - `height`: The height at which an upgrade should be executed and the node will be restarted. -- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. +- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. ### Validate the URLs (live network only) @@ -111,7 +111,7 @@ The `MsgSoftwareUpgrade` can be submitted using the following command: poktrolld tx authz exec $PATH_TO_UPGRADE_TRANSACTION_JSON --from=pnf ``` -If the transaction has been accepted, upgrade plan can be viewed with this command: +If the transaction has been accepted, the upgrade plan can be viewed with this command: ```bash poktrolld query upgrade plan @@ -148,7 +148,7 @@ In short, the procedure is: #### Full example -As we are testing an upgrade, we need to have a network that first runs on the old version. So it is a good idea to have a LocalNet running using a binary from the [previous release you with to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. +As we are testing an upgrade, we need to have a network that first runs on the old version. So it is a good idea to have a LocalNet running using a binary from the [previous release you wish to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. 1. Make a note of the version you want to test an upgrade **FROM**. This will be the **OLD** version. For example, let's imagine we're upgrading from `v0.0.9`. 2. Pull a new `poktroll` repo (will be used as an "old" version): @@ -172,12 +172,11 @@ As we are testing an upgrade, we need to have a network that first runs on the o 7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction (e.g. `poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf`) 8. Verify the plan is active: `poktrolld query upgrade plan` 9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. -10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade network **TO**. +10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. 11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. 12. Start the new version (from the **NEW VERSION REPO**: `./release_binaries/poktroll_darwin_arm64 start`) 13. Observe the behavior. Your node should go through the upgrade process and start using the new version. - ### DevNet DevNets currently do not support `cosmovisor`. @@ -200,4 +199,4 @@ If you are a member of Grove, you can find the instructions to access the infras ### Mainnet -The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. +The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. \ No newline at end of file From dd631e60a3501f10f6678a1d140ea8db938aa4d6 Mon Sep 17 00:00:00 2001 From: Dmitry K Date: Thu, 17 Oct 2024 13:35:53 -0700 Subject: [PATCH 08/27] Empty commit From 83a24aa26637a6947e46d3ef59622b32538ca34a Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Mon, 21 Oct 2024 13:00:44 -0400 Subject: [PATCH 09/27] Partial review --- api/poktroll/application/event.pulsar.go | 2 +- .../chain_halt_troubleshooting.md | 21 +++++++--- .../recovery_from_chain_halt.md | 39 ++++++++++++------- go.mod | 6 --- go.sum | 2 - x/tokenomics/types/tx.pb.go | 1 - 6 files changed, 41 insertions(+), 30 deletions(-) diff --git a/api/poktroll/application/event.pulsar.go b/api/poktroll/application/event.pulsar.go index 043aa2264..290bbde55 100644 --- a/api/poktroll/application/event.pulsar.go +++ b/api/poktroll/application/event.pulsar.go @@ -3,11 +3,11 @@ package application import ( _ "cosmossdk.io/api/cosmos/base/v1beta1" + _ "github.com/pokt-network/poktroll/api/poktroll/shared" fmt "fmt" _ "github.com/cosmos/cosmos-proto" runtime "github.com/cosmos/cosmos-proto/runtime" _ "github.com/cosmos/gogoproto/gogoproto" - _ "github.com/pokt-network/poktroll/api/poktroll/shared" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoiface "google.golang.org/protobuf/runtime/protoiface" protoimpl "google.golang.org/protobuf/runtime/protoimpl" diff --git a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md index e97f535db..4f5796793 100644 --- a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md +++ b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md @@ -8,14 +8,14 @@ title: Chain Halt Troubleshooting - [Understanding Chain Halts](#understanding-chain-halts) - [Definition and Causes](#definition-and-causes) - [Impact on Network](#impact-on-network) -- [`wrong Block.Header.AppHash` Troubleshooting Process](#wrong-blockheaderapphash-troubleshooting-process) +- [Troubleshooting `wrong Block.Header.AppHash`](#troubleshooting-wrong-blockheaderapphash) - [Step 1: Identifying the Issue](#step-1-identifying-the-issue) - [Step 2: Collecting Node Data](#step-2-collecting-node-data) - [Step 3: Analyzing Discrepancies](#step-3-analyzing-discrepancies) - [Step 4: Decoding and Interpreting Data](#step-4-decoding-and-interpreting-data) - [Step 5: Comparing Records](#step-5-comparing-records) - [Step 6: Investigation and Resolution](#step-6-investigation-and-resolution) -- [`wrong Block.Header.LastResultsHash`](#wrong-blockheaderlastresultshash) +- [Troubleshooting `wrong Block.Header.LastResultsHash`](#troubleshooting-wrong-blockheaderlastresultshash) - [Syncing from genesis](#syncing-from-genesis) ## Understanding Chain Halts @@ -42,7 +42,7 @@ Chain halts can have severe consequences for the network: Given these impacts, swift and effective troubleshooting is crucial to maintain network health and user trust. -## `wrong Block.Header.AppHash` Troubleshooting Process +## Troubleshooting `wrong Block.Header.AppHash` ### Step 1: Identifying the Issue @@ -97,10 +97,19 @@ Based on the identified discrepancies: 3. If necessary, initiate discussions with the validator community to reach social consensus on how to proceed. 4. Implement the agreed-upon solution and monitor the network closely during and after the fix. -## `wrong Block.Header.LastResultsHash` +## Troubleshooting `wrong Block.Header.LastResultsHash` -Errors such as `reactor validation error: wrong Block.Header.LastResultsHash.` are most likely to come from the non-deterministic gas calculation. That can happen when the node runs on a different version. The solution is to use the correct binary version. +Errors like the following can occur from using the incorrect binary version at a certain height. + +```bash +reactor validation error: wrong Block.Header.LastResultsHash. +``` + +The solution is to use the correct binary version to sync the full node at the correct height. + +Tools like [cosmosvisor](https://docs.cosmos.network/v0.45/run-node/cosmovisor.html) make it easier +to sync a node from genesis, using the appropriate binary for each range of block heights. ## Syncing from genesis -If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running the correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). \ No newline at end of file +If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running the correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index b56b0e725..13e9991cb 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -5,40 +5,51 @@ title: Chain Halt Recovery ## Chain Halt Recovery -This document describes how to recover from a chain halt. This document assumes the cause of the chain halt has been identified and the new release has been created and verified to work. +This document describes how to recover from a chain halt. It assumes the cause of +the chain halt has been identified, the new release has been created, and verified +function correctly. :::tip See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more information on identifying the cause of a chain halt. ::: - [Background](#background) -- [Halt during network upgrade](#halt-during-network-upgrade) -- [Replacing the binary manually (preferred)](#replacing-the-binary-manually-preferred) -- [Rollback, fork and upgrade](#rollback-fork-and-upgrade) - +- [Resolving halts during a network upgrade](#resolving-halts-during-a-network-upgrade) + - [Manual binary replacement (preferred)](#manual-binary-replacement-preferred) + - [Rollback, fork and upgrade](#rollback-fork-and-upgrade) ## Background -Pocket network is built on top of `cosmos-sdk` which utilizes CometBFT consensus engine. Byzantine Fault Tolerant (BFT) consensus algorithm requires that at least 2/3 of validators are online and voting for the same block to reach a consensus. In order to get the chain moving, we need the majority of validators to participate to get the chain moving. +Pocket network is built on top of `cosmos-sdk`, which utilizes the CometBFT consensus engine. +Byzantine Fault Tolerant (BFT) consensus algorithm requires that **at least** 2/3 of Validators +are online and voting for the same block to reach a consensus. In order to maintain liveness +and avoid a chain-halt, we need the majority (> 2/3) of Validators to participate +and use the same version of the software. + +## Resolving halts during a network upgrade -## Halt during network upgrade +If the halt is caused by the network upgrade, it is possible the solution can be as simple as +skipping an upgrade (i.e. `unsafe-skip-upgrade`) and creating a new (fixed) upgrade. -If the halt is caused by the network upgrade, it is possible the solution can be as simple as skipping an upgrade (`unsafe-skip-upgrade`) and creating a new (fixed) one. Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_plans.md). +Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_plans.md). +### Manual binary replacement (preferred) -## Replacing the binary manually (preferred) +:::note **This is preferred way of resolving the consensus-breaking issues**. -Since the chain is not moving, it would be impossible to issue an automatic upgrade with an upgrade plan. Instead, we need to gather a social consensus to manually replace the binary and get the chain moving. +::: -Currently this means breaking a way to sync the network from genesis without human interaction, but there are some plans to make the process less painful in the future. +Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. +Instead, we need **social consensus** to manually replace the binary and get the chain moving. - +Currently this involves synching the network from genesis breaking a way to sync the network from genesis without human interaction, but there are some plans to make the process less painful in the future. + -## Rollback, fork and upgrade +### Rollback, fork and upgrade :::info @@ -65,4 +76,4 @@ Performing a rollback basically means forking the network at the older height. M - On block `101`, we will submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. - `x/upgrade` performs an upgrade in the `EndBlocker` of the block `102` and waits for the node operator or `cosmovisor` to replace the binary. - The network should go through successful upgrade and climb to the next block. -- After the chain has been reached over the height of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can perform the rollback or use a snapshot as well. \ No newline at end of file +- After the chain has been reached over the height of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can perform the rollback or use a snapshot as well. diff --git a/go.mod b/go.mod index c9f3d9997..91de15f1d 100644 --- a/go.mod +++ b/go.mod @@ -79,15 +79,10 @@ require ( gopkg.in/yaml.v2 v2.4.0 ) -<<<<<<< HEAD -require github.com/jhump/protoreflect v1.16.0 -======= require ( cosmossdk.io/x/tx v0.13.4 github.com/jhump/protoreflect v1.16.0 - go.uber.org/mock v0.4.0 ) ->>>>>>> main require ( buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.34.2-20240508200655-46a4cf4ba109.2 // indirect @@ -100,7 +95,6 @@ require ( connectrpc.com/connect v1.16.2 // indirect connectrpc.com/otelconnect v0.7.0 // indirect cosmossdk.io/collections v0.4.0 // indirect - cosmossdk.io/x/tx v0.13.4 // indirect filippo.io/edwards25519 v1.0.0 // indirect github.com/99designs/go-keychain v0.0.0-20191008050251-8e49817e8af4 // indirect github.com/99designs/keyring v1.2.1 // indirect diff --git a/go.sum b/go.sum index b35917516..62dc61bb0 100644 --- a/go.sum +++ b/go.sum @@ -1213,8 +1213,6 @@ go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0 go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= -go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU= -go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= diff --git a/x/tokenomics/types/tx.pb.go b/x/tokenomics/types/tx.pb.go index e4fec264c..9f18a148c 100644 --- a/x/tokenomics/types/tx.pb.go +++ b/x/tokenomics/types/tx.pb.go @@ -125,7 +125,6 @@ type MsgUpdateParam struct { // specified in the `Params` message in `proof/params.proto.` Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` // Types that are valid to be assigned to AsType: - // // *MsgUpdateParam_AsString // *MsgUpdateParam_AsInt64 // *MsgUpdateParam_AsBytes From 50cc08e0a6c130bbf9c8e939f90fc51402d17d75 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Mon, 21 Oct 2024 13:55:48 -0400 Subject: [PATCH 10/27] Partial review --- .../recovery_from_chain_halt.md | 74 +++++++++++++------ .../docs/protocol/upgrades/release_process.md | 6 +- 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 13e9991cb..82c432f7b 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -17,6 +17,8 @@ See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more infor - [Resolving halts during a network upgrade](#resolving-halts-during-a-network-upgrade) - [Manual binary replacement (preferred)](#manual-binary-replacement-preferred) - [Rollback, fork and upgrade](#rollback-fork-and-upgrade) + - [Step 5: Data rollback - retrieving snapshot at a specific height](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height) + - [Step 6: Validator Isolation - risk mitigation](#step-6-validator-isolation---risk-mitigation) ## Background @@ -37,7 +39,7 @@ Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_p :::note -**This is preferred way of resolving the consensus-breaking issues**. +This is preferred way of resolving the consensus-breaking issues. ::: @@ -53,27 +55,55 @@ Currently this involves synching the network from genesis breaking a way to sync :::info -This part is relevant for Pocket Network Shannon release only, as we do not rely on `x/gov` module for upgrades in Shannon. Instead, our DAO can issue upgrade transactions on the Pocket Network chain directly. Conventional `cosmos-sdk` upgrade process would require to go through the voting process to issue an upgrade. +These instructions are only relevant to Pocket Network's Shannon release. + +We do not currently use `x/gov` and on-chain voting for upgrades. + +Instead, our DAO votes on upgrades off-chain and the Foundation executes +transactions on their behalf. ::: -Performing a rollback basically means forking the network at the older height. Modern CometBFT versions are incredibly hard to fork. As a result, **it is not recommended to perform rollbacks** unless absolutely necessary. If we do decide to go ahead with a rollback, these are the steps: - -- Prepare and verify the new version that addresses the consensus-breaking issue. -- [Create a release](../../protocol/upgrades/release_process.md). -- [Prepare an upgrade transaction](../../protocol/upgrades/upgrade_procedure.md#writing-an-upgrade-transaction) to the new version. -- Get the state of the validators on the network to **three blocks** prior to the consensus-breaking issue. - - For example, if there was an issue at height `103`, we need to get the state to the height of `100`. At `101` we will submit an upgrade transaction so the chain upgrades on `102` and avoids the issue at height `103`. - - Can be done in two ways: - - `poktrolld rollback --hard` until the command responds with the desired block number. **OR,** - - The node can be restored from the snapshot and started with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. -- **Make sure all validators use the same data directory** or have been rolled back to the same height. -- **Isolate validators from the other nodes** that have not been rolled back to the older state. If that means using a firewall or isolating from the internet - this is the way. Validators should be able to only gossip blocks between themselves. **Having at least one node that has knowledge of the forking ledger can jeopardize the whole process**. In particular, the following errors are the sign of the nodes populating existing blocks: - - `found conflicting vote from ourselves; did you unsafe_reset a validator?` - - `conflicting votes from validator` -- Start the network and perform an upgrade (following the example above): - - We would not be able to submit a transaction at `100` (this needs to be investigated, but for some reason we were not able to) due to `signature verification failed; please verify account number (0) and chain-id (poktroll): (unable to verify single signer signature): unauthorized`. - - On block `101`, we will submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. - - `x/upgrade` performs an upgrade in the `EndBlocker` of the block `102` and waits for the node operator or `cosmovisor` to replace the binary. -- The network should go through successful upgrade and climb to the next block. -- After the chain has been reached over the height of the previous ledger (`104`+), validators can open the gates for other full nodes to join the network again. Full nodes can perform the rollback or use a snapshot as well. +**Performing a rollback is analogous to forking the network at the older height.** + +This should be avoided unless absolutely necessary. + +However, if necessary, the instructions to follow are: + +1. Prepare & verify a new binary that addresses the consensus-breaking issue. +2. [Create a release](../../protocol/upgrades/release_process.md). +3. [Prepare an upgrade transaction](../../protocol/upgrades/upgrade_procedure.md#writing-an-upgrade-transaction) to the new version. +4. Get the Validator set off the network **3 blocks** prior to the height of the chain halt. For example: + - Assume an issue at height `103` + - Get the validator set at height `100` + - Submit an upgrade transaction at `101` + - Upgrade the chain at height `102` + - Avoid the issue at height `103` +5. Ensure all validators rolled back to the same height and use the same snapshot + - The snapshot should be imported into each Validator's data directory + - This is necessary to ensure data continuity and prevent forks. +6. Isolate the validator set from full nodes. + - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. + - This may require using a firewall or a private network + - Validators should only be gossip blocks amongst themselves. +7. Start the network and perform the upgrade. For example, reiterating the process above: + - Start all Validators at height `100` + - On block `101`, submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. + - `x/upgrade` will perform the upgrade in the `EndBlocker` of block `102` + - If using `cosmosvisor`, the node will wait to replace the binary +8. Wait for the network to reach the height of the previous ledger (`104`+) +9. Allow validators to open their network to full nodes again. + - Note that full nodes will need to perform the rollback or use a snapshot as well. + +#### Step 5: Data rollback - retrieving snapshot at a specific height + +There are two ways to get a snapshot from a prior height: + +1. Use `poktrolld rollback --hard` repeately until the command responds with the desired block number. +2. Use a snapshot and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. + +#### Step 6: Validator Isolation - risk mitigation + +- Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the following errors are the sign of the nodes populating existing blocks: +- `found conflicting vote from ourselves; did you unsafe_reset a validator?` +- `conflicting votes from validator` diff --git a/docusaurus/docs/protocol/upgrades/release_process.md b/docusaurus/docs/protocol/upgrades/release_process.md index 4a756d5a5..398d56c05 100644 --- a/docusaurus/docs/protocol/upgrades/release_process.md +++ b/docusaurus/docs/protocol/upgrades/release_process.md @@ -16,7 +16,6 @@ sidebar_position: 4 This document is for the Pocket Network protocol team's internal use only. ::: - ### 1. Determine if the Release is Consensus-Breaking :::note @@ -54,10 +53,8 @@ You can find an example [here](https://github.com/pokt-network/poktroll/releases ## Protocol Upgrades - **Planned Upgrade:** ❌ Not applicable for this release. @@ -66,6 +63,7 @@ such as https://github.com/pokt-network/poktroll/blob/main/app/upgrades/historic - **Upgrade Height:** ❌ Not applicable for this release. ## What's Changed + ``` From 3bddc15c1eb10b9310edbb39a20968be651c3e98 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Mon, 21 Oct 2024 14:12:16 -0400 Subject: [PATCH 11/27] Partial review --- .../protocol/upgrades/contigency_plans.md | 51 ++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 75438c9be..3ab90d586 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -3,47 +3,64 @@ title: Failed upgrade contingency plan sidebar_position: 5 --- -# Contingency plans - -There's always a chance the upgrade will fail. We have prepared some contingency plans, so we can recover without significant downtime. - :::tip -This documentation covers failed upgrade contingency for `poktroll` - a `cosmos-sdk` based chain. While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. +This documentation covers failed upgrade contingency for `poktroll` - a `cosmos-sdk` based chain. + +While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. ::: +## Contingency plans + +There's always a chance the upgrade will fail. + +This document is intended to help you recover without significant downtime. + - [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) - [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) - [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) - [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) +### Option 0: The bug is discovered before the upgrade height is reached + +**Cancel the upgrade plan!!** -## Option 0: The bug is discovered before the upgrade height is reached +See the instructions of [how to do that here](./upgrade_procedure.md#cancelling-the-upgrade-plan). -Cancel the upgrade plan: [how](./upgrade_procedure.md#cancelling-the-upgrade-plan). +### Option 1: The upgrade height is reached and the migration didn't start -## Option 1: The upgrade height is reached and the migration didn't start +If the nodes on the network stopped at the upgrade height and the migration did not +start yet (i.e. there are no logs indicating the upgrade handler and store migrations are being executed), +we mist gather social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. -If the nodes on the network stopped at the upgrade height and the migration did not start yet (there are no logs indicating the upgrade handler and store migrations are being executed), we should gather a social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. +This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. -`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations, and the chain continues as if the upgrade plan was never set. The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. +`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations. +The chain continues as if the upgrade plan was never set. +The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. :::caution + `--unsafe-skip-upgrade` needs to be documented and added to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. - + + ::: -## Option 2: The migration is stuck +### Option 2: The migration is stuck + +If the migration is stuck, there's always a chance the state has been mutated for +the upgrade but the migration didn't complete. -If the migration is stuck, there's always a chance the state has been mutated for the upgrade but the migration didn't complete. In such a case, we need to: +In such a case, we need to: -- Roll back validators to the backup (a snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). +- Roll back validators to the backup (a snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). - Skip the upgrade handler and store migrations with `--unsafe-skip-upgrade=$upgradeHeightNumber`. -- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. +- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody tries to sync the network from genesis + they will automatically skip the failed upgrade. - Resolve the issue with an upgrade and schedule another plan. -## Option 3: The network is stuck at the future height after the upgrade +### Option 3: The network is stuck at the future height after the upgrade -This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. \ No newline at end of file +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. From 498d9d8f53e62e5e8e7e4837833c1977ca1777f8 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Mon, 21 Oct 2024 14:26:41 -0400 Subject: [PATCH 12/27] Partial review --- .../protocol/upgrades/upgrade_procedure.md | 134 ++++++++++++------ 1 file changed, 91 insertions(+), 43 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index a5f990f54..668bee067 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -17,8 +17,8 @@ This page describes the protocol upgrade process, which is internal to the proto - [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - [LocalNet](#localnet) - - [TLDR](#tldr) - - [Full example](#full-example) + - [LocalNet Upgrade tl;dr](#localnet-upgrade-tldr) + - [LocalNet Upgrade Full Example Walkthrough](#localnet-upgrade-full-example-walkthrough) - [DevNet](#devnet) - [TestNet](#testnet) - [Mainnet](#mainnet) @@ -71,22 +71,20 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl - `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. - `height`: The height at which an upgrade should be executed and the node will be restarted. -- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. +- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. -### Validate the URLs (live network only) +:::tip -The URLs of the binaries contain checksums. It is important to make sure they are correct, otherwise Cosmovisor won't be able -to download the binaries and go through the upgrade. Here's a little command that uses `jq` and `go-getter` (same library used by Cosmovisor - so it is a good test). +When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. -:::tip +::: -Go-getter can be installed using the following command: +### Validate the URLs (live network only) -```bash -go install github.com/hashicorp/go-getter/cmd/go-getter@latest -``` +The URLs of the binaries contain checksums. It is critical to ensure they are correct. +Otherwise Cosmovisor won't be able to download the binaries and go through the upgrade. -::: +The command below (using toold build by the authors of Cosmosvisor) can be used to achieve the above: ```bash jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' $PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do @@ -103,6 +101,16 @@ The output should look like this: 2024/09/24 12:40:46 success! ``` +:::tip + +`go-getter` can be installed using the following command: + +```bash +go install github.com/hashicorp/go-getter/cmd/go-getter@latest +``` + +::: + ## Submitting the upgrade on-chain The `MsgSoftwareUpgrade` can be submitted using the following command: @@ -133,48 +141,88 @@ Note that for local testing, `cosmovisor` won't pull the binary from the info fi ### LocalNet -LocalNet does not support `cosmovisor` and automatic upgrades at the moment. But we don't need it to simulate and test the upgrade procedure. +LocalNet **DOES NOT** support `cosmovisor` and automatic upgrades at the moment. + +However, **IT IS NOT NEEDED** to simulate and test the upgrade procedure. + +#### LocalNet Upgrade tl;dr -#### TLDR +1. Pull git repo with old version (separate directory) +2. Download release binary of the old version +3. Wipe LocalNet data and generate genesis using OLD version +4. Start node using anOLD binary +5. Write and submit an upgrade transaction on-chain +6. When the Upgrade Plan height is reached, stop the old node and run the new binary +7. Observe the behavior -In short, the procedure is: -- Pull git repo with old version (separate directory) -- Download release binary of the old version -- Wipe localnet data and generate genesis using OLD version -- Start node using OLD binary -- Write and submit an upgrade transaction on-chain -- When the Upgrade Plan height is reached, stop the old node and run the new binary -- Observe the behavior +#### LocalNet Upgrade Full Example Walkthrough -#### Full example +Testing an upgrade requires a network running on an old version. -As we are testing an upgrade, we need to have a network that first runs on the old version. So it is a good idea to have a LocalNet running using a binary from the [previous release you wish to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. +Ensure LocalNet is running using a binary from the [previous release you wish to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. 1. Make a note of the version you want to test an upgrade **FROM**. This will be the **OLD** version. For example, let's imagine we're upgrading from `v0.0.9`. 2. Pull a new `poktroll` repo (will be used as an "old" version): - ```bash - git clone https://github.com/pokt-network/poktroll.git poktroll-upgrade-old - cd poktroll-upgrade-old - git checkout v0.0.9 - # Download the v0.0.9 binary: https://github.com/pokt-network/poktroll/releases - # CHANGE POKTROLLD_VERSION and ARCH - curl -L "https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" | tar -zxvf - -C . + ```bash + git clone https://github.com/pokt-network/poktroll.git poktroll-upgrade-old + cd poktroll-upgrade-old + git checkout v0.0.9 + + # Download the v0.0.9 binary: https://github.com/pokt-network/poktroll/releases + # CHANGE POKTROLLD_VERSION and ARCH + curl -L "https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" | tar -zxvf - -C . + + # Validate the version + ./poktrolld version + 0.0.9 + ``` + +3. Stop LocalNet + + ```bash + make localnet_down + ``` + +4. Reset the data + + ```bash + ./poktrolld comet unsafe-reset-all + ``` + +5. Create new genesis using old version (from `poktroll-upgrade-old` dir) + + ```bash + make localnet_regenesis + ``` + +6. Start the network + + ```bash + ./poktrolld start + ``` + +7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction. For example: + + ```bash + poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf` + ``` + +8. Verify the plan is active + + ```bash + poktrolld query upgrade plan + ``` - # Validate the version - ./poktrolld version - 0.0.9 - ``` -3. Stop LocalNet: `make localnet_down` -4. Reset the data: `./poktrolld comet unsafe-reset-all` -5. Create new genesis using old version (from `poktroll-upgrade-old` dir): `make localnet_regenesis` -6. Start the network: `./poktrolld start` -7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction (e.g. `poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf`) -8. Verify the plan is active: `poktrolld query upgrade plan` 9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. 10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. 11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. -12. Start the new version (from the **NEW VERSION REPO**: `./release_binaries/poktroll_darwin_arm64 start`) +12. Start the new version from the **NEW VERSION REPO**: + + ```bash + ./release_binaries/poktroll_darwin_arm64 start + ``` + 13. Observe the behavior. Your node should go through the upgrade process and start using the new version. ### DevNet @@ -199,4 +247,4 @@ If you are a member of Grove, you can find the instructions to access the infras ### Mainnet -The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. \ No newline at end of file +The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. From be35f1a80cf9f4c98e9f3a63b5b7889cb0f72b01 Mon Sep 17 00:00:00 2001 From: Dmitry K Date: Wed, 23 Oct 2024 17:41:04 -0700 Subject: [PATCH 13/27] requested changes --- .../recovery_from_chain_halt.md | 100 ++++++++++++++---- .../protocol/upgrades/contigency_plans.md | 23 +++- .../protocol/upgrades/upgrade_procedure.md | 5 +- tools/installer/full-node.sh | 3 +- 4 files changed, 105 insertions(+), 26 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 82c432f7b..03a399052 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -18,7 +18,7 @@ See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more infor - [Manual binary replacement (preferred)](#manual-binary-replacement-preferred) - [Rollback, fork and upgrade](#rollback-fork-and-upgrade) - [Step 5: Data rollback - retrieving snapshot at a specific height](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height) - - [Step 6: Validator Isolation - risk mitigation](#step-6-validator-isolation---risk-mitigation) + - [Step 6: Validator Isolation - risks](#step-6-validator-isolation---risks) ## Background @@ -39,17 +39,49 @@ Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_p :::note -This is preferred way of resolving the consensus-breaking issues. +This is the preferred way of resolving consensus-breaking issues. -::: - -Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. +**Significant side effect**: this breaks an ability to sync from genesis **without manual interventions**. +For example, when a consensus-breaking issue occurs on a node that is synching from the first block, node operators need +to manually replace the binary with the new one. There are efforts underway to mitigate this issue, including +configuration for `cosmovisor` that could automate the process. -Instead, we need **social consensus** to manually replace the binary and get the chain moving. + -Currently this involves synching the network from genesis breaking a way to sync the network from genesis without human interaction, but there are some plans to make the process less painful in the future. +::: - +Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. Instead, +we need **social consensus** to manually replace the binary and get the chain moving. + +1. Prepare and verify a new binary that addresses the consensus-breaking issue. +2. Reach out to the community and validators so they can upgrade the binary manually. + :::warning UNKNOWN, NEED TO INVESTIGATE + + We might need to coordinate the timing of when the nodes should be started. In Tendermint version of Pocket Network + (Morse), this was necessary to sync consensus rounds and steps, getting the chain moving. It might not be a + requirement anymore, but we need to double-check. [More information](https://docs.cometbft.com/v1.0/spec/consensus/consensus). +3. Update [the documentation](../../protocol/upgrades/upgrade_list.md) to include a range a height when the binary needs + to be repleced. Consider a configuration change for `cosmovisor` so it would automatically replace the binary when + synching from genesis. + + +```mermaid +sequenceDiagram + participant DevTeam + participant Community + participant Validators + participant Documentation + participant Network + + DevTeam->>DevTeam: Prepare and verify new binary + DevTeam->>Community: Announce new binary and instructions + DevTeam->>Validators: Notify validators to upgrade manually + Validators->>Validators: Manually replace the binary + Validators->>Network: Restart nodes with new binary + DevTeam->>Documentation: Update upgrade documentation + Validators->>Network: Network resumes operation + +``` ### Rollback, fork and upgrade @@ -79,10 +111,10 @@ However, if necessary, the instructions to follow are: - Submit an upgrade transaction at `101` - Upgrade the chain at height `102` - Avoid the issue at height `103` -5. Ensure all validators rolled back to the same height and use the same snapshot +5. Ensure all validators rolled back to the same height and use the same snapshot - ([how to get the snapshot](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height)) - The snapshot should be imported into each Validator's data directory - This is necessary to ensure data continuity and prevent forks. -6. Isolate the validator set from full nodes. +6. Isolate the validator set from full nodes - ([why this is necessary](#step-6-validator-isolation---risks)) - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. - This may require using a firewall or a private network - Validators should only be gossip blocks amongst themselves. @@ -95,15 +127,47 @@ However, if necessary, the instructions to follow are: 9. Allow validators to open their network to full nodes again. - Note that full nodes will need to perform the rollback or use a snapshot as well. +```mermaid +sequenceDiagram + participant DevTeam + participant Foundation + participant Validators + participant FullNodes + participant Network + + DevTeam->>DevTeam: Prepare & verify new binary + DevTeam->>DevTeam: Create a release + Validators->>Validators: Roll back to height before issue or import snapshot + Validators->>Validators: Isolate from Full Nodes + Foundation->>Validators: Distribute upgrade transaction + Validators->>Network: Start network and perform upgrade + Validators->>Network: Wait until over consensus-breaking height + Validators->>FullNodes: Open network connections + FullNodes->>Network: Sync with updated network + Validators->>Network: Network resumes operation + +``` + #### Step 5: Data rollback - retrieving snapshot at a specific height There are two ways to get a snapshot from a prior height: -1. Use `poktrolld rollback --hard` repeately until the command responds with the desired block number. -2. Use a snapshot and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. - -#### Step 6: Validator Isolation - risk mitigation - -- Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the following errors are the sign of the nodes populating existing blocks: -- `found conflicting vote from ourselves; did you unsafe_reset a validator?` -- `conflicting votes from validator` +1. Execute + ```bash + poktrolld rollback --hard + ``` + repeately until the command responds with the desired block number. +2. Use a snapshot and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then + gracefully shuts down. Add this argument to `poktrolld start` like this: + ```bash + poktrolld start --halt-height=100 + ``` + + +#### Step 6: Validator Isolation - risks + +Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the +following errors in logs are the sign of the nodes populating existing blocks: + - `found conflicting vote from ourselves; did you unsafe_reset a validator?` + - `conflicting votes from validator` + diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 3ab90d586..32b254cad 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -20,11 +20,12 @@ This document is intended to help you recover without significant downtime. - [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) - [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) - [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) + - [Documentation and scripts to update](#documentation-and-scripts-to-update) - [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) ### Option 0: The bug is discovered before the upgrade height is reached -**Cancel the upgrade plan!!** +**Cancel the upgrade plan!** See the instructions of [how to do that here](./upgrade_procedure.md#cancelling-the-upgrade-plan). @@ -55,12 +56,24 @@ the upgrade but the migration didn't complete. In such a case, we need to: -- Roll back validators to the backup (a snapshot is taken by `cosmovisor` automatically prior to upgrade, if `UNSAFE_SKIP_BACKUP` is set to `false`). -- Skip the upgrade handler and store migrations with `--unsafe-skip-upgrade=$upgradeHeightNumber`. -- Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. +- Roll back validators to the backup. A snapshot is taken by `cosmovisor` automatically prior to upgrade, + if `UNSAFE_SKIP_BACKUP` is set to `false` (which is a default and recommended value - + [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)). +- All full nodes and validators on the network: skip the upgrade handler and store migrations be adding `--unsafe-skip-upgrade=$upgradeHeightNumber` + argument to your `poktroll start` command. Like this: + ```bash + poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber + ``` +- Protocol team: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody + tries to sync the network from genesis they will automatically skip the failed upgrade. [Documentation and scripts to update](#documentation-and-scripts-to-update) - Resolve the issue with an upgrade and schedule another plan. +#### Documentation and scripts to update + +- The [upgrade list](./upgrade_list.md) should reflect a failed upgrade and provide a range of heights that served by each version. +- Systemd service should include`--unsafe-skip-upgrade=$upgradeHeightNumber` argument in its start command [here](https://github.com/pokt-network/poktroll/blob/main/tools/installer/full-node.sh). +- [Helm chart](https://github.com/pokt-network/helm-charts/blob/main/charts/poktrolld/templates/StatefulSet.yaml) (consider exposing via a `values.yaml` file) + ### Option 3: The network is stuck at the future height after the upgrade This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 668bee067..76ec0a824 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -215,7 +215,8 @@ Ensure LocalNet is running using a binary from the [previous release you wish to ``` 9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. -10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. +10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. It might be a + `poktroll` repo you working on or a release tag. 11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. 12. Start the new version from the **NEW VERSION REPO**: @@ -223,7 +224,7 @@ Ensure LocalNet is running using a binary from the [previous release you wish to ./release_binaries/poktroll_darwin_arm64 start ``` -13. Observe the behavior. Your node should go through the upgrade process and start using the new version. +13. Observe the output. Your node should go through the upgrade process and start using the new version. ### DevNet diff --git a/tools/installer/full-node.sh b/tools/installer/full-node.sh index 7ee542336..610df7a83 100644 --- a/tools/installer/full-node.sh +++ b/tools/installer/full-node.sh @@ -138,7 +138,8 @@ setup_poktrolld() { exit 1 fi - # Get the version genesis started from + # Get the version genesis started from. We can't just use `latest` as the new binary won't sync from genesis. + # We need to start syncing from scratch using the version that was used when the network started. POKTROLLD_VERSION=$(curl -s https://raw.githubusercontent.com/pokt-network/pocket-network-genesis/master/poktrolld/testnet-validated.init-version) # Use the direct download link for the correct release From 7ce8194fbb05db0988b875d35e56c6a9b42913af Mon Sep 17 00:00:00 2001 From: Dmitry K Date: Mon, 18 Nov 2024 16:10:43 -0800 Subject: [PATCH 14/27] change localnet upgrade docs --- .../protocol/upgrades/upgrade_procedure.md | 90 ++++++++----------- 1 file changed, 36 insertions(+), 54 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 76ec0a824..016c88d10 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -17,8 +17,7 @@ This page describes the protocol upgrade process, which is internal to the proto - [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - [LocalNet](#localnet) - - [LocalNet Upgrade tl;dr](#localnet-upgrade-tldr) - - [LocalNet Upgrade Full Example Walkthrough](#localnet-upgrade-full-example-walkthrough) + - [LocalNet Upgrade Cheat Sheet](#localnet-upgrade-cheat-sheet) - [DevNet](#devnet) - [TestNet](#testnet) - [Mainnet](#mainnet) @@ -136,7 +135,7 @@ poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --ga ## Testing the Upgrade :::warning -Note that for local testing, `cosmovisor` won't pull the binary from the info field. +Note that for local testing, `cosmovisor` won't pull the binary from the upgrade Plan's info field. ::: ### LocalNet @@ -145,86 +144,69 @@ LocalNet **DOES NOT** support `cosmovisor` and automatic upgrades at the moment. However, **IT IS NOT NEEDED** to simulate and test the upgrade procedure. -#### LocalNet Upgrade tl;dr +#### LocalNet Upgrade Cheat Sheet -1. Pull git repo with old version (separate directory) -2. Download release binary of the old version -3. Wipe LocalNet data and generate genesis using OLD version -4. Start node using anOLD binary -5. Write and submit an upgrade transaction on-chain -6. When the Upgrade Plan height is reached, stop the old node and run the new binary -7. Observe the behavior +For a hypothetical scenario to upgrade from `0.1` to `0.2`: -#### LocalNet Upgrade Full Example Walkthrough - -Testing an upgrade requires a network running on an old version. - -Ensure LocalNet is running using a binary from the [previous release you wish to upgrade **FROM**](https://github.com/pokt-network/poktroll/releases). We also want to provision the network using this version, which requires us to pull the specific git tag. - -1. Make a note of the version you want to test an upgrade **FROM**. This will be the **OLD** version. For example, let's imagine we're upgrading from `v0.0.9`. -2. Pull a new `poktroll` repo (will be used as an "old" version): +1. **Stop LocalNet** to prevent interference. Pull the `poktroll` repo into two separate directories. Let's name them `old` and `new`. It is recommended to open at least two tabs/shell panels in each directory for easier switching between directories. +2. **(`old` repo)** - Check out the old version. For the test to be accurate, we need to upgrade from the correct version. ```bash - git clone https://github.com/pokt-network/poktroll.git poktroll-upgrade-old - cd poktroll-upgrade-old - git checkout v0.0.9 - - # Download the v0.0.9 binary: https://github.com/pokt-network/poktroll/releases - # CHANGE POKTROLLD_VERSION and ARCH - curl -L "https://github.com/pokt-network/poktroll/releases/download/${POKTROLLD_VERSION}/poktroll_linux_${ARCH}.tar.gz" | tar -zxvf - -C . - - # Validate the version - ./poktrolld version - 0.0.9 + git checkout v0.1 ``` -3. Stop LocalNet - +3. **(`new` repo)** ```bash - make localnet_down + git checkout -b branch_to_test ``` + Replace `branch_to_test` with the actual branch you want to test. **Note:** This branch should have an upgrade implemented - [Implementing the Upgrade](#implementing-the-upgrade). Here, the upgrade should be named `v0.2`. -4. Reset the data - +4. **(BOTH repos)** - We'll use binaries from both versions - old and new. ```bash - ./poktrolld comet unsafe-reset-all + make go_develop ignite_release ignite_release_extract_binaries ``` + :::note + The binary produced by these commands in the old repo should result in the same binary as it was downloaded from [production releases](https://github.com/pokt-network/poktroll/releases), however you might consider using + ::: -5. Create new genesis using old version (from `poktroll-upgrade-old` dir) - +5. **(`old` repo)** - Clean up and generate an empty genesis using the old version. ```bash - make localnet_regenesis + rm -rf ~/.poktroll && ./release_binaries/poktroll_darwin_arm64 comet unsafe-reset-all && make localnet_regenesis ``` -6. Start the network +6. **(`old` repo)** Write and save [an upgrade transaction](#writing-an-upgrade-transaction) for `v0.2`. The upgrade plan should be named after the version to which you're upgrading. +7. **(`old` repo)** Start the node: ```bash - ./poktrolld start + ./release_binaries/poktroll_darwin_arm64 start ``` + The validator node should run and produce blocks as expected. -7. [Write](#writing-an-upgrade-transaction) and [Submit](#submitting-the-upgrade-on-chain) a transaction. For example: - +8. **(`old` repo)** Submit the upgrade transaction. **Note:** The upgrade height in the transaction should be higher than the current block height. Adjust and submit if necessary: ```bash - poktrolld tx authz exec tools/scripts/upgrades/local_test_v0.0.9-2.json --from=pnf` + ./release_binaries/poktroll_darwin_arm64 tx authz exec tools/scripts/upgrades/local_test_v0.2.json --from=pnf ``` - -8. Verify the plan is active - + Replace the path to the JSON transaction with your prepared upgrade transaction. Verify the upgrade plan was submitted and accepted: ```bash - poktrolld query upgrade plan + ./release_binaries/poktroll_darwin_arm64 query upgrade plan ``` -9. Wait until the height is reached and the old node dies due to the error: `ERR UPGRADE "v0.0.9-2" NEEDED at height`, which is expected. -10. At this point, switch to the repo with the **NEW** version - the code you wish to upgrade the network **TO**. It might be a - `poktroll` repo you working on or a release tag. -11. In the **NEW VERSION GIT REPO** you can build binaries using `go_develop`, `ignite_release` and `ignite_release_extract_binaries` make targets. -12. Start the new version from the **NEW VERSION REPO**: +9. Wait for the upgrade height to be reached on the old version. The old version should stop working since it has no knowledge of the `v0.2` upgrade. This simulates a real-world scenario. Stop the old node, and switch to the new version. +10. **(`new` repo)** ```bash ./release_binaries/poktroll_darwin_arm64 start ``` -13. Observe the output. Your node should go through the upgrade process and start using the new version. +11. **Observe the output:** + - A successful upgrade should output `applying upgrade "v0.2" at height: 20 module=x/upgrade`. + - The node on the new version should continue producing blocks. + - If there were errors during the upgrade, investigate and address them. + +12. **(`new` repo, optional**) - If parameters were changed during the upgrade, test if these changes were applied. For example: + ```bash + ./release_binaries/poktroll_darwin_arm64 q application params + ``` ### DevNet From a2a03ba38e124232f999dfa3458ba0e78652b535 Mon Sep 17 00:00:00 2001 From: Dmitry K Date: Mon, 18 Nov 2024 16:41:31 -0800 Subject: [PATCH 15/27] more requested changes --- .../protocol/upgrades/contigency_plans.md | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 32b254cad..6b6d35041 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -20,8 +20,8 @@ This document is intended to help you recover without significant downtime. - [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) - [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) - [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) - - [Documentation and scripts to update](#documentation-and-scripts-to-update) - [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) +- [Documentation and scripts to update](#documentation-and-scripts-to-update) ### Option 0: The bug is discovered before the upgrade height is reached @@ -43,7 +43,7 @@ The upgrade needs to be fixed, and then a new plan needs to be submitted to the :::caution -`--unsafe-skip-upgrade` needs to be documented and added to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. +`--unsafe-skip-upgrade` needs to be documented in the list of upgrades and added to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. [Documentation and scripts to update](#documentation-and-scripts-to-update) @@ -56,24 +56,26 @@ the upgrade but the migration didn't complete. In such a case, we need to: -- Roll back validators to the backup. A snapshot is taken by `cosmovisor` automatically prior to upgrade, - if `UNSAFE_SKIP_BACKUP` is set to `false` (which is a default and recommended value - +- Roll back validators to the backup. A snapshot is taken by `cosmovisor` automatically prior to upgrade when`UNSAFE_SKIP_BACKUP` is set to `false` (which is a default and recommended value - [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)). -- All full nodes and validators on the network: skip the upgrade handler and store migrations be adding `--unsafe-skip-upgrade=$upgradeHeightNumber` +- All full nodes and validators on the network: skip the upgrade by adding `--unsafe-skip-upgrade=$upgradeHeightNumber` argument to your `poktroll start` command. Like this: ```bash - poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber + poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber # ... the rest of the arguments ``` -- Protocol team: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts so the next time somebody +- Protocol team: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) so the next time somebody tries to sync the network from genesis they will automatically skip the failed upgrade. [Documentation and scripts to update](#documentation-and-scripts-to-update) - Resolve the issue with an upgrade and schedule another plan. -#### Documentation and scripts to update - -- The [upgrade list](./upgrade_list.md) should reflect a failed upgrade and provide a range of heights that served by each version. -- Systemd service should include`--unsafe-skip-upgrade=$upgradeHeightNumber` argument in its start command [here](https://github.com/pokt-network/poktroll/blob/main/tools/installer/full-node.sh). -- [Helm chart](https://github.com/pokt-network/helm-charts/blob/main/charts/poktrolld/templates/StatefulSet.yaml) (consider exposing via a `values.yaml` file) + ### Option 3: The network is stuck at the future height after the upgrade This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. + +### Documentation and scripts to update + +- The [upgrade list](./upgrade_list.md) should reflect a failed upgrade and provide a range of heights that served by each version. +- Systemd service should include`--unsafe-skip-upgrade=$upgradeHeightNumber` argument in its start command [here](https://github.com/pokt-network/poktroll/blob/main/tools/installer/full-node.sh). +- [Helm chart](https://github.com/pokt-network/helm-charts/blob/main/charts/poktrolld/templates/StatefulSet.yaml) (consider exposing via a `values.yaml` file) +- [docker-compose](https://github.com/pokt-network/poktroll-docker-compose-example/tree/main/scripts) example \ No newline at end of file From f5a6d0e163f0b2d115a235ec7806ed4d74ffbff8 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Tue, 19 Nov 2024 15:56:26 -0800 Subject: [PATCH 16/27] WIP --- .../protocol/upgrades/upgrade_procedure.md | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 016c88d10..913da0948 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -151,25 +151,31 @@ For a hypothetical scenario to upgrade from `0.1` to `0.2`: 1. **Stop LocalNet** to prevent interference. Pull the `poktroll` repo into two separate directories. Let's name them `old` and `new`. It is recommended to open at least two tabs/shell panels in each directory for easier switching between directories. 2. **(`old` repo)** - Check out the old version. For the test to be accurate, we need to upgrade from the correct version. + ```bash git checkout v0.1 ``` -3. **(`new` repo)** +3. **(`new` repo)** + ```bash git checkout -b branch_to_test ``` + Replace `branch_to_test` with the actual branch you want to test. **Note:** This branch should have an upgrade implemented - [Implementing the Upgrade](#implementing-the-upgrade). Here, the upgrade should be named `v0.2`. 4. **(BOTH repos)** - We'll use binaries from both versions - old and new. + ```bash make go_develop ignite_release ignite_release_extract_binaries ``` + :::note - The binary produced by these commands in the old repo should result in the same binary as it was downloaded from [production releases](https://github.com/pokt-network/poktroll/releases), however you might consider using + The binary produced by these commands in the old repo should result in the same binary as it was downloaded from [production releases](https://github.com/pokt-network/poktroll/releases), however you might consider using ::: 5. **(`old` repo)** - Clean up and generate an empty genesis using the old version. + ```bash rm -rf ~/.poktroll && ./release_binaries/poktroll_darwin_arm64 comet unsafe-reset-all && make localnet_regenesis ``` @@ -177,28 +183,35 @@ For a hypothetical scenario to upgrade from `0.1` to `0.2`: 6. **(`old` repo)** Write and save [an upgrade transaction](#writing-an-upgrade-transaction) for `v0.2`. The upgrade plan should be named after the version to which you're upgrading. 7. **(`old` repo)** Start the node: + ```bash ./release_binaries/poktroll_darwin_arm64 start ``` + The validator node should run and produce blocks as expected. 8. **(`old` repo)** Submit the upgrade transaction. **Note:** The upgrade height in the transaction should be higher than the current block height. Adjust and submit if necessary: + ```bash ./release_binaries/poktroll_darwin_arm64 tx authz exec tools/scripts/upgrades/local_test_v0.2.json --from=pnf ``` + Replace the path to the JSON transaction with your prepared upgrade transaction. Verify the upgrade plan was submitted and accepted: + ```bash ./release_binaries/poktroll_darwin_arm64 query upgrade plan ``` 9. Wait for the upgrade height to be reached on the old version. The old version should stop working since it has no knowledge of the `v0.2` upgrade. This simulates a real-world scenario. Stop the old node, and switch to the new version. -10. **(`new` repo)** +10. **(`new` repo)** + ```bash ./release_binaries/poktroll_darwin_arm64 start ``` 11. **Observe the output:** + - A successful upgrade should output `applying upgrade "v0.2" at height: 20 module=x/upgrade`. - The node on the new version should continue producing blocks. - If there were errors during the upgrade, investigate and address them. From 7113cde83627fe9964b1cd944f20d1169d3f595e Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Wed, 20 Nov 2024 11:38:55 -0800 Subject: [PATCH 17/27] WIP review --- api/poktroll/application/event.pulsar.go | 2 +- api/poktroll/application/types.pulsar.go | 2 +- api/poktroll/supplier/event.pulsar.go | 2 +- api/poktroll/supplier/tx.pulsar.go | 2 +- api/poktroll/tokenomics/event.pulsar.go | 2 +- api/poktroll/tokenomics/types.pulsar.go | 2 +- .../develop/developer_guide/adding_params.md | 8 ++------ .../recovery_from_chain_halt.md | 16 ++++++++------- .../protocol/upgrades/upgrade_procedure.md | 20 ++++++++++++++----- 9 files changed, 32 insertions(+), 24 deletions(-) diff --git a/api/poktroll/application/event.pulsar.go b/api/poktroll/application/event.pulsar.go index 292a46750..09bac55c2 100644 --- a/api/poktroll/application/event.pulsar.go +++ b/api/poktroll/application/event.pulsar.go @@ -3,11 +3,11 @@ package application import ( _ "cosmossdk.io/api/cosmos/base/v1beta1" + _ "github.com/pokt-network/poktroll/api/poktroll/shared" fmt "fmt" _ "github.com/cosmos/cosmos-proto" runtime "github.com/cosmos/cosmos-proto/runtime" _ "github.com/cosmos/gogoproto/gogoproto" - _ "github.com/pokt-network/poktroll/api/poktroll/shared" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoiface "google.golang.org/protobuf/runtime/protoiface" protoimpl "google.golang.org/protobuf/runtime/protoimpl" diff --git a/api/poktroll/application/types.pulsar.go b/api/poktroll/application/types.pulsar.go index ca032ba67..f482e77bb 100644 --- a/api/poktroll/application/types.pulsar.go +++ b/api/poktroll/application/types.pulsar.go @@ -3,11 +3,11 @@ package application import ( v1beta1 "cosmossdk.io/api/cosmos/base/v1beta1" + shared "github.com/pokt-network/poktroll/api/poktroll/shared" fmt "fmt" _ "github.com/cosmos/cosmos-proto" runtime "github.com/cosmos/cosmos-proto/runtime" _ "github.com/cosmos/gogoproto/gogoproto" - shared "github.com/pokt-network/poktroll/api/poktroll/shared" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoiface "google.golang.org/protobuf/runtime/protoiface" protoimpl "google.golang.org/protobuf/runtime/protoimpl" diff --git a/api/poktroll/supplier/event.pulsar.go b/api/poktroll/supplier/event.pulsar.go index 8558365eb..22dcc4989 100644 --- a/api/poktroll/supplier/event.pulsar.go +++ b/api/poktroll/supplier/event.pulsar.go @@ -3,11 +3,11 @@ package supplier import ( _ "cosmossdk.io/api/cosmos/base/v1beta1" + shared "github.com/pokt-network/poktroll/api/poktroll/shared" fmt "fmt" _ "github.com/cosmos/cosmos-proto" runtime "github.com/cosmos/cosmos-proto/runtime" _ "github.com/cosmos/gogoproto/gogoproto" - shared "github.com/pokt-network/poktroll/api/poktroll/shared" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoiface "google.golang.org/protobuf/runtime/protoiface" protoimpl "google.golang.org/protobuf/runtime/protoimpl" diff --git a/api/poktroll/supplier/tx.pulsar.go b/api/poktroll/supplier/tx.pulsar.go index 7a709f7f0..1d2057b01 100644 --- a/api/poktroll/supplier/tx.pulsar.go +++ b/api/poktroll/supplier/tx.pulsar.go @@ -5,11 +5,11 @@ import ( _ "cosmossdk.io/api/amino" v1beta1 "cosmossdk.io/api/cosmos/base/v1beta1" _ "cosmossdk.io/api/cosmos/msg/v1" + shared "github.com/pokt-network/poktroll/api/poktroll/shared" fmt "fmt" _ "github.com/cosmos/cosmos-proto" runtime "github.com/cosmos/cosmos-proto/runtime" _ "github.com/cosmos/gogoproto/gogoproto" - shared "github.com/pokt-network/poktroll/api/poktroll/shared" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoiface "google.golang.org/protobuf/runtime/protoiface" protoimpl "google.golang.org/protobuf/runtime/protoimpl" diff --git a/api/poktroll/tokenomics/event.pulsar.go b/api/poktroll/tokenomics/event.pulsar.go index e62c41344..af0c412dc 100644 --- a/api/poktroll/tokenomics/event.pulsar.go +++ b/api/poktroll/tokenomics/event.pulsar.go @@ -3,10 +3,10 @@ package tokenomics import ( v1beta1 "cosmossdk.io/api/cosmos/base/v1beta1" + proof "github.com/pokt-network/poktroll/api/poktroll/proof" fmt "fmt" runtime "github.com/cosmos/cosmos-proto/runtime" _ "github.com/cosmos/gogoproto/gogoproto" - proof "github.com/pokt-network/poktroll/api/poktroll/proof" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoiface "google.golang.org/protobuf/runtime/protoiface" protoimpl "google.golang.org/protobuf/runtime/protoimpl" diff --git a/api/poktroll/tokenomics/types.pulsar.go b/api/poktroll/tokenomics/types.pulsar.go index 9ca1a54f9..e48600e95 100644 --- a/api/poktroll/tokenomics/types.pulsar.go +++ b/api/poktroll/tokenomics/types.pulsar.go @@ -3,11 +3,11 @@ package tokenomics import ( v1beta1 "cosmossdk.io/api/cosmos/base/v1beta1" + proof "github.com/pokt-network/poktroll/api/poktroll/proof" fmt "fmt" _ "github.com/cosmos/cosmos-proto" runtime "github.com/cosmos/cosmos-proto/runtime" _ "github.com/cosmos/gogoproto/gogoproto" - proof "github.com/pokt-network/poktroll/api/poktroll/proof" protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoiface "google.golang.org/protobuf/runtime/protoiface" protoimpl "google.golang.org/protobuf/runtime/protoimpl" diff --git a/docusaurus/docs/develop/developer_guide/adding_params.md b/docusaurus/docs/develop/developer_guide/adding_params.md index 0843a6d81..48f540848 100644 --- a/docusaurus/docs/develop/developer_guide/adding_params.md +++ b/docusaurus/docs/develop/developer_guide/adding_params.md @@ -7,11 +7,7 @@ title: Adding On-Chain Module Parameters - [Step-by-Step Instructions](#step-by-step-instructions) - [1. Define the Parameter in the Protocol Buffers File](#1-define-the-parameter-in-the-protocol-buffers-file) - - [2 Update the Parameter E2E Tests](#2-update-the-parameter-e2e-tests) - - [2.1 Scenario Example](#21-scenario-example) - - [2.2 Scenario Outline Example](#22-scenario-outline-example) - - [2.3 Step Definition Helpers Example](#23-step-definition-helpers-example) - - [2.4 Update switch statement to support new param](#24-update-switch-statement-to-support-new-param) + - [2 Update the Parameter Integration Tests](#2-update-the-parameter-integration-tests) - [3. Update the Default Parameter Values](#3-update-the-default-parameter-values) - [4. Add Parameter Default to Genesis Configuration](#4-add-parameter-default-to-genesis-configuration) - [5. Modify the Makefile](#5-modify-the-makefile) @@ -149,7 +145,7 @@ with the default value for the new parameter. "authority": "pokt10d07y265gmmuvt4z0w9aw880jnsr700j8yv32t", "params": { "proof_request_probability": "0.25", - "proof_requirement_threshold": { + "proof_requirement_threshold": { "denom": "upokt", "amount": "20000000" }, diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 03a399052..7d1f82f8b 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -51,10 +51,10 @@ configuration for `cosmovisor` that could automate the process. ::: Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. Instead, -we need **social consensus** to manually replace the binary and get the chain moving. +we need **social consensus** to manually replace the binary and get the chain moving. 1. Prepare and verify a new binary that addresses the consensus-breaking issue. -2. Reach out to the community and validators so they can upgrade the binary manually. +2. Reach out to the community and validators so they can upgrade the binary manually. :::warning UNKNOWN, NEED TO INVESTIGATE We might need to coordinate the timing of when the nodes should be started. In Tendermint version of Pocket Network @@ -64,7 +64,6 @@ we need **social consensus** to manually replace the binary and get the chain mo to be repleced. Consider a configuration change for `cosmovisor` so it would automatically replace the binary when synching from genesis. - ```mermaid sequenceDiagram participant DevTeam @@ -153,21 +152,24 @@ sequenceDiagram There are two ways to get a snapshot from a prior height: 1. Execute + ```bash poktrolld rollback --hard ``` + repeately until the command responds with the desired block number. + 2. Use a snapshot and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. Add this argument to `poktrolld start` like this: + ```bash poktrolld start --halt-height=100 ``` - #### Step 6: Validator Isolation - risks Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the following errors in logs are the sign of the nodes populating existing blocks: - - `found conflicting vote from ourselves; did you unsafe_reset a validator?` - - `conflicting votes from validator` - + +- `found conflicting vote from ourselves; did you unsafe_reset a validator?` +- `conflicting votes from validator` diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 913da0948..278ae4421 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -42,10 +42,14 @@ An upgrade is necessary whenever there's an API, State Machine, or other Consens 1. When a new version includes a consensus-breaking change, plan for the next protocol upgrade: - If there's a change to a specific module, bump that module's consensus version. - Note any potential parameter changes to include in the upgrade. -2. Create a new upgrade in `app/upgrades`. **THIS MUST BE DONE** even if there are no state changes. +2. Create a new upgrade in `app/upgrades`. - Refer to `historical.go` for past upgrades and examples. - Consult Cosmos-sdk documentation on upgrades for additional guidance [here](https://docs.cosmos.network/main/build/building-apps/app-upgrade) and [here](https://docs.cosmos.network/main/build/modules/upgrade). +:::info +Creating a new upgrade plan MUST BE DONE even if there are no state changes. +::: + ## Writing an Upgrade Transaction An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/blob/0fda53f265de4bcf4be1a13ea9fad450fc2e66d4/x/upgrade/proto/cosmos/upgrade/v1beta1/upgrade.proto#L14) with specific details about the upgrade. This information helps schedule the upgrade on the network and provides necessary data for automatic upgrades via `Cosmovisor`. A typical upgrade transaction will look like the following: @@ -83,7 +87,7 @@ When `cosmovisor` is configured to automatically download binaries, it will pull The URLs of the binaries contain checksums. It is critical to ensure they are correct. Otherwise Cosmovisor won't be able to download the binaries and go through the upgrade. -The command below (using toold build by the authors of Cosmosvisor) can be used to achieve the above: +The command below (using tools build by the authors of Cosmosvisor) can be used to achieve the above: ```bash jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' $PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do @@ -162,7 +166,12 @@ For a hypothetical scenario to upgrade from `0.1` to `0.2`: git checkout -b branch_to_test ``` - Replace `branch_to_test` with the actual branch you want to test. **Note:** This branch should have an upgrade implemented - [Implementing the Upgrade](#implementing-the-upgrade). Here, the upgrade should be named `v0.2`. + Replace `branch_to_test` with the actual branch you want to test. + + :::note + This branch should have an upgrade implemented per the docs in [Implementing the Upgrade](#implementing-the-upgrade). + Here, the upgrade should be named `v0.2`. + ::: 4. **(BOTH repos)** - We'll use binaries from both versions - old and new. @@ -171,7 +180,7 @@ For a hypothetical scenario to upgrade from `0.1` to `0.2`: ``` :::note - The binary produced by these commands in the old repo should result in the same binary as it was downloaded from [production releases](https://github.com/pokt-network/poktroll/releases), however you might consider using + The binary produced by these commands in the old repo should result in the same binary as it was downloaded from [production releases](https://github.com/pokt-network/poktroll/releases), however you might consider using a different version. ::: 5. **(`old` repo)** - Clean up and generate an empty genesis using the old version. @@ -190,7 +199,7 @@ For a hypothetical scenario to upgrade from `0.1` to `0.2`: The validator node should run and produce blocks as expected. -8. **(`old` repo)** Submit the upgrade transaction. **Note:** The upgrade height in the transaction should be higher than the current block height. Adjust and submit if necessary: +8. **(`old` repo)** Submit the upgrade transaction. **NOTE THAT** the upgrade height in the transaction should be higher than the current block height. Adjust and submit if necessary: ```bash ./release_binaries/poktroll_darwin_arm64 tx authz exec tools/scripts/upgrades/local_test_v0.2.json --from=pnf @@ -217,6 +226,7 @@ For a hypothetical scenario to upgrade from `0.1` to `0.2`: - If there were errors during the upgrade, investigate and address them. 12. **(`new` repo, optional**) - If parameters were changed during the upgrade, test if these changes were applied. For example: + ```bash ./release_binaries/poktroll_darwin_arm64 q application params ``` From ec4314c94d6d1512f375feb31e2cc81d8157d6ef Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Tue, 26 Nov 2024 14:50:59 -0800 Subject: [PATCH 18/27] Update docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md Co-authored-by: Bryan White --- .../docs/develop/developer_guide/recovery_from_chain_halt.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 7d1f82f8b..3b5d14733 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -116,7 +116,7 @@ However, if necessary, the instructions to follow are: 6. Isolate the validator set from full nodes - ([why this is necessary](#step-6-validator-isolation---risks)) - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. - This may require using a firewall or a private network - - Validators should only be gossip blocks amongst themselves. + - Validators should only be permitted to gossip blocks amongst themselves. 7. Start the network and perform the upgrade. For example, reiterating the process above: - Start all Validators at height `100` - On block `101`, submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. From 988622095483240eef312c445ead755ca20d6714 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Tue, 26 Nov 2024 14:51:13 -0800 Subject: [PATCH 19/27] Update docusaurus/docs/protocol/upgrades/contigency_plans.md Co-authored-by: Bryan White --- docusaurus/docs/protocol/upgrades/contigency_plans.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 6b6d35041..f79bda7de 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -33,7 +33,7 @@ See the instructions of [how to do that here](./upgrade_procedure.md#cancelling- If the nodes on the network stopped at the upgrade height and the migration did not start yet (i.e. there are no logs indicating the upgrade handler and store migrations are being executed), -we mist gather social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. +we MUST gather social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. From 80c3e382eaa6d56a947fb4c0dee11e4e7075aa56 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Tue, 26 Nov 2024 14:51:28 -0800 Subject: [PATCH 20/27] Update docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md Co-authored-by: Bryan White --- .../docs/develop/developer_guide/recovery_from_chain_halt.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 3b5d14733..4a3dcd0c2 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -23,7 +23,7 @@ See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more infor ## Background Pocket network is built on top of `cosmos-sdk`, which utilizes the CometBFT consensus engine. -Byzantine Fault Tolerant (BFT) consensus algorithm requires that **at least** 2/3 of Validators +Comet's Byzantine Fault Tolerant (BFT) consensus algorithm requires that **at least** 2/3 of Validators are online and voting for the same block to reach a consensus. In order to maintain liveness and avoid a chain-halt, we need the majority (> 2/3) of Validators to participate and use the same version of the software. From d6f6cedfa8daca22409080a43f0217b81a120aff Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Tue, 26 Nov 2024 14:53:14 -0800 Subject: [PATCH 21/27] Apply suggestions from code review Co-authored-by: Bryan White --- .../recovery_from_chain_halt.md | 22 +++++++++---------- .../protocol/upgrades/contigency_plans.md | 7 +++--- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 4a3dcd0c2..cce21d185 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -5,8 +5,8 @@ title: Chain Halt Recovery ## Chain Halt Recovery -This document describes how to recover from a chain halt. It assumes the cause of -the chain halt has been identified, the new release has been created, and verified +This document describes how to recover from a chain halt. It assumes that the cause of +the chain halt has been identified, and that the new release has been created and verified function correctly. :::tip @@ -111,18 +111,18 @@ However, if necessary, the instructions to follow are: - Upgrade the chain at height `102` - Avoid the issue at height `103` 5. Ensure all validators rolled back to the same height and use the same snapshot - ([how to get the snapshot](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height)) - - The snapshot should be imported into each Validator's data directory + - The snapshot should be imported into each Validator's data directory. - This is necessary to ensure data continuity and prevent forks. -6. Isolate the validator set from full nodes - ([why this is necessary](#step-6-validator-isolation---risks)) +6. Isolate the validator set from full nodes - ([why this is necessary](#step-6-validator-isolation---risks)). - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. - - This may require using a firewall or a private network + - This may require using a firewall or a private network. - Validators should only be permitted to gossip blocks amongst themselves. -7. Start the network and perform the upgrade. For example, reiterating the process above: - - Start all Validators at height `100` +7. Start the validator set and perform the upgrade. For example, reiterating the process above: + - Start all Validators at height `100`. - On block `101`, submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. - - `x/upgrade` will perform the upgrade in the `EndBlocker` of block `102` - - If using `cosmosvisor`, the node will wait to replace the binary -8. Wait for the network to reach the height of the previous ledger (`104`+) + - `x/upgrade` will perform the upgrade in the `EndBlocker` of block `102`. + - If using `cosmosvisor`, the node will wait to replace the binary. +8. Wait for the network to reach the height of the previous ledger (`104`+). 9. Allow validators to open their network to full nodes again. - Note that full nodes will need to perform the rollback or use a snapshot as well. @@ -169,7 +169,7 @@ There are two ways to get a snapshot from a prior height: #### Step 6: Validator Isolation - risks Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the -following errors in logs are the sign of the nodes populating existing blocks: +following errors in logs are the sign of the nodes syncing blocks from the wrong fork: - `found conflicting vote from ourselves; did you unsafe_reset a validator?` - `conflicting votes from validator` diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index f79bda7de..dea402ad4 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -51,19 +51,18 @@ The upgrade needs to be fixed, and then a new plan needs to be submitted to the ### Option 2: The migration is stuck -If the migration is stuck, there's always a chance the state has been mutated for -the upgrade but the migration didn't complete. +If the migration is stuck, there's always a chance the upgrade handler was executed on-chain as scheduled, but the migration didn't complete. In such a case, we need to: - Roll back validators to the backup. A snapshot is taken by `cosmovisor` automatically prior to upgrade when`UNSAFE_SKIP_BACKUP` is set to `false` (which is a default and recommended value - [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)). -- All full nodes and validators on the network: skip the upgrade by adding `--unsafe-skip-upgrade=$upgradeHeightNumber` +- **All full nodes and validators**: skip the upgrade by adding `--unsafe-skip-upgrade=$upgradeHeightNumber` argument to your `poktroll start` command. Like this: ```bash poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber # ... the rest of the arguments ``` -- Protocol team: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) so the next time somebody +- **Protocol team**: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) so the next time somebody tries to sync the network from genesis they will automatically skip the failed upgrade. [Documentation and scripts to update](#documentation-and-scripts-to-update) - Resolve the issue with an upgrade and schedule another plan. From a18a1978b4f6db0c2d5aa784320f3bb1f73bd733 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Tue, 26 Nov 2024 14:53:35 -0800 Subject: [PATCH 22/27] Fix merge conflict --- docusaurus/docs/develop/developer_guide/adding_params.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/adding_params.md b/docusaurus/docs/develop/developer_guide/adding_params.md index 405d9b3ec..df8bf2c7d 100644 --- a/docusaurus/docs/develop/developer_guide/adding_params.md +++ b/docusaurus/docs/develop/developer_guide/adding_params.md @@ -32,8 +32,10 @@ title: Adding On-Chain Module Parameters - [6.1 Add a valid param](#61-add-a-valid-param) - [6.2 Check for `as_` on `MsgUpdateParam`](#62-check-for-as_type-on-msgupdateparam) - [6.3 Update the module's `ModuleParamConfig`](#63-update-the-modules-moduleparamconfig) - - [7. Update the Makefile and Supporting JSON Files](#7-update-the-makefile-and-supporting-json-files) - [7.1 Update the Makefile](#71-update-the-makefile) - [7.2 Create a new JSON File for the Individual Parameter Update](#72-create-a-new-json-file-for-the-individual-parameter-update) - [7.3 Update the JSON File for Updating All Parameters for the Module](#73-update-the-json-file-for-updating-all-parameters-for-the-module) - > > > > > > > main + - [7. Update the Makefile and Supporting JSON Files](#7-update-the-makefile-and-supporting-json-files) + - [7.1 Update the Makefile](#71-update-the-makefile) + - [7.2 Create a new JSON File for the Individual Parameter Update](#72-create-a-new-json-file-for-the-individual-parameter-update) + - [7.3 Update the JSON File for Updating All Parameters for the Module](#73-update-the-json-file-for-updating-all-parameters-for-the-module) Adding a new on-chain module parameter involves multiple steps to ensure that the parameter is properly integrated into the system. This guide will walk you through From b7e5151d2e62329913c20bda4e7c4d9de639f301 Mon Sep 17 00:00:00 2001 From: Dmitry K Date: Wed, 27 Nov 2024 14:59:38 -0800 Subject: [PATCH 23/27] requested changes --- app/upgrades/historical.go | 5 +- .../chain_halt_troubleshooting.md | 2 +- .../recovery_from_chain_halt.md | 82 ++++++++++--------- .../protocol/upgrades/contigency_plans.md | 18 ++-- .../protocol/upgrades/upgrade_procedure.md | 6 +- makefiles/localnet.mk | 8 ++ 6 files changed, 71 insertions(+), 50 deletions(-) diff --git a/app/upgrades/historical.go b/app/upgrades/historical.go index 29a7c1ede..35393ad02 100644 --- a/app/upgrades/historical.go +++ b/app/upgrades/historical.go @@ -11,13 +11,13 @@ package upgrades import ( "context" - "fmt" storetypes "cosmossdk.io/store/types" upgradetypes "cosmossdk.io/x/upgrade/types" "github.com/cosmos/cosmos-sdk/types/module" consensusparamtypes "github.com/cosmos/cosmos-sdk/x/consensus/types" + cosmostypes "github.com/cosmos/cosmos-sdk/types" "github.com/pokt-network/poktroll/app/keepers" ) @@ -30,7 +30,8 @@ func defaultUpgradeHandler( configurator module.Configurator, ) upgradetypes.UpgradeHandler { return func(ctx context.Context, plan upgradetypes.Plan, vm module.VersionMap) (module.VersionMap, error) { - fmt.Println("Starting the migration in defaultUpgradeHandler.") + logger := cosmostypes.UnwrapSDKContext(ctx).Logger() + logger.Info("Starting the migration in defaultUpgradeHandler") return mm.RunMigrations(ctx, configurator, vm) } } diff --git a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md index 4f5796793..5b32f5cda 100644 --- a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md +++ b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md @@ -108,7 +108,7 @@ reactor validation error: wrong Block.Header.LastResultsHash. The solution is to use the correct binary version to sync the full node at the correct height. Tools like [cosmosvisor](https://docs.cosmos.network/v0.45/run-node/cosmovisor.html) make it easier -to sync a node from genesis, using the appropriate binary for each range of block heights. +to sync a node from genesis by automatically using the appropriate binary for each range of block heights. ## Syncing from genesis diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 03a399052..9da256d06 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -5,9 +5,9 @@ title: Chain Halt Recovery ## Chain Halt Recovery -This document describes how to recover from a chain halt. It assumes the cause of -the chain halt has been identified, the new release has been created, and verified -function correctly. +This document describes how to recover from a chain halt. It assumes that the cause of +the chain halt has been identified, and that the new release has been created and verified +to function correctly. :::tip See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more information on identifying the cause of a chain halt. @@ -23,7 +23,7 @@ See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more infor ## Background Pocket network is built on top of `cosmos-sdk`, which utilizes the CometBFT consensus engine. -Byzantine Fault Tolerant (BFT) consensus algorithm requires that **at least** 2/3 of Validators +Comet's Byzantine Fault Tolerant (BFT) consensus algorithm requires that **at least** 2/3 of Validators are online and voting for the same block to reach a consensus. In order to maintain liveness and avoid a chain-halt, we need the majority (> 2/3) of Validators to participate and use the same version of the software. @@ -78,7 +78,7 @@ sequenceDiagram DevTeam->>Validators: Notify validators to upgrade manually Validators->>Validators: Manually replace the binary Validators->>Network: Restart nodes with new binary - DevTeam->>Documentation: Update upgrade documentation + DevTeam->>Documentation: Update documentation (GitHub Release and Upgrade List to include instructions) Validators->>Network: Network resumes operation ``` @@ -90,64 +90,72 @@ sequenceDiagram These instructions are only relevant to Pocket Network's Shannon release. We do not currently use `x/gov` and on-chain voting for upgrades. - -Instead, our DAO votes on upgrades off-chain and the Foundation executes -transactions on their behalf. +Instead, all participants in our DAO vote on upgrades off-chain, and the Foundation +executes transactions on their behalf. ::: **Performing a rollback is analogous to forking the network at the older height.** -This should be avoided unless absolutely necessary. +:::warning + +This should be avoided or more testing is required. In our tests, the full nodes were +propagating the existing blocks signed by the Validators, making it hard to rollback. + +::: However, if necessary, the instructions to follow are: 1. Prepare & verify a new binary that addresses the consensus-breaking issue. 2. [Create a release](../../protocol/upgrades/release_process.md). 3. [Prepare an upgrade transaction](../../protocol/upgrades/upgrade_procedure.md#writing-an-upgrade-transaction) to the new version. -4. Get the Validator set off the network **3 blocks** prior to the height of the chain halt. For example: - - Assume an issue at height `103` - - Get the validator set at height `100` - - Submit an upgrade transaction at `101` - - Upgrade the chain at height `102` - - Avoid the issue at height `103` +4. Disconnect the `Validator set` from the rest of the network **3 blocks** prior to the height of the chain halt. For example: + - Assume an issue at height `103`. + - Revert the `validator set` to height `100`. + - Submit an upgrade transaction at `101`. + - Upgrade the chain at height `102`. + - Avoid the issue at height `103`. 5. Ensure all validators rolled back to the same height and use the same snapshot - ([how to get the snapshot](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height)) - - The snapshot should be imported into each Validator's data directory + - The snapshot should be imported into each Validator's data directory. - This is necessary to ensure data continuity and prevent forks. -6. Isolate the validator set from full nodes - ([why this is necessary](#step-6-validator-isolation---risks)) +6. Isolate the `validator set` from full nodes - ([why this is necessary](#step-6-validator-isolation---risks)). - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. - - This may require using a firewall or a private network - - Validators should only be gossip blocks amongst themselves. -7. Start the network and perform the upgrade. For example, reiterating the process above: - - Start all Validators at height `100` + - This may require using a firewall or a private network. + - Validators should only be permitted to gossip blocks amongst themselves. +7. Start the `validator set` and perform the upgrade. For example, reiterating the process above: + - Start all Validators at height `100`. - On block `101`, submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. - - `x/upgrade` will perform the upgrade in the `EndBlocker` of block `102` - - If using `cosmosvisor`, the node will wait to replace the binary -8. Wait for the network to reach the height of the previous ledger (`104`+) + - `x/upgrade` will perform the upgrade in the `EndBlocker` of block `102`. + - The node will stop climbing with an error waiting for the upgrade to be performed. + - Cosmovisor deployments automatically replace the binary. + - Manual deployments will require a manual replacement at this point. + - Start the node back up. +8. Wait for the network to reach the height of the previous ledger (`104`+). 9. Allow validators to open their network to full nodes again. - - Note that full nodes will need to perform the rollback or use a snapshot as well. - + - **Note**: full nodes will need to perform the rollback or use a snapshot as well. ```mermaid sequenceDiagram participant DevTeam participant Foundation participant Validators participant FullNodes - participant Network + %% participant Network DevTeam->>DevTeam: Prepare & verify new binary DevTeam->>DevTeam: Create a release Validators->>Validators: Roll back to height before issue or import snapshot Validators->>Validators: Isolate from Full Nodes Foundation->>Validators: Distribute upgrade transaction - Validators->>Network: Start network and perform upgrade - Validators->>Network: Wait until over consensus-breaking height - Validators->>FullNodes: Open network connections - FullNodes->>Network: Sync with updated network - Validators->>Network: Network resumes operation - + Validators->>Validators: Start network and perform upgrade + + break + Validators->>Validators: Wait until previously problematic height elapses + end + + Validators-->FullNodes: Open network connections + FullNodes->>Validators: Sync with updated network + note over Validators,FullNodes: Network resumes operation ``` - #### Step 5: Data rollback - retrieving snapshot at a specific height There are two ways to get a snapshot from a prior height: @@ -156,8 +164,8 @@ There are two ways to get a snapshot from a prior height: ```bash poktrolld rollback --hard ``` - repeately until the command responds with the desired block number. -2. Use a snapshot and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then + repeately, until the command responds with the desired block number. +2. Use a snapshot from below the halt height (e.g. `100`) and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. Add this argument to `poktrolld start` like this: ```bash poktrolld start --halt-height=100 @@ -167,7 +175,7 @@ There are two ways to get a snapshot from a prior height: #### Step 6: Validator Isolation - risks Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the -following errors in logs are the sign of the nodes populating existing blocks: +following errors in logs are the sign of the nodes syncing blocks from the wrong fork: - `found conflicting vote from ourselves; did you unsafe_reset a validator?` - `conflicting votes from validator` diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 6b6d35041..8b0aed5fd 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -18,7 +18,7 @@ There's always a chance the upgrade will fail. This document is intended to help you recover without significant downtime. - [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) -- [Option 1: The upgrade height is reached and the migration didn't start](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start) +- [Option 1: The upgrade height is reached and the migration didn't start (halted)](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start-halted) - [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) - [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) - [Documentation and scripts to update](#documentation-and-scripts-to-update) @@ -29,11 +29,15 @@ This document is intended to help you recover without significant downtime. See the instructions of [how to do that here](./upgrade_procedure.md#cancelling-the-upgrade-plan). -### Option 1: The upgrade height is reached and the migration didn't start +### Option 1: The upgrade height is reached and the migration didn't start (halted) + +This is unlikely to happen. Possible cases are if the name of the upgrade handler is +different from the one specified in the upgrade plan, or if the binary suggested by +the upgrade plan is wrong. If the nodes on the network stopped at the upgrade height and the migration did not start yet (i.e. there are no logs indicating the upgrade handler and store migrations are being executed), -we mist gather social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. +we **MUST** gather social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. @@ -56,16 +60,16 @@ the upgrade but the migration didn't complete. In such a case, we need to: -- Roll back validators to the backup. A snapshot is taken by `cosmovisor` automatically prior to upgrade when`UNSAFE_SKIP_BACKUP` is set to `false` (which is a default and recommended value - +- **All full nodes and validators**: Roll back validators to the backup. A snapshot is taken by `cosmovisor` automatically prior to upgrade when`UNSAFE_SKIP_BACKUP` is set to `false` (which is a default and recommended value - [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)). -- All full nodes and validators on the network: skip the upgrade by adding `--unsafe-skip-upgrade=$upgradeHeightNumber` +- **All full nodes and validators**: skip the upgrade by adding `--unsafe-skip-upgrade=$upgradeHeightNumber` argument to your `poktroll start` command. Like this: ```bash poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber # ... the rest of the arguments ``` -- Protocol team: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) so the next time somebody +- **Protocol team**: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) so the next time somebody tries to sync the network from genesis they will automatically skip the failed upgrade. [Documentation and scripts to update](#documentation-and-scripts-to-update) -- Resolve the issue with an upgrade and schedule another plan. +- **Protocol team**: Resolve the issue with an upgrade and schedule another plan. diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 016c88d10..19b74a79a 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -126,10 +126,10 @@ poktrolld query upgrade plan ## Cancelling the upgrade plan -It is possible to cancel the upgrade before the upgrade plan height is reached. To do so, execute the following transaction: +It is possible to cancel the upgrade before the upgrade plan height is reached. To do so, execute the following make target: ```bash -poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from=pnf +make localnet_cancel_upgrade ``` ## Testing the Upgrade @@ -166,7 +166,7 @@ For a hypothetical scenario to upgrade from `0.1` to `0.2`: make go_develop ignite_release ignite_release_extract_binaries ``` :::note - The binary produced by these commands in the old repo should result in the same binary as it was downloaded from [production releases](https://github.com/pokt-network/poktroll/releases), however you might consider using + The binary produced by these commands in the old repo should result in the same binary as it was downloaded from [production releases](https://github.com/pokt-network/poktroll/releases). You can use them as an alternative to building the binary from source. ::: 5. **(`old` repo)** - Clean up and generate an empty genesis using the old version. diff --git a/makefiles/localnet.mk b/makefiles/localnet.mk index 1df6113cf..00579ff03 100644 --- a/makefiles/localnet.mk +++ b/makefiles/localnet.mk @@ -29,3 +29,11 @@ localnet_regenesis: check_yq warn_message_acc_initialize_pubkeys ## Regenerate t .PHONY: cosmovisor_start_node cosmovisor_start_node: # Starts the node using cosmovisor that waits for an upgrade plan bash tools/scripts/upgrades/cosmovisor-start-node.sh + +.PHONY: localnet_cancel_upgrade +localnet_cancel_upgrade: ## Cancels the planed upgrade on local node + poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from=pnf + +.PHONY: localnet_show_upgrade_plan +localnet_show_upgrade_plan: ## Shows the upgrade plan on local node + poktrolld query upgrade plan From 8e4ef7b474c1fd467155a0a63a0e05e08b607409 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Wed, 11 Dec 2024 17:19:07 -0800 Subject: [PATCH 24/27] Review docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md --- .../recovery_from_chain_halt.md | 99 +++++++++++-------- 1 file changed, 57 insertions(+), 42 deletions(-) diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md index 2bca4d598..d1ca5a069 100644 --- a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -5,20 +5,24 @@ title: Chain Halt Recovery ## Chain Halt Recovery -This document describes how to recover from a chain halt. It assumes that the cause of -the chain halt has been identified, and that the new release has been created and verified -to function correctly. +This document describes how to recover from a chain halt. + +It assumes that the cause of the chain halt has been identified, and that the +new release has been created and verified to function correctly. :::tip + See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more information on identifying the cause of a chain halt. + ::: - [Background](#background) - [Resolving halts during a network upgrade](#resolving-halts-during-a-network-upgrade) - [Manual binary replacement (preferred)](#manual-binary-replacement-preferred) - [Rollback, fork and upgrade](#rollback-fork-and-upgrade) - - [Step 5: Data rollback - retrieving snapshot at a specific height](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height) - - [Step 6: Validator Isolation - risks](#step-6-validator-isolation---risks) + - [Troubleshooting](#troubleshooting) + - [Data rollback - retrieving snapshot at a specific height (step 5)](#data-rollback---retrieving-snapshot-at-a-specific-height-step-5) + - [Validator Isolation - risks (step 6)](#validator-isolation---risks-step-6) ## Background @@ -46,23 +50,28 @@ For example, when a consensus-breaking issue occurs on a node that is synching f to manually replace the binary with the new one. There are efforts underway to mitigate this issue, including configuration for `cosmovisor` that could automate the process. - + ::: Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. Instead, we need **social consensus** to manually replace the binary and get the chain moving. +The steps to doing so are: + 1. Prepare and verify a new binary that addresses the consensus-breaking issue. 2. Reach out to the community and validators so they can upgrade the binary manually. - :::warning UNKNOWN, NEED TO INVESTIGATE - - We might need to coordinate the timing of when the nodes should be started. In Tendermint version of Pocket Network - (Morse), this was necessary to sync consensus rounds and steps, getting the chain moving. It might not be a - requirement anymore, but we need to double-check. [More information](https://docs.cometbft.com/v1.0/spec/consensus/consensus). 3. Update [the documentation](../../protocol/upgrades/upgrade_list.md) to include a range a height when the binary needs - to be repleced. Consider a configuration change for `cosmovisor` so it would automatically replace the binary when - synching from genesis. + to be replaced. + +:::warning + +TODO_MAINNET(@okdas): + +1. **For step 2**: Investigate if the CometBFT rounds/steps need to be aligned as in Morse chain halts. See [this ref](https://docs.cometbft.com/v1.0/spec/consensus/consensus). +2. **For step 3**: Add `cosmovisor` documentation so its configured to automatically replace the binary when synching from genesis. + +::: ```mermaid sequenceDiagram @@ -72,13 +81,13 @@ sequenceDiagram participant Documentation participant Network - DevTeam->>DevTeam: Prepare and verify new binary - DevTeam->>Community: Announce new binary and instructions - DevTeam->>Validators: Notify validators to upgrade manually - Validators->>Validators: Manually replace the binary - Validators->>Network: Restart nodes with new binary - DevTeam->>Documentation: Update documentation (GitHub Release and Upgrade List to include instructions) - Validators->>Network: Network resumes operation + DevTeam->>DevTeam: 1. Prepare and verify new binary + DevTeam->>Community: 2. Announce new binary and instructions + DevTeam->>Validators: 2. Notify validators to upgrade manually + Validators->>Validators: 2. Manually replace the binary + Validators->>Network: 2. Restart nodes with new binary + DevTeam->>Documentation: 3. Update documentation (GitHub Release and Upgrade List to include instructions) + Validators-->>Network: Network resumes operation ``` @@ -88,14 +97,12 @@ sequenceDiagram These instructions are only relevant to Pocket Network's Shannon release. -We do not currently use `x/gov` and on-chain voting for upgrades. +We do not currently use `x/gov` or on-chain voting for upgrades. Instead, all participants in our DAO vote on upgrades off-chain, and the Foundation executes transactions on their behalf. ::: -**Performing a rollback is analogous to forking the network at the older height.** - :::warning This should be avoided or more testing is required. In our tests, the full nodes were @@ -103,6 +110,8 @@ propagating the existing blocks signed by the Validators, making it hard to roll ::: +**Performing a rollback is analogous to forking the network at the older height.** + However, if necessary, the instructions to follow are: 1. Prepare & verify a new binary that addresses the consensus-breaking issue. @@ -114,10 +123,10 @@ However, if necessary, the instructions to follow are: - Submit an upgrade transaction at `101`. - Upgrade the chain at height `102`. - Avoid the issue at height `103`. -5. Ensure all validators rolled back to the same height and use the same snapshot - ([how to get the snapshot](#step-5-data-rollback---retrieving-snapshot-at-a-specific-height)) +5. Ensure all validators rolled back to the same height and use the same snapshot ([how to get a snapshot](#data-rollback---retrieving-snapshot-at-a-specific-height-step-5)) - The snapshot should be imported into each Validator's data directory. - This is necessary to ensure data continuity and prevent forks. -6. Isolate the `validator set` from full nodes - ([why this is necessary](#step-6-validator-isolation---risks)). +6. Isolate the `validator set` from full nodes - ([why this is necessary](#validator-isolation---risks-step-6)). - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. - This may require using a firewall or a private network. - Validators should only be permitted to gossip blocks amongst themselves. @@ -128,10 +137,11 @@ However, if necessary, the instructions to follow are: - The node will stop climbing with an error waiting for the upgrade to be performed. - Cosmovisor deployments automatically replace the binary. - Manual deployments will require a manual replacement at this point. - - Start the node back up. + - Start the node back up. 8. Wait for the network to reach the height of the previous ledger (`104`+). 9. Allow validators to open their network to full nodes again. - **Note**: full nodes will need to perform the rollback or use a snapshot as well. + ```mermaid sequenceDiagram participant DevTeam @@ -140,22 +150,25 @@ sequenceDiagram participant FullNodes %% participant Network - DevTeam->>DevTeam: Prepare & verify new binary - DevTeam->>DevTeam: Create a release - Validators->>Validators: Roll back to height before issue or import snapshot - Validators->>Validators: Isolate from Full Nodes - Foundation->>Validators: Distribute upgrade transaction - Validators->>Validators: Start network and perform upgrade - + DevTeam->>DevTeam: 1. Prepare & verify new binary + DevTeam->>DevTeam: 2 & 3. Create a release & prepare upgrade transaction + Validators->>Validators: 4 & 5. Roll back to height before issue or import snapshot + Validators->>Validators: 6. Isolate from Full Nodes + Foundation->>Validators: 7. Distribute upgrade transaction + Validators->>Validators: 7. Start network and perform upgrade + break - Validators->>Validators: Wait until previously problematic height elapses + Validators->>Validators: 8. Wait until previously problematic height elapses end - - Validators-->FullNodes: Open network connections - FullNodes->>Validators: Sync with updated network + + Validators-->FullNodes: 9. Open network connections + FullNodes-->>Validators: 9. Sync with updated network note over Validators,FullNodes: Network resumes operation ``` -#### Step 5: Data rollback - retrieving snapshot at a specific height + +### Troubleshooting + +#### Data rollback - retrieving snapshot at a specific height (step 5) There are two ways to get a snapshot from a prior height: @@ -164,7 +177,9 @@ There are two ways to get a snapshot from a prior height: ```bash poktrolld rollback --hard ``` + repeately, until the command responds with the desired block number. + 2. Use a snapshot from below the halt height (e.g. `100`) and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then gracefully shuts down. Add this argument to `poktrolld start` like this: @@ -172,10 +187,10 @@ There are two ways to get a snapshot from a prior height: poktrolld start --halt-height=100 ``` -#### Step 6: Validator Isolation - risks +#### Validator Isolation - risks (step 6) Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the following errors in logs are the sign of the nodes syncing blocks from the wrong fork: - - `found conflicting vote from ourselves; did you unsafe_reset a validator?` - - `conflicting votes from validator` - + +- `found conflicting vote from ourselves; did you unsafe_reset a validator?` +- `conflicting votes from validator` From 58a7493f6d00ca9e0b7bfba4e267e26c4eeb2382 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Thu, 12 Dec 2024 09:55:29 -0800 Subject: [PATCH 25/27] WIP review on docusaurus/docs/protocol/upgrades/contigency_plans.md --- .../protocol/upgrades/contigency_plans.md | 55 +++++++++++-------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index 16c88f26a..b23c00148 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -18,9 +18,9 @@ There's always a chance the upgrade will fail. This document is intended to help you recover without significant downtime. - [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) -- [Option 1: The upgrade height is reached and the migration didn't start (halted)](#option-1-the-upgrade-height-is-reached-and-the-migration-didnt-start-halted) -- [Option 2: The migration is stuck](#option-2-the-migration-is-stuck) -- [Option 3: The network is stuck at the future height after the upgrade](#option-3-the-network-is-stuck-at-the-future-height-after-the-upgrade) +- [Option 1: The migration didn't start (i.e. migration halt)](#option-1-the-migration-didnt-start-ie-migration-halt) +- [Option 2: The migration is stuck (i.e. incomplete/partial migration)](#option-2-the-migration-is-stuck-ie-incompletepartial-migration) +- [Option 3: The migration succeed but the network is stuck (i.e. migration had a bug)](#option-3-the-migration-succeed-but-the-network-is-stuck-ie-migration-had-a-bug) - [Documentation and scripts to update](#documentation-and-scripts-to-update) ### Option 0: The bug is discovered before the upgrade height is reached @@ -29,11 +29,13 @@ This document is intended to help you recover without significant downtime. See the instructions of [how to do that here](./upgrade_procedure.md#cancelling-the-upgrade-plan). -### Option 1: The upgrade height is reached and the migration didn't start (halted) +### Option 1: The migration didn't start (i.e. migration halt) -This is unlikely to happen. Possible cases are if the name of the upgrade handler is -different from the one specified in the upgrade plan, or if the binary suggested by -the upgrade plan is wrong. +**This is unlikely to happen.** + +Possible reasons for this are if the name of the upgrade handler is different +from the one specified in the upgrade plan, or if the binary suggested by the +upgrade plan is wrong. If the nodes on the network stopped at the upgrade height and the migration did not start yet (i.e. there are no logs indicating the upgrade handler and store migrations are being executed), @@ -47,32 +49,37 @@ The upgrade needs to be fixed, and then a new plan needs to be submitted to the :::caution -`--unsafe-skip-upgrade` needs to be documented in the list of upgrades and added to the scripts so the next time somebody tries to sync the network from genesis - they will automatically skip the failed upgrade. [Documentation and scripts to update](#documentation-and-scripts-to-update) +`--unsafe-skip-upgrade` needs to be documented in the list of upgrades and added +to the scripts so the next time somebody tries to sync the network from genesis, +they will automatically skip the failed upgrade. +[Documentation and scripts to update](#documentation-and-scripts-to-update) - + ::: -### Option 2: The migration is stuck +### Option 2: The migration is stuck (i.e. incomplete/partial migration) If the migration is stuck, there's always a chance the upgrade handler was executed on-chain as scheduled, but the migration didn't complete. -In such a case, we need to: +In such a case, we need: -- **All full nodes and validators**: Roll back validators to the backup. A snapshot is taken by `cosmovisor` automatically prior to upgrade when`UNSAFE_SKIP_BACKUP` is set to `false` (which is a default and recommended value - - [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)). -- **All full nodes and validators**: skip the upgrade by adding `--unsafe-skip-upgrade=$upgradeHeightNumber` - argument to your `poktroll start` command. Like this: - ```bash - poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber # ... the rest of the arguments - ``` -- **Protocol team**: document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) so the next time somebody - tries to sync the network from genesis they will automatically skip the failed upgrade. [Documentation and scripts to update](#documentation-and-scripts-to-update) -- **Protocol team**: Resolve the issue with an upgrade and schedule another plan. +- **All full nodes and validators**: Roll back validators to the backup + - A snapshot is taken by `cosmovisor` automatically prior to upgrade when `UNSAFE_SKIP_BACKUP` is set to `false` (the default recommended value; + [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)) +- **All full nodes and validators**: skip the upgrade + - Add the `--unsafe-skip-upgrade=$upgradeHeightNumber` argument to `poktroll start` command like so: + ```bash + poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber # ... the rest of the arguments + ``` +- **Protocol team**: document the failed upgrade + - document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) + - The next time somebody tries to sync the network from genesis they will automatically skip the failed upgrade; see [documentation and scripts to update](#documentation-and-scripts-to-update) +- **Protocol team**: Resolve the issue with an upgrade and schedule a new plan. - + -### Option 3: The network is stuck at the future height after the upgrade +### Option 3: The migration succeed but the network is stuck (i.e. migration had a bug) This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. @@ -81,4 +88,4 @@ This should be treated as a consensus or non-determinism bug that is unrelated t - The [upgrade list](./upgrade_list.md) should reflect a failed upgrade and provide a range of heights that served by each version. - Systemd service should include`--unsafe-skip-upgrade=$upgradeHeightNumber` argument in its start command [here](https://github.com/pokt-network/poktroll/blob/main/tools/installer/full-node.sh). - [Helm chart](https://github.com/pokt-network/helm-charts/blob/main/charts/poktrolld/templates/StatefulSet.yaml) (consider exposing via a `values.yaml` file) -- [docker-compose](https://github.com/pokt-network/poktroll-docker-compose-example/tree/main/scripts) example \ No newline at end of file +- [docker-compose](https://github.com/pokt-network/poktroll-docker-compose-example/tree/main/scripts) example From 9661c94cc6995a7fa64de44344277a22b2a4a00d Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Thu, 12 Dec 2024 12:48:11 -0800 Subject: [PATCH 26/27] Finished reviewing docusaurus/docs/protocol/upgrades/contigency_plans.md --- .../protocol/upgrades/contigency_plans.md | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md index b23c00148..260f37823 100644 --- a/docusaurus/docs/protocol/upgrades/contigency_plans.md +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -21,7 +21,7 @@ This document is intended to help you recover without significant downtime. - [Option 1: The migration didn't start (i.e. migration halt)](#option-1-the-migration-didnt-start-ie-migration-halt) - [Option 2: The migration is stuck (i.e. incomplete/partial migration)](#option-2-the-migration-is-stuck-ie-incompletepartial-migration) - [Option 3: The migration succeed but the network is stuck (i.e. migration had a bug)](#option-3-the-migration-succeed-but-the-network-is-stuck-ie-migration-had-a-bug) -- [Documentation and scripts to update](#documentation-and-scripts-to-update) +- [MANDATORY Checklist of Documentation \& Scripts to Update](#mandatory-checklist-of-documentation--scripts-to-update) ### Option 0: The bug is discovered before the upgrade height is reached @@ -65,17 +65,26 @@ If the migration is stuck, there's always a chance the upgrade handler was execu In such a case, we need: - **All full nodes and validators**: Roll back validators to the backup + - A snapshot is taken by `cosmovisor` automatically prior to upgrade when `UNSAFE_SKIP_BACKUP` is set to `false` (the default recommended value; [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)) + - **All full nodes and validators**: skip the upgrade + - Add the `--unsafe-skip-upgrade=$upgradeHeightNumber` argument to `poktroll start` command like so: + ```bash poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber # ... the rest of the arguments ``` + +- **Protocol team**: Resolve the issue with an upgrade and schedule a new plan. + + - The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. + - **Protocol team**: document the failed upgrade - - document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) + + - Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) - The next time somebody tries to sync the network from genesis they will automatically skip the failed upgrade; see [documentation and scripts to update](#documentation-and-scripts-to-update) -- **Protocol team**: Resolve the issue with an upgrade and schedule a new plan. @@ -83,9 +92,9 @@ In such a case, we need: This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. -### Documentation and scripts to update +### MANDATORY Checklist of Documentation & Scripts to Update -- The [upgrade list](./upgrade_list.md) should reflect a failed upgrade and provide a range of heights that served by each version. -- Systemd service should include`--unsafe-skip-upgrade=$upgradeHeightNumber` argument in its start command [here](https://github.com/pokt-network/poktroll/blob/main/tools/installer/full-node.sh). -- [Helm chart](https://github.com/pokt-network/helm-charts/blob/main/charts/poktrolld/templates/StatefulSet.yaml) (consider exposing via a `values.yaml` file) -- [docker-compose](https://github.com/pokt-network/poktroll-docker-compose-example/tree/main/scripts) example +- [ ] The [upgrade list](./upgrade_list.md) should reflect a failed upgrade and provide a range of heights that served by each version. +- [ ] Systemd service should include`--unsafe-skip-upgrade=$upgradeHeightNumber` argument in its start command [here](https://github.com/pokt-network/poktroll/blob/main/tools/installer/full-node.sh). +- [ ] The [Helm chart](https://github.com/pokt-network/helm-charts/blob/main/charts/poktrolld/templates/StatefulSet.yaml) should point to the latest version;consider exposing via a `values.yaml` file +- [ ] The [docker-compose](https://github.com/pokt-network/poktroll-docker-compose-example/tree/main/scripts) examples should point to the latest version From 2ce49453f54730d81c686cb9582ece3f8a92c525 Mon Sep 17 00:00:00 2001 From: Daniel Olshansky Date: Thu, 12 Dec 2024 13:38:16 -0800 Subject: [PATCH 27/27] Finish reviewing docusaurus/docs/protocol/upgrades/upgrade_procedure.md --- .../protocol/upgrades/upgrade_procedure.md | 66 ++++++++++++------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 6adc42aa3..6cc8dc31c 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -6,7 +6,11 @@ sidebar_position: 2 # Upgrade procedure :::warning -This page describes the protocol upgrade process, which is internal to the protocol team. If you're interested in upgrading your Pocket Network node, please check our [releases page](https://github.com/pokt-network/poktroll/releases) for upgrade instructions and changelogs. + +This page describes the protocol upgrade process, intended for the protocol team's internal use. + +If you're interested in upgrading your Pocket Network node, please check our [releases page](https://github.com/pokt-network/poktroll/releases) for upgrade instructions and changelogs. + ::: - [When is an Upgrade Warranted?](#when-is-an-upgrade-warranted) @@ -16,15 +20,18 @@ This page describes the protocol upgrade process, which is internal to the proto - [Submitting the upgrade on-chain](#submitting-the-upgrade-on-chain) - [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - - [LocalNet](#localnet) + - [LocalNet Upgrades](#localnet-upgrades) - [LocalNet Upgrade Cheat Sheet](#localnet-upgrade-cheat-sheet) - - [DevNet](#devnet) - - [TestNet](#testnet) - - [Mainnet](#mainnet) + - [DevNet Upgrades](#devnet-upgrades) + - [TestNet Upgrades](#testnet-upgrades) + - [Mainnet Upgrades](#mainnet-upgrades) ## Overview -When a consensus-breaking change is made to the protocol, we must carefully evaluate and implement an upgrade path that allows existing nodes to transition safely from one software version to another without disruption. This process involves several key steps: +When a consensus-breaking change is made to the protocol, we must carefully evaluate and implement an upgrade path that +allows existing nodes to transition safely from one software version to another without disruption. + +This process involves several key steps: 1. **Proposal**: The DAO drafts an upgrade proposal using our off-chain governance system. 2. **Implementation**: The proposed changes are implemented in the codebase. @@ -39,20 +46,34 @@ An upgrade is necessary whenever there's an API, State Machine, or other Consens ## Implementing the Upgrade -1. When a new version includes a consensus-breaking change, plan for the next protocol upgrade: - - If there's a change to a specific module, bump that module's consensus version. +1. When a new version includes a `consensus-breaking` change, plan for the next protocol upgrade: + + - If there's a change to a specific module -> bump that module's consensus version. - Note any potential parameter changes to include in the upgrade. -2. Create a new upgrade in `app/upgrades`. + +2. Create a new upgrade in `app/upgrades`: - Refer to `historical.go` for past upgrades and examples. - - Consult Cosmos-sdk documentation on upgrades for additional guidance [here](https://docs.cosmos.network/main/build/building-apps/app-upgrade) and [here](https://docs.cosmos.network/main/build/modules/upgrade). + - Consult Cosmos-sdk documentation on upgrades for additional guidance on [building-apps/app-upgrade](https://docs.cosmos.network/main/build/building-apps/app-upgrade) and [modules/upgrade](https://docs.cosmos.network/main/build/modules/upgrade). :::info -Creating a new upgrade plan MUST BE DONE even if there are no state changes. + +Creating a new upgrade plan **MUST BE DONE** even if there are no state changes. + ::: ## Writing an Upgrade Transaction -An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/blob/0fda53f265de4bcf4be1a13ea9fad450fc2e66d4/x/upgrade/proto/cosmos/upgrade/v1beta1/upgrade.proto#L14) with specific details about the upgrade. This information helps schedule the upgrade on the network and provides necessary data for automatic upgrades via `Cosmovisor`. A typical upgrade transaction will look like the following: +An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/blob/0fda53f265de4bcf4be1a13ea9fad450fc2e66d4/x/upgrade/proto/cosmos/upgrade/v1beta1/upgrade.proto#L14) with specific details about the upgrade. + +This information helps schedule the upgrade on the network and provides necessary data for automatic upgrades via `Cosmovisor`. + +A typical upgrade transaction includes: + +- `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. +- `height`: The height at which an upgrade should be executed and the node will be restarted. +- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. + +And looks like the following as an example: ```json { @@ -72,13 +93,12 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl } ``` -- `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. -- `height`: The height at which an upgrade should be executed and the node will be restarted. -- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. - :::tip -When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is also optional). We only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. +When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in +the object about and perform a hash verification (which is also optional). + +**NOTE THAT** we only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. ::: @@ -142,7 +162,7 @@ make localnet_cancel_upgrade Note that for local testing, `cosmovisor` won't pull the binary from the upgrade Plan's info field. ::: -### LocalNet +### LocalNet Upgrades LocalNet **DOES NOT** support `cosmovisor` and automatic upgrades at the moment. @@ -231,13 +251,13 @@ For a hypothetical scenario to upgrade from `0.1` to `0.2`: ./release_binaries/poktroll_darwin_arm64 q application params ``` -### DevNet +### DevNet Upgrades DevNets currently do not support `cosmovisor`. We use Kubernetes to manage software versions, including validators. Introducing another component to manage versions would be complex, requiring a re-architecture of our current solution to accommodate this change. -### TestNet +### TestNet Upgrades We currently deploy TestNet validators using Kubernetes with helm charts, which prevents us from managing the validator with `cosmovisor`. We do not control what other TestNet participants are running. However, if participants have deployed their nodes using the [cosmovisor guide](../../operate/run_a_node/full_node_walkthrough.md), their nodes will upgrade automatically. @@ -248,9 +268,11 @@ Until we transition to [cosmos-operator](https://github.com/strangelove-ventures 3. Monitor validator node(s) as they start and begin producing blocks. :::tip -If you are a member of Grove, you can find the instructions to access the infrastructure [here](https://www.notion.so/buildwithgrove/How-to-re-genesis-a-Shannon-TestNet-a6230dd8869149c3a4c21613e3cfad15?pvs=4). + +If you are a member of Grove, you can find the instructions to access the infrastructure [on notion](https://www.notion.so/buildwithgrove/How-to-re-genesis-a-Shannon-TestNet-a6230dd8869149c3a4c21613e3cfad15?pvs=4). + ::: -### Mainnet +### Mainnet Upgrades The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment.