diff --git a/Makefile b/Makefile index b464b13a109..a93b8fbe1ee 100644 --- a/Makefile +++ b/Makefile @@ -91,12 +91,14 @@ PROFILES_FORGE_STRESS_RTS := forge-stress-pre-rtsA4m forge-stress-pre-rtsA64m fo PROFILES_CHAINSYNC := chainsync-early-byron chainsync-early-byron-notracer chainsync-early-byron-oldtracing PROFILES_CHAINSYNC += chainsync-early-alonzo chainsync-early-alonzo-notracer chainsync-early-alonzo-oldtracing chainsync-early-alonzo-p2p PROFILES_VENDOR := dish dish-plutus dish-10M dish-10M-plutus -# "qa" and "perf" namespaces for cardano world (world.dev.cardano.org) Nomad -# Not all local profiles are compatible (yet) with a cloud run +# Cardano World (world.dev.cardano.org) Nomad cluster's "qa" class nodes # Cloud version of "default", "ci-test" and "ci-bench" -PROFILES_CW_QA := default-cw-qa ci-test-cw-qa ci-bench-cw-qa -# The 52+explorer profile -PROFILES_CW_PERF := default-cw-perf ci-test-cw-perf ci-bench-cw-perf cw-perf-value +# Not all local profiles are compatible or tested (yet) with a cloud runs +PROFILES_NOMAD_CW_QA := default-nomadcwqa ci-test-nomadcwqa ci-bench-nomadcwqa oldtracing-nomadcwqa ci-test-oldtracing-nomadcwqa ci-bench-oldtracing-nomadcwqa +# The dedicated P&T Nomad cluster on AWS +# Cloud version of "default", "ci-test" and "ci-bench" plus value (52+explorer) +# Not all local profiles are compatible or tested (yet) with a cloud runs +PROFILES_NOMAD_PERF := default-nomadperf ci-test-nomadperf ci-bench-nomadperf value-nomadperf oldtracing-nomadperf ci-test-oldtracing-nomadperf ci-bench-oldtracing-nomadperf value-oldtracing-nomadperf LOCAL_PROFILES += $(PROFILES_BASE) LOCAL_PROFILES += $(PROFILES_FAST) @@ -112,7 +114,7 @@ LOCAL_PROFILES += $(PROFILES_FORGE_STRESS_PRE) LOCAL_PROFILES += $(PROFILES_FORGE_STRESS_RTS) LOCAL_PROFILES += $(PROFILES_CHAINSYNC) LOCAL_PROFILES += $(PROFILES_VENDOR) -CLOUD_PROFILES += $(PROFILES_CW_QA) $(PROFILES_CW_PERF) +CLOUD_PROFILES += $(PROFILES_NOMAD_CW_QA) $(PROFILES_NOMAD_PERF) ## Note: to enable a shell for a profile, just add its name (one of names from 'make ps') to SHELL_PROFILES diff --git a/nix/workbench/backend/nomad-job.nix b/nix/workbench/backend/nomad-job.nix index ac61073d19d..4d2767d807a 100644 --- a/nix/workbench/backend/nomad-job.nix +++ b/nix/workbench/backend/nomad-job.nix @@ -16,17 +16,22 @@ let - # Task's (Container or chroot) defaults: + # Filesystem # - ## Default values below are stored in the job's "meta" stanza to be able to - ## overrided them with 'jq' from a workbench shell. These values in "meta" - ## are used to programatically create a "template" with "env = true;" so they - ## are automagically reachable as envars inside the Task's entrypoint and - ## 'supervisord' programs. - ## Values go: Nix (defaults) -> meta -> template -> envars + # Nomad creates a working directory for each allocation on a client. This + # directory can be found in the Nomad data_dir at ./alloc/«alloc_id». The + # allocation working directory is where Nomad creates task directories and + # directories shared between tasks, write logs for tasks, and downloads + # artifacts or templates. + # https://developer.hashicorp.com/nomad/docs/concepts/filesystem # - ## See ./oci-images.nix for further details if using the `podman` driver. - ## For the `exec` driver almost everything is here. + # For example: + ## - Driver "exec" ("chroot" isolation): + ## - - NOMAD_ALLOC_DIR=/alloc + ## - - NOMAD_TASK_DIR=/local + ## - Driver "raw_exec" ("none" isolation): + ## - - NOMAD_ALLOC_DIR=DATA-DIR/alloc/XXXXXXXX/alloc + ## - - NOMAD_TASK_DIR=DATA-DIR/alloc/XXXXXXXX/TASK-NAME/local # # Templates are rendered into the task working directory. Drivers without # filesystem isolation (such as raw_exec) or drivers that build a chroot in @@ -38,20 +43,28 @@ let ## - https://developer.hashicorp.com/nomad/docs/job-specification/template#template-destinations ## - https://developer.hashicorp.com/nomad/docs/runtime/environment#task-directories ## - https://developer.hashicorp.com/nomad/docs/concepts/filesystem - task_workdir = if execTaskDriver - # A `work_dir` stanza is comming (?): - # https://github.com/hashicorp/nomad/pull/10984 - # TODO: Try with ''${NOMAD_TASK_DIR}'' in both! - then "/local" - # This value must also be used inside the `podman` `config` stanza. - else "/local" - ; - # Usually "*/local/run/current" - task_statedir = "${task_workdir}${if stateDir == "" then "" else ("/" + stateDir)}"; + + # Task's filesystem / working directory (maybe container or chroot) defaults: + # + # When using the isolated fork task driver ("exec") + ## Default values below are stored in the job's "meta" stanza to be able to + ## overrided them with 'jq' from a workbench shell. These values in "meta" + ## are used to programatically create a "template" with "env = true;" so they + ## are automagically reachable as envars inside the Task's entrypoint and + ## 'supervisord' programs. + ## Values go: Nix (defaults) -> meta -> template -> envars + # + ## See ./oci-images.nix for further details if using the `podman` driver. + ## For the `exec` driver almost everything is here. + # + # A symlink to the supervisord nix-installed inside the OCI image/chroot. - # We need to be able to `nomad exec supervisorctl ...` , for these the path + # We need to be able to `nomad exec supervisorctl ...` , for this the path # of the installed supervisor binaries is needed. - task_supervisor_nix = "${task_statedir}/supervisor/nix-store"; + task_supervisor_nix = "${stateDir}/supervisor/nix-store"; + # Location of the supervisord config file inside the container. + # This file can be mounted as a volume or created as a template. + task_supervisord_conf = "${stateDir}/supervisor/supervisord.conf"; # The URL to the listening inet or socket of the supervisord server: # The problem is that if we use "127.0.0.1:9001" as parameter (without the # "http" part) the container returns: @@ -63,11 +76,8 @@ let # the container I get (from journald): # Nov 02 11:44:36 hostname cluster-18f3852f-e067-6394-8159-66a7b8da2ecc[1088457]: Error: Cannot open an HTTP server: socket.error reported -2 # Nov 02 11:44:36 hostname cluster-18f3852f-e067-6394-8159-66a7b8da2ecc[1088457]: For help, use /nix/store/izqhlj5i1x9ldyn43d02kcy4mafmj3ci-python3.9-supervisor-4.2.4/bin/supervisord -h - unixHttpServerPort = "/tmp/supervisor.sock"; + unixHttpServerPort = "/tmp/supervisor-{{ env \"NOMAD_TASK_NAME\" }}.sock"; task_supervisord_url = "unix://${unixHttpServerPort}"; - # Location of the supervisord config file inside the container. - # This file can be mounted as a volume or created as a template. - task_supervisord_conf = "${task_statedir}/supervisor/supervisord.conf"; task_supervisord_loglevel = "info"; entrypoint = @@ -76,21 +86,37 @@ let supervisor = containerSpecs.containerPkgs.supervisor.nix-store-path; in escapeTemplate '' - # Store the entrypoint's envars in a file for debugging purposes. - ${coreutils}/bin/env > /local/entrypoint.env + # Store entrypoint's envars and "uname" in a file for debugging purposes. + ${coreutils}/bin/env > "''${NOMAD_TASK_DIR}"/entrypoint.env + ${coreutils}/bin/uname -a > "''${NOMAD_TASK_DIR}"/entrypoint.uname + ${coreutils}/bin/cat /proc/cpuinfo > "''${NOMAD_TASK_DIR}"/entrypoint.cpuinfo + # Directories map to use when `nomad fs` and `nomad alloc exec` + SUPERVISOR_NIX="''${NOMAD_TASK_DIR}/${task_supervisor_nix}" + SUPERVISOR_CONF="''${NOMAD_TASK_DIR}/${task_supervisord_conf}" + echo \ + "{ \ + \"nomad\": { \ + \"alloc\": \"''${NOMAD_ALLOC_DIR}\" \ + , \"task\": \"''${NOMAD_TASK_DIR}\" \ + } \ + , \"workbench\": { \ + \"state\": \"''${NOMAD_TASK_DIR}/${stateDir}\" \ + } \ + , \"supervisor\": { \ + \"nix\": \"''${SUPERVISOR_NIX}\" \ + , \"config\": \"''${SUPERVISOR_CONF}\" \ + , \"socket\": \"${unixHttpServerPort}\" \ + , \"url\": \"${task_supervisord_url}\" \ + } \ + }" \ + > "''${NOMAD_TASK_DIR}"/entrypoint.dirs # Only needed for "exec" ? if test "''${TASK_DRIVER}" = "exec" then - cd "''${TASK_WORKDIR}" + cd "''${NOMAD_TASK_DIR}" fi - # The SUPERVISOR_NIX variable must be set - [ -z "''${SUPERVISOR_NIX:-}" ] && echo "SUPERVISOR_NIX env var must be set -- aborting" && exit 1 - - # The SUPERVISORD_CONFIG variable must be set - [ -z "''${SUPERVISORD_CONFIG:-}" ] && echo "SUPERVISORD_CONFIG env var must be set -- aborting" && exit 1 - # Create a symlink to 'supervisor' Nix Store folder so we can call it from # 'ssh' or 'nomad exec' without having it in PATH or knowing the currently # running version. But first check if it already exists to be able to @@ -107,9 +133,9 @@ let LOGLEVEL="''${SUPERVISORD_LOGLEVEL:-info}" # Start `supervisord` on the foreground. - # Avoid buffer related problems with stdout and stderr disabling buffering + # Make sure it never runs in unbuffered mode: # https://docs.python.org/3/using/cmdline.html#envvar-PYTHONUNBUFFERED - PYTHONUNBUFFERED=TRUE ${supervisor}/bin/supervisord --nodaemon --configuration "''${SUPERVISORD_CONFIG}" --loglevel="''${LOGLEVEL}" + PYTHONUNBUFFERED="" ${supervisor}/bin/supervisord --nodaemon --configuration "''${SUPERVISOR_CONF}" --loglevel="''${LOGLEVEL}" '' ; @@ -178,7 +204,7 @@ let # multiple times to define additional constraints. # Cloud runs set the distinct hosts constraint here but local runs can't # because we are only starting one Nomad client. - constraint = null; + constraint = null; # Values are appended inside the workbench (bash). # The reschedule stanza specifies the group's rescheduling strategy. If # specified at the job level, the configuration will apply to all groups @@ -199,12 +225,8 @@ let # Specifies a key-value map that annotates with user-defined metadata. meta = { # Only top level "KEY=STRING" are allowed, no child objects/attributes! + WORKBENCH_STATEDIR = stateDir; TASK_DRIVER = if execTaskDriver then "exec" else "podman"; - TASK_WORKDIR = task_workdir; - TASK_STATEDIR = task_statedir; - SUPERVISOR_NIX = task_supervisor_nix; - SUPERVISORD_URL = task_supervisord_url; - SUPERVISORD_CONFIG = task_supervisord_conf; SUPERVISORD_LOGLEVEL = task_supervisord_loglevel; ONE_TRACER_PER_NODE = oneTracerPerNode; }; @@ -271,12 +293,12 @@ let constraint = { attribute = "\${node.class}"; operator = "="; - # For cloud benchmarking, dedicated static machines in the "perf" - # class are used. We mimic that for local/test runs. + # Cloud jobs can run in the dedicated P&T Nomad cluster on AWS or in + # Cardano World Nomad cluster's "qa" class nodes. # This default is just a precaution, like the top level namespace, - # because there are also available "qa" Class nodes but usage of these - # must be limited to short test and "infra" Class nodes are used for - # HA jobs and must be avoided entirely. + # because "qa" Class nodes usage must be limited to short test and + # "infra" Class nodes, that are used for HA jobs, must be avoided + # entirely. value = "perf"; }; @@ -410,9 +432,9 @@ let }; # Sensible defaults to run cloud version of "default", "ci-test" and - # "ci-bench" in cardano-world qa class Nomad nodes. - # For benchmarking dedicated static machines in the "perf" class are - # used and this value should be updated accordingly. + # "ci-bench" in Cardano World Nomad cluster's "qa" class nodes. + # For benchmarking the dedicated P&T Nomad cluster on AWS is used and + # this value should be updated accordingly. resources = { # Task can only ask for 'cpu' or 'cores' resource but not both. cores = 2; # cpu = 512; @@ -448,10 +470,10 @@ let # address of an AWS EC2 instance set this to # ${attr.unique.platform.aws.public-ipv4}. address = - # When using Cardano World (nomad.world.dev.cardano.org) "perf" - # class nodes we use public IPs/routing, all the other cloud runs - # are behind a VPC/firewall. Local runs just use 12.0.0.1. - if lib.strings.hasInfix "cw-perf" profileData.profileName + # When using the dedicated P&T Nomad cluster on AWS we use public + # IPs/routing, all the other cloud runs are behind a VPC/firewall. + # Local runs just use 12.0.0.1. + if lib.strings.hasInfix "-nomadperf" profileData.profileName then "\${attr.unique.platform.aws.public-ipv4}" else "" ; @@ -511,11 +533,7 @@ let # https://developer.hashicorp.com/nomad/docs/runtime/environment data = '' TASK_DRIVER="{{ env "NOMAD_META_TASK_DRIVER" }}" - TASK_WORKDIR="{{ env "NOMAD_META_TASK_WORKDIR" }}" - TASK_STATEDIR="{{ env "NOMAD_META_TASK_STATEDIR" }}" - SUPERVISOR_NIX="{{ env "NOMAD_META_SUPERVISOR_NIX" }}" - SUPERVISORD_URL="{{ env "NOMAD_META_SUPERVISORD_URL" }}" - SUPERVISORD_CONFIG="{{ env "NOMAD_META_SUPERVISORD_CONFIG" }}" + WORKBENCH_STATEDIR="{{ env "NOMAD_META_WORKBENCH_STATEDIR" }}" SUPERVISORD_LOGLEVEL="{{ env "NOMAD_META_SUPERVISORD_LOGLEVEL" }}" ''; # Specifies the behavior Nomad should take if the rendered @@ -529,7 +547,7 @@ let ## Make the profile.json file available (mainly for healthchecks) { env = false; - destination = "${task_statedir}/profile.json"; + destination = "local/${stateDir}/profile.json"; data = escapeTemplate (__readFile profileData.JSON.outPath); change_mode = "noop"; @@ -538,7 +556,7 @@ let ## Make the node-specs.json file available (mainly for healthchecks) { env = false; - destination = "${task_statedir}/node-specs.json"; + destination = "local/${stateDir}/node-specs.json"; data = escapeTemplate (__readFile profileData.node-specs.JSON.outPath); change_mode = "noop"; @@ -547,7 +565,7 @@ let # entrypoint { env = false; - destination = "${task_workdir}/entrypoint.sh"; + destination = "local/entrypoint.sh"; data = entrypoint; change_mode = "noop"; error_on_missing_key = true; @@ -555,7 +573,7 @@ let # Dynamically generated addresses for debugging purposes { env = false; - destination = "${task_workdir}/networking.json"; + destination = "local/networking.json"; data = '' { {{- $first := true -}} @@ -598,7 +616,7 @@ let ## supervisord configuration file. { env = false; - destination = "${task_supervisord_conf}"; + destination = "local/${task_supervisord_conf}"; data = escapeTemplate (__readFile ( let supervisorConf = import ./supervisor-conf.nix { inherit pkgs lib stateDir; @@ -630,7 +648,7 @@ let ## Tracer start.sh script. { env = false; - destination = "${task_statedir}/tracer/start.sh"; + destination = "local/${stateDir}/tracer/start.sh"; data = escapeTemplate profileData.tracer-service.start.value; change_mode = "noop"; @@ -640,7 +658,7 @@ let ## Tracer configuration file. { env = false; - destination = "${task_statedir}/tracer/config.json"; + destination = "local/${stateDir}/tracer/config.json"; data = escapeTemplate (lib.generators.toJSON {} # TODO / FIXME: Ugly config patching! (lib.attrsets.recursiveUpdate @@ -669,7 +687,7 @@ let ## Node start.sh script. { env = false; - destination = "${task_statedir}/${nodeSpec.name}/start.sh"; + destination = "local/${stateDir}/${nodeSpec.name}/start.sh"; data = escapeTemplate ( let scriptValue = profileData.node-services."${nodeSpec.name}".start.value; in if execTaskDriver @@ -687,7 +705,7 @@ let ## Node configuration file. { env = false; - destination = "${task_statedir}/${nodeSpec.name}/config.json"; + destination = "local/${stateDir}/${nodeSpec.name}/config.json"; data = escapeTemplate (lib.generators.toJSON {} profileData.node-services."${nodeSpec.name}".config.value); change_mode = "noop"; @@ -696,7 +714,7 @@ let ## Node topology file. { env = false; - destination = "${task_statedir}/${nodeSpec.name}/topology.json"; + destination = "local/${stateDir}/${nodeSpec.name}/topology.json"; data = escapeTemplate ( let topology = profileData.node-services."${nodeSpec.name}".topology; in if execTaskDriver @@ -713,7 +731,7 @@ let ## Generator start.sh script. { env = false; - destination = "${task_statedir}/generator/start.sh"; + destination = "local/${stateDir}/generator/start.sh"; data = escapeTemplate profileData.generator-service.start.value; change_mode = "noop"; @@ -723,7 +741,7 @@ let ## Generator configuration file. { env = false; - destination = "${task_statedir}/generator/run-script.json"; + destination = "local/${stateDir}/generator/run-script.json"; data = escapeTemplate ( let runScript = profileData.generator-service.config; in if execTaskDriver @@ -740,7 +758,7 @@ let ## healthcheck start.sh script. { env = false; - destination = "${task_statedir}/healthcheck/start.sh"; + destination = "local/${stateDir}/healthcheck/start.sh"; data = escapeTemplate profileData.healthcheck-service.start.value; change_mode = "noop"; @@ -765,7 +783,7 @@ let ## ssh start.sh script. { env = false; - destination = "${task_statedir}/ssh/start.sh"; + destination = "local/${stateDir}/ssh/start.sh"; data = escapeTemplate ssh-service.start.value; change_mode = "noop"; error_on_missing_key = true; @@ -774,15 +792,15 @@ let ## ssh config file. { env = false; - destination = "${task_statedir}/ssh/sshd_config"; + destination = "local/${stateDir}/ssh/sshd_config"; data = escapeTemplate ssh-service.config.value; change_mode = "noop"; error_on_missing_key = true; perms = "744"; # Only for every "start.sh" script. Default: "644" } # The deployer script must add the templates for the private keys: - # - ${task_statedir}/ssh/sshd.id_ed25519 - # - ${task_statedir}/ssh/nobody.id_ed25519.pub + # - local/${stateDir}/ssh/sshd.id_ed25519 + # - local/${stateDir}/ssh/nobody.id_ed25519.pub ] )) ; @@ -817,7 +835,7 @@ let command = "${containerSpecs.containerPkgs.bashInteractive.nix-store-path}/bin/bash"; - args = ["${task_workdir}/entrypoint.sh"]; + args = ["local/entrypoint.sh"]; nix_installables = (lib.attrsets.mapAttrsToList @@ -839,7 +857,7 @@ let command = "${containerSpecs.containerPkgs.bashInteractive.nix-store-path}/bin/bash"; - args = ["${task_workdir}/entrypoint.sh"]; + args = ["local/entrypoint.sh"]; # The image to run. Accepted transports are docker (default if # missing), oci-archive and docker-archive. Images reference as @@ -884,8 +902,7 @@ let # The working directory for the container. Defaults to the # default set in the image. - #working_dir = ''{{ env "NOMAD_META_TASK_WORKDIR" }}''; - working_dir = task_workdir; + working_dir = "local/"; }; } @@ -1178,15 +1195,20 @@ let # Port string from ''--port ${toString nodeSpec.port}'' ] - # On cloud deployments to SRE-managed Nomad, that uses AWS, the hosts at - # Linux level may not be aware of the EIP public address they have so we - # can't bind to the public IP (that we can resolve to using templates). - # I prefer being more specific but the "all-weather" alternative is to - # bind to 0.0.0.0 instead of the private IP, just in case the Nomad Client - # was not started with the correct `-network-interface XX` parameter. + # On cloud deployments to SRE-managed / dedicated P&T Nomad cluster, that + # uses AWS, the hosts at Linux level may not be aware of the EIP public + # address they have so we can't bind to the public IP (that we can resolve + # to using templates). The only options available are to bind to the + # "all-weather" 0.0.0.0 or use the private IP provided by AWS. We use the + # latter in case the Nomad Client was not started with the correct + # `-network-interface XX` parameter. [ # Address string to - ''--host-addr 0.0.0.0'' + ( + if lib.strings.hasInfix "-nomadperf" profileData.profileName + then ''--host-addr {{ env "attr.unique.platform.aws.local-ipv4" }}'' + else ''--host-addr 0.0.0.0'' + ) # Alternatives (may not work): #''--host-addr {{ env "NOMAD_HOST_IP_${servicePortName}" }}'' #''--host-addr {{ env "NOMAD_IP_${servicePortName}" }}'' diff --git a/nix/workbench/backend/nomad.sh b/nix/workbench/backend/nomad.sh index ecd66a40a44..ad30fc2e8af 100644 --- a/nix/workbench/backend/nomad.sh +++ b/nix/workbench/backend/nomad.sh @@ -26,7 +26,7 @@ backend_nomad() { case "$op" in ############################################################################ - # Functions to configure cluster: + # Functions to configure the cluster and start the Nomad job: # - setenv-defaults BACKEND-DIR # - allocate-run RUN-DIR # - allocate-run-directory-nomad RUN-DIR (Nomad only) @@ -42,26 +42,30 @@ backend_nomad() { ############################################################################ # * Functions in the backend "interface" must use `fatal` when errors! + # Completely overrided by each sub-backend (exec.sh, podman.sh and cloud.sh) setenv-defaults ) + # The impossible just happened? fatal "Function \"setenv-defaults\" is Nomad backend specific" ;; - # After `allocate-run` the Nomad is running waiting for the genesis to be - # deployed and tracer/cardano-nodes/generator to be started. + # After `allocate-run` the Nomad job is running (supervisord) waiting for + # genesis to be deployed and tracer/cardano-nodes/generator to be started. # - # "generator", "tracer" and "node" folders contents (start.sh, config files, - # etc) are included in the Nomad Job spec file as "template" stanzas and are - # materialized inside the container when the job is started. This is how it - # works for every environment combination (podman/exec-(local/cloud)). + # "generator", "tracer", "node" and "healthcheck" folder contents (start.sh, + # config files, etc) are included in the Nomad Job spec file as "template" + # stanzas and are materialized inside the container when the job is started. + # This is how it works for every environment combination + # (podman/exec-(local/cloud)). # - # But "genesis" and "CARDANO_MAINNET_MIRROR" are the exceptions: + # But "genesis" and "CARDANO_MAINNET_MIRROR" are the deployment exceptions: # - "CARDANO_MAINNET_MIRROR": is added as a Nix dependency using the # `nix_installables` stanza when using the "exec" driver and is mounted as a # local volume for "podman" that currently is only allowed to run locally. # - "genesis": it's too big for a "template" stanza so we are mounting it - # locally for "podman" and uploading it to a cloud storage to download using - # "nomad exec" when the "exec" task driver is used, the latter means - # creating an HTTP server for local runs and using Amazon S2 for cloud runs. + # locally for "podman" and uploading it to a cloud storage service to + # download it using "nomad exec" when the "exec" task driver is used, the + # latter means creating an HTTP server for local runs and using Amazon S2 + # for cloud runs. allocate-run ) local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift @@ -108,8 +112,10 @@ backend_nomad() { do mkdir "${dir}"/nomad/"${node}" done + # A "tracer"(s) is optional. if jqtest ".node.tracer" "${dir}"/profile.json then + # Create a unique dedicated tracer only when not one-tracer-per-node. if ! test "${one_tracer_per_node}" = "true" then mkdir "${dir}"/nomad/tracer @@ -177,10 +183,10 @@ backend_nomad() { local nodes=($(jq_tolist keys "${dir}"/node-specs.json)) for node in ${nodes[*]} do - # Files "start.sh" and "topology.sh" that usually go in here are copied - # from the Task/container once it's started because the contents are - # created or patched using Nomad's "template" stanza in the job spec - # and we want to hold a copy of what was actually run. + # Files "start.sh", "config.json" and "topology.sh" that usually go in + # here are copied from the Task/container once it's started because the + # contents are created or patched using Nomad's "template" stanza in the + # job spec and we want to hold a copy of what was actually run. mkdir "${dir}"/"${node}" done ;; @@ -189,8 +195,9 @@ backend_nomad() { local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift # Not much to do! - # Generator always runs inside Task/container "node-0" for local runs + # Generator actually runs inside Task/container "node-0" for local runs # and "explorer" for cloud runs. + # See: `local generator_task=$(envjqr 'generator_task_name')` mkdir -p "${dir}"/generator ;; @@ -202,17 +209,17 @@ backend_nomad() { local nodes=($(jq_tolist keys "${dir}"/node-specs.json)) for node in ${nodes[*]} do - # Files "start.sh" and "topology.sh" that usually go in here are copied - # from the Task/container once it's started because the contents are - # created or patched using Nomad's "template" stanza in the job spec - # and we want to hold a copy of what was actually run. + # File "start.sh" that usually goes in here is copied from the + # Task/container once it's started because the contents are created or + # patched using Nomad's "template" stanza in the job spec and we want to + # hold a copy of what was actually run. mkdir "${dir}"/healthcheck/"${node}" done ;; # Change the Nomad job name to the current run tag. This allows to run # multiple clusters simulatenously (as long as the network isolation mode - # and/or topology.json allows no port clashing) + # and/or topology.json is designed for no port clashing) allocate-run-nomad-job-patch-name ) local usage="USAGE: wb backend $op RUN-DIR JOB-NAME" local dir=${1:?$usage}; shift @@ -277,10 +284,11 @@ backend_nomad() { ############################################################################ # Functions to start the cluster: - # - is-running RUN-DIR - # - start-cluster RUN-DIR - # - start-config-download RUN-DIR (Nomad only) - # - deploy-genesis-wget RUN-DIR (Nomad only) + # - is-running RUN-DIR + # - start-cluster RUN-DIR + # - start-config-download RUN-DIR (Nomad only) + # - start-services-download RUN-DIR (Nomad only) + # - deploy-genesis-wget RUN-DIR (Nomad only) ############################################################################ # * Functions in the backend "interface" must use `fatal` when errors! @@ -322,71 +330,153 @@ backend_nomad() { local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift touch "${dir}"/starting - if \ - ! backend_nomad start-nomad-job "${dir}" \ - || \ - # TODO: Send the download everything job to the background ??? - ! backend_nomad start-config-download "${dir}" + if ! backend_nomad start-nomad-job "${dir}" then - backend_nomad stop-nomad-job "${dir}" fatal "Backend start failed!" + else + # Don't send to the background! + if ! backend_nomad start-config-download "${dir}" || ! backend_nomad start-services-download "${dir}" + then + msg "$(yellow "We don't want to start a job/cluster if we are not able to download what has been deployed (copies of dynamically generated files)")" + backend_nomad stop-nomad-job "${dir}" || msg "stop-nomad-job failed!" + fatal "Backend start failed!" + fi fi rm "${dir}"/starting; touch "${dir}"/started ;; - # Downloads all Nomad job deployed files, the "template" stanzas. + # Downloads all Nomad Job deployed files, the "template" stanzas that suffer + # interpolation, and the entrypoint script generated files. start-config-download ) local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift local one_tracer_per_node=$(envjqr 'one_tracer_per_node') + # Do not download everything in parallel, first the entrypoints' files and + # then the templates: + # Error querying allocation: Unexpected response code: 429 (Your IP is issuing too many concurrent connections, please rate limit your calls) + + # First entrypoints. + local jobs_entrypoints=() + msg "Fetch entrypoints generated files ..." + # For every node (not including a possible tracer Task) ... + local nodes=($(jq_tolist keys "$dir"/node-specs.json)) + for node in ${nodes[*]} + do + # The entrypoint generated files of the Task were the node runs. + backend_nomad download-config-entrypoint "${dir}" "${node}" & + jobs_entrypoints+=("$!") + done + if ! test "${one_tracer_per_node}" = "true" + then + # The entrypoint generated files of the Task were the tracer runs. + backend_nomad download-config-entrypoint "${dir}" "tracer" & + jobs_entrypoints+=("$!") + fi + # Wait and check! + if test -n "${jobs_entrypoints}" + then + if ! wait_kill_em_all "${jobs_entrypoints[@]}" + then + msg "$(red "Config downloads failed!")" + return 1 + else + msg "Finished fetching entrypoints generated files" + fi + fi + + # Last the Tasks' template stanzas. msg "Fetch Nomad generated files ..." - local jobs_array=() - # Only used for debugging! - backend_nomad download-config-generator "${dir}" & - jobs_array+=("$!") - # For every node ... + local jobs_tasks=() + # The `tx-generator` config files, running in one of the Tasks were + # `cardano-node` is deployed. + backend_nomad download-config-generator "${dir}" & + jobs_tasks+=("$!") + # For every node (not including a possible tracer Task) ... local nodes=($(jq_tolist keys "$dir"/node-specs.json)) for node in ${nodes[*]} do - # Only used for debugging! - backend_nomad download-config-node "${dir}" "${node}" & - jobs_array+=("$!") + # `cardano-node` config files. + backend_nomad download-config-node "${dir}" "${node}" & + jobs_tasks+=("$!") done - # This same script looks for the socket path inside the tracer config if test "${one_tracer_per_node}" = "true" then local nodes=($(jq_tolist keys "$dir"/node-specs.json)) for node in ${nodes[*]} do - backend_nomad download-config-tracer "${dir}" "${node}" & - jobs_array+=("$!") + # `cardano-tracer` config files, running in one of the Tasks were + # `cardano-node` is deployed. + backend_nomad download-config-tracer "${dir}" "${node}" & + jobs_tasks+=("$!") done else - backend_nomad download-config-tracer "${dir}" "tracer" & - jobs_array+=("$!") + # The tracer runs as an extra Nomad Task. + backend_nomad download-config-tracer "${dir}" "tracer" & + jobs_tasks+=("$!") fi - # For every node ... + # For every node (not including a possible tracer Task) ... local nodes=($(jq_tolist keys "$dir"/node-specs.json)) for node in ${nodes[*]} do # Only used for debugging! backend_nomad download-config-healthcheck "${dir}" "${node}" & + jobs_tasks+=("$!") + done + # Wait and check! + if test -n "${jobs_tasks}" + then + if ! wait_kill_em_all "${jobs_tasks[@]}" + then + msg "$(red "Config downloads failed!")" + return 1 + else + msg "Finished fetching Nomad generated files" + fi + fi + ;; + + # Downloads all Nomad services definitions, the "service" stanzas. + start-services-download ) + local usage="USAGE: wb backend $op RUN-DIR" + local dir=${1:?$usage}; shift + local one_tracer_per_node=$(envjqr 'one_tracer_per_node') + + msg "Fetch Nomad services definitions ..." + local jobs_array=() + # For every node ... + local i_array=($(jq_tolist 'map(.i)' "${dir}"/node-specs.json)) + for i in ${i_array[*]} + do + local node_name + node_name=$(jq -r "map(select( .i == ${i} ))[0].name" "${dir}"/node-specs.json) + nomad service info -json "perfnode${i}" > "${dir}"/nomad/"${node_name}"/service-info.json & jobs_array+=("$!") done + if ! test "${one_tracer_per_node}" = "true" + then + nomad service info -json "perftracer" > "${dir}"/nomad/tracer/service-info.json & + jobs_array+=("$!") + fi # Wait and check! if test -n "${jobs_array}" then - if ! wait_fail_any "${jobs_array[@]}" + if ! wait_kill_em_all "${jobs_array[@]}" then - backend_nomad stop-nomad-job "${dir}" - fatal "Downloads failed!" + msg "$(red "Service definitions downloads failed!")" + return 1 else - msg "Finished fetching Nomad generated files" + msg "Finished fetching Nomad services definitions" fi fi ;; + # Completely overrided by each sub-backend (exec.sh, podman.sh and cloud.sh) + deploy-genesis ) + # The impossible just happened? + fatal "Function \"deploy-genesis\" is Nomad backend specific" + ;; + # Called by the sub-backends, don't use `fatal` and let them do the cleaning deploy-genesis-wget ) local usage="USAGE: wb backend $op RUN-DIR" @@ -401,19 +491,23 @@ backend_nomad() { for node in ${nodes[*]} do msg "$(blue Downloading) $(yellow "\"${uri}\"") from $(yellow "node \"${node}\"") ..." - backend_nomad task-exec "${dir}" "${node}" \ - "${wget_path}"/bin/wget \ - --output-document=/local/run/current/genesis.tar.zst \ - "${uri}" \ - --no-verbose \ - > /dev/null \ + # When executing commands the directories used depend on the filesystem + # isolation mode (AKA chroot or not). + local state_dir + state_dir="$(backend_nomad task-workbench-state-dir "${dir}" "${node}")" + backend_nomad task-exec "${dir}" "${node}" \ + "${wget_path}"/bin/wget \ + --output-document="${state_dir}"/genesis.tar.zst \ + "${uri}" \ + --no-verbose \ + > /dev/null \ & uploads_array+=("$!") done # Wait and check! if test -n "${uploads_array}" then - if ! wait_fail_any "${uploads_array[@]}" + if ! wait_kill_em_all "${uploads_array[@]}" then msg "$(red "Failed to upload some genesis files")" return 1 @@ -425,11 +519,15 @@ backend_nomad() { local unpacks_array=() for node in ${nodes[*]} do + # When executing commands the directories used depend on the + # filesystem isolation mode (AKA chroot or not). + local state_dir + state_dir="$(backend_nomad task-workbench-state-dir "${dir}" "${node}")" backend_nomad task-exec "${dir}" "${node}" \ - ${tar_path}/bin/tar --extract \ + "${tar_path}"/bin/tar --extract \ --use-compress-program="${zstd_path}"/bin/zstd \ - --file=/local/run/current/genesis.tar.zst \ - --one-top-level=/local/run/current/genesis \ + --file="${state_dir}"/genesis.tar.zst \ + --one-top-level="${state_dir}"/genesis \ --same-permissions \ --no-same-owner \ --numeric-owner \ @@ -441,7 +539,7 @@ backend_nomad() { # Wait and check! if test -n "${unpacks_array}" then - if ! wait_fail_any "${unpacks_array[@]}" + if ! wait_kill_em_all "${unpacks_array[@]}" then msg "$(red "Failed to unpack some genesis files")" return 1 @@ -510,6 +608,7 @@ backend_nomad() { then if test "${nomad_agents_were_already_running}" = "false" then + # No checks, `agents stop` checks for errors and uses msg to show them! wb_nomad agents stop \ "${server_name}" "${client_name}" "${nomad_task_driver}" fi @@ -561,9 +660,11 @@ backend_nomad() { local nomad_agents_were_already_running=$(envjqr 'nomad_agents_were_already_running') local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json) touch "${dir}"/nomad/stopped - wb_nomad job stop "${dir}/nomad/nomad-job.json" "${nomad_job_name}" > "${dir}/nomad/job.stop.stdout" 2> "$dir/nomad/job.stop.stderr" || true + # No checks, `job stop` checks for errors and uses msg to show them! + wb_nomad job stop "${dir}/nomad/nomad-job.json" "${nomad_job_name}" > "${dir}/nomad/job.stop.stdout" 2> "$dir/nomad/job.stop.stderr" if test "${nomad_agents_were_already_running}" = "false" then + # No checks, `agents stop` checks for errors and uses msg to show them! wb_nomad agents stop \ "${server_name}" "${client_name}" "${nomad_task_driver}" fi @@ -656,7 +757,8 @@ backend_nomad() { # - stop-all-nodes RUN-DIR (Nomad only) # - stop-all-tracers RUN-DIR (Nomad only) # - fetch-logs RUN-DIR - # - stop-cluster RUN-DIR + # - stop-cluster-internal RUN-DIR + # - stop-cluster-local RUN-DIR # - cleanup-cluster RUN-DIR ############################################################################ # * Functions in the backend "interface" must use `fatal` when errors! @@ -674,7 +776,7 @@ backend_nomad() { backend_nomad stop-all-healthchecks "${dir}" "${node}" & jobs_healthchecks_array+=("$!") done - if ! wait_fail_any "${jobs_healthchecks_array[@]}" + if ! wait_all "${jobs_healthchecks_array[@]}" then msg "$(red "Failed to stop healthcheck(s)")" fi @@ -689,7 +791,7 @@ backend_nomad() { backend_nomad stop-all-nodes "${dir}" "${node}" & jobs_nodes_array+=("$!") done - if ! wait_fail_any "${jobs_nodes_array[@]}" + if ! wait_all "${jobs_nodes_array[@]}" then msg "$(red "Failed to stop node(s)")" fi @@ -705,7 +807,7 @@ backend_nomad() { backend_nomad stop-all-tracers "${dir}" "${node}" & jobs_tracers_array+=("$!") done - if ! wait_fail_any "${jobs_tracers_array[@]}" + if ! wait_all "${jobs_tracers_array[@]}" then msg "$(red "Failed to stop tracer(s)")" fi @@ -859,9 +961,15 @@ backend_nomad() { fi ;; - # All or clean up everything! - # Called by `scenario.sh` without exit trap (`scenario_setup_exit_trap`)! + # Completely overrided by each sub-backend (exec.sh, podman.sh and cloud.sh) stop-cluster ) + # The impossible just happened? + fatal "Function \"stop-cluster\" is Nomad backend specific" + ;; + + # All or clean up everything! + # Called after `scenario.sh` without exit trap (`scenario_setup_exit_trap`)! + stop-cluster-internal ) local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json) @@ -875,8 +983,17 @@ backend_nomad() { fi msg "$(blue Stopping) Nomad $(yellow "Job \"${nomad_job_name}\"")..." - # TODO: Show output or do something if it fails? - wb_nomad job stop "${dir}/nomad/nomad-job.json" "${nomad_job_name}" > "$dir/nomad/job.stop.stdout" 2> "$dir/nomad/job.stop.stderr" || true + # No checks, `job stop` checks for errors and uses msg to show them! + wb_nomad job stop "${dir}/nomad/nomad-job.json" "${nomad_job_name}" > "$dir/nomad/job.stop.stdout" 2> "$dir/nomad/job.stop.stderr" + + rm "${dir}"/stopping; touch "${dir}"/stopped + ;; + + # All or clean up everything! + # Called after `scenario.sh` without exit trap (`scenario_setup_exit_trap`)! + stop-cluster-local ) + local usage="USAGE: wb backend $op RUN-DIR" + local dir=${1:?$usage}; shift local nomad_agents_were_already_running=$(envjqr 'nomad_agents_were_already_running') if test "$nomad_agents_were_already_running" = "false" @@ -884,6 +1001,7 @@ backend_nomad() { local nomad_server_name=$(envjqr 'nomad_server_name') local nomad_client_name=$(envjqr 'nomad_client_name') local nomad_task_driver=$(envjqr 'nomad_task_driver') + # No checks, `agents stop` checks for errors and uses msg to show them! wb_nomad agents stop \ "${nomad_server_name}" "${nomad_client_name}" "${nomad_task_driver}" fi @@ -893,8 +1011,6 @@ backend_nomad() { local oci_image_was_already_available=$(envjqr 'oci_image_was_already_available') #TODO: Remove it? - - rm "${dir}"/stopping; touch "${dir}"/stopped ;; # Called by `scenario.sh` without exit trap (`scenario_setup_exit_trap`)! @@ -910,17 +1026,9 @@ backend_nomad() { # Download healthcheck(s) logs. ########################################## ########################################################################## - # Remove "live" symlinks before downloading the "originals" - if test "${nomad_environment}" != "cloud" - then - for node in $(jq_tolist 'keys' "${dir}"/node-specs.json) - do - rm -f "${dir}"/healthcheck/"${node}"/{stdout,stderr,exit_code} - rm -f "${dir}"/supervisor/"${node}"/supervisord.log - done - fi # Download retry "infinite" loop. local healthchecks_array + # Fetch the nodes that don't have all the log files in its directory healthchecks_array="$(jq_tolist 'keys' "$dir"/node-specs.json)" while test -n "${healthchecks_array:-}" do @@ -932,9 +1040,9 @@ backend_nomad() { done if test -n "${Healthchecks_jobs_array:-}" # If = () "unbound variable" error then - # Wait until all jobs finish, don't use `wait_fail_any` that kills - # Returns the exit code of the last job, ignore it! - wait "${Healthchecks_jobs_array[@]}" || true + # Wait until all jobs finish, don't use `wait_kill_em_all` that kills + # Returns the exit code of the last failed job, we ignore it! + wait_all "${Healthchecks_jobs_array[@]}" || true fi # Fetch the nodes that don't have all the log files in its directory healthchecks_array="$(backend_nomad fetch-logs-healthchecks "${dir}")" @@ -947,12 +1055,6 @@ backend_nomad() { msg "$(green "Finished downloading Healthcheck(s) logs")" # Download generator logs. ############################################### ########################################################################## - # Remove "live" symlinks before downloading the "originals" - if test "${nomad_environment}" != "cloud" - then - rm -f "${dir}"/generator/{stdout,stderr,exit_code} - rm -f "${dir}"/supervisor/"${generator_task}"/supervisord.log - fi # Download retry "infinite" loop. while ! backend_nomad download-logs-generator "${dir}" "${generator_task}" do @@ -962,16 +1064,6 @@ backend_nomad() { msg "$(green "Finished downloading \"generator\" logs")" # Download node(s) logs. ################################################# ########################################################################## - # Remove "live" symlinks before downloading the "originals" - if test "${nomad_environment}" != "cloud" - then - for node in $(jq_tolist 'keys' "$dir"/node-specs.json) - do - rm -f "${dir}"/"${node}"/{stdout,stderr,exit_code} - rm -f "${dir}"/nomad/"${node}"/{stdout,stderr} - rm -f "${dir}"/supervisor/"${node}"/supervisord.log - done - fi # Download retry "infinite" loop. local nodes_array nodes_array="$(jq_tolist 'keys' "$dir"/node-specs.json)" @@ -985,9 +1077,9 @@ backend_nomad() { done if test -n "${nodes_jobs_array:-}" # If = () "unbound variable" error then - # Wait until all jobs finish, don't use `wait_fail_any` that kills - # Returns the exit code of the last job, ignore it! - wait "${nodes_jobs_array[@]}" || true + # Wait until all jobs finish, don't use `wait_kill_em_all` that kills + # Returns the exit code of the last failed job, we ignore it! + wait_all "${nodes_jobs_array[@]}" || true fi # Fetch the nodes that don't have all the log files in its directory nodes_array="$(backend_nomad fetch-logs-nodes "${dir}")" @@ -1002,28 +1094,11 @@ backend_nomad() { ########################################################################## if jqtest ".node.tracer" "${dir}"/profile.json then - # Remove "live" symlinks before downloading the "originals" - if test "${nomad_environment}" != "cloud" - then - if test "${one_tracer_per_node}" = "true" - then - for node in $(jq_tolist 'keys' "${dir}"/node-specs.json) - do - rm -f "${dir}"/tracer/"${node}"/{stdout,stderr,exit_code} - done - else - # When "local" and "podman" "tracer" folder is mounted - if ! test "${nomad_task_driver}" = "podman" - then - rm -f "${dir}"/tracer/{stdout,stderr,exit_code} - fi - rm -f "${dir}"/supervisor/tracer/supervisord.log - fi - fi # Download retry "infinite" loop. if test "${one_tracer_per_node}" = "true" then local tracers_array + # Fetch the nodes that don't have all the log files in its directory tracers_array="$(jq_tolist 'keys' "$dir"/node-specs.json)" while test -n "${tracers_array:-}" do @@ -1035,9 +1110,9 @@ backend_nomad() { done if test -n "${tracers_jobs_array:-}" # If = () "unbound variable" error then - # Wait until all jobs finish, don't use `wait_fail_any` that kills - # Returns the exit code of the last job, ignore it! - wait "${tracers_jobs_array[@]}" || true + # Wait until all jobs finish, don't use `wait_kill_em_all` that kills + # Returns the exit code of the last failed job, we ignore it! + wait_all "${tracers_jobs_array[@]}" || true fi # Fetch the nodes that don't have all the log files in its directory tracers_array="$(backend_nomad fetch-logs-tracers "${dir}")" @@ -1076,6 +1151,34 @@ backend_nomad() { fi fi fi + # Download entrypoint(s) logs. ########################################### + ########################################################################## + # Download retry "infinite" loop. + local entrypoints_array + entrypoints_array="$(jq_tolist 'keys' "$dir"/node-specs.json)" + while test -n "${entrypoints_array:-}" + do + local entrypoints_jobs_array=() + for node in ${entrypoints_array[*]} + do + backend_nomad download-logs-entrypoint "${dir}" "${node}" & + entrypoints_jobs_array+=("$!") + done + if test -n "${entrypoints_jobs_array:-}" # If = () "unbound variable" error + then + # Wait until all jobs finish, don't use `wait_kill_em_all` that kills + # Returns the exit code of the last failed job, we ignore it! + wait_all "${entrypoints_jobs_array[@]}" || true + fi + # Fetch the nodes that don't have all the log files in its directory + entrypoints_array="$(backend_nomad fetch-logs-entrypoints "${dir}")" + if test -n "${entrypoints_array:-}" + then + msg "Retrying entrypoint(s) [${entrypoints_array[@]}] logs download" + read -p "Hit enter to continue ..." + fi + done + msg "$(green "Finished downloading entrypoint(s) logs")" # TODO: Check downloads # ls run/current/nomad/{node-{0..51},explorer}/{stdout,stderr} || msg "" @@ -1087,8 +1190,8 @@ backend_nomad() { msg "$(green "Finished fetching logs")" ;; - # Array of nodes that don't have all the log files in its directory - fetch-logs-nodes ) + # Array of entrypoints that don't have all the required log files in its directory + fetch-logs-entrypoints ) local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift local nodes_array=() @@ -1096,18 +1199,6 @@ backend_nomad() { do local node_ok="true" # Check the existance of all the wanted files: - if ! test -f "${dir}"/"${node}"/exit_code - then - node_ok="false" - fi - if ! test -f "${dir}"/"${node}"/stdout - then - node_ok="false" - fi - if ! test -f "${dir}"/"${node}"/stderr - then - node_ok="false" - fi if ! test -f "${dir}"/nomad/"${node}"/stdout then node_ok="false" @@ -1140,92 +1231,132 @@ backend_nomad() { ;; # Only to be called with one_tracer_per_node = true - # Array of tracers' nodes that don't have all the log files in its directory + # Array of tracers' nodes that don't have all the required log files in its directory fetch-logs-tracers ) local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift local tracers_array=() for node in $(jq_tolist 'keys' "${dir}"/node-specs.json) do - local tracer_ok="true" - # Check the existance of all the wanted files: - if ! test -f "${dir}"/tracer/"${node}"/exit_code - then - tracer_ok="false" - fi - if ! test -f "${dir}"/tracer/"${node}"/stdout - then - tracer_ok="false" - fi - if ! test -f "${dir}"/tracer/"${node}"/stderr - then - tracer_ok="false" - fi - # Below like errors can end in truncated files, a proper flag is used! - # failed to exec into task: read tcp 10.0.0.115:33840->3.72.231.105:443: read: connection reset by peer - # tar: Unexpected EOF in archive - # tar: Unexpected EOF in archive - # tar: Error is not recoverable: exiting now - if test -f "${dir}"/nomad/"${node}"/download_failed + # Only if the tracer was started. + if test -f "${dir}"/tracer/"${node}"/started then - tracer_ok="false" + local tracer_ok="true" + # Check the existance of all the wanted files: + if ! test -f "${dir}"/tracer/"${node}"/exit_code + then + tracer_ok="false" + fi + if ! test -f "${dir}"/tracer/"${node}"/stdout + then + tracer_ok="false" + fi + if ! test -f "${dir}"/tracer/"${node}"/stderr + then + tracer_ok="false" + fi + # Below like errors can end in truncated files, a proper flag is used! + # failed to exec into task: read tcp 10.0.0.115:33840->3.72.231.105:443: read: connection reset by peer + # tar: Unexpected EOF in archive + # tar: Unexpected EOF in archive + # tar: Error is not recoverable: exiting now + if test -f "${dir}"/nomad/"${node}"/download_failed + then + tracer_ok="false" + fi + # If any error add this node to the array + if test "${tracer_ok}" = "false" + then + tracers_array+=("${node}") + fi fi - # If any error add this node to the array - if test "${tracer_ok}" = "false" + done + # Return array + echo "${tracers_array[@]}" + ;; + + # Array of nodes that don't have all the required log files in its directory + fetch-logs-nodes ) + local usage="USAGE: wb backend $op RUN-DIR" + local dir=${1:?$usage}; shift + local nodes_array=() + for node in $(jq_tolist 'keys' "${dir}"/node-specs.json) + do + # Only if the node was started. + if test -f "${dir}"/"${node}"/started then - tracers_array+=("${node}") + local node_ok="true" + # Check the existance of all the wanted files: + if ! test -f "${dir}"/"${node}"/exit_code + then + node_ok="false" + fi + if ! test -f "${dir}"/"${node}"/stdout + then + node_ok="false" + fi + if ! test -f "${dir}"/"${node}"/stderr + then + node_ok="false" + fi + # Below like errors can end in truncated files, a proper flag is used! + # failed to exec into task: read tcp 10.0.0.115:33840->3.72.231.105:443: read: connection reset by peer + # tar: Unexpected EOF in archive + # tar: Unexpected EOF in archive + # tar: Error is not recoverable: exiting now + if test -f "${dir}"/nomad/"${node}"/download_failed + then + node_ok="false" + fi + # If any error add this node to the array + if test "${node_ok}" = "false" + then + nodes_array+=("${node}") + fi fi done # Return array - echo "${tracers_array[@]}" + echo "${nodes_array[@]}" ;; - # Array of nodes that don't have all the log files in its directory + # Array of nodes that don't have all the required log files in its directory fetch-logs-healthchecks ) local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift local healthchecks_array=() for node in $(jq_tolist 'keys' "${dir}"/node-specs.json) do - local healthcheck_ok="true" - # Check the existance of all the wanted files: - if ! test -f "${dir}"/healthcheck/"${node}"/exit_code - then - healthcheck_ok="false" - fi - if ! test -f "${dir}"/healthcheck/"${node}"/stdout - then - healthcheck_ok="false" - fi - if ! test -f "${dir}"/healthcheck/"${node}"/stderr - then - healthcheck_ok="false" - fi - if ! test -f "${dir}"/nomad/"${node}"/stdout - then - healthcheck_ok="false" - fi - if ! test -f "${dir}"/nomad/"${node}"/stderr - then - healthcheck_ok="false" - fi - if ! test -f "${dir}"/supervisor/"${node}"/supervisord.log + # Only if the healthcheck was started. + if test -f "${dir}"/healthcheck/"${node}"/started then - healthcheck_ok="false" - fi - # Below like errors can end in truncated files, a proper flag is used! - # failed to exec into task: read tcp 10.0.0.115:33840->3.72.231.105:443: read: connection reset by peer - # tar: Unexpected EOF in archive - # tar: Unexpected EOF in archive - # tar: Error is not recoverable: exiting now - if test -f "${dir}"/healthcheck/"${node}"/download_failed - then - healthcheck_ok="false" - fi - # If any error add this healthcheck to the array - if test "${healthcheck_ok}" = "false" - then - healthchecks_array+=("${node}") + local healthcheck_ok="true" + # Check the existance of all the wanted files: + if ! test -f "${dir}"/healthcheck/"${node}"/exit_code + then + healthcheck_ok="false" + fi + if ! test -f "${dir}"/healthcheck/"${node}"/stdout + then + healthcheck_ok="false" + fi + if ! test -f "${dir}"/healthcheck/"${node}"/stderr + then + healthcheck_ok="false" + fi + # Below like errors can end in truncated files, a proper flag is used! + # failed to exec into task: read tcp 10.0.0.115:33840->3.72.231.105:443: read: connection reset by peer + # tar: Unexpected EOF in archive + # tar: Unexpected EOF in archive + # tar: Error is not recoverable: exiting now + if test -f "${dir}"/healthcheck/"${node}"/download_failed + then + healthcheck_ok="false" + fi + # If any error add this healthcheck to the array + if test "${healthcheck_ok}" = "false" + then + healthchecks_array+=("${node}") + fi fi done # Return array @@ -1282,7 +1413,7 @@ backend_nomad() { # Wait and check! if test -n "${jobs_array}" then - if ! wait_fail_any "${jobs_array[@]}" + if ! wait_kill_em_all "${jobs_array[@]}" then # Don't use fatal here, let `start` decide! msg "$(red "Failed to start tracer(s)")" @@ -1329,7 +1460,7 @@ backend_nomad() { # Wait and check! if test -n "${jobs_array}" then - if ! wait_fail_any "${jobs_array[@]}" + if ! wait_kill_em_all "${jobs_array[@]}" then fatal "Failed to start node(s)" else @@ -1375,7 +1506,7 @@ backend_nomad() { # Wait and check! if test -n "${jobs_array}" then - if ! wait_fail_any "${jobs_array[@]}" + if ! wait_kill_em_all "${jobs_array[@]}" then fatal "Failed to start healthcheck(s)" return 1 @@ -1422,7 +1553,8 @@ backend_nomad() { then msg "$(red "FATAL: Program \"tracer\" (inside \"${task}\") startup failed")" # TODO: Let the download fail when everything fails? - backend_nomad download-logs-tracer "${dir}" "${task}" || true + backend_nomad download-logs-entrypoint "${dir}" "${task}" || true + backend_nomad download-logs-tracer "${dir}" "${task}" || true if test "$one_tracer_per_node" = "true" || test "${task}" != "tracer" then # Should show the output/log of `supervisord` (runs as "entrypoint"). @@ -1550,7 +1682,8 @@ backend_nomad() { then msg "$(red "FATAL: Program \"${node}\" (always inside \"${node}\") startup failed")" # TODO: Let the download fail when everything fails? - backend_nomad download-logs-node "${dir}" "${node}" || true + backend_nomad download-logs-entrypoint "${dir}" "${node}" || true + backend_nomad download-logs-node "${dir}" "${node}" || true # Should show the output/log of `supervisord` (runs as "entrypoint"). msg "$(yellow "${dir}/nomad/${node}/stdout:")" cat \ @@ -1604,7 +1737,6 @@ backend_nomad() { touch "${dir}"/"${node}"/started else # Failed to start, mostly timeout before listening socket was found. - backend_nomad stop-cluster "${dir}" fatal "Node \"${node}\" startup did not succeed" fi fi @@ -1625,7 +1757,8 @@ backend_nomad() { then msg "$(red "FATAL: Program \"generator\" (inside \"${generator_task}\") startup failed")" # TODO: Let the download fail when everything fails? - backend_nomad download-logs-generator "${dir}" "${generator_task}" || true + backend_nomad download-logs-entrypoint "${dir}" "${generator_task}" || true + backend_nomad download-logs-generator "${dir}" "${generator_task}" || true # Should show the output/log of `supervisord` (runs as "entrypoint"). msg "$(yellow "${dir}/nomad/${generator_task}/stdout:")" cat \ @@ -1686,6 +1819,7 @@ backend_nomad() { then msg "$(red "FATAL: Program \"healthcheck\" inside Nomad Task \"${task}\" startup failed")" # TODO: Let the download fail when everything fails? + backend_nomad download-logs-entrypoint "${dir}" "${task}" || true backend_nomad download-logs-healthcheck "${dir}" "${task}" || true # Should show the output/log of `supervisord` (runs as "entrypoint"). msg "$(yellow "${dir}/nomad/${task}/stdout:")" @@ -1774,28 +1908,21 @@ backend_nomad() { fi done else + local socket_name if test "${one_tracer_per_node}" = "true" || test "${task}" != "tracer" then - local socket_path_relative=$(jq -r '.network.contents' "${dir}/tracer/${task}/config.json") - local socket_path_absolute=/"${task}"/local/run/current/tracer/"${socket_path_relative}" + socket_name=$(jq -r '.network.contents' "${dir}/tracer/${task}/config.json") else - local socket_path_relative=$(jq -r '.network.contents' "${dir}/tracer/config.json") - local socket_path_absolute=/tracer/local/run/current/tracer/"${socket_path_relative}" + socket_name=$(jq -r '.network.contents' "${dir}/tracer/config.json") fi # Wait for tracer socket - #local socket_path_absolute="$dir/tracer/$node/$socket_path_relative" msg "$(blue Waiting) ${patience}s for socket of supervisord $(yellow "program \"tracer\"") inside Nomad $(yellow "Task \"${task}\"") ..." local i=0 - # while test ! -S "$socket_path_absolute" - local task_alloc_id - task_alloc_id=$(wb_nomad job task-name-allocation-id \ - "${dir}/nomad/nomad-job.json" \ - "${task}") # Always keep checking that the supervisord program is still running! while \ - backend_nomad is-task-program-running "${dir}" "${task}" tracer \ - && \ - ! nomad alloc fs -stat -H "${task_alloc_id}" "${socket_path_absolute}" | grep --quiet "application/octet-stream" + backend_nomad is-task-program-running "${dir}" "${task}" tracer \ + && \ + ! backend_nomad task-file-stat "${dir}" "${task}" run/current/tracer/"${socket_name}" | grep --quiet "application/octet-stream" do printf "%3d" $i; sleep 1 i=$((i+1)) if test "${i}" -ge "${patience}" @@ -1825,7 +1952,7 @@ backend_nomad() { if ! backend_nomad task-program-stop "${dir}" "${node}" "${node}" then - msg "$(yellow "WARNING: Program \"healthcheck\" inside Task \"${task}\" failed to stop")" + msg "$(yellow "WARNING: Program \"${node}\" inside Task \"${task}\" failed to stop")" else touch "${dir}"/"${node}"/stopped fi @@ -1837,20 +1964,14 @@ backend_nomad() { local dir=${1:?$usage}; shift local node=${1:-$(dirname $CARDANO_NODE_SOCKET_PATH | xargs basename)}; shift - local socket=$(backend_nomad get-node-socket-path "${dir}" ${node}) - local socket_path_absolute=/"${node}"/local/run/current/"${node}"/node.socket local patience=$(jq '.analysis.cluster_startup_overhead_s | ceil' ${dir}/profile.json) msg "$(blue Waiting) ${patience}s for socket of supervisord $(yellow "program \"${node}\"") inside Nomad $(yellow "Task \"${node}\"") ..." local i=0 - local node_alloc_id - node_alloc_id=$(wb_nomad job task-name-allocation-id \ - "$dir/nomad/nomad-job.json" \ - "${node}") # Always keep checking that the supervisord program is still running! while \ - backend_nomad is-task-program-running "${dir}" "${node}" "${node}" \ - && \ - ! nomad alloc fs -stat -H "${node_alloc_id}" "${socket_path_absolute}" 2>/dev/null | grep --quiet "application/octet-stream" + backend_nomad is-task-program-running "${dir}" "${node}" "${node}" \ + && \ + ! backend_nomad task-file-stat "${dir}" "${node}" run/current/"${node}"/node.socket 2>/dev/null | grep --quiet "application/octet-stream" # TODO: Add the "timer" `printf "%3d" $i;` but for concurrent processes! do sleep 1 @@ -1908,7 +2029,9 @@ backend_nomad() { ;; wait-pools-stopped ) - local usage="USAGE: wb backend $op RUN-DIR" + local usage="USAGE: wb backend $op SLEEP-SECONDS RUN-DIR" + # This parameters is added by the nomad backend being used. + local sleep_seconds=${1:?$usage}; shift local dir=${1:?$usage}; shift local generator_task=$(envjqr 'generator_task_name') @@ -1922,6 +2045,23 @@ backend_nomad() { && \ backend_nomad is-task-program-running "${dir}" "node-${pool_ix}" "node-${pool_ix}" 5 > /dev/null do + # Always check that, if present, the explorer node is doing OK! + if \ + test "${generator_task}" = "explorer" \ + && \ + ! test -f "${dir}"/healthcheck/explorer/quit \ + && \ + ! backend_nomad is-task-program-running "${dir}" "explorer" healthcheck 5 + then + if backend_nomad is-task-program-failed "${dir}" "explorer" healthcheck 5 + then + touch "${dir}"/healthcheck/explorer/quit + # Show the warning and continue with the counter + echo -ne "\n" + msg "$(yellow "WARNING: supervisord program \"healthcheck\" inside Nomad Task \"explorer\" quit with an error exit code")" + msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000" + fi + fi # Always check that a started generator has not FAILED! if \ test -f "${dir}"/generator/started \ @@ -1963,7 +2103,10 @@ backend_nomad() { local elapsed="$(($(date +%s) - start_time))" echo -ne "\b\b\b\b\b\b" printf "%6d" "${elapsed}" - sleep 1 + # This time is different between local and cloud backends to avoid + # unnecesary Nomad specific traffic and at the same time be less + # sensitive to network failures. + sleep "${sleep_seconds}" done # While if ! test -f "${dir}"/flag/cluster-stopping then @@ -2037,20 +2180,12 @@ backend_nomad() { local dir=${1:?$usage}; shift local task=${1:?$usage}; shift local download_ok="true" - # Should show the output/log of `supervisord` (runs as "entrypoint"). - msg "$(blue Fetching) $(yellow "entrypoint's stdout and stderr") of Nomad $(yellow "Task \"${task}\"") ..." - backend_nomad task-entrypoint-stdout "${dir}" "${task}" \ - > "${dir}"/nomad/"${task}"/stdout \ - || download_ok="false" - backend_nomad task-entrypoint-stderr "${dir}" "${task}" \ - > "${dir}"/nomad/"${task}"/stderr \ - || download_ok="false" - # If the entrypoint was ran till the end, this file should be available! - msg "$(blue Fetching) $(yellow supervisord.log) of Nomad $(yellow "Task \"${task}\"") ..." - backend_nomad task-file-contents "${dir}" "${task}" \ - /local/run/current/supervisor/supervisord.log \ - > "${dir}"/supervisor/"${task}"/supervisord.log \ - || download_ok="false" + # Remove "live" symlinks before downloading the "originals" + local nomad_environment=$(envjqr 'nomad_environment') + if test "${nomad_environment}" != "cloud" + then + rm -f "${dir}"/healthcheck/"${task}"/{stdout,stderr,exit_code} + fi # Downloads "exit_code", "stdout", "stderr" and GHC files. # Depending on when the start command failed, logs may not be available! backend_nomad download-zstd-healthcheck "${dir}" "${task}" \ @@ -2081,20 +2216,12 @@ backend_nomad() { local dir=${1:?$usage}; shift local task=${1:?$usage}; shift local download_ok="true" - # Should show the output/log of `supervisord` (runs as "entrypoint"). - msg "$(blue Fetching) $(yellow "entrypoint's stdout and stderr") of Nomad $(yellow "Task \"${task}\"") ..." - backend_nomad task-entrypoint-stdout "${dir}" "${task}" \ - > "${dir}"/nomad/"${task}"/stdout \ - || download_ok="false" - backend_nomad task-entrypoint-stderr "${dir}" "${task}" \ - > "${dir}"/nomad/"${task}"/stderr \ - || download_ok="false" - # If the entrypoint was ran till the end, this file should be available! - msg "$(blue Fetching) $(yellow supervisord.log) of Nomad $(yellow "Task \"${task}\"") ..." - backend_nomad task-file-contents "${dir}" "${task}" \ - /local/run/current/supervisor/supervisord.log \ - > "${dir}"/supervisor/"${task}"/supervisord.log \ - || download_ok="false" + # Remove "live" symlinks before downloading the "originals" + local nomad_environment=$(envjqr 'nomad_environment') + if test "${nomad_environment}" != "cloud" + then + rm -f "${dir}"/generator/{stdout,stderr,exit_code} + fi # Downloads "exit_code", "stdout", "stderr" and GHC files. # Depending on when the start command failed, logs may not be available! backend_nomad download-zstd-generator "${dir}" "${task}" \ @@ -2115,20 +2242,12 @@ backend_nomad() { local dir=${1:?$usage}; shift local node=${1:?$usage}; shift local download_ok="true" - # Should show the output/log of `supervisord` (runs as "entrypoint"). - msg "$(blue Fetching) $(yellow "entrypoint's stdout and stderr") of Nomad $(yellow "Task \"${node}\"") ..." - backend_nomad task-entrypoint-stdout "${dir}" "${node}" \ - > "${dir}"/nomad/"${node}"/stdout \ - || download_ok="false" - backend_nomad task-entrypoint-stderr "${dir}" "${node}" \ - > "${dir}"/nomad/"${node}"/stderr \ - || download_ok="false" - # If the entrypoint was ran till the end, this file should be available! - msg "$(blue Fetching) $(yellow supervisord.log) of Nomad $(yellow "Task \"${node}\"") ..." - backend_nomad task-file-contents "${dir}" "${node}" \ - /local/run/current/supervisor/supervisord.log \ - > "${dir}"/supervisor/"${node}"/supervisord.log \ - || download_ok="false" + # Remove "live" symlinks before downloading the "originals" + local nomad_environment=$(envjqr 'nomad_environment') + if test "${nomad_environment}" != "cloud" + then + rm -f "${dir}"/"${node}"/{stdout,stderr,exit_code} + fi # Downloads "exit_code", "stdout", "stderr" and GHC files. # Depending on when the start command failed, logs may not be available! backend_nomad download-zstd-node "${dir}" "${node}" \ @@ -2162,6 +2281,24 @@ backend_nomad() { # supervisord program and no entrypoints' logs are downloaded here # because they should be downloaded by the main supervisord program. local one_tracer_per_node=$(envjqr 'one_tracer_per_node') + # Remove "live" symlinks before downloading the "originals" + local nomad_environment=$(envjqr 'nomad_environment') + if test "${nomad_environment}" != "cloud" + then + if test "${one_tracer_per_node}" = "true" + then + for node in $(jq_tolist 'keys' "${dir}"/node-specs.json) + do + rm -f "${dir}"/tracer/"${node}"/{stdout,stderr,exit_code} + done + else + # When "local" and "podman" "tracer" folder is mounted + if ! test "${nomad_task_driver}" = "podman" + then + rm -f "${dir}"/tracer/{stdout,stderr,exit_code} + fi + fi + fi if test "${one_tracer_per_node}" = "true" || test "${task}" != "tracer" then # Downloads "exit_code", "stdout", "stderr" and GHC files. @@ -2185,20 +2322,6 @@ backend_nomad() { fi else local download_ok="true" - # Should show the output/log of `supervisord` (runs as "entrypoint"). - msg "$(blue Fetching) $(yellow "entrypoint's stdout and stderr") of Nomad $(yellow "Task \"tracer\"") ..." - backend_nomad task-entrypoint-stdout "${dir}" "tracer" \ - > "${dir}"/nomad/tracer/stdout \ - || download_ok="false" - backend_nomad task-entrypoint-stderr "${dir}" "tracer" \ - > "${dir}"/nomad/tracer/stderr \ - || download_ok="false" - # If the entrypoint was ran till the end, this file should be available! - msg "$(blue Fetching) $(yellow supervisord.log) of Nomad $(yellow "Task \"tracer\"") ..." - backend_nomad task-file-contents "${dir}" "tracer" \ - /local/run/current/supervisor/supervisord.log \ - > "${dir}"/supervisor/tracer/supervisord.log \ - || download_ok="false" # When "local" and "podman" "tracer" folder is mounted local nomad_task_driver=$(envjqr 'nomad_task_driver') if ! test "${nomad_task_driver}" = "podman" @@ -2228,6 +2351,53 @@ backend_nomad() { fi ;; + # For debugging when something fails, downloads and prints details! + download-logs-entrypoint ) + local usage="USAGE: wb backend pass $op RUN-DIR NODE-NAME" + local dir=${1:?$usage}; shift + local node=${1:?$usage}; shift + local download_ok="true" + # Remove "live" symlinks before downloading the "originals" + local nomad_environment=$(envjqr 'nomad_environment') + if test "${nomad_environment}" != "cloud" + then + rm -f "${dir}"/nomad/"${node}"/{stdout,stderr} + rm -f "${dir}"/supervisor/"${node}"/supervisord.log + fi + # Should show the output/log of `supervisord` (runs as "entrypoint"). + msg "$(blue Fetching) $(yellow "entrypoint's stdout and stderr") of Nomad $(yellow "Task \"${node}\"") ..." + backend_nomad task-entrypoint-stdout "${dir}" "${node}" \ + > "${dir}"/nomad/"${node}"/stdout \ + || download_ok="false" + backend_nomad task-entrypoint-stderr "${dir}" "${node}" \ + > "${dir}"/nomad/"${node}"/stderr \ + || download_ok="false" + # If the entrypoint was ran till the end, this file should be available! + msg "$(blue Fetching) $(yellow supervisord.log) of Nomad $(yellow "Task \"${node}\"") ..." + backend_nomad task-file-contents "${dir}" "${node}" \ + run/current/supervisor/supervisord.log \ + > "${dir}"/supervisor/"${node}"/supervisord.log \ + || download_ok="false" + # Return + if test "${download_ok}" = "false" + then + msg "$(red "Failed to download \"${node}\" entrypoint files from \"${node}\"")" + # Below like errors can end in truncated files, a proper flag is needed! + # failed to exec into task: read tcp 10.0.0.115:33840->3.72.231.105:443: read: connection reset by peer + # tar: Unexpected EOF in archive + # tar: Unexpected EOF in archive + # tar: Error is not recoverable: exiting now + touch "${dir}"/nomad/"${node}"/download_failed + return 1 + else + if test -f "${dir}"/nomad/"${node}"/download_failed + then + rm "${dir}"/nomad/"${node}"/download_failed + fi + return 0 + fi + ;; + download-zstd-healthcheck ) local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME" local dir=${1:?$usage}; shift @@ -2342,6 +2512,40 @@ backend_nomad() { fi ;; + download-config-entrypoint ) + local usage="USAGE: wb backend pass $op RUN-DIR NODE-NAME" + local dir=${1:?$usage}; shift + local task=${1:?$usage}; shift + # Dynamically modified file, store to be able to debug! + backend_nomad task-file-contents "${dir}" "${task}" \ + entrypoint.sh \ + > "${dir}"/nomad/"${task}"/entrypoint.sh + # Dynamically generated file with the envars of the entrypoint! + backend_nomad task-file-contents "${dir}" "${task}" \ + entrypoint.env \ + > "${dir}"/nomad/"${task}"/entrypoint.env + # Dynamically generated file with system info! + backend_nomad task-file-contents "${dir}" "${task}" \ + entrypoint.uname \ + > "${dir}"/nomad/"${task}"/entrypoint.uname + # Dynamically generated file with cpu info! + backend_nomad task-file-contents "${dir}" "${task}" \ + entrypoint.cpuinfo \ + > "${dir}"/nomad/"${task}"/entrypoint.cpuinfo + # Dynamically generated file with directories info! + backend_nomad task-file-contents "${dir}" "${task}" \ + entrypoint.dirs \ + > "${dir}"/nomad/"${task}"/entrypoint.dirs + # This Task's supervisor files + backend_nomad task-file-contents "${dir}" "${task}" \ + run/current/supervisor/supervisord.conf \ + > "${dir}"/supervisor/"${task}"/supervisord.conf + # Dynamically generated file with all the services/addresses found! + backend_nomad task-file-contents "${dir}" "${task}" \ + networking.json \ + > "${dir}"/nomad/"${task}"/networking.json + ;; + download-config-generator ) local usage="USAGE: wb backend pass $op RUN-DIR" local dir=${1:?$usage}; shift @@ -2349,10 +2553,10 @@ backend_nomad() { # Generator runs inside task/supervisord "${generator_task}" # Node files that may suffer interpolation/sed replace. backend_nomad task-file-contents "${dir}" "${generator_task}" \ - /local/run/current/generator/start.sh \ + run/current/generator/start.sh \ > "${dir}"/generator/start.sh backend_nomad task-file-contents "${dir}" "${generator_task}" \ - /local/run/current/generator/run-script.json \ + run/current/generator/run-script.json \ > "${dir}"/generator/run-script.json ;; @@ -2362,30 +2566,14 @@ backend_nomad() { local node=${1:?$usage}; shift # Node files that may suffer interpolation/sed replace. backend_nomad task-file-contents "${dir}" "${node}" \ - /local/run/current/"${node}"/start.sh \ + run/current/"${node}"/start.sh \ > "${dir}"/"${node}"/start.sh backend_nomad task-file-contents "${dir}" "${node}" \ - /local/run/current/"${node}"/config.json \ + run/current/"${node}"/config.json \ > "${dir}"/"${node}"/config.json backend_nomad task-file-contents "${dir}" "${node}" \ - /local/run/current/"${node}"/topology.json \ + run/current/"${node}"/topology.json \ > "${dir}"/"${node}"/topology.json - # This Task's supervisor files - backend_nomad task-file-contents "${dir}" "${node}" \ - /local/run/current/supervisor/supervisord.conf \ - > "${dir}"/supervisor/"${node}"/supervisord.conf - # Dynamically modified file, store to be able to debug! - backend_nomad task-file-contents "${dir}" "${node}" \ - /local/entrypoint.sh \ - > "${dir}"/nomad/"${node}"/entrypoint.sh - # Dynamically generated file with the envars of the entrypoint! - backend_nomad task-file-contents "${dir}" "${node}" \ - /local/entrypoint.env \ - > "${dir}"/nomad/"${node}"/entrypoint.env - # Dynamically generated file with all the services/addresses found! - backend_nomad task-file-contents "${dir}" "${node}" \ - /local/networking.json \ - > "${dir}"/nomad/"${node}"/networking.json ;; download-config-tracer ) @@ -2399,41 +2587,25 @@ backend_nomad() { then # Node files that may suffer interpolation/sed replace. backend_nomad task-file-contents "${dir}" "${task}" \ - /local/run/current/tracer/start.sh \ + run/current/tracer/start.sh \ > "${dir}"/tracer/"${task}"/start.sh backend_nomad task-file-contents "${dir}" "${task}" \ - /local/run/current/tracer/config.json \ + run/current/tracer/config.json \ > "${dir}"/tracer/"${task}"/config.json else - local nomad_task_driver=$(envjqr 'nomad_task_driver') # When "local" and "podman" "tracer" folder is mounted and contents # created locally by the workbench (obtained from the profile services). + local nomad_task_driver=$(envjqr 'nomad_task_driver') if ! test "${nomad_task_driver}" = "podman" then # Node files that may suffer interpolation/sed replace. backend_nomad task-file-contents "${dir}" "tracer" \ - /local/run/current/tracer/start.sh \ + run/current/tracer/start.sh \ > "${dir}"/tracer/start.sh backend_nomad task-file-contents "${dir}" "tracer" \ - /local/run/current/tracer/config.json \ + run/current/tracer/config.json \ > "${dir}"/tracer/config.json fi - # This Task's supervisor files - backend_nomad task-file-contents "${dir}" "tracer" \ - /local/run/current/supervisor/supervisord.conf \ - > "${dir}"/supervisor/tracer/supervisord.conf - # Dynamically modified file, store to be able to debug! - backend_nomad task-file-contents "${dir}" "tracer" \ - /local/entrypoint.sh \ - > "${dir}"/nomad/tracer/entrypoint.sh - # Dynamically generated file with the envars of the entrypoint! - backend_nomad task-file-contents "${dir}" "tracer" \ - /local/entrypoint.env \ - > "${dir}"/nomad/tracer/entrypoint.env - # Dynamically generated file with all the services/addresses found! - backend_nomad task-file-contents "${dir}" "tracer" \ - /local/networking.json \ - > "${dir}"/nomad/tracer/networking.json fi fi ;; @@ -2443,12 +2615,12 @@ backend_nomad() { local dir=${1:?$usage}; shift local node=${1:?$usage}; shift backend_nomad task-file-contents "${dir}" "${node}" \ - /local/run/current/healthcheck/start.sh \ + run/current/healthcheck/start.sh \ > "${dir}"/healthcheck/"${node}"/start.sh ;; - ## Nomad job tasks supervisord queries - ###################################### + ## Nomad Job's Tasks supervisord queries + ######################################## task-program-start ) local usage="USAGE: wb backend pass $op RUN-DIR SUPERVISOR-PROGRAM" @@ -2548,6 +2720,7 @@ backend_nomad() { fi ;; + # First `is-task-program-running` and later this function for the status. # Don't use fatal with no strikes, the exit trap uses it to stop everything! is-task-program-failed ) local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME SUPERVISOR-PROGRAM" @@ -2562,7 +2735,7 @@ backend_nomad() { :> "${stderr_file}" local exit_code if ! exit_code=$(backend_nomad task-file-contents "${dir}" "${task}" \ - /local/run/current/"${program}"/exit_code 2> "${stderr_file}") + run/current/"${program}"/exit_code 2> "${stderr_file}") then # Command returned "false" if test -n "${strikes}" @@ -2611,25 +2784,25 @@ backend_nomad() { # The `supervisord` binary is nix-installed inside the container but not # added to $PATH (resides in /nix/store) - local container_supervisor_nix=$(jq -r '.containerPkgs.supervisor."nix-store-path"' "$dir"/container-specs.json) + local container_supervisor_nix="$(jq -r .supervisor.nix ${dir}/nomad/${task}/entrypoint.dirs)" # The `--serverurl` argument is needed in every call to `nomad exec`. # Uusually a socket/file decided between the container and the Job file. - local container_supervisord_url=$(jq -r .supervisord.url "$dir"/container-specs.json) + local container_supervisord_url="$(jq -r .supervisor.url ${dir}/nomad/${task}/entrypoint.dirs)" # The container needs to know where the `supervisord` config file is # located so it can be started. - local container_supervisord_conf=$(jq -r .supervisord.conf "$dir"/container-specs.json) + local container_supervisord_conf="$(jq -r .supervisor.config ${dir}/nomad/${task}/entrypoint.dirs)" # Returns "PROGRAM-NAME: ERROR (no such file)" when `supervisord` is not # able to find the command defined in "command=XXX" for PROGRAM-NAME # "[program:PROGRAM-NAME]" - backend_nomad task-exec "$dir" "$task" \ - "$container_supervisor_nix"/bin/supervisorctl \ - --serverurl "$container_supervisord_url" \ - --configuration "$container_supervisord_conf" \ - "$action" $@ + backend_nomad task-exec "${dir}" "${task}" \ + "${container_supervisor_nix}"/bin/supervisorctl \ + --serverurl "${container_supervisord_url}" \ + --configuration "${container_supervisord_conf}" \ + "${action}" $@ ;; - ## Nomad job tasks exec queries - ############################### + ## Nomad Job's Tasks exec queries + ################################# task-exec ) local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME CMD" @@ -2637,9 +2810,10 @@ backend_nomad() { local task=${1:?$usage}; shift local task_alloc_id - task_alloc_id=$(wb_nomad job task-name-allocation-id \ - "${dir}/nomad/nomad-job.json" \ - "${task}") + task_alloc_id="$(wb_nomad job task-name-allocation-id \ + "$dir/nomad/nomad-job.json" \ + "${task}" \ + )" # If you run it without `-i=false -t=false` supervisord starts an # interactive shell (output "supervisor>") and breaks the whole script # expecting you to hit enter on every call! @@ -2649,6 +2823,7 @@ backend_nomad() { "$@" ;; + # Generic function, tries all known runtime log files names. task-exec-program-run-files-tar-zstd ) local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME" local dir=${1:?$usage}; shift @@ -2659,34 +2834,39 @@ backend_nomad() { local find_path="$(jq -r ".containerPkgs.findutils.\"nix-store-path\"" "${dir}"/container-specs.json)"/bin/find local tar_path="$(jq -r ".containerPkgs.gnutar.\"nix-store-path\"" "${dir}"/container-specs.json)"/bin/tar local cat_path="$(jq -r ".containerPkgs.coreutils.\"nix-store-path\"" "${dir}"/container-specs.json)"/bin/cat - local prog_dir=/local/run/current/"${program}"/ + # When executing commands the directories used depend on the filesystem + # isolation mode (AKA chroot or not). + local state_dir + state_dir="$(backend_nomad task-workbench-state-dir "${dir}" "${task}")" + local prog_dir="${state_dir}"/"${program}"/ # TODO: Add compression, either "--zstd" or "--xz" # tar (child): zstd: Cannot exec: No such file or directory # tar (child): Error is not recoverable: exiting now # tar (child): xz: Cannot exec: No such file or directory # tar (child): Error is not recoverable: exiting now # Code example of the files needed: https://github.com/input-output-hk/cardano-ops/blob/bench-master/bench/bench.sh#L646-L670 - backend_nomad task-exec "${dir}" "${task}" \ - "${bash_path}" -c \ - " \ - \"${find_path}\" \"${prog_dir}\" \ - -mindepth 1 -maxdepth 1 -type f \ - \( \ - -name "exit_code" \ - -o -name "stdout" \ - -o -name "stderr" \ - -o -name "*.prof" \ - -o -name "*.eventlog" \ - -o -name "*.gcstats" \ - -o -name "*.log" \ - -o -name "start.sh.debug" \ - \) \ - -printf \"%P\\n\" \ - | \ - \"${tar_path}\" --create \ - --directory=\"${prog_dir}\" --files-from=- \ - | \ - \"${cat_path}\" \ + backend_nomad task-exec "${dir}" "${task}" \ + "${bash_path}" -c \ + " \ + \"${find_path}\" \"${prog_dir}\" \ + -mindepth 1 -maxdepth 1 -type f \ + \( \ + -name "exit_code" \ + -o -name "stdout" \ + -o -name "stderr" \ + -o -name "*.prof" \ + -o -name "*.eventlog" \ + -o -name "*.gcstats" \ + -o -name "*.log" \ + -o -name "protocol-parameters-queried.json" \ + -o -name "start.sh.debug" \ + \) \ + -printf \"%P\\n\" \ + | \ + \"${tar_path}\" --create \ + --directory=\"${prog_dir}\" --files-from=- \ + | \ + \"${cat_path}\" \ " ;; @@ -2701,7 +2881,11 @@ backend_nomad() { local cat_path="$(jq -r ".containerPkgs.coreutils.\"nix-store-path\"" "${dir}"/container-specs.json)"/bin/cat # TODO: Fetch the logRoot local log_root="$(jq -r ".containerPkgs.findutils.\"nix-store-path\"" "${dir}"/container-specs.json)"/bin/find - local tracer_dir=/local/run/current/tracer/ + # When executing commands the directories used depend on the filesystem + # isolation mode (AKA chroot or not). + local state_dir + state_dir="$(backend_nomad task-workbench-state-dir "${dir}" "${task}")" + local tracer_dir="${state_dir}"/tracer/ # TODO: Add compression, either "--zstd" or "--xz" # tar (child): zstd: Cannot exec: No such file or directory # tar (child): Error is not recoverable: exiting now @@ -2721,8 +2905,39 @@ backend_nomad() { " ;; - ## Nomad job tasks file queries - ############################### + ## Nomad Job's Tasks file queries + ################################# + + # Always use this functions as entrypoint to everything filesystem related + # because Nomad has an specific folder hierarchy and I want to have an + # abstraction over that. + # Also Nomad has different filesystem isolation modes but right now all + # Nomad `logs` and `fs` commands are using the same directory as its root + # directory, the allocation's working directory, so it's usefull mostly to + # execute commands inside the Job's Tasks (`nomad exec`). + # + # Allocation's/task's working directory: + # https://developer.hashicorp.com/nomad/docs/concepts/filesystem + ### <>/alloc/<> + ### alloc/ (envar `NOMAD_ALLOC_DIR`) + ### data/ + ### logs/ + ### tmp/ + ### <>/ + ### local/ (envar `NOMAD_TASK_DIR` ) + ### private/ + ### secrets/ + ### tmp/ + + # Our "local/run/current" prefix that can be used with `nomad exec` without + # worrying if running isolated or not. + task-workbench-state-dir ) + local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME" + local dir=${1:?$usage}; shift + local task=${1:?$usage}; shift + + jq -r .workbench.state "${dir}"/nomad/"${task}"/entrypoint.dirs + ;; task-entrypoint-stdout ) local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME" @@ -2730,11 +2945,14 @@ backend_nomad() { local task=${1:?$usage}; shift local task_alloc_id - task_alloc_id=$(wb_nomad job task-name-allocation-id \ - "$dir/nomad/nomad-job.json" \ - "${task}") - nomad alloc logs \ - "${task_alloc_id}" "${task}" + task_alloc_id="$(wb_nomad job task-name-allocation-id \ + "$dir/nomad/nomad-job.json" \ + "${task}" \ + )" + # Log commands don't need a folder argument. + nomad alloc logs \ + "${task_alloc_id}" \ + "${task}" ;; task-entrypoint-stderr ) @@ -2743,11 +2961,32 @@ backend_nomad() { local task=${1:?$usage}; shift local task_alloc_id - task_alloc_id=$(wb_nomad job task-name-allocation-id \ - "$dir/nomad/nomad-job.json" \ - "${task}") + task_alloc_id="$(wb_nomad job task-name-allocation-id \ + "$dir/nomad/nomad-job.json" \ + "${task}" \ + )" + # Log commands don't need a folder argument. nomad alloc logs -stderr \ - "${task_alloc_id}" "${task}" + "${task_alloc_id}" \ + "${task}" + ;; + + # Machine friendly output of file stat information, instead of displaying + # the file, or listing the directory. + task-file-stat ) + local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME PATH" + local dir=${1:?$usage}; shift + local task=${1:?$usage}; shift + local path=${1:?$usage}; shift + + local task_alloc_id + task_alloc_id="$(wb_nomad job task-name-allocation-id \ + "$dir/nomad/nomad-job.json" \ + "${task}" \ + )" + nomad alloc fs -stat -H \ + "${task_alloc_id}" \ + /"${task}"/local/"${path}" ;; task-file-contents ) @@ -2757,11 +2996,18 @@ backend_nomad() { local path=${1:?$usage}; shift local task_alloc_id - task_alloc_id=$(wb_nomad job task-name-allocation-id \ - "$dir/nomad/nomad-job.json" \ - "${task}") - nomad alloc fs "${task_alloc_id}" \ - /"${task}""${path}" \ + task_alloc_id="$(wb_nomad job task-name-allocation-id \ + "$dir/nomad/nomad-job.json" \ + "${task}" \ + )" + # Always adds as prefix the `NOMAD_TASK_DIR`, "TASK-NAME/local" inside the + # `NOMAD_ALLOC_DIR` ("DATA-DIR/alloc/XXXXXXXX/alloc"). + # If running the `exec` (isolated) or `raw_exec` (no isolation) the root + # directory of `nomad alloc fs` is always the same, the task's working + # directory. + nomad alloc fs \ + "${task_alloc_id}" \ + /"${task}"/local/"${path}" ;; * ) @@ -3430,6 +3676,12 @@ plugin "exec" { # https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities allow_caps = [ "kill", "mknod", "net_bind_service" ] } +plugin "raw_exec" { + config = { + enabled = true + no_cgroups = true + } +} EOF fi diff --git a/nix/workbench/backend/nomad/cloud.nix b/nix/workbench/backend/nomad/cloud.nix index 83b9529c5f2..9124a02074a 100644 --- a/nix/workbench/backend/nomad/cloud.nix +++ b/nix/workbench/backend/nomad/cloud.nix @@ -25,8 +25,14 @@ let validateNodeSpecs = { nodeSpecsValue }: let - # SRE is using these 3 Nomad "datacenters" (how they are called in Nomad) - datacenters = [ "eu-central-1" "us-east-2" "ap-southeast-2" ]; + # There's a region mismatch between the workbench (specifically Haskell + # code in cardano-topology) and Cardano World's Nomad cluster that both + # use "us-east-2" while our dedicated Nomad cluster is using "us-east-1", + # what SRE deployed. + # - Cardano World cluster: "eu-central-1", "us-east-2" + # - Workbench (Nix level): "eu-central-1", "us-east-2", and "ap-southeast-2" + # - Dedicated P&T cluster: "eu-central-1", "us-east-1", and "ap-southeast-2" + datacenters = [ "eu-central-1" "us-east-1" "us-east-2" "ap-southeast-2" ]; regions = lib.attrsets.mapAttrsToList (name: value: value.region) nodeSpecsValue diff --git a/nix/workbench/backend/nomad/cloud.sh b/nix/workbench/backend/nomad/cloud.sh index bc81cdffdd8..6cb82d970e6 100644 --- a/nix/workbench/backend/nomad/cloud.sh +++ b/nix/workbench/backend/nomad/cloud.sh @@ -13,7 +13,7 @@ backend_nomadcloud() { # Can be: # nomadpodman (Using podman Task Driver in the cloud is not planned) # nomadexec (Starts Nomad Agents supporting the "nix_installable" stanza) - # nomadcloud (IOG Nomad Agents and Amazon S3 with credentials from Vault) + # nomadcloud (SRE managed Nomad Agents on Amazon S3 (dedicated or not)) echo 'nomadcloud' ;; @@ -42,113 +42,127 @@ backend_nomadcloud() { ;; allocate-run ) - allocate-run-nomadcloud "$@" + allocate-run-nomadcloud "$@" # Does a pre allocation before calling the default/common allocation. - backend_nomad allocate-run "$@" + backend_nomad allocate-run "$@" ;; + # Called by `run.sh` without exit trap (unlike `scenario_setup_exit_trap`)! + start-cluster ) + backend_nomad start-cluster "$@" + # If value profile on the dedicated P&T Nomad cluster on AWS extra checks + # to make sure the topology that was deployed is the correct one. + if \ + test "${WB_SHELL_PROFILE:0:15}" = 'value-nomadperf' \ + || \ + test "${WB_SHELL_PROFILE:0:26}" = 'value-oldtracing-nomadperf' + then + # Show a big warning but let the run continue! + check-deployment "${dir}" + fi + ;; + + # Called by `run.sh` without exit trap (unlike `scenario_setup_exit_trap`)! deploy-genesis ) # It "overrides" completely `backend_nomad`'s `deploy-genesis`. - deploy-genesis-nomadcloud "$@" + deploy-genesis-nomadcloud "$@" + ;; + + wait-pools-stopped ) + # It passes the sleep time (in seconds) required argument. + # This time is different between local and cloud backends to avoid + # unnecesary Nomad specific traffic (~99% happens waiting for node-0, the + # first one it waits to stop inside a loop) and at the same time be less + # sensitive to network failures. + backend_nomad wait-pools-stopped 60 "$@" ;; fetch-logs ) - # Only if running on "perf" exclusive nodes we use SSH, if not - # `nomad exec`, because we need to have an exclusive port open for us. - if echo "${WB_SHELL_PROFILE}" | grep --quiet "cw-perf" + # Only if running on the dedicated P&T Nomad cluster on AWS we use SSH, if + # not `nomad exec`, because we need to have a dedicated port open for us. + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" then - fetch-logs-nomadcloud "$@" + # It "overrides" completely `backend_nomad`'s `fetch-logs`. + fetch-logs-nomadcloud "$@" else - backend_nomad fetch-logs "$@" + # Generic backend sub-commands, shared code between Nomad sub-backends. + backend_nomad fetch-logs "$@" + fi + ;; + + # All or clean up everything! + # Called after `scenario.sh` without an exit trap! + stop-cluster ) + # Only when running on dedicated P&T Nomad cluster job is kept running! + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + # It "overrides" completely `backend_nomad`'s `stop-cluster`. + local usage="USAGE: wb backend $op RUN-DIR" + local dir=${1:?$usage}; shift + local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json) + msg "$(yellow "Cloud runs DO NOT automatically stop and purge Nomad jobs")" + msg "$(yellow "To stop the Nomad job use:")" + msg "$(yellow "wb nomad job stop ${dir}/nomad/nomad-job.json ${nomad_job_name}")" + msg "$(yellow "(With the same NOMAD_ADDR, NOMAD_NAMESPACE and NOMAD_TOKEN used for start-cluster)")" + else + # Shared code between Nomad sub-backends that internally only takes care + # of the Nomad job. + backend_nomad stop-cluster-internal "$@" fi ;; # Generic backend sub-commands, shared code between Nomad sub-backends. describe-run ) - backend_nomad describe-run "$@" + backend_nomad describe-run "$@" ;; is-running ) - backend_nomad is-running "$@" - ;; - - start-cluster ) - backend_nomad start-cluster "$@" - # start-ssh - # Only if running on "perf" exclusive nodes we use SSH, if not - # `nomad exec`, because we need to have an exclusive port open for us. - if echo "${WB_SHELL_PROFILE}" | grep --quiet "cw-perf" - then - local jobs_array=() - local nodes=($(jq_tolist keys "${dir}"/node-specs.json)) - for node in ${nodes[*]} - do - # TODO: Do it in parallel ? - backend_nomad task-program-start "${dir}" "${node}" ssh & - jobs_array+=("$!") - done - # Wait and check! - if test -n "${jobs_array}" - then - if ! wait_fail_any "${jobs_array[@]}" - then - fatal "Failed to start ssh server(s)" - fi - fi - fi + backend_nomad is-running "$@" ;; start-tracers ) - backend_nomad start-tracers "$@" + backend_nomad start-tracers "$@" ;; start-nodes ) - backend_nomad start-nodes "$@" + backend_nomad start-nodes "$@" ;; start-generator ) - backend_nomad start-generator "$@" + backend_nomad start-generator "$@" ;; start-healthchecks ) - backend_nomad start-healthchecks "$@" + backend_nomad start-healthchecks "$@" ;; start-node ) - backend_nomad start-node "$@" + backend_nomad start-node "$@" ;; stop-node ) - backend_nomad stop-node "$@" + backend_nomad stop-node "$@" ;; get-node-socket-path ) - backend_nomad get-node-socket-path "$@" + backend_nomad get-node-socket-path "$@" ;; wait-node ) - backend_nomad wait-node "$@" + backend_nomad wait-node "$@" ;; wait-node-stopped ) - backend_nomad wait-node-stopped "$@" - ;; - - wait-pools-stopped ) - backend_nomad wait-pools-stopped "$@" + backend_nomad wait-node-stopped "$@" ;; stop-all ) - backend_nomad stop-all "$@" - ;; - - stop-cluster ) - backend_nomad stop-cluster "$@" + backend_nomad stop-all "$@" ;; cleanup-cluster ) - backend_nomad cleanup-cluster "$@" + backend_nomad cleanup-cluster "$@" ;; * ) @@ -159,72 +173,184 @@ backend_nomadcloud() { } -# Sets jq envars "profile_container_specs_file" ,"nomad_environment", -# "nomad_task_driver" and "one_tracer_per_node". +# Sets jq envars ("profile_container_specs_file" ,"nomad_environment", +# "nomad_task_driver" and "one_tracer_per_node") and checks Nomad envars +# (NOMAD_ADDR, NOMAD_NAMESPACE, NOMAD_TOKEN). setenv-defaults-nomadcloud() { local backend_dir="${1}" local profile_container_specs_file profile_container_specs_file="${backend_dir}"/container-specs.json + # Nomad cloud profiles only available for Cardano World "qa" nodes + # ("-nomadcwqa") or the P&T dedicated Nomad cluster ("-nomadperf") + if \ + ! echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadcwqa" \ + && \ + ! echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + fatal "Unknown profile for Nomad Cloud: \"${WB_SHELL_PROFILE}\"" + fi + + ############## + # NOMAD_ADDR # + ############## if test -z "${NOMAD_ADDR+set}" then - # The variable is not set, not set but empty, just not set! - msg $(yellow "WARNING: Nomad address \"NOMAD_ADDR\" envar is not set") - # TODO: New Nomad cluster:export NOMAD_ADDR=http://10.200.0.1:4646 - export NOMAD_ADDR="https://nomad.world.dev.cardano.org" - msg $(blue "INFO: Setting \"NOMAD_ADDR\" to the SRE provided address for \"Performance and Tracing\" (\"${NOMAD_ADDR}\")") + # The variable is not set, it's not set to an empty value, just not set! + ######################################################################## + msg $(blue "INFO: Nomad address \"NOMAD_ADDR\" envar is not set") + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadcwqa" + then + export NOMAD_ADDR="https://nomad.world.dev.cardano.org" + fi + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + export NOMAD_ADDR="http://10.200.0.1:4646" + fi + msg $(yellow "WARNING: Setting \"NOMAD_ADDR\" to the SRE provided address for \"Performance and Tracing\" (\"${NOMAD_ADDR}\")") else # The variable is set and maybe empty! + ###################################### msg $(blue "INFO: Nomad address \"NOMAD_ADDR\" envar is \"${NOMAD_ADDR}\"") - if test "${NOMAD_ADDR}" != "https://nomad.world.dev.cardano.org" + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadcwqa" then - msg $(yellow "WARNING: Nomad address \"NOMAD_ADDR\" envar is not \"https://nomad.world.dev.cardano.org\"") + if test "${NOMAD_ADDR}" != "https://nomad.world.dev.cardano.org" + then + fatal "Nomad address \"NOMAD_ADDR\" envar is not \"https://nomad.world.dev.cardano.org\"" + fi + fi + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + if test "${NOMAD_ADDR}" != "http://10.200.0.1:4646" + then + fatal "Nomad address \"NOMAD_ADDR\" envar is not \"http://10.200.0.1:4646\"" + fi fi fi - # The abscence of `NOMAD_NAMESPACE` or `NOMAD_TOKEN` needs confirmation + ################### + # NOMAD_NAMESPACE # + ################### if test -z ${NOMAD_NAMESPACE+set} then - # The variable is not set, not set but empty, just not set! - msg $(yellow "WARNING: Nomad namespace \"NOMAD_NAMESPACE\" envar is not set") - # TODO: New Nomad cluster: export NOMAD_NAMESPACE="" - export NOMAD_NAMESPACE="perf" - msg $(blue "INFO: Setting \"NOMAD_NAMESPACE\" to the SRE provided namespace for \"Performance and Tracing\" (\"${NOMAD_NAMESPACE}\")") + msg $(blue "INFO: Nomad namespace \"NOMAD_NAMESPACE\" envar is not set") + # The variable is not set, it's not set to an empty value, just not set! + ######################################################################## + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadcwqa" + then + export NOMAD_NAMESPACE="perf" + msg $(yellow "WARNING: Setting \"NOMAD_NAMESPACE\" to the SRE provided namespace for \"Performance and Tracing\" (\"${NOMAD_NAMESPACE}\")") + fi + # We don't use namespaces for the P&T cluster. Nothing else to do! else # The variable is set and maybe empty! + ###################################### msg $(blue "INFO: Nomad namespace \"NOMAD_NAMESPACE\" envar is \"${NOMAD_NAMESPACE}\"") - if test "${NOMAD_NAMESPACE}" != "perf" + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadcwqa" + then + if test "${NOMAD_NAMESPACE}" != "perf" + then + fatal "Nomad namespace \"NOMAD_NAMESPACE\" envar is not \"perf\"" + fi + fi + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" then - msg $(yellow "WARNING: Nomad namespace \"NOMAD_NAMESPACE\" envar is not \"perf\"") + if test "${NOMAD_NAMESPACE}" != "" + then + fatal "Nomad namespace \"NOMAD_NAMESPACE\" envar is not empty" + fi fi fi + ############### + # NOMAD_TOKEN # + ############### if test -z "${NOMAD_TOKEN+set}" then - # The variable is not set, not set but empty, just not set! - msg $(yellow "WARNING: Nomad token \"NOMAD_TOKEN\" envar is not set") - msg $(yellow "If you need to fetch a NOMAD_TOKEN for world.dev.cardano.org provide an empty string") + msg $(blue "INFO: Nomad token \"NOMAD_TOKEN\" envar is not set") + # The variable is not set, it's not set to an empty value, just not set! + ######################################################################## + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadcwqa" + then + export NOMAD_TOKEN="$(wb_nomad vault world nomad-token)" + msg $(yellow "WARNING: Fetching a \"NOMAD_TOKEN\" from SRE provided Vault for \"Performance and Tracing\"") + fi + # We don't use tokens for the P&T cluster. Nothing else to do! else # The variable is set and maybe empty! - if test -z "${NOMAD_TOKEN}" + ###################################### + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadcwqa" then - msg $(blue "INFO: Fetching a \"NOMAD_TOKEN\" from SRE provided Vault for \"Performance and Tracing\"") - export NOMAD_TOKEN="$(wb_nomad vault world nomad-token)" - else - msg $(blue "INFO: Using provided Nomad token \"NOMAD_TOKEN\" envar") + if test -z "${NOMAD_TOKEN}" + then + msg $(red "FATAL: Nomad token \"NOMAD_TOKEN\" envar is empty") + fatal "If you need to fetch a NOMAD_TOKEN for world.dev.cardano.org don't set the envar" + else + msg $(blue "INFO: Using provided Nomad token \"NOMAD_TOKEN\" envar") + fi + fi + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + if test -n "${NOMAD_TOKEN}" + then + fatal "A non-empty Nomad token \"NOMAD_TOKEN\" envar was provided but none is needed" + fi fi fi + # Check all the AWS S3 envars needed for the HTTP PUT request # Using same names as the AWS CLI # https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html if test -z "${AWS_ACCESS_KEY_ID:-}" || test -z "${AWS_SECRET_ACCESS_KEY:-}" then - msg $(yellow "WARNING: Amazon S3 \"AWS_ACCESS_KEY_ID\" or \"AWS_SECRET_ACCESS_KEY\" envar is not set") - msg $(blue "INFO: Fetching \"AWS_ACCESS_KEY_ID\" and \"AWS_SECRET_ACCESS_KEY\" from SRE provided Vault for \"Performance and Tracing\"") + msg $(blue "INFO: Amazon S3 \"AWS_ACCESS_KEY_ID\" or \"AWS_SECRET_ACCESS_KEY\" envar is not set") + msg $(yellow "WARNING: Fetching \"AWS_ACCESS_KEY_ID\" and \"AWS_SECRET_ACCESS_KEY\" from SRE provided Vault for \"Performance and Tracing\"") local aws_credentials aws_credentials="$(wb_nomad vault world aws-s3-credentials)" export AWS_ACCESS_KEY_ID=$(echo "${aws_credentials}" | jq -r .data.access_key) export AWS_SECRET_ACCESS_KEY=$(echo "${aws_credentials}" | jq -r .data.secret_key) fi +} + +# Sub-backend specific allocs and calls `backend_nomad`'s `allocate-run`. +allocate-run-nomadcloud() { + local usage="USAGE: wb backend $op RUN-DIR" + local dir=${1:?$usage}; shift + + # Copy the container specs file (container-specs.json) + # This is the output file of the Nix derivation + local profile_container_specs_file=$(envjqr 'profile_container_specs_file') + # Create a nicely sorted and indented copy + jq . "${profile_container_specs_file}" > "${dir}"/container-specs.json + + # Create nomad folder and copy the Nomad job spec file to run. + mkdir -p "${dir}"/nomad + # Select which version of the Nomad job spec file we are running and + # create a nicely sorted and indented copy in "nomad/nomad-job.json". + # Only if running on the dedicated P&T Nomad Cluster we use SSH, if not + # `nomad exec`, because we need to have an exclusive port open for us + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + jq -r ".nomadJob.cloud.ssh" \ + "${dir}"/container-specs.json \ + > "${dir}"/nomad/nomad-job.json + else + # This avoids building some extra non-needed dependencies. + jq -r ".nomadJob.cloud.nomadExec" \ + "${dir}"/container-specs.json \ + > "${dir}"/nomad/nomad-job.json + fi + # The job file is "slightly" modified (jq) to suit the running environment. + if test -n "${NOMAD_NAMESPACE:-}" + then + # This sets only the global namespace, the job level namespace. Not groups! + backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" "${NOMAD_NAMESPACE}" + else + # Empty the global namespace + backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" + fi + # Will set the flake URIs from ".installable" in container-specs.json + backend_nomad allocate-run-nomad-job-patch-nix "${dir}" + # The Nomad job spec will contain links ("nix_installables" stanza) to # the Nix Flake outputs it needs inside the container, these are # refereced with a GitHub commit ID inside the "container-specs" file. @@ -267,61 +393,21 @@ setenv-defaults-nomadcloud() { fatal "Could not fetch commit info from GitHub (\`curl\` error)" fi fi + # There are so many assumptions that I like having the user confirm them! read -p "Hit enter to continue ..." -} - -# Sub-backend specific allocs and calls `backend_nomad`'s `allocate-run`. -allocate-run-nomadcloud() { - local usage="USAGE: wb backend $op RUN-DIR" - local dir=${1:?$usage}; shift - - # Copy the container specs file (container-specs.json) - # This is the output file of the Nix derivation - local profile_container_specs_file=$(envjqr 'profile_container_specs_file') - # Create a nicely sorted and indented copy - jq . "${profile_container_specs_file}" > "${dir}"/container-specs.json - - # Create nomad folder and copy the Nomad job spec file to run. - mkdir -p "${dir}"/nomad - # Select which version of the Nomad job spec file we are running and - # create a nicely sorted and indented copy it "nomad/nomad-job.json". - # Only if running on "perf" exclusive nodes we use SSH, if not `nomad exec`, - # because we need to have an exclusive port open for us. - if echo "${WB_SHELL_PROFILE}" | grep --quiet "cw-perf" - then - jq -r ".nomadJob.cloud.ssh" \ - "${dir}"/container-specs.json \ - > "${dir}"/nomad/nomad-job.json - else - jq -r ".nomadJob.cloud.nomadExec" \ - "${dir}"/container-specs.json \ - > "${dir}"/nomad/nomad-job.json - fi - # The job file is "slightly" modified (jq) to suit the running environment. - if test -n "${NOMAD_NAMESPACE:-}" - then - # This sets only the global namespace, the job level namespace. Not groups! - backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" "${NOMAD_NAMESPACE}" - else - # Empty the global namespace - backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" - fi - # Will set the flake URIs from ".installable" in container-specs.json - backend_nomad allocate-run-nomad-job-patch-nix "${dir}" # Set the placement info and resources accordingly local nomad_job_name nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json) - ########################################################################## - # Profile name dependent changes ######################################### - ########################################################################## - # "cw-perf-*" profiles are profiles that only run on Cardano World's Nomad - # Nodes of class "perf". - # Other cloud profiles are for example "ci-test-cw-qa", "ci-test-cw-perf", - # "ci-test-cw-qa", "ci-test-cw-perf". "qa" means that they run on Nomad - # nodes that belong to the "qa" class, runs on these should be limited to - # short tests and must never use the "infra" class where HA jobs runs. + ############################################################################## + # Profile name dependent changes ############################################# + ############################################################################## + # "*-nomadperf" profiles only run on the dedicated P&T Nomad Cluster on AWS. + # "*-nomadcwqa" (for example "ci-test-nomadcwqa" or "default-nomadcwqa") means + # that they run on Cardano World Nomad cluster's nodes that belong to the "qa" + # class, runs on these should be limited to short tests and must never use the + # "infra" class where HA jobs runs. if test -z "${WB_SHELL_PROFILE:-}" then fatal "Envar \"WB_SHELL_PROFILE\" is empty!" @@ -329,20 +415,44 @@ allocate-run-nomadcloud() { ######################################################################## # Fix for region mismatches ############################################ ######################################################################## - # We use "us-east-2" and they use "us-east-1" - jq \ - ".[\"job\"][\"${nomad_job_name}\"][\"datacenters\"] |= [\"eu-central-1\", \"us-east-1\", \"ap-southeast-2\"]" \ - "${dir}"/nomad/nomad-job.json \ - | \ + # If value profile, "value-nomadperf", topology was imported from + # cardano-ops / nixops that was already using "us-east-1", but not + # "default-nomadperf", "ci-test-nomadperf" and "ci-bench" that is generated + # by `cardano-topology` (Haskell project in bench/). + # - Cardano World cluster: "eu-central-1", "us-east-2" + # - Workbench (Nix level): "eu-central-1", "us-east-2", and "ap-southeast-2" + # - Dedicated P&T cluster: "eu-central-1", "us-east-1", and "ap-southeast-2" + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + jq \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"datacenters\"] \ + |= \ + [\"eu-central-1\", \"us-east-1\", \"ap-southeast-2\"] \ + " \ + "${dir}"/nomad/nomad-job.json \ + | \ sponge "${dir}"/nomad/nomad-job.json - jq \ - ".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries( if (.value.affinity.value == \"us-east-2\") then (.value.affinity.value |= \"us-east-1\") else (.) end )" \ - "${dir}"/nomad/nomad-job.json \ - | \ + # Nix creates a Nomad Job file with affinities taken from node-specs.json + jq \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"group\"] \ + |= with_entries( \ + if (.value.affinity.value == \"us-east-2\") \ + then \ + (.value.affinity.value |= \"us-east-1\") \ + else \ + (.) \ + end \ + ) \ + " \ + "${dir}"/nomad/nomad-job.json \ + | \ sponge "${dir}"/nomad/nomad-job.json - ######################################################################## - # Unique placement: #################################################### - ######################################################################## + fi + ############################################################################ + # Unique placement: ######################################################## + ############################################################################ ## "distinct_hosts": Instructs the scheduler to not co-locate any groups ## on the same machine. When specified as a job constraint, it applies ## to all groups in the job. When specified as a group constraint, the @@ -359,25 +469,29 @@ allocate-run-nomadcloud() { } ] ' - # Adds it as a job level contraint. + # Adds it as an extra job level contraint. jq \ --argjson job_constraints_array "${job_constraints_array}" \ - ".[\"job\"][\"${nomad_job_name}\"].constraint |= \$job_constraints_array" \ + " \ + .[\"job\"][\"${nomad_job_name}\"].constraint \ + |= \ + (. + \$job_constraints_array) \ + " \ "${dir}"/nomad/nomad-job.json \ | \ sponge "${dir}"/nomad/nomad-job.json - ######################################################################## - # Node class: ########################################################## - ######################################################################## + ############################################################################ + # Node class: ############################################################## + ############################################################################ local group_constraints_array - # "perf" class nodes are the default unless the profile name contains - # "cw-qa", we try to limit the usage of Nomad nodes that are not dedicated - # Perf team nodes. - # But also, we have to be careful that "perf" runs do not overlap. We are - # making sure "perf" class nodes runs can't clash because service names - # and resources definitions currently won't allow that to happen but a new - # "perf" run may still mess up a previously running cluster. - if echo "${WB_SHELL_PROFILE}" | grep --quiet "cw-qa" + # Nomad nodes that belong to the "perf" class are the default in the Job + # definition and it stays like that unless the profile name contains + # "-nomadcwqa", in this case we limit the usage of to "qa" class nodes (CI + # dedicated) that are available for short runs. + # We have also have to be careful that runs do not overlap. This is + # automatically enforced because service names and resources definitions + # currently won't allow that to happen. + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadcwqa" then # Using "qa" class distinct nodes. Only "short" test allowed here. group_constraints_array=' @@ -391,7 +505,8 @@ allocate-run-nomadcloud() { ' elif test -n "${NOMAD_NAMESPACE:-}" then - # Using Performance & Tracing exclusive "perf" class distinct nodes! + # Use what was provided. + # If no namespace all group level constraints will be emptied! group_constraints_array=" \ [ \ { \ @@ -402,32 +517,40 @@ allocate-run-nomadcloud() { ] \ " fi - # It there something to change related to group constraints ? + # Is there something to change related to group constraints ? # Sets or deletes all groups level constraints. if test -n "${group_constraints_array:-}" then - # Adds it as a group level contraint to all groups. + # Adds it as a group level contraint to all groups replacing default ones. jq \ --argjson group_constraints_array "${group_constraints_array}" \ - ".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = \$group_constraints_array)" \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"group\"] \ + |= \ + with_entries(.value.constraint = \$group_constraints_array) \ + " \ "${dir}"/nomad/nomad-job.json \ | \ sponge "${dir}"/nomad/nomad-job.json else # Else, empties all group level constraints, like previous namespaces. jq \ - ".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = null)" \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"group\"] \ + |= \ + with_entries(.value.constraint = null) \ + " \ "${dir}"/nomad/nomad-job.json \ | \ sponge "${dir}"/nomad/nomad-job.json fi - ######################################################################## - # Memory/resources: #################################################### - ######################################################################## + ############################################################################ + # Memory/resources: ######################################################## + ############################################################################ # Set the resources, only for perf exlusive cloud runs! - # When not "perf", when "cw-qa", only "short" tests are allowed on - # whatever resources we are given. - if echo "${WB_SHELL_PROFILE}" | grep --quiet "cw-perf" + # When not "-nomadperf", when "-nomadcwqa", only "short" tests are allowed + # on whatever resources we are given. + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" then # Producer nodes use this specs, make sure they are available! # AWS: @@ -443,27 +566,30 @@ allocate-run-nomadcloud() { ## - memory.totalbytes = 16300142592 ## Pesimistic: 1,798 MiB / 15,545 MiB Total ## Optimistic: 1,396 MiB / 15,545 MiB Total + # + # WARNING: Don't use more than roughly 15400, for example 15432, because + # some clients show a couple bytes less available. local producer_resources='{ "cores": 8 - , "memory": 13000 - , "memory_max": 15000 + , "memory": 15400 + , "memory_max": 32000 }' # Set this for every non-explorer node jq \ --argjson producer_resources "${producer_resources}" \ - " \ - .[\"job\"][\"${nomad_job_name}\"][\"group\"] \ - |= \ - with_entries( \ - if ( .key != \"explorer\" ) \ - then ( \ - .value.task \ - |= \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"group\"] \ + |= \ + with_entries( \ + if ( .key != \"explorer\" ) \ + then ( \ + .value.task \ + |= \ with_entries( .value.resources = \$producer_resources ) \ - ) else ( \ - . \ - ) end \ - ) \ + ) else ( \ + . \ + ) end \ + ) \ " \ "${dir}"/nomad/nomad-job.json \ | \ @@ -484,12 +610,23 @@ allocate-run-nomadcloud() { # client named "ip-10-24-30-90.eu-central-1.compute.internal" local explorer_resources='{ "cores": 16 - , "memory": 29000 - , "memory_max": 31000 + , "memory": 32000 + , "memory_max": 64000 }' + # TODO/MAYBE: When not "value" profile, let the explorer run in any node? + # resource wise. So more than one "ci-test", "ci-bench", "default" profile + # can be run at the same time. This will need some changes to Nomad + # services names (currently all "perfnode#"). + # WARNING: By always using/placing the explorer node in the only machine + # with more memory, we are sure runs do not overlap and no ports, etc are + # clashing and interfering with benchmarks results! jq \ --argjson resources "${explorer_resources}" \ - ".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"explorer\"][\"task\"] |= with_entries( .value.resources = \$resources )" \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"group\"][\"explorer\"][\"task\"] \ + |= \ + with_entries( .value.resources = \$resources ) \ + " \ "${dir}"/nomad/nomad-job.json \ | \ sponge "${dir}"/nomad/nomad-job.json @@ -497,8 +634,9 @@ allocate-run-nomadcloud() { ############################################################################ # SSH Server: ############################################################## ############################################################################ - if echo "${WB_SHELL_PROFILE}" | grep --quiet "cw-perf" + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" then + # Get or create the keys for the SSH servers and add them as templates. local template_json_srv template_json_usr template_json_srv="$( \ ssh-key-template \ @@ -522,10 +660,15 @@ allocate-run-nomadcloud() { tasks_array=$(jq -S -r ".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"task\"] | keys | join (\" \")" "${dir}"/nomad/nomad-job.json) for task_name in ${tasks_array[*]} do + # Append the new templates. jq \ --argjson template_json_srv "${template_json_srv}" \ --argjson template_json_usr "${template_json_usr}" \ - ".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"task\"][\"${task_name}\"][\"template\"] |= ( . + [\$template_json_srv, \$template_json_usr])" \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"task\"][\"${task_name}\"][\"template\"] \ + |= \ + ( . + [\$template_json_srv, \$template_json_usr]) \ + " \ "${dir}"/nomad/nomad-job.json \ | \ sponge "${dir}"/nomad/nomad-job.json @@ -535,21 +678,27 @@ allocate-run-nomadcloud() { ######################################################################## # Reproducibility: ##################################################### ######################################################################## - # If value profile on "perf", using always the same placement! + # If value profile on "-nomadperf", using always the same placement! # This means node-N always runs on the same Nomad Client/AWS EC2 machine - if test "${WB_SHELL_PROFILE:0:13}" = 'cw-perf-value' + if \ + test "${WB_SHELL_PROFILE:0:15}" = 'value-nomadperf' \ + || \ + test "${WB_SHELL_PROFILE:0:26}" = 'value-oldtracing-nomadperf' then # A file with all the available Nomad Clients is needed! # This files is a list of Nomad Clients with a minimun of ".id", # ".datacenter", ".attributes.platform.aws["instance-type"]", # ".attributes.platform.aws.placement["availability-zone"]", # ".attributes.unique.platform.aws["instance-id"]", - # ".attributes.unique.platform.aws.["public-ipv4"]" and - # ".attributes.unique.platform.aws.mac". + # ".attributes.unique.platform.aws.["public-ipv4"]" + # ".attributes.unique.platform.aws.mac", ".attributes.cpu.modelname" and + # ".attributes.kernel.version". if test -z "${NOMAD_CLIENTS_FILE:-}" || ! test -f "${NOMAD_CLIENTS_FILE}" then fatal "No \"\$NOMAD_CLIENTS_FILE\". For reproducible builds provide this file that ensures cluster nodes are always placed on the same machines, or create a new one with 'wb nomad nodes' if Nomad Clients have suffered changes and runs fail with \"placement errors\"" fi + # Keep a copy of the file used for this run! + cp "${NOMAD_CLIENTS_FILE}" "${dir}"/nomad/clients.json # For each (instance-type, datacener/region) we look incrementally for # the unique AWS EC2 "instance-id" only after ordering the Nomad # Clients by its unique Nomad provided "id". @@ -557,11 +706,13 @@ allocate-run-nomadcloud() { # For each Nomad Job Group local groups_array # Keys MUST be sorted to always get the same order for the same profile! - groups_array=$(jq -S -r ".[\"job\"][\"${nomad_job_name}\"][\"group\"] | keys | sort | join (\" \")" "${dir}"/nomad/nomad-job.json) + # Bash's `sort --version-sort` to correctly sort "node-20" and "node-9". + readarray -t groups_array < <(jq -S -r ".[\"job\"][\"${nomad_job_name}\"][\"group\"] | keys | .[]" "${dir}"/nomad/nomad-job.json | sort --version-sort) for group_name in ${groups_array[*]} do - # Obtain the datacenter as Nomad sees it, not as an AWS attributes. + # Obtain the datacenter as Nomad sees it, not as an AWS attribute. # For example "eu-central-1" instead of "eu-central-1a". + # These values were corrected above. local datacenter datacenter=$(jq \ -r \ @@ -571,7 +722,8 @@ allocate-run-nomadcloud() { # For each Nomad Job Group Task local tasks_array # Keys MUST be sorted to always get the same order for the same profile! - tasks_array=$(jq -S -r ".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"task\"] | keys | sort | join (\" \")" "${dir}"/nomad/nomad-job.json) + # Bash's `sort --version-sort` to correctly sort "node-20" and "node-9". + readarray -t tasks_array < <(jq -S -r ".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"task\"] | keys | .[]" "${dir}"/nomad/nomad-job.json | sort --version-sort) for task_name in ${tasks_array[*]} do local count instance_type @@ -600,10 +752,14 @@ allocate-run-nomadcloud() { fi # Get the actual client for this datacenter and instance type. local actual_client + # Sort first by name so if a Nomad client gets redeployed, replaced + # by a new one with the same name, only that Task is placed in a + # different EC2 machine instead of having random changes depending on + # where the new UUID lands on the clients NOMAD_CLIENTS_FILE file. actual_client=$(jq \ " . \ | \ - sort_by(.id) \ + sort_by(.name, .id) \ | \ map(select(.datacenter == \"${datacenter}\")) \ | \ @@ -613,7 +769,7 @@ allocate-run-nomadcloud() { " \ "${NOMAD_CLIENTS_FILE}" \ ) - local instance_id availability_zone public_ipv4 mac_address + local instance_id availability_zone public_ipv4 mac_address cpu_model kernel_version instance_id="$( \ echo "${actual_client}" \ | \ @@ -638,6 +794,18 @@ allocate-run-nomadcloud() { jq -r \ '.attributes.unique.platform.aws.mac' \ )" + cpu_model="$( \ + echo "${actual_client}" \ + | \ + jq -r \ + '.attributes.cpu.modelname' \ + )" + kernel_version="$( \ + echo "${actual_client}" \ + | \ + jq -r \ + '.attributes.kernel.version' \ + )" # Pin the actual node to an specific Nomad Client / AWS instance # by appending below constraints to the already there group # constraints. @@ -671,17 +839,69 @@ allocate-run-nomadcloud() { \"attribute\": \"\${attr.unique.platform.aws.mac}\" \ , \"value\": \"${mac_address}\" \ } \ + , + { \ + \"attribute\": \"\${attr.cpu.modelname}\" \ + , \"value\": \"${cpu_model}\" \ + } \ + , + { \ + \"attribute\": \"\${attr.kernel.version}\" \ + , \"value\": \"${kernel_version}\" \ + } \ ] \ " jq \ --argjson group_constraints_array_plus "${group_constraints_array_plus}" \ - ".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"constraint\"] |= ( . + \$group_constraints_array_plus)" \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"constraint\"] \ + |= \ + ( . + \$group_constraints_array_plus) \ + " \ "${dir}"/nomad/nomad-job.json \ | \ sponge "${dir}"/nomad/nomad-job.json done done + # Else, if not value profile but still the P&T exclusive cluster, it's not + # always the same exact placement, we just make sure regions are OK + # When not "-nomadperf", when "-nomadcwqa", only "short" tests are allowed + # on whatever resources we are given, regions are only an affinity. + elif echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + local groups_array + groups_array=$(jq -S -r ".[\"job\"][\"${nomad_job_name}\"][\"group\"] | keys | sort | join (\" \")" "${dir}"/nomad/nomad-job.json) + for group_name in ${groups_array[*]} + do + # Obtain the datacenter as Nomad sees it, not as an AWS attribute. + # For example "eu-central-1" instead of "eu-central-1a". + # These values were corrected above. + local datacenter + datacenter=$(jq \ + -r \ + ".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"].affinity.value" \ + "${dir}"/nomad/nomad-job.json \ + ) + local group_constraints_array_plus=" + [ \ + { \"attribute\": \"\${node.datacenter}\" \ + , \"value\": \"${datacenter}\" \ + } \ + ] \ + " + jq \ + --argjson group_constraints_array_plus "${group_constraints_array_plus}" \ + " \ + .[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"constraint\"] \ + |= \ + ( . + \$group_constraints_array_plus) \ + " \ + "${dir}"/nomad/nomad-job.json \ + | \ + sponge "${dir}"/nomad/nomad-job.json + done fi + ############################################################################ fi # Store a summary of the job. @@ -718,6 +938,92 @@ allocate-run-nomadcloud() { read -p "Hit enter to continue ..." } +check-deployment() { + local usage="USAGE: check-deployment RUN-DIR" + local dir=${1:?$usage}; shift + + # Can only be created if `"${dir}"/nomad/clients.json` exists + # (Requested by `allocate-run`). + msg "Creating ex-post node-specs.json and topology.json files ..." + local jobs_array=() + # A node-specs.json like file with only "i", "name", "region", "port" + # but adds a "nomad-client" object with "id", "name", "az" and "ip". + wb_nomad job node-specs "${dir}"/nomad/nomad-job.json \ + > "${dir}"/nomad/node-specs.json \ + & + jobs_array+=("$!") + # A topology.json like file with only "nodeId", "name", "region" and + # the list of "producers" that is re-constructed. + wb_nomad job topology "${dir}"/nomad/nomad-job.json \ + > "${dir}"/nomad/topology.json \ + & + jobs_array+=("$!") + if ! wait_kill_em_all "${jobs_array[@]}" + then + return 1 + fi + + # An easy to compare .csv version of the topology. + jq -r \ + ' + .coreNodes as $nodes | $nodes | map( + .name as $name + | .region as $region + | .producers | map( . as $prodName | + $name + + "," + + $region[0:2] + + "," + + $prodName + + "," + + ($nodes | map(select(.name == $prodName))[0] | .region[0:2]) + ) + ) | .[] | .[] + ' \ + "${dir}"/nomad/topology.json \ + | sort --version-sort \ + > "${dir}"/nomad/topology.csv + + local node_specs_filter=' + map( {i: .i, name: .name, region: .region[0:2], port: .port} ) + | sort_by(.i) + ' + local node_specs_ante node_specs_post + node_specs_ante="$(jq "${node_specs_filter}" "${dir}"/node-specs.json)" + node_specs_post="$(jq "${node_specs_filter}" "${dir}"/nomad/node-specs.json)" + if ! test "${node_specs_ante}" = "${node_specs_post}" + then + echo "${node_specs_ante}" > "${dir}"/node-specs.ante.json + echo "${node_specs_post}" > "${dir}"/node-specs.post.json + diff --side-by-side "${dir}"/node-specs.ante.json "${dir}"/node-specs.post.json + msg "$(red "----------")" + msg "$(red "REQUESTED AND DEPLOYED node-specs.json DO NOT MATCH")" + msg "$(red "----------")" + fi + local topology_filter=' + .coreNodes + | map({ + name: .name + , nodeId: .nodeId + , region: .region[0:2] + , producers: (.producers | sort) + }) + ' + local topology_ante topology_post + topology_ante="$(jq "${topology_filter}" "${dir}"/topology.json)" + topology_post="$(jq "${topology_filter}" "${dir}"/nomad/topology.json)" + if ! test "${topology_ante}" = "${topology_post}" + then + echo "${topology_ante}" > "${dir}"/topology.ante.json + echo "${topology_post}" > "${dir}"/topology.post.json + diff --side-by-side "${dir}"/topology.ante.json "${dir}"/topology.post.json + msg "$(red "----------")" + msg "$(red "REQUESTED AND DEPLOYED topology.json DO NOT MATCH")" + msg "$(red "----------")" + fi +} + +# Called by `run.sh` without exit trap (unlike `scenario_setup_exit_trap`)! deploy-genesis-nomadcloud() { local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift @@ -760,13 +1066,8 @@ deploy-genesis-nomadcloud() { msg "$(green "File \"${genesis_file_name}\" uploaded successfully")" else msg "$(red "FATAL: Upload to Amazon S3 failed")" - local nomad_agents_were_already_running=$(envjqr 'nomad_agents_were_already_running') - if test "${nomad_agents_were_already_running}" = "false" - then - wb_nomad agents stop "${server_name}" "${client_name}" "exec" - fi # Already "fatal" -> ignore errors! - backend_nomad stop-nomad-job "${dir}" || true + backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" fatal "Failed to upload genesis" fi @@ -777,7 +1078,7 @@ deploy-genesis-nomadcloud() { # File kept for debugging! msg "$(red "FATAL: deploy-genesis-wget \"${dir}\" \"${uri}\"")" # Already "fatal" -> ignore errors! - backend_nomad stop-nomad-job "${dir}" || true + backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" fatal "Deploy of genesis \"${uri}\" failed" else msg "$(green "Genesis \"${uri}\" deployed successfully")" @@ -801,6 +1102,48 @@ fetch-logs-nomadcloud() { local dir=${1:?$usage}; shift msg "Fetch logs ..." + + msg "First start the sandboxed SSH servers ..." + # Only if running on dedicated P&T Nomad cluster on AWS we use SSH, if not + # `nomad exec`, because we need to have an exclusive port open for us. + if echo "${WB_SHELL_PROFILE}" | grep --quiet "\-nomadperf" + then + local jobs_array=() + local nodes=($(jq_tolist keys "${dir}"/node-specs.json)) + for node in ${nodes[*]} + do + # TODO: Do it in parallel ? + backend_nomad task-program-start "${dir}" "${node}" ssh & + jobs_array+=("$!") + done + # Wait and check! + if test -n "${jobs_array}" + then + if ! wait_kill_em_all "${jobs_array[@]}" + then + fatal "Failed to start ssh server(s)" + else + msg "Sandboxed ssh server(s) should be now ready" + # Make sure the SSH config file used to connect is already created. + # Ugly but if `ssh` is called inmediately after `wb nomad ssh config` + # race conditions can happen because the file contents are still in the + # cache. + local ssh_config_path + ssh_config_path="$(wb nomad ssh config)" + msg "Used ssh config file: $(realpath ${ssh_config_path})" + fi + fi + fi + + fetch-logs-nomadcloud-retry "${dir}" + + msg "Sandboxed SSH servers will be kept running for debugging purposes" +} + +fetch-logs-nomadcloud-retry() { + local usage="USAGE: wb backend $op RUN-DIR" + local dir=${1:?$usage}; shift + local jobs_array=() for node in $(jq_tolist 'keys' "${dir}"/node-specs.json) do @@ -814,14 +1157,14 @@ fetch-logs-nomadcloud() { done if test -n "${jobs_array:-}" # If = () "unbound variable" error then - # Wait until all jobs finish, don't use `wait_fail_any` that kills - # Returns the exit code of the last job, ignore it! - if ! wait "${jobs_array[@]}" + # Wait until all jobs finish, don't use `wait_kill_em_all` that kills. + # Returns the exit code of the last failed job, we ignore it! + if ! wait_all "${jobs_array[@]}" then msg "$(red "Failed to fetch some logs")" msg "Check files \"${dir}/nomad/NODE/download_ok\" and \"${dir}/nomad/NODE/download_failed\"" read -p "Hit enter to retry ..." - fetch-logs-nomadcloud "${dir}" + fetch-logs-nomadcloud-retry "${dir}" else msg "$(green "Finished fetching logs")" fi @@ -843,12 +1186,14 @@ fetch-logs-nomadcloud-node() { | \ jq -r .Attributes[\"unique.platform.aws.public-ipv4\"] \ )" - local ssh_command="ssh -F $(wb nomad ssh config) -p 32000 -l nobody" + local ssh_config_path ssh_command + ssh_config_path="$(wb nomad ssh config)" + ssh_command="ssh -F ${ssh_config_path} -p 32000 -l nobody" local node_ok="true" - # Download healthcheck(s) logs. ############################################ - ############################################################################ + # Download healthcheck(s) logs. ############################################## + ############################################################################## msg "$(blue "Fetching") $(yellow "program \"healthcheck\"") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..." - if ! rsync -au -e "${ssh_command}" \ + if ! rsync -e "${ssh_command}" -au \ -f'- start.sh' \ "${public_ipv4}":/local/run/current/healthcheck/ \ "${dir}"/healthcheck/"${node}"/ @@ -857,12 +1202,12 @@ fetch-logs-nomadcloud-node() { touch "${dir}"/nomad/"${node}"/download_failed msg "$(red Error fetching) $(yellow "program \"healthcheck\"") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..." fi - # Download generator logs. ################################################# - ############################################################################ + # Download generator logs. ################################################### + ############################################################################## if test "${node}" = "explorer" then msg "$(blue Fetching) $(yellow "program \"generator\"") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..." - if ! rsync -au -e "${ssh_command}" \ + if ! rsync -e "${ssh_command}" -au \ -f'- start.sh' -f'- run-script.json' \ "${public_ipv4}":/local/run/current/generator/ \ "${dir}"/generator/ @@ -872,10 +1217,10 @@ fetch-logs-nomadcloud-node() { msg "$(red Error fetching) $(yellow "program \"generator\"") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..." fi fi - # Download node(s) logs. ################################################### - ############################################################################ + # Download node(s) logs. ##################################################### + ############################################################################## msg "$(blue Fetching) $(yellow "program \"node\"") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..." - if ! rsync -au -e "${ssh_command}" \ + if ! rsync -e "${ssh_command}" -au \ -f'- start.sh' -f'- config.json' -f'- topology.json' \ -f'- node.socket' -f'- db/' \ "${public_ipv4}":/local/run/current/"${node}"/ \ @@ -885,10 +1230,10 @@ fetch-logs-nomadcloud-node() { touch "${dir}"/nomad/"${node}"/download_failed msg "$(red Error fetching) $(yellow "program \"node\"") $(red "run files from") $(yellow "\"${node}\" (\"${public_ipv4}\")") ..." fi - # Download tracer(s) logs. ############################################### - ########################################################################## + # Download tracer(s) logs. ################################################### + ############################################################################## msg "$(blue Fetching) $(yellow "program \"tracer\"") run files from $(yellow "\"${node}\" (\"${public_ipv4}\")") ..." - if ! rsync -au -e "${ssh_command}" \ + if ! rsync -e "${ssh_command}" -au \ -f'- start.sh' -f'- config.json' \ -f'- tracer.socket' -f'- logRoot/' \ "${public_ipv4}":/local/run/current/tracer/ \ @@ -923,7 +1268,7 @@ ssh-key-template() { ' { "env": false - , "destination": ("/local/run/current/ssh/" + $key_name) + , "destination": ("local/run/current/ssh/" + $key_name) , "data": $key_data , "change_mode": "noop" , "error_on_missing_key": true diff --git a/nix/workbench/backend/nomad/exec.sh b/nix/workbench/backend/nomad/exec.sh index a8cd3edec16..26966077694 100644 --- a/nix/workbench/backend/nomad/exec.sh +++ b/nix/workbench/backend/nomad/exec.sh @@ -13,7 +13,7 @@ backend_nomadexec() { # Can be: # nomadpodman (Using podman Task Driver in the cloud is not planned) # nomadexec (Starts Nomad Agents supporting the "nix_installable" stanza) - # nomadcloud (IOG Nomad Agents and Amazon S3 with credentials from Vault) + # nomadcloud (SRE managed Nomad Agents on Amazon S3 (dedicated or not)) echo 'nomadexec' ;; @@ -42,84 +42,97 @@ backend_nomadexec() { ;; allocate-run ) - allocate-run-nomadexec "$@" + allocate-run-nomadexec "$@" # Does a pre allocation before calling the default/common allocation. - backend_nomad allocate-run "$@" + backend_nomad allocate-run "$@" ;; + # Called by `run.sh` without exit trap (unlike `scenario_setup_exit_trap`)! deploy-genesis ) # It "overrides" completely `backend_nomad`'s `deploy-genesis`. - deploy-genesis-nomadexec "$@" + deploy-genesis-nomadexec "$@" + ;; + + wait-pools-stopped ) + # It passes the sleep time (in seconds) required argument. + # This time is different between local and cloud backends to avoid + # unnecesary Nomad specific traffic (~99% happens waiting for node-0, the + # first one it waits to stop inside a loop) and at the same time be less + # sensitive to network failures. + backend_nomad wait-pools-stopped 1 "$@" + ;; + + # All or clean up everything! + # Called after `scenario.sh` without an exit trap! + stop-cluster ) + # Shared code between Nomad sub-backends that internally only takes care + # of the Nomad job. + backend_nomad stop-cluster-internal "$@" + # Takes care of any Nomad agents (server and client(s)) that were setup + # locally for only this run. + backend_nomad stop-cluster-local "$@" ;; # Generic backend sub-commands, shared code between Nomad sub-backends. describe-run ) - backend_nomad describe-run "$@" + backend_nomad describe-run "$@" ;; is-running ) - backend_nomad is-running "$@" + backend_nomad is-running "$@" ;; start-cluster ) - backend_nomad start-cluster "$@" + backend_nomad start-cluster "$@" ;; start-tracers ) - backend_nomad start-tracers "$@" + backend_nomad start-tracers "$@" ;; start-nodes ) - backend_nomad start-nodes "$@" + backend_nomad start-nodes "$@" ;; start-generator ) - backend_nomad start-generator "$@" + backend_nomad start-generator "$@" ;; start-healthchecks ) - backend_nomad start-healthchecks "$@" + backend_nomad start-healthchecks "$@" ;; start-node ) - backend_nomad start-node "$@" + backend_nomad start-node "$@" ;; stop-node ) - backend_nomad stop-node "$@" + backend_nomad stop-node "$@" ;; get-node-socket-path ) - backend_nomad get-node-socket-path "$@" + backend_nomad get-node-socket-path "$@" ;; wait-node ) - backend_nomad wait-node "$@" + backend_nomad wait-node "$@" ;; wait-node-stopped ) - backend_nomad wait-node-stopped "$@" - ;; - - wait-pools-stopped ) - backend_nomad wait-pools-stopped "$@" + backend_nomad wait-node-stopped "$@" ;; stop-all ) - backend_nomad stop-all "$@" + backend_nomad stop-all "$@" ;; fetch-logs ) - backend_nomad fetch-logs "$@" - ;; - - stop-cluster ) - backend_nomad stop-cluster "$@" + backend_nomad fetch-logs "$@" ;; cleanup-cluster ) - backend_nomad cleanup-cluster "$@" + backend_nomad cleanup-cluster "$@" ;; * ) @@ -165,9 +178,10 @@ allocate-run-nomadexec() { ## Empty the global namespace. Local runs ignore "${NOMAD_NAMESPACE:-}" backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" # Will set the /nix/store paths from ".nix-store-path" in container-specs.json - backend_nomad allocate-run-nomad-job-patch-nix "${dir}" +# backend_nomad allocate-run-nomad-job-patch-nix "${dir}" } +# Called by `run.sh` without exit trap (unlike `scenario_setup_exit_trap`)! deploy-genesis-nomadexec() { local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift @@ -185,8 +199,8 @@ deploy-genesis-nomadexec() { if test "${nomad_agents_were_already_running}" = "false" then msg "$(red "Startup of webfs failed, cleaning up ...")" + backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" wb_nomad agents stop "${server_name}" "${client_name}" "exec" - backend_nomad stop-nomad-job "${dir}" fi fatal "Failed to start a local HTTP server" fi @@ -199,8 +213,8 @@ deploy-genesis-nomadexec() { if test "${nomad_agents_were_already_running}" = "false" then msg "$(red "Startup of webfs failed, cleaning up ...")" + backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" wb_nomad agents stop "${server_name}" "${client_name}" "exec" - backend_nomad stop-nomad-job "${dir}" fi fatal "Failed to add genesis file to local HTTP server" else @@ -210,6 +224,9 @@ deploy-genesis-nomadexec() { local uri="http://127.0.0.1:12000/${nomad_job_name}.tar.zst" if ! backend_nomad deploy-genesis-wget "${dir}" "${uri}" then + msg "$(red "Deploy of genesis failed, cleaning up ...")" + backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" + wb_nomad agents stop "${server_name}" "${client_name}" "exec" fatal "Deploy of genesis \"${uri}\" failed" else msg "$(green "Genesis \"${uri}\" deployed successfully")" diff --git a/nix/workbench/backend/nomad/podman.sh b/nix/workbench/backend/nomad/podman.sh index 0ea6111555c..d6a3f661af2 100644 --- a/nix/workbench/backend/nomad/podman.sh +++ b/nix/workbench/backend/nomad/podman.sh @@ -13,7 +13,7 @@ backend_nomadpodman() { # Can be: # nomadpodman (Using podman Task Driver in the cloud is not planned) # nomadexec (Starts Nomad Agents supporting the "nix_installable" stanza) - # nomadcloud (IOG Nomad Agents and Amazon S3 with credentials from Vault) + # nomadcloud (SRE managed Nomad Agents on Amazon S3 (dedicated or not)) echo 'nomadpodman' ;; @@ -42,84 +42,97 @@ backend_nomadpodman() { ;; allocate-run ) - allocate-run-nomadpodman "$@" + allocate-run-nomadpodman "$@" # Does a pre allocation before calling the default/common allocation. - backend_nomad allocate-run "$@" + backend_nomad allocate-run "$@" ;; + # Called by `run.sh` without exit trap (unlike `scenario_setup_exit_trap`)! deploy-genesis ) # It "overrides" completely `backend_nomad`'s `deploy-genesis`. - deploy-genesis-nomadpodman "$@" + deploy-genesis-nomadpodman "$@" + ;; + + wait-pools-stopped ) + # It passes the sleep time (in seconds) required argument. + # This time is different between local and cloud backends to avoid + # unnecesary Nomad specific traffic (~99% happens waiting for node-0, the + # first one it waits to stop inside a loop) and at the same time be less + # sensitive to network failures. + backend_nomad wait-pools-stopped 1 "$@" + ;; + + # All or clean up everything! + # Called after `scenario.sh` without an exit trap! + stop-cluster ) + # Shared code between Nomad sub-backends that internally only takes care + # of the Nomad job. + backend_nomad stop-cluster-internal "$@" + # Takes care of any Nomad agents (server and client(s)) that were setup + # locally for only this run. + backend_nomad stop-cluster-local "$@" ;; # Generic backend sub-commands, shared code between Nomad sub-backends. describe-run ) - backend_nomad describe-run "$@" + backend_nomad describe-run "$@" ;; is-running ) - backend_nomad is-running "$@" + backend_nomad is-running "$@" ;; start-cluster ) - backend_nomad start-cluster "$@" + backend_nomad start-cluster "$@" ;; start-tracers ) - backend_nomad start-tracers "$@" + backend_nomad start-tracers "$@" ;; start-nodes ) - backend_nomad start-nodes "$@" + backend_nomad start-nodes "$@" ;; start-generator ) - backend_nomad start-generator "$@" + backend_nomad start-generator "$@" ;; start-healthchecks ) - backend_nomad start-healthchecks "$@" + backend_nomad start-healthchecks "$@" ;; start-node ) - backend_nomad start-node "$@" + backend_nomad start-node "$@" ;; stop-node ) - backend_nomad stop-node "$@" + backend_nomad stop-node "$@" ;; get-node-socket-path ) - backend_nomad get-node-socket-path "$@" + backend_nomad get-node-socket-path "$@" ;; wait-node ) - backend_nomad wait-node "$@" + backend_nomad wait-node "$@" ;; wait-node-stopped ) - backend_nomad wait-node-stopped "$@" - ;; - - wait-pools-stopped ) - backend_nomad wait-pools-stopped "$@" + backend_nomad wait-node-stopped "$@" ;; stop-all ) - backend_nomad stop-all "$@" + backend_nomad stop-all "$@" ;; fetch-logs ) - backend_nomad fetch-logs "$@" - ;; - - stop-cluster ) - backend_nomad stop-cluster "$@" + backend_nomad fetch-logs "$@" ;; cleanup-cluster ) - backend_nomad cleanup-cluster "$@" + backend_nomad cleanup-cluster "$@" ;; * ) @@ -176,7 +189,7 @@ allocate-run-nomadpodman() { nomad_job_file_create_mounts "${dir}" } -# It "overrides" completely `backend_nomad`'s `deploy-genesis`. +# Called by `run.sh` without exit trap (unlike `scenario_setup_exit_trap`)! deploy-genesis-nomadpodman() { local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift diff --git a/nix/workbench/backend/supervisor.sh b/nix/workbench/backend/supervisor.sh index 0ce53974d9f..123380a6021 100755 --- a/nix/workbench/backend/supervisor.sh +++ b/nix/workbench/backend/supervisor.sh @@ -176,9 +176,9 @@ EOF local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift - # Avoid buffer related problems with stdout and stderr disabling buffering + # Make sure it never runs in unbuffered mode: # https://docs.python.org/3/using/cmdline.html#envvar-PYTHONUNBUFFERED - if ! PYTHONUNBUFFERED=TRUE supervisord --config "$dir"/supervisor/supervisord.conf $@ >"$dir"/supervisor/stderr 2>"$dir"/supervisor/stderr + if ! PYTHONUNBUFFERED="" supervisord --config "$dir"/supervisor/supervisord.conf $@ >"$dir"/supervisor/stderr 2>"$dir"/supervisor/stderr then progress "supervisor" "$(red fatal: failed to start) $(white supervisord)" echo "$(red supervisord.conf) --------------------------------" >&2 cat "$dir"/supervisor/supervisord.conf diff --git a/nix/workbench/lib.sh b/nix/workbench/lib.sh index 080f97d86bd..966a8d59a21 100644 --- a/nix/workbench/lib.sh +++ b/nix/workbench/lib.sh @@ -243,38 +243,71 @@ git_repo_commit_description() { } || echo "unknown-not-a-git-repo" } -# Wait for any job to fail or all to be OK! -wait_fail_any () { - local processes=("$@") - # There are any processes left? - if test -n "${processes[*]:-}" +# Waits for all jobs to finish independent of their exit status! +# Returns the first error code obtained if any one fails. +wait_all () { + wait_internal 0 "false" "$@" +} + +# Waits for any job to fail or all to be OK! +# All processes are killed as soon as one fails! +# Returns the first error code obtained if any one fails. +wait_kill_em_all () { + # We are scanning the scene in the city tonite ... searching, seek and destroy + wait_internal 0 "true" "$@" +} + +# Returns 0/success if no process fails, else returns the first error code +# obtained that is not zero. +wait_internal () { + # The initial status for recursion, on first call it should always be zero! + local initial_exit_status=${1}; shift + # Should all processes be killed as soon as one fails? Else waits for all + # processes to finish independent of their exit status. + local kill_em_all=${1}; shift + # Array of processes IDs or a jobs specifications. + # If ID is a job specification, waits for all processes in that job's pipeline + local processes_ids=("$@") + # Are there any processes left to wait for ? + if test -n "${processes_ids[*]:-}" then local wait_exit_status - local exited_process - wait -n -p exited_process "${processes[@]}" + local exited_process_id + # Wait for a single job from the list of processes and returns its exit + # status and the process or job identifier of the job for which the exit + # status is returned is assigned to the variable provided by `-p VAR`. + wait -n -p exited_process_id "${processes_ids[@]}" wait_exit_status=$? - # New array without the exited process - local processes_p=() - for p in "${processes[@]}" + # Only if the exit status to return is still zero we care about the + # new exit status. + if test "${initial_exit_status}" -eq 0 + then + initial_exit_status="${wait_exit_status}" + fi + # Create a wew array without the newly exited process. + local processes_ids_p=() + for p in "${processes_ids[@]}" do - if test "${p}" != "${exited_process}" + if test "${p}" != "${exited_process_id}" then - processes_p+=("${p}") + processes_ids_p+=("${p}") fi done - # Something else to wait for? - if test -n "${processes_p[*]:-}" + # Are there still any processes left to wait for ? + if test -n "${processes_ids_p[*]:-}" then # Keep waiting or kill 'em all ?' - if test "${wait_exit_status}" -eq 0 + if ! test "${wait_exit_status}" -eq 0 && test "${kill_em_all}" = "true" then - wait_fail_any "${processes_p[@]}" - else kill "${processes_p[@]}" 2>/dev/null || true return "${wait_exit_status}" + else + # Recursion, wiiiiiiiii! + wait_internal \ + "${initial_exit_status}" "${kill_em_all}" "${processes_ids_p[@]}" fi else - return "${wait_exit_status}" + return "${initial_exit_status}" fi else return 0 diff --git a/nix/workbench/nomad.sh b/nix/workbench/nomad.sh index a051f5ec283..fb2c673b2d2 100644 --- a/nix/workbench/nomad.sh +++ b/nix/workbench/nomad.sh @@ -25,6 +25,8 @@ usage_nomad() { Creates a JSON array with all the SRE's perf nodes in a format that can be used to ensure cloud runs are reproducible. + Needed envars (NOMAD_TOKEN, NOMAD_ADDR or NOMAD_NAMESPACE) + must be provided by the user. $(helpcmd agents start SERVER-NAME CLIENT-NAME TASK-DRIVER-NAME) Start a default 1 server 1 client Nomad cluster. @@ -337,7 +339,7 @@ wb_nomad() { local key_path="${ssh_dir}"/server.id_ed25519 if ! test -f "${key_path}" then - ssh-keygen -t ed25519 -f "${key_path}" -C "" -N "" + ssh-keygen -t ed25519 -f "${key_path}" -C "" -N "" >/dev/null fi echo "${key_path}" ;; @@ -346,7 +348,7 @@ wb_nomad() { local key_path="${ssh_dir}"/user.id_ed25519 if ! test -f "${key_path}" then - ssh-keygen -t ed25519 -f "${key_path}" -C "" -N "" + ssh-keygen -t ed25519 -f "${key_path}" -C "" -N "" >/dev/null fi echo "${key_path}" ;; @@ -362,21 +364,22 @@ wb_nomad() { if ! test -f "${file_path}" then cat > "${file_path}" << EOL -StrictHostKeyChecking accept-new -GlobalKnownHostsFile $(wb nomad ssh known_hosts) -UserKnownHostsFile $(wb nomad ssh known_hosts) -PasswordAuthentication no -PubKeyAuthentication yes -PreferredAuthentications publickey -IdentitiesOnly yes -IdentityFile $(wb nomad ssh key user) -Compression yes -TCPKeepAlive no -ServerAliveInterval 15 -ServerAliveCountMax 4 -ControlMaster auto -ControlPath ${ssh_dir}/%h-%p-%r -ControlPersist 15 +Host * + StrictHostKeyChecking accept-new + GlobalKnownHostsFile $(wb nomad ssh known_hosts) + UserKnownHostsFile $(wb nomad ssh known_hosts) + PasswordAuthentication no + PubKeyAuthentication yes + PreferredAuthentications publickey + IdentitiesOnly yes + IdentityFile $(wb nomad ssh key user) + Compression yes + TCPKeepAlive no + ServerAliveInterval 15 + ServerAliveCountMax 4 + ControlMaster auto + ControlPath ${ssh_dir}/%h-%p-%r + ControlPersist 15 EOL fi echo "${file_path}" @@ -401,12 +404,11 @@ EOL ################################################################################ nodes ) local usage="USAGE: wb nomad ${op}" - local nomad_address="https://nomad.world.dev.cardano.org" - local nomad_token - nomad_token=$(wb_nomad vault world nomad-token) - # Fetch the status of all nodes of class "perf" + # Fetch the status of all nodes that are in the "ready" state. + # If a node is removed status is "down" and will still show its details. + # Not using cardano specific filters anymore (-filter 'NodeClass=="perf"'). local perf_nodes - perf_nodes="$(NOMAD_TOKEN="${nomad_token}" NOMAD_NAMESPACE=perf nomad node status -address="${nomad_address}" -filter 'NodeClass=="perf"' -json)" + perf_nodes="$(nomad node status -filter 'Status=="ready"' -json)" # Create the base JSON string but without the "attributes" because those # are only available when fetching the status of individual nodes. local nodes_json @@ -433,7 +435,7 @@ EOL do # Fetch the attributes local node_attributes - node_attributes="$(NOMAD_TOKEN="${nomad_token}" NOMAD_NAMESPACE=perf nomad node status -address="${nomad_address}" -json "${node_id}" | jq .Attributes)" + node_attributes="$(nomad node status -json "${node_id}" | jq .Attributes)" # Add the attributes of this node to the JSON string nodes_json="$( \ echo "${nodes_json}" \ @@ -446,7 +448,23 @@ EOL .attributes \ |= \ { \ - \"os\": { \ + \"cpu\": { \ + \"arch\": \$attrs[\"cpu.arch\"] \ + , \"frequency\": \$attrs[\"cpu.frequency\"] \ + , \"modelname\": \$attrs[\"cpu.modelname\"] \ + , \"numcores\": \$attrs[\"cpu.numcores\"] \ + , \"reservablecores\": \$attrs[\"cpu.reservablecores\"] \ + , \"totalcompute\": \$attrs[\"cpu.totalcompute\"] \ + } \ + , \"kernel\": { \ + \"arch\": \$attrs[\"kernel.arch\"] \ + , \"name\": \$attrs[\"kernel.name\"] \ + , \"version\": \$attrs[\"kernel.version\"] \ + } \ + , \"memory\": { \ + \"totalbytes\": \$attrs[\"memory.totalbytes\"] \ + } \ + , \"os\": { \ \"name\": \$attrs[\"os.name\"] \ , \"version\": \$attrs[\"os.version\"] \ } \ @@ -559,13 +577,20 @@ EOL # https://support.hashicorp.com/hc/en-us/articles/360000654467-Removing-Orphaned-Mounts-from-Nomad-Allocation-Directory nomad system gc 2>&1 >/dev/null || true # Stop client - wb_nomad client stop "${client_name}" || true + wb_nomad client stop "${client_name}" \ + || \ + msg "$(red "Failed to stop Nomad client \"${client_name}\"")" + # Stop driver(s) if test "${task_driver}" = "podman" then - wb_nomad plugin nomad-driver-podman stop || true + wb_nomad plugin nomad-driver-podman stop \ + || \ + msg "$(red "Failed to stop nomad-driver-podman")" fi # Stop server - wb_nomad server stop "${server_name}" || true + wb_nomad server stop "${server_name}" \ + || \ + msg "$(red "Failed to stop Nomad server \"${server_name}\"")" ;; ####### agents -> * )########################################################### * ) @@ -1430,7 +1455,7 @@ EOF local job_file=${1:?$usage}; shift local job_name=${1:?$usage}; shift # Post a Nomad job without "monitor" (`-detach`) mode! - # I don't want to have `nomad` process attached to my terminal, + # I don't want to have a `nomad` process attached to my terminal, # funny things are happening with the workbench's log output! ### -detach ### Return immediately instead of entering monitor mode. After job @@ -1514,7 +1539,7 @@ EOF & jobs_array+=("$!") # Wait for all processes to finish or kill them if at least one fails! - wait_fail_any "${jobs_array[@]}" || touch "${job_file}.run/job.error" + wait_kill_em_all "${jobs_array[@]}" || touch "${job_file}.run/job.error" # Check for every possible error local return_code=0 # Any failed evaluation(s)? @@ -1623,7 +1648,7 @@ EOF jobs_array+=("$!") done # Wait for all processes to finish or kill them if at least one fails! - if ! wait_fail_any "${jobs_array[@]}" || test -f "${job_file}.run/evaluations.error" + if ! wait_kill_em_all "${jobs_array[@]}" || test -f "${job_file}.run/evaluations.error" then "${msgoff}" || msg "$(red "Exiting monitor of Nomad Evaluation(s) [${ids_array[@]}] due to errors")" # Send fatal job error signal after printing this error's messages! @@ -1689,7 +1714,7 @@ EOF jobs_array+=("$!") done # Wait for all processes to finish or kill them if at least one fails! - if ! wait_fail_any "${jobs_array[@]}" || test -f "${job_file}.run/allocations.error" + if ! wait_kill_em_all "${jobs_array[@]}" || test -f "${job_file}.run/allocations.error" then "${msgoff}" || msg "$(red "Exiting monitor of Nomad Allocation(s) [${ids_array[@]}] due to errors")" # Send fatal job error signal after printing this error's messages! @@ -1756,7 +1781,7 @@ EOF jobs_array+=("$!") done # Wait for all processes to finish or kill them if at least one fails! - if ! wait_fail_any "${jobs_array[@]}" || test -f "${job_file}.run/tasks.error" + if ! wait_kill_em_all "${jobs_array[@]}" || test -f "${job_file}.run/tasks.error" then "${msgoff}" || msg "$(red "Exiting monitor of Nomad Allocation \"${alloc_id}\" Task(s) [${ids_array[@]}] due to errors")" # Send fatal job error signal after printing this error's messages! @@ -2076,7 +2101,7 @@ EOF local usage="USAGE:wb nomad ${op} ${subop} JOB-FILE TASK-NAME" local job_file=${1:?$usage}; shift local task_name=${1:?$usage}; shift - jq -r '.ID' "${job_file}.run/task.${task_name}.final.json" + jq -r '.ID' "${job_file}".run/task.${task_name}.final.json ;; ####### job -> stop )########################################################### stop ) @@ -2084,7 +2109,268 @@ EOF local job_file=${1:?$usage}; shift local job_name=${1:?$usage}; shift # Do the prune, purge, garbage collect thing! - nomad job stop -global -no-shutdown-delay -purge -yes -verbose "${job_name}" + nomad job stop -global -no-shutdown-delay -purge -yes -verbose "${job_name}" || msg "$(red "Failed to stop Nomad job")" + ;; +####### job -> node-specs )##################################################### + node-specs ) + # Creates an ex-post "node-specs.json" like file. + # It uses Nomad tasks and allocations data plus the files that were + # actually deployed, these last ones because parts of them are + # dynamically generated using Nomad templates. + local usage="USAGE:wb nomad ${op} ${subop} JOB-FILE" + local job_file=${1:?$usage}; shift + # The nodes/clients file must exists! + local clients_file_path="$(dirname "${job_file}")"/clients.json + local node_specs_path="$(dirname "${job_file}")"/../node-specs.json + # Top object start + ################## + echo "{" + # Grab all the "i" properties from inside each "node-i" object + # Why "i" and not "name"? `jq` sorts like this: "node-49", "node-5", + # "node-50". + local node_specs_is + node_specs_is=$(jq --raw-output \ + 'map(.i) | join (" ")' \ + "${node_specs_path}" \ + ) + local first_node="true" + for node_i in ${node_specs_is[*]} + do + # Nomad Job Tasks' names are taken from the `node-specs.json` file. + # Task names are of the form "node-0", "node-1", "node-10" (not + # "node-04"). + local task_name + task_name=$(jq --raw-output \ + "map(select(.i == ${node_i})) | .[] | .name" \ + "${node_specs_path}" \ + ) + # Node open "{" + ############### + # If not the first one "," + if test "${first_node}" == "true" + then + first_node="false" + echo " \"${task_name}\": {" + else + echo " , \"${task_name}\": {" + fi + # Fetch from the allocation data the Nomad Client ID, Name and + # Datacenter/region were this Task was deployed. + local nomad_client_id + nomad_client_id=$(jq --raw-output \ + .NodeID \ + "${job_file}".run/task."${task_name}".final.json \ + ) + local nomad_client_name + nomad_client_name=$(jq --raw-output \ + .NodeName \ + "${job_file}".run/task."${task_name}".final.json \ + ) + local nomad_client_datacenter + nomad_client_datacenter=$(jq --raw-output \ + ". | map(select(.id == \"${nomad_client_id}\")) | .[0] | .datacenter" \ + "${clients_file_path}" \ + ) + # With the Nomad Client data now fetch AZ and port. + local nomad_client_az # Client's AWS AZ were this task was deployed! + nomad_client_az=$(jq --raw-output \ + ". | map(select(.id == \"${nomad_client_id}\")) | .[0] | .attributes.platform.aws.placement[\"availability-zone\"]" \ + "${clients_file_path}" \ + ) + local nomad_task_port # Task's reserved port number! + nomad_task_port=$(jq --raw-output \ + .Resources.Networks[0].ReservedPorts[0].Value \ + "${job_file}".run/task."${task_name}".final.json \ + ) + local nomad_task_ip + nomad_task_ip=$(jq --raw-output \ + ". | map(select(.name == \"${nomad_client_name}\")) | .[0] | .attributes.unique.platform.aws[\"public-ipv4\"]" \ + "${clients_file_path}" \ + ) + # Same data as "node-specs.json". + ################################# + echo " \"i\": ${node_i}" + echo " , \"name\": \"${task_name}\"" + echo " , \"region\": \"${nomad_client_datacenter}\"" + echo " , \"port\": ${nomad_task_port}" + # Extra Nomad client data. + ########################## + echo " , \"nomad-client\": {" + echo " \"id\": \"${nomad_client_id}\"" + echo " , \"name\": \"${nomad_client_name}\"" + echo " , \"az\": \"${nomad_client_az}\"" + echo " , \"ip\": \"${nomad_task_ip}\"" + echo " }" + # Node close "}" + ################ + echo " }" + done + # Top object end + ################ + echo "}" + ;; +####### job -> topology )####################################################### + topology ) + # Creates an ex-post "topology.json" like file. + # It uses Nomad tasks and allocations data plus the files that were + # actually deployed, these last ones because parts of them are + # dynamically generated using Nomad templates. + # The "producers" list of each node is re-constructed using the Nomad + # services definitions. + local usage="USAGE:wb nomad ${op} ${subop} JOB-FILE" + local job_file=${1:?$usage}; shift + # The nodes/clients file must exists! + local clients_file_path="$(dirname "${job_file}")"/clients.json + local topology_path="$(dirname "${job_file}")"/../topology.json + # Helper, called for "coreNodes" and "relayNodes" separately. + topology-node-helper() { + local task_name="$1" + local node_i="$2" + # Fetch from the allocation data the Nomad Client ID and + # Datacenter/region were this Task was deployed. + local nomad_client_id + nomad_client_id=$(jq --raw-output \ + .NodeID \ + "${job_file}".run/task."${task_name}".final.json \ + ) + local nomad_client_datacenter + nomad_client_datacenter=$(jq --raw-output \ + ". | map(select(.id == \"${nomad_client_id}\")) | .[0] | .datacenter" \ + "${clients_file_path}" \ + ) + # Same data as "topology.json". + ############################### + echo " \"name\": \"${task_name}\"" + echo " , \"nodeId\": ${node_i}" + echo " , \"region\": \"${nomad_client_datacenter}\"" + # producers start + ################# + echo " , \"producers\": [" + local node_topology_file_path + node_topology_file_path="$(dirname "${job_file}")"/../"${task_name}"/topology.json + # Grab producers from the fetched, after deployment, "topology.json" files + local node_producers + node_producers=$(jq .Producers "${node_topology_file_path}") + local node_producers_keys + node_producers_keys=$(echo "${node_producers}" | jq --raw-output 'keys | join (" ")') + local node_producer_i=0 + for node_producer_key in ${node_producers_keys[*]} + do + # The topology file, as used by the node, is already formated as + # {"addr":XX,"port":YY} were the XX and YY values were resolved + # using Nomad templates. + local producer_addr producer_port + producer_addr=$(echo "${node_producers}" | jq -r ".[${node_producer_key}] | .addr") + producer_port=$(echo "${node_producers}" | jq -r ".[${node_producer_key}] | .port") + # From the public IP and port look for the node that was deployed + # with this values by searching thorugh the services definitions. + local node_specs_path="$(dirname "${job_file}")"/../node-specs.json + local node_specs_names + node_specs_names=$(jq --raw-output \ + 'map(.name) | join (" ")' \ + "${node_specs_path}" \ + ) + for node_name in ${node_specs_names[*]} + do + if jq -e "any(select(.Address == \"${producer_addr}\" and .Port == ${producer_port}))" "$(dirname "${job_file}")"/"${node_name}"/service-info.json >/dev/null + then + if test "${node_producer_i}" == "0" + then + echo " \"${node_name}\"" + else + echo " , \"${node_name}\"" + fi + node_producer_i=$((node_producer_i + 1)) + fi + done + done + # producers end + ############### + echo " ]" + } + # Top object start + ################## + echo "{" + # coreNodes start + ################# + echo " \"coreNodes\": [" + # Grab all the "nodeId" properties from inside each array's objects + # Why "nodeId" and not "name"? `jq` sorts like this: "node-49", + # "node-5", "node-50". + local coreNodes_is + coreNodes_is=$(jq --raw-output \ + '.coreNodes | map(.nodeId) | join (" ")' \ + "${topology_path}" \ + ) + local first_coreNode="true" + for node_i in ${coreNodes_is[*]} + do + # Nomad Job Tasks' names are taken from the `topology.json` file. + # Task names are of the form "node-0", "node-1", "node-10" (not "node-04"). + local task_name + task_name=$(jq --raw-output \ + ".coreNodes | map(select(.nodeId == ${node_i})) | .[] | .name" \ + "${topology_path}" \ + ) + # Node open "{" + ############### + # If not the first one "," + if test "${first_coreNode}" == "true" + then + first_coreNode="false" + echo " {" + else + echo " , {" + fi + topology-node-helper "${task_name}" "${node_i}" + # Node close "}" + ################ + echo " }" + done + # coreNodes end + ############### + echo " ]" + # relayNodes start + ################## + echo " , \"relayNodes\": [" + # Grab all the "i" properties from inside each "node-i" object + # Why "i" and not name? `jq` sorts like this: "node-49", "node-5", "node-50" + local relayNodes_is + relayNodes_is=$(jq --raw-output \ + '.relayNodes | map(.nodeId) | join (" ")' \ + "${topology_path}" \ + ) + local first_relayNode="true" + for node_i in ${relayNodes_is[*]} + do + # Nomad Job Tasks' names are taken from the `topology.json` file. + # Task names are of the form "node-0", "node-1", "node-10" (not "node-04"). + local task_name + task_name=$(jq --raw-output \ + ".relayNodes | map(select(.nodeId == ${node_i})) | .[] | .name" \ + "${topology_path}" \ + ) + # Node open "{" + ############### + # If not the first one "," + if test "${first_relayNode}" == "true" + then + first_relayNode="false" + echo " {" + else + echo " , {" + fi + topology-node-helper "${task_name}" "${node_i}" + # Node close "}" + ################ + echo " }" + done + # relayNodes end + ################ + echo " ]" + # Top object end + ################ + echo "}" ;; ####### job -> * )############################################################## * ) diff --git a/nix/workbench/profile/pparams/delta-mimicops.jq b/nix/workbench/profile/pparams/delta-mimicops.jq new file mode 100644 index 00000000000..9c7926794ee --- /dev/null +++ b/nix/workbench/profile/pparams/delta-mimicops.jq @@ -0,0 +1,118 @@ +def delta: +{ + "alonzo": { + "coinsPerUTxOByte": 4310 + }, + "shelley": { + "protocolVersion": { + "major": 5 + }, + }, + "costModels": { + "PlutusV1": { + "addInteger-cpu-arguments-intercept": 205665, + "addInteger-cpu-arguments-slope": 812, + "appendByteString-cpu-arguments-intercept": 1000, + "appendByteString-cpu-arguments-slope": 571, + "appendString-cpu-arguments-intercept": 1000, + "appendString-cpu-arguments-slope": 24177, + "appendString-memory-arguments-intercept": 4, + "bData-cpu-arguments": 1000, + "blake2b-cpu-arguments-intercept": 117366, + "blake2b-cpu-arguments-slope": 10475, + "blake2b-memory-arguments": 4, + "cekApplyCost-exBudgetCPU": 23000, + "cekBuiltinCost-exBudgetCPU": 23000, + "cekConstCost-exBudgetCPU": 23000, + "cekDelayCost-exBudgetCPU": 23000, + "cekForceCost-exBudgetCPU": 23000, + "cekLamCost-exBudgetCPU": 23000, + "cekVarCost-exBudgetCPU": 23000, + "chooseData-cpu-arguments": 19537, + "chooseList-cpu-arguments": 175354, + "chooseUnit-cpu-arguments": 46417, + "chooseUnit-memory-arguments": 4, + "consByteString-cpu-arguments-intercept": 221973, + "consByteString-cpu-arguments-slope": 511, + "constrData-cpu-arguments": 89141, + "decodeUtf8-cpu-arguments-intercept": 497525, + "decodeUtf8-cpu-arguments-slope": 14068, + "decodeUtf8-memory-arguments-intercept": 4, + "decodeUtf8-memory-arguments-slope": 2, + "divideInteger-cpu-arguments-constant": 196500, + "divideInteger-cpu-arguments-model-arguments-intercept": 453240, + "divideInteger-cpu-arguments-model-arguments-slope": 220, + "encodeUtf8-cpu-arguments-intercept": 1000, + "encodeUtf8-cpu-arguments-slope": 28662, + "encodeUtf8-memory-arguments-intercept": 4, + "encodeUtf8-memory-arguments-slope": 2, + "equalsByteString-cpu-arguments-constant": 245000, + "equalsByteString-cpu-arguments-intercept": 216773, + "equalsByteString-cpu-arguments-slope": 62, + "equalsData-cpu-arguments-intercept": 1060367, + "equalsData-cpu-arguments-slope": 12586, + "equalsInteger-cpu-arguments-intercept": 208512, + "equalsInteger-cpu-arguments-slope": 421, + "equalsString-cpu-arguments-constant": 187000, + "equalsString-cpu-arguments-intercept": 1000, + "equalsString-cpu-arguments-slope": 52998, + "fstPair-cpu-arguments": 80436, + "headList-cpu-arguments": 43249, + "iData-cpu-arguments": 1000, + "ifThenElse-cpu-arguments": 80556, + "indexByteString-cpu-arguments": 57667, + "indexByteString-memory-arguments": 4, + "lengthOfByteString-cpu-arguments": 1000, + "lengthOfByteString-memory-arguments": 10, + "lessThanByteString-cpu-arguments-intercept": 197145, + "lessThanByteString-cpu-arguments-slope": 156, + "lessThanEqualsByteString-cpu-arguments-intercept": 197145, + "lessThanEqualsByteString-cpu-arguments-slope": 156, + "lessThanEqualsInteger-cpu-arguments-intercept": 204924, + "lessThanEqualsInteger-cpu-arguments-slope": 473, + "lessThanInteger-cpu-arguments-intercept": 208896, + "lessThanInteger-cpu-arguments-slope": 511, + "listData-cpu-arguments": 52467, + "mapData-cpu-arguments": 64832, + "mkCons-cpu-arguments": 65493, + "mkNilData-cpu-arguments": 22558, + "mkNilPairData-cpu-arguments": 16563, + "mkPairData-cpu-arguments": 76511, + "modInteger-cpu-arguments-constant": 196500, + "modInteger-cpu-arguments-model-arguments-intercept": 453240, + "modInteger-cpu-arguments-model-arguments-slope": 220, + "multiplyInteger-cpu-arguments-intercept": 69522, + "multiplyInteger-cpu-arguments-slope": 11687, + "nullList-cpu-arguments": 60091, + "quotientInteger-cpu-arguments-constant": 196500, + "quotientInteger-cpu-arguments-model-arguments-intercept": 453240, + "quotientInteger-cpu-arguments-model-arguments-slope": 220, + "remainderInteger-cpu-arguments-constant": 196500, + "remainderInteger-cpu-arguments-model-arguments-intercept": 453240, + "remainderInteger-cpu-arguments-model-arguments-slope": 220, + "sha2_256-cpu-arguments-intercept": 806990, + "sha2_256-cpu-arguments-slope": 30482, + "sha3_256-cpu-arguments-intercept": 1927926, + "sha3_256-cpu-arguments-slope": 82523, + "sliceByteString-cpu-arguments-intercept": 265318, + "sliceByteString-cpu-arguments-slope": 0, + "sliceByteString-memory-arguments-intercept": 4, + "sliceByteString-memory-arguments-slope": 0, + "sndPair-cpu-arguments": 85931, + "subtractInteger-cpu-arguments-intercept": 205665, + "subtractInteger-cpu-arguments-slope": 812, + "tailList-cpu-arguments": 41182, + "trace-cpu-arguments": 212342, + "unBData-cpu-arguments": 31220, + "unConstrData-cpu-arguments": 32696, + "unIData-cpu-arguments": 43357, + "unListData-cpu-arguments": 32247, + "unMapData-cpu-arguments": 38314, + "verifyEd25519Signature-cpu-arguments-intercept": 9462713, + "verifyEd25519Signature-cpu-arguments-slope": 1021, + "verifyEd25519Signature-memory-arguments": 10 + }, + "PlutusV2": { + } + } +}; diff --git a/nix/workbench/profile/prof1-variants.jq b/nix/workbench/profile/prof1-variants.jq index 574a26a5e3f..49e7e812718 100644 --- a/nix/workbench/profile/prof1-variants.jq +++ b/nix/workbench/profile/prof1-variants.jq @@ -175,15 +175,24 @@ def all_profile_variants: , topology: "torus" , with_explorer: true } - } as $cardano_world_qa + } as $nomad_cardano_world_qa | - # "perf" class Nomad Nodes in ["eu-central-1", "us-east-2", "ap-southeast-2"] datacenters + # nomad_perf using cardano-ops "dense" topology + # Can only be used with the 52 + explorer value profile! + { composition: + { locations: ["EU", "US", "AP"] + , topology: "dense" + , with_explorer: true + } + } as $nomad_perf_dense + | + # P&T Nomad cluster Nodes in ["eu-central-1", "us-east-2", "ap-southeast-2"] datacenters { composition: { locations: ["EU", "US", "AP"] , topology: "torus" , with_explorer: true } - } as $cardano_world_perf + } as $nomad_perf_torus | ## ### Definition vocabulary: filtering @@ -266,7 +275,7 @@ def all_profile_variants: ) as $current_tps_saturation_value | ({}| .generator.tps = 12 - ) as $cw_perf_tps_saturation_value + ) as $nomad_perf_tps_saturation_value | ({}| .generator.tps = 9 ) as $model_tps_saturation_value @@ -365,6 +374,11 @@ def all_profile_variants: | .genesis.pparamsEpoch = timeline::lastKnownEpoch | .genesis.pparamsOverlays = ["v8-preview", "doublebudget"] ) as $costmodel_v8_preview_doubleb + | + ({} + | .genesis.pparamsEpoch = timeline::lastKnownEpoch + | .genesis.pparamsOverlays = ["mimic-ops"] + ) as $mimic_ops_params ## ### Definition vocabulary: node config variants ## @@ -442,9 +456,9 @@ def all_profile_variants: { scenario: "fixed-loaded" }) as $scenario_fixed_loaded | - ($model_timescale * $cw_perf_tps_saturation_value * + ($model_timescale * $nomad_perf_tps_saturation_value * { scenario: "fixed-loaded" - }) as $scenario_cw_perf + }) as $scenario_nomad_perf | ($model_timescale * $model_tps_saturation_value * { scenario: "fixed-loaded" @@ -489,7 +503,7 @@ def all_profile_variants: , desc: "Small dataset, honest 15 epochs duration" }) as $plutuscall_base | - ($scenario_cw_perf * $compose_fiftytwo * $dataset_oct2021 * $for_7ep * + ($scenario_nomad_perf * $compose_fiftytwo * $dataset_oct2021 * $for_7ep * { node: { shutdown_on_slot_synced: 56000 } @@ -504,7 +518,7 @@ def all_profile_variants: , max_block_size: 88000 } , desc: "AWS c5-2xlarge cluster dataset, 7 epochs" - }) as $cw_perf_base + }) as $nomad_perf_base | ($scenario_model * $quadruplet * $dataset_current * $for_7ep * { node: @@ -596,13 +610,13 @@ def all_profile_variants: , { name: "default" , desc: "Default, as per nix/workbench/profile/prof0-defaults.jq" } - , $cardano_world_qa * - { name: "default-cw-qa" - , desc: "Default, but on Cardano World QA" + , $nomad_cardano_world_qa * + { name: "default-nomadcwqa" + , desc: "Default on Cardano World QA" } - , $cardano_world_perf * - { name: "default-cw-perf" - , desc: "Default, but on Cardano World perf" + , $nomad_perf_torus * + { name: "default-nomadperf" + , desc: "Default on P&T exclusive cluster" } , $plutus_base * $costmodel_v8_preview * $plutus_loop_counter * { name: "plutus" @@ -620,6 +634,14 @@ def all_profile_variants: { name: "oldtracing" , desc: "Default in legacy tracing mode" } + , $nomad_cardano_world_qa * $old_tracing * + { name: "oldtracing-nomadcwqa" + , desc: "Default in legacy tracing mode on Cardano World QA" + } + , $nomad_perf_torus * $old_tracing * + { name: "oldtracing-nomadperf" + , desc: "Default in legacy tracing mode on P&T exclusive cluster" + } , $scenario_idle * { name: "idle" , desc: "Idle scenario: start nodes & detach from tty; no cluster termination" @@ -662,13 +684,21 @@ def all_profile_variants: , $citest_base * $with_rtview * { name: "ci-test-rtview" } - , $citest_base * $cardano_world_qa * - { name: "ci-test-cw-qa" - , desc: "ci-test, but on Cardano World QA" + , $citest_base * $nomad_cardano_world_qa * + { name: "ci-test-nomadcwqa" + , desc: "ci-test on Cardano World QA" + } + , $citest_base * $nomad_cardano_world_qa * $old_tracing * + { name: "ci-test-oldtracing-nomadcwqa" + , desc: "ci-test in legacy tracing mode on Cardano World QA" + } + , $citest_base * $nomad_perf_torus * + { name: "ci-test-nomadperf" + , desc: "ci-test on P&T exclusive cluster" } - , $citest_base * $cardano_world_perf * - { name: "ci-test-cw-perf" - , desc: "ci-test, but on Cardano World perf" + , $citest_base * $nomad_perf_torus * $old_tracing * + { name: "ci-test-oldtracing-nomadperf" + , desc: "ci-test in legacy tracing mode on P&T exclusive cluster" } ## CI variants: bench duration, 15 blocks @@ -693,13 +723,21 @@ def all_profile_variants: , $cibench_base * $with_rtview * { name: "ci-bench-rtview" } - , $cibench_base * $cardano_world_qa * - { name: "ci-bench-cw-qa" - , desc: "ci-bench but on Cardano World QA" + , $cibench_base * $nomad_cardano_world_qa * + { name: "ci-bench-nomadcwqa" + , desc: "ci-bench on Cardano World QA" } - , $cibench_base * $cardano_world_perf * - { name: "ci-bench-cw-perf" - , desc: "ci-bench but on Cardano World perf" + , $cibench_base * $nomad_cardano_world_qa * $old_tracing * + { name: "ci-bench-oldtracing-nomadcwqa" + , desc: "ci-bench in legacy tracing mode on Cardano World QA" + } + , $cibench_base * $nomad_perf_torus * + { name: "ci-bench-nomadperf" + , desc: "ci-bench on P&T exclusive cluster" + } + , $cibench_base * $nomad_perf_torus * $old_tracing * + { name: "ci-bench-oldtracing-nomadperf" + , desc: "ci-bench in legacy tracing mode on P&T exclusive cluster" } ## CI variants: test duration, 3 blocks, dense10 @@ -755,9 +793,12 @@ def all_profile_variants: { name: "plutuscall-secp-schnorr-double" } -## Cardano World QA cluster: 52 nodes, 3 regions, value variant - , $cw_perf_base * $cardano_world_perf * $costmodel_v8_preview * - { name: "cw-perf-value" +## P&T Nomad cluster: 52 nodes, 3 regions, value variant + , $nomad_perf_base * $nomad_perf_dense * $costmodel_v8_preview * + { name: "value-nomadperf" + } + , $nomad_perf_base * $nomad_perf_dense * $costmodel_v8_preview * $old_tracing * + { name: "value-oldtracing-nomadperf" } ## Model value variant: 7 epochs (128GB RAM needed; 16GB for testing locally) diff --git a/nix/workbench/profile/prof2-pparams.jq b/nix/workbench/profile/prof2-pparams.jq index da7fbbbf24a..abda8805553 100644 --- a/nix/workbench/profile/prof2-pparams.jq +++ b/nix/workbench/profile/prof2-pparams.jq @@ -4,6 +4,7 @@ import "epoch-timeline" as timeline; import "delta-blockbudget" as blockbudget; import "delta-v8-preview" as v8preview; +import "delta-mimicops" as mimicops; def filterMapPParams(flt; map): timeline::epochs @@ -26,6 +27,7 @@ def overlays: { "doublebudget": blockbudget::delta_doublebudget , "stepshalf": blockbudget::delta_stepshalf , "v8-preview": v8preview::delta + , "mimic-ops": mimicops::delta }; def pParamsWithOverlays(epoch; overlay_names): diff --git a/nix/workbench/scenario.sh b/nix/workbench/scenario.sh index b042f2e036a..90744743511 100644 --- a/nix/workbench/scenario.sh +++ b/nix/workbench/scenario.sh @@ -127,6 +127,7 @@ scenario_exit_trap() { backend stop-all "$__scenario_exit_trap_dir" backend fetch-logs "$__scenario_exit_trap_dir" backend stop-cluster "$__scenario_exit_trap_dir" + msg "scenario: $(with_color yellow exit trap finished)" } scenario_setup_exit_trap() { diff --git a/nix/workbench/service/healthcheck.nix b/nix/workbench/service/healthcheck.nix index 7b7e2a0eeee..71916342f9b 100644 --- a/nix/workbench/service/healthcheck.nix +++ b/nix/workbench/service/healthcheck.nix @@ -3,6 +3,7 @@ , backend , profile , nodeSpecs +, eventlogged ? true }: with pkgs.lib; @@ -27,6 +28,7 @@ let value = '' #!${bashInteractive}/bin/sh + ###################################################################### # Set script globals ################################################# ###################################################################### @@ -59,8 +61,8 @@ let ${coreutils}/bin/echo "- active_slots_coeff: ''${active_slots_coeff}" ${coreutils}/bin/echo "- active_slots: ''${active_slots}" - # Fetch node names (Including "explorer" nodes) - ############################################### + # Fetch all node names (Including "explorer" nodes) + ################################################### node_specs_nodes=$(${jq}/bin/jq --raw-output \ "keys | join (\" \")" \ @@ -74,11 +76,11 @@ let ${coreutils}/bin/echo "- Nodes: [''${node_specs_nodes[*]}]" ${coreutils}/bin/echo "- Pools: ''${node_specs_pools}" - # Look for available nodes and allocate healthcheck - ################################################### + # Look for deployed nodes and allocate healthcheck + ################################################## nodes=() - now=$(${coreutils}/bin/date +%s) + started_time=$(${coreutils}/bin/date +%s) for node in ''${node_specs_nodes[*]} do if test -d "../''${node}" @@ -86,9 +88,8 @@ let nodes+=("''${node}") # Create healthcheck directory inside available node directories ${coreutils}/bin/mkdir "../''${node}/healthcheck" - # TODO: Store the `+RTS --info` # Save the starting time - ${coreutils}/bin/echo "''${now}" > "../''${node}/healthcheck/start_time" + ${coreutils}/bin/echo "''${started_time}" > "../''${node}/healthcheck/start_time" fi done ${coreutils}/bin/echo "Found deployed nodes:" @@ -96,6 +97,7 @@ let # Look for the generator ######################## + generator=0 if test -d "../generator" then @@ -106,68 +108,119 @@ let ${coreutils}/bin/echo "Found no deployed generator" fi + ###################################################################### # Main ############################################################### ###################################################################### # The main function, called at the end of the file/script. function healthcheck() { - # Ignore PIPE "errors", mixing 'jq', 'tac', 'grep' and/or 'head' - # will evetually throw a PIPE exception (see jq_node_stdout_last). - trap "${coreutils}/bin/echo \"trap PIPE\" >&2" PIPE msg "Started!" - # Do a one and only networking/latency test! + # Do a one and only connectivity/latency test! + ############################################## + for node in ''${nodes[*]} do - latency_topology_producers "''${node}" + + # TODO: A couple of simple pings + # latency_topology_producers "''${node}" + + # Cardano cluster connectivity (cardano-ping) + connectivity_topology_producers "''${node}" + + # Store the `+RTS --info` + ${if eventlogged + then pkgs.cardanoNodePackages.cardano-node.passthru.eventlogged + else pkgs.cardanoNodePackages.cardano-node + }/bin/cardano-node +RTS --info > "../''${node}/healthcheck/rts.info" + done - # Start the healthcheck infinite loop - while true - do + # Start individual nodes' healthchecks + ###################################### - # First available nodes - for node in ''${nodes[*]} + # Check that all available nodes are synced and past slot zero! + for node in ''${nodes[*]} + do + # Returns false if not synced and true when synced. + # Will exit with an error after a defined time has passed. + while ! healthcheck_node_synced "''${node}" do - healthcheck_node "''${node}" + ${coreutils}/bin/sleep 1 done + msg "Node "\"''${node}\"" is now synced!" + done - # Then generator if available - if test "''${generator}" != "0" - then - healthcheck_generator - fi + # Ignore PIPE "errors", mixing 'jq', 'tac', 'grep' and/or 'head' + # will evetually throw a PIPE exception (see jq_node_stdout_last). + trap "${coreutils}/bin/echo \"trap PIPE\" >&2" PIPE - if test "''${#nodes[@]}" = "1" - then - # This healthcheck run is monitoring only one node - # This is the case for all Nomad runs, either local or cloud - ${coreutils}/bin/sleep 10 - else - # This healthcheck run is monitoring many nodes - # Local/supervisord uses one healthcheck for the entire cluster - ${coreutils}/bin/sleep 1 - fi + # This is an "explorer" node (only one node and generator). + # If not an explorer node we don't keep unwanted stuff running! + if test "''${#nodes[@]}" = "1" && test "''${nodes[0]}" = "explorer" && test "''${generator}" != "0" + then + # Start a healthcheck infinite loop + while true + do - done + # TODO: An array from node-specs.json with nodes like + # '"kind": "explorer"' and '"isProducer": false' + + for node in ''${nodes[*]} + do + # TODO: Check forges? + # healthcheck_node_forge "''${node}" + # Checks that blocks are being transmitted + healthcheck_node_block "''${node}" + # TODO: Right now there no traces with a "txIds" to check. + # healthcheck_node_txs "''${node}" + done + + # Then generator if available + if test "''${generator}" != "0" + then + healthcheck_generator + fi + + if test "''${#nodes[@]}" = "1" + then + # This healthcheck run is monitoring only one node + # This is the case for all Nomad runs, either local or cloud + ${coreutils}/bin/sleep 10 + else + # This healthcheck run is monitoring many nodes + # Local/supervisord uses one healthcheck for the entire cluster + ${coreutils}/bin/sleep 1 + fi + + done + else + # Seconds supervisor needs to consider the start successful + ${coreutils}/bin/sleep 5 + msg "Done, bye!" + fi trap - PIPE + } - # Latency ############################################################ + ###################################################################### + # Network ############################################################ ###################################################################### - function latency_topology_producers() { + # TODO: latency_topology_producers "''${node}" + + function connectivity_topology_producers() { local node=$1 - msg "Latencies using 'cardano-cli ping' of \"''${node}\"'s Producers" + msg "Connectivity using 'cardano-cli ping' of \"''${node}\"'s Producers" local topology_path="../''${node}/topology.json" local keys=$(${jq}/bin/jq --raw-output '.Producers | keys | join (" ")' "''${topology_path}") for key in ''${keys[*]} do local host=$(${jq}/bin/jq --raw-output ".Producers[''${key}].addr" "''${topology_path}") local port=$(${jq}/bin/jq --raw-output ".Producers[''${key}].port" "''${topology_path}") - msg "'cardano-cli ping' of \"''${host}:''${port}\"" + msg "Executing 'cardano-cli ping' to \"''${host}:''${port}\"" # If the ping fails the whole script must fail! ${cardano-cli}/bin/cardano-cli ping \ --magic "''${network_magic}" \ @@ -178,10 +231,11 @@ let done } + ###################################################################### # Node ############################################################### ###################################################################### - function healthcheck_node() { + function healthcheck_node_synced() { local node=$1 # Checks if the node has not exited with errors if assert_program_running "''${node}" @@ -206,6 +260,8 @@ let if test $((now - start_time)) -ge 180 then exit_healthcheck "''${node}: More than 3m waiting for slot 0" + else + false fi else # The node is now synced! #################################### @@ -215,9 +271,7 @@ let else # The node was already flagged as synced! ###################### ################################################################ - # TODO: healthcheck_node_forge "''${node}" - healthcheck_node_block "''${node}" - healthcheck_node_txs "''${node}" + true fi fi } @@ -403,12 +457,15 @@ let fi } + ###################################################################### # Helper/auxiliary functions! ######################################## ###################################################################### - # The "at" time has format "2023-05-16 19:57:19.0231Z" and I can't - # parse it using any standard `date` format so I'm stripping the - # milliseconds part and converting to Unix time (Integer). + # The "at" time has format "2023-05-16 19:57:19.0231Z" for the new + # tracing system and "2023-10-04T21:06:21.03Z" for the old tracing + # system and I can't parse it using any standard `date` format so I'm + # stripping the milliseconds part and converting to Unix time + # (Integer). function msg_unix_time() { local msg=$1 echo "''${msg}" \ @@ -417,7 +474,10 @@ let ' .at[:20] | - strptime("%Y-%m-%d %H:%M:%S.") + if .[10:11] == "T" + then (. | strptime("%Y-%m-%dT%H:%M:%S.")) + else (. | strptime("%Y-%m-%d %H:%M:%S.")) + end | mktime ' @@ -441,10 +501,10 @@ let # # Also, the stdout of the node starts with some text that's echoed by # node's start.sh script that is not valid JSON, so we `grep` for - # "{"at": ... }". We still need to check the exit code of these - # functions just in case because if it fails a query may have failed - # and the healthcheck should fail. This is tricky because when 'jq' - # finishes and exists `tac` or `grep` may throw the following error: + # "{"... }". We still need to check the exit code of these functions + # just in case because if it fails a query may have failed and the + # healthcheck should fail. This is tricky because when 'jq' finishes + # and exists `tac` or `grep` may throw the following error: # "writing output failed: Broken pipe" # # Finally filter for "null" inputs that are the output of 'nth(0;...)' @@ -460,7 +520,7 @@ let ans="$( \ { ${coreutils}/bin/tac "''${stdout_path}" 2>/dev/null; } \ | \ - { ${grep}/bin/grep -E "^{\"at\":.*}$" 2>/dev/null; } \ + { ${grep}/bin/grep -E "^{\".*}$" 2>/dev/null; } \ | \ ${jq}/bin/jq \ --compact-output \ @@ -583,62 +643,123 @@ let fi } - # Block sent: - # { - # "at": "2023-05-17 16:00:57.0222Z", - # "ns": "BlockFetch.Server.SendBlock", - # "data": { - # "block": "dde....414", - # "kind": "BlockFetchServer" - # }, - # "sev": "Info", - # "thread": "67", - # "host": "localhost" - # } - # Block received: - # { - # "at": "2023-05-17 16:00:57.0246Z", - # "ns": "BlockFetch.Remote.Receive.Block", - # "data": { - # "kind": "Recv", - # "msg": { - # "agency": "ServerAgency TokStreaming", - # "blockHash": "dde....414", - # "blockSize": 64334, - # "kind": "MsgBlock", - # "txIds": [ - # "60e....6ec", - # ... - # "95d....165" - # ] - # }, - # "peer": { - # "connectionId": "127.0.0.1:37117 127.0.0.1:30000" - # } - # }, - # "sev": "Info", - # "thread": "77", - # "host": "localhost" - # } + ##################### + # Old Tracing System: + ##################### + ## Block received: + ## { + ## "app": [], + ## "at": "2023-10-04T21:06:21.03Z", + ## "data": { + ## "block": "944....10a", + ## "delay": 0.029996935, + ## "kind": "CompletedBlockFetch", + ## "peer": { + ## "local": { + ## "addr": "10.0.0.1", + ## "port": "41179" + ## }, + ## "remote": { + ## "addr": "10.0.0.52", + ## "port": "30051" + ## } + ## }, + ## "size": 863 + ## }, + ## "env": "8.2.1:00000", + ## "host": "client-e", + ## "loc": null, + ## "msg": "", + ## "ns": [ + ## "cardano.node.BlockFetchClient" + ## ], + ## "pid": "89", + ## "sev": "Info", + ## "thread": "245" + ## } + ## Block sent: + ## { + ## "app": [], + ## "at": "2023-10-04T21:06:08.16Z", + ## "data": { + ## "block": "b30....c60", + ## "kind": "TraceBlockFetchServerSendBlock", + ## "peer": { + ## "local": { + ## "addr": "10.0.0.1", + ## "port": "30000" + ## }, + ## "remote": { + ## "addr": "10.0.0.52", + ## "port": "37949" + ## } + ## } + ## }, + ## "env": "8.2.1:00000", + ## "host": "client-e", + ## "loc": null, + ## "msg": "", + ## "ns": [ + ## "cardano.node.BlockFetchServer" + ## ], + ## "pid": "89", + ## "sev": "Info", + ## "thread": "202" + ## } + ##################### + # New Tracing System: + ##################### + ## Block received: + ## { + ## "at": "2023-11-02 18:49:00.2802Z", + ## "ns": "BlockFetch.Client.CompletedBlockFetch", + ## "data": { + ## "block": "18c....33d", + ## "delay": 0.279370276, + ## "kind": "CompletedBlockFetch", + ## "peer": { + ## "connectionId": "10.0.0.3:42527 10.0.0.1:30001" + ## }, + ## "size": 13484 + ## }, + ## "sev": "Info", + ## "thread": "85", + ## "host": "host.compute.internal" + ## } + ## Block sent: + ## { + ## "at": "2023-11-02 18:49:00.1283Z", + ## "ns": "BlockFetch.Server.SendBlock", + ## "data": { + ## "block": "18c....33d", + ## "kind": "BlockFetchServer", + ## "peer": { + ## "connectionId": "0.0.0.0:30001 10.0.0.3:42527" + ## } + ## }, + ## "sev": "Info", + ## "thread": "84", + ## "host": "host.compute.internal" + ## } function last_block_transmitted() { local node=$1 if ! jq_node_stdout_last "''${node}" \ ' ( - (.ns == "BlockFetch.Server.SendBlock") + (.data.block != null) and + (.data.kind? == "CompletedBlockFetch") + ) or ( (.data.block? != null) and - (.data.kind? == "BlockFetchServer") + (.data.kind? == "TraceBlockFetchServerSendBlock") ) or ( - (.ns == "BlockFetch.Remote.Receive.Block") - and - (.data.kind? == "Recv") + (.ns == "BlockFetch.Server.SendBlock") and - (.data.msg?.blockHash != null) + (.data.block != null) and - (.data.msg?.kind == "MsgBlock") + (.data.kind? == "BlockFetchServer") ) ' then @@ -646,35 +767,37 @@ let fi } - function last_block_sent() { + function last_block_received() { local node=$1 if ! jq_node_stdout_last "''${node}" \ ' - (.ns == "BlockFetch.Server.SendBlock") - and - (.data.block? != null) + (.data.block != null) and - (.data.kind? == "BlockFetchServer") + (.data.kind? == "CompletedBlockFetch") ' then - exit_22 "jq error: last_block_sent: ''${node}" + exit_22 "jq error: last_block_received: ''${node}" fi } - function last_block_received() { + function last_block_sent() { local node=$1 if ! jq_node_stdout_last "''${node}" \ ' - (.ns == "BlockFetch.Remote.Receive.Block") - and - (.data.kind? == "Recv") - and - (.data.msg?.blockHash != null) - and - (.data.msg?.kind == "MsgBlock") + ( + (.data.block? != null) + and + (.data.kind? == "TraceBlockFetchServerSendBlock") + ) or ( + (.ns == "BlockFetch.Server.SendBlock") + and + (.data.block != null) + and + (.data.kind? == "BlockFetchServer") + ) ' then - exit_22 "jq error: last_block_received: ''${node}" + exit_22 "jq error: last_block_sent: ''${node}" fi } @@ -706,8 +829,6 @@ let local node=$1 if ! jq_node_stdout_last "''${node}" \ ' - (.ns == "BlockFetch.Remote.Receive.Block") - and (.data.kind? == "Recv") and (.data.msg?.blockHash != null) diff --git a/nix/workbench/topology/bench-dense-52.csv b/nix/workbench/topology/bench-dense-52.csv new file mode 100644 index 00000000000..b54dcfd2a57 --- /dev/null +++ b/nix/workbench/topology/bench-dense-52.csv @@ -0,0 +1,312 @@ +node-0,eu,node-1,ap +node-0,eu,node-2,us +node-0,eu,node-3,eu +node-0,eu,node-18,eu +node-0,eu,node-36,eu +node-0,eu,node-51,eu +node-1,ap,node-0,eu +node-1,ap,node-2,us +node-1,ap,node-4,ap +node-1,ap,node-16,ap +node-1,ap,node-34,ap +node-1,ap,node-49,ap +node-2,us,node-0,eu +node-2,us,node-1,ap +node-2,us,node-5,us +node-2,us,node-17,us +node-2,us,node-35,us +node-2,us,node-50,us +node-3,eu,node-0,eu +node-3,eu,node-4,ap +node-3,eu,node-5,us +node-3,eu,node-6,eu +node-3,eu,node-21,eu +node-3,eu,node-39,eu +node-4,ap,node-1,ap +node-4,ap,node-3,eu +node-4,ap,node-5,us +node-4,ap,node-7,ap +node-4,ap,node-19,ap +node-4,ap,node-37,ap +node-5,us,node-2,us +node-5,us,node-3,eu +node-5,us,node-4,ap +node-5,us,node-8,us +node-5,us,node-20,us +node-5,us,node-38,us +node-6,eu,node-3,eu +node-6,eu,node-7,ap +node-6,eu,node-8,us +node-6,eu,node-9,eu +node-6,eu,node-24,eu +node-6,eu,node-42,eu +node-7,ap,node-4,ap +node-7,ap,node-6,eu +node-7,ap,node-8,us +node-7,ap,node-10,ap +node-7,ap,node-22,ap +node-7,ap,node-40,ap +node-8,us,node-5,us +node-8,us,node-6,eu +node-8,us,node-7,ap +node-8,us,node-11,us +node-8,us,node-23,us +node-8,us,node-41,us +node-9,eu,node-6,eu +node-9,eu,node-10,ap +node-9,eu,node-11,us +node-9,eu,node-12,eu +node-9,eu,node-27,eu +node-9,eu,node-45,eu +node-10,ap,node-7,ap +node-10,ap,node-9,eu +node-10,ap,node-11,us +node-10,ap,node-13,ap +node-10,ap,node-25,ap +node-10,ap,node-43,ap +node-11,us,node-8,us +node-11,us,node-9,eu +node-11,us,node-10,ap +node-11,us,node-14,us +node-11,us,node-26,us +node-11,us,node-44,us +node-12,eu,node-9,eu +node-12,eu,node-13,ap +node-12,eu,node-14,us +node-12,eu,node-15,eu +node-12,eu,node-30,eu +node-12,eu,node-48,eu +node-13,ap,node-10,ap +node-13,ap,node-12,eu +node-13,ap,node-14,us +node-13,ap,node-16,ap +node-13,ap,node-28,ap +node-13,ap,node-46,ap +node-14,us,node-11,us +node-14,us,node-12,eu +node-14,us,node-13,ap +node-14,us,node-17,us +node-14,us,node-29,us +node-14,us,node-47,us +node-15,eu,node-12,eu +node-15,eu,node-16,ap +node-15,eu,node-17,us +node-15,eu,node-18,eu +node-15,eu,node-33,eu +node-15,eu,node-51,eu +node-16,ap,node-13,ap +node-16,ap,node-15,eu +node-16,ap,node-17,us +node-16,ap,node-19,ap +node-16,ap,node-31,ap +node-16,ap,node-49,ap +node-17,us,node-14,us +node-17,us,node-15,eu +node-17,us,node-16,ap +node-17,us,node-20,us +node-17,us,node-32,us +node-17,us,node-50,us +node-18,eu,node-0,eu +node-18,eu,node-15,eu +node-18,eu,node-19,ap +node-18,eu,node-20,us +node-18,eu,node-21,eu +node-18,eu,node-36,eu +node-19,ap,node-1,ap +node-19,ap,node-16,ap +node-19,ap,node-18,eu +node-19,ap,node-20,us +node-19,ap,node-22,ap +node-19,ap,node-34,ap +node-20,us,node-2,us +node-20,us,node-17,us +node-20,us,node-18,eu +node-20,us,node-19,ap +node-20,us,node-23,us +node-20,us,node-35,us +node-21,eu,node-3,eu +node-21,eu,node-18,eu +node-21,eu,node-22,ap +node-21,eu,node-23,us +node-21,eu,node-24,eu +node-21,eu,node-39,eu +node-22,ap,node-4,ap +node-22,ap,node-19,ap +node-22,ap,node-21,eu +node-22,ap,node-23,us +node-22,ap,node-25,ap +node-22,ap,node-37,ap +node-23,us,node-5,us +node-23,us,node-20,us +node-23,us,node-21,eu +node-23,us,node-22,ap +node-23,us,node-26,us +node-23,us,node-38,us +node-24,eu,node-6,eu +node-24,eu,node-21,eu +node-24,eu,node-25,ap +node-24,eu,node-26,us +node-24,eu,node-27,eu +node-24,eu,node-42,eu +node-25,ap,node-7,ap +node-25,ap,node-22,ap +node-25,ap,node-24,eu +node-25,ap,node-26,us +node-25,ap,node-28,ap +node-25,ap,node-40,ap +node-26,us,node-8,us +node-26,us,node-23,us +node-26,us,node-24,eu +node-26,us,node-25,ap +node-26,us,node-29,us +node-26,us,node-41,us +node-27,eu,node-9,eu +node-27,eu,node-24,eu +node-27,eu,node-28,ap +node-27,eu,node-29,us +node-27,eu,node-30,eu +node-27,eu,node-45,eu +node-28,ap,node-10,ap +node-28,ap,node-25,ap +node-28,ap,node-27,eu +node-28,ap,node-29,us +node-28,ap,node-31,ap +node-28,ap,node-43,ap +node-29,us,node-11,us +node-29,us,node-26,us +node-29,us,node-27,eu +node-29,us,node-28,ap +node-29,us,node-32,us +node-29,us,node-44,us +node-30,eu,node-12,eu +node-30,eu,node-27,eu +node-30,eu,node-31,ap +node-30,eu,node-32,us +node-30,eu,node-33,eu +node-30,eu,node-48,eu +node-31,ap,node-13,ap +node-31,ap,node-28,ap +node-31,ap,node-30,eu +node-31,ap,node-32,us +node-31,ap,node-34,ap +node-31,ap,node-46,ap +node-32,us,node-14,us +node-32,us,node-29,us +node-32,us,node-30,eu +node-32,us,node-31,ap +node-32,us,node-35,us +node-32,us,node-47,us +node-33,eu,node-15,eu +node-33,eu,node-30,eu +node-33,eu,node-34,ap +node-33,eu,node-35,us +node-33,eu,node-36,eu +node-33,eu,node-51,eu +node-34,ap,node-16,ap +node-34,ap,node-31,ap +node-34,ap,node-33,eu +node-34,ap,node-35,us +node-34,ap,node-37,ap +node-34,ap,node-49,ap +node-35,us,node-17,us +node-35,us,node-32,us +node-35,us,node-33,eu +node-35,us,node-34,ap +node-35,us,node-38,us +node-35,us,node-50,us +node-36,eu,node-0,eu +node-36,eu,node-18,eu +node-36,eu,node-33,eu +node-36,eu,node-37,ap +node-36,eu,node-38,us +node-36,eu,node-39,eu +node-37,ap,node-1,ap +node-37,ap,node-19,ap +node-37,ap,node-34,ap +node-37,ap,node-36,eu +node-37,ap,node-38,us +node-37,ap,node-40,ap +node-38,us,node-2,us +node-38,us,node-20,us +node-38,us,node-35,us +node-38,us,node-36,eu +node-38,us,node-37,ap +node-38,us,node-41,us +node-39,eu,node-3,eu +node-39,eu,node-21,eu +node-39,eu,node-36,eu +node-39,eu,node-40,ap +node-39,eu,node-41,us +node-39,eu,node-42,eu +node-40,ap,node-4,ap +node-40,ap,node-22,ap +node-40,ap,node-37,ap +node-40,ap,node-39,eu +node-40,ap,node-41,us +node-40,ap,node-43,ap +node-41,us,node-5,us +node-41,us,node-23,us +node-41,us,node-38,us +node-41,us,node-39,eu +node-41,us,node-40,ap +node-41,us,node-44,us +node-42,eu,node-6,eu +node-42,eu,node-24,eu +node-42,eu,node-39,eu +node-42,eu,node-43,ap +node-42,eu,node-44,us +node-42,eu,node-45,eu +node-43,ap,node-7,ap +node-43,ap,node-25,ap +node-43,ap,node-40,ap +node-43,ap,node-42,eu +node-43,ap,node-44,us +node-43,ap,node-46,ap +node-44,us,node-8,us +node-44,us,node-26,us +node-44,us,node-41,us +node-44,us,node-42,eu +node-44,us,node-43,ap +node-44,us,node-47,us +node-45,eu,node-9,eu +node-45,eu,node-27,eu +node-45,eu,node-42,eu +node-45,eu,node-46,ap +node-45,eu,node-47,us +node-45,eu,node-48,eu +node-46,ap,node-10,ap +node-46,ap,node-28,ap +node-46,ap,node-43,ap +node-46,ap,node-45,eu +node-46,ap,node-47,us +node-46,ap,node-49,ap +node-47,us,node-11,us +node-47,us,node-29,us +node-47,us,node-44,us +node-47,us,node-45,eu +node-47,us,node-46,ap +node-47,us,node-50,us +node-48,eu,node-12,eu +node-48,eu,node-30,eu +node-48,eu,node-45,eu +node-48,eu,node-49,ap +node-48,eu,node-50,us +node-48,eu,node-51,eu +node-49,ap,node-1,ap +node-49,ap,node-13,ap +node-49,ap,node-31,ap +node-49,ap,node-46,ap +node-49,ap,node-48,eu +node-49,ap,node-50,us +node-50,us,node-2,us +node-50,us,node-14,us +node-50,us,node-32,us +node-50,us,node-47,us +node-50,us,node-48,eu +node-50,us,node-49,ap +node-51,eu,node-0,eu +node-51,eu,node-1,ap +node-51,eu,node-2,us +node-51,eu,node-15,eu +node-51,eu,node-33,eu +node-51,eu,node-48,eu diff --git a/nix/workbench/topology/bench-dense-52.json b/nix/workbench/topology/bench-dense-52.json new file mode 100644 index 00000000000..e87785f6ec5 --- /dev/null +++ b/nix/workbench/topology/bench-dense-52.json @@ -0,0 +1 @@ +{"coreNodes":[{"name":"node-0","nodeId":0,"org":"IOHK","pools":null,"producers":["node-1","node-2","node-3","node-51","node-18","node-36"],"region":"eu-central-1"},{"name":"node-3","nodeId":3,"org":"IOHK","pools":2,"producers":["node-4","node-5","node-6","node-0","node-21","node-39"],"region":"eu-central-1"},{"name":"node-6","nodeId":6,"org":"IOHK","pools":2,"producers":["node-7","node-8","node-9","node-3","node-24","node-42"],"region":"eu-central-1"},{"name":"node-9","nodeId":9,"org":"IOHK","pools":2,"producers":["node-10","node-11","node-12","node-6","node-27","node-45"],"region":"eu-central-1"},{"name":"node-12","nodeId":12,"org":"IOHK","pools":2,"producers":["node-13","node-14","node-15","node-9","node-30","node-48"],"region":"eu-central-1"},{"name":"node-15","nodeId":15,"org":"IOHK","pools":2,"producers":["node-16","node-17","node-18","node-12","node-33","node-51"],"region":"eu-central-1"},{"name":"node-18","nodeId":18,"org":"IOHK","pools":2,"producers":["node-19","node-20","node-21","node-15","node-36","node-0"],"region":"eu-central-1"},{"name":"node-21","nodeId":21,"org":"IOHK","pools":2,"producers":["node-22","node-23","node-24","node-18","node-39","node-3"],"region":"eu-central-1"},{"name":"node-24","nodeId":24,"org":"IOHK","pools":2,"producers":["node-25","node-26","node-27","node-21","node-42","node-6"],"region":"eu-central-1"},{"name":"node-27","nodeId":27,"org":"IOHK","pools":2,"producers":["node-28","node-29","node-30","node-24","node-45","node-9"],"region":"eu-central-1"},{"name":"node-30","nodeId":30,"org":"IOHK","pools":2,"producers":["node-31","node-32","node-33","node-27","node-48","node-12"],"region":"eu-central-1"},{"name":"node-33","nodeId":33,"org":"IOHK","pools":2,"producers":["node-34","node-35","node-36","node-30","node-51","node-15"],"region":"eu-central-1"},{"name":"node-36","nodeId":36,"org":"IOHK","pools":2,"producers":["node-37","node-38","node-39","node-33","node-0","node-18"],"region":"eu-central-1"},{"name":"node-39","nodeId":39,"org":"IOHK","pools":2,"producers":["node-40","node-41","node-42","node-36","node-3","node-21"],"region":"eu-central-1"},{"name":"node-42","nodeId":42,"org":"IOHK","pools":2,"producers":["node-43","node-44","node-45","node-39","node-6","node-24"],"region":"eu-central-1"},{"name":"node-45","nodeId":45,"org":"IOHK","pools":2,"producers":["node-46","node-47","node-48","node-42","node-9","node-27"],"region":"eu-central-1"},{"name":"node-48","nodeId":48,"org":"IOHK","pools":2,"producers":["node-49","node-50","node-51","node-45","node-12","node-30"],"region":"eu-central-1"},{"name":"node-51","nodeId":51,"org":"IOHK","pools":2,"producers":["node-1","node-2","node-0","node-48","node-15","node-33"],"region":"eu-central-1"},{"name":"node-1","nodeId":1,"org":"IOHK","pools":1,"producers":["node-2","node-0","node-4","node-49","node-16","node-34"],"region":"ap-southeast-2"},{"name":"node-4","nodeId":4,"org":"IOHK","pools":2,"producers":["node-5","node-3","node-7","node-1","node-19","node-37"],"region":"ap-southeast-2"},{"name":"node-7","nodeId":7,"org":"IOHK","pools":2,"producers":["node-8","node-6","node-10","node-4","node-22","node-40"],"region":"ap-southeast-2"},{"name":"node-10","nodeId":10,"org":"IOHK","pools":2,"producers":["node-11","node-9","node-13","node-7","node-25","node-43"],"region":"ap-southeast-2"},{"name":"node-13","nodeId":13,"org":"IOHK","pools":2,"producers":["node-14","node-12","node-16","node-10","node-28","node-46"],"region":"ap-southeast-2"},{"name":"node-16","nodeId":16,"org":"IOHK","pools":2,"producers":["node-17","node-15","node-19","node-13","node-31","node-49"],"region":"ap-southeast-2"},{"name":"node-19","nodeId":19,"org":"IOHK","pools":2,"producers":["node-20","node-18","node-22","node-16","node-34","node-1"],"region":"ap-southeast-2"},{"name":"node-22","nodeId":22,"org":"IOHK","pools":2,"producers":["node-23","node-21","node-25","node-19","node-37","node-4"],"region":"ap-southeast-2"},{"name":"node-25","nodeId":25,"org":"IOHK","pools":2,"producers":["node-26","node-24","node-28","node-22","node-40","node-7"],"region":"ap-southeast-2"},{"name":"node-28","nodeId":28,"org":"IOHK","pools":2,"producers":["node-29","node-27","node-31","node-25","node-43","node-10"],"region":"ap-southeast-2"},{"name":"node-31","nodeId":31,"org":"IOHK","pools":2,"producers":["node-32","node-30","node-34","node-28","node-46","node-13"],"region":"ap-southeast-2"},{"name":"node-34","nodeId":34,"org":"IOHK","pools":2,"producers":["node-35","node-33","node-37","node-31","node-49","node-16"],"region":"ap-southeast-2"},{"name":"node-37","nodeId":37,"org":"IOHK","pools":2,"producers":["node-38","node-36","node-40","node-34","node-1","node-19"],"region":"ap-southeast-2"},{"name":"node-40","nodeId":40,"org":"IOHK","pools":2,"producers":["node-41","node-39","node-43","node-37","node-4","node-22"],"region":"ap-southeast-2"},{"name":"node-43","nodeId":43,"org":"IOHK","pools":2,"producers":["node-44","node-42","node-46","node-40","node-7","node-25"],"region":"ap-southeast-2"},{"name":"node-46","nodeId":46,"org":"IOHK","pools":2,"producers":["node-47","node-45","node-49","node-43","node-10","node-28"],"region":"ap-southeast-2"},{"name":"node-49","nodeId":49,"org":"IOHK","pools":2,"producers":["node-50","node-48","node-1","node-46","node-13","node-31"],"region":"ap-southeast-2"},{"name":"node-2","nodeId":2,"org":"IOHK","pools":2,"producers":["node-0","node-1","node-5","node-50","node-17","node-35"],"region":"us-east-1"},{"name":"node-5","nodeId":5,"org":"IOHK","pools":2,"producers":["node-3","node-4","node-8","node-2","node-20","node-38"],"region":"us-east-1"},{"name":"node-8","nodeId":8,"org":"IOHK","pools":2,"producers":["node-6","node-7","node-11","node-5","node-23","node-41"],"region":"us-east-1"},{"name":"node-11","nodeId":11,"org":"IOHK","pools":2,"producers":["node-9","node-10","node-14","node-8","node-26","node-44"],"region":"us-east-1"},{"name":"node-14","nodeId":14,"org":"IOHK","pools":2,"producers":["node-12","node-13","node-17","node-11","node-29","node-47"],"region":"us-east-1"},{"name":"node-17","nodeId":17,"org":"IOHK","pools":2,"producers":["node-15","node-16","node-20","node-14","node-32","node-50"],"region":"us-east-1"},{"name":"node-20","nodeId":20,"org":"IOHK","pools":2,"producers":["node-18","node-19","node-23","node-17","node-35","node-2"],"region":"us-east-1"},{"name":"node-23","nodeId":23,"org":"IOHK","pools":2,"producers":["node-21","node-22","node-26","node-20","node-38","node-5"],"region":"us-east-1"},{"name":"node-26","nodeId":26,"org":"IOHK","pools":2,"producers":["node-24","node-25","node-29","node-23","node-41","node-8"],"region":"us-east-1"},{"name":"node-29","nodeId":29,"org":"IOHK","pools":2,"producers":["node-27","node-28","node-32","node-26","node-44","node-11"],"region":"us-east-1"},{"name":"node-32","nodeId":32,"org":"IOHK","pools":2,"producers":["node-30","node-31","node-35","node-29","node-47","node-14"],"region":"us-east-1"},{"name":"node-35","nodeId":35,"org":"IOHK","pools":2,"producers":["node-33","node-34","node-38","node-32","node-50","node-17"],"region":"us-east-1"},{"name":"node-38","nodeId":38,"org":"IOHK","pools":2,"producers":["node-36","node-37","node-41","node-35","node-2","node-20"],"region":"us-east-1"},{"name":"node-41","nodeId":41,"org":"IOHK","pools":2,"producers":["node-39","node-40","node-44","node-38","node-5","node-23"],"region":"us-east-1"},{"name":"node-44","nodeId":44,"org":"IOHK","pools":2,"producers":["node-42","node-43","node-47","node-41","node-8","node-26"],"region":"us-east-1"},{"name":"node-47","nodeId":47,"org":"IOHK","pools":2,"producers":["node-45","node-46","node-50","node-44","node-11","node-29"],"region":"us-east-1"},{"name":"node-50","nodeId":50,"org":"IOHK","pools":2,"producers":["node-48","node-49","node-2","node-47","node-14","node-32"],"region":"us-east-1"}],"relayNodes":[{"name":"explorer","nodeId":52,"org":"IOHK","producers":["node-0","node-1","node-2","node-3","node-4","node-5","node-6","node-7","node-8","node-9","node-10","node-11","node-12","node-13","node-14","node-15","node-16","node-17","node-18","node-19","node-20","node-21","node-22","node-23","node-24","node-25","node-26","node-27","node-28","node-29","node-30","node-31","node-32","node-33","node-34","node-35","node-36","node-37","node-38","node-39","node-40","node-41","node-42","node-43","node-44","node-45","node-46","node-47","node-48","node-49","node-50","node-51"],"region":"eu-central-1"}]} diff --git a/nix/workbench/topology/bench-dense-52.nix b/nix/workbench/topology/bench-dense-52.nix new file mode 100644 index 00000000000..8968e86f394 --- /dev/null +++ b/nix/workbench/topology/bench-dense-52.nix @@ -0,0 +1,19 @@ +{ coreNodes = [ { name = "node-0"; nodeId = 0; org = "IOHK"; pools = null; producers = [ "node-1" "node-2" "node-3" "node-51" "node-18" "node-36" ]; region = "eu-central-1"; } { name = "node-3"; nodeId = 3; org = "IOHK"; pools = 2; producers = [ "node-4" "node-5" "node-6" "node-0" "node-21" "node-39" ]; region = "eu-central-1"; } { name = "node-6"; nodeId = 6; org = "IOHK"; pools = 2; producers = [ "node-7" "node-8" "node-9" "node-3" "node-24" "node-42" ]; region = "eu-central-1"; } { name = "node-9"; nodeId = 9; org = "IOHK"; pools = 2; producers = [ "node-10" "node-11" "node-12" "node-6" "node-27" "node-45" ]; region = "eu-central-1"; } { name = "node-12"; nodeId = 12; org = "IOHK"; pools = 2; producers = [ "node-13" "node-14" "node-15" "node-9" "node-30" "node-48" ]; region = "eu-central-1"; } { name = "node-15"; nodeId = 15; org = "IOHK"; pools = 2; producers = [ "node-16" "node-17" "node-18" "node-12" "node-33" "node-51" ]; region = "eu-central-1"; } { name = "node-18"; nodeId = 18; org = "IOHK"; pools = 2; producers = [ "node-19" "node-20" "node-21" "node-15" "node-36" "node-0" ]; region = "eu-central-1"; } { name = "node-21"; nodeId = 21; org = "IOHK"; pools = 2; producers = [ "node-22" "node-23" "node-24" "node-18" "node-39" "node-3" ]; region = "eu-central-1"; } { name = "node-24"; nodeId = 24; org = "IOHK"; pools = 2; producers = [ "node-25" "node-26" "node-27" "node-21" "node-42" "node-6" ]; region = "eu-central-1"; } { name = "node-27"; nodeId = 27; org = "IOHK"; pools = 2; producers = [ "node-28" "node-29" "node-30" "node-24" "node-45" "node-9" ]; region = "eu-central-1"; } { name = "node-30"; nodeId = 30; org = "IOHK"; pools = 2; producers = [ "node-31" "node-32" "node-33" "node-27" "node-48" "node-12" ]; region = "eu-central-1"; } { name = "node-33"; nodeId = 33; org = "IOHK"; pools = 2; producers = [ "node-34" "node-35" "node-36" "node-30" "node-51" "node-15" ]; region = "eu-central-1"; } { name = "node-36"; nodeId = 36; org = "IOHK"; pools = 2; producers = [ "node-37" "node-38" "node-39" "node-33" "node-0" "node-18" ]; region = "eu-central-1"; } { name = "node-39"; nodeId = 39; org = "IOHK"; pools = 2; producers = [ "node-40" "node-41" "node-42" "node-36" "node-3" "node-21" ]; region = "eu-central-1"; } { name = "node-42"; nodeId = 42; org = "IOHK"; pools = 2; producers = [ "node-43" "node-44" "node-45" "node-39" "node-6" "node-24" ]; region = "eu-central-1"; } { name = "node-45"; nodeId = 45; org = "IOHK"; pools = 2; producers = [ "node-46" "node-47" "node-48" "node-42" "node-9" "node-27" ]; region = "eu-central-1"; } { name = "node-48"; nodeId = 48; org = "IOHK"; pools = 2; producers = [ "node-49" "node-50" "node-51" "node-45" "node-12" "node-30" ]; region = "eu-central-1"; } { name = "node-51"; nodeId = 51; org = "IOHK"; pools = 2; producers = [ "node-1" "node-2" "node-0" "node-48" "node-15" "node-33" ]; region = "eu-central-1"; } { name = "node-1"; nodeId = 1; org = "IOHK"; pools = 1; producers = [ "node-2" "node-0" "node-4" "node-49" "node-16" "node-34" ]; region = "ap-southeast-2"; } { name = "node-4"; nodeId = 4; org = "IOHK"; pools = 2; producers = [ "node-5" "node-3" "node-7" "node-1" "node-19" "node-37" ]; region = "ap-southeast-2"; } { name = "node-7"; nodeId = 7; org = "IOHK"; pools = 2; producers = [ "node-8" "node-6" "node-10" "node-4" "node-22" "node-40" ]; region = "ap-southeast-2"; } { name = "node-10"; nodeId = 10; org = "IOHK"; pools = 2; producers = [ "node-11" "node-9" "node-13" "node-7" "node-25" "node-43" ]; region = "ap-southeast-2"; } { name = "node-13"; nodeId = 13; org = "IOHK"; pools = 2; producers = [ "node-14" "node-12" "node-16" "node-10" "node-28" "node-46" ]; region = "ap-southeast-2"; } { name = "node-16"; nodeId = 16; org = "IOHK"; pools = 2; producers = [ "node-17" "node-15" "node-19" "node-13" "node-31" "node-49" ]; region = "ap-southeast-2"; } { name = "node-19"; nodeId = 19; org = "IOHK"; pools = 2; producers = [ "node-20" "node-18" "node-22" "node-16" "node-34" "node-1" ]; region = "ap-southeast-2"; } { name = "node-22"; nodeId = 22; org = "IOHK"; pools = 2; producers = [ "node-23" "node-21" "node-25" "node-19" "node-37" "node-4" ]; region = "ap-southeast-2"; } { name = "node-25"; nodeId = 25; org = "IOHK"; pools = 2; producers = [ "node-26" "node-24" "node-28" "node-22" "node-40" "node-7" ]; region = "ap-southeast-2"; } { name = "node-28"; nodeId = 28; org = "IOHK"; pools = 2; producers = [ "node-29" "node-27" "node-31" "node-25" "node-43" "node-10" ]; region = "ap-southeast-2"; } { name = "node-31"; nodeId = 31; org = "IOHK"; pools = 2; producers = [ "node-32" "node-30" "node-34" "node-28" "node-46" "node-13" ]; region = "ap-southeast-2"; } { name = "node-34"; nodeId = 34; org = "IOHK"; pools = 2; producers = [ "node-35" "node-33" "node-37" "node-31" "node-49" "node-16" ]; region = "ap-southeast-2"; } { name = "node-37"; nodeId = 37; org = "IOHK"; pools = 2; producers = [ "node-38" "node-36" "node-40" "node-34" "node-1" "node-19" ]; region = "ap-southeast-2"; } { name = "node-40"; nodeId = 40; org = "IOHK"; pools = 2; producers = [ "node-41" "node-39" "node-43" "node-37" "node-4" "node-22" ]; region = "ap-southeast-2"; } { name = "node-43"; nodeId = 43; org = "IOHK"; pools = 2; producers = [ "node-44" "node-42" "node-46" "node-40" "node-7" "node-25" ]; region = "ap-southeast-2"; } { name = "node-46"; nodeId = 46; org = "IOHK"; pools = 2; producers = [ "node-47" "node-45" "node-49" "node-43" "node-10" "node-28" ]; region = "ap-southeast-2"; } { name = "node-49"; nodeId = 49; org = "IOHK"; pools = 2; producers = [ "node-50" "node-48" "node-1" "node-46" "node-13" "node-31" ]; region = "ap-southeast-2"; } { name = "node-2"; nodeId = 2; org = "IOHK"; pools = 2; producers = [ "node-0" "node-1" "node-5" "node-50" "node-17" "node-35" ]; region = "us-east-1"; } { name = "node-5"; nodeId = 5; org = "IOHK"; pools = 2; producers = [ "node-3" "node-4" "node-8" "node-2" "node-20" "node-38" ]; region = "us-east-1"; } { name = "node-8"; nodeId = 8; org = "IOHK"; pools = 2; producers = [ "node-6" "node-7" "node-11" "node-5" "node-23" "node-41" ]; region = "us-east-1"; } { name = "node-11"; nodeId = 11; org = "IOHK"; pools = 2; producers = [ "node-9" "node-10" "node-14" "node-8" "node-26" "node-44" ]; region = "us-east-1"; } { name = "node-14"; nodeId = 14; org = "IOHK"; pools = 2; producers = [ "node-12" "node-13" "node-17" "node-11" "node-29" "node-47" ]; region = "us-east-1"; } { name = "node-17"; nodeId = 17; org = "IOHK"; pools = 2; producers = [ "node-15" "node-16" "node-20" "node-14" "node-32" "node-50" ]; region = "us-east-1"; } { name = "node-20"; nodeId = 20; org = "IOHK"; pools = 2; producers = [ "node-18" "node-19" "node-23" "node-17" "node-35" "node-2" ]; region = "us-east-1"; } { name = "node-23"; nodeId = 23; org = "IOHK"; pools = 2; producers = [ "node-21" "node-22" "node-26" "node-20" "node-38" "node-5" ]; region = "us-east-1"; } { name = "node-26"; nodeId = 26; org = "IOHK"; pools = 2; producers = [ "node-24" "node-25" "node-29" "node-23" "node-41" "node-8" ]; region = "us-east-1"; } { name = "node-29"; nodeId = 29; org = "IOHK"; pools = 2; producers = [ "node-27" "node-28" "node-32" "node-26" "node-44" "node-11" ]; region = "us-east-1"; } { name = "node-32"; nodeId = 32; org = "IOHK"; pools = 2; producers = [ "node-30" "node-31" "node-35" "node-29" "node-47" "node-14" ]; region = "us-east-1"; } { name = "node-35"; nodeId = 35; org = "IOHK"; pools = 2; producers = [ "node-33" "node-34" "node-38" "node-32" "node-50" "node-17" ]; region = "us-east-1"; } { name = "node-38"; nodeId = 38; org = "IOHK"; pools = 2; producers = [ "node-36" "node-37" "node-41" "node-35" "node-2" "node-20" ]; region = "us-east-1"; } { name = "node-41"; nodeId = 41; org = "IOHK"; pools = 2; producers = [ "node-39" "node-40" "node-44" "node-38" "node-5" "node-23" ]; region = "us-east-1"; } { name = "node-44"; nodeId = 44; org = "IOHK"; pools = 2; producers = [ "node-42" "node-43" "node-47" "node-41" "node-8" "node-26" ]; region = "us-east-1"; } { name = "node-47"; nodeId = 47; org = "IOHK"; pools = 2; producers = [ "node-45" "node-46" "node-50" "node-44" "node-11" "node-29" ]; region = "us-east-1"; } { name = "node-50"; nodeId = 50; org = "IOHK"; pools = 2; producers = [ "node-48" "node-49" "node-2" "node-47" "node-14" "node-32" ]; region = "us-east-1"; } ]; + relayNodes = [ + { + name = "explorer"; + nodeId = 52; + org = "IOHK"; + region = "eu-central-1"; + producers = + [ + "node-0" "node-1" "node-2" "node-3" "node-4" "node-5" "node-6" "node-7" "node-8" "node-9" + "node-10" "node-11" "node-12" "node-13" "node-14" "node-15" "node-16" "node-17" "node-18" "node-19" + "node-20" "node-21" "node-22" "node-23" "node-24" "node-25" "node-26" "node-27" "node-28" "node-29" + "node-30" "node-31" "node-32" "node-33" "node-34" "node-35" "node-36" "node-37" "node-38" "node-39" + "node-40" "node-41" "node-42" "node-43" "node-44" "node-45" "node-46" "node-47" "node-48" "node-49" + "node-50" "node-51" + ]; + } + ]; +} diff --git a/nix/workbench/topology/topology.jq b/nix/workbench/topology/topology.jq index 608956fe391..8d69eac7b81 100644 --- a/nix/workbench/topology/topology.jq +++ b/nix/workbench/topology/topology.jq @@ -1,6 +1,7 @@ def loopback_node_topology_from_nixops_topology($topo; $i): - $topo.coreNodes[$i].producers as $producers - | ($producers | map(ltrimstr("node-") | fromjson)) as $prod_indices + # DON'T ASSUME NODES ARE ORDERED INSIDE THE GLOBAL TOPOLOGY FILE!!!!!!!!!!!! + ($topo.coreNodes | map(select(.nodeId == $i)) | .[0] | .producers) as $producers + | ($producers | map(ltrimstr("node-") | fromjson)) as $prod_indices | { Producers: ( $prod_indices | map diff --git a/nix/workbench/topology/topology.sh b/nix/workbench/topology/topology.sh index 48cd5ddd231..3b25c2e2b05 100644 --- a/nix/workbench/topology/topology.sh +++ b/nix/workbench/topology/topology.sh @@ -47,55 +47,181 @@ case "${op}" in local profile_json=${1:?$usage} local outdir=${2:?$usage} + local topology_name=$(jq '.composition.topology' --raw-output "$profile_json") local n_hosts=$(jq .composition.n_hosts "$profile_json") ## 0. Generate: # - mkdir -p "$outdir" - args=( --topology-output "$outdir"/topology.json - --dot-output "$outdir"/topology.dot - $(jq '.composition.topology - ' --raw-output "$profile_json") - --size $n_hosts - $(jq '.composition.locations - | map("--loc " + .) - | join(" ") - ' --raw-output "$profile_json") - ) - if jqtest .composition.with_explorer $profile_json - then args+=('--with-explorer') + if \ + test "${topology_name}" = "dense" \ + && test "${n_hosts}" = 52 \ + && jqtest .composition.with_explorer "${profile_json}" + then + # If the value profiles's 52 nodes dense topology we just copy it as + # it was imported from cardano-ops when switching to Nomad. + # The other difference is that the .dot file is generated here + # instead of by `cardano-topology`. + progress "topology" "Copying cardano-ops 52 nodes + explorer \"dense\" topology" + cp \ + "$(dirname "$(readlink -f "$0")")"/topology/bench-dense-52.json \ + "${outdir}"/topology.json + topology dot "${outdir}"/topology.json > "${outdir}"/topology.dot + else + mkdir -p "$outdir" + args=( --topology-output "$outdir"/topology.json + --dot-output "$outdir"/topology.dot + "$topology_name" + --size $n_hosts + $(jq '.composition.locations + | map("--loc " + .) + | join(" ") + ' --raw-output "$profile_json") + ) + if jqtest .composition.with_explorer $profile_json + then args+=('--with-explorer') + fi + progress "topology" "cardano-topology ${args[*]}" + cardano-topology "${args[@]}" + # Patch the nixops topology with the density information: + # This is only needed here, the dense topology was already imported + # from nixops / cardano-ops. + jq --slurpfile prof "$profile_json" ' + def nixops_topology_set_pool_density($topo; $density): + $topo * + { coreNodes: + ( .coreNodes + | map + ( . * + { pools: + (if .pools == null then 0 else + if .pools == 1 then 1 else + ([$density, 1] | max) end end) + } + ) + ) + }; + + nixops_topology_set_pool_density(.; $prof[0].dense_pool_density) + ' "$outdir"/topology.json | + sponge "$outdir"/topology.json fi - progress "topology" "cardano-topology ${args[*]}" - cardano-topology "${args[@]}" - ## 1. Render PDF: + ## 1. Render GraphViz topology PDF: # neato -s120 -Tpdf \ "$outdir"/topology.dot > "$outdir"/topology.pdf - - ## 2. Patch the nixops topology with the density information: - # - jq --slurpfile prof "$profile_json" ' - def nixops_topology_set_pool_density($topo; $density): - $topo * - { coreNodes: - ( .coreNodes - | map - ( . * - { pools: - (if .pools == null then 0 else - if .pools == 1 then 1 else - ([$density, 1] | max) end end) - } - ) - ) - }; - - nixops_topology_set_pool_density(.; $prof[0].dense_pool_density) - ' "$outdir"/topology.json | - sponge "$outdir"/topology.json ;; + dot ) + local usage="USAGE: wb topology dot TOPOLOGY-JSON" + local topology_file=${1:?$usage} + + # Top object start + ################## + echo "digraph dense {" + echo " splines=true;" + echo " overlap=false;" + # Add each node to its corresponding location array. + local lo_array=() eu_array=() us_array=() ap_array=() + # Grab all the "i" properties from inside each "node-i" object + # Why "i" and not name? `jq` sorts like: "node-49", "node-5", "node-50" + local nodes_array + nodes_array=$(jq --raw-output '.coreNodes | map(.nodeId) | sort | join (" ")' "${topology_file}") + for node_i in ${nodes_array[*]} + do + # Fetch this node's JSON object description. + local node + node=$(jq -r ".coreNodes | map(select( .nodeId == ${node_i} )) | .[0]" "${topology_file}") + local region + region="$(echo "${node}" | jq -r .region)" + local color + if echo "${region}" | grep --quiet "eu-central-" + then + color="blue" + eu_array+=("${node_i}") + elif echo "${region}" | grep --quiet "us-east-" + then + color="red" + us_array+=("${node_i}") + elif echo "${region}" | grep --quiet "ap-southeast-" + then + color="green" + ap_array+=("${node_i}") + else + color="black" + lo_array+=("${node_i}") + fi + done + # Output a GraphViz "subgraph" for each of the regions. + echo " subgraph eu {" + echo " label = \"EU\";" + echo " cluster=true;" + for node_i in ${eu_array[*]} + do + local color="blue" + echo " \"node-${node_i}\" [fillcolor=${color}, style=filled];" + done + echo " }" + echo " subgraph us {" + echo " label = \"US\";" + echo " cluster=true;" + for node_i in ${us_array[*]} + do + local color="red" + echo " \"node-${node_i}\" [fillcolor=${color}, style=filled];" + done + echo " }" + echo " subgraph ap {" + echo " label = \"AP\";" + echo " cluster=true;" + for node_i in ${ap_array[*]} + do + local color="green" + echo " \"node-${node_i}\" [fillcolor=${color}, style=filled];" + done + echo " }" + # Output each node's connections with its corresponding color. + for node_i in ${eu_array[*]} + do + local node + node=$(jq -r ".coreNodes | map(select( .nodeId == ${node_i} )) | .[0]" "${topology_file}") + local producers_array + producers_array=$(echo "${node}" | jq --raw-output '.producers | sort | join (" ")') + local color="blue" + for producer in ${producers_array[*]} + do + echo " \"node-${node_i}\" -> \"${producer}\" [color=${color}];" + done + done + for node_i in ${us_array[*]} + do + local node + node=$(jq -r ".coreNodes | map(select( .nodeId == ${node_i} )) | .[0]" "${topology_file}") + local producers_array + producers_array=$(echo "${node}" | jq --raw-output '.producers | sort | join (" ")') + local color="red" + for producer in ${producers_array[*]} + do + echo " \"node-${node_i}\" -> \"${producer}\" [color=${color}];" + done + done + for node_i in ${ap_array[*]} + do + local node + node=$(jq -r ".coreNodes | map(select( .nodeId == ${node_i} )) | .[0]" "${topology_file}") + local producers_array + producers_array=$(echo "${node}" | jq --raw-output '.producers | sort | join (" ")') + local color="green" + for producer in ${producers_array[*]} + do + echo " \"node-${node_i}\" -> \"${producer}\" [color=${color}];" + done + done + # Top object end + ################ + echo "}" + ;; + # For the value profile returns: # { # "0":1,