From 3750b7d23217919254b4a8e670397a5b3207cf3f Mon Sep 17 00:00:00 2001 From: jbtrystram Date: Sun, 3 Nov 2024 18:53:46 +0100 Subject: [PATCH] tests/ingition/kdump: add a remote NFS kdump test This way we have good coverage of most-used kdump features. Some context on the NFS kdump configuration: coreos/fedora-coreos-tracker#1729 This was previously merged in [1] then reverted in [2] because the nfs server container was not multi-arch, causing the pipeline to trip on it. It's also not functionning on systemd256 (so anything f41 and above), see [3] This requires https://github.com/coreos/coreos-assembler/pull/3917 for the multi-arch container, and https://github.com/coreos/coreos-assembler/issues/3921 [1] https://github.com/coreos/coreos-assembler/commit/b10d8dcfe730d2e7c8a5366bfcddc7f3081be203 [2] https://github.com/coreos/coreos-assembler/commit/af1468c421fdedfe6c62e731eaac7fd4df720fc7 [3] https://github.com/rhkdump/kdump-utils/issues/52 --- mantle/kola/tests/ignition/kdump.go | 184 ++++++++++++++++++++++++---- 1 file changed, 159 insertions(+), 25 deletions(-) diff --git a/mantle/kola/tests/ignition/kdump.go b/mantle/kola/tests/ignition/kdump.go index e7266b2aa6..b7ffbebac7 100644 --- a/mantle/kola/tests/ignition/kdump.go +++ b/mantle/kola/tests/ignition/kdump.go @@ -28,6 +28,63 @@ func init() { Tags: []string{"kdump", kola.SkipBaseChecksTag, kola.NeedsInternetTag}, Platforms: []string{"qemu"}, }) + register.RegisterTest(®ister.Test{ + Run: kdumpNFSTest, + ClusterSize: 0, + Name: `kdump.crash.nfs`, + Description: "Verifies kdump logs are exported to NFS destination", + Tags: []string{"kdump", kola.SkipBaseChecksTag, kola.NeedsInternetTag}, + Platforms: []string{"qemu"}, + }) +} + +// This function test the remote kdump feature by: +// - making sure kdump is ready +// - crashing machine +// - monitoring the expected vmcore path +func testRemoteKdump(c cluster.TestCluster, kdump_machine platform.Machine, remote_machine platform.Machine, crash_path string) { + + // Wait for kdump to become active + // 3 minutes should be enough to generate the kdump initramfs + err := util.Retry(12, 15*time.Second, func() error { + + kdump_status, err := c.SSH(kdump_machine, "systemctl is-active kdump.service") + + if err != nil { + return err + } else if string(kdump_status) == "inactive" { + return fmt.Errorf("Kdump.service is not ready: %s.", string(kdump_status)) + } + return nil + }) + if err != nil { + c.Fatalf("Timed out while waiting for kdump.service to be ready: %v", err) + } + + // crash the kernel + // use systemd-run because direclty calling `echo c > ...` will always + // throw an error as the kernel immediately hangs. + _, err = c.SSH(kdump_machine, "sudo systemd-run sh -c 'sleep 5 && echo c > /proc/sysrq-trigger'") + if err != nil { + c.Fatalf("failed to queue kernel crash: %v", err) + } + + // Wait for kdump to create vmcore dump on the remote host + err = util.Retry(8, 10*time.Second, func() error { + + // Look for the crash files created on the SSH machine + logs, err := c.SSH(remote_machine, fmt.Sprintf("find %s -type f -name vmcore*", crash_path)) + + if err != nil { + return fmt.Errorf("failed to search for vmcore: %w", err) + } else if logs == nil { + return fmt.Errorf("No vmcore created on remote host") + } + return nil + }) + if err != nil { + c.Fatalf("Timed out while waiting for kdump to create vmcore files: %v", err) + } } // The destination VM for kdump logs @@ -180,45 +237,122 @@ kernel_arguments: c.Fatalf("Unable to create test machine: %v", err) } - // Wait for kdump to become active - // 3 minutes should be enough to generate the kdump initramfs - err = util.Retry(12, 15*time.Second, func() error { + testRemoteKdump(c, kdump_machine, ssh_host.Machine, "/home/core/crash") +} - kdump_status, err := c.SSH(kdump_machine, "systemctl is-active kdump.service") +// The destination VM for kdump logs over NFS +type NfsServer struct { + Machine platform.Machine + MachineAddress string +} - if err != nil { - return err - } else if string(kdump_status) == "inactive" { - return fmt.Errorf(fmt.Sprintf("Kdump.service is not ready: %s.", string(kdump_status))) - } - return nil - }) +func setupNFSMachine(c cluster.TestCluster) NfsServer { + var m platform.Machine + var err error + + options := platform.QemuMachineOptions{ + HostForwardPorts: []platform.HostForwardPort{ + {Service: "ssh", HostPort: 0, GuestPort: 22}, + // Kdump NFS option does not allow a custom port + {Service: "nfs", HostPort: 2049, GuestPort: 2049}, + }, + } + + nfs_server_butane := conf.Butane(`variant: fcos +version: 1.5.0 +storage: + files: + - path: /etc/containers/systemd/nfs.container + overwrite: true + contents: + inline: | + [Container] + Image=quay.io/coreos-assembler/nfs + Volume=/var/nfs:/export + Network=host + PodmanArgs=--privileged + [Install] + WantedBy=default.target + directories: + - path: /var/nfs/crash`) + + // start the machine + switch c := c.Cluster.(type) { + // These cases have to be separated because when put together to the same case statement + // the golang compiler no longer checks that the individual types in the case have the + // NewMachineWithQemuOptions function, but rather whether platform.Cluster + // does which fails + case *qemu.Cluster: + m, err = c.NewMachineWithQemuOptions(nfs_server_butane, options) + default: + panic("unreachable") + } if err != nil { - c.Fatalf("Timed out while waiting for kdump.service to be ready: %v", err) + c.Fatal(err) } - // crash the kernel - // use systemd-run because direclty calling `echo c...` will alaways - // throw an error as the kernel immediately hangs. - _, err = c.SSH(kdump_machine, "sudo systemd-run sh -c 'sleep 5 && echo c > /proc/sysrq-trigger'") + return NfsServer{ + Machine: m, + MachineAddress: "10.0.2.2", + } +} + +func kdumpNFSTest(c cluster.TestCluster) { + nfs_host := setupNFSMachine(c) + + butane := conf.Butane(fmt.Sprintf(`variant: fcos +version: 1.5.0 +storage: + files: + - path: /etc/kdump.conf + overwrite: true + contents: + inline: | + nfs %s:/ + path /crash + core_collector makedumpfile -l --message-level 1 -d 31 + extra_bins /sbin/mount.nfs + extra_modules nfs nfsv3 nfs_layout_nfsv41_files blocklayoutdriver nfs_layout_flexfiles nfs_layout_nfsv41_files +systemd: + units: + - name: kdump.service + enabled: true + dropins: + - name: debug.conf + contents: | + [Service] + Environment="debug=1" +kernel_arguments: + should_exist: + - crashkernel=512M`, + nfs_host.MachineAddress)) + + opts := platform.MachineOptions{ + MinMemory: 2048, + } + + kdump_machine, err := c.NewMachineWithOptions(butane, opts) if err != nil { - c.Fatalf("failed to queue kernel crash: %v", err) + c.Fatalf("Unable to create test machine: %v", err) } - // Wait for kdump to create vmcore dump on the remote host - err = util.Retry(5, 10*time.Second, func() error { + // XXX Refactor this + // Wait for nfs server to become active + // 1 minutes should be enough to pull the container image + err = util.Retry(4, 15*time.Second, func() error { - // Look for the crash files created on the SSH machine - logs, err := c.SSH(ssh_host.Machine, "find /home/core/crash -type f -name vmcore*") + nfs_status, err := c.SSH(nfs_host.Machine, "systemctl is-active nfs.service") if err != nil { - return fmt.Errorf("failed to search for vmcore: %w", err) - } else if logs == nil { - return fmt.Errorf("No vmcore created on remote SSH host") + return err + } else if string(nfs_status) == "inactive" { + return fmt.Errorf("nfs.service is not ready: %s.", string(nfs_status)) } return nil }) if err != nil { - c.Fatalf("Timed out while waiting for kdump to create vmcore files: %v", err) + c.Fatalf("Timed out while waiting for nfs.service to be ready: %v", err) } + + testRemoteKdump(c, kdump_machine, nfs_host.Machine, "/var/nfs/crash") }