Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: add kdump over NFS #3922

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 159 additions & 25 deletions mantle/kola/tests/ignition/kdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,63 @@ func init() {
Tags: []string{"kdump", kola.SkipBaseChecksTag, kola.NeedsInternetTag},
Platforms: []string{"qemu"},
})
register.RegisterTest(&register.Test{
Run: kdumpNFSTest,
ClusterSize: 0,
Name: `kdump.crash.nfs`,
Description: "Verifies kdump logs are exported to NFS destination",
Tags: []string{"kdump", kola.SkipBaseChecksTag, kola.NeedsInternetTag},
Platforms: []string{"qemu"},
})
}

// This function test the remote kdump feature by:
// - making sure kdump is ready
// - crashing machine
// - monitoring the expected vmcore path
func testRemoteKdump(c cluster.TestCluster, kdump_machine platform.Machine, remote_machine platform.Machine, crash_path string) {

// Wait for kdump to become active
// 3 minutes should be enough to generate the kdump initramfs
err := util.Retry(12, 15*time.Second, func() error {

kdump_status, err := c.SSH(kdump_machine, "systemctl is-active kdump.service")

if err != nil {
return err
} else if string(kdump_status) == "inactive" {
return fmt.Errorf("Kdump.service is not ready: %s.", string(kdump_status))
}
return nil
})
if err != nil {
c.Fatalf("Timed out while waiting for kdump.service to be ready: %v", err)
}

// crash the kernel
// use systemd-run because direclty calling `echo c > ...` will always
// throw an error as the kernel immediately hangs.
_, err = c.SSH(kdump_machine, "sudo systemd-run sh -c 'sleep 5 && echo c > /proc/sysrq-trigger'")
if err != nil {
c.Fatalf("failed to queue kernel crash: %v", err)
}

// Wait for kdump to create vmcore dump on the remote host
err = util.Retry(8, 10*time.Second, func() error {

// Look for the crash files created on the SSH machine
logs, err := c.SSH(remote_machine, fmt.Sprintf("find %s -type f -name vmcore*", crash_path))

if err != nil {
return fmt.Errorf("failed to search for vmcore: %w", err)
} else if logs == nil {
return fmt.Errorf("No vmcore created on remote host")
}
return nil
})
if err != nil {
c.Fatalf("Timed out while waiting for kdump to create vmcore files: %v", err)
}
}

// The destination VM for kdump logs
Expand Down Expand Up @@ -180,45 +237,122 @@ kernel_arguments:
c.Fatalf("Unable to create test machine: %v", err)
}

// Wait for kdump to become active
// 3 minutes should be enough to generate the kdump initramfs
err = util.Retry(12, 15*time.Second, func() error {
testRemoteKdump(c, kdump_machine, ssh_host.Machine, "/home/core/crash")
}

kdump_status, err := c.SSH(kdump_machine, "systemctl is-active kdump.service")
// The destination VM for kdump logs over NFS
type NfsServer struct {
Machine platform.Machine
MachineAddress string
}

if err != nil {
return err
} else if string(kdump_status) == "inactive" {
return fmt.Errorf(fmt.Sprintf("Kdump.service is not ready: %s.", string(kdump_status)))
}
return nil
})
func setupNFSMachine(c cluster.TestCluster) NfsServer {
var m platform.Machine
var err error

options := platform.QemuMachineOptions{
HostForwardPorts: []platform.HostForwardPort{
{Service: "ssh", HostPort: 0, GuestPort: 22},
// Kdump NFS option does not allow a custom port
{Service: "nfs", HostPort: 2049, GuestPort: 2049},
},
}

nfs_server_butane := conf.Butane(`variant: fcos
version: 1.5.0
storage:
files:
- path: /etc/containers/systemd/nfs.container
overwrite: true
contents:
inline: |
[Container]
Image=quay.io/coreos-assembler/nfs
Volume=/var/nfs:/export
Network=host
PodmanArgs=--privileged
[Install]
WantedBy=default.target
directories:
- path: /var/nfs/crash`)

// start the machine
switch c := c.Cluster.(type) {
// These cases have to be separated because when put together to the same case statement
// the golang compiler no longer checks that the individual types in the case have the
// NewMachineWithQemuOptions function, but rather whether platform.Cluster
// does which fails
case *qemu.Cluster:
m, err = c.NewMachineWithQemuOptions(nfs_server_butane, options)
default:
panic("unreachable")
}
if err != nil {
c.Fatalf("Timed out while waiting for kdump.service to be ready: %v", err)
c.Fatal(err)
}

// crash the kernel
// use systemd-run because direclty calling `echo c...` will alaways
// throw an error as the kernel immediately hangs.
_, err = c.SSH(kdump_machine, "sudo systemd-run sh -c 'sleep 5 && echo c > /proc/sysrq-trigger'")
return NfsServer{
Machine: m,
MachineAddress: "10.0.2.2",
}
}

func kdumpNFSTest(c cluster.TestCluster) {
nfs_host := setupNFSMachine(c)

butane := conf.Butane(fmt.Sprintf(`variant: fcos
version: 1.5.0
storage:
files:
- path: /etc/kdump.conf
overwrite: true
contents:
inline: |
nfs %s:/
path /crash
core_collector makedumpfile -l --message-level 1 -d 31
extra_bins /sbin/mount.nfs
extra_modules nfs nfsv3 nfs_layout_nfsv41_files blocklayoutdriver nfs_layout_flexfiles nfs_layout_nfsv41_files
systemd:
units:
- name: kdump.service
enabled: true
dropins:
- name: debug.conf
contents: |
[Service]
Environment="debug=1"
kernel_arguments:
should_exist:
- crashkernel=512M`,
nfs_host.MachineAddress))

opts := platform.MachineOptions{
MinMemory: 2048,
}

kdump_machine, err := c.NewMachineWithOptions(butane, opts)
if err != nil {
c.Fatalf("failed to queue kernel crash: %v", err)
c.Fatalf("Unable to create test machine: %v", err)
}

// Wait for kdump to create vmcore dump on the remote host
err = util.Retry(5, 10*time.Second, func() error {
// XXX Refactor this
// Wait for nfs server to become active
// 1 minutes should be enough to pull the container image
err = util.Retry(4, 15*time.Second, func() error {

// Look for the crash files created on the SSH machine
logs, err := c.SSH(ssh_host.Machine, "find /home/core/crash -type f -name vmcore*")
nfs_status, err := c.SSH(nfs_host.Machine, "systemctl is-active nfs.service")

if err != nil {
return fmt.Errorf("failed to search for vmcore: %w", err)
} else if logs == nil {
return fmt.Errorf("No vmcore created on remote SSH host")
return err
} else if string(nfs_status) == "inactive" {
return fmt.Errorf("nfs.service is not ready: %s.", string(nfs_status))
}
return nil
})
if err != nil {
c.Fatalf("Timed out while waiting for kdump to create vmcore files: %v", err)
c.Fatalf("Timed out while waiting for nfs.service to be ready: %v", err)
}

testRemoteKdump(c, kdump_machine, nfs_host.Machine, "/var/nfs/crash")
}
Loading