Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: use single SSH connection for lifetime of microvm #4955

Merged
merged 10 commits into from
Dec 12, 2024
11 changes: 10 additions & 1 deletion tests/framework/microvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ def __init__(
self.mem_size_bytes = None
self.cpu_template_name = None

self._connections = []

self._pre_cmd = []
if numa_node:
node_str = str(numa_node)
Expand Down Expand Up @@ -282,6 +284,10 @@ def kill(self):
for monitor in self.monitors:
monitor.stop()

# Kill all background SSH connections
for connection in self._connections:
connection.close()

roypat marked this conversation as resolved.
Show resolved Hide resolved
# We start with vhost-user backends,
# because if we stop Firecracker first, the backend will want
# to exit as well and this will cause a race condition.
Expand Down Expand Up @@ -1007,13 +1013,16 @@ def ssh_iface(self, iface_idx=0):
"""Return a cached SSH connection on a given interface id."""
guest_ip = list(self.iface.values())[iface_idx]["iface"].guest_ip
self.ssh_key = Path(self.ssh_key)
return net_tools.SSHConnection(
connection = net_tools.SSHConnection(
netns=self.netns.id,
ssh_key=self.ssh_key,
user="root",
host=guest_ip,
control_path=Path(self.chroot()) / f"ssh-{iface_idx}.sock",
ShadowCurse marked this conversation as resolved.
Show resolved Hide resolved
on_error=self._dump_debug_information,
)
self._connections.append(connection)
return connection

@property
def ssh(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/framework/microvm_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,6 @@ def trace_cmd_guest(self, fns, cmd, port=4321):
print("guest> trace-cmd record")
host_ip = self.vm.iface["eth0"]["iface"].host_ip
_guest_ps = self.vm.ssh.run(
f"trace-cmd record -N {host_ip}:{port} -p function {" ".join(fns)} {cmd}"
f"trace-cmd record -N {host_ip}:{port} -p function {' '.join(fns)} {cmd}"
)
return list(Path(".").glob("trace.*.dat"))
6 changes: 6 additions & 0 deletions tests/framework/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,12 @@ def run_cmd(cmd, check=False, shell=True, cwd=None, timeout=None) -> CommandRetu
stdout, stderr = proc.communicate(timeout=timeout)
except subprocess.TimeoutExpired:
proc.kill()

# Sometimes stdout/stderr are passed on to children, in which case killing
# the parent won't close them and communicate will still hang.
proc.stdout.close()
proc.stderr.close()

stdout, stderr = proc.communicate()

# Log the message with one call so that multiple statuses
Expand Down
129 changes: 84 additions & 45 deletions tests/host_tools/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,35 @@
"""Utilities for test host microVM network setup."""

import ipaddress
import os
import random
import re
import signal
import string
import subprocess
from dataclasses import dataclass, field
from pathlib import Path

from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from tenacity import retry, stop_after_attempt, wait_fixed

from framework import utils
from framework.utils import Timeout


class SSHConnection:
"""
SSHConnection encapsulates functionality for microVM SSH interaction.

This class should be instantiated as part of the ssh fixture with the
This class should be instantiated as part of the ssh fixture with
the hostname obtained from the MAC address, the username for logging into
the image and the path of the ssh key.

This translates into an SSH connection as follows:
ssh -i ssh_key_path username@hostname
Establishes a ControlMaster upon construction, which is then re-used
for all subsequent SSH interactions.
"""

def __init__(self, netns, ssh_key: Path, host, user, *, on_error=None):
def __init__(
self, netns, ssh_key: Path, control_path: Path, host, user, *, on_error=None
):
"""Instantiate a SSH client and connect to a microVM."""
self.netns = netns
self.ssh_key = ssh_key
Expand All @@ -37,22 +42,13 @@ def __init__(self, netns, ssh_key: Path, host, user, *, on_error=None):
assert (ssh_key.stat().st_mode & 0o777) == 0o400
self.host = host
self.user = user
self._control_path = control_path

self._on_error = None

self.options = [
"-o",
"LogLevel=ERROR",
"-o",
"ConnectTimeout=1",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
"-o",
"PreferredAuthentications=publickey",
"-i",
str(self.ssh_key),
f"ControlPath={self._control_path}",
]

# _init_connection loops until it can connect to the guest
Expand Down Expand Up @@ -96,39 +92,103 @@ def scp_get(self, remote_path, local_path, recursive=False):
self._scp(self.remote_path(remote_path), local_path, opts)

@retry(
retry=retry_if_exception_type(ChildProcessError),
wait=wait_fixed(0.5),
wait=wait_fixed(1),
stop=stop_after_attempt(20),
reraise=True,
)
def _init_connection(self):
"""Create an initial SSH client connection (retry until it works).
"""Initialize the persistent background connection which will be used
to execute all commands sent via this `SSHConnection` object.

Since we're connecting to a microVM we just started, we'll probably
have to wait for it to boot up and start the SSH server.
We'll keep trying to execute a remote command that can't fail
(`/bin/true`), until we get a successful (0) exit code.
"""
self.check_output("true", timeout=100, debug=True)
assert not self._control_path.exists()

# Sadly, we cannot get debug output from this command (e.g. `-vvv`),
# because passing -vvv causes the daemonized ssh to hold on to stderr,
# and inside utils.run_cmd we're using subprocess.communicate, which
# only returns once stderr gets closed (which would thus result in an
# indefinite hang).
establish_cmd = [
"ssh",
# Only need to pass the ssh key here, as all multiplexed
# connections won't have to re-authenticate
"-i",
str(self.ssh_key),
"-o",
"StrictHostKeyChecking=no",
"-o",
"ConnectTimeout=2",
# Set up a persistent background connection
"-o",
"ControlMaster=auto",
"-o",
"ControlPersist=yes",
*self.options,
self.user_host,
"/usr/bin/true",
]

# don't set a low timeout here, because otherwise we might get into a race condition
# where ssh already forked off the persisted connection daemon, but gets killed here
# before exiting itself. In that case, self._control_path will exist, and the retry
# will hit the assert at the start of this function.
self._exec(establish_cmd, check=True)

def run(self, cmd_string, timeout=None, *, check=False, debug=False):
def _check_liveness(self) -> int:
"""Checks whether the ControlPersist connection is still alive"""
check_cmd = ["ssh", "-O", "check", *self.options, self.user_host]

_, _, stderr = self._exec(check_cmd, check=True)

pid_match = re.match(r"Master running \(pid=(\d+)\)", stderr)

assert pid_match, f"SSH ControlMaster connection not alive anymore: {stderr}"

return int(pid_match.group(1))

def close(self):
"""Closes the ControlPersist connection"""
master_pid = self._check_liveness()

stop_cmd = ["ssh", "-O", "stop", *self.options, self.user_host]

_, _, stderr = self._exec(stop_cmd, check=True)

assert "Stop listening request sent" in stderr

try:
with Timeout(5):
utils.wait_process_termination(master_pid)
except TimeoutError:
# for some reason it won't exit, let's force it...
# if this also fails, when during teardown we'll get an error about
# "found a process with supposedly dead Firecracker's jailer ID"
os.kill(master_pid, signal.SIGKILL)

def run(self, cmd_string, timeout=100, *, check=False, debug=False):
"""
Execute the command passed as a string in the ssh context.

If `debug` is set, pass `-vvv` to `ssh`. Note that this will clobber stderr.
"""
self._check_liveness()

command = ["ssh", *self.options, self.user_host, cmd_string]

if debug:
command.insert(1, "-vvv")

return self._exec(command, timeout, check=check)

def check_output(self, cmd_string, timeout=None, *, debug=False):
def check_output(self, cmd_string, timeout=100, *, debug=False):
"""Same as `run`, but raises an exception on non-zero return code of remote command"""
return self.run(cmd_string, timeout, check=True, debug=debug)

def _exec(self, cmd, timeout=None, check=False):
def _exec(self, cmd, timeout=100, check=False):
"""Private function that handles the ssh client invocation."""
if self.netns is not None:
cmd = ["ip", "netns", "exec", self.netns] + cmd
Expand All @@ -141,27 +201,6 @@ def _exec(self, cmd, timeout=None, check=False):

raise

# pylint:disable=invalid-name
roypat marked this conversation as resolved.
Show resolved Hide resolved
def Popen(
self,
cmd: str,
stdin=subprocess.DEVNULL,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
**kwargs,
) -> subprocess.Popen:
"""Execute the command in the guest and return a Popen object.

pop = uvm.ssh.Popen("while true; do echo $(date -Is) $RANDOM; sleep 1; done")
pop.stdout.read(16)
"""
cmd = ["ssh", *self.options, self.user_host, cmd]
if self.netns is not None:
cmd = ["ip", "netns", "exec", self.netns] + cmd
return subprocess.Popen(
cmd, stdin=stdin, stdout=stdout, stderr=stderr, **kwargs
)


def mac_from_ip(ip_address):
"""Create a MAC address based on the provided IP.
Expand Down
10 changes: 1 addition & 9 deletions tests/integration_tests/functional/test_balloon.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,10 @@ def lower_ssh_oom_chance(ssh_connection):

def make_guest_dirty_memory(ssh_connection, amount_mib=32):
"""Tell the guest, over ssh, to dirty `amount` pages of memory."""
logger = logging.getLogger("make_guest_dirty_memory")

lower_ssh_oom_chance(ssh_connection)

cmd = f"/usr/local/bin/fillmem {amount_mib}"
try:
exit_code, stdout, stderr = ssh_connection.run(cmd, timeout=1.0)
# add something to the logs for troubleshooting
if exit_code != 0:
logger.error("while running: %s", cmd)
logger.error("stdout: %s", stdout)
logger.error("stderr: %s", stderr)
_ = ssh_connection.run(f"/usr/local/bin/fillmem {amount_mib}", timeout=1.0)
except TimeoutExpired:
# It's ok if this expires. Sometimes the SSH connection
# gets killed by the OOM killer *after* the fillmem program
Expand Down
11 changes: 4 additions & 7 deletions tests/integration_tests/functional/test_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ def test_high_ingress_traffic(uvm_plain_any):
time.sleep(1)

# Start iperf3 client on the host. Send 1Gbps UDP traffic.
# If the net device breaks, iperf will freeze. We have to use a timeout.
# If the net device breaks, iperf will freeze, and we'll hit the pytest timeout
utils.check_output(
"timeout 31 {} {} -c {} -u -V -b 1000000000 -t 30".format(
"{} {} -c {} -u -V -b 1000000000 -t 30".format(
roypat marked this conversation as resolved.
Show resolved Hide resolved
test_microvm.netns.cmd_prefix(),
IPERF_BINARY_HOST,
guest_ip,
Expand Down Expand Up @@ -112,11 +112,8 @@ def test_tap_offload(uvm_any):
)

# Try to send a UDP message from host with UDP offload enabled
cmd = f"ip netns exec {vm.ssh_iface().netns} python3 ./host_tools/udp_offload.py {vm.ssh_iface().host} {port}"
ret = utils.run_cmd(cmd)

# Check that the transmission was successful
assert ret.returncode == 0, f"{ret.stdout=} {ret.stderr=}"
cmd = f"ip netns exec {vm.ssh.netns} python3 ./host_tools/udp_offload.py {vm.ssh.host} {port}"
utils.check_output(cmd)

# Check that the server received the message
ret = vm.ssh.run(f"cat {out_filename}")
Expand Down
10 changes: 4 additions & 6 deletions tests/integration_tests/functional/test_pause_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import platform
import time
from subprocess import TimeoutExpired

import pytest

Expand Down Expand Up @@ -52,17 +53,13 @@ def test_pause_resume(uvm_nano):
microvm.flush_metrics()

# Verify guest is no longer active.
with pytest.raises(ChildProcessError):
microvm.ssh.check_output("true")
with pytest.raises(TimeoutExpired):
microvm.ssh.check_output("true", timeout=1)

# Verify emulation was indeed paused and no events from either
# guest or host side were handled.
verify_net_emulation_paused(microvm.flush_metrics())

# Verify guest is no longer active.
with pytest.raises(ChildProcessError):
microvm.ssh.check_output("true")

# Pausing the microVM when it is already `Paused` is allowed
# (microVM remains in `Paused` state).
microvm.api.vm.patch(state="Paused")
Expand All @@ -71,6 +68,7 @@ def test_pause_resume(uvm_nano):
microvm.api.vm.patch(state="Resumed")

# Verify guest is active again.
microvm.ssh.check_output("true")

# Resuming the microVM when it is already `Resumed` is allowed
# (microVM remains in the running state).
Expand Down
Loading
Loading