From c681101d2d3dc66e634fad2db388907f550a8ad7 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Oct 2023 10:35:34 +0000 Subject: [PATCH 1/4] fix(test): balloon deflate on OOM Balloon devices have a feature where they can start deflating when the guest is in an OOM situation. We have a test that ensures this functionality works as expected. The test creates a microVM with a balloon device enabled, it inflates the balloon and then invokes a process in the microVM that exhausts the remaining microVM memory. The expectation is that the OOM killer will kick in and reap that process. The test relies on observing the process that fills up the memory to be killed in order to succeed. However, we do not really have control on what process the OOM will decide to kill, in low memory situations. This makes the test failing intermittently. This commit, changes the test to instead look into balloon statistics. Conceptually this makes sense; we don't want to test the OOM killer functionality, we want to ensure that the balloon device gives back memory to the VM in low memory situations. The balloon statistics can give us this information. Signed-off-by: Babis Chalios --- .../functional/test_balloon.py | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index 785f4c52a81..bb48512b489 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -52,17 +52,20 @@ def lower_ssh_oom_chance(ssh_connection): for pid in stdout.split(" "): cmd = f"choom -n -1000 -p {pid}" - ssh_connection.run(cmd) + exit_code, stdout, stderr = ssh_connection.run(cmd) + if exit_code != 0: + logger.error("while running: %s", cmd) + logger.error("stdout: %s", stdout) + logger.error("stderr: %s", stderr) -def make_guest_dirty_memory(ssh_connection, should_oom=False, amount_mib=32): +def make_guest_dirty_memory(ssh_connection, amount_mib=32): """Tell the guest, over ssh, to dirty `amount` pages of memory.""" logger = logging.getLogger("make_guest_dirty_memory") lower_ssh_oom_chance(ssh_connection) - # Aim OOM at fillmem process - cmd = f"choom -n 1000 -- /usr/local/bin/fillmem {amount_mib}" + cmd = f"/usr/local/bin/fillmem {amount_mib}" exit_code, stdout, stderr = ssh_connection.run(cmd) # add something to the logs for troubleshooting if exit_code != 0: @@ -80,12 +83,6 @@ def make_guest_dirty_memory(ssh_connection, should_oom=False, amount_mib=32): break tries -= 1 - if should_oom: - assert "OOM Killer stopped the program with signal 9, exit code 0" in stdout - else: - assert exit_code == 0, stderr - assert "Memory filling was successful" in stdout, stdout - def _test_rss_memory_lower(test_microvm, stable_delta=1): """Check inflating the balloon makes guest use less rss memory.""" @@ -181,11 +178,11 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom): deflate_on_oom=True - should not result in an OOM kill + should result in balloon_stats['actual_mib'] be reduced deflate_on_oom=False - should result in an OOM kill + should result in balloon_stats['actual_mib'] remain the same """ test_microvm = test_microvm_with_api test_microvm.spawn() @@ -194,7 +191,7 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom): # Add a deflated memory balloon. test_microvm.api.balloon.put( - amount_mib=0, deflate_on_oom=deflate_on_oom, stats_polling_interval_s=0 + amount_mib=0, deflate_on_oom=deflate_on_oom, stats_polling_interval_s=1 ) # Start the microvm. @@ -213,8 +210,17 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom): # This call will internally wait for rss to become stable. _ = get_stable_rss_mem_by_pid(firecracker_pid) - # Check that using memory leads an out of memory error (or not). - make_guest_dirty_memory(test_microvm.ssh, should_oom=not deflate_on_oom) + # Check that using memory leads to the balloon device automatically + # deflate (or not). + balloon_size_before = test_microvm.api.balloon_stats.get().json()["actual_mib"] + make_guest_dirty_memory(test_microvm.ssh) + + balloon_size_after = test_microvm.api.balloon_stats.get().json()["actual_mib"] + print(f"size before: {balloon_size_before} size after: {balloon_size_after}") + if deflate_on_oom: + assert balloon_size_after < balloon_size_before, "Balloon did not deflate" + else: + assert balloon_size_after >= balloon_size_before, "Balloon deflated" # pylint: disable=C0103 From 199c7c1ad6e17f4b3c19ce55c0ca038a6553e6aa Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Oct 2023 13:24:18 +0000 Subject: [PATCH 2/4] fix(test): replace "pidof sshd" for /run/sshd.pid In test_balloon.py we are trying to find the PID of the SSH daemon process, so that we later change its OOM score, so that it does not get killed when we (deliberately) exhaust the available memory in a microVM. We use "pidof sshd" to do that. However, pidof returns the PIDs of all threads of the process. Now sshd launches a new thread for every new SSH connection. Later when we iterate over these PIDs to change their OOM score not all of the threads might be there, so the choom will fail for these. This is not really a problem, but it some times leads to misleading error messages. This commit drops the use of "pidof" in favour of reading the daemon's PID from "/run/sshd.pid". Signed-off-by: Babis Chalios --- tests/integration_tests/functional/test_balloon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index bb48512b489..7f7ec7dd928 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -42,7 +42,7 @@ def lower_ssh_oom_chance(ssh_connection): """Lure OOM away from ssh process""" logger = logging.getLogger("lower_ssh_oom_chance") - cmd = "pidof sshd" + cmd = "cat /run/sshd.pid" exit_code, stdout, stderr = ssh_connection.run(cmd) # add something to the logs for troubleshooting if exit_code != 0: From 51d7ec9e11997b055810be29ab1294aa3049b2bb Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Oct 2023 13:36:20 +0000 Subject: [PATCH 3/4] fix(test): avoid running ssh after exhausting memory In test_balloon.py::test_deflate_on_oom we are exhausting the memory of the microVM trying to trigger the OOM killer. This commit removes SSH commands after launching the memory hogger inside the microVM, to avoid hang connections due to the OOM killer killing sshd. Signed-off-by: Babis Chalios --- tests/integration_tests/functional/test_balloon.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index 7f7ec7dd928..8ec591b5836 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -73,15 +73,7 @@ def make_guest_dirty_memory(ssh_connection, amount_mib=32): logger.error("stdout: %s", stdout) logger.error("stderr: %s", stderr) - cmd = "cat /tmp/fillmem_output.txt" - tries = 3 - while tries > 0: - # it may take a bit of time to dirty the memory and the OOM to kick-in - time.sleep(0.5) - _, stdout, _ = ssh_connection.run(cmd) - if stdout != "": - break - tries -= 1 + time.sleep(5) def _test_rss_memory_lower(test_microvm, stable_delta=1): From 57ed015ad05a1c143fdb1510febb0bd3642e932f Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Oct 2023 14:31:18 +0000 Subject: [PATCH 4/4] feat(test): add optional timeout in SSH connections We have a mechanism that allows us to run a command inside a microVM. This mechanism ends up using Popen.communicate to retrieve the output of the SSH command. Popen.communicate comes with a timeout variable that we were not using. However, it is useful in cases where we don't want to wait for the result of the command we execute in the microVM. This commit extends our SSH mechanism to accept the timeout argument. Then, it uses the timeout in the test_balloon.py::test_deflate_on_oom when launching the fillmem process. fillmem drains the memory of the VM, which sometimes results in the SSH connection hanging. Signed-off-by: Babis Chalios --- tests/framework/utils.py | 12 ++++------- tests/host_tools/network.py | 9 ++++---- .../functional/test_balloon.py | 21 +++++++++++++------ 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/tests/framework/utils.py b/tests/framework/utils.py index a6d79613513..802e002b12c 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -448,7 +448,7 @@ def get_free_mem_ssh(ssh_connection): raise Exception("Available memory not found in `/proc/meminfo") -def run_cmd_sync(cmd, ignore_return_code=False, no_shell=False, cwd=None): +def run_cmd_sync(cmd, ignore_return_code=False, no_shell=False, cwd=None, timeout=None): """ Execute a given command. @@ -469,7 +469,7 @@ def run_cmd_sync(cmd, ignore_return_code=False, no_shell=False, cwd=None): ) # Capture stdout/stderr - stdout, stderr = proc.communicate() + stdout, stderr = proc.communicate(timeout=timeout) output_message = f"\n[{proc.pid}] Command:\n{cmd}" # Append stdout/stderr to the output message @@ -493,18 +493,14 @@ def run_cmd_sync(cmd, ignore_return_code=False, no_shell=False, cwd=None): return CommandReturn(proc.returncode, stdout.decode(), stderr.decode()) -def run_cmd(cmd, ignore_return_code=False, no_shell=False, cwd=None): +def run_cmd(cmd, **kwargs): """ Run a command using the sync function that logs the output. :param cmd: command to run - :param ignore_return_code: whether a non-zero return code should be ignored - :param noshell: don't run the command in a sub-shell :returns: tuple of (return code, stdout, stderr) """ - return run_cmd_sync( - cmd=cmd, ignore_return_code=ignore_return_code, no_shell=no_shell, cwd=cwd - ) + return run_cmd_sync(cmd, **kwargs) def eager_map(func, iterable): diff --git a/tests/host_tools/network.py b/tests/host_tools/network.py index 3ef2332e748..60f1624fb90 100644 --- a/tests/host_tools/network.py +++ b/tests/host_tools/network.py @@ -89,7 +89,7 @@ def _init_connection(self): if ecode != 0: raise ConnectionError - def run(self, cmd_string): + def run(self, cmd_string, timeout=None): """Execute the command passed as a string in the ssh context.""" return self._exec( [ @@ -97,10 +97,11 @@ def run(self, cmd_string): *self.options, f"{self.user}@{self.host}", cmd_string, - ] + ], + timeout, ) - def _exec(self, cmd): + def _exec(self, cmd, timeout=None): """Private function that handles the ssh client invocation.""" # TODO: If a microvm runs in a particular network namespace, we have to @@ -111,7 +112,7 @@ def _exec(self, cmd): if self.netns_file_path is not None: ctx = Namespace(self.netns_file_path, "net") with ctx: - return utils.run_cmd(cmd, ignore_return_code=True) + return utils.run_cmd(cmd, ignore_return_code=True, timeout=timeout) def mac_from_ip(ip_address): diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index 8ec591b5836..040dae5ea0c 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -4,6 +4,7 @@ import logging import time +from subprocess import TimeoutExpired import pytest from retry import retry @@ -66,12 +67,20 @@ def make_guest_dirty_memory(ssh_connection, amount_mib=32): lower_ssh_oom_chance(ssh_connection) cmd = f"/usr/local/bin/fillmem {amount_mib}" - exit_code, stdout, stderr = ssh_connection.run(cmd) - # add something to the logs for troubleshooting - if exit_code != 0: - logger.error("while running: %s", cmd) - logger.error("stdout: %s", stdout) - logger.error("stderr: %s", stderr) + try: + exit_code, stdout, stderr = ssh_connection.run(cmd, timeout=1.0) + # add something to the logs for troubleshooting + if exit_code != 0: + logger.error("while running: %s", cmd) + logger.error("stdout: %s", stdout) + logger.error("stderr: %s", stderr) + + cmd = "cat /tmp/fillmem_output.txt" + except TimeoutExpired: + # It's ok if this expires. Some times the SSH connection + # gets killed by the OOM killer *after* the fillmem program + # started. As a result, we can ignore timeouts here. + pass time.sleep(5)