From c681101d2d3dc66e634fad2db388907f550a8ad7 Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Thu, 5 Oct 2023 10:35:34 +0000
Subject: [PATCH 1/4] fix(test): balloon deflate on OOM

Balloon devices have a feature where they can start deflating when the
guest is in an OOM situation. We have a test that ensures this
functionality works as expected. The test creates a microVM with a
balloon device enabled, it inflates the balloon and then invokes a
process in the microVM that exhausts the remaining microVM memory. The
expectation is that the OOM killer will kick in and reap that process.
The test relies on observing the process that fills up the memory to be
killed in order to succeed.

However, we do not really have control on what process the OOM will
decide to kill, in low memory situations. This makes the test failing
intermittently.

This commit, changes the test to instead look into balloon statistics.
Conceptually this makes sense; we don't want to test the OOM killer
functionality, we want to ensure that the balloon device gives back
memory to the VM in low memory situations. The balloon statistics can
give us this information.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 .../functional/test_balloon.py                | 36 +++++++++++--------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 785f4c52a81..bb48512b489 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -52,17 +52,20 @@ def lower_ssh_oom_chance(ssh_connection):
 
     for pid in stdout.split(" "):
         cmd = f"choom -n -1000 -p {pid}"
-        ssh_connection.run(cmd)
+        exit_code, stdout, stderr = ssh_connection.run(cmd)
+        if exit_code != 0:
+            logger.error("while running: %s", cmd)
+            logger.error("stdout: %s", stdout)
+            logger.error("stderr: %s", stderr)
 
 
-def make_guest_dirty_memory(ssh_connection, should_oom=False, amount_mib=32):
+def make_guest_dirty_memory(ssh_connection, amount_mib=32):
     """Tell the guest, over ssh, to dirty `amount` pages of memory."""
     logger = logging.getLogger("make_guest_dirty_memory")
 
     lower_ssh_oom_chance(ssh_connection)
 
-    # Aim OOM at fillmem process
-    cmd = f"choom -n 1000 -- /usr/local/bin/fillmem {amount_mib}"
+    cmd = f"/usr/local/bin/fillmem {amount_mib}"
     exit_code, stdout, stderr = ssh_connection.run(cmd)
     # add something to the logs for troubleshooting
     if exit_code != 0:
@@ -80,12 +83,6 @@ def make_guest_dirty_memory(ssh_connection, should_oom=False, amount_mib=32):
             break
         tries -= 1
 
-    if should_oom:
-        assert "OOM Killer stopped the program with signal 9, exit code 0" in stdout
-    else:
-        assert exit_code == 0, stderr
-        assert "Memory filling was successful" in stdout, stdout
-
 
 def _test_rss_memory_lower(test_microvm, stable_delta=1):
     """Check inflating the balloon makes guest use less rss memory."""
@@ -181,11 +178,11 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom):
 
     deflate_on_oom=True
 
-      should not result in an OOM kill
+      should result in balloon_stats['actual_mib'] be reduced
 
     deflate_on_oom=False
 
-      should result in an OOM kill
+      should result in balloon_stats['actual_mib'] remain the same
     """
     test_microvm = test_microvm_with_api
     test_microvm.spawn()
@@ -194,7 +191,7 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom):
 
     # Add a deflated memory balloon.
     test_microvm.api.balloon.put(
-        amount_mib=0, deflate_on_oom=deflate_on_oom, stats_polling_interval_s=0
+        amount_mib=0, deflate_on_oom=deflate_on_oom, stats_polling_interval_s=1
     )
 
     # Start the microvm.
@@ -213,8 +210,17 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom):
     # This call will internally wait for rss to become stable.
     _ = get_stable_rss_mem_by_pid(firecracker_pid)
 
-    # Check that using memory leads an out of memory error (or not).
-    make_guest_dirty_memory(test_microvm.ssh, should_oom=not deflate_on_oom)
+    # Check that using memory leads to the balloon device automatically
+    # deflate (or not).
+    balloon_size_before = test_microvm.api.balloon_stats.get().json()["actual_mib"]
+    make_guest_dirty_memory(test_microvm.ssh)
+
+    balloon_size_after = test_microvm.api.balloon_stats.get().json()["actual_mib"]
+    print(f"size before: {balloon_size_before} size after: {balloon_size_after}")
+    if deflate_on_oom:
+        assert balloon_size_after < balloon_size_before, "Balloon did not deflate"
+    else:
+        assert balloon_size_after >= balloon_size_before, "Balloon deflated"
 
 
 # pylint: disable=C0103

From 199c7c1ad6e17f4b3c19ce55c0ca038a6553e6aa Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Thu, 5 Oct 2023 13:24:18 +0000
Subject: [PATCH 2/4] fix(test): replace "pidof sshd" for /run/sshd.pid

In test_balloon.py we are trying to find the PID of the SSH daemon
process, so that we later change its OOM score, so that it does not get
killed when we (deliberately) exhaust the available memory in a microVM.
We use "pidof sshd" to do that. However, pidof returns the PIDs of all
threads of the process. Now sshd launches a new thread for every new SSH
connection. Later when we iterate over these PIDs to change their OOM
score not all of the threads might be there, so the choom will fail for
these. This is not really a problem, but it some times leads to
misleading error messages.

This commit drops the use of "pidof" in favour of reading the daemon's
PID from "/run/sshd.pid".

Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 tests/integration_tests/functional/test_balloon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index bb48512b489..7f7ec7dd928 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -42,7 +42,7 @@ def lower_ssh_oom_chance(ssh_connection):
     """Lure OOM away from ssh process"""
     logger = logging.getLogger("lower_ssh_oom_chance")
 
-    cmd = "pidof sshd"
+    cmd = "cat /run/sshd.pid"
     exit_code, stdout, stderr = ssh_connection.run(cmd)
     # add something to the logs for troubleshooting
     if exit_code != 0:

From 51d7ec9e11997b055810be29ab1294aa3049b2bb Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Thu, 5 Oct 2023 13:36:20 +0000
Subject: [PATCH 3/4] fix(test): avoid running ssh after exhausting memory

In test_balloon.py::test_deflate_on_oom we are exhausting the memory of
the microVM trying to trigger the OOM killer. This commit removes SSH
commands after launching the memory hogger inside the microVM, to avoid
hang connections due to the OOM killer killing sshd.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 tests/integration_tests/functional/test_balloon.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 7f7ec7dd928..8ec591b5836 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -73,15 +73,7 @@ def make_guest_dirty_memory(ssh_connection, amount_mib=32):
         logger.error("stdout: %s", stdout)
         logger.error("stderr: %s", stderr)
 
-    cmd = "cat /tmp/fillmem_output.txt"
-    tries = 3
-    while tries > 0:
-        # it may take a bit of time to dirty the memory and the OOM to kick-in
-        time.sleep(0.5)
-        _, stdout, _ = ssh_connection.run(cmd)
-        if stdout != "":
-            break
-        tries -= 1
+    time.sleep(5)
 
 
 def _test_rss_memory_lower(test_microvm, stable_delta=1):

From 57ed015ad05a1c143fdb1510febb0bd3642e932f Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Thu, 5 Oct 2023 14:31:18 +0000
Subject: [PATCH 4/4] feat(test): add optional timeout in SSH connections

We have a mechanism that allows us to run a command inside a microVM.
This mechanism ends up using Popen.communicate to retrieve the output of
the SSH command. Popen.communicate comes with a timeout variable that we
were not using. However, it is useful in cases where we don't want to
wait for the result of the command we execute in the microVM.

This commit extends our SSH mechanism to accept the timeout argument.
Then, it uses the timeout in the test_balloon.py::test_deflate_on_oom
when launching the fillmem process. fillmem drains the memory of the VM,
which sometimes results in the SSH connection hanging.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 tests/framework/utils.py                      | 12 ++++-------
 tests/host_tools/network.py                   |  9 ++++----
 .../functional/test_balloon.py                | 21 +++++++++++++------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/tests/framework/utils.py b/tests/framework/utils.py
index a6d79613513..802e002b12c 100644
--- a/tests/framework/utils.py
+++ b/tests/framework/utils.py
@@ -448,7 +448,7 @@ def get_free_mem_ssh(ssh_connection):
     raise Exception("Available memory not found in `/proc/meminfo")
 
 
-def run_cmd_sync(cmd, ignore_return_code=False, no_shell=False, cwd=None):
+def run_cmd_sync(cmd, ignore_return_code=False, no_shell=False, cwd=None, timeout=None):
     """
     Execute a given command.
 
@@ -469,7 +469,7 @@ def run_cmd_sync(cmd, ignore_return_code=False, no_shell=False, cwd=None):
         )
 
     # Capture stdout/stderr
-    stdout, stderr = proc.communicate()
+    stdout, stderr = proc.communicate(timeout=timeout)
 
     output_message = f"\n[{proc.pid}] Command:\n{cmd}"
     # Append stdout/stderr to the output message
@@ -493,18 +493,14 @@ def run_cmd_sync(cmd, ignore_return_code=False, no_shell=False, cwd=None):
     return CommandReturn(proc.returncode, stdout.decode(), stderr.decode())
 
 
-def run_cmd(cmd, ignore_return_code=False, no_shell=False, cwd=None):
+def run_cmd(cmd, **kwargs):
     """
     Run a command using the sync function that logs the output.
 
     :param cmd: command to run
-    :param ignore_return_code: whether a non-zero return code should be ignored
-    :param noshell: don't run the command in a sub-shell
     :returns: tuple of (return code, stdout, stderr)
     """
-    return run_cmd_sync(
-        cmd=cmd, ignore_return_code=ignore_return_code, no_shell=no_shell, cwd=cwd
-    )
+    return run_cmd_sync(cmd, **kwargs)
 
 
 def eager_map(func, iterable):
diff --git a/tests/host_tools/network.py b/tests/host_tools/network.py
index 3ef2332e748..60f1624fb90 100644
--- a/tests/host_tools/network.py
+++ b/tests/host_tools/network.py
@@ -89,7 +89,7 @@ def _init_connection(self):
         if ecode != 0:
             raise ConnectionError
 
-    def run(self, cmd_string):
+    def run(self, cmd_string, timeout=None):
         """Execute the command passed as a string in the ssh context."""
         return self._exec(
             [
@@ -97,10 +97,11 @@ def run(self, cmd_string):
                 *self.options,
                 f"{self.user}@{self.host}",
                 cmd_string,
-            ]
+            ],
+            timeout,
         )
 
-    def _exec(self, cmd):
+    def _exec(self, cmd, timeout=None):
         """Private function that handles the ssh client invocation."""
 
         # TODO: If a microvm runs in a particular network namespace, we have to
@@ -111,7 +112,7 @@ def _exec(self, cmd):
         if self.netns_file_path is not None:
             ctx = Namespace(self.netns_file_path, "net")
         with ctx:
-            return utils.run_cmd(cmd, ignore_return_code=True)
+            return utils.run_cmd(cmd, ignore_return_code=True, timeout=timeout)
 
 
 def mac_from_ip(ip_address):
diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 8ec591b5836..040dae5ea0c 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -4,6 +4,7 @@
 
 import logging
 import time
+from subprocess import TimeoutExpired
 
 import pytest
 from retry import retry
@@ -66,12 +67,20 @@ def make_guest_dirty_memory(ssh_connection, amount_mib=32):
     lower_ssh_oom_chance(ssh_connection)
 
     cmd = f"/usr/local/bin/fillmem {amount_mib}"
-    exit_code, stdout, stderr = ssh_connection.run(cmd)
-    # add something to the logs for troubleshooting
-    if exit_code != 0:
-        logger.error("while running: %s", cmd)
-        logger.error("stdout: %s", stdout)
-        logger.error("stderr: %s", stderr)
+    try:
+        exit_code, stdout, stderr = ssh_connection.run(cmd, timeout=1.0)
+        # add something to the logs for troubleshooting
+        if exit_code != 0:
+            logger.error("while running: %s", cmd)
+            logger.error("stdout: %s", stdout)
+            logger.error("stderr: %s", stderr)
+
+        cmd = "cat /tmp/fillmem_output.txt"
+    except TimeoutExpired:
+        # It's ok if this expires. Some times the SSH connection
+        # gets killed by the OOM killer *after* the fillmem program
+        # started. As a result, we can ignore timeouts here.
+        pass
 
     time.sleep(5)