From 2f52798cee29585b3b3c5ee275d9a32f34d1d7ff Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Thu, 5 Oct 2023 10:35:34 +0000
Subject: [PATCH] fix(test): balloon deflate on OOM

Balloon devices have a feature where they can start deflating when the
guest is in an OOM situation. We have a test that ensures this
functionality works as expected. The test creates a microVM with a
balloon device enabled, it inflates the balloon and then invokes a
process in the microVM that exhausts the remaining microVM memory. The
expectation is that the OOM killer will kick in and reap that process.
The test relies on observing the process that fills up the memory to be
killed in order to succeed.

However, we do not really have control on what process the OOM will
decide to kill, in low memory situations. This makes the test failing
intermittently.

This commit, changes the test to instead look into balloon statistics.
Conceptually this makes sense; we don't want to test the OOM killer
functionality, we want to ensure that the balloon device gives back
memory to the VM in low memory situations. The balloon statistics can
give us this information.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
---
 .../functional/test_balloon.py                | 48 +++++++------------
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
index 785f4c52a81f..a4bab4b3d90e 100644
--- a/tests/integration_tests/functional/test_balloon.py
+++ b/tests/integration_tests/functional/test_balloon.py
@@ -38,31 +38,12 @@ def get_rss_from_pmap():
     return second_rss
 
 
-def lower_ssh_oom_chance(ssh_connection):
-    """Lure OOM away from ssh process"""
-    logger = logging.getLogger("lower_ssh_oom_chance")
-
-    cmd = "pidof sshd"
-    exit_code, stdout, stderr = ssh_connection.run(cmd)
-    # add something to the logs for troubleshooting
-    if exit_code != 0:
-        logger.error("while running: %s", cmd)
-        logger.error("stdout: %s", stdout)
-        logger.error("stderr: %s", stderr)
-
-    for pid in stdout.split(" "):
-        cmd = f"choom -n -1000 -p {pid}"
-        ssh_connection.run(cmd)
-
-
-def make_guest_dirty_memory(ssh_connection, should_oom=False, amount_mib=32):
+def make_guest_dirty_memory(ssh_connection, amount_mib=32):
     """Tell the guest, over ssh, to dirty `amount` pages of memory."""
     logger = logging.getLogger("make_guest_dirty_memory")
 
-    lower_ssh_oom_chance(ssh_connection)
-
     # Aim OOM at fillmem process
-    cmd = f"choom -n 1000 -- /usr/local/bin/fillmem {amount_mib}"
+    cmd = f"/usr/local/bin/fillmem {amount_mib}"
     exit_code, stdout, stderr = ssh_connection.run(cmd)
     # add something to the logs for troubleshooting
     if exit_code != 0:
@@ -80,12 +61,6 @@ def make_guest_dirty_memory(ssh_connection, should_oom=False, amount_mib=32):
             break
         tries -= 1
 
-    if should_oom:
-        assert "OOM Killer stopped the program with signal 9, exit code 0" in stdout
-    else:
-        assert exit_code == 0, stderr
-        assert "Memory filling was successful" in stdout, stdout
-
 
 def _test_rss_memory_lower(test_microvm, stable_delta=1):
     """Check inflating the balloon makes guest use less rss memory."""
@@ -181,11 +156,11 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom):
 
     deflate_on_oom=True
 
-      should not result in an OOM kill
+      should result in balloon_stats['actual_mib'] be reduced
 
     deflate_on_oom=False
 
-      should result in an OOM kill
+      should result in balloon_stats['actual_mib'] remain the same
     """
     test_microvm = test_microvm_with_api
     test_microvm.spawn()
@@ -194,7 +169,7 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom):
 
     # Add a deflated memory balloon.
     test_microvm.api.balloon.put(
-        amount_mib=0, deflate_on_oom=deflate_on_oom, stats_polling_interval_s=0
+        amount_mib=0, deflate_on_oom=deflate_on_oom, stats_polling_interval_s=1
     )
 
     # Start the microvm.
@@ -213,8 +188,17 @@ def test_deflate_on_oom(test_microvm_with_api, deflate_on_oom):
     # This call will internally wait for rss to become stable.
     _ = get_stable_rss_mem_by_pid(firecracker_pid)
 
-    # Check that using memory leads an out of memory error (or not).
-    make_guest_dirty_memory(test_microvm.ssh, should_oom=not deflate_on_oom)
+    # Check that using memory leads to the balloon device automatically
+    # deflate (or not).
+    balloon_size_before = test_microvm.api.balloon_stats.get().json()["actual_mib"]
+    make_guest_dirty_memory(test_microvm.ssh)
+
+    balloon_size_after = test_microvm.api.balloon_stats.get().json()["actual_mib"]
+    print(f"size before: {balloon_size_before} size after: {balloon_size_after}")
+    if deflate_on_oom:
+        assert balloon_size_after < balloon_size_before, "Balloon did not deflate"
+    else:
+        assert balloon_size_after >= balloon_size_before, "Balloon deflated"
 
 
 # pylint: disable=C0103