diff --git a/shinigami/__init__.py b/shinigami/__init__.py index 7be8d6b..4652006 100644 --- a/shinigami/__init__.py +++ b/shinigami/__init__.py @@ -1,10 +1,6 @@ """Shinigami is a command-line application for killing errant processes on Slurm based compute nodes. The application scans for and terminates any running processes not associated with a currently running Slurm job. - -Individual users and groups can be whitelisted in the application settings file -via UID and GID values. Specific compute nodes can also be ignored using basic -string matching. See the `settings` module for more details. """ import importlib.metadata diff --git a/shinigami/cli.py b/shinigami/cli.py index 326c50a..f6b5371 100644 --- a/shinigami/cli.py +++ b/shinigami/cli.py @@ -1,4 +1,4 @@ -"""The executable application and its command-line interface.""" +"""The executable application and its command line interface.""" import asyncio import inspect diff --git a/shinigami/utils.py b/shinigami/utils.py index b1c2c09..d9c982d 100755 --- a/shinigami/utils.py +++ b/shinigami/utils.py @@ -10,15 +10,18 @@ import asyncssh import pandas as pd +# Technically the init process ID may vary with the system +# architecture, but 1 is an almost universal default INIT_PROCESS_ID = 1 +# Custom type hints Whitelist = Collection[Union[int, Tuple[int, int]]] -def id_in_whitelist(id_value: int, whitelist: Whitelist) -> bool: +def _id_in_whitelist(id_value: int, whitelist: Whitelist) -> bool: """Return whether an ID is in a list of ID value definitions - The `whitelist` of ID values can contain a mix of integers and tuples + The `whitelist` of ID values can contain a mix of integers and tuples of integer ranges. For example, [0, 1, (2, 9), 10] includes all IDs from zero through ten. @@ -58,11 +61,14 @@ def get_nodes(cluster: str, ignore_nodes: Collection[str] = tuple()) -> set: raise RuntimeError(stderr) all_nodes = stdout.decode().strip().split('\n') - return set(node for node in all_nodes if node not in ignore_nodes) + return set(all_nodes) - set(ignore_nodes) async def get_remote_processes(conn: asyncssh.SSHClientConnection) -> pd.DataFrame: - """Fetch running process data from the remote machine + """Fetch running process data from a remote machine + + The returned DataFrame is guaranteed to have columns `PID`, `PPID`, `PGID`, + `UID`, and `CND`. Args: conn: Open SSH connection to the machine @@ -82,8 +88,10 @@ def include_orphaned_processes(df: pd.DataFrame) -> pd.DataFrame: Given a DataFrame with system process data, return a subset of the data containing processes parented by `INIT_PROCESS_ID`. + See the `get_remote_processes` function for the assumed DataFrame data model. + Args: - df: A DataFrame with a `PPID` column + df: A DataFrame with process data Returns: A copy of the given DataFrame @@ -98,23 +106,30 @@ def include_user_whitelist(df: pd.DataFrame, uid_whitelist: Whitelist) -> pd.Dat Given a DataFrame with system process data, return a subset of the data containing processes owned by the given user IDs. + See the `get_remote_processes` function for the assumed DataFrame data model. + Args: - df: A DataFrame with a `UID` column + df: A DataFrame with process data uid_whitelist: List of user IDs to whitelist Returns: A copy of the given DataFrame """ - whitelist_index = df['UID'].apply(id_in_whitelist, whitelist=uid_whitelist) + whitelist_index = df['UID'].apply(_id_in_whitelist, whitelist=uid_whitelist) return df[whitelist_index] def exclude_active_slurm_users(df: pd.DataFrame) -> pd.DataFrame: """Filter a DataFrame to exclude user IDs tied to a running slurm job + Given a DataFrame with system process data, return a subset of the data + that excludes processes owned by users running a `slurmd` command. + + See the `get_remote_processes` function for the assumed DataFrame data model. + Args: - df: A DataFrame with `UID` and `CMD` columns + df: A DataFrame with process data Returns: A copy of the given DataFrame @@ -136,7 +151,7 @@ async def terminate_errant_processes( Args: node: The DNS resolvable name of the node to terminate processes on - uid_whitelist: Do not terminate processes owned by the given UID + uid_whitelist: Do not terminate processes owned by the given UIDs ssh_limit: Semaphore object used to limit concurrent SSH connections ssh_options: Options for configuring the outbound SSH connection debug: Log which process to terminate but do not terminate them diff --git a/tests/cli/test_base_parser.py b/tests/cli/test_base_parser.py index 39899b6..39239c7 100644 --- a/tests/cli/test_base_parser.py +++ b/tests/cli/test_base_parser.py @@ -6,7 +6,7 @@ class BaseParsing(TestCase): - """Test custom parsing login encapsulated by the `BaseParser` class""" + """Test custom parsing logic encapsulated by the `BaseParser` class""" def test_errors_raise_system_exit(self) -> None: """Test error messages are raised as `SystemExit` instances""" diff --git a/tests/utils/test_exclude_active_slurm_users.py b/tests/utils/test_exclude_active_slurm_users.py index 604125e..322de10 100644 --- a/tests/utils/test_exclude_active_slurm_users.py +++ b/tests/utils/test_exclude_active_slurm_users.py @@ -11,31 +11,34 @@ class ExcludeSlurmUsers(unittest.TestCase): """Test the identification of slurm users from a DataFrame of process data""" def test_exclude_slurm_users(self) -> None: + """Test slurm processes are excluded from the returned DataFrame""" + + # User 1002 has two processes. + # They BOTH should be excluded because ONE of them is a slurm process. input_df = pd.DataFrame({ - 'UID': [1001, 1002, 1003, 1004, 1005], - 'CMD': ['process_1', 'slurmd', 'process_3', 'process_4', 'process_5']}) + 'UID': [1001, 1002, 1002, 1003, 1004], + 'CMD': ['process 1', 'slurmd ...', 'process 3', 'process 4', 'process5']}) - expected_df = input_df.loc[[0, 2, 3, 4]] + expected_df = input_df.loc[[0, 3, 4]] returned_df = exclude_active_slurm_users(input_df) - pd.testing.assert_frame_equal(returned_df, expected_df) - self.assertIsNot(returned_df, input_df) def test_no_slurm_users(self) -> None: + """Test the returned dataframe matches the input dataframe when there are no slurm processes""" + input_df = pd.DataFrame({ 'UID': [1001, 1002, 1003, 1004, 1005], - 'CMD': ['process_1', 'process_2', 'process_3', 'process_4', 'process_5']}) + 'CMD': ['process1', 'process2', 'process3', 'process4', 'process5']}) returned_df = exclude_active_slurm_users(input_df) - pd.testing.assert_frame_equal(returned_df, input_df) - self.assertIsNot(returned_df, input_df) def test_all_slurm_users(self) -> None: + """Test the returned dataframe is empty when all process container `slurmd`""" + input_df = pd.DataFrame({ - 'UID': [1001, 1002, 1003, 1004, 1005], - 'CMD': ['slurmd', 'slurmd', 'slurmd', 'slurmd', 'slurmd']}) + 'UID': [1001, 1002, 1003], + 'CMD': ['slurmd', 'prefix slurmd', 'slurmd postfix']}) returned_df = exclude_active_slurm_users(input_df) self.assertTrue(returned_df.empty) - self.assertIsNot(returned_df, input_df) diff --git a/tests/utils/test_include_orphaned_processes.py b/tests/utils/test_include_orphaned_processes.py index 5aaa991..ad81b67 100644 --- a/tests/utils/test_include_orphaned_processes.py +++ b/tests/utils/test_include_orphaned_processes.py @@ -15,35 +15,23 @@ def test_orphaned_processes_are_returned(self) -> None: input_df = pd.DataFrame({ 'PID': [1, 2, 3, 4, 5], - 'PPID': [0, INIT_PROCESS_ID, INIT_PROCESS_ID, 2000, 2000], - 'PGID': [0, 0, 0, 0, 0], - 'UID': [0, 1, 1, 2, 2], - 'CMD': ['init', 'process_1', 'process_2', 'process_3', 'process_4']}) - - # Manually define the data subset (including the index) expected in the function return - expected_output_df = pd.DataFrame({ - 'PID': [2, 3], - 'PPID': [INIT_PROCESS_ID, INIT_PROCESS_ID], - 'PGID': [0, 0], - 'UID': [1, 1], - 'CMD': ['process_1', 'process_2']}, - index=[1, 2]) - - pd.testing.assert_frame_equal( - include_orphaned_processes(input_df), - expected_output_df) + 'PPID': [0, INIT_PROCESS_ID, INIT_PROCESS_ID, 2000, 2000] + }) + + expected_df = input_df.loc[[1, 2]] + returned_df = include_orphaned_processes(input_df) + pd.testing.assert_frame_equal(returned_df, expected_df) def test_no_orphaned_processes(self) -> None: """Test the returned DataFrame is empty when no orphaned processes are present""" input_df = pd.DataFrame({ 'PID': [1, 2, 3, 4, 5], - 'PPID': [0, 2, 3, 4, 4], - 'PGID': [0, 0, 0, 0, 0], - 'UID': [0, 1, 1, 2, 2], - 'CMD': ['init', 'process_1', 'process_2', 'process_3', 'process_4']}) + 'PPID': [0, 2, 3, 4, 4] + }) - self.assertTrue(include_orphaned_processes(input_df).empty) + returned_df = include_orphaned_processes(input_df) + self.assertTrue(returned_df.empty) def test_all_orphaned_processes(self) -> None: """Test the returned DataFrame matches the input DataFrame when all processes are orphaned""" @@ -51,12 +39,7 @@ def test_all_orphaned_processes(self) -> None: input_df = pd.DataFrame({ 'PID': [2, 3, 4, 5], 'PPID': [INIT_PROCESS_ID, INIT_PROCESS_ID, INIT_PROCESS_ID, INIT_PROCESS_ID], - 'PGID': [0, 0, 0, 0], - 'UID': [1, 1, 2, 2], - 'CMD': ['process_1', 'process_2', 'process_3', 'process_4']}) + }) returned_df = include_orphaned_processes(input_df) pd.testing.assert_frame_equal(returned_df, input_df) - - # Make sure the return value is an equivalent copy and not just the original DataFrame - self.assertIsNot(returned_df, input_df) diff --git a/tests/utils/test_include_user_whitelist.py b/tests/utils/test_include_user_whitelist.py index f9f92e9..12ee5eb 100644 --- a/tests/utils/test_include_user_whitelist.py +++ b/tests/utils/test_include_user_whitelist.py @@ -4,7 +4,7 @@ import pandas as pd -from shinigami.utils import INIT_PROCESS_ID, include_user_whitelist +from shinigami.utils import include_user_whitelist class UserIDs(TestCase): @@ -17,12 +17,7 @@ def setUp(self) -> None: processes, and multiple user IDs. """ - self.testing_data = pd.DataFrame({ - 'PID': [INIT_PROCESS_ID, 100, 101, 102, 103], - 'PPID': [0, INIT_PROCESS_ID, INIT_PROCESS_ID, 100, 200], - 'PGID': [0, 1, 1, 2, 3], - 'UID': [0, 123, 123, 456, 789], - 'CMD': ['init', 'process_1', 'process_2', 'process_3', 'process_4']}) + self.testing_data = pd.DataFrame({'UID': [0, 123, 123, 456, 789]}) def test_empty_whitelist(self) -> None: """Test the returned DataFrame is empty when the whitelist is empty"""