fix redaction so playback on different system can work

cloudtruth · Sep 24, 2020 · 70ad4fe · 70ad4fe
1 parent 93cbdc6
commit 70ad4fe
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 85 deletions.
diff --git a/interposer/recorder.py b/interposer/recorder.py
@@ -102,13 +102,18 @@ def tearDownClass(cls) -> None:
 
         super().tearDownClass()
 
-    def redact(self, secret: str) -> str:
+    def redact(self, secret: str, replacement: str = "*") -> str:
         """
         Redact a secret in playback mode, and keep track of the secret in
         recording mode.  This allows tests to use secrets quite normally.
         Callers just have to remember to run secrets through redact().
+
+        It is recommended you provide a unique replacment string (it does
+        not need to be the same length as the secret), for each secret
+        to disambiguate them in the same run, especially for secrets of the
+        same length.
         """
-        return self.tapedeck.redact(secret)
+        return self.tapedeck.redact(secret, replacement=replacement)
 
 
 class TapeDeckCallHandler(CallHandler):

diff --git a/interposer/tapedeck.py b/interposer/tapedeck.py
@@ -21,7 +21,6 @@
 from typing import Callable
 from typing import Dict
 from typing import Optional
-from typing import Set
 
 import yaml
 
@@ -172,13 +171,11 @@ def __init__(self, deck: Path, mode: Mode) -> None:
         self.deck = deck
         self.file_format = None
         self.mode = mode
-        self.redactions: Set[str] = set()
-
-        self._logger = logging.getLogger(__name__)
 
         # call ordinal key (channel name) and value (ordinal number)
         self._call_ordinals: Dict[str, int] = {}
-
+        self._logger = logging.getLogger(__name__)
+        self._redactions: Dict[str, str] = dict()
         # the open file resource
         self._tape = None
 
@@ -233,6 +230,8 @@ def open(self) -> None:
         if self._tape:
             raise TapeDeckOpenError()
 
+        self._reset()
+
         if self.mode == Mode.Playback:
             self._tape = shelve.open(
                 str(self.deck), flag="r", protocol=self.PICKLE_PROTOCOL
@@ -253,12 +252,6 @@ def open(self) -> None:
             self._tape[self.LABEL_FILE_FORMAT] = self.CURRENT_FILE_FORMAT
             self.file_format = self.CURRENT_FILE_FORMAT
 
-        # ensure if close() then open() is called we reset the ordinals to zero
-        self._call_ordinals = dict()
-        self.redactions = set()
-
-        self._call_ordinals = {}
-
         self._log(
             logging.DEBUG,
             "open",
@@ -282,6 +275,8 @@ def close(self) -> None:
                 f"{self.deck} for {self.mode} using file format {self.file_format}",
             )
 
+        self._reset()
+
     def record(
         self,
         context: CallContext,
@@ -348,29 +343,45 @@ def playback(self, context: CallContext, channel: str = "default") -> Any:
             self._log_ex("playback", context, payload.ex)
             raise payload.ex
 
-    def redact(self, secret: str) -> str:
+    def redact(self, secret: str, replacement: str = "*") -> str:
         """
         Auto-track secrets for redaction.
 
-        In recording mode this returns the secret and makes sure the secret
-        never makes it into the recording; instead a redacted version does,
-        which is done before call contexts get hashed so the hashed context
-        is of the redacted context.  The results and exceptions are also
-        redacted before storage.
+        Tracks the secret for redaction of content being written.  For example
+        if a secret exists inside an object call result, it will be
+        overwritten with an equal length string containing the replacement,
+        wrapping around if necessary.  The replacement can be a single
+        character or a string which will get duplicated (or truncated) to
+        fit to the required size.
+
+        It is recommended that for each secret a different replacement is
+        used, since equal length secrets with the same replacement will yield
+        equal redacted secrets.
 
-        In playback mode the redacted secret is returned so the lookup finds
-        the redacted context.
+        During recording this method updates the redaction dict internally
+        and returns the original secret (so the remainder of the recording
+        can proceed), however call signatures will have the replacement in
+        place of the secret.  In playback mode when called, this method
+        will return the replacement that was used during recording so the
+        call can be found.
         """
         if not isinstance(secret, str):
             raise TypeError("secret must be a string")
+        if not isinstance(replacement, str):
+            raise TypeError("replacement must be a string")
         if not secret:
             raise AttributeError("secret cannot be an empty string")
+        if not replacement:
+            raise AttributeError("replacement cannot be an empty string")
+
+        wantlen = len(secret)
+        obfu = (replacement * math.ceil(wantlen / len(replacement)))[:wantlen]
 
         if self.mode == Mode.Recording:
-            self.redactions.add(secret)
+            self._redactions[secret] = obfu
             return secret
         else:
-            return self._obscured(secret)
+            return obfu
 
     def _advance(self, context: CallContext, channel: str) -> str:
         """
@@ -482,9 +493,8 @@ def _log(self, level: int, category: str, action: str, msg: str) -> None:
         Common funnel for logs.
         """
         msg = f"TAPE: {category}({action}): {msg}"
-        for secret in self.redactions:
-            redacted_secret = self._redact(secret)
-            msg = msg.replace(secret, redacted_secret)
+        for secret, replacement in self._redactions.items():
+            msg = msg.replace(secret, replacement)
         self._logger.log(level, msg)
 
     def _log_ex(self, action: str, context: CallContext, ex: Exception) -> None:
@@ -519,42 +529,21 @@ def _log_result(self, action: str, context: CallContext, result: Any) -> None:
         if self._logger.isEnabledFor(self.DEBUG_WITH_RESULTS):
             context.meta[self.LABEL_TAPE].pop(self.LABEL_RESULT)
 
-    def _obscured(self, secret: str) -> str:
-        """
-        Create a redaction for a secret that is based on the content
-        of the secret in order to disambiguate it from other redactions
-        in the same recording, but still be redacted.
-        """
-        wantlen = len(secret)
-        uniq = sha256(secret.encode()).hexdigest()
-        if len(uniq) < wantlen:
-            uniq *= math.ceil(wantlen / len(uniq))
-        if wantlen > 31:
-            # bookmarks make redactions easier to spot in logs
-            uniq = ":::REDACTED:::" + uniq[: wantlen - 28] + ":::REDACTED:::"
-        elif wantlen > 10:
-            uniq = ":::" + uniq[: wantlen - 6] + ":::"
-        else:
-            uniq = uniq[:wantlen]
-        return uniq
-
     def _redact(self, entity: Any, return_bytes: bool = False) -> Any:
         """
         Redacts any known secrets in an object by converting it to pickled
         binary form, then doing a binary secret replacement, then unpickling.
 
         This is used before we hash contexts and before we store results to
         make sure there are no secrets in the recording.  The secrets must
-        be fed to us from the consumer (self.redactions).
+        be fed to us from the consumer (self._redactions).
 
         Raises:
             PicklingError if something in the context cannot be pickled.
         """
         raw = pickle.dumps(entity, protocol=self.PICKLE_PROTOCOL)
-        for secret in self.redactions:
-            binary_secret = secret.encode()
-            redacted_secret = self._obscured(secret).encode()
-            raw = raw.replace(binary_secret, redacted_secret)
+        for secret, replacement in self._redactions.items():
+            raw = raw.replace(secret.encode(), replacement.encode())
         return pickle.loads(raw) if not return_bytes else raw  # nosec
 
     def _reduce_call(self, context: CallContext) -> Callable:
@@ -581,3 +570,9 @@ def _reduce_call(self, context: CallContext) -> Callable:
         result = context.call
         context.call = sig
         return result
+
+    def _reset(self) -> None:
+        """ Clean out stuff at open and close. """
+        self.file_format = None
+        self._call_ordinals = dict()
+        self._redactions = dict()
diff --git a/tests/recorder_test.py b/tests/recorder_test.py
@@ -282,15 +282,15 @@ def tearDownClass(cls) -> None:
         Delete the recording file since we tested both modes in one test.
         """
         datafile = Path(str(cls.tapedeck.deck) + ".gz")
-        redactions = cls.tapedeck.redactions
+        redactions = cls.tapedeck._redactions
         super().tearDownClass()
         if datafile.exists():
             with gzip.open(datafile, "rb") as fin:
                 raw = fin.read()
-                for secret in redactions:
+                for secret, replacement in redactions.items():
                     assert (
                         secret.encode() not in raw
-                    ), "a secret leaked into the recording"
+                    ), "a secret leaked into the recording!"
             datafile.unlink()
         os.environ.pop("RECORDING")
 
@@ -315,25 +315,22 @@ def test_secrets(self):
         # post-processing redaction list and after the recording file is closed
         # the secrets get redacted; in playback mode this redacts it immediately
         # so the playback matches the recording
-        assert not self.tapedeck.redactions
-        uut = cls(self.redact(self.token))
-        assert self.tapedeck.redactions
-        assert uut.get_token() == self.redact(self.token)
+        assert not self.tapedeck._redactions
+        uut = cls(self.redact(self.token, replacement="TOKEN"))
+        assert self.tapedeck._redactions
+        assert uut.get_token() == self.redact(self.token, replacement="TOKEN")
         assert self.redact(self.token) == self.token
 
         self.tapedeck.close()  # applies redaction to recording
         self.tapedeck.mode = Mode.Playback
         self.tapedeck.open()
 
-        assert not self.tapedeck.redactions
-        uut = cls(self.redact(self.token))
-        foo = self.redact(self.token)
-        assert foo.startswith(":::REDACTED:::")
-        assert foo.endswith(":::REDACTED:::")
-        assert len(foo) == len(self.token)
-        assert ":" not in foo[14:-14]
+        assert not self.tapedeck._redactions
+        uut = cls(self.redact(self.token, replacement="TOKEN"))
+        foo = self.redact(self.token, replacement="TOKEN")
+        assert foo == "TOKENTOKENTOKENTOKENTOKENTOKENTOKENT"
 
-        # most importantly, the value that was recorded was also redacted
+        # most importantly, the object initializer argument was redacted
         assert uut.get_token() == foo
         with self.assertRaises(RecordedCallNotFoundError):
             uut.get_token()

diff --git a/tests/tapedeck_test.py b/tests/tapedeck_test.py
@@ -172,42 +172,44 @@ def test_recording_too_old(self):
         finally:
             TapeDeck.EARLIEST_FILE_FORMAT_SUPPORTED = save
 
-    def test_redacted_obscured(self):
+    def test_redact(self):
         """
         Tests logic that makes an obscure but unique redaction string.
         """
-        uut = TapeDeck(self.datadir / "recording", Mode.Playback)
+        uut = TapeDeck(self.datadir / "recording", Mode.Recording)
 
         with self.assertRaises(TypeError):
             uut.redact(None)
         with self.assertRaises(TypeError):
             uut.redact(42)
         with self.assertRaises(AttributeError):
             uut.redact("")
+        with self.assertRaises(TypeError):
+            uut.redact("foo", replacement=None)
+        with self.assertRaises(TypeError):
+            uut.redact("foo", replacement=42)
+        with self.assertRaises(AttributeError):
+            uut.redact("foo", replacement="")
 
-        for i in range(1, 512):
-            assert len(uut._obscured("*" * i)) == i
+        assert uut.redact("foo") == "foo"  # recording mode
+        assert uut._redactions.get("foo") == "***"  # default
 
-        token = str(uuid.uuid4())
-        token2 = str(uuid.uuid4())
+        assert uut.redact("foo", replacement="#") == "foo"  # recording mode
+        assert uut._redactions.get("foo") == "###"  # default
 
-        # tokens up to length 10 are just hash
-        assert uut._obscured("A") != uut._obscured("Z")
-        assert ":" not in uut._obscured("123456789")
+        assert uut.redact("foo", replacement="BARSAM") == "foo"  # recording mode
+        assert uut._redactions.get("foo") == "BAR"  # default
 
-        # tokens up to length 32 have small bookmarks for logging standout
-        foo = uut._obscured("*" * 31)
-        assert foo.startswith(":::")
-        assert foo.endswith(":::")
-        assert ":" not in foo[3:-3]
+        assert (
+            uut.redact("fudge brownie", replacement="BARS") == "fudge brownie"
+        )  # recording mode
+        assert uut._redactions.get("fudge brownie") == "BARSBARSBARSB"  # default
 
-        # longer secrets have :::REDACTED::: bookmarks and still unique
-        bar = uut._obscured(token)
-        assert bar.startswith(":::REDACTED:::")
-        assert bar.endswith(":::REDACTED:::")
-        assert ":" not in bar[14:-14]
+        uut.mode = Mode.Playback
 
-        assert uut._obscured(token) != uut._obscured(token2)
+        assert (
+            uut.redact("fudge brownie", replacement="BARS") == "BARSBARSBARSB"
+        )  # playback mode
 
     def test_recording_secrets(self):
         """ Tests automatic redaction of known secrets and use in playback """