ricklupton · ricklupton · Dec 3, 2024 · Nov 17, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -110,3 +110,6 @@ jobs:
       uses: pypa/gh-action-pypi-publish@release/v1
       with:
         repository-url: https://test.pypi.org/legacy/
+        # Don't fail when this is another push to the same version. If we put
+        # the git hash in the version string, this could be removed.
+        skip-existing: true
diff --git a/README.md b/README.md
@@ -11,7 +11,9 @@ To convert rm files to other formats, you can use [rmc](https://github.com/rickl
 ### Unreleased
 
 Fixes:
+
 - Fix AssertionError when some ids are missing in a `CrdtSequence` ([#36](https://github.com/ricklupton/rmscene/pull/36))
+- Store any unparsed data in blocks as raw bytes to allow for round-trip saving of files written in a newer format than the parsing code knows about.
 
 ### v0.6.0
 

diff --git a/src/rmscene/scene_stream.py b/src/rmscene/scene_stream.py
@@ -62,11 +62,44 @@ def lookup(cls, block_type: int) -> tp.Optional[tp.Type[Block]]:
                 return match
         return None
 
+    @classmethod
+    def read(self, reader: TaggedBlockReader) -> Optional[Block]:
+        """
+        Maybe parse a block from the reader stream.
+        """
+        with reader.read_block() as block_info:
+            if block_info is None:
+                return
+
+            block_type = Block.lookup(block_info.block_type)
+            if block_type:
+                try:
+                    block = block_type.from_stream(reader)
+                except Exception as e:
+                    _logger.warning("Error reading block: %s", e)
+                    reader.data.data.seek(block_info.offset)
+                    data = reader.data.read_bytes(block_info.size)
+                    block = UnreadableBlock(str(e), data, block_info)
+            else:
+                msg = (
+                    f"Unknown block type {block_info.block_type}. "
+                    f"Skipping {block_info.size} bytes."
+                )
+                _logger.warning(msg)
+                data = reader.data.read_bytes(block_info.size)
+                block = UnreadableBlock(msg, data, block_info)
+
+        # Keep any unparsed extra data
+        block.extra_data = block_info.extra_data
+        return block
+
     def write(self, writer: TaggedBlockWriter):
         """Write the block header and content to the stream."""
         min_version, current_version = self.version_info(writer)
         with writer.write_block(self.get_block_type(), min_version, current_version):
             self.to_stream(writer)
+            # Write any leftover extra data that wasn't parsed
+            writer.data.write_bytes(self.extra_data)
 
     @classmethod
     @abstractmethod
@@ -421,6 +454,7 @@ def line_to_stream(line: si.Line, writer: TaggedBlockWriter, version: int = 2):
 class SceneItemBlock(Block):
     parent_id: CrdtId
     item: CrdtSequenceItem
+    extra_value_data: bytes = b""
 
     ITEM_TYPE: tp.ClassVar[int] = 0
 
@@ -457,16 +491,16 @@ def from_stream(cls, stream: TaggedBlockReader) -> SceneItemBlock:
                 item_type = stream.data.read_uint8()
                 assert item_type == subclass.ITEM_TYPE
                 value = subclass.value_from_stream(stream)
-            # Keep known extra data
-            extra_data = block_info.extra_data
+            # Keep known extra data from within the value subblock
+            extra_value_data = block_info.extra_data
         else:
             value = None
-            extra_data = b""
+            extra_value_data = b""
 
         return subclass(
             parent_id,
             CrdtSequenceItem(item_id, left_id, right_id, deleted_length, value),
-            extra_data=extra_data,
+            extra_value_data=extra_value_data,
         )
 
     def to_stream(self, writer: TaggedBlockWriter):
@@ -482,7 +516,7 @@ def to_stream(self, writer: TaggedBlockWriter):
                 writer.data.write_uint8(self.ITEM_TYPE)
                 self.value_to_stream(writer, self.item.value)
 
-                writer.data.write_bytes(self.extra_data)
+                writer.data.write_bytes(self.extra_value_data)
 
     @classmethod
     @abstractmethod
@@ -795,28 +829,12 @@ def _read_blocks(stream: TaggedBlockReader) -> Iterator[Block]:
     Parse blocks from reMarkable v6 file.
     """
     while True:
-        with stream.read_block() as block_info:
-            if block_info is None:
-                # no more blocks
-                return
-
-            block_type = Block.lookup(block_info.block_type)
-            if block_type:
-                try:
-                    yield block_type.from_stream(stream)
-                except Exception as e:
-                    _logger.warning("Error reading block: %s", e)
-                    stream.data.data.seek(block_info.offset)
-                    data = stream.data.read_bytes(block_info.size)
-                    yield UnreadableBlock(str(e), data, block_info)
-            else:
-                msg = (
-                    f"Unknown block type {block_info.block_type}. "
-                    f"Skipping {block_info.size} bytes."
-                )
-                _logger.warning(msg)
-                data = stream.data.read_bytes(block_info.size)
-                yield UnreadableBlock(msg, data, block_info)
+        maybe_block = Block.read(stream)
+        if maybe_block:
+            yield maybe_block
+        else:
+            # no more blocks
+            return
 
 
 def read_blocks(data: tp.BinaryIO) -> Iterator[Block]:

diff --git a/tests/test_scene_stream.py b/tests/test_scene_stream.py
@@ -33,21 +33,22 @@ def _hex_lines(b, n=32):
 ]
 
 
-@pytest.mark.parametrize(
-    "test_file,version",
-    [
-        ("Normal_AB.rm", "3.0"),
-        ("Normal_A_stroke_2_layers.rm", "3.0"),
-        ("Normal_A_stroke_2_layers_v3.2.2.rm", "3.2.2"),
-        ("Normal_A_stroke_2_layers_v3.3.2.rm", "3.3.2"),
-        ("Bold_Heading_Bullet_Normal.rm", "3.0"),
-        ("Lines_v2.rm", "3.1"),
-        ("Lines_v2_updated.rm", "3.2"),  # extra 7fXXXX part of Line data was added
-        ("Wikipedia_highlighted_p1.rm", "3.1"),
-        ("Wikipedia_highlighted_p2.rm", "3.1"),
-        ("With_SceneInfo_Block.rm", "3.4"),  # XXX version?
-    ],
-)
+TEST_FILES_AND_VERSIONS = [
+    ("Normal_AB.rm", "3.0"),
+    ("Normal_A_stroke_2_layers.rm", "3.0"),
+    ("Normal_A_stroke_2_layers_v3.2.2.rm", "3.2.2"),
+    ("Normal_A_stroke_2_layers_v3.3.2.rm", "3.3.2"),
+    ("Bold_Heading_Bullet_Normal.rm", "3.0"),
+    ("Lines_v2.rm", "3.1"),
+    ("Lines_v2_updated.rm", "3.2"),  # extra 7fXXXX part of Line data was added
+    ("Wikipedia_highlighted_p1.rm", "3.1"),
+    ("Wikipedia_highlighted_p2.rm", "3.1"),
+    ("With_SceneInfo_Block.rm", "3.4"),  # XXX version?
+    ("Color_and_tool_v3.14.4.rm", "3.14"),
+]
+
+
+@pytest.mark.parametrize("test_file,version", TEST_FILES_AND_VERSIONS)
 def test_full_roundtrip(test_file, version):
     with open(DATA_PATH / test_file, "rb") as f:
         data = f.read()
@@ -67,6 +68,28 @@ def test_full_roundtrip(test_file, version):
     assert _hex_lines(input_buf.getvalue()) == _hex_lines(output_buf.getvalue())
 
 
+# FIXME: remove xfail when parsing updated
+
+TEST_FILES_FOR_FULL_PARSING = [
+    pytest.param(
+        filename,
+        marks=pytest.mark.xfail if filename == "Color_and_tool_v3.14.4.rm" else [],
+    )
+    for filename, _ in TEST_FILES_AND_VERSIONS
+]
+
+
+@pytest.mark.parametrize("test_file", TEST_FILES_FOR_FULL_PARSING)
+def test_files_fully_parsed(test_file):
+    with open(DATA_PATH / test_file, "rb") as f:
+        result = list(read_blocks(f))
+
+    # Check none of the blocks were unreadable and do not have extra data
+    for block in result:
+        assert not isinstance(block, UnreadableBlock)
+        assert not block.extra_data
+
+
 def test_normal_ab():
     with open(DATA_PATH / "Normal_AB.rm", "rb") as f:
         result = list(read_blocks(f))
@@ -234,16 +257,13 @@ def test_blocks_roundtrip(block):
     writer = TaggedBlockWriter(buf)
     reader = TaggedBlockReader(buf)
 
-    # Use 4 as a fallback -- it only matters for the SceneItem blocks
-    block_type = getattr(block, "BLOCK_TYPE", 4)
-    with writer.write_block(block_type, 1, 1):
-        block.to_stream(writer)
-
+    block.write(writer)
     buf.seek(0)
     logger.info("After writing block %s", type(block))
     logger.info("Buffer: %s", buf.getvalue().hex())
-    with reader.read_block():
-        block2 = block.from_stream(reader)
+
+    block2 = Block.read(reader)
+
     assert block2 == block
 
 
@@ -259,7 +279,28 @@ def test_write_blocks():
     assert buf.getvalue()[43:].hex() == "05000000000101001f01012101"
 
 
-def test_blocks_keep_unknown_data():
+def test_blocks_keep_unknown_data_in_main_block():
+    # The "E1 FF" is represents new, unknown data -- note that this might need
+    # to be changed in future if the next id starts to actually be used in a
+    # future update!
+    data_hex = """
+    21000000 0000010D
+    1C 06000000
+       1F 0000
+       2F 0000
+    2C 05000000
+       1F 0000 21 01
+    3C 05000000
+       1F 0000 21 01
+    E1 FF
+    """
+    buf = BytesIO(HEADER_V6 + bytes.fromhex(data_hex))
+    block = next(read_blocks(buf))
+    assert isinstance(block, SceneInfo)
+    assert block.extra_data == bytes.fromhex("E1 FF")
+
+
+def test_blocks_keep_unknown_data_in_value_subblock():
     # The "8f 010f" is represents new, unknown data -- note that this might need
     # to be changed in future if the next id starts to actually be used in a
     # future update!
@@ -286,7 +327,7 @@ def test_blocks_keep_unknown_data():
     buf = BytesIO(HEADER_V6 + bytes.fromhex(data_hex))
     block = next(read_blocks(buf))
     assert isinstance(block, SceneLineItemBlock)
-    assert block.extra_data == bytes.fromhex("8f 0101")
+    assert block.extra_value_data == bytes.fromhex("8f 0101")
 
 
 def test_error_in_block_contained():
@@ -339,6 +380,7 @@ def test_error_in_block_contained():
 author_ids_block_strategy = st.builds(
     AuthorIdsBlock,
     st.dictionaries(st.integers(min_value=0, max_value=65535), st.uuids()),
+    extra_data=st.binary(),
 )
 
 block_strategy = st.one_of(
@@ -355,15 +397,11 @@ def test_blocks_roundtrip_2(block):
     writer = TaggedBlockWriter(buf)
     reader = TaggedBlockReader(buf)
 
-    # Mock header
-    with writer.write_block(4, 1, 1):
-        block.to_stream(writer)
-
+    block.write(writer)
     buf.seek(0)
     logger.info("After writing block %s", type(block))
     logger.info("Buffer: %s", buf.getvalue().hex())
-    with reader.read_block():
-        block2 = block.from_stream(reader)
+    block2 = Block.read(reader)
     assert block2 == block