Implement text compression

mdsteele · Aug 28, 2024 · b9e0f37 · b9e0f37
1 parent c67d8f3
commit b9e0f37
Show file tree

Hide file tree

Showing 78 changed files with 3,143 additions and 3,306 deletions.
diff --git a/Makefile b/Makefile
@@ -29,6 +29,7 @@ MUSIC_OUT_DIR = $(OUTDIR)/music
 PCM_OUT_DIR = $(OUTDIR)/pcm
 ROOM_OUT_DIR = $(OUTDIR)/rooms
 SIM65_OUT_DIR = $(OUTDIR)/sim65
+TEXT_OUT_DIR = $(OUTDIR)/text
 TILE_OUT_DIR = $(OUTDIR)/tiles
 TSET_OUT_DIR = $(OUTDIR)/tilesets
 
@@ -67,6 +68,13 @@ ROOM_ROOM_FILES := \
   $(patsubst src/rooms/%.bg,$(ROOM_OUT_DIR)/%.room,$(ROOM_BG_FILES))
 ROOM_LIB_FILE = $(LIB_OUT_DIR)/rooms.lib
 
+TEXT_TXT_FILES := $(shell find src/text -name '*.txt' | sort)
+TEXT_ASM_FILES := \
+  $(patsubst src/text/%.txt,$(TEXT_OUT_DIR)/%.asm,$(TEXT_TXT_FILES))
+TEXT_OBJ_FILES := \
+  $(patsubst $(TEXT_OUT_DIR)/%.asm,$(TEXT_OUT_DIR)/%.o,$(TEXT_ASM_FILES))
+TEXT_LIB_FILE = $(LIB_OUT_DIR)/text.lib
+
 TILE_AHI_FILES := $(shell find src/tiles -name '*.ahi' | sort)
 TILE_CHR_FILES := \
   $(patsubst src/tiles/%.ahi,$(TILE_OUT_DIR)/%.chr,$(TILE_AHI_FILES))
@@ -214,6 +222,12 @@ $(MUSIC_OUT_DIR)/%.asm: src/music/%.sng $(SNG2ASM)
 	@$(SNG2ASM) < $< > $@
 .SECONDARY: $(MUSIC_ASM_FILES)
 
+$(TEXT_OUT_DIR)/%.asm: src/text/%.txt build/text2asm.py
+	@echo "Generating $@"
+	@mkdir -p $(@D)
+	@python3 build/text2asm.py $< > $@
+.SECONDARY: $(TEXT_ASM_FILES)
+
 $(TSET_OUT_DIR)/%.asm: src/tilesets/%.bg $(BG2TSET) $(TILE_AHI_FILES)
 	@echo "Generating $@"
 	@mkdir -p $(@D)
@@ -317,6 +331,10 @@ $(MUSIC_OUT_DIR)/%.o: $(MUSIC_OUT_DIR)/%.asm $(INC_FILES)
 	$(compile-asm)
 .SECONDARY: $(MUSIC_OBJ_FILES)
 
+$(TEXT_OUT_DIR)/%.o: $(TEXT_OUT_DIR)/%.asm $(INC_FILES)
+	$(compile-asm)
+.SECONDARY: $(TEXT_OBJ_FILES)
+
 $(TSET_OUT_DIR)/%.o: $(TSET_OUT_DIR)/%.asm $(INC_FILES)
 	$(compile-asm)
 .SECONDARY: $(TSET_OBJ_FILES)
@@ -333,6 +351,9 @@ $(MUSIC_LIB_FILE): $(MUSIC_OBJ_FILES)
 $(ROOM_LIB_FILE): $(ROOM_OBJ_FILES)
 	$(update-archive)
 
+$(TEXT_LIB_FILE): $(TEXT_OBJ_FILES)
+	$(update-archive)
+
 $(TSET_LIB_FILE): $(TSET_OBJ_FILES)
 	$(update-archive)
 
@@ -341,13 +362,14 @@ $(TSET_LIB_FILE): $(TSET_OBJ_FILES)
 
 $(ROM_BIN_FILE) $(ROM_LABEL_FILE): \
   tests/lint.py $(ROM_CFG_FILE) $(ROM_OBJ_FILES) \
-  $(MUSIC_LIB_FILE) $(ROOM_LIB_FILE) $(TSET_LIB_FILE)
+  $(MUSIC_LIB_FILE) $(ROOM_LIB_FILE) $(TEXT_LIB_FILE) $(TSET_LIB_FILE)
 	python3 tests/lint.py
 	@echo "Linking $@"
 	@mkdir -p $(@D)
 	@ld65 -Ln $(ROM_LABEL_FILE) -m $(ROM_MAP_FILE) -o $@ \
 	      -C $(ROM_CFG_FILE) $(ROM_OBJ_FILES) \
-	      $(MUSIC_LIB_FILE) $(ROOM_LIB_FILE) $(TSET_LIB_FILE)
+	      $(MUSIC_LIB_FILE) $(ROOM_LIB_FILE) $(TEXT_LIB_FILE) \
+	      $(TSET_LIB_FILE)
 $(ROM_LABEL_FILE): $(ROM_BIN_FILE)
 
 #=============================================================================#

diff --git a/build/text2asm.py b/build/text2asm.py
@@ -0,0 +1,212 @@
+#=============================================================================#
+# Copyright 2022 Matthew D. Steele <[email protected]>                    #
+#                                                                             #
+# This file is part of Annalog.                                               #
+#                                                                             #
+# Annalog is free software: you can redistribute it and/or modify it under    #
+# the terms of the GNU General Public License as published by the Free        #
+# Software Foundation, either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# Annalog is distributed in the hope that it will be useful, but WITHOUT ANY  #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more      #
+# details.                                                                    #
+#                                                                             #
+# You should have received a copy of the GNU General Public License along     #
+# with Annalog.  If not, see <http://www.gnu.org/licenses/>.                  #
+#=============================================================================#
+
+import os
+import sys
+
+#=============================================================================#
+
+MAX_PAIRS = 0xfd - 0x80
+
+HEADER = """\
+;;; This file was generated by text2asm.
+
+.INCLUDE "../../src/charmap.inc"
+.INCLUDE "../../src/dialog.inc"
+
+;;;=========================================================================;;;
+
+"""
+
+FOOTER = """\
+
+;;;=========================================================================;;;
+"""
+
+#=============================================================================#
+
+def parse_text(data):
+    text = []
+    current_chars = []
+    original_size = 0
+    def finish_string():
+        if not current_chars: return
+        string = ''.join(current_chars)
+        current_chars.clear()
+        text.append(('b', string))
+    while data:
+        if data.startswith('{'):
+            i = data.find('}')
+            finish_string()
+            constant = data[1:i]
+            data = data[i + 1:]
+            text.append(('c', constant))
+            original_size += 1
+        elif data.startswith('['):
+            i = data.find(']')
+            finish_string()
+            pair = data[1:i]
+            data = data[i + 1:]
+            assert len(pair) == 4
+            text.append(('p', (int(pair[:2], 16), int(pair[2:], 16))))
+            original_size += 2
+        else:
+            char = data[0]
+            data = data[1:]
+            current_chars.append(char)
+            original_size += 1
+    finish_string()
+    return (text, original_size)
+
+def parse_input_file(filepath):
+    texts = {}
+    current_text_name = None
+    current_text_data = ''
+    original_size = 0
+    for line in open(filepath):
+        line = line.rstrip('\n')
+        if current_text_name is not None:
+            current_text_data += line
+            if line.endswith('#') or line.endswith('%'):
+                (text, size) = parse_text(current_text_data)
+                texts[current_text_name] = text
+                original_size += size
+                current_text_name = None
+                current_text_data = ''
+            else:
+                current_text_data += '$'
+        else:
+            if not line: continue
+            elif line.startswith('#'): continue
+            elif line.startswith('@'):
+                current_text_name = line[1:]
+            else:
+                raise ValueError('bad line: ' + repr(line))
+    assert current_text_name is None
+    return (texts, original_size)
+
+def compute_pairs(texts):
+    forced_pairs = set()
+    pair_counts = {}
+    for text in texts.values():
+        for kind, value in text:
+            if kind == 'p':
+                forced_pairs.add(value)
+            elif kind == 'b':
+                for i in range(0, len(value) - 1):
+                    pair = value[i:i + 2]
+                    if pair not in pair_counts:
+                        pair_counts[pair] = 0
+                    pair_counts[pair] += 1
+    sorted_counts = sorted(pair_counts.items(), key=lambda item: -item[1])
+    best_pairs = [pair for pair, count in sorted_counts if count > 2]
+    return sorted(forced_pairs) + best_pairs[:MAX_PAIRS - len(forced_pairs)]
+
+def compress_text(text, dictionary):
+    compressed_size = 0
+    result_lines = []
+    current_line = []
+    current_chars = []
+    def finish_string():
+        if not current_chars: return
+        string = ''.join(current_chars)
+        current_chars.clear()
+        current_line.append(f'"{string}"')
+    def finish_line():
+        finish_string()
+        if not current_line: return
+        line = ', '.join(current_line)
+        current_line.clear()
+        result_lines.append(f'    .byte {line}\n')
+    for kind, value in text:
+        if kind == 'b':
+            while len(value) >= 2:
+                pair = value[:2]
+                i = dictionary.get(pair)
+                if i is not None:
+                    value = value[2:]
+                    finish_string()
+                    current_line.append(f'${i + 0x80:02x}')
+                    compressed_size += 1
+                    if '$' in pair: finish_line()
+                else:
+                    char = value[0]
+                    value = value[1:]
+                    current_chars.append(char)
+                    compressed_size += 1
+                    if char == '$': finish_line()
+            if value:
+                current_chars.append(value)
+                compressed_size += len(value)
+                if value.endswith('$'): finish_line()
+        elif kind == 'c':
+            finish_string()
+            current_line.append(value)
+            compressed_size += 1
+        elif kind == 'p':
+            i = dictionary[value]
+            finish_string()
+            current_line.append(f'${i + 0x80:02x}')
+            compressed_size += 1
+        else: assert False
+    finish_line()
+    return (''.join(result_lines), compressed_size)
+
+def write_output_file(bank, pairs, texts, original_data_size):
+    compressed_data_size = 0
+    sys.stdout.write(HEADER)
+    sys.stdout.write(f'.SEGMENT "PRGA_{bank}"\n\n')
+    sys.stdout.write(f'.EXPORT DataA_{bank}_Strings_u8_arr2_arr\n')
+    sys.stdout.write(f'.PROC DataA_{bank}_Strings_u8_arr2_arr\n')
+    for pair in pairs:
+        if isinstance(pair, str):
+            sys.stdout.write(f'    .byte "{pair}"\n')
+        else:
+            sys.stdout.write(f'    .byte ${pair[0]:02x}, ${pair[1]:02x}\n')
+        compressed_data_size += 2
+    sys.stdout.write(f'.ENDPROC\n')
+    dictionary = {pair: i for i, pair in enumerate(pairs)}
+    for name, text in sorted(texts.items()):
+        sys.stdout.write(f'\n.EXPORT DataA_{bank}_{name}_u8_arr\n')
+        sys.stdout.write(f'.PROC DataA_{bank}_{name}_u8_arr\n')
+        (compressed_asm, compressed_size) = compress_text(text, dictionary)
+        sys.stdout.write(compressed_asm)
+        sys.stdout.write(f'.ENDPROC\n')
+        compressed_data_size += compressed_size
+    sys.stdout.write(f'\n;;;   Original size = {original_data_size:4x}\n')
+    sys.stdout.write(f';;; Compressed size = {compressed_data_size:4x}\n')
+    saved = original_data_size - compressed_data_size
+    sys.stdout.write(f';;;     Bytes saved = {saved:4x}\n')
+    percent = int(round(100 * (1 - compressed_data_size / original_data_size)))
+    sys.stdout.write(f';;;   Percent saved = {percent:3d}%\n')
+    sys.stdout.write(FOOTER)
+
+def run(filepath):
+    filename = os.path.split(filepath)[1]
+    bank = os.path.splitext(filename)[0].capitalize()
+    (texts, original_size) = parse_input_file(filepath)
+    pairs = compute_pairs(texts)
+    write_output_file(bank, pairs, texts, original_size)
+
+#=============================================================================#
+
+if __name__ == '__main__':
+    run(sys.argv[1])
+
+#=============================================================================#
diff --git a/src/dialog.asm b/src/dialog.asm
@@ -159,6 +159,7 @@
 .IMPORT FuncA_Dialog_PlaySfxDialogText
 .IMPORT FuncA_Dialog_PlaySfxQuestMarker
 .IMPORT FuncA_Objects_DrawObjectsForRoom
+.IMPORT FuncM_CopyDialogText
 .IMPORT FuncM_DrawObjectsForRoomAndProcessFrame
 .IMPORT FuncM_ScrollTowardsAvatar
 .IMPORT FuncM_ScrollTowardsGoal
@@ -422,24 +423,6 @@ _Finish:
     jmp_prga MainA_Pause_Papers
 .ENDPROC
 
-;;; Given the bank/pointer returned by FuncA_Dialog_GetNextDialogTextPointer,
-;;; switches the PRGA bank and copies the dialog text into
-;;; Ram_DialogText_u8_arr.
-;;; @param T2 The PRGA bank that contains the dialog text.
-;;; @param T1T0 A pointer to the start of the dialog text.
-.EXPORT FuncM_CopyDialogText
-.PROC FuncM_CopyDialogText
-    main_prga T2
-    ldy #$ff
-    @loop:
-    iny
-    lda (T1T0), y
-    sta Ram_DialogText_u8_arr, y
-    cmp #kDialogTextNewline + 1
-    blt @loop
-    rts
-.ENDPROC
-
 ;;;=========================================================================;;;
 
 .SEGMENT "PRGA_Dialog"

diff --git a/src/linker.cfg b/src/linker.cfg
@@ -250,21 +250,20 @@ SEGMENTS {
   PRGA_Text0:        load=PRGA_01, type=ro;
   PRGA_Text1:        load=PRGA_03, type=ro;
   PRGA_Text2:        load=PRGA_05, type=ro;
-  PRGA_Text3:        load=PRGA_07, type=ro;
-  PRGA_Actor:        load=PRGA_09, type=ro;
-  PRGA_Avatar:       load=PRGA_0B, type=ro;
-  PRGA_Console:      load=PRGA_0D, type=ro;
-  PRGA_Cutscene:     load=PRGA_0F, type=ro;
-  PRGA_Death:        load=PRGA_11, type=ro;
-  PRGA_Dialog:       load=PRGA_13, type=ro;
-  PRGA_Machine:      load=PRGA_15, type=ro;
-  PRGA_Objects:      load=PRGA_17, type=ro;
-  PRGA_Pause:        load=PRGA_19, type=ro;
-  PRGA_Room:         load=PRGA_1B, type=ro;
-  PRGA_Terrain:      load=PRGA_1D, type=ro;
-  PRGA_Pcm0:         load=PRGA_1F, type=ro;
-  PRGA_Pcm1:         load=PRGA_21, type=ro;
-  PRGA_Pcm2:         load=PRGA_23, type=ro;
+  PRGA_Actor:        load=PRGA_07, type=ro;
+  PRGA_Avatar:       load=PRGA_09, type=ro;
+  PRGA_Console:      load=PRGA_0B, type=ro;
+  PRGA_Cutscene:     load=PRGA_0D, type=ro;
+  PRGA_Death:        load=PRGA_0F, type=ro;
+  PRGA_Dialog:       load=PRGA_11, type=ro;
+  PRGA_Machine:      load=PRGA_13, type=ro;
+  PRGA_Objects:      load=PRGA_15, type=ro;
+  PRGA_Pause:        load=PRGA_17, type=ro;
+  PRGA_Room:         load=PRGA_19, type=ro;
+  PRGA_Terrain:      load=PRGA_1B, type=ro;
+  PRGA_Pcm0:         load=PRGA_1D, type=ro;
+  PRGA_Pcm1:         load=PRGA_1F, type=ro;
+  PRGA_Pcm2:         load=PRGA_21, type=ro;
   # Fixed-bank PRG segments:
   PRG8:              load=PRG8,    type=ro;
   PRGE_Pcm:          load=PRGE,    type=ro, align=$100;