EleutherAI · Lewington-pitsos · Jul 18, 2024 · Jul 18, 2024
diff --git a/sae/data.py b/sae/data.py
@@ -20,6 +20,7 @@ def chunk_and_tokenize(
     max_seq_len: int = 2048,
     return_final_batch: bool = False,
     load_from_cache_file: bool = True,
+    batch_size: int = 2048,
 ) -> T:
     """Perform GPT-style chunking and tokenization on a dataset.
 
@@ -43,7 +44,7 @@ def chunk_and_tokenize(
         The chunked and tokenized dataset.
     """
 
-    def _tokenize_fn(x: dict[str, list]):
+    def _tokenize_fn(x: dict[str, list], leftovers: list=[]):
         chunk_size = min(tokenizer.model_max_length, max_seq_len)
         sep = tokenizer.eos_token or "<|endoftext|>"
         joined_text = sep.join([""] + x[text_key])
@@ -67,10 +68,29 @@ def _tokenize_fn(x: dict[str, list]):
             ]
             output = {"input_ids": chunks}
 
-        if not return_final_batch:
-            # We know that the last sample will almost always be less than the max
-            # number of tokens, and we don't want to pad, so we just drop it.
-            output = {k: v[:-1] for k, v in output.items()}
+        if (not return_final_batch) and len(output["input_ids"][-1]) != chunk_size:
+            # we do not pad so if the last batch is smaller than the required
+            # batch size we either lengthen it using leftover batches or put
+            # it in the basket of leftovers
+            final_chunk = output["input_ids"].pop()
+
+            while len(final_chunk) < chunk_size:
+                if len(leftovers) == 0:
+                    leftovers.append(final_chunk)
+                    break
+
+                leftover = leftovers.pop()
+                final_chunk.extend([tokenizer.eos_token_id] + leftover)
+            else:
+                new_leftover = final_chunk[chunk_size:]
+                final_chunk = final_chunk[:chunk_size]
+                output["input_ids"].append(final_chunk)
+
+                if len(new_leftover) > 0:
+                    leftovers.append(new_leftover)
+
+            output = {k: v[:len(output['input_ids'])] for k, v in output.items()}
+
 
         output_batch_size = len(output["input_ids"])
 
@@ -89,10 +109,11 @@ def _tokenize_fn(x: dict[str, list]):
         # since we always throw away the last element of the batch we
         # want to keep the batch size as large as possible
         batched=True,
-        batch_size=2048,
+        batch_size=batch_size,
         num_proc=num_proc,
         remove_columns=get_columns_all_equal(data),
         load_from_cache_file=load_from_cache_file,
+        fn_kwargs={} if return_final_batch else {"leftovers": []}
     )
     return data.with_format(format, columns=["input_ids"])
 

diff --git a/tests/test_chunk_and_tokenize.py b/tests/test_chunk_and_tokenize.py
@@ -0,0 +1,36 @@
+import pytest
+from transformers import GPT2TokenizerFast
+from datasets import Dataset
+from sae.data import chunk_and_tokenize
+
+@pytest.fixture
+def setup_data():
+    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+    data = Dataset.from_dict({
+        "text": ["This is a very short sentence.",] * 2000 + \
+        ["This is a sentence which is just a little bit longer, you better have a way of dealing with it homeslice."] * 3
+
+    })
+    return tokenizer, data
+
+def test_chunk_and_tokenize(setup_data):
+    tokenizer, data = setup_data
+
+    # Perform chunking and tokenization
+    max_seq_len = 10  # Setting a small max_seq_len for testing overflow
+    tokenized_data = chunk_and_tokenize(
+        data,
+        tokenizer,
+        max_seq_len=max_seq_len,
+        num_proc=2,
+        batch_size=32,
+    )
+
+    # Verify the output
+    input_ids = tokenized_data["input_ids"]
+    input_id_lengths = [len(ids) for ids in input_ids] 
+
+    assert all([l == max_seq_len for l in input_id_lengths]), f"All input_ids should have max_seq_len, got {input_id_lengths}"
+    assert len(input_ids[-1]) <= max_seq_len, "Last input_ids should be <= max_seq_len"
+    assert len(input_ids) >= 1610, f"Expected at least 1610 input_ids, got {len(input_ids)}"
+