karpathy · fraserlove · Jul 3, 2024 · Jul 4, 2024
diff --git a/train_gpt2.py b/train_gpt2.py
@@ -217,7 +217,9 @@ def __init__(self, B, T, process_rank, num_processes, split):
         self.T = T
         self.process_rank = process_rank
         self.num_processes = num_processes
+        self.split = split
         assert split in {'train', 'val'}
+        self.rng = np.random.default_rng(1337)
 
         # get the shard filenames
         data_root = "edu_fineweb10B"
@@ -231,10 +233,22 @@ def __init__(self, B, T, process_rank, num_processes, split):
             print(f"found {len(shards)} shards for split {split}")
         self.reset()
 
+    def load_shard(self, filename):
+        shard = load_tokens(filename)
+        if self.split == "train":
+            # split tokens into documents using the <|endoftext|> token and shuffle
+            eot_positions = (torch.where(shard == enc.eot_token)[0] + 1).tolist()
+            documents = [shard[start:end] for start, end in zip([0] + eot_positions[:-1], eot_positions)]
+            self.rng.shuffle(documents)
+            shard = torch.cat(documents) # concatenate the documents back together
+        return shard
+
     def reset(self):
         # state, init at shard zero
         self.current_shard = 0
-        self.tokens = load_tokens(self.shards[self.current_shard])
+        if self.split == "train":
+            self.rng.shuffle(self.shards)
+        self.tokens = self.load_shard(self.shards[self.current_shard])
         self.current_position = self.B * self.T * self.process_rank
 
     def next_batch(self):
@@ -246,9 +260,13 @@ def next_batch(self):
         self.current_position += B * T * self.num_processes
         # if loading the next batch would be out of bounds, advance to next shard
         if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
-            self.current_shard = (self.current_shard + 1) % len(self.shards)
-            self.tokens = load_tokens(self.shards[self.current_shard])
-            self.current_position = B * T * self.process_rank
+            self.current_shard += 1
+            # reshuffle after each epoch
+            if self.current_shard == len(self.shards):
+                self.reset()
+            else:
+                self.tokens = self.load_shard(self.shards[self.current_shard])
+                self.current_position = B * T * self.process_rank
         return x, y
 
 # -----------------------------------------------------------------------------
@@ -518,4 +536,4 @@ def get_lr(it):
             f.write(f"{step} train {loss_accum.item():.6f}\n")
 
 if ddp:
-    destroy_process_group()
+    destroy_process_group()