From c6da2330515c650466e719750c4f0e33815f4142 Mon Sep 17 00:00:00 2001 From: David Hall Date: Tue, 13 Feb 2024 13:43:10 -0800 Subject: [PATCH] don't raise on hf datasets with no validation set --- src/levanter/data/text.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py index 00e17eb58..d347e9793 100644 --- a/src/levanter/data/text.py +++ b/src/levanter/data/text.py @@ -551,7 +551,15 @@ class LMDatasetSourceConfig: def get_shard_source(self, split) -> Optional[ShardedDataset[str]]: if self.id is not None: - ds = WrappedHFDataset(self.id, split=split, name=self.name, streaming=self.stream) + try: + ds = WrappedHFDataset(self.id, split=split, name=self.name, streaming=self.stream) + except ValueError as e: + # if the message starts with Bad split, then just return None + if str(e).startswith("Bad split"): + logger.warning(f"Splits {split} not found for {self.id} {self.name}") + return None + else: + raise if len(ds.shard_names) == 0: return None