Skip to content

Commit

Permalink
don't raise on hf datasets with no validation set
Browse files Browse the repository at this point in the history
  • Loading branch information
dlwh committed Feb 13, 2024
1 parent fabce9a commit c6da233
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/levanter/data/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,15 @@ class LMDatasetSourceConfig:

def get_shard_source(self, split) -> Optional[ShardedDataset[str]]:
if self.id is not None:
ds = WrappedHFDataset(self.id, split=split, name=self.name, streaming=self.stream)
try:
ds = WrappedHFDataset(self.id, split=split, name=self.name, streaming=self.stream)
except ValueError as e:
# if the message starts with Bad split, then just return None
if str(e).startswith("Bad split"):
logger.warning(f"Splits {split} not found for {self.id} {self.name}")
return None
else:
raise

if len(ds.shard_names) == 0:
return None
Expand Down

0 comments on commit c6da233

Please sign in to comment.