From e2c02e4e8b65db791df2a4396b3e8773fe7b4243 Mon Sep 17 00:00:00 2001 From: Ivan Zhou Date: Sun, 28 Jan 2024 22:20:00 +0000 Subject: [PATCH] rpv1 mixture --- config/data/rpv1_llama.yaml | 458 +++++++++++++++++++----------------- src/levanter/data/text.py | 5 +- 2 files changed, 245 insertions(+), 218 deletions(-) diff --git a/config/data/rpv1_llama.yaml b/config/data/rpv1_llama.yaml index c77fd46cf..545e6759e 100644 --- a/config/data/rpv1_llama.yaml +++ b/config/data/rpv1_llama.yaml @@ -1,218 +1,244 @@ -cache_dir: gs://levanter-data/tokenized/redpajama_v1_llama/ +cache_dir: gs://levanter-data/tokenized/redpajama_v1_llama_mixture rows_per_chunk: 4096 tokenizer: "meta-llama/Llama-2-7b-hf" -train_urls: - - gs://levanter-data/dev/redpajama/2019-30/en_head_{0000..0055}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2019-30/en_middle_{0060..0096}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2020-05/en_head_{0000..0068}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2020-05/en_middle_{0000..0128}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2021-04/en_head_{0000..0072}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2021-04/en_middle_{0000..0102}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2022-05/en_head_{0000..0059}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2022-05/en_middle_{0000..0096}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2023-06/en_head_{0000..0069}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/2023-06/en_middle_{0000..0104}.json.gz.dedup.classifier.jsonl.zst - - gs://levanter-data/dev/redpajama/arxiv/arxiv_023827cd-7ee8-42e6-aa7b-661731f4c70f.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_024de5df-1b7f-447c-8c3a-51407d8d6732.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_03232e26-be3f-4a28-a5d2-ee1d8c0e9831.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_034e819a-cfcb-43c6-ad25-0232ad48823c.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_077ae8de-a68e-47e7-95a6-6d82f8f4eeb9.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_0af50072-df4c-4084-a833-cebbd046e70e.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_0de84cfc-c080-471f-b139-1bf061db4feb.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_0fbdd8ad-32d8-4228-9a40-e09dde689760.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_11c659c1-ffbf-4455-abfd-058f6bbf4bb2.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_1958455d-6543-4307-a081-d86ce0637f9a.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_1982fb29-c4ed-4dd3-855c-666e63bc62d9.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_1caed86f-5625-4941-bdc1-cc57e4fec1cd.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_1d3a0cd6-f0e6-4106-a080-524a4bd50016.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_29d54f5a-1dd0-4e9a-b783-fb2eec9db072.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_29fd3d99-53fb-43e2-a4a5-2fd01bf77258.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_2b224cd9-286e-46ac-8c4e-c1e3befc8760.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_2c131fca-2a05-4d5f-a805-59d2af3477e2.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_2f28f1a7-6972-48ad-8997-65a5d52e4f1c.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_30440198-cd90-48c6-82c1-ea871b8c21c5.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_39367d6c-d7d4-45fc-a929-8a17184d1744.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_393d19f2-1cd1-421f-be8a-78d955fdf602.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_3a5d4f93-97ec-483a-88ef-324df9651b3f.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_3c89ea11-69ff-4049-b775-f0c785997909.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_3d5a011a-4bbe-4585-a2bd-ff3e943c8671.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_3f805f4b-6f7f-42a8-a006-47c1e0401bd7.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_3f9eb7ad-f266-4154-8d4d-54deeffde075.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_400748d3-0076-4a04-8a1c-6055ba0b5a2d.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_44e19375-3995-4dff-a3b6-8a25247a165c.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_4a8cf52f-81d0-4875-9528-466b1cbc71e1.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_4cc7015c-c39a-4bf6-9686-c00b3343edd9.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_50757a42-079b-41ec-bcca-73759faffd62.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_575ae832-e770-4a89-bfa7-c56f16dbca69.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_580be642-bb73-4d0d-8b5e-f494722934cd.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_5a02d9ee-12a0-437d-808f-d26f0eb2012b.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_5d8d402b-8277-480a-b5fa-71169726864f.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_5ee33ef7-455e-4fd5-9512-c4771dd802c1.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_610c82ed-b9ee-449c-83b0-601205f3a74a.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_629fe3ca-075f-4663-9b81-b807f3b42bf2.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_64e5075e-e87e-4b2a-9e38-e5c102f6f2b1.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_65dd2ff6-dae3-4a60-90d3-c3d7349fc92f.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_6719ecd2-fe34-4078-a584-320d921cbf6f.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_6938ee72-43ee-4ade-8840-151a402383b0.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_73241940-66c1-481c-b53a-f5e8b9afe9fa.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_751370b5-c7cb-44d8-a039-1468ee6747ab.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_75af5d17-5ebb-4460-9f2a-dc9fe880a936.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_79d50803-f7d9-4aa8-bf1a-d807980a40c6.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_7b26046f-7c8d-405b-911b-df51e1a069fa.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_7d1d69dc-bc8e-4817-9cab-afdc002ab7c4.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_7ea7a996-b1bb-4773-a36a-461dce2de861.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_8232f276-9e3f-463a-9350-362de1b501d1.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_8509f5a7-64a8-4813-92dc-f6eb53e3aacc.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_85b4c166-469d-449c-ab3d-5214c1d80246.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_872b620a-b4fd-45d3-92bc-ff0584447705.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_88f24f8d-16d3-4a21-894d-192033d0fa67.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_8e6bd730-0f10-49d9-9b02-5ce16da47483.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_8ede1b71-6846-439a-acba-86a57cfec3d2.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_8f74f6ba-1c53-42d5-a3c7-e4ef46a71133.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_90fa9c2b-25b0-47b7-af2b-a683356e543b.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_92ec488a-287d-4bf0-977b-6998cf0cf476.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_94a393a1-3b23-4961-a0a6-70bad5b4979c.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_94b9df70-a95f-4545-be3a-5a34f7b09fb3.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_95ffc9e1-c505-4a3b-8fb0-cbc98b8703e1.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_98e718fd-5b0e-439f-a00c-57b61e06b395.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_9f50a028-2586-4e0d-bcfd-d9d2d74e8953.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_9f8d7d10-dda7-4e44-b00c-811635a199c8.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_a055bd62-1ec2-47cf-bad2-321e3d4f053f.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_a1e3430e-ef5c-4a86-914d-88e8fb7818c0.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_a2b4cb3d-bea3-478e-82a2-77c00a827250.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_a647274d-799d-4e7a-a485-b8632a87061e.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_a94ea420-99ae-4d58-9cdc-d4666e3322a7.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_ab7c0034-7fc1-4fa8-bae3-e97b85fc16a4.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_b0732cff-657e-4e69-87b8-66e8025bf441.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_b1f3b912-a2ab-43bd-8811-43d84b422506.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_b6c2e4d3-d215-4d99-891f-d20b997d4d5a.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_bbfabe6b-b9bc-476b-b8f0-7d6c47e9d2be.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_bf11ef29-a3f9-4a2d-9dbf-ebcc56d39fdb.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_c05da42b-4939-4f55-867c-16cf6d228e60.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_c1fc3dd5-861f-4b8d-b7a2-eb8f6887b33b.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_c44164b4-0770-48d0-87db-590ca529032a.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_c6b43cda-ca5c-4855-9c08-0f8264cab1af.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_c6e57a16-4879-4dcf-b591-503cfb46a360.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_ca6e3842-6ca4-4230-84fa-376a3374c380.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_caf769e4-7308-4419-9114-900ca213682a.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_d0da78e9-3dcf-4a46-85c1-f23ed00178bc.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_d386838c-a51e-4839-9f27-8495b2466e49.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_d41edf9f-0ebb-4866-b3fe-50785746b36b.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_d6a7fc44-b584-4dd8-9de2-e981afe0bb4a.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_d7c49dbb-c008-47fc-9cbe-8d5695842d21.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_db3b4be7-4e98-4fe9-96bf-05a5788815e3.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_db981f69-9eca-4031-8565-318b949efbfe.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_dbd4105f-7cbb-4483-a7b2-96b17b7fb594.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_de42e348-b333-4d35-b883-9bfc94f29822.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_de744938-fa6c-45dd-b600-428dd7c63a73.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_e8f25867-697d-4f52-84e1-e50a95bc182b.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_eb4e26d4-6625-4f8a-b5fe-6f3a9b8a4b79.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_f141b736-5ce4-4f18-bb29-704227ca4bd1.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_f50efdc6-f88e-4fa6-9ef6-dd1d8314bb36.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_f7680c03-70df-4781-a98d-c88695f92f04.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_fbc62949-624d-4943-9731-f5c46242ba55.jsonl - - gs://levanter-data/dev/redpajama/arxiv/arxiv_fd572627-cce7-4667-a684-fef096dfbeb7.jsonl - - gs://levanter-data/dev/redpajama/book/book.jsonl - - gs://levanter-data/dev/redpajama/c4/c4-train.{00000..01023}-of-01024.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_08cdfa755e6d4d89b673d5bd1acee5f6.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_0f27d10d846a473b96070c3394832f32.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_0f979046c8e64e0fb5843d2634a9957d.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_10f129bfd0af45caa9cd72aa9d863ec5.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_11a1943edfa349c7939382799599eed6.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_17197bd2478044bebd9ff4634b6dfcee.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_1d750c0ce39d40c6bc20bad9469e5a99.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_21078cf63afb4d9eb4a7876f726a7226.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_216883d3a669406699428bc485a4c228.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_24278a707deb445b8e4f59c83dd67910.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_25989b8233a04ac791b0eccd502e0c7a.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_26a6fa61c5eb4bb885e7bc643e285f0e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_275100c1a44f4451b0343373ebc5637a.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_27f05c041a1c401783f90b9415e40e4b.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_28106e66bfd94978abbc15ec845aeddb.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_2afd093fefad4c8da76cc539e8fb6137.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_30d01c4edab64866bda8c609e90b4f4e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_34b0785b77814b7583433ddb27a61ae0.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_34e78793c1e94eeebd92852399097596.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_366012588bef4d749bbbea76ae701141.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_36c2a955ddd84672bbc778aa4ad2fbaf.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_3be6a5cc7428401393c23e5516a43537.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_3cc6cab9266746a6befa23648aa43119.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_3f09f384f0734a4b912bb75db3f812bc.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_478afe8aaccb43e6be2de7e34e041ef3.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_483f11d6bc864f7fbfbe63bdf3583ce2.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_4b6883dc304c4e799620ec95b96dc91a.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_4bebabdbd8544da7a2071864ccf81f2e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_4d44caf43c154ae4aaeab36eab0221c9.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_4f98dc136ba94eeaa1ba6c974814b33c.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_4ff3604761614a3db550ef758e6457b5.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_50423e84046948b4a2a70e7e4538e12d.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_58fad5994b4446c6bceb33453484acb4.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_5abf005e4e634f1dbfa8bd20b5687092.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_5f2b3517159b426bb1a9e81ca189abcd.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_610e908cafaa4d53958de50ad700822a.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_6353ab14cb8f4623a7d75678d9e7f44e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_64747f25102740bab0ab54559569342a.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_6742e99802894114a3ba44841f49b168.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_677e10f9b0af4c489e60670352e7e224.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_68534e6a093744fa9f38fa1a9cf51232.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_6adcf92fb2ee48059cb60579f2e931f7.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_70f0aa43987643a7874286bca4faa66b.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_7462f1a34f594e9f8334d9a0cbbf80e7.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_78a13a25258f4d24923702c07445e20e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_78e3afc1cfee41fbb7eaae2e5bfaa17b.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_791ed086a57f4879bb1596bed6d37bb3.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_7b5cc18857a34a0981b54f082de55cf8.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_7c54b908a2df4ec2ba316d2081fc674e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_7edd0af0b61c426e93e8bd3f549a8f78.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_7fa232e63a5d44a88a267181e9ac47b4.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_81138c01f3a84f7fa8b56bf3d8fa35ce.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_82a9a647b0eb4eb080d7ac15a13c765b.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_835ac26c7a70447b97f4ec38bcb969ed.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_8bc3c4fae78c41d999b2ae6d97cce96c.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_8dc430f8f7114f018440c7b3d990e602.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_8e1db2cf6c98420a88bad52fd57f4aa7.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_919156910e704e6ca52e0d4880cdbb63.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_944beac1347f491faa88f43c25d26fe4.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_96175b6e4c764abfbbf14e78d4fd6464.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_977d1fcdab92452d9dc38b2f4a99389b.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_9eea501af8544d0b88f0b002850829d4.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_a26c51ffe2924fd8ad6694c6aa0eacc5.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_a3a01a2f81ed4cd2afb30e175200b48f.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_a3d3e6f3f7d5495ca9ecf94d808fd350.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_a777da5620f1467f8df3616b17d533dc.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_a8fcae75b0c3410faabcff02f0056a36.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_aabc2d54d1c946908d3400228e0f238c.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_ad160286662849f49d1e6de27c0f1d15.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_adda41d791974289aff042e2e3d07ec3.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_ae1813abc63f4b1998dfa608e7fe5588.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_af7b386db97e4211a7378be08d7b3f4f.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_b15c575fa9f8465d98f70ba2f2f73c6e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_b40763d0b9ce4e0d8fb5f519f1f49f8c.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_b56453718c5f46efa9c46feb194b0d6e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_b6225e159b86432d9fa5bf226bb51393.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_b821f640b8f14ed588bf48ae13f44098.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_bfbcad9633f04601ba2f824d082eaacf.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_c6cfd16905814d7c955df1f4754a8b11.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_c82c6775f0b74dbdae4524bb9aebf0ef.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_ca73c69896b34adbbe468d78d9f134bc.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_cac864a472b948b0bfe18c8e9a19aeb5.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_cb330e8dc8ac411eba2dc8676c9c4403.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_d0a054a678fc4c38b496d10e91a2c735.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_d148857715424aabbd32b8ffe56c4082.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_d33701435f964c90a86c22e204dd5fde.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_d448c2553f474193a1224df1c38f74d4.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_d8d45c58819948c9b352d74383944c4a.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_db611a692a704f6db3c18e77f79fd2f0.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_dcaa5c8b729b4fb599399dbf4557e43e.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_dda863d833614d04b96bbe21b161768d.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_eabf427d56184fb89f9b5f27e73f7988.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_eee6980922ca4a14b0c3341fa8a904d9.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_f478ff72b57f4f4283c22ac22ae84134.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_f931feb0e85940879d194c0e20d9e28a.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_f9bbe6a065004a4c8e018c6ad63063b2.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_fbfc552e48164acda6605fa31fc2f563.sampled.jsonl - - gs://levanter-data/dev/redpajama/github/filtered_fc817e4c16494957bcb89b166e91434f.sampled.jsonl - - gs://levanter-data/dev/redpajama/stackexchange/stackexchange.jsonl - - gs://levanter-data/dev/redpajama/wikipedia/wiki.jsonl -validation_urls: - - gs://levanter-data/dev/redpajama/arxiv/arxiv_023827cd-7ee8-42e6-aa7b-661731f4c70f.jsonl +configs: + arxiv: + train_urls: + - gs://levanter-data/dev/redpajama/arxiv/arxiv_023827cd-7ee8-42e6-aa7b-661731f4c70f.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_024de5df-1b7f-447c-8c3a-51407d8d6732.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_03232e26-be3f-4a28-a5d2-ee1d8c0e9831.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_034e819a-cfcb-43c6-ad25-0232ad48823c.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_077ae8de-a68e-47e7-95a6-6d82f8f4eeb9.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_0af50072-df4c-4084-a833-cebbd046e70e.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_0de84cfc-c080-471f-b139-1bf061db4feb.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_0fbdd8ad-32d8-4228-9a40-e09dde689760.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_11c659c1-ffbf-4455-abfd-058f6bbf4bb2.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_1958455d-6543-4307-a081-d86ce0637f9a.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_1982fb29-c4ed-4dd3-855c-666e63bc62d9.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_1caed86f-5625-4941-bdc1-cc57e4fec1cd.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_1d3a0cd6-f0e6-4106-a080-524a4bd50016.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_29d54f5a-1dd0-4e9a-b783-fb2eec9db072.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_29fd3d99-53fb-43e2-a4a5-2fd01bf77258.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_2b224cd9-286e-46ac-8c4e-c1e3befc8760.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_2c131fca-2a05-4d5f-a805-59d2af3477e2.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_2f28f1a7-6972-48ad-8997-65a5d52e4f1c.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_30440198-cd90-48c6-82c1-ea871b8c21c5.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_39367d6c-d7d4-45fc-a929-8a17184d1744.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_393d19f2-1cd1-421f-be8a-78d955fdf602.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_3a5d4f93-97ec-483a-88ef-324df9651b3f.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_3c89ea11-69ff-4049-b775-f0c785997909.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_3d5a011a-4bbe-4585-a2bd-ff3e943c8671.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_3f805f4b-6f7f-42a8-a006-47c1e0401bd7.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_3f9eb7ad-f266-4154-8d4d-54deeffde075.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_400748d3-0076-4a04-8a1c-6055ba0b5a2d.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_44e19375-3995-4dff-a3b6-8a25247a165c.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_4a8cf52f-81d0-4875-9528-466b1cbc71e1.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_4cc7015c-c39a-4bf6-9686-c00b3343edd9.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_50757a42-079b-41ec-bcca-73759faffd62.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_575ae832-e770-4a89-bfa7-c56f16dbca69.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_580be642-bb73-4d0d-8b5e-f494722934cd.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_5a02d9ee-12a0-437d-808f-d26f0eb2012b.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_5d8d402b-8277-480a-b5fa-71169726864f.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_5ee33ef7-455e-4fd5-9512-c4771dd802c1.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_610c82ed-b9ee-449c-83b0-601205f3a74a.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_629fe3ca-075f-4663-9b81-b807f3b42bf2.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_64e5075e-e87e-4b2a-9e38-e5c102f6f2b1.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_65dd2ff6-dae3-4a60-90d3-c3d7349fc92f.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_6719ecd2-fe34-4078-a584-320d921cbf6f.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_6938ee72-43ee-4ade-8840-151a402383b0.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_73241940-66c1-481c-b53a-f5e8b9afe9fa.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_751370b5-c7cb-44d8-a039-1468ee6747ab.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_75af5d17-5ebb-4460-9f2a-dc9fe880a936.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_79d50803-f7d9-4aa8-bf1a-d807980a40c6.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_7b26046f-7c8d-405b-911b-df51e1a069fa.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_7d1d69dc-bc8e-4817-9cab-afdc002ab7c4.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_7ea7a996-b1bb-4773-a36a-461dce2de861.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_8232f276-9e3f-463a-9350-362de1b501d1.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_8509f5a7-64a8-4813-92dc-f6eb53e3aacc.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_85b4c166-469d-449c-ab3d-5214c1d80246.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_872b620a-b4fd-45d3-92bc-ff0584447705.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_88f24f8d-16d3-4a21-894d-192033d0fa67.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_8e6bd730-0f10-49d9-9b02-5ce16da47483.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_8ede1b71-6846-439a-acba-86a57cfec3d2.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_8f74f6ba-1c53-42d5-a3c7-e4ef46a71133.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_90fa9c2b-25b0-47b7-af2b-a683356e543b.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_92ec488a-287d-4bf0-977b-6998cf0cf476.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_94a393a1-3b23-4961-a0a6-70bad5b4979c.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_94b9df70-a95f-4545-be3a-5a34f7b09fb3.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_95ffc9e1-c505-4a3b-8fb0-cbc98b8703e1.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_98e718fd-5b0e-439f-a00c-57b61e06b395.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_9f50a028-2586-4e0d-bcfd-d9d2d74e8953.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_9f8d7d10-dda7-4e44-b00c-811635a199c8.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_a055bd62-1ec2-47cf-bad2-321e3d4f053f.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_a1e3430e-ef5c-4a86-914d-88e8fb7818c0.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_a2b4cb3d-bea3-478e-82a2-77c00a827250.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_a647274d-799d-4e7a-a485-b8632a87061e.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_a94ea420-99ae-4d58-9cdc-d4666e3322a7.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_ab7c0034-7fc1-4fa8-bae3-e97b85fc16a4.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_b0732cff-657e-4e69-87b8-66e8025bf441.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_b1f3b912-a2ab-43bd-8811-43d84b422506.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_b6c2e4d3-d215-4d99-891f-d20b997d4d5a.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_bbfabe6b-b9bc-476b-b8f0-7d6c47e9d2be.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_bf11ef29-a3f9-4a2d-9dbf-ebcc56d39fdb.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_c05da42b-4939-4f55-867c-16cf6d228e60.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_c1fc3dd5-861f-4b8d-b7a2-eb8f6887b33b.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_c44164b4-0770-48d0-87db-590ca529032a.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_c6b43cda-ca5c-4855-9c08-0f8264cab1af.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_c6e57a16-4879-4dcf-b591-503cfb46a360.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_ca6e3842-6ca4-4230-84fa-376a3374c380.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_caf769e4-7308-4419-9114-900ca213682a.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_d0da78e9-3dcf-4a46-85c1-f23ed00178bc.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_d386838c-a51e-4839-9f27-8495b2466e49.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_d41edf9f-0ebb-4866-b3fe-50785746b36b.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_d6a7fc44-b584-4dd8-9de2-e981afe0bb4a.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_d7c49dbb-c008-47fc-9cbe-8d5695842d21.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_db3b4be7-4e98-4fe9-96bf-05a5788815e3.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_db981f69-9eca-4031-8565-318b949efbfe.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_dbd4105f-7cbb-4483-a7b2-96b17b7fb594.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_de42e348-b333-4d35-b883-9bfc94f29822.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_de744938-fa6c-45dd-b600-428dd7c63a73.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_e8f25867-697d-4f52-84e1-e50a95bc182b.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_eb4e26d4-6625-4f8a-b5fe-6f3a9b8a4b79.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_f141b736-5ce4-4f18-bb29-704227ca4bd1.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_f50efdc6-f88e-4fa6-9ef6-dd1d8314bb36.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_f7680c03-70df-4781-a98d-c88695f92f04.jsonl + - gs://levanter-data/dev/redpajama/arxiv/arxiv_fbc62949-624d-4943-9731-f5c46242ba55.jsonl + validation_urls: + - gs://levanter-data/dev/redpajama/arxiv/arxiv_fd572627-cce7-4667-a684-fef096dfbeb7.jsonl + book: + train_urls: + - gs://levanter-data/dev/redpajama/book/book.jsonl + c4: + train_urls: + - gs://levanter-data/dev/redpajama/c4/c4-train.{00000..01022}-of-01024.jsonl + validation_urls: + - https://data.together.xyz/redpajama-data-1T/v1.0.0/c4/c4-train.01023-of-01024.jsonl + common_crawl: + train_urls: + - gs://levanter-data/dev/redpajama/2019-30/en_head_{0000..0055}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2019-30/en_middle_{0060..0096}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2020-05/en_head_{0000..0068}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2020-05/en_middle_{0000..0128}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2021-04/en_head_{0000..0072}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2021-04/en_middle_{0000..0102}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2022-05/en_head_{0000..0059}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2022-05/en_middle_{0000..0096}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2023-06/en_head_{0000..0069}.json.gz.dedup.classifier.jsonl.zst + - gs://levanter-data/dev/redpajama/2023-06/en_middle_{0000..0103}.json.gz.dedup.classifier.jsonl.zst + validation_urls: + - https://data.together.xyz/redpajama-data-1T/v1.0.0/common_crawl/2023-06/en_middle_0104.json.gz.dedup.classifier.jsonl.zst + github: + train_urls: + - gs://levanter-data/dev/redpajama/github/filtered_08cdfa755e6d4d89b673d5bd1acee5f6.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_0f27d10d846a473b96070c3394832f32.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_0f979046c8e64e0fb5843d2634a9957d.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_10f129bfd0af45caa9cd72aa9d863ec5.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_11a1943edfa349c7939382799599eed6.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_17197bd2478044bebd9ff4634b6dfcee.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_1d750c0ce39d40c6bc20bad9469e5a99.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_21078cf63afb4d9eb4a7876f726a7226.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_216883d3a669406699428bc485a4c228.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_24278a707deb445b8e4f59c83dd67910.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_25989b8233a04ac791b0eccd502e0c7a.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_26a6fa61c5eb4bb885e7bc643e285f0e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_275100c1a44f4451b0343373ebc5637a.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_27f05c041a1c401783f90b9415e40e4b.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_28106e66bfd94978abbc15ec845aeddb.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_2afd093fefad4c8da76cc539e8fb6137.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_30d01c4edab64866bda8c609e90b4f4e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_34b0785b77814b7583433ddb27a61ae0.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_34e78793c1e94eeebd92852399097596.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_366012588bef4d749bbbea76ae701141.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_36c2a955ddd84672bbc778aa4ad2fbaf.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_3be6a5cc7428401393c23e5516a43537.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_3cc6cab9266746a6befa23648aa43119.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_3f09f384f0734a4b912bb75db3f812bc.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_478afe8aaccb43e6be2de7e34e041ef3.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_483f11d6bc864f7fbfbe63bdf3583ce2.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_4b6883dc304c4e799620ec95b96dc91a.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_4bebabdbd8544da7a2071864ccf81f2e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_4d44caf43c154ae4aaeab36eab0221c9.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_4f98dc136ba94eeaa1ba6c974814b33c.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_4ff3604761614a3db550ef758e6457b5.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_50423e84046948b4a2a70e7e4538e12d.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_58fad5994b4446c6bceb33453484acb4.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_5abf005e4e634f1dbfa8bd20b5687092.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_5f2b3517159b426bb1a9e81ca189abcd.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_610e908cafaa4d53958de50ad700822a.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_6353ab14cb8f4623a7d75678d9e7f44e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_64747f25102740bab0ab54559569342a.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_6742e99802894114a3ba44841f49b168.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_677e10f9b0af4c489e60670352e7e224.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_68534e6a093744fa9f38fa1a9cf51232.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_6adcf92fb2ee48059cb60579f2e931f7.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_70f0aa43987643a7874286bca4faa66b.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_7462f1a34f594e9f8334d9a0cbbf80e7.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_78a13a25258f4d24923702c07445e20e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_78e3afc1cfee41fbb7eaae2e5bfaa17b.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_791ed086a57f4879bb1596bed6d37bb3.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_7b5cc18857a34a0981b54f082de55cf8.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_7c54b908a2df4ec2ba316d2081fc674e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_7edd0af0b61c426e93e8bd3f549a8f78.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_7fa232e63a5d44a88a267181e9ac47b4.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_81138c01f3a84f7fa8b56bf3d8fa35ce.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_82a9a647b0eb4eb080d7ac15a13c765b.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_835ac26c7a70447b97f4ec38bcb969ed.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_8bc3c4fae78c41d999b2ae6d97cce96c.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_8dc430f8f7114f018440c7b3d990e602.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_8e1db2cf6c98420a88bad52fd57f4aa7.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_919156910e704e6ca52e0d4880cdbb63.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_944beac1347f491faa88f43c25d26fe4.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_96175b6e4c764abfbbf14e78d4fd6464.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_977d1fcdab92452d9dc38b2f4a99389b.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_9eea501af8544d0b88f0b002850829d4.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_a26c51ffe2924fd8ad6694c6aa0eacc5.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_a3a01a2f81ed4cd2afb30e175200b48f.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_a3d3e6f3f7d5495ca9ecf94d808fd350.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_a777da5620f1467f8df3616b17d533dc.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_a8fcae75b0c3410faabcff02f0056a36.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_aabc2d54d1c946908d3400228e0f238c.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_ad160286662849f49d1e6de27c0f1d15.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_adda41d791974289aff042e2e3d07ec3.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_ae1813abc63f4b1998dfa608e7fe5588.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_af7b386db97e4211a7378be08d7b3f4f.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_b15c575fa9f8465d98f70ba2f2f73c6e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_b40763d0b9ce4e0d8fb5f519f1f49f8c.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_b56453718c5f46efa9c46feb194b0d6e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_b6225e159b86432d9fa5bf226bb51393.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_b821f640b8f14ed588bf48ae13f44098.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_bfbcad9633f04601ba2f824d082eaacf.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_c6cfd16905814d7c955df1f4754a8b11.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_c82c6775f0b74dbdae4524bb9aebf0ef.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_ca73c69896b34adbbe468d78d9f134bc.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_cac864a472b948b0bfe18c8e9a19aeb5.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_cb330e8dc8ac411eba2dc8676c9c4403.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_d0a054a678fc4c38b496d10e91a2c735.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_d148857715424aabbd32b8ffe56c4082.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_d33701435f964c90a86c22e204dd5fde.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_d448c2553f474193a1224df1c38f74d4.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_d8d45c58819948c9b352d74383944c4a.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_db611a692a704f6db3c18e77f79fd2f0.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_dcaa5c8b729b4fb599399dbf4557e43e.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_dda863d833614d04b96bbe21b161768d.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_eabf427d56184fb89f9b5f27e73f7988.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_eee6980922ca4a14b0c3341fa8a904d9.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_f478ff72b57f4f4283c22ac22ae84134.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_f931feb0e85940879d194c0e20d9e28a.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_f9bbe6a065004a4c8e018c6ad63063b2.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_fbfc552e48164acda6605fa31fc2f563.sampled.jsonl + - gs://levanter-data/dev/redpajama/github/filtered_fc817e4c16494957bcb89b166e91434f.sampled.jsonl + StackExchange: + train_urls: + - gs://levanter-data/dev/redpajama/stackexchange/stackexchange.jsonl + wikipedia: + train_urls: + - gs://levanter-data/dev/redpajama/wikipedia/wiki.jsonl +train_weights: + arxiv: 28 + book: 26 + c4: 175 + common_crawl: 878 + github: 59 + StackExchange: 20 + wikipedia: 24 +stop_strategy: all_exhausted \ No newline at end of file diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py index 8ac061eb3..d83146244 100644 --- a/src/levanter/data/text.py +++ b/src/levanter/data/text.py @@ -21,7 +21,7 @@ import haliax as hax from haliax import Axis -from levanter.data.mixture import MixtureDataset +from levanter.data.mixture import MixtureDataset, StopStrategy # intercept the logging nonsense here from levanter.logging import silence_transformer_nag # noqa @@ -639,6 +639,7 @@ class LMMixtureDatasetConfig(LMTaskConfig): """ configuration of each dataset source (urls, hf dataset id, etc.) """ train_weights: Dict[str, float] = field(default_factory=dict) """ weights for each dataset source. They will be normalized to sum to 1. """ + stop_strategy: str = StopStrategy.FIRST_STOP_STRATEGY def __post_init__(self): if len(self.configs) == 0: @@ -655,7 +656,7 @@ def train_set( ) -> ShardableDataset[np.ndarray]: doc_caches = self.build_caches("train", monitors=monitors) token_datasets = {name: TokenSeqDataset(cache, seq_len, stride=None) for name, cache in doc_caches.items()} - return MixtureDataset(datasets=token_datasets, weights=self.train_weights) + return MixtureDataset(datasets=token_datasets, weights=self.train_weights, stop_strategy=self.stop_strategy) def validation_sets( self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True