diff --git a/configs/dolma2-resharding/ablation.log b/configs/dolma2-resharding/ablation.log new file mode 100644 index 00000000..a5427c72 --- /dev/null +++ b/configs/dolma2-resharding/ablation.log @@ -0,0 +1,1702 @@ +Sum of weights is 0.9886, rounding up to 1 +Subset Name : s2pdf:electronics_and_hardware +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/electronics_and_hardware +Natural tokens : 7.02 B +Desired tokens : 4.34 B +Target ratio : 0.0007 +Repetition factor : 1 + +Subset Name : s2pdf:science_math_and_technology +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/science_math_and_technology +Natural tokens : 414.21 B +Desired tokens : 479.91 B +Target ratio : 0.0809 +Repetition factor : 2 + +Subset Name : all-dressed:entertainment:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0014 +Natural tokens : 40.30 B +Desired tokens : 0.17 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:fashion_and_beauty:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0020 +Natural tokens : 16.52 B +Desired tokens : 0.45 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:health:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0018 +Natural tokens : 54.39 B +Desired tokens : 123.70 B +Target ratio : 0.0209 +Repetition factor : 3 + +Subset Name : all-dressed:home_and_hobbies:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0020 +Natural tokens : 44.67 B +Desired tokens : 32.98 B +Target ratio : 0.0056 +Repetition factor : 1 + +Subset Name : all-dressed:literature:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0015 +Natural tokens : 26.05 B +Desired tokens : 0.67 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:religion:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0018 +Natural tokens : 31.17 B +Desired tokens : 16.12 B +Target ratio : 0.0027 +Repetition factor : 1 + +Subset Name : all-dressed:social_life:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0014 +Natural tokens : 22.74 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:transportation:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0013 +Natural tokens : 12.18 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:games +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/games +Natural tokens : 2.41 B +Desired tokens : 0.19 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:religion +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/religion +Natural tokens : 24.32 B +Desired tokens : 4.06 B +Target ratio : 0.0007 +Repetition factor : 1 + +Subset Name : all-dressed:art_and_design:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0014 +Natural tokens : 11.28 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:crime_and_law:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0017 +Natural tokens : 19.18 B +Desired tokens : 3.45 B +Target ratio : 0.0006 +Repetition factor : 1 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0013 +Natural tokens : 11.85 B +Desired tokens : 0.14 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:fashion_and_beauty:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0012 +Natural tokens : 11.75 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:finance_and_business:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0020 +Natural tokens : 50.20 B +Desired tokens : 133.28 B +Target ratio : 0.0225 +Repetition factor : 3 + +Subset Name : all-dressed:games:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0015 +Natural tokens : 23.61 B +Desired tokens : 4.10 B +Target ratio : 0.0007 +Repetition factor : 1 + +Subset Name : all-dressed:history_and_geography:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0014 +Natural tokens : 13.14 B +Desired tokens : 0.02 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:industrial:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0012 +Natural tokens : 7.09 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:politics:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0012 +Natural tokens : 39.05 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:religion:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0016 +Natural tokens : 26.98 B +Desired tokens : 0.19 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:social_life:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0015 +Natural tokens : 25.20 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:software_development:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0012 +Natural tokens : 10.58 B +Desired tokens : 12.52 B +Target ratio : 0.0021 +Repetition factor : 2 + +Subset Name : all-dressed:transportation:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0015 +Natural tokens : 14.37 B +Desired tokens : 0.05 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:education_and_jobs +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/education_and_jobs +Natural tokens : 133.65 B +Desired tokens : 41.88 B +Target ratio : 0.0071 +Repetition factor : 1 + +Subset Name : stack-edu:Cpp +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Cpp +Natural tokens : 12.53 B +Desired tokens : 49.04 B +Target ratio : 0.0083 +Repetition factor : 4 + +Subset Name : all-dressed:adult_content:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0014 +Natural tokens : 6.01 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:crime_and_law:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0012 +Natural tokens : 12.60 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:education_and_jobs:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0018 +Natural tokens : 30.67 B +Desired tokens : 39.14 B +Target ratio : 0.0066 +Repetition factor : 2 + +Subset Name : all-dressed:finance_and_business:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0017 +Natural tokens : 61.29 B +Desired tokens : 8.82 B +Target ratio : 0.0015 +Repetition factor : 1 + +Subset Name : all-dressed:games:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0013 +Natural tokens : 20.82 B +Desired tokens : 0.61 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:health:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0020 +Natural tokens : 40.46 B +Desired tokens : 281.14 B +Target ratio : 0.0474 +Repetition factor : 7 + +Subset Name : all-dressed:home_and_hobbies:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0014 +Natural tokens : 39.06 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:industrial:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0018 +Natural tokens : 9.60 B +Desired tokens : 14.13 B +Target ratio : 0.0024 +Repetition factor : 2 + +Subset Name : all-dressed:software:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0012 +Natural tokens : 12.76 B +Desired tokens : 0.41 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:software_development:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0020 +Natural tokens : 12.04 B +Desired tokens : 100.76 B +Target ratio : 0.0170 +Repetition factor : 9 + +Subset Name : all-dressed:transportation:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0016 +Natural tokens : 15.59 B +Desired tokens : 0.24 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:art_and_design +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/art_and_design +Natural tokens : 6.70 B +Desired tokens : 0.77 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : s2pdf:literature +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/literature +Natural tokens : 31.36 B +Desired tokens : 7.27 B +Target ratio : 0.0012 +Repetition factor : 1 + +Subset Name : stack-edu:Python +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Python +Natural tokens : 18.02 B +Desired tokens : 74.80 B +Target ratio : 0.0126 +Repetition factor : 5 + +Subset Name : all-dressed:art_and_design:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0012 +Natural tokens : 8.95 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:education_and_jobs:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0012 +Natural tokens : 21.33 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0017 +Natural tokens : 16.53 B +Desired tokens : 13.18 B +Target ratio : 0.0022 +Repetition factor : 1 + +Subset Name : all-dressed:fashion_and_beauty:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0014 +Natural tokens : 14.44 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:games:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0018 +Natural tokens : 29.32 B +Desired tokens : 82.13 B +Target ratio : 0.0139 +Repetition factor : 3 + +Subset Name : all-dressed:history_and_geography:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0017 +Natural tokens : 19.46 B +Desired tokens : 4.30 B +Target ratio : 0.0007 +Repetition factor : 1 + +Subset Name : all-dressed:industrial:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0015 +Natural tokens : 8.62 B +Desired tokens : 0.11 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:politics:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0016 +Natural tokens : 52.06 B +Desired tokens : 0.56 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:science_math_and_technology:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0016 +Natural tokens : 40.51 B +Desired tokens : 112.08 B +Target ratio : 0.0189 +Repetition factor : 3 + +Subset Name : all-dressed:software:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0013 +Natural tokens : 13.89 B +Desired tokens : 0.98 B +Target ratio : 0.0002 +Repetition factor : 1 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0011 +Natural tokens : 29.75 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:transportation:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0018 +Natural tokens : 16.76 B +Desired tokens : 8.24 B +Target ratio : 0.0014 +Repetition factor : 1 + +Subset Name : s2pdf:health +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/health +Natural tokens : 105.24 B +Desired tokens : 95.45 B +Target ratio : 0.0161 +Repetition factor : 1 + +Subset Name : stack-edu:Go +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Go +Natural tokens : 1.40 B +Desired tokens : 5.37 B +Target ratio : 0.0009 +Repetition factor : 4 + +Subset Name : all-dressed:adult_content:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0013 +Natural tokens : 5.55 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:crime_and_law:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0013 +Natural tokens : 13.68 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0012 +Natural tokens : 10.99 B +Desired tokens : 0.05 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:fashion_and_beauty:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0017 +Natural tokens : 22.34 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0014 +Natural tokens : 17.83 B +Desired tokens : 0.02 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:health:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0012 +Natural tokens : 37.34 B +Desired tokens : 0.05 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:literature:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0014 +Natural tokens : 22.48 B +Desired tokens : 0.19 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:religion:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0020 +Natural tokens : 22.15 B +Desired tokens : 71.14 B +Target ratio : 0.0120 +Repetition factor : 4 + +Subset Name : all-dressed:software:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0020 +Natural tokens : 12.83 B +Desired tokens : 96.35 B +Target ratio : 0.0162 +Repetition factor : 8 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0017 +Natural tokens : 41.14 B +Desired tokens : 2.45 B +Target ratio : 0.0004 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0011 +Natural tokens : 15.42 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:finance_and_business +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/finance_and_business +Natural tokens : 59.19 B +Desired tokens : 11.06 B +Target ratio : 0.0019 +Repetition factor : 1 + +Subset Name : s2pdf:travel_and_tourism +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/travel_and_tourism +Natural tokens : 2.04 B +Desired tokens : 0.08 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : stack-edu:PHP +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/PHP +Natural tokens : 7.40 B +Desired tokens : 24.81 B +Target ratio : 0.0042 +Repetition factor : 4 + +Subset Name : all-dressed:art_and_design:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0013 +Natural tokens : 9.97 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:crime_and_law:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0020 +Natural tokens : 16.64 B +Desired tokens : 104.06 B +Target ratio : 0.0175 +Repetition factor : 7 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0016 +Natural tokens : 15.89 B +Desired tokens : 4.02 B +Target ratio : 0.0007 +Repetition factor : 1 + +Subset Name : all-dressed:fashion_and_beauty:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0013 +Natural tokens : 13.10 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:finance_and_business:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0018 +Natural tokens : 58.25 B +Desired tokens : 39.14 B +Target ratio : 0.0066 +Repetition factor : 1 + +Subset Name : all-dressed:games:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0014 +Natural tokens : 22.28 B +Desired tokens : 1.61 B +Target ratio : 0.0003 +Repetition factor : 1 + +Subset Name : all-dressed:history_and_geography:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0013 +Natural tokens : 11.40 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:home_and_hobbies:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0015 +Natural tokens : 44.02 B +Desired tokens : 0.02 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:literature:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0020 +Natural tokens : 33.14 B +Desired tokens : 220.00 B +Target ratio : 0.0371 +Repetition factor : 7 + +Subset Name : all-dressed:religion:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0017 +Natural tokens : 29.54 B +Desired tokens : 1.69 B +Target ratio : 0.0003 +Repetition factor : 1 + +Subset Name : all-dressed:social_life:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0013 +Natural tokens : 20.53 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:software:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0017 +Natural tokens : 17.16 B +Desired tokens : 30.09 B +Target ratio : 0.0051 +Repetition factor : 2 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0014 +Natural tokens : 36.41 B +Desired tokens : 0.02 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:crime_and_law +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/crime_and_law +Natural tokens : 41.59 B +Desired tokens : 12.37 B +Target ratio : 0.0021 +Repetition factor : 1 + +Subset Name : stack-edu:Markdown +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Markdown +Natural tokens : 28.92 B +Desired tokens : 68.05 B +Target ratio : 0.0115 +Repetition factor : 3 + +Subset Name : all-dressed:adult_content:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0017 +Natural tokens : 10.17 B +Desired tokens : 0.43 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:crime_and_law:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0015 +Natural tokens : 16.10 B +Desired tokens : 0.10 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0011 +Natural tokens : 10.17 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:entertainment:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0017 +Natural tokens : 53.47 B +Desired tokens : 15.47 B +Target ratio : 0.0026 +Repetition factor : 1 + +Subset Name : all-dressed:finance_and_business:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0013 +Natural tokens : 55.13 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0015 +Natural tokens : 19.55 B +Desired tokens : 0.09 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:health:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0014 +Natural tokens : 44.87 B +Desired tokens : 0.63 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:home_and_hobbies:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0013 +Natural tokens : 35.01 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:literature:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0016 +Natural tokens : 30.08 B +Desired tokens : 2.51 B +Target ratio : 0.0004 +Repetition factor : 1 + +Subset Name : all-dressed:politics:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0018 +Natural tokens : 54.90 B +Desired tokens : 19.78 B +Target ratio : 0.0033 +Repetition factor : 1 + +Subset Name : all-dressed:science_math_and_technology:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0018 +Natural tokens : 45.02 B +Desired tokens : 271.35 B +Target ratio : 0.0458 +Repetition factor : 7 + +Subset Name : all-dressed:software:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0014 +Natural tokens : 15.67 B +Desired tokens : 2.42 B +Target ratio : 0.0004 +Repetition factor : 1 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0012 +Natural tokens : 32.14 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:transportation:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0020 +Natural tokens : 14.35 B +Desired tokens : 34.23 B +Target ratio : 0.0058 +Repetition factor : 3 + +Subset Name : s2pdf:food_and_dining +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/food_and_dining +Natural tokens : 2.25 B +Desired tokens : 0.05 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:software +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/software +Natural tokens : 8.81 B +Desired tokens : 9.72 B +Target ratio : 0.0016 +Repetition factor : 2 + +Subset Name : stack-edu:SQL +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/SQL +Natural tokens : 7.06 B +Desired tokens : 7.46 B +Target ratio : 0.0013 +Repetition factor : 2 + +Subset Name : all-dressed:art_and_design:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0016 +Natural tokens : 13.48 B +Desired tokens : 0.18 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:education_and_jobs:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0014 +Natural tokens : 25.40 B +Desired tokens : 0.05 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:entertainment:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0012 +Natural tokens : 32.80 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0013 +Natural tokens : 16.21 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:health:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0011 +Natural tokens : 34.13 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:history_and_geography:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0020 +Natural tokens : 13.17 B +Desired tokens : 87.37 B +Target ratio : 0.0147 +Repetition factor : 7 + +Subset Name : all-dressed:industrial:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0016 +Natural tokens : 9.30 B +Desired tokens : 0.53 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:politics:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0017 +Natural tokens : 54.98 B +Desired tokens : 3.40 B +Target ratio : 0.0006 +Repetition factor : 1 + +Subset Name : all-dressed:science_math_and_technology:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0017 +Natural tokens : 45.61 B +Desired tokens : 186.06 B +Target ratio : 0.0314 +Repetition factor : 5 + +Subset Name : all-dressed:software:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0015 +Natural tokens : 16.48 B +Desired tokens : 5.71 B +Target ratio : 0.0010 +Repetition factor : 1 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0013 +Natural tokens : 34.06 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0010 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0010 +Natural tokens : 14.43 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:home_and_hobbies +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/home_and_hobbies +Natural tokens : 3.79 B +Desired tokens : 0.66 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : s2pdf:software_development +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/software_development +Natural tokens : 40.25 B +Desired tokens : 69.68 B +Target ratio : 0.0117 +Repetition factor : 2 + +Subset Name : stack-edu:TypeScript +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/TypeScript +Natural tokens : 2.50 B +Desired tokens : 9.38 B +Target ratio : 0.0016 +Repetition factor : 4 + +Subset Name : all-dressed:art_and_design:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0018 +Natural tokens : 15.98 B +Desired tokens : 10.52 B +Target ratio : 0.0018 +Repetition factor : 1 + +Subset Name : all-dressed:education_and_jobs:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0016 +Natural tokens : 29.41 B +Desired tokens : 1.29 B +Target ratio : 0.0002 +Repetition factor : 1 + +Subset Name : all-dressed:entertainment:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0013 +Natural tokens : 36.52 B +Desired tokens : 0.05 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:fashion_and_beauty:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0015 +Natural tokens : 16.42 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0012 +Natural tokens : 14.67 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:games:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0020 +Natural tokens : 25.85 B +Desired tokens : 183.47 B +Target ratio : 0.0309 +Repetition factor : 8 + +Subset Name : all-dressed:history_and_geography:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0018 +Natural tokens : 20.08 B +Desired tokens : 29.84 B +Target ratio : 0.0050 +Repetition factor : 2 + +Subset Name : all-dressed:industrial:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0017 +Natural tokens : 9.76 B +Desired tokens : 2.84 B +Target ratio : 0.0005 +Repetition factor : 1 + +Subset Name : all-dressed:politics:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0015 +Natural tokens : 49.64 B +Desired tokens : 0.10 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:science_math_and_technology:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0014 +Natural tokens : 31.86 B +Desired tokens : 41.08 B +Target ratio : 0.0069 +Repetition factor : 2 + +Subset Name : all-dressed:social_life:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0018 +Natural tokens : 31.39 B +Desired tokens : 3.54 B +Target ratio : 0.0006 +Repetition factor : 1 + +Subset Name : all-dressed:software:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0018 +Natural tokens : 16.59 B +Desired tokens : 63.99 B +Target ratio : 0.0108 +Repetition factor : 4 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0016 +Natural tokens : 40.32 B +Desired tokens : 0.46 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0013 +Natural tokens : 17.96 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:history_and_geography +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/history_and_geography +Natural tokens : 25.52 B +Desired tokens : 4.02 B +Target ratio : 0.0007 +Repetition factor : 1 + +Subset Name : s2pdf:transportation +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/transportation +Natural tokens : 16.52 B +Desired tokens : 1.79 B +Target ratio : 0.0003 +Repetition factor : 1 + +Subset Name : stack-edu:Rust +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Rust +Natural tokens : 1.42 B +Desired tokens : 5.73 B +Target ratio : 0.0010 +Repetition factor : 5 + +Subset Name : all-dressed:adult_content:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0020 +Natural tokens : 0.47 B +Desired tokens : 0.62 B +Target ratio : 0.0001 +Repetition factor : 2 + +Subset Name : all-dressed:crime_and_law:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0011 +Natural tokens : 11.66 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:education_and_jobs:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0017 +Natural tokens : 30.13 B +Desired tokens : 7.04 B +Target ratio : 0.0012 +Repetition factor : 1 + +Subset Name : all-dressed:finance_and_business:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0012 +Natural tokens : 49.98 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0017 +Natural tokens : 20.98 B +Desired tokens : 3.02 B +Target ratio : 0.0005 +Repetition factor : 1 + +Subset Name : all-dressed:home_and_hobbies:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0016 +Natural tokens : 50.70 B +Desired tokens : 0.14 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:literature:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0017 +Natural tokens : 37.19 B +Desired tokens : 12.00 B +Target ratio : 0.0020 +Repetition factor : 1 + +Subset Name : all-dressed:religion:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0013 +Natural tokens : 19.33 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:science_math_and_technology:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0020 +Natural tokens : 28.83 B +Desired tokens : 235.43 B +Target ratio : 0.0397 +Repetition factor : 9 + +Subset Name : all-dressed:software:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0016 +Natural tokens : 17.08 B +Desired tokens : 13.47 B +Target ratio : 0.0023 +Repetition factor : 1 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0015 +Natural tokens : 38.65 B +Desired tokens : 0.09 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0012 +Natural tokens : 16.70 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : wikipedia +Base URI : s3://ai2-llm/preprocessed/wikipedia-dolma-0823/allenai/dolma2-tokenizer +Natural tokens : 3.69 B +Desired tokens : 2.50 B +Target ratio : 0.0004 +Repetition factor : 1 + +Subset Name : stack-edu:C +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/C +Natural tokens : 4.74 B +Desired tokens : 16.58 B +Target ratio : 0.0028 +Repetition factor : 4 + +Subset Name : stack-edu:Ruby +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Ruby +Natural tokens : 1.39 B +Desired tokens : 5.37 B +Target ratio : 0.0009 +Repetition factor : 4 + +Subset Name : all-dressed:adult_content:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0018 +Natural tokens : 8.42 B +Desired tokens : 4.08 B +Target ratio : 0.0007 +Repetition factor : 1 + +Subset Name : all-dressed:crime_and_law:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0018 +Natural tokens : 20.39 B +Desired tokens : 23.72 B +Target ratio : 0.0040 +Repetition factor : 2 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0014 +Natural tokens : 13.07 B +Desired tokens : 0.40 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:finance_and_business:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0014 +Natural tokens : 57.29 B +Desired tokens : 0.07 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0020 +Natural tokens : 15.72 B +Desired tokens : 45.88 B +Target ratio : 0.0077 +Repetition factor : 3 + +Subset Name : all-dressed:health:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0013 +Natural tokens : 41.01 B +Desired tokens : 0.18 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:home_and_hobbies:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0012 +Natural tokens : 31.47 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:literature:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0013 +Natural tokens : 19.52 B +Desired tokens : 0.05 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:science_math_and_technology:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0012 +Natural tokens : 25.66 B +Desired tokens : 10.92 B +Target ratio : 0.0018 +Repetition factor : 1 + +Subset Name : all-dressed:social_life:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0020 +Natural tokens : 25.94 B +Desired tokens : 16.85 B +Target ratio : 0.0028 +Repetition factor : 1 + +Subset Name : all-dressed:software_development:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0015 +Natural tokens : 13.75 B +Desired tokens : 51.31 B +Target ratio : 0.0087 +Repetition factor : 4 + +Subset Name : all-dressed:transportation:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0012 +Natural tokens : 11.31 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0015 +Natural tokens : 21.54 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:fashion_and_beauty +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/fashion_and_beauty +Natural tokens : 0.54 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 0 + +Subset Name : s2pdf:sports_and_fitness +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/sports_and_fitness +Natural tokens : 5.28 B +Desired tokens : 2.88 B +Target ratio : 0.0005 +Repetition factor : 1 + +Subset Name : stack-edu:JavaScript +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/JavaScript +Natural tokens : 8.89 B +Desired tokens : 35.62 B +Target ratio : 0.0060 +Repetition factor : 5 + +Subset Name : all-dressed:adult_content:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0016 +Natural tokens : 8.84 B +Desired tokens : 0.03 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:crime_and_law:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0014 +Natural tokens : 14.76 B +Desired tokens : 0.02 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0018 +Natural tokens : 16.54 B +Desired tokens : 41.69 B +Target ratio : 0.0070 +Repetition factor : 3 + +Subset Name : all-dressed:fashion_and_beauty:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0016 +Natural tokens : 19.16 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0011 +Natural tokens : 13.46 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:games:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0016 +Natural tokens : 24.96 B +Desired tokens : 10.49 B +Target ratio : 0.0018 +Repetition factor : 1 + +Subset Name : all-dressed:history_and_geography:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0016 +Natural tokens : 17.00 B +Desired tokens : 0.61 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:industrial:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0014 +Natural tokens : 8.09 B +Desired tokens : 0.02 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:politics:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0014 +Natural tokens : 46.26 B +Desired tokens : 0.02 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:science_math_and_technology:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0013 +Natural tokens : 28.57 B +Desired tokens : 23.93 B +Target ratio : 0.0040 +Repetition factor : 1 + +Subset Name : all-dressed:social_life:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0016 +Natural tokens : 28.51 B +Desired tokens : 0.07 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:software_development:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0013 +Natural tokens : 11.77 B +Desired tokens : 24.93 B +Target ratio : 0.0042 +Repetition factor : 3 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0020 +Natural tokens : 33.45 B +Desired tokens : 43.58 B +Target ratio : 0.0073 +Repetition factor : 2 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0016 +Natural tokens : 23.12 B +Desired tokens : 0.07 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : arxiv +Base URI : s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated-0625_tokenized/arxiv/train/allenai/dolma2-tokenizer +Natural tokens : 21.38 B +Desired tokens : 49.70 B +Target ratio : 0.0084 +Repetition factor : 3 + +Subset Name : s2pdf:industrial +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/industrial +Natural tokens : 28.25 B +Desired tokens : 42.87 B +Target ratio : 0.0072 +Repetition factor : 2 + +Subset Name : all-dressed:art_and_design:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0020 +Natural tokens : 12.03 B +Desired tokens : 44.37 B +Target ratio : 0.0075 +Repetition factor : 4 + +Subset Name : all-dressed:education_and_jobs:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0013 +Natural tokens : 23.44 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0020 +Natural tokens : 13.58 B +Desired tokens : 95.05 B +Target ratio : 0.0160 +Repetition factor : 8 + +Subset Name : all-dressed:entertainment:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0020 +Natural tokens : 52.26 B +Desired tokens : 333.58 B +Target ratio : 0.0563 +Repetition factor : 7 + +Subset Name : all-dressed:finance_and_business:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0016 +Natural tokens : 62.36 B +Desired tokens : 1.80 B +Target ratio : 0.0003 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0018 +Natural tokens : 19.26 B +Desired tokens : 14.00 B +Target ratio : 0.0024 +Repetition factor : 1 + +Subset Name : all-dressed:health:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0016 +Natural tokens : 51.03 B +Desired tokens : 8.28 B +Target ratio : 0.0014 +Repetition factor : 1 + +Subset Name : all-dressed:home_and_hobbies:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0017 +Natural tokens : 56.59 B +Desired tokens : 1.08 B +Target ratio : 0.0002 +Repetition factor : 1 + +Subset Name : all-dressed:literature:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0018 +Natural tokens : 44.84 B +Desired tokens : 72.46 B +Target ratio : 0.0122 +Repetition factor : 2 + +Subset Name : all-dressed:politics:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0020 +Natural tokens : 44.35 B +Desired tokens : 74.04 B +Target ratio : 0.0125 +Repetition factor : 2 + +Subset Name : all-dressed:social_life:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0012 +Natural tokens : 18.94 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:software_development:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0018 +Natural tokens : 16.23 B +Desired tokens : 114.28 B +Target ratio : 0.0193 +Repetition factor : 8 + +Subset Name : all-dressed:transportation:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0014 +Natural tokens : 13.13 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0017 +Natural tokens : 25.21 B +Desired tokens : 0.39 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : finemath-3plus +Base URI : s3://ai2-llm/preprocessed/olmo3-final/math/allenai/dolma2-tokenizer/finemath_3plus_all +Natural tokens : 34.06 B +Desired tokens : 152.00 B +Target ratio : 0.0256 +Repetition factor : 5 + +Subset Name : stack-edu:CSharp +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/CSharp +Natural tokens : 7.20 B +Desired tokens : 25.13 B +Target ratio : 0.0042 +Repetition factor : 4 + +Subset Name : stack-edu:Swift +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Swift +Natural tokens : 1.51 B +Desired tokens : 5.80 B +Target ratio : 0.0010 +Repetition factor : 4 + +Subset Name : all-dressed:art_and_design:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0015 +Natural tokens : 12.28 B +Desired tokens : 0.03 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:education_and_jobs:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0011 +Natural tokens : 19.74 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:electronics_and_hardware:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0015 +Natural tokens : 14.39 B +Desired tokens : 1.21 B +Target ratio : 0.0002 +Repetition factor : 1 + +Subset Name : all-dressed:entertainment:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0018 +Natural tokens : 56.97 B +Desired tokens : 79.80 B +Target ratio : 0.0135 +Repetition factor : 2 + +Subset Name : all-dressed:finance_and_business:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0015 +Natural tokens : 60.68 B +Desired tokens : 0.34 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:games:vigintile_0012 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0012 +Natural tokens : 19.20 B +Desired tokens : 0.19 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:health:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0017 +Natural tokens : 53.83 B +Desired tokens : 32.30 B +Target ratio : 0.0054 +Repetition factor : 1 + +Subset Name : all-dressed:industrial:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0011 +Natural tokens : 6.79 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:industrial:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0020 +Natural tokens : 8.38 B +Desired tokens : 53.40 B +Target ratio : 0.0090 +Repetition factor : 7 + +Subset Name : all-dressed:religion:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0015 +Natural tokens : 24.51 B +Desired tokens : 0.02 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:software_development:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0016 +Natural tokens : 14.85 B +Desired tokens : 69.78 B +Target ratio : 0.0118 +Repetition factor : 5 + +Subset Name : all-dressed:transportation:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0011 +Natural tokens : 10.61 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0018 +Natural tokens : 25.14 B +Desired tokens : 2.12 B +Target ratio : 0.0004 +Repetition factor : 1 + +Subset Name : s2pdf:adult_content +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/adult_content +Natural tokens : 0.30 B +Desired tokens : 0.08 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : s2pdf:politics +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/politics +Natural tokens : 38.46 B +Desired tokens : 14.65 B +Target ratio : 0.0025 +Repetition factor : 1 + +Subset Name : stack-edu:Shell +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Shell +Natural tokens : 2.54 B +Desired tokens : 10.44 B +Target ratio : 0.0018 +Repetition factor : 5 + +Subset Name : all-dressed:art_and_design:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0017 +Natural tokens : 14.73 B +Desired tokens : 1.27 B +Target ratio : 0.0002 +Repetition factor : 1 + +Subset Name : all-dressed:education_and_jobs:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0015 +Natural tokens : 27.77 B +Desired tokens : 0.24 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:entertainment:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0015 +Natural tokens : 44.76 B +Desired tokens : 0.71 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:fashion_and_beauty:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0018 +Natural tokens : 20.64 B +Desired tokens : 0.10 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:games:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0017 +Natural tokens : 26.81 B +Desired tokens : 28.28 B +Target ratio : 0.0048 +Repetition factor : 2 + +Subset Name : all-dressed:history_and_geography:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0015 +Natural tokens : 14.93 B +Desired tokens : 0.10 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:industrial:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0013 +Natural tokens : 7.49 B +Desired tokens : 0.01 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:politics:vigintile_0013 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0013 +Natural tokens : 42.27 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:science_math_and_technology:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0015 +Natural tokens : 35.84 B +Desired tokens : 67.66 B +Target ratio : 0.0114 +Repetition factor : 2 + +Subset Name : all-dressed:software:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0011 +Natural tokens : 11.70 B +Desired tokens : 0.14 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:software_development:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0017 +Natural tokens : 15.98 B +Desired tokens : 92.20 B +Target ratio : 0.0155 +Repetition factor : 6 + +Subset Name : all-dressed:transportation:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0017 +Natural tokens : 16.49 B +Desired tokens : 1.43 B +Target ratio : 0.0002 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0020 +Natural tokens : 19.93 B +Desired tokens : 7.32 B +Target ratio : 0.0012 +Repetition factor : 1 + +Subset Name : s2pdf:entertainment +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/entertainment +Natural tokens : 5.86 B +Desired tokens : 1.76 B +Target ratio : 0.0003 +Repetition factor : 1 + +Subset Name : s2pdf:social_life +Base URI : s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/social_life +Natural tokens : 3.11 B +Desired tokens : 0.17 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : stack-edu:Java +Base URI : s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Java +Natural tokens : 31.35 B +Desired tokens : 65.31 B +Target ratio : 0.0110 +Repetition factor : 3 + +Subset Name : all-dressed:adult_content:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0015 +Natural tokens : 7.05 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:crime_and_law:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0016 +Natural tokens : 17.73 B +Desired tokens : 0.54 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:education_and_jobs:vigintile_0020 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0020 +Natural tokens : 25.14 B +Desired tokens : 145.66 B +Target ratio : 0.0246 +Repetition factor : 6 + +Subset Name : all-dressed:entertainment:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0016 +Natural tokens : 49.56 B +Desired tokens : 3.20 B +Target ratio : 0.0005 +Repetition factor : 1 + +Subset Name : all-dressed:finance_and_business:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0011 +Natural tokens : 46.08 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:food_and_dining:vigintile_0016 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0016 +Natural tokens : 20.67 B +Desired tokens : 0.53 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:health:vigintile_0015 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0015 +Natural tokens : 47.63 B +Desired tokens : 2.21 B +Target ratio : 0.0004 +Repetition factor : 1 + +Subset Name : all-dressed:home_and_hobbies:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0018 +Natural tokens : 54.82 B +Desired tokens : 7.41 B +Target ratio : 0.0012 +Repetition factor : 1 + +Subset Name : all-dressed:politics:vigintile_0011 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0011 +Natural tokens : 35.73 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:religion:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0014 +Natural tokens : 22.03 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + +Subset Name : all-dressed:social_life:vigintile_0017 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0017 +Natural tokens : 31.04 B +Desired tokens : 0.49 B +Target ratio : 0.0001 +Repetition factor : 1 + +Subset Name : all-dressed:software_development:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0014 +Natural tokens : 12.69 B +Desired tokens : 36.86 B +Target ratio : 0.0062 +Repetition factor : 3 + +Subset Name : all-dressed:sports_and_fitness:vigintile_0018 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0018 +Natural tokens : 40.28 B +Desired tokens : 12.37 B +Target ratio : 0.0021 +Repetition factor : 1 + +Subset Name : all-dressed:travel_and_tourism:vigintile_0014 +Base URI : s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0014 +Natural tokens : 19.31 B +Desired tokens : 0.00 B +Target ratio : 0.0000 +Repetition factor : 1 + diff --git a/configs/dolma2-resharding/ablation.py b/configs/dolma2-resharding/ablation.py new file mode 100755 index 00000000..e1753729 --- /dev/null +++ b/configs/dolma2-resharding/ablation.py @@ -0,0 +1,208 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.8" +# dependencies = [ +# "boto3", +# "tqdm", +# "pyyaml", +# ] +# /// + +import json +import math +import os +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from pathlib import Path +from urllib.parse import urlparse + +import boto3 +import tqdm +import yaml + +BASE_URLS = { + "all-dressed": "s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/{topic}/{quality}", + "s2pdf": "s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/{topic}", + "stack-edu": "s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/{topic}", + "finemath-3plus": "s3://ai2-llm/preprocessed/olmo3-final/math/allenai/dolma2-tokenizer/finemath_3plus_all", + "wikipedia": "s3://ai2-llm/preprocessed/wikipedia-dolma-0823/allenai/dolma2-tokenizer", + "arxiv": "s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated-0625_tokenized/arxiv/train/allenai/dolma2-tokenizer", +} + + +def get_size_of_prefix(prefix: str, ext: str = ".npy") -> int: + bucket, prefix = (p := urlparse(prefix)).netloc, p.path.lstrip("/") + s3 = boto3.client("s3") + + total_size = 0 + continuation_token = None + + while True: + if continuation_token: + response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token) + else: + response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix) + + for obj in response.get("Contents", []): + if "Key" not in obj: + continue + + if not obj["Key"].endswith(ext): + continue + + if "Size" not in obj: + continue + + total_size += int(obj["Size"]) + + if response.get("IsTruncated", False): + continuation_token = response.get("NextContinuationToken") + else: + break + + return total_size + + +@dataclass(frozen=True) +class WeightConfig: + domain: str + topic: str | None + quality: str | None + weight: float + + @classmethod + def from_dict(cls, d: dict) -> "WeightConfig": + domain_topic_quality = d["domain"].split(":") + if len(domain_topic_quality) == 3: + return cls( + domain=domain_topic_quality[0], + topic=domain_topic_quality[1], + quality=domain_topic_quality[2], + weight=d["weight"], + ) + elif len(domain_topic_quality) == 2: + return cls( + domain=domain_topic_quality[0], topic=domain_topic_quality[1], quality=None, weight=d["weight"] + ) + elif len(domain_topic_quality) == 1: + return cls(domain=domain_topic_quality[0], topic=None, quality=None, weight=d["weight"]) + else: + raise ValueError(f"Invalid domain: {d['domain']}") + + @property + def uri(self) -> str: + fmt = { + **({"topic": self.topic} if self.topic is not None else {}), + **({"quality": self.quality} if self.quality is not None else {}), + } + try: + return BASE_URLS[self.domain].format(**fmt) + except KeyError: + raise ValueError(f"Invalid domain: {self.domain}") + + @property + def name(self) -> str: + name = f"{self.domain}:{self.topic or ''}:{self.quality or ''}" + return name.strip(":") + + +TOKEN_TARGET = 5_929_970_906_676 +# TOKEN_TARGET = 6_000_000_000_000 + + +cross_source_pstar = { + "finemath-3plus": 0.025340376929054265, + "arxiv": 0.008284928006565282, + "wikipedia": 0.000416156026289699, + "all-dressed": 0.752076181317783, + "s2pdf": 0.134320140174659, + "stack-edu": 0.06816936506111054, +} + + +def make_one_config(weight_config: WeightConfig, token_target: int) -> dict: + base_uri = weight_config.uri + actual_size = get_size_of_prefix(base_uri.rstrip("/") + "/") // 4 # 4 bytes per token + desired_size = token_target * weight_config.weight + sample_rate = math.ceil(desired_size / actual_size) + + msg = ( + f"Subset Name : {weight_config.name}\n" + f"Base URI : {base_uri}\n" + f"Natural tokens : {actual_size / 1000 ** 3:6.2f} B\n" + f"Desired tokens : {desired_size / 1000 ** 3:6.2f} B\n" + f"Target ratio : {weight_config.weight:6.4f}\n" + f"Repetition factor : {sample_rate}\n" + ) + print(msg) + return { + "name": weight_config.name, + "target_ratio": weight_config.weight, + "repetition_factor": sample_rate, + "paths": [base_uri.rstrip("/") + "/*.npy"], + } + + +def main(): + + weights = [] + + # load everything from here except pes2o (not used) and dclm (we will load from snazzy2) + raw_weights_path = Path(__file__).parent / "s2pdf/full_pstar_7rep_dclm_stackedu_conditional.json" + with open(raw_weights_path, "r") as f: + other_weights = [WeightConfig.from_dict(w) for w in json.load(f)] + weights.extend([w for w in other_weights if w.domain in cross_source_pstar]) + + snazzy2_weights_path = Path(__file__).parent / "all-dressed/vigintiles/snazzy2.json" + with open(snazzy2_weights_path, "r") as f: + snazzy2_weights = [ + WeightConfig( + domain="all-dressed", + topic=(topic_quality := k.split("/"))[0], + quality=topic_quality[1], + weight=v * cross_source_pstar["all-dressed"], + ) + for k, v in json.load(f).items() + ] + weights.extend(snazzy2_weights) + + # gotta round up to 1 the weights + if (sum_weights := sum(w.weight for w in weights)) < 1: + print(f"Sum of weights is {sum_weights:.4f}, rounding up to 1") + weights = [ + WeightConfig(domain=w.domain, topic=w.topic, quality=w.quality, weight=w.weight / sum_weights) + for w in weights + ] + + with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor: + # with ThreadPoolExecutor(max_workers=1) as executor: + futures = [] + for weight_config in weights: + future = executor.submit(make_one_config, weight_config=weight_config, token_target=TOKEN_TARGET) + futures.append(future) + + sources = [] + for future in as_completed(futures): + try: + source = future.result() + sources.append(source) + except Exception as e: + print(f"Error making config for {weight_config.name}: {e}") + for future in futures: + future.cancel() + raise e + + # remove sources with target ratio = 0 + sources = [source for source in sources if source["target_ratio"] > 0] + + # sort sources by name + sources = sorted(sources, key=lambda x: x["name"]) + + # make and write config + config = {"dataset": {"sources": sources}} + with open(Path(__file__).parent / "ablation.yaml", "w") as f: + yaml.dump(config, f) + + +if __name__ == "__main__": + main() diff --git a/configs/dolma2-resharding/ablation.yaml b/configs/dolma2-resharding/ablation.yaml new file mode 100644 index 00000000..111ca598 --- /dev/null +++ b/configs/dolma2-resharding/ablation.yaml @@ -0,0 +1,1212 @@ +dataset: + sources: + - name: all-dressed:adult_content:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 4.888839929381333e-09 + - name: all-dressed:adult_content:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 3.8301684076646866e-08 + - name: all-dressed:adult_content:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 3.2633541449838395e-07 + - name: all-dressed:adult_content:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 4.2238103056761175e-06 + - name: all-dressed:adult_content:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 7.183538475035353e-05 + - name: all-dressed:adult_content:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.0006887648064833394 + - name: all-dressed:adult_content:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/adult_content/vigintile_0020/*.npy + repetition_factor: 2 + target_ratio: 0.00010518633531009835 + - name: all-dressed:art_and_design:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 2.7338444340700146e-08 + - name: all-dressed:art_and_design:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 1.5752358277003603e-07 + - name: all-dressed:art_and_design:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 8.544745357955255e-07 + - name: all-dressed:art_and_design:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 4.789544027301923e-06 + - name: all-dressed:art_and_design:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 3.0126198178097416e-05 + - name: all-dressed:art_and_design:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.00021432697874750596 + - name: all-dressed:art_and_design:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.0017738840082224846 + - name: all-dressed:art_and_design:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/art_and_design/vigintile_0020/*.npy + repetition_factor: 4 + target_ratio: 0.007482897953957075 + - name: all-dressed:crime_and_law:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 3.1843518796017836e-08 + - name: all-dressed:crime_and_law:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 1.6240528550191057e-07 + - name: all-dressed:crime_and_law:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 7.186002308764538e-07 + - name: all-dressed:crime_and_law:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 3.2699623043109103e-06 + - name: all-dressed:crime_and_law:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 1.622023563703666e-05 + - name: all-dressed:crime_and_law:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 9.186450471671149e-05 + - name: all-dressed:crime_and_law:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0005811430580008507 + - name: all-dressed:crime_and_law:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0018/*.npy + repetition_factor: 2 + target_ratio: 0.0039993215237273255 + - name: all-dressed:crime_and_law:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/crime_and_law/vigintile_0020/*.npy + repetition_factor: 7 + target_ratio: 0.017547466376744446 + - name: all-dressed:education_and_jobs:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 7.190331913413103e-08 + - name: all-dressed:education_and_jobs:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 3.6802198414206233e-07 + - name: all-dressed:education_and_jobs:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 1.6717823868102729e-06 + - name: all-dressed:education_and_jobs:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 7.827556652622969e-06 + - name: all-dressed:education_and_jobs:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 4.043419163991613e-05 + - name: all-dressed:education_and_jobs:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.00021681288030676236 + - name: all-dressed:education_and_jobs:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0011872890309069996 + - name: all-dressed:education_and_jobs:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0018/*.npy + repetition_factor: 2 + target_ratio: 0.00660080301434597 + - name: all-dressed:education_and_jobs:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/education_and_jobs/vigintile_0020/*.npy + repetition_factor: 6 + target_ratio: 0.024562949301373947 + - name: all-dressed:electronics_and_hardware:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 2.3601137945874096e-06 + - name: all-dressed:electronics_and_hardware:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 8.12119291511839e-06 + - name: all-dressed:electronics_and_hardware:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 2.2948165390448923e-05 + - name: all-dressed:electronics_and_hardware:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 6.6918565885042e-05 + - name: all-dressed:electronics_and_hardware:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 0.00020470337336571178 + - name: all-dressed:electronics_and_hardware:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.0006772920048798669 + - name: all-dressed:electronics_and_hardware:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0022221442368570254 + - name: all-dressed:electronics_and_hardware:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0018/*.npy + repetition_factor: 3 + target_ratio: 0.007031127113728502 + - name: all-dressed:electronics_and_hardware:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/electronics_and_hardware/vigintile_0020/*.npy + repetition_factor: 8 + target_ratio: 0.016028371484787234 + - name: all-dressed:entertainment:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 1.6842087834446143e-06 + - name: all-dressed:entertainment:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 7.599518963122566e-06 + - name: all-dressed:entertainment:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 2.9506361314362786e-05 + - name: all-dressed:entertainment:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 0.0001201768017185984 + - name: all-dressed:entertainment:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.0005403263934865682 + - name: all-dressed:entertainment:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0026096107870991537 + - name: all-dressed:entertainment:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0018/*.npy + repetition_factor: 2 + target_ratio: 0.013456776776588273 + - name: all-dressed:entertainment:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/entertainment/vigintile_0020/*.npy + repetition_factor: 7 + target_ratio: 0.05625406846559406 + - name: all-dressed:fashion_and_beauty:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 2.785031307544406e-10 + - name: all-dressed:fashion_and_beauty:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 1.5389339982501387e-09 + - name: all-dressed:fashion_and_beauty:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 7.433302054150595e-09 + - name: all-dressed:fashion_and_beauty:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 4.091445795912251e-08 + - name: all-dressed:fashion_and_beauty:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 2.828985342715727e-07 + - name: all-dressed:fashion_and_beauty:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 2.524458708644147e-06 + - name: all-dressed:fashion_and_beauty:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 1.7553352575144054e-05 + - name: all-dressed:fashion_and_beauty:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/fashion_and_beauty/vigintile_0020/*.npy + repetition_factor: 1 + target_ratio: 7.645962766034606e-05 + - name: all-dressed:finance_and_business:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 7.793007389932867e-08 + - name: all-dressed:finance_and_business:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 4.407793505683075e-07 + - name: all-dressed:finance_and_business:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 2.263288237085406e-06 + - name: all-dressed:finance_and_business:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 1.1155486757002212e-05 + - name: all-dressed:finance_and_business:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 5.7962090491908636e-05 + - name: all-dressed:finance_and_business:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.00030338069947890466 + - name: all-dressed:finance_and_business:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.001487985472882501 + - name: all-dressed:finance_and_business:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.006600242312734452 + - name: all-dressed:finance_and_business:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/finance_and_business/vigintile_0020/*.npy + repetition_factor: 3 + target_ratio: 0.022476124807886654 + - name: all-dressed:food_and_dining:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 2.413497438902672e-08 + - name: all-dressed:food_and_dining:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 1.2535431372720965e-07 + - name: all-dressed:food_and_dining:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 5.829967440155756e-07 + - name: all-dressed:food_and_dining:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 2.8631759064815014e-06 + - name: all-dressed:food_and_dining:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 1.5744036308134813e-05 + - name: all-dressed:food_and_dining:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 8.995655348331771e-05 + - name: all-dressed:food_and_dining:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0005098762206111881 + - name: all-dressed:food_and_dining:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.0023602414178112217 + - name: all-dressed:food_and_dining:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/food_and_dining/vigintile_0020/*.npy + repetition_factor: 3 + target_ratio: 0.0077365360946123 + - name: all-dressed:games:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 3.132299499403478e-05 + - name: all-dressed:games:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 0.00010259516564574387 + - name: all-dressed:games:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 0.000272038106309843 + - name: all-dressed:games:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 0.0006914857086765009 + - name: all-dressed:games:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.0017686297285715818 + - name: all-dressed:games:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0017/*.npy + repetition_factor: 2 + target_ratio: 0.004769341807850287 + - name: all-dressed:games:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0018/*.npy + repetition_factor: 3 + target_ratio: 0.013850029723686728 + - name: all-dressed:games:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/games/vigintile_0020/*.npy + repetition_factor: 8 + target_ratio: 0.030939537656638916 + - name: all-dressed:health:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 2.22661357759855e-06 + - name: all-dressed:health:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 8.917280330579426e-06 + - name: all-dressed:health:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 3.0476749411771047e-05 + - name: all-dressed:health:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 0.00010640822039670898 + - name: all-dressed:health:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 0.00037241309229369085 + - name: all-dressed:health:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.0013964930748848329 + - name: all-dressed:health:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.005446395769128015 + - name: all-dressed:health:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0018/*.npy + repetition_factor: 3 + target_ratio: 0.020860419027065622 + - name: all-dressed:health:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/health/vigintile_0020/*.npy + repetition_factor: 7 + target_ratio: 0.047410735510919094 + - name: all-dressed:history_and_geography:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 5.931708027603779e-07 + - name: all-dressed:history_and_geography:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 3.328393927100208e-06 + - name: all-dressed:history_and_geography:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 1.755010000800208e-05 + - name: all-dressed:history_and_geography:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.00010260692269515485 + - name: all-dressed:history_and_geography:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0007246250612928564 + - name: all-dressed:history_and_geography:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0018/*.npy + repetition_factor: 2 + target_ratio: 0.005032289677021166 + - name: all-dressed:history_and_geography:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/history_and_geography/vigintile_0020/*.npy + repetition_factor: 7 + target_ratio: 0.014734049685859936 + - name: all-dressed:home_and_hobbies:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 2.32342685927812e-08 + - name: all-dressed:home_and_hobbies:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 1.2699833027312767e-07 + - name: all-dressed:home_and_hobbies:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 6.253146258919168e-07 + - name: all-dressed:home_and_hobbies:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 3.444076499192907e-06 + - name: all-dressed:home_and_hobbies:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 2.300523921936357e-05 + - name: all-dressed:home_and_hobbies:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.00018170275142159205 + - name: all-dressed:home_and_hobbies:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.0012489999951922233 + - name: all-dressed:home_and_hobbies:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/home_and_hobbies/vigintile_0020/*.npy + repetition_factor: 1 + target_ratio: 0.0055610452446265635 + - name: all-dressed:industrial:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 4.241776954213832e-08 + - name: all-dressed:industrial:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 2.099379407983175e-07 + - name: all-dressed:industrial:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 8.820859319761287e-07 + - name: all-dressed:industrial:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 3.844879183700919e-06 + - name: all-dressed:industrial:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 1.778667991668441e-05 + - name: all-dressed:industrial:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 8.9601128783755e-05 + - name: all-dressed:industrial:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0004784750791783453 + - name: all-dressed:industrial:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0018/*.npy + repetition_factor: 2 + target_ratio: 0.002382559589853123 + - name: all-dressed:industrial:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/industrial/vigintile_0020/*.npy + repetition_factor: 7 + target_ratio: 0.009005084668450607 + - name: all-dressed:literature:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 8.326296490784335e-06 + - name: all-dressed:literature:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 3.276405956771744e-05 + - name: all-dressed:literature:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 0.00011345845687141338 + - name: all-dressed:literature:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.00042381017046076144 + - name: all-dressed:literature:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0020243606298684775 + - name: all-dressed:literature:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0018/*.npy + repetition_factor: 2 + target_ratio: 0.01221999609757044 + - name: all-dressed:literature:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/literature/vigintile_0020/*.npy + repetition_factor: 7 + target_ratio: 0.037098992273756556 + - name: all-dressed:politics:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 2.3921881789641743e-08 + - name: all-dressed:politics:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 1.3120319068176834e-07 + - name: all-dressed:politics:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 6.259754990176311e-07 + - name: all-dressed:politics:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 3.160156359000192e-06 + - name: all-dressed:politics:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 1.7005120087388388e-05 + - name: all-dressed:politics:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 9.49365445631273e-05 + - name: all-dressed:politics:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0005738849599741415 + - name: all-dressed:politics:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.003335069880738106 + - name: all-dressed:politics:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/politics/vigintile_0020/*.npy + repetition_factor: 2 + target_ratio: 0.012486076496178722 + - name: all-dressed:religion:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 7.830740322575062e-08 + - name: all-dressed:religion:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 5.920201221495388e-07 + - name: all-dressed:religion:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 4.138011329254504e-06 + - name: all-dressed:religion:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 3.192726020817335e-05 + - name: all-dressed:religion:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.0002848061306883643 + - name: all-dressed:religion:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.00271857064196512 + - name: all-dressed:religion:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/religion/vigintile_0020/*.npy + repetition_factor: 4 + target_ratio: 0.011996132020233329 + - name: all-dressed:science_math_and_technology:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 0.001840657391646038 + - name: all-dressed:science_math_and_technology:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 0.00403489495332744 + - name: all-dressed:science_math_and_technology:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0014/*.npy + repetition_factor: 2 + target_ratio: 0.006927785817951225 + - name: all-dressed:science_math_and_technology:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0015/*.npy + repetition_factor: 2 + target_ratio: 0.011409304595613911 + - name: all-dressed:science_math_and_technology:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0016/*.npy + repetition_factor: 3 + target_ratio: 0.01889999948798935 + - name: all-dressed:science_math_and_technology:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0017/*.npy + repetition_factor: 5 + target_ratio: 0.03137604129783345 + - name: all-dressed:science_math_and_technology:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0018/*.npy + repetition_factor: 7 + target_ratio: 0.04575976322109039 + - name: all-dressed:science_math_and_technology:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/science_math_and_technology/vigintile_0020/*.npy + repetition_factor: 9 + target_ratio: 0.039700939950123366 + - name: all-dressed:social_life:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 1.008833225274497e-08 + - name: all-dressed:social_life:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 5.675053715042752e-08 + - name: all-dressed:social_life:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 2.933706058412839e-07 + - name: all-dressed:social_life:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 1.6363855076583667e-06 + - name: all-dressed:social_life:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 1.0967468851145498e-05 + - name: all-dressed:social_life:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 8.20226443725068e-05 + - name: all-dressed:social_life:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.0005977415498580481 + - name: all-dressed:social_life:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/social_life/vigintile_0020/*.npy + repetition_factor: 1 + target_ratio: 0.0028418989191311664 + - name: all-dressed:software:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 2.2839745846807322e-05 + - name: all-dressed:software:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 6.830269326270995e-05 + - name: all-dressed:software:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 0.00016445483035412023 + - name: all-dressed:software:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 0.0004089096381021688 + - name: all-dressed:software:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 0.0009630819714032733 + - name: all-dressed:software:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 0.0022723537856840868 + - name: all-dressed:software:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0017/*.npy + repetition_factor: 2 + target_ratio: 0.00507494466013384 + - name: all-dressed:software:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0018/*.npy + repetition_factor: 4 + target_ratio: 0.010790717266264151 + - name: all-dressed:software:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software/vigintile_0020/*.npy + repetition_factor: 8 + target_ratio: 0.016247208751460924 + - name: all-dressed:software_development:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0012/*.npy + repetition_factor: 2 + target_ratio: 0.00211152363562115 + - name: all-dressed:software_development:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0013/*.npy + repetition_factor: 3 + target_ratio: 0.004204587995968386 + - name: all-dressed:software_development:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0014/*.npy + repetition_factor: 3 + target_ratio: 0.006215235231054335 + - name: all-dressed:software_development:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0015/*.npy + repetition_factor: 4 + target_ratio: 0.00865243002881247 + - name: all-dressed:software_development:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0016/*.npy + repetition_factor: 5 + target_ratio: 0.011767588484419219 + - name: all-dressed:software_development:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0017/*.npy + repetition_factor: 6 + target_ratio: 0.015548116255494376 + - name: all-dressed:software_development:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0018/*.npy + repetition_factor: 8 + target_ratio: 0.019272171199262066 + - name: all-dressed:software_development:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/software_development/vigintile_0020/*.npy + repetition_factor: 9 + target_ratio: 0.01699143990064301 + - name: all-dressed:sports_and_fitness:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 2.180769628965081e-08 + - name: all-dressed:sports_and_fitness:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 1.2288280963650494e-07 + - name: all-dressed:sports_and_fitness:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 5.843831365114651e-07 + - name: all-dressed:sports_and_fitness:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 2.8433144587316595e-06 + - name: all-dressed:sports_and_fitness:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 1.4570347677272436e-05 + - name: all-dressed:sports_and_fitness:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 7.7685069302949e-05 + - name: all-dressed:sports_and_fitness:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.00041362770350551466 + - name: all-dressed:sports_and_fitness:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.00208665184724219 + - name: all-dressed:sports_and_fitness:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/sports_and_fitness/vigintile_0020/*.npy + repetition_factor: 2 + target_ratio: 0.007348434929935755 + - name: all-dressed:transportation:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 1.6072073396166637e-08 + - name: all-dressed:transportation:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 8.056169810448865e-08 + - name: all-dressed:transportation:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 3.4838157442686463e-07 + - name: all-dressed:transportation:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 1.5552384914936164e-06 + - name: all-dressed:transportation:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 7.634317117830071e-06 + - name: all-dressed:transportation:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 4.131300247531798e-05 + - name: all-dressed:transportation:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 0.00024046218774652196 + - name: all-dressed:transportation:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.0013900991604417362 + - name: all-dressed:transportation:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/transportation/vigintile_0020/*.npy + repetition_factor: 3 + target_ratio: 0.005772716742589373 + - name: all-dressed:travel_and_tourism:vigintile_0010 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0010/*.npy + repetition_factor: 1 + target_ratio: 1.7898672063057278e-09 + - name: all-dressed:travel_and_tourism:vigintile_0011 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0011/*.npy + repetition_factor: 1 + target_ratio: 7.977095884988518e-09 + - name: all-dressed:travel_and_tourism:vigintile_0012 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0012/*.npy + repetition_factor: 1 + target_ratio: 3.064594823726175e-08 + - name: all-dressed:travel_and_tourism:vigintile_0013 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0013/*.npy + repetition_factor: 1 + target_ratio: 1.1866832367487384e-07 + - name: all-dressed:travel_and_tourism:vigintile_0014 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0014/*.npy + repetition_factor: 1 + target_ratio: 4.851389710979032e-07 + - name: all-dressed:travel_and_tourism:vigintile_0015 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0015/*.npy + repetition_factor: 1 + target_ratio: 2.2720423588267026e-06 + - name: all-dressed:travel_and_tourism:vigintile_0016 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0016/*.npy + repetition_factor: 1 + target_ratio: 1.144348956981328e-05 + - name: all-dressed:travel_and_tourism:vigintile_0017 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0017/*.npy + repetition_factor: 1 + target_ratio: 6.5840416584949e-05 + - name: all-dressed:travel_and_tourism:vigintile_0018 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0018/*.npy + repetition_factor: 1 + target_ratio: 0.0003570687130693643 + - name: all-dressed:travel_and_tourism:vigintile_0020 + paths: + - s3://ai2-llm/preprocessed/cc_all_dressed/all_dressed_v3/dclm_plus2_vigilantes/allenai/dolma2-tokenizer/travel_and_tourism/vigintile_0020/*.npy + repetition_factor: 1 + target_ratio: 0.0012343655701362514 + - name: arxiv + paths: + - s3://ai2-llm/preprocessed/proof-pile-2/v0_decontaminated-0625_tokenized/arxiv/train/allenai/dolma2-tokenizer/*.npy + repetition_factor: 3 + target_ratio: 0.008380404721315968 + - name: finemath-3plus + paths: + - s3://ai2-llm/preprocessed/olmo3-final/math/allenai/dolma2-tokenizer/finemath_3plus_all/*.npy + repetition_factor: 5 + target_ratio: 0.02563240311658576 + - name: s2pdf:adult_content + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/adult_content/*.npy + repetition_factor: 1 + target_ratio: 1.3656629559769624e-05 + - name: s2pdf:art_and_design + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/art_and_design/*.npy + repetition_factor: 1 + target_ratio: 0.00012905849230145726 + - name: s2pdf:crime_and_law + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/crime_and_law/*.npy + repetition_factor: 1 + target_ratio: 0.002085553656353873 + - name: s2pdf:education_and_jobs + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/education_and_jobs/*.npy + repetition_factor: 1 + target_ratio: 0.007061668460823319 + - name: s2pdf:electronics_and_hardware + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/electronics_and_hardware/*.npy + repetition_factor: 1 + target_ratio: 0.000731433680997109 + - name: s2pdf:entertainment + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/entertainment/*.npy + repetition_factor: 1 + target_ratio: 0.00029630860599163123 + - name: s2pdf:finance_and_business + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/finance_and_business/*.npy + repetition_factor: 1 + target_ratio: 0.0018648174819328963 + - name: s2pdf:food_and_dining + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/food_and_dining/*.npy + repetition_factor: 1 + target_ratio: 8.727144539129813e-06 + - name: s2pdf:games + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/games/*.npy + repetition_factor: 1 + target_ratio: 3.200622017333524e-05 + - name: s2pdf:health + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/health/*.npy + repetition_factor: 1 + target_ratio: 0.016095671450555054 + - name: s2pdf:history_and_geography + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/history_and_geography/*.npy + repetition_factor: 1 + target_ratio: 0.0006773186703135859 + - name: s2pdf:home_and_hobbies + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/home_and_hobbies/*.npy + repetition_factor: 1 + target_ratio: 0.00011111602187736472 + - name: s2pdf:industrial + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/industrial/*.npy + repetition_factor: 2 + target_ratio: 0.007229448538187444 + - name: s2pdf:literature + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/literature/*.npy + repetition_factor: 1 + target_ratio: 0.001226137550015088 + - name: s2pdf:politics + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/politics/*.npy + repetition_factor: 1 + target_ratio: 0.002470961950416769 + - name: s2pdf:religion + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/religion/*.npy + repetition_factor: 1 + target_ratio: 0.0006838781239016469 + - name: s2pdf:science_math_and_technology + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/science_math_and_technology/*.npy + repetition_factor: 2 + target_ratio: 0.08093020521453764 + - name: s2pdf:social_life + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/social_life/*.npy + repetition_factor: 1 + target_ratio: 2.9149146621417673e-05 + - name: s2pdf:software + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/software/*.npy + repetition_factor: 2 + target_ratio: 0.0016390539494377276 + - name: s2pdf:software_development + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/software_development/*.npy + repetition_factor: 2 + target_ratio: 0.0117499427779504 + - name: s2pdf:sports_and_fitness + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/sports_and_fitness/*.npy + repetition_factor: 1 + target_ratio: 0.00048648419991604544 + - name: s2pdf:transportation + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/transportation/*.npy + repetition_factor: 1 + target_ratio: 0.00030174140097848746 + - name: s2pdf:travel_and_tourism + paths: + - s3://ai2-llm/preprocessed/olmo3-final/s2pdfs/allenai/dolma2-tokenizer/travel_and_tourism/*.npy + repetition_factor: 1 + target_ratio: 1.3725629142896785e-05 + - name: stack-edu:C + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/C/*.npy + repetition_factor: 4 + target_ratio: 0.002795807312960203 + - name: stack-edu:CSharp + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/CSharp/*.npy + repetition_factor: 4 + target_ratio: 0.004237706219785314 + - name: stack-edu:Cpp + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Cpp/*.npy + repetition_factor: 4 + target_ratio: 0.008269371013923662 + - name: stack-edu:Go + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Go/*.npy + repetition_factor: 4 + target_ratio: 0.0009061832150359205 + - name: stack-edu:Java + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Java/*.npy + repetition_factor: 3 + target_ratio: 0.011013308721776234 + - name: stack-edu:JavaScript + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/JavaScript/*.npy + repetition_factor: 5 + target_ratio: 0.006007351797701659 + - name: stack-edu:Markdown + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Markdown/*.npy + repetition_factor: 3 + target_ratio: 0.01147515525991462 + - name: stack-edu:PHP + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/PHP/*.npy + repetition_factor: 4 + target_ratio: 0.004184271880663751 + - name: stack-edu:Python + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Python/*.npy + repetition_factor: 5 + target_ratio: 0.012613504498479729 + - name: stack-edu:Ruby + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Ruby/*.npy + repetition_factor: 4 + target_ratio: 0.0009059591750292334 + - name: stack-edu:Rust + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Rust/*.npy + repetition_factor: 5 + target_ratio: 0.0009669958455076481 + - name: stack-edu:SQL + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/SQL/*.npy + repetition_factor: 2 + target_ratio: 0.0012576977261663565 + - name: stack-edu:Shell + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Shell/*.npy + repetition_factor: 5 + target_ratio: 0.0017613321092862328 + - name: stack-edu:Swift + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/Swift/*.npy + repetition_factor: 4 + target_ratio: 0.0009777644855408437 + - name: stack-edu:TypeScript + paths: + - s3://ai2-llm/preprocessed/stack-edu/allenai/dolma2-tokenizer/TypeScript/*.npy + repetition_factor: 4 + target_ratio: 0.001582549472273524 + - name: wikipedia + paths: + - s3://ai2-llm/preprocessed/wikipedia-dolma-0823/allenai/dolma2-tokenizer/*.npy + repetition_factor: 1 + target_ratio: 0.00042095186883442045