[docs]classDataMixBase(StrEnum):""" Base class for enumeration of data mixes. """
[docs]@abstractmethoddefbuild(self,base_dir:str,tokenizer:str)->Tuple[List[str],List[str]]:""" Construct the data mix. :param base_dir: Where the mix is stored, e.g. "s3://ai2-llm" or "/weka/oe-training-default/ai2-llm". :param tokenizer: The tokenizer identifier. :returns: A list of paths/URLs to the tokenized numpy data files in the mix and list of corresponding labels. """raiseNotImplementedError
[docs]classDataMix(DataMixBase):""" An enumeration of data mix names. """# Pretraining mixesOLMoE_mix_0824="OLMoE-mix-0824"dolma17="dolma17"OLMo_mix_0625="OLMo-mix-0625"OLMo_mix_0625_150Bsample="OLMo-mix-0625-150Bsample"OLMo_mix_0625_700Bsample="OLMo-mix-0625-700Bsample"OLMo_mix_0625_official="OLMo-mix-0625-official"OLMo_mix_0925="OLMo-mix-0925"OLMo_mix_0925_official="OLMo-mix-0925-official"# Midtraining mixesOLMo_midtraining_mix_0625_100B="OLMo-midtraining-mix-0625-100B"OLMo_midtraining_mix_0925_ingredient1_100B="OLMo-midtraining-mix-0925-ingredient1-100B"OLMo_midtraining_mix_0925_ingredient2_100B="OLMo-midtraining-mix-0925-ingredient2-100B"# Long-context extension mixesOLMo_longmino_mix_0625="OLMo-longmino-mix-0625"OLMo_longmino_mix_0925="OLMo-longmino-mix-0925"# Validation mixesv3_small_ppl_validation="v3-small-ppl-validation"code_fresh_ppl_validation="code-fresh-ppl-validation"@classmethoddef_missing_(cls,value:object)->"DataMix | None":"""Handle alias lookups."""# Aliases mappingaliases={"dolma3-0625-6T-mix":"OLMo-mix-0625","dolma3-0925-6T-mix":"OLMo-mix-0925","dolma3-0925-150B-mix":"OLMo-mix-0625-150Bsample",}# Check if the value is an aliasifisinstance(value,str)andvalueinaliases:# Look up the real value and return the corresponding enum memberreal_value=aliases[value]formemberincls:ifmember.value==real_value:returnmemberreturnNone
[docs]defbuild(self,base_dir:str,tokenizer:str)->Tuple[List[str],List[str]]:ifnotbase_dir.endswith("/"):base_dir=base_dir+"/"tokenizer_id:str=tokenizerifself==DataMix.v3_small_ppl_validation:iftokenizer==TokenizerName.gpt_neox_olmo_dolma_v1_5:tokenizer_id="gptneox20b"eliftokenizer==TokenizerName.dolma2:tokenizer_id="dolma2-tokenizer"elifself==DataMix.code_fresh_ppl_validation:iftokenizer==TokenizerName.dolma2:tokenizer_id="dolma2-tokenizer"elifself==DataMix.OLMo_mix_0625:iftokenizer==TokenizerName.dolma2_sigdig:tokenizer_id="dolma2-tokenizer-sigdig"elifselfin[# Mixes used for OLMo3 training are saved with "dolma3-tokenizer" tokenizer,# which is exactly the same as "dolma2-tokenizer" but with a different name.DataMix.OLMo_mix_0625_official,DataMix.OLMo_mix_0925_official,DataMix.OLMo_midtraining_mix_0625_100B,DataMix.OLMo_midtraining_mix_0925_ingredient1_100B,DataMix.OLMo_midtraining_mix_0925_ingredient2_100B,DataMix.OLMo_longmino_mix_0625,DataMix.OLMo_longmino_mix_0925,]:iftokenizer==TokenizerName.dolma2:tokenizer_id="allenai/dolma3-tokenizer"eliftokenizer==TokenizerName.gpt_neox_olmo_dolma_v1_5:tokenizer_id="gpt-neox-olmo-dolma-v1_5"paths=[]labels=[]with_get_data_mix_path(self)asmix_path:withmix_path.open()asf:forline_num,lineinenumerate(f):line=line.strip()ifnotlineorline.startswith("#"):continuelabel,path=line.split(",")if"{TOKENIZER}"notinpath:raiseValueError(f"line {line_num+1} in data mix '{self}' is invalid")path=path.replace("{TOKENIZER}",tokenizer_id)paths.append(f"{base_dir}{path}")labels.append(label)returnpaths,labels