[docs]classTokenizerName(StrEnum):""" An enumeration of tokenizer identifiers commonly used OLMo researchers. """dolma2="allenai/dolma2-tokenizer"""" The dolma2 tokenizer. """dolma2_sigdig="allenai/dolma2-tokenizer-sigdig"""" The R2L dolma2 tokenizer. """gpt_neox_olmo_dolma_v1_5="allenai/gpt-neox-olmo-dolma-v1_5"""" A modified GPT NeoX tokenizer. """gpt2="gpt2"""" The base GPT2 tokenizer. """
[docs]@dataclassclassTokenizerConfig(Config):""" A configuration class that represents a tokenizer. """vocab_size:int""" The vocab size. """eos_token_id:int""" The end-of-sentence token ID. """pad_token_id:int""" The padding token ID. """bos_token_id:Optional[int]=None""" The begin-of-sentence token ID. """identifier:Optional[str]=None""" The identifier of the tokenizer. Could be a path or HuggingFace identifier. """
[docs]defpadded_vocab_size(self,pad_multiple:int=128)->int:""" Returns the vocab size padded to be a multiple of ``pad_multiple``. This is useful to set model embeddings to this number to increase throughput. """returnpad_multiple*((self.vocab_size+pad_multiple-1)//pad_multiple)
[docs]@classmethoddefdolma2(cls)->"TokenizerConfig":""" Get a :data:`~TokenizerName.dolma2` tokenizer config. """returncls(vocab_size=100278,eos_token_id=100257,pad_token_id=100277,identifier=TokenizerName.dolma2,)
[docs]@classmethoddefdolma2_sigdig(cls)->"TokenizerConfig":""" Get a :data:`~TokenizerName.dolma2_sigdig` tokenizer config. """returncls(vocab_size=100278,eos_token_id=100257,pad_token_id=100277,bos_token_id=100257,identifier=TokenizerName.dolma2_sigdig,)
[docs]@classmethoddefgpt_neox_olmo_dolma_v1_5(cls)->"TokenizerConfig":""" Get a :data:`~TokenizerName.gpt_neox_olmo_dolma_v1_5` tokenizer config. """returncls(vocab_size=50280,eos_token_id=50279,pad_token_id=1,identifier=TokenizerName.gpt_neox_olmo_dolma_v1_5,)
[docs]@classmethoddefgpt2(cls)->"TokenizerConfig":""" Get a :data:`~TokenizerName.gpt2` tokenizer config. """returncls(vocab_size=50257,eos_token_id=50256,bos_token_id=50256,pad_token_id=50256,identifier=TokenizerName.gpt2,)
[docs]@classmethoddeffrom_hf(cls,identifier:str)->"TokenizerConfig":""" Initialize a tokenizer config from a model on HuggingFace. :param identifier: The HF model identifier, e.g. "meta-llama/Llama-3.2-1B". """importjsonfromcached_pathimportcached_pathtry:config_path=cached_path(f"hf://{identifier}/config.json")exceptFileNotFoundError:config_path=cached_path(f"hf://{identifier}/tokenizer_config.json")withconfig_path.open()asf:config=json.load(f)returncls(vocab_size=config["vocab_size"],eos_token_id=config["eos_token_id"],pad_token_id=config.get("pad_token_id",config["eos_token_id"]),bos_token_id=config.get("bos_token_id"),identifier=identifier,)