Source code for olmo_core.data.mixes

import os
from abc import abstractmethod
from contextlib import contextmanager
from pathlib import Path
from typing import Generator, List, Tuple

from olmo_core.config import StrEnum

from ..tokenizer import TokenizerName

__all__ = ["DataMixBase", "DataMix"]



[docs]
class DataMixBase(StrEnum):
    """
    Base class for enumeration of data mixes.
    """


[docs]
    @abstractmethod
    def build(self, base_dir: str, tokenizer: str) -> Tuple[List[str], List[str]]:
        """
        Construct the data mix.

        :param base_dir: Where the mix is stored, e.g. "s3://ai2-llm" or "/weka/oe-training-default/ai2-llm".
        :param tokenizer: The tokenizer identifier.

        :returns: A list of paths/URLs to the tokenized numpy data files in the mix and list
            of corresponding labels.
        """
        raise NotImplementedError





[docs]
class DataMix(DataMixBase):
    """
    An enumeration of data mix names.
    """

    # Pretraining mixes
    OLMoE_mix_0824 = "OLMoE-mix-0824"
    dolma17 = "dolma17"
    OLMo_mix_0625 = "OLMo-mix-0625"
    OLMo_mix_0625_150Bsample = "OLMo-mix-0625-150Bsample"
    OLMo_mix_0625_700Bsample = "OLMo-mix-0625-700Bsample"
    OLMo_mix_0625_official = "OLMo-mix-0625-official"
    OLMo_mix_0925 = "OLMo-mix-0925"
    OLMo_mix_0925_official = "OLMo-mix-0925-official"

    # Midtraining mixes
    OLMo_midtraining_mix_0625_100B = "OLMo-midtraining-mix-0625-100B"
    OLMo_midtraining_mix_0925_ingredient1_100B = "OLMo-midtraining-mix-0925-ingredient1-100B"
    OLMo_midtraining_mix_0925_ingredient2_100B = "OLMo-midtraining-mix-0925-ingredient2-100B"

    # Long-context extension mixes
    OLMo_longmino_mix_0625 = "OLMo-longmino-mix-0625"
    OLMo_longmino_mix_0925 = "OLMo-longmino-mix-0925"

    # Validation mixes
    v3_small_ppl_validation = "v3-small-ppl-validation"
    code_fresh_ppl_validation = "code-fresh-ppl-validation"

    @classmethod
    def _missing_(cls, value: object) -> "DataMix | None":
        """Handle alias lookups."""
        # Aliases mapping
        aliases = {
            "dolma3-0625-6T-mix": "OLMo-mix-0625",
            "dolma3-0925-6T-mix": "OLMo-mix-0925",
            "dolma3-0925-150B-mix": "OLMo-mix-0625-150Bsample",
        }

        # Check if the value is an alias
        if isinstance(value, str) and value in aliases:
            # Look up the real value and return the corresponding enum member
            real_value = aliases[value]
            for member in cls:
                if member.value == real_value:
                    return member
        return None


[docs]
    def build(self, base_dir: str, tokenizer: str) -> Tuple[List[str], List[str]]:
        if not base_dir.endswith("/"):
            base_dir = base_dir + "/"

        tokenizer_id: str = tokenizer
        if self == DataMix.v3_small_ppl_validation:
            if tokenizer == TokenizerName.gpt_neox_olmo_dolma_v1_5:
                tokenizer_id = "gptneox20b"
            elif tokenizer == TokenizerName.dolma2:
                tokenizer_id = "dolma2-tokenizer"
        elif self == DataMix.code_fresh_ppl_validation:
            if tokenizer == TokenizerName.dolma2:
                tokenizer_id = "dolma2-tokenizer"
        elif self == DataMix.OLMo_mix_0625:
            if tokenizer == TokenizerName.dolma2_sigdig:
                tokenizer_id = "dolma2-tokenizer-sigdig"
        elif self in [
            # Mixes used for OLMo3 training are saved with "dolma3-tokenizer" tokenizer,
            # which is exactly the same as "dolma2-tokenizer" but with a different name.
            DataMix.OLMo_mix_0625_official,
            DataMix.OLMo_mix_0925_official,
            DataMix.OLMo_midtraining_mix_0625_100B,
            DataMix.OLMo_midtraining_mix_0925_ingredient1_100B,
            DataMix.OLMo_midtraining_mix_0925_ingredient2_100B,
            DataMix.OLMo_longmino_mix_0625,
            DataMix.OLMo_longmino_mix_0925,
        ]:
            if tokenizer == TokenizerName.dolma2:
                tokenizer_id = "allenai/dolma3-tokenizer"
        elif tokenizer == TokenizerName.gpt_neox_olmo_dolma_v1_5:
            tokenizer_id = "gpt-neox-olmo-dolma-v1_5"

        paths = []
        labels = []
        with _get_data_mix_path(self) as mix_path:
            with mix_path.open() as f:
                for line_num, line in enumerate(f):
                    line = line.strip()
                    if not line or line.startswith("#"):
                        continue
                    label, path = line.split(",")
                    if "{TOKENIZER}" not in path:
                        raise ValueError(f"line {line_num + 1} in data mix '{self}' is invalid")
                    path = path.replace("{TOKENIZER}", tokenizer_id)
                    paths.append(f"{base_dir}{path}")
                    labels.append(label)
        return paths, labels




@contextmanager
def _get_data_mix_path(name: str) -> Generator[Path, None, None]:
    import importlib_resources

    try:
        with importlib_resources.as_file(
            importlib_resources.files("olmo_core").joinpath(
                f"data/mixes/{os.path.basename(name)}.txt"
            )
        ) as path:
            yield path
    finally:
        pass