import logging
import re
from typing import Any, Dict, List
import torch
from transformers import PretrainedConfig
from olmo_core.doc_utils import beta_feature
from olmo_core.nn.conversion.state_converter import StateConverter
from olmo_core.nn.conversion.state_mapping import (
StateMappingTemplate,
StateType,
TemplatePlaceholder,
)
log = logging.getLogger(__name__)
LAYER = TemplatePlaceholder.LAYER
EXPERT = TemplatePlaceholder.EXPERT
#: Map of Hugging Face weight keys to OLMo Core weight keys, that is used to determine how HF state
#: maps to OLMo Core state. Different HF models may use different names for a given OLMo
#: Core state. You may configure this to change how HF state maps to OLMo Core state.
#:
#: This map only captures one-to-one mappings from HF to OLMo Core. For many-to-many mappings
#: or mappings that require additional manipulation of state, see
#: :data:`HF_TO_OLMO_CORE_TEMPLATE_MAPPINGS`. If a given HF key can refer to different OLMo Core
#: states depending on the HF model, see :data:`MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_WEIGHT_MAPPINGS`.
HF_TO_OLMO_CORE_WEIGHT_MAPPINGS: Dict[str, str] = {
"model.embed_tokens.weight": "embeddings.weight",
"model.norm.weight": "lm_head.norm.weight",
"lm_head.weight": "lm_head.w_out.weight",
# Attention.
f"model.layers.{LAYER}.self_attn.q_proj.weight": f"blocks.{LAYER}.attention.w_q.weight",
f"model.layers.{LAYER}.self_attn.k_proj.weight": f"blocks.{LAYER}.attention.w_k.weight",
f"model.layers.{LAYER}.self_attn.v_proj.weight": f"blocks.{LAYER}.attention.w_v.weight",
f"model.layers.{LAYER}.self_attn.o_proj.weight": f"blocks.{LAYER}.attention.w_out.weight",
# MLP.
f"model.layers.{LAYER}.mlp.gate_proj.weight": f"blocks.{LAYER}.feed_forward.w1.weight",
f"model.layers.{LAYER}.mlp.down_proj.weight": f"blocks.{LAYER}.feed_forward.w2.weight",
f"model.layers.{LAYER}.mlp.up_proj.weight": f"blocks.{LAYER}.feed_forward.w3.weight",
# Layer norms.
f"model.layers.{LAYER}.input_layernorm.weight": f"blocks.{LAYER}.attention_norm.weight",
f"model.layers.{LAYER}.post_attention_layernorm.weight": f"blocks.{LAYER}.attention_norm.weight",
f"model.layers.{LAYER}.post_feedforward_layernorm.weight": f"blocks.{LAYER}.feed_forward_norm.weight",
f"model.layers.{LAYER}.self_attn.q_norm.weight": f"blocks.{LAYER}.attention.q_norm.weight",
f"model.layers.{LAYER}.self_attn.k_norm.weight": f"blocks.{LAYER}.attention.k_norm.weight",
# MoEMLP.
f"model.layers.{LAYER}.mlp.gate.weight": f"blocks.{LAYER}.feed_forward_moe.router.weight",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.gate_proj.weight": f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w1",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.down_proj.weight": f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w2",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.up_proj.weight": f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w3",
}
#: Map of Hugging Face module keys to OLMo Core module keys, that is used to determine how HF state
#: maps to OLMo Core state. Different HF models may use different names for a given OLMo
#: Core state. You may configure this to change how HF state maps to OLMo Core state.
#:
#: This map only captures one-to-one mappings from HF to OLMo Core. For many-to-many mappings
#: or mappings that require additional manipulation of state, see
#: :data:`HF_TO_OLMO_CORE_TEMPLATE_MAPPINGS`. If a given HF key can refer to different OLMo Core
#: states depending on the HF model, see :data:`MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_MODULE_MAPPINGS`.
HF_TO_OLMO_CORE_MODULE_MAPPINGS: Dict[str, str] = {
"model.embed_tokens": "embeddings",
"model.norm": "lm_head.norm",
"lm_head": "lm_head.w_out",
# Attention.
f"model.layers.{LAYER}.self_attn.q_proj": f"blocks.{LAYER}.attention.w_q",
f"model.layers.{LAYER}.self_attn.k_proj": f"blocks.{LAYER}.attention.w_k",
f"model.layers.{LAYER}.self_attn.v_proj": f"blocks.{LAYER}.attention.w_v",
f"model.layers.{LAYER}.self_attn.o_proj": f"blocks.{LAYER}.attention.w_out",
# MLP.
f"model.layers.{LAYER}.mlp.gate_proj": f"blocks.{LAYER}.feed_forward.w1",
f"model.layers.{LAYER}.mlp.down_proj": f"blocks.{LAYER}.feed_forward.w2",
f"model.layers.{LAYER}.mlp.up_proj": f"blocks.{LAYER}.feed_forward.w3",
# Layer norms.
f"model.layers.{LAYER}.input_layernorm": f"blocks.{LAYER}.attention_norm",
f"model.layers.{LAYER}.post_attention_layernorm": f"blocks.{LAYER}.attention_norm",
f"model.layers.{LAYER}.post_feedforward_layernorm": f"blocks.{LAYER}.feed_forward_norm",
f"model.layers.{LAYER}.self_attn.q_norm": f"blocks.{LAYER}.attention.q_norm",
f"model.layers.{LAYER}.self_attn.k_norm": f"blocks.{LAYER}.attention.k_norm",
# MoEMLP.
f"model.layers.{LAYER}.mlp": f"blocks.{LAYER}.feed_forward_moe",
f"model.layers.{LAYER}.post_moe_norm": f"blocks.{LAYER}.feed_forward_moe_norm",
f"model.layers.{LAYER}.mlp.gate": f"blocks.{LAYER}.feed_forward_moe.router",
# Indices are not part of the original OLMo Core state but can be introduced during conversion for aide debugging.
f"model.layers.{LAYER}.mlp.gate.indices": f"blocks.{LAYER}.feed_forward_moe.router.indices",
f"model.layers.{LAYER}.mlp.shared_mlp": f"blocks.{LAYER}.feed_forward_moe.shared_mlp",
f"model.layers.{LAYER}.mlp.shared_mlp.gate_proj": f"blocks.{LAYER}.feed_forward_moe.shared_mlp.w1",
f"model.layers.{LAYER}.mlp.shared_mlp.down_proj": f"blocks.{LAYER}.feed_forward_moe.shared_mlp.w2",
f"model.layers.{LAYER}.mlp.shared_mlp.up_proj": f"blocks.{LAYER}.feed_forward_moe.shared_mlp.w3",
}
#: Map of Hugging Face weight keys to OLMo Core weight keys. This map captures overrides of the standard
#: one-to-one mappings in :data:`HF_TO_OLMO_CORE_WEIGHT_MAPPINGS`, in case a given HF key can refer to
#: different OLMo Core states depending on the HF model architecture. You may configure this to change
#: how HF state maps to OLMo Core state.
MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_WEIGHT_MAPPINGS: Dict[str, Dict[str, str]] = {
"llama": {
f"model.layers.{LAYER}.post_attention_layernorm.weight": f"blocks.{LAYER}.feed_forward_norm.weight"
},
"gemma3_text": {
f"model.layers.{LAYER}.post_attention_layernorm.weight": f"blocks.{LAYER}.post_attention_norm.weight",
f"model.layers.{LAYER}.pre_feedforward_layernorm.weight": f"blocks.{LAYER}.feed_forward_norm.weight",
f"model.layers.{LAYER}.post_feedforward_layernorm.weight": f"blocks.{LAYER}.post_feed_forward_norm.weight",
},
"qwen3": {
f"model.layers.{LAYER}.post_attention_layernorm.weight": f"blocks.{LAYER}.feed_forward_norm.weight"
},
}
#: Map of Hugging Face module keys to OLMo Core module keys. This map captures overrides of the standard
#: one-to-one mappings in :data:`HF_TO_OLMO_CORE_MODULE_MAPPINGS`, in case a given HF key can refer to
#: different OLMo Core states depending on the HF model architecture. You may configure this to change
#: how HF state maps to OLMo Core state.
MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_MODULE_MAPPINGS: Dict[str, Dict[str, str]] = {
"llama": {
f"model.layers.{LAYER}.post_attention_layernorm": f"blocks.{LAYER}.feed_forward_norm"
},
"gemma3_text": {
f"model.layers.{LAYER}.post_attention_layernorm": f"blocks.{LAYER}.post_attention_norm",
f"model.layers.{LAYER}.pre_feedforward_layernorm": f"blocks.{LAYER}.feed_forward_norm",
f"model.layers.{LAYER}.post_feedforward_layernorm": f"blocks.{LAYER}.post_feed_forward_norm",
},
"qwen3": {
f"model.layers.{LAYER}.post_attention_layernorm": f"blocks.{LAYER}.feed_forward_norm"
},
}
#: Map of Hugging Face keys to OLMo Core keys, that is used to determine how HF state
#: maps to OLMo Core state. Different HF models may use different names for a given OLMo
#: Core state. You may configure this to change how HF state maps to OLMo Core state.
#:
#: This map captures many-to-many mappings from HF to OLMo Core and mappings that require
#: additional manipulation of state (e.g. merging dimensions).
#: For simple one-to-one mappings from HF to OLMo Core, see
#: :data:`HF_TO_OLMO_CORE_MAPPINGS`.
HF_TO_OLMO_CORE_TEMPLATE_MAPPINGS: Dict[str, StateMappingTemplate] = {
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.gate_proj.weight": StateMappingTemplate(
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.gate_proj.weight",
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w1",
source_key_per_placeholder=TemplatePlaceholder.EXPERT,
source_concat_dim=1,
dims_permutation=(1, 0),
),
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.down_proj.weight": StateMappingTemplate(
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.down_proj.weight",
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w2",
source_key_per_placeholder=TemplatePlaceholder.EXPERT,
source_concat_dim=1,
dims_permutation=(1, 0),
),
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.up_proj.weight": StateMappingTemplate(
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.up_proj.weight",
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w3",
source_key_per_placeholder=TemplatePlaceholder.EXPERT,
source_concat_dim=1,
dims_permutation=(1, 0),
),
f"model.layers.{LAYER}.mlp.gate.weight": StateMappingTemplate(
f"model.layers.{LAYER}.mlp.gate.weight",
f"blocks.{LAYER}.feed_forward_moe.router.weight",
flatten_dims=(0, 1),
),
}
#: Map of OLMo Core weight keys to Hugging Face weight keys, that is used to determine how OLMo Core state
#: maps to HF state. You may configure this to change how OLMo Core state maps to HF state.
#:
#: This map only captures one-to-one mappings from OLMo Core to HF. For many-to-many mappings
#: or mappings that require additional manipulation of state, see :data:`OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS`.
OLMO_CORE_TO_HF_WEIGHT_MAPPINGS: Dict[str, str] = {
"embeddings.weight": "model.embed_tokens.weight",
"lm_head.norm.weight": "model.norm.weight",
"lm_head.w_out.weight": "lm_head.weight",
# Attention.
f"blocks.{LAYER}.attention.w_q.weight": f"model.layers.{LAYER}.self_attn.q_proj.weight",
f"blocks.{LAYER}.attention.w_k.weight": f"model.layers.{LAYER}.self_attn.k_proj.weight",
f"blocks.{LAYER}.attention.w_v.weight": f"model.layers.{LAYER}.self_attn.v_proj.weight",
f"blocks.{LAYER}.attention.w_out.weight": f"model.layers.{LAYER}.self_attn.o_proj.weight",
# MLP.
f"blocks.{LAYER}.feed_forward.w1.weight": f"model.layers.{LAYER}.mlp.gate_proj.weight",
f"blocks.{LAYER}.feed_forward.w2.weight": f"model.layers.{LAYER}.mlp.down_proj.weight",
f"blocks.{LAYER}.feed_forward.w3.weight": f"model.layers.{LAYER}.mlp.up_proj.weight",
# Layer norms.
f"blocks.{LAYER}.attention_norm.weight": f"model.layers.{LAYER}.post_attention_layernorm.weight",
f"blocks.{LAYER}.feed_forward_norm.weight": f"model.layers.{LAYER}.post_feedforward_layernorm.weight",
f"blocks.{LAYER}.attention.q_norm.weight": f"model.layers.{LAYER}.self_attn.q_norm.weight",
f"blocks.{LAYER}.attention.k_norm.weight": f"model.layers.{LAYER}.self_attn.k_norm.weight",
# MoEMLP.
f"blocks.{LAYER}.feed_forward_moe.router.weight": f"model.layers.{LAYER}.mlp.gate.weight",
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w1": f"model.layers.{LAYER}.mlp.experts.{EXPERT}.gate_proj.weight",
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w2": f"model.layers.{LAYER}.mlp.experts.{EXPERT}.down_proj.weight",
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w3": f"model.layers.{LAYER}.mlp.experts.{EXPERT}.up_proj.weight",
}
#: Map of OLMo Core module keys to Hugging Face module keys, that is used to determine how OLMo Core state
#: maps to HF state. You may configure this to change how OLMo Core state maps to HF state.
#:
#: This map only captures one-to-one mappings from OLMo Core to HF. For many-to-many mappings
#: or mappings that require additional manipulation of state, see :data:`OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS`.
OLMO_CORE_TO_HF_MODULE_MAPPINGS: Dict[str, str] = {
"embeddings": "model.embed_tokens",
"lm_head.norm": "model.norm",
"lm_head.w_out": "lm_head",
# Attention.
f"blocks.{LAYER}.attention.w_q": f"model.layers.{LAYER}.self_attn.q_proj",
f"blocks.{LAYER}.attention.w_k": f"model.layers.{LAYER}.self_attn.k_proj",
f"blocks.{LAYER}.attention.w_v": f"model.layers.{LAYER}.self_attn.v_proj",
f"blocks.{LAYER}.attention.w_out": f"model.layers.{LAYER}.self_attn.o_proj",
# MLP.
f"blocks.{LAYER}.feed_forward": f"model.layers.{LAYER}.mlp",
f"blocks.{LAYER}.feed_forward.w1": f"model.layers.{LAYER}.mlp.gate_proj",
f"blocks.{LAYER}.feed_forward.w2": f"model.layers.{LAYER}.mlp.down_proj",
f"blocks.{LAYER}.feed_forward.w3": f"model.layers.{LAYER}.mlp.up_proj",
# Layer norms.
f"blocks.{LAYER}.attention_norm": f"model.layers.{LAYER}.post_attention_layernorm",
f"blocks.{LAYER}.feed_forward_norm": f"model.layers.{LAYER}.post_feedforward_layernorm",
f"blocks.{LAYER}.attention.q_norm": f"model.layers.{LAYER}.self_attn.q_norm",
f"blocks.{LAYER}.attention.k_norm": f"model.layers.{LAYER}.self_attn.k_norm",
# MoEMLP.
f"blocks.{LAYER}.feed_forward_moe": f"model.layers.{LAYER}.mlp",
f"blocks.{LAYER}.feed_forward_moe_norm": f"model.layers.{LAYER}.post_moe_norm",
f"blocks.{LAYER}.feed_forward_moe.router": f"model.layers.{LAYER}.mlp.gate",
# Indices are not part of the original OLMo Core state but can be introduced during conversion for aide debugging.
f"blocks.{LAYER}.feed_forward_moe.router.indices": f"model.layers.{LAYER}.mlp.gate.indices",
f"blocks.{LAYER}.feed_forward_moe.shared_mlp": f"model.layers.{LAYER}.mlp.shared_mlp",
f"blocks.{LAYER}.feed_forward_moe.shared_mlp.w1": f"model.layers.{LAYER}.mlp.shared_mlp.gate_proj",
f"blocks.{LAYER}.feed_forward_moe.shared_mlp.w2": f"model.layers.{LAYER}.mlp.shared_mlp.down_proj",
f"blocks.{LAYER}.feed_forward_moe.shared_mlp.w3": f"model.layers.{LAYER}.mlp.shared_mlp.up_proj",
}
#: Map of OLMo Core keys to Hugging Face keys, that is used to determine how OLMo Core state
#: maps to HF state. You may configure this to change how OLMo Core state maps to HF state.
#:
#: This map captures many-to-many mappings from OLMo Core to HF and mappings that require
#: additional manipulation of state (e.g. merging dimensions).
#: For simple one-to-one mappings from OLMo Core to HF, see
#: :data:`OLMO_CORE_TO_HF_MAPPINGS`.
OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS: Dict[str, StateMappingTemplate] = {
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w1": StateMappingTemplate(
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w1",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.gate_proj.weight",
dest_key_per_placeholder=TemplatePlaceholder.EXPERT,
dims_permutation=(1, 0),
dest_chunk_dim=1,
),
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w2": StateMappingTemplate(
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w2",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.down_proj.weight",
dest_key_per_placeholder=TemplatePlaceholder.EXPERT,
dims_permutation=(1, 0),
dest_chunk_dim=1,
),
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w3": StateMappingTemplate(
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w3",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.up_proj.weight",
dest_key_per_placeholder=TemplatePlaceholder.EXPERT,
dims_permutation=(1, 0),
dest_chunk_dim=1,
),
f"blocks.{LAYER}.feed_forward_moe.router.weight": StateMappingTemplate(
f"blocks.{LAYER}.feed_forward_moe.router.weight",
f"model.layers.{LAYER}.mlp.gate.weight",
unflatten_dim=(0, (TemplatePlaceholder.EXPERT, -1)),
),
}
#: Map of OLMo Core keys to Hugging Face keys, that is used to determine how OLMo Core state
#: maps to HF state. This map captures overrides of the standard mappings in
#: :data:`OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS`, in case a given OLMo Core key can refer to
#: different HF states depending on the HF model. You may configure this to change how OLMo Core
#: state maps to HF state.
MODEL_TYPE_SPECIFIC_OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS: Dict[
str, Dict[str, StateMappingTemplate]
] = {
"flex_olmo": {
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w1": StateMappingTemplate(
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w1",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.gate_proj.weight",
dest_key_per_placeholder=TemplatePlaceholder.EXPERT,
dims_permutation=(1, 0),
dest_chunk_dim=1,
),
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w2": StateMappingTemplate(
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w2",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.down_proj.weight",
dest_key_per_placeholder=TemplatePlaceholder.EXPERT,
dims_permutation=(1, 0),
dest_chunk_dim=1,
),
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w3": StateMappingTemplate(
f"blocks.{LAYER}.feed_forward_moe.experts.mlp.w3",
f"model.layers.{LAYER}.mlp.experts.{EXPERT}.up_proj.weight",
dest_key_per_placeholder=TemplatePlaceholder.EXPERT,
dims_permutation=(1, 0),
dest_chunk_dim=1,
),
f"blocks.{LAYER}.feed_forward_moe.router.weight": StateMappingTemplate(
f"blocks.{LAYER}.feed_forward_moe.router.weight",
f"model.layers.{LAYER}.mlp.gate.weight",
unflatten_dim=(0, (TemplatePlaceholder.EXPERT, -1)),
),
}
}
def _get_hf_model_to_olmo_core_one_to_one_templates(
model_type: str | None = None,
) -> List[StateMappingTemplate]:
mapping_templates = {
hf_key: StateMappingTemplate(hf_key, olmo_core_key, state_type=StateType.weight)
for hf_key, olmo_core_key in HF_TO_OLMO_CORE_WEIGHT_MAPPINGS.items()
}
for hf_key, olmo_core_key in HF_TO_OLMO_CORE_MODULE_MAPPINGS.items():
mapping_templates[hf_key] = StateMappingTemplate(
hf_key, olmo_core_key, state_type=StateType.module
)
if model_type in MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_WEIGHT_MAPPINGS:
model_type_specific_mapping_templates = {
hf_key: StateMappingTemplate(hf_key, olmo_core_key, state_type=StateType.weight)
for hf_key, olmo_core_key in MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_WEIGHT_MAPPINGS[
model_type
].items()
}
mapping_templates.update(model_type_specific_mapping_templates)
if model_type in MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_MODULE_MAPPINGS:
model_type_specific_mapping_templates = {
hf_key: StateMappingTemplate(hf_key, olmo_core_key, state_type=StateType.module)
for hf_key, olmo_core_key in MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_MODULE_MAPPINGS[
model_type
].items()
}
mapping_templates.update(model_type_specific_mapping_templates)
return list(mapping_templates.values())
def _get_converter_from_hf(model_type: str | None = None) -> StateConverter:
mapping_templates = _get_hf_model_to_olmo_core_one_to_one_templates(model_type)
mapping_templates += list(HF_TO_OLMO_CORE_TEMPLATE_MAPPINGS.values())
return StateConverter(mapping_templates)
[docs]
@beta_feature
def get_converter_from_hf(model_type: str | None = None) -> StateConverter:
return _get_converter_from_hf(model_type=model_type)
def _apply_gemma3_norm_transform(state: Dict[str, Any]) -> Dict[str, Any]:
"""
Transform Gemma 3 norm weights from HF format to OLMo format.
HF Gemma 3 uses zero-initialized RMSNorm weights with `hidden_states * (1 + weight)`,
while OLMo uses ones-initialized weights with `hidden_states * weight`.
This function adds 1 to all norm weights to convert between the two conventions.
"""
norm_patterns = [
"attention_norm.weight",
"post_attention_norm.weight",
"feed_forward_norm.weight",
"post_feed_forward_norm.weight",
"lm_head.norm.weight",
"q_norm.weight",
"k_norm.weight",
]
for key, value in state.items():
if any(pattern in key for pattern in norm_patterns):
if isinstance(value, torch.Tensor):
state[key] = value + 1.0
return state
def _convert_state(
config: PretrainedConfig,
state: Dict[str, Any],
converter: StateConverter,
) -> Dict[str, Any]:
if not hasattr(config, "num_hidden_layers"):
raise ValueError(f"Number of hidden layers missing in HF config: {config}")
n_layers: int = config.num_hidden_layers
n_experts: int | None = getattr(config, "num_experts", None)
placeholder_bounds = {
TemplatePlaceholder.LAYER: n_layers,
}
if n_experts:
placeholder_bounds[TemplatePlaceholder.EXPERT] = n_experts
return converter.convert(state, placeholder_bounds)
[docs]
@beta_feature
def convert_state_from_hf(
config: PretrainedConfig,
hf_state: Dict[str, Any],
*,
model_type: str | None = None,
) -> Dict[str, Any]:
"""
Converts a model state dict in Hugging Face transformers format into an unsharded state dict of
OLMo Core format.
:param config: The Hugging Face config for the model
:param hf_state: A model state dict in HF format.
:param model_type: The model type of the HF model.
"""
converter = _get_converter_from_hf(model_type=model_type)
converted_state = _convert_state(config, hf_state, converter)
if model_type == "gemma3_text":
converted_state = _apply_gemma3_norm_transform(converted_state)
return converted_state
def _get_converter_to_hf(model_type: str | None = None) -> StateConverter:
mapping_templates = {
olmo_core_key: StateMappingTemplate(olmo_core_key, hf_key, state_type=StateType.module)
for olmo_core_key, hf_key in OLMO_CORE_TO_HF_MODULE_MAPPINGS.items()
}
mapping_templates.update(
{
olmo_core_key: StateMappingTemplate(olmo_core_key, hf_key, state_type=StateType.weight)
for olmo_core_key, hf_key in OLMO_CORE_TO_HF_WEIGHT_MAPPINGS.items()
}
)
mapping_templates.update(OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS)
if model_type:
mapping_templates.update(
MODEL_TYPE_SPECIFIC_OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS.get(model_type, {})
)
return StateConverter(list(mapping_templates.values()))
[docs]
@beta_feature
def get_converter_to_hf(model_type: str | None = None) -> StateConverter:
return _get_converter_to_hf(model_type)
[docs]
@beta_feature
def convert_state_to_hf(
config: PretrainedConfig, olmo_core_state: Dict[str, Any]
) -> Dict[str, Any]:
"""
Converts an *unsharded* model state dict of OLMo Core format into Hugging Face transformers format.
:param config: The Hugging Face config for the model
:param olmo_core_state: An unsharded OLMo Core model state dict. None of the states can be
:class:`DTensor` or :class:`ShardedTensor`
"""
converter = _get_converter_to_hf(getattr(config, "model_type", None))
return _convert_state(config, olmo_core_state, converter)
# ---------------------------------------------------------------------------
# Hybrid (GDN + attention) per-layer key maps.
#
# These can't use the LAYER template because GDN and attention layers sharing
# the same OLMo-core prefix (``blocks.{i}.attention.*``) need different HF
# prefixes (``linear_attn.*`` vs ``self_attn.*``).
# ---------------------------------------------------------------------------
#: GDN layers: OLMo-core ``blocks.{i}.attention.*`` -> HF ``model.layers.{i}.linear_attn.*``.
#: These layers use pre-norm in HF (input_layernorm before the sequence mixer).
HYBRID_GDN_LAYER_KEY_MAP: Dict[str, str] = {
"attention.w_q.weight": "linear_attn.q_proj.weight",
"attention.w_k.weight": "linear_attn.k_proj.weight",
"attention.w_v.weight": "linear_attn.v_proj.weight",
"attention.w_a.weight": "linear_attn.a_proj.weight",
"attention.w_b.weight": "linear_attn.b_proj.weight",
"attention.w_g.weight": "linear_attn.g_proj.weight",
"attention.w_out.weight": "linear_attn.o_proj.weight",
"attention.q_conv1d.weight": "linear_attn.q_conv1d.weight",
"attention.k_conv1d.weight": "linear_attn.k_conv1d.weight",
"attention.v_conv1d.weight": "linear_attn.v_conv1d.weight",
"attention.o_norm.weight": "linear_attn.o_norm.weight",
"attention.A_log": "linear_attn.A_log",
"attention.dt_bias": "linear_attn.dt_bias",
"attention_norm.weight": "input_layernorm.weight",
"feed_forward_norm.weight": "post_attention_layernorm.weight",
"feed_forward.w1.weight": "mlp.gate_proj.weight",
"feed_forward.w2.weight": "mlp.down_proj.weight",
"feed_forward.w3.weight": "mlp.up_proj.weight",
}
#: Attention layers: OLMo-core ``blocks.{i}.attention.*`` -> HF ``model.layers.{i}.self_attn.*``.
#: These layers use post-norm in HF (layernorm after the sequence mixer and after the MLP).
HYBRID_ATTN_LAYER_KEY_MAP: Dict[str, str] = {
"attention.w_q.weight": "self_attn.q_proj.weight",
"attention.w_k.weight": "self_attn.k_proj.weight",
"attention.w_v.weight": "self_attn.v_proj.weight",
"attention.w_out.weight": "self_attn.o_proj.weight",
"attention.q_norm.weight": "self_attn.q_norm.weight",
"attention.k_norm.weight": "self_attn.k_norm.weight",
"attention_norm.weight": "post_attention_layernorm.weight",
"feed_forward_norm.weight": "post_feedforward_layernorm.weight",
"feed_forward.w1.weight": "mlp.gate_proj.weight",
"feed_forward.w2.weight": "mlp.down_proj.weight",
"feed_forward.w3.weight": "mlp.up_proj.weight",
}
#: Non-block keys shared across all hybrid models.
HYBRID_SHARED_KEY_MAP: Dict[str, str] = {
"embeddings.weight": "model.embed_tokens.weight",
"lm_head.norm.weight": "model.norm.weight",
"lm_head.w_out.weight": "lm_head.weight",
}
_HYBRID_BLOCK_KEY_RE = re.compile(r"^blocks\.(\d+)\.(.+)$")
[docs]
@beta_feature
def convert_hybrid_state_to_hf(
state_dict: Dict[str, Any],
layer_types: List[str],
) -> Dict[str, Any]:
"""
Convert an OLMo-core hybrid state dict to HF ``olmo_hybrid`` format.
Uses :data:`HYBRID_SHARED_KEY_MAP` for non-block keys, and per-layer
:data:`HYBRID_GDN_LAYER_KEY_MAP` / :data:`HYBRID_ATTN_LAYER_KEY_MAP`
based on *layer_types*.
:param state_dict: An unsharded OLMo-core model state dict.
:param layer_types: Per-layer type list (``"linear_attention"`` or ``"full_attention"``).
"""
hf_state: Dict[str, Any] = {}
for olmo_key, value in state_dict.items():
# Try shared (non-block) keys first.
if olmo_key in HYBRID_SHARED_KEY_MAP:
hf_state[HYBRID_SHARED_KEY_MAP[olmo_key]] = value
continue
m = _HYBRID_BLOCK_KEY_RE.match(olmo_key)
if m is None:
raise KeyError(f"Unmapped key: {olmo_key}")
layer_idx = int(m.group(1))
suffix = m.group(2)
key_map = (
HYBRID_GDN_LAYER_KEY_MAP
if layer_types[layer_idx] == "linear_attention"
else HYBRID_ATTN_LAYER_KEY_MAP
)
if suffix not in key_map:
raise KeyError(
f"Unmapped block suffix for layer {layer_idx} "
f"(type={layer_types[layer_idx]!r}): {olmo_key}"
)
hf_key = f"model.layers.{layer_idx}.{key_map[suffix]}"
hf_state[hf_key] = value
return hf_state