Index _ | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | Y | Z _ __add__() (olmo_core.data.composable.InstanceSource method) (olmo_core.data.composable.InstanceSourceConfig method) (olmo_core.data.composable.TokenSource method) (olmo_core.data.composable.TokenSourceConfig method) __call__() (olmo_core.data.collator.DataCollator method) __getitem__() (olmo_core.data.composable.ConcatAndChunkInstanceSource method) (olmo_core.data.composable.ConcatenatedInstanceSource method) (olmo_core.data.composable.InstanceSource method) (olmo_core.data.composable.MixingInstanceSource method) (olmo_core.data.composable.PackingInstanceSource method) (olmo_core.data.composable.RandomInstanceSource method) (olmo_core.data.composable.SamplingInstanceSource method) (olmo_core.data.composable.SlicedInstanceSource method) (olmo_core.data.composable.TokenSource method) (olmo_core.data.numpy_dataset.NumpyDatasetBase method) (olmo_core.data.numpy_dataset.NumpyFSLDataset method) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDataset method) (olmo_core.data.numpy_dataset.NumpyPackedFSLDataset method) (olmo_core.data.numpy_dataset.NumpyPaddedFSLDataset method) (olmo_core.data.numpy_dataset.NumpyVSLDataset method) __iter__() (olmo_core.data.composable.InstanceSource method) (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.data_loader.TextDataLoaderBase method) __len__() (olmo_core.data.composable.ConcatAndChunkInstanceSource method) (olmo_core.data.composable.ConcatenatedInstanceSource method) (olmo_core.data.composable.InstanceSource method) (olmo_core.data.composable.MixingInstanceSource method) (olmo_core.data.composable.PackingInstanceSource method) (olmo_core.data.composable.RandomInstanceSource method) (olmo_core.data.composable.SamplingInstanceSource method) (olmo_core.data.composable.SlicedInstanceSource method) (olmo_core.data.composable.TokenSource method) (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.numpy_dataset.NumpyDatasetBase method) (olmo_core.data.numpy_dataset.NumpyFSLDataset method) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDataset method) (olmo_core.data.numpy_dataset.NumpyPackedFSLDataset method) (olmo_core.data.numpy_dataset.NumpyVSLDataset method) __mul__() (olmo_core.data.composable.InstanceSource method) (olmo_core.data.composable.InstanceSourceConfig method) (olmo_core.data.composable.TokenSource method) (olmo_core.data.composable.TokenSourceConfig method) _iter_batches() (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.data_loader.NumpyDataLoaderBase method) A ABFRoPEScalingConfig (class in olmo_core.nn.rope) activation (olmo_core.nn.feed_forward.FeedForwardConfig attribute) activation_memory_budget (olmo_core.train.train_module.TransformerActivationCheckpointingConfig attribute) ActivationFunction (class in olmo_core.nn.feed_forward) active (olmo_core.train.callbacks.ProfilerCallback attribute) AdamConfig (class in olmo_core.optim) AdamWConfig (class in olmo_core.optim) add_cached_path_clients() (in module olmo_core.io) add_callback() (olmo_core.train.Trainer method) (olmo_core.train.TrainerConfig method) add_callbacks() (olmo_core.train.TrainerConfig method) adjust_lr (olmo_core.optim.MuonConfig attribute) all (olmo_core.train.callbacks.CometNotificationSetting attribute) (olmo_core.train.callbacks.SlackNotificationSetting attribute) all_gather() (in module olmo_core.distributed.utils) all_gather_object() (in module olmo_core.distributed.utils) all_non_permanent (olmo_core.train.callbacks.CheckpointRemovalStrategy attribute) all_ranks (olmo_core.utils.LogFilterType attribute) all_reduce_value() (in module olmo_core.distributed.utils) allow_dirty (olmo_core.launch.beaker.BeakerLaunchConfig attribute) allow_neg_eigval (olmo_core.nn.attention.GatedDeltaNetConfig attribute) always (olmo_core.train.LoadStrategy attribute) AOFloat8LinearConfig (class in olmo_core.float8) AOFloat8LinearRecipe (class in olmo_core.float8) AOMXLinearConfig (class in olmo_core.float8) apply() (olmo_core.config.Config method) apply_activation_checkpointing() (olmo_core.nn.transformer.Transformer method) apply_compile() (olmo_core.nn.transformer.NormalizedTransformer method) (olmo_core.nn.transformer.Transformer method) apply_cp() (olmo_core.nn.attention.Attention method) (olmo_core.nn.attention.AttentionBackend method) (olmo_core.nn.attention.TEAttentionBackend method) (olmo_core.nn.transformer.Transformer method) apply_ddp() (olmo_core.nn.transformer.Transformer method) apply_ep() (olmo_core.nn.moe.MoEBase method) apply_float8_linear() (olmo_core.float8.Float8Config method) apply_fp8() (olmo_core.nn.transformer.Transformer method) apply_fsdp() (olmo_core.nn.transformer.Transformer method) apply_pp() (olmo_core.nn.transformer.Transformer method) apply_to_tensors() (in module olmo_core.utils) apply_tp() (olmo_core.nn.transformer.NormalizedTransformer method) (olmo_core.nn.transformer.Transformer method) as_config_dict() (olmo_core.config.Config method) as_dict() (olmo_core.config.Config method) as_np_dtype() (olmo_core.data.types.NumpyDatasetDType method) assert_supported() (olmo_core.nn.attention.AttentionBackend class method) (olmo_core.nn.attention.FlashAttention2Backend class method) (olmo_core.nn.attention.FlashAttention3Backend class method) (olmo_core.nn.attention.FlashAttention4Backend class method) (olmo_core.nn.attention.TEAttentionBackend class method) (olmo_core.nn.attention.TorchAttentionBackend class method) assert_supports_kv_cache() (olmo_core.nn.attention.AttentionBackend class method) (olmo_core.nn.attention.FlashAttention2Backend class method) (olmo_core.nn.attention.FlashAttention3Backend class method) (olmo_core.nn.attention.FlashAttention4Backend class method) (olmo_core.nn.attention.TEAttentionBackend class method) (olmo_core.nn.attention.TorchAttentionBackend class method) assert_supports_packed_qkv() (olmo_core.nn.attention.AttentionBackend class method) (olmo_core.nn.attention.FlashAttention2Backend class method) (olmo_core.nn.attention.FlashAttention3Backend class method) (olmo_core.nn.attention.FlashAttention4Backend class method) (olmo_core.nn.attention.TEAttentionBackend class method) (olmo_core.nn.attention.TorchAttentionBackend class method) assert_supports_ring_cp() (olmo_core.nn.attention.AttentionBackend class method) (olmo_core.nn.attention.FlashAttention2Backend class method) (olmo_core.nn.attention.FlashAttention3Backend class method) (olmo_core.nn.attention.FlashAttention4Backend class method) (olmo_core.nn.attention.TorchAttentionBackend class method) assert_supports_swa() (olmo_core.nn.attention.AttentionBackend class method) (olmo_core.nn.attention.FlashAttention2Backend class method) (olmo_core.nn.attention.FlashAttention3Backend class method) (olmo_core.nn.attention.FlashAttention4Backend class method) (olmo_core.nn.attention.TEAttentionBackend class method) (olmo_core.nn.attention.TorchAttentionBackend class method) assert_supports_ulysses_cp() (olmo_core.nn.attention.AttentionBackend class method) (olmo_core.nn.attention.FlashAttention2Backend class method) (olmo_core.nn.attention.FlashAttention3Backend class method) (olmo_core.nn.attention.FlashAttention4Backend class method) (olmo_core.nn.attention.TorchAttentionBackend class method) async_bookkeeping (olmo_core.train.Trainer attribute) async_save_model_and_optim_state() (in module olmo_core.distributed.checkpoint) async_save_state_dict() (in module olmo_core.distributed.checkpoint) Attention (class in olmo_core.nn.attention) attention (olmo_core.nn.transformer.TransformerBlockConfig attribute) attention_mask_to_cache_leftpad() (in module olmo_core.data.utils) attention_rescale_factor (olmo_core.nn.rope.ABFRoPEScalingConfig attribute) (olmo_core.nn.rope.PIRoPEScalingConfig attribute) (olmo_core.nn.rope.StepwiseRoPEScalingConfig attribute) attention_residual_alpha (olmo_core.nn.transformer.TransformerBlockConfig attribute) AttentionBackend (class in olmo_core.nn.attention) AttentionBackendName (class in olmo_core.nn.attention) AttentionConfig (class in olmo_core.nn.attention) AttentionType (class in olmo_core.nn.attention) auto_resume (olmo_core.train.callbacks.CometCallback attribute) aws_config_secret (olmo_core.launch.beaker.BeakerLaunchConfig attribute) aws_credentials_secret (olmo_core.launch.beaker.BeakerLaunchConfig attribute) B backend (olmo_core.model_ladder.ModelLadder attribute) backend_supports_cpu() (in module olmo_core.distributed.utils) backend_supports_cuda() (in module olmo_core.distributed.utils) balanced (olmo_core.data.numpy_dataset.VSLGrowthCurriculum attribute) barrier() (in module olmo_core.distributed.utils) BasicTrainModule (class in olmo_core.train.train_module) batch_shard() (olmo_core.nn.attention.RingAttentionLlama3LoadBalancer method) (olmo_core.nn.attention.RingAttentionLoadBalancer method) (olmo_core.nn.attention.RingAttentionZigZagLoadBalancer method) (olmo_core.nn.attention.UlyssesLoadBalancer method) batch_shard_by_document() (olmo_core.nn.attention.RingAttentionLlama3LoadBalancer method) (olmo_core.nn.attention.RingAttentionLoadBalancer method) (olmo_core.nn.attention.RingAttentionZigZagLoadBalancer method) (olmo_core.nn.attention.UlyssesLoadBalancer method) batch_size_unit (olmo_core.train.train_module.EvalBatchSpec attribute) batch_sizes (olmo_core.train.callbacks.BatchSizeSchedulerCallback attribute) batches_in_epoch() (olmo_core.data.composable.ComposableDataLoader method) (olmo_core.data.data_loader.DataLoaderBase method) batches_processed (olmo_core.data.data_loader.DataLoaderBase attribute) BatchSizeSchedulerCallback (class in olmo_core.train.callbacks) beaker_image (olmo_core.launch.beaker.BeakerLaunchConfig attribute) BeakerCallback (class in olmo_core.train.callbacks) BeakerEnvSecret (class in olmo_core.launch.beaker) BeakerEnvVar (class in olmo_core.launch.beaker) BeakerInsufficientResourcesError BeakerLaunchConfig (class in olmo_core.launch.beaker) BeakerWekaBucket (class in olmo_core.launch.beaker) beta_fast (olmo_core.nn.rope.YaRNRoPEScalingConfig attribute) beta_slow (olmo_core.nn.rope.YaRNRoPEScalingConfig attribute) betas (olmo_core.optim.DionConfig attribute) (olmo_core.optim.MuonConfig attribute) block_ephemeral_checkpoints() (olmo_core.train.callbacks.Callback method) block_interval (olmo_core.train.train_module.TransformerActivationCheckpointingConfig attribute) block_size (olmo_core.float8.AOMXLinearConfig attribute) blocks (olmo_core.train.train_module.TransformerDataParallelWrappingStrategy attribute) bookkeeping_device (olmo_core.train.Trainer property) bookkeeping_pg (olmo_core.train.Trainer property) bookkeeping_soft_timeout (olmo_core.train.Trainer attribute) bos_token_id (olmo_core.data.tokenizer.TokenizerConfig attribute) broadcast_object() (in module olmo_core.distributed.utils) bucket_documents() (in module olmo_core.data.utils) budget (olmo_core.launch.beaker.BeakerLaunchConfig attribute) (olmo_core.train.train_module.TransformerActivationCheckpointingMode attribute) build() (olmo_core.data.composable.ComposableDataLoaderConfig method) (olmo_core.data.composable.ConcatAndChunkInstanceSourceConfig method) (olmo_core.data.composable.ConcatenatedDocumentSourceConfig method) (olmo_core.data.composable.ConcatenatedInstanceSourceConfig method) (olmo_core.data.composable.ConcatenatedTokenSourceConfig method) (olmo_core.data.composable.DocumentSourceConfig method) (olmo_core.data.composable.InstanceSourceConfig method) (olmo_core.data.composable.MixingDocumentSourceConfig method) (olmo_core.data.composable.MixingInstanceSourceConfig method) (olmo_core.data.composable.MixingTokenSourceConfig method) (olmo_core.data.composable.NumpyDocumentSourceConfig method) (olmo_core.data.composable.NumpyDocumentSourceMixConfig method) (olmo_core.data.composable.PackingInstanceSourceConfig method) (olmo_core.data.composable.RandomInstanceSourceConfig method) (olmo_core.data.composable.SamplingDocumentSourceConfig method) (olmo_core.data.composable.SamplingInstanceSourceConfig method) (olmo_core.data.composable.SamplingTokenSourceConfig method) (olmo_core.data.composable.SplitInstanceSourceConfig method) (olmo_core.data.composable.SplitTokenSourceConfig method) (olmo_core.data.composable.TokenSourceConfig method) (olmo_core.data.data_loader.NumpyDataLoaderConfig method) (olmo_core.data.mixes.DataMix method) (olmo_core.data.mixes.DataMixBase method) (olmo_core.data.numpy_dataset.NumpyDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyPackedFSLDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyPaddedFSLDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyVSLDatasetConfig method) (olmo_core.data.numpy_dataset.VSLCurriculumConfig method) (olmo_core.generate.generation_module.TransformerGenerationModuleConfig method) (olmo_core.nn.attention.AttentionConfig method) (olmo_core.nn.attention.GatedDeltaNetConfig method) (olmo_core.nn.attention.RingAttentionLoadBalancerType method) (olmo_core.nn.feed_forward.FeedForwardConfig method) (olmo_core.nn.layer_norm.LayerNormConfig method) (olmo_core.nn.lm_head.LMHeadConfig method) (olmo_core.nn.moe.MoEConfig method) (olmo_core.nn.moe.MoERouterConfig method) (olmo_core.nn.rope.RoPEConfig method) (olmo_core.nn.transformer.TransformerBlockConfig method) (olmo_core.nn.transformer.TransformerConfig method) (olmo_core.optim.MatrixAwareOptimConfig method) (olmo_core.optim.OptimConfig method) (olmo_core.train.callbacks.CallbackConfig method) (olmo_core.train.TrainerConfig method) build_groups() (olmo_core.optim.MatrixAwareOptimConfig method) (olmo_core.optim.MuonConfig method) (olmo_core.optim.OptimConfig method) build_parallelism_config() (olmo_core.optim.DionConfig method) (olmo_core.optim.MuonConfig method) build_train_module() (olmo_core.model_ladder.ModelConfigurator method) (olmo_core.model_ladder.TransformerModelConfigurator method) build_world_mesh() (in module olmo_core.distributed.parallel) C Callback (class in olmo_core.train.callbacks) CallbackConfig (class in olmo_core.train.callbacks) callbacks (olmo_core.train.Trainer attribute) cancel_after_first_eval (olmo_core.train.callbacks.EvaluatorCallback attribute) cancel_check_interval (olmo_core.train.callbacks.CometCallback attribute) (olmo_core.train.callbacks.WandBCallback attribute) (olmo_core.train.Trainer attribute) cancel_run() (olmo_core.train.Trainer method) cancel_tags (olmo_core.train.callbacks.CometCallback attribute) (olmo_core.train.callbacks.WandBCallback attribute) cautious_wd (olmo_core.optim.MuonConfig attribute) ce_loss (olmo_core.nn.lm_head.LMOutputWithLoss attribute) check_if_canceled() (olmo_core.train.Trainer method) checkpoint_loaded (olmo_core.train.Trainer property) Checkpointer (class in olmo_core.train) checkpointer (olmo_core.train.Trainer attribute) CheckpointerCallback (class in olmo_core.train.callbacks) CheckpointerConfig (class in olmo_core.train) CheckpointRemovalStrategy (class in olmo_core.train.callbacks) children() (olmo_core.data.composable.ConcatAndChunkInstanceSource method) (olmo_core.data.composable.ConcatenatedDocumentSource method) (olmo_core.data.composable.ConcatenatedInstanceSource method) (olmo_core.data.composable.ConcatenatedTokenSource method) (olmo_core.data.composable.InMemoryDocumentSource method) (olmo_core.data.composable.InMemoryTokenSource method) (olmo_core.data.composable.MixingDocumentSource method) (olmo_core.data.composable.MixingInstanceSource method) (olmo_core.data.composable.MixingTokenSource method) (olmo_core.data.composable.NumpyDocumentSource method) (olmo_core.data.composable.PackingInstanceSource method) (olmo_core.data.composable.RandomInstanceSource method) (olmo_core.data.composable.SamplingDocumentSource method) (olmo_core.data.composable.SamplingInstanceSource method) (olmo_core.data.composable.SamplingTokenSource method) (olmo_core.data.composable.SlicedInstanceSource method) (olmo_core.data.composable.SlicedTokenSource method) (olmo_core.data.composable.SourceABC method) chinchilla_multiple (olmo_core.model_ladder.WSDSChinchillaRunConfigurator attribute) chinchilla_tokens() (olmo_core.train.Duration class method) chunk_size_bytes (olmo_core.distributed.checkpoint.UnshardStrategy attribute) chunked() (in module olmo_core.data.utils) chunks (olmo_core.distributed.checkpoint.UnshardStrategyType attribute) chunks() (olmo_core.distributed.checkpoint.UnshardStrategy class method) chunks_per_doc (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDatasetConfig attribute) CLASS_NAME_FIELD (olmo_core.config.Config attribute) clear_directory() (in module olmo_core.io) close() (olmo_core.train.callbacks.Callback method) clusters (olmo_core.launch.beaker.BeakerLaunchConfig attribute) cmd (olmo_core.launch.beaker.BeakerLaunchConfig attribute) collator (olmo_core.data.data_loader.TextDataLoaderBase attribute) CometCallback (class in olmo_core.train.callbacks) CometNotificationSetting (class in olmo_core.train.callbacks) common_work_dir (olmo_core.data.composable.SourceABC property) compile (olmo_core.optim.OptimConfig attribute) complex (olmo_core.nn.rope.RoPEType attribute) ComplexRotaryEmbedding (class in olmo_core.nn.rope) ComposableDataLoader (class in olmo_core.data.composable) ComposableDataLoaderConfig (class in olmo_core.data.composable) compute() (olmo_core.eval.metrics.MeanMetric method) (olmo_core.eval.metrics.Metric method) compute_metrics() (olmo_core.eval.evaluator.Evaluator method) (olmo_core.eval.lm_evaluator.LMEvaluator method) compute_scaled_inv_freq() (olmo_core.nn.rope.ABFRoPEScalingConfig method) (olmo_core.nn.rope.PIRoPEScalingConfig method) (olmo_core.nn.rope.RoPEScalingConfig method) (olmo_core.nn.rope.StepwiseRoPEScalingConfig method) (olmo_core.nn.rope.YaRNRoPEScalingConfig method) ConcatAndChunkInstanceSource (class in olmo_core.data.composable) ConcatAndChunkInstanceSourceConfig (class in olmo_core.data.composable) ConcatenatedDocumentSource (class in olmo_core.data.composable) ConcatenatedDocumentSourceConfig (class in olmo_core.data.composable) ConcatenatedInstanceSource (class in olmo_core.data.composable) ConcatenatedInstanceSourceConfig (class in olmo_core.data.composable) ConcatenatedTokenSource (class in olmo_core.data.composable) ConcatenatedTokenSourceConfig (class in olmo_core.data.composable) Config (class in olmo_core.config) (olmo_core.data.composable.ComposableDataLoader attribute) (olmo_core.data.composable.ConcatAndChunkInstanceSource attribute) (olmo_core.data.composable.ConcatenatedDocumentSource attribute) (olmo_core.data.composable.ConcatenatedInstanceSource attribute) (olmo_core.data.composable.ConcatenatedTokenSource attribute) (olmo_core.data.composable.MixingDocumentSource attribute) (olmo_core.data.composable.MixingDocumentSourceSpec attribute) (olmo_core.data.composable.MixingInstanceSource attribute) (olmo_core.data.composable.MixingInstanceSourceSpec attribute) (olmo_core.data.composable.MixingTokenSource attribute) (olmo_core.data.composable.MixingTokenSourceSpec attribute) (olmo_core.data.composable.NumpyDocumentSource attribute) (olmo_core.data.composable.PackingInstanceSource attribute) (olmo_core.data.composable.RandomInstanceSource attribute) (olmo_core.data.composable.SamplingDocumentSource attribute) (olmo_core.data.composable.SamplingInstanceSource attribute) (olmo_core.data.composable.SamplingTokenSource attribute) config (olmo_core.train.callbacks.BeakerCallback attribute) (olmo_core.train.callbacks.CometCallback attribute) (olmo_core.train.callbacks.ConfigSaverCallback property) (olmo_core.train.callbacks.WandBCallback attribute) ConfigSaverCallback (class in olmo_core.train.callbacks) configure_checkpoint_intervals() (olmo_core.model_ladder.RunConfigurator method) (olmo_core.model_ladder.WSDSChinchillaRunConfigurator method) configure_duration() (olmo_core.model_ladder.RunConfigurator method) (olmo_core.model_ladder.WSDSChinchillaRunConfigurator method) configure_lr_scheduler() (olmo_core.model_ladder.RunConfigurator method) (olmo_core.model_ladder.WSDSChinchillaRunConfigurator method) configure_minimal_device_mesh_spec() (olmo_core.model_ladder.ModelConfigurator method) (olmo_core.model_ladder.TransformerModelConfigurator method) configure_model() (olmo_core.model_ladder.ModelConfigurator method) (olmo_core.model_ladder.Olmo3ModelConfigurator method) configure_optimizer() (olmo_core.model_ladder.RunConfigurator method) (olmo_core.model_ladder.WSDSChinchillaRunConfigurator method) configure_rank_microbatch_size() (olmo_core.model_ladder.ModelConfigurator method) (olmo_core.model_ladder.TransformerModelConfigurator method) configure_target_batch_size() (olmo_core.model_ladder.RunConfigurator method) (olmo_core.model_ladder.WSDSChinchillaRunConfigurator method) ConsoleLoggerCallback (class in olmo_core.train.callbacks) ConstantScheduler (class in olmo_core.optim) ConstantWithWarmup (class in olmo_core.optim) contains_checkpoint() (olmo_core.train.Checkpointer class method) conv_bias (olmo_core.nn.attention.GatedDeltaNetConfig attribute) conv_size (olmo_core.nn.attention.GatedDeltaNetConfig attribute) convert() (olmo_core.nn.conversion.StateConverter method) convert_checkpoint_to_hf() (in module olmo_core.nn.hf) convert_duration_to_steps() (olmo_core.train.Trainer method) convert_hybrid_state_to_hf() (in module olmo_core.nn.hf) convert_state_from_hf() (in module olmo_core.nn.hf) convert_state_to_hf() (in module olmo_core.nn.hf) copy() (olmo_core.config.Config method) copy_dir() (in module olmo_core.io) copy_file() (in module olmo_core.io) CosWithWarmup (class in olmo_core.optim) CosWithWarmupAndLinearDecay (class in olmo_core.optim) cp (olmo_core.distributed.parallel.MeshDimName attribute) create_optimizer() (olmo_core.optim.DionConfig method) (olmo_core.optim.MatrixAwareOptimConfig method) (olmo_core.optim.MuonConfig method) cross_entropy_loss() (in module olmo_core.nn.functional) cuda_sync_debug_mode() (in module olmo_core.utils) cute_rms (olmo_core.nn.layer_norm.LayerNormType attribute) CuTeRMSNorm (class in olmo_core.nn.layer_norm) D data_loader (olmo_core.model_ladder.ModelLadder attribute) (olmo_core.train.Trainer attribute) DataCollator (class in olmo_core.data.collator) DataLoaderBase (class in olmo_core.data.data_loader) DataMix (class in olmo_core.data.mixes) DataMixBase (class in olmo_core.data.mixes) DataParallelConfig (class in olmo_core.distributed.parallel) DataParallelType (class in olmo_core.distributed.parallel) debug (olmo_core.train.callbacks.HFConverterCallback attribute) decay_fraction (olmo_core.model_ladder.WSDSChinchillaRunConfigurator attribute) default (olmo_core.nn.attention.AttentionType attribute) (olmo_core.nn.feed_forward.FeedForwardType attribute) (olmo_core.nn.layer_norm.LayerNormType attribute) (olmo_core.nn.lm_head.LMHeadType attribute) (olmo_core.nn.lm_head.LMLossImplementation attribute) (olmo_core.nn.moe.MoERouterType attribute) (olmo_core.nn.moe.MoEType attribute) (olmo_core.nn.rope.RoPEType attribute) (olmo_core.nn.transformer.TransformerBlockType attribute) (olmo_core.nn.transformer.TransformerType attribute) default_env_vars (olmo_core.launch.beaker.BeakerLaunchConfig property) default_group_overrides() (olmo_core.optim.DionConfig method) (olmo_core.optim.MatrixAwareOptimConfig method) (olmo_core.optim.MuonConfig method) default_scaled (olmo_core.nn.transformer.TransformerBlockType attribute) degree (olmo_core.distributed.parallel.ExpertParallelConfig attribute) (olmo_core.distributed.parallel.PipelineParallelConfig attribute) (olmo_core.distributed.parallel.TensorParallelConfig attribute) description (olmo_core.launch.beaker.BeakerLaunchConfig attribute) deserialize_from_tensor() (in module olmo_core.io) dest_chunk_dim (olmo_core.nn.conversion.StateMapping attribute) (olmo_core.nn.conversion.StateMappingTemplate attribute) dest_key_per_placeholder (olmo_core.nn.conversion.StateMappingTemplate attribute) dest_keys (olmo_core.nn.conversion.StateMapping attribute) dest_template_keys (olmo_core.nn.conversion.StateMappingTemplate attribute) deterministic_glob_directory() (in module olmo_core.io) device (olmo_core.train.callbacks.HFConverterCallback attribute) (olmo_core.train.Trainer attribute) device_type (olmo_core.model_ladder.ModelLadder attribute) DeviceMeshSpec (class in olmo_core.model_ladder) dims_permutation (olmo_core.nn.conversion.StateMapping attribute) (olmo_core.nn.conversion.StateMappingTemplate attribute) DionConfig (class in olmo_core.optim) dir (olmo_core.model_ladder.ModelLadder attribute) dir_is_checkpoint() (olmo_core.train.Checkpointer class method) dir_is_empty() (in module olmo_core.io) display() (olmo_core.model_ladder.RunCheckpointInfo method) do_n_at_a_time() (in module olmo_core.distributed.utils) do_sample (olmo_core.generate.generation_module.GenerationConfig attribute) docs_per_instance (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDatasetConfig attribute) DocumentSource (class in olmo_core.data.composable) DocumentSourceConfig (class in olmo_core.data.composable) dolma2 (olmo_core.data.tokenizer.TokenizerName attribute) dolma2() (olmo_core.data.tokenizer.TokenizerConfig class method) dolma2_sigdig (olmo_core.data.tokenizer.TokenizerName attribute) dolma2_sigdig() (olmo_core.data.tokenizer.TokenizerConfig class method) DownstreamEvaluatorCallbackConfig (class in olmo_core.train.callbacks) dp (olmo_core.distributed.parallel.MeshDimName attribute) dp_process_group (olmo_core.generate.generation_module.GenerationModule property) (olmo_core.generate.generation_module.TransformerGenerationModule property) (olmo_core.train.train_module.TrainModule property) (olmo_core.train.train_module.TransformerPipelineTrainModule property) (olmo_core.train.train_module.TransformerTrainModule property) (olmo_core.train.Trainer attribute) dp_replicate (olmo_core.distributed.parallel.MeshDimName attribute) dp_shard (olmo_core.distributed.parallel.MeshDimName attribute) dp_world_size (olmo_core.model_ladder.DeviceMeshSpec attribute) DPMeshDimName (class in olmo_core.distributed.parallel) dropless (olmo_core.nn.moe.MoEType attribute) DroplessMoE (class in olmo_core.nn.moe) DroplessMoEMLP (class in olmo_core.nn.moe) dropout (olmo_core.nn.transformer.TransformerBlockConfig attribute) dry_run() (olmo_core.launch.beaker.BeakerLaunchConfig method) (olmo_core.model_ladder.ModelLadder method) DType (class in olmo_core.config) dtype (olmo_core.data.composable.NumpyDocumentSourceConfigBase attribute) (olmo_core.data.numpy_dataset.NumpyDatasetBase property) (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) (olmo_core.generate.generation_module.TransformerGenerationModuleConfig attribute) (olmo_core.nn.attention.GatedDeltaNetConfig attribute) (olmo_core.train.callbacks.HFConverterCallback attribute) due() (olmo_core.train.Duration method) dump_gradients (olmo_core.train.callbacks.GAPMonitorCallback attribute) dump_gradients_end_step (olmo_core.train.callbacks.GAPMonitorCallback attribute) dump_gradients_save_first_n (olmo_core.train.callbacks.GAPMonitorCallback attribute) dump_gradients_start_step (olmo_core.train.callbacks.GAPMonitorCallback attribute) dump_gradients_step_interval (olmo_core.train.callbacks.GAPMonitorCallback attribute) Duration (class in olmo_core.train) DurationUnit (class in olmo_core.train) E elem_dtype (olmo_core.float8.AOMXLinearConfig attribute) elem_dtype_grad_output_override (olmo_core.float8.AOMXLinearConfig attribute) elem_dtype_weight_override (olmo_core.float8.AOMXLinearConfig attribute) elementwise (olmo_core.nn.attention.GateGranularity attribute) enable_async (olmo_core.distributed.parallel.TensorParallelConfig attribute) enable_cuda_sync_events (olmo_core.train.callbacks.ProfilerCallback attribute) enabled (olmo_core.train.callbacks.CometCallback attribute) (olmo_core.train.callbacks.GAPMonitorCallback attribute) (olmo_core.train.callbacks.HFConverterCallback attribute) (olmo_core.train.callbacks.ProfilerCallback attribute) (olmo_core.train.callbacks.SlackNotifierCallback attribute) (olmo_core.train.callbacks.StabilityMonitorCallback attribute) (olmo_core.train.callbacks.WandBCallback attribute) end (olmo_core.data.utils.RepetitionTuple attribute) end_only (olmo_core.train.callbacks.CometNotificationSetting attribute) (olmo_core.train.callbacks.SlackNotificationSetting attribute) entity (olmo_core.train.callbacks.WandBCallback attribute) env_secrets (olmo_core.launch.beaker.BeakerLaunchConfig attribute) env_vars (olmo_core.launch.beaker.BeakerLaunchConfig attribute) eos_token_id (olmo_core.data.tokenizer.TokenizerConfig attribute) (olmo_core.generate.generation_module.GenerationConfig attribute) ep (olmo_core.distributed.parallel.MeshDimName attribute) ephemeral_cooldown (olmo_core.train.callbacks.CheckpointerCallback attribute) ephemeral_only (olmo_core.train.callbacks.CheckpointRemovalStrategy attribute) ephemeral_save_interval (olmo_core.train.callbacks.CheckpointerCallback attribute) epoch (olmo_core.data.data_loader.DataLoaderBase property) (olmo_core.train.Trainer attribute) epochs (olmo_core.train.DurationUnit attribute) epochs() (olmo_core.train.Duration class method) eval_batch() (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) eval_batch_spec (olmo_core.train.train_module.BasicTrainModule property) (olmo_core.train.train_module.TrainModule property) (olmo_core.train.train_module.TransformerPipelineTrainModule property) (olmo_core.train.train_module.TransformerTrainModule property) eval_duration (olmo_core.train.callbacks.EvaluatorCallback attribute) eval_interval (olmo_core.train.callbacks.EvaluatorCallback attribute) eval_on_finish (olmo_core.train.callbacks.EvaluatorCallback attribute) eval_on_startup (olmo_core.train.callbacks.EvaluatorCallback attribute) EvalBatchSizeUnit (class in olmo_core.train.train_module) EvalBatchSpec (class in olmo_core.train.train_module) Evaluator (class in olmo_core.eval.evaluator) EvaluatorCallback (class in olmo_core.train.callbacks) evaluators (olmo_core.train.callbacks.EvaluatorCallback attribute) excepthook() (in module olmo_core.utils) exists (olmo_core.model_ladder.RunCheckpointInfo attribute) expand_glob (olmo_core.data.composable.NumpyDocumentSourceConfig attribute) (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) expand_v (olmo_core.nn.attention.GatedDeltaNetConfig attribute) EXPERT (olmo_core.nn.conversion.TemplatePlaceholder attribute) ExpertParallelConfig (class in olmo_core.distributed.parallel) ExponentialScheduler (class in olmo_core.optim) extra_repr() (olmo_core.nn.layer_norm.LayerNorm method) (olmo_core.nn.moe.MoELinearRouter method) (olmo_core.nn.moe.MoEMLP method) F factor (olmo_core.nn.rope.PIRoPEScalingConfig attribute) (olmo_core.nn.rope.StepwiseRoPEScalingConfig attribute) (olmo_core.nn.rope.YaRNRoPEScalingConfig attribute) failure_only (olmo_core.train.callbacks.CometNotificationSetting attribute) (olmo_core.train.callbacks.SlackNotificationSetting attribute) failure_tag (olmo_core.train.callbacks.CometCallback attribute) fan_in (olmo_core.nn.transformer.InitMethod attribute) feed_forward (olmo_core.nn.transformer.TransformerBlockConfig attribute) feed_forward_moe (olmo_core.nn.transformer.TransformerBlockConfig attribute) feed_forward_residual_alpha (olmo_core.nn.transformer.TransformerBlockConfig attribute) FeedForward (class in olmo_core.nn.feed_forward) FeedForwardConfig (class in olmo_core.nn.feed_forward) FeedForwardType (class in olmo_core.nn.feed_forward) file_exists() (in module olmo_core.io) file_sizes (olmo_core.data.numpy_dataset.NumpyDatasetBase property) (olmo_core.data.numpy_dataset.NumpyFSLDataset property) filter_warnings() (in module olmo_core.utils) final_metrics_fname (olmo_core.train.callbacks.MetricSaverCallback attribute) find_checkpoints() (olmo_core.train.Checkpointer class method) find_end_first_consecutive_true() (in module olmo_core.data.utils) find_periodic_sequences() (in module olmo_core.data.utils) find_start_last_consecutive_true() (in module olmo_core.data.utils) fine_grained (olmo_core.train.train_module.TransformerDataParallelWrappingStrategy attribute) fingerprint (olmo_core.data.composable.ConcatAndChunkInstanceSource property) (olmo_core.data.composable.ConcatenatedInstanceSource property) (olmo_core.data.composable.ConcatenatedTokenSource property) (olmo_core.data.composable.InMemoryDocumentSource property) (olmo_core.data.composable.InMemoryTokenSource property) (olmo_core.data.composable.MixingDocumentSource property) (olmo_core.data.composable.MixingInstanceSource property) (olmo_core.data.composable.MixingTokenSource property) (olmo_core.data.composable.NumpyDocumentSource property) (olmo_core.data.composable.PackingInstanceSource property) (olmo_core.data.composable.RandomInstanceSource property) (olmo_core.data.composable.SamplingDocumentSource property) (olmo_core.data.composable.SamplingInstanceSource property) (olmo_core.data.composable.SamplingTokenSource property) (olmo_core.data.composable.SlicedInstanceSource property) (olmo_core.data.composable.SlicedTokenSource property) (olmo_core.data.composable.SourceABC property) (olmo_core.data.numpy_dataset.NumpyDatasetBase property) fingerprint_fields (olmo_core.data.numpy_dataset.NumpyDatasetBase property) (olmo_core.data.numpy_dataset.NumpyFSLDataset property) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDataset property) (olmo_core.data.numpy_dataset.NumpyPackedFSLDataset property) (olmo_core.data.numpy_dataset.NumpyPaddedFSLDataset property) (olmo_core.data.numpy_dataset.NumpyVSLDataset property) fingerprint_version (olmo_core.data.numpy_dataset.NumpyDatasetBase property) fit() (olmo_core.train.Trainer method) fixed_fields (olmo_core.optim.OptimConfig attribute) fixed_sequence_length (olmo_core.train.train_module.EvalBatchSpec attribute) fixed_steps (olmo_core.train.callbacks.CheckpointerCallback attribute) (olmo_core.train.callbacks.EvaluatorCallback attribute) (olmo_core.train.callbacks.MetricSaverCallback attribute) flash_2 (olmo_core.nn.attention.AttentionBackendName attribute) flash_3 (olmo_core.nn.attention.AttentionBackendName attribute) flash_4 (olmo_core.nn.attention.AttentionBackendName attribute) FlashAttention2Backend (class in olmo_core.nn.attention) FlashAttention3Backend (class in olmo_core.nn.attention) FlashAttention4Backend (class in olmo_core.nn.attention) flatten (olmo_core.optim.MuonConfig attribute) flatten_dict() (in module olmo_core.utils) flatten_dims (olmo_core.nn.conversion.StateMapping attribute) (olmo_core.nn.conversion.StateMappingTemplate attribute) flatten_mesh() (in module olmo_core.distributed.parallel) Float8Config (class in olmo_core.float8) follow (olmo_core.launch.beaker.BeakerLaunchConfig attribute) force_full_attention_on_first_layer (olmo_core.nn.attention.SlidingWindowAttentionConfig attribute) force_full_attention_on_last_layer (olmo_core.nn.attention.SlidingWindowAttentionConfig attribute) foreach (olmo_core.optim.SkipStepAdamWConfig attribute) format_int() (in module olmo_core.utils) forward() (olmo_core.nn.attention.Attention method) (olmo_core.nn.attention.AttentionBackend method) (olmo_core.nn.attention.FlashAttention2Backend method) (olmo_core.nn.attention.FlashAttention3Backend method) (olmo_core.nn.attention.FlashAttention4Backend method) (olmo_core.nn.attention.FusedAttention method) (olmo_core.nn.attention.GatedDeltaNet method) (olmo_core.nn.attention.NormalizedAttention method) (olmo_core.nn.attention.TEAttentionBackend method) (olmo_core.nn.attention.TorchAttentionBackend method) (olmo_core.nn.feed_forward.FeedForward method) (olmo_core.nn.feed_forward.NormalizedFeedForward method) (olmo_core.nn.layer_norm.CuTeRMSNorm method) (olmo_core.nn.layer_norm.FusedRMSNorm method) (olmo_core.nn.layer_norm.L2Norm method) (olmo_core.nn.layer_norm.LayerNorm method) (olmo_core.nn.layer_norm.QwenRMSNorm method) (olmo_core.nn.layer_norm.RMSNorm method) (olmo_core.nn.lm_head.LMHead method) (olmo_core.nn.lm_head.NormalizedLMHead method) (olmo_core.nn.moe.DroplessMoEMLP method) (olmo_core.nn.moe.MoEBase method) (olmo_core.nn.moe.MoEMLP method) (olmo_core.nn.moe.MoERouter method) (olmo_core.nn.rope.ComplexRotaryEmbedding method) (olmo_core.nn.rope.FusedRotaryEmbedding method) (olmo_core.nn.rope.RotaryEmbedding method) (olmo_core.nn.transformer.LayerNormScaledTransformerBlock method) (olmo_core.nn.transformer.MoEHybridTransformerBlockBase method) (olmo_core.nn.transformer.MoEReorderedNormTransformerBlock method) (olmo_core.nn.transformer.MoETransformerBlock method) (olmo_core.nn.transformer.NormalizedTransformerBlock method) (olmo_core.nn.transformer.PeriNormTransformerBlock method) (olmo_core.nn.transformer.ReorderedNormTransformerBlock method) (olmo_core.nn.transformer.Transformer method) (olmo_core.nn.transformer.TransformerBlock method) (olmo_core.nn.transformer.TransformerBlockBase method) fragment (olmo_core.data.composable.LongDocStrategy attribute) (olmo_core.data.types.LongDocStrategy attribute) from_checkpoint() (olmo_core.generate.generation_module.TransformerGenerationModule class method) from_data_mix() (olmo_core.data.numpy_dataset.NumpyDatasetConfig class method) from_dict() (olmo_core.config.Config class method) from_hf() (olmo_core.data.tokenizer.TokenizerConfig class method) from_npy() (olmo_core.data.composable.ConcatAndChunkInstanceSourceConfig class method) (olmo_core.data.composable.PackingInstanceSourceConfig class method) from_numpy_dataset() (olmo_core.eval.lm_evaluator.LMEvaluator class method) from_source_groups() (olmo_core.data.composable.NumpyDocumentSourceConfig class method) from_src_mix() (olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig class method) fs_local_rank (olmo_core.data.composable.SourceABC property) full (olmo_core.train.train_module.TransformerActivationCheckpointingMode attribute) (olmo_core.train.train_module.TransformerDataParallelWrappingStrategy attribute) full_precision (olmo_core.nn.attention.GateConfig attribute) (olmo_core.nn.rope.RoPEConfig attribute) fused (olmo_core.nn.attention.AttentionType attribute) (olmo_core.nn.rope.RoPEType attribute) fused_linear (olmo_core.nn.lm_head.LMLossImplementation attribute) fused_linear_cross_entropy_loss() (in module olmo_core.nn.functional) fused_rms (olmo_core.nn.layer_norm.LayerNormType attribute) FusedAttention (class in olmo_core.nn.attention) FusedRMSNorm (class in olmo_core.nn.layer_norm) FusedRotaryEmbedding (class in olmo_core.nn.rope) G GAPMonitorCallback (class in olmo_core.train.callbacks) GarbageCollectorCallback (class in olmo_core.train.callbacks) GateConfig (class in olmo_core.nn.attention) GatedDeltaNet (class in olmo_core.nn.attention) GatedDeltaNetConfig (class in olmo_core.nn.attention) GateGranularity (class in olmo_core.nn.attention) gc_cuda() (in module olmo_core.utils) gelu_tanh (olmo_core.nn.feed_forward.ActivationFunction attribute) gemma3_12B() (olmo_core.nn.transformer.TransformerConfig class method) gemma3_1B() (olmo_core.nn.transformer.TransformerConfig class method) gemma3_27B() (olmo_core.nn.transformer.TransformerConfig class method) gemma3_4B() (olmo_core.nn.transformer.TransformerConfig class method) gemma3_like() (olmo_core.nn.transformer.TransformerConfig class method) generate_batch() (olmo_core.generate.generation_module.TransformerGenerationModule method) generate_doc_lengths (olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig attribute) (olmo_core.data.numpy_dataset.NumpyPackedFSLDatasetConfig attribute) generate_uuid() (in module olmo_core.utils) GenerationConfig (class in olmo_core.generate.generation_module) GenerationModule (class in olmo_core.generate.generation_module) get_attention_rescale_factor() (olmo_core.nn.rope.YaRNRoPEScalingConfig method) get_buffers() (olmo_core.nn.rope.ComplexRotaryEmbedding method) (olmo_core.nn.rope.FusedRotaryEmbedding method) (olmo_core.nn.rope.RotaryEmbedding method) (olmo_core.nn.rope.RotaryEmbeddingBase method) get_bytes_range() (in module olmo_core.io) get_checkpoint_metadata() (in module olmo_core.distributed.checkpoint) get_checkpoints() (olmo_core.model_ladder.ModelLadder method) get_converter_from_hf() (in module olmo_core.nn.hf) get_converter_to_hf() (in module olmo_core.nn.hf) get_cp_mesh() (in module olmo_core.distributed.parallel) get_cumulative_document_lengths() (in module olmo_core.data.utils) get_default_device() (in module olmo_core.utils) get_default_thread_count() (in module olmo_core.utils) get_device_mesh_info() (in module olmo_core.distributed.parallel) get_document_indices() (in module olmo_core.data.utils) get_document_lengths() (in module olmo_core.data.utils) get_document_offsets() (olmo_core.data.composable.ConcatenatedDocumentSource method) (olmo_core.data.composable.DocumentSource method) (olmo_core.data.composable.InMemoryDocumentSource method) (olmo_core.data.composable.MixingDocumentSource method) (olmo_core.data.composable.NumpyDocumentSource method) (olmo_core.data.composable.SamplingDocumentSource method) get_dp_mesh() (in module olmo_core.distributed.parallel) get_dp_model_mesh() (in module olmo_core.distributed.parallel) get_dp_process_group() (in module olmo_core.distributed.parallel) get_element_size() (in module olmo_core.utils) get_ep_mesh() (in module olmo_core.distributed.parallel) get_expert_logits() (olmo_core.nn.moe.MoELinearRouter method) (olmo_core.nn.moe.MoERouter method) get_file_size() (in module olmo_core.io) get_fs_local_rank() (in module olmo_core.distributed.utils) get_global_rank() (in module olmo_core.distributed.utils) get_hf_config() (in module olmo_core.nn.hf) get_hybrid_hf_config() (in module olmo_core.nn.hf) get_hybrid_layer_types() (in module olmo_core.nn.hf) get_instance_bucket() (olmo_core.data.numpy_dataset.NumpyVSLDataset method) get_instance_buckets() (olmo_core.data.numpy_dataset.NumpyVSLDataset method) get_instance_lengths() (olmo_core.data.numpy_dataset.NumpyVSLDataset method) get_local_rank() (in module olmo_core.distributed.utils) get_local_world_size() (in module olmo_core.distributed.utils) get_lr() (olmo_core.optim.ConstantScheduler method) (olmo_core.optim.ConstantWithWarmup method) (olmo_core.optim.CosWithWarmup method) (olmo_core.optim.CosWithWarmupAndLinearDecay method) (olmo_core.optim.ExponentialScheduler method) (olmo_core.optim.HalfCosWithWarmup method) (olmo_core.optim.InvSqrtWithWarmup method) (olmo_core.optim.LinearWithWarmup method) (olmo_core.optim.PowerLR method) (olmo_core.optim.Scheduler method) (olmo_core.optim.SequentialScheduler method) (olmo_core.optim.WSD method) (olmo_core.optim.WSDS method) get_mappings() (olmo_core.nn.conversion.StateConverter method) get_mesh_coordinates() (in module olmo_core.distributed.utils) get_metric() (olmo_core.train.Trainer method) get_metrics() (olmo_core.model_ladder.ModelLadder method) get_mock_batch() (olmo_core.data.composable.ComposableDataLoader method) (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.data_loader.NumpyDataLoaderBase method) get_model_config() (olmo_core.model_ladder.ModelLadder method) get_node_hostname() (in module olmo_core.distributed.utils) get_num_devices() (olmo_core.model_ladder.ModelLadder method) get_num_nodes() (in module olmo_core.distributed.utils) get_num_params() (olmo_core.model_ladder.ModelLadder method) get_parent() (in module olmo_core.io) get_paths_and_tokens_for_source() (olmo_core.data.source_mixture.SourceMixtureDatasetConfig method) get_pp_mesh() (in module olmo_core.distributed.parallel) get_pp_stage_mesh() (in module olmo_core.distributed.parallel) get_rank() (in module olmo_core.distributed.utils) get_replicate_and_shard_degree() (olmo_core.distributed.parallel.DataParallelConfig method) get_rope_buffers() (olmo_core.nn.transformer.Transformer method) get_save_folder() (olmo_core.model_ladder.ModelLadder method) get_step_factor() (olmo_core.optim.SkipStepOptimizer method) get_token_range() (olmo_core.data.composable.ConcatenatedTokenSource method) (olmo_core.data.composable.InMemoryTokenSource method) (olmo_core.data.composable.MixingDocumentSource method) (olmo_core.data.composable.MixingTokenSource method) (olmo_core.data.composable.NumpyDocumentSource method) (olmo_core.data.composable.SamplingDocumentSource method) (olmo_core.data.composable.SamplingTokenSource method) (olmo_core.data.composable.SlicedTokenSource method) (olmo_core.data.composable.TokenSource method) get_tp_mesh() (in module olmo_core.distributed.parallel) get_window_size() (olmo_core.nn.attention.SlidingWindowAttentionConfig method) get_world_mesh() (in module olmo_core.distributed.parallel) get_world_size() (in module olmo_core.distributed.utils) git (olmo_core.launch.beaker.BeakerLaunchConfig attribute) glob() (olmo_core.data.numpy_dataset.NumpyDatasetConfig class method) glob_directory() (in module olmo_core.io) global_batch_size (olmo_core.data.source_mixture.SourceMixtureDatasetConfig attribute) (olmo_core.train.Trainer property) global_num_flops_in_batch() (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) global_num_tokens_in_batch() (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.data_loader.TextDataLoaderBase method) global_step (olmo_core.train.Trainer attribute) global_train_petaflops (olmo_core.train.Trainer attribute) global_train_tokens_seen (olmo_core.train.Trainer attribute) google_credentials_secret (olmo_core.launch.beaker.BeakerLaunchConfig attribute) gpt2 (olmo_core.data.tokenizer.TokenizerName attribute) gpt2() (olmo_core.data.tokenizer.TokenizerConfig class method) gpt_neox_olmo_dolma_v1_5 (olmo_core.data.tokenizer.TokenizerName attribute) gpt_neox_olmo_dolma_v1_5() (olmo_core.data.tokenizer.TokenizerConfig class method) gpu_types (olmo_core.launch.beaker.BeakerLaunchConfig attribute) GPUMemoryMonitorCallback (class in olmo_core.train.callbacks) granularity (olmo_core.nn.attention.GateConfig attribute) greedy_selection() (in module olmo_core.generate.sampling) group (olmo_core.train.callbacks.WandBCallback attribute) group_consecutive_values() (in module olmo_core.data.utils) group_overrides (olmo_core.optim.OptimConfig attribute) grow_linear (olmo_core.data.numpy_dataset.VSLCurriculumType attribute) grow_p2 (olmo_core.data.numpy_dataset.VSLCurriculumType attribute) H HalfCosWithWarmup (class in olmo_core.optim) hard_stop (olmo_core.train.Trainer attribute) has_callback() (olmo_core.train.Trainer method) has_flash_attn() (in module olmo_core.utils) head_dim (olmo_core.nn.attention.GatedDeltaNetConfig attribute) head_stride (olmo_core.nn.attention.RingContextParallelStyle attribute) headwise (olmo_core.nn.attention.GateGranularity attribute) HF_TO_OLMO_CORE_MODULE_MAPPINGS (in module olmo_core.nn.hf.convert) HF_TO_OLMO_CORE_TEMPLATE_MAPPINGS (in module olmo_core.nn.hf.convert) HF_TO_OLMO_CORE_WEIGHT_MAPPINGS (in module olmo_core.nn.hf.convert) HFConverterCallback (class in olmo_core.train.callbacks) high_freq_proportion (olmo_core.nn.rope.StepwiseRoPEScalingConfig attribute) host_networking (olmo_core.launch.beaker.BeakerLaunchConfig attribute) hostnames (olmo_core.launch.beaker.BeakerLaunchConfig attribute) I identifier (olmo_core.data.tokenizer.TokenizerConfig attribute) if_available (olmo_core.train.LoadStrategy attribute) ignore_fingerprint_mismatch (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) include_instance_metadata (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) info_value_of_dtype() (in module olmo_core.utils) init_client() (in module olmo_core.io) init_distributed() (in module olmo_core.distributed.utils) init_kv_cache_manager() (olmo_core.nn.attention.Attention method) init_weights() (olmo_core.nn.transformer.NormalizedTransformer method) (olmo_core.nn.transformer.Transformer method) InitMethod (class in olmo_core.nn.transformer) InMemoryDocumentSource (class in olmo_core.data.composable) InMemoryTokenSource (class in olmo_core.data.composable) input_ids (olmo_core.data.composable.Instance attribute) (olmo_core.data.composable.TokenRange attribute) install_excepthook() (in module olmo_core.utils) Instance (class in olmo_core.data.composable) instance (olmo_core.nn.moe.MoELoadBalancingLossGranularity attribute) instance_filter_config (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) instance_sources (olmo_core.model_ladder.ModelLadder attribute) InstanceFilterConfig (class in olmo_core.data.composable) instances (olmo_core.train.train_module.EvalBatchSizeUnit attribute) instances_per_bucket (olmo_core.data.numpy_dataset.NumpyVSLDataset property) InstanceSource (class in olmo_core.data.composable) InstanceSourceConfig (class in olmo_core.data.composable) inter_source (olmo_core.data.composable.ShuffleStrategy attribute) interleaved_source (olmo_core.data.composable.ShuffleStrategy attribute) interleaving_exempt_paths (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDatasetConfig attribute) interval (olmo_core.train.callbacks.GAPMonitorCallback attribute) intra_source (olmo_core.data.composable.ShuffleStrategy attribute) InvSqrtWithWarmup (class in olmo_core.optim) is_distributed() (in module olmo_core.distributed.utils) is_leaf (olmo_core.data.composable.SourceABC property) is_olmo_hybrid_model() (in module olmo_core.nn.hf) is_running_in_beaker() (in module olmo_core.launch.beaker) is_running_in_beaker_batch_job() (in module olmo_core.launch.beaker) is_url() (in module olmo_core.io) iter_document_indices() (in module olmo_core.data.utils) iter_document_indices_with_max_sequence_length() (in module olmo_core.data.utils) J join_path() (in module olmo_core.io) K kernel_preference (olmo_core.float8.AOMXLinearConfig attribute) L l2_norm (olmo_core.nn.layer_norm.LayerNormType attribute) (olmo_core.train.ReduceType attribute) L2Norm (class in olmo_core.nn.layer_norm) label (olmo_core.data.composable.MixingDocumentSourceConfig attribute) (olmo_core.data.composable.MixingDocumentSourceSpec attribute) (olmo_core.data.composable.MixingInstanceSourceConfig attribute) (olmo_core.data.composable.MixingInstanceSourceSpec attribute) (olmo_core.data.composable.MixingTokenSourceConfig attribute) (olmo_core.data.composable.MixingTokenSourceSpec attribute) (olmo_core.data.composable.NumpyDocumentSourceConfigBase attribute) (olmo_core.data.composable.SourceABC property) label_mask (olmo_core.data.composable.Instance attribute) (olmo_core.data.composable.TokenRange attribute) label_mask_paths (olmo_core.data.composable.NumpyDocumentSourceConfig attribute) (olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig attribute) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDatasetConfig attribute) (olmo_core.data.numpy_dataset.NumpyPackedFSLDatasetConfig attribute) (olmo_core.data.numpy_dataset.NumpyPaddedFSLDatasetConfig attribute) latest (olmo_core.train.MetricMergeStrategy attribute) latest_checkpoint() (olmo_core.train.Checkpointer class method) launch() (olmo_core.launch.beaker.BeakerLaunchConfig method) launch_timeout (olmo_core.launch.beaker.BeakerLaunchConfig attribute) LAYER (olmo_core.nn.conversion.TemplatePlaceholder attribute) layer_norm (olmo_core.nn.transformer.TransformerBlockConfig attribute) LayerNorm (class in olmo_core.nn.layer_norm) LayerNormConfig (class in olmo_core.nn.layer_norm) LayerNormScaledTransformerBlock (class in olmo_core.nn.transformer) LayerNormType (class in olmo_core.nn.layer_norm) LinearWithWarmup (class in olmo_core.optim) Lion (class in olmo_core.optim) LionConfig (class in olmo_core.optim) list_directory() (in module olmo_core.io) ListCheckpointerCallback (class in olmo_core.train.callbacks) llama (olmo_core.nn.transformer.InitMethod attribute) llama2_13B() (olmo_core.nn.transformer.TransformerConfig class method) llama2_1B() (olmo_core.nn.transformer.TransformerConfig class method) llama2_26B() (olmo_core.nn.transformer.TransformerConfig class method) llama2_271M() (olmo_core.nn.transformer.TransformerConfig class method) llama2_70B() (olmo_core.nn.transformer.TransformerConfig class method) llama2_7B() (olmo_core.nn.transformer.TransformerConfig class method) llama3 (olmo_core.nn.attention.RingAttentionLoadBalancerType attribute) llama3_1B() (olmo_core.nn.transformer.TransformerConfig class method) llama3_405B() (olmo_core.nn.transformer.TransformerConfig class method) llama3_70B() (olmo_core.nn.transformer.TransformerConfig class method) llama3_8B() (olmo_core.nn.transformer.TransformerConfig class method) llama_depth (olmo_core.nn.transformer.InitMethod attribute) llama_like() (olmo_core.nn.transformer.TransformerConfig class method) LMEvaluator (class in olmo_core.eval.lm_evaluator) LMEvaluatorCallbackConfig (class in olmo_core.train.callbacks) LMHead (class in olmo_core.nn.lm_head) LMHeadConfig (class in olmo_core.nn.lm_head) LMHeadType (class in olmo_core.nn.lm_head) LMLossImplementation (class in olmo_core.nn.lm_head) LMOutputWithLoss (class in olmo_core.nn.lm_head) load() (olmo_core.train.Checkpointer method) load_array_slice() (in module olmo_core.data.utils) load_array_slice_into_tensor() (in module olmo_core.data.utils) load_balancer (olmo_core.nn.attention.RingContextParallelStyle attribute) load_checkpoint() (olmo_core.generate.generation_module.TransformerGenerationModule method) (olmo_core.train.Trainer method) load_config() (in module olmo_core.nn.hf) load_hf_model() (in module olmo_core.nn.hf) load_keys() (in module olmo_core.distributed.checkpoint) load_model_and_optim_state() (in module olmo_core.distributed.checkpoint) load_optim_state (olmo_core.train.Trainer attribute) load_path (olmo_core.train.Trainer attribute) load_state_dict() (in module olmo_core.distributed.checkpoint) (olmo_core.data.composable.ComposableDataLoader method) (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.data_loader.NumpyDataLoaderBase method) (olmo_core.data.data_loader.NumpyFSLDataLoader method) (olmo_core.data.data_loader.NumpyVSLDataLoader method) (olmo_core.generate.generation_module.GenerationModule method) (olmo_core.generate.generation_module.TransformerGenerationModule method) (olmo_core.train.callbacks.Callback method) (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) (olmo_core.train.Trainer method) load_strategy (olmo_core.train.Trainer attribute) load_trainer_state (olmo_core.train.Trainer attribute) LoadStrategy (class in olmo_core.train) local_batch (olmo_core.nn.moe.MoELoadBalancingLossGranularity attribute) local_rank0_only (olmo_core.utils.LogFilterType attribute) log_extra_field() (in module olmo_core.utils) log_interval (olmo_core.train.callbacks.ConsoleLoggerCallback attribute) (olmo_core.train.callbacks.EvaluatorCallback attribute) log_metrics() (olmo_core.train.callbacks.Callback method) LogFilterType (class in olmo_core.utils) logging_configured() (in module olmo_core.utils) logits (olmo_core.nn.lm_head.LMOutputWithLoss attribute) long_doc_strategy (olmo_core.data.composable.NumpyDocumentSourceConfigBase attribute) (olmo_core.data.numpy_dataset.NumpyPackedFSLDatasetConfig attribute) LongDocStrategy (class in olmo_core.data.composable) (class in olmo_core.data.types) loss (olmo_core.nn.lm_head.LMOutputWithLoss attribute) low_freq_proportion (olmo_core.nn.rope.StepwiseRoPEScalingConfig attribute) lr (olmo_core.optim.DionConfig attribute) (olmo_core.optim.MuonConfig attribute) (olmo_core.optim.NoOpConfig attribute) lr_multiplier (olmo_core.model_ladder.WSDSChinchillaRunConfigurator attribute) M map() (olmo_core.data.numpy_dataset.NumpyDatasetBase method) mark_dynamic() (in module olmo_core.utils) MatrixAwareOptimConfig (class in olmo_core.optim) max (olmo_core.train.MetricMergeStrategy attribute) (olmo_core.train.ReduceType attribute) max_devices (olmo_core.model_ladder.ModelLadder attribute) max_document_length (olmo_core.data.composable.NumpyDocumentSourceConfigBase attribute) max_duration (olmo_core.train.Trainer attribute) max_length (olmo_core.generate.generation_module.GenerationConfig attribute) max_new_tokens (olmo_core.generate.generation_module.GenerationConfig attribute) max_repetition_factor (olmo_core.data.composable.MixingDocumentSourceSpec attribute) (olmo_core.data.composable.MixingInstanceSourceSpec attribute) (olmo_core.data.composable.MixingTokenSourceSpec attribute) max_repetition_ratio (olmo_core.data.source_mixture.SourceMixtureConfig attribute) max_sequence_length (olmo_core.data.composable.InstanceSource property) (olmo_core.data.numpy_dataset.NumpyDatasetBase property) (olmo_core.data.numpy_dataset.NumpyFSLDataset property) (olmo_core.data.numpy_dataset.NumpyFSLDatasetBase property) (olmo_core.data.numpy_dataset.NumpyVSLDataset property) (olmo_core.data.numpy_dataset.NumpyVSLDatasetConfig attribute) (olmo_core.train.callbacks.HFConverterCallback attribute) (olmo_core.train.train_module.EvalBatchSpec attribute) max_source_fraction (olmo_core.data.source_mixture.SourceMixtureConfig attribute) max_steps (olmo_core.train.Trainer property) max_target_sequence_length (olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig attribute) max_tokens (olmo_core.train.Trainer property) maybe_load_checkpoint() (olmo_core.train.Trainer method) mean (olmo_core.train.MetricMergeStrategy attribute) (olmo_core.train.ReduceType attribute) MeanMetric (class in olmo_core.eval.metrics) melt_batch() (in module olmo_core.data.utils) memmap_to_write() (in module olmo_core.data.utils) merge() (olmo_core.config.Config method) merge_interval (olmo_core.train.callbacks.ModelMergeCallback attribute) merge_last_n_steps (olmo_core.train.callbacks.ModelMergeCallback attribute) merge_state_dicts() (in module olmo_core.distributed.checkpoint) merge_step (olmo_core.train.callbacks.ModelMergeCallback attribute) MeshDimName (class in olmo_core.distributed.parallel) metadata (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) Metric (class in olmo_core.eval.metrics) MetricMergeStrategy (class in olmo_core.train) metrics (olmo_core.train.callbacks.ConsoleLoggerCallback attribute) (olmo_core.train.callbacks.MetricSaverCallback property) metrics_collect_interval (olmo_core.train.Trainer attribute) metrics_log_interval (olmo_core.train.callbacks.ConsoleLoggerCallback attribute) metrics_path (olmo_core.model_ladder.RunCheckpointInfo attribute) metrics_to_capture (olmo_core.train.callbacks.MetricSaverCallback attribute) MetricSaverCallback (class in olmo_core.train.callbacks) min (olmo_core.train.MetricMergeStrategy attribute) min_sequence_length (olmo_core.data.numpy_dataset.NumpyVSLDatasetConfig attribute) min_value_of_dtype() (in module olmo_core.utils) mix (olmo_core.data.composable.NumpyDocumentSourceMixConfig attribute) (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) mix_base_dir (olmo_core.data.composable.NumpyDocumentSourceMixConfig attribute) (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) MixConfig (olmo_core.data.composable.NumpyDocumentSource attribute) MixingDocumentSource (class in olmo_core.data.composable) MixingDocumentSourceConfig (class in olmo_core.data.composable) MixingDocumentSourceSpec (class in olmo_core.data.composable) MixingDocumentSourceSpecConfig (class in olmo_core.data.composable) MixingInstanceSource (class in olmo_core.data.composable) MixingInstanceSourceConfig (class in olmo_core.data.composable) MixingInstanceSourceSpec (class in olmo_core.data.composable) MixingInstanceSourceSpecConfig (class in olmo_core.data.composable) MixingTokenSource (class in olmo_core.data.composable) MixingTokenSourceConfig (class in olmo_core.data.composable) MixingTokenSourceSpec (class in olmo_core.data.composable) MixingTokenSourceSpecConfig (class in olmo_core.data.composable) mode (olmo_core.train.train_module.TransformerActivationCheckpointingConfig attribute) model_configurator (olmo_core.model_ladder.ModelLadder attribute) model_construction_kwargs (olmo_core.model_ladder.Olmo3ModelConfigurator attribute) MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_MODULE_MAPPINGS (in module olmo_core.nn.hf.convert) MODEL_TYPE_SPECIFIC_HF_TO_OLMO_CORE_WEIGHT_MAPPINGS (in module olmo_core.nn.hf.convert) MODEL_TYPE_SPECIFIC_OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS (in module olmo_core.nn.hf.convert) ModelConfigurator (class in olmo_core.model_ladder) ModelLadder (class in olmo_core.model_ladder) ModelMergeCallback (class in olmo_core.train.callbacks) module olmo_core.config olmo_core.data olmo_core.data.collator olmo_core.data.composable olmo_core.data.data_loader olmo_core.data.mixes olmo_core.data.numpy_dataset olmo_core.data.source_mixture olmo_core.data.tokenizer olmo_core.data.types olmo_core.data.utils olmo_core.distributed olmo_core.distributed.checkpoint olmo_core.distributed.parallel olmo_core.distributed.utils olmo_core.eval olmo_core.eval.evaluator olmo_core.eval.lm_evaluator olmo_core.eval.metrics olmo_core.exceptions olmo_core.float8 olmo_core.fs_cache olmo_core.generate olmo_core.generate.generation_module olmo_core.generate.sampling olmo_core.generate.utils olmo_core.io olmo_core.launch olmo_core.launch.beaker olmo_core.model_ladder olmo_core.nn olmo_core.nn.attention olmo_core.nn.conversion olmo_core.nn.feed_forward olmo_core.nn.functional olmo_core.nn.hf olmo_core.nn.layer_norm olmo_core.nn.lm_head olmo_core.nn.moe olmo_core.nn.rope olmo_core.nn.transformer olmo_core.optim olmo_core.testing olmo_core.train olmo_core.train.callbacks olmo_core.train.train_module olmo_core.utils modules (olmo_core.train.train_module.TransformerActivationCheckpointingConfig attribute) modules_to_ignore (olmo_core.float8.Float8Config attribute) moe (olmo_core.nn.transformer.TransformerBlockType attribute) (olmo_core.nn.transformer.TransformerType attribute) moe_capacity_factor (olmo_core.train.callbacks.HFConverterCallback attribute) moe_hybrid (olmo_core.nn.transformer.TransformerBlockType attribute) moe_hybrid_reordered_norm (olmo_core.nn.transformer.TransformerBlockType attribute) moe_reordered_norm (olmo_core.nn.transformer.TransformerBlockType attribute) MoEBase (class in olmo_core.nn.moe) MoEConfig (class in olmo_core.nn.moe) MoEHybridReorderedNormTransformerBlock (class in olmo_core.nn.transformer) MoEHybridTransformerBlock (class in olmo_core.nn.transformer) MoEHybridTransformerBlockBase (class in olmo_core.nn.transformer) MoELinearRouter (class in olmo_core.nn.moe) MoELoadBalancingLossGranularity (class in olmo_core.nn.moe) MoEMLP (class in olmo_core.nn.moe) MoEReorderedNormTransformerBlock (class in olmo_core.nn.transformer) MoERouter (class in olmo_core.nn.moe) MoERouterConfig (class in olmo_core.nn.moe) MoERouterGatingFunction (class in olmo_core.nn.moe) MoERouterType (class in olmo_core.nn.moe) MoETransformer (class in olmo_core.nn.transformer) MoETransformerBlock (class in olmo_core.nn.transformer) MoEType (class in olmo_core.nn.moe) monitor (olmo_core.train.callbacks.GAPMonitorCallback attribute) MonkeyPatcherCallback (class in olmo_core.train.callbacks) move_to_device() (in module olmo_core.utils) mu (olmo_core.optim.DionConfig attribute) (olmo_core.optim.MuonConfig attribute) multi_thread_pool (olmo_core.train.Trainer property) muon_beta2 (olmo_core.optim.NorMuonConfig attribute) MuonConfig (class in olmo_core.optim) mxfp8_cast_kernel_choice (olmo_core.float8.AOMXLinearConfig attribute) mxfp8_cublas_rceil() (olmo_core.float8.AOMXLinearConfig class method) N n_heads (olmo_core.nn.attention.GatedDeltaNetConfig attribute) n_v_heads (olmo_core.nn.attention.GatedDeltaNetConfig attribute) name (olmo_core.distributed.checkpoint.UnshardStrategy attribute) (olmo_core.launch.beaker.BeakerLaunchConfig attribute) (olmo_core.model_ladder.ModelLadder attribute) (olmo_core.model_ladder.RunCheckpointInfo attribute) (olmo_core.nn.attention.AttentionConfig attribute) (olmo_core.nn.feed_forward.FeedForwardConfig attribute) (olmo_core.nn.layer_norm.LayerNormConfig attribute) (olmo_core.nn.lm_head.LMHeadConfig attribute) (olmo_core.nn.moe.MoEConfig attribute) (olmo_core.nn.moe.MoERouterConfig attribute) (olmo_core.nn.rope.RoPEConfig attribute) (olmo_core.nn.transformer.TransformerBlockConfig attribute) (olmo_core.train.callbacks.CometCallback attribute) (olmo_core.train.callbacks.SlackNotifierCallback attribute) (olmo_core.train.callbacks.WandBCallback attribute) natural (olmo_core.data.numpy_dataset.VSLCurriculumType attribute) nesterov (olmo_core.optim.MuonConfig attribute) never (olmo_core.train.callbacks.CheckpointRemovalStrategy attribute) (olmo_core.train.LoadStrategy attribute) ngpt_1B() (olmo_core.nn.transformer.TransformerConfig class method) ngpt_271M() (olmo_core.nn.transformer.TransformerConfig class method) ngpt_like() (olmo_core.nn.transformer.TransformerConfig class method) no_checkpoints (olmo_core.train.Trainer attribute) no_evals (olmo_core.train.Trainer attribute) no_global_rope (olmo_core.nn.rope.RoPEConfig attribute) none (olmo_core.train.callbacks.CometNotificationSetting attribute) (olmo_core.train.callbacks.SlackNotificationSetting attribute) NoOpConfig (class in olmo_core.optim) NoOpOptimizer (class in olmo_core.optim) norm_eps (olmo_core.nn.attention.GatedDeltaNetConfig attribute) normal (olmo_core.nn.transformer.InitMethod attribute) normalize_matrices() (olmo_core.nn.attention.NormalizedAttention method) (olmo_core.nn.feed_forward.NormalizedFeedForward method) (olmo_core.nn.transformer.NormalizedTransformer method) (olmo_core.nn.transformer.NormalizedTransformerBlock method) normalize_path() (in module olmo_core.io) normalized (olmo_core.nn.attention.AttentionType attribute) (olmo_core.nn.feed_forward.FeedForwardType attribute) (olmo_core.nn.lm_head.LMHeadType attribute) (olmo_core.nn.transformer.InitMethod attribute) (olmo_core.nn.transformer.TransformerBlockType attribute) (olmo_core.nn.transformer.TransformerType attribute) NormalizedAttention (class in olmo_core.nn.attention) NormalizedFeedForward (class in olmo_core.nn.feed_forward) NormalizedLMHead (class in olmo_core.nn.lm_head) NormalizedTransformer (class in olmo_core.nn.transformer) NormalizedTransformerBlock (class in olmo_core.nn.transformer) NorMuonConfig (class in olmo_core.optim) notes (olmo_core.train.callbacks.WandBCallback attribute) notifications (olmo_core.train.callbacks.CometCallback attribute) (olmo_core.train.callbacks.SlackNotifierCallback attribute) num_active_non_embedding_params (olmo_core.nn.transformer.TransformerConfig property) num_active_params (olmo_core.nn.transformer.TransformerConfig property) num_cycles (olmo_core.data.numpy_dataset.VSLGrowthCurriculum attribute) num_execution_units (olmo_core.launch.beaker.BeakerLaunchConfig attribute) num_flops_per_token() (olmo_core.nn.attention.Attention method) (olmo_core.nn.attention.GatedDeltaNet method) (olmo_core.nn.transformer.Transformer method) (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) (olmo_core.train.train_module.TransformerPipelineTrainModule method) (olmo_core.train.train_module.TransformerTrainModule method) num_gpus (olmo_core.launch.beaker.BeakerLaunchConfig attribute) num_instances (olmo_core.data.composable.MixingInstanceSourceConfig attribute) num_nodes (olmo_core.launch.beaker.BeakerLaunchConfig attribute) num_non_embedding_params (olmo_core.nn.transformer.TransformerConfig property) num_params (olmo_core.nn.transformer.TransformerConfig property) num_params() (olmo_core.nn.attention.AttentionConfig method) (olmo_core.nn.attention.GatedDeltaNetConfig method) (olmo_core.nn.feed_forward.FeedForwardConfig method) (olmo_core.nn.layer_norm.LayerNormConfig method) (olmo_core.nn.lm_head.LMHeadConfig method) (olmo_core.nn.moe.MoERouterConfig method) num_tokens (olmo_core.data.composable.ConcatenatedTokenSource property) (olmo_core.data.composable.InMemoryTokenSource property) (olmo_core.data.composable.InstanceSource property) (olmo_core.data.composable.MixingDocumentSource property) (olmo_core.data.composable.MixingDocumentSourceConfig attribute) (olmo_core.data.composable.MixingInstanceSourceConfig attribute) (olmo_core.data.composable.MixingTokenSource property) (olmo_core.data.composable.MixingTokenSourceConfig attribute) (olmo_core.data.composable.NumpyDocumentSource property) (olmo_core.data.composable.RandomInstanceSource property) (olmo_core.data.composable.SamplingDocumentSource property) (olmo_core.data.composable.SamplingTokenSource property) (olmo_core.data.composable.SlicedTokenSource property) (olmo_core.data.composable.SourceABC property) (olmo_core.data.numpy_dataset.NumpyDatasetBase property) (olmo_core.data.numpy_dataset.NumpyFSLDataset property) NumpyDataLoaderBase (class in olmo_core.data.data_loader) NumpyDataLoaderConfig (class in olmo_core.data.data_loader) NumpyDatasetBase (class in olmo_core.data.numpy_dataset) NumpyDatasetConfig (class in olmo_core.data.numpy_dataset) NumpyDatasetDType (class in olmo_core.data.types) NumpyDocumentSource (class in olmo_core.data.composable) NumpyDocumentSourceConfig (class in olmo_core.data.composable) NumpyDocumentSourceConfigBase (class in olmo_core.data.composable) NumpyDocumentSourceMixConfig (class in olmo_core.data.composable) NumpyFSLDataLoader (class in olmo_core.data.data_loader) NumpyFSLDataset (class in olmo_core.data.numpy_dataset) NumpyFSLDatasetBase (class in olmo_core.data.numpy_dataset) NumpyFSLDatasetConfig (class in olmo_core.data.numpy_dataset) NumpyFSLDatasetMixture (class in olmo_core.data.numpy_dataset) NumpyInterleavedFSLDataset (class in olmo_core.data.numpy_dataset) NumpyInterleavedFSLDatasetConfig (class in olmo_core.data.numpy_dataset) NumpyPackedFSLDataset (class in olmo_core.data.numpy_dataset) NumpyPackedFSLDatasetConfig (class in olmo_core.data.numpy_dataset) NumpyPaddedFSLDataset (class in olmo_core.data.numpy_dataset) NumpyPaddedFSLDatasetConfig (class in olmo_core.data.numpy_dataset) NumpyVSLDataLoader (class in olmo_core.data.data_loader) NumpyVSLDataset (class in olmo_core.data.numpy_dataset) NumpyVSLDatasetConfig (class in olmo_core.data.numpy_dataset) O offsets (olmo_core.data.numpy_dataset.NumpyFSLDataset property) (olmo_core.data.numpy_dataset.NumpyPaddedFSLDataset property) (olmo_core.data.numpy_dataset.NumpyVSLDataset property) old_context_len (olmo_core.nn.rope.StepwiseRoPEScalingConfig attribute) (olmo_core.nn.rope.YaRNRoPEScalingConfig attribute) oldest (olmo_core.train.MetricMergeStrategy attribute) olmo2_100M() (olmo_core.nn.transformer.TransformerConfig class method) olmo2_13B() (olmo_core.nn.transformer.TransformerConfig class method) olmo2_1B() (olmo_core.nn.transformer.TransformerConfig class method) olmo2_1B_v2() (olmo_core.nn.transformer.TransformerConfig class method) olmo2_32B() (olmo_core.nn.transformer.TransformerConfig class method) olmo2_3B() (olmo_core.nn.transformer.TransformerConfig class method) olmo2_7B() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_100M() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_13B() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_190M() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_1B() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_32B() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_370M() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_3B() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_600M() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_760M() (olmo_core.nn.transformer.TransformerConfig class method) olmo3_7B() (olmo_core.nn.transformer.TransformerConfig class method) Olmo3ModelConfigurator (class in olmo_core.model_ladder) olmo_core.config module olmo_core.data module olmo_core.data.collator module olmo_core.data.composable module olmo_core.data.data_loader module olmo_core.data.mixes module olmo_core.data.numpy_dataset module olmo_core.data.source_mixture module olmo_core.data.tokenizer module olmo_core.data.types module olmo_core.data.utils module olmo_core.distributed module olmo_core.distributed.checkpoint module olmo_core.distributed.parallel module olmo_core.distributed.utils module olmo_core.eval module olmo_core.eval.evaluator module olmo_core.eval.lm_evaluator module olmo_core.eval.metrics module olmo_core.exceptions module olmo_core.float8 module olmo_core.fs_cache module olmo_core.generate module olmo_core.generate.generation_module module olmo_core.generate.sampling module olmo_core.generate.utils module olmo_core.io module olmo_core.launch module olmo_core.launch.beaker module olmo_core.model_ladder module olmo_core.nn module olmo_core.nn.attention module olmo_core.nn.conversion module olmo_core.nn.feed_forward module olmo_core.nn.functional module olmo_core.nn.hf module olmo_core.nn.layer_norm module olmo_core.nn.lm_head module olmo_core.nn.moe module olmo_core.nn.rope module olmo_core.nn.transformer module olmo_core.optim module olmo_core.testing module olmo_core.train module olmo_core.train.callbacks module olmo_core.train.train_module module olmo_core.utils module OLMO_CORE_TO_HF_MODULE_MAPPINGS (in module olmo_core.nn.hf.convert) OLMO_CORE_TO_HF_TEMPLATE_MAPPINGS (in module olmo_core.nn.hf.convert) OLMO_CORE_TO_HF_WEIGHT_MAPPINGS (in module olmo_core.nn.hf.convert) OLMoBeakerExperimentFailedError OLMoCheckpointError OLMoCLIError OLMoConfigurationError OLMoCoreBeakerImage (class in olmo_core.launch.beaker) OLMoEnvironmentError OLMoError OLMoInvalidRangeRequestError OLMoNetworkError OLMoThreadError OLMoUploadError OLMoUserError on_attach() (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) on_error() (olmo_core.train.callbacks.Callback method) one_file (olmo_core.distributed.checkpoint.UnshardStrategyType attribute) one_file() (olmo_core.distributed.checkpoint.UnshardStrategy class method) one_file_per_tensor (olmo_core.distributed.checkpoint.UnshardStrategyType attribute) one_file_per_tensor() (olmo_core.distributed.checkpoint.UnshardStrategy class method) optim_step() (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) OptimConfig (class in olmo_core.optim) OptimGroupOverride (class in olmo_core.optim) optimizer() (olmo_core.optim.AdamConfig class method) (olmo_core.optim.AdamWConfig class method) (olmo_core.optim.DionConfig class method) (olmo_core.optim.LionConfig class method) (olmo_core.optim.MatrixAwareOptimConfig class method) (olmo_core.optim.MuonConfig class method) (olmo_core.optim.NoOpConfig class method) (olmo_core.optim.NorMuonConfig class method) (olmo_core.optim.OptimConfig class method) (olmo_core.optim.SkipStepAdamWConfig class method) (olmo_core.optim.SkipStepLionConfig class method) opts (olmo_core.optim.OptimGroupOverride attribute) output_folder (olmo_core.train.callbacks.HFConverterCallback attribute) output_suffix (olmo_core.train.callbacks.ModelMergeCallback attribute) override_decay (olmo_core.optim.SequentialScheduler attribute) P pack_documents_into_instances() (in module olmo_core.data.utils) PackingInstanceSource (class in olmo_core.data.composable) PackingInstanceSourceConfig (class in olmo_core.data.composable) pad_token_id (olmo_core.data.tokenizer.TokenizerConfig attribute) (olmo_core.generate.generation_module.GenerationConfig attribute) padded_vocab_size() (olmo_core.data.tokenizer.TokenizerConfig method) params (olmo_core.optim.OptimGroupOverride attribute) path (olmo_core.model_ladder.RunCheckpointInfo attribute) paths (olmo_core.data.numpy_dataset.NumpyDatasetBase property) (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) (olmo_core.data.source_mixture.SourceMixtureConfig attribute) pattern (olmo_core.nn.attention.SlidingWindowAttentionConfig attribute) peri_norm (olmo_core.nn.transformer.TransformerBlockType attribute) PeriNormTransformerBlock (class in olmo_core.nn.transformer) period (olmo_core.data.utils.RepetitionTuple attribute) persist_working_file() (olmo_core.train.Trainer method) persist_working_subdir() (olmo_core.train.Trainer method) PipelineParallelConfig (class in olmo_core.distributed.parallel) PipelineSchedule (class in olmo_core.distributed.parallel) PipelineScheduleType (class in olmo_core.distributed.parallel) PipelineSplitStyle (class in olmo_core.distributed.parallel) PIRoPEScalingConfig (class in olmo_core.nn.rope) plot_lr_schedule() (olmo_core.model_ladder.RunConfigurator method) (olmo_core.model_ladder.WSDSChinchillaRunConfigurator method) post_attach() (olmo_core.train.callbacks.Callback method) post_batch() (olmo_core.nn.moe.MoEBase method) (olmo_core.nn.transformer.MoETransformer method) (olmo_core.nn.transformer.Transformer method) post_checkpoint_loaded() (olmo_core.train.callbacks.Callback method) post_checkpoint_saved() (olmo_core.train.callbacks.Callback method) post_epoch() (olmo_core.train.callbacks.Callback method) post_optim_step() (olmo_core.nn.transformer.NormalizedTransformer method) (olmo_core.nn.transformer.Transformer method) post_setup (olmo_core.launch.beaker.BeakerLaunchConfig attribute) post_step() (olmo_core.train.callbacks.Callback method) post_train() (olmo_core.train.callbacks.Callback method) post_train_batch() (olmo_core.train.callbacks.Callback method) PowerLR (class in olmo_core.optim) pp (olmo_core.distributed.parallel.MeshDimName attribute) pre_epoch() (olmo_core.train.callbacks.Callback method) pre_load_batch() (olmo_core.train.callbacks.Callback method) pre_log_metrics() (olmo_core.train.callbacks.Callback method) pre_optim_step() (olmo_core.train.callbacks.Callback method) pre_setup (olmo_core.launch.beaker.BeakerLaunchConfig attribute) pre_step() (olmo_core.train.callbacks.Callback method) pre_train() (olmo_core.train.callbacks.Callback method) (olmo_core.train.train_module.TrainModule method) pre_train_checkpoint (olmo_core.train.callbacks.CheckpointerCallback attribute) preemptible (olmo_core.launch.beaker.BeakerLaunchConfig attribute) prepare() (olmo_core.data.numpy_dataset.NumpyDatasetBase method) (olmo_core.data.numpy_dataset.NumpyFSLDataset method) (olmo_core.data.numpy_dataset.NumpyFSLDatasetMixture method) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDataset method) (olmo_core.data.numpy_dataset.NumpyPackedFSLDataset method) (olmo_core.data.numpy_dataset.NumpyPaddedFSLDataset method) (olmo_core.data.numpy_dataset.NumpyVSLDataset method) prepare_cli_environment() (in module olmo_core.utils) prepare_experts_for_ddp() (olmo_core.nn.moe.MoEBase method) prepare_experts_for_fsdp() (olmo_core.nn.moe.MoEBase method) prepare_training_environment() (in module olmo_core.train) priority (olmo_core.launch.beaker.BeakerLaunchConfig attribute) (olmo_core.train.callbacks.BeakerCallback attribute) (olmo_core.train.callbacks.Callback attribute) (olmo_core.train.callbacks.CheckpointerCallback attribute) (olmo_core.train.callbacks.GPUMemoryMonitorCallback attribute) (olmo_core.train.callbacks.HFConverterCallback attribute) (olmo_core.train.callbacks.ModelMergeCallback attribute) (olmo_core.train.callbacks.SpeedMonitorCallback attribute) processes (olmo_core.data.source_mixture.SourceMixtureDatasetConfig attribute) profile_memory (olmo_core.train.callbacks.ProfilerCallback attribute) ProfilerCallback (class in olmo_core.train.callbacks) project (olmo_core.model_ladder.ModelLadder attribute) (olmo_core.train.callbacks.CometCallback attribute) (olmo_core.train.callbacks.WandBCallback attribute) prune_state_dict() (in module olmo_core.distributed.checkpoint) Q qwen_rms (olmo_core.nn.layer_norm.LayerNormType attribute) QwenRMSNorm (class in olmo_core.nn.layer_norm) R random_split() (olmo_core.data.composable.InstanceSource method) (olmo_core.data.composable.InstanceSourceConfig method) RandomInstanceSource (class in olmo_core.data.composable) RandomInstanceSourceConfig (class in olmo_core.data.composable) rank (olmo_core.data.composable.SourceABC property) rank0_only (olmo_core.utils.LogFilterType attribute) rank_batch_size (olmo_core.data.data_loader.DataLoaderBase property) (olmo_core.train.train_module.EvalBatchSpec attribute) (olmo_core.train.Trainer property) rank_completion_order() (olmo_core.distributed.parallel.PipelineParallelConfig method) rank_fraction (olmo_core.optim.DionConfig attribute) rank_microbatch_size (olmo_core.model_ladder.TransformerModelConfigurator attribute) rank_multiple_of (olmo_core.optim.DionConfig attribute) ranks (olmo_core.train.callbacks.ProfilerCallback attribute) ratio (olmo_core.data.composable.MixingDocumentSourceSpec attribute) (olmo_core.data.composable.MixingInstanceSourceSpec attribute) (olmo_core.data.composable.MixingTokenSourceSpec attribute) record_ce_loss() (olmo_core.train.train_module.TrainModule method) (olmo_core.train.Trainer method) record_metric() (olmo_core.train.train_module.TrainModule method) (olmo_core.train.Trainer method) ReduceType (class in olmo_core.train) registered_base (olmo_core.data.composable.ComposableDataLoaderConfig attribute) (olmo_core.data.data_loader.NumpyDataLoaderConfig attribute) (olmo_core.nn.attention.AttentionConfig attribute) (olmo_core.nn.attention.GatedDeltaNetConfig attribute) (olmo_core.optim.AdamConfig attribute) (olmo_core.optim.AdamWConfig attribute) (olmo_core.optim.ConstantScheduler attribute) (olmo_core.optim.ConstantWithWarmup attribute) (olmo_core.optim.CosWithWarmup attribute) (olmo_core.optim.CosWithWarmupAndLinearDecay attribute) (olmo_core.optim.DionConfig attribute) (olmo_core.optim.ExponentialScheduler attribute) (olmo_core.optim.HalfCosWithWarmup attribute) (olmo_core.optim.InvSqrtWithWarmup attribute) (olmo_core.optim.LinearWithWarmup attribute) (olmo_core.optim.LionConfig attribute) (olmo_core.optim.MuonConfig attribute) (olmo_core.optim.NoOpConfig attribute) (olmo_core.optim.NorMuonConfig attribute) (olmo_core.optim.PowerLR attribute) (olmo_core.optim.SequentialScheduler attribute) (olmo_core.optim.SkipStepAdamWConfig attribute) (olmo_core.optim.SkipStepLionConfig attribute) (olmo_core.optim.WSD attribute) (olmo_core.optim.WSDS attribute) Registrable (class in olmo_core.config) remove (olmo_core.train.callbacks.CheckpointerCallback attribute) remove_file() (in module olmo_core.io) render_mixture_outcome_tables() (olmo_core.data.source_mixture.SourceMixtureDatasetConfig method) render_tables (olmo_core.data.source_mixture.SourceMixtureDatasetConfig attribute) reordered_norm (olmo_core.nn.transformer.TransformerBlockType attribute) ReorderedNormTransformerBlock (class in olmo_core.nn.transformer) repeat (olmo_core.train.callbacks.ProfilerCallback attribute) RepetitionTuple (class in olmo_core.data.utils) replace() (olmo_core.config.Config method) replicate (olmo_core.distributed.parallel.DPMeshDimName attribute) requested_tokens (olmo_core.data.source_mixture.SourceMixtureDatasetConfig attribute) reset() (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.data_loader.TextDataLoaderBase method) (olmo_core.eval.metrics.MeanMetric method) (olmo_core.eval.metrics.Metric method) reset_composable_seed() (in module olmo_core.data.composable) reset_metrics() (olmo_core.eval.evaluator.Evaluator method) (olmo_core.eval.lm_evaluator.LMEvaluator method) reset_parameters() (olmo_core.nn.lm_head.NormalizedLMHead method) reshuffle() (olmo_core.data.composable.ComposableDataLoader method) (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.data_loader.NumpyDataLoaderBase method) resize() (olmo_core.data.composable.InstanceSource method) (olmo_core.data.composable.InstanceSourceConfig method) (olmo_core.data.composable.TokenSource method) (olmo_core.data.composable.TokenSourceConfig method) resize_by_docs() (olmo_core.data.composable.DocumentSource method) (olmo_core.data.composable.DocumentSourceConfig method) resolved_paths (olmo_core.data.source_mixture.SourceMixtureConfig property) resource_path() (in module olmo_core.io) result_dir (olmo_core.launch.beaker.BeakerLaunchConfig attribute) (olmo_core.train.callbacks.BeakerCallback attribute) retries (olmo_core.launch.beaker.BeakerLaunchConfig attribute) RingAttentionLlama3LoadBalancer (class in olmo_core.nn.attention) RingAttentionLoadBalancer (class in olmo_core.nn.attention) RingAttentionLoadBalancerType (class in olmo_core.nn.attention) RingAttentionZigZagLoadBalancer (class in olmo_core.nn.attention) RingContextParallelStyle (class in olmo_core.nn.attention) rms (olmo_core.nn.layer_norm.LayerNormType attribute) RMSNorm (class in olmo_core.nn.layer_norm) rolling_interval_length (olmo_core.optim.NoOpConfig attribute) (olmo_core.optim.SkipStepAdamWConfig attribute) rolling_window (olmo_core.train.callbacks.StabilityMonitorCallback attribute) RoPEConfig (class in olmo_core.nn.rope) RoPEScalingConfig (class in olmo_core.nn.rope) RoPEType (class in olmo_core.nn.rope) RotaryEmbedding (class in olmo_core.nn.rope) RotaryEmbeddingBase (class in olmo_core.nn.rope) roundrobin() (in module olmo_core.utils) run() (olmo_core.model_ladder.ModelLadder method) run_benchmark() (olmo_core.model_ladder.ModelLadder method) run_bookkeeping_op() (olmo_core.train.Trainer method) run_configurator (olmo_core.model_ladder.ModelLadder attribute) run_distributed_test() (in module olmo_core.testing) RunCheckpointInfo (class in olmo_core.model_ladder) RunConfigurator (class in olmo_core.model_ladder) S same_storage() (in module olmo_core.utils) sample() (olmo_core.data.composable.InstanceSource method) (olmo_core.data.composable.InstanceSourceConfig method) (olmo_core.data.composable.TokenSource method) (olmo_core.data.composable.TokenSourceConfig method) sample_by_docs() (olmo_core.data.composable.DocumentSource method) (olmo_core.data.composable.DocumentSourceConfig method) SamplingDocumentSource (class in olmo_core.data.composable) SamplingDocumentSourceConfig (class in olmo_core.data.composable) SamplingInstanceSource (class in olmo_core.data.composable) SamplingInstanceSourceConfig (class in olmo_core.data.composable) SamplingTokenSource (class in olmo_core.data.composable) SamplingTokenSourceConfig (class in olmo_core.data.composable) save() (olmo_core.train.Checkpointer method) save_async (olmo_core.train.callbacks.CheckpointerCallback attribute) save_async() (olmo_core.train.Checkpointer method) save_checkpoint() (olmo_core.train.Trainer method) save_checkpoint_async() (olmo_core.train.Trainer method) save_folder (olmo_core.train.Trainer attribute) save_hf_hybrid_model() (in module olmo_core.nn.hf) save_hf_model() (in module olmo_core.nn.hf) save_interval (olmo_core.train.callbacks.CheckpointerCallback attribute) (olmo_core.train.callbacks.ListCheckpointerCallback attribute) (olmo_core.train.callbacks.MetricSaverCallback attribute) save_model_and_optim_state() (in module olmo_core.distributed.checkpoint) save_overwrite (olmo_core.train.Trainer attribute) save_state_dict() (in module olmo_core.distributed.checkpoint) scale_calculation_mode (olmo_core.float8.AOMXLinearConfig attribute) scaling (olmo_core.nn.rope.RoPEConfig attribute) schedule (olmo_core.distributed.parallel.PipelineParallelConfig attribute) (olmo_core.train.callbacks.BatchSizeSchedulerCallback attribute) Scheduler (class in olmo_core.optim) schedulers_max (olmo_core.optim.SequentialScheduler attribute) SchedulerUnits (class in olmo_core.optim) seed (olmo_core.data.composable.MixingDocumentSourceConfig attribute) (olmo_core.data.composable.MixingInstanceSourceConfig attribute) (olmo_core.data.composable.MixingTokenSourceConfig attribute) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDatasetConfig attribute) (olmo_core.data.source_mixture.SourceMixtureDatasetConfig attribute) (olmo_core.model_ladder.ModelLadder attribute) seed_all() (in module olmo_core.utils) segment_documents_into_instances() (in module olmo_core.data.utils) SegmentTreeNode (class in olmo_core.data.utils) select_next_token() (in module olmo_core.generate.sampling) selected_blocks (olmo_core.train.train_module.TransformerActivationCheckpointingMode attribute) selected_modules (olmo_core.train.train_module.TransformerActivationCheckpointingMode attribute) selected_ops (olmo_core.train.train_module.TransformerActivationCheckpointingMode attribute) selective_log_softmax() (in module olmo_core.generate.utils) sequence_length (olmo_core.data.composable.InstanceSource property) (olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig attribute) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDatasetConfig attribute) (olmo_core.data.numpy_dataset.NumpyPackedFSLDatasetConfig attribute) (olmo_core.data.numpy_dataset.NumpyPaddedFSLDatasetConfig attribute) (olmo_core.model_ladder.ModelLadder attribute) sequence_mixer (olmo_core.nn.transformer.TransformerBlockConfig attribute) SequenceLengthSchedulerCallback (class in olmo_core.train.callbacks) SequentialScheduler (class in olmo_core.optim) serialize_to_tensor() (in module olmo_core.io) set_composable_seed() (in module olmo_core.data.composable) set_env_variables() (in module olmo_core.utils) set_lr() (olmo_core.optim.Scheduler method) setup_logging() (in module olmo_core.utils) shard (olmo_core.distributed.parallel.DPMeshDimName attribute) shared_filesystem (olmo_core.launch.beaker.BeakerLaunchConfig attribute) shared_memory (olmo_core.launch.beaker.BeakerLaunchConfig attribute) short_str (olmo_core.data.numpy_dataset.VSLCurriculum property) (olmo_core.data.numpy_dataset.VSLGrowLinearCurriculum property) (olmo_core.data.numpy_dataset.VSLGrowP2Curriculum property) (olmo_core.data.numpy_dataset.VSLNaturalCurriculum property) should_use_swa() (olmo_core.nn.attention.SlidingWindowAttentionConfig method) ShuffleStrategy (class in olmo_core.data.composable) sigma_factor (olmo_core.optim.NoOpConfig attribute) (olmo_core.optim.SkipStepAdamWConfig attribute) silu (olmo_core.nn.feed_forward.ActivationFunction attribute) single_thread_pool (olmo_core.train.Trainer property) sizes (olmo_core.model_ladder.ModelLadder attribute) skip_first (olmo_core.train.callbacks.ProfilerCallback attribute) SkipStepAdamW (class in olmo_core.optim) SkipStepAdamWConfig (class in olmo_core.optim) SkipStepLion (class in olmo_core.optim) SkipStepLionConfig (class in olmo_core.optim) SkipStepOptimizer (class in olmo_core.optim) slack_notifications (olmo_core.launch.beaker.BeakerLaunchConfig attribute) SlackNotificationSetting (class in olmo_core.train.callbacks) SlackNotifierCallback (class in olmo_core.train.callbacks) SlicedInstanceSource (class in olmo_core.data.composable) SlicedTokenSource (class in olmo_core.data.composable) SlidingWindowAttentionConfig (class in olmo_core.nn.attention) source (olmo_core.data.composable.MixingDocumentSourceSpec attribute) (olmo_core.data.composable.MixingInstanceSourceSpec attribute) (olmo_core.data.composable.MixingTokenSourceSpec attribute) source_concat_dim (olmo_core.nn.conversion.StateMapping attribute) (olmo_core.nn.conversion.StateMappingTemplate attribute) source_group_size (olmo_core.data.composable.NumpyDocumentSourceConfigBase attribute) (olmo_core.data.numpy_dataset.NumpyPackedFSLDatasetConfig attribute) source_key_per_placeholder (olmo_core.nn.conversion.StateMappingTemplate attribute) source_keys (olmo_core.nn.conversion.StateMapping attribute) source_list (olmo_core.data.source_mixture.SourceMixtureDatasetConfig attribute) source_mixture_config (olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig attribute) source_name (olmo_core.data.source_mixture.SourceMixtureConfig attribute) source_paths (olmo_core.data.composable.NumpyDocumentSourceConfig attribute) source_permutation_seed (olmo_core.data.composable.NumpyDocumentSourceConfigBase attribute) (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) source_specs (olmo_core.data.composable.MixingDocumentSourceConfig attribute) (olmo_core.data.composable.MixingInstanceSourceConfig attribute) (olmo_core.data.composable.MixingTokenSourceConfig attribute) source_template_keys (olmo_core.nn.conversion.StateMappingTemplate attribute) SourceABC (class in olmo_core.data.composable) SourceMixtureConfig (class in olmo_core.data.source_mixture) SourceMixtureDatasetConfig (class in olmo_core.data.source_mixture) SourceMixtureList (class in olmo_core.data.source_mixture) Spec (olmo_core.data.composable.MixingDocumentSource attribute) (olmo_core.data.composable.MixingInstanceSource attribute) (olmo_core.data.composable.MixingTokenSource attribute) SpeedMonitorCallback (class in olmo_core.train.callbacks) split() (olmo_core.data.composable.InstanceSource method) (olmo_core.data.composable.InstanceSourceConfig method) (olmo_core.data.composable.TokenSource method) (olmo_core.data.composable.TokenSourceConfig method) split_batch() (in module olmo_core.data.utils) split_by_source() (olmo_core.data.composable.NumpyDocumentSource method) split_points (olmo_core.train.train_module.TransformerPipelineParallelConfig attribute) SplitInstanceSourceConfig (class in olmo_core.data.composable) SplitTokenSourceConfig (class in olmo_core.data.composable) StabilityMonitorCallback (class in olmo_core.train.callbacks) stable (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) stable_cu128 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) stable_cu130 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) stage_ids_this_rank() (olmo_core.distributed.parallel.PipelineParallelConfig method) start (olmo_core.data.utils.RepetitionTuple attribute) (olmo_core.train.StepSkipRange attribute) state_dict() (olmo_core.data.composable.ComposableDataLoader method) (olmo_core.data.data_loader.DataLoaderBase method) (olmo_core.data.data_loader.NumpyDataLoaderBase method) (olmo_core.data.data_loader.NumpyFSLDataLoader method) (olmo_core.data.data_loader.NumpyVSLDataLoader method) (olmo_core.generate.generation_module.TransformerGenerationModule method) (olmo_core.train.callbacks.Callback method) (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) (olmo_core.train.Trainer method) state_dict_to_load() (olmo_core.train.train_module.TrainModule method) state_dict_to_save() (olmo_core.train.train_module.TrainModule method) StateConverter (class in olmo_core.nn.conversion) StateMapping (class in olmo_core.nn.conversion) StateMappingTemplate (class in olmo_core.nn.conversion) step (olmo_core.model_ladder.RunCheckpointInfo attribute) step() (olmo_core.distributed.parallel.PipelineSchedule method) (olmo_core.optim.Lion method) (olmo_core.optim.NoOpOptimizer method) (olmo_core.optim.SkipStepAdamW method) (olmo_core.optim.SkipStepLion method) step_increment_bugfix (olmo_core.optim.SkipStepAdamWConfig attribute) step_metrics_fname (olmo_core.train.callbacks.MetricSaverCallback attribute) step_skipped (olmo_core.optim.NoOpOptimizer property) (olmo_core.optim.SkipStepAdamW property) (olmo_core.optim.SkipStepLion property) (olmo_core.optim.SkipStepOptimizer property) step_soft_timeout (olmo_core.launch.beaker.BeakerLaunchConfig attribute) step_timeout (olmo_core.launch.beaker.BeakerLaunchConfig attribute) stepped_schedule (olmo_core.model_ladder.WSDSChinchillaRunConfigurator attribute) steps (olmo_core.train.DurationUnit attribute) steps() (olmo_core.train.Duration class method) steps_per_epoch (olmo_core.train.Trainer property) steps_to_skip (olmo_core.train.Trainer attribute) StepSkipRange (class in olmo_core.train) StepwiseRoPEScalingConfig (class in olmo_core.nn.rope) stop (olmo_core.train.StepSkipRange attribute) stop_token_ids (olmo_core.generate.generation_module.GenerationConfig attribute) StrEnum (class in olmo_core.config) style (olmo_core.distributed.parallel.PipelineParallelConfig attribute) sum (olmo_core.train.MetricMergeStrategy attribute) (olmo_core.train.ReduceType attribute) synchronize_flag() (in module olmo_core.distributed.utils) synchronize_value() (in module olmo_core.distributed.utils) system_python (olmo_core.launch.beaker.BeakerLaunchConfig attribute) T table_to_text() (olmo_core.data.source_mixture.SourceMixtureDatasetConfig method) tags (olmo_core.launch.beaker.BeakerLaunchConfig attribute) (olmo_core.train.callbacks.CometCallback attribute) (olmo_core.train.callbacks.WandBCallback attribute) target_ratio (olmo_core.data.source_mixture.SourceMixtureConfig attribute) task_name (olmo_core.launch.beaker.BeakerLaunchConfig attribute) tch2100_cu128 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) tch270_cu128 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) tch271_cu126 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) tch271_cu128 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) tch280_cu128 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) tch291_cu128 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) tch291_cu129 (olmo_core.launch.beaker.OLMoCoreBeakerImage attribute) te (olmo_core.nn.attention.AttentionBackendName attribute) teardown_training_environment() (in module olmo_core.train) TEAttentionBackend (class in olmo_core.nn.attention) temperature (olmo_core.generate.generation_module.GenerationConfig attribute) TemplatePlaceholder (class in olmo_core.nn.conversion) TensorParallelConfig (class in olmo_core.distributed.parallel) TextDataLoaderBase (class in olmo_core.data.data_loader) theta (olmo_core.nn.rope.RoPEConfig attribute) threaded_generator() (in module olmo_core.utils) threshold_std (olmo_core.train.callbacks.StabilityMonitorCallback attribute) times (olmo_core.data.utils.RepetitionTuple attribute) to_hf_config() (olmo_core.nn.rope.ABFRoPEScalingConfig method) (olmo_core.nn.rope.PIRoPEScalingConfig method) (olmo_core.nn.rope.RoPEScalingConfig method) (olmo_core.nn.rope.StepwiseRoPEScalingConfig method) (olmo_core.nn.rope.YaRNRoPEScalingConfig method) tokenizer (olmo_core.data.composable.NumpyDocumentSourceConfigBase attribute) (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) (olmo_core.model_ladder.ModelLadder attribute) tokenizer_id (olmo_core.train.callbacks.HFConverterCallback attribute) TokenizerConfig (class in olmo_core.data.tokenizer) TokenizerLike (class in olmo_core.data.tokenizer) TokenizerName (class in olmo_core.data.tokenizer) TokenRange (class in olmo_core.data.composable) tokens (olmo_core.model_ladder.RunCheckpointInfo attribute) (olmo_core.train.DurationUnit attribute) (olmo_core.train.train_module.EvalBatchSizeUnit attribute) tokens() (olmo_core.train.Duration class method) tokens_per_batch (olmo_core.train.Trainer property) tokens_per_epoch (olmo_core.train.Trainer property) tokens_per_param (olmo_core.model_ladder.WSDSChinchillaRunConfigurator attribute) tokens_processed (olmo_core.data.data_loader.TextDataLoaderBase attribute) TokenSource (class in olmo_core.data.composable) TokenSourceConfig (class in olmo_core.data.composable) top_k (olmo_core.generate.generation_module.GenerationConfig attribute) top_k_filtering() (in module olmo_core.generate.sampling) top_p (olmo_core.generate.generation_module.GenerationConfig attribute) top_p_filtering() (in module olmo_core.generate.sampling) torch (olmo_core.nn.attention.AttentionBackendName attribute) TorchAttentionBackend (class in olmo_core.nn.attention) torchrun (olmo_core.launch.beaker.BeakerLaunchConfig attribute) total_batches (olmo_core.data.composable.ComposableDataLoader property) (olmo_core.data.data_loader.DataLoaderBase property) (olmo_core.data.data_loader.NumpyFSLDataLoader property) (olmo_core.data.data_loader.NumpyVSLDataLoader property) (olmo_core.eval.evaluator.Evaluator property) total_size (olmo_core.data.data_loader.NumpyFSLDataLoader property) tp (olmo_core.distributed.parallel.MeshDimName attribute) train_batch() (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) train_module (olmo_core.train.Trainer attribute) Trainer (class in olmo_core.train) trainer (olmo_core.train.train_module.TrainModule property) TrainerConfig (class in olmo_core.train) TrainModule (class in olmo_core.train.train_module) TrainModuleConfig (class in olmo_core.train.train_module) Transformer (class in olmo_core.nn.transformer) TransformerActivationCheckpointingConfig (class in olmo_core.train.train_module) TransformerActivationCheckpointingMode (class in olmo_core.train.train_module) TransformerBlock (class in olmo_core.nn.transformer) TransformerBlockBase (class in olmo_core.nn.transformer) TransformerBlockConfig (class in olmo_core.nn.transformer) TransformerBlockType (class in olmo_core.nn.transformer) TransformerConfig (class in olmo_core.nn.transformer) TransformerContextParallelConfig (class in olmo_core.train.train_module) TransformerDataParallelConfig (class in olmo_core.train.train_module) TransformerDataParallelWrappingStrategy (class in olmo_core.train.train_module) TransformerExpertParallelConfig (class in olmo_core.train.train_module) TransformerGenerationModule (class in olmo_core.generate.generation_module) TransformerGenerationModuleConfig (class in olmo_core.generate.generation_module) TransformerModelConfigurator (class in olmo_core.model_ladder) TransformerPipelineParallelConfig (class in olmo_core.train.train_module) TransformerPipelineTrainModule (class in olmo_core.train.train_module) TransformerPipelineTrainModuleConfig (class in olmo_core.train.train_module) TransformerSize (class in olmo_core.model_ladder) TransformerTensorParallelConfig (class in olmo_core.train.train_module) TransformerTrainModule (class in olmo_core.train.train_module) TransformerTrainModuleConfig (class in olmo_core.train.train_module) TransformerType (class in olmo_core.nn.transformer) truncate (olmo_core.data.composable.LongDocStrategy attribute) (olmo_core.data.types.LongDocStrategy attribute) truncate_batch() (in module olmo_core.data.utils) U ulysses (olmo_core.nn.attention.RingAttentionLoadBalancerType attribute) UlyssesContextParallelStyle (class in olmo_core.nn.attention) UlyssesLoadBalancer (class in olmo_core.nn.attention) unblock_ephemeral_checkpoints() (olmo_core.train.callbacks.Callback method) unflatten_dim (olmo_core.nn.conversion.StateMapping attribute) (olmo_core.nn.conversion.StateMappingTemplate attribute) unit (olmo_core.train.Duration attribute) unshard_checkpoint() (in module olmo_core.distributed.checkpoint) UnshardStrategy (class in olmo_core.distributed.checkpoint) UnshardStrategyType (class in olmo_core.distributed.checkpoint) update() (olmo_core.eval.metrics.MeanMetric method) (olmo_core.eval.metrics.Metric method) update_metrics() (olmo_core.eval.evaluator.Evaluator method) (olmo_core.eval.lm_evaluator.LMEvaluator method) upload() (in module olmo_core.io) use_cache (olmo_core.generate.generation_module.GenerationConfig attribute) use_triton (olmo_core.optim.MuonConfig attribute) V validate (olmo_core.train.callbacks.HFConverterCallback attribute) validate() (olmo_core.config.Config method) (olmo_core.data.numpy_dataset.NumpyFSLDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyInterleavedFSLDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyPackedFSLDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyPaddedFSLDatasetConfig method) (olmo_core.data.numpy_dataset.NumpyVSLDatasetConfig method) (olmo_core.data.numpy_dataset.VSLCurriculumConfig method) (olmo_core.data.source_mixture.SourceMixtureConfig method) (olmo_core.data.source_mixture.SourceMixtureDatasetConfig method) (olmo_core.data.source_mixture.SourceMixtureList method) (olmo_core.float8.Float8Config method) (olmo_core.generate.generation_module.GenerationConfig method) validate_env_vars() (in module olmo_core.distributed.utils) value (olmo_core.train.Duration attribute) visualize() (olmo_core.data.composable.InstanceSource method) vocab_size (olmo_core.data.tokenizer.TokenizerConfig attribute) vsl_curriculum (olmo_core.data.numpy_dataset.NumpyVSLDatasetConfig attribute) VSLCurriculum (class in olmo_core.data.numpy_dataset) VSLCurriculumConfig (class in olmo_core.data.numpy_dataset) VSLCurriculumType (class in olmo_core.data.numpy_dataset) VSLGrowLinearCurriculum (class in olmo_core.data.numpy_dataset) VSLGrowP2Curriculum (class in olmo_core.data.numpy_dataset) VSLGrowthCurriculum (class in olmo_core.data.numpy_dataset) VSLNaturalCurriculum (class in olmo_core.data.numpy_dataset) W wait (olmo_core.train.callbacks.ProfilerCallback attribute) wait_for() (in module olmo_core.utils) WandBCallback (class in olmo_core.train.callbacks) warmup (olmo_core.train.callbacks.ProfilerCallback attribute) warmup_cache() (olmo_core.nn.attention.AttentionBackend method) (olmo_core.nn.attention.TorchAttentionBackend method) (olmo_core.nn.rope.ComplexRotaryEmbedding method) (olmo_core.nn.rope.FusedRotaryEmbedding method) (olmo_core.nn.rope.RotaryEmbedding method) (olmo_core.nn.rope.RotaryEmbeddingBase method) warn (olmo_core.train.MetricMergeStrategy attribute) webhook_url (olmo_core.train.callbacks.SlackNotifierCallback attribute) weight_decay (olmo_core.optim.DionConfig attribute) (olmo_core.optim.MuonConfig attribute) weka_buckets (olmo_core.launch.beaker.BeakerLaunchConfig attribute) window_size (olmo_core.train.callbacks.StabilityMonitorCallback attribute) with_callback() (olmo_core.train.TrainerConfig method) with_callbacks() (olmo_core.train.TrainerConfig method) with_recommended_evals() (olmo_core.train.TrainerConfig method) with_rope_scaling() (olmo_core.nn.transformer.TransformerConfig method) with_stack (olmo_core.train.callbacks.ProfilerCallback attribute) work_dir (olmo_core.data.composable.SourceABC property) (olmo_core.data.numpy_dataset.NumpyDatasetConfig attribute) (olmo_core.train.Trainer attribute) work_dir_set (olmo_core.data.numpy_dataset.NumpyDatasetBase property) workspace (olmo_core.launch.beaker.BeakerLaunchConfig attribute) (olmo_core.train.callbacks.CometCallback attribute) world_size (olmo_core.model_ladder.DeviceMeshSpec attribute) wrap_numpy_dataset() (olmo_core.data.data_loader.NumpyDataLoaderBase class method) wrapping_strategy (olmo_core.train.train_module.TransformerDataParallelConfig attribute) write_array_to_disk() (in module olmo_core.data.utils) write_document_indices() (in module olmo_core.data.utils) write_file() (olmo_core.train.Checkpointer method) (olmo_core.train.Trainer method) WSD (class in olmo_core.optim) WSDS (class in olmo_core.optim) WSDSChinchillaRunConfigurator (class in olmo_core.model_ladder) Y YaRNRoPEScalingConfig (class in olmo_core.nn.rope) Z z_loss (olmo_core.nn.lm_head.LMOutputWithLoss attribute) zero_grads() (olmo_core.train.train_module.BasicTrainModule method) (olmo_core.train.train_module.TrainModule method) zig_zag (olmo_core.nn.attention.RingAttentionLoadBalancerType attribute)