diff --git a/configs/llm-360-amber1.yaml b/configs/llm-360-amber1.yaml new file mode 100644 index 000000000..17951ce85 --- /dev/null +++ b/configs/llm-360-amber1.yaml @@ -0,0 +1,598 @@ +run_name: amberish1-base +seed: 6198 +dry_run: false + +wandb: + name: ${run_name} + project: olmo-small + group: amberish1-base + +model: + d_model: 2048 + n_heads: 16 + n_layers: 16 + mlp_ratio: 8 + # mlp_hidden_size: 22016 + weight_tying: false + alibi: false + rope: true + flash_attention: true + attention_dropout: 0.0 + attention_layer_norm: false + clip_qkv: null + include_bias: false + block_type: sequential + layer_norm_type: rms + layer_norm_with_affine: false + layer_norm_eps: 1e-6 + bias_for_layer_norm: false + attention_layer_norm_with_affine: false + activation_type: swiglu + residual_dropout: 0.0 + embedding_dropout: 0.0 + max_sequence_length: 2048 + vocab_size: 32000 + embedding_size: 32000 + eos_token_id: 2 + pad_token_id: 2 + init_device: meta + init_fn: normal + init_std: 0.02 + init_cutoff_factor: 3 + +compile: null + +optimizer: + name: adamw + learning_rate: 4.0e-4 + weight_decay: 0.1 + decay_norm_and_bias: true + decay_embeddings: false + eps: 1.0E-08 + betas: + - 0.9 + - 0.95 + metrics_log_interval: 10 + +scheduler: + name: cosine_with_warmup + units: tokens + t_warmup: 4587520000 + t_max: 1.25e12 + alpha_f: 0.1 + warmup_min_lr: 0 + +tokenizer: + identifier: huggyllama/llama-7b + truncate_direction: right + +save_folder: /weka/oe-training-default/oe-training-default/ai2-llm/checkpoints/OLMo-medium/dustins-stability/${run_name} +# remote_save_folder: /weka/oe-training-default/ai2-llm/checkpoints/OLMo-small/${run_name} +save_overwrite: true + +save_interval: 1000 +save_interval_ephemeral: null +save_num_checkpoints_to_keep: -1 +sharded_checkpointer: olmo_core + +save_interval_unsharded: null +save_num_unsharded_checkpoints_to_keep: -1 + +load_path: null + +max_duration: 2ep +global_train_batch_size: 1120 +device_train_microbatch_size: 4 + +precision: amp_bf16 + +fsdp: + wrapping_strategy: by_block_and_size + sharding_strategy: SHARD_GRAD_OP + precision: mixed + +max_grad_norm: 1.0 +max_grad_norm_ratio: null + +speed_monitor: + window_size: 1 + +eval_interval: 1000 +eval_subset_num_batches: -1 +device_eval_batch_size: ${device_train_microbatch_size} +evaluators: + # - label: all-small-ppl-validation + # data: + # num_workers: 0 + # drop_last: true + # datasets: + # c4_en-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy + # dolma_books-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy + # dolma_common-crawl-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy + # dolma_pes2o-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy + # dolma_reddit-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy + # dolma_stack-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy + # dolma_wiki-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy + # ice-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy + # m2d2_s2orc-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy + # pile-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy + # wikitext_103-validation: + # - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy + + ########################## + # Downstream evaluations # + ########################## + - label: piqa + type: downstream + + - label: hellaswag + type: downstream + + - label: winogrande + type: downstream + + - label: openbook_qa + type: downstream + + - label: boolq + type: downstream + + - label: sciq + type: downstream + + - label: arc_easy + type: downstream + + - label: arc_challenge + type: downstream + + - label: copa + type: downstream + + #- label: rte + # type: downstream + + #- label: commitment_bank + # type: downstream + + #- label: sst2 + # type: downstream + + - label: commonsense_qa + type: downstream + + - label: social_iqa + type: downstream + + # Doesn't work from cache. + # - label: basic_arithmetic + # type: downstream + + - label: mmlu_stem_var + type: downstream + + - label: mmlu_humanities_var + type: downstream + + - label: mmlu_social_sciences_var + type: downstream + + - label: mmlu_other_var + type: downstream + + - label: mmlu_stem_mc_5shot + type: downstream + + - label: mmlu_humanities_mc_5shot + type: downstream + + - label: mmlu_social_sciences_mc_5shot + type: downstream + + - label: mmlu_other_mc_5shot + type: downstream + + - label: mmlu_stem_mc_5shot_test + type: downstream + + - label: mmlu_humanities_mc_5shot_test + type: downstream + + - label: mmlu_social_sciences_mc_5shot_test + type: downstream + + - label: mmlu_other_mc_5shot_test + type: downstream + + - label: basic_arithmetic + type: downstream + + - label: trivia_qa_wiki_ppl + type: downstream + + - label: natural_qs_open_ppl + type: downstream + + - label: arc_easy_ppl + type: downstream + +data: + pad_direction: right + num_workers: 32 + drop_last: true + pin_memory: true + prefetch_factor: 8 + persistent_workers: true + timeout: 0 + paths: + ######### Amber ######### + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_000.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_001.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_002.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_003.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_004.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_005.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_006.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_007.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_008.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_009.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_010.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_011.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_012.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_013.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_014.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_015.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_016.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_017.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_018.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_019.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_020.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_021.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_022.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_023.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_024.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_025.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_026.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_027.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_028.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_029.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_030.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_031.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_032.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_033.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_034.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_035.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_036.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_037.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_038.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_039.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_040.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_041.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_042.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_043.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_044.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_045.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_046.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_047.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_048.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_049.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_050.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_051.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_052.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_053.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_054.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_055.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_056.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_057.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_058.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_059.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_060.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_061.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_062.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_063.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_064.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_065.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_066.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_067.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_068.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_069.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_070.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_071.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_072.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_073.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_074.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_075.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_076.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_077.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_078.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_079.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_080.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_081.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_082.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_083.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_084.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_085.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_086.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_087.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_088.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_089.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_090.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_091.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_092.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_093.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_094.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_095.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_096.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_097.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_098.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_099.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_100.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_101.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_102.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_103.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_104.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_105.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_106.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_107.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_108.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_109.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_110.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_111.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_112.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_113.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_114.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_115.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_116.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_117.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_118.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_119.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_120.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_121.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_122.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_123.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_124.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_125.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_126.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_127.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_128.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_129.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_130.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_131.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_132.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_133.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_134.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_135.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_136.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_137.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_138.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_139.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_140.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_141.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_142.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_143.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_144.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_145.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_146.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_147.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_148.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_149.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_150.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_151.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_152.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_153.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_154.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_155.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_156.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_157.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_158.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_159.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_160.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_161.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_162.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_163.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_164.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_165.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_166.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_167.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_168.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_169.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_170.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_171.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_172.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_173.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_174.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_175.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_176.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_177.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_178.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_179.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_180.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_181.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_182.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_183.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_184.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_185.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_186.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_187.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_188.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_189.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_190.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_191.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_192.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_193.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_194.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_195.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_196.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_197.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_198.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_199.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_200.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_201.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_202.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_203.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_204.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_205.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_206.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_207.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_208.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_209.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_210.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_211.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_212.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_213.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_214.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_215.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_216.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_217.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_218.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_219.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_220.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_221.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_222.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_223.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_224.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_225.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_226.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_227.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_228.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_229.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_230.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_231.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_232.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_233.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_234.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_235.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_236.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_237.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_238.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_239.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_240.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_241.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_242.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_243.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_244.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_245.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_246.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_247.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_248.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_249.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_250.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_251.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_252.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_253.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_254.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_255.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_256.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_257.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_258.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_259.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_260.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_261.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_262.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_263.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_264.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_265.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_266.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_267.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_268.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_269.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_270.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_271.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_272.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_273.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_274.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_275.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_276.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_277.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_278.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_279.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_280.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_281.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_282.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_283.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_284.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_285.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_286.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_287.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_288.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_289.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_290.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_291.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_292.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_293.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_294.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_295.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_296.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_297.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_298.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_299.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_300.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_301.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_302.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_303.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_304.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_305.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_306.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_307.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_308.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_309.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_310.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_311.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_312.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_313.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_314.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_315.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_316.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_317.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_318.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_319.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_320.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_321.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_322.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_323.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_324.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_325.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_326.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_327.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_328.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_329.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_330.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_331.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_332.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_333.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_334.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_335.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_336.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_337.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_338.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_339.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_340.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_341.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_342.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_343.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_344.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_345.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_346.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_347.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_348.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_349.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_350.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_351.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_352.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_353.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_354.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_355.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_356.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_357.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_358.npy + - /weka/oe-training-default/ai2-llm/preprocessed/llm-360-amber-datasets/olmo_npy/train_359.npy \ No newline at end of file diff --git a/olmo/config.py b/olmo/config.py index 7c294b2db..96c761899 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -349,6 +349,8 @@ class ModelConfig(BaseConfig): to ``False``. """ + layer_norm_eps: float = 1e-05 + attention_layer_norm_with_affine: bool = True """ Toggle affine transform for the QK norms. diff --git a/olmo/model.py b/olmo/model.py index 65c430e78..f902c2463 100644 --- a/olmo/model.py +++ b/olmo/model.py @@ -136,11 +136,10 @@ def __init__( *, size: Optional[int] = None, elementwise_affine: Optional[bool] = True, - eps: float = 1e-05, ): super().__init__() self.config = config - self.eps = eps + self.eps = config.layer_norm_eps self.normalized_shape = (size or config.d_model,) if elementwise_affine or (elementwise_affine is None and self.config.layer_norm_with_affine): self.weight = nn.Parameter(torch.ones(self.normalized_shape, device=config.init_device)) @@ -199,9 +198,8 @@ def __init__( size: Optional[int] = None, low_precision: bool = False, elementwise_affine: Optional[bool] = None, - eps: float = 1e-05, ): - super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps) + super().__init__(config, size=size, elementwise_affine=elementwise_affine) self.low_precision = low_precision def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -230,9 +228,8 @@ def __init__( config: ModelConfig, size: Optional[int] = None, elementwise_affine: Optional[bool] = None, - eps: float = 1e-5, ): - super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps) + super().__init__(config, size=size, elementwise_affine=elementwise_affine) def forward(self, x: torch.Tensor) -> torch.Tensor: with torch.autocast(enabled=False, device_type=x.device.type): diff --git a/pyproject.toml b/pyproject.toml index 18314fc64..2d8927fd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ requires-python = ">=3.8" license = { file = "LICENSE" } dependencies = [ "numpy", - "torch>=2.1,<2.3", + "torch>=2.1,<=2.3", "ai2-olmo-core==0.1.0", "omegaconf", "rich", diff --git a/scripts/beaker/llamaish7-normal-launch.sh b/scripts/beaker/llamaish7-normal-launch.sh new file mode 100755 index 000000000..4938615ac --- /dev/null +++ b/scripts/beaker/llamaish7-normal-launch.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=64 + +gantry run \ + --workspace ai2/OLMo-training \ + --task-name llamaish7-normal \ + --description "OLMo medium - 7B - Llamaish Normal" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --propagate-failure \ + --synchronized-start-timeout 15m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/llamaish7-normal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/llm-360-amber1-launch.sh b/scripts/beaker/llm-360-amber1-launch.sh new file mode 100755 index 000000000..3ac881088 --- /dev/null +++ b/scripts/beaker/llm-360-amber1-launch.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=4 + +gantry run \ + --workspace ai2/OLMo-training \ + --task-name amberish1-base \ + --description "OLMo small - 1B - Amberish with Amber data" \ + --priority urgent \ + --preemptible \ + --beaker-image petew/olmo-torch23-gantry \ + --cluster ai2/jupiter-cirrascale-2 \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --no-nfs \ + --weka oe-training-default:/weka/oe-training-default \ + --propagate-failure \ + --synchronized-start-timeout 20m \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=DUSTINS_WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=DUSTINS_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=DUSTINS_AWS_SECRET_ACCESS_KEY \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/llm-360-amber1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK" diff --git a/scripts/beaker/llm-360-amber1.sh b/scripts/beaker/llm-360-amber1.sh new file mode 100755 index 000000000..1cd2cf315 --- /dev/null +++ b/scripts/beaker/llm-360-amber1.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +BEAKER_REPLICA_RANK=$1 +shift + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/hf-cache/huggingface_cache_v4.tar.gz" | tar --keep-newer-files -xzf - +popd +export HF_DATASETS_OFFLINE=1 + + +torchrun \ + --nnodes ${NUM_NODES}:${NUM_NODES} \ + --nproc-per-node 8 \ + --rdzv_id=12347 \ + --rdzv_backend=static \ + --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ + --node_rank=$BEAKER_REPLICA_RANK \ + --rdzv_conf="read_timeout=420" \ + scripts/train.py \ + configs/llm-360-amber1.yaml \ + --gen1_gc_interval=null \ + --save_folder=runs/ \ + --save_interval=1000 \ + --eval_interval=1000 \ + --optimizer.metrics_log_interval=1 \ + --save_overwrite \ + --save_num_checkpoints_to_keep=3 \ + '--load_path=s3://ai2-llm/checkpoints/OLMo-small/${run_name}/step69750/'