configs for reproducibility

Browse files

Files changed (7) hide show

README.md +13 -2
stage1/open-stage1.py +82 -0
stage1/open-stage1.toml +42 -0
stage2/open-stage2.py +82 -0
stage2/open-stage2.toml +44 -0
stage3/open-stage3.py +82 -0
stage3/open-stage3.toml +44 -0

README.md CHANGED Viewed

@@ -12,13 +12,24 @@ pipeline_tag: text-generation
 ---
 # Munin-7B-Open-pt
-Munin-7B-open-pt is a 7 billion parameter language model continually pre-trained from [Comma v0.1-2T](https://huggingface.co/common-pile/comma-v0.1-2t/) using 30B tokens using a mix of the [Dynaword](https://huggingface.co/datasets/danish-foundation-models/danish-dynaword) and [the Comma v0.1 dataset](https://huggingface.co/datasets/common-pile/comma_v0.1_training_dataset), comprising only public domain and openly licensed data.
 Munin-7B-open-pt is a base model that can be used a the starting point for fine-tuning and post-training. It has not been instruction-tuned and cannot directly be expected to function as a chat model.
-Munin-7B-open-pt has been trained using the [maester](https://github.com/rlrs/maester) framework developed as part of the [Danish Foundation Models project](https://foundationmodels.dk/). The three pre-training stages are detailed in the following table:
 | Stage | Batch size | Steps | HF path | Data mix | Comments |
 |-|-|-|-|-|-|
 | stage1 | 262,144 tok | 37,852| [subfolder="stage1"](https://huggingface.co/danish-foundation-models/munin-7b-open-pt/tree/main/stage1)  | 2/3 [DynaWord](https://huggingface.co/datasets/danish-foundation-models/danish-dynaword/tree/9e230b35e31a510e5ab909112ad5bfc9463b2c23); <br> 1/3 [Common-Pile](https://huggingface.co/common-pile/comma_v0.1_training_dataset/5afc546db324e7f39f297ba757c9a60547151e7c/) | Excludes depbank, jvj, nordjyllandnews, synne for DynaWord; <br> uses subsets and weighting from [Comma-v0.1-2T](https://huggingface.co/common-pile/comma-v0.1-2t) cooldown phase for Common-Pile ; LR swchedule with 1000 steps warmup, constant 1e-5, 1000 steps cooldown |
 | stage2 | 524,288 tok | 18926 | [subfolder="stage2"](https://huggingface.co/danish-foundation-models/munin-7b-open-pt/tree/main/stage2)  | 2/3 [DynaWord](https://huggingface.co/datasets/danish-foundation-models/danish-dynaword/tree/9e230b35e31a510e5ab909112ad5bfc9463b2c23); <br> 1/3 [Common-Pile](https://huggingface.co/common-pile/comma_v0.1_training_dataset/5afc546db324e7f39f297ba757c9a60547151e7c/) | Excludes depbank, jvj, nordjyllandnews, synne for DynaWord; <br> uses subsets and weighting from [Comma-v0.1-2T](https://huggingface.co/common-pile/comma-v0.1-2t) cooldown phase for Common-Pile; LR swchedule with 500 steps warmup, constant 1e-5, 500 steps cooldown |
 | stage3 | 524,288 tok | 18926 | [subfolder="stage3"](https://huggingface.co/danish-foundation-models/munin-7b-open-pt/tree/main/stage3)  | 2/3 [DynaWord](https://huggingface.co/datasets/danish-foundation-models/danish-dynaword/tree/9e230b35e31a510e5ab909112ad5bfc9463b2c23); <br> 1/3 [Common-Pile](https://huggingface.co/common-pile/comma_v0.1_training_dataset/5afc546db324e7f39f297ba757c9a60547151e7c/) | Excludes depbank, jvj, nordjyllandnews, synne for DynaWord; <br> uses subsets and weighting from [Comma-v0.1-2T](https://huggingface.co/common-pile/comma-v0.1-2t) cooldown phase for Common-Pile; LR swchedule with 500 steps warmup, square root decay from 1e-5 |

 ---
 # Munin-7B-Open-pt
+Munin-7B-open-pt is a 7 billion parameter language model continually pre-trained from [Comma v0.1-2T](https://huggingface.co/common-pile/comma-v0.1-2t/) using 30B tokens using a mix of the [Dynaword](https://huggingface.co/datasets/danish-foundation-models/danish-dynaword) and [the Comma v0.1 dataset](https://huggingface.co/datasets/common-pile/comma_v0.1_training_dataset), both comprising only public domain and openly licensed data.
 Munin-7B-open-pt is a base model that can be used a the starting point for fine-tuning and post-training. It has not been instruction-tuned and cannot directly be expected to function as a chat model.
+## Training details
+Munin-7B-open-pt has been trained using the [maester](https://github.com/rlrs/maester) framework developed as part of the [Danish Foundation Models project](https://foundationmodels.dk/). All training was performed on a single 8x Nvidia B200 node (the first of its kind in Denmark).
+The training was performed in three stages, with data mix (open-stageK.py) and maester (open-stageK.toml) configuration files available in each subfolder. The three pre-training stages are detailed in the following table:
 | Stage | Batch size | Steps | HF path | Data mix | Comments |
 |-|-|-|-|-|-|
 | stage1 | 262,144 tok | 37,852| [subfolder="stage1"](https://huggingface.co/danish-foundation-models/munin-7b-open-pt/tree/main/stage1)  | 2/3 [DynaWord](https://huggingface.co/datasets/danish-foundation-models/danish-dynaword/tree/9e230b35e31a510e5ab909112ad5bfc9463b2c23); <br> 1/3 [Common-Pile](https://huggingface.co/common-pile/comma_v0.1_training_dataset/5afc546db324e7f39f297ba757c9a60547151e7c/) | Excludes depbank, jvj, nordjyllandnews, synne for DynaWord; <br> uses subsets and weighting from [Comma-v0.1-2T](https://huggingface.co/common-pile/comma-v0.1-2t) cooldown phase for Common-Pile ; LR swchedule with 1000 steps warmup, constant 1e-5, 1000 steps cooldown |
 | stage2 | 524,288 tok | 18926 | [subfolder="stage2"](https://huggingface.co/danish-foundation-models/munin-7b-open-pt/tree/main/stage2)  | 2/3 [DynaWord](https://huggingface.co/datasets/danish-foundation-models/danish-dynaword/tree/9e230b35e31a510e5ab909112ad5bfc9463b2c23); <br> 1/3 [Common-Pile](https://huggingface.co/common-pile/comma_v0.1_training_dataset/5afc546db324e7f39f297ba757c9a60547151e7c/) | Excludes depbank, jvj, nordjyllandnews, synne for DynaWord; <br> uses subsets and weighting from [Comma-v0.1-2T](https://huggingface.co/common-pile/comma-v0.1-2t) cooldown phase for Common-Pile; LR swchedule with 500 steps warmup, constant 1e-5, 500 steps cooldown |
 | stage3 | 524,288 tok | 18926 | [subfolder="stage3"](https://huggingface.co/danish-foundation-models/munin-7b-open-pt/tree/main/stage3)  | 2/3 [DynaWord](https://huggingface.co/datasets/danish-foundation-models/danish-dynaword/tree/9e230b35e31a510e5ab909112ad5bfc9463b2c23); <br> 1/3 [Common-Pile](https://huggingface.co/common-pile/comma_v0.1_training_dataset/5afc546db324e7f39f297ba757c9a60547151e7c/) | Excludes depbank, jvj, nordjyllandnews, synne for DynaWord; <br> uses subsets and weighting from [Comma-v0.1-2T](https://huggingface.co/common-pile/comma-v0.1-2t) cooldown phase for Common-Pile; LR swchedule with 500 steps warmup, square root decay from 1e-5 |
+## Limitations
+Munin-7B-Open-pt was trained only on Danish and English-language data and code from the 15 programming languages covered by the [stack-edu classifiers](https://huggingface.co/collections/HuggingFaceTB/the-ultimate-collection-of-code-classifiers-67b5aa3eb8994a4b71453005).
+It will likely have poor performance on other languages or programming languages.
+As a base model, Munin-7B-Open-pt has not been aligned for safety and may, for example, reflect social biases present in its training data or potentially provide toxic or harmful information.

stage1/open-stage1.py ADDED Viewed

	@@ -0,0 +1,82 @@

+dyna_train = {
+    "adl": 1.0,
+    "ai-aktindsigt": 1.0,
+    "botxt": 1.0,
+    "cellar": 1.0,
+    "dannet": 1.0,
+    "danske-taler": 1.0,
+    "domsdatabasen": 1.0,
+    "enevaeldens_nyheder": 1.0,
+    "ep": 1.0,
+    "eur-lex-sum-da": 1.0,
+    "fm-udgivelser": 1.0,
+    "ft": 1.0,
+    "grundtvig": 1.0,
+    "gutenberg": 1.0,
+    "health_hovedstaden": 1.0,
+    "hest": 1.0,
+    "historical-danish-handwriting": 1.0,
+    "memo": 1.0,
+    "miljoeportalen": 1.0,
+    "naat": 1.0,
+    "ncc_books": 1.0,
+    "ncc_maalfrid": 1.0,
+    "ncc_newspaper": 1.0,
+    "ncc_parliament": 1.0,
+    "nota": 1.0,
+    "opensubtitles": 1.0,
+    "relig": 1.0,
+    "retsinformationdk": 1.0,
+    "skat": 1.0,
+    "retspraksis": 1.0,
+    "spont": 1.0,
+    "tv2r": 1.0,
+    "wiki-comments": 1.0,
+    "wikibooks": 1.0,
+    "wikipedia": 1.0,
+    "wikisource": 1.0,
+}
+dyna_test = {
+    "depbank": 1.0,
+    "jvj": 1.0,
+    "nordjyllandnews": 1.0,
+    "synne": 1.0,
+}
+cp_train = {
+    "arxiv_papers": 0.5,
+    "cccc": 0.3,
+    "data_provenance_initiative": 2,
+    "doab": 2,
+    "foodista": 2,
+    "libretexts": 2,
+    "news": 2,
+    "oercommons": 2,
+    "peS2o": 0.1,
+    "pressbooks": 2,
+    "public_domain_review": 2,
+    "python_enhancement_proposals": 2,
+    "stackexchange": 0.25,
+    "stackv2_edu": 0.1,
+    "wikimedia": 0.4,
+}
+sources = {
+    "dyna": {
+        "uri": "hf://datasets/danish-foundation-models/danish-dynaword/data/{key}/*.parquet",
+        "format": "parquet",
+        "shards": 1,
+        "shard_index": 0,
+        "train": dyna_train,
+        "test": dyna_test,
+    },
+    "cp": {
+        "uri": "hf://datasets/common-pile/comma_v0.1_training_dataset/{key}/*.jsonl.gz",
+        "format": "json",
+        "shards": 16,
+        "shard_index": 0,
+        "train": cp_train,
+        "test": {},
+    },
+}

stage1/open-stage1.toml ADDED Viewed

	@@ -0,0 +1,42 @@

+model_name = "llama3"
+flavor = "Comma7B"
+tokenizer_name = "common-pile/comma-v0.1-2t"
+# job
+job_name = "munin-7b-open-stage1"
+wandb_project = "munin-7b-open-stage1"
+enable_wandb = false
+# parallelism
+num_nodes = 1
+data_parallel_shard_degree = 8
+data_parallel_replicate_degree = 1
+# training settings
+train_batch_size = 8
+seq_len = 4096
+train_num_steps = 37852
+scheduler = "linear_warmup_constant_sqrt_decay"
+warmup_steps = 1000
+cooldown_steps = 1000
+checkpoint_interval = 1000
+forced_load_path = "/work/training/maester/comma-v0.1-2t-dcp/"
+compile = true
+enable_cut_cross_entropy = false
+ac_mode = "none"
+selective_ac_option = "op"
+[dataset]
+bos_token = 2
+eos_token = 1
+data_dirs = [
+    "/work/production/data/munin-open-dyna-0-of-1-cp-0-of-16-train/",
+]
+dataset_weights = "1.0"
+[opt_cfg] # must specify *all* fields here, will not merge with defaults
+lr = 1e-5
+betas = [0.9, 0.95]
+weight_decay = 0.1
+eps = 1e-9
+fused = true

stage2/open-stage2.py ADDED Viewed

	@@ -0,0 +1,82 @@

+dyna_train = {
+    "adl": 1.0,
+    "ai-aktindsigt": 1.0,
+    "botxt": 1.0,
+    "cellar": 1.0,
+    "dannet": 1.0,
+    "danske-taler": 1.0,
+    "domsdatabasen": 1.0,
+    "enevaeldens_nyheder": 1.0,
+    "ep": 1.0,
+    "eur-lex-sum-da": 1.0,
+    "fm-udgivelser": 1.0,
+    "ft": 1.0,
+    "grundtvig": 1.0,
+    "gutenberg": 1.0,
+    "health_hovedstaden": 1.0,
+    "hest": 1.0,
+    "historical-danish-handwriting": 1.0,
+    "memo": 1.0,
+    "miljoeportalen": 1.0,
+    "naat": 1.0,
+    "ncc_books": 1.0,
+    "ncc_maalfrid": 1.0,
+    "ncc_newspaper": 1.0,
+    "ncc_parliament": 1.0,
+    "nota": 1.0,
+    "opensubtitles": 1.0,
+    "relig": 1.0,
+    "retsinformationdk": 1.0,
+    "skat": 1.0,
+    "retspraksis": 1.0,
+    "spont": 1.0,
+    "tv2r": 1.0,
+    "wiki-comments": 1.0,
+    "wikibooks": 1.0,
+    "wikipedia": 1.0,
+    "wikisource": 1.0,
+}
+dyna_test = {
+    "depbank": 1.0,
+    "jvj": 1.0,
+    "nordjyllandnews": 1.0,
+    "synne": 1.0,
+}
+cp_train = {
+    "arxiv_papers": 0.5,
+    "cccc": 0.3,
+    "data_provenance_initiative": 2,
+    "doab": 2,
+    "foodista": 2,
+    "libretexts": 2,
+    "news": 2,
+    "oercommons": 2,
+    "peS2o": 0.1,
+    "pressbooks": 2,
+    "public_domain_review": 2,
+    "python_enhancement_proposals": 2,
+    "stackexchange": 0.25,
+    "stackv2_edu": 0.1,
+    "wikimedia": 0.4,
+}
+sources = {
+    "dyna": {
+        "uri": "hf://datasets/danish-foundation-models/danish-dynaword/data/{key}/*.parquet",
+        "format": "parquet",
+        "shards": 1,
+        "shard_index": 0,
+        "train": dyna_train,
+        "test": dyna_test,
+    },
+    "cp": {
+        "uri": "hf://datasets/common-pile/comma_v0.1_training_dataset/{key}/*.jsonl.gz",
+        "format": "json",
+        "shards": 16,
+        "shard_index": 1,
+        "train": cp_train,
+        "test": {},
+    },
+}

stage2/open-stage2.toml ADDED Viewed

	@@ -0,0 +1,44 @@

+model_name = "llama3"
+flavor = "Comma7B"
+tokenizer_name = "common-pile/comma-v0.1-2t"
+# job
+job_name = "munin-7b-open-stage2"
+wandb_project = "munin-7b-open-stage2"
+enable_wandb = false
+# parallelism
+num_nodes = 1
+data_parallel_shard_degree = 8
+data_parallel_replicate_degree = 1
+# training settings
+train_batch_size = 8
+gradient_accumulation_steps = 2
+gradient_accumulation_sync_each_step = true
+seq_len = 4096
+train_num_steps = 18926 # 37852 // 2
+scheduler = "linear_warmup_constant_sqrt_decay"
+warmup_steps = 500
+cooldown_steps = 500
+checkpoint_interval = 1000
+forced_load_path = "/work/training/maester/jobs/munin-7b-open-stage1/checkpoints/step-37852/"
+compile = true
+enable_cut_cross_entropy = false
+ac_mode = "none"
+selective_ac_option = "op"
+[dataset]
+bos_token = 2
+eos_token = 1
+data_dirs = [
+    "/work/production/data/dsk-open-dyna-0-of-1-cp-1-of-16-train/",
+]
+dataset_weights = "1.0"
+[opt_cfg] # must specify *all* fields here, will not merge with defaults
+lr = 1e-5
+betas = [0.9, 0.95]
+weight_decay = 0.1
+eps = 1e-9
+fused = true

stage3/open-stage3.py ADDED Viewed

	@@ -0,0 +1,82 @@

+dyna_train = {
+    "adl": 1.0,
+    "ai-aktindsigt": 1.0,
+    "botxt": 1.0,
+    "cellar": 1.0,
+    "dannet": 1.0,
+    "danske-taler": 1.0,
+    "domsdatabasen": 1.0,
+    "enevaeldens_nyheder": 1.0,
+    "ep": 1.0,
+    "eur-lex-sum-da": 1.0,
+    "fm-udgivelser": 1.0,
+    "ft": 1.0,
+    "grundtvig": 1.0,
+    "gutenberg": 1.0,
+    "health_hovedstaden": 1.0,
+    "hest": 1.0,
+    "historical-danish-handwriting": 1.0,
+    "memo": 1.0,
+    "miljoeportalen": 1.0,
+    "naat": 1.0,
+    "ncc_books": 1.0,
+    "ncc_maalfrid": 1.0,
+    "ncc_newspaper": 1.0,
+    "ncc_parliament": 1.0,
+    "nota": 1.0,
+    "opensubtitles": 1.0,
+    "relig": 1.0,
+    "retsinformationdk": 1.0,
+    "skat": 1.0,
+    "retspraksis": 1.0,
+    "spont": 1.0,
+    "tv2r": 1.0,
+    "wiki-comments": 1.0,
+    "wikibooks": 1.0,
+    "wikipedia": 1.0,
+    "wikisource": 1.0,
+}
+dyna_test = {
+    "depbank": 1.0,
+    "jvj": 1.0,
+    "nordjyllandnews": 1.0,
+    "synne": 1.0,
+}
+cp_train = {
+    "arxiv_papers": 0.5,
+    "cccc": 0.3,
+    "data_provenance_initiative": 2,
+    "doab": 2,
+    "foodista": 2,
+    "libretexts": 2,
+    "news": 2,
+    "oercommons": 2,
+    "peS2o": 0.1,
+    "pressbooks": 2,
+    "public_domain_review": 2,
+    "python_enhancement_proposals": 2,
+    "stackexchange": 0.25,
+    "stackv2_edu": 0.1,
+    "wikimedia": 0.4,
+}
+sources = {
+    "dyna": {
+        "uri": "hf://datasets/danish-foundation-models/danish-dynaword/data/{key}/*.parquet",
+        "format": "parquet",
+        "shards": 1,
+        "shard_index": 0,
+        "train": dyna_train,
+        "test": dyna_test,
+    },
+    "cp": {
+        "uri": "hf://datasets/common-pile/comma_v0.1_training_dataset/{key}/*.jsonl.gz",
+        "format": "json",
+        "shards": 16,
+        "shard_index": 2,
+        "train": cp_train,
+        "test": {},
+    },
+}

stage3/open-stage3.toml ADDED Viewed

	@@ -0,0 +1,44 @@

+model_name = "llama3"
+flavor = "Comma7B"
+tokenizer_name = "common-pile/comma-v0.1-2t"
+# job
+job_name = "munin-7b-open-stage3"
+wandb_project = "munin-7b-open-stage3"
+enable_wandb = false
+# parallelism
+num_nodes = 1
+data_parallel_shard_degree = 8
+data_parallel_replicate_degree = 1
+# training settings
+train_batch_size = 8
+gradient_accumulation_steps = 2
+gradient_accumulation_sync_each_step = true
+seq_len = 4096
+train_num_steps = 18926 # 37852 // 2
+scheduler = "linear_warmup_constant_sqrt_decay"
+warmup_steps = 500
+cooldown_steps = 18426
+checkpoint_interval = 1000
+forced_load_path = "/work/training/maester/jobs/munin-7b-open-stage2/checkpoints/step-18926/"
+compile = true
+enable_cut_cross_entropy = false
+ac_mode = "none"
+selective_ac_option = "op"
+[dataset]
+bos_token = 2
+eos_token = 1
+data_dirs = [
+    "/work/production/data/dsk-open-dyna-0-of-1-cp-2-of-16-train/",
+]
+dataset_weights = "1.0"
+[opt_cfg] # must specify *all* fields here, will not merge with defaults
+lr = 1e-5
+betas = [0.9, 0.95]
+weight_decay = 0.1
+eps = 1e-9
+fused = true