smikulas commited on
Commit
7c74201
·
1 Parent(s): ccceab9

Baseline Qwen3-0.6B-Base

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .DS_Store
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ pipeline_tag: text-generation
5
+ ---
6
+ # Qwen3-0.6B-Base
7
+
8
+ ## Qwen3 Highlights
9
+
10
+ Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models.
11
+ Building upon extensive advancements in training data, model architecture, and optimization techniques, Qwen3 delivers the following key improvements over the previously released Qwen2.5:
12
+
13
+ - **Expanded Higher-Quality Pre-training Corpus:** Qwen3 is pre-trained on 36 trillion tokens across 119 languages — tripling the language coverage of Qwen2.5 — with a much richer mix of high-quality data, including coding, STEM, reasoning, book, multilingual, and synthetic data.
14
+ - **Training Techniques and Model Architecture:** Qwen3 incorporates a series of training techiques and architectural refinements, including global-batch load balancing loss for MoE models and qk layernorm for all models, leading to improved stability and overall performance.
15
+ - **Three-stage Pre-training:** Stage 1 focuses on broad language modeling and general knowledge acquisition, Stage 2 improves reasoning skills like STEM, coding, and logical reasoning, and Stage 3 enhances long-context comprehension by extending training sequence lengths up to 32k tokens.
16
+ - **Scaling Law Guided Hyperparameter Tuning:** Through comprehensive scaling law studies across the three-stage pre-training pipeline, Qwen3 systematically tunes critical hyperparameters — such as learning rate scheduler and batch size — separately for dense and MoE models, resulting in better training dynamics and final performance across different model scales.
17
+
18
+ ## Model Overview
19
+
20
+ **Qwen3-0.6B-Base** has the following features:
21
+ - Type: Causal Language Models
22
+ - Training Stage: Pretraining
23
+ - Number of Parameters: 0.6B
24
+ - Number of Paramaters (Non-Embedding): 0.44B
25
+ - Number of Layers: 28
26
+ - Number of Attention Heads (GQA): 16 for Q and 8 for KV
27
+ - Context Length: 32,768
28
+
29
+ For more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our [blog](https://qwenlm.github.io/blog/qwen3/), [GitHub](https://github.com/QwenLM/Qwen3), and [Documentation](https://qwen.readthedocs.io/en/latest/).
30
+
31
+ ## Requirements
32
+
33
+ The code of Qwen3 has been in the latest Hugging Face `transformers` and we advise you to use the latest version of `transformers`.
34
+
35
+ With `transformers<4.51.0`, you will encounter the following error:
36
+ ```
37
+ KeyError: 'qwen3'
38
+ ```
39
+
40
+ ## Evaluation & Performance
41
+
42
+ Detailed evaluation results are reported in this [📑 blog](https://qwenlm.github.io/blog/qwen3/).
43
+
44
+ ### Citation
45
+
46
+ If you find our work helpful, feel free to give us a cite.
47
+
48
+ ```
49
+ @misc{qwen3technicalreport,
50
+ title={Qwen3 Technical Report},
51
+ author={Qwen Team},
52
+ year={2025},
53
+ eprint={2505.09388},
54
+ archivePrefix={arXiv},
55
+ primaryClass={cs.CL},
56
+ url={https://arxiv.org/abs/2505.09388},
57
+ }
58
+ ```
added_tokens.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "</think>": 151668,
3
- "</tool_call>": 151658,
4
- "</tool_response>": 151666,
5
- "<think>": 151667,
6
- "<tool_call>": 151657,
7
- "<tool_response>": 151665,
8
- "<|box_end|>": 151649,
9
- "<|box_start|>": 151648,
10
- "<|endoftext|>": 151643,
11
- "<|file_sep|>": 151664,
12
- "<|fim_middle|>": 151660,
13
- "<|fim_pad|>": 151662,
14
- "<|fim_prefix|>": 151659,
15
- "<|fim_suffix|>": 151661,
16
- "<|im_end|>": 151645,
17
- "<|im_start|>": 151644,
18
- "<|image_pad|>": 151655,
19
- "<|object_ref_end|>": 151647,
20
- "<|object_ref_start|>": 151646,
21
- "<|quad_end|>": 151651,
22
- "<|quad_start|>": 151650,
23
- "<|repo_name|>": 151663,
24
- "<|video_pad|>": 151656,
25
- "<|vision_end|>": 151653,
26
- "<|vision_pad|>": 151654,
27
- "<|vision_start|>": 151652
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,12 +1,32 @@
1
  {
2
  "architectures": [
3
- "AutoModelForCausalLM"
4
  ],
5
- "model_type": "qwen",
6
- "auto_map": {
7
- "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel",
8
- "AutoTokenizer": "tokenization_qwen.QWenTokenizer"
9
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "rag_config": {
11
  "retriever_type": "faiss",
12
  "embedding_model": "smikulas/MNLP_M2_document_encoder",
 
1
  {
2
  "architectures": [
3
+ "Qwen3ForCausalLM"
4
  ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.51.0",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936,
30
  "rag_config": {
31
  "retriever_type": "faiss",
32
  "embedding_model": "smikulas/MNLP_M2_document_encoder",
faiss_index.bin DELETED
Binary file (4.65 kB)
 
generation_config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "bos_token_id": 151643,
 
3
  "eos_token_id": 151643,
4
  "max_new_tokens": 2048,
5
- "transformers_version": "4.51.3"
6
- }
 
1
  {
2
  "bos_token_id": 151643,
3
+ "do_sample": false,
4
  "eos_token_id": 151643,
5
  "max_new_tokens": 2048,
6
+ "transformers_version": "4.37.0"
7
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1881eca1d5c57b04779f2c2755b1992d4064877f0f76ffaa79fe805b11808b9
3
- size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd2a512003e2f9f3cd3c32a9c3573f820bb28c940f73c57b1ddaa983d9223eba
3
+ size 1192135096
rag_documents.jsonl DELETED
@@ -1,3 +0,0 @@
1
- {"text": "The Transformer architecture introduced self-attention to model dependencies without regard to distance."}
2
- {"text": "Retrieval-Augmented Generation (RAG) enhances generation by retrieving documents relevant to the input query."}
3
- {"text": "BERT is a transformer model pre-trained on a large corpus and fine-tuned for specific NLP tasks."}
 
 
 
 
special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|endoftext|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
- size 11422654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0382117ea329cdf097041132f6d735924b697924d6f6fc3945713e96ce87539
3
+ size 7031645
tokenizer_config.json CHANGED
@@ -231,10 +231,9 @@
231
  "clean_up_tokenization_spaces": false,
232
  "eos_token": "<|endoftext|>",
233
  "errors": "replace",
234
- "extra_special_tokens": {},
235
  "model_max_length": 131072,
236
  "pad_token": "<|endoftext|>",
237
  "split_special_tokens": false,
238
  "tokenizer_class": "Qwen2Tokenizer",
239
  "unk_token": null
240
- }
 
231
  "clean_up_tokenization_spaces": false,
232
  "eos_token": "<|endoftext|>",
233
  "errors": "replace",
 
234
  "model_max_length": 131072,
235
  "pad_token": "<|endoftext|>",
236
  "split_special_tokens": false,
237
  "tokenizer_class": "Qwen2Tokenizer",
238
  "unk_token": null
239
+ }