Baseline Qwen3-0.6B-Base

Browse files

Files changed (11) hide show

.gitignore +1 -0
README.md +58 -0
added_tokens.json +0 -28
config.json +26 -6
faiss_index.bin +0 -0
generation_config.json +3 -2
model.safetensors +2 -2
rag_documents.jsonl +0 -3
special_tokens_map.json +0 -31
tokenizer.json +2 -2
tokenizer_config.json +1 -2

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+license: apache-2.0
+library_name: transformers
+pipeline_tag: text-generation
+---
+# Qwen3-0.6B-Base
+## Qwen3 Highlights
+Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models.
+Building upon extensive advancements in training data, model architecture, and optimization techniques, Qwen3 delivers the following key improvements over the previously released Qwen2.5:
+- **Expanded Higher-Quality Pre-training Corpus:** Qwen3 is pre-trained on 36 trillion tokens across 119 languages — tripling the language coverage of Qwen2.5 — with a much richer mix of high-quality data, including coding, STEM, reasoning, book, multilingual, and synthetic data.
+- **Training Techniques and Model Architecture:** Qwen3 incorporates a series of training techiques and architectural refinements, including global-batch load balancing loss for MoE models and qk layernorm for all models, leading to improved stability and overall performance.
+- **Three-stage Pre-training:** Stage 1 focuses on broad language modeling and general knowledge acquisition, Stage 2 improves reasoning skills like STEM, coding, and logical reasoning, and Stage 3 enhances long-context comprehension by extending training sequence lengths up to 32k tokens.
+- **Scaling Law Guided Hyperparameter Tuning:** Through comprehensive scaling law studies across the three-stage pre-training pipeline, Qwen3 systematically tunes critical hyperparameters — such as learning rate scheduler and batch size — separately for dense and MoE models, resulting in better training dynamics and final performance across different model scales.
+## Model Overview
+**Qwen3-0.6B-Base** has the following features:
+- Type: Causal Language Models
+- Training Stage: Pretraining
+- Number of Parameters: 0.6B
+- Number of Paramaters (Non-Embedding): 0.44B
+- Number of Layers: 28
+- Number of Attention Heads (GQA): 16 for Q and 8 for KV
+- Context Length: 32,768
+For more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our [blog](https://qwenlm.github.io/blog/qwen3/), [GitHub](https://github.com/QwenLM/Qwen3), and [Documentation](https://qwen.readthedocs.io/en/latest/).
+## Requirements
+The code of Qwen3 has been in the latest Hugging Face `transformers` and we advise you to use the latest version of `transformers`.
+With `transformers<4.51.0`, you will encounter the following error:
+```
+KeyError: 'qwen3'
+```
+## Evaluation & Performance
+Detailed evaluation results are reported in this [📑 blog](https://qwenlm.github.io/blog/qwen3/).
+### Citation
+If you find our work helpful, feel free to give us a cite.
+```
+@misc{qwen3technicalreport,
+      title={Qwen3 Technical Report},
+      author={Qwen Team},
+      year={2025},
+      eprint={2505.09388},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.09388},
+}
+```

added_tokens.json DELETED Viewed

@@ -1,28 +0,0 @@
-{
-  "</think>": 151668,
-  "</tool_call>": 151658,
-  "</tool_response>": 151666,
-  "<think>": 151667,
-  "<tool_call>": 151657,
-  "<tool_response>": 151665,
-  "<|box_end|>": 151649,
-  "<|box_start|>": 151648,
-  "<|endoftext|>": 151643,
-  "<|file_sep|>": 151664,
-  "<|fim_middle|>": 151660,
-  "<|fim_pad|>": 151662,
-  "<|fim_prefix|>": 151659,
-  "<|fim_suffix|>": 151661,
-  "<|im_end|>": 151645,
-  "<|im_start|>": 151644,
-  "<|image_pad|>": 151655,
-  "<|object_ref_end|>": 151647,
-  "<|object_ref_start|>": 151646,
-  "<|quad_end|>": 151651,
-  "<|quad_start|>": 151650,
-  "<|repo_name|>": 151663,
-  "<|video_pad|>": 151656,
-  "<|vision_end|>": 151653,
-  "<|vision_pad|>": 151654,
-  "<|vision_start|>": 151652
-}

config.json CHANGED Viewed

@@ -1,12 +1,32 @@
 {
   "architectures": [
-    "AutoModelForCausalLM"
   ],
-  "model_type": "qwen",
-  "auto_map": {
-    "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel",
-    "AutoTokenizer": "tokenization_qwen.QWenTokenizer"
-  },
   "rag_config": {
     "retriever_type": "faiss",
     "embedding_model": "smikulas/MNLP_M2_document_encoder",

 {
   "architectures": [
+    "Qwen3ForCausalLM"
   ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
   "rag_config": {
     "retriever_type": "faiss",
     "embedding_model": "smikulas/MNLP_M2_document_encoder",

faiss_index.bin DELETED Viewed

Binary file (4.65 kB)

generation_config.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
   "bos_token_id": 151643,
   "eos_token_id": 151643,
   "max_new_tokens": 2048,
-  "transformers_version": "4.51.3"
-}

 {
   "bos_token_id": 151643,
+  "do_sample": false,
   "eos_token_id": 151643,
   "max_new_tokens": 2048,
+  "transformers_version": "4.37.0"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1881eca1d5c57b04779f2c2755b1992d4064877f0f76ffaa79fe805b11808b9
-size 2384234968

 version https://git-lfs.github.com/spec/v1
+oid sha256:cd2a512003e2f9f3cd3c32a9c3573f820bb28c940f73c57b1ddaa983d9223eba
+size 1192135096

rag_documents.jsonl DELETED Viewed

@@ -1,3 +0,0 @@
-{"text": "The Transformer architecture introduced self-attention to model dependencies without regard to distance."}
-{"text": "Retrieval-Augmented Generation (RAG) enhances generation by retrieving documents relevant to the input query."}
-{"text": "BERT is a transformer model pre-trained on a large corpus and fine-tuned for specific NLP tasks."}

special_tokens_map.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "additional_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "eos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
-size 11422654

 version https://git-lfs.github.com/spec/v1
+oid sha256:c0382117ea329cdf097041132f6d735924b697924d6f6fc3945713e96ce87539
+size 7031645

tokenizer_config.json CHANGED Viewed

@@ -231,10 +231,9 @@
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "errors": "replace",
-  "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null
-}

   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "errors": "replace",
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null
+}