Upload 4 files

Browse files

Files changed (4) hide show

README.md +7 -75
config.json +2 -1
special_tokens_map.json +2 -2
tokenizer_config.json +18 -4

README.md CHANGED Viewed

@@ -5,89 +5,21 @@ OpenSeek-Small-v1 is the initial production model of the OpenSeek project.
 - Utilizes DeepSeek-V3-like MoE architecture.
 - Comprises 1.4 billion total parameters, with 0.4 billion activated parameters.
 - Trained on 720 billion tokens.
-- Demonstrates superior efficiency compared to 1-billion-parameter models.
-## Training Data
-- 0.72TB tokens of high-quality pretraining data and the ratio for each domain is as follows:
-| Name                                      | Ratio   |
-|-------------------------------------------|---------|
-| Nemotron-CC-high-actual-actual-high       | 1.26    |
-| Nemotron-CC-high-actual-actual-low        | 0.67    |
-| Nemotron-CC-high-actual-actual-mid        | 2.05    |
-| Nemotron-CC-high-synthetic-distill-high   | 1.59    |
-| Nemotron-CC-high-synthetic-distill-low    | 0.64    |
-| Nemotron-CC-high-synthetic-distill-mid    | 2.32    |
-| Nemotron-CC-high-synthetic-diverse_qa_pairs-high | 4.67    |
-| Nemotron-CC-high-synthetic-diverse_qa_pairs-low  | 2.16    |
-| Nemotron-CC-high-synthetic-diverse_qa_pairs-mid  | 7.58    |
-| Nemotron-CC-high-synthetic-extract_knowledge-high | 6.43    |
-| Nemotron-CC-high-synthetic-extract_knowledge-low  | 0.07    |
-| Nemotron-CC-high-synthetic-extract_knowledge-mid  | 2.22    |
-| Nemotron-CC-high-synthetic-knowledge_list-high | 1.88    |
-| Nemotron-CC-high-synthetic-knowledge_list-low  | 0.74    |
-| Nemotron-CC-high-synthetic-knowledge_list-mid  | 3.20    |
-| Nemotron-CC-high-synthetic-wrap_medium-high | 3.89    |
-| Nemotron-CC-high-synthetic-wrap_medium-low  | 0.65    |
-| Nemotron-CC-high-synthetic-wrap_medium-mid  | 6.18    |
-| Nemotron-CC-low-synthetic-wrap_medium-high | 0.17    |
-| Nemotron-CC-low-synthetic-wrap_medium-low  | 0.30    |
-| Nemotron-CC-low-synthetic-wrap_medium-mid  | 1.08    |
-| Nemotron-CC-medium-actual-actual-high      | 2.20    |
-| Nemotron-CC-medium-actual-actual-low       | 4.48    |
-| Nemotron-CC-medium-actual-actual-mid       | 7.76    |
-| arxiv                                    | 0.32    |
-| books                                    | 1.98    |
-| code                                     | 3.43    |
-| cot_synthesis_CC                         | 9.82    |
-| cot_synthesis_OpenSource                 | 0.46    |
-| cot_synthesis_arxiv                      | 4.15    |
-| cot_synthesis_code                       | 1.32    |
-| cot_synthesis_math                       | 2.19    |
-| cot_synthesis_wiki                       | 0.83    |
-| math                                     | 0.83    |
-| pes2o                                    | 0.31    |
-| stack                                    | 0.19    |
-| wiki                                     | 0.29    |
-| zh_cc                                    | 9.65    |
-## Wandb
-Our training curves have been recorded in Weights & Biases [wandb](https://wandb.ai/openseek-team/OpenSeek-Small-v1).
-## Evaluation
-| Category                     | Metrics (shots)   | Llama-3.2-1B | Qwen2.5-1.5B | Qwen2.5-0.5B | OLMo-1B-0724 | OpenSeek-Small-v1 |
-|------------------------------|-------------------|--------------|--------------|--------------|---------------|-------------------|
-| **English-Commonsense Reasoning** | HellaSwag (5-shot)  | 0.4830       | 0.5007       | 0.4007       | 0.4909        | 0.3893            |
-|                              | TruthfulQA (0-shot) | 0.3773       | 0.4663       | 0.3986       | 0.4029        | 0.3990            |
-|                              | Winogrande (5-shot) | 0.6212       | 0.6448       | 0.5683       | 0.6290        | 0.5541            |
-|                              | CommonsenseQA (5-shot) | 0.3120       | 0.7445       | 0.5487       | 0.1949        | 0.2048            |
-|                              | PIQA (5-shot)      | 0.7514       | 0.7612       | 0.7111       | 0.7459        | 0.7203            |
-|                              | OpenBookQA (5-shot) | 0.2960       | 0.3340       | 0.2720       | 0.3080        | 0.2560            |
-|                              | BoolQ (5-shot)     | 0.6590       | 0.7774       | 0.6572       | 0.6508        | 0.6165            |
-| **English-Problem-Solving**   | ARC Easy (5-shot)  | 0.6940       | 0.8043       | 0.6780       | 0.6111        | 0.6237            |
-|                              | ARC Challenge (5-shot) | 0.3532       | 0.4846       | 0.3370       | 0.3063        | 0.3157            |
-|                              | MMLU (5-shot)      | 0.3124       | 0.6165       | 0.4818       | 0.2869        | 0.2654            |
-| **English-Mathematics**       | GSM8K (5-shot)     | 0.0637       | 0.6194       | 0.3495       | 0.0159        | 0.0182            |
-|                              | Minerva Math (4-shot) | 0.0180       | 0.2876       | 0.1160       | 0.0182        | 0.0010            |
-| **Chinese**                   | CEval (5-shot)     | 0.2779       | 0.6954       | 0.5423       | 0.2340        | 0.2422            |
-|                              | CMMLU (5-shot)     | 0.2687       | 0.6882       | 0.5300       | 0.2570        | 0.2468            |
-| **Average Metrics**           | **Average-English(w/o Math)** | 0.4859       | 0.6134       | 0.5053       | 0.4627        | 0.4345            |
-|                              | **Average-English** | 0.4118       | 0.5868       | 0.4599       | 0.3884        | 0.3637            |
-|                              | **Average-Chinese** | 0.2733       | 0.6918       | 0.5362       | 0.2455        | 0.2445            |
-|                              | **Average**         | 0.3920       | 0.6018       | 0.4708       | 0.3680        | 0.3466            |
-|                              | **Average(w/o Math)** | 0.4505       | 0.6265       | 0.5105       | 0.4265        | 0.4028            |
-OpenSeek-Small-v1 demonstrates superior efficiency compared to 1-billion-parameter models.
-- <img src="logC_vs_Metric_Average_scatter_plot.png" alt="logC_vs_Metric_Average" width="400"/>
 ## Usage Instructions
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
-model = AutoModelForCausalLM.from_pretrained("BAAI/OpenSeek-Small-v1",trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained("BAAI/OpenSeek-Small-v1",trust_remote_code=True)
 inputs = tokenizer("The future of AI is", return_tensors="pt")
 outputs = model.generate(**inputs, max_length=50)
 print(tokenizer.decode(outputs[0]))
 ```

 - Utilizes DeepSeek-V3-like MoE architecture.
 - Comprises 1.4 billion total parameters, with 0.4 billion activated parameters.
 - Trained on 720 billion tokens.
+- Completely broken in stock form.
+Key Fixes in this repository:
+- Fixed Broken Position Embeddings
+- Fixed Fundamental Incompatibilities Between Deepseekv3 Model and Qwen Tokenizer.
 ## Usage Instructions
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained("Robertp423/OpenSeek-Fixed",trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("Robertp423/OpenSeek-Fixed",trust_remote_code=True)
 inputs = tokenizer("The future of AI is", return_tensors="pt")
+inputs.pop("token_type_ids", None)  # Critical fix
 outputs = model.generate(**inputs, max_length=50)
 print(tokenizer.decode(outputs[0]))
 ```

config.json CHANGED Viewed

@@ -19,7 +19,6 @@
   "initializer_range": 0.006,
   "intermediate_size": 7168,
   "kv_lora_rank": 512,
-  "position_embedding": "rope_64dim",
   "max_position_embeddings": 4096,
   "model_type": "deepseek_v3",
   "moe_intermediate_size": 896,
@@ -34,6 +33,8 @@
   "num_key_value_heads": 10,
   "num_nextn_predict_layers": 1,
   "pretraining_tp": 1,
   "q_lora_rank": null,
   "qk_nope_head_dim": 128,
   "qk_rope_head_dim": 64,

   "initializer_range": 0.006,
   "intermediate_size": 7168,
   "kv_lora_rank": 512,
   "max_position_embeddings": 4096,
   "model_type": "deepseek_v3",
   "moe_intermediate_size": 896,
   "num_key_value_heads": 10,
   "num_nextn_predict_layers": 1,
   "pretraining_tp": 1,
+  "problematic_params": ["token_type_ids"],
+  "position_embedding_type": "rope",
   "q_lora_rank": null,
   "qk_nope_head_dim": 128,
   "qk_rope_head_dim": 64,

special_tokens_map.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "bos_token": "<|extra_203|>",
   "eos_token": "<|extra_204|>",
-  "unk_token": "[UNK]",
-  "pad_token": "[PAD]"
 }

 {
   "bos_token": "<|extra_203|>",
   "eos_token": "<|extra_204|>",
+  "unk_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>"
 }

tokenizer_config.json CHANGED Viewed

@@ -1,11 +1,25 @@
 {
-  "model_max_length": 8192,
   "tokenizer_class": "QWenTokenizer",
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_qwen.QWenTokenizer",
       null
-      ]
   },
-  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
-}

 {
+  "model_max_length": 4096,
   "tokenizer_class": "QWenTokenizer",
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_qwen.QWenTokenizer",
       null
+    ]
   },
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "added_tokens_map": {
+    "<|endoftext|>": {
+      "content": "<|endoftext|>",
+      "single_word": false
+    },
+    "<|im_start|>": {
+      "content": "<|im_start|>",
+      "single_word": false
+    },
+    "<|im_end|>": {
+      "content": "<|im_end|>",
+      "single_word": false
+    }
+  }
+}