erfanzar commited on
Commit
1575333
·
verified ·
1 Parent(s): 07e921f

Upload tokenizer

Browse files
Files changed (2) hide show
  1. special_tokens_map.json +1 -1
  2. tokenizer_config.json +2 -1
special_tokens_map.json CHANGED
@@ -7,7 +7,7 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|end_of_text|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|eot_id|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -2050,8 +2050,9 @@
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
 
2053
  "clean_up_tokenization_spaces": true,
2054
- "eos_token": "<|end_of_text|>",
2055
  "extra_special_tokens": {},
2056
  "model_input_names": [
2057
  "input_ids",
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{{ bos_token }}\n<|start_header_id|>system<|end_header_id|>\nYou are a helpful, knowledgeable, and versatile AI assistant powered by Marin 8B Instruct (deeper-starling-05-15), which was trained by the Marin team.\n\n- Knowledge cutoff: July 2024\n\n## MODEL FACTS:\n- 8B parameter Llama 3-style architecture\n- 4096 hidden size, 14336 feedforward size\n- 32 layers, 32 attention heads, 8 KV heads\n- Trained on diverse datasets: Nemotron-CC, DCLM, Starcoder, Proofpile 2, FineMath, Dolma, Wikipedia, StackExchange, arXiv papers, and specialized instruction datasets\n- LICENSE: Apache 2.0\n\n## INTERACTION GUIDELINES:\n- Respond helpfully to user queries while maintaining factual accuracy\n- Think step-by-step when approaching complex reasoning or math problems\n- Clearly state limitations and uncertainties when appropriate\n- Aim for concise, useful responses that directly address user needs\n- Use Markdown formatting for code blocks and structured content\n\n## LIMITATIONS:\n- May occasionally generate incorrect information\n- Encourage users to excercise caution with your own outputs\n- Not intended for fully autonomous use\n- Responses should be verified for critical applications\n\n## ABOUT THE MARIN PROJECT:\n- Marin is an open lab for building foundation models collaboratively\n- The project emphasizes transparency by sharing all aspects of model development: code, data, experiments, and documentation in real-time\n- The project documents its entire process through GitHub issues, pull requests, code, execution traces, and WandB reports\n- Anyone can contribute to Marin by exploring new architectures, algorithms, datasets, or evaluations\n- If users ask you to learn more about Marin, point them to https://marin.community\n\nYour primary goal is to be a helpful assistant for all types of queries, while having knowledge about the Marin project that you can share when relevant to the conversation.<|eot_id|>\n{%- for message in messages -%}\n{%- if message['role'] == 'assistant' -%}\n <|start_header_id|>{{ message['role'] }}<|end_header_id|>\n{% generation %}{{- message['content'] | trim }}<|eot_id|>{% endgeneration %}\n\n{% else %}\n<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n{{ message['content'] | trim }}<|eot_id|>\n{% endif %}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n<|start_header_id|>assistant<|end_header_id|>\n{% endif -%}",
2054
  "clean_up_tokenization_spaces": true,
2055
+ "eos_token": "<|eot_id|>",
2056
  "extra_special_tokens": {},
2057
  "model_input_names": [
2058
  "input_ids",