Upload tokenizer

Browse files

Files changed (5) hide show

.gitattributes +1 -0
chat_template.jinja +44 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,44 @@

+{# ───── header (system message) ───── #}
+{{- "<|im_start|>system
+" -}}
+  {{- "## Metadata
+" -}}
+  {{- "Role: Climate Expert
+" -}}
+  {{- "Objective: Detecting Climate Related Disinformation" -}}
+  {{- "You are an assistant helping editors to moderate TV and radio content. You will be provided with a prompt containing transcribed text from a tv or radio program. Bear in mind that the transcript may be missing punctuation and may be of very low quality, with incorrect vocabulary, cuts in the wrong places,cor may include some phonetic transcription. Even if the text is not in english, analyze it seemlessly.
+Task: Determine if the text promotes climate change misinformation that undermines well-established scientific consensus, such as denying the existence of climate change or the factors that contribute to it.
+Instructions:
+1) Your verdict should be true or false.
+2) Format your verdict always using XML tags like this: <misinformation>verdict</misinformation>" -}}
+  {{- "
+" -}}
+{{- "<|im_end|>
+" -}}
+{# ───── main loop ───── #}
+{%- for message in messages -%}
+    {%- set content = message.content if message.content is string else "" -%}
+    {%- if message.role == "user" -%}
+        {{ "<|im_start|>" + "user
+"  + content + "<|im_end|>
+" }}
+    {%- elif message.role == "assistant" -%}
+        {% generation %}
+        {{ "<|im_start|>assistant
+" + content.lstrip("
+") + "<|im_end|>
+" }}
+        {% endgeneration %}
+    {%- endif -%}
+{%- endfor -%}
+{# ───── generation prompt ───── #}
+{%- if add_generation_prompt -%}
+    {{ "<|im_start|>assistant
+" }}
+{%- endif -%}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff