Upload 9 files
Browse files- .gitattributes +1 -0
- genai_config.json +56 -0
- model.data +3 -0
- model.onnx +3 -0
- prompts.txt +2 -0
- special_tokens_map.json +6 -0
- tokenization_chatglm.py +506 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +117 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
model.data filter=lfs diff=lfs merge=lfs -text
|
genai_config.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"bos_token_id": 0,
|
4 |
+
"context_length": 8192,
|
5 |
+
"decoder": {
|
6 |
+
"session_options": {
|
7 |
+
"log_id": "onnxruntime-genai",
|
8 |
+
"custom_ops_library": "onnxruntime_vitis_ai_custom_ops.dll",
|
9 |
+
"provider_options": [
|
10 |
+
{
|
11 |
+
"VitisAI": {
|
12 |
+
"config_file": ".\\libs\\vaip_llm.json"
|
13 |
+
}
|
14 |
+
}
|
15 |
+
]
|
16 |
+
},
|
17 |
+
"filename": "model.onnx",
|
18 |
+
"head_size": 128,
|
19 |
+
"hidden_size": 4096,
|
20 |
+
"inputs": {
|
21 |
+
"input_ids": "input_ids",
|
22 |
+
"attention_mask": "attention_mask",
|
23 |
+
"past_key_names": "past_key_values.%d.key",
|
24 |
+
"past_value_names": "past_key_values.%d.value"
|
25 |
+
},
|
26 |
+
"outputs": {
|
27 |
+
"logits": "logits",
|
28 |
+
"present_key_names": "present.%d.key",
|
29 |
+
"present_value_names": "present.%d.value"
|
30 |
+
},
|
31 |
+
"num_attention_heads": 32,
|
32 |
+
"num_hidden_layers": 28,
|
33 |
+
"num_key_value_heads": 2
|
34 |
+
},
|
35 |
+
"eos_token_id": 2,
|
36 |
+
"pad_token_id": 0,
|
37 |
+
"type": "chatglm",
|
38 |
+
"vocab_size": 65024
|
39 |
+
},
|
40 |
+
"search": {
|
41 |
+
"diversity_penalty": 0.0,
|
42 |
+
"do_sample": false,
|
43 |
+
"early_stopping": true,
|
44 |
+
"length_penalty": 1.0,
|
45 |
+
"max_length": 8192,
|
46 |
+
"min_length": 0,
|
47 |
+
"no_repeat_ngram_size": 0,
|
48 |
+
"num_beams": 1,
|
49 |
+
"num_return_sequences": 1,
|
50 |
+
"past_present_share_buffer": true,
|
51 |
+
"repetition_penalty": 1.0,
|
52 |
+
"temperature": 1.0,
|
53 |
+
"top_k": 1,
|
54 |
+
"top_p": 1.0
|
55 |
+
}
|
56 |
+
}
|
model.data
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a8e60c0f2bd5314b3b51269d293f6a0f7775020cf6e9c1d8bbf057c48e10ff64
|
3 |
+
size 3791874048
|
model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:67931c5992f4721670bc6f73491da788c977f2546dedb76697d497d6cecd54a7
|
3 |
+
size 213850
|
prompts.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
2048------------------------------
|
2 |
+
In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and suggest the most likely diagnosis. It could also recommend a personalized treatment plan based on the patientΓÇÖs genetic makeup, lifestyle, and other factors. This level of precision medicine has the potential to improve patient outcomes significantly. However, the implementation of AI in healthcare is not without its challenges. One of the main concerns is the ethical implications of using AI in such a sensitive field. For instance, who is responsible if an AI system makes a wrong diagnosis? How do we ensure that AI systems are not biased in their decision-making processes? Moreover, there are concerns about patient privacy and the security of sensitive medical data. Another challenge is the integration of AI systems into existing healthcare infrastructure. Many healthcare systems are already burdened with outdated technology and limited resources. Integrating advanced AI systems into these environments requires significant investment and training. Additionally, there is the question of how AI will impact the roles of healthcare professionals. While AI can assist doctors and nurses in their work, there is concern that it could also lead to job displacement. Despite these challenges, the potential benefits of AI in healthcare are immense. For example, AI-powered robots are already being used in some hospitals to assist with surgeries. These robots can perform delicate procedures with a level of precision that is difficult for humans to achieve. AI is also being used to develop new drugs and treatment plans. By analyzing the molecular structure of diseases, AI can help researchers identify potential treatments faster than traditional methods. Moreover, AI has In recent years, artificial intelligence (AI) has revolutionized numerous industries, with healthcare being one of the most promising fields. The integration of AI in healthcare systems has the potential to transform patient care, diagnostics, and treatment plans. Imagine a world where AI-powered algorithms can predict diseases before they manifest, provide personalized treatment plans based on genetic information, and even assist in complex surgeries with unparalleled precision. One of the most significant advantages of AI in healthcare is its ability to analyze vast amounts of data quickly and accurately. Traditional methods of data analysis in healthcare often involve manual processes that are time-consuming and prone to human error. AI, on the other hand, can sift through millions of patient records, medical images, and research papers in a fraction of the time it would take a human. This ability to process and analyze big data allows for more accurate diagnoses and more effective treatment plans. For example, consider the case of a patient presenting with symptoms that could indicate several different conditions. An AI system could analyze the patientΓÇÖs medical history, compare it with millions of other cases, and
|
special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"pad_token": "<unk>",
|
5 |
+
"unk_token": "<unk>"
|
6 |
+
}
|
tokenization_chatglm.py
ADDED
@@ -0,0 +1,506 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from typing import List, Optional, Union, Dict
|
5 |
+
from sentencepiece import SentencePieceProcessor
|
6 |
+
from transformers import AddedToken, PreTrainedTokenizer, PreTrainedTokenizerFast
|
7 |
+
from transformers.convert_slow_tokenizer import (
|
8 |
+
SLOW_TO_FAST_CONVERTERS,
|
9 |
+
SpmConverter,
|
10 |
+
decoders,
|
11 |
+
normalizers,
|
12 |
+
pre_tokenizers,
|
13 |
+
processors,
|
14 |
+
)
|
15 |
+
from transformers.utils import logging, PaddingStrategy
|
16 |
+
from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
|
17 |
+
|
18 |
+
|
19 |
+
logger = logging.get_logger(__name__)
|
20 |
+
|
21 |
+
ADDITIONAL_SPECIAL_TOKENS = [
|
22 |
+
"[MASK]",
|
23 |
+
"[gMASK]",
|
24 |
+
"[sMASK]",
|
25 |
+
"<!sop!>",
|
26 |
+
"<!eop!>",
|
27 |
+
"<|system|>",
|
28 |
+
"<|user|>",
|
29 |
+
"<|assistant|>",
|
30 |
+
"<|observation|>",
|
31 |
+
]
|
32 |
+
PREFIX_TOKENS = ["[gMASK]", "<!sop!>"]
|
33 |
+
|
34 |
+
DUMMY_PREFIX_INDICATOR_FOR_FAST = "<!dummy-prefix!>"
|
35 |
+
|
36 |
+
|
37 |
+
class SPTokenizer:
|
38 |
+
def __init__(self, model_path: str):
|
39 |
+
# reload tokenizer
|
40 |
+
assert os.path.isfile(model_path), model_path
|
41 |
+
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
42 |
+
|
43 |
+
# BOS / EOS token IDs
|
44 |
+
self.n_words: int = self.sp_model.vocab_size()
|
45 |
+
self.bos_id: int = self.sp_model.bos_id()
|
46 |
+
self.eos_id: int = self.sp_model.eos_id()
|
47 |
+
self.pad_id: int = self.sp_model.unk_id()
|
48 |
+
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
49 |
+
|
50 |
+
special_tokens = ADDITIONAL_SPECIAL_TOKENS
|
51 |
+
self.special_tokens = {}
|
52 |
+
self.index_special_tokens = {}
|
53 |
+
for token in special_tokens:
|
54 |
+
self.special_tokens[token] = self.n_words
|
55 |
+
self.index_special_tokens[self.n_words] = token
|
56 |
+
self.n_words += 1
|
57 |
+
self.role_special_token_expression = "|".join([re.escape(token) for token in special_tokens]) # for apply_chat_template
|
58 |
+
|
59 |
+
def tokenize(self, s: str, encode_special_tokens=False):
|
60 |
+
if encode_special_tokens:
|
61 |
+
last_index = 0
|
62 |
+
t = []
|
63 |
+
for match in re.finditer(self.role_special_token_expression, s):
|
64 |
+
if last_index < match.start():
|
65 |
+
t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
|
66 |
+
t.append(s[match.start():match.end()])
|
67 |
+
last_index = match.end()
|
68 |
+
if last_index < len(s):
|
69 |
+
t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
|
70 |
+
return t
|
71 |
+
else:
|
72 |
+
return self.sp_model.EncodeAsPieces(s)
|
73 |
+
|
74 |
+
def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
|
75 |
+
assert type(s) is str
|
76 |
+
t = self.sp_model.encode(s)
|
77 |
+
if bos:
|
78 |
+
t = [self.bos_id] + t
|
79 |
+
if eos:
|
80 |
+
t = t + [self.eos_id]
|
81 |
+
return t
|
82 |
+
|
83 |
+
def decode(self, t: List[int]) -> str:
|
84 |
+
text, buffer = "", []
|
85 |
+
for token in t:
|
86 |
+
if token in self.index_special_tokens:
|
87 |
+
if buffer:
|
88 |
+
text += self.sp_model.decode(buffer)
|
89 |
+
buffer = []
|
90 |
+
text += self.index_special_tokens[token]
|
91 |
+
else:
|
92 |
+
buffer.append(token)
|
93 |
+
if buffer:
|
94 |
+
text += self.sp_model.decode(buffer)
|
95 |
+
return text
|
96 |
+
|
97 |
+
def decode_tokens(self, tokens: List[str]) -> str:
|
98 |
+
text = self.sp_model.DecodePieces(tokens)
|
99 |
+
return text
|
100 |
+
|
101 |
+
def convert_token_to_id(self, token):
|
102 |
+
""" Converts a token (str) in an id using the vocab. """
|
103 |
+
if token in self.special_tokens:
|
104 |
+
return self.special_tokens[token]
|
105 |
+
return self.sp_model.PieceToId(token)
|
106 |
+
|
107 |
+
def convert_id_to_token(self, index):
|
108 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
109 |
+
if index in self.index_special_tokens:
|
110 |
+
return self.index_special_tokens[index]
|
111 |
+
if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0 or index >= self.sp_model.vocab_size():
|
112 |
+
return ""
|
113 |
+
return self.sp_model.IdToPiece(index)
|
114 |
+
|
115 |
+
|
116 |
+
class ChatGLMTokenizer(PreTrainedTokenizer):
|
117 |
+
|
118 |
+
vocab_files_names = {"vocab_file": "tokenizer.model"}
|
119 |
+
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
120 |
+
|
121 |
+
def __init__(
|
122 |
+
self,
|
123 |
+
vocab_file,
|
124 |
+
padding_side="left",
|
125 |
+
clean_up_tokenization_spaces=False,
|
126 |
+
encode_special_tokens=False,
|
127 |
+
**kwargs
|
128 |
+
):
|
129 |
+
self.name = "GLMTokenizer"
|
130 |
+
self.vocab_file = vocab_file
|
131 |
+
self.tokenizer = SPTokenizer(vocab_file)
|
132 |
+
self.special_tokens = {
|
133 |
+
"<bos>": self.tokenizer.bos_id,
|
134 |
+
"<eos>": self.tokenizer.eos_id,
|
135 |
+
"<unk>": self.tokenizer.pad_id,
|
136 |
+
"<pad>": self.tokenizer.pad_id
|
137 |
+
}
|
138 |
+
self.encode_special_tokens = encode_special_tokens
|
139 |
+
|
140 |
+
super().__init__(
|
141 |
+
padding_side=padding_side,
|
142 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
143 |
+
**kwargs
|
144 |
+
)
|
145 |
+
|
146 |
+
def get_command(self, token):
|
147 |
+
if token in self.special_tokens:
|
148 |
+
return self.special_tokens[token]
|
149 |
+
assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
|
150 |
+
return self.tokenizer.special_tokens[token]
|
151 |
+
|
152 |
+
@property
|
153 |
+
def unk_token(self) -> str:
|
154 |
+
return self.tokenizer.sp_model.IdToPiece(self.get_command("<unk>"))
|
155 |
+
|
156 |
+
@property
|
157 |
+
def pad_token(self) -> str:
|
158 |
+
return self.tokenizer.sp_model.IdToPiece(self.get_command("<pad>"))
|
159 |
+
|
160 |
+
@property
|
161 |
+
def eos_token(self) -> str:
|
162 |
+
return self.tokenizer.sp_model.IdToPiece(self.get_command("<eos>"))
|
163 |
+
|
164 |
+
@property
|
165 |
+
def unk_token_id(self) -> int:
|
166 |
+
return self.get_command("<unk>")
|
167 |
+
|
168 |
+
@property
|
169 |
+
def pad_token_id(self) -> int:
|
170 |
+
return self.get_command("<pad>")
|
171 |
+
|
172 |
+
@property
|
173 |
+
def eos_token_id(self):
|
174 |
+
return self.get_command("<eos>")
|
175 |
+
|
176 |
+
@unk_token.setter
|
177 |
+
def unk_token(self, value):
|
178 |
+
logger.warning("Setting unk_token is not supported, use the default one.")
|
179 |
+
|
180 |
+
@pad_token.setter
|
181 |
+
def pad_token(self, value):
|
182 |
+
logger.warning("Setting pad_token is not supported, use the default one.")
|
183 |
+
|
184 |
+
@eos_token.setter
|
185 |
+
def eos_token(self, value):
|
186 |
+
logger.warning("Setting eos_token is not supported, use the default one.")
|
187 |
+
|
188 |
+
@property
|
189 |
+
def vocab_size(self):
|
190 |
+
return self.tokenizer.n_words
|
191 |
+
|
192 |
+
def get_vocab(self):
|
193 |
+
""" Returns vocab as a dict """
|
194 |
+
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
|
195 |
+
vocab.update(self.added_tokens_encoder)
|
196 |
+
return vocab
|
197 |
+
|
198 |
+
def _tokenize(self, text, **kwargs):
|
199 |
+
return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
|
200 |
+
|
201 |
+
def _convert_token_to_id(self, token):
|
202 |
+
""" Converts a token (str) in an id using the vocab. """
|
203 |
+
return self.tokenizer.convert_token_to_id(token)
|
204 |
+
|
205 |
+
def _convert_id_to_token(self, index):
|
206 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
207 |
+
return self.tokenizer.convert_id_to_token(index)
|
208 |
+
|
209 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
210 |
+
return self.tokenizer.decode_tokens(tokens)
|
211 |
+
|
212 |
+
def save_vocabulary(self, save_directory, filename_prefix=None):
|
213 |
+
"""
|
214 |
+
Save the vocabulary and special tokens file to a directory.
|
215 |
+
|
216 |
+
Args:
|
217 |
+
save_directory (`str`):
|
218 |
+
The directory in which to save the vocabulary.
|
219 |
+
filename_prefix (`str`, *optional*):
|
220 |
+
An optional prefix to add to the named of the saved files.
|
221 |
+
|
222 |
+
Returns:
|
223 |
+
`Tuple(str)`: Paths to the files saved.
|
224 |
+
"""
|
225 |
+
if os.path.isdir(save_directory):
|
226 |
+
vocab_file = os.path.join(
|
227 |
+
save_directory, self.vocab_files_names["vocab_file"]
|
228 |
+
)
|
229 |
+
else:
|
230 |
+
vocab_file = save_directory
|
231 |
+
|
232 |
+
with open(self.vocab_file, 'rb') as fin:
|
233 |
+
proto_str = fin.read()
|
234 |
+
|
235 |
+
with open(vocab_file, "wb") as writer:
|
236 |
+
writer.write(proto_str)
|
237 |
+
|
238 |
+
return (vocab_file,)
|
239 |
+
|
240 |
+
def get_prefix_tokens(self):
|
241 |
+
return list(map(self.get_command, PREFIX_TOKENS))
|
242 |
+
|
243 |
+
def build_single_message(self, role, metadata, message):
|
244 |
+
assert role in ["system", "user", "assistant", "observation"], role
|
245 |
+
role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
|
246 |
+
message_tokens = self.tokenizer.encode(message)
|
247 |
+
tokens = role_tokens + message_tokens
|
248 |
+
return tokens
|
249 |
+
|
250 |
+
def build_chat_input(self, query, history=None, role="user"):
|
251 |
+
if history is None:
|
252 |
+
history = []
|
253 |
+
input_ids = []
|
254 |
+
for item in history:
|
255 |
+
content = item["content"]
|
256 |
+
if item["role"] == "system" and "tools" in item:
|
257 |
+
content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
|
258 |
+
input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
|
259 |
+
input_ids.extend(self.build_single_message(role, "", query))
|
260 |
+
input_ids.extend([self.get_command("<|assistant|>")])
|
261 |
+
return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
|
262 |
+
|
263 |
+
def build_inputs_with_special_tokens(
|
264 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
265 |
+
) -> List[int]:
|
266 |
+
"""
|
267 |
+
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
268 |
+
adding special tokens. A BERT sequence has the following format:
|
269 |
+
|
270 |
+
- single sequence: `[CLS] X [SEP]`
|
271 |
+
- pair of sequences: `[CLS] A [SEP] B [SEP]`
|
272 |
+
|
273 |
+
Args:
|
274 |
+
token_ids_0 (`List[int]`):
|
275 |
+
List of IDs to which the special tokens will be added.
|
276 |
+
token_ids_1 (`List[int]`, *optional*):
|
277 |
+
Optional second list of IDs for sequence pairs.
|
278 |
+
|
279 |
+
Returns:
|
280 |
+
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
281 |
+
"""
|
282 |
+
prefix_tokens = self.get_prefix_tokens()
|
283 |
+
token_ids_0 = prefix_tokens + token_ids_0
|
284 |
+
if token_ids_1 is not None:
|
285 |
+
token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
|
286 |
+
return token_ids_0
|
287 |
+
|
288 |
+
def _pad(
|
289 |
+
self,
|
290 |
+
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
|
291 |
+
max_length: Optional[int] = None,
|
292 |
+
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
293 |
+
pad_to_multiple_of: Optional[int] = None,
|
294 |
+
return_attention_mask: Optional[bool] = None,
|
295 |
+
) -> dict:
|
296 |
+
"""
|
297 |
+
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
298 |
+
|
299 |
+
Args:
|
300 |
+
encoded_inputs:
|
301 |
+
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
|
302 |
+
max_length: maximum length of the returned list and optionally padding length (see below).
|
303 |
+
Will truncate by taking into account the special tokens.
|
304 |
+
padding_strategy: PaddingStrategy to use for padding.
|
305 |
+
|
306 |
+
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
307 |
+
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
308 |
+
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
309 |
+
The tokenizer padding sides are defined in self.padding_side:
|
310 |
+
|
311 |
+
- 'left': pads on the left of the sequences
|
312 |
+
- 'right': pads on the right of the sequences
|
313 |
+
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
314 |
+
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
315 |
+
`>= 7.5` (Volta).
|
316 |
+
return_attention_mask:
|
317 |
+
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
318 |
+
"""
|
319 |
+
# Load from model defaults
|
320 |
+
assert self.padding_side == "left"
|
321 |
+
|
322 |
+
required_input = encoded_inputs[self.model_input_names[0]]
|
323 |
+
seq_length = len(required_input)
|
324 |
+
|
325 |
+
if padding_strategy == PaddingStrategy.LONGEST:
|
326 |
+
max_length = len(required_input)
|
327 |
+
|
328 |
+
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
329 |
+
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
330 |
+
|
331 |
+
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
332 |
+
|
333 |
+
# Initialize attention mask if not present.
|
334 |
+
if "attention_mask" not in encoded_inputs:
|
335 |
+
encoded_inputs["attention_mask"] = [1] * seq_length
|
336 |
+
|
337 |
+
if "position_ids" not in encoded_inputs:
|
338 |
+
encoded_inputs["position_ids"] = list(range(seq_length))
|
339 |
+
|
340 |
+
if needs_to_be_padded:
|
341 |
+
difference = max_length - len(required_input)
|
342 |
+
|
343 |
+
if "attention_mask" in encoded_inputs:
|
344 |
+
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
345 |
+
if "position_ids" in encoded_inputs:
|
346 |
+
encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
|
347 |
+
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
348 |
+
|
349 |
+
return encoded_inputs
|
350 |
+
|
351 |
+
|
352 |
+
class ChatGLMTokenizerFast(PreTrainedTokenizerFast):
|
353 |
+
# multiple breaking changes, no backward-compatibility
|
354 |
+
slow_tokenizer_class = ChatGLMTokenizer
|
355 |
+
vocab_files_names = {
|
356 |
+
**ChatGLMTokenizer.vocab_files_names,
|
357 |
+
**PreTrainedTokenizerFast.vocab_files_names,
|
358 |
+
}
|
359 |
+
|
360 |
+
def __init__(self, **kwargs):
|
361 |
+
kwargs.setdefault("clean_up_tokenization_spaces", False)
|
362 |
+
kwargs.setdefault("bos_token", "<s>")
|
363 |
+
kwargs.setdefault("eos_token", "</s>")
|
364 |
+
kwargs.setdefault("unk_token", "<unk>")
|
365 |
+
kwargs.setdefault("pad_token", "<unk>")
|
366 |
+
super().__init__(**kwargs)
|
367 |
+
|
368 |
+
@property
|
369 |
+
def dummy_prefix_indicator(self):
|
370 |
+
return DUMMY_PREFIX_INDICATOR_FOR_FAST
|
371 |
+
|
372 |
+
@property
|
373 |
+
def can_save_slow_tokenizer(self) -> bool:
|
374 |
+
# multiple breaking changes
|
375 |
+
return False
|
376 |
+
|
377 |
+
def save_pretrained(self, *args, **kwargs):
|
378 |
+
if not self.can_save_slow_tokenizer:
|
379 |
+
logger.warning(
|
380 |
+
f"{type(self).__name__} does not support saving slow tokenizer. "
|
381 |
+
"Saving it at the same directory may break the original tokenizer. "
|
382 |
+
"Please keep a backup beforehand."
|
383 |
+
)
|
384 |
+
|
385 |
+
return super().save_pretrained(*args, **kwargs)
|
386 |
+
|
387 |
+
def build_single_message_prompt(self, role, metadata, message):
|
388 |
+
assert role in ["system", "user", "assistant", "observation"], role
|
389 |
+
return (
|
390 |
+
f"<|{role}|>"
|
391 |
+
f"{self.dummy_prefix_indicator}{metadata}\n"
|
392 |
+
f"{self.dummy_prefix_indicator}{message}"
|
393 |
+
)
|
394 |
+
|
395 |
+
def build_chat_prompt(self, query, history=None, role="user", metadata=""):
|
396 |
+
inputs = []
|
397 |
+
|
398 |
+
for item in history or []:
|
399 |
+
content = item["content"]
|
400 |
+
|
401 |
+
if item["role"] == "system" and "tools" in item:
|
402 |
+
content += "\n" + json.dumps(
|
403 |
+
item["tools"], indent=4, ensure_ascii=False
|
404 |
+
)
|
405 |
+
|
406 |
+
inputs.append(
|
407 |
+
self.build_single_message_prompt(
|
408 |
+
item["role"], item.get("metadata", ""), content
|
409 |
+
)
|
410 |
+
)
|
411 |
+
|
412 |
+
inputs.append(self.build_single_message_prompt(role, metadata, query))
|
413 |
+
inputs.append("<|assistant|>")
|
414 |
+
|
415 |
+
return "".join(inputs)
|
416 |
+
|
417 |
+
def build_chat_input(self, *args, **kwargs):
|
418 |
+
return self.batch_encode_plus(
|
419 |
+
[self.build_chat_prompt(*args, **kwargs)],
|
420 |
+
return_tensors="pt",
|
421 |
+
)
|
422 |
+
|
423 |
+
|
424 |
+
ChatGLMTokenizer.register_for_auto_class()
|
425 |
+
ChatGLMTokenizerFast.register_for_auto_class()
|
426 |
+
|
427 |
+
|
428 |
+
class ChatGLMTokenizerConverter(SpmConverter):
|
429 |
+
handle_byte_fallback = True
|
430 |
+
|
431 |
+
def normalizer(self, proto):
|
432 |
+
return normalizers.Sequence(
|
433 |
+
[
|
434 |
+
normalizers.Replace(
|
435 |
+
pattern=DUMMY_PREFIX_INDICATOR_FOR_FAST, content="▁"
|
436 |
+
),
|
437 |
+
normalizers.Replace(pattern=" ", content="▁"),
|
438 |
+
]
|
439 |
+
)
|
440 |
+
|
441 |
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
442 |
+
# NOTE: don't use Metaspace, it won't merge spaces into one token
|
443 |
+
# without Metaspace: " " => ["▁▁"]
|
444 |
+
# with Metaspace: " " => ["▁", "▁"]
|
445 |
+
return pre_tokenizers.Split(DUMMY_PREFIX_INDICATOR_FOR_FAST, "merged_with_next")
|
446 |
+
|
447 |
+
def decoder(self, replacement, add_prefix_space):
|
448 |
+
return decoders.Sequence(
|
449 |
+
[
|
450 |
+
decoders.ByteFallback(),
|
451 |
+
decoders.Metaspace(replacement="▁", add_prefix_space=True),
|
452 |
+
]
|
453 |
+
)
|
454 |
+
|
455 |
+
def tokenizer(self, proto):
|
456 |
+
tokenizer = super().tokenizer(proto)
|
457 |
+
|
458 |
+
tokenizer.model.byte_fallback = True
|
459 |
+
|
460 |
+
assert tokenizer.token_to_id("<unk>") == 0
|
461 |
+
assert tokenizer.token_to_id("<s>") == 1
|
462 |
+
assert tokenizer.token_to_id("</s>") == 2
|
463 |
+
special_tokens = [
|
464 |
+
"<unk>",
|
465 |
+
"<s>",
|
466 |
+
"</s>",
|
467 |
+
*ADDITIONAL_SPECIAL_TOKENS,
|
468 |
+
]
|
469 |
+
|
470 |
+
tokenizer.add_special_tokens(
|
471 |
+
[AddedToken(token, special=True) for token in special_tokens]
|
472 |
+
)
|
473 |
+
|
474 |
+
return tokenizer
|
475 |
+
|
476 |
+
def converted(self):
|
477 |
+
tokenizer = super().converted()
|
478 |
+
|
479 |
+
# Post processors
|
480 |
+
prefix_token_ids = list(map(tokenizer.token_to_id, PREFIX_TOKENS))
|
481 |
+
assert all(i is not None for i in prefix_token_ids)
|
482 |
+
prefix_template = " ".join(PREFIX_TOKENS)
|
483 |
+
|
484 |
+
template_special_tokens = list(frozenset(zip(PREFIX_TOKENS, prefix_token_ids)))
|
485 |
+
|
486 |
+
if "</s>" not in PREFIX_TOKENS:
|
487 |
+
eos_token_id = tokenizer.token_to_id("</s>")
|
488 |
+
assert eos_token_id is not None
|
489 |
+
template_special_tokens.append(("</s>", eos_token_id))
|
490 |
+
|
491 |
+
post = processors.TemplateProcessing(
|
492 |
+
single=f"{prefix_template} $A",
|
493 |
+
pair=f"{prefix_template} $A $B:1 </s>:1",
|
494 |
+
special_tokens=template_special_tokens,
|
495 |
+
)
|
496 |
+
if tokenizer.post_processor is None:
|
497 |
+
tokenizer.post_processor = post
|
498 |
+
else:
|
499 |
+
tokenizer.post_processor = processors.Sequence(
|
500 |
+
[tokenizer.post_processor, post]
|
501 |
+
)
|
502 |
+
|
503 |
+
return tokenizer
|
504 |
+
|
505 |
+
|
506 |
+
SLOW_TO_FAST_CONVERTERS[ChatGLMTokenizer.__name__] = ChatGLMTokenizerConverter
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
|
3 |
+
size 1018370
|
tokenizer_config.json
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<unk>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<s>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"64789": {
|
28 |
+
"content": "[MASK]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"64790": {
|
36 |
+
"content": "[gMASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"64791": {
|
44 |
+
"content": "[sMASK]",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
},
|
51 |
+
"64792": {
|
52 |
+
"content": "<!sop!>",
|
53 |
+
"lstrip": false,
|
54 |
+
"normalized": false,
|
55 |
+
"rstrip": false,
|
56 |
+
"single_word": false,
|
57 |
+
"special": true
|
58 |
+
},
|
59 |
+
"64793": {
|
60 |
+
"content": "<!eop!>",
|
61 |
+
"lstrip": false,
|
62 |
+
"normalized": false,
|
63 |
+
"rstrip": false,
|
64 |
+
"single_word": false,
|
65 |
+
"special": true
|
66 |
+
},
|
67 |
+
"64794": {
|
68 |
+
"content": "<|system|>",
|
69 |
+
"lstrip": false,
|
70 |
+
"normalized": false,
|
71 |
+
"rstrip": false,
|
72 |
+
"single_word": false,
|
73 |
+
"special": true
|
74 |
+
},
|
75 |
+
"64795": {
|
76 |
+
"content": "<|user|>",
|
77 |
+
"lstrip": false,
|
78 |
+
"normalized": false,
|
79 |
+
"rstrip": false,
|
80 |
+
"single_word": false,
|
81 |
+
"special": true
|
82 |
+
},
|
83 |
+
"64796": {
|
84 |
+
"content": "<|assistant|>",
|
85 |
+
"lstrip": false,
|
86 |
+
"normalized": false,
|
87 |
+
"rstrip": false,
|
88 |
+
"single_word": false,
|
89 |
+
"special": true
|
90 |
+
},
|
91 |
+
"64797": {
|
92 |
+
"content": "<|observation|>",
|
93 |
+
"lstrip": false,
|
94 |
+
"normalized": false,
|
95 |
+
"rstrip": false,
|
96 |
+
"single_word": false,
|
97 |
+
"special": true
|
98 |
+
}
|
99 |
+
},
|
100 |
+
"auto_map": {
|
101 |
+
"AutoTokenizer": [
|
102 |
+
"tokenization_chatglm.ChatGLMTokenizer",
|
103 |
+
"tokenization_chatglm.ChatGLMTokenizerFast"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
"bos_token": "<s>",
|
107 |
+
"chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|> \n {{ message['content'] }}{% else %}<|{{ message['role'] }}|> \n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
|
108 |
+
"clean_up_tokenization_spaces": false,
|
109 |
+
"do_lower_case": false,
|
110 |
+
"eos_token": "</s>",
|
111 |
+
"model_max_length": 1000000000000000019884624838656,
|
112 |
+
"pad_token": "<unk>",
|
113 |
+
"padding_side": "left",
|
114 |
+
"remove_space": false,
|
115 |
+
"tokenizer_class": "ChatGLMTokenizer",
|
116 |
+
"unk_token": "<unk>"
|
117 |
+
}
|