Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- CPIS 498 - Final Presentation Template .pptx +0 -0
- CPIS 498 - Poster Template.pptx +0 -0
- README.md +2 -8
- Senior Project - Final Report Example.pdf +3 -0
- app.py +6 -0
- chapter1_2.py +201 -0
- chapter3.py +247 -0
- chapter4.py +4 -0
- chat.py +66 -0
- get-pip.py +0 -0
- gradio_cached_examples/16/log.csv +4 -0
- test-trainer/checkpoint-500/config.json +27 -0
- test-trainer/checkpoint-500/model.safetensors +3 -0
- test-trainer/checkpoint-500/optimizer.pt +3 -0
- test-trainer/checkpoint-500/rng_state.pth +3 -0
- test-trainer/checkpoint-500/scheduler.pt +3 -0
- test-trainer/checkpoint-500/special_tokens_map.json +7 -0
- test-trainer/checkpoint-500/tokenizer.json +0 -0
- test-trainer/checkpoint-500/tokenizer_config.json +55 -0
- test-trainer/checkpoint-500/trainer_state.json +27 -0
- test-trainer/checkpoint-500/training_args.bin +3 -0
- test-trainer/checkpoint-500/vocab.txt +0 -0
- test-trainer/runs/Jan27_22-08-26_DESKTOP-KTM59NT/events.out.tfevents.1706382507.DESKTOP-KTM59NT.3612.0 +3 -0
- test-trainer/runs/Jan28_06-39-34_DESKTOP-KTM59NT/events.out.tfevents.1706413175.DESKTOP-KTM59NT.17992.0 +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Senior[[:space:]]Project[[:space:]]-[[:space:]]Final[[:space:]]Report[[:space:]]Example.pdf filter=lfs diff=lfs merge=lfs -text
|
CPIS 498 - Final Presentation Template .pptx
ADDED
Binary file (185 kB). View file
|
|
CPIS 498 - Poster Template.pptx
ADDED
Binary file (141 kB). View file
|
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: blue
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.16.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: echo-chatbot
|
3 |
+
app_file: app.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.16.0
|
|
|
|
|
6 |
---
|
|
|
|
Senior Project - Final Report Example.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:491805426b30d23083763aaaf20194ae41426d6d9bdd1f17c9f9d3f5781c7370
|
3 |
+
size 1393475
|
app.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def slow_echo(message, history):
|
4 |
+
return message
|
5 |
+
|
6 |
+
demo = gr.ChatInterface(slow_echo).queue().launch()
|
chapter1_2.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli
|
2 |
+
# from transformers import pipeline
|
3 |
+
# classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
|
4 |
+
# sequence_to_classify = "Angela Merkel is a politician in Germany and leader of the CDU"
|
5 |
+
# candidate_labels = ["politics", "economy", "entertainment", "environment"]
|
6 |
+
# output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
|
7 |
+
# print(output)
|
8 |
+
|
9 |
+
# from transformers import pipeline
|
10 |
+
|
11 |
+
# generator = pipeline("text-generation", model="distilgpt2")
|
12 |
+
# output = generator("In this course, we will teach you how to")
|
13 |
+
# print(output)
|
14 |
+
|
15 |
+
# https://huggingface.co/bigscience/bloom-560m
|
16 |
+
# from transformers import AutoTokenizer, AutoModelForCausalLM
|
17 |
+
# import transformers
|
18 |
+
# import torch
|
19 |
+
|
20 |
+
# model = "bigscience/bloom-560m"
|
21 |
+
|
22 |
+
# tokenizer = AutoTokenizer.from_pretrained(model)
|
23 |
+
# pipeline = transformers.pipeline(
|
24 |
+
# "text-generation",
|
25 |
+
# model=model,
|
26 |
+
# tokenizer=tokenizer,
|
27 |
+
# torch_dtype=torch.bfloat16,
|
28 |
+
# trust_remote_code=True,
|
29 |
+
# device_map="auto",
|
30 |
+
# )
|
31 |
+
# sequences = pipeline(
|
32 |
+
# "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
|
33 |
+
# max_length=200,
|
34 |
+
# do_sample=True,
|
35 |
+
# top_k=10,
|
36 |
+
# num_return_sequences=1,
|
37 |
+
# eos_token_id=tokenizer.eos_token_id,
|
38 |
+
# )
|
39 |
+
# for seq in sequences:
|
40 |
+
# print(f"Result: {seq['generated_text']}")
|
41 |
+
|
42 |
+
# https://huggingface.co/bert-base-uncased
|
43 |
+
# from transformers import pipeline
|
44 |
+
# unmasker = pipeline('fill-mask', model='bert-base-multilingual-cased')
|
45 |
+
# output = unmasker("tu es [MASK] homme?")
|
46 |
+
|
47 |
+
|
48 |
+
# named entity recognition
|
49 |
+
# from transformers import pipeline
|
50 |
+
|
51 |
+
# ner = pipeline("ner", grouped_entities=True)
|
52 |
+
# output = ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")
|
53 |
+
|
54 |
+
# https://huggingface.co/facebook/bart-large-cnn
|
55 |
+
from transformers import pipeline
|
56 |
+
|
57 |
+
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
58 |
+
# output = summarizer(
|
59 |
+
# """
|
60 |
+
# America has changed dramatically during recent years. Not only has the number of
|
61 |
+
# graduates in traditional engineering disciplines such as mechanical, civil,
|
62 |
+
# electrical, chemical, and aeronautical engineering declined, but in most of
|
63 |
+
# the premier American universities engineering curricula now concentrate on
|
64 |
+
# and encourage largely the study of engineering science. As a result, there
|
65 |
+
# are declining offerings in engineering subjects dealing with infrastructure,
|
66 |
+
# the environment, and related issues, and greater concentration on high
|
67 |
+
# technology subjects, largely supporting increasingly complex scientific
|
68 |
+
# developments. While the latter is important, it should not be at the expense
|
69 |
+
# of more traditional engineering.
|
70 |
+
|
71 |
+
# Rapidly developing economies such as China and India, as well as other
|
72 |
+
# industrial countries in Europe and Asia, continue to encourage and advance
|
73 |
+
# the teaching of engineering. Both China and India, respectively, graduate
|
74 |
+
# six and eight times as many traditional engineers as does the United States.
|
75 |
+
# Other industrial countries at minimum maintain their output, while America
|
76 |
+
# suffers an increasingly serious decline in the number of engineering graduates
|
77 |
+
# and a lack of well-educated engineers.
|
78 |
+
# """
|
79 |
+
# )
|
80 |
+
|
81 |
+
# from transformers import pipeline
|
82 |
+
|
83 |
+
# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
|
84 |
+
# output = translator("屌")
|
85 |
+
|
86 |
+
# print(output)
|
87 |
+
|
88 |
+
# from transformers import AutoTokenizer
|
89 |
+
|
90 |
+
# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
91 |
+
|
92 |
+
# sequence = "Using a Transformer network is simple"
|
93 |
+
# tokens = tokenizer.tokenize(sequence)
|
94 |
+
# print(tokens)
|
95 |
+
# ids = tokenizer.convert_tokens_to_ids(tokens)
|
96 |
+
# print(ids)
|
97 |
+
# decoded_string = tokenizer.decode(ids)
|
98 |
+
# print(decoded_string)
|
99 |
+
# print("----------------------")
|
100 |
+
|
101 |
+
# sequence = "Using a Transform network are simple"
|
102 |
+
# tokens = tokenizer.tokenize(sequence)
|
103 |
+
# print(tokens)
|
104 |
+
# ids = tokenizer.convert_tokens_to_ids(tokens)
|
105 |
+
# print(ids)
|
106 |
+
# decoded_string = tokenizer.decode(ids)
|
107 |
+
# print(decoded_string)
|
108 |
+
|
109 |
+
# import torch
|
110 |
+
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
111 |
+
|
112 |
+
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
113 |
+
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
114 |
+
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
|
115 |
+
|
116 |
+
# sequence = "I’ve been waiting for a HuggingFace course my whole life."
|
117 |
+
# tokens = tokenizer.tokenize(sequence)
|
118 |
+
# print(tokens)
|
119 |
+
# sequence1_ids = tokenizer.convert_tokens_to_ids(tokens)
|
120 |
+
# print(sequence1_ids)
|
121 |
+
|
122 |
+
# sequence = "I hate this so much!"
|
123 |
+
# tokens = tokenizer.tokenize(sequence)
|
124 |
+
# print(tokens)
|
125 |
+
# sequence2_ids = tokenizer.convert_tokens_to_ids(tokens)
|
126 |
+
# print(sequence2_ids)
|
127 |
+
|
128 |
+
# sequence1_ids = [[200, 200, 200]]
|
129 |
+
# sequence2_ids = [[200, 200]]
|
130 |
+
# batched_ids = [
|
131 |
+
# [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
|
132 |
+
# [1045, 5223, 2023, 2061, 2172, 999, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id],
|
133 |
+
# ]
|
134 |
+
|
135 |
+
# attention_mask = [
|
136 |
+
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
|
137 |
+
# [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
138 |
+
# ]
|
139 |
+
|
140 |
+
# outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
|
141 |
+
# print(outputs.logits)
|
142 |
+
|
143 |
+
# from transformers import AutoTokenizer
|
144 |
+
|
145 |
+
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
146 |
+
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
147 |
+
|
148 |
+
# sequence = "I've been waiting for a HuggingFace course my whole life."
|
149 |
+
|
150 |
+
# model_inputs = tokenizer(sequence)
|
151 |
+
|
152 |
+
# print(model_inputs)
|
153 |
+
|
154 |
+
# sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
|
155 |
+
# Will pad the sequences up to the maximum sequence length
|
156 |
+
# model_inputs = tokenizer(sequences, padding="longest")
|
157 |
+
# print(model_inputs)
|
158 |
+
# print("-------------------------")
|
159 |
+
|
160 |
+
# Will pad the sequences up to the specified max length
|
161 |
+
# model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
|
162 |
+
# print(model_inputs)
|
163 |
+
|
164 |
+
# from transformers import AutoTokenizer
|
165 |
+
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
166 |
+
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
167 |
+
|
168 |
+
# sequence = "I've been waiting for a HuggingFace course my whole life."
|
169 |
+
|
170 |
+
# model_inputs = tokenizer(sequence)
|
171 |
+
# print("model_inputs = tokenizer(sequence)")
|
172 |
+
# print(model_inputs)
|
173 |
+
# print(model_inputs["input_ids"])
|
174 |
+
|
175 |
+
# tokens = tokenizer.tokenize(sequence)
|
176 |
+
# print("tokens = tokenizer.tokenize(sequence)")
|
177 |
+
# print(tokens)
|
178 |
+
# ids = tokenizer.convert_tokens_to_ids(tokens)
|
179 |
+
# print(sequence)
|
180 |
+
# print(ids)
|
181 |
+
|
182 |
+
# import torch
|
183 |
+
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
184 |
+
|
185 |
+
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
186 |
+
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
187 |
+
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
|
188 |
+
# sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
|
189 |
+
|
190 |
+
# tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
|
191 |
+
# output = model(**tokens)
|
192 |
+
# print(output)
|
193 |
+
|
194 |
+
from transformers import AutoTokenizer, AutoModel
|
195 |
+
|
196 |
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
197 |
+
model = AutoModel.from_pretrained("gpt2")
|
198 |
+
|
199 |
+
encoded = tokenizer("Hey!", return_tensors="pt")
|
200 |
+
result = model(**encoded)
|
201 |
+
print(result)
|
chapter3.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import torch
|
2 |
+
# from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
|
3 |
+
|
4 |
+
# # Same as before
|
5 |
+
# checkpoint = "bert-base-uncased"
|
6 |
+
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
7 |
+
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
|
8 |
+
# sequences = [
|
9 |
+
# "I've been waiting for a HuggingFace course my whole life.",
|
10 |
+
# "This course is amazing!",
|
11 |
+
# ]
|
12 |
+
# batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
|
13 |
+
|
14 |
+
# # This is new
|
15 |
+
# batch["labels"] = torch.tensor([1, 1])
|
16 |
+
|
17 |
+
# optimizer = AdamW(model.parameters())
|
18 |
+
# loss = model(**batch).loss
|
19 |
+
# loss.backward()
|
20 |
+
# optimizer.step()
|
21 |
+
|
22 |
+
from datasets import load_dataset
|
23 |
+
|
24 |
+
# raw_datasets = load_dataset("glue", "sst2")
|
25 |
+
# raw_datasets
|
26 |
+
# raw_train_dataset = raw_datasets["train"]
|
27 |
+
# output = raw_train_dataset[0]['sentence']
|
28 |
+
# print(output)
|
29 |
+
|
30 |
+
# raw_train_dataset = raw_datasets["validation"]
|
31 |
+
# output = raw_train_dataset[87]
|
32 |
+
|
33 |
+
# print(raw_train_dataset.features)
|
34 |
+
|
35 |
+
# from transformers import AutoTokenizer
|
36 |
+
|
37 |
+
# checkpoint = "bert-base-uncased"
|
38 |
+
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
39 |
+
# print(tokenizer(output))
|
40 |
+
# inputs = tokenizer(output)
|
41 |
+
# print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
|
42 |
+
|
43 |
+
# inputs = tokenizer("This is the first sentence.")
|
44 |
+
# print(inputs)
|
45 |
+
# print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
|
46 |
+
# # tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
|
47 |
+
# # tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
|
48 |
+
|
49 |
+
# # inputs = tokenizer("This is the first sentence.", "This is the second one.")
|
50 |
+
# # inputs = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
|
51 |
+
# # print(inputs)
|
52 |
+
|
53 |
+
# def tokenize_function(example):
|
54 |
+
# return tokenizer(example["sentence"], truncation=True)
|
55 |
+
|
56 |
+
# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
57 |
+
# print(tokenized_datasets)
|
58 |
+
|
59 |
+
# from transformers import DataCollatorWithPadding
|
60 |
+
|
61 |
+
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
62 |
+
# samples = tokenized_datasets["train"][:8]
|
63 |
+
# samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
|
64 |
+
|
65 |
+
# print([len(x) for x in samples["input_ids"]])
|
66 |
+
|
67 |
+
# batch = data_collator(samples)
|
68 |
+
# print(batch)
|
69 |
+
# print({k: v.shape for k, v in batch.items()})
|
70 |
+
|
71 |
+
# # Try it yourself
|
72 |
+
from datasets import load_dataset
|
73 |
+
|
74 |
+
raw_datasets = load_dataset("glue", "sst2")
|
75 |
+
raw_train_dataset = raw_datasets["train"]
|
76 |
+
output = raw_train_dataset[0]['sentence']
|
77 |
+
# print(output)
|
78 |
+
|
79 |
+
from transformers import AutoTokenizer
|
80 |
+
|
81 |
+
checkpoint = "bert-base-uncased"
|
82 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
83 |
+
# print(tokenizer(output))
|
84 |
+
inputs = tokenizer(output)
|
85 |
+
# print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
|
86 |
+
|
87 |
+
tokenized_dataset = tokenizer(
|
88 |
+
output,
|
89 |
+
padding=True,
|
90 |
+
truncation=True,
|
91 |
+
)
|
92 |
+
|
93 |
+
def tokenize_function(example):
|
94 |
+
return tokenizer(example["sentence"], truncation=True)
|
95 |
+
|
96 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
97 |
+
# print(tokenized_datasets)
|
98 |
+
|
99 |
+
|
100 |
+
# from datasets import load_dataset
|
101 |
+
# from transformers import AutoTokenizer, DataCollatorWithPadding
|
102 |
+
|
103 |
+
# raw_datasets = load_dataset("glue", "mrpc")
|
104 |
+
# checkpoint = "bert-base-uncased"
|
105 |
+
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
106 |
+
|
107 |
+
|
108 |
+
# def tokenize_function(example):
|
109 |
+
# return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
|
110 |
+
|
111 |
+
|
112 |
+
# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
113 |
+
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
114 |
+
|
115 |
+
# from transformers import TrainingArguments
|
116 |
+
|
117 |
+
# training_args = TrainingArguments("test-trainer")
|
118 |
+
|
119 |
+
# from transformers import AutoModelForSequenceClassification
|
120 |
+
|
121 |
+
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
|
122 |
+
|
123 |
+
# from transformers import Trainer
|
124 |
+
|
125 |
+
# trainer = Trainer(
|
126 |
+
# model,
|
127 |
+
# training_args,
|
128 |
+
# train_dataset=tokenized_datasets["train"],
|
129 |
+
# eval_dataset=tokenized_datasets["validation"],
|
130 |
+
# data_collator=data_collator,
|
131 |
+
# tokenizer=tokenizer,
|
132 |
+
# )
|
133 |
+
# predictions = trainer.predict(tokenized_datasets["validation"])
|
134 |
+
# print(predictions.predictions.shape, predictions.label_ids.shape)
|
135 |
+
|
136 |
+
# import numpy as np
|
137 |
+
|
138 |
+
# preds = np.argmax(predictions.predictions, axis=-1)
|
139 |
+
|
140 |
+
# import evaluate
|
141 |
+
|
142 |
+
# metric = evaluate.load("glue", "mrpc")
|
143 |
+
# metric.compute(predictions=preds, references=predictions.label_ids)
|
144 |
+
|
145 |
+
# def compute_metrics(eval_preds):
|
146 |
+
# metric = evaluate.load("glue", "mrpc")
|
147 |
+
# logits, labels = eval_preds
|
148 |
+
# predictions = np.argmax(logits, axis=-1)
|
149 |
+
# return metric.compute(predictions=predictions, references=labels)
|
150 |
+
|
151 |
+
# training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
|
152 |
+
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
|
153 |
+
|
154 |
+
# trainer = Trainer(
|
155 |
+
# model,
|
156 |
+
# training_args,
|
157 |
+
# train_dataset=tokenized_datasets["train"],
|
158 |
+
# eval_dataset=tokenized_datasets["validation"],
|
159 |
+
# data_collator=data_collator,
|
160 |
+
# tokenizer=tokenizer,
|
161 |
+
# compute_metrics=compute_metrics,
|
162 |
+
# )
|
163 |
+
# trainer.train()
|
164 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
165 |
+
|
166 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
167 |
+
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
|
168 |
+
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
169 |
+
tokenized_datasets.set_format("torch")
|
170 |
+
tokenized_datasets["train"].column_names
|
171 |
+
|
172 |
+
from torch.utils.data import DataLoader
|
173 |
+
|
174 |
+
train_dataloader = DataLoader(
|
175 |
+
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
|
176 |
+
)
|
177 |
+
eval_dataloader = DataLoader(
|
178 |
+
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
|
179 |
+
)
|
180 |
+
for batch in train_dataloader:
|
181 |
+
break
|
182 |
+
output = {k: v.shape for k, v in batch.items()}
|
183 |
+
# print(output)
|
184 |
+
|
185 |
+
from transformers import AutoModelForSequenceClassification
|
186 |
+
|
187 |
+
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
|
188 |
+
|
189 |
+
outputs = model(**batch)
|
190 |
+
# print(outputs.loss, outputs.logits.shape)
|
191 |
+
|
192 |
+
from transformers import AdamW
|
193 |
+
|
194 |
+
optimizer = AdamW(model.parameters(), lr=5e-5)
|
195 |
+
|
196 |
+
from transformers import get_scheduler
|
197 |
+
|
198 |
+
num_epochs = 3
|
199 |
+
num_training_steps = num_epochs * len(train_dataloader)
|
200 |
+
lr_scheduler = get_scheduler(
|
201 |
+
"linear",
|
202 |
+
optimizer=optimizer,
|
203 |
+
num_warmup_steps=0,
|
204 |
+
num_training_steps=num_training_steps,
|
205 |
+
)
|
206 |
+
print(num_training_steps)
|
207 |
+
|
208 |
+
# The training loop
|
209 |
+
import torch
|
210 |
+
|
211 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
212 |
+
model.to(device)
|
213 |
+
# print(device)
|
214 |
+
|
215 |
+
|
216 |
+
from tqdm.auto import tqdm
|
217 |
+
|
218 |
+
progress_bar = tqdm(range(num_training_steps))
|
219 |
+
|
220 |
+
model.train()
|
221 |
+
for epoch in range(num_epochs):
|
222 |
+
for batch in train_dataloader:
|
223 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
224 |
+
outputs = model(**batch)
|
225 |
+
loss = outputs.loss
|
226 |
+
loss.backward()
|
227 |
+
|
228 |
+
optimizer.step()
|
229 |
+
lr_scheduler.step()
|
230 |
+
optimizer.zero_grad()
|
231 |
+
progress_bar.update(1)
|
232 |
+
|
233 |
+
# The evaluation loop
|
234 |
+
import evaluate
|
235 |
+
|
236 |
+
metric = evaluate.load("glue", "mrpc")
|
237 |
+
model.eval()
|
238 |
+
for batch in eval_dataloader:
|
239 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
240 |
+
with torch.no_grad():
|
241 |
+
outputs = model(**batch)
|
242 |
+
|
243 |
+
logits = outputs.logits
|
244 |
+
predictions = torch.argmax(logits, dim=-1)
|
245 |
+
metric.add_batch(predictions=predictions, references=batch["labels"])
|
246 |
+
|
247 |
+
metric.compute()
|
chapter4.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
2 |
+
|
3 |
+
tokenizer = AutoTokenizer.from_pretrained("camembert-base", force_download=True, resume_download=False)
|
4 |
+
model = AutoModelForMaskedLM.from_pretrained("camembert-base")
|
chat.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
def random_response(message, history):
|
4 |
+
return random.choice(["Yes", "No"])
|
5 |
+
|
6 |
+
import time
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
def yes_man(message, history):
|
10 |
+
if message.endswith("?"):
|
11 |
+
return "Yes"
|
12 |
+
else:
|
13 |
+
return "Ask me anything!"
|
14 |
+
|
15 |
+
def echo(message, history, system_prompt, tokens):
|
16 |
+
response = f"System prompt: {system_prompt}\n Message: {message}."
|
17 |
+
for i in range(min(len(response), int(tokens))):
|
18 |
+
time.sleep(0.05)
|
19 |
+
yield response[: i+1]
|
20 |
+
|
21 |
+
# from langchain.chat_models import ChatOpenAI
|
22 |
+
# from langchain.schema import AIMessage, HumanMessage
|
23 |
+
# import openai
|
24 |
+
# import gradio as gr
|
25 |
+
# import os
|
26 |
+
|
27 |
+
# os.environ["OPENAI_API_KEY"] = "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" # Replace with your key
|
28 |
+
|
29 |
+
# llm = ChatOpenAI(temperature=1.0, model='gpt-3.5-turbo-0613')
|
30 |
+
|
31 |
+
# def predict(message, history):
|
32 |
+
# history_langchain_format = []
|
33 |
+
# for human, ai in history:
|
34 |
+
# history_langchain_format.append(HumanMessage(content=human))
|
35 |
+
# history_langchain_format.append(AIMessage(content=ai))
|
36 |
+
# history_langchain_format.append(HumanMessage(content=message))
|
37 |
+
# gpt_response = llm(history_langchain_format)
|
38 |
+
# return gpt_response.content
|
39 |
+
|
40 |
+
# gr.ChatInterface(predict).launch()
|
41 |
+
|
42 |
+
import openai
|
43 |
+
import gradio as gr
|
44 |
+
|
45 |
+
openai.api_key = "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" # Replace with your key
|
46 |
+
|
47 |
+
from langchain.chat_models import ChatOpenAI
|
48 |
+
from langchain.schema import AIMessage, HumanMessage
|
49 |
+
import openai
|
50 |
+
import gradio as gr
|
51 |
+
import os
|
52 |
+
|
53 |
+
os.environ["OPENAI_API_KEY"] = "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"
|
54 |
+
|
55 |
+
llm = ChatOpenAI(temperature=1.0, model='gpt-3.5-turbo-0613')
|
56 |
+
|
57 |
+
def predict(message, history):
|
58 |
+
history_langchain_format = []
|
59 |
+
for human, ai in history:
|
60 |
+
history_langchain_format.append(HumanMessage(content=human))
|
61 |
+
history_langchain_format.append(AIMessage(content=ai))
|
62 |
+
history_langchain_format.append(HumanMessage(content=message))
|
63 |
+
gpt_response = llm(history_langchain_format)
|
64 |
+
return gpt_response.content
|
65 |
+
|
66 |
+
gr.ChatInterface(predict).launch()
|
get-pip.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
gradio_cached_examples/16/log.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
component 0,flag,username,timestamp
|
2 |
+
"[[""Hello"",""Ask me anything!""]]",,,2024-01-28 21:12:34.005450
|
3 |
+
"[[""Want a fuck?"",""Yes""]]",,,2024-01-28 21:12:34.005450
|
4 |
+
"[[""Why so fucking sexy?"",""Yes""]]",,,2024-01-28 21:12:34.021247
|
test-trainer/checkpoint-500/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"problem_type": "single_label_classification",
|
22 |
+
"torch_dtype": "float32",
|
23 |
+
"transformers_version": "4.37.1",
|
24 |
+
"type_vocab_size": 2,
|
25 |
+
"use_cache": true,
|
26 |
+
"vocab_size": 30522
|
27 |
+
}
|
test-trainer/checkpoint-500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:26d438974d3ca04d8c051e587f62520fc01652131e9b1e0ba7811f4cbbc47510
|
3 |
+
size 437958648
|
test-trainer/checkpoint-500/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3a88fbdf3bc025085a10bc736f4c05b4cd1e0fafdf303bccf70e2272e5de73e
|
3 |
+
size 876032762
|
test-trainer/checkpoint-500/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8ce914157a8fa18ed7f5bc895c6169e23a29396d283307ed2eadabfbf64eece
|
3 |
+
size 13990
|
test-trainer/checkpoint-500/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fdb639f9f8f7e062a303535330b8bbcc38edb9ca16539b24483e41c8655ec97d
|
3 |
+
size 1064
|
test-trainer/checkpoint-500/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
test-trainer/checkpoint-500/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test-trainer/checkpoint-500/tokenizer_config.json
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_lower_case": true,
|
47 |
+
"mask_token": "[MASK]",
|
48 |
+
"model_max_length": 512,
|
49 |
+
"pad_token": "[PAD]",
|
50 |
+
"sep_token": "[SEP]",
|
51 |
+
"strip_accents": null,
|
52 |
+
"tokenize_chinese_chars": true,
|
53 |
+
"tokenizer_class": "BertTokenizer",
|
54 |
+
"unk_token": "[UNK]"
|
55 |
+
}
|
test-trainer/checkpoint-500/trainer_state.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0893246187363834,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 500,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.09,
|
13 |
+
"learning_rate": 3.184458968772695e-05,
|
14 |
+
"loss": 0.4912,
|
15 |
+
"step": 500
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"logging_steps": 500,
|
19 |
+
"max_steps": 1377,
|
20 |
+
"num_input_tokens_seen": 0,
|
21 |
+
"num_train_epochs": 3,
|
22 |
+
"save_steps": 500,
|
23 |
+
"total_flos": 147381246548880.0,
|
24 |
+
"train_batch_size": 8,
|
25 |
+
"trial_name": null,
|
26 |
+
"trial_params": null
|
27 |
+
}
|
test-trainer/checkpoint-500/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5817e64c3b9b34fc6bf81ac86329df992d56705557d2276ff5fae264646f7b08
|
3 |
+
size 4728
|
test-trainer/checkpoint-500/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test-trainer/runs/Jan27_22-08-26_DESKTOP-KTM59NT/events.out.tfevents.1706382507.DESKTOP-KTM59NT.3612.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f1636178a7430729e8acf5a52f10f6c9661d1c0295049a135bfb233d27c43c7
|
3 |
+
size 4492
|
test-trainer/runs/Jan28_06-39-34_DESKTOP-KTM59NT/events.out.tfevents.1706413175.DESKTOP-KTM59NT.17992.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6dd42909f5f35f4ff8a387e96c95ae5809f9d8e3ba2923a207e71e71dc70407d
|
3 |
+
size 4335
|