hirosan6595 commited on
Commit
3f2b93b
·
verified ·
1 Parent(s): 986b76e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +26 -281
README.md CHANGED
@@ -16,285 +16,43 @@ language:
16
  - **Developed by:** HiroSan6595
17
  - **License:** apache-2.0
18
  - **Finetuned from model :** llm-jp/llm-jp-3-13b
19
-
20
  This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
21
-
22
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
23
 
24
- """python
25
- !pip uninstall unsloth -y
26
- !pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
27
- !pip install --upgrade torch
28
- !pip install --upgrade xformers
29
- !pip install ipywidgets --upgrade
30
- import torch
31
- if torch.cuda.get_device_capability()[0] >= 8:
32
- !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"
33
- HF_TOKEN = "My Token"
34
- from unsloth import FastLanguageModel
35
- import torch
36
- max_seq_length = 1024
37
- dtype = None
38
- load_in_4bit = True
39
-
40
- model_id = "llm-jp/llm-jp-3-13b"
41
- new_model_id = "llm-jp-3-13b-it-j"
42
- model, tokenizer = FastLanguageModel.from_pretrained(
43
- model_name=model_id,
44
- dtype=dtype,
45
- load_in_4bit=load_in_4bit,
46
- trust_remote_code=True,
47
- )
48
- model = FastLanguageModel.get_peft_model(
49
- model,
50
- r = 32,
51
- target_modules = ["q_proj","k_proj","v_proj", o_proj","gate_proj","up_proj","down_proj",],
52
- lora_alpha = 32,
53
- lora_dropout = 0.05,
54
- bias = "none",
55
- use_gradient_checkpointing = "unsloth",
56
- random_state = 3407,
57
- use_rslora = False,
58
- loftq_config = None,
59
- max_seq_length = max_seq_length,
60
- )
61
- from datasets import load_dataset, DatasetDict
62
- dataset2 = load_dataset("DeL-TaiseiOzaki/Tengentoppa-sft-v2.0")
63
- sampledds = dataset2["train"].shuffle(seed=60).select(range(3000))
64
- dataset2 = DatasetDict({
65
- "train": sampledds
66
- })
67
- prompt = """### 指示
68
- {}
69
- ###回答
70
- {}"""
71
- """
72
- formatting_prompts_func:
73
- """
74
- EOS_TOKEN = tokenizer.eos_token
75
- def formatting_prompts_func(examples):
76
- input = examples["instruction"]
77
- output = examples["output"]
78
- text = prompt.format(input, output) + EOS_TOKEN
79
- return { "formatted_text" : text, }
80
- pass
81
- dataset = dataset2.map(
82
- formatting_prompts_func,
83
- num_proc= 4,
84
- )
85
- print(dataset["train"]["formatted_text"][2500])
86
- from trl import SFTTrainer
87
- from transformers import TrainingArguments
88
- from unsloth import is_bfloat16_supported
89
-
90
- trainer = SFTTrainer(
91
- model = model,
92
- tokenizer = tokenizer,
93
- train_dataset=dataset["train"],
94
- max_seq_length = max_seq_length,
95
- dataset_text_field="formatted_text",
96
- packing = False,
97
- args = TrainingArguments(
98
- per_device_train_batch_size = 2, #
99
- gradient_accumulation_steps = 4, #
100
- num_train_epochs = 1, #
101
- logging_steps = 10,
102
- warmup_steps = 10,
103
- save_steps=100,
104
- save_total_limit=2,
105
- max_steps=-1,
106
- learning_rate = 2e-4,
107
- fp16 = not is_bfloat16_supported(),
108
- bf16 = is_bfloat16_supported(),
109
- group_by_length=True,
110
- seed = 3407,
111
- output_dir = "outputs",
112
- report_to = "none",
113
- ),
114
- )
115
-
116
- trainer_stats = trainer.train()
117
- from tqdm import tqdm
118
- FastLanguageModel.for_inference(model)
119
-
120
- results = []
121
- for dt in tqdm(datasets):
122
- input = dt["input"]
123
 
124
- prompt = f"""### 指示\n{input}\n### 回答\n"""
125
-
126
- inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
127
-
128
- outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True, do_sample=False, repetition_penalty=1.2)
129
- prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### 回答')[-1]
130
 
131
- results.append({"task_id": dt["task_id"], "input": input, "output": prediction})
132
- with open(f"{new_model_id}_output1210a.jsonl", 'w', encoding='utf-8') as f:
133
- for result in results:
134
- json.dump(result, f, ensure_ascii=False)
135
- f.write('\n')
136
- """
137
- """Python
138
- !pip install -U bitsandbytes
139
- !pip install -U transformers
140
- !pip install -U accelerate
141
- !pip install -U datasets
142
  !pip install -U peft
143
- !pip install ipywidgets --upgrade
144
- from transformers import (
145
- AutoModelForCausalLM,
146
- AutoTokenizer,
147
- BitsAndBytesConfig,
148
- )
149
  from peft import PeftModel
150
  import torch
151
- from tqdm import tqdm
152
  import json
153
- HF_TOKEN="My Token"
154
- bnb_config = BitsAndBytesConfig(
155
- load_in_4bit=True,
156
- bnb_4bit_quant_type="nf4",
157
- bnb_4bit_compute_dtype=torch.bfloat16,
158
- )
159
- model = AutoModelForCausalLM.from_pretrained(
160
- base_model_id,
161
- quantization_config=bnb_config,
162
- device_map="auto",
163
- token = HF_TOKEN
164
- )
165
- tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, token = HF_TOKEN)
166
- model = PeftModel.from_pretrained(model, adapter_id, token = HF_TOKEN)
167
- datasets = []
168
- with open("/content/drive/MyDrive/Student_LLM/05FinalReport/elyza-tasks-100-TV_0.jsonl", "r") as f:
169
- item = ""
170
- for line in f:
171
- line = line.strip()
172
- item += line
173
- if item.endswith("}"):
174
- datasets.append(json.loads(item))
175
- item = ""
176
-
177
- results = []
178
- for data in tqdm(datasets):
179
-
180
- input = data["input"]
181
-
182
- prompt = f"""### 指示
183
- {input}
184
- ### 回答
185
- """
186
-
187
- tokenized_input = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
188
- attention_mask = torch.ones_like(tokenized_input)
189
- with torch.no_grad():
190
- outputs = model.generate(
191
- tokenized_input,
192
- attention_mask=attention_mask,
193
- max_new_tokens=100,
194
- do_sample=False,
195
- repetition_penalty=1.2,
196
- pad_token_id=tokenizer.eos_token_id
197
- )[0]
198
- output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True)
199
-
200
- results.append({"task_id": data["task_id"], "input": input, "output": output})
201
  import re
202
- jsonl_id = re.sub(".*/", "", adapter_id)
203
- with open(f"./{jsonl_id}-outputs1209n5.jsonl", 'w', encoding='utf-8') as f:
204
- for result in results:
205
- json.dump(result, f, ensure_ascii=False) # ensure_ascii=False for handling non-ASCII characters
206
- f.write('\n')
207
- """
208
- """Python
209
- !pip install unsloth
210
- # Also get the latest nightly Unsloth!
211
- !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
212
- from unsloth import PatchDPOTrainer
213
- PatchDPOTrainer()
214
- from unsloth import FastLanguageModel
215
- import torch
216
- max_seq_length = 2048 # C
217
- dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
218
- load_in_4bit = True
219
- HF_TOKEN = "MyToken"
220
- model, tokenizer = FastLanguageModel.from_pretrained(
221
- model_name = "HiroSan6595/llm-jp-3-13b-it-j_lora", # 自分がUnslothを使ってFTして、loraだけアップロードしているモデル
222
- max_seq_length = max_seq_length,
223
- dtype = dtype,
224
- load_in_4bit = load_in_4bit,
225
- token = HF_TOKEN
226
- )
227
- from huggingface_hub import notebook_login
228
- notebook_login()
229
- from datasets import load_dataset
230
-
231
- # データセットをロード
232
- ds = load_dataset("weblab-GENIAC/aya-ja-nemotron-dpo-masked")
233
-
234
- # フィルタリング関数を定義
235
- def filter_short_examples(example):
236
- return (
237
- len(example['prompt']) <= 4000 and
238
- len(example['chosen']) <= 4000 and
239
- len(example['rejected']) <= 4000
240
- )
241
 
242
- # トレーニングデータをフィルタリング
243
- filtered_train = ds['train'].filter(filter_short_examples)
244
-
245
- # データセットをトレーニング用と評価用に分割 (80%をトレーニング用、20%を評価用)
246
- train_size = int(0.8 * len(filtered_train)) # トレーニングデータのサイズ
247
- eval_size = len(filtered_train) - train_size # 評価データのサイズ
248
-
249
- # インデックスを順序通りに生成 (ランダム性なし)
250
- train_indices = list(range(train_size)) # トレーニング用インデックス
251
- eval_indices = list(range(train_size, len(filtered_train))) # 評価用インデックス
252
 
253
- # トレーニングデータと評価データを選択
254
- train_dataset = filtered_train.select(train_indices)
255
- eval_dataset = filtered_train.select(eval_indices)
256
 
257
- # データセットのサイズを出力
258
- print(f"トレーニングデータセットのサイズ: {len(train_dataset)}")
259
- print(f"評価デ
260
- from unsloth import PatchDPOTrainer = train_dataset.select(range(1000))
261
- use_dataset
262
- PatchDPOTrainer()
263
- from transformers import TrainingArguments
264
- from trl import DPOTrainer, DPOConfig
265
- from unsloth import is_bfloat16_supported
266
 
267
- dpo_trainer = DPOTrainer(
268
- model = model,
269
- ref_model = None,
270
- args = DPOConfig(
271
- per_device_train_batch_size = 2,
272
- gradient_accumulation_steps = 4,
273
- warmup_ratio = 0.1,
274
- num_train_epochs = 2, #1->2
275
- learning_rate = 2e-5, #5e-6 -> 2e-5
276
- fp16 = not is_bfloat16_supported(),
277
- bf16 = is_bfloat16_supported(),
278
- logging_steps = 1,
279
- optim = "adamw_8bit",
280
- weight_decay = 0.0,
281
- lr_scheduler_type = "linear",
282
- seed = 24, #42-> 24
283
- output_dir = "outputs",
284
- report_to = "none", # Use this for WandB etc
285
- ),
286
- beta = 0.1,
287
- train_dataset = use_dataset, #raw_datasets["train"],
288
- # eval_dataset = raw_datasets["test"],
289
- tokenizer = tokenizer,
290
- max_length = 2048,
291
- max_prompt_length = 1024,
292
  )
293
- dpo_trainer.train()
294
- import json
295
- new_model_id = "llm-jp-3-13b-it-j_dpo2"
296
  datasets = []
297
- with open("/content/drive/MyDrive/Student_LLM/05FinalReport/elyza-tasks-100-TV_0.jsonl", "r") as f:
298
  item = ""
299
  for line in f:
300
  line = line.strip()
@@ -303,10 +61,6 @@ with open("/content/drive/MyDrive/Student_LLM/05FinalReport/elyza-tasks-100-TV_0
303
  datasets.append(json.loads(item))
304
  item = ""
305
 
306
- # 学習したモデルを用いてタスクを実行
307
- from tqdm import tqdm
308
-
309
- # 推論するためにモデルのモードを変更
310
  FastLanguageModel.for_inference(model)
311
 
312
  results = []
@@ -317,24 +71,15 @@ for dt in tqdm(datasets):
317
 
318
  inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
319
 
320
- outputs = model.generate(**inputs, max_new_tokens = 2048, use_cache = True, do_sample=False, repetition_penalty=1.2)
321
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### 回答')[-1]
322
 
323
  results.append({"task_id": dt["task_id"], "input": input, "output": prediction})
324
 
325
- # jsonlで保存
326
- with open(f"{new_model_id}_output1211dpo2.jsonl", 'w', encoding='utf-8') as f:
 
327
  for result in results:
328
  json.dump(result, f, ensure_ascii=False)
329
  f.write('\n')
330
- model.push_to_hub_merged(
331
- "llm-jp-3-13b-it-j_dpo2",#保存するモデルの名前
332
- tokenizer=tokenizer,
333
- save_method="lora",#loraだけ保存
334
- token=HF_TOKEN,
335
- private=True
336
- )
337
- """
338
-
339
-
340
-
 
16
  - **Developed by:** HiroSan6595
17
  - **License:** apache-2.0
18
  - **Finetuned from model :** llm-jp/llm-jp-3-13b
 
19
  This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
 
20
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
21
 
22
+ LLM-JP-3-13B ファインチューニングモデル
23
+ 使用方法
24
+ 以下は、モデルの基本的な使用例です
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
 
 
 
 
 
 
26
 
27
+ """python
28
+ !pip install unsloth
29
+ !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
30
+ !pip install -U torch
 
 
 
 
 
 
 
31
  !pip install -U peft
32
+
33
+ from unsloth import FastLanguageModel
 
 
 
 
34
  from peft import PeftModel
35
  import torch
 
36
  import json
37
+ from tqdm import tqdm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ model_id = "llm-jp/llm-jp-3-13b"
41
+ adapter_id = "HiroSan6595/llm-jp-3-13b-it-j_dpo2"
 
 
 
 
 
 
 
 
42
 
43
+ HF_TOKEN = "有効なHuggingFaceトークン"
 
 
44
 
45
+ dtype = None
46
+ load_in_4bit = True
 
 
 
 
 
 
 
47
 
48
+ model, tokenizer = FastLanguageModel.from_pretrained(
49
+ model_name=model_id,
50
+ dtype=dtype,
51
+ load_in_4bit=load_in_4bit,
52
+ trust_remote_code=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  )
 
 
 
54
  datasets = []
55
+ with open("path to elyza-tasks-100-TV_0.jsonl", "r") as f:
56
  item = ""
57
  for line in f:
58
  line = line.strip()
 
61
  datasets.append(json.loads(item))
62
  item = ""
63
 
 
 
 
 
64
  FastLanguageModel.for_inference(model)
65
 
66
  results = []
 
71
 
72
  inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
73
 
74
+ outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True, do_sample=False, repetition_penalty=1.2)
75
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### 回答')[-1]
76
 
77
  results.append({"task_id": dt["task_id"], "input": input, "output": prediction})
78
 
79
+ import re
80
+ json_file_id = re.sub(".*/", "", adapter_id) #
81
+ with open(f"path to {json_file_id}_output.jsonl", 'w', encoding='utf-8') as f:
82
  for result in results:
83
  json.dump(result, f, ensure_ascii=False)
84
  f.write('\n')
85
+ """