|
import pandas as pd |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import jsonlines |
|
import sys |
|
from tqdm.auto import tqdm |
|
|
|
MODEL_NAME = sys.argv[1] |
|
INPUT_FILENAME = "./Vietnamese truthful QA results.xlsx" |
|
OUTPUT_FILENAME = sys.argv[2] |
|
MAX_NEW_TOKENS = 512 |
|
|
|
writer = jsonlines.open(OUTPUT_FILENAME, "w") |
|
|
|
try: |
|
df = pd.read_excel(INPUT_FILENAME) |
|
except FileNotFoundError: |
|
print(f"Error: The file '{INPUT_FILENAME}' was not found.") |
|
print("Please make sure your XLSX file is in the same directory as the script.") |
|
exit() |
|
except Exception as e: |
|
print(f"An error occurred while reading the Excel file: {e}") |
|
exit() |
|
|
|
|
|
if "Question" not in df.columns or "Ground truth" not in df.columns: |
|
print("Error: Required columns 'Question' and/or 'Ground truth' not found.") |
|
print(f"Available columns are: {list(df.columns)}") |
|
exit() |
|
|
|
df_processed = df[["Question", "Ground truth"]].copy() |
|
|
|
|
|
print(f"Loading model '{MODEL_NAME}' and tokenizer...") |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {device}") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, attn_implementation='flash_attention_2') |
|
model.to(device) |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
model.config.pad_token_id = model.config.eos_token_id |
|
|
|
print("Model and tokenizer loaded successfully.") |
|
|
|
|
|
answers = [] |
|
out_dict = [] |
|
total_questions = len(df_processed) |
|
print(f"Generating answers for {total_questions} questions...") |
|
|
|
for i, question in tqdm(enumerate(df_processed["Question"])): |
|
|
|
|
|
messages = [ |
|
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, |
|
{"role": "user", "content": question} |
|
] |
|
input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
input_ids = tokenizer([input], return_tensors='pt').to(model.device) |
|
|
|
|
|
output_sequences = model.generate( |
|
**input_ids, |
|
max_new_tokens=MAX_NEW_TOKENS, |
|
do_sample=False, |
|
pad_token_id=tokenizer.pad_token_id |
|
) |
|
|
|
|
|
|
|
full_text = tokenizer.decode(output_sequences[0][input_ids['input_ids'].shape[1]:], skip_special_tokens=True) |
|
answer = full_text.strip() |
|
gold = df['Ground truth'][i] |
|
answers.append(answer) |
|
print(f"Processed question {i + 1}/{total_questions}\nAnswer: {answer}\nGold: {gold}") |
|
writer.write({ |
|
"question": question, |
|
"answer": answer, |
|
"gold": gold |
|
}) |
|
|
|
|