{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 4 documents.\n"
]
}
],
"source": [
"from llama_index.core import SimpleDirectoryReader\n",
"documents = SimpleDirectoryReader(r\"C:\\Users\\agshi\\Downloads\\output_chunks\", filename_as_id=True).load_data()\n",
"print(f\"Loaded {len(documents)} documents.\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.llms.ollama import Ollama\n",
"\n",
"llm = Ollama(model=\"llama3:8b\", request_timeout=120.0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import SimpleDirectoryReader\n",
"from llama_index.core.llama_dataset.generator import RagDatasetGenerator\n",
"from llama_index.llms.gemini import Gemini\n",
"import os\n",
"from llama_index.core import Settings\n",
"from llama_index.llms.groq import Groq\n",
"import nest_asyncio\n",
"nest_asyncio.apply()\n",
"# create llm\n",
"llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
"\n",
"def question_dataset_generator(document):\n",
" dataset_generator = RagDatasetGenerator.from_documents(\n",
" documents=document,\n",
" llm=llm,\n",
" num_questions_per_chunk=2,\n",
" show_progress=True # set the number of qu/estions per nodes\n",
" )\n",
"\n",
" rag_dataset = dataset_generator.generate_questions_from_nodes()\n",
" question = [e.query for e in rag_dataset.examples]\n",
" return question"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "122068b898824caabf36017c8ef57733",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Parsing nodes: 0%| | 0/2 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 4/4 [00:10<00:00, 2.70s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"['**Question 1:**', 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?', '**Question 1:**', 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?', '**Question 1:**', 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?', '**Question 1:**', 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?']\n",
"2\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6446e58ae076476589737522e642d5aa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Parsing nodes: 0%| | 0/2 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 5/5 [00:10<00:00, 2.15s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"['**Question 1:**', 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?', '**Question 1:**', 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?', '**Question 1:**', 'What is the definition of \"G\" in the context of executive remuneration?', '**Question 1:**', 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?', '**Question 1 (Comprehension):**', 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']\n"
]
}
],
"source": [
"import time\n",
"questions = []\n",
"for i in range(0, len(documents), 2):\n",
" print(i)\n",
" document_pair = documents[i:i+2] # Take two documents at a time\n",
" question = question_dataset_generator(document_pair)\n",
" print(question)\n",
" questions.extend(question)\n",
" time.sleep(60)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['**Question 1:**',\n",
" 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?',\n",
" '**Question 1:**',\n",
" 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?',\n",
" '**Question 1:**',\n",
" 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?',\n",
" '**Question 1:**',\n",
" 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?',\n",
" '**Question 1:**',\n",
" 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?',\n",
" '**Question 1:**',\n",
" 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?',\n",
" '**Question 1:**',\n",
" 'What is the definition of \"G\" in the context of executive remuneration?',\n",
" '**Question 1:**',\n",
" 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?',\n",
" '**Question 1 (Comprehension):**',\n",
" 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"questions"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Questions list saved as pickle file.\n"
]
}
],
"source": [
"import pickle\n",
"pickle_file_path = r'C:\\Users\\agshi\\Downloads\\question_list.pkl'\n",
"with open(pickle_file_path, 'wb') as file:\n",
" pickle.dump(questions, file)\n",
"\n",
"print(\"Questions list saved as pickle file.\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded questions: ['**Question 1:**', 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?', '**Question 1:**', 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?', '**Question 1:**', 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?', '**Question 1:**', 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?', '**Question 1:**', 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?', '**Question 1:**', 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?', '**Question 1:**', 'What is the definition of \"G\" in the context of executive remuneration?', '**Question 1:**', 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?', '**Question 1 (Comprehension):**', 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick here for more info. \n",
"\u001b[1;31mView Jupyter log for further details."
]
}
],
"source": [
"import pickle\n",
"\n",
"# Path to the pickle file\n",
"pickle_file_path = r'C:\\Users\\agshi\\Downloads\\question_list.pkl'\n",
"\n",
"# Load the question list from the pickle file\n",
"with open(pickle_file_path, 'rb') as file:\n",
" loaded_questions = pickle.load(file)\n",
"\n",
"print(\"Loaded questions:\", loaded_questions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core.evaluation import BatchEvalRunner\n",
"from llama_index.core.evaluation import RelevancyEvaluator\n",
"from llama_index.core.evaluation import FaithfulnessEvaluator\n",
"from llama_index.llms.groq import Groq\n",
"import os\n",
"from llama_index.llms.gemini import Gemini\n",
"\n",
"os.environ[\"GOOGLE_API_KEY\"] = \"AIzaSyClIR8gLfV7DhuF8idI8BG6PuGLdEo2tIM\"\n",
"llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
"relevancy_evaluator = RelevancyEvaluator(llm=llm)\n",
"faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def extract_elements(eval_result):\n",
" # Dictionary to store the extracted elements\n",
" extracted_data = {\n",
" \"contexts\": [eval_result.contexts],\n",
" \"response\": [eval_result.response],\n",
" \"passing\": [eval_result.passing],\n",
" \"feedback\": [eval_result.feedback],\n",
" \"score\": [eval_result.score],\n",
" \"pairwise_source\": [eval_result.pairwise_source],\n",
" \"invalid_result\": [eval_result.invalid_result],\n",
" \"invalid_reason\": [eval_result.invalid_reason]\n",
" }\n",
"\n",
" # Convert the dictionary into a DataFrame\n",
" df = pd.DataFrame(extracted_data)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'query_engine' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[11], line 8\u001b[0m\n\u001b[0;32m 6\u001b[0m llm \u001b[38;5;241m=\u001b[39m Gemini(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels/gemini-pro\u001b[39m\u001b[38;5;124m\"\u001b[39m, temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m question \u001b[38;5;129;01min\u001b[39;00m questions:\n\u001b[1;32m----> 8\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mquery_engine\u001b[49m\u001b[38;5;241m.\u001b[39mquery(question)\n\u001b[0;32m 10\u001b[0m \u001b[38;5;66;03m# Evaluate faithfulness\u001b[39;00m\n\u001b[0;32m 11\u001b[0m eval_result \u001b[38;5;241m=\u001b[39m faithfulness_evaluator\u001b[38;5;241m.\u001b[39mevaluate_response(query\u001b[38;5;241m=\u001b[39mquestion, response\u001b[38;5;241m=\u001b[39mresponse)\n",
"\u001b[1;31mNameError\u001b[0m: name 'query_engine' is not defined"
]
}
],
"source": [
"import time\n",
"import pandas as pd\n",
"faithfulness_df = pd.DataFrame()\n",
"relevancy_df = pd.DataFrame()\n",
"os.environ[\"GOOGLE_API_KEY\"] = \"AIzaSyC_UnbyMmhvklBRyjLvdEWXuhXim_BX0fk\"\n",
"llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
"for question in questions:\n",
" response = query_engine.query(question)\n",
"\n",
" # Evaluate faithfulness\n",
" eval_result = faithfulness_evaluator.evaluate_response(query=question, response=response)\n",
" faithfulness_elements = extract_elements(eval_result)\n",
" faithfulness_df = pd.concat([faithfulness_df, faithfulness_elements], ignore_index=True)\n",
"\n",
" # Evaluate relevancy\n",
" eval_result = relevancy_evaluator.evaluate_response( query=question, response=response)\n",
" relevancy_elements = extract_elements(eval_result)\n",
" relevancy_df = pd.concat([relevancy_df,relevancy_elements], ignore_index=True)\n",
"\n",
" time.sleep(60)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Content split into 21 chunks.\n",
"Chunk 1 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_1.txt\n",
"Chunk 2 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_2.txt\n",
"Chunk 3 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_3.txt\n",
"Chunk 4 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_4.txt\n",
"Chunk 5 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_5.txt\n",
"Chunk 6 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_6.txt\n",
"Chunk 7 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_7.txt\n",
"Chunk 8 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_8.txt\n",
"Chunk 9 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_9.txt\n",
"Chunk 10 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_10.txt\n",
"Chunk 11 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_11.txt\n",
"Chunk 12 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_12.txt\n",
"Chunk 13 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_13.txt\n",
"Chunk 14 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_14.txt\n",
"Chunk 15 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_15.txt\n",
"Chunk 16 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_16.txt\n",
"Chunk 17 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_17.txt\n",
"Chunk 18 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_18.txt\n",
"Chunk 19 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_19.txt\n",
"Chunk 20 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_20.txt\n",
"Chunk 21 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_21.txt\n"
]
}
],
"source": [
"import os\n",
"\n",
"# Step 1: Read the content of the text file\n",
"input_file_path = r\"C:\\Users\\agshi\\Downloads\\discussion_data-20240911T083316Z-001\\discussion_data\\C-2_4.txt\"\n",
"with open(input_file_path, 'r', encoding='utf-8') as file:\n",
" content = file.read()\n",
"\n",
"# Step 2: Split the content into chunks based on a defined chunk size\n",
"chunk_size = 5000 # Adjust chunk size as needed\n",
"chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]\n",
"\n",
"print(f\"Content split into {len(chunks)} chunks.\")\n",
"\n",
"# Step 3: Save each chunk into a new file in the output folder\n",
"output_folder = r'C:\\Users\\agshi\\Downloads\\output_chunks'\n",
"os.makedirs(output_folder, exist_ok=True)\n",
"\n",
"# Extract the base name of the input file (without extension)\n",
"file_base_name = os.path.splitext(os.path.basename(input_file_path))[0]\n",
"\n",
"# Save each chunk with the file name and chunk number\n",
"for i, chunk in enumerate(chunks):\n",
" output_file_path = os.path.join(output_folder, f\"{file_base_name}_chunk_{i+1}.txt\")\n",
" \n",
" # Write each chunk into a separate file\n",
" with open(output_file_path, 'w', encoding='utf-8') as output_file:\n",
" output_file.write(chunk)\n",
" \n",
" print(f\"Chunk {i+1} saved as: {output_file_path}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}