"from llama_index.core import SimpleDirectoryReader\n",
"documents = SimpleDirectoryReader(r\"C:\\Users\\agshi\\Downloads\\output_chunks\", filename_as_id=True).load_data()\n",
"print(f\"Loaded {len(documents)} documents.\")"
"from llama_index.llms.ollama import Ollama\n",
"llm = Ollama(model=\"llama3:8b\", request_timeout=120.0)"
"from llama_index.core import SimpleDirectoryReader\n",
"from llama_index.core.llama_dataset.generator import RagDatasetGenerator\n",
"from llama_index.llms.gemini import Gemini\n",
"import os\n",
"from llama_index.core import Settings\n",
"from llama_index.llms.groq import Groq\n",
"import nest_asyncio\n",
"# create llm\n",
"llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
"def question_dataset_generator(document):\n",
" dataset_generator = RagDatasetGenerator.from_documents(\n",
" documents=document,\n",
" llm=llm,\n",
" num_questions_per_chunk=2,\n",
" show_progress=True # set the number of qu/estions per nodes\n",
" )\n",
" rag_dataset = dataset_generator.generate_questions_from_nodes()\n",
" question = [e.query for e in rag_dataset.examples]\n",
" return question"
"100%|██████████| 4/4 [00:10<00:00, 2.70s/it]\n"
"name": "stdout",
"output_type": "stream",
"text": [
"['**Question 1:**', 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?', '**Question 1:**', 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?', '**Question 1:**', 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?', '**Question 1:**', 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?']\n",
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 5/5 [00:10<00:00, 2.15s/it]\n"
"name": "stdout",
"output_type": "stream",
"text": [
"['**Question 1:**', 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?', '**Question 1:**', 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?', '**Question 1:**', 'What is the definition of \"G\" in the context of executive remuneration?', '**Question 1:**', 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?', '**Question 1 (Comprehension):**', 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']\n"
"source": [
"import time\n",
"questions = []\n",
"for i in range(0, len(documents), 2):\n",
" print(i)\n",
" document_pair = documents[i:i+2] # Take two documents at a time\n",
" question = question_dataset_generator(document_pair)\n",
" print(question)\n",
" questions.extend(question)\n",
" time.sleep(60)\n"
"data": {
"text/plain": [
"['**Question 1:**',\n",
" 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?',\n",
" '**Question 1:**',\n",
" 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?',\n",
" '**Question 1:**',\n",
" 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?',\n",
" '**Question 1:**',\n",
" 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?',\n",
" '**Question 1:**',\n",
" 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?',\n",
" '**Question 1:**',\n",
" 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?',\n",
" '**Question 1:**',\n",
" 'What is the definition of \"G\" in the context of executive remuneration?',\n",
" '**Question 1:**',\n",
" 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?',\n",
" '**Question 1 (Comprehension):**',\n",
" 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']"
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
"source": [
"name": "stdout",
"output_type": "stream",
"text": [
"Questions list saved as pickle file.\n"
"source": [
"import pickle\n",
"pickle_file_path = r'C:\\Users\\agshi\\Downloads\\question_list.pkl'\n",
"with open(pickle_file_path, 'wb') as file:\n",
" pickle.dump(questions, file)\n",
"print(\"Questions list saved as pickle file.\")"
"Loaded questions: ['**Question 1:**', 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?', '**Question 1:**', 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?', '**Question 1:**', 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?', '**Question 1:**', 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?', '**Question 1:**', 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?', '**Question 1:**', 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?', '**Question 1:**', 'What is the definition of \"G\" in the context of executive remuneration?', '**Question 1:**', 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?', '**Question 1 (Comprehension):**', 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']\n"
"source": [
"import pickle\n",
"# Path to the pickle file\n",
"pickle_file_path = r'C:\\Users\\agshi\\Downloads\\question_list.pkl'\n",
"# Load the question list from the pickle file\n",
"with open(pickle_file_path, 'rb') as file:\n",
" loaded_questions = pickle.load(file)\n",
"print(\"Loaded questions:\", loaded_questions)"
"from llama_index.core.evaluation import BatchEvalRunner\n",
"from llama_index.core.evaluation import RelevancyEvaluator\n",
"from llama_index.core.evaluation import FaithfulnessEvaluator\n",
"from llama_index.llms.groq import Groq\n",
"import os\n",
"from llama_index.llms.gemini import Gemini\n",
"os.environ[\"GOOGLE_API_KEY\"] = \"[REDACTED]\"\n",
"llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
"relevancy_evaluator = RelevancyEvaluator(llm=llm)\n",
"faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)"
"def extract_elements(eval_result):\n",
" # Dictionary to store the extracted elements\n",
" extracted_data = {\n",
" \"contexts\": [eval_result.contexts],\n",
" \"response\": [eval_result.response],\n",
" \"passing\": [eval_result.passing],\n",
" \"feedback\": [eval_result.feedback],\n",
" \"score\": [eval_result.score],\n",
" \"pairwise_source\": [eval_result.pairwise_source],\n",
" \"invalid_result\": [eval_result.invalid_result],\n",
" \"invalid_reason\": [eval_result.invalid_reason]\n",
" }\n",
" # Convert the dictionary into a DataFrame\n",
" df = pd.DataFrame(extracted_data)\n",
" return df"
"import time\n",
"import pandas as pd\n",
"faithfulness_df = pd.DataFrame()\n",
"relevancy_df = pd.DataFrame()\n",
"os.environ[\"GOOGLE_API_KEY\"] = \"[REDACTED]\"\n",
"llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
"for question in questions:\n",
" response = query_engine.query(question)\n",
" # Evaluate faithfulness\n",
" eval_result = faithfulness_evaluator.evaluate_response(query=question, response=response)\n",
" faithfulness_elements = extract_elements(eval_result)\n",
" faithfulness_df = pd.concat([faithfulness_df, faithfulness_elements], ignore_index=True)\n",
" # Evaluate relevancy\n",
" eval_result = relevancy_evaluator.evaluate_response( query=question, response=response)\n",
" relevancy_elements = extract_elements(eval_result)\n",
" relevancy_df = pd.concat([relevancy_df,relevancy_elements], ignore_index=True)\n",
" time.sleep(60)"
"Content split into 21 chunks.\n",
"Chunk 1 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_1.txt\n",
"Chunk 2 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_2.txt\n",
"Chunk 3 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_3.txt\n",
"Chunk 4 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_4.txt\n",
"Chunk 5 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_5.txt\n",
"Chunk 6 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_6.txt\n",
"Chunk 7 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_7.txt\n",
"Chunk 8 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_8.txt\n",
"Chunk 9 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_9.txt\n",
"Chunk 10 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_10.txt\n",
"Chunk 11 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_11.txt\n",
"Chunk 12 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_12.txt\n",
"Chunk 13 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_13.txt\n",
"Chunk 14 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_14.txt\n",
"Chunk 15 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_15.txt\n",
"Chunk 16 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_16.txt\n",
"Chunk 17 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_17.txt\n",
"Chunk 18 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_18.txt\n",
"Chunk 19 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_19.txt\n",
"Chunk 20 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_20.txt\n",
"Chunk 21 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_21.txt\n"
"source": [
"import os\n",
"# Step 1: Read the content of the text file\n",
"input_file_path = r\"C:\\Users\\agshi\\Downloads\\discussion_data-20240911T083316Z-001\\discussion_data\\C-2_4.txt\"\n",
"with open(input_file_path, 'r', encoding='utf-8') as file:\n",
" content = file.read()\n",
"# Step 2: Split the content into chunks based on a defined chunk size\n",
"chunk_size = 5000 # Adjust chunk size as needed\n",
"chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]\n",
"print(f\"Content split into {len(chunks)} chunks.\")\n",
"# Step 3: Save each chunk into a new file in the output folder\n",
"output_folder = r'C:\\Users\\agshi\\Downloads\\output_chunks'\n",
"os.makedirs(output_folder, exist_ok=True)\n",
"# Extract the base name of the input file (without extension)\n",
"file_base_name = os.path.splitext(os.path.basename(input_file_path))[0]\n",
"# Save each chunk with the file name and chunk number\n",
"for i, chunk in enumerate(chunks):\n",
" output_file_path = os.path.join(output_folder, f\"{file_base_name}_chunk_{i+1}.txt\")\n",
" \n",
" # Write each chunk into a separate file\n",
" with open(output_file_path, 'w', encoding='utf-8') as output_file:\n",
" output_file.write(chunk)\n",
" \n",
" print(f\"Chunk {i+1} saved as: {output_file_path}\")"
