{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 4 documents.\n"
     ]
    }
   ],
   "source": [
    "from llama_index.core import SimpleDirectoryReader\n",
    "documents = SimpleDirectoryReader(r\"C:\\Users\\agshi\\Downloads\\output_chunks\", filename_as_id=True).load_data()\n",
    "print(f\"Loaded {len(documents)} documents.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llms.ollama import Ollama\n",
    "\n",
    "llm = Ollama(model=\"llama3:8b\", request_timeout=120.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import SimpleDirectoryReader\n",
    "from llama_index.core.llama_dataset.generator import RagDatasetGenerator\n",
    "from llama_index.llms.gemini import Gemini\n",
    "import os\n",
    "from llama_index.core import Settings\n",
    "from llama_index.llms.groq import Groq\n",
    "import nest_asyncio\n",
    "nest_asyncio.apply()\n",
    "# create llm\n",
    "llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
    "\n",
    "def question_dataset_generator(document):\n",
    "  dataset_generator = RagDatasetGenerator.from_documents(\n",
    "      documents=document,\n",
    "      llm=llm,\n",
    "      num_questions_per_chunk=2,\n",
    "      show_progress=True       # set the number of qu/estions per nodes\n",
    "  )\n",
    "\n",
    "  rag_dataset = dataset_generator.generate_questions_from_nodes()\n",
    "  question = [e.query for e in rag_dataset.examples]\n",
    "  return question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "122068b898824caabf36017c8ef57733",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4/4 [00:10<00:00,  2.70s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['**Question 1:**', 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?', '**Question 1:**', 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?', '**Question 1:**', 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?', '**Question 1:**', 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?']\n",
      "2\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6446e58ae076476589737522e642d5aa",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:10<00:00,  2.15s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['**Question 1:**', 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?', '**Question 1:**', 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?', '**Question 1:**', 'What is the definition of \"G\" in the context of executive remuneration?', '**Question 1:**', 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?', '**Question 1 (Comprehension):**', 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "questions = []\n",
    "for i in range(0, len(documents), 2):\n",
    "    print(i)\n",
    "    document_pair = documents[i:i+2]  # Take two documents at a time\n",
    "    question = question_dataset_generator(document_pair)\n",
    "    print(question)\n",
    "    questions.extend(question)\n",
    "    time.sleep(60)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['**Question 1:**',\n",
       " 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?',\n",
       " '**Question 1:**',\n",
       " 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?',\n",
       " '**Question 1:**',\n",
       " 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?',\n",
       " '**Question 1:**',\n",
       " 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?',\n",
       " '**Question 1:**',\n",
       " 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?',\n",
       " '**Question 1:**',\n",
       " 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?',\n",
       " '**Question 1:**',\n",
       " 'What is the definition of \"G\" in the context of executive remuneration?',\n",
       " '**Question 1:**',\n",
       " 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?',\n",
       " '**Question 1 (Comprehension):**',\n",
       " 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Questions list saved as pickle file.\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "pickle_file_path = r'C:\\Users\\agshi\\Downloads\\question_list.pkl'\n",
    "with open(pickle_file_path, 'wb') as file:\n",
    "    pickle.dump(questions, file)\n",
    "\n",
    "print(\"Questions list saved as pickle file.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded questions: ['**Question 1:**', 'What are the key provisions of Part 1 of the Act, and how do they extend support in response to the COVID-19 pandemic?', '**Question 1:**', 'What are the two main types of leave of absence provided for under Part 4 of the Canada Labour Code amendments?', '**Question 1:**', 'What is the definition of \"base percentage\" in the Income Tax Act, as amended by the Act in the context information?', '**Question 1:**', 'What are the specific dates assigned to the twenty-third to twenty-eighth qualifying periods in the amended definition of \"current reference period\"?', '**Question 1:**', 'What are the conditions that must be met for an agreement to be considered valid under the definition of \"executive compensation repayment amount\"?', '**Question 1:**', 'How is the executive remuneration of an eligible entity calculated for the 2021 calendar year?', '**Question 1:**', 'What is the definition of \"G\" in the context of executive remuneration?', '**Question 1:**', 'What is the definition of \"prior reference period\" in subsection 125.7(1) of the Act, as amended by the provided legislation?', '**Question 1 (Comprehension):**', 'Explain the changes made to the definition of \"qualifying recovery entity\" in subsection 125.7(1) of the Act.']\n"
     ]
    },
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "\n",
    "# Path to the pickle file\n",
    "pickle_file_path = r'C:\\Users\\agshi\\Downloads\\question_list.pkl'\n",
    "\n",
    "# Load the question list from the pickle file\n",
    "with open(pickle_file_path, 'rb') as file:\n",
    "    loaded_questions = pickle.load(file)\n",
    "\n",
    "print(\"Loaded questions:\", loaded_questions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core.evaluation import BatchEvalRunner\n",
    "from llama_index.core.evaluation import RelevancyEvaluator\n",
    "from llama_index.core.evaluation import FaithfulnessEvaluator\n",
    "from llama_index.llms.groq import Groq\n",
    "import os\n",
    "from llama_index.llms.gemini import Gemini\n",
    "\n",
    "os.environ[\"GOOGLE_API_KEY\"] = \"AIzaSyClIR8gLfV7DhuF8idI8BG6PuGLdEo2tIM\"\n",
    "llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
    "relevancy_evaluator = RelevancyEvaluator(llm=llm)\n",
    "faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_elements(eval_result):\n",
    "    # Dictionary to store the extracted elements\n",
    "    extracted_data = {\n",
    "        \"contexts\": [eval_result.contexts],\n",
    "        \"response\": [eval_result.response],\n",
    "        \"passing\": [eval_result.passing],\n",
    "        \"feedback\": [eval_result.feedback],\n",
    "        \"score\": [eval_result.score],\n",
    "        \"pairwise_source\": [eval_result.pairwise_source],\n",
    "        \"invalid_result\": [eval_result.invalid_result],\n",
    "        \"invalid_reason\": [eval_result.invalid_reason]\n",
    "    }\n",
    "\n",
    "    # Convert the dictionary into a DataFrame\n",
    "    df = pd.DataFrame(extracted_data)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'query_engine' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[11], line 8\u001b[0m\n\u001b[0;32m      6\u001b[0m llm \u001b[38;5;241m=\u001b[39m Gemini(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels/gemini-pro\u001b[39m\u001b[38;5;124m\"\u001b[39m, temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m question \u001b[38;5;129;01min\u001b[39;00m questions:\n\u001b[1;32m----> 8\u001b[0m     response \u001b[38;5;241m=\u001b[39m \u001b[43mquery_engine\u001b[49m\u001b[38;5;241m.\u001b[39mquery(question)\n\u001b[0;32m     10\u001b[0m     \u001b[38;5;66;03m# Evaluate faithfulness\u001b[39;00m\n\u001b[0;32m     11\u001b[0m     eval_result \u001b[38;5;241m=\u001b[39m faithfulness_evaluator\u001b[38;5;241m.\u001b[39mevaluate_response(query\u001b[38;5;241m=\u001b[39mquestion, response\u001b[38;5;241m=\u001b[39mresponse)\n",
      "\u001b[1;31mNameError\u001b[0m: name 'query_engine' is not defined"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import pandas as pd\n",
    "faithfulness_df = pd.DataFrame()\n",
    "relevancy_df = pd.DataFrame()\n",
    "os.environ[\"GOOGLE_API_KEY\"] = \"AIzaSyC_UnbyMmhvklBRyjLvdEWXuhXim_BX0fk\"\n",
    "llm = Gemini(model=\"models/gemini-pro\", temperature=0)\n",
    "for question in questions:\n",
    "    response = query_engine.query(question)\n",
    "\n",
    "    # Evaluate faithfulness\n",
    "    eval_result = faithfulness_evaluator.evaluate_response(query=question, response=response)\n",
    "    faithfulness_elements = extract_elements(eval_result)\n",
    "    faithfulness_df = pd.concat([faithfulness_df, faithfulness_elements], ignore_index=True)\n",
    "\n",
    "    # Evaluate relevancy\n",
    "    eval_result = relevancy_evaluator.evaluate_response( query=question, response=response)\n",
    "    relevancy_elements = extract_elements(eval_result)\n",
    "    relevancy_df = pd.concat([relevancy_df,relevancy_elements], ignore_index=True)\n",
    "\n",
    "    time.sleep(60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Content split into 21 chunks.\n",
      "Chunk 1 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_1.txt\n",
      "Chunk 2 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_2.txt\n",
      "Chunk 3 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_3.txt\n",
      "Chunk 4 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_4.txt\n",
      "Chunk 5 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_5.txt\n",
      "Chunk 6 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_6.txt\n",
      "Chunk 7 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_7.txt\n",
      "Chunk 8 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_8.txt\n",
      "Chunk 9 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_9.txt\n",
      "Chunk 10 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_10.txt\n",
      "Chunk 11 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_11.txt\n",
      "Chunk 12 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_12.txt\n",
      "Chunk 13 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_13.txt\n",
      "Chunk 14 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_14.txt\n",
      "Chunk 15 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_15.txt\n",
      "Chunk 16 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_16.txt\n",
      "Chunk 17 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_17.txt\n",
      "Chunk 18 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_18.txt\n",
      "Chunk 19 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_19.txt\n",
      "Chunk 20 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_20.txt\n",
      "Chunk 21 saved as: C:\\Users\\agshi\\Downloads\\output_chunks\\C-2_4_chunk_21.txt\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "# Step 1: Read the content of the text file\n",
    "input_file_path = r\"C:\\Users\\agshi\\Downloads\\discussion_data-20240911T083316Z-001\\discussion_data\\C-2_4.txt\"\n",
    "with open(input_file_path, 'r', encoding='utf-8') as file:\n",
    "    content = file.read()\n",
    "\n",
    "# Step 2: Split the content into chunks based on a defined chunk size\n",
    "chunk_size = 5000  # Adjust chunk size as needed\n",
    "chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]\n",
    "\n",
    "print(f\"Content split into {len(chunks)} chunks.\")\n",
    "\n",
    "# Step 3: Save each chunk into a new file in the output folder\n",
    "output_folder = r'C:\\Users\\agshi\\Downloads\\output_chunks'\n",
    "os.makedirs(output_folder, exist_ok=True)\n",
    "\n",
    "# Extract the base name of the input file (without extension)\n",
    "file_base_name = os.path.splitext(os.path.basename(input_file_path))[0]\n",
    "\n",
    "# Save each chunk with the file name and chunk number\n",
    "for i, chunk in enumerate(chunks):\n",
    "    output_file_path = os.path.join(output_folder, f\"{file_base_name}_chunk_{i+1}.txt\")\n",
    "    \n",
    "    # Write each chunk into a separate file\n",
    "    with open(output_file_path, 'w', encoding='utf-8') as output_file:\n",
    "        output_file.write(chunk)\n",
    "    \n",
    "    print(f\"Chunk {i+1} saved as: {output_file_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}