Spaces:

Yozora721
/

pnp-chatbot-admin-v1

Running

App Files Files Community

FauziIsyrinApridal commited on 12 days ago

Commit

050f867

1 Parent(s): 61960d8

perbaikan semester filter dan jurusan scrap tambahkan beatifulsoup

Browse files

Files changed (4) hide show

app/(main)/components/RagDashboard.tsx +4 -1
components/SemesterFilter.tsx +40 -49
requirements.txt +1 -1
scrapping/jurusan_scrap.py +609 -469

app/(main)/components/RagDashboard.tsx CHANGED Viewed

@@ -189,7 +189,10 @@ export default function RagDashboard() {
             {isRefreshing ? "Refreshing..." : "Refresh"}
           </Button>
-          <SemesterFilter onFilterChange={handleSemesterFilterChange} />
           <DropdownMenu>
             <DropdownMenuTrigger asChild>

             {isRefreshing ? "Refreshing..." : "Refresh"}
           </Button>
+          <SemesterFilter
+            dates={ragData.map((file) => file.created_at)}
+            onFilterChange={handleSemesterFilterChange}
+          />
           <DropdownMenu>
             <DropdownMenuTrigger asChild>

components/SemesterFilter.tsx CHANGED Viewed

@@ -1,12 +1,11 @@
-import { useState, useEffect } from "react";
 import { ChevronDown, Filter, Calendar } from "lucide-react";
-// Define the props interface for the component
 interface SemesterFilterProps {
-  onFilterChange?: (semesterId: string) => void; // Make it optional with ?
 }
-// Define the semester option type
 interface SemesterOption {
   id: string;
   label: string;
@@ -14,40 +13,57 @@ interface SemesterOption {
 }
 export default function SemesterFilter({
   onFilterChange,
 }: SemesterFilterProps) {
   const [semesterFilter, setSemesterFilter] = useState("all");
   const [isFilterOpen, setIsFilterOpen] = useState(false);
-  // Get current year
-  const currentYear = new Date().getFullYear();
-  // Generate academic years (e.g., 2023/2024, 2024/2025)
-  const generateAcademicYears = (
-    startYear: number,
-    endYear: number,
-  ): string[] => {
-    const years: string[] = [];
-    for (let year = startYear; year <= endYear; year++) {
-      years.push(`${year}/${year + 1}`);
     }
-    return years.reverse(); // Most recent first okay
   };
-  const academicYears = generateAcademicYears(currentYear - 3, currentYear);
   // Generate semester options
   const semesterOptions: SemesterOption[] = [];
   academicYears.forEach((academicYear) => {
     const [startYear, endYear] = academicYear.split("/");
-    // Odd semester (September startYear - January endYear)
     semesterOptions.push({
       id: `odd-${academicYear}`,
       label: `Ganjil ${academicYear}`,
       description: `September ${startYear} - January ${endYear}`,
     });
-    // Even semester (February - August of endYear)
     semesterOptions.push({
       id: `even-${academicYear}`,
       label: `Genap ${academicYear}`,
@@ -55,28 +71,6 @@ export default function SemesterFilter({
     });
   });
-  // Check if a document falls within a specific semester
-  const isInSemester = (date: string, semesterId: string) => {
-    if (semesterId === "all") return true;
-    const [type, academicYear] = semesterId.split("-");
-    const [startYear, endYear] = academicYear.split("/");
-    const docDate = new Date(date);
-    const docMonth = docDate.getMonth() + 1; // 1-12
-    const docYear = docDate.getFullYear();
-    if (type === "odd") {
-      // Odd semester: September (9) - January (1) of next year
-      return (
-        (docYear === parseInt(startYear) && docMonth >= 9 && docMonth <= 12) ||
-        (docYear === parseInt(endYear) && docMonth === 1)
-      );
-    } else {
-      // Even semester: February (2) - August (8)
-      return docYear === parseInt(endYear) && docMonth >= 2 && docMonth <= 8;
-    }
-  };
   const handleFilterClick = () => {
     setIsFilterOpen(!isFilterOpen);
   };
@@ -84,14 +78,11 @@ export default function SemesterFilter({
   const handleSemesterSelect = (semesterId: string) => {
     setSemesterFilter(semesterId);
     setIsFilterOpen(false);
-    // Call the onFilterChange prop if it exists
     if (onFilterChange) {
       onFilterChange(semesterId);
     }
   };
-  // Get display text for current filter
   const getCurrentFilterText = () => {
     if (semesterFilter === "all") return "All Semesters";
@@ -118,7 +109,7 @@ export default function SemesterFilter({
       {isFilterOpen && (
         <div className="absolute z-50 mt-2 min-w-[240px] rounded-md border border-gray-200 bg-white shadow-lg">
           <div className="py-1">
-            {/* All option */}
             <div
               onClick={() => handleSemesterSelect("all")}
               className={`cursor-pointer px-4 py-2 hover:bg-gray-100 ${
@@ -128,15 +119,15 @@ export default function SemesterFilter({
               All Semesters
             </div>
-            {/* Divider */}
-            <div className="my-1 border-t border-gray-200"></div>
-            {/* Academic Years and Semesters */}
             {academicYears.map((year, yearIndex) => (
               <div key={year}>
                 {yearIndex > 0 && (
-                  <div className="my-1 border-t border-gray-200"></div>
                 )}
                 <div className="px-4 py-2 text-xs font-semibold text-gray-500">
                   Academic Year {year}
                 </div>

+import { useState } from "react";
 import { ChevronDown, Filter, Calendar } from "lucide-react";
 interface SemesterFilterProps {
+  dates: string[];
+  onFilterChange?: (semesterId: string) => void;
 }
 interface SemesterOption {
   id: string;
   label: string;
 }
 export default function SemesterFilter({
+  dates,
   onFilterChange,
 }: SemesterFilterProps) {
   const [semesterFilter, setSemesterFilter] = useState("all");
   const [isFilterOpen, setIsFilterOpen] = useState(false);
+  // Extract unique academic years from dates
+  const extractAcademicYears = (dates: string[]): string[] => {
+    const yearSet = new Set<string>();
+    dates.forEach((dateStr) => {
+      const date = new Date(dateStr);
+      const year = date.getFullYear();
+      const month = date.getMonth() + 1;
+      let startYear: number;
+      if (month >= 9) {
+        // Odd semester: starts in September
+        startYear = year;
+      } else {
+        // Even semester: January–August of next year
+        startYear = year - 1;
+      }
+      const academicYear = `${startYear}/${startYear + 1}`;
+      yearSet.add(academicYear);
+    });
+    if (yearSet.size === 0) {
+      const currentYear = new Date().getFullYear();
+      yearSet.add(`${currentYear - 2}/${currentYear - 1}`);
+      yearSet.add(`${currentYear - 1}/${currentYear}`);
+      yearSet.add(`${currentYear}/${currentYear + 1}`);
     }
+    return Array.from(yearSet).sort((a, b) => (a > b ? -1 : 1));
   };
+  const academicYears = extractAcademicYears(dates);
   // Generate semester options
   const semesterOptions: SemesterOption[] = [];
   academicYears.forEach((academicYear) => {
     const [startYear, endYear] = academicYear.split("/");
     semesterOptions.push({
       id: `odd-${academicYear}`,
       label: `Ganjil ${academicYear}`,
       description: `September ${startYear} - January ${endYear}`,
     });
     semesterOptions.push({
       id: `even-${academicYear}`,
       label: `Genap ${academicYear}`,
     });
   });
   const handleFilterClick = () => {
     setIsFilterOpen(!isFilterOpen);
   };
   const handleSemesterSelect = (semesterId: string) => {
     setSemesterFilter(semesterId);
     setIsFilterOpen(false);
     if (onFilterChange) {
       onFilterChange(semesterId);
     }
   };
   const getCurrentFilterText = () => {
     if (semesterFilter === "all") return "All Semesters";
       {isFilterOpen && (
         <div className="absolute z-50 mt-2 min-w-[240px] rounded-md border border-gray-200 bg-white shadow-lg">
           <div className="py-1">
+            {/* All Semesters */}
             <div
               onClick={() => handleSemesterSelect("all")}
               className={`cursor-pointer px-4 py-2 hover:bg-gray-100 ${
               All Semesters
             </div>
+            <div className="my-1 border-t border-gray-200" />
+            {/* Semester Options Grouped by Academic Year */}
             {academicYears.map((year, yearIndex) => (
               <div key={year}>
                 {yearIndex > 0 && (
+                  <div className="my-1 border-t border-gray-200" />
                 )}
                 <div className="px-4 py-2 text-xs font-semibold text-gray-500">
                   Academic Year {year}
                 </div>

requirements.txt CHANGED Viewed

@@ -3,4 +3,4 @@ supabase
 python-dotenv
 requests
 instaloader

 python-dotenv
 requests
 instaloader
+BeautifulSoup

scrapping/jurusan_scrap.py CHANGED Viewed

@@ -8,12 +8,14 @@ from datetime import datetime
 from collections import defaultdict
 from supabase import create_client
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 class PNPDepartmentSpider(scrapy.Spider):
-    name = 'improved_pnp_department_spider'
     DEPARTMENTS = {
         'akt.pnp.ac.id': 'Akuntansi',
@@ -27,50 +29,88 @@ class PNPDepartmentSpider(scrapy.Spider):
     start_urls = [f'https://{domain}' for domain in DEPARTMENTS.keys()]
     visited_urls = set()
     custom_settings = {
-        'DOWNLOAD_DELAY': 2.0,
         'ROBOTSTXT_OBEY': True,
         'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'LOG_LEVEL': 'INFO',
         'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
-        "DOWNLOAD_TIMEOUT": 100,
-        'RETRY_TIMES': 3,
         'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
-        'HTTPCACHE_ENABLED': True
     }
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.supabase = create_client(
-            os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
-            os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
-        )
-        self.storage_bucket = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
         self.department_data = defaultdict(lambda: defaultdict(list))
         self.study_programs = defaultdict(list)
         self.department_info = defaultdict(dict)
     def start_requests(self):
         for url in self.start_urls:
             yield scrapy.Request(
                 url=url,
                 callback=self.parse_department_homepage,
                 errback=self.handle_error,
-                headers={'Accept': 'text/html,application/xhtml+xml'}
             )
     def parse_department_homepage(self, response):
         domain = urlparse(response.url).netloc
         department = self.DEPARTMENTS.get(domain, domain)
         self.visited_urls.add(response.url)
-        self.logger.info(f"Processing department homepage: {domain} - {department}")
-        # Extract homepage content first
         homepage_content = self.extract_content(response)
         if homepage_content:
-            page_title = response.css('h1::text, .site-title::text, title::text').get('Homepage').strip()
             self.save_page_content(
                 response.url,
                 page_title,
@@ -79,11 +119,12 @@ class PNPDepartmentSpider(scrapy.Spider):
                 'Beranda',
                 homepage_content
             )
-        # Process navigation menu
         nav_elements = self.extract_navigation(response)
         for nav_item in nav_elements:
-            if not nav_item['link'] or nav_item['link'].startswith('#'):
                 continue
             full_url = response.urljoin(nav_item['link'])
@@ -98,12 +139,15 @@ class PNPDepartmentSpider(scrapy.Spider):
                         'category': category,
                         'department': department,
                         'domain': domain,
-                        'menu_path': nav_item['text']
                     },
-                    errback=self.handle_error
                 )
-        # Find and process all study program links
         study_program_links = self.extract_study_program_links(response)
         for prog in study_program_links:
             if prog['link'] not in self.visited_urls:
@@ -113,12 +157,14 @@ class PNPDepartmentSpider(scrapy.Spider):
                     meta={
                         'page_title': prog['title'],
                         'department': department,
-                        'domain': domain
                     },
                     errback=self.handle_error
                 )
-        # Find and process vision & mission specifically
         vision_mission_links = self.extract_vision_mission_links(response)
         for vm_link in vision_mission_links:
             if vm_link['link'] not in self.visited_urls:
@@ -128,96 +174,168 @@ class PNPDepartmentSpider(scrapy.Spider):
                     meta={
                         'page_title': vm_link['title'],
                         'department': department,
-                        'domain': domain
                     },
                     errback=self.handle_error
                 )
     def extract_navigation(self, response):
-        """Extract navigation elements from page"""
         nav_items = []
-        # Try multiple selectors that commonly contain navigation
-        nav_selectors = [
-            'nav a', '.navbar a', '.navigation a', '.main-menu a', '.nav a',
-            '#menu a', '.menu a', 'header a', '.navbar-collapse a',
-            'ul.nav a', '.dropdown-menu a', '.megamenu a',
-            '#main-menu a', '.main-navigation a', '#primary-menu a',
-            '.top-menu a', '.primary-menu a', '#nav a'
         ]
-        for selector in nav_selectors:
-            for item in response.css(selector):
-                text = self.clean_text(' '.join(item.css('::text').getall()))
-                link = item.css('::attr(href)').get()
-                if text and link and len(text.strip()) > 1:
-                    if not self.is_social_media_link(link) and not self.is_unwanted_url(link):
                         nav_items.append({
-                            'text': text.strip(),
-                            'link': response.urljoin(link)
                         })
         return nav_items
     def extract_study_program_links(self, response):
-        """Extract study program links from complex WordPress navigation menus"""
         program_links = []
-        # XPath untuk menemukan menu 'Program Studi' dan semua submenu-nya
-        base_xpath = """
-            //li[contains(@class, 'wp-block-navigation-submenu')]
-            [.//span[contains(translate(., 'PROGRAMSTUDI', 'programstudi'), 'program studi')]]
-            //ul[@class='wp-block-navigation__submenu-container']
-            /li[contains(@class, 'wp-block-navigation-submenu')]
-            /a[contains(@class, 'wp-block-navigation-item__content')]
-        """
-        # Ambil semua link program studi utama
-        for link in response.xpath(base_xpath):
-            text = self.clean_text(''.join(link.xpath('.//span[@class="wp-block-navigation-item__label"]//text()').getall()))
-            url = link.xpath('@href').get()
-            if text and url:
                 program_links.append({
-                    'title': text.strip(),
-                    'link': response.urljoin(url)
                 })
-        # Logika fallback untuk website yang menggunakan struktur berbeda
-        if not program_links:
-            program_links = super().extract_study_program_links(response)
-        return program_links
     def extract_vision_mission_links(self, response):
-        """Extract links specifically for vision & mission"""
         vm_links = []
-        # Terms related to vision & mission
-        vm_terms = ['visi', 'misi', 'vision', 'mission', 'visi-misi', 'visi & misi', 'visi dan misi']
-        # Look for links containing these terms
-        for term in vm_terms:
             for link in response.css(f'a:contains("{term}")'):
                 text = self.clean_text(' '.join(link.css('::text').getall()))
                 url = link.css('::attr(href)').get()
                 if text and url:
                     vm_links.append({
-                        'title': text.strip(),
                         'link': response.urljoin(url)
                     })
         return vm_links
     def parse_content_page(self, response):
-        """Process regular content pages"""
         meta = response.meta
         self.visited_urls.add(response.url)
-        # Extract content from this page
-        content = self.extract_content(response)
         if content:
             self.save_page_content(
@@ -229,43 +347,180 @@ class PNPDepartmentSpider(scrapy.Spider):
                 content,
                 meta.get('menu_path', '')
             )
-        # Look for additional sub-links within this page
-        content_links = response.css('article a, .content a, .entry-content a, .post-content a, main a')
-        for link in content_links:
-            link_text = self.clean_text(' '.join(link.css('::text').getall()))
-            link_url = link.css('::attr(href)').get()
-            if link_text and link_url and len(link_text) > 3:
-                # Only follow internal links
-                parsed_url = urlparse(response.urljoin(link_url))
-                if parsed_url.netloc == meta['domain']:
-                    full_url = response.urljoin(link_url)
-                    if full_url not in self.visited_urls and not self.is_unwanted_url(full_url):
-                        yield scrapy.Request(
-                            url=full_url,
-                            callback=self.parse_content_page,
-                            meta={
-                                'page_title': link_text,
-                                'category': meta['category'],  # Keep parent category
-                                'department': meta['department'],
-                                'domain': meta['domain'],
-                                'menu_path': f"{meta.get('menu_path', '')} > {link_text}"
-                            },
-                            errback=self.handle_error
-                        )
     def parse_study_program(self, response):
-        """Process study program pages specifically"""
         meta = response.meta
         self.visited_urls.add(response.url)
         department = meta['department']
-        program_title = meta['page_title']
-        # Extract program details
         program_details = self.extract_program_details(response)
         # Add to the study programs collection
         self.study_programs[department].append({
@@ -275,7 +530,7 @@ class PNPDepartmentSpider(scrapy.Spider):
         })
         # Also save as a regular page
-        content = self.extract_content(response)
         if content:
             self.save_page_content(
                 response.url,
@@ -285,16 +540,18 @@ class PNPDepartmentSpider(scrapy.Spider):
                 'Program_Studi',
                 content
             )
     def extract_program_details(self, response):
-        """Enhanced program details extraction with better degree detection"""
-        details = {}  # Initialize details as empty dict
-        # Improved degree detection from multiple sources
         degree_sources = [
-            response.css('title::text').get(),
-            response.css('h1::text').get(),
-            ' '.join(response.css('.breadcrumb ::text').getall())
         ]
         degree_pattern = re.compile(
@@ -307,124 +564,84 @@ class PNPDepartmentSpider(scrapy.Spider):
                 details['degree'] = match.group(1).upper()
                 break
-        # Extract accreditation status
-        accreditation = response.xpath(
-            '//span[contains(translate(., "ABCDE", "abcde"), "akreditasi")]'
-            '/following-sibling::span/text()'
-        ).get()
-        if accreditation:
-            details['accreditation'] = self.clean_text(accreditation)
-        # Extract description from the first paragraph
-        first_paragraph = response.css('p::text').get()
-        if first_paragraph:
-            details['description'] = self.clean_text(first_paragraph)
         return details
     def parse_vision_mission(self, response):
-        """Special handler for vision & mission pages"""
         meta = response.meta
         self.visited_urls.add(response.url)
         department = meta['department']
-        vision_text = ""
-        mission_text = ""
-        # Look for vision section
-        vision_selectors = [
-            'h2:contains("Visi") + p', 'h3:contains("Visi") + p',
-            'h4:contains("Visi") + p', '.visi p', '#visi p',
-            'h2:contains("Vision") + p', 'h3:contains("Vision") + p',
-            'strong:contains("Visi") + p', 'b:contains("Visi") + p'
-        ]
-        for selector in vision_selectors:
-            try:
-                vision = response.css(selector).get()
-                if vision:
-                    vision_text = self.clean_text(scrapy.Selector(text=vision).css('::text').get(''))
-                    if vision_text:
-                        break
-            except:
-                continue
-        # If still not found, try looking for paragraphs after headings
-        if not vision_text:
-            for heading in response.css('h1, h2, h3, h4, h5, h6'):
-                heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
-                if heading_text and ('visi' in heading_text.lower() or 'vision' in heading_text.lower()):
-                    # Try to get the next paragraph
-                    next_p = heading.xpath('following-sibling::p[1]')
-                    if next_p:
-                        vision_text = self.clean_text(' '.join(next_p.css('::text').getall()))
-                        break
-        # Look for mission section using similar approach
-        mission_selectors = [
-            'h2:contains("Misi") + p', 'h3:contains("Misi") + p',
-            'h4:contains("Misi") + p', '.misi p', '#misi p',
-            'h2:contains("Mission") + p', 'h3:contains("Mission") + p',
-            'strong:contains("Misi") + p', 'b:contains("Misi") + p'
-        ]
-        for selector in mission_selectors:
-            try:
-                mission = response.css(selector).get()
-                if mission:
-                    mission_text = self.clean_text(scrapy.Selector(text=mission).css('::text').get(''))
-                    if mission_text:
-                        break
-            except:
-                continue
-        # If still not found, try looking for paragraphs after headings
-        if not mission_text:
-            for heading in response.css('h1, h2, h3, h4, h5, h6'):
-                heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
-                if heading_text and ('misi' in heading_text.lower() or 'mission' in heading_text.lower()):
-                    # Try to get the next paragraph
-                    next_p = heading.xpath('following-sibling::p[1]')
-                    if next_p:
-                        mission_text = self.clean_text(' '.join(next_p.css('::text').getall()))
-                        break
-        # Try to find mission list items
-        mission_list_items = []
-        for list_selector in ['h2:contains("Misi") ~ ul li', 'h3:contains("Misi") ~ ul li',
-                             'h4:contains("Misi") ~ ul li', '.misi ul li', '#misi ul li',
-                             'h2:contains("Mission") ~ ul li', 'h3:contains("Mission") ~ ul li']:
-            try:
-                items = response.css(f'{list_selector}::text').getall()
-                if items:
-                    mission_list_items = [self.clean_text(item) for item in items if self.clean_text(item)]
-                    if mission_list_items:
-                        break
-            except:
-                continue
-        # Store vision and mission in department info
-        if vision_text or mission_text or mission_list_items:
             if vision_text:
                 self.department_info[department]['vision'] = vision_text
             if mission_text:
                 self.department_info[department]['mission'] = mission_text
-            if mission_list_items:
-                self.department_info[department]['mission_items'] = mission_list_items
-            # Save as separate file for vision-mission
             self.save_vision_mission(
                 department,
                 meta['domain'],
                 vision_text,
                 mission_text,
-                mission_list_items,
                 response.url
             )
         # Also save as a regular page
-        content = self.extract_content(response)
         if content:
             self.save_page_content(
                 response.url,
@@ -434,371 +651,294 @@ class PNPDepartmentSpider(scrapy.Spider):
                 'Profil',
                 content
             )
-    def extract_content(self, response):
-        """Extract content from a page in a structured format"""
-        content = {"paragraphs": [], "tables": [], "files": []}
-        # First try to find the main content areas
-        content_selectors = [
-            'div.entry-content', 'article.post', 'main.site-main',
-            'div.content', 'div.main-content', 'div#content', 'div.page-content',
-            'article', '.post-content', '.entry-content', '.content',
-            '.page-content', 'main', '#content', '.main-content',
-            '.article-content', '.single-content'
-        ]
-        main_content = None
-        for selector in content_selectors:
-            elements = response.css(selector)
-            if elements:
-                main_content = elements
-                break
-        # If no primary content found, use body
-        if not main_content:
-            main_content = response.css('body')
-        # Extract headings and paragraphs
-        for heading in main_content.css('h1, h2, h3, h4, h5, h6'):
-            heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
-            if heading_text and len(heading_text) > 3:
-                heading_tag = heading.root.tag
-                content["paragraphs"].append(f"[{heading_tag.upper()}] {heading_text}")
-        # Extract paragraphs
-        for p in main_content.css('p'):
-            text = self.clean_text(' '.join(p.css('::text').getall()))
-            if text and len(text) > 20:  # Minimum meaningful length
-                # Add any links found in this paragraph
-                links = []
-                for a in p.css('a'):
-                    link_text = self.clean_text(' '.join(a.css('::text').getall()))
-                    link_url = a.css('::attr(href)').get()
-                    if link_text and link_url:
-                        links.append(f"{link_text} (Link: {response.urljoin(link_url)})")
-                paragraph = text
-                if links:
-                    paragraph += f" | Links: {'; '.join(links)}"
-                content["paragraphs"].append(paragraph)
-        # Extract list items
-        for li in main_content.css('li'):
-            text = self.clean_text(' '.join(li.css('::text').getall()))
-            if text and len(text) > 10:
-                content["paragraphs"].append(f"• {text}")
-        # If no structured text elements found, try general text extraction
-        if not content["paragraphs"]:
-            # Get all text nodes within divs but not within scripts or styles
-            for div in main_content.css('div'):
-                text = self.clean_text(' '.join(div.xpath('./text()').getall()))
-                if text and len(text) > 30:
-                    content["paragraphs"].append(text)
-        # Extract tables
-        for table in main_content.css('table'):
-            rows = []
-            # Get header if it exists
-            headers = []
-            for th in table.css('thead th, tr th'):
-                header_text = self.clean_text(' '.join(th.css('::text').getall()))
-                if header_text:
-                    headers.append(header_text)
-            if headers:
-                rows.append(" - ".join(headers))
-            # Get table body rows
-            for tr in table.css('tbody tr, tr'):
-                if tr.css('th') and not tr.css('td'):
-                    continue  # Skip header rows already processed
-                cells = []
-                for td in tr.css('td'):
-                    cell_text = self.clean_text(' '.join(td.css('::text').getall()))
-                    link = td.css('a::attr(href)').get()
-                    if link:
-                        cell_text += f" (Link: {response.urljoin(link)})"
-                    if cell_text:
-                        cells.append(cell_text)
-                    else:
-                        cells.append(" ")  # Empty cell placeholder
-                if cells:
-                    rows.append(" - ".join(cells))
-            if len(rows) > 1:  # Only add if we have meaningful table
-                content["tables"].append("\n".join(rows))
-        # Extract downloads and files
-        for link in main_content.css('a[href]'):
-            href = link.css('::attr(href)').get()
-            if not href:
-                continue
-            link_text = self.clean_text(' '.join(link.css('::text').getall()))
-            if not link_text:
-                link_text = "Unduhan"
-            # Match common document formats
-            if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)$', href.lower()):
-                # Extract file extension for better categorization
-                file_ext = href.split('.')[-1].lower()
-                content["files"].append({
-                    "title": link_text,
-                    "url": urljoin(response.url, href),
-                    "type": file_ext
-                })
-        return content if any(value for value in content.values()) else None
     def save_page_content(self, url, title, department, domain, category, content, menu_path=''):
-        """Save a page's content as a formatted text file"""
         if not content or not title:
             return
-        # Clean up title for filename
         safe_title = re.sub(r'[^\w\s-]', '', title).strip().lower()
-        safe_title = re.sub(r'[-\s]+', '-', safe_title)
-        # Prepare the content
-        formatted_content = f"""# {title}
-URL: {url}
-Tanggal: {datetime.now().strftime('%d %B %Y')}
-Jurusan: {department}
-Kategori: {category}
-"""
-        if menu_path:
-            formatted_content += f"Navigasi: {menu_path}\n"
-        formatted_content += "\n## Konten\n\n"
-        if content["paragraphs"]:
-            formatted_content += "\n".join(content["paragraphs"])
-        if content["tables"]:
-            formatted_content += "\n\n## Tabel Data\n\n"
-            for i, table in enumerate(content["tables"]):
-                formatted_content += f"### Tabel {i+1}\n{table}\n\n"
-        if content["files"]:
-            formatted_content += "\n\n## Berkas\n\n"
-            for file in content["files"]:
-                formatted_content += f"- {file['title']} [{file['type']}]: {file['url']}\n"
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        # Generate filename with department prefix
-        filename = f"{department}_{safe_title}_{timestamp}.txt"
-        # Upload file to Supabase
         try:
             upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
                 path=filename,
-                file=formatted_content.encode('utf-8'),
-                file_options={"content-type": "text/plain", "x-upsert": "true"}
             )
             self.logger.info(f"Successfully uploaded {filename}")
-            # Store in our collection for later summaries
             self.department_data[department][category].append({
                 'title': title,
                 'url': url,
-                'filename': filename
             })
         except Exception as e:
             self.logger.error(f"Upload failed for {filename}: {str(e)}")
     def save_vision_mission(self, department, domain, vision, mission, mission_items, url):
-        """Save vision & mission as a separate well-formatted file"""
-        filename = f"{department}_Visi_Misi.txt"
-        content = f"""# Visi dan Misi {department}
-URL: {url}
-Tanggal: {datetime.now().strftime('%d %B %Y')}
-Jurusan: {department}
-"""
-        if vision:
-            content += f"## Visi\n\n{vision}\n\n"
-        if mission:
-            content += f"## Misi\n\n{mission}\n\n"
-        if mission_items:
-            if not mission:  # Only add header if not already added
-                content += "## Misi\n\n"
-            for i, item in enumerate(mission_items, 1):
-                content += f"{i}. {item}\n"
         try:
-            # Remove existing file if it exists
-            try:
-                self.supabase.storage.from_(self.storage_bucket).remove(filename)
-            except:
-                pass
             upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
                 path=filename,
-                file=content.encode('utf-8'),
-                file_options={"content-type": "text/plain", "x-upsert": "true"}
             )
-            self.logger.info(f"Successfully uploaded {filename}")
         except Exception as e:
-            self.logger.error(f"Upload failed for {filename}: {str(e)}")
     def clean_text(self, text):
-        """Clean and normalize text"""
         if not text:
             return ""
         # Normalize unicode characters
         text = unicodedata.normalize('NFKC', text)
-        # Replace multiple spaces with single space
-        text = re.sub(r'\s+', ' ', text)
-        # Remove special characters and non-printable characters
-        text = re.sub(r'[^\x20-\x7E\s\u00A0-\u00FF\u0100-\u017F]', '', text)
-        # Remove multiple periods
-        text = re.sub(r'\.{2,}', ' ', text)
-        return text.strip()
     def determine_category(self, menu_text):
-        """Determine content category based on menu text"""
         menu_lower = menu_text.lower()
-        # Define category mappings
-        categories = {
-            'Beranda': ['beranda', 'home', 'utama', 'main', 'index'],
-            'Profil': ['profil', 'profile', 'tentang', 'about', 'visi', 'misi', 'sejarah', 'history', 'struktur', 'organisasi', 'pimpinan', 'sambutan'],
-            'Program_Studi': ['program', 'studi', 'prodi', 'd3', 'd4', 'diploma', 'sarjana', 'akademik', 'jurusan', 'kurikulum'],
-            'Dosen': ['dosen', 'staff', 'tenaga', 'pengajar', 'lecturer', 'faculty'],
-            'Penelitian': ['penelitian', 'research', 'jurnal', 'karya', 'ilmiah', 'publikasi', 'paper'],
-            'Mahasiswa': ['mahasiswa', 'student', 'alumni', 'lulusan', 'graduate', 'kegiatan', 'activity', 'kemahasiswaan'],
-            'Fasilitas': ['fasilitas', 'facility', 'lab', 'laboratorium', 'gedung', 'building', 'sarana', 'prasarana'],
-            'Informasi': ['informasi', 'info', 'pengumuman', 'announcement', 'agenda', 'berita', 'news', 'event'],
-            'Kerjasama': ['kerjasama', 'cooperation', 'mitra', 'partner', 'industri', 'industry', 'collaboration'],
-            'Dokumen': ['dokumen', 'document', 'unduhan', 'download', 'berkas', 'file']
-        }
-        # Check each category
-        for category, terms in categories.items():
-            if any(term in menu_lower for term in terms):
                 return category
-        # Default category if no match
         return 'Lainnya'
     def is_social_media_link(self, url):
-        """Check if URL is for social media"""
-        social_patterns = [
-            'facebook.com', 'twitter.com', 'instagram.com',
-            'youtube.com', 'linkedin.com', 'pinterest.com',
-            'tiktok.com', 'wa.me', 'whatsapp.com', 't.me'
         ]
-        return any(pattern in url.lower() for pattern in social_patterns)
     def is_unwanted_url(self, url):
-        """Check if URL should be skipped"""
-        # Skip certain file types
-        if re.search(r'\.(jpg|jpeg|png|gif|svg|ico|css|js)$', url.lower()):
             return True
-        # Skip certain URL patterns
         unwanted_patterns = [
-            'login', 'logout', 'signin', 'signup', 'register', 'admin',
-            'wp-', '/wp/', 'wordpress', 'comment', 'feed', 'rss', 'atom',
-            'javascript:', 'mailto:', 'tel:', 'page/', '/tag/', '/author/',
-            '/archive/', '/category/', '/search', 'kalender', '/ajax/', '/api/'
         ]
-        return any(pattern in url.lower() for pattern in unwanted_patterns)
     def handle_error(self, failure):
-        """Handle request errors"""
         url = failure.request.url
-        self.visited_urls.add(url)  # Mark as visited to prevent retries
-        self.logger.error(f"Request failed: {url} - {str(failure.value)}")
     def closed(self, reason):
-        """Finalize processing when spider is closed"""
-        self.logger.info("Spider closed. Generating summary report...")
-        # Log statistics
-        departments_count = len(self.department_data)
-        pages_count = sum(len(cat_data) for dept_data in self.department_data.values()
-                        for cat_data in dept_data.values())
-        self.logger.info(f"Crawled {departments_count} departments and {pages_count} pages")
-        self.logger.info(f"Found {len(self.study_programs)} departments with programs")
-        for dept, programs in self.study_programs.items():
-            self.logger.info(f"{dept}: {len(programs)} programs")
-        # Generate and upload the summary file
-        self.generate_summary_file()
-    def generate_summary_file(self):
-        """Generate comprehensive summary with program metadata"""
-        content = """# Daftar Lengkap Jurusan dan Program Studi Politeknik Negeri Padang\n\n"""
-        content += f"**Terakhir diperbarui**: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
         # Create reverse mapping from department name to domain
         reverse_departments = {v: k for k, v in self.DEPARTMENTS.items()}
         for department, programs in self.study_programs.items():
-            # Get domain from reverse mapping
             domain = reverse_departments.get(department, '')
-            website_url = f'https://{domain}' if domain else 'URL tidak ditemukan'
             content += f"## {department.replace('_', ' ')}\n"
-            content += f"**Website**: {website_url}\n\n"
             if programs:
                 for prog in programs:
-                    content += f"### {prog['title']}\n"
-                    content += f"- **Jenjang**: {prog['details'].get('degree', 'N/A')}\n"
-                    content += f"- **Akreditasi**: {prog['details'].get('accreditation', 'N/A')}\n"
-                    content += f"- **URL**: {prog['url']}\n"
                     if 'description' in prog['details']:
-                        desc = prog['details']['description']
-                        content += f"\n**Deskripsi**:\n{desc}\n"
-                    content += "\n"
             else:
-                content += "### Belum ada informasi program studi\n"
             content += "\n---\n\n"
-        # Upload to Supabase
-        filename = "Daftar_Jurusan_dan_Prodi_Politeknik_Negeri_Padang.txt"
         try:
-            self.supabase.storage.from_(self.storage_bucket).remove(filename)
             self.supabase.storage.from_(self.storage_bucket).upload(
                 path=filename,
                 file=content.encode('utf-8'),
-                file_options={"content-type": "text/plain", "x-upsert": "true"}
             )
-            self.logger.info("Ringkasan jurusan berhasil diunggah")
         except Exception as e:
-            self.logger.error(f"Gagal mengupload file ringkasan: {str(e)}")
-# Main execution
 if __name__ == "__main__":
     process = CrawlerProcess()
     process.crawl(PNPDepartmentSpider)

 from collections import defaultdict
 from supabase import create_client
 from dotenv import load_dotenv
+import logging
+from bs4 import BeautifulSoup
 # Load environment variables
 load_dotenv()
 class PNPDepartmentSpider(scrapy.Spider):
+    name = 'optimized_pnp_department_spider'
     DEPARTMENTS = {
         'akt.pnp.ac.id': 'Akuntansi',
     start_urls = [f'https://{domain}' for domain in DEPARTMENTS.keys()]
     visited_urls = set()
+    failed_urls = set()
     custom_settings = {
+        'DOWNLOAD_DELAY': 1.5,
         'ROBOTSTXT_OBEY': True,
         'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'LOG_LEVEL': 'INFO',
         'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
+        'DOWNLOAD_TIMEOUT': 30,
+        'RETRY_TIMES': 2,
         'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
+        'HTTPCACHE_ENABLED': True,
+        'HTTPCACHE_EXPIRATION_SECS': 86400,  # Cache for 1 day
+        'DEPTH_LIMIT': 3,
+        'DEPTH_PRIORITY': 1
     }
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.setup_logging()
+        self.setup_supabase()
         self.department_data = defaultdict(lambda: defaultdict(list))
         self.study_programs = defaultdict(list)
         self.department_info = defaultdict(dict)
+        self.session_stats = {
+            'total_pages': 0,
+            'successful_pages': 0,
+            'failed_pages': 0
+        }
+    def setup_logging(self):
+        """Configure advanced logging"""
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+            handlers=[
+                logging.FileHandler('pnp_spider.log'),
+                logging.StreamHandler()
+            ]
+        )
+        self.logger = logging.getLogger(self.name)
+    def setup_supabase(self):
+        """Initialize Supabase client with error handling"""
+        try:
+            self.supabase = create_client(
+                os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
+                os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
+            )
+            self.storage_bucket = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
+            # Test connection
+            self.supabase.storage.list_buckets()
+            self.logger.info("Successfully connected to Supabase")
+        except Exception as e:
+            self.logger.error(f"Supabase connection failed: {str(e)}")
+            raise
     def start_requests(self):
+        """Initialize requests with better error handling"""
         for url in self.start_urls:
             yield scrapy.Request(
                 url=url,
                 callback=self.parse_department_homepage,
                 errback=self.handle_error,
+                headers={'Accept': 'text/html,application/xhtml+xml'},
+                meta={'retry_count': 0}
             )
     def parse_department_homepage(self, response):
+        """Enhanced department homepage parsing"""
+        self.session_stats['total_pages'] += 1
         domain = urlparse(response.url).netloc
         department = self.DEPARTMENTS.get(domain, domain)
         self.visited_urls.add(response.url)
+        self.logger.info(f"Processing department: {department} ({domain})")
+        # Extract homepage content
         homepage_content = self.extract_content(response)
         if homepage_content:
+            page_title = self.extract_page_title(response)
             self.save_page_content(
                 response.url,
                 page_title,
                 'Beranda',
                 homepage_content
             )
+            self.session_stats['successful_pages'] += 1
+        # Process navigation with improved detection
         nav_elements = self.extract_navigation(response)
         for nav_item in nav_elements:
+            if not self.should_follow_link(nav_item['link'], response.url):
                 continue
             full_url = response.urljoin(nav_item['link'])
                         'category': category,
                         'department': department,
                         'domain': domain,
+                        'menu_path': nav_item['text'],
+                        'retry_count': 0,
+                        'depth': response.meta.get('depth', 0) + 1
                     },
+                    errback=self.handle_error,
+                    priority=2 if 'prodi' in nav_item['text'].lower() else 1
                 )
+        # Process study programs with better detection
         study_program_links = self.extract_study_program_links(response)
         for prog in study_program_links:
             if prog['link'] not in self.visited_urls:
                     meta={
                         'page_title': prog['title'],
                         'department': department,
+                        'domain': domain,
+                        'retry_count': 0,
+                        'priority': 3  # Higher priority for program pages
                     },
                     errback=self.handle_error
                 )
+        # Process vision & mission with better detection
         vision_mission_links = self.extract_vision_mission_links(response)
         for vm_link in vision_mission_links:
             if vm_link['link'] not in self.visited_urls:
                     meta={
                         'page_title': vm_link['title'],
                         'department': department,
+                        'domain': domain,
+                        'retry_count': 0,
+                        'priority': 2
                     },
                     errback=self.handle_error
                 )
+    def should_follow_link(self, link, base_url):
+        """Determine if a link should be followed"""
+        if not link or link.startswith('#') or link.startswith('javascript:'):
+            return False
+        parsed_link = urlparse(link)
+        parsed_base = urlparse(base_url)
+        # Skip if different domain
+        if parsed_link.netloc and parsed_link.netloc != parsed_base.netloc:
+            return False
+        # Skip unwanted file types
+        if re.search(r'\.(jpg|jpeg|png|gif|pdf|docx?|xlsx?|pptx?|zip|rar)$', link.lower()):
+            return False
+        # Skip admin/login pages
+        if any(x in link.lower() for x in ['wp-admin', 'wp-login', 'admin', 'login']):
+            return False
+        return True
+    def extract_page_title(self, response):
+        """Extract page title with multiple fallbacks"""
+        title = response.css('h1::text, h1.page-title::text, h1.entry-title::text').get()
+        if not title:
+            title = response.css('title::text').get()
+        if not title:
+            title = response.url.split('/')[-1].replace('-', ' ').title()
+        return self.clean_text(title or 'Untitled Page')
     def extract_navigation(self, response):
+        """Improved navigation extraction with BeautifulSoup"""
+        soup = BeautifulSoup(response.text, 'html.parser')
         nav_items = []
+        # Common navigation patterns
+        nav_patterns = [
+            {'tag': 'nav'},
+            {'class': 'navbar'},
+            {'class': 'navigation'},
+            {'class': 'main-menu'},
+            {'id': 'menu'},
+            {'class': 'primary-menu'},
+            {'role': 'navigation'}
         ]
+        for pattern in nav_patterns:
+            nav = soup.find(**pattern)
+            if nav:
+                for link in nav.find_all('a', href=True):
+                    text = self.clean_text(link.get_text())
+                    href = link['href']
+                    if text and href and len(text) > 1 and not self.is_social_media_link(href):
                         nav_items.append({
+                            'text': text,
+                            'link': href
                         })
+        # Fallback to CSS selectors if BeautifulSoup finds nothing
+        if not nav_items:
+            for link in response.css('a'):
+                text = self.clean_text(' '.join(link.css('::text').getall()))
+                href = link.css('::attr(href)').get()
+                if text and href and len(text) > 1 and not self.is_social_media_link(href):
+                    nav_items.append({
+                        'text': text,
+                        'link': href
+                    })
         return nav_items
     def extract_study_program_links(self, response):
+        """Enhanced study program link extraction"""
         program_links = []
+        # Try BeautifulSoup first
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Look for common patterns in menu items
+        program_texts = ['program studi', 'prodi', 'jurusan', 'program pendidikan']
+        for text in program_texts:
+            menu_items = soup.find_all(lambda tag: tag.name == 'a' and text in tag.get_text().lower())
+            for item in menu_items:
+                href = item.get('href')
+                if href:
+                    program_links.append({
+                        'title': self.clean_text(item.get_text()),
+                        'link': response.urljoin(href)
+                    })
+        # Fallback to XPath if needed
+        if not program_links:
+            xpath = "//a[contains(translate(., 'PROGRAMSTUDI', 'programstudi'), 'program studi') or contains(., 'Prodi')]"
+            for link in response.xpath(xpath):
                 program_links.append({
+                    'title': self.clean_text(''.join(link.xpath('.//text()').getall())),
+                    'link': response.urljoin(link.xpath('@href').get())
                 })
+        # Deduplicate
+        seen = set()
+        unique_links = []
+        for prog in program_links:
+            if prog['link'] not in seen:
+                seen.add(prog['link'])
+                unique_links.append(prog)
+        return unique_links
     def extract_vision_mission_links(self, response):
+        """Improved vision & mission link detection"""
         vm_links = []
+        # Terms in multiple languages
+        terms = ['visi', 'misi', 'vision', 'mission', 'tujuan', 'goal']
+        # Check both link text and URLs
+        for term in terms:
+            # Links containing the term in text
             for link in response.css(f'a:contains("{term}")'):
                 text = self.clean_text(' '.join(link.css('::text').getall()))
                 url = link.css('::attr(href)').get()
                 if text and url:
                     vm_links.append({
+                        'title': text,
                         'link': response.urljoin(url)
                     })
+            # Links with term in URL
+            for link in response.css(f'a[href*="{term}"]'):
+                if link not in vm_links:
+                    text = self.clean_text(' '.join(link.css('::text').getall()))
+                    url = link.css('::attr(href)').get()
+                    if text and url:
+                        vm_links.append({
+                            'title': text,
+                            'link': response.urljoin(url)
+                        })
         return vm_links
     def parse_content_page(self, response):
+        """Enhanced content page parsing"""
+        self.session_stats['total_pages'] += 1
         meta = response.meta
         self.visited_urls.add(response.url)
+        # Extract content with improved methods
+        content = self.extract_structured_content(response)
         if content:
             self.save_page_content(
                 content,
                 meta.get('menu_path', '')
             )
+            self.session_stats['successful_pages'] += 1
+        # Extract and follow internal links with better filtering
+        if response.meta.get('depth', 0) < 3:  # Limit depth
+            internal_links = self.extract_internal_links(response, meta['domain'])
+            for link in internal_links:
+                if link['url'] not in self.visited_urls:
+                    yield scrapy.Request(
+                        url=link['url'],
+                        callback=self.parse_content_page,
+                        meta={
+                            'page_title': link['text'],
+                            'category': meta['category'],  # Inherit parent category
+                            'department': meta['department'],
+                            'domain': meta['domain'],
+                            'menu_path': f"{meta.get('menu_path', '')} > {link['text']}",
+                            'retry_count': 0,
+                            'depth': response.meta.get('depth', 0) + 1
+                        },
+                        errback=self.handle_error,
+                        priority=1
+                    )
+    def extract_structured_content(self, response):
+        """Extract content in a more structured way using both CSS and XPath"""
+        content = {
+            "metadata": {
+                "title": self.extract_page_title(response),
+                "url": response.url,
+                "timestamp": datetime.now().isoformat(),
+                "department": response.meta.get('department', ''),
+                "domain": response.meta.get('domain', '')
+            },
+            "sections": [],
+            "files": [],
+            "tables": []
+        }
+        # Use BeautifulSoup for better HTML parsing
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove unwanted elements
+        for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'form']):
+            element.decompose()
+        # Extract main content areas
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content|main')) or soup
+        # Process headings and content hierarchy
+        current_section = {}
+        for element in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):
+            if element.name.startswith('h'):
+                # If we have a current section with content, add it first
+                if current_section and current_section.get('content'):
+                    content['sections'].append(current_section)
+                # Start new section
+                current_section = {
+                    "heading": self.clean_text(element.get_text()),
+                    "level": int(element.name[1]),
+                    "content": []
+                }
+            else:
+                if not current_section:
+                    current_section = {
+                        "heading": "Content",
+                        "level": 2,
+                        "content": []
+                    }
+                if element.name == 'p':
+                    text = self.clean_text(element.get_text())
+                    if text and len(text) > 20:
+                        current_section['content'].append({
+                            "type": "paragraph",
+                            "text": text
+                        })
+                elif element.name in ['ul', 'ol']:
+                    items = [self.clean_text(li.get_text()) for li in element.find_all('li')]
+                    if items:
+                        current_section['content'].append({
+                            "type": "list",
+                            "style": "ordered" if element.name == 'ol' else "unordered",
+                            "items": items
+                        })
+                elif element.name == 'table':
+                    table_data = self.extract_table_data(element)
+                    if table_data:
+                        content['tables'].append(table_data)
+        # Add the last section if it exists
+        if current_section and current_section.get('content'):
+            content['sections'].append(current_section)
+        # Extract files and downloads
+        for link in main_content.find_all('a', href=True):
+            href = link['href']
+            if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)$', href.lower()):
+                content['files'].append({
+                    "title": self.clean_text(link.get_text()) or "Unduhan",
+                    "url": response.urljoin(href),
+                    "type": href.split('.')[-1].lower()
+                })
+        return content if (content['sections'] or content['tables']) else None
+    def extract_table_data(self, table_element):
+        """Extract structured table data"""
+        table_data = {
+            "headers": [],
+            "rows": []
+        }
+        # Extract headers from thead if exists
+        thead = table_element.find('thead')
+        if thead:
+            for th in thead.find_all(['th', 'td']):
+                table_data['headers'].append(self.clean_text(th.get_text()))
+        # Extract rows from tbody or directly from table
+        tbody = table_element.find('tbody') or table_element
+        for tr in tbody.find_all('tr'):
+            row = []
+            for cell in tr.find_all(['td', 'th']):
+                # Handle cell content with possible links
+                cell_text = self.clean_text(cell.get_text())
+                links = [{'text': self.clean_text(a.get_text()), 'url': a['href']}
+                        for a in cell.find_all('a', href=True)]
+                row.append({
+                    "text": cell_text,
+                    "links": links
+                })
+            if row:
+                table_data['rows'].append(row)
+        return table_data if table_data['rows'] else None
+    def extract_internal_links(self, response, domain):
+        """Extract internal links with better filtering"""
+        internal_links = []
+        for link in response.css('a[href]'):
+            text = self.clean_text(' '.join(link.css('::text').getall()))
+            href = link.css('::attr(href)').get()
+            if not text or not href:
+                continue
+            full_url = response.urljoin(href)
+            parsed_url = urlparse(full_url)
+            # Only follow links from the same domain
+            if parsed_url.netloc == domain:
+                if not self.is_unwanted_url(full_url) and full_url not in self.visited_urls:
+                    internal_links.append({
+                        'text': text,
+                        'url': full_url
+                    })
+        return internal_links
     def parse_study_program(self, response):
+        """Enhanced study program parsing"""
+        self.session_stats['total_pages'] += 1
         meta = response.meta
         self.visited_urls.add(response.url)
         department = meta['department']
+        program_title = self.extract_page_title(response)
+        # Extract program details with improved methods
         program_details = self.extract_program_details(response)
         # Add to the study programs collection
         self.study_programs[department].append({
         })
         # Also save as a regular page
+        content = self.extract_structured_content(response)
         if content:
             self.save_page_content(
                 response.url,
                 'Program_Studi',
                 content
             )
+            self.session_stats['successful_pages'] += 1
     def extract_program_details(self, response):
+        """Enhanced program details extraction"""
+        details = {}
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Degree detection from multiple sources
         degree_sources = [
+            soup.title.string if soup.title else None,
+            soup.h1.get_text() if soup.h1 else None,
+            ' '.join(soup.find(class_=re.compile('breadcrumb')).stripped_strings) if soup.find(class_=re.compile('breadcrumb')) else None
         ]
         degree_pattern = re.compile(
                 details['degree'] = match.group(1).upper()
                 break
+        # Extract accreditation status with better pattern matching
+        accreditation_texts = [
+            'akreditasi',
+            'peringkat',
+            'status akreditasi',
+            'sertifikasi'
+        ]
+        for text in accreditation_texts:
+            element = soup.find(string=re.compile(text, re.IGNORECASE))
+            if element:
+                # Look for the accreditation value in nearby elements
+                parent = element.find_parent()
+                siblings = [sib for sib in parent.next_siblings if isinstance(sib, str) or sib.name]
+                for sib in siblings:
+                    if isinstance(sib, str):
+                        if match := re.search(r'[A-Z]', sib):
+                            details['accreditation'] = match.group()
+                            break
+                    elif sib.name:
+                        if match := re.search(r'[A-Z]', sib.get_text()):
+                            details['accreditation'] = match.group()
+                            break
+                if 'accreditation' in details:
+                    break
+        # Extract description from the first meaningful paragraph
+        for p in soup.find_all('p'):
+            text = self.clean_text(p.get_text())
+            if text and len(text) > 50 and not any(x in text.lower() for x in ['copyright', 'hak cipta']):
+                details['description'] = text
+                break
         return details
     def parse_vision_mission(self, response):
+        """Enhanced vision & mission parsing"""
+        self.session_stats['total_pages'] += 1
         meta = response.meta
         self.visited_urls.add(response.url)
         department = meta['department']
+        # Use BeautifulSoup for better content extraction
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Find vision and mission sections
+        vision_text = self.find_section_text(soup, ['visi', 'vision'])
+        mission_text = self.find_section_text(soup, ['misi', 'mission'])
+        # Find mission items if presented as list
+        mission_items = []
+        mission_list = self.find_mission_list(soup)
+        if mission_list:
+            mission_items = [self.clean_text(li.get_text()) for li in mission_list.find_all('li')]
+        # Store in department info
+        if vision_text or mission_text or mission_items:
             if vision_text:
                 self.department_info[department]['vision'] = vision_text
             if mission_text:
                 self.department_info[department]['mission'] = mission_text
+            if mission_items:
+                self.department_info[department]['mission_items'] = mission_items
+            # Save as separate file
             self.save_vision_mission(
                 department,
                 meta['domain'],
                 vision_text,
                 mission_text,
+                mission_items,
                 response.url
             )
         # Also save as a regular page
+        content = self.extract_structured_content(response)
         if content:
             self.save_page_content(
                 response.url,
                 'Profil',
                 content
             )
+            self.session_stats['successful_pages'] += 1
+    def find_section_text(self, soup, keywords):
+        """Find section text based on keywords"""
+        for keyword in keywords:
+            # Look for headings containing the keyword
+            for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
+                if keyword.lower() in heading.get_text().lower():
+                    # Get the next paragraph or div
+                    next_node = heading.next_sibling
+                    while next_node:
+                        if next_node.name in ['p', 'div']:
+                            text = self.clean_text(next_node.get_text())
+                            if text:
+                                return text
+                        next_node = next_node.next_sibling
+        return None
+    def find_mission_list(self, soup):
+        """Find mission items presented as list"""
+        for keyword in ['misi', 'mission']:
+            # Look for headings containing the keyword
+            for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
+                if keyword.lower() in heading.get_text().lower():
+                    # Find the next ul or ol element
+                    next_node = heading.next_sibling
+                    while next_node:
+                        if next_node.name in ['ul', 'ol']:
+                            return next_node
+                        next_node = next_node.next_sibling
+        return None
     def save_page_content(self, url, title, department, domain, category, content, menu_path=''):
+        """Save page content with improved formatting"""
         if not content or not title:
             return
+        # Generate filename with department prefix
         safe_title = re.sub(r'[^\w\s-]', '', title).strip().lower()
+        safe_title = re.sub(r'[-\s]+', '-', safe_title)[:100]  # Limit length
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"{department}_{safe_title}_{timestamp}.json"
         try:
+            # Convert content to JSON string
+            import json
+            content_str = json.dumps(content, ensure_ascii=False, indent=2)
+            # Upload to Supabase
             upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
                 path=filename,
+                file=content_str.encode('utf-8'),
+                file_options={"content-type": "application/json", "x-upsert": "true"}
             )
             self.logger.info(f"Successfully uploaded {filename}")
+            # Store in our collection
             self.department_data[department][category].append({
                 'title': title,
                 'url': url,
+                'filename': filename,
+                'timestamp': timestamp
             })
         except Exception as e:
             self.logger.error(f"Upload failed for {filename}: {str(e)}")
+            self.failed_urls.add(url)
     def save_vision_mission(self, department, domain, vision, mission, mission_items, url):
+        """Save vision & mission with improved formatting"""
+        filename = f"{department}_Visi_Misi.json"
+        content = {
+            "department": department,
+            "domain": domain,
+            "url": url,
+            "timestamp": datetime.now().isoformat(),
+            "vision": vision,
+            "mission": mission,
+            "mission_items": mission_items
+        }
         try:
+            import json
+            content_str = json.dumps(content, ensure_ascii=False, indent=2)
             upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
                 path=filename,
+                file=content_str.encode('utf-8'),
+                file_options={"content-type": "application/json", "x-upsert": "true"}
             )
+            self.logger.info(f"Successfully uploaded vision & mission for {department}")
         except Exception as e:
+            self.logger.error(f"Failed to upload vision & mission for {department}: {str(e)}")
+            self.failed_urls.add(url)
     def clean_text(self, text):
+        """Improved text cleaning with normalization"""
         if not text:
             return ""
         # Normalize unicode characters
         text = unicodedata.normalize('NFKC', text)
+        # Remove control characters
+        text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
+        # Replace multiple spaces/newlines with single space
+        text = re.sub(r'\s+', ' ', text)
+        # Remove leading/trailing whitespace
+        text = text.strip()
+        return text
     def determine_category(self, menu_text):
+        """Enhanced category determination"""
         menu_lower = menu_text.lower()
+        category_mapping = [
+            (['beranda', 'home', 'utama'], 'Beranda'),
+            (['profil', 'profile', 'tentang', 'about', 'sejarah', 'history'], 'Profil'),
+            (['program', 'studi', 'prodi', 'jurusan', 'kurikulum'], 'Program_Studi'),
+            (['dosen', 'staff', 'pengajar', 'lecturer'], 'Dosen'),
+            (['penelitian', 'research', 'publikasi', 'jurnal'], 'Penelitian'),
+            (['mahasiswa', 'student', 'alumni'], 'Mahasiswa'),
+            (['fasilitas', 'lab', 'laboratorium'], 'Fasilitas'),
+            (['pengumuman', 'berita', 'news', 'agenda'], 'Informasi'),
+            (['kerjasama', 'partnership', 'mitra'], 'Kerjasama'),
+            (['dokumen', 'download', 'unduhan'], 'Dokumen')
+        ]
+        for keywords, category in category_mapping:
+            if any(keyword in menu_lower for keyword in keywords):
                 return category
         return 'Lainnya'
     def is_social_media_link(self, url):
+        """Check if URL is social media with better pattern matching"""
+        social_domains = [
+            'facebook.com', 'twitter.com', 'instagram.com',
+            'youtube.com', 'linkedin.com', 'tiktok.com',
+            'whatsapp.com', 'wa.me', 'telegram.me'
         ]
+        if not url:
+            return False
+        parsed = urlparse(url.lower())
+        if not parsed.netloc:
+            return False
+        return any(domain in parsed.netloc for domain in social_domains)
     def is_unwanted_url(self, url):
+        """Improved unwanted URL detection"""
+        if not url:
             return True
         unwanted_patterns = [
+            r'\.(jpg|jpeg|png|gif|svg|ico|css|js|pdf|docx?|xlsx?|pptx?|zip|rar)$',
+            r'(login|logout|signin|signup|register|admin|wp-|/wp/|wordpress|comment|feed|rss|atom)',
+            r'(javascript:|mailto:|tel:|#)',
+            r'(page/\d+|tag/|author/|archive/|category/|search|kalender|ajax|api)'
         ]
+        url_lower = url.lower()
+        return any(re.search(pattern, url_lower) for pattern in unwanted_patterns)
     def handle_error(self, failure):
+        """Enhanced error handling with retry logic"""
         url = failure.request.url
+        meta = failure.request.meta
+        retry_count = meta.get('retry_count', 0)
+        self.logger.error(f"Request failed ({retry_count}): {url} - {str(failure.value)}")
+        self.session_stats['failed_pages'] += 1
+        self.failed_urls.add(url)
+        # Retry logic
+        if retry_count < self.custom_settings.get('RETRY_TIMES', 2):
+            self.logger.info(f"Retrying {url} (attempt {retry_count + 1})")
+            new_request = failure.request.copy()
+            new_request.meta['retry_count'] = retry_count + 1
+            new_request.dont_filter = True
+            return new_request
     def closed(self, reason):
+        """Enhanced closing method with comprehensive reporting"""
+        self.logger.info("Spider closed. Generating final reports...")
+        # Generate summary statistics
+        summary = {
+            "total_pages": self.session_stats['total_pages'],
+            "successful_pages": self.session_stats['successful_pages'],
+            "failed_pages": self.session_stats['failed_pages'],
+            "success_rate": (self.session_stats['successful_pages'] / self.session_stats['total_pages']) * 100 if self.session_stats['total_pages'] > 0 else 0,
+            "departments_crawled": len(self.department_data),
+            "programs_found": sum(len(progs) for progs in self.study_programs.values()),
+            "failed_urls": list(self.failed_urls)
+        }
+        # Upload summary report
+        self.upload_summary_report(summary)
+        # Generate department summary
+        self.generate_department_summary()
+        self.logger.info(f"Crawling completed. Success rate: {summary['success_rate']:.2f}%")
+    def upload_summary_report(self, summary):
+        """Upload comprehensive summary report"""
+        filename = "crawling_summary_report.json"
+        try:
+            import json
+            content = json.dumps(summary, indent=2)
+            self.supabase.storage.from_(self.storage_bucket).upload(
+                path=filename,
+                file=content.encode('utf-8'),
+                file_options={"content-type": "application/json", "x-upsert": "true"}
+            )
+            self.logger.info("Successfully uploaded summary report")
+        except Exception as e:
+            self.logger.error(f"Failed to upload summary report: {str(e)}")
+    def generate_department_summary(self):
+        """Generate detailed department summary"""
+        content = "# Laporan Lengkap Jurusan Politeknik Negeri Padang\n\n"
+        content += f"**Tanggal**: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
         # Create reverse mapping from department name to domain
         reverse_departments = {v: k for k, v in self.DEPARTMENTS.items()}
         for department, programs in self.study_programs.items():
             domain = reverse_departments.get(department, '')
+            website_url = f'https://{domain}' if domain else ''
             content += f"## {department.replace('_', ' ')}\n"
+            content += f"**Website**: {website_url}\n"
+            # Add vision and mission if available
+            if department in self.department_info:
+                if 'vision' in self.department_info[department]:
+                    content += f"\n### Visi\n{self.department_info[department]['vision']}\n"
+                if 'mission' in self.department_info[department]:
+                    content += f"\n### Misi\n{self.department_info[department]['mission']}\n"
+                elif 'mission_items' in self.department_info[department]:
+                    content += "\n### Misi\n"
+                    for i, item in enumerate(self.department_info[department]['mission_items'], 1):
+                        content += f"{i}. {item}\n"
+            # Add study programs
             if programs:
+                content += "\n### Program Studi\n"
                 for prog in programs:
+                    content += f"- **{prog['title']}**\n"
+                    content += f"  - Jenjang: {prog['details'].get('degree', 'N/A')}\n"
+                    content += f"  - Akreditasi: {prog['details'].get('accreditation', 'N/A')}\n"
+                    content += f"  - URL: {prog['url']}\n"
                     if 'description' in prog['details']:
+                        content += f"  - Deskripsi: {prog['details']['description']}\n"
             else:
+                content += "\n### Belum ada informasi program studi\n"
             content += "\n---\n\n"
+        # Upload department summary
+        filename = "department_summary_report.md"
         try:
             self.supabase.storage.from_(self.storage_bucket).upload(
                 path=filename,
                 file=content.encode('utf-8'),
+                file_options={"content-type": "text/markdown", "x-upsert": "true"}
             )
+            self.logger.info("Successfully uploaded department summary report")
         except Exception as e:
+            self.logger.error(f"Failed to upload department summary: {str(e)}")
 if __name__ == "__main__":
     process = CrawlerProcess()
     process.crawl(PNPDepartmentSpider)