FauziIsyrinApridal commited on
Commit
050f867
·
1 Parent(s): 61960d8

perbaikan semester filter dan jurusan scrap tambahkan beatifulsoup

Browse files
app/(main)/components/RagDashboard.tsx CHANGED
@@ -189,7 +189,10 @@ export default function RagDashboard() {
189
  {isRefreshing ? "Refreshing..." : "Refresh"}
190
  </Button>
191
 
192
- <SemesterFilter onFilterChange={handleSemesterFilterChange} />
 
 
 
193
 
194
  <DropdownMenu>
195
  <DropdownMenuTrigger asChild>
 
189
  {isRefreshing ? "Refreshing..." : "Refresh"}
190
  </Button>
191
 
192
+ <SemesterFilter
193
+ dates={ragData.map((file) => file.created_at)}
194
+ onFilterChange={handleSemesterFilterChange}
195
+ />
196
 
197
  <DropdownMenu>
198
  <DropdownMenuTrigger asChild>
components/SemesterFilter.tsx CHANGED
@@ -1,12 +1,11 @@
1
- import { useState, useEffect } from "react";
2
  import { ChevronDown, Filter, Calendar } from "lucide-react";
3
 
4
- // Define the props interface for the component
5
  interface SemesterFilterProps {
6
- onFilterChange?: (semesterId: string) => void; // Make it optional with ?
 
7
  }
8
 
9
- // Define the semester option type
10
  interface SemesterOption {
11
  id: string;
12
  label: string;
@@ -14,40 +13,57 @@ interface SemesterOption {
14
  }
15
 
16
  export default function SemesterFilter({
 
17
  onFilterChange,
18
  }: SemesterFilterProps) {
19
  const [semesterFilter, setSemesterFilter] = useState("all");
20
  const [isFilterOpen, setIsFilterOpen] = useState(false);
21
 
22
- // Get current year
23
- const currentYear = new Date().getFullYear();
24
-
25
- // Generate academic years (e.g., 2023/2024, 2024/2025)
26
- const generateAcademicYears = (
27
- startYear: number,
28
- endYear: number,
29
- ): string[] => {
30
- const years: string[] = [];
31
- for (let year = startYear; year <= endYear; year++) {
32
- years.push(`${year}/${year + 1}`);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  }
34
- return years.reverse(); // Most recent first okay
 
35
  };
36
 
37
- const academicYears = generateAcademicYears(currentYear - 3, currentYear);
38
 
39
  // Generate semester options
40
  const semesterOptions: SemesterOption[] = [];
41
  academicYears.forEach((academicYear) => {
42
  const [startYear, endYear] = academicYear.split("/");
43
- // Odd semester (September startYear - January endYear)
44
  semesterOptions.push({
45
  id: `odd-${academicYear}`,
46
  label: `Ganjil ${academicYear}`,
47
  description: `September ${startYear} - January ${endYear}`,
48
  });
49
 
50
- // Even semester (February - August of endYear)
51
  semesterOptions.push({
52
  id: `even-${academicYear}`,
53
  label: `Genap ${academicYear}`,
@@ -55,28 +71,6 @@ export default function SemesterFilter({
55
  });
56
  });
57
 
58
- // Check if a document falls within a specific semester
59
- const isInSemester = (date: string, semesterId: string) => {
60
- if (semesterId === "all") return true;
61
-
62
- const [type, academicYear] = semesterId.split("-");
63
- const [startYear, endYear] = academicYear.split("/");
64
- const docDate = new Date(date);
65
- const docMonth = docDate.getMonth() + 1; // 1-12
66
- const docYear = docDate.getFullYear();
67
-
68
- if (type === "odd") {
69
- // Odd semester: September (9) - January (1) of next year
70
- return (
71
- (docYear === parseInt(startYear) && docMonth >= 9 && docMonth <= 12) ||
72
- (docYear === parseInt(endYear) && docMonth === 1)
73
- );
74
- } else {
75
- // Even semester: February (2) - August (8)
76
- return docYear === parseInt(endYear) && docMonth >= 2 && docMonth <= 8;
77
- }
78
- };
79
-
80
  const handleFilterClick = () => {
81
  setIsFilterOpen(!isFilterOpen);
82
  };
@@ -84,14 +78,11 @@ export default function SemesterFilter({
84
  const handleSemesterSelect = (semesterId: string) => {
85
  setSemesterFilter(semesterId);
86
  setIsFilterOpen(false);
87
-
88
- // Call the onFilterChange prop if it exists
89
  if (onFilterChange) {
90
  onFilterChange(semesterId);
91
  }
92
  };
93
 
94
- // Get display text for current filter
95
  const getCurrentFilterText = () => {
96
  if (semesterFilter === "all") return "All Semesters";
97
 
@@ -118,7 +109,7 @@ export default function SemesterFilter({
118
  {isFilterOpen && (
119
  <div className="absolute z-50 mt-2 min-w-[240px] rounded-md border border-gray-200 bg-white shadow-lg">
120
  <div className="py-1">
121
- {/* All option */}
122
  <div
123
  onClick={() => handleSemesterSelect("all")}
124
  className={`cursor-pointer px-4 py-2 hover:bg-gray-100 ${
@@ -128,15 +119,15 @@ export default function SemesterFilter({
128
  All Semesters
129
  </div>
130
 
131
- {/* Divider */}
132
- <div className="my-1 border-t border-gray-200"></div>
133
 
134
- {/* Academic Years and Semesters */}
135
  {academicYears.map((year, yearIndex) => (
136
  <div key={year}>
137
  {yearIndex > 0 && (
138
- <div className="my-1 border-t border-gray-200"></div>
139
  )}
 
140
  <div className="px-4 py-2 text-xs font-semibold text-gray-500">
141
  Academic Year {year}
142
  </div>
 
1
+ import { useState } from "react";
2
  import { ChevronDown, Filter, Calendar } from "lucide-react";
3
 
 
4
  interface SemesterFilterProps {
5
+ dates: string[];
6
+ onFilterChange?: (semesterId: string) => void;
7
  }
8
 
 
9
  interface SemesterOption {
10
  id: string;
11
  label: string;
 
13
  }
14
 
15
  export default function SemesterFilter({
16
+ dates,
17
  onFilterChange,
18
  }: SemesterFilterProps) {
19
  const [semesterFilter, setSemesterFilter] = useState("all");
20
  const [isFilterOpen, setIsFilterOpen] = useState(false);
21
 
22
+ // Extract unique academic years from dates
23
+ const extractAcademicYears = (dates: string[]): string[] => {
24
+ const yearSet = new Set<string>();
25
+
26
+ dates.forEach((dateStr) => {
27
+ const date = new Date(dateStr);
28
+ const year = date.getFullYear();
29
+ const month = date.getMonth() + 1;
30
+
31
+ let startYear: number;
32
+ if (month >= 9) {
33
+ // Odd semester: starts in September
34
+ startYear = year;
35
+ } else {
36
+ // Even semester: January–August of next year
37
+ startYear = year - 1;
38
+ }
39
+
40
+ const academicYear = `${startYear}/${startYear + 1}`;
41
+ yearSet.add(academicYear);
42
+ });
43
+
44
+ if (yearSet.size === 0) {
45
+ const currentYear = new Date().getFullYear();
46
+ yearSet.add(`${currentYear - 2}/${currentYear - 1}`);
47
+ yearSet.add(`${currentYear - 1}/${currentYear}`);
48
+ yearSet.add(`${currentYear}/${currentYear + 1}`);
49
  }
50
+
51
+ return Array.from(yearSet).sort((a, b) => (a > b ? -1 : 1));
52
  };
53
 
54
+ const academicYears = extractAcademicYears(dates);
55
 
56
  // Generate semester options
57
  const semesterOptions: SemesterOption[] = [];
58
  academicYears.forEach((academicYear) => {
59
  const [startYear, endYear] = academicYear.split("/");
60
+
61
  semesterOptions.push({
62
  id: `odd-${academicYear}`,
63
  label: `Ganjil ${academicYear}`,
64
  description: `September ${startYear} - January ${endYear}`,
65
  });
66
 
 
67
  semesterOptions.push({
68
  id: `even-${academicYear}`,
69
  label: `Genap ${academicYear}`,
 
71
  });
72
  });
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  const handleFilterClick = () => {
75
  setIsFilterOpen(!isFilterOpen);
76
  };
 
78
  const handleSemesterSelect = (semesterId: string) => {
79
  setSemesterFilter(semesterId);
80
  setIsFilterOpen(false);
 
 
81
  if (onFilterChange) {
82
  onFilterChange(semesterId);
83
  }
84
  };
85
 
 
86
  const getCurrentFilterText = () => {
87
  if (semesterFilter === "all") return "All Semesters";
88
 
 
109
  {isFilterOpen && (
110
  <div className="absolute z-50 mt-2 min-w-[240px] rounded-md border border-gray-200 bg-white shadow-lg">
111
  <div className="py-1">
112
+ {/* All Semesters */}
113
  <div
114
  onClick={() => handleSemesterSelect("all")}
115
  className={`cursor-pointer px-4 py-2 hover:bg-gray-100 ${
 
119
  All Semesters
120
  </div>
121
 
122
+ <div className="my-1 border-t border-gray-200" />
 
123
 
124
+ {/* Semester Options Grouped by Academic Year */}
125
  {academicYears.map((year, yearIndex) => (
126
  <div key={year}>
127
  {yearIndex > 0 && (
128
+ <div className="my-1 border-t border-gray-200" />
129
  )}
130
+
131
  <div className="px-4 py-2 text-xs font-semibold text-gray-500">
132
  Academic Year {year}
133
  </div>
requirements.txt CHANGED
@@ -3,4 +3,4 @@ supabase
3
  python-dotenv
4
  requests
5
  instaloader
6
-
 
3
  python-dotenv
4
  requests
5
  instaloader
6
+ BeautifulSoup
scrapping/jurusan_scrap.py CHANGED
@@ -8,12 +8,14 @@ from datetime import datetime
8
  from collections import defaultdict
9
  from supabase import create_client
10
  from dotenv import load_dotenv
 
 
11
 
12
  # Load environment variables
13
  load_dotenv()
14
 
15
  class PNPDepartmentSpider(scrapy.Spider):
16
- name = 'improved_pnp_department_spider'
17
 
18
  DEPARTMENTS = {
19
  'akt.pnp.ac.id': 'Akuntansi',
@@ -27,50 +29,88 @@ class PNPDepartmentSpider(scrapy.Spider):
27
 
28
  start_urls = [f'https://{domain}' for domain in DEPARTMENTS.keys()]
29
  visited_urls = set()
 
30
 
31
  custom_settings = {
32
- 'DOWNLOAD_DELAY': 2.0,
33
  'ROBOTSTXT_OBEY': True,
34
  'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
35
  'LOG_LEVEL': 'INFO',
36
  'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
37
- "DOWNLOAD_TIMEOUT": 100,
38
- 'RETRY_TIMES': 3,
39
  'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
40
- 'HTTPCACHE_ENABLED': True
 
 
 
41
  }
42
 
43
  def __init__(self, *args, **kwargs):
44
  super().__init__(*args, **kwargs)
45
- self.supabase = create_client(
46
- os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
47
- os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
48
- )
49
- self.storage_bucket = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
50
  self.department_data = defaultdict(lambda: defaultdict(list))
51
  self.study_programs = defaultdict(list)
52
  self.department_info = defaultdict(dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def start_requests(self):
 
55
  for url in self.start_urls:
56
  yield scrapy.Request(
57
  url=url,
58
  callback=self.parse_department_homepage,
59
  errback=self.handle_error,
60
- headers={'Accept': 'text/html,application/xhtml+xml'}
 
61
  )
62
 
63
  def parse_department_homepage(self, response):
 
 
 
64
  domain = urlparse(response.url).netloc
65
  department = self.DEPARTMENTS.get(domain, domain)
66
  self.visited_urls.add(response.url)
67
 
68
- self.logger.info(f"Processing department homepage: {domain} - {department}")
69
 
70
- # Extract homepage content first
71
  homepage_content = self.extract_content(response)
72
  if homepage_content:
73
- page_title = response.css('h1::text, .site-title::text, title::text').get('Homepage').strip()
74
  self.save_page_content(
75
  response.url,
76
  page_title,
@@ -79,11 +119,12 @@ class PNPDepartmentSpider(scrapy.Spider):
79
  'Beranda',
80
  homepage_content
81
  )
 
82
 
83
- # Process navigation menu
84
  nav_elements = self.extract_navigation(response)
85
  for nav_item in nav_elements:
86
- if not nav_item['link'] or nav_item['link'].startswith('#'):
87
  continue
88
 
89
  full_url = response.urljoin(nav_item['link'])
@@ -98,12 +139,15 @@ class PNPDepartmentSpider(scrapy.Spider):
98
  'category': category,
99
  'department': department,
100
  'domain': domain,
101
- 'menu_path': nav_item['text']
 
 
102
  },
103
- errback=self.handle_error
 
104
  )
105
 
106
- # Find and process all study program links
107
  study_program_links = self.extract_study_program_links(response)
108
  for prog in study_program_links:
109
  if prog['link'] not in self.visited_urls:
@@ -113,12 +157,14 @@ class PNPDepartmentSpider(scrapy.Spider):
113
  meta={
114
  'page_title': prog['title'],
115
  'department': department,
116
- 'domain': domain
 
 
117
  },
118
  errback=self.handle_error
119
  )
120
 
121
- # Find and process vision & mission specifically
122
  vision_mission_links = self.extract_vision_mission_links(response)
123
  for vm_link in vision_mission_links:
124
  if vm_link['link'] not in self.visited_urls:
@@ -128,96 +174,168 @@ class PNPDepartmentSpider(scrapy.Spider):
128
  meta={
129
  'page_title': vm_link['title'],
130
  'department': department,
131
- 'domain': domain
 
 
132
  },
133
  errback=self.handle_error
134
  )
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def extract_navigation(self, response):
137
- """Extract navigation elements from page"""
 
138
  nav_items = []
139
 
140
- # Try multiple selectors that commonly contain navigation
141
- nav_selectors = [
142
- 'nav a', '.navbar a', '.navigation a', '.main-menu a', '.nav a',
143
- '#menu a', '.menu a', 'header a', '.navbar-collapse a',
144
- 'ul.nav a', '.dropdown-menu a', '.megamenu a',
145
- '#main-menu a', '.main-navigation a', '#primary-menu a',
146
- '.top-menu a', '.primary-menu a', '#nav a'
 
 
147
  ]
148
 
149
- for selector in nav_selectors:
150
- for item in response.css(selector):
151
- text = self.clean_text(' '.join(item.css('::text').getall()))
152
- link = item.css('::attr(href)').get()
153
-
154
- if text and link and len(text.strip()) > 1:
155
- if not self.is_social_media_link(link) and not self.is_unwanted_url(link):
 
156
  nav_items.append({
157
- 'text': text.strip(),
158
- 'link': response.urljoin(link)
159
  })
160
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  return nav_items
162
 
163
  def extract_study_program_links(self, response):
164
- """Extract study program links from complex WordPress navigation menus"""
165
  program_links = []
166
 
167
- # XPath untuk menemukan menu 'Program Studi' dan semua submenu-nya
168
- base_xpath = """
169
- //li[contains(@class, 'wp-block-navigation-submenu')]
170
- [.//span[contains(translate(., 'PROGRAMSTUDI', 'programstudi'), 'program studi')]]
171
- //ul[@class='wp-block-navigation__submenu-container']
172
- /li[contains(@class, 'wp-block-navigation-submenu')]
173
- /a[contains(@class, 'wp-block-navigation-item__content')]
174
- """
175
-
176
- # Ambil semua link program studi utama
177
- for link in response.xpath(base_xpath):
178
- text = self.clean_text(''.join(link.xpath('.//span[@class="wp-block-navigation-item__label"]//text()').getall()))
179
- url = link.xpath('@href').get()
180
-
181
- if text and url:
 
 
 
 
 
182
  program_links.append({
183
- 'title': text.strip(),
184
- 'link': response.urljoin(url)
185
  })
186
-
187
- # Logika fallback untuk website yang menggunakan struktur berbeda
188
- if not program_links:
189
- program_links = super().extract_study_program_links(response)
190
 
191
- return program_links
 
 
 
 
 
 
 
 
192
 
193
  def extract_vision_mission_links(self, response):
194
- """Extract links specifically for vision & mission"""
195
  vm_links = []
196
 
197
- # Terms related to vision & mission
198
- vm_terms = ['visi', 'misi', 'vision', 'mission', 'visi-misi', 'visi & misi', 'visi dan misi']
199
 
200
- # Look for links containing these terms
201
- for term in vm_terms:
 
202
  for link in response.css(f'a:contains("{term}")'):
203
  text = self.clean_text(' '.join(link.css('::text').getall()))
204
  url = link.css('::attr(href)').get()
205
 
206
  if text and url:
207
  vm_links.append({
208
- 'title': text.strip(),
209
  'link': response.urljoin(url)
210
  })
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  return vm_links
213
 
214
  def parse_content_page(self, response):
215
- """Process regular content pages"""
 
216
  meta = response.meta
217
  self.visited_urls.add(response.url)
218
 
219
- # Extract content from this page
220
- content = self.extract_content(response)
221
 
222
  if content:
223
  self.save_page_content(
@@ -229,43 +347,180 @@ class PNPDepartmentSpider(scrapy.Spider):
229
  content,
230
  meta.get('menu_path', '')
231
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
- # Look for additional sub-links within this page
234
- content_links = response.css('article a, .content a, .entry-content a, .post-content a, main a')
235
- for link in content_links:
236
- link_text = self.clean_text(' '.join(link.css('::text').getall()))
237
- link_url = link.css('::attr(href)').get()
 
 
 
 
238
 
239
- if link_text and link_url and len(link_text) > 3:
240
- # Only follow internal links
241
- parsed_url = urlparse(response.urljoin(link_url))
242
- if parsed_url.netloc == meta['domain']:
243
- full_url = response.urljoin(link_url)
244
- if full_url not in self.visited_urls and not self.is_unwanted_url(full_url):
245
- yield scrapy.Request(
246
- url=full_url,
247
- callback=self.parse_content_page,
248
- meta={
249
- 'page_title': link_text,
250
- 'category': meta['category'], # Keep parent category
251
- 'department': meta['department'],
252
- 'domain': meta['domain'],
253
- 'menu_path': f"{meta.get('menu_path', '')} > {link_text}"
254
- },
255
- errback=self.handle_error
256
- )
257
 
258
  def parse_study_program(self, response):
259
- """Process study program pages specifically"""
 
260
  meta = response.meta
261
  self.visited_urls.add(response.url)
262
 
263
  department = meta['department']
264
- program_title = meta['page_title']
265
 
266
- # Extract program details
267
  program_details = self.extract_program_details(response)
268
-
269
 
270
  # Add to the study programs collection
271
  self.study_programs[department].append({
@@ -275,7 +530,7 @@ class PNPDepartmentSpider(scrapy.Spider):
275
  })
276
 
277
  # Also save as a regular page
278
- content = self.extract_content(response)
279
  if content:
280
  self.save_page_content(
281
  response.url,
@@ -285,16 +540,18 @@ class PNPDepartmentSpider(scrapy.Spider):
285
  'Program_Studi',
286
  content
287
  )
 
288
 
289
  def extract_program_details(self, response):
290
- """Enhanced program details extraction with better degree detection"""
291
- details = {} # Initialize details as empty dict
 
292
 
293
- # Improved degree detection from multiple sources
294
  degree_sources = [
295
- response.css('title::text').get(),
296
- response.css('h1::text').get(),
297
- ' '.join(response.css('.breadcrumb ::text').getall())
298
  ]
299
 
300
  degree_pattern = re.compile(
@@ -307,124 +564,84 @@ class PNPDepartmentSpider(scrapy.Spider):
307
  details['degree'] = match.group(1).upper()
308
  break
309
 
310
- # Extract accreditation status
311
- accreditation = response.xpath(
312
- '//span[contains(translate(., "ABCDE", "abcde"), "akreditasi")]'
313
- '/following-sibling::span/text()'
314
- ).get()
315
-
316
- if accreditation:
317
- details['accreditation'] = self.clean_text(accreditation)
318
-
319
- # Extract description from the first paragraph
320
- first_paragraph = response.css('p::text').get()
321
- if first_paragraph:
322
- details['description'] = self.clean_text(first_paragraph)
323
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  return details
325
 
326
  def parse_vision_mission(self, response):
327
- """Special handler for vision & mission pages"""
 
328
  meta = response.meta
329
  self.visited_urls.add(response.url)
330
  department = meta['department']
331
 
332
- vision_text = ""
333
- mission_text = ""
334
 
335
- # Look for vision section
336
- vision_selectors = [
337
- 'h2:contains("Visi") + p', 'h3:contains("Visi") + p',
338
- 'h4:contains("Visi") + p', '.visi p', '#visi p',
339
- 'h2:contains("Vision") + p', 'h3:contains("Vision") + p',
340
- 'strong:contains("Visi") + p', 'b:contains("Visi") + p'
341
- ]
342
 
343
- for selector in vision_selectors:
344
- try:
345
- vision = response.css(selector).get()
346
- if vision:
347
- vision_text = self.clean_text(scrapy.Selector(text=vision).css('::text').get(''))
348
- if vision_text:
349
- break
350
- except:
351
- continue
352
-
353
- # If still not found, try looking for paragraphs after headings
354
- if not vision_text:
355
- for heading in response.css('h1, h2, h3, h4, h5, h6'):
356
- heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
357
- if heading_text and ('visi' in heading_text.lower() or 'vision' in heading_text.lower()):
358
- # Try to get the next paragraph
359
- next_p = heading.xpath('following-sibling::p[1]')
360
- if next_p:
361
- vision_text = self.clean_text(' '.join(next_p.css('::text').getall()))
362
- break
363
-
364
- # Look for mission section using similar approach
365
- mission_selectors = [
366
- 'h2:contains("Misi") + p', 'h3:contains("Misi") + p',
367
- 'h4:contains("Misi") + p', '.misi p', '#misi p',
368
- 'h2:contains("Mission") + p', 'h3:contains("Mission") + p',
369
- 'strong:contains("Misi") + p', 'b:contains("Misi") + p'
370
- ]
371
-
372
- for selector in mission_selectors:
373
- try:
374
- mission = response.css(selector).get()
375
- if mission:
376
- mission_text = self.clean_text(scrapy.Selector(text=mission).css('::text').get(''))
377
- if mission_text:
378
- break
379
- except:
380
- continue
381
-
382
- # If still not found, try looking for paragraphs after headings
383
- if not mission_text:
384
- for heading in response.css('h1, h2, h3, h4, h5, h6'):
385
- heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
386
- if heading_text and ('misi' in heading_text.lower() or 'mission' in heading_text.lower()):
387
- # Try to get the next paragraph
388
- next_p = heading.xpath('following-sibling::p[1]')
389
- if next_p:
390
- mission_text = self.clean_text(' '.join(next_p.css('::text').getall()))
391
- break
392
-
393
- # Try to find mission list items
394
- mission_list_items = []
395
- for list_selector in ['h2:contains("Misi") ~ ul li', 'h3:contains("Misi") ~ ul li',
396
- 'h4:contains("Misi") ~ ul li', '.misi ul li', '#misi ul li',
397
- 'h2:contains("Mission") ~ ul li', 'h3:contains("Mission") ~ ul li']:
398
- try:
399
- items = response.css(f'{list_selector}::text').getall()
400
- if items:
401
- mission_list_items = [self.clean_text(item) for item in items if self.clean_text(item)]
402
- if mission_list_items:
403
- break
404
- except:
405
- continue
406
 
407
- # Store vision and mission in department info
408
- if vision_text or mission_text or mission_list_items:
409
  if vision_text:
410
  self.department_info[department]['vision'] = vision_text
411
  if mission_text:
412
  self.department_info[department]['mission'] = mission_text
413
- if mission_list_items:
414
- self.department_info[department]['mission_items'] = mission_list_items
415
 
416
- # Save as separate file for vision-mission
417
  self.save_vision_mission(
418
  department,
419
  meta['domain'],
420
  vision_text,
421
  mission_text,
422
- mission_list_items,
423
  response.url
424
  )
425
 
426
  # Also save as a regular page
427
- content = self.extract_content(response)
428
  if content:
429
  self.save_page_content(
430
  response.url,
@@ -434,371 +651,294 @@ class PNPDepartmentSpider(scrapy.Spider):
434
  'Profil',
435
  content
436
  )
 
437
 
438
- def extract_content(self, response):
439
- """Extract content from a page in a structured format"""
440
- content = {"paragraphs": [], "tables": [], "files": []}
441
-
442
- # First try to find the main content areas
443
- content_selectors = [
444
- 'div.entry-content', 'article.post', 'main.site-main',
445
- 'div.content', 'div.main-content', 'div#content', 'div.page-content',
446
- 'article', '.post-content', '.entry-content', '.content',
447
- '.page-content', 'main', '#content', '.main-content',
448
- '.article-content', '.single-content'
449
- ]
450
-
451
- main_content = None
452
- for selector in content_selectors:
453
- elements = response.css(selector)
454
- if elements:
455
- main_content = elements
456
- break
457
-
458
- # If no primary content found, use body
459
- if not main_content:
460
- main_content = response.css('body')
461
-
462
- # Extract headings and paragraphs
463
- for heading in main_content.css('h1, h2, h3, h4, h5, h6'):
464
- heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
465
- if heading_text and len(heading_text) > 3:
466
- heading_tag = heading.root.tag
467
- content["paragraphs"].append(f"[{heading_tag.upper()}] {heading_text}")
468
-
469
- # Extract paragraphs
470
- for p in main_content.css('p'):
471
- text = self.clean_text(' '.join(p.css('::text').getall()))
472
- if text and len(text) > 20: # Minimum meaningful length
473
- # Add any links found in this paragraph
474
- links = []
475
- for a in p.css('a'):
476
- link_text = self.clean_text(' '.join(a.css('::text').getall()))
477
- link_url = a.css('::attr(href)').get()
478
- if link_text and link_url:
479
- links.append(f"{link_text} (Link: {response.urljoin(link_url)})")
480
-
481
- paragraph = text
482
- if links:
483
- paragraph += f" | Links: {'; '.join(links)}"
484
-
485
- content["paragraphs"].append(paragraph)
486
-
487
- # Extract list items
488
- for li in main_content.css('li'):
489
- text = self.clean_text(' '.join(li.css('::text').getall()))
490
- if text and len(text) > 10:
491
- content["paragraphs"].append(f"• {text}")
492
-
493
- # If no structured text elements found, try general text extraction
494
- if not content["paragraphs"]:
495
- # Get all text nodes within divs but not within scripts or styles
496
- for div in main_content.css('div'):
497
- text = self.clean_text(' '.join(div.xpath('./text()').getall()))
498
- if text and len(text) > 30:
499
- content["paragraphs"].append(text)
500
-
501
- # Extract tables
502
- for table in main_content.css('table'):
503
- rows = []
504
-
505
- # Get header if it exists
506
- headers = []
507
- for th in table.css('thead th, tr th'):
508
- header_text = self.clean_text(' '.join(th.css('::text').getall()))
509
- if header_text:
510
- headers.append(header_text)
511
-
512
- if headers:
513
- rows.append(" - ".join(headers))
514
-
515
- # Get table body rows
516
- for tr in table.css('tbody tr, tr'):
517
- if tr.css('th') and not tr.css('td'):
518
- continue # Skip header rows already processed
519
-
520
- cells = []
521
- for td in tr.css('td'):
522
- cell_text = self.clean_text(' '.join(td.css('::text').getall()))
523
- link = td.css('a::attr(href)').get()
524
- if link:
525
- cell_text += f" (Link: {response.urljoin(link)})"
526
- if cell_text:
527
- cells.append(cell_text)
528
- else:
529
- cells.append(" ") # Empty cell placeholder
530
-
531
- if cells:
532
- rows.append(" - ".join(cells))
533
-
534
- if len(rows) > 1: # Only add if we have meaningful table
535
- content["tables"].append("\n".join(rows))
536
-
537
- # Extract downloads and files
538
- for link in main_content.css('a[href]'):
539
- href = link.css('::attr(href)').get()
540
- if not href:
541
- continue
542
-
543
- link_text = self.clean_text(' '.join(link.css('::text').getall()))
544
- if not link_text:
545
- link_text = "Unduhan"
546
-
547
- # Match common document formats
548
- if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)$', href.lower()):
549
- # Extract file extension for better categorization
550
- file_ext = href.split('.')[-1].lower()
551
- content["files"].append({
552
- "title": link_text,
553
- "url": urljoin(response.url, href),
554
- "type": file_ext
555
- })
556
-
557
- return content if any(value for value in content.values()) else None
558
 
559
  def save_page_content(self, url, title, department, domain, category, content, menu_path=''):
560
- """Save a page's content as a formatted text file"""
561
  if not content or not title:
562
  return
563
 
564
- # Clean up title for filename
565
  safe_title = re.sub(r'[^\w\s-]', '', title).strip().lower()
566
- safe_title = re.sub(r'[-\s]+', '-', safe_title)
567
-
568
- # Prepare the content
569
- formatted_content = f"""# {title}
570
-
571
- URL: {url}
572
- Tanggal: {datetime.now().strftime('%d %B %Y')}
573
- Jurusan: {department}
574
- Kategori: {category}
575
- """
576
-
577
- if menu_path:
578
- formatted_content += f"Navigasi: {menu_path}\n"
579
-
580
- formatted_content += "\n## Konten\n\n"
581
- if content["paragraphs"]:
582
- formatted_content += "\n".join(content["paragraphs"])
583
-
584
- if content["tables"]:
585
- formatted_content += "\n\n## Tabel Data\n\n"
586
- for i, table in enumerate(content["tables"]):
587
- formatted_content += f"### Tabel {i+1}\n{table}\n\n"
588
-
589
- if content["files"]:
590
- formatted_content += "\n\n## Berkas\n\n"
591
- for file in content["files"]:
592
- formatted_content += f"- {file['title']} [{file['type']}]: {file['url']}\n"
593
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
594
- # Generate filename with department prefix
595
- filename = f"{department}_{safe_title}_{timestamp}.txt"
596
 
597
- # Upload file to Supabase
598
  try:
599
-
600
-
 
 
 
601
  upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
602
  path=filename,
603
- file=formatted_content.encode('utf-8'),
604
- file_options={"content-type": "text/plain", "x-upsert": "true"}
605
  )
606
 
607
  self.logger.info(f"Successfully uploaded {filename}")
608
 
609
- # Store in our collection for later summaries
610
  self.department_data[department][category].append({
611
  'title': title,
612
  'url': url,
613
- 'filename': filename
 
614
  })
615
 
616
  except Exception as e:
617
  self.logger.error(f"Upload failed for {filename}: {str(e)}")
 
618
 
619
  def save_vision_mission(self, department, domain, vision, mission, mission_items, url):
620
- """Save vision & mission as a separate well-formatted file"""
621
- filename = f"{department}_Visi_Misi.txt"
622
-
623
- content = f"""# Visi dan Misi {department}
624
-
625
- URL: {url}
626
- Tanggal: {datetime.now().strftime('%d %B %Y')}
627
- Jurusan: {department}
628
-
629
- """
630
-
631
- if vision:
632
- content += f"## Visi\n\n{vision}\n\n"
633
-
634
- if mission:
635
- content += f"## Misi\n\n{mission}\n\n"
636
-
637
- if mission_items:
638
- if not mission: # Only add header if not already added
639
- content += "## Misi\n\n"
640
- for i, item in enumerate(mission_items, 1):
641
- content += f"{i}. {item}\n"
642
 
643
  try:
644
- # Remove existing file if it exists
645
- try:
646
- self.supabase.storage.from_(self.storage_bucket).remove(filename)
647
- except:
648
- pass
649
-
650
  upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
651
  path=filename,
652
- file=content.encode('utf-8'),
653
- file_options={"content-type": "text/plain", "x-upsert": "true"}
654
  )
655
 
656
- self.logger.info(f"Successfully uploaded {filename}")
657
  except Exception as e:
658
- self.logger.error(f"Upload failed for {filename}: {str(e)}")
659
-
 
660
  def clean_text(self, text):
661
- """Clean and normalize text"""
662
  if not text:
663
  return ""
664
 
665
  # Normalize unicode characters
666
  text = unicodedata.normalize('NFKC', text)
667
 
668
- # Replace multiple spaces with single space
669
- text = re.sub(r'\s+', ' ', text)
670
 
671
- # Remove special characters and non-printable characters
672
- text = re.sub(r'[^\x20-\x7E\s\u00A0-\u00FF\u0100-\u017F]', '', text)
673
 
674
- # Remove multiple periods
675
- text = re.sub(r'\.{2,}', ' ', text)
676
 
677
- return text.strip()
678
 
679
  def determine_category(self, menu_text):
680
- """Determine content category based on menu text"""
681
  menu_lower = menu_text.lower()
682
 
683
- # Define category mappings
684
- categories = {
685
- 'Beranda': ['beranda', 'home', 'utama', 'main', 'index'],
686
- 'Profil': ['profil', 'profile', 'tentang', 'about', 'visi', 'misi', 'sejarah', 'history', 'struktur', 'organisasi', 'pimpinan', 'sambutan'],
687
- 'Program_Studi': ['program', 'studi', 'prodi', 'd3', 'd4', 'diploma', 'sarjana', 'akademik', 'jurusan', 'kurikulum'],
688
- 'Dosen': ['dosen', 'staff', 'tenaga', 'pengajar', 'lecturer', 'faculty'],
689
- 'Penelitian': ['penelitian', 'research', 'jurnal', 'karya', 'ilmiah', 'publikasi', 'paper'],
690
- 'Mahasiswa': ['mahasiswa', 'student', 'alumni', 'lulusan', 'graduate', 'kegiatan', 'activity', 'kemahasiswaan'],
691
- 'Fasilitas': ['fasilitas', 'facility', 'lab', 'laboratorium', 'gedung', 'building', 'sarana', 'prasarana'],
692
- 'Informasi': ['informasi', 'info', 'pengumuman', 'announcement', 'agenda', 'berita', 'news', 'event'],
693
- 'Kerjasama': ['kerjasama', 'cooperation', 'mitra', 'partner', 'industri', 'industry', 'collaboration'],
694
- 'Dokumen': ['dokumen', 'document', 'unduhan', 'download', 'berkas', 'file']
695
- }
696
 
697
- # Check each category
698
- for category, terms in categories.items():
699
- if any(term in menu_lower for term in terms):
700
  return category
701
 
702
- # Default category if no match
703
  return 'Lainnya'
704
 
705
  def is_social_media_link(self, url):
706
- """Check if URL is for social media"""
707
- social_patterns = [
708
- 'facebook.com', 'twitter.com', 'instagram.com',
709
- 'youtube.com', 'linkedin.com', 'pinterest.com',
710
- 'tiktok.com', 'wa.me', 'whatsapp.com', 't.me'
711
  ]
712
- return any(pattern in url.lower() for pattern in social_patterns)
 
 
 
 
 
 
 
 
713
 
714
  def is_unwanted_url(self, url):
715
- """Check if URL should be skipped"""
716
- # Skip certain file types
717
- if re.search(r'\.(jpg|jpeg|png|gif|svg|ico|css|js)$', url.lower()):
718
  return True
719
 
720
- # Skip certain URL patterns
721
  unwanted_patterns = [
722
- 'login', 'logout', 'signin', 'signup', 'register', 'admin',
723
- 'wp-', '/wp/', 'wordpress', 'comment', 'feed', 'rss', 'atom',
724
- 'javascript:', 'mailto:', 'tel:', 'page/', '/tag/', '/author/',
725
- '/archive/', '/category/', '/search', 'kalender', '/ajax/', '/api/'
726
  ]
727
 
728
- return any(pattern in url.lower() for pattern in unwanted_patterns)
 
729
 
730
  def handle_error(self, failure):
731
- """Handle request errors"""
732
  url = failure.request.url
733
- self.visited_urls.add(url) # Mark as visited to prevent retries
734
- self.logger.error(f"Request failed: {url} - {str(failure.value)}")
 
 
 
 
 
 
 
 
 
 
 
 
735
 
736
  def closed(self, reason):
737
- """Finalize processing when spider is closed"""
738
- self.logger.info("Spider closed. Generating summary report...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739
 
740
- # Log statistics
741
- departments_count = len(self.department_data)
742
- pages_count = sum(len(cat_data) for dept_data in self.department_data.values()
743
- for cat_data in dept_data.values())
744
 
745
- self.logger.info(f"Crawled {departments_count} departments and {pages_count} pages")
746
- self.logger.info(f"Found {len(self.study_programs)} departments with programs")
 
 
 
747
 
748
- for dept, programs in self.study_programs.items():
749
- self.logger.info(f"{dept}: {len(programs)} programs")
 
750
 
751
- # Generate and upload the summary file
752
- self.generate_summary_file()
 
 
 
 
 
 
753
 
754
- def generate_summary_file(self):
755
- """Generate comprehensive summary with program metadata"""
756
- content = """# Daftar Lengkap Jurusan dan Program Studi Politeknik Negeri Padang\n\n"""
757
- content += f"**Terakhir diperbarui**: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
758
 
759
  # Create reverse mapping from department name to domain
760
  reverse_departments = {v: k for k, v in self.DEPARTMENTS.items()}
761
 
762
  for department, programs in self.study_programs.items():
763
- # Get domain from reverse mapping
764
  domain = reverse_departments.get(department, '')
765
- website_url = f'https://{domain}' if domain else 'URL tidak ditemukan'
766
 
767
  content += f"## {department.replace('_', ' ')}\n"
768
- content += f"**Website**: {website_url}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
769
 
 
770
  if programs:
 
771
  for prog in programs:
772
- content += f"### {prog['title']}\n"
773
- content += f"- **Jenjang**: {prog['details'].get('degree', 'N/A')}\n"
774
- content += f"- **Akreditasi**: {prog['details'].get('accreditation', 'N/A')}\n"
775
- content += f"- **URL**: {prog['url']}\n"
776
 
777
  if 'description' in prog['details']:
778
- desc = prog['details']['description']
779
- content += f"\n**Deskripsi**:\n{desc}\n"
780
-
781
- content += "\n"
782
  else:
783
- content += "### Belum ada informasi program studi\n"
784
 
785
  content += "\n---\n\n"
786
 
787
- # Upload to Supabase
788
- filename = "Daftar_Jurusan_dan_Prodi_Politeknik_Negeri_Padang.txt"
789
  try:
790
- self.supabase.storage.from_(self.storage_bucket).remove(filename)
791
  self.supabase.storage.from_(self.storage_bucket).upload(
792
  path=filename,
793
  file=content.encode('utf-8'),
794
- file_options={"content-type": "text/plain", "x-upsert": "true"}
795
  )
796
- self.logger.info("Ringkasan jurusan berhasil diunggah")
797
  except Exception as e:
798
- self.logger.error(f"Gagal mengupload file ringkasan: {str(e)}")
799
 
800
 
801
- # Main execution
802
  if __name__ == "__main__":
803
  process = CrawlerProcess()
804
  process.crawl(PNPDepartmentSpider)
 
8
  from collections import defaultdict
9
  from supabase import create_client
10
  from dotenv import load_dotenv
11
+ import logging
12
+ from bs4 import BeautifulSoup
13
 
14
  # Load environment variables
15
  load_dotenv()
16
 
17
  class PNPDepartmentSpider(scrapy.Spider):
18
+ name = 'optimized_pnp_department_spider'
19
 
20
  DEPARTMENTS = {
21
  'akt.pnp.ac.id': 'Akuntansi',
 
29
 
30
  start_urls = [f'https://{domain}' for domain in DEPARTMENTS.keys()]
31
  visited_urls = set()
32
+ failed_urls = set()
33
 
34
  custom_settings = {
35
+ 'DOWNLOAD_DELAY': 1.5,
36
  'ROBOTSTXT_OBEY': True,
37
  'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
38
  'LOG_LEVEL': 'INFO',
39
  'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
40
+ 'DOWNLOAD_TIMEOUT': 30,
41
+ 'RETRY_TIMES': 2,
42
  'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
43
+ 'HTTPCACHE_ENABLED': True,
44
+ 'HTTPCACHE_EXPIRATION_SECS': 86400, # Cache for 1 day
45
+ 'DEPTH_LIMIT': 3,
46
+ 'DEPTH_PRIORITY': 1
47
  }
48
 
49
  def __init__(self, *args, **kwargs):
50
  super().__init__(*args, **kwargs)
51
+ self.setup_logging()
52
+ self.setup_supabase()
 
 
 
53
  self.department_data = defaultdict(lambda: defaultdict(list))
54
  self.study_programs = defaultdict(list)
55
  self.department_info = defaultdict(dict)
56
+ self.session_stats = {
57
+ 'total_pages': 0,
58
+ 'successful_pages': 0,
59
+ 'failed_pages': 0
60
+ }
61
+
62
+ def setup_logging(self):
63
+ """Configure advanced logging"""
64
+ logging.basicConfig(
65
+ level=logging.INFO,
66
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
67
+ handlers=[
68
+ logging.FileHandler('pnp_spider.log'),
69
+ logging.StreamHandler()
70
+ ]
71
+ )
72
+ self.logger = logging.getLogger(self.name)
73
+
74
+ def setup_supabase(self):
75
+ """Initialize Supabase client with error handling"""
76
+ try:
77
+ self.supabase = create_client(
78
+ os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
79
+ os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
80
+ )
81
+ self.storage_bucket = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
82
+ # Test connection
83
+ self.supabase.storage.list_buckets()
84
+ self.logger.info("Successfully connected to Supabase")
85
+ except Exception as e:
86
+ self.logger.error(f"Supabase connection failed: {str(e)}")
87
+ raise
88
 
89
  def start_requests(self):
90
+ """Initialize requests with better error handling"""
91
  for url in self.start_urls:
92
  yield scrapy.Request(
93
  url=url,
94
  callback=self.parse_department_homepage,
95
  errback=self.handle_error,
96
+ headers={'Accept': 'text/html,application/xhtml+xml'},
97
+ meta={'retry_count': 0}
98
  )
99
 
100
  def parse_department_homepage(self, response):
101
+ """Enhanced department homepage parsing"""
102
+ self.session_stats['total_pages'] += 1
103
+
104
  domain = urlparse(response.url).netloc
105
  department = self.DEPARTMENTS.get(domain, domain)
106
  self.visited_urls.add(response.url)
107
 
108
+ self.logger.info(f"Processing department: {department} ({domain})")
109
 
110
+ # Extract homepage content
111
  homepage_content = self.extract_content(response)
112
  if homepage_content:
113
+ page_title = self.extract_page_title(response)
114
  self.save_page_content(
115
  response.url,
116
  page_title,
 
119
  'Beranda',
120
  homepage_content
121
  )
122
+ self.session_stats['successful_pages'] += 1
123
 
124
+ # Process navigation with improved detection
125
  nav_elements = self.extract_navigation(response)
126
  for nav_item in nav_elements:
127
+ if not self.should_follow_link(nav_item['link'], response.url):
128
  continue
129
 
130
  full_url = response.urljoin(nav_item['link'])
 
139
  'category': category,
140
  'department': department,
141
  'domain': domain,
142
+ 'menu_path': nav_item['text'],
143
+ 'retry_count': 0,
144
+ 'depth': response.meta.get('depth', 0) + 1
145
  },
146
+ errback=self.handle_error,
147
+ priority=2 if 'prodi' in nav_item['text'].lower() else 1
148
  )
149
 
150
+ # Process study programs with better detection
151
  study_program_links = self.extract_study_program_links(response)
152
  for prog in study_program_links:
153
  if prog['link'] not in self.visited_urls:
 
157
  meta={
158
  'page_title': prog['title'],
159
  'department': department,
160
+ 'domain': domain,
161
+ 'retry_count': 0,
162
+ 'priority': 3 # Higher priority for program pages
163
  },
164
  errback=self.handle_error
165
  )
166
 
167
+ # Process vision & mission with better detection
168
  vision_mission_links = self.extract_vision_mission_links(response)
169
  for vm_link in vision_mission_links:
170
  if vm_link['link'] not in self.visited_urls:
 
174
  meta={
175
  'page_title': vm_link['title'],
176
  'department': department,
177
+ 'domain': domain,
178
+ 'retry_count': 0,
179
+ 'priority': 2
180
  },
181
  errback=self.handle_error
182
  )
183
 
184
+ def should_follow_link(self, link, base_url):
185
+ """Determine if a link should be followed"""
186
+ if not link or link.startswith('#') or link.startswith('javascript:'):
187
+ return False
188
+
189
+ parsed_link = urlparse(link)
190
+ parsed_base = urlparse(base_url)
191
+
192
+ # Skip if different domain
193
+ if parsed_link.netloc and parsed_link.netloc != parsed_base.netloc:
194
+ return False
195
+
196
+ # Skip unwanted file types
197
+ if re.search(r'\.(jpg|jpeg|png|gif|pdf|docx?|xlsx?|pptx?|zip|rar)$', link.lower()):
198
+ return False
199
+
200
+ # Skip admin/login pages
201
+ if any(x in link.lower() for x in ['wp-admin', 'wp-login', 'admin', 'login']):
202
+ return False
203
+
204
+ return True
205
+
206
+ def extract_page_title(self, response):
207
+ """Extract page title with multiple fallbacks"""
208
+ title = response.css('h1::text, h1.page-title::text, h1.entry-title::text').get()
209
+ if not title:
210
+ title = response.css('title::text').get()
211
+ if not title:
212
+ title = response.url.split('/')[-1].replace('-', ' ').title()
213
+ return self.clean_text(title or 'Untitled Page')
214
+
215
  def extract_navigation(self, response):
216
+ """Improved navigation extraction with BeautifulSoup"""
217
+ soup = BeautifulSoup(response.text, 'html.parser')
218
  nav_items = []
219
 
220
+ # Common navigation patterns
221
+ nav_patterns = [
222
+ {'tag': 'nav'},
223
+ {'class': 'navbar'},
224
+ {'class': 'navigation'},
225
+ {'class': 'main-menu'},
226
+ {'id': 'menu'},
227
+ {'class': 'primary-menu'},
228
+ {'role': 'navigation'}
229
  ]
230
 
231
+ for pattern in nav_patterns:
232
+ nav = soup.find(**pattern)
233
+ if nav:
234
+ for link in nav.find_all('a', href=True):
235
+ text = self.clean_text(link.get_text())
236
+ href = link['href']
237
+
238
+ if text and href and len(text) > 1 and not self.is_social_media_link(href):
239
  nav_items.append({
240
+ 'text': text,
241
+ 'link': href
242
  })
243
 
244
+ # Fallback to CSS selectors if BeautifulSoup finds nothing
245
+ if not nav_items:
246
+ for link in response.css('a'):
247
+ text = self.clean_text(' '.join(link.css('::text').getall()))
248
+ href = link.css('::attr(href)').get()
249
+
250
+ if text and href and len(text) > 1 and not self.is_social_media_link(href):
251
+ nav_items.append({
252
+ 'text': text,
253
+ 'link': href
254
+ })
255
+
256
  return nav_items
257
 
258
  def extract_study_program_links(self, response):
259
+ """Enhanced study program link extraction"""
260
  program_links = []
261
 
262
+ # Try BeautifulSoup first
263
+ soup = BeautifulSoup(response.text, 'html.parser')
264
+
265
+ # Look for common patterns in menu items
266
+ program_texts = ['program studi', 'prodi', 'jurusan', 'program pendidikan']
267
+
268
+ for text in program_texts:
269
+ menu_items = soup.find_all(lambda tag: tag.name == 'a' and text in tag.get_text().lower())
270
+ for item in menu_items:
271
+ href = item.get('href')
272
+ if href:
273
+ program_links.append({
274
+ 'title': self.clean_text(item.get_text()),
275
+ 'link': response.urljoin(href)
276
+ })
277
+
278
+ # Fallback to XPath if needed
279
+ if not program_links:
280
+ xpath = "//a[contains(translate(., 'PROGRAMSTUDI', 'programstudi'), 'program studi') or contains(., 'Prodi')]"
281
+ for link in response.xpath(xpath):
282
  program_links.append({
283
+ 'title': self.clean_text(''.join(link.xpath('.//text()').getall())),
284
+ 'link': response.urljoin(link.xpath('@href').get())
285
  })
 
 
 
 
286
 
287
+ # Deduplicate
288
+ seen = set()
289
+ unique_links = []
290
+ for prog in program_links:
291
+ if prog['link'] not in seen:
292
+ seen.add(prog['link'])
293
+ unique_links.append(prog)
294
+
295
+ return unique_links
296
 
297
  def extract_vision_mission_links(self, response):
298
+ """Improved vision & mission link detection"""
299
  vm_links = []
300
 
301
+ # Terms in multiple languages
302
+ terms = ['visi', 'misi', 'vision', 'mission', 'tujuan', 'goal']
303
 
304
+ # Check both link text and URLs
305
+ for term in terms:
306
+ # Links containing the term in text
307
  for link in response.css(f'a:contains("{term}")'):
308
  text = self.clean_text(' '.join(link.css('::text').getall()))
309
  url = link.css('::attr(href)').get()
310
 
311
  if text and url:
312
  vm_links.append({
313
+ 'title': text,
314
  'link': response.urljoin(url)
315
  })
316
+
317
+ # Links with term in URL
318
+ for link in response.css(f'a[href*="{term}"]'):
319
+ if link not in vm_links:
320
+ text = self.clean_text(' '.join(link.css('::text').getall()))
321
+ url = link.css('::attr(href)').get()
322
+
323
+ if text and url:
324
+ vm_links.append({
325
+ 'title': text,
326
+ 'link': response.urljoin(url)
327
+ })
328
 
329
  return vm_links
330
 
331
  def parse_content_page(self, response):
332
+ """Enhanced content page parsing"""
333
+ self.session_stats['total_pages'] += 1
334
  meta = response.meta
335
  self.visited_urls.add(response.url)
336
 
337
+ # Extract content with improved methods
338
+ content = self.extract_structured_content(response)
339
 
340
  if content:
341
  self.save_page_content(
 
347
  content,
348
  meta.get('menu_path', '')
349
  )
350
+ self.session_stats['successful_pages'] += 1
351
+
352
+ # Extract and follow internal links with better filtering
353
+ if response.meta.get('depth', 0) < 3: # Limit depth
354
+ internal_links = self.extract_internal_links(response, meta['domain'])
355
+ for link in internal_links:
356
+ if link['url'] not in self.visited_urls:
357
+ yield scrapy.Request(
358
+ url=link['url'],
359
+ callback=self.parse_content_page,
360
+ meta={
361
+ 'page_title': link['text'],
362
+ 'category': meta['category'], # Inherit parent category
363
+ 'department': meta['department'],
364
+ 'domain': meta['domain'],
365
+ 'menu_path': f"{meta.get('menu_path', '')} > {link['text']}",
366
+ 'retry_count': 0,
367
+ 'depth': response.meta.get('depth', 0) + 1
368
+ },
369
+ errback=self.handle_error,
370
+ priority=1
371
+ )
372
+
373
+ def extract_structured_content(self, response):
374
+ """Extract content in a more structured way using both CSS and XPath"""
375
+ content = {
376
+ "metadata": {
377
+ "title": self.extract_page_title(response),
378
+ "url": response.url,
379
+ "timestamp": datetime.now().isoformat(),
380
+ "department": response.meta.get('department', ''),
381
+ "domain": response.meta.get('domain', '')
382
+ },
383
+ "sections": [],
384
+ "files": [],
385
+ "tables": []
386
+ }
387
+
388
+ # Use BeautifulSoup for better HTML parsing
389
+ soup = BeautifulSoup(response.text, 'html.parser')
390
+
391
+ # Remove unwanted elements
392
+ for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'form']):
393
+ element.decompose()
394
+
395
+ # Extract main content areas
396
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content|main')) or soup
397
+
398
+ # Process headings and content hierarchy
399
+ current_section = {}
400
+ for element in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):
401
+ if element.name.startswith('h'):
402
+ # If we have a current section with content, add it first
403
+ if current_section and current_section.get('content'):
404
+ content['sections'].append(current_section)
405
+
406
+ # Start new section
407
+ current_section = {
408
+ "heading": self.clean_text(element.get_text()),
409
+ "level": int(element.name[1]),
410
+ "content": []
411
+ }
412
+ else:
413
+ if not current_section:
414
+ current_section = {
415
+ "heading": "Content",
416
+ "level": 2,
417
+ "content": []
418
+ }
419
+
420
+ if element.name == 'p':
421
+ text = self.clean_text(element.get_text())
422
+ if text and len(text) > 20:
423
+ current_section['content'].append({
424
+ "type": "paragraph",
425
+ "text": text
426
+ })
427
+ elif element.name in ['ul', 'ol']:
428
+ items = [self.clean_text(li.get_text()) for li in element.find_all('li')]
429
+ if items:
430
+ current_section['content'].append({
431
+ "type": "list",
432
+ "style": "ordered" if element.name == 'ol' else "unordered",
433
+ "items": items
434
+ })
435
+ elif element.name == 'table':
436
+ table_data = self.extract_table_data(element)
437
+ if table_data:
438
+ content['tables'].append(table_data)
439
+
440
+ # Add the last section if it exists
441
+ if current_section and current_section.get('content'):
442
+ content['sections'].append(current_section)
443
+
444
+ # Extract files and downloads
445
+ for link in main_content.find_all('a', href=True):
446
+ href = link['href']
447
+ if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)$', href.lower()):
448
+ content['files'].append({
449
+ "title": self.clean_text(link.get_text()) or "Unduhan",
450
+ "url": response.urljoin(href),
451
+ "type": href.split('.')[-1].lower()
452
+ })
453
+
454
+ return content if (content['sections'] or content['tables']) else None
455
+
456
+ def extract_table_data(self, table_element):
457
+ """Extract structured table data"""
458
+ table_data = {
459
+ "headers": [],
460
+ "rows": []
461
+ }
462
+
463
+ # Extract headers from thead if exists
464
+ thead = table_element.find('thead')
465
+ if thead:
466
+ for th in thead.find_all(['th', 'td']):
467
+ table_data['headers'].append(self.clean_text(th.get_text()))
468
+
469
+ # Extract rows from tbody or directly from table
470
+ tbody = table_element.find('tbody') or table_element
471
+ for tr in tbody.find_all('tr'):
472
+ row = []
473
+ for cell in tr.find_all(['td', 'th']):
474
+ # Handle cell content with possible links
475
+ cell_text = self.clean_text(cell.get_text())
476
+ links = [{'text': self.clean_text(a.get_text()), 'url': a['href']}
477
+ for a in cell.find_all('a', href=True)]
478
+
479
+ row.append({
480
+ "text": cell_text,
481
+ "links": links
482
+ })
483
+
484
+ if row:
485
+ table_data['rows'].append(row)
486
+
487
+ return table_data if table_data['rows'] else None
488
+
489
+ def extract_internal_links(self, response, domain):
490
+ """Extract internal links with better filtering"""
491
+ internal_links = []
492
 
493
+ for link in response.css('a[href]'):
494
+ text = self.clean_text(' '.join(link.css('::text').getall()))
495
+ href = link.css('::attr(href)').get()
496
+
497
+ if not text or not href:
498
+ continue
499
+
500
+ full_url = response.urljoin(href)
501
+ parsed_url = urlparse(full_url)
502
 
503
+ # Only follow links from the same domain
504
+ if parsed_url.netloc == domain:
505
+ if not self.is_unwanted_url(full_url) and full_url not in self.visited_urls:
506
+ internal_links.append({
507
+ 'text': text,
508
+ 'url': full_url
509
+ })
510
+
511
+ return internal_links
 
 
 
 
 
 
 
 
 
512
 
513
  def parse_study_program(self, response):
514
+ """Enhanced study program parsing"""
515
+ self.session_stats['total_pages'] += 1
516
  meta = response.meta
517
  self.visited_urls.add(response.url)
518
 
519
  department = meta['department']
520
+ program_title = self.extract_page_title(response)
521
 
522
+ # Extract program details with improved methods
523
  program_details = self.extract_program_details(response)
 
524
 
525
  # Add to the study programs collection
526
  self.study_programs[department].append({
 
530
  })
531
 
532
  # Also save as a regular page
533
+ content = self.extract_structured_content(response)
534
  if content:
535
  self.save_page_content(
536
  response.url,
 
540
  'Program_Studi',
541
  content
542
  )
543
+ self.session_stats['successful_pages'] += 1
544
 
545
  def extract_program_details(self, response):
546
+ """Enhanced program details extraction"""
547
+ details = {}
548
+ soup = BeautifulSoup(response.text, 'html.parser')
549
 
550
+ # Degree detection from multiple sources
551
  degree_sources = [
552
+ soup.title.string if soup.title else None,
553
+ soup.h1.get_text() if soup.h1 else None,
554
+ ' '.join(soup.find(class_=re.compile('breadcrumb')).stripped_strings) if soup.find(class_=re.compile('breadcrumb')) else None
555
  ]
556
 
557
  degree_pattern = re.compile(
 
564
  details['degree'] = match.group(1).upper()
565
  break
566
 
567
+ # Extract accreditation status with better pattern matching
568
+ accreditation_texts = [
569
+ 'akreditasi',
570
+ 'peringkat',
571
+ 'status akreditasi',
572
+ 'sertifikasi'
573
+ ]
574
+
575
+ for text in accreditation_texts:
576
+ element = soup.find(string=re.compile(text, re.IGNORECASE))
577
+ if element:
578
+ # Look for the accreditation value in nearby elements
579
+ parent = element.find_parent()
580
+ siblings = [sib for sib in parent.next_siblings if isinstance(sib, str) or sib.name]
581
+
582
+ for sib in siblings:
583
+ if isinstance(sib, str):
584
+ if match := re.search(r'[A-Z]', sib):
585
+ details['accreditation'] = match.group()
586
+ break
587
+ elif sib.name:
588
+ if match := re.search(r'[A-Z]', sib.get_text()):
589
+ details['accreditation'] = match.group()
590
+ break
591
+
592
+ if 'accreditation' in details:
593
+ break
594
+
595
+ # Extract description from the first meaningful paragraph
596
+ for p in soup.find_all('p'):
597
+ text = self.clean_text(p.get_text())
598
+ if text and len(text) > 50 and not any(x in text.lower() for x in ['copyright', 'hak cipta']):
599
+ details['description'] = text
600
+ break
601
+
602
  return details
603
 
604
  def parse_vision_mission(self, response):
605
+ """Enhanced vision & mission parsing"""
606
+ self.session_stats['total_pages'] += 1
607
  meta = response.meta
608
  self.visited_urls.add(response.url)
609
  department = meta['department']
610
 
611
+ # Use BeautifulSoup for better content extraction
612
+ soup = BeautifulSoup(response.text, 'html.parser')
613
 
614
+ # Find vision and mission sections
615
+ vision_text = self.find_section_text(soup, ['visi', 'vision'])
616
+ mission_text = self.find_section_text(soup, ['misi', 'mission'])
 
 
 
 
617
 
618
+ # Find mission items if presented as list
619
+ mission_items = []
620
+ mission_list = self.find_mission_list(soup)
621
+ if mission_list:
622
+ mission_items = [self.clean_text(li.get_text()) for li in mission_list.find_all('li')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
 
624
+ # Store in department info
625
+ if vision_text or mission_text or mission_items:
626
  if vision_text:
627
  self.department_info[department]['vision'] = vision_text
628
  if mission_text:
629
  self.department_info[department]['mission'] = mission_text
630
+ if mission_items:
631
+ self.department_info[department]['mission_items'] = mission_items
632
 
633
+ # Save as separate file
634
  self.save_vision_mission(
635
  department,
636
  meta['domain'],
637
  vision_text,
638
  mission_text,
639
+ mission_items,
640
  response.url
641
  )
642
 
643
  # Also save as a regular page
644
+ content = self.extract_structured_content(response)
645
  if content:
646
  self.save_page_content(
647
  response.url,
 
651
  'Profil',
652
  content
653
  )
654
+ self.session_stats['successful_pages'] += 1
655
 
656
+ def find_section_text(self, soup, keywords):
657
+ """Find section text based on keywords"""
658
+ for keyword in keywords:
659
+ # Look for headings containing the keyword
660
+ for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
661
+ if keyword.lower() in heading.get_text().lower():
662
+ # Get the next paragraph or div
663
+ next_node = heading.next_sibling
664
+ while next_node:
665
+ if next_node.name in ['p', 'div']:
666
+ text = self.clean_text(next_node.get_text())
667
+ if text:
668
+ return text
669
+ next_node = next_node.next_sibling
670
+
671
+ return None
672
+
673
+ def find_mission_list(self, soup):
674
+ """Find mission items presented as list"""
675
+ for keyword in ['misi', 'mission']:
676
+ # Look for headings containing the keyword
677
+ for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
678
+ if keyword.lower() in heading.get_text().lower():
679
+ # Find the next ul or ol element
680
+ next_node = heading.next_sibling
681
+ while next_node:
682
+ if next_node.name in ['ul', 'ol']:
683
+ return next_node
684
+ next_node = next_node.next_sibling
685
+
686
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
 
688
  def save_page_content(self, url, title, department, domain, category, content, menu_path=''):
689
+ """Save page content with improved formatting"""
690
  if not content or not title:
691
  return
692
 
693
+ # Generate filename with department prefix
694
  safe_title = re.sub(r'[^\w\s-]', '', title).strip().lower()
695
+ safe_title = re.sub(r'[-\s]+', '-', safe_title)[:100] # Limit length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
697
+ filename = f"{department}_{safe_title}_{timestamp}.json"
 
698
 
 
699
  try:
700
+ # Convert content to JSON string
701
+ import json
702
+ content_str = json.dumps(content, ensure_ascii=False, indent=2)
703
+
704
+ # Upload to Supabase
705
  upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
706
  path=filename,
707
+ file=content_str.encode('utf-8'),
708
+ file_options={"content-type": "application/json", "x-upsert": "true"}
709
  )
710
 
711
  self.logger.info(f"Successfully uploaded {filename}")
712
 
713
+ # Store in our collection
714
  self.department_data[department][category].append({
715
  'title': title,
716
  'url': url,
717
+ 'filename': filename,
718
+ 'timestamp': timestamp
719
  })
720
 
721
  except Exception as e:
722
  self.logger.error(f"Upload failed for {filename}: {str(e)}")
723
+ self.failed_urls.add(url)
724
 
725
  def save_vision_mission(self, department, domain, vision, mission, mission_items, url):
726
+ """Save vision & mission with improved formatting"""
727
+ filename = f"{department}_Visi_Misi.json"
728
+
729
+ content = {
730
+ "department": department,
731
+ "domain": domain,
732
+ "url": url,
733
+ "timestamp": datetime.now().isoformat(),
734
+ "vision": vision,
735
+ "mission": mission,
736
+ "mission_items": mission_items
737
+ }
 
 
 
 
 
 
 
 
 
 
738
 
739
  try:
740
+ import json
741
+ content_str = json.dumps(content, ensure_ascii=False, indent=2)
742
+
 
 
 
743
  upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
744
  path=filename,
745
+ file=content_str.encode('utf-8'),
746
+ file_options={"content-type": "application/json", "x-upsert": "true"}
747
  )
748
 
749
+ self.logger.info(f"Successfully uploaded vision & mission for {department}")
750
  except Exception as e:
751
+ self.logger.error(f"Failed to upload vision & mission for {department}: {str(e)}")
752
+ self.failed_urls.add(url)
753
+
754
  def clean_text(self, text):
755
+ """Improved text cleaning with normalization"""
756
  if not text:
757
  return ""
758
 
759
  # Normalize unicode characters
760
  text = unicodedata.normalize('NFKC', text)
761
 
762
+ # Remove control characters
763
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
764
 
765
+ # Replace multiple spaces/newlines with single space
766
+ text = re.sub(r'\s+', ' ', text)
767
 
768
+ # Remove leading/trailing whitespace
769
+ text = text.strip()
770
 
771
+ return text
772
 
773
  def determine_category(self, menu_text):
774
+ """Enhanced category determination"""
775
  menu_lower = menu_text.lower()
776
 
777
+ category_mapping = [
778
+ (['beranda', 'home', 'utama'], 'Beranda'),
779
+ (['profil', 'profile', 'tentang', 'about', 'sejarah', 'history'], 'Profil'),
780
+ (['program', 'studi', 'prodi', 'jurusan', 'kurikulum'], 'Program_Studi'),
781
+ (['dosen', 'staff', 'pengajar', 'lecturer'], 'Dosen'),
782
+ (['penelitian', 'research', 'publikasi', 'jurnal'], 'Penelitian'),
783
+ (['mahasiswa', 'student', 'alumni'], 'Mahasiswa'),
784
+ (['fasilitas', 'lab', 'laboratorium'], 'Fasilitas'),
785
+ (['pengumuman', 'berita', 'news', 'agenda'], 'Informasi'),
786
+ (['kerjasama', 'partnership', 'mitra'], 'Kerjasama'),
787
+ (['dokumen', 'download', 'unduhan'], 'Dokumen')
788
+ ]
 
789
 
790
+ for keywords, category in category_mapping:
791
+ if any(keyword in menu_lower for keyword in keywords):
 
792
  return category
793
 
 
794
  return 'Lainnya'
795
 
796
  def is_social_media_link(self, url):
797
+ """Check if URL is social media with better pattern matching"""
798
+ social_domains = [
799
+ 'facebook.com', 'twitter.com', 'instagram.com',
800
+ 'youtube.com', 'linkedin.com', 'tiktok.com',
801
+ 'whatsapp.com', 'wa.me', 'telegram.me'
802
  ]
803
+
804
+ if not url:
805
+ return False
806
+
807
+ parsed = urlparse(url.lower())
808
+ if not parsed.netloc:
809
+ return False
810
+
811
+ return any(domain in parsed.netloc for domain in social_domains)
812
 
813
  def is_unwanted_url(self, url):
814
+ """Improved unwanted URL detection"""
815
+ if not url:
 
816
  return True
817
 
 
818
  unwanted_patterns = [
819
+ r'\.(jpg|jpeg|png|gif|svg|ico|css|js|pdf|docx?|xlsx?|pptx?|zip|rar)$',
820
+ r'(login|logout|signin|signup|register|admin|wp-|/wp/|wordpress|comment|feed|rss|atom)',
821
+ r'(javascript:|mailto:|tel:|#)',
822
+ r'(page/\d+|tag/|author/|archive/|category/|search|kalender|ajax|api)'
823
  ]
824
 
825
+ url_lower = url.lower()
826
+ return any(re.search(pattern, url_lower) for pattern in unwanted_patterns)
827
 
828
  def handle_error(self, failure):
829
+ """Enhanced error handling with retry logic"""
830
  url = failure.request.url
831
+ meta = failure.request.meta
832
+ retry_count = meta.get('retry_count', 0)
833
+
834
+ self.logger.error(f"Request failed ({retry_count}): {url} - {str(failure.value)}")
835
+ self.session_stats['failed_pages'] += 1
836
+ self.failed_urls.add(url)
837
+
838
+ # Retry logic
839
+ if retry_count < self.custom_settings.get('RETRY_TIMES', 2):
840
+ self.logger.info(f"Retrying {url} (attempt {retry_count + 1})")
841
+ new_request = failure.request.copy()
842
+ new_request.meta['retry_count'] = retry_count + 1
843
+ new_request.dont_filter = True
844
+ return new_request
845
 
846
  def closed(self, reason):
847
+ """Enhanced closing method with comprehensive reporting"""
848
+ self.logger.info("Spider closed. Generating final reports...")
849
+
850
+ # Generate summary statistics
851
+ summary = {
852
+ "total_pages": self.session_stats['total_pages'],
853
+ "successful_pages": self.session_stats['successful_pages'],
854
+ "failed_pages": self.session_stats['failed_pages'],
855
+ "success_rate": (self.session_stats['successful_pages'] / self.session_stats['total_pages']) * 100 if self.session_stats['total_pages'] > 0 else 0,
856
+ "departments_crawled": len(self.department_data),
857
+ "programs_found": sum(len(progs) for progs in self.study_programs.values()),
858
+ "failed_urls": list(self.failed_urls)
859
+ }
860
+
861
+ # Upload summary report
862
+ self.upload_summary_report(summary)
863
 
864
+ # Generate department summary
865
+ self.generate_department_summary()
 
 
866
 
867
+ self.logger.info(f"Crawling completed. Success rate: {summary['success_rate']:.2f}%")
868
+
869
+ def upload_summary_report(self, summary):
870
+ """Upload comprehensive summary report"""
871
+ filename = "crawling_summary_report.json"
872
 
873
+ try:
874
+ import json
875
+ content = json.dumps(summary, indent=2)
876
 
877
+ self.supabase.storage.from_(self.storage_bucket).upload(
878
+ path=filename,
879
+ file=content.encode('utf-8'),
880
+ file_options={"content-type": "application/json", "x-upsert": "true"}
881
+ )
882
+ self.logger.info("Successfully uploaded summary report")
883
+ except Exception as e:
884
+ self.logger.error(f"Failed to upload summary report: {str(e)}")
885
 
886
+ def generate_department_summary(self):
887
+ """Generate detailed department summary"""
888
+ content = "# Laporan Lengkap Jurusan Politeknik Negeri Padang\n\n"
889
+ content += f"**Tanggal**: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
890
 
891
  # Create reverse mapping from department name to domain
892
  reverse_departments = {v: k for k, v in self.DEPARTMENTS.items()}
893
 
894
  for department, programs in self.study_programs.items():
 
895
  domain = reverse_departments.get(department, '')
896
+ website_url = f'https://{domain}' if domain else ''
897
 
898
  content += f"## {department.replace('_', ' ')}\n"
899
+ content += f"**Website**: {website_url}\n"
900
+
901
+ # Add vision and mission if available
902
+ if department in self.department_info:
903
+ if 'vision' in self.department_info[department]:
904
+ content += f"\n### Visi\n{self.department_info[department]['vision']}\n"
905
+
906
+ if 'mission' in self.department_info[department]:
907
+ content += f"\n### Misi\n{self.department_info[department]['mission']}\n"
908
+ elif 'mission_items' in self.department_info[department]:
909
+ content += "\n### Misi\n"
910
+ for i, item in enumerate(self.department_info[department]['mission_items'], 1):
911
+ content += f"{i}. {item}\n"
912
 
913
+ # Add study programs
914
  if programs:
915
+ content += "\n### Program Studi\n"
916
  for prog in programs:
917
+ content += f"- **{prog['title']}**\n"
918
+ content += f" - Jenjang: {prog['details'].get('degree', 'N/A')}\n"
919
+ content += f" - Akreditasi: {prog['details'].get('accreditation', 'N/A')}\n"
920
+ content += f" - URL: {prog['url']}\n"
921
 
922
  if 'description' in prog['details']:
923
+ content += f" - Deskripsi: {prog['details']['description']}\n"
 
 
 
924
  else:
925
+ content += "\n### Belum ada informasi program studi\n"
926
 
927
  content += "\n---\n\n"
928
 
929
+ # Upload department summary
930
+ filename = "department_summary_report.md"
931
  try:
 
932
  self.supabase.storage.from_(self.storage_bucket).upload(
933
  path=filename,
934
  file=content.encode('utf-8'),
935
+ file_options={"content-type": "text/markdown", "x-upsert": "true"}
936
  )
937
+ self.logger.info("Successfully uploaded department summary report")
938
  except Exception as e:
939
+ self.logger.error(f"Failed to upload department summary: {str(e)}")
940
 
941
 
 
942
  if __name__ == "__main__":
943
  process = CrawlerProcess()
944
  process.crawl(PNPDepartmentSpider)