Spaces:
Running
Running
import scrapy | |
from scrapy.crawler import CrawlerProcess | |
import os | |
import re | |
from datetime import datetime | |
from supabase import create_client | |
from io import StringIO | |
class PnpSpider(scrapy.Spider): | |
name = 'pnp_spider' | |
allowed_domains = ['presensi.pnp.ac.id', 'elektro.pnp.ac.id'] | |
start_urls = [ | |
'https://presensi.pnp.ac.id/', | |
'https://elektro.pnp.ac.id/jadwal-perkuliahan-jurusan-teknik-elektro/jadwal-perkuliahan-program-studi-teknik-listrik/' | |
] | |
excluded_departments = ['elektronika', 'telkom', 'listrik'] | |
def __init__(self, *args, **kwargs): | |
super(PnpSpider, self).__init__(*args, **kwargs) | |
# Initialize Supabase client | |
url = os.environ.get("NEXT_PUBLIC_SUPABASE_URL") | |
key = os.environ.get("NEXT_PUBLIC_SUPABASE_SERVICE_KEY") | |
self.supabase = create_client(url, key) | |
self.storage_bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET") | |
self.file_buffers = {} # Dictionary to store StringIO objects | |
self.current_date = datetime.now().strftime("%Y-%m-%d") | |
def closed(self, reason): | |
print(f"Spider closing with reason: {reason}") | |
print(f"Uploading {len(self.file_buffers)} files to Supabase...") | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
for jurusan_id, buffer in self.file_buffers.items(): | |
filename = f"{jurusan_id}_{timestamp}.txt" | |
content = buffer.getvalue() | |
print(f"Uploading {filename} with content length: {len(content)}") | |
success = self.upload_to_supabase(filename, content) | |
if success: | |
print(f"✅ Successfully uploaded {filename}") | |
else: | |
print(f"❌ Failed to upload {filename}") | |
buffer.close() | |
def upload_to_supabase(self, filename, content): | |
"""Upload content directly to Supabase Storage""" | |
try: | |
# Upload new content | |
res = self.supabase.storage.from_(self.storage_bucket).upload( | |
path=filename, | |
file=content.encode('utf-8'), | |
file_options={"content-type": "text/plain"} | |
) | |
return True | |
except Exception as e: | |
print(f"Upload error: {str(e)}") | |
return False | |
def parse(self, response): | |
if 'elektro.pnp.ac.id' in response.url: | |
jurusan_id = 'teknik_elektro' | |
jurusan_name = 'Jurusan Teknik Elektro' | |
return self.parse_elektro_page(response, jurusan_id, jurusan_name) | |
print("Memulai scraping dari halaman utama...") | |
jurusan_links = set(response.xpath('//article[contains(@class, "section")]//a/@href').getall()) | |
for link in jurusan_links: | |
if any(excluded in link.lower() for excluded in self.excluded_departments): | |
continue | |
jurusan_url = response.urljoin(link) | |
jurusan_id = self.extract_jurusan_id(link) | |
yield scrapy.Request(jurusan_url, | |
callback=self.parse_jurusan, | |
meta={'jurusan_id': jurusan_id}) | |
def parse_elektro_page(self, response, jurusan_id, jurusan_name): | |
if jurusan_id not in self.file_buffers: | |
self.initialize_document_buffer(jurusan_id, jurusan_name) | |
output_buffer = self.file_buffers[jurusan_id] | |
tables = response.xpath('//table') | |
if not tables: | |
return | |
for table_idx, table in enumerate(tables): | |
caption_text = self.get_table_caption(table, table_idx) | |
class_info = self.clean_class_info(caption_text, table) | |
if not class_info: | |
continue | |
self.write_section_header(output_buffer, class_info) | |
days = table.xpath('.//thead//th[@class="xAxis"]/text()').getall() or \ | |
table.xpath('.//thead//th[contains(@class, "xAxis")]/text()').getall() | |
time_slots = table.xpath('.//tbody//th[@class="yAxis"]/text()').getall() or \ | |
table.xpath('.//tbody//th[contains(@class, "yAxis")]/text()').getall() | |
if not days or not time_slots: | |
continue | |
schedule_grid = self.build_schedule_grid(days, time_slots) | |
self.process_table_rows(table, schedule_grid, days, time_slots) | |
self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots) | |
def initialize_document_buffer(self, jurusan_id, jurusan_name): | |
"""Initialize a new document with proper title and metadata""" | |
self.file_buffers[jurusan_id] = StringIO() | |
buffer = self.file_buffers[jurusan_id] | |
# Write document title and metadata | |
buffer.write(f"# Jadwal Perkuliahan {jurusan_name}\n\n") | |
buffer.write(f"**Jurusan:** {jurusan_name}\n") | |
buffer.write(f"**Tanggal Update:** {self.current_date}\n") | |
buffer.write(f"**Sumber:** Politeknik Negeri Padang\n\n") | |
buffer.write("---\n\n") | |
def get_table_caption(self, table, table_idx): | |
"""Extract and clean table caption text""" | |
caption = table.xpath('.//caption//text()').getall() | |
caption_text = ' '.join(caption).strip() | |
if not caption_text: | |
caption_text = table.xpath('preceding::h2[1]//text()|preceding::h3[1]//text()|preceding::h4[1]//text()').get() | |
caption_text = caption_text.strip() if caption_text else f"Jadwal Kelas {table_idx + 1}" | |
return caption_text | |
def clean_class_info(self, caption_text, table): | |
"""Combine and clean class information""" | |
thead_class_info = ' '.join(table.xpath('.//thead/tr[1]//text()').getall()).strip() | |
class_info = f"{caption_text} {thead_class_info}" if thead_class_info else caption_text | |
return re.sub(r'\s+', ' ', class_info).strip() | |
def write_section_header(self, buffer, class_info): | |
"""Write a section header for each class schedule""" | |
buffer.write(f"## Jadwal Perkuliahan {class_info}\n\n") | |
buffer.write("Berikut adalah jadwal perkuliahan untuk kelas tersebut, diurutkan berdasarkan hari dan waktu:\n\n") | |
def build_schedule_grid(self, days, time_slots): | |
"""Initialize the schedule grid structure""" | |
return {day: {time: 'kosong' for time in time_slots} for day in days} | |
def process_table_rows(self, table, schedule_grid, days, time_slots): | |
"""Process table rows respecting rowspans and colspans""" | |
rows = table.xpath('.//tbody/tr[not(contains(@class, "foot"))]') | |
active_rowspans = {} | |
for row_idx, row in enumerate(rows): | |
if row_idx >= len(time_slots): | |
continue | |
current_time = time_slots[row_idx] | |
filled_columns = set() | |
# Apply active rowspans | |
self.apply_active_rowspans(active_rowspans, schedule_grid, days, current_time, filled_columns, row_idx) | |
# Process current row cells | |
cells = row.xpath('./td') | |
col_idx = 0 | |
for cell in cells: | |
while col_idx < len(days) and col_idx in filled_columns: | |
col_idx += 1 | |
if col_idx >= len(days): | |
break | |
cell_content = self.process_cell_content(cell) | |
rowspan = int(cell.xpath('./@rowspan').get() or 1) | |
colspan = int(cell.xpath('./@colspan').get() or 1) | |
self.update_schedule_grid(schedule_grid, days, current_time, col_idx, colspan, cell_content) | |
self.update_active_rowspans(active_rowspans, row_idx, col_idx, colspan, rowspan, cell_content) | |
col_idx += colspan | |
def apply_active_rowspans(self, active_rowspans, schedule_grid, days, current_time, filled_columns, row_idx): | |
"""Apply content from cells with rowspan to current row""" | |
rowspans_to_remove = [] | |
for (rs_col_idx, rs_row_start_idx), (rowspan_left, content) in active_rowspans.items(): | |
if rowspan_left > 0 and rs_col_idx < len(days): | |
day = days[rs_col_idx] | |
schedule_grid[day][current_time] = content | |
filled_columns.add(rs_col_idx) | |
active_rowspans[(rs_col_idx, rs_row_start_idx)] = (rowspan_left - 1, content) | |
if rowspan_left - 1 <= 0: | |
rowspans_to_remove.append((rs_col_idx, rs_row_start_idx)) | |
for key in rowspans_to_remove: | |
del active_rowspans[key] | |
def process_cell_content(self, cell): | |
"""Extract and clean cell content""" | |
content = ' '.join(cell.xpath('.//text()').getall()).strip() | |
return 'kosong' if not content or content == '---' else content | |
def update_schedule_grid(self, schedule_grid, days, current_time, col_idx, colspan, content): | |
"""Update schedule grid with cell content""" | |
for c in range(colspan): | |
current_col_idx = col_idx + c | |
if current_col_idx < len(days): | |
schedule_grid[days[current_col_idx]][current_time] = content | |
def update_active_rowspans(self, active_rowspans, row_idx, col_idx, colspan, rowspan, content): | |
"""Track cells with rowspan for future rows""" | |
if rowspan > 1: | |
for c in range(colspan): | |
active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, content) | |
def format_course_entry(self, time_slots, course_info): | |
"""Format a course entry for optimal RAG retrieval""" | |
# Parse course information | |
parts = course_info.split() | |
course_code = parts[0] if parts and len(parts[0]) == 7 and parts[0][:3].isalpha() and parts[0][3:].isdigit() else "" | |
course_name = "" | |
lecturer = "" | |
room = "" | |
# Extract course name, lecturer, and room | |
if "_" in course_info: | |
# Format: COURSE_CODE Course_Name_P Lecturer Room | |
course_parts = course_info.split("_P") | |
if len(course_parts) > 1: | |
course_name = course_parts[0].replace(course_code, "").strip() | |
remaining = course_parts[1].strip().split() | |
lecturer = " ".join(remaining[:-1]) | |
room = remaining[-1] if remaining else "" | |
else: | |
# Alternative format | |
course_name = " ".join(parts[1:-2]) if len(parts) > 3 else course_info.replace(course_code, "").strip() | |
lecturer = parts[-2] if len(parts) > 1 else "" | |
room = parts[-1] if parts else "" | |
# Format time range | |
time_range = self.format_time_range(time_slots) | |
# Create structured information | |
return { | |
"time_range": time_range, | |
"course_code": course_code, | |
"course_name": course_name, | |
"lecturer": lecturer, | |
"room": room | |
} | |
def write_schedule_to_buffer(self, buffer, schedule_grid, days, time_slots): | |
for day in days: | |
current_course = None | |
current_times = [] | |
day_schedule = [] | |
for time_slot in time_slots: | |
course = schedule_grid[day][time_slot] | |
if course == current_course: | |
current_times.append(time_slot) | |
else: | |
if current_course and current_course.lower() != 'kosong': | |
time_range = self.format_time_range(current_times) | |
entry = f"- {day} {time_range} | {current_course}" | |
day_schedule.append(entry) | |
current_course = course | |
current_times = [time_slot] | |
# Tambahkan entri terakhir | |
if current_course and current_course.lower() != 'kosong': | |
time_range = self.format_time_range(current_times) | |
entry = f"- {day} {time_range} | {current_course}" | |
day_schedule.append(entry) | |
# Tulis hasil ke buffer | |
for entry in day_schedule: | |
buffer.write(entry + "\n") | |
buffer.write("\n") # spasi antar hari | |
def format_time_range(self, time_slots): | |
"""Format multiple time slots into a readable range""" | |
if len(time_slots) == 1: | |
return time_slots[0] | |
first_start = time_slots[0].split('-')[0].strip() | |
last_end = time_slots[-1].split('-')[-1].strip() | |
return f"{first_start} - {last_end}" | |
def extract_jurusan_id(self, link): | |
match = re.search(r'department\?dep=(\d+)', link) | |
return match.group(1) if match else f"unknown_{hash(link) % 1000}" | |
def parse_jurusan(self, response): | |
jurusan_id = response.meta.get('jurusan_id') | |
jurusan_name = self.extract_title_jurusan_name(response) | |
groups_days_horizontal_link = response.xpath('//td/a[contains(@href, "groups_days_horizontal") and not(contains(@href, "subgroups_days_horizontal"))]/@href').get() | |
if groups_days_horizontal_link: | |
groups_days_horizontal_url = response.urljoin(groups_days_horizontal_link) | |
safe_jurusan_name = re.sub(r'[^\w\-_\. ]', '_', jurusan_name) | |
yield scrapy.Request(groups_days_horizontal_url, | |
callback=self.parse_jadwal, | |
meta={'jurusan_id': safe_jurusan_name, 'jurusan_name': jurusan_name}) | |
def parse_jadwal(self, response): | |
jurusan_id = response.meta.get('jurusan_id') | |
jurusan_name = response.meta.get('jurusan_name') | |
if jurusan_id not in self.file_buffers: | |
self.initialize_document_buffer(jurusan_id, jurusan_name) | |
output_buffer = self.file_buffers[jurusan_id] | |
tables = response.xpath('//table[contains(@id, "table_")]') or response.xpath('//table') | |
for table in tables: | |
caption_text = self.get_table_caption(table, 0) | |
class_info = self.clean_class_info(caption_text, table) | |
if not class_info: | |
continue | |
self.write_section_header(output_buffer, class_info) | |
days = table.xpath('.//thead//th[@class="xAxis"]/text()').getall() | |
time_slots = table.xpath('.//tbody/tr[not(contains(@class, "foot"))]/th[@class="yAxis"]/text()').getall() | |
if not days or not time_slots: | |
continue | |
schedule_grid = self.build_schedule_grid(days, time_slots) | |
self.process_table_rows(table, schedule_grid, days, time_slots) | |
self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots) | |
def extract_title_jurusan_name(self, response): | |
title = response.xpath('//title/text()').get() | |
return title.strip() if title else f"Jurusan_{response.meta.get('jurusan_id')}" | |
if __name__ == "__main__": | |
process = CrawlerProcess(settings={ | |
"LOG_LEVEL": "INFO", | |
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
"DOWNLOAD_DELAY": 1, | |
"AUTOTHROTTLE_ENABLED": True, | |
}) | |
process.crawl(PnpSpider) | |
process.start() |