Spaces:
Running
Running
FauziIsyrinApridal
commited on
Commit
·
050f867
1
Parent(s):
61960d8
perbaikan semester filter dan jurusan scrap tambahkan beatifulsoup
Browse files- app/(main)/components/RagDashboard.tsx +4 -1
- components/SemesterFilter.tsx +40 -49
- requirements.txt +1 -1
- scrapping/jurusan_scrap.py +609 -469
app/(main)/components/RagDashboard.tsx
CHANGED
@@ -189,7 +189,10 @@ export default function RagDashboard() {
|
|
189 |
{isRefreshing ? "Refreshing..." : "Refresh"}
|
190 |
</Button>
|
191 |
|
192 |
-
<SemesterFilter
|
|
|
|
|
|
|
193 |
|
194 |
<DropdownMenu>
|
195 |
<DropdownMenuTrigger asChild>
|
|
|
189 |
{isRefreshing ? "Refreshing..." : "Refresh"}
|
190 |
</Button>
|
191 |
|
192 |
+
<SemesterFilter
|
193 |
+
dates={ragData.map((file) => file.created_at)}
|
194 |
+
onFilterChange={handleSemesterFilterChange}
|
195 |
+
/>
|
196 |
|
197 |
<DropdownMenu>
|
198 |
<DropdownMenuTrigger asChild>
|
components/SemesterFilter.tsx
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
-
import { useState
|
2 |
import { ChevronDown, Filter, Calendar } from "lucide-react";
|
3 |
|
4 |
-
// Define the props interface for the component
|
5 |
interface SemesterFilterProps {
|
6 |
-
|
|
|
7 |
}
|
8 |
|
9 |
-
// Define the semester option type
|
10 |
interface SemesterOption {
|
11 |
id: string;
|
12 |
label: string;
|
@@ -14,40 +13,57 @@ interface SemesterOption {
|
|
14 |
}
|
15 |
|
16 |
export default function SemesterFilter({
|
|
|
17 |
onFilterChange,
|
18 |
}: SemesterFilterProps) {
|
19 |
const [semesterFilter, setSemesterFilter] = useState("all");
|
20 |
const [isFilterOpen, setIsFilterOpen] = useState(false);
|
21 |
|
22 |
-
//
|
23 |
-
const
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
}
|
34 |
-
|
|
|
35 |
};
|
36 |
|
37 |
-
const academicYears =
|
38 |
|
39 |
// Generate semester options
|
40 |
const semesterOptions: SemesterOption[] = [];
|
41 |
academicYears.forEach((academicYear) => {
|
42 |
const [startYear, endYear] = academicYear.split("/");
|
43 |
-
|
44 |
semesterOptions.push({
|
45 |
id: `odd-${academicYear}`,
|
46 |
label: `Ganjil ${academicYear}`,
|
47 |
description: `September ${startYear} - January ${endYear}`,
|
48 |
});
|
49 |
|
50 |
-
// Even semester (February - August of endYear)
|
51 |
semesterOptions.push({
|
52 |
id: `even-${academicYear}`,
|
53 |
label: `Genap ${academicYear}`,
|
@@ -55,28 +71,6 @@ export default function SemesterFilter({
|
|
55 |
});
|
56 |
});
|
57 |
|
58 |
-
// Check if a document falls within a specific semester
|
59 |
-
const isInSemester = (date: string, semesterId: string) => {
|
60 |
-
if (semesterId === "all") return true;
|
61 |
-
|
62 |
-
const [type, academicYear] = semesterId.split("-");
|
63 |
-
const [startYear, endYear] = academicYear.split("/");
|
64 |
-
const docDate = new Date(date);
|
65 |
-
const docMonth = docDate.getMonth() + 1; // 1-12
|
66 |
-
const docYear = docDate.getFullYear();
|
67 |
-
|
68 |
-
if (type === "odd") {
|
69 |
-
// Odd semester: September (9) - January (1) of next year
|
70 |
-
return (
|
71 |
-
(docYear === parseInt(startYear) && docMonth >= 9 && docMonth <= 12) ||
|
72 |
-
(docYear === parseInt(endYear) && docMonth === 1)
|
73 |
-
);
|
74 |
-
} else {
|
75 |
-
// Even semester: February (2) - August (8)
|
76 |
-
return docYear === parseInt(endYear) && docMonth >= 2 && docMonth <= 8;
|
77 |
-
}
|
78 |
-
};
|
79 |
-
|
80 |
const handleFilterClick = () => {
|
81 |
setIsFilterOpen(!isFilterOpen);
|
82 |
};
|
@@ -84,14 +78,11 @@ export default function SemesterFilter({
|
|
84 |
const handleSemesterSelect = (semesterId: string) => {
|
85 |
setSemesterFilter(semesterId);
|
86 |
setIsFilterOpen(false);
|
87 |
-
|
88 |
-
// Call the onFilterChange prop if it exists
|
89 |
if (onFilterChange) {
|
90 |
onFilterChange(semesterId);
|
91 |
}
|
92 |
};
|
93 |
|
94 |
-
// Get display text for current filter
|
95 |
const getCurrentFilterText = () => {
|
96 |
if (semesterFilter === "all") return "All Semesters";
|
97 |
|
@@ -118,7 +109,7 @@ export default function SemesterFilter({
|
|
118 |
{isFilterOpen && (
|
119 |
<div className="absolute z-50 mt-2 min-w-[240px] rounded-md border border-gray-200 bg-white shadow-lg">
|
120 |
<div className="py-1">
|
121 |
-
{/* All
|
122 |
<div
|
123 |
onClick={() => handleSemesterSelect("all")}
|
124 |
className={`cursor-pointer px-4 py-2 hover:bg-gray-100 ${
|
@@ -128,15 +119,15 @@ export default function SemesterFilter({
|
|
128 |
All Semesters
|
129 |
</div>
|
130 |
|
131 |
-
|
132 |
-
<div className="my-1 border-t border-gray-200"></div>
|
133 |
|
134 |
-
{/*
|
135 |
{academicYears.map((year, yearIndex) => (
|
136 |
<div key={year}>
|
137 |
{yearIndex > 0 && (
|
138 |
-
<div className="my-1 border-t border-gray-200"
|
139 |
)}
|
|
|
140 |
<div className="px-4 py-2 text-xs font-semibold text-gray-500">
|
141 |
Academic Year {year}
|
142 |
</div>
|
|
|
1 |
+
import { useState } from "react";
|
2 |
import { ChevronDown, Filter, Calendar } from "lucide-react";
|
3 |
|
|
|
4 |
interface SemesterFilterProps {
|
5 |
+
dates: string[];
|
6 |
+
onFilterChange?: (semesterId: string) => void;
|
7 |
}
|
8 |
|
|
|
9 |
interface SemesterOption {
|
10 |
id: string;
|
11 |
label: string;
|
|
|
13 |
}
|
14 |
|
15 |
export default function SemesterFilter({
|
16 |
+
dates,
|
17 |
onFilterChange,
|
18 |
}: SemesterFilterProps) {
|
19 |
const [semesterFilter, setSemesterFilter] = useState("all");
|
20 |
const [isFilterOpen, setIsFilterOpen] = useState(false);
|
21 |
|
22 |
+
// Extract unique academic years from dates
|
23 |
+
const extractAcademicYears = (dates: string[]): string[] => {
|
24 |
+
const yearSet = new Set<string>();
|
25 |
+
|
26 |
+
dates.forEach((dateStr) => {
|
27 |
+
const date = new Date(dateStr);
|
28 |
+
const year = date.getFullYear();
|
29 |
+
const month = date.getMonth() + 1;
|
30 |
+
|
31 |
+
let startYear: number;
|
32 |
+
if (month >= 9) {
|
33 |
+
// Odd semester: starts in September
|
34 |
+
startYear = year;
|
35 |
+
} else {
|
36 |
+
// Even semester: January–August of next year
|
37 |
+
startYear = year - 1;
|
38 |
+
}
|
39 |
+
|
40 |
+
const academicYear = `${startYear}/${startYear + 1}`;
|
41 |
+
yearSet.add(academicYear);
|
42 |
+
});
|
43 |
+
|
44 |
+
if (yearSet.size === 0) {
|
45 |
+
const currentYear = new Date().getFullYear();
|
46 |
+
yearSet.add(`${currentYear - 2}/${currentYear - 1}`);
|
47 |
+
yearSet.add(`${currentYear - 1}/${currentYear}`);
|
48 |
+
yearSet.add(`${currentYear}/${currentYear + 1}`);
|
49 |
}
|
50 |
+
|
51 |
+
return Array.from(yearSet).sort((a, b) => (a > b ? -1 : 1));
|
52 |
};
|
53 |
|
54 |
+
const academicYears = extractAcademicYears(dates);
|
55 |
|
56 |
// Generate semester options
|
57 |
const semesterOptions: SemesterOption[] = [];
|
58 |
academicYears.forEach((academicYear) => {
|
59 |
const [startYear, endYear] = academicYear.split("/");
|
60 |
+
|
61 |
semesterOptions.push({
|
62 |
id: `odd-${academicYear}`,
|
63 |
label: `Ganjil ${academicYear}`,
|
64 |
description: `September ${startYear} - January ${endYear}`,
|
65 |
});
|
66 |
|
|
|
67 |
semesterOptions.push({
|
68 |
id: `even-${academicYear}`,
|
69 |
label: `Genap ${academicYear}`,
|
|
|
71 |
});
|
72 |
});
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
const handleFilterClick = () => {
|
75 |
setIsFilterOpen(!isFilterOpen);
|
76 |
};
|
|
|
78 |
const handleSemesterSelect = (semesterId: string) => {
|
79 |
setSemesterFilter(semesterId);
|
80 |
setIsFilterOpen(false);
|
|
|
|
|
81 |
if (onFilterChange) {
|
82 |
onFilterChange(semesterId);
|
83 |
}
|
84 |
};
|
85 |
|
|
|
86 |
const getCurrentFilterText = () => {
|
87 |
if (semesterFilter === "all") return "All Semesters";
|
88 |
|
|
|
109 |
{isFilterOpen && (
|
110 |
<div className="absolute z-50 mt-2 min-w-[240px] rounded-md border border-gray-200 bg-white shadow-lg">
|
111 |
<div className="py-1">
|
112 |
+
{/* All Semesters */}
|
113 |
<div
|
114 |
onClick={() => handleSemesterSelect("all")}
|
115 |
className={`cursor-pointer px-4 py-2 hover:bg-gray-100 ${
|
|
|
119 |
All Semesters
|
120 |
</div>
|
121 |
|
122 |
+
<div className="my-1 border-t border-gray-200" />
|
|
|
123 |
|
124 |
+
{/* Semester Options Grouped by Academic Year */}
|
125 |
{academicYears.map((year, yearIndex) => (
|
126 |
<div key={year}>
|
127 |
{yearIndex > 0 && (
|
128 |
+
<div className="my-1 border-t border-gray-200" />
|
129 |
)}
|
130 |
+
|
131 |
<div className="px-4 py-2 text-xs font-semibold text-gray-500">
|
132 |
Academic Year {year}
|
133 |
</div>
|
requirements.txt
CHANGED
@@ -3,4 +3,4 @@ supabase
|
|
3 |
python-dotenv
|
4 |
requests
|
5 |
instaloader
|
6 |
-
|
|
|
3 |
python-dotenv
|
4 |
requests
|
5 |
instaloader
|
6 |
+
BeautifulSoup
|
scrapping/jurusan_scrap.py
CHANGED
@@ -8,12 +8,14 @@ from datetime import datetime
|
|
8 |
from collections import defaultdict
|
9 |
from supabase import create_client
|
10 |
from dotenv import load_dotenv
|
|
|
|
|
11 |
|
12 |
# Load environment variables
|
13 |
load_dotenv()
|
14 |
|
15 |
class PNPDepartmentSpider(scrapy.Spider):
|
16 |
-
name = '
|
17 |
|
18 |
DEPARTMENTS = {
|
19 |
'akt.pnp.ac.id': 'Akuntansi',
|
@@ -27,50 +29,88 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
27 |
|
28 |
start_urls = [f'https://{domain}' for domain in DEPARTMENTS.keys()]
|
29 |
visited_urls = set()
|
|
|
30 |
|
31 |
custom_settings = {
|
32 |
-
'DOWNLOAD_DELAY':
|
33 |
'ROBOTSTXT_OBEY': True,
|
34 |
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
35 |
'LOG_LEVEL': 'INFO',
|
36 |
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
|
37 |
-
|
38 |
-
'RETRY_TIMES':
|
39 |
'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
|
40 |
-
'HTTPCACHE_ENABLED': True
|
|
|
|
|
|
|
41 |
}
|
42 |
|
43 |
def __init__(self, *args, **kwargs):
|
44 |
super().__init__(*args, **kwargs)
|
45 |
-
self.
|
46 |
-
|
47 |
-
os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
|
48 |
-
)
|
49 |
-
self.storage_bucket = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
|
50 |
self.department_data = defaultdict(lambda: defaultdict(list))
|
51 |
self.study_programs = defaultdict(list)
|
52 |
self.department_info = defaultdict(dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
def start_requests(self):
|
|
|
55 |
for url in self.start_urls:
|
56 |
yield scrapy.Request(
|
57 |
url=url,
|
58 |
callback=self.parse_department_homepage,
|
59 |
errback=self.handle_error,
|
60 |
-
headers={'Accept': 'text/html,application/xhtml+xml'}
|
|
|
61 |
)
|
62 |
|
63 |
def parse_department_homepage(self, response):
|
|
|
|
|
|
|
64 |
domain = urlparse(response.url).netloc
|
65 |
department = self.DEPARTMENTS.get(domain, domain)
|
66 |
self.visited_urls.add(response.url)
|
67 |
|
68 |
-
self.logger.info(f"Processing department
|
69 |
|
70 |
-
# Extract homepage content
|
71 |
homepage_content = self.extract_content(response)
|
72 |
if homepage_content:
|
73 |
-
page_title =
|
74 |
self.save_page_content(
|
75 |
response.url,
|
76 |
page_title,
|
@@ -79,11 +119,12 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
79 |
'Beranda',
|
80 |
homepage_content
|
81 |
)
|
|
|
82 |
|
83 |
-
# Process navigation
|
84 |
nav_elements = self.extract_navigation(response)
|
85 |
for nav_item in nav_elements:
|
86 |
-
if not nav_item['link']
|
87 |
continue
|
88 |
|
89 |
full_url = response.urljoin(nav_item['link'])
|
@@ -98,12 +139,15 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
98 |
'category': category,
|
99 |
'department': department,
|
100 |
'domain': domain,
|
101 |
-
'menu_path': nav_item['text']
|
|
|
|
|
102 |
},
|
103 |
-
errback=self.handle_error
|
|
|
104 |
)
|
105 |
|
106 |
-
#
|
107 |
study_program_links = self.extract_study_program_links(response)
|
108 |
for prog in study_program_links:
|
109 |
if prog['link'] not in self.visited_urls:
|
@@ -113,12 +157,14 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
113 |
meta={
|
114 |
'page_title': prog['title'],
|
115 |
'department': department,
|
116 |
-
'domain': domain
|
|
|
|
|
117 |
},
|
118 |
errback=self.handle_error
|
119 |
)
|
120 |
|
121 |
-
#
|
122 |
vision_mission_links = self.extract_vision_mission_links(response)
|
123 |
for vm_link in vision_mission_links:
|
124 |
if vm_link['link'] not in self.visited_urls:
|
@@ -128,96 +174,168 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
128 |
meta={
|
129 |
'page_title': vm_link['title'],
|
130 |
'department': department,
|
131 |
-
'domain': domain
|
|
|
|
|
132 |
},
|
133 |
errback=self.handle_error
|
134 |
)
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
def extract_navigation(self, response):
|
137 |
-
"""
|
|
|
138 |
nav_items = []
|
139 |
|
140 |
-
#
|
141 |
-
|
142 |
-
'
|
143 |
-
'
|
144 |
-
'
|
145 |
-
'
|
146 |
-
'
|
|
|
|
|
147 |
]
|
148 |
|
149 |
-
for
|
150 |
-
|
151 |
-
|
152 |
-
link
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
156 |
nav_items.append({
|
157 |
-
'text': text
|
158 |
-
'link':
|
159 |
})
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
return nav_items
|
162 |
|
163 |
def extract_study_program_links(self, response):
|
164 |
-
"""
|
165 |
program_links = []
|
166 |
|
167 |
-
#
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
182 |
program_links.append({
|
183 |
-
'title': text.
|
184 |
-
'link': response.urljoin(
|
185 |
})
|
186 |
-
|
187 |
-
# Logika fallback untuk website yang menggunakan struktur berbeda
|
188 |
-
if not program_links:
|
189 |
-
program_links = super().extract_study_program_links(response)
|
190 |
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
def extract_vision_mission_links(self, response):
|
194 |
-
"""
|
195 |
vm_links = []
|
196 |
|
197 |
-
# Terms
|
198 |
-
|
199 |
|
200 |
-
#
|
201 |
-
for term in
|
|
|
202 |
for link in response.css(f'a:contains("{term}")'):
|
203 |
text = self.clean_text(' '.join(link.css('::text').getall()))
|
204 |
url = link.css('::attr(href)').get()
|
205 |
|
206 |
if text and url:
|
207 |
vm_links.append({
|
208 |
-
'title': text
|
209 |
'link': response.urljoin(url)
|
210 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
return vm_links
|
213 |
|
214 |
def parse_content_page(self, response):
|
215 |
-
"""
|
|
|
216 |
meta = response.meta
|
217 |
self.visited_urls.add(response.url)
|
218 |
|
219 |
-
# Extract content
|
220 |
-
content = self.
|
221 |
|
222 |
if content:
|
223 |
self.save_page_content(
|
@@ -229,43 +347,180 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
229 |
content,
|
230 |
meta.get('menu_path', '')
|
231 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
meta={
|
249 |
-
'page_title': link_text,
|
250 |
-
'category': meta['category'], # Keep parent category
|
251 |
-
'department': meta['department'],
|
252 |
-
'domain': meta['domain'],
|
253 |
-
'menu_path': f"{meta.get('menu_path', '')} > {link_text}"
|
254 |
-
},
|
255 |
-
errback=self.handle_error
|
256 |
-
)
|
257 |
|
258 |
def parse_study_program(self, response):
|
259 |
-
"""
|
|
|
260 |
meta = response.meta
|
261 |
self.visited_urls.add(response.url)
|
262 |
|
263 |
department = meta['department']
|
264 |
-
program_title =
|
265 |
|
266 |
-
# Extract program details
|
267 |
program_details = self.extract_program_details(response)
|
268 |
-
|
269 |
|
270 |
# Add to the study programs collection
|
271 |
self.study_programs[department].append({
|
@@ -275,7 +530,7 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
275 |
})
|
276 |
|
277 |
# Also save as a regular page
|
278 |
-
content = self.
|
279 |
if content:
|
280 |
self.save_page_content(
|
281 |
response.url,
|
@@ -285,16 +540,18 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
285 |
'Program_Studi',
|
286 |
content
|
287 |
)
|
|
|
288 |
|
289 |
def extract_program_details(self, response):
|
290 |
-
"""Enhanced program details extraction
|
291 |
-
details = {}
|
|
|
292 |
|
293 |
-
#
|
294 |
degree_sources = [
|
295 |
-
|
296 |
-
|
297 |
-
' '.join(
|
298 |
]
|
299 |
|
300 |
degree_pattern = re.compile(
|
@@ -307,124 +564,84 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
307 |
details['degree'] = match.group(1).upper()
|
308 |
break
|
309 |
|
310 |
-
# Extract accreditation status
|
311 |
-
|
312 |
-
'
|
313 |
-
'
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
return details
|
325 |
|
326 |
def parse_vision_mission(self, response):
|
327 |
-
"""
|
|
|
328 |
meta = response.meta
|
329 |
self.visited_urls.add(response.url)
|
330 |
department = meta['department']
|
331 |
|
332 |
-
|
333 |
-
|
334 |
|
335 |
-
#
|
336 |
-
|
337 |
-
|
338 |
-
'h4:contains("Visi") + p', '.visi p', '#visi p',
|
339 |
-
'h2:contains("Vision") + p', 'h3:contains("Vision") + p',
|
340 |
-
'strong:contains("Visi") + p', 'b:contains("Visi") + p'
|
341 |
-
]
|
342 |
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
if vision_text:
|
349 |
-
break
|
350 |
-
except:
|
351 |
-
continue
|
352 |
-
|
353 |
-
# If still not found, try looking for paragraphs after headings
|
354 |
-
if not vision_text:
|
355 |
-
for heading in response.css('h1, h2, h3, h4, h5, h6'):
|
356 |
-
heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
|
357 |
-
if heading_text and ('visi' in heading_text.lower() or 'vision' in heading_text.lower()):
|
358 |
-
# Try to get the next paragraph
|
359 |
-
next_p = heading.xpath('following-sibling::p[1]')
|
360 |
-
if next_p:
|
361 |
-
vision_text = self.clean_text(' '.join(next_p.css('::text').getall()))
|
362 |
-
break
|
363 |
-
|
364 |
-
# Look for mission section using similar approach
|
365 |
-
mission_selectors = [
|
366 |
-
'h2:contains("Misi") + p', 'h3:contains("Misi") + p',
|
367 |
-
'h4:contains("Misi") + p', '.misi p', '#misi p',
|
368 |
-
'h2:contains("Mission") + p', 'h3:contains("Mission") + p',
|
369 |
-
'strong:contains("Misi") + p', 'b:contains("Misi") + p'
|
370 |
-
]
|
371 |
-
|
372 |
-
for selector in mission_selectors:
|
373 |
-
try:
|
374 |
-
mission = response.css(selector).get()
|
375 |
-
if mission:
|
376 |
-
mission_text = self.clean_text(scrapy.Selector(text=mission).css('::text').get(''))
|
377 |
-
if mission_text:
|
378 |
-
break
|
379 |
-
except:
|
380 |
-
continue
|
381 |
-
|
382 |
-
# If still not found, try looking for paragraphs after headings
|
383 |
-
if not mission_text:
|
384 |
-
for heading in response.css('h1, h2, h3, h4, h5, h6'):
|
385 |
-
heading_text = self.clean_text(' '.join(heading.css('::text').getall()))
|
386 |
-
if heading_text and ('misi' in heading_text.lower() or 'mission' in heading_text.lower()):
|
387 |
-
# Try to get the next paragraph
|
388 |
-
next_p = heading.xpath('following-sibling::p[1]')
|
389 |
-
if next_p:
|
390 |
-
mission_text = self.clean_text(' '.join(next_p.css('::text').getall()))
|
391 |
-
break
|
392 |
-
|
393 |
-
# Try to find mission list items
|
394 |
-
mission_list_items = []
|
395 |
-
for list_selector in ['h2:contains("Misi") ~ ul li', 'h3:contains("Misi") ~ ul li',
|
396 |
-
'h4:contains("Misi") ~ ul li', '.misi ul li', '#misi ul li',
|
397 |
-
'h2:contains("Mission") ~ ul li', 'h3:contains("Mission") ~ ul li']:
|
398 |
-
try:
|
399 |
-
items = response.css(f'{list_selector}::text').getall()
|
400 |
-
if items:
|
401 |
-
mission_list_items = [self.clean_text(item) for item in items if self.clean_text(item)]
|
402 |
-
if mission_list_items:
|
403 |
-
break
|
404 |
-
except:
|
405 |
-
continue
|
406 |
|
407 |
-
# Store
|
408 |
-
if vision_text or mission_text or
|
409 |
if vision_text:
|
410 |
self.department_info[department]['vision'] = vision_text
|
411 |
if mission_text:
|
412 |
self.department_info[department]['mission'] = mission_text
|
413 |
-
if
|
414 |
-
self.department_info[department]['mission_items'] =
|
415 |
|
416 |
-
# Save as separate file
|
417 |
self.save_vision_mission(
|
418 |
department,
|
419 |
meta['domain'],
|
420 |
vision_text,
|
421 |
mission_text,
|
422 |
-
|
423 |
response.url
|
424 |
)
|
425 |
|
426 |
# Also save as a regular page
|
427 |
-
content = self.
|
428 |
if content:
|
429 |
self.save_page_content(
|
430 |
response.url,
|
@@ -434,371 +651,294 @@ class PNPDepartmentSpider(scrapy.Spider):
|
|
434 |
'Profil',
|
435 |
content
|
436 |
)
|
|
|
437 |
|
438 |
-
def
|
439 |
-
"""
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
# Extract paragraphs
|
470 |
-
for p in main_content.css('p'):
|
471 |
-
text = self.clean_text(' '.join(p.css('::text').getall()))
|
472 |
-
if text and len(text) > 20: # Minimum meaningful length
|
473 |
-
# Add any links found in this paragraph
|
474 |
-
links = []
|
475 |
-
for a in p.css('a'):
|
476 |
-
link_text = self.clean_text(' '.join(a.css('::text').getall()))
|
477 |
-
link_url = a.css('::attr(href)').get()
|
478 |
-
if link_text and link_url:
|
479 |
-
links.append(f"{link_text} (Link: {response.urljoin(link_url)})")
|
480 |
-
|
481 |
-
paragraph = text
|
482 |
-
if links:
|
483 |
-
paragraph += f" | Links: {'; '.join(links)}"
|
484 |
-
|
485 |
-
content["paragraphs"].append(paragraph)
|
486 |
-
|
487 |
-
# Extract list items
|
488 |
-
for li in main_content.css('li'):
|
489 |
-
text = self.clean_text(' '.join(li.css('::text').getall()))
|
490 |
-
if text and len(text) > 10:
|
491 |
-
content["paragraphs"].append(f"• {text}")
|
492 |
-
|
493 |
-
# If no structured text elements found, try general text extraction
|
494 |
-
if not content["paragraphs"]:
|
495 |
-
# Get all text nodes within divs but not within scripts or styles
|
496 |
-
for div in main_content.css('div'):
|
497 |
-
text = self.clean_text(' '.join(div.xpath('./text()').getall()))
|
498 |
-
if text and len(text) > 30:
|
499 |
-
content["paragraphs"].append(text)
|
500 |
-
|
501 |
-
# Extract tables
|
502 |
-
for table in main_content.css('table'):
|
503 |
-
rows = []
|
504 |
-
|
505 |
-
# Get header if it exists
|
506 |
-
headers = []
|
507 |
-
for th in table.css('thead th, tr th'):
|
508 |
-
header_text = self.clean_text(' '.join(th.css('::text').getall()))
|
509 |
-
if header_text:
|
510 |
-
headers.append(header_text)
|
511 |
-
|
512 |
-
if headers:
|
513 |
-
rows.append(" - ".join(headers))
|
514 |
-
|
515 |
-
# Get table body rows
|
516 |
-
for tr in table.css('tbody tr, tr'):
|
517 |
-
if tr.css('th') and not tr.css('td'):
|
518 |
-
continue # Skip header rows already processed
|
519 |
-
|
520 |
-
cells = []
|
521 |
-
for td in tr.css('td'):
|
522 |
-
cell_text = self.clean_text(' '.join(td.css('::text').getall()))
|
523 |
-
link = td.css('a::attr(href)').get()
|
524 |
-
if link:
|
525 |
-
cell_text += f" (Link: {response.urljoin(link)})"
|
526 |
-
if cell_text:
|
527 |
-
cells.append(cell_text)
|
528 |
-
else:
|
529 |
-
cells.append(" ") # Empty cell placeholder
|
530 |
-
|
531 |
-
if cells:
|
532 |
-
rows.append(" - ".join(cells))
|
533 |
-
|
534 |
-
if len(rows) > 1: # Only add if we have meaningful table
|
535 |
-
content["tables"].append("\n".join(rows))
|
536 |
-
|
537 |
-
# Extract downloads and files
|
538 |
-
for link in main_content.css('a[href]'):
|
539 |
-
href = link.css('::attr(href)').get()
|
540 |
-
if not href:
|
541 |
-
continue
|
542 |
-
|
543 |
-
link_text = self.clean_text(' '.join(link.css('::text').getall()))
|
544 |
-
if not link_text:
|
545 |
-
link_text = "Unduhan"
|
546 |
-
|
547 |
-
# Match common document formats
|
548 |
-
if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)$', href.lower()):
|
549 |
-
# Extract file extension for better categorization
|
550 |
-
file_ext = href.split('.')[-1].lower()
|
551 |
-
content["files"].append({
|
552 |
-
"title": link_text,
|
553 |
-
"url": urljoin(response.url, href),
|
554 |
-
"type": file_ext
|
555 |
-
})
|
556 |
-
|
557 |
-
return content if any(value for value in content.values()) else None
|
558 |
|
559 |
def save_page_content(self, url, title, department, domain, category, content, menu_path=''):
|
560 |
-
"""Save
|
561 |
if not content or not title:
|
562 |
return
|
563 |
|
564 |
-
#
|
565 |
safe_title = re.sub(r'[^\w\s-]', '', title).strip().lower()
|
566 |
-
safe_title = re.sub(r'[-\s]+', '-', safe_title)
|
567 |
-
|
568 |
-
# Prepare the content
|
569 |
-
formatted_content = f"""# {title}
|
570 |
-
|
571 |
-
URL: {url}
|
572 |
-
Tanggal: {datetime.now().strftime('%d %B %Y')}
|
573 |
-
Jurusan: {department}
|
574 |
-
Kategori: {category}
|
575 |
-
"""
|
576 |
-
|
577 |
-
if menu_path:
|
578 |
-
formatted_content += f"Navigasi: {menu_path}\n"
|
579 |
-
|
580 |
-
formatted_content += "\n## Konten\n\n"
|
581 |
-
if content["paragraphs"]:
|
582 |
-
formatted_content += "\n".join(content["paragraphs"])
|
583 |
-
|
584 |
-
if content["tables"]:
|
585 |
-
formatted_content += "\n\n## Tabel Data\n\n"
|
586 |
-
for i, table in enumerate(content["tables"]):
|
587 |
-
formatted_content += f"### Tabel {i+1}\n{table}\n\n"
|
588 |
-
|
589 |
-
if content["files"]:
|
590 |
-
formatted_content += "\n\n## Berkas\n\n"
|
591 |
-
for file in content["files"]:
|
592 |
-
formatted_content += f"- {file['title']} [{file['type']}]: {file['url']}\n"
|
593 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
594 |
-
|
595 |
-
filename = f"{department}_{safe_title}_{timestamp}.txt"
|
596 |
|
597 |
-
# Upload file to Supabase
|
598 |
try:
|
599 |
-
|
600 |
-
|
|
|
|
|
|
|
601 |
upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
|
602 |
path=filename,
|
603 |
-
file=
|
604 |
-
file_options={"content-type": "
|
605 |
)
|
606 |
|
607 |
self.logger.info(f"Successfully uploaded {filename}")
|
608 |
|
609 |
-
# Store in our collection
|
610 |
self.department_data[department][category].append({
|
611 |
'title': title,
|
612 |
'url': url,
|
613 |
-
'filename': filename
|
|
|
614 |
})
|
615 |
|
616 |
except Exception as e:
|
617 |
self.logger.error(f"Upload failed for {filename}: {str(e)}")
|
|
|
618 |
|
619 |
def save_vision_mission(self, department, domain, vision, mission, mission_items, url):
|
620 |
-
"""Save vision & mission
|
621 |
-
filename = f"{department}_Visi_Misi.
|
622 |
-
|
623 |
-
content =
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
""
|
630 |
-
|
631 |
-
|
632 |
-
content += f"## Visi\n\n{vision}\n\n"
|
633 |
-
|
634 |
-
if mission:
|
635 |
-
content += f"## Misi\n\n{mission}\n\n"
|
636 |
-
|
637 |
-
if mission_items:
|
638 |
-
if not mission: # Only add header if not already added
|
639 |
-
content += "## Misi\n\n"
|
640 |
-
for i, item in enumerate(mission_items, 1):
|
641 |
-
content += f"{i}. {item}\n"
|
642 |
|
643 |
try:
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
except:
|
648 |
-
pass
|
649 |
-
|
650 |
upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
|
651 |
path=filename,
|
652 |
-
file=
|
653 |
-
file_options={"content-type": "
|
654 |
)
|
655 |
|
656 |
-
self.logger.info(f"Successfully uploaded {
|
657 |
except Exception as e:
|
658 |
-
self.logger.error(f"
|
659 |
-
|
|
|
660 |
def clean_text(self, text):
|
661 |
-
"""
|
662 |
if not text:
|
663 |
return ""
|
664 |
|
665 |
# Normalize unicode characters
|
666 |
text = unicodedata.normalize('NFKC', text)
|
667 |
|
668 |
-
#
|
669 |
-
text = re.sub(r'\
|
670 |
|
671 |
-
#
|
672 |
-
text = re.sub(r'
|
673 |
|
674 |
-
# Remove
|
675 |
-
text =
|
676 |
|
677 |
-
return text
|
678 |
|
679 |
def determine_category(self, menu_text):
|
680 |
-
"""
|
681 |
menu_lower = menu_text.lower()
|
682 |
|
683 |
-
|
684 |
-
|
685 |
-
'
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
}
|
696 |
|
697 |
-
|
698 |
-
|
699 |
-
if any(term in menu_lower for term in terms):
|
700 |
return category
|
701 |
|
702 |
-
# Default category if no match
|
703 |
return 'Lainnya'
|
704 |
|
705 |
def is_social_media_link(self, url):
|
706 |
-
"""Check if URL is
|
707 |
-
|
708 |
-
'facebook.com', 'twitter.com', 'instagram.com',
|
709 |
-
'youtube.com', 'linkedin.com', '
|
710 |
-
'
|
711 |
]
|
712 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
713 |
|
714 |
def is_unwanted_url(self, url):
|
715 |
-
"""
|
716 |
-
|
717 |
-
if re.search(r'\.(jpg|jpeg|png|gif|svg|ico|css|js)$', url.lower()):
|
718 |
return True
|
719 |
|
720 |
-
# Skip certain URL patterns
|
721 |
unwanted_patterns = [
|
722 |
-
'
|
723 |
-
'wp
|
724 |
-
'javascript
|
725 |
-
'
|
726 |
]
|
727 |
|
728 |
-
|
|
|
729 |
|
730 |
def handle_error(self, failure):
|
731 |
-
"""
|
732 |
url = failure.request.url
|
733 |
-
|
734 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
735 |
|
736 |
def closed(self, reason):
|
737 |
-
"""
|
738 |
-
self.logger.info("Spider closed. Generating
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
739 |
|
740 |
-
#
|
741 |
-
|
742 |
-
pages_count = sum(len(cat_data) for dept_data in self.department_data.values()
|
743 |
-
for cat_data in dept_data.values())
|
744 |
|
745 |
-
self.logger.info(f"
|
746 |
-
|
|
|
|
|
|
|
747 |
|
748 |
-
|
749 |
-
|
|
|
750 |
|
751 |
-
|
752 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
753 |
|
754 |
-
def
|
755 |
-
"""Generate
|
756 |
-
content = "
|
757 |
-
content += f"**
|
758 |
|
759 |
# Create reverse mapping from department name to domain
|
760 |
reverse_departments = {v: k for k, v in self.DEPARTMENTS.items()}
|
761 |
|
762 |
for department, programs in self.study_programs.items():
|
763 |
-
# Get domain from reverse mapping
|
764 |
domain = reverse_departments.get(department, '')
|
765 |
-
website_url = f'https://{domain}' if domain else '
|
766 |
|
767 |
content += f"## {department.replace('_', ' ')}\n"
|
768 |
-
content += f"**Website**: {website_url}\n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
769 |
|
|
|
770 |
if programs:
|
|
|
771 |
for prog in programs:
|
772 |
-
content += f"
|
773 |
-
content += f"-
|
774 |
-
content += f"-
|
775 |
-
content += f"-
|
776 |
|
777 |
if 'description' in prog['details']:
|
778 |
-
|
779 |
-
content += f"\n**Deskripsi**:\n{desc}\n"
|
780 |
-
|
781 |
-
content += "\n"
|
782 |
else:
|
783 |
-
content += "### Belum ada informasi program studi\n"
|
784 |
|
785 |
content += "\n---\n\n"
|
786 |
|
787 |
-
# Upload
|
788 |
-
filename = "
|
789 |
try:
|
790 |
-
self.supabase.storage.from_(self.storage_bucket).remove(filename)
|
791 |
self.supabase.storage.from_(self.storage_bucket).upload(
|
792 |
path=filename,
|
793 |
file=content.encode('utf-8'),
|
794 |
-
file_options={"content-type": "text/
|
795 |
)
|
796 |
-
self.logger.info("
|
797 |
except Exception as e:
|
798 |
-
self.logger.error(f"
|
799 |
|
800 |
|
801 |
-
# Main execution
|
802 |
if __name__ == "__main__":
|
803 |
process = CrawlerProcess()
|
804 |
process.crawl(PNPDepartmentSpider)
|
|
|
8 |
from collections import defaultdict
|
9 |
from supabase import create_client
|
10 |
from dotenv import load_dotenv
|
11 |
+
import logging
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
|
14 |
# Load environment variables
|
15 |
load_dotenv()
|
16 |
|
17 |
class PNPDepartmentSpider(scrapy.Spider):
|
18 |
+
name = 'optimized_pnp_department_spider'
|
19 |
|
20 |
DEPARTMENTS = {
|
21 |
'akt.pnp.ac.id': 'Akuntansi',
|
|
|
29 |
|
30 |
start_urls = [f'https://{domain}' for domain in DEPARTMENTS.keys()]
|
31 |
visited_urls = set()
|
32 |
+
failed_urls = set()
|
33 |
|
34 |
custom_settings = {
|
35 |
+
'DOWNLOAD_DELAY': 1.5,
|
36 |
'ROBOTSTXT_OBEY': True,
|
37 |
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
38 |
'LOG_LEVEL': 'INFO',
|
39 |
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
|
40 |
+
'DOWNLOAD_TIMEOUT': 30,
|
41 |
+
'RETRY_TIMES': 2,
|
42 |
'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
|
43 |
+
'HTTPCACHE_ENABLED': True,
|
44 |
+
'HTTPCACHE_EXPIRATION_SECS': 86400, # Cache for 1 day
|
45 |
+
'DEPTH_LIMIT': 3,
|
46 |
+
'DEPTH_PRIORITY': 1
|
47 |
}
|
48 |
|
49 |
def __init__(self, *args, **kwargs):
|
50 |
super().__init__(*args, **kwargs)
|
51 |
+
self.setup_logging()
|
52 |
+
self.setup_supabase()
|
|
|
|
|
|
|
53 |
self.department_data = defaultdict(lambda: defaultdict(list))
|
54 |
self.study_programs = defaultdict(list)
|
55 |
self.department_info = defaultdict(dict)
|
56 |
+
self.session_stats = {
|
57 |
+
'total_pages': 0,
|
58 |
+
'successful_pages': 0,
|
59 |
+
'failed_pages': 0
|
60 |
+
}
|
61 |
+
|
62 |
+
def setup_logging(self):
|
63 |
+
"""Configure advanced logging"""
|
64 |
+
logging.basicConfig(
|
65 |
+
level=logging.INFO,
|
66 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
67 |
+
handlers=[
|
68 |
+
logging.FileHandler('pnp_spider.log'),
|
69 |
+
logging.StreamHandler()
|
70 |
+
]
|
71 |
+
)
|
72 |
+
self.logger = logging.getLogger(self.name)
|
73 |
+
|
74 |
+
def setup_supabase(self):
|
75 |
+
"""Initialize Supabase client with error handling"""
|
76 |
+
try:
|
77 |
+
self.supabase = create_client(
|
78 |
+
os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
|
79 |
+
os.getenv("NEXT_PUBLIC_SUPABASE_SERVICE_KEY")
|
80 |
+
)
|
81 |
+
self.storage_bucket = os.getenv("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
|
82 |
+
# Test connection
|
83 |
+
self.supabase.storage.list_buckets()
|
84 |
+
self.logger.info("Successfully connected to Supabase")
|
85 |
+
except Exception as e:
|
86 |
+
self.logger.error(f"Supabase connection failed: {str(e)}")
|
87 |
+
raise
|
88 |
|
89 |
def start_requests(self):
|
90 |
+
"""Initialize requests with better error handling"""
|
91 |
for url in self.start_urls:
|
92 |
yield scrapy.Request(
|
93 |
url=url,
|
94 |
callback=self.parse_department_homepage,
|
95 |
errback=self.handle_error,
|
96 |
+
headers={'Accept': 'text/html,application/xhtml+xml'},
|
97 |
+
meta={'retry_count': 0}
|
98 |
)
|
99 |
|
100 |
def parse_department_homepage(self, response):
|
101 |
+
"""Enhanced department homepage parsing"""
|
102 |
+
self.session_stats['total_pages'] += 1
|
103 |
+
|
104 |
domain = urlparse(response.url).netloc
|
105 |
department = self.DEPARTMENTS.get(domain, domain)
|
106 |
self.visited_urls.add(response.url)
|
107 |
|
108 |
+
self.logger.info(f"Processing department: {department} ({domain})")
|
109 |
|
110 |
+
# Extract homepage content
|
111 |
homepage_content = self.extract_content(response)
|
112 |
if homepage_content:
|
113 |
+
page_title = self.extract_page_title(response)
|
114 |
self.save_page_content(
|
115 |
response.url,
|
116 |
page_title,
|
|
|
119 |
'Beranda',
|
120 |
homepage_content
|
121 |
)
|
122 |
+
self.session_stats['successful_pages'] += 1
|
123 |
|
124 |
+
# Process navigation with improved detection
|
125 |
nav_elements = self.extract_navigation(response)
|
126 |
for nav_item in nav_elements:
|
127 |
+
if not self.should_follow_link(nav_item['link'], response.url):
|
128 |
continue
|
129 |
|
130 |
full_url = response.urljoin(nav_item['link'])
|
|
|
139 |
'category': category,
|
140 |
'department': department,
|
141 |
'domain': domain,
|
142 |
+
'menu_path': nav_item['text'],
|
143 |
+
'retry_count': 0,
|
144 |
+
'depth': response.meta.get('depth', 0) + 1
|
145 |
},
|
146 |
+
errback=self.handle_error,
|
147 |
+
priority=2 if 'prodi' in nav_item['text'].lower() else 1
|
148 |
)
|
149 |
|
150 |
+
# Process study programs with better detection
|
151 |
study_program_links = self.extract_study_program_links(response)
|
152 |
for prog in study_program_links:
|
153 |
if prog['link'] not in self.visited_urls:
|
|
|
157 |
meta={
|
158 |
'page_title': prog['title'],
|
159 |
'department': department,
|
160 |
+
'domain': domain,
|
161 |
+
'retry_count': 0,
|
162 |
+
'priority': 3 # Higher priority for program pages
|
163 |
},
|
164 |
errback=self.handle_error
|
165 |
)
|
166 |
|
167 |
+
# Process vision & mission with better detection
|
168 |
vision_mission_links = self.extract_vision_mission_links(response)
|
169 |
for vm_link in vision_mission_links:
|
170 |
if vm_link['link'] not in self.visited_urls:
|
|
|
174 |
meta={
|
175 |
'page_title': vm_link['title'],
|
176 |
'department': department,
|
177 |
+
'domain': domain,
|
178 |
+
'retry_count': 0,
|
179 |
+
'priority': 2
|
180 |
},
|
181 |
errback=self.handle_error
|
182 |
)
|
183 |
|
184 |
+
def should_follow_link(self, link, base_url):
|
185 |
+
"""Determine if a link should be followed"""
|
186 |
+
if not link or link.startswith('#') or link.startswith('javascript:'):
|
187 |
+
return False
|
188 |
+
|
189 |
+
parsed_link = urlparse(link)
|
190 |
+
parsed_base = urlparse(base_url)
|
191 |
+
|
192 |
+
# Skip if different domain
|
193 |
+
if parsed_link.netloc and parsed_link.netloc != parsed_base.netloc:
|
194 |
+
return False
|
195 |
+
|
196 |
+
# Skip unwanted file types
|
197 |
+
if re.search(r'\.(jpg|jpeg|png|gif|pdf|docx?|xlsx?|pptx?|zip|rar)$', link.lower()):
|
198 |
+
return False
|
199 |
+
|
200 |
+
# Skip admin/login pages
|
201 |
+
if any(x in link.lower() for x in ['wp-admin', 'wp-login', 'admin', 'login']):
|
202 |
+
return False
|
203 |
+
|
204 |
+
return True
|
205 |
+
|
206 |
+
def extract_page_title(self, response):
|
207 |
+
"""Extract page title with multiple fallbacks"""
|
208 |
+
title = response.css('h1::text, h1.page-title::text, h1.entry-title::text').get()
|
209 |
+
if not title:
|
210 |
+
title = response.css('title::text').get()
|
211 |
+
if not title:
|
212 |
+
title = response.url.split('/')[-1].replace('-', ' ').title()
|
213 |
+
return self.clean_text(title or 'Untitled Page')
|
214 |
+
|
215 |
def extract_navigation(self, response):
|
216 |
+
"""Improved navigation extraction with BeautifulSoup"""
|
217 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
218 |
nav_items = []
|
219 |
|
220 |
+
# Common navigation patterns
|
221 |
+
nav_patterns = [
|
222 |
+
{'tag': 'nav'},
|
223 |
+
{'class': 'navbar'},
|
224 |
+
{'class': 'navigation'},
|
225 |
+
{'class': 'main-menu'},
|
226 |
+
{'id': 'menu'},
|
227 |
+
{'class': 'primary-menu'},
|
228 |
+
{'role': 'navigation'}
|
229 |
]
|
230 |
|
231 |
+
for pattern in nav_patterns:
|
232 |
+
nav = soup.find(**pattern)
|
233 |
+
if nav:
|
234 |
+
for link in nav.find_all('a', href=True):
|
235 |
+
text = self.clean_text(link.get_text())
|
236 |
+
href = link['href']
|
237 |
+
|
238 |
+
if text and href and len(text) > 1 and not self.is_social_media_link(href):
|
239 |
nav_items.append({
|
240 |
+
'text': text,
|
241 |
+
'link': href
|
242 |
})
|
243 |
|
244 |
+
# Fallback to CSS selectors if BeautifulSoup finds nothing
|
245 |
+
if not nav_items:
|
246 |
+
for link in response.css('a'):
|
247 |
+
text = self.clean_text(' '.join(link.css('::text').getall()))
|
248 |
+
href = link.css('::attr(href)').get()
|
249 |
+
|
250 |
+
if text and href and len(text) > 1 and not self.is_social_media_link(href):
|
251 |
+
nav_items.append({
|
252 |
+
'text': text,
|
253 |
+
'link': href
|
254 |
+
})
|
255 |
+
|
256 |
return nav_items
|
257 |
|
258 |
def extract_study_program_links(self, response):
|
259 |
+
"""Enhanced study program link extraction"""
|
260 |
program_links = []
|
261 |
|
262 |
+
# Try BeautifulSoup first
|
263 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
264 |
+
|
265 |
+
# Look for common patterns in menu items
|
266 |
+
program_texts = ['program studi', 'prodi', 'jurusan', 'program pendidikan']
|
267 |
+
|
268 |
+
for text in program_texts:
|
269 |
+
menu_items = soup.find_all(lambda tag: tag.name == 'a' and text in tag.get_text().lower())
|
270 |
+
for item in menu_items:
|
271 |
+
href = item.get('href')
|
272 |
+
if href:
|
273 |
+
program_links.append({
|
274 |
+
'title': self.clean_text(item.get_text()),
|
275 |
+
'link': response.urljoin(href)
|
276 |
+
})
|
277 |
+
|
278 |
+
# Fallback to XPath if needed
|
279 |
+
if not program_links:
|
280 |
+
xpath = "//a[contains(translate(., 'PROGRAMSTUDI', 'programstudi'), 'program studi') or contains(., 'Prodi')]"
|
281 |
+
for link in response.xpath(xpath):
|
282 |
program_links.append({
|
283 |
+
'title': self.clean_text(''.join(link.xpath('.//text()').getall())),
|
284 |
+
'link': response.urljoin(link.xpath('@href').get())
|
285 |
})
|
|
|
|
|
|
|
|
|
286 |
|
287 |
+
# Deduplicate
|
288 |
+
seen = set()
|
289 |
+
unique_links = []
|
290 |
+
for prog in program_links:
|
291 |
+
if prog['link'] not in seen:
|
292 |
+
seen.add(prog['link'])
|
293 |
+
unique_links.append(prog)
|
294 |
+
|
295 |
+
return unique_links
|
296 |
|
297 |
def extract_vision_mission_links(self, response):
|
298 |
+
"""Improved vision & mission link detection"""
|
299 |
vm_links = []
|
300 |
|
301 |
+
# Terms in multiple languages
|
302 |
+
terms = ['visi', 'misi', 'vision', 'mission', 'tujuan', 'goal']
|
303 |
|
304 |
+
# Check both link text and URLs
|
305 |
+
for term in terms:
|
306 |
+
# Links containing the term in text
|
307 |
for link in response.css(f'a:contains("{term}")'):
|
308 |
text = self.clean_text(' '.join(link.css('::text').getall()))
|
309 |
url = link.css('::attr(href)').get()
|
310 |
|
311 |
if text and url:
|
312 |
vm_links.append({
|
313 |
+
'title': text,
|
314 |
'link': response.urljoin(url)
|
315 |
})
|
316 |
+
|
317 |
+
# Links with term in URL
|
318 |
+
for link in response.css(f'a[href*="{term}"]'):
|
319 |
+
if link not in vm_links:
|
320 |
+
text = self.clean_text(' '.join(link.css('::text').getall()))
|
321 |
+
url = link.css('::attr(href)').get()
|
322 |
+
|
323 |
+
if text and url:
|
324 |
+
vm_links.append({
|
325 |
+
'title': text,
|
326 |
+
'link': response.urljoin(url)
|
327 |
+
})
|
328 |
|
329 |
return vm_links
|
330 |
|
331 |
def parse_content_page(self, response):
|
332 |
+
"""Enhanced content page parsing"""
|
333 |
+
self.session_stats['total_pages'] += 1
|
334 |
meta = response.meta
|
335 |
self.visited_urls.add(response.url)
|
336 |
|
337 |
+
# Extract content with improved methods
|
338 |
+
content = self.extract_structured_content(response)
|
339 |
|
340 |
if content:
|
341 |
self.save_page_content(
|
|
|
347 |
content,
|
348 |
meta.get('menu_path', '')
|
349 |
)
|
350 |
+
self.session_stats['successful_pages'] += 1
|
351 |
+
|
352 |
+
# Extract and follow internal links with better filtering
|
353 |
+
if response.meta.get('depth', 0) < 3: # Limit depth
|
354 |
+
internal_links = self.extract_internal_links(response, meta['domain'])
|
355 |
+
for link in internal_links:
|
356 |
+
if link['url'] not in self.visited_urls:
|
357 |
+
yield scrapy.Request(
|
358 |
+
url=link['url'],
|
359 |
+
callback=self.parse_content_page,
|
360 |
+
meta={
|
361 |
+
'page_title': link['text'],
|
362 |
+
'category': meta['category'], # Inherit parent category
|
363 |
+
'department': meta['department'],
|
364 |
+
'domain': meta['domain'],
|
365 |
+
'menu_path': f"{meta.get('menu_path', '')} > {link['text']}",
|
366 |
+
'retry_count': 0,
|
367 |
+
'depth': response.meta.get('depth', 0) + 1
|
368 |
+
},
|
369 |
+
errback=self.handle_error,
|
370 |
+
priority=1
|
371 |
+
)
|
372 |
+
|
373 |
+
def extract_structured_content(self, response):
|
374 |
+
"""Extract content in a more structured way using both CSS and XPath"""
|
375 |
+
content = {
|
376 |
+
"metadata": {
|
377 |
+
"title": self.extract_page_title(response),
|
378 |
+
"url": response.url,
|
379 |
+
"timestamp": datetime.now().isoformat(),
|
380 |
+
"department": response.meta.get('department', ''),
|
381 |
+
"domain": response.meta.get('domain', '')
|
382 |
+
},
|
383 |
+
"sections": [],
|
384 |
+
"files": [],
|
385 |
+
"tables": []
|
386 |
+
}
|
387 |
+
|
388 |
+
# Use BeautifulSoup for better HTML parsing
|
389 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
390 |
+
|
391 |
+
# Remove unwanted elements
|
392 |
+
for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'form']):
|
393 |
+
element.decompose()
|
394 |
+
|
395 |
+
# Extract main content areas
|
396 |
+
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content|main')) or soup
|
397 |
+
|
398 |
+
# Process headings and content hierarchy
|
399 |
+
current_section = {}
|
400 |
+
for element in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'table']):
|
401 |
+
if element.name.startswith('h'):
|
402 |
+
# If we have a current section with content, add it first
|
403 |
+
if current_section and current_section.get('content'):
|
404 |
+
content['sections'].append(current_section)
|
405 |
+
|
406 |
+
# Start new section
|
407 |
+
current_section = {
|
408 |
+
"heading": self.clean_text(element.get_text()),
|
409 |
+
"level": int(element.name[1]),
|
410 |
+
"content": []
|
411 |
+
}
|
412 |
+
else:
|
413 |
+
if not current_section:
|
414 |
+
current_section = {
|
415 |
+
"heading": "Content",
|
416 |
+
"level": 2,
|
417 |
+
"content": []
|
418 |
+
}
|
419 |
+
|
420 |
+
if element.name == 'p':
|
421 |
+
text = self.clean_text(element.get_text())
|
422 |
+
if text and len(text) > 20:
|
423 |
+
current_section['content'].append({
|
424 |
+
"type": "paragraph",
|
425 |
+
"text": text
|
426 |
+
})
|
427 |
+
elif element.name in ['ul', 'ol']:
|
428 |
+
items = [self.clean_text(li.get_text()) for li in element.find_all('li')]
|
429 |
+
if items:
|
430 |
+
current_section['content'].append({
|
431 |
+
"type": "list",
|
432 |
+
"style": "ordered" if element.name == 'ol' else "unordered",
|
433 |
+
"items": items
|
434 |
+
})
|
435 |
+
elif element.name == 'table':
|
436 |
+
table_data = self.extract_table_data(element)
|
437 |
+
if table_data:
|
438 |
+
content['tables'].append(table_data)
|
439 |
+
|
440 |
+
# Add the last section if it exists
|
441 |
+
if current_section and current_section.get('content'):
|
442 |
+
content['sections'].append(current_section)
|
443 |
+
|
444 |
+
# Extract files and downloads
|
445 |
+
for link in main_content.find_all('a', href=True):
|
446 |
+
href = link['href']
|
447 |
+
if re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar)$', href.lower()):
|
448 |
+
content['files'].append({
|
449 |
+
"title": self.clean_text(link.get_text()) or "Unduhan",
|
450 |
+
"url": response.urljoin(href),
|
451 |
+
"type": href.split('.')[-1].lower()
|
452 |
+
})
|
453 |
+
|
454 |
+
return content if (content['sections'] or content['tables']) else None
|
455 |
+
|
456 |
+
def extract_table_data(self, table_element):
|
457 |
+
"""Extract structured table data"""
|
458 |
+
table_data = {
|
459 |
+
"headers": [],
|
460 |
+
"rows": []
|
461 |
+
}
|
462 |
+
|
463 |
+
# Extract headers from thead if exists
|
464 |
+
thead = table_element.find('thead')
|
465 |
+
if thead:
|
466 |
+
for th in thead.find_all(['th', 'td']):
|
467 |
+
table_data['headers'].append(self.clean_text(th.get_text()))
|
468 |
+
|
469 |
+
# Extract rows from tbody or directly from table
|
470 |
+
tbody = table_element.find('tbody') or table_element
|
471 |
+
for tr in tbody.find_all('tr'):
|
472 |
+
row = []
|
473 |
+
for cell in tr.find_all(['td', 'th']):
|
474 |
+
# Handle cell content with possible links
|
475 |
+
cell_text = self.clean_text(cell.get_text())
|
476 |
+
links = [{'text': self.clean_text(a.get_text()), 'url': a['href']}
|
477 |
+
for a in cell.find_all('a', href=True)]
|
478 |
+
|
479 |
+
row.append({
|
480 |
+
"text": cell_text,
|
481 |
+
"links": links
|
482 |
+
})
|
483 |
+
|
484 |
+
if row:
|
485 |
+
table_data['rows'].append(row)
|
486 |
+
|
487 |
+
return table_data if table_data['rows'] else None
|
488 |
+
|
489 |
+
def extract_internal_links(self, response, domain):
|
490 |
+
"""Extract internal links with better filtering"""
|
491 |
+
internal_links = []
|
492 |
|
493 |
+
for link in response.css('a[href]'):
|
494 |
+
text = self.clean_text(' '.join(link.css('::text').getall()))
|
495 |
+
href = link.css('::attr(href)').get()
|
496 |
+
|
497 |
+
if not text or not href:
|
498 |
+
continue
|
499 |
+
|
500 |
+
full_url = response.urljoin(href)
|
501 |
+
parsed_url = urlparse(full_url)
|
502 |
|
503 |
+
# Only follow links from the same domain
|
504 |
+
if parsed_url.netloc == domain:
|
505 |
+
if not self.is_unwanted_url(full_url) and full_url not in self.visited_urls:
|
506 |
+
internal_links.append({
|
507 |
+
'text': text,
|
508 |
+
'url': full_url
|
509 |
+
})
|
510 |
+
|
511 |
+
return internal_links
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
512 |
|
513 |
def parse_study_program(self, response):
|
514 |
+
"""Enhanced study program parsing"""
|
515 |
+
self.session_stats['total_pages'] += 1
|
516 |
meta = response.meta
|
517 |
self.visited_urls.add(response.url)
|
518 |
|
519 |
department = meta['department']
|
520 |
+
program_title = self.extract_page_title(response)
|
521 |
|
522 |
+
# Extract program details with improved methods
|
523 |
program_details = self.extract_program_details(response)
|
|
|
524 |
|
525 |
# Add to the study programs collection
|
526 |
self.study_programs[department].append({
|
|
|
530 |
})
|
531 |
|
532 |
# Also save as a regular page
|
533 |
+
content = self.extract_structured_content(response)
|
534 |
if content:
|
535 |
self.save_page_content(
|
536 |
response.url,
|
|
|
540 |
'Program_Studi',
|
541 |
content
|
542 |
)
|
543 |
+
self.session_stats['successful_pages'] += 1
|
544 |
|
545 |
def extract_program_details(self, response):
|
546 |
+
"""Enhanced program details extraction"""
|
547 |
+
details = {}
|
548 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
549 |
|
550 |
+
# Degree detection from multiple sources
|
551 |
degree_sources = [
|
552 |
+
soup.title.string if soup.title else None,
|
553 |
+
soup.h1.get_text() if soup.h1 else None,
|
554 |
+
' '.join(soup.find(class_=re.compile('breadcrumb')).stripped_strings) if soup.find(class_=re.compile('breadcrumb')) else None
|
555 |
]
|
556 |
|
557 |
degree_pattern = re.compile(
|
|
|
564 |
details['degree'] = match.group(1).upper()
|
565 |
break
|
566 |
|
567 |
+
# Extract accreditation status with better pattern matching
|
568 |
+
accreditation_texts = [
|
569 |
+
'akreditasi',
|
570 |
+
'peringkat',
|
571 |
+
'status akreditasi',
|
572 |
+
'sertifikasi'
|
573 |
+
]
|
574 |
+
|
575 |
+
for text in accreditation_texts:
|
576 |
+
element = soup.find(string=re.compile(text, re.IGNORECASE))
|
577 |
+
if element:
|
578 |
+
# Look for the accreditation value in nearby elements
|
579 |
+
parent = element.find_parent()
|
580 |
+
siblings = [sib for sib in parent.next_siblings if isinstance(sib, str) or sib.name]
|
581 |
+
|
582 |
+
for sib in siblings:
|
583 |
+
if isinstance(sib, str):
|
584 |
+
if match := re.search(r'[A-Z]', sib):
|
585 |
+
details['accreditation'] = match.group()
|
586 |
+
break
|
587 |
+
elif sib.name:
|
588 |
+
if match := re.search(r'[A-Z]', sib.get_text()):
|
589 |
+
details['accreditation'] = match.group()
|
590 |
+
break
|
591 |
+
|
592 |
+
if 'accreditation' in details:
|
593 |
+
break
|
594 |
+
|
595 |
+
# Extract description from the first meaningful paragraph
|
596 |
+
for p in soup.find_all('p'):
|
597 |
+
text = self.clean_text(p.get_text())
|
598 |
+
if text and len(text) > 50 and not any(x in text.lower() for x in ['copyright', 'hak cipta']):
|
599 |
+
details['description'] = text
|
600 |
+
break
|
601 |
+
|
602 |
return details
|
603 |
|
604 |
def parse_vision_mission(self, response):
|
605 |
+
"""Enhanced vision & mission parsing"""
|
606 |
+
self.session_stats['total_pages'] += 1
|
607 |
meta = response.meta
|
608 |
self.visited_urls.add(response.url)
|
609 |
department = meta['department']
|
610 |
|
611 |
+
# Use BeautifulSoup for better content extraction
|
612 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
613 |
|
614 |
+
# Find vision and mission sections
|
615 |
+
vision_text = self.find_section_text(soup, ['visi', 'vision'])
|
616 |
+
mission_text = self.find_section_text(soup, ['misi', 'mission'])
|
|
|
|
|
|
|
|
|
617 |
|
618 |
+
# Find mission items if presented as list
|
619 |
+
mission_items = []
|
620 |
+
mission_list = self.find_mission_list(soup)
|
621 |
+
if mission_list:
|
622 |
+
mission_items = [self.clean_text(li.get_text()) for li in mission_list.find_all('li')]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
623 |
|
624 |
+
# Store in department info
|
625 |
+
if vision_text or mission_text or mission_items:
|
626 |
if vision_text:
|
627 |
self.department_info[department]['vision'] = vision_text
|
628 |
if mission_text:
|
629 |
self.department_info[department]['mission'] = mission_text
|
630 |
+
if mission_items:
|
631 |
+
self.department_info[department]['mission_items'] = mission_items
|
632 |
|
633 |
+
# Save as separate file
|
634 |
self.save_vision_mission(
|
635 |
department,
|
636 |
meta['domain'],
|
637 |
vision_text,
|
638 |
mission_text,
|
639 |
+
mission_items,
|
640 |
response.url
|
641 |
)
|
642 |
|
643 |
# Also save as a regular page
|
644 |
+
content = self.extract_structured_content(response)
|
645 |
if content:
|
646 |
self.save_page_content(
|
647 |
response.url,
|
|
|
651 |
'Profil',
|
652 |
content
|
653 |
)
|
654 |
+
self.session_stats['successful_pages'] += 1
|
655 |
|
656 |
+
def find_section_text(self, soup, keywords):
|
657 |
+
"""Find section text based on keywords"""
|
658 |
+
for keyword in keywords:
|
659 |
+
# Look for headings containing the keyword
|
660 |
+
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
|
661 |
+
if keyword.lower() in heading.get_text().lower():
|
662 |
+
# Get the next paragraph or div
|
663 |
+
next_node = heading.next_sibling
|
664 |
+
while next_node:
|
665 |
+
if next_node.name in ['p', 'div']:
|
666 |
+
text = self.clean_text(next_node.get_text())
|
667 |
+
if text:
|
668 |
+
return text
|
669 |
+
next_node = next_node.next_sibling
|
670 |
+
|
671 |
+
return None
|
672 |
+
|
673 |
+
def find_mission_list(self, soup):
|
674 |
+
"""Find mission items presented as list"""
|
675 |
+
for keyword in ['misi', 'mission']:
|
676 |
+
# Look for headings containing the keyword
|
677 |
+
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
|
678 |
+
if keyword.lower() in heading.get_text().lower():
|
679 |
+
# Find the next ul or ol element
|
680 |
+
next_node = heading.next_sibling
|
681 |
+
while next_node:
|
682 |
+
if next_node.name in ['ul', 'ol']:
|
683 |
+
return next_node
|
684 |
+
next_node = next_node.next_sibling
|
685 |
+
|
686 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
687 |
|
688 |
def save_page_content(self, url, title, department, domain, category, content, menu_path=''):
|
689 |
+
"""Save page content with improved formatting"""
|
690 |
if not content or not title:
|
691 |
return
|
692 |
|
693 |
+
# Generate filename with department prefix
|
694 |
safe_title = re.sub(r'[^\w\s-]', '', title).strip().lower()
|
695 |
+
safe_title = re.sub(r'[-\s]+', '-', safe_title)[:100] # Limit length
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
696 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
697 |
+
filename = f"{department}_{safe_title}_{timestamp}.json"
|
|
|
698 |
|
|
|
699 |
try:
|
700 |
+
# Convert content to JSON string
|
701 |
+
import json
|
702 |
+
content_str = json.dumps(content, ensure_ascii=False, indent=2)
|
703 |
+
|
704 |
+
# Upload to Supabase
|
705 |
upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
|
706 |
path=filename,
|
707 |
+
file=content_str.encode('utf-8'),
|
708 |
+
file_options={"content-type": "application/json", "x-upsert": "true"}
|
709 |
)
|
710 |
|
711 |
self.logger.info(f"Successfully uploaded {filename}")
|
712 |
|
713 |
+
# Store in our collection
|
714 |
self.department_data[department][category].append({
|
715 |
'title': title,
|
716 |
'url': url,
|
717 |
+
'filename': filename,
|
718 |
+
'timestamp': timestamp
|
719 |
})
|
720 |
|
721 |
except Exception as e:
|
722 |
self.logger.error(f"Upload failed for {filename}: {str(e)}")
|
723 |
+
self.failed_urls.add(url)
|
724 |
|
725 |
def save_vision_mission(self, department, domain, vision, mission, mission_items, url):
|
726 |
+
"""Save vision & mission with improved formatting"""
|
727 |
+
filename = f"{department}_Visi_Misi.json"
|
728 |
+
|
729 |
+
content = {
|
730 |
+
"department": department,
|
731 |
+
"domain": domain,
|
732 |
+
"url": url,
|
733 |
+
"timestamp": datetime.now().isoformat(),
|
734 |
+
"vision": vision,
|
735 |
+
"mission": mission,
|
736 |
+
"mission_items": mission_items
|
737 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
738 |
|
739 |
try:
|
740 |
+
import json
|
741 |
+
content_str = json.dumps(content, ensure_ascii=False, indent=2)
|
742 |
+
|
|
|
|
|
|
|
743 |
upload_response = self.supabase.storage.from_(self.storage_bucket).upload(
|
744 |
path=filename,
|
745 |
+
file=content_str.encode('utf-8'),
|
746 |
+
file_options={"content-type": "application/json", "x-upsert": "true"}
|
747 |
)
|
748 |
|
749 |
+
self.logger.info(f"Successfully uploaded vision & mission for {department}")
|
750 |
except Exception as e:
|
751 |
+
self.logger.error(f"Failed to upload vision & mission for {department}: {str(e)}")
|
752 |
+
self.failed_urls.add(url)
|
753 |
+
|
754 |
def clean_text(self, text):
|
755 |
+
"""Improved text cleaning with normalization"""
|
756 |
if not text:
|
757 |
return ""
|
758 |
|
759 |
# Normalize unicode characters
|
760 |
text = unicodedata.normalize('NFKC', text)
|
761 |
|
762 |
+
# Remove control characters
|
763 |
+
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
|
764 |
|
765 |
+
# Replace multiple spaces/newlines with single space
|
766 |
+
text = re.sub(r'\s+', ' ', text)
|
767 |
|
768 |
+
# Remove leading/trailing whitespace
|
769 |
+
text = text.strip()
|
770 |
|
771 |
+
return text
|
772 |
|
773 |
def determine_category(self, menu_text):
|
774 |
+
"""Enhanced category determination"""
|
775 |
menu_lower = menu_text.lower()
|
776 |
|
777 |
+
category_mapping = [
|
778 |
+
(['beranda', 'home', 'utama'], 'Beranda'),
|
779 |
+
(['profil', 'profile', 'tentang', 'about', 'sejarah', 'history'], 'Profil'),
|
780 |
+
(['program', 'studi', 'prodi', 'jurusan', 'kurikulum'], 'Program_Studi'),
|
781 |
+
(['dosen', 'staff', 'pengajar', 'lecturer'], 'Dosen'),
|
782 |
+
(['penelitian', 'research', 'publikasi', 'jurnal'], 'Penelitian'),
|
783 |
+
(['mahasiswa', 'student', 'alumni'], 'Mahasiswa'),
|
784 |
+
(['fasilitas', 'lab', 'laboratorium'], 'Fasilitas'),
|
785 |
+
(['pengumuman', 'berita', 'news', 'agenda'], 'Informasi'),
|
786 |
+
(['kerjasama', 'partnership', 'mitra'], 'Kerjasama'),
|
787 |
+
(['dokumen', 'download', 'unduhan'], 'Dokumen')
|
788 |
+
]
|
|
|
789 |
|
790 |
+
for keywords, category in category_mapping:
|
791 |
+
if any(keyword in menu_lower for keyword in keywords):
|
|
|
792 |
return category
|
793 |
|
|
|
794 |
return 'Lainnya'
|
795 |
|
796 |
def is_social_media_link(self, url):
|
797 |
+
"""Check if URL is social media with better pattern matching"""
|
798 |
+
social_domains = [
|
799 |
+
'facebook.com', 'twitter.com', 'instagram.com',
|
800 |
+
'youtube.com', 'linkedin.com', 'tiktok.com',
|
801 |
+
'whatsapp.com', 'wa.me', 'telegram.me'
|
802 |
]
|
803 |
+
|
804 |
+
if not url:
|
805 |
+
return False
|
806 |
+
|
807 |
+
parsed = urlparse(url.lower())
|
808 |
+
if not parsed.netloc:
|
809 |
+
return False
|
810 |
+
|
811 |
+
return any(domain in parsed.netloc for domain in social_domains)
|
812 |
|
813 |
def is_unwanted_url(self, url):
|
814 |
+
"""Improved unwanted URL detection"""
|
815 |
+
if not url:
|
|
|
816 |
return True
|
817 |
|
|
|
818 |
unwanted_patterns = [
|
819 |
+
r'\.(jpg|jpeg|png|gif|svg|ico|css|js|pdf|docx?|xlsx?|pptx?|zip|rar)$',
|
820 |
+
r'(login|logout|signin|signup|register|admin|wp-|/wp/|wordpress|comment|feed|rss|atom)',
|
821 |
+
r'(javascript:|mailto:|tel:|#)',
|
822 |
+
r'(page/\d+|tag/|author/|archive/|category/|search|kalender|ajax|api)'
|
823 |
]
|
824 |
|
825 |
+
url_lower = url.lower()
|
826 |
+
return any(re.search(pattern, url_lower) for pattern in unwanted_patterns)
|
827 |
|
828 |
def handle_error(self, failure):
|
829 |
+
"""Enhanced error handling with retry logic"""
|
830 |
url = failure.request.url
|
831 |
+
meta = failure.request.meta
|
832 |
+
retry_count = meta.get('retry_count', 0)
|
833 |
+
|
834 |
+
self.logger.error(f"Request failed ({retry_count}): {url} - {str(failure.value)}")
|
835 |
+
self.session_stats['failed_pages'] += 1
|
836 |
+
self.failed_urls.add(url)
|
837 |
+
|
838 |
+
# Retry logic
|
839 |
+
if retry_count < self.custom_settings.get('RETRY_TIMES', 2):
|
840 |
+
self.logger.info(f"Retrying {url} (attempt {retry_count + 1})")
|
841 |
+
new_request = failure.request.copy()
|
842 |
+
new_request.meta['retry_count'] = retry_count + 1
|
843 |
+
new_request.dont_filter = True
|
844 |
+
return new_request
|
845 |
|
846 |
def closed(self, reason):
|
847 |
+
"""Enhanced closing method with comprehensive reporting"""
|
848 |
+
self.logger.info("Spider closed. Generating final reports...")
|
849 |
+
|
850 |
+
# Generate summary statistics
|
851 |
+
summary = {
|
852 |
+
"total_pages": self.session_stats['total_pages'],
|
853 |
+
"successful_pages": self.session_stats['successful_pages'],
|
854 |
+
"failed_pages": self.session_stats['failed_pages'],
|
855 |
+
"success_rate": (self.session_stats['successful_pages'] / self.session_stats['total_pages']) * 100 if self.session_stats['total_pages'] > 0 else 0,
|
856 |
+
"departments_crawled": len(self.department_data),
|
857 |
+
"programs_found": sum(len(progs) for progs in self.study_programs.values()),
|
858 |
+
"failed_urls": list(self.failed_urls)
|
859 |
+
}
|
860 |
+
|
861 |
+
# Upload summary report
|
862 |
+
self.upload_summary_report(summary)
|
863 |
|
864 |
+
# Generate department summary
|
865 |
+
self.generate_department_summary()
|
|
|
|
|
866 |
|
867 |
+
self.logger.info(f"Crawling completed. Success rate: {summary['success_rate']:.2f}%")
|
868 |
+
|
869 |
+
def upload_summary_report(self, summary):
|
870 |
+
"""Upload comprehensive summary report"""
|
871 |
+
filename = "crawling_summary_report.json"
|
872 |
|
873 |
+
try:
|
874 |
+
import json
|
875 |
+
content = json.dumps(summary, indent=2)
|
876 |
|
877 |
+
self.supabase.storage.from_(self.storage_bucket).upload(
|
878 |
+
path=filename,
|
879 |
+
file=content.encode('utf-8'),
|
880 |
+
file_options={"content-type": "application/json", "x-upsert": "true"}
|
881 |
+
)
|
882 |
+
self.logger.info("Successfully uploaded summary report")
|
883 |
+
except Exception as e:
|
884 |
+
self.logger.error(f"Failed to upload summary report: {str(e)}")
|
885 |
|
886 |
+
def generate_department_summary(self):
|
887 |
+
"""Generate detailed department summary"""
|
888 |
+
content = "# Laporan Lengkap Jurusan Politeknik Negeri Padang\n\n"
|
889 |
+
content += f"**Tanggal**: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n"
|
890 |
|
891 |
# Create reverse mapping from department name to domain
|
892 |
reverse_departments = {v: k for k, v in self.DEPARTMENTS.items()}
|
893 |
|
894 |
for department, programs in self.study_programs.items():
|
|
|
895 |
domain = reverse_departments.get(department, '')
|
896 |
+
website_url = f'https://{domain}' if domain else ''
|
897 |
|
898 |
content += f"## {department.replace('_', ' ')}\n"
|
899 |
+
content += f"**Website**: {website_url}\n"
|
900 |
+
|
901 |
+
# Add vision and mission if available
|
902 |
+
if department in self.department_info:
|
903 |
+
if 'vision' in self.department_info[department]:
|
904 |
+
content += f"\n### Visi\n{self.department_info[department]['vision']}\n"
|
905 |
+
|
906 |
+
if 'mission' in self.department_info[department]:
|
907 |
+
content += f"\n### Misi\n{self.department_info[department]['mission']}\n"
|
908 |
+
elif 'mission_items' in self.department_info[department]:
|
909 |
+
content += "\n### Misi\n"
|
910 |
+
for i, item in enumerate(self.department_info[department]['mission_items'], 1):
|
911 |
+
content += f"{i}. {item}\n"
|
912 |
|
913 |
+
# Add study programs
|
914 |
if programs:
|
915 |
+
content += "\n### Program Studi\n"
|
916 |
for prog in programs:
|
917 |
+
content += f"- **{prog['title']}**\n"
|
918 |
+
content += f" - Jenjang: {prog['details'].get('degree', 'N/A')}\n"
|
919 |
+
content += f" - Akreditasi: {prog['details'].get('accreditation', 'N/A')}\n"
|
920 |
+
content += f" - URL: {prog['url']}\n"
|
921 |
|
922 |
if 'description' in prog['details']:
|
923 |
+
content += f" - Deskripsi: {prog['details']['description']}\n"
|
|
|
|
|
|
|
924 |
else:
|
925 |
+
content += "\n### Belum ada informasi program studi\n"
|
926 |
|
927 |
content += "\n---\n\n"
|
928 |
|
929 |
+
# Upload department summary
|
930 |
+
filename = "department_summary_report.md"
|
931 |
try:
|
|
|
932 |
self.supabase.storage.from_(self.storage_bucket).upload(
|
933 |
path=filename,
|
934 |
file=content.encode('utf-8'),
|
935 |
+
file_options={"content-type": "text/markdown", "x-upsert": "true"}
|
936 |
)
|
937 |
+
self.logger.info("Successfully uploaded department summary report")
|
938 |
except Exception as e:
|
939 |
+
self.logger.error(f"Failed to upload department summary: {str(e)}")
|
940 |
|
941 |
|
|
|
942 |
if __name__ == "__main__":
|
943 |
process = CrawlerProcess()
|
944 |
process.crawl(PNPDepartmentSpider)
|