|
import requests |
|
from bs4 import BeautifulSoup |
|
from fpdf import FPDF |
|
from urllib.parse import urljoin |
|
import re |
|
import streamlit as st |
|
|
|
def clean_text(text: str) -> str: |
|
"""Clean extracted text by removing extra whitespace and special characters.""" |
|
text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace and newlines |
|
return text |
|
|
|
def scrape_accountant_data(base_url: str) -> list: |
|
"""Scrape accountant data from all pages of the given URL.""" |
|
data = [] |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
} |
|
|
|
current_page = base_url |
|
|
|
while current_page: |
|
response = requests.get(current_page, headers=headers, timeout=10) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
# Extract accountant entries |
|
entries = soup.find_all('div', class_='wpbdp-listing') |
|
for entry in entries: |
|
name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name") |
|
address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address") |
|
business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type") |
|
location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location") |
|
tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags") |
|
|
|
data.append({ |
|
'Name': name, |
|
'Address': address, |
|
'Business Type': business_type, |
|
'Location': location, |
|
'Tags': tags |
|
}) |
|
|
|
# Find the next page link |
|
next_page_link = soup.find('a', class_='next') |
|
current_page = urljoin(base_url, next_page_link['href']) if next_page_link else None |
|
|
|
return data |
|
|
|
def generate_pdf(data: list, output_file: str): |
|
"""Generate a PDF from the scraped data.""" |
|
pdf = FPDF() |
|
pdf.set_auto_page_break(auto=True, margin=15) |
|
pdf.add_page() |
|
pdf.set_font('Arial', 'B', 16) |
|
pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C') |
|
pdf.ln(10) |
|
|
|
pdf.set_font('Arial', '', 12) |
|
|
|
for entry in data: |
|
pdf.cell(0, 10, f"Name: {entry['Name']}", ln=True) |
|
pdf.cell(0, 10, f"Address: {entry['Address']}", ln=True) |
|
pdf.cell(0, 10, f"Business Type: {entry['Business Type']}", ln=True) |
|
pdf.cell(0, 10, f"Location: {entry['Location']}", ln=True) |
|
pdf.cell(0, 10, f"Tags: {entry['Tags']}", ln=True) |
|
pdf.ln(10) |
|
|
|
pdf.output(output_file) |
|
|
|
def main(): |
|
st.title("Accountant Directory Scraper") |
|
|
|
base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/") |
|
|
|
if st.button("Scrape Data"): |
|
with st.spinner("Scraping data, please wait..."): |
|
data = scrape_accountant_data(base_url) |
|
|
|
if data: |
|
output_file = "accountant_directory.pdf" |
|
generate_pdf(data, output_file) |
|
st.success(f"Scraping complete! PDF generated.") |
|
with open(output_file, "rb") as pdf_file: |
|
st.download_button( |
|
label="Download PDF", |
|
data=pdf_file, |
|
file_name=output_file, |
|
mime="application/pdf" |
|
) |
|
else: |
|
st.error("No data found to scrape.") |
|
|
|
if __name__ == "__main__": |
|
main() |