shukdevdatta123's picture
Create ex.txt
dd5262c verified
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
from urllib.parse import urljoin
import re
import streamlit as st
def clean_text(text: str) -> str:
"""Clean extracted text by removing extra whitespace and special characters."""
text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace and newlines
return text
def scrape_accountant_data(base_url: str) -> list:
"""Scrape accountant data from all pages of the given URL."""
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
current_page = base_url
while current_page:
response = requests.get(current_page, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract accountant entries
entries = soup.find_all('div', class_='wpbdp-listing')
for entry in entries:
name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name")
address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address")
business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type")
location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location")
tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags")
data.append({
'Name': name,
'Address': address,
'Business Type': business_type,
'Location': location,
'Tags': tags
})
# Find the next page link
next_page_link = soup.find('a', class_='next')
current_page = urljoin(base_url, next_page_link['href']) if next_page_link else None
return data
def generate_pdf(data: list, output_file: str):
"""Generate a PDF from the scraped data."""
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C')
pdf.ln(10)
pdf.set_font('Arial', '', 12)
for entry in data:
pdf.cell(0, 10, f"Name: {entry['Name']}", ln=True)
pdf.cell(0, 10, f"Address: {entry['Address']}", ln=True)
pdf.cell(0, 10, f"Business Type: {entry['Business Type']}", ln=True)
pdf.cell(0, 10, f"Location: {entry['Location']}", ln=True)
pdf.cell(0, 10, f"Tags: {entry['Tags']}", ln=True)
pdf.ln(10)
pdf.output(output_file)
def main():
st.title("Accountant Directory Scraper")
base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/")
if st.button("Scrape Data"):
with st.spinner("Scraping data, please wait..."):
data = scrape_accountant_data(base_url)
if data:
output_file = "accountant_directory.pdf"
generate_pdf(data, output_file)
st.success(f"Scraping complete! PDF generated.")
with open(output_file, "rb") as pdf_file:
st.download_button(
label="Download PDF",
data=pdf_file,
file_name=output_file,
mime="application/pdf"
)
else:
st.error("No data found to scrape.")
if __name__ == "__main__":
main()