Spaces:

shukdevdatta123
/

bangladeshcircle-scrapper

Paused

App Files Files Community

bangladeshcircle-scrapper / ex.txt

shukdevdatta123

Create ex.txt

dd5262c verified 8 months ago

raw

history blame contribute delete

3.94 kB

	import requests
	from bs4 import BeautifulSoup
	from fpdf import FPDF
	from urllib.parse import urljoin
	import re
	import streamlit as st

	def clean_text(text: str) -> str:
	"""Clean extracted text by removing extra whitespace and special characters."""
	text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace and newlines
	return text

	def scrape_accountant_data(base_url: str) -> list:
	"""Scrape accountant data from all pages of the given URL."""
	data = []
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	current_page = base_url

	while current_page:
	response = requests.get(current_page, headers=headers, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract accountant entries
	entries = soup.find_all('div', class_='wpbdp-listing')
	for entry in entries:
	name = clean_text(entry.find('a').get_text() if entry.find('a') else "No Name")
	address = clean_text(entry.find('div', class_='wpbdp-field-address').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-address') else "No Address")
	business_type = clean_text(entry.find('div', class_='wpbdp-field-business_type').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-business_type') else "No Business Type")
	location = clean_text(entry.find('div', class_='wpbdp-field-location').find('div', class_='value').get_text() if entry.find('div', class_='wpbdp-field-location') else "No Location")
	tags = clean_text(", ".join([tag.get_text() for tag in entry.find('div', class_='wpbdp-field-tags').find_all('a')]) if entry.find('div', class_='wpbdp-field-tags') else "No Tags")

	data.append({
	'Name': name,
	'Address': address,
	'Business Type': business_type,
	'Location': location,
	'Tags': tags
	})

	# Find the next page link
	next_page_link = soup.find('a', class_='next')
	current_page = urljoin(base_url, next_page_link['href']) if next_page_link else None

	return data

	def generate_pdf(data: list, output_file: str):
	"""Generate a PDF from the scraped data."""
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()
	pdf.set_font('Arial', 'B', 16)
	pdf.cell(0, 10, 'Accountant Directory', ln=True, align='C')
	pdf.ln(10)

	pdf.set_font('Arial', '', 12)

	for entry in data:
	pdf.cell(0, 10, f"Name: {entry['Name']}", ln=True)
	pdf.cell(0, 10, f"Address: {entry['Address']}", ln=True)
	pdf.cell(0, 10, f"Business Type: {entry['Business Type']}", ln=True)
	pdf.cell(0, 10, f"Location: {entry['Location']}", ln=True)
	pdf.cell(0, 10, f"Tags: {entry['Tags']}", ln=True)
	pdf.ln(10)

	pdf.output(output_file)

	def main():
	st.title("Accountant Directory Scraper")

	base_url = st.text_input("Enter the URL to scrape:", "https://www.bangladeshcircle.com/bangladesh-business-directory/wpbdp_category/accountant/")

	if st.button("Scrape Data"):
	with st.spinner("Scraping data, please wait..."):
	data = scrape_accountant_data(base_url)

	if data:
	output_file = "accountant_directory.pdf"
	generate_pdf(data, output_file)
	st.success(f"Scraping complete! PDF generated.")
	with open(output_file, "rb") as pdf_file:
	st.download_button(
	label="Download PDF",
	data=pdf_file,
	file_name=output_file,
	mime="application/pdf"
	)
	else:
	st.error("No data found to scrape.")

	if __name__ == "__main__":
	main()