Spaces:
Running
Running
Upload 2 files
Browse files- app.py +52 -0
- requirements.txt +56 -0
app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import urllib3
|
5 |
+
|
6 |
+
def simple_web_scraper(url, scrape_option):
|
7 |
+
try:
|
8 |
+
# Create a PoolManager with urllib3 to handle SSL
|
9 |
+
http = urllib3.PoolManager()
|
10 |
+
|
11 |
+
# Send an HTTP request
|
12 |
+
response = http.request('GET', url)
|
13 |
+
|
14 |
+
# Check if the request was successful (status code 200)
|
15 |
+
if response.status == 200:
|
16 |
+
# Parse the HTML content of the page
|
17 |
+
soup = BeautifulSoup(response.data, 'html.parser')
|
18 |
+
|
19 |
+
# Extract information from the HTML based on user's choice
|
20 |
+
if scrape_option == 'data':
|
21 |
+
# Extract all text content from the page
|
22 |
+
all_text = soup.get_text()
|
23 |
+
|
24 |
+
# Prepare data for the table (split text by lines)
|
25 |
+
table_data = [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()]
|
26 |
+
|
27 |
+
# Display the data in a table
|
28 |
+
st.table(table_data)
|
29 |
+
elif scrape_option == 'links':
|
30 |
+
# Example: Extract all the links on the page
|
31 |
+
links = soup.find_all('a')
|
32 |
+
|
33 |
+
# Prepare data for the table
|
34 |
+
table_data = [{'Links': link.get('href')} for link in links]
|
35 |
+
|
36 |
+
# Display the data in a table
|
37 |
+
st.table(table_data)
|
38 |
+
else:
|
39 |
+
st.write('Invalid scrape option. Please choose "data" or "links".')
|
40 |
+
else:
|
41 |
+
st.write(f'Error: {response.status}')
|
42 |
+
|
43 |
+
except Exception as e:
|
44 |
+
st.write(f'An error occurred: {e}')
|
45 |
+
|
46 |
+
# Streamlit UI
|
47 |
+
st.title("Web Scraping Tool")
|
48 |
+
website_url = st.text_input("Enter the URL to scrape:")
|
49 |
+
scrape_option = st.selectbox("Select what to scrape:", ['data', 'links'])
|
50 |
+
|
51 |
+
if st.button("Scrape"):
|
52 |
+
simple_web_scraper(website_url, scrape_option)
|
requirements.txt
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.2.0
|
2 |
+
attrs==23.1.0
|
3 |
+
beautifulsoup4==4.12.2
|
4 |
+
blinker==1.7.0
|
5 |
+
cachetools==5.3.2
|
6 |
+
certifi==2023.11.17
|
7 |
+
charset-normalizer==3.3.2
|
8 |
+
click==8.1.7
|
9 |
+
filelock==3.13.1
|
10 |
+
fsspec==2023.12.2
|
11 |
+
gitdb==4.0.11
|
12 |
+
GitPython==3.1.40
|
13 |
+
huggingface-hub==0.19.4
|
14 |
+
idna==3.6
|
15 |
+
importlib-metadata==6.11.0
|
16 |
+
Jinja2==3.1.2
|
17 |
+
jsonschema==4.20.0
|
18 |
+
jsonschema-specifications==2023.11.2
|
19 |
+
markdown-it-py==3.0.0
|
20 |
+
MarkupSafe==2.1.3
|
21 |
+
mdurl==0.1.2
|
22 |
+
numpy==1.26.2
|
23 |
+
packaging==23.2
|
24 |
+
pandas==2.1.4
|
25 |
+
Pillow==10.1.0
|
26 |
+
protobuf==4.25.1
|
27 |
+
pyarrow==14.0.1
|
28 |
+
pydeck==0.8.1b0
|
29 |
+
Pygments==2.17.2
|
30 |
+
python-dateutil==2.8.2
|
31 |
+
pytz==2023.3.post1
|
32 |
+
PyYAML==6.0.1
|
33 |
+
referencing==0.32.0
|
34 |
+
regex==2023.10.3
|
35 |
+
requests==2.31.0
|
36 |
+
rich==13.7.0
|
37 |
+
rpds-py==0.13.2
|
38 |
+
safetensors==0.4.1
|
39 |
+
six==1.16.0
|
40 |
+
smmap==5.0.1
|
41 |
+
soupsieve==2.5
|
42 |
+
streamlit==1.29.0
|
43 |
+
tenacity==8.2.3
|
44 |
+
tokenizers==0.15.0
|
45 |
+
toml==0.10.2
|
46 |
+
toolz==0.12.0
|
47 |
+
tornado==6.4
|
48 |
+
tqdm==4.66.1
|
49 |
+
transformers==4.36.0
|
50 |
+
typing_extensions==4.9.0
|
51 |
+
tzdata==2023.3
|
52 |
+
tzlocal==5.2
|
53 |
+
urllib3==2.1.0
|
54 |
+
validators==0.22.0
|
55 |
+
watchdog==3.0.0
|
56 |
+
zipp==3.17.0
|