razaAhmed commited on
Commit
72b4c46
·
1 Parent(s): ed28502

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +52 -0
  2. requirements.txt +56 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import urllib3
5
+
6
+ def simple_web_scraper(url, scrape_option):
7
+ try:
8
+ # Create a PoolManager with urllib3 to handle SSL
9
+ http = urllib3.PoolManager()
10
+
11
+ # Send an HTTP request
12
+ response = http.request('GET', url)
13
+
14
+ # Check if the request was successful (status code 200)
15
+ if response.status == 200:
16
+ # Parse the HTML content of the page
17
+ soup = BeautifulSoup(response.data, 'html.parser')
18
+
19
+ # Extract information from the HTML based on user's choice
20
+ if scrape_option == 'data':
21
+ # Extract all text content from the page
22
+ all_text = soup.get_text()
23
+
24
+ # Prepare data for the table (split text by lines)
25
+ table_data = [{'Data': line.strip()} for line in all_text.split('\n') if line.strip()]
26
+
27
+ # Display the data in a table
28
+ st.table(table_data)
29
+ elif scrape_option == 'links':
30
+ # Example: Extract all the links on the page
31
+ links = soup.find_all('a')
32
+
33
+ # Prepare data for the table
34
+ table_data = [{'Links': link.get('href')} for link in links]
35
+
36
+ # Display the data in a table
37
+ st.table(table_data)
38
+ else:
39
+ st.write('Invalid scrape option. Please choose "data" or "links".')
40
+ else:
41
+ st.write(f'Error: {response.status}')
42
+
43
+ except Exception as e:
44
+ st.write(f'An error occurred: {e}')
45
+
46
+ # Streamlit UI
47
+ st.title("Web Scraping Tool")
48
+ website_url = st.text_input("Enter the URL to scrape:")
49
+ scrape_option = st.selectbox("Select what to scrape:", ['data', 'links'])
50
+
51
+ if st.button("Scrape"):
52
+ simple_web_scraper(website_url, scrape_option)
requirements.txt ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.2.0
2
+ attrs==23.1.0
3
+ beautifulsoup4==4.12.2
4
+ blinker==1.7.0
5
+ cachetools==5.3.2
6
+ certifi==2023.11.17
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ filelock==3.13.1
10
+ fsspec==2023.12.2
11
+ gitdb==4.0.11
12
+ GitPython==3.1.40
13
+ huggingface-hub==0.19.4
14
+ idna==3.6
15
+ importlib-metadata==6.11.0
16
+ Jinja2==3.1.2
17
+ jsonschema==4.20.0
18
+ jsonschema-specifications==2023.11.2
19
+ markdown-it-py==3.0.0
20
+ MarkupSafe==2.1.3
21
+ mdurl==0.1.2
22
+ numpy==1.26.2
23
+ packaging==23.2
24
+ pandas==2.1.4
25
+ Pillow==10.1.0
26
+ protobuf==4.25.1
27
+ pyarrow==14.0.1
28
+ pydeck==0.8.1b0
29
+ Pygments==2.17.2
30
+ python-dateutil==2.8.2
31
+ pytz==2023.3.post1
32
+ PyYAML==6.0.1
33
+ referencing==0.32.0
34
+ regex==2023.10.3
35
+ requests==2.31.0
36
+ rich==13.7.0
37
+ rpds-py==0.13.2
38
+ safetensors==0.4.1
39
+ six==1.16.0
40
+ smmap==5.0.1
41
+ soupsieve==2.5
42
+ streamlit==1.29.0
43
+ tenacity==8.2.3
44
+ tokenizers==0.15.0
45
+ toml==0.10.2
46
+ toolz==0.12.0
47
+ tornado==6.4
48
+ tqdm==4.66.1
49
+ transformers==4.36.0
50
+ typing_extensions==4.9.0
51
+ tzdata==2023.3
52
+ tzlocal==5.2
53
+ urllib3==2.1.0
54
+ validators==0.22.0
55
+ watchdog==3.0.0
56
+ zipp==3.17.0