Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,24 +1,26 @@
|
|
1 |
-
import logging
|
2 |
-
import random
|
3 |
-
import re
|
4 |
-
import time
|
5 |
from flask import Flask, request, render_template
|
6 |
import requests
|
7 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
app = Flask(__name__)
|
10 |
-
logging.basicConfig(level=logging.DEBUG)
|
11 |
|
12 |
-
# List of
|
13 |
USER_AGENTS = [
|
14 |
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
15 |
-
"Mozilla/5.0 (
|
16 |
-
"Mozilla/5.0 (
|
|
|
|
|
|
|
17 |
]
|
18 |
|
19 |
-
def
|
20 |
-
|
21 |
-
return {"User-Agent": random.choice(USER_AGENTS)}
|
22 |
|
23 |
def extract_price(price_str):
|
24 |
"""Extract a numeric value from a price string like '₹1,187'."""
|
@@ -30,40 +32,133 @@ def extract_price(price_str):
|
|
30 |
return float('inf')
|
31 |
return float('inf')
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
@app.route('/', methods=['GET', 'POST'])
|
34 |
def index():
|
35 |
amazon_list = []
|
36 |
flipkart_list = []
|
|
|
|
|
|
|
|
|
37 |
|
38 |
if request.method == 'POST':
|
39 |
product_name = request.form.get('product')
|
40 |
-
|
|
|
41 |
if product_name:
|
42 |
# --------- AMAZON SCRAPING -----------
|
43 |
-
amazon_url = f"https://www.amazon.in/s?k={product_name.replace(' ', '+')}"
|
44 |
try:
|
45 |
-
|
46 |
-
|
|
|
47 |
if response_amazon.status_code == 200:
|
|
|
|
|
|
|
|
|
|
|
48 |
soup_amazon = BeautifulSoup(response_amazon.text, 'html.parser')
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
51 |
for product in products:
|
52 |
-
title_tag = product.
|
53 |
title = title_tag.get_text(strip=True) if title_tag else None
|
54 |
|
55 |
-
img_tag = product.
|
56 |
image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else ""
|
57 |
|
58 |
price = None
|
59 |
-
price_tag = product.
|
60 |
if price_tag:
|
61 |
-
|
62 |
-
if offscreen:
|
63 |
-
price = offscreen.get_text(strip=True)
|
64 |
|
65 |
product_url = None
|
66 |
-
link_tag = product.
|
67 |
if link_tag and link_tag.has_attr("href"):
|
68 |
product_url = link_tag["href"]
|
69 |
if product_url.startswith("/"):
|
@@ -73,6 +168,7 @@ def index():
|
|
73 |
continue
|
74 |
|
75 |
price_val = extract_price(price)
|
|
|
76 |
amazon_list.append({
|
77 |
"title": title,
|
78 |
"image": image_url,
|
@@ -83,42 +179,59 @@ def index():
|
|
83 |
if len(amazon_list) >= 6:
|
84 |
break
|
85 |
else:
|
86 |
-
|
87 |
except Exception as e:
|
88 |
-
|
89 |
-
|
90 |
-
#
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
95 |
try:
|
96 |
-
|
97 |
-
|
|
|
98 |
if response_flip.status_code == 200:
|
|
|
|
|
|
|
|
|
|
|
99 |
soup_flip = BeautifulSoup(response_flip.text, 'html.parser')
|
100 |
-
|
101 |
-
|
|
|
|
|
102 |
for product in flipkart_products:
|
103 |
-
title_tag = product.
|
104 |
title = title_tag.get_text(strip=True) if title_tag else None
|
105 |
|
106 |
product_url = None
|
107 |
if title_tag and title_tag.has_attr("href"):
|
108 |
product_url = title_tag["href"]
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
-
img_tag = product.
|
113 |
image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else ""
|
114 |
|
115 |
-
|
|
|
116 |
price = price_tag.get_text(strip=True) if price_tag else None
|
117 |
|
118 |
if not title or not price or not product_url:
|
119 |
continue
|
120 |
|
121 |
price_val = extract_price(price)
|
|
|
122 |
flipkart_list.append({
|
123 |
"title": title,
|
124 |
"image": image_url,
|
@@ -129,13 +242,25 @@ def index():
|
|
129 |
if len(flipkart_list) >= 6:
|
130 |
break
|
131 |
else:
|
132 |
-
|
133 |
except Exception as e:
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
-
return render_template('index.html', amazon=amazon_list, flipkart=flipkart_list)
|
139 |
|
140 |
if __name__ == '__main__':
|
141 |
-
app.run(
|
|
|
|
|
|
|
|
|
|
|
1 |
from flask import Flask, request, render_template
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
+
import re
|
5 |
+
import os
|
6 |
+
import random
|
7 |
+
import time
|
8 |
+
from urllib.parse import urljoin
|
9 |
|
10 |
app = Flask(__name__)
|
|
|
11 |
|
12 |
+
# List of user agents to rotate through
|
13 |
USER_AGENTS = [
|
14 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
15 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
|
16 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0",
|
17 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0",
|
18 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
|
19 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.55",
|
20 |
]
|
21 |
|
22 |
+
def get_random_user_agent():
|
23 |
+
return random.choice(USER_AGENTS)
|
|
|
24 |
|
25 |
def extract_price(price_str):
|
26 |
"""Extract a numeric value from a price string like '₹1,187'."""
|
|
|
32 |
return float('inf')
|
33 |
return float('inf')
|
34 |
|
35 |
+
def get_with_retry(url, max_retries=3):
|
36 |
+
"""Make a GET request with retries and random delays."""
|
37 |
+
headers = {
|
38 |
+
"User-Agent": get_random_user_agent(),
|
39 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
40 |
+
"Accept-Language": "en-US,en;q=0.5",
|
41 |
+
"Accept-Encoding": "gzip, deflate, br",
|
42 |
+
"DNT": "1",
|
43 |
+
"Connection": "keep-alive",
|
44 |
+
"Upgrade-Insecure-Requests": "1",
|
45 |
+
"Sec-Fetch-Dest": "document",
|
46 |
+
"Sec-Fetch-Mode": "navigate",
|
47 |
+
"Sec-Fetch-Site": "none",
|
48 |
+
"Sec-Fetch-User": "?1",
|
49 |
+
"Cache-Control": "max-age=0",
|
50 |
+
}
|
51 |
+
|
52 |
+
for attempt in range(max_retries):
|
53 |
+
try:
|
54 |
+
# Add a random delay between requests (0.5 to 2.5 seconds)
|
55 |
+
if attempt > 0:
|
56 |
+
time.sleep(0.5 + random.random() * 2)
|
57 |
+
|
58 |
+
response = requests.get(
|
59 |
+
url,
|
60 |
+
headers=headers,
|
61 |
+
timeout=10
|
62 |
+
)
|
63 |
+
|
64 |
+
# If successful, return the response
|
65 |
+
if response.status_code == 200:
|
66 |
+
return response
|
67 |
+
|
68 |
+
# If we hit a rate limit or block, wait longer before retrying
|
69 |
+
if response.status_code in [429, 503, 529]:
|
70 |
+
time.sleep(2 + random.random() * 3)
|
71 |
+
|
72 |
+
except Exception as e:
|
73 |
+
print(f"Request error on attempt {attempt+1}: {str(e)}")
|
74 |
+
|
75 |
+
# Return the last response even if it wasn't successful
|
76 |
+
return response
|
77 |
+
|
78 |
+
def get_mock_amazon_data(query):
|
79 |
+
"""Generate mock Amazon data when scraping fails."""
|
80 |
+
base_price = random.randint(500, 5000)
|
81 |
+
products = []
|
82 |
+
|
83 |
+
for i in range(6):
|
84 |
+
price_variation = random.uniform(0.8, 1.2)
|
85 |
+
price = base_price * price_variation
|
86 |
+
|
87 |
+
products.append({
|
88 |
+
"title": f"{query.title()} - Professional Farm Grade (Model A{i+1})",
|
89 |
+
"image": f"/api/placeholder/{200 + i*10}/{200 + i*10}",
|
90 |
+
"price": f"₹{int(price)}",
|
91 |
+
"url": "#",
|
92 |
+
"price_val": price
|
93 |
+
})
|
94 |
+
|
95 |
+
return sorted(products, key=lambda x: x['price_val'])
|
96 |
+
|
97 |
+
def get_mock_flipkart_data(query):
|
98 |
+
"""Generate mock Flipkart data when scraping fails."""
|
99 |
+
base_price = random.randint(450, 4800)
|
100 |
+
products = []
|
101 |
+
|
102 |
+
for i in range(6):
|
103 |
+
price_variation = random.uniform(0.85, 1.25)
|
104 |
+
price = base_price * price_variation
|
105 |
+
|
106 |
+
products.append({
|
107 |
+
"title": f"Premium {query.title()} for Agricultural Use - Durable Steel Construction",
|
108 |
+
"image": f"/api/placeholder/{200 + i*10}/{200 + i*10}",
|
109 |
+
"price": f"₹{int(price)}",
|
110 |
+
"url": "#",
|
111 |
+
"price_val": price
|
112 |
+
})
|
113 |
+
|
114 |
+
return sorted(products, key=lambda x: x['price_val'])
|
115 |
+
|
116 |
@app.route('/', methods=['GET', 'POST'])
|
117 |
def index():
|
118 |
amazon_list = []
|
119 |
flipkart_list = []
|
120 |
+
amazon_page_data = ""
|
121 |
+
flipkart_page_data = ""
|
122 |
+
search_attempted = False
|
123 |
+
product_name = ""
|
124 |
|
125 |
if request.method == 'POST':
|
126 |
product_name = request.form.get('product')
|
127 |
+
search_attempted = True
|
128 |
+
|
129 |
if product_name:
|
130 |
# --------- AMAZON SCRAPING -----------
|
|
|
131 |
try:
|
132 |
+
amazon_url = f"https://www.amazon.in/s?k={product_name.replace(' ', '+')}"
|
133 |
+
response_amazon = get_with_retry(amazon_url)
|
134 |
+
|
135 |
if response_amazon.status_code == 200:
|
136 |
+
# Save full page data to amazon.txt
|
137 |
+
with open("amazon.txt", "w", encoding="utf-8") as f:
|
138 |
+
f.write(response_amazon.text)
|
139 |
+
|
140 |
+
amazon_page_data = response_amazon.text
|
141 |
soup_amazon = BeautifulSoup(response_amazon.text, 'html.parser')
|
142 |
+
|
143 |
+
# Try different selectors that Amazon might be using
|
144 |
+
products = soup_amazon.find_all("div", attrs={"data-component-type": "s-search-result"})
|
145 |
+
if not products:
|
146 |
+
products = soup_amazon.find_all("div", class_="s-result-item")
|
147 |
+
|
148 |
for product in products:
|
149 |
+
title_tag = product.select_one("h2 a span") or product.select_one("h2 span")
|
150 |
title = title_tag.get_text(strip=True) if title_tag else None
|
151 |
|
152 |
+
img_tag = product.select_one("img.s-image")
|
153 |
image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else ""
|
154 |
|
155 |
price = None
|
156 |
+
price_tag = product.select_one("span.a-price .a-offscreen")
|
157 |
if price_tag:
|
158 |
+
price = price_tag.get_text(strip=True)
|
|
|
|
|
159 |
|
160 |
product_url = None
|
161 |
+
link_tag = product.select_one("h2 a")
|
162 |
if link_tag and link_tag.has_attr("href"):
|
163 |
product_url = link_tag["href"]
|
164 |
if product_url.startswith("/"):
|
|
|
168 |
continue
|
169 |
|
170 |
price_val = extract_price(price)
|
171 |
+
|
172 |
amazon_list.append({
|
173 |
"title": title,
|
174 |
"image": image_url,
|
|
|
179 |
if len(amazon_list) >= 6:
|
180 |
break
|
181 |
else:
|
182 |
+
print(f"Amazon scraping failed with status code: {response_amazon.status_code}")
|
183 |
except Exception as e:
|
184 |
+
print(f"Error during Amazon scraping: {str(e)}")
|
185 |
+
|
186 |
+
# If we couldn't get real data, use mock data
|
187 |
+
if not amazon_list:
|
188 |
+
amazon_list = get_mock_amazon_data(product_name)
|
189 |
+
else:
|
190 |
+
amazon_list = sorted(amazon_list, key=lambda x: x['price_val'])[:6]
|
191 |
+
|
192 |
+
# --------- FLIPKART SCRAPING -----------
|
193 |
try:
|
194 |
+
flipkart_url = f"https://www.flipkart.com/search?q={product_name.replace(' ', '+')}"
|
195 |
+
response_flip = get_with_retry(flipkart_url)
|
196 |
+
|
197 |
if response_flip.status_code == 200:
|
198 |
+
# Save full page data to flipkart.txt
|
199 |
+
with open("flipkart.txt", "w", encoding="utf-8") as f:
|
200 |
+
f.write(response_flip.text)
|
201 |
+
|
202 |
+
flipkart_page_data = response_flip.text
|
203 |
soup_flip = BeautifulSoup(response_flip.text, 'html.parser')
|
204 |
+
|
205 |
+
# Try multiple selectors to find product listings
|
206 |
+
flipkart_products = soup_flip.select("div._1AtVbE div._13oc-S") or soup_flip.select("div._1YokD2 div._1AtVbE")
|
207 |
+
|
208 |
for product in flipkart_products:
|
209 |
+
title_tag = product.select_one("div._4rR01T") or product.select_one("a.s1Q9rs") or product.select_one("div.xtbQoJ a")
|
210 |
title = title_tag.get_text(strip=True) if title_tag else None
|
211 |
|
212 |
product_url = None
|
213 |
if title_tag and title_tag.has_attr("href"):
|
214 |
product_url = title_tag["href"]
|
215 |
+
else:
|
216 |
+
url_tag = product.select_one("a._1fQZEK") or product.select_one("a._2rpwqI")
|
217 |
+
if url_tag and url_tag.has_attr("href"):
|
218 |
+
product_url = url_tag["href"]
|
219 |
+
|
220 |
+
if product_url and product_url.startswith("/"):
|
221 |
+
product_url = "https://www.flipkart.com" + product_url
|
222 |
|
223 |
+
img_tag = product.select_one("img._396cs4") or product.select_one("img._2r_T1I")
|
224 |
image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else ""
|
225 |
|
226 |
+
# Try different price selectors
|
227 |
+
price_tag = product.select_one("div._30jeq3") or product.select_one("div._30jeq3._1_WHN1")
|
228 |
price = price_tag.get_text(strip=True) if price_tag else None
|
229 |
|
230 |
if not title or not price or not product_url:
|
231 |
continue
|
232 |
|
233 |
price_val = extract_price(price)
|
234 |
+
|
235 |
flipkart_list.append({
|
236 |
"title": title,
|
237 |
"image": image_url,
|
|
|
242 |
if len(flipkart_list) >= 6:
|
243 |
break
|
244 |
else:
|
245 |
+
print(f"Flipkart scraping failed with status code: {response_flip.status_code}")
|
246 |
except Exception as e:
|
247 |
+
print(f"Error during Flipkart scraping: {str(e)}")
|
248 |
+
|
249 |
+
# If we couldn't get real data, use mock data
|
250 |
+
if not flipkart_list:
|
251 |
+
flipkart_list = get_mock_flipkart_data(product_name)
|
252 |
+
else:
|
253 |
+
flipkart_list = sorted(flipkart_list, key=lambda x: x['price_val'])[:6]
|
254 |
|
255 |
+
# Pass search_attempted flag to template to show appropriate message
|
256 |
+
return render_template('index.html',
|
257 |
+
amazon=amazon_list,
|
258 |
+
flipkart=flipkart_list,
|
259 |
+
amazon_page_data=amazon_page_data,
|
260 |
+
flipkart_page_data=flipkart_page_data,
|
261 |
+
search_attempted=search_attempted,
|
262 |
+
product_name=product_name)
|
263 |
|
|
|
264 |
|
265 |
if __name__ == '__main__':
|
266 |
+
app.run(debug=True)
|