Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- data/gita_data_new.json +0 -0
- output/bhagavat_gita.json +0 -0
- pyproject.toml +1 -0
- scrape_summary.py +51 -0
- uv.lock +24 -0
data/gita_data_new.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
output/bhagavat_gita.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -5,6 +5,7 @@ description = "Add your description here"
|
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.12"
|
| 7 |
dependencies = [
|
|
|
|
| 8 |
"chromadb>=1.0.15",
|
| 9 |
"dotenv>=0.9.9",
|
| 10 |
"gradio>=5.38.0",
|
|
|
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.12"
|
| 7 |
dependencies = [
|
| 8 |
+
"beautifulsoup4>=4.13.5",
|
| 9 |
"chromadb>=1.0.15",
|
| 10 |
"dotenv>=0.9.9",
|
| 11 |
"gradio>=5.38.0",
|
scrape_summary.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
url = "https://cdn.vivekavani.com/bg/"
|
| 7 |
+
resp = requests.get(url)
|
| 8 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
| 9 |
+
|
| 10 |
+
data = []
|
| 11 |
+
|
| 12 |
+
for chapter in soup.find_all("h2"):
|
| 13 |
+
chapter_text = chapter.get_text(strip=True)
|
| 14 |
+
m = re.match(r"Chapter\s+(\d+):\s+(.+)", chapter_text)
|
| 15 |
+
if not m:
|
| 16 |
+
continue
|
| 17 |
+
chapter_num, chapter_title = m.groups()
|
| 18 |
+
|
| 19 |
+
# First H3 = overview
|
| 20 |
+
h3_overview = chapter.find_next("h3")
|
| 21 |
+
if h3_overview:
|
| 22 |
+
ol = h3_overview.find_next("ol")
|
| 23 |
+
if ol:
|
| 24 |
+
overview_items = [li.get_text(strip=True) for li in ol.find_all("li")]
|
| 25 |
+
# Second H3 = verse range
|
| 26 |
+
h3_range = h3_overview.find_next("h3") if h3_overview else None
|
| 27 |
+
verse_start = verse_end = None
|
| 28 |
+
if h3_range:
|
| 29 |
+
verses_text = h3_range.get_text(strip=True)
|
| 30 |
+
m2 = re.search(r"Verse[s]?\s+(\d+)\s+to\s+(\d+)", verses_text)
|
| 31 |
+
if m2:
|
| 32 |
+
verse_start, verse_end = map(int, m2.groups())
|
| 33 |
+
|
| 34 |
+
# UL after second H3 = summary
|
| 35 |
+
ul = h3_range.find_next("ul") if h3_range else None
|
| 36 |
+
summary = [li.get_text(strip=True) for li in ul.find_all("li")] if ul else []
|
| 37 |
+
|
| 38 |
+
data.append({
|
| 39 |
+
"chapter_number": int(chapter_num),
|
| 40 |
+
"chapter_title": chapter_title,
|
| 41 |
+
"overview": overview_items,
|
| 42 |
+
"verse_start": verse_start,
|
| 43 |
+
"verse_end": verse_end,
|
| 44 |
+
"summary": summary
|
| 45 |
+
})
|
| 46 |
+
|
| 47 |
+
# Save JSON
|
| 48 |
+
with open("output/bhagavat_gita.json", "w", encoding="utf-8") as f:
|
| 49 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 50 |
+
|
| 51 |
+
print(json.dumps(data, indent=2, ensure_ascii=False))
|
uv.lock
CHANGED
|
@@ -228,6 +228,19 @@ wheels = [
|
|
| 228 |
{ url = "https://files.pythonhosted.org/packages/a9/cf/45fb5261ece3e6b9817d3d82b2f343a505fd58674a92577923bc500bd1aa/bcrypt-4.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:e53e074b120f2877a35cc6c736b8eb161377caae8925c17688bd46ba56daaa5b", size = 152799, upload-time = "2025-02-28T01:23:53.139Z" },
|
| 229 |
]
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
[[package]]
|
| 232 |
name = "brotli"
|
| 233 |
version = "1.1.0"
|
|
@@ -1204,6 +1217,7 @@ name = "langgraph-demo"
|
|
| 1204 |
version = "0.1.0"
|
| 1205 |
source = { virtual = "." }
|
| 1206 |
dependencies = [
|
|
|
|
| 1207 |
{ name = "chromadb" },
|
| 1208 |
{ name = "dotenv" },
|
| 1209 |
{ name = "gradio" },
|
|
@@ -1218,6 +1232,7 @@ dependencies = [
|
|
| 1218 |
|
| 1219 |
[package.metadata]
|
| 1220 |
requires-dist = [
|
|
|
|
| 1221 |
{ name = "chromadb", specifier = ">=1.0.15" },
|
| 1222 |
{ name = "dotenv", specifier = ">=0.9.9" },
|
| 1223 |
{ name = "gradio", specifier = ">=5.38.0" },
|
|
@@ -2514,6 +2529,15 @@ wheels = [
|
|
| 2514 |
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
| 2515 |
]
|
| 2516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2517 |
[[package]]
|
| 2518 |
name = "sqlalchemy"
|
| 2519 |
version = "2.0.41"
|
|
|
|
| 228 |
{ url = "https://files.pythonhosted.org/packages/a9/cf/45fb5261ece3e6b9817d3d82b2f343a505fd58674a92577923bc500bd1aa/bcrypt-4.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:e53e074b120f2877a35cc6c736b8eb161377caae8925c17688bd46ba56daaa5b", size = 152799, upload-time = "2025-02-28T01:23:53.139Z" },
|
| 229 |
]
|
| 230 |
|
| 231 |
+
[[package]]
|
| 232 |
+
name = "beautifulsoup4"
|
| 233 |
+
version = "4.13.5"
|
| 234 |
+
source = { registry = "https://pypi.org/simple" }
|
| 235 |
+
dependencies = [
|
| 236 |
+
{ name = "soupsieve" },
|
| 237 |
+
{ name = "typing-extensions" },
|
| 238 |
+
]
|
| 239 |
+
sdist = { url = "https://files.pythonhosted.org/packages/85/2e/3e5079847e653b1f6dc647aa24549d68c6addb4c595cc0d902d1b19308ad/beautifulsoup4-4.13.5.tar.gz", hash = "sha256:5e70131382930e7c3de33450a2f54a63d5e4b19386eab43a5b34d594268f3695", size = 622954, upload-time = "2025-08-24T14:06:13.168Z" }
|
| 240 |
+
wheels = [
|
| 241 |
+
{ url = "https://files.pythonhosted.org/packages/04/eb/f4151e0c7377a6e08a38108609ba5cede57986802757848688aeedd1b9e8/beautifulsoup4-4.13.5-py3-none-any.whl", hash = "sha256:642085eaa22233aceadff9c69651bc51e8bf3f874fb6d7104ece2beb24b47c4a", size = 105113, upload-time = "2025-08-24T14:06:14.884Z" },
|
| 242 |
+
]
|
| 243 |
+
|
| 244 |
[[package]]
|
| 245 |
name = "brotli"
|
| 246 |
version = "1.1.0"
|
|
|
|
| 1217 |
version = "0.1.0"
|
| 1218 |
source = { virtual = "." }
|
| 1219 |
dependencies = [
|
| 1220 |
+
{ name = "beautifulsoup4" },
|
| 1221 |
{ name = "chromadb" },
|
| 1222 |
{ name = "dotenv" },
|
| 1223 |
{ name = "gradio" },
|
|
|
|
| 1232 |
|
| 1233 |
[package.metadata]
|
| 1234 |
requires-dist = [
|
| 1235 |
+
{ name = "beautifulsoup4", specifier = ">=4.13.5" },
|
| 1236 |
{ name = "chromadb", specifier = ">=1.0.15" },
|
| 1237 |
{ name = "dotenv", specifier = ">=0.9.9" },
|
| 1238 |
{ name = "gradio", specifier = ">=5.38.0" },
|
|
|
|
| 2529 |
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
| 2530 |
]
|
| 2531 |
|
| 2532 |
+
[[package]]
|
| 2533 |
+
name = "soupsieve"
|
| 2534 |
+
version = "2.8"
|
| 2535 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2536 |
+
sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" }
|
| 2537 |
+
wheels = [
|
| 2538 |
+
{ url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
|
| 2539 |
+
]
|
| 2540 |
+
|
| 2541 |
[[package]]
|
| 2542 |
name = "sqlalchemy"
|
| 2543 |
version = "2.0.41"
|