vikramvasudevan commited on
Commit
c89e702
·
verified ·
1 Parent(s): f7f40d0

Upload folder using huggingface_hub

Browse files
data/gita_data_new.json ADDED
The diff for this file is too large to render. See raw diff
 
output/bhagavat_gita.json ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -5,6 +5,7 @@ description = "Add your description here"
5
  readme = "README.md"
6
  requires-python = ">=3.12"
7
  dependencies = [
 
8
  "chromadb>=1.0.15",
9
  "dotenv>=0.9.9",
10
  "gradio>=5.38.0",
 
5
  readme = "README.md"
6
  requires-python = ">=3.12"
7
  dependencies = [
8
+ "beautifulsoup4>=4.13.5",
9
  "chromadb>=1.0.15",
10
  "dotenv>=0.9.9",
11
  "gradio>=5.38.0",
scrape_summary.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import re
5
+
6
+ url = "https://cdn.vivekavani.com/bg/"
7
+ resp = requests.get(url)
8
+ soup = BeautifulSoup(resp.text, "html.parser")
9
+
10
+ data = []
11
+
12
+ for chapter in soup.find_all("h2"):
13
+ chapter_text = chapter.get_text(strip=True)
14
+ m = re.match(r"Chapter\s+(\d+):\s+(.+)", chapter_text)
15
+ if not m:
16
+ continue
17
+ chapter_num, chapter_title = m.groups()
18
+
19
+ # First H3 = overview
20
+ h3_overview = chapter.find_next("h3")
21
+ if h3_overview:
22
+ ol = h3_overview.find_next("ol")
23
+ if ol:
24
+ overview_items = [li.get_text(strip=True) for li in ol.find_all("li")]
25
+ # Second H3 = verse range
26
+ h3_range = h3_overview.find_next("h3") if h3_overview else None
27
+ verse_start = verse_end = None
28
+ if h3_range:
29
+ verses_text = h3_range.get_text(strip=True)
30
+ m2 = re.search(r"Verse[s]?\s+(\d+)\s+to\s+(\d+)", verses_text)
31
+ if m2:
32
+ verse_start, verse_end = map(int, m2.groups())
33
+
34
+ # UL after second H3 = summary
35
+ ul = h3_range.find_next("ul") if h3_range else None
36
+ summary = [li.get_text(strip=True) for li in ul.find_all("li")] if ul else []
37
+
38
+ data.append({
39
+ "chapter_number": int(chapter_num),
40
+ "chapter_title": chapter_title,
41
+ "overview": overview_items,
42
+ "verse_start": verse_start,
43
+ "verse_end": verse_end,
44
+ "summary": summary
45
+ })
46
+
47
+ # Save JSON
48
+ with open("output/bhagavat_gita.json", "w", encoding="utf-8") as f:
49
+ json.dump(data, f, indent=2, ensure_ascii=False)
50
+
51
+ print(json.dumps(data, indent=2, ensure_ascii=False))
uv.lock CHANGED
@@ -228,6 +228,19 @@ wheels = [
228
  { url = "https://files.pythonhosted.org/packages/a9/cf/45fb5261ece3e6b9817d3d82b2f343a505fd58674a92577923bc500bd1aa/bcrypt-4.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:e53e074b120f2877a35cc6c736b8eb161377caae8925c17688bd46ba56daaa5b", size = 152799, upload-time = "2025-02-28T01:23:53.139Z" },
229
  ]
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  [[package]]
232
  name = "brotli"
233
  version = "1.1.0"
@@ -1204,6 +1217,7 @@ name = "langgraph-demo"
1204
  version = "0.1.0"
1205
  source = { virtual = "." }
1206
  dependencies = [
 
1207
  { name = "chromadb" },
1208
  { name = "dotenv" },
1209
  { name = "gradio" },
@@ -1218,6 +1232,7 @@ dependencies = [
1218
 
1219
  [package.metadata]
1220
  requires-dist = [
 
1221
  { name = "chromadb", specifier = ">=1.0.15" },
1222
  { name = "dotenv", specifier = ">=0.9.9" },
1223
  { name = "gradio", specifier = ">=5.38.0" },
@@ -2514,6 +2529,15 @@ wheels = [
2514
  { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
2515
  ]
2516
 
 
 
 
 
 
 
 
 
 
2517
  [[package]]
2518
  name = "sqlalchemy"
2519
  version = "2.0.41"
 
228
  { url = "https://files.pythonhosted.org/packages/a9/cf/45fb5261ece3e6b9817d3d82b2f343a505fd58674a92577923bc500bd1aa/bcrypt-4.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:e53e074b120f2877a35cc6c736b8eb161377caae8925c17688bd46ba56daaa5b", size = 152799, upload-time = "2025-02-28T01:23:53.139Z" },
229
  ]
230
 
231
+ [[package]]
232
+ name = "beautifulsoup4"
233
+ version = "4.13.5"
234
+ source = { registry = "https://pypi.org/simple" }
235
+ dependencies = [
236
+ { name = "soupsieve" },
237
+ { name = "typing-extensions" },
238
+ ]
239
+ sdist = { url = "https://files.pythonhosted.org/packages/85/2e/3e5079847e653b1f6dc647aa24549d68c6addb4c595cc0d902d1b19308ad/beautifulsoup4-4.13.5.tar.gz", hash = "sha256:5e70131382930e7c3de33450a2f54a63d5e4b19386eab43a5b34d594268f3695", size = 622954, upload-time = "2025-08-24T14:06:13.168Z" }
240
+ wheels = [
241
+ { url = "https://files.pythonhosted.org/packages/04/eb/f4151e0c7377a6e08a38108609ba5cede57986802757848688aeedd1b9e8/beautifulsoup4-4.13.5-py3-none-any.whl", hash = "sha256:642085eaa22233aceadff9c69651bc51e8bf3f874fb6d7104ece2beb24b47c4a", size = 105113, upload-time = "2025-08-24T14:06:14.884Z" },
242
+ ]
243
+
244
  [[package]]
245
  name = "brotli"
246
  version = "1.1.0"
 
1217
  version = "0.1.0"
1218
  source = { virtual = "." }
1219
  dependencies = [
1220
+ { name = "beautifulsoup4" },
1221
  { name = "chromadb" },
1222
  { name = "dotenv" },
1223
  { name = "gradio" },
 
1232
 
1233
  [package.metadata]
1234
  requires-dist = [
1235
+ { name = "beautifulsoup4", specifier = ">=4.13.5" },
1236
  { name = "chromadb", specifier = ">=1.0.15" },
1237
  { name = "dotenv", specifier = ">=0.9.9" },
1238
  { name = "gradio", specifier = ">=5.38.0" },
 
2529
  { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
2530
  ]
2531
 
2532
+ [[package]]
2533
+ name = "soupsieve"
2534
+ version = "2.8"
2535
+ source = { registry = "https://pypi.org/simple" }
2536
+ sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" }
2537
+ wheels = [
2538
+ { url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
2539
+ ]
2540
+
2541
  [[package]]
2542
  name = "sqlalchemy"
2543
  version = "2.0.41"