Upload folder using huggingface_hub
Browse files- scrape_post_processor.py +59 -0
- scrape_verses.py +38 -13
scrape_post_processor.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import json
|
4 |
+
|
5 |
+
input_dir = "output/chapters"
|
6 |
+
output_dir = "output/chapters_final"
|
7 |
+
os.makedirs(output_dir, exist_ok=True)
|
8 |
+
|
9 |
+
def split_combined_entry(entry):
|
10 |
+
results = []
|
11 |
+
|
12 |
+
# detect range in verse title (e.g., "Verse 4-6")
|
13 |
+
m = re.search(r"Verse\s+(\d+)(?:\s*-\s*(\d+))?", entry.get("verse_title", ""))
|
14 |
+
if not m:
|
15 |
+
return [entry] # no split needed
|
16 |
+
|
17 |
+
start = int(m.group(1))
|
18 |
+
end = int(m.group(2)) if m.group(2) else start
|
19 |
+
|
20 |
+
# split into individual verses
|
21 |
+
for v in range(start, end + 1):
|
22 |
+
new_entry = entry.copy()
|
23 |
+
new_entry["verse_number"] = v
|
24 |
+
new_entry["verse_title"] = f"Bhagavad Gita: Chapter {entry['verse_number']}, Verse {v}"
|
25 |
+
|
26 |
+
# ⚠️ Optionally: split text by "|| X||" markers
|
27 |
+
if entry.get("sanskrit"):
|
28 |
+
parts = re.split(r"\|\|\s*\d+\s*\|\|", entry["sanskrit"])
|
29 |
+
if len(parts) >= (end - start + 1):
|
30 |
+
new_entry["sanskrit"] = parts[v - start].strip()
|
31 |
+
|
32 |
+
if entry.get("transliteration"):
|
33 |
+
parts = re.split(r"(\d+\s*)", entry["transliteration"])
|
34 |
+
# fallback: keep full transliteration if splitting fails
|
35 |
+
if len(parts) > (end - start):
|
36 |
+
new_entry["transliteration"] = parts[v - start].strip()
|
37 |
+
|
38 |
+
# Keep same translation/commentary/audio if not splittable
|
39 |
+
results.append(new_entry)
|
40 |
+
|
41 |
+
return results
|
42 |
+
|
43 |
+
for fname in os.listdir(input_dir):
|
44 |
+
if not fname.endswith(".json"):
|
45 |
+
continue
|
46 |
+
|
47 |
+
with open(os.path.join(input_dir, fname), "r", encoding="utf-8") as f:
|
48 |
+
verses = json.load(f)
|
49 |
+
|
50 |
+
final_verses = []
|
51 |
+
for entry in verses:
|
52 |
+
final_verses.extend(split_combined_entry(entry))
|
53 |
+
|
54 |
+
# save per chapter
|
55 |
+
out_path = os.path.join(output_dir, fname)
|
56 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
57 |
+
json.dump(final_verses, f, indent=2, ensure_ascii=False)
|
58 |
+
|
59 |
+
print(f"✅ Processed {fname} → {out_path}")
|
scrape_verses.py
CHANGED
@@ -2,6 +2,7 @@ import requests
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import json
|
4 |
import os
|
|
|
5 |
|
6 |
# Load the chapters JSON
|
7 |
with open("output/bhagavat_gita.json", "r", encoding="utf-8") as f:
|
@@ -14,25 +15,49 @@ base_url = "https://vivekavani.com/b{chapter}v{verse}/"
|
|
14 |
|
15 |
|
16 |
def scrape_verse(chapter_num, verse_num):
|
17 |
-
print("scraping chapter
|
18 |
url = base_url.format(chapter=chapter_num, verse=verse_num)
|
19 |
resp = requests.get(url)
|
20 |
if resp.status_code != 200:
|
21 |
-
print(f"⚠️ Skipping {url} (status {resp.status_code})")
|
22 |
return None
|
23 |
|
24 |
soup = BeautifulSoup(resp.text, "html.parser")
|
25 |
|
26 |
-
#
|
27 |
header = soup.find("header", class_="entry-header")
|
28 |
-
verse_title = header.find("h1", class_="entry-title")
|
29 |
-
entry_content =
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
32 |
audio_tag = soup.find("audio")
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
return {
|
38 |
"verse_number": verse_num,
|
@@ -44,8 +69,8 @@ def scrape_verse(chapter_num, verse_num):
|
|
44 |
"word_by_word_meaning": (
|
45 |
word_by_word_meaning.get_text(strip=True) if word_by_word_meaning else None
|
46 |
),
|
47 |
-
"translation": translation
|
48 |
-
"commentary": commentary
|
49 |
"audio": audio_tag["src"] if audio_tag and audio_tag.has_attr("src") else None,
|
50 |
"source": url,
|
51 |
}
|
@@ -59,7 +84,7 @@ for chapter in chapters:
|
|
59 |
print(f"📖 Scraping Chapter {chapter_num} ({verse_start}–{verse_end})")
|
60 |
|
61 |
verses = []
|
62 |
-
for v in range(verse_start, verse_end + 1):
|
63 |
verse_data = scrape_verse(chapter_num, v)
|
64 |
if verse_data:
|
65 |
verses.append(verse_data)
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import json
|
4 |
import os
|
5 |
+
from tqdm import tqdm
|
6 |
|
7 |
# Load the chapters JSON
|
8 |
with open("output/bhagavat_gita.json", "r", encoding="utf-8") as f:
|
|
|
15 |
|
16 |
|
17 |
def scrape_verse(chapter_num, verse_num):
|
18 |
+
# print("scraping chapter#", chapter_num, ":verse#", verse_num)
|
19 |
url = base_url.format(chapter=chapter_num, verse=verse_num)
|
20 |
resp = requests.get(url)
|
21 |
if resp.status_code != 200:
|
22 |
+
# print(f"⚠️ Skipping {url} (status {resp.status_code})")
|
23 |
return None
|
24 |
|
25 |
soup = BeautifulSoup(resp.text, "html.parser")
|
26 |
|
27 |
+
# Title + content container
|
28 |
header = soup.find("header", class_="entry-header")
|
29 |
+
verse_title = header.find("h1", class_="entry-title") if header else None
|
30 |
+
entry_content = soup.find("div", class_="entry-content")
|
31 |
+
|
32 |
+
# Sanskrit + transliteration
|
33 |
+
sanskrit = entry_content.find("p") if entry_content else None
|
34 |
+
transliteration = sanskrit.find_next("p") if sanskrit else None
|
35 |
+
|
36 |
+
# Audio
|
37 |
audio_tag = soup.find("audio")
|
38 |
+
|
39 |
+
# Word-by-word meaning (just first <p> after audio)
|
40 |
+
word_by_word_meaning = audio_tag.find_next("p") if audio_tag else None
|
41 |
+
|
42 |
+
# --- Translation section ---
|
43 |
+
translation = []
|
44 |
+
h3_translation = soup.find("h3", string=lambda t: t and "Translation" in t)
|
45 |
+
if h3_translation:
|
46 |
+
for sib in h3_translation.find_next_siblings():
|
47 |
+
if sib.name == "h3" and "Commentary" in sib.get_text():
|
48 |
+
break
|
49 |
+
if sib.name == "p":
|
50 |
+
translation.append(sib.get_text(strip=True))
|
51 |
+
|
52 |
+
# --- Commentary section ---
|
53 |
+
commentary = []
|
54 |
+
h3_commentary = soup.find("h3", string=lambda t: t and "Commentary" in t)
|
55 |
+
if h3_commentary:
|
56 |
+
for sib in h3_commentary.find_next_siblings():
|
57 |
+
if sib.name == "div":
|
58 |
+
break
|
59 |
+
if sib.name == "p":
|
60 |
+
commentary.append(sib.get_text(strip=True))
|
61 |
|
62 |
return {
|
63 |
"verse_number": verse_num,
|
|
|
69 |
"word_by_word_meaning": (
|
70 |
word_by_word_meaning.get_text(strip=True) if word_by_word_meaning else None
|
71 |
),
|
72 |
+
"translation": translation,
|
73 |
+
"commentary": commentary,
|
74 |
"audio": audio_tag["src"] if audio_tag and audio_tag.has_attr("src") else None,
|
75 |
"source": url,
|
76 |
}
|
|
|
84 |
print(f"📖 Scraping Chapter {chapter_num} ({verse_start}–{verse_end})")
|
85 |
|
86 |
verses = []
|
87 |
+
for v in tqdm(range(verse_start, verse_end + 1)):
|
88 |
verse_data = scrape_verse(chapter_num, v)
|
89 |
if verse_data:
|
90 |
verses.append(verse_data)
|