vikramvasudevan commited on
Commit
07a3179
·
verified ·
1 Parent(s): 584da1b

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. scrape_post_processor.py +59 -0
  2. scrape_verses.py +38 -13
scrape_post_processor.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+
5
+ input_dir = "output/chapters"
6
+ output_dir = "output/chapters_final"
7
+ os.makedirs(output_dir, exist_ok=True)
8
+
9
+ def split_combined_entry(entry):
10
+ results = []
11
+
12
+ # detect range in verse title (e.g., "Verse 4-6")
13
+ m = re.search(r"Verse\s+(\d+)(?:\s*-\s*(\d+))?", entry.get("verse_title", ""))
14
+ if not m:
15
+ return [entry] # no split needed
16
+
17
+ start = int(m.group(1))
18
+ end = int(m.group(2)) if m.group(2) else start
19
+
20
+ # split into individual verses
21
+ for v in range(start, end + 1):
22
+ new_entry = entry.copy()
23
+ new_entry["verse_number"] = v
24
+ new_entry["verse_title"] = f"Bhagavad Gita: Chapter {entry['verse_number']}, Verse {v}"
25
+
26
+ # ⚠️ Optionally: split text by "|| X||" markers
27
+ if entry.get("sanskrit"):
28
+ parts = re.split(r"\|\|\s*\d+\s*\|\|", entry["sanskrit"])
29
+ if len(parts) >= (end - start + 1):
30
+ new_entry["sanskrit"] = parts[v - start].strip()
31
+
32
+ if entry.get("transliteration"):
33
+ parts = re.split(r"(\d+\s*)", entry["transliteration"])
34
+ # fallback: keep full transliteration if splitting fails
35
+ if len(parts) > (end - start):
36
+ new_entry["transliteration"] = parts[v - start].strip()
37
+
38
+ # Keep same translation/commentary/audio if not splittable
39
+ results.append(new_entry)
40
+
41
+ return results
42
+
43
+ for fname in os.listdir(input_dir):
44
+ if not fname.endswith(".json"):
45
+ continue
46
+
47
+ with open(os.path.join(input_dir, fname), "r", encoding="utf-8") as f:
48
+ verses = json.load(f)
49
+
50
+ final_verses = []
51
+ for entry in verses:
52
+ final_verses.extend(split_combined_entry(entry))
53
+
54
+ # save per chapter
55
+ out_path = os.path.join(output_dir, fname)
56
+ with open(out_path, "w", encoding="utf-8") as f:
57
+ json.dump(final_verses, f, indent=2, ensure_ascii=False)
58
+
59
+ print(f"✅ Processed {fname} → {out_path}")
scrape_verses.py CHANGED
@@ -2,6 +2,7 @@ import requests
2
  from bs4 import BeautifulSoup
3
  import json
4
  import os
 
5
 
6
  # Load the chapters JSON
7
  with open("output/bhagavat_gita.json", "r", encoding="utf-8") as f:
@@ -14,25 +15,49 @@ base_url = "https://vivekavani.com/b{chapter}v{verse}/"
14
 
15
 
16
  def scrape_verse(chapter_num, verse_num):
17
- print("scraping chapter:", chapter_num, ":verse#", verse_num)
18
  url = base_url.format(chapter=chapter_num, verse=verse_num)
19
  resp = requests.get(url)
20
  if resp.status_code != 200:
21
- print(f"⚠️ Skipping {url} (status {resp.status_code})")
22
  return None
23
 
24
  soup = BeautifulSoup(resp.text, "html.parser")
25
 
26
- # Example structure (you may need to tweak based on actual HTML)
27
  header = soup.find("header", class_="entry-header")
28
- verse_title = header.find("h1", class_="entry-title")
29
- entry_content = header.find_next("div", class_="entry-content")
30
- sanskrit = entry_content.find("p")
31
- transliteration = sanskrit.find_next("p")
 
 
 
 
32
  audio_tag = soup.find("audio")
33
- word_by_word_meaning = audio_tag.find_next("p")
34
- translation = word_by_word_meaning.find_next("p")
35
- commentary = translation.find_next("p")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  return {
38
  "verse_number": verse_num,
@@ -44,8 +69,8 @@ def scrape_verse(chapter_num, verse_num):
44
  "word_by_word_meaning": (
45
  word_by_word_meaning.get_text(strip=True) if word_by_word_meaning else None
46
  ),
47
- "translation": translation.get_text(strip=True) if translation else None,
48
- "commentary": commentary.get_text(strip=True) if commentary else None,
49
  "audio": audio_tag["src"] if audio_tag and audio_tag.has_attr("src") else None,
50
  "source": url,
51
  }
@@ -59,7 +84,7 @@ for chapter in chapters:
59
  print(f"📖 Scraping Chapter {chapter_num} ({verse_start}–{verse_end})")
60
 
61
  verses = []
62
- for v in range(verse_start, verse_end + 1):
63
  verse_data = scrape_verse(chapter_num, v)
64
  if verse_data:
65
  verses.append(verse_data)
 
2
  from bs4 import BeautifulSoup
3
  import json
4
  import os
5
+ from tqdm import tqdm
6
 
7
  # Load the chapters JSON
8
  with open("output/bhagavat_gita.json", "r", encoding="utf-8") as f:
 
15
 
16
 
17
  def scrape_verse(chapter_num, verse_num):
18
+ # print("scraping chapter#", chapter_num, ":verse#", verse_num)
19
  url = base_url.format(chapter=chapter_num, verse=verse_num)
20
  resp = requests.get(url)
21
  if resp.status_code != 200:
22
+ # print(f"⚠️ Skipping {url} (status {resp.status_code})")
23
  return None
24
 
25
  soup = BeautifulSoup(resp.text, "html.parser")
26
 
27
+ # Title + content container
28
  header = soup.find("header", class_="entry-header")
29
+ verse_title = header.find("h1", class_="entry-title") if header else None
30
+ entry_content = soup.find("div", class_="entry-content")
31
+
32
+ # Sanskrit + transliteration
33
+ sanskrit = entry_content.find("p") if entry_content else None
34
+ transliteration = sanskrit.find_next("p") if sanskrit else None
35
+
36
+ # Audio
37
  audio_tag = soup.find("audio")
38
+
39
+ # Word-by-word meaning (just first <p> after audio)
40
+ word_by_word_meaning = audio_tag.find_next("p") if audio_tag else None
41
+
42
+ # --- Translation section ---
43
+ translation = []
44
+ h3_translation = soup.find("h3", string=lambda t: t and "Translation" in t)
45
+ if h3_translation:
46
+ for sib in h3_translation.find_next_siblings():
47
+ if sib.name == "h3" and "Commentary" in sib.get_text():
48
+ break
49
+ if sib.name == "p":
50
+ translation.append(sib.get_text(strip=True))
51
+
52
+ # --- Commentary section ---
53
+ commentary = []
54
+ h3_commentary = soup.find("h3", string=lambda t: t and "Commentary" in t)
55
+ if h3_commentary:
56
+ for sib in h3_commentary.find_next_siblings():
57
+ if sib.name == "div":
58
+ break
59
+ if sib.name == "p":
60
+ commentary.append(sib.get_text(strip=True))
61
 
62
  return {
63
  "verse_number": verse_num,
 
69
  "word_by_word_meaning": (
70
  word_by_word_meaning.get_text(strip=True) if word_by_word_meaning else None
71
  ),
72
+ "translation": translation,
73
+ "commentary": commentary,
74
  "audio": audio_tag["src"] if audio_tag and audio_tag.has_attr("src") else None,
75
  "source": url,
76
  }
 
84
  print(f"📖 Scraping Chapter {chapter_num} ({verse_start}–{verse_end})")
85
 
86
  verses = []
87
+ for v in tqdm(range(verse_start, verse_end + 1)):
88
  verse_data = scrape_verse(chapter_num, v)
89
  if verse_data:
90
  verses.append(verse_data)