SissiFeng commited on
Commit
f0a1172
·
1 Parent(s): cec13fd

feat(youtube): add time-synced subtitles via IFrame API; add proxy fallback to fetch auto-generated captions; annotate with hiragana + POS colors; clickable to seek

Browse files
Files changed (2) hide show
  1. app.py +169 -8
  2. requirements.txt +1 -0
app.py CHANGED
@@ -7,6 +7,12 @@ import pytesseract
7
  import cv2
8
  import numpy as np
9
  from typing import List, Dict, Optional
 
 
 
 
 
 
10
 
11
  # YouTube transcript & translation (optional)
12
  try:
@@ -340,7 +346,13 @@ def fetch_transcript(video_id: str, lang_priority: List[str]) -> List[Dict]:
340
  except TranscriptsDisabled:
341
  raise RuntimeError("该视频字幕被禁用。")
342
  except Exception as e:
343
- raise RuntimeError(f"拉取字幕失败: {e}")
 
 
 
 
 
 
344
 
345
  for lang in lang_priority:
346
  try:
@@ -356,8 +368,72 @@ def fetch_transcript(video_id: str, lang_priority: List[str]) -> List[Dict]:
356
  translated = tr.translate(target)
357
  return translated.fetch()
358
  except Exception:
 
 
 
 
359
  raise RuntimeError("未找到可用字幕。建议换个视频或检查语言。")
360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  def translate_zh(text: str) -> Optional[str]:
362
  if not text or GoogleTranslator is None:
363
  return None
@@ -368,26 +444,32 @@ def translate_zh(text: str) -> Optional[str]:
368
 
369
  def render_transcript_html(items: List[Dict], grammar_mode: bool, with_zh: bool) -> str:
370
  lines = []
 
371
  for it in items:
372
  raw = normalize_text(it.get('text', ''))
373
  if not raw:
 
374
  continue
375
- ts = seconds_to_mmss(float(it.get('start', 0)))
 
 
376
  jp_html = to_furigana_inline(raw, grammar_mode)
377
  zh = translate_zh(raw) if with_zh else None
378
  zh_html = f"<div style='color:#444;margin-top:4px;'>【中】{zh}</div>" if zh else ""
379
  lines.append(
380
- f"<div style='padding:8px 10px;margin:6px 0;border-left:4px solid #ddd;background:#fff;'>"
 
381
  f"<div style='font-size:0.9em;color:#888;'>⏱ {ts}</div>"
382
  f"<div style='line-height:2.0;font-size:1.1em;'>{jp_html}</div>"
383
  f"{zh_html}"
384
  f"</div>"
385
  )
 
386
  if not lines:
387
  return "<div>未获取到字幕内容。</div>"
388
  legend = create_pos_legend() if grammar_mode else ""
389
  return (
390
- "<div style='max-height:70vh;overflow:auto;padding:8px;background:#f6f7f9;border-radius:8px;'>"
391
  + legend
392
  + "".join(lines)
393
  + "</div>"
@@ -400,9 +482,10 @@ def show_youtube(url: str, grammar_mode: bool, with_zh: bool):
400
  vid = parse_video_id(url)
401
  if not vid:
402
  return "", "", "未能解析视频ID,请检查链接。"
 
403
  iframe = (
404
  f"<div style='position:relative;padding-top:56.25%;'>"
405
- f"<iframe src='https://www.youtube.com/embed/{vid}' "
406
  f"style='position:absolute;top:0;left:0;width:100%;height:100%;border:0;' "
407
  f"allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' allowfullscreen></iframe>"
408
  f"</div>"
@@ -411,9 +494,87 @@ def show_youtube(url: str, grammar_mode: bool, with_zh: bool):
411
  items = fetch_transcript(vid, ["ja", "ja-JP"])
412
  except Exception as e:
413
  return iframe, "", f"加载字幕失败:{e}"
414
- html = render_transcript_html(items, grammar_mode, with_zh)
415
- tip = "已加载字幕。支持平假名注音与词性上色。" + (" 已附中文释义。" if with_zh else "")
416
- return iframe, html, tip
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
  # ----- OCR from screenshot -----
419
  def ocr_image(img: np.ndarray):
 
7
  import cv2
8
  import numpy as np
9
  from typing import List, Dict, Optional
10
+ import html as _html
11
+ import json as _json
12
+ try:
13
+ import requests
14
+ except Exception:
15
+ requests = None
16
 
17
  # YouTube transcript & translation (optional)
18
  try:
 
346
  except TranscriptsDisabled:
347
  raise RuntimeError("该视频字幕被禁用。")
348
  except Exception as e:
349
+ # 常见于平台无法访问 YouTube 的网络/DNS 限制,尝试代理抓取
350
+ fallback = _fetch_transcript_via_proxy(video_id, lang_priority)
351
+ if fallback:
352
+ return fallback
353
+ raise RuntimeError(
354
+ "拉取字幕失败: {}。若 Space 无法直连 YouTube,已尝试代理抓取。".format(e)
355
+ )
356
 
357
  for lang in lang_priority:
358
  try:
 
368
  translated = tr.translate(target)
369
  return translated.fetch()
370
  except Exception:
371
+ # 尝试代理抓取
372
+ fallback = _fetch_transcript_via_proxy(video_id, lang_priority)
373
+ if fallback:
374
+ return fallback
375
  raise RuntimeError("未找到可用字幕。建议换个视频或检查语言。")
376
 
377
+ def _fetch_transcript_via_proxy(video_id: str, lang_priority: List[str]) -> Optional[List[Dict]]:
378
+ """Fallback: use r.jina.ai to fetch YouTube page and timedtext via a CORS-friendly proxy.
379
+ Returns list of dicts with text,start,duration or None if fails.
380
+ """
381
+ if requests is None:
382
+ return None
383
+ try:
384
+ resp = requests.get(f"https://r.jina.ai/https://www.youtube.com/watch?v={video_id}", timeout=12)
385
+ if resp.status_code != 200:
386
+ return None
387
+ html = resp.text
388
+ # Find captionTracks JSON
389
+ m = re.search(r'"captionTracks":\s*(\[.*?\])', html)
390
+ if not m:
391
+ return None
392
+ tracks_json = m.group(1)
393
+ tracks = _json.loads(tracks_json)
394
+ # Pick best track
395
+ chosen = None
396
+ # preferred languages
397
+ for lang in (lang_priority or []):
398
+ for tr in tracks:
399
+ if tr.get('languageCode') == lang:
400
+ chosen = tr
401
+ break
402
+ if chosen:
403
+ break
404
+ # fallback: any Japanese or ASR ja
405
+ if not chosen:
406
+ for tr in tracks:
407
+ lc = tr.get('languageCode', '')
408
+ if lc.startswith('ja'):
409
+ chosen = tr
410
+ break
411
+ # last resort: first track
412
+ if not chosen and tracks:
413
+ chosen = tracks[0]
414
+ if not chosen:
415
+ return None
416
+ base_url = chosen.get('baseUrl')
417
+ if not base_url:
418
+ return None
419
+ # Fetch timedtext XML through proxy
420
+ xml_url = f"https://r.jina.ai/{base_url}"
421
+ x = requests.get(xml_url, timeout=12)
422
+ if x.status_code != 200:
423
+ return None
424
+ xml = x.text
425
+ items: List[Dict] = []
426
+ for m in re.finditer(r'<text[^>]*start="([0-9.]+)"[^>]*dur="([0-9.]+)"[^>]*>(.*?)</text>', xml):
427
+ st = float(m.group(1))
428
+ du = float(m.group(2))
429
+ tx = m.group(3)
430
+ # Unescape HTML entities and replace line breaks
431
+ tx = _html.unescape(tx.replace('\n', ' ').replace('<br>', ' '))
432
+ items.append({'text': tx, 'start': st, 'duration': du})
433
+ return items or None
434
+ except Exception:
435
+ return None
436
+
437
  def translate_zh(text: str) -> Optional[str]:
438
  if not text or GoogleTranslator is None:
439
  return None
 
444
 
445
  def render_transcript_html(items: List[Dict], grammar_mode: bool, with_zh: bool) -> str:
446
  lines = []
447
+ idx = 0
448
  for it in items:
449
  raw = normalize_text(it.get('text', ''))
450
  if not raw:
451
+ idx += 1
452
  continue
453
+ start = float(it.get('start', 0.0))
454
+ dur = float(it.get('duration', 0.0))
455
+ ts = seconds_to_mmss(start)
456
  jp_html = to_furigana_inline(raw, grammar_mode)
457
  zh = translate_zh(raw) if with_zh else None
458
  zh_html = f"<div style='color:#444;margin-top:4px;'>【中】{zh}</div>" if zh else ""
459
  lines.append(
460
+ f"<div class='yt-line' data-yt-idx='{idx}' data-yt-start='{start}' data-yt-dur='{dur}' "
461
+ f"style='padding:8px 10px;margin:6px 0;border-left:4px solid #ddd;background:#fff;border-radius:6px;'>"
462
  f"<div style='font-size:0.9em;color:#888;'>⏱ {ts}</div>"
463
  f"<div style='line-height:2.0;font-size:1.1em;'>{jp_html}</div>"
464
  f"{zh_html}"
465
  f"</div>"
466
  )
467
+ idx += 1
468
  if not lines:
469
  return "<div>未获取到字幕内容。</div>"
470
  legend = create_pos_legend() if grammar_mode else ""
471
  return (
472
+ "<div data-yt-scroll style='max-height:70vh;overflow:auto;padding:8px;background:#f6f7f9;border-radius:8px;'>"
473
  + legend
474
  + "".join(lines)
475
  + "</div>"
 
482
  vid = parse_video_id(url)
483
  if not vid:
484
  return "", "", "未能解析视频ID,请检查链接。"
485
+ iframe_id = f"yt-player-{vid}"
486
  iframe = (
487
  f"<div style='position:relative;padding-top:56.25%;'>"
488
+ f"<iframe id='{iframe_id}' src='https://www.youtube.com/embed/{vid}?enablejsapi=1' "
489
  f"style='position:absolute;top:0;left:0;width:100%;height:100%;border:0;' "
490
  f"allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' allowfullscreen></iframe>"
491
  f"</div>"
 
494
  items = fetch_transcript(vid, ["ja", "ja-JP"])
495
  except Exception as e:
496
  return iframe, "", f"加载字幕失败:{e}"
497
+ # 渲染字幕并注入同步脚本
498
+ html_core = render_transcript_html(items, grammar_mode, with_zh)
499
+ # 将字幕的时间戳打包为 JSON,供前端脚本同步
500
+ safe_items = []
501
+ for it in items:
502
+ safe_items.append({
503
+ 'start': float(it.get('start', 0.0)),
504
+ 'duration': float(it.get('duration', 0.0)),
505
+ })
506
+ import json
507
+ times_json = json.dumps(safe_items)
508
+ sync_script = f"""
509
+ <style>
510
+ .yt-line-active {{
511
+ background: #fff4cc;
512
+ border-left-color: #f0b400 !important;
513
+ }}
514
+ .yt-line {{ cursor: pointer; }}
515
+ </style>
516
+ <script>
517
+ (function(){{
518
+ var vid = {repr(vid)};
519
+ var iframeId = {repr(iframe_id)};
520
+ var items = {times_json};
521
+ // 加载 IFrame API(若尚未加载)
522
+ function ensureAPI(cb){{
523
+ if (window.YT && window.YT.Player) return cb();
524
+ if (!document.getElementById('yt-iframe-api')){{
525
+ var s = document.createElement('script');
526
+ s.id='yt-iframe-api';
527
+ s.src='https://www.youtube.com/iframe_api';
528
+ document.body.appendChild(s);
529
+ }}
530
+ var t = setInterval(function(){{
531
+ if (window.YT && window.YT.Player) {{ clearInterval(t); cb(); }}
532
+ }}, 200);
533
+ }}
534
+ var player;
535
+ function startSync(){{
536
+ try {{
537
+ player = player || new YT.Player(iframeId);
538
+ }} catch(e) {{ return; }}
539
+ // 点击跳转
540
+ document.querySelectorAll('[data-yt-start]').forEach(function(el){{
541
+ el.addEventListener('click', function(){{
542
+ var st = parseFloat(el.getAttribute('data-yt-start')||'0');
543
+ if (player && player.seekTo) player.seekTo(st, true);
544
+ }});
545
+ }});
546
+ // 定时高亮
547
+ var last = -1;
548
+ setInterval(function(){{
549
+ if (!player || !player.getCurrentTime) return;
550
+ var t = player.getCurrentTime();
551
+ var idx = -1;
552
+ for (var i=0;i<items.length;i++){{
553
+ var s = items[i].start, e = s + items[i].duration;
554
+ if (t >= s && t < e) {{ idx = i; break; }}
555
+ }}
556
+ if (idx !== last) {{
557
+ last = idx;
558
+ document.querySelectorAll('[data-yt-idx]').forEach(function(el){{ el.classList.remove('yt-line-active'); }});
559
+ var cur = document.querySelector('[data-yt-idx="'+idx+'"]');
560
+ if (cur) {{
561
+ cur.classList.add('yt-line-active');
562
+ // 滚动到可见区域
563
+ var parent = cur.closest('[data-yt-scroll]');
564
+ if (parent) {{
565
+ var top = cur.offsetTop - 80;
566
+ parent.scrollTo({{ top: top, behavior: 'smooth' }});
567
+ }}
568
+ }}
569
+ }}
570
+ }}, 250);
571
+ }}
572
+ ensureAPI(startSync);
573
+ }})();
574
+ </script>
575
+ """
576
+ tip = "已加载字幕并开启同步。支持点击字幕跳转、自动高亮。" + (" 已附中文释义。" if with_zh else "")
577
+ return iframe, html_core + sync_script, tip
578
 
579
  # ----- OCR from screenshot -----
580
  def ocr_image(img: np.ndarray):
requirements.txt CHANGED
@@ -6,3 +6,4 @@ pytesseract==0.3.13
6
  opencv-python-headless==4.10.0.84
7
  youtube-transcript-api==0.6.2
8
  deep-translator==1.11.4
 
 
6
  opencv-python-headless==4.10.0.84
7
  youtube-transcript-api==0.6.2
8
  deep-translator==1.11.4
9
+ requests==2.32.3