Spaces:

hchcsuim
/

Automatic-Speech-Recognition-Speech-to-Text

Running

App Files Files Community

Automatic-Speech-Recognition-Speech-to-Text / youtube_api.py

hchcsuim

Fix YouTube example URL processing in Spaces environment

d2b4586 7 months ago

raw

history blame

9.46 kB

	"""
	YouTube API 處理模塊
	使用 YouTube Data API 獲取視頻信息，並使用 yt-dlp 下載音頻
	"""

	import os
	import time
	import tempfile
	import shutil
	import yt_dlp
	from googleapiclient.discovery import build
	from googleapiclient.errors import HttpError

	# YouTube API 配置
	YOUTUBE_API_SERVICE_NAME = "youtube"
	YOUTUBE_API_VERSION = "v3"
	YOUTUBE_API_KEY = None # 將在運行時設置

	def set_api_key(api_key):
	"""設置 YouTube API 金鑰"""
	global YOUTUBE_API_KEY
	YOUTUBE_API_KEY = api_key
	return YOUTUBE_API_KEY is not None

	def extract_video_id(youtube_url):
	"""從 YouTube URL 中提取視頻 ID"""
	if "youtube.com/watch" in youtube_url:
	# 標準 YouTube URL
	video_id = youtube_url.split("v=")[1].split("&")[0]
	elif "youtu.be/" in youtube_url:
	# 短 URL
	video_id = youtube_url.split("youtu.be/")[1].split("?")[0]
	else:
	# 不支持的 URL 格式
	return None
	return video_id

	def get_video_info(video_id):
	"""使用 YouTube Data API 獲取視頻信息"""
	if not YOUTUBE_API_KEY:
	raise ValueError("YouTube API 金鑰未設置。請先調用 set_api_key() 函數。")

	try:
	youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=YOUTUBE_API_KEY)

	# 獲取視頻詳細信息
	video_response = youtube.videos().list(
	part="snippet,contentDetails,statistics",
	id=video_id
	).execute()

	# 檢查是否找到視頻
	if not video_response.get("items"):
	return None

	video_info = video_response["items"][0]
	snippet = video_info["snippet"]
	content_details = video_info["contentDetails"]

	# 解析時長
	duration_str = content_details["duration"] # 格式: PT#H#M#S
	duration_seconds = parse_duration(duration_str)

	# 返回視頻信息
	return {
	"title": snippet["title"],
	"description": snippet["description"],
	"channel": snippet["channelTitle"],
	"published_at": snippet["publishedAt"],
	"duration": duration_seconds,
	"thumbnail": snippet["thumbnails"]["high"]["url"] if "high" in snippet["thumbnails"] else snippet["thumbnails"]["default"]["url"]
	}

	except HttpError as e:
	print(f"YouTube API 錯誤: {e}")
	return None
	except Exception as e:
	print(f"獲取視頻信息時發生錯誤: {e}")
	return None

	def parse_duration(duration_str):
	"""解析 ISO 8601 時長格式 (PT#H#M#S)"""
	duration_str = duration_str[2:] # 移除 "PT"
	hours, minutes, seconds = 0, 0, 0

	# 解析小時
	if "H" in duration_str:
	hours_part = duration_str.split("H")[0]
	hours = int(hours_part)
	duration_str = duration_str.split("H")[1]

	# 解析分鐘
	if "M" in duration_str:
	minutes_part = duration_str.split("M")[0]
	minutes = int(minutes_part)
	duration_str = duration_str.split("M")[1]

	# 解析秒
	if "S" in duration_str:
	seconds_part = duration_str.split("S")[0]
	seconds = int(seconds_part)

	# 計算總秒數
	total_seconds = hours * 3600 + minutes * 60 + seconds
	return total_seconds

	def download_audio(video_id, api_info=None):
	"""下載 YouTube 視頻的音頻

	Args:
	video_id: YouTube 視頻 ID
	api_info: 從 API 獲取的視頻信息 (可選)

	Returns:
	tuple: (音頻文件路徑, 臨時目錄, 視頻時長)
	"""
	# 使用固定的目錄來存儲下載的音訊文件
	download_dir = os.path.join(tempfile.gettempdir(), "youtube_downloads")
	os.makedirs(download_dir, exist_ok=True)

	# 使用視頻 ID 和時間戳作為文件名
	filename = f"youtube_{video_id}_{int(time.time())}"
	temp_dir = tempfile.mkdtemp()

	try:
	# 準備下載路徑
	temp_filepath_tmpl = os.path.join(download_dir, f"{filename}.%(ext)s")

	# 設置 yt-dlp 選項
	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': temp_filepath_tmpl,
	'noplaylist': True,
	'quiet': True,
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	'ffmpeg_location': shutil.which("ffmpeg"),
	}

	# 檢查 ffmpeg
	if not ydl_opts['ffmpeg_location']:
	print("Warning: ffmpeg not found... / 警告：找不到 ffmpeg...")

	# 如果已經有 API 信息，使用它
	duration = api_info["duration"] if api_info else None
	title = api_info["title"] if api_info else "Unknown"

	# 下載音頻
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	# 如果沒有 API 信息，從 yt-dlp 獲取
	if not api_info:
	info_dict = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=True)
	duration = info_dict.get('duration')
	title = info_dict.get('title', 'unknown')
	else:
	# 有 API 信息，直接下載
	ydl.download([f"https://www.youtube.com/watch?v={video_id}"])

	# 確定最終文件路徑
	final_filepath = os.path.join(download_dir, f"{filename}.mp3")

	# 檢查文件是否存在
	if os.path.exists(final_filepath):
	print(f"YouTube audio downloaded: {final_filepath}")
	print(f"Title: {title}, Duration: {duration}s")
	return final_filepath, temp_dir, duration
	else:
	# 嘗試查找可能的文件
	potential_files = [
	os.path.join(download_dir, f)
	for f in os.listdir(download_dir)
	if f.startswith(filename) and f.endswith(".mp3")
	]
	if potential_files:
	downloaded_path = potential_files[0]
	print(f"Warning: Could not find expected MP3, using fallback: {downloaded_path}")
	return downloaded_path, temp_dir, duration
	else:
	raise FileNotFoundError(f"Audio file not found after download in {download_dir}")

	except Exception as e:
	print(f"Error downloading YouTube audio: {e}")
	if temp_dir and os.path.exists(temp_dir):
	try:
	shutil.rmtree(temp_dir)
	except Exception as cleanup_e:
	print(f"Error cleaning temp directory {temp_dir}: {cleanup_e}")
	return None, None, None

	def process_youtube_url(youtube_url, user_api_key=None):
	"""處理 YouTube URL，獲取信息並下載音頻

	Args:
	youtube_url: YouTube 視頻 URL
	user_api_key: 用戶提供的 API 金鑰（可選）

	Returns:
	tuple: (音頻文件路徑, 視頻信息)
	"""
	# 檢查 URL 是否有效
	if not youtube_url or not youtube_url.strip():
	return None, None

	# 檢查是否在 Hugging Face Spaces 環境中
	is_spaces = os.environ.get("SPACE_ID") is not None

	# 如果提供了用戶 API 金鑰，設置它
	if user_api_key:
	set_api_key(user_api_key)

	# 提取視頻 ID
	video_id = extract_video_id(youtube_url)
	if not video_id:
	print(f"Invalid YouTube URL: {youtube_url}")
	return None, None

	# 檢查是否設置了 API 金鑰
	if YOUTUBE_API_KEY:
	# 使用 API 獲取視頻信息
	video_info = get_video_info(video_id)
	if not video_info:
	print(f"Could not get video info from API for: {video_id}")

	# 如果在 Spaces 環境中且沒有 API 信息，則不嘗試下載
	if is_spaces:
	raise ValueError("YouTube 下載在 Hugging Face Spaces 中需要有效的 API 金鑰。")

	# 如果 API 失敗，嘗試直接下載
	audio_path, temp_dir, duration = download_audio(video_id)
	return audio_path, {"title": "Unknown", "duration": duration}

	# 使用 API 信息下載音頻
	audio_path, temp_dir, _ = download_audio(video_id, video_info)
	return audio_path, video_info
	else:
	# 沒有 API 金鑰
	if is_spaces:
	# 在 Spaces 環境中需要 API 金鑰
	raise ValueError("YouTube 下載在 Hugging Face Spaces 中需要 API 金鑰。請在上方的 'YouTube API Key Settings' 中輸入您的 API 金鑰。\n\nYouTube download in Hugging Face Spaces requires an API key. Please enter your API key in the 'YouTube API Key Settings' section above.")

	# 本地環境，直接使用 yt-dlp
	print("No YouTube API key set, using yt-dlp directly")
	audio_path, temp_dir, duration = download_audio(video_id)
	return audio_path, {"title": "Unknown", "duration": duration}

	# 測試代碼
	if __name__ == "__main__":
	# 設置 API 金鑰（實際使用時應從環境變量或配置文件獲取）
	api_key = "YOUR_API_KEY"
	set_api_key(api_key)

	# 測試 URL
	test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

	# 處理 URL
	audio_path, video_info = process_youtube_url(test_url)

	if audio_path and video_info:
	print(f"Downloaded: {audio_path}")
	print(f"Video info: {video_info}")
	else:
	print("Failed to process YouTube URL")