fmab777 commited on
Commit
332d5fc
Β·
verified Β·
1 Parent(s): 37917ce

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +68 -87
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (Correcting SyntaxError at line 192)
2
  import os
3
  import re
4
  import logging
@@ -71,7 +71,7 @@ APIFY_API_TOKEN = get_secret('APIFY_API_TOKEN')
71
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
72
 
73
  OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
74
- APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts")
75
 
76
  if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
77
  if not OPENROUTER_API_KEY: logger.error("❌ ERROR: OPENROUTER_API_KEY not found. Summarization will fail.")
@@ -132,7 +132,8 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
132
  if not video_id: logger.error("[Supadata] No video_id provided"); return None
133
  if not api_key: logger.error("[Supadata] API key missing."); return None
134
  logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
135
- api_endpoint = "https://api.supadata.net/v1/youtube/transcript"
 
136
  params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
137
  try:
138
  async with httpx.AsyncClient(timeout=30.0) as client:
@@ -144,7 +145,7 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
144
  except json.JSONDecodeError: data = None
145
  content = None
146
  if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
147
- if not content and response.text: content = response.text
148
  if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
149
  else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
150
  except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
@@ -153,94 +154,73 @@ async def get_transcript_via_supadata(video_id: str, api_key: str) -> Optional[s
153
  else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None
154
  except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None
155
  except httpx.RequestError as e:
 
156
  if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}")
157
  else: logger.error(f"[Supadata] Request error for {video_id}: {e}")
158
  return None
159
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
160
 
161
-
162
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
163
- """Fetches YouTube transcript using Apify REST API (async start + poll + dataset fetch)."""
164
- global APIFY_ACTOR_ID
165
- if not video_url: logger.error("[Apify Async] No video_url provided"); return None
166
- if not api_token: logger.error("[Apify Async] API token missing."); return None
167
- logger.info(f"[Apify Async] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
168
-
169
- start_run_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/runs"
170
- params_base = {} # Token goes in header now
171
- payload = { "urls": [video_url], "outputFormat": "singleStringText", "maxRetries": 5, "channelHandleBoolean": False, "channelNameBoolean": False, "datePublishedBoolean": False, "relativeDateTextBoolean": False, }
172
- headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_token}" }
173
-
174
- run_id: Optional[str] = None; dataset_id: Optional[str] = None
175
- max_wait_seconds = 120; poll_interval = 5
 
 
 
 
176
 
177
  try:
178
- async with httpx.AsyncClient(timeout=30.0, headers=headers) as client:
179
- log_headers = {k: v for k, v in headers.items() if k.lower() != 'authorization'}
180
- logger.debug(f"[Apify Async] POST Request Details:\nURL: {start_run_endpoint}\nHeaders: {log_headers}\nPayload: {json.dumps(payload)}")
181
- response_start = await client.post(start_run_endpoint, json=payload)
182
- logger.debug(f"[Apify Async] Start run status: {response_start.status_code}")
183
-
184
- if response_start.status_code == 201:
185
- try:
186
- run_data = response_start.json().get("data", {})
187
- run_id = run_data.get("id"); dataset_id = run_data.get("defaultDatasetId")
188
- if not run_id or not dataset_id: logger.error(f"[Apify Async] Started run but missing runId or datasetId. Data: {run_data}"); return None
189
- logger.info(f"[Apify Async] Run started. Run ID: {run_id}, Dataset ID: {dataset_id}")
190
- except Exception as e: logger.error(f"[Apify Async] Error parsing start run response: {e}. Response: {response_start.text[:200]}", exc_info=True); return None
191
- else:
192
- # *** FIX: Correct Indentation for try/except block ***
193
- error_info = ""
194
- try:
195
- # Attempt to get the error message from the JSON response
196
- error_info = response_start.json().get("error", {}).get("message", "")
197
- except Exception:
198
- # If parsing fails or structure is unexpected, just pass
199
- pass
200
- logger.error(f"[Apify Async] Failed to start run. Status: {response_start.status_code}. Error: '{error_info}' Resp: {response_start.text[:200]}")
201
- return None
202
-
203
- run_status_endpoint = f"https://api.apify.com/v2/actor-runs/{run_id}"; elapsed_time = 0; final_status = None
204
- while elapsed_time < max_wait_seconds:
205
- await asyncio.sleep(poll_interval); elapsed_time += poll_interval
206
- logger.debug(f"[Apify Async] Polling status for run {run_id} ({elapsed_time}s elapsed)")
207
  try:
208
- response_status = await client.get(run_status_endpoint)
209
- if response_status.status_code == 200:
210
- status_data = response_status.json().get("data", {}); final_status = status_data.get("status")
211
- logger.debug(f"[Apify Async] Run status: {final_status}")
212
- if final_status in ["SUCCEEDED", "FAILED", "ABORTED", "TIMED-OUT"]: break
213
- else: logger.warning(f"[Apify Async] Non-200 status ({response_status.status_code}) polling run {run_id}."); await asyncio.sleep(poll_interval * 2)
214
- except Exception as poll_err: logger.error(f"[Apify Async] Error polling run status {run_id}: {poll_err}"); await asyncio.sleep(poll_interval * 2)
215
-
216
- if final_status != "SUCCEEDED": logger.warning(f"[Apify Async] Run {run_id} did not succeed. Final status: {final_status}"); return None
217
-
218
- logger.info(f"[Apify Async] Run {run_id} succeeded. Fetching items from dataset {dataset_id}")
219
- dataset_endpoint = f"https://api.apify.com/v2/datasets/{dataset_id}/items"
220
- params_dataset = {"format": "json", "limit": 5}
221
- response_dataset = await client.get(dataset_endpoint, params=params_dataset)
222
- logger.debug(f"[Apify Async] Dataset fetch status: {response_dataset.status_code}")
223
- response_dataset.raise_for_status()
224
-
225
- results = response_dataset.json()
226
- if isinstance(results, list) and len(results) > 0:
227
- item = results[0]
228
- content = None
229
- if "captions" in item and isinstance(item["captions"], str): logger.info("[Apify Async] Found 'captions' key with string content."); content = item["captions"]
230
- elif "text" in item and isinstance(item["text"], str): logger.info("[Apify Async] Found 'text' key with string content."); content = item["text"]
231
- elif "transcript" in item and isinstance(item["transcript"], str): logger.info("[Apify Async] Found 'transcript' key with string content."); content = item["transcript"]
232
- elif "captions" in item and isinstance(item["captions"], list):
233
- logger.warning("[Apify Async] Received list format for 'captions' unexpectedly. Processing...")
234
- if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
235
- elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
236
- if content and isinstance(content, str): logger.info(f"[Apify Async] Success via ASYNC REST for {video_url}. Length: {len(content)}"); return content.strip()
237
- else: logger.warning(f"[Apify Async] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None
238
- else: logger.warning(f"[Apify Async] Dataset {dataset_id} was empty for {video_url}. Response: {results}"); return None
239
-
240
- except httpx.TimeoutException as e: logger.error(f"[Apify Async] Timeout during API interaction for {video_url}: {e}"); return None
241
- except httpx.HTTPStatusError as e: logger.error(f"[Apify Async] HTTP Status Error during API interaction for {video_url}: {e}"); return None
242
- except httpx.RequestError as e: logger.error(f"[Apify Async] Request error during API interaction for {video_url}: {e}"); return None
243
- except Exception as e: logger.error(f"[Apify Async] Unexpected error during Apify Async REST call for {video_url}: {e}", exc_info=True); return None
244
 
245
 
246
  # (get_youtube_transcript, get_website_content, get_website_content_via_api, generate_summary remain the same)
@@ -268,11 +248,11 @@ async def get_youtube_transcript(video_id: str, video_url: str) -> Optional[str]
268
  else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
269
  else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
270
  if transcript_text is None:
271
- logger.info("[Fallback YT 2] Trying Apify REST API (Async)...")
272
  if APIFY_API_TOKEN:
273
- transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN)
274
- if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify ASYNC REST for {video_url}"); return transcript_text
275
- else: logger.warning(f"[Fallback YT 2] Apify ASYNC REST failed or no content for {video_url}.")
276
  else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
277
  if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
278
  return transcript_text
@@ -325,6 +305,7 @@ async def get_website_content_via_api(url: str, api_key: str) -> Optional[str]:
325
  except httpx.RequestError as e: logger.error(f"[Fallback Web API] Request error connecting to urltotext.com API for {url}: {e}"); return None
326
  except Exception as e: logger.error(f"[Fallback Web API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
327
 
 
328
  async def generate_summary(text: str, summary_type: str) -> str:
329
  global OPENROUTER_API_KEY, OPENROUTER_MODEL
330
  logger.info(f"Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")
 
1
+ # main.py (Correcting Supadata URL and Apify Endpoint/Logic)
2
  import os
3
  import re
4
  import logging
 
71
  WEBHOOK_SECRET = get_secret('WEBHOOK_SECRET')
72
 
73
  OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "deepseek/deepseek-chat-v3-0324:free")
74
+ APIFY_ACTOR_ID = os.environ.get("APIFY_ACTOR_ID", "karamelo~youtube-transcripts") # Keep karamelo as per docs
75
 
76
  if not TELEGRAM_TOKEN: logger.critical("❌ FATAL: TELEGRAM_TOKEN not found."); raise RuntimeError("Exiting: Telegram token missing.")
77
  if not OPENROUTER_API_KEY: logger.error("❌ ERROR: OPENROUTER_API_KEY not found. Summarization will fail.")
 
132
  if not video_id: logger.error("[Supadata] No video_id provided"); return None
133
  if not api_key: logger.error("[Supadata] API key missing."); return None
134
  logger.info(f"[Supadata] Attempting fetch for video ID: {video_id}")
135
+ # *** FIX: Use correct base URL ***
136
+ api_endpoint = "https://api.supadata.ai/v1/youtube/transcript"
137
  params = {"videoId": video_id, "format": "text"}; headers = {"X-API-Key": api_key}
138
  try:
139
  async with httpx.AsyncClient(timeout=30.0) as client:
 
145
  except json.JSONDecodeError: data = None
146
  content = None
147
  if data: content = data if isinstance(data, str) else data.get("transcript") or data.get("text") or data.get("data")
148
+ if not content and response.text: content = response.text # Check plain text response
149
  if content and isinstance(content, str): logger.info(f"[Supadata] Success for {video_id}. Length: {len(content)}"); return content.strip()
150
  else: logger.warning(f"[Supadata] Success but content empty/invalid for {video_id}. Response: {response.text[:200]}"); return None
151
  except Exception as e: logger.error(f"[Supadata] Error processing success response for {video_id}: {e}", exc_info=True); return None
 
154
  else: logger.error(f"[Supadata] Unexpected status {response.status_code} for {video_id}. Resp: {response.text[:200]}"); return None
155
  except httpx.TimeoutException: logger.error(f"[Supadata] Timeout connecting for {video_id}"); return None
156
  except httpx.RequestError as e:
157
+ # SSL errors usually fall under RequestError
158
  if "CERTIFICATE_VERIFY_FAILED" in str(e): logger.error(f"[Supadata] SSL Cert Verify Failed for {video_id}: {e}")
159
  else: logger.error(f"[Supadata] Request error for {video_id}: {e}")
160
  return None
161
  except Exception as e: logger.error(f"[Supadata] Unexpected error for {video_id}: {e}", exc_info=True); return None
162
 
 
163
  async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
164
+ """Fetches YouTube transcript using Apify REST API (run-sync-get-dataset-items endpoint)."""
165
+ global APIFY_ACTOR_ID # Should be karamelo~youtube-transcripts
166
+ if not video_url: logger.error("[Apify SyncItems] No video_url provided"); return None
167
+ if not api_token: logger.error("[Apify SyncItems] API token missing."); return None
168
+ logger.info(f"[Apify SyncItems] Attempting fetch for URL: {video_url} (Actor: {APIFY_ACTOR_ID})")
169
+
170
+ # *** FIX: Use the run-sync-get-dataset-items endpoint ***
171
+ sync_items_endpoint = f"https://api.apify.com/v2/acts/{APIFY_ACTOR_ID}/run-sync-get-dataset-items"
172
+ params = {"token": api_token} # Token in param as per OpenAPI spec for this endpoint
173
+ payload = {
174
+ "urls": [video_url],
175
+ "outputFormat": "singleStringText",
176
+ "maxRetries": 5,
177
+ "channelHandleBoolean": False, "channelNameBoolean": False,
178
+ "datePublishedBoolean": False, "relativeDateTextBoolean": False,
179
+ }
180
+ headers = {"Content-Type": "application/json"} # No Auth header needed if token in params
181
 
182
  try:
183
+ # Use a longer timeout for this synchronous endpoint
184
+ async with httpx.AsyncClient(timeout=120.0) as client:
185
+ log_headers = {k: v for k, v in headers.items()} # Log headers
186
+ logger.debug(f"[Apify SyncItems] POST Request Details:\nURL: {sync_items_endpoint}\nParams: {params}\nHeaders: {log_headers}\nPayload: {json.dumps(payload)}")
187
+ # *** FIX: POST to the sync items endpoint ***
188
+ response = await client.post(sync_items_endpoint, headers=headers, params=params, json=payload)
189
+ logger.debug(f"[Apify SyncItems] Received status code {response.status_code} for {video_url}")
190
+
191
+ # *** FIX: Expect 200 OK for this endpoint ***
192
+ if response.status_code == 200:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  try:
194
+ # Response body *is* the dataset items array
195
+ results = response.json()
196
+ if isinstance(results, list) and len(results) > 0:
197
+ item = results[0]
198
+ # Parsing logic (same as before)
199
+ content = None
200
+ if "captions" in item and isinstance(item["captions"], str): content = item["captions"]
201
+ elif "text" in item and isinstance(item["text"], str): content = item["text"]
202
+ elif "transcript" in item and isinstance(item["transcript"], str): content = item["transcript"]
203
+ elif "captions" in item and isinstance(item["captions"], list):
204
+ logger.warning("[Apify SyncItems] Received list format for 'captions' unexpectedly. Processing...")
205
+ if len(item["captions"]) > 0 and isinstance(item["captions"][0], dict) and 'text' in item["captions"][0]: content = " ".join(line.get("text", "") for line in item["captions"] if line.get("text"))
206
+ elif len(item["captions"]) > 0 and isinstance(item["captions"][0], str): content = " ".join(item["captions"])
207
+
208
+ if content and isinstance(content, str): logger.info(f"[Apify SyncItems] Success via REST for {video_url}. Length: {len(content)}"); return content.strip()
209
+ else: logger.warning(f"[Apify SyncItems] Dataset item parsed but transcript content empty/invalid format for {video_url}. Item keys: {list(item.keys())}"); return None
210
+ else: logger.warning(f"[Apify SyncItems] Actor success but dataset was empty for {video_url}. Response: {results}"); return None
211
+ except json.JSONDecodeError: logger.error(f"[Apify SyncItems] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}"); return None
212
+ except Exception as e: logger.error(f"[Apify SyncItems] Error processing success response for {video_url}: {e}", exc_info=True); return None
213
+ elif response.status_code == 400: logger.error(f"[Apify SyncItems] Bad Request (400) for {video_url}. Check payload. Resp:{response.text[:200]}"); return None
214
+ elif response.status_code == 401: logger.error("[Apify SyncItems] Auth error (401). Check token."); return None
215
+ elif response.status_code == 404: # This was the error before, log details if it happens again
216
+ error_info = ""; try: error_info = response.json().get("error", {}).get("message", "") except Exception: pass
217
+ logger.error(f"[Apify SyncItems] Endpoint/Actor Not Found (404). Error: '{error_info}' Resp:{response.text[:200]}"); return None
218
+ else: logger.error(f"[Apify SyncItems] Unexpected status {response.status_code} for {video_url}. Resp:{response.text[:200]}"); return None
219
+
220
+ except httpx.TimeoutException as e: logger.error(f"[Apify SyncItems] Timeout during API interaction for {video_url}: {e}"); return None
221
+ except httpx.HTTPStatusError as e: logger.error(f"[Apify SyncItems] HTTP Status Error during API interaction for {video_url}: {e}"); return None
222
+ except httpx.RequestError as e: logger.error(f"[Apify SyncItems] Request error during API interaction for {video_url}: {e}"); return None
223
+ except Exception as e: logger.error(f"[Apify SyncItems] Unexpected error during Apify SyncItems REST call for {video_url}: {e}", exc_info=True); return None
 
 
 
 
 
 
224
 
225
 
226
  # (get_youtube_transcript, get_website_content, get_website_content_via_api, generate_summary remain the same)
 
248
  else: logger.warning(f"[Fallback YT 1] Supadata failed or no content for {video_id}.")
249
  else: logger.warning("[Fallback YT 1] Supadata API key unavailable. Skipping.")
250
  if transcript_text is None:
251
+ logger.info("[Fallback YT 2] Trying Apify REST API (SyncItems)...") # Updated log
252
  if APIFY_API_TOKEN:
253
+ transcript_text = await get_transcript_via_apify(video_url, APIFY_API_TOKEN) # Calls updated func
254
+ if transcript_text: logger.info(f"[Fallback YT 2] Success via Apify SyncItems REST for {video_url}"); return transcript_text
255
+ else: logger.warning(f"[Fallback YT 2] Apify SyncItems REST failed or no content for {video_url}.")
256
  else: logger.warning("[Fallback YT 2] Apify API token unavailable. Skipping.")
257
  if transcript_text is None: logger.error(f"All methods failed for YT transcript: {video_id}"); return None
258
  return transcript_text
 
305
  except httpx.RequestError as e: logger.error(f"[Fallback Web API] Request error connecting to urltotext.com API for {url}: {e}"); return None
306
  except Exception as e: logger.error(f"[Fallback Web API] Unexpected error during urltotext.com API call for {url}: {e}", exc_info=True); return None
307
 
308
+
309
  async def generate_summary(text: str, summary_type: str) -> str:
310
  global OPENROUTER_API_KEY, OPENROUTER_MODEL
311
  logger.info(f"Generating {summary_type} summary using {OPENROUTER_MODEL}. Input length: {len(text)}")