Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on Apr 15

Commit

16d7cee

verified ·

1 Parent(s): 72136b9

Update main.py

Browse files

Files changed (1) hide show

main.py +48 -36

main.py CHANGED Viewed

@@ -482,21 +482,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
     logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
     sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
     params = {"token": api_token}
-    # Define base input, adjust for specific actors
     run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
     is_yt_actor = actor_id == APIFY_ACTOR_ID
     log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
     if is_yt_actor:
-        # Use input specific to the default YT actor if needed
-        run_input = { "urls": [video_url], # <--- Make sure video_url is passed correctly if this function is called directly for YT
-                      "outputFormat": "singleStringText", # Keep trying this format
-                      "maxRetries": 5,
                       "channelHandleBoolean": False,
                       "channelNameBoolean": False,
                       "datePublishedBoolean": False,
                       "relativeDateTextBoolean": False }
-        logger.debug(f"{log_prefix} Using YouTube-specific input: { {k:v for k,v in run_input.items() if k != 'urls'} }") # Don't log URL twice
     elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
         run_input = { "urls": [url] }
         logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
@@ -517,7 +520,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                     results = response.json(); content = None
                     if isinstance(results, list) and len(results) > 0:
                         item = results[0]
-                        # Optional: Add debug log here again if the next fix doesn't work
                         # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
                         content = None # Reset content
@@ -532,34 +535,45 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                             logger.info(f"{log_prefix} Found text content in 'markdown' field.")
                             content = item["markdown"]
                         elif "captions" in item and isinstance(item["captions"], str):
-                            # This handles if outputFormat=singleStringText actually worked
                             logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
                             content = item["captions"]
-                        # --- ADDED LIST HANDLING FOR CAPTIONS ---
                         elif "captions" in item and isinstance(item["captions"], list):
-                            logger.info(f"{log_prefix} Found 'captions' field as a list. Attempting to extract text.")
                             transcript_parts = []
-                            for segment in item["captions"]:
-                                if isinstance(segment, dict) and "text" in segment and isinstance(segment["text"], str):
-                                    transcript_parts.append(segment["text"])
-                                elif isinstance(segment, str): # Handle if it's sometimes just a list of strings
-                                    transcript_parts.append(segment)
                             if transcript_parts:
                                 content = " ".join(transcript_parts).strip()
                                 logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
                             else:
-                                logger.warning(f"{log_prefix} 'captions' field was a list but contained no usable text segments.")
-                        # --- END LIST HANDLING ---
                         elif "html" in item and isinstance(item["html"], str):
                              logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
-                             def parse_html_sync(html_str): # Define sync function for threading
                                  try:
                                      soup = BeautifulSoup(html_str, DEFAULT_PARSER)
                                      return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
                                  except Exception as e:
                                      logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
                                      return None
-                             content = await asyncio.to_thread(parse_html_sync, item["html"]) # Run in thread
                         # --- FINAL CONTENT CHECK ---
                         if content and isinstance(content, str) and len(content) > 30:
@@ -595,23 +609,21 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
     except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
     except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
-# --- Important Note on Calling This Function ---
-# Make sure that when `get_youtube_transcript` calls `get_transcript_via_apify`,
-# it correctly passes the `video_url`. And if you refactor `get_transcript_via_apify`
-# to use `_run_apify_actor_for_web_content` directly, ensure the correct Apify Actor ID
-# and the `video_url` are passed.
-# Example refactor of get_transcript_via_apify (if you choose to do this):
-# async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
-#     """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
-#     global APIFY_ACTOR_ID
-#     # Note: The run_input logic specific to the YT actor is now inside _run_apify_actor_for_web_content
-#     return await _run_apify_actor_for_web_content(
-#         url=video_url,  # Pass video_url as the 'url' parameter
-#         api_token=api_token,
-#         actor_id=APIFY_ACTOR_ID,
-#         actor_name="Apify YT" # Use specific name for logging
-#     )
 async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
     """Fallback 4: Fetches website content using Apify Website Content Crawler."""

     logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
     sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
     params = {"token": api_token}
+    # --- Define base input, adjust for specific actors ---
     run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
     is_yt_actor = actor_id == APIFY_ACTOR_ID
     log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
     if is_yt_actor:
+        # Use input specific to the default YT actor
+        # REMOVED outputFormat: "singleStringText" as it seems unreliable based on observed output
+        run_input = { "urls": [url], # Pass the URL correctly
+                      # "outputFormat": "singleStringText", # <<< REMOVED THIS LINE
+                      "maxRetries": 5, # Keep retries
+                      # Keep other flags as they might affect which data is returned overall
                       "channelHandleBoolean": False,
                       "channelNameBoolean": False,
                       "datePublishedBoolean": False,
                       "relativeDateTextBoolean": False }
+        logger.debug(f"{log_prefix} Using YouTube-specific input (default array output expected)")
     elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
         run_input = { "urls": [url] }
         logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
                     results = response.json(); content = None
                     if isinstance(results, list) and len(results) > 0:
                         item = results[0]
+                        # Optional: Re-enable for deep debugging if needed
                         # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
                         content = None # Reset content
                             logger.info(f"{log_prefix} Found text content in 'markdown' field.")
                             content = item["markdown"]
                         elif "captions" in item and isinstance(item["captions"], str):
+                            # This case might still happen if the actor *sometimes* returns string
                             logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
                             content = item["captions"]
+                        # --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
                         elif "captions" in item and isinstance(item["captions"], list):
+                            logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
                             transcript_parts = []
+                            if not item["captions"]: # Handle empty list case
+                                logger.warning(f"{log_prefix} 'captions' field is an empty list.")
+                            else:
+                                # Check the type of the *first* element to decide parsing strategy
+                                first_element = item["captions"][0]
+                                if isinstance(first_element, str):
+                                    # Assume list of strings (Example 1 in docs)
+                                    logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
+                                    transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
+                                elif isinstance(first_element, dict) and "text" in first_element:
+                                    # Assume list of dictionaries (Example 2 in docs)
+                                    logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
+                                    transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
+                                else:
+                                    logger.warning(f"{log_prefix} 'captions' list contains unexpected element types (first element type: {type(first_element)}). Cannot parse.")
                             if transcript_parts:
                                 content = " ".join(transcript_parts).strip()
                                 logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
                             else:
+                                logger.warning(f"{log_prefix} Could not extract usable text from 'captions' list structure.")
+                        # --- END MODIFIED LIST HANDLING ---
                         elif "html" in item and isinstance(item["html"], str):
                              logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
+                             def parse_html_sync(html_str):
                                  try:
                                      soup = BeautifulSoup(html_str, DEFAULT_PARSER)
                                      return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
                                  except Exception as e:
                                      logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
                                      return None
+                             content = await asyncio.to_thread(parse_html_sync, item["html"])
                         # --- FINAL CONTENT CHECK ---
                         if content and isinstance(content, str) and len(content) > 30:
     except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
     except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
+# --- Ensure YT Transcript function uses the generic one ---
+# You can simplify the get_transcript_via_apify function now
+async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
+    """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
+    global APIFY_ACTOR_ID
+    # The specific run_input logic is now handled within _run_apify_actor_for_web_content
+    # when it detects the actor_id matches APIFY_ACTOR_ID
+    logger.debug(f"[get_transcript_via_apify] Calling generic runner for URL: {video_url}")
+    return await _run_apify_actor_for_web_content(
+        url=video_url,  # Pass video_url as the 'url' parameter
+        api_token=api_token,
+        actor_id=APIFY_ACTOR_ID,
+        actor_name="Apify YT" # Keep specific name for logging clarity
+    )
 async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
     """Fallback 4: Fetches website content using Apify Website Content Crawler."""