Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on 17 days ago

Commit

72136b9

verified ·

1 Parent(s): 6f50520

Update main.py

Browse files

Files changed (1) hide show

main.py +84 -41

main.py CHANGED Viewed

@@ -482,93 +482,136 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
     logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
     sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
     params = {"token": api_token}
     run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
-    if actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID: run_input = { "urls": [url] }; logger.debug(f"[{actor_name}] Using simplified input: {run_input}")
     headers = {"Content-Type": "application/json"}
     try:
         async with httpx.AsyncClient(timeout=120.0) as client:
-            logger.debug(f"[{actor_name}] POST Request to {sync_items_endpoint} for {url}")
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
-            logger.debug(f"[{actor_name}] Received status code {response.status_code} for {url}")
             if response.status_code in [200, 201]:
                 if response.status_code == 201:
-                    logger.info(f"[{actor_name}] Received status 201 (Created) from run-sync endpoint, processing results anyway.")
                 try:
                     results = response.json(); content = None
-                    # Check if results is a list and not empty
                     if isinstance(results, list) and len(results) > 0:
                         item = results[0]
                         content = None # Reset content
-                        # Parsing Logic - try extracting text from various possible keys
                         if "text" in item and isinstance(item["text"], str):
                             content = item["text"]
                         elif "content" in item and isinstance(item["content"], str):
                             content = item["content"]
                         elif "markdown" in item and isinstance(item["markdown"], str):
                             content = item["markdown"]
                         elif "captions" in item and isinstance(item["captions"], str):
-                            # Specifically check if this is the YT actor by name or ID if needed,
-                            # otherwise this assumes 'captions' might contain text for other actors too.
-                            is_yt_actor = actor_id == APIFY_ACTOR_ID # Check if it's the specific YT actor
-                            log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
-                            logger.info(f"{log_prefix} Found text content in 'captions' field.")
                             content = item["captions"]
                         elif "html" in item and isinstance(item["html"], str):
-                             is_yt_actor = actor_id == APIFY_ACTOR_ID
-                             log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
                              logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
-                             # Run potentially blocking BS4 parsing in a separate thread
-                             def parse_html_sync(html_str):
                                  try:
                                      soup = BeautifulSoup(html_str, DEFAULT_PARSER)
                                      return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
                                  except Exception as e:
                                      logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
                                      return None
-                             content = await asyncio.to_thread(parse_html_sync, item["html"])
-                        # Check content validity AFTER attempting all parsing methods
                         if content and isinstance(content, str) and len(content) > 30:
-                            is_yt_actor = actor_id == APIFY_ACTOR_ID
-                            log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
-                            # Use 'url' which is the function parameter
-                            logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Length: {len(content)}")
                             return content.strip()
                         else:
-                             # Content is invalid or parsing failed
                              content_len = len(content) if content and isinstance(content, str) else 0
-                             is_yt_actor = actor_id == APIFY_ACTOR_ID
-                             log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
                              item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
-                             # Use 'url' which is the function parameter
-                             logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format for {url}. Item keys: {item_keys_str}. Length: {content_len}")
-                             return None
-                    # <<< CORRECTED INDENTATION FOR THIS ELSE >>>
                     else:
                         # Handle empty dataset list '[]'
-                        logger.warning(f"[{actor_name}] Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
                         return None
                 # --- End of success processing logic ---
                 except json.JSONDecodeError:
-                    logger.error(f"[{actor_name}] Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
                     return None
                 except Exception as e:
-                    logger.error(f"[{actor_name}] Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
                     return None
-            # <<< The existing elif error handling remains the same >>>
-            elif response.status_code == 400: logger.error(f"[{actor_name}] Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
-            elif response.status_code == 401: logger.error(f"[{actor_name}] Auth error (401). Check token."); return None
-            elif response.status_code == 404: logger.error(f"[{actor_name}] Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
-            else: # Catches any other non-200/201 status
-                logger.error(f"[{actor_name}] Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}");
                 return None
-    except httpx.TimeoutException as e: logger.error(f"[{actor_name}] Timeout during API interaction for {url}: {e}"); return None
-    except httpx.HTTPStatusError as e: logger.error(f"[{actor_name}] HTTP Status Error during API interaction for {url}: {e}"); return None
-    except httpx.RequestError as e: logger.error(f"[{actor_name}] Request error during API interaction for {url}: {e}"); return None
-    except Exception as e: logger.error(f"[{actor_name}] Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
 async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
     """Fallback 4: Fetches website content using Apify Website Content Crawler."""

     logger.info(f"[{actor_name}] Attempting fetch for URL: {url} (Actor: {actor_id})")
     sync_items_endpoint = f"https://api.apify.com/v2/acts/{actor_id}/run-sync-get-dataset-items"
     params = {"token": api_token}
+    # Define base input, adjust for specific actors
     run_input = { "startUrls": [{"url": url}], "maxCrawlPages": 1, "crawlerType": "playwright:firefox" }
+    is_yt_actor = actor_id == APIFY_ACTOR_ID
+    log_prefix = "[Apify YT]" if is_yt_actor else f"[{actor_name}]"
+    if is_yt_actor:
+        # Use input specific to the default YT actor if needed
+        run_input = { "urls": [video_url], # <--- Make sure video_url is passed correctly if this function is called directly for YT
+                      "outputFormat": "singleStringText", # Keep trying this format
+                      "maxRetries": 5,
+                      "channelHandleBoolean": False,
+                      "channelNameBoolean": False,
+                      "datePublishedBoolean": False,
+                      "relativeDateTextBoolean": False }
+        logger.debug(f"{log_prefix} Using YouTube-specific input: { {k:v for k,v in run_input.items() if k != 'urls'} }") # Don't log URL twice
+    elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
+        run_input = { "urls": [url] }
+        logger.debug(f"{log_prefix} Using simplified input for Text Scraper: {run_input}")
+    # Add other actor-specific input adjustments here if necessary
     headers = {"Content-Type": "application/json"}
     try:
         async with httpx.AsyncClient(timeout=120.0) as client:
+            logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url}")
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
+            logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
             if response.status_code in [200, 201]:
                 if response.status_code == 201:
+                    logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
                 try:
                     results = response.json(); content = None
                     if isinstance(results, list) and len(results) > 0:
                         item = results[0]
+                        # Optional: Add debug log here again if the next fix doesn't work
+                        # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
                         content = None # Reset content
+                        # --- REFINED PARSING LOGIC ---
                         if "text" in item and isinstance(item["text"], str):
+                            logger.info(f"{log_prefix} Found text content in 'text' field.")
                             content = item["text"]
                         elif "content" in item and isinstance(item["content"], str):
+                            logger.info(f"{log_prefix} Found text content in 'content' field.")
                             content = item["content"]
                         elif "markdown" in item and isinstance(item["markdown"], str):
+                            logger.info(f"{log_prefix} Found text content in 'markdown' field.")
                             content = item["markdown"]
                         elif "captions" in item and isinstance(item["captions"], str):
+                            # This handles if outputFormat=singleStringText actually worked
+                            logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
                             content = item["captions"]
+                        # --- ADDED LIST HANDLING FOR CAPTIONS ---
+                        elif "captions" in item and isinstance(item["captions"], list):
+                            logger.info(f"{log_prefix} Found 'captions' field as a list. Attempting to extract text.")
+                            transcript_parts = []
+                            for segment in item["captions"]:
+                                if isinstance(segment, dict) and "text" in segment and isinstance(segment["text"], str):
+                                    transcript_parts.append(segment["text"])
+                                elif isinstance(segment, str): # Handle if it's sometimes just a list of strings
+                                    transcript_parts.append(segment)
+                            if transcript_parts:
+                                content = " ".join(transcript_parts).strip()
+                                logger.info(f"{log_prefix} Successfully extracted transcript from list in 'captions'. Combined length: {len(content)}")
+                            else:
+                                logger.warning(f"{log_prefix} 'captions' field was a list but contained no usable text segments.")
+                        # --- END LIST HANDLING ---
                         elif "html" in item and isinstance(item["html"], str):
                              logger.warning(f"{log_prefix} No direct text/markdown/captions found, attempting to parse 'html' from result.")
+                             def parse_html_sync(html_str): # Define sync function for threading
                                  try:
                                      soup = BeautifulSoup(html_str, DEFAULT_PARSER)
                                      return " ".join(line.strip() for line in soup.get_text(separator='\n', strip=True).splitlines() if line.strip())
                                  except Exception as e:
                                      logger.error(f"{log_prefix} Error parsing HTML in thread: {e}")
                                      return None
+                             content = await asyncio.to_thread(parse_html_sync, item["html"]) # Run in thread
+                        # --- FINAL CONTENT CHECK ---
                         if content and isinstance(content, str) and len(content) > 30:
+                            logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
                             return content.strip()
                         else:
+                             # Log failure after trying all parsing methods
                              content_len = len(content) if content and isinstance(content, str) else 0
                              item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
+                             logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
+                             return None # Return None if no valid content found
                     else:
                         # Handle empty dataset list '[]'
+                        logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty for {url}. Response: {results}")
                         return None
                 # --- End of success processing logic ---
                 except json.JSONDecodeError:
+                    logger.error(f"{log_prefix} Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
                     return None
                 except Exception as e:
+                    logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
                     return None
+            # Error handling for API call itself
+            elif response.status_code == 400: logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
+            elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
+            elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
+            else:
+                logger.error(f"{log_prefix} Unexpected status {response.status_code} for {url}. Resp:{response.text[:200]}");
                 return None
+    # Error handling for network/client issues
+    except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
+    except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
+    except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
+    except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None
+# --- Important Note on Calling This Function ---
+# Make sure that when `get_youtube_transcript` calls `get_transcript_via_apify`,
+# it correctly passes the `video_url`. And if you refactor `get_transcript_via_apify`
+# to use `_run_apify_actor_for_web_content` directly, ensure the correct Apify Actor ID
+# and the `video_url` are passed.
+# Example refactor of get_transcript_via_apify (if you choose to do this):
+# async def get_transcript_via_apify(video_url: str, api_token: str) -> Optional[str]:
+#     """Fallback YT 2: Fetches YouTube transcript using default Apify Actor via generic function."""
+#     global APIFY_ACTOR_ID
+#     # Note: The run_input logic specific to the YT actor is now inside _run_apify_actor_for_web_content
+#     return await _run_apify_actor_for_web_content(
+#         url=video_url,  # Pass video_url as the 'url' parameter
+#         api_token=api_token,
+#         actor_id=APIFY_ACTOR_ID,
+#         actor_name="Apify YT" # Use specific name for logging
+#     )
 async def get_website_content_via_apify_crawler(url: str, api_token: str) -> Optional[str]:
     """Fallback 4: Fetches website content using Apify Website Content Crawler."""