Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on 11 days ago

Commit

97fb38c

verified ·

1 Parent(s): 4afcd87

Update main.py

Browse files

Files changed (1) hide show

main.py +32 -24

main.py CHANGED Viewed

@@ -528,20 +528,32 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
             "channelHandleBoolean": False,
             "channelNameBoolean": False,
             "datePublishedBoolean": False,
-            "relativeDateTextBoolean": False
         }
-        logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
-    elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
         # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
-        # Based on the error message "Field input.urls must be string"
         run_input = {
-            "urls": url # <<< STRING format needed here, not list
         }
-        logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
     elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
         # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
         run_input = {
             "urls": [url] # <<< Assume LIST format standard here
         }
         logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
     elif actor_id == APIFY_CRAWLER_ACTOR_ID:
@@ -549,24 +561,30 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
          run_input = {
             "startUrls": [{"url": url}], # <<< Different structure entirely
             "maxCrawlPages": 1,
-            "crawlerType": "playwright:firefox" # Or adjust as needed
          }
          logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
     else:
         # Fallback default input if actor ID doesn't match known ones
-        # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
         run_input = {"urls": [url]} # <<< Default to LIST
         logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
     headers = {"Content-Type": "application/json"}
     try:
-        async with httpx.AsyncClient(timeout=120.0) as client:
-            logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
             logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
-            # --- Start of response processing (Remains the same as before) ---
             if response.status_code in [200, 201]:
                 if response.status_code == 201:
                     logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
@@ -575,8 +593,6 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                     results = response.json(); content = None
                     if isinstance(results, list) and len(results) > 0:
                         item = results[0]
-                        # Optional: Re-enable for deep debugging if needed
-                        # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
                         content = None # Reset content
                         # --- REFINED PARSING LOGIC (Handles output from various actors) ---
@@ -590,24 +606,20 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                             logger.info(f"{log_prefix} Found text content in 'markdown' field.")
                             content = item["markdown"]
                         elif "captions" in item and isinstance(item["captions"], str):
-                            # This case might still happen if the actor *sometimes* returns string
                             logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
                             content = item["captions"]
-                        # --- MODIFIED LIST HANDLING FOR CAPTIONS (Checks Documentation Examples) ---
                         elif "captions" in item and isinstance(item["captions"], list):
-                            logger.info(f"{log_prefix} Found 'captions' field as a list. Processing based on documentation examples...")
                             transcript_parts = []
                             if not item["captions"]: # Handle empty list case
                                 logger.warning(f"{log_prefix} 'captions' field is an empty list.")
                             else:
-                                # Check the type of the *first* element to decide parsing strategy
                                 first_element = item["captions"][0]
                                 if isinstance(first_element, str):
-                                    # Assume list of strings (Example 1 in docs)
                                     logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
                                     transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
                                 elif isinstance(first_element, dict) and "text" in first_element:
-                                    # Assume list of dictionaries (Example 2 in docs)
                                     logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
                                     transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
                                 else:
@@ -635,18 +647,15 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                             logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
                             return content.strip()
                         else:
-                             # Log failure after trying all parsing methods
                              content_len = len(content) if content and isinstance(content, str) else 0
                              item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
                              logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
                              return None # Return None if no valid content found
                     else:
-                        # Handle empty dataset list '[]' or non-list response
                         logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
                         return None
                 # --- End of success processing logic ---
                 except json.JSONDecodeError:
-                    # Check if the raw text looks like a transcript if JSON fails
                     raw_text = response.text
                     if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
                         logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
@@ -659,7 +668,6 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                     return None
             # Error handling for API call itself
             elif response.status_code == 400:
-                 # Log the specific error message from the API response if available
                  error_msg = response.text[:200] # Default
                  try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
                  except Exception: pass
@@ -671,7 +679,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                 return None
     # Error handling for network/client issues
     except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
-    except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None # Should be caught by status code checks, but good practice
     except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
     except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None

             "channelHandleBoolean": False,
             "channelNameBoolean": False,
             "datePublishedBoolean": False,
+            "relativeDateTextBoolean": False,
+            # --- ADDED RESIDENTIAL PROXY CONFIG ---
+            "proxyConfiguration": {
+                "useApifyProxy": True,
+                "apifyProxyGroups": ["RESIDENTIAL"]
+            }
+            # --- END ADDED PROXY CONFIG ---
         }
+        logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID}) with Residential Proxy")
+    elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID:
         # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
         run_input = {
+            "urls": url, # <<< STRING format needed here, not list
+             # --- ADDED RESIDENTIAL PROXY CONFIG ---
+            "proxyConfiguration": {
+                "useApifyProxy": True,
+                "apifyProxyGroups": ["RESIDENTIAL"]
+            }
+            # --- END ADDED PROXY CONFIG ---
         }
+        logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID}) with Residential Proxy")
     elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
         # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
         run_input = {
             "urls": [url] # <<< Assume LIST format standard here
+            # Note: Proxy config not added here by default, could be added if needed
         }
         logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
     elif actor_id == APIFY_CRAWLER_ACTOR_ID:
          run_input = {
             "startUrls": [{"url": url}], # <<< Different structure entirely
             "maxCrawlPages": 1,
+            "crawlerType": "playwright:firefox", # Or adjust as needed
+            # Note: Proxy config not added here by default, but Website Crawler often needs it.
+            # Example if needed:
+            # "proxyConfiguration": {
+            #     "useApifyProxy": True,
+            #     "apifyProxyGroups": ["RESIDENTIAL"] # Or other groups
+            # }
          }
          logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
     else:
         # Fallback default input if actor ID doesn't match known ones
         run_input = {"urls": [url]} # <<< Default to LIST
         logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
     headers = {"Content-Type": "application/json"}
     try:
+        # Increased timeout for potentially longer residential proxy connections/actor runs
+        async with httpx.AsyncClient(timeout=180.0) as client:
+            logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
             logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
+            # --- Start of response processing ---
             if response.status_code in [200, 201]:
                 if response.status_code == 201:
                     logger.info(f"{log_prefix} Received status 201 (Created) from run-sync endpoint, processing results anyway.")
                     results = response.json(); content = None
                     if isinstance(results, list) and len(results) > 0:
                         item = results[0]
                         content = None # Reset content
                         # --- REFINED PARSING LOGIC (Handles output from various actors) ---
                             logger.info(f"{log_prefix} Found text content in 'markdown' field.")
                             content = item["markdown"]
                         elif "captions" in item and isinstance(item["captions"], str):
                             logger.info(f"{log_prefix} Found text content directly in 'captions' field (string).")
                             content = item["captions"]
+                        # --- MODIFIED LIST HANDLING FOR CAPTIONS ---
                         elif "captions" in item and isinstance(item["captions"], list):
+                            logger.info(f"{log_prefix} Found 'captions' field as a list. Processing...")
                             transcript_parts = []
                             if not item["captions"]: # Handle empty list case
                                 logger.warning(f"{log_prefix} 'captions' field is an empty list.")
                             else:
                                 first_element = item["captions"][0]
                                 if isinstance(first_element, str):
                                     logger.debug(f"{log_prefix} Detected list of strings format in 'captions'.")
                                     transcript_parts = [seg for seg in item["captions"] if isinstance(seg, str)]
                                 elif isinstance(first_element, dict) and "text" in first_element:
                                     logger.debug(f"{log_prefix} Detected list of dictionaries format in 'captions'.")
                                     transcript_parts = [seg.get("text", "") for seg in item["captions"] if isinstance(seg, dict) and "text" in seg]
                                 else:
                             logger.info(f"{log_prefix} Success via REST (Status {response.status_code}) for {url}. Final content length: {len(content)}")
                             return content.strip()
                         else:
                              content_len = len(content) if content and isinstance(content, str) else 0
                              item_keys_str = list(item.keys()) if isinstance(item, dict) else "N/A"
                              logger.warning(f"{log_prefix} Dataset item parsed (Status {response.status_code}) but final content empty/short/invalid format after checking all known keys for {url}. Item keys: {item_keys_str}. Length: {content_len}")
                              return None # Return None if no valid content found
                     else:
                         logger.warning(f"{log_prefix} Actor call successful (Status {response.status_code}) but dataset was empty or not a list for {url}. Response type: {type(results)}. Response sample: {str(results)[:200]}")
                         return None
                 # --- End of success processing logic ---
                 except json.JSONDecodeError:
                     raw_text = response.text
                     if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
                         logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
                     return None
             # Error handling for API call itself
             elif response.status_code == 400:
                  error_msg = response.text[:200] # Default
                  try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
                  except Exception: pass
                 return None
     # Error handling for network/client issues
     except httpx.TimeoutException as e: logger.error(f"{log_prefix} Timeout during API interaction for {url}: {e}"); return None
+    except httpx.HTTPStatusError as e: logger.error(f"{log_prefix} HTTP Status Error during API interaction for {url}: {e}"); return None
     except httpx.RequestError as e: logger.error(f"{log_prefix} Request error during API interaction for {url}: {e}"); return None
     except Exception as e: logger.error(f"{log_prefix} Unexpected error during {actor_name} call for {url}: {e}", exc_info=True); return None