Spaces:

fmab777
/

telegram-summary-bot

Running

App Files Files Community

fmab777 commited on Apr 17

Commit

1bb77fa

verified ·

1 Parent(s): de2d0f3

Update main.py

Browse files

Files changed (1) hide show

main.py +27 -18

main.py CHANGED Viewed

@@ -564,9 +564,9 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
     run_input: Dict[str, Any] = {} # Initialize empty dict
     if actor_id == APIFY_ACTOR_ID:
-        # Input specific to the default YT actor (1s7eXiaukVuOr4Ueg)
         run_input = {
-            "urls": [url],
             "maxRetries": 5,
             "channelHandleBoolean": False,
             "channelNameBoolean": False,
@@ -574,40 +574,38 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
             "relativeDateTextBoolean": False
         }
         logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
-    elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- ADDED THIS BLOCK --- >>>
-        # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR)
-        # Based on the error message "Field input.urls is required"
         run_input = {
-            "urls": [url]
-            # Add other specific parameters for this actor if needed/known
         }
         logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
     elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
-        # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR)
         run_input = {
-            "urls": [url]
         }
         logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
     elif actor_id == APIFY_CRAWLER_ACTOR_ID:
-         # Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS)
          run_input = {
-            "startUrls": [{"url": url}],
             "maxCrawlPages": 1,
             "crawlerType": "playwright:firefox" # Or adjust as needed
-            # Add other parameters specific to the crawler if necessary
          }
          logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
     else:
         # Fallback default input if actor ID doesn't match known ones
         # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
-        run_input = {"urls": [url]}
         logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
     headers = {"Content-Type": "application/json"}
     try:
         async with httpx.AsyncClient(timeout=120.0) as client:
-            logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {run_input}") # Log the input being sent
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
             logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
@@ -624,7 +622,7 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                         # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
                         content = None # Reset content
-                        # --- REFINED PARSING LOGIC ---
                         if "text" in item and isinstance(item["text"], str):
                             logger.info(f"{log_prefix} Found text content in 'text' field.")
                             content = item["text"]
@@ -691,13 +689,24 @@ async def _run_apify_actor_for_web_content(url: str, api_token: str, actor_id: s
                         return None
                 # --- End of success processing logic ---
                 except json.JSONDecodeError:
-                    logger.error(f"{log_prefix} Failed JSON decode. Status:{response.status_code}. Resp:{response.text[:200]}");
-                    return None
                 except Exception as e:
                     logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
                     return None
             # Error handling for API call itself
-            elif response.status_code == 400: logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. Resp:{response.text[:200]}"); return None
             elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
             elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
             else:

     run_input: Dict[str, Any] = {} # Initialize empty dict
     if actor_id == APIFY_ACTOR_ID:
+        # Input specific to the default YT actor (1s7eXiaukVuOr4Ueg) - Requires LIST
         run_input = {
+            "urls": [url], # <<< LIST format needed here
             "maxRetries": 5,
             "channelHandleBoolean": False,
             "channelNameBoolean": False,
             "relativeDateTextBoolean": False
         }
         logger.debug(f"{log_prefix} Using input format for Default YT Actor ({APIFY_ACTOR_ID})")
+    elif actor_id == APIFY_STRUCTURED_YT_ACTOR_ID: # <<< --- MODIFIED THIS BLOCK --- >>>
+        # Input specific to the Structured YT Actor (gpjTCWkGZS1lHc9pR) - Requires STRING
+        # Based on the error message "Field input.urls must be string"
         run_input = {
+            "urls": url # <<< STRING format needed here, not list
         }
         logger.debug(f"{log_prefix} Using input format for Structured YT Actor ({APIFY_STRUCTURED_YT_ACTOR_ID})")
     elif actor_id == APIFY_TEXT_SCRAPER_ACTOR_ID:
+        # Input specific to Text Scraper Free (2gbQiRSpJIIag2FdR) - Likely expects LIST
         run_input = {
+            "urls": [url] # <<< Assume LIST format standard here
         }
         logger.debug(f"{log_prefix} Using input format for Text Scraper ({APIFY_TEXT_SCRAPER_ACTOR_ID})")
     elif actor_id == APIFY_CRAWLER_ACTOR_ID:
+         # Input specific to Website Content Crawler (aYG0l9s7dbB7j3gbS) - Uses startUrls
          run_input = {
+            "startUrls": [{"url": url}], # <<< Different structure entirely
             "maxCrawlPages": 1,
             "crawlerType": "playwright:firefox" # Or adjust as needed
          }
          logger.debug(f"{log_prefix} Using input format for Website Content Crawler ({APIFY_CRAWLER_ACTOR_ID})")
     else:
         # Fallback default input if actor ID doesn't match known ones
         # Using the simple {"urls": [url]} format seems safest for generic text/content extractors
+        run_input = {"urls": [url]} # <<< Default to LIST
         logger.warning(f"{log_prefix} Unknown Actor ID '{actor_id}'. Using default input format: {run_input}")
     headers = {"Content-Type": "application/json"}
     try:
         async with httpx.AsyncClient(timeout=120.0) as client:
+            logger.debug(f"{log_prefix} POST Request to {sync_items_endpoint} for {url} with input: {json.dumps(run_input)}") # Log the input being sent (using dumps for clarity)
             response = await client.post(sync_items_endpoint, headers=headers, params=params, json=run_input)
             logger.debug(f"{log_prefix} Received status code {response.status_code} for {url}")
                         # logger.debug(f"{log_prefix} DEBUG] Full item received: {json.dumps(item, indent=2)}")
                         content = None # Reset content
+                        # --- REFINED PARSING LOGIC (Handles output from various actors) ---
                         if "text" in item and isinstance(item["text"], str):
                             logger.info(f"{log_prefix} Found text content in 'text' field.")
                             content = item["text"]
                         return None
                 # --- End of success processing logic ---
                 except json.JSONDecodeError:
+                    # Check if the raw text looks like a transcript if JSON fails
+                    raw_text = response.text
+                    if raw_text and len(raw_text) > 50 and (' ' in raw_text): # Basic check for textual content
+                        logger.warning(f"{log_prefix} Failed JSON decode, but raw text found. Status:{response.status_code}. Using raw text. Length: {len(raw_text)}")
+                        return raw_text.strip()
+                    else:
+                        logger.error(f"{log_prefix} Failed JSON decode and no usable raw text found. Status:{response.status_code}. Resp:{raw_text[:200]}");
+                        return None
                 except Exception as e:
                     logger.error(f"{log_prefix} Error processing success response (Status {response.status_code}) for {url}: {e}", exc_info=True);
                     return None
             # Error handling for API call itself
+            elif response.status_code == 400:
+                 # Log the specific error message from the API response if available
+                 error_msg = response.text[:200] # Default
+                 try: error_msg = response.json().get("error", {}).get("message", response.text[:200])
+                 except Exception: pass
+                 logger.error(f"{log_prefix} Bad Request (400) for {url}. Check run_input. API Msg: '{error_msg}'"); return None
             elif response.status_code == 401: logger.error(f"{log_prefix} Auth error (401). Check token."); return None
             elif response.status_code == 404: logger.error(f"{log_prefix} Endpoint/Actor Not Found (404). Actor: {actor_id} Resp:{response.text[:200]}"); return None
             else: