Spaces:

DrishtiSharma
/

chat-w-google-patents

Running

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

c7cbec9

verified ·

1 Parent(s): 661cade

Update patent_downloader.py

Browse files

Files changed (1) hide show

patent_downloader.py +98 -81

patent_downloader.py CHANGED Viewed

@@ -1,15 +1,20 @@
 from typing import List, Union, Optional
 import os
 import requests
-import pandas as pd
-import shutil
 import time
-from urllib.parse import urljoin
-from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
-import subprocess
 class PatentDownloader:
@@ -17,126 +22,144 @@ class PatentDownloader:
     def __init__(self, verbose: bool = False):
         """
-        Initialize the Patent Downloader.
         """
         self.verbose = verbose
         self.chrome_path = self.install_chrome()
-        self.chromedriver_path = self.install_chromedriver()
     def install_chrome(self) -> str:
         """
-        Ensure Google Chrome is installed and return the path.
         """
         chrome_path = "/usr/bin/google-chrome"
         if not shutil.which("google-chrome"):
-            print("Installing Google Chrome...")
             subprocess.run(
                 "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
                 shell=True,
-                check=True
             )
             subprocess.run(
                 "apt-get update && apt-get install -y ./chrome.deb",
                 shell=True,
-                check=True
             )
             os.remove("chrome.deb")
         if not shutil.which("google-chrome"):
-            raise ValueError("Failed to install Google Chrome.")
         return chrome_path
-    def install_chromedriver(self) -> str:
-        """
-        Ensure ChromeDriver is installed and return its path.
-        """
-        chromedriver_path = "/usr/local/bin/chromedriver"
-        if not os.path.isfile(chromedriver_path):
-            print("Installing ChromeDriver...")
-            subprocess.run(
-                "wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip -O chromedriver.zip",
-                shell=True,
-                check=True
-            )
-            subprocess.run("unzip chromedriver.zip -d /usr/local/bin", shell=True, check=True)
-            os.remove("chromedriver.zip")
-            subprocess.run("chmod +x /usr/local/bin/chromedriver", shell=True)
-        if not os.path.isfile(chromedriver_path):
-            raise ValueError("Failed to install ChromeDriver.")
-        return chromedriver_path
-    def download(self, patent: Union[str, List[str]], output_path: str = "./") -> None:
         """
-        Download one or multiple patent PDFs.
         """
         if isinstance(patent, list) or os.path.isfile(patent):
-            self.get_pdfs(patent, output_path)
         else:
-            self.get_pdf(patent, output_path)
-    def get_pdf(self, patent: str, output_path: str = "./") -> None:
         """
-        Download a single patent PDF using the citation_pdf_url meta tag.
         """
         chrome_options = Options()
         chrome_options.binary_location = self.chrome_path
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-dev-shm-usage")
-        service = Service(self.chromedriver_path)
         driver = webdriver.Chrome(service=service, options=chrome_options)
         try:
-            print(f"Accessing Google Patents for {patent}...")
-            driver.get(f"{self.url}/patent/{patent}/en")
-            time.sleep(3)  # Wait for page load
-            soup = BeautifulSoup(driver.page_source, "html.parser")
-            meta_tag = soup.find("meta", {"name": "citation_pdf_url"})
-            if not meta_tag:
-                raise FileNotFoundError(f"No PDF link found for patent: {patent}")
-            pdf_url = meta_tag["content"]
-            print(f"PDF link found: {pdf_url}")
-            file_path = os.path.join(output_path, f"{patent}.pdf")
-            self.download_pdf(pdf_url, file_path)
-            print(f"Patent PDF saved to {file_path}")
         except Exception as e:
-            print(f"Error: {e}")
-            raise FileNotFoundError(f"Failed to process patent: {patent}")
         finally:
             driver.quit()
-    @staticmethod
-    def download_pdf(pdf_url: str, file_path: str):
-        """
-        Download the PDF file from the given URL.
-        """
-        print("Downloading PDF...")
-        response = requests.get(pdf_url, stream=True)
-        response.raise_for_status()
-        with open(file_path, "wb") as file:
-            for chunk in response.iter_content(chunk_size=8192):
-                file.write(chunk)
-        print(f"PDF successfully downloaded: {file_path}")
-    def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./") -> None:
         """
-        Download multiple patents as PDFs.
         """
         if isinstance(patents, str):
-            if patents.endswith('.csv'):
-                patents = pd.read_csv(patents)['patent_number'].tolist()
-            elif patents.endswith('.txt'):
-                with open(patents, 'r') as file:
-                    patents = file.read().splitlines()
             else:
-                raise ValueError("Unsupported file format. Use CSV or TXT.")
-        for patent in patents:
-            self.get_pdf(patent, output_path)
 def validate_directory(directory: str) -> None:
@@ -145,9 +168,3 @@ def validate_directory(directory: str) -> None:
     """
     if not os.path.exists(directory):
         os.makedirs(directory)
-if __name__ == "__main__":
-    validate_directory("./downloads")
-    downloader = PatentDownloader()
-    downloader.download(patent="US8676427B1", output_path="./downloads")

 from typing import List, Union, Optional
 import os
 import requests
+import re
 import time
+import shutil
+import subprocess
+import pandas as pd
 from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import chromedriver_autoinstaller
 class PatentDownloader:
     def __init__(self, verbose: bool = False):
         """
+        Parameters
+        ----------
+        verbose : bool
+            Print additional debug information.
         """
         self.verbose = verbose
         self.chrome_path = self.install_chrome()
     def install_chrome(self) -> str:
         """
+        Download and install Google Chrome dynamically.
+        Returns
+        -------
+        str: Path to the Chrome binary.
         """
         chrome_path = "/usr/bin/google-chrome"
         if not shutil.which("google-chrome"):
+            print("Downloading and installing Google Chrome...")
             subprocess.run(
                 "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
                 shell=True,
+                check=True,
             )
             subprocess.run(
                 "apt-get update && apt-get install -y ./chrome.deb",
                 shell=True,
+                check=True,
             )
             os.remove("chrome.deb")
         if not shutil.which("google-chrome"):
+            raise ValueError("Google Chrome installation failed!")
         return chrome_path
+    def download(self, patent: Union[str, List[str]], output_path: str = "./",
+                 waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
         """
+        Download patent document(s) as PDF.
         """
         if isinstance(patent, list) or os.path.isfile(patent):
+            self.get_pdfs(patent, output_path, waiting_time, remove_kind_codes)
         else:
+            self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
+    def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 10,
+                remove_kind_codes: Optional[List[str]] = None) -> None:
         """
+        Download a single patent PDF.
         """
+        if remove_kind_codes:
+            for kind_code in remove_kind_codes:
+                patent = re.sub(kind_code + "$", "", patent)
+        # Automatically install ChromeDriver
+        chromedriver_autoinstaller.install()
+        # Set up Chrome options
         chrome_options = Options()
         chrome_options.binary_location = self.chrome_path
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-dev-shm-usage")
+        # Initialize Selenium WebDriver
+        service = Service()
         driver = webdriver.Chrome(service=service, options=chrome_options)
+        pdf_link = None  # Ensure pdf_link is defined
         try:
+            driver.get(self.url)
+            # Wait for the search input field and interact with it
+            print("Waiting for the search input field...")
+            search_input_xpath = "//input[@aria-label='Search patents']"
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.XPATH, search_input_xpath))
+            )
+            element = driver.find_element(By.XPATH, search_input_xpath)
+            print("Search input field located.")
+            element.send_keys(patent)
+            element.send_keys(Keys.RETURN)
+            # Wait for search results to load
+            print("Waiting for search results to load...")
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.TAG_NAME, "body"))
+            )
+            time.sleep(waiting_time)
+            # Parse HTML and get the PDF link
+            soup = BeautifulSoup(driver.page_source, "html.parser")
+            pdf_link = self.get_pdf_link(soup, patent)
         except Exception as e:
+            print(f"Error occurred: {e}")
         finally:
             driver.quit()
+        # Download the PDF
+        if pdf_link:
+            validate_directory(output_path)
+            pdf_content = requests.get(pdf_link).content
+            with open(os.path.join(output_path, f"{patent}.pdf"), "wb") as file:
+                file.write(pdf_content)
+            print(f">>> Patent {patent} successfully downloaded <<<")
+        else:
+            print(f"Error: PDF link for patent {patent} not found!")
+    def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./",
+                 waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
         """
+        Download multiple patent PDFs from a list or file.
         """
         if isinstance(patents, str):
+            if patents.lower().endswith('csv'):
+                df_patents = pd.read_csv(patents)
+                patents = df_patents['patent_number'].to_list()
+            elif patents.lower().endswith('txt'):
+                with open(patents, 'r') as txt_file:
+                    patents = txt_file.read().splitlines()
             else:
+                raise NotImplementedError(f'Unsupported file type: {patents}')
+        for i, patent in enumerate(patents):
+            print(len(patents) - i, "patent(s) remaining.")
+            self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
+    @staticmethod
+    def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
+        """
+        Extract the PDF link from parsed HTML.
+        """
+        pdf_links = [link['href'] for link in soup.find_all('a', href=True) if link['href'].lower().endswith("pdf")]
+        for link in pdf_links:
+            if patent.lower() in link.lower():
+                return link
+        return None
 def validate_directory(directory: str) -> None:
     """
     if not os.path.exists(directory):
         os.makedirs(directory)