Spaces:

DrishtiSharma
/

chat-w-google-patents

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

661cade

verified ·

1 Parent(s): be9b225

Update patent_downloader.py

Browse files

Files changed (1) hide show

patent_downloader.py +76 -37

patent_downloader.py CHANGED Viewed

@@ -1,14 +1,15 @@
 from typing import List, Union, Optional
 import os
 import requests
-import re
 import pandas as pd
-from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
-import chromedriver_autoinstaller
 class PatentDownloader:
@@ -20,13 +21,48 @@ class PatentDownloader:
         """
         self.verbose = verbose
         self.chrome_path = self.install_chrome()
     def install_chrome(self) -> str:
         """
-        Install ChromeDriver dynamically.
         """
-        chromedriver_autoinstaller.install()
-        return "chromedriver"
     def download(self, patent: Union[str, List[str]], output_path: str = "./") -> None:
         """
@@ -39,46 +75,56 @@ class PatentDownloader:
     def get_pdf(self, patent: str, output_path: str = "./") -> None:
         """
-        Download a single patent PDF by extracting the link from the meta tag.
         """
-        # Setup headless browser
         chrome_options = Options()
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-dev-shm-usage")
-        driver = webdriver.Chrome(service=Service(), options=chrome_options)
         try:
-            # Open the patent page
-            patent_url = f"{self.url}/patent/{patent}/en"
-            print(f"Accessing: {patent_url}")
-            driver.get(patent_url)
-            # Parse the page source
             soup = BeautifulSoup(driver.page_source, "html.parser")
-            # Extract the PDF URL from meta tag
             meta_tag = soup.find("meta", {"name": "citation_pdf_url"})
             if not meta_tag:
-                raise FileNotFoundError(f"PDF link not found in meta tags for patent {patent}")
             pdf_url = meta_tag["content"]
-            print(f"Found PDF URL: {pdf_url}")
-            # Download the PDF
             file_path = os.path.join(output_path, f"{patent}.pdf")
             self.download_pdf(pdf_url, file_path)
-            print(f"PDF saved successfully: {file_path}")
         except Exception as e:
-            print(f"Error processing patent {patent}: {e}")
             raise FileNotFoundError(f"Failed to process patent: {patent}")
         finally:
             driver.quit()
     def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./") -> None:
         """
-        Download multiple patents from a list or file.
         """
         if isinstance(patents, str):
             if patents.endswith('.csv'):
@@ -87,24 +133,11 @@ class PatentDownloader:
                 with open(patents, 'r') as file:
                     patents = file.read().splitlines()
             else:
-                raise ValueError("Unsupported file type. Use CSV or TXT files.")
-        for i, patent in enumerate(patents):
-            print(f"Downloading {i + 1}/{len(patents)}: {patent}")
             self.get_pdf(patent, output_path)
-    @staticmethod
-    def download_pdf(pdf_url: str, file_path: str):
-        """
-        Download the PDF file from the given URL and save it locally.
-        """
-        response = requests.get(pdf_url, stream=True)
-        response.raise_for_status()
-        with open(file_path, "wb") as file:
-            for chunk in response.iter_content(chunk_size=8192):
-                file.write(chunk)
-        print(f"PDF downloaded: {file_path}")
 def validate_directory(directory: str) -> None:
     """
@@ -112,3 +145,9 @@ def validate_directory(directory: str) -> None:
     """
     if not os.path.exists(directory):
         os.makedirs(directory)

 from typing import List, Union, Optional
 import os
 import requests
 import pandas as pd
+import shutil
+import time
 from urllib.parse import urljoin
+from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
+import subprocess
 class PatentDownloader:
         """
         self.verbose = verbose
         self.chrome_path = self.install_chrome()
+        self.chromedriver_path = self.install_chromedriver()
     def install_chrome(self) -> str:
         """
+        Ensure Google Chrome is installed and return the path.
+        """
+        chrome_path = "/usr/bin/google-chrome"
+        if not shutil.which("google-chrome"):
+            print("Installing Google Chrome...")
+            subprocess.run(
+                "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
+                shell=True,
+                check=True
+            )
+            subprocess.run(
+                "apt-get update && apt-get install -y ./chrome.deb",
+                shell=True,
+                check=True
+            )
+            os.remove("chrome.deb")
+        if not shutil.which("google-chrome"):
+            raise ValueError("Failed to install Google Chrome.")
+        return chrome_path
+    def install_chromedriver(self) -> str:
+        """
+        Ensure ChromeDriver is installed and return its path.
         """
+        chromedriver_path = "/usr/local/bin/chromedriver"
+        if not os.path.isfile(chromedriver_path):
+            print("Installing ChromeDriver...")
+            subprocess.run(
+                "wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip -O chromedriver.zip",
+                shell=True,
+                check=True
+            )
+            subprocess.run("unzip chromedriver.zip -d /usr/local/bin", shell=True, check=True)
+            os.remove("chromedriver.zip")
+            subprocess.run("chmod +x /usr/local/bin/chromedriver", shell=True)
+        if not os.path.isfile(chromedriver_path):
+            raise ValueError("Failed to install ChromeDriver.")
+        return chromedriver_path
     def download(self, patent: Union[str, List[str]], output_path: str = "./") -> None:
         """
     def get_pdf(self, patent: str, output_path: str = "./") -> None:
         """
+        Download a single patent PDF using the citation_pdf_url meta tag.
         """
         chrome_options = Options()
+        chrome_options.binary_location = self.chrome_path
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-dev-shm-usage")
+        service = Service(self.chromedriver_path)
+        driver = webdriver.Chrome(service=service, options=chrome_options)
         try:
+            print(f"Accessing Google Patents for {patent}...")
+            driver.get(f"{self.url}/patent/{patent}/en")
+            time.sleep(3)  # Wait for page load
             soup = BeautifulSoup(driver.page_source, "html.parser")
             meta_tag = soup.find("meta", {"name": "citation_pdf_url"})
             if not meta_tag:
+                raise FileNotFoundError(f"No PDF link found for patent: {patent}")
             pdf_url = meta_tag["content"]
+            print(f"PDF link found: {pdf_url}")
             file_path = os.path.join(output_path, f"{patent}.pdf")
             self.download_pdf(pdf_url, file_path)
+            print(f"Patent PDF saved to {file_path}")
         except Exception as e:
+            print(f"Error: {e}")
             raise FileNotFoundError(f"Failed to process patent: {patent}")
         finally:
             driver.quit()
+    @staticmethod
+    def download_pdf(pdf_url: str, file_path: str):
+        """
+        Download the PDF file from the given URL.
+        """
+        print("Downloading PDF...")
+        response = requests.get(pdf_url, stream=True)
+        response.raise_for_status()
+        with open(file_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        print(f"PDF successfully downloaded: {file_path}")
     def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./") -> None:
         """
+        Download multiple patents as PDFs.
         """
         if isinstance(patents, str):
             if patents.endswith('.csv'):
                 with open(patents, 'r') as file:
                     patents = file.read().splitlines()
             else:
+                raise ValueError("Unsupported file format. Use CSV or TXT.")
+        for patent in patents:
             self.get_pdf(patent, output_path)
 def validate_directory(directory: str) -> None:
     """
     """
     if not os.path.exists(directory):
         os.makedirs(directory)
+if __name__ == "__main__":
+    validate_directory("./downloads")
+    downloader = PatentDownloader()
+    downloader.download(patent="US8676427B1", output_path="./downloads")