DrishtiSharma commited on
Commit
661cade
·
verified ·
1 Parent(s): be9b225

Update patent_downloader.py

Browse files
Files changed (1) hide show
  1. patent_downloader.py +76 -37
patent_downloader.py CHANGED
@@ -1,14 +1,15 @@
1
  from typing import List, Union, Optional
2
  import os
3
  import requests
4
- import re
5
  import pandas as pd
6
- from bs4 import BeautifulSoup
 
7
  from urllib.parse import urljoin
 
8
  from selenium import webdriver
9
  from selenium.webdriver.chrome.service import Service
10
  from selenium.webdriver.chrome.options import Options
11
- import chromedriver_autoinstaller
12
 
13
 
14
  class PatentDownloader:
@@ -20,13 +21,48 @@ class PatentDownloader:
20
  """
21
  self.verbose = verbose
22
  self.chrome_path = self.install_chrome()
 
23
 
24
  def install_chrome(self) -> str:
25
  """
26
- Install ChromeDriver dynamically.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  """
28
- chromedriver_autoinstaller.install()
29
- return "chromedriver"
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def download(self, patent: Union[str, List[str]], output_path: str = "./") -> None:
32
  """
@@ -39,46 +75,56 @@ class PatentDownloader:
39
 
40
  def get_pdf(self, patent: str, output_path: str = "./") -> None:
41
  """
42
- Download a single patent PDF by extracting the link from the meta tag.
43
  """
44
- # Setup headless browser
45
  chrome_options = Options()
 
46
  chrome_options.add_argument("--headless")
47
  chrome_options.add_argument("--no-sandbox")
48
  chrome_options.add_argument("--disable-dev-shm-usage")
49
- driver = webdriver.Chrome(service=Service(), options=chrome_options)
 
 
50
 
51
  try:
52
- # Open the patent page
53
- patent_url = f"{self.url}/patent/{patent}/en"
54
- print(f"Accessing: {patent_url}")
55
- driver.get(patent_url)
56
 
57
- # Parse the page source
58
  soup = BeautifulSoup(driver.page_source, "html.parser")
59
-
60
- # Extract the PDF URL from meta tag
61
  meta_tag = soup.find("meta", {"name": "citation_pdf_url"})
62
  if not meta_tag:
63
- raise FileNotFoundError(f"PDF link not found in meta tags for patent {patent}")
64
 
65
  pdf_url = meta_tag["content"]
66
- print(f"Found PDF URL: {pdf_url}")
67
 
68
- # Download the PDF
69
  file_path = os.path.join(output_path, f"{patent}.pdf")
70
  self.download_pdf(pdf_url, file_path)
71
- print(f"PDF saved successfully: {file_path}")
72
 
73
  except Exception as e:
74
- print(f"Error processing patent {patent}: {e}")
75
  raise FileNotFoundError(f"Failed to process patent: {patent}")
76
  finally:
77
  driver.quit()
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./") -> None:
80
  """
81
- Download multiple patents from a list or file.
82
  """
83
  if isinstance(patents, str):
84
  if patents.endswith('.csv'):
@@ -87,24 +133,11 @@ class PatentDownloader:
87
  with open(patents, 'r') as file:
88
  patents = file.read().splitlines()
89
  else:
90
- raise ValueError("Unsupported file type. Use CSV or TXT files.")
91
 
92
- for i, patent in enumerate(patents):
93
- print(f"Downloading {i + 1}/{len(patents)}: {patent}")
94
  self.get_pdf(patent, output_path)
95
 
96
- @staticmethod
97
- def download_pdf(pdf_url: str, file_path: str):
98
- """
99
- Download the PDF file from the given URL and save it locally.
100
- """
101
- response = requests.get(pdf_url, stream=True)
102
- response.raise_for_status()
103
- with open(file_path, "wb") as file:
104
- for chunk in response.iter_content(chunk_size=8192):
105
- file.write(chunk)
106
- print(f"PDF downloaded: {file_path}")
107
-
108
 
109
  def validate_directory(directory: str) -> None:
110
  """
@@ -112,3 +145,9 @@ def validate_directory(directory: str) -> None:
112
  """
113
  if not os.path.exists(directory):
114
  os.makedirs(directory)
 
 
 
 
 
 
 
1
  from typing import List, Union, Optional
2
  import os
3
  import requests
 
4
  import pandas as pd
5
+ import shutil
6
+ import time
7
  from urllib.parse import urljoin
8
+ from bs4 import BeautifulSoup
9
  from selenium import webdriver
10
  from selenium.webdriver.chrome.service import Service
11
  from selenium.webdriver.chrome.options import Options
12
+ import subprocess
13
 
14
 
15
  class PatentDownloader:
 
21
  """
22
  self.verbose = verbose
23
  self.chrome_path = self.install_chrome()
24
+ self.chromedriver_path = self.install_chromedriver()
25
 
26
  def install_chrome(self) -> str:
27
  """
28
+ Ensure Google Chrome is installed and return the path.
29
+ """
30
+ chrome_path = "/usr/bin/google-chrome"
31
+ if not shutil.which("google-chrome"):
32
+ print("Installing Google Chrome...")
33
+ subprocess.run(
34
+ "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
35
+ shell=True,
36
+ check=True
37
+ )
38
+ subprocess.run(
39
+ "apt-get update && apt-get install -y ./chrome.deb",
40
+ shell=True,
41
+ check=True
42
+ )
43
+ os.remove("chrome.deb")
44
+ if not shutil.which("google-chrome"):
45
+ raise ValueError("Failed to install Google Chrome.")
46
+ return chrome_path
47
+
48
+ def install_chromedriver(self) -> str:
49
+ """
50
+ Ensure ChromeDriver is installed and return its path.
51
  """
52
+ chromedriver_path = "/usr/local/bin/chromedriver"
53
+ if not os.path.isfile(chromedriver_path):
54
+ print("Installing ChromeDriver...")
55
+ subprocess.run(
56
+ "wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip -O chromedriver.zip",
57
+ shell=True,
58
+ check=True
59
+ )
60
+ subprocess.run("unzip chromedriver.zip -d /usr/local/bin", shell=True, check=True)
61
+ os.remove("chromedriver.zip")
62
+ subprocess.run("chmod +x /usr/local/bin/chromedriver", shell=True)
63
+ if not os.path.isfile(chromedriver_path):
64
+ raise ValueError("Failed to install ChromeDriver.")
65
+ return chromedriver_path
66
 
67
  def download(self, patent: Union[str, List[str]], output_path: str = "./") -> None:
68
  """
 
75
 
76
  def get_pdf(self, patent: str, output_path: str = "./") -> None:
77
  """
78
+ Download a single patent PDF using the citation_pdf_url meta tag.
79
  """
 
80
  chrome_options = Options()
81
+ chrome_options.binary_location = self.chrome_path
82
  chrome_options.add_argument("--headless")
83
  chrome_options.add_argument("--no-sandbox")
84
  chrome_options.add_argument("--disable-dev-shm-usage")
85
+
86
+ service = Service(self.chromedriver_path)
87
+ driver = webdriver.Chrome(service=service, options=chrome_options)
88
 
89
  try:
90
+ print(f"Accessing Google Patents for {patent}...")
91
+ driver.get(f"{self.url}/patent/{patent}/en")
92
+ time.sleep(3) # Wait for page load
 
93
 
 
94
  soup = BeautifulSoup(driver.page_source, "html.parser")
 
 
95
  meta_tag = soup.find("meta", {"name": "citation_pdf_url"})
96
  if not meta_tag:
97
+ raise FileNotFoundError(f"No PDF link found for patent: {patent}")
98
 
99
  pdf_url = meta_tag["content"]
100
+ print(f"PDF link found: {pdf_url}")
101
 
 
102
  file_path = os.path.join(output_path, f"{patent}.pdf")
103
  self.download_pdf(pdf_url, file_path)
104
+ print(f"Patent PDF saved to {file_path}")
105
 
106
  except Exception as e:
107
+ print(f"Error: {e}")
108
  raise FileNotFoundError(f"Failed to process patent: {patent}")
109
  finally:
110
  driver.quit()
111
 
112
+ @staticmethod
113
+ def download_pdf(pdf_url: str, file_path: str):
114
+ """
115
+ Download the PDF file from the given URL.
116
+ """
117
+ print("Downloading PDF...")
118
+ response = requests.get(pdf_url, stream=True)
119
+ response.raise_for_status()
120
+ with open(file_path, "wb") as file:
121
+ for chunk in response.iter_content(chunk_size=8192):
122
+ file.write(chunk)
123
+ print(f"PDF successfully downloaded: {file_path}")
124
+
125
  def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./") -> None:
126
  """
127
+ Download multiple patents as PDFs.
128
  """
129
  if isinstance(patents, str):
130
  if patents.endswith('.csv'):
 
133
  with open(patents, 'r') as file:
134
  patents = file.read().splitlines()
135
  else:
136
+ raise ValueError("Unsupported file format. Use CSV or TXT.")
137
 
138
+ for patent in patents:
 
139
  self.get_pdf(patent, output_path)
140
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  def validate_directory(directory: str) -> None:
143
  """
 
145
  """
146
  if not os.path.exists(directory):
147
  os.makedirs(directory)
148
+
149
+
150
+ if __name__ == "__main__":
151
+ validate_directory("./downloads")
152
+ downloader = PatentDownloader()
153
+ downloader.download(patent="US8676427B1", output_path="./downloads")