taeyeol commited on
Commit
ed67663
ยท
verified ยท
1 Parent(s): 4d71151

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -12
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
 
5
  # ๋””๋ฒ„๊น…(๋กœ๊ทธ)์šฉ ํ•จ์ˆ˜
6
  def debug_log(message: str):
@@ -27,26 +28,60 @@ def scrape_naver_blog(url: str) -> str:
27
  }
28
 
29
  try:
 
30
  response = requests.get(url, headers=headers)
31
- debug_log("HTTP GET ์š”์ฒญ ์™„๋ฃŒ")
32
 
33
  # ์‘๋‹ต ์ƒํƒœ์ฝ”๋“œ ํ™•์ธ
34
  if response.status_code != 200:
35
  debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
36
  return f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {response.status_code}"
37
 
38
- # BeautifulSoup ํŒŒ์‹ฑ
39
  soup = BeautifulSoup(response.text, "html.parser")
40
- debug_log("HTML ํŒŒ์‹ฑ ์™„๋ฃŒ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # ์ œ๋ชฉ ์ถ”์ถœ
43
- title_div = soup.select_one('.se-module.se-module-text.se-title-text')
44
  title = title_div.get_text(strip=True) if title_div else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
45
  debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
46
 
47
  # ๋ณธ๋ฌธ ์ถ”์ถœ
48
- content_div = soup.select_one('.se-main-container')
49
- content = content_div.get_text("\n", strip=True) if content_div else "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
 
 
 
 
50
  debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")
51
 
52
  # ๊ฒฐ๊ณผ ํ•ฉ์น˜๊ธฐ
@@ -62,16 +97,14 @@ def scrape_naver_blog(url: str) -> str:
62
 
63
  # Gradio ์ธํ„ฐํŽ˜์ด์Šค
64
  def main_interface():
65
- # ์ž…๋ ฅ: ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ
66
- # ์ถœ๋ ฅ: ์ œ๋ชฉ + ๋ณธ๋ฌธ ๋‚ด์šฉ
67
  interface = gr.Interface(
68
  fn=scrape_naver_blog,
69
- inputs=gr.inputs.Textbox(
70
  lines=1,
71
  label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ",
72
  placeholder="์˜ˆ: https://blog.naver.com/ssboost/222983068507"
73
  ),
74
- outputs=gr.outputs.Textbox(label="๊ฒฐ๊ณผ"),
75
  title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ",
76
  description="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜์—ฌ ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค."
77
  )
@@ -81,5 +114,4 @@ if __name__ == "__main__":
81
  debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
82
  demo = main_interface()
83
  demo.launch()
84
- debug_log("Gradio ์•ฑ ์‹คํ–‰ ์ข…๋ฃŒ")
85
-
 
1
  import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ import urllib.parse # iframe ๊ฒฝ๋กœ๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ผ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋งŒ๋“ค๊ธฐ ์œ„ํ•ด ์‚ฌ์šฉ
5
 
6
  # ๋””๋ฒ„๊น…(๋กœ๊ทธ)์šฉ ํ•จ์ˆ˜
7
  def debug_log(message: str):
 
28
  }
29
 
30
  try:
31
+ # 1) ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ '๋ฉ”์ธ' ํŽ˜์ด์ง€ ์š”์ฒญ
32
  response = requests.get(url, headers=headers)
33
+ debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
34
 
35
  # ์‘๋‹ต ์ƒํƒœ์ฝ”๋“œ ํ™•์ธ
36
  if response.status_code != 200:
37
  debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
38
  return f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {response.status_code}"
39
 
40
+ # BeautifulSoup ํŒŒ์‹ฑ (๋ฉ”์ธ ํŽ˜์ด์ง€)
41
  soup = BeautifulSoup(response.text, "html.parser")
42
+ debug_log("HTML ํŒŒ์‹ฑ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
43
+
44
+ # 2) iframe ํƒœ๊ทธ ์ฐพ๊ธฐ
45
+ iframe = soup.select_one("iframe#mainFrame")
46
+ if not iframe:
47
+ # iframe ์ž์ฒด๋ฅผ ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ
48
+ debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
49
+ return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
50
+
51
+ iframe_src = iframe.get("src")
52
+ if not iframe_src:
53
+ debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
54
+ return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
55
+
56
+ # 3) iframe src๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ธ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋ณด์ •
57
+ # (์˜ˆ: //blog.naver.com/~~~ ์™€ ๊ฐ™์€ ๊ฒฝ์šฐ๋ฅผ ์ฒ˜๋ฆฌ)
58
+ parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
59
+
60
+ # iframe ํŽ˜์ด์ง€๋กœ ์žฌ์š”์ฒญ
61
+ debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
62
+ iframe_response = requests.get(parsed_iframe_url, headers=headers)
63
+ debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
64
+
65
+ if iframe_response.status_code != 200:
66
+ debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
67
+ return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"
68
+
69
+ # 4) iframe ํŽ˜์ด์ง€ ํŒŒ์‹ฑ
70
+ iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
71
+ debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
72
 
73
  # ์ œ๋ชฉ ์ถ”์ถœ
74
+ title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
75
  title = title_div.get_text(strip=True) if title_div else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
76
  debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
77
 
78
  # ๋ณธ๋ฌธ ์ถ”์ถœ
79
+ content_div = iframe_soup.select_one('.se-main-container')
80
+ if content_div:
81
+ # ๋ณธ๋ฌธ์„ \n ๊ธฐ์ค€์œผ๋กœ ๊ตฌ๋ถ„ํ•ด์„œ ์ข€ ๋” ๊น”๋”ํ•˜๊ฒŒ ๋งŒ๋“ค๊ธฐ
82
+ content = content_div.get_text("\n", strip=True)
83
+ else:
84
+ content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
85
  debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")
86
 
87
  # ๊ฒฐ๊ณผ ํ•ฉ์น˜๊ธฐ
 
97
 
98
  # Gradio ์ธํ„ฐํŽ˜์ด์Šค
99
  def main_interface():
 
 
100
  interface = gr.Interface(
101
  fn=scrape_naver_blog,
102
+ inputs=gr.Textbox(
103
  lines=1,
104
  label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ",
105
  placeholder="์˜ˆ: https://blog.naver.com/ssboost/222983068507"
106
  ),
107
+ outputs=gr.Textbox(label="๊ฒฐ๊ณผ"),
108
  title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ",
109
  description="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜์—ฌ ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค."
110
  )
 
114
  debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
115
  demo = main_interface()
116
  demo.launch()
117
+ debug_log("Gradio ์•ฑ ์‹คํ–‰ ์ข…๋ฃŒ")