Spaces:
Sleeping
Sleeping
File size: 4,647 Bytes
877c973 5bd4136 877c973 5bd4136 877c973 5bd4136 877c973 5bd4136 877c973 5bd4136 877c973 8c89f83 877c973 8c89f83 877c973 5bd4136 877c973 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import datetime
import json
import gradio as gr
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS
from bs4.element import Tag
class App:
def __init__(self) -> None:
with open("bookmarks.json", "rt", encoding="UTF-8") as fp:
self.bookmark_info = json.load(fp)
with open("Intro.md", "rt", encoding="UTF-8") as fp:
intro = fp.read()
theme = gr.themes.Soft()
with gr.Blocks(title="Yuyu Tei Crawler", theme=theme) as self.demo:
gr.Markdown(intro)
with gr.Row(equal_height=False):
self.__CreateColumns__()
self.__RegisterEvents__()
def __CreateColumns__(self):
with gr.Column():
ph = "e.g. https://yuyu-tei.jp/game_ws/sell/sell_price.php?ver=uma&menu=newest"
self.input_url = gr.Textbox(label="URL", placeholder=ph)
self.submit_btn = gr.Button("Submit")
self.__CreateBookmarks__()
with gr.Column():
self.output_file = gr.File(label="Result", file_count="single")
self.status = gr.Textbox("Ready", label="Status", interactive=False)
def __CreateBookmarks__(self):
with gr.Tab("Bookmarks"):
with gr.Row():
self.bookmarks = {name: gr.Button(value=name) for name in self.bookmark_info}
def __RegisterEvents__(self):
args_submit = KwargsToDict(
fn=self.Download,
inputs=self.input_url,
outputs=[self.output_file, self.status],
)
self.submit_btn.click(**args_submit)
self.input_url.submit(**args_submit)
def GetArgsBookmark(name):
return KwargsToDict(
fn=self.ClickBookmark,
inputs=self.bookmarks[name],
outputs=self.input_url,
show_progress=False,
)
for name in self.bookmarks:
args_bookmark = GetArgsBookmark(name)
self.bookmarks[name].click(**args_bookmark)
def ClickBookmark(self, name):
return self.bookmark_info[name]
def Download(self, url):
try:
ts = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
output_path = ts.strftime("%Y%m%d_%H%M%S.xlsx")
CrawlPage(url, output_path)
return output_path, "Success"
except Exception as e:
return None, f"Error: {e}"
def Launch(self):
self.demo.launch(favicon_path="icon.png")
def CrawlPage(url, output_path):
print(f"Visiting {url}")
res = requests.get(url)
print(f"Status: {res.status_code}")
bs = BS(res.text, features="html.parser")
elems = bs.find_all("li", attrs={"class": "card_unit"})
data = [IterElem(e) for e in elems]
pd.DataFrame(data).to_excel(output_path, index=False)
def IterElem(e: Tag):
# 卡號
card_id = e.find_next("p", attrs={"class": "id"})
card_id = card_id.text.strip()
# 卡名 1 - 從標題提取,但可能會被縮減
card_name_elem = e.find_next("p", attrs={"class": "name"})
card_name = card_name_elem.text.strip()
# 卡名 2 - 從圖片的替代敘述提取,但有些圖片的替代名稱是 "NowPrinting" 與實際卡名不符
card_alt_name = e.find_next("p", attrs={"class": "image"})
card_alt_name = card_alt_name.find_next("img")
card_alt_name = card_alt_name.get("alt")
# 卡名 3 - 進入該卡片的詳細資訊網頁抓取卡名,但會比較慢
# 且可能造成過多訪問以至於被伺服器當成機器人 Ban 掉
# 這邊只針對圖片的替代名稱為 "NowPrinting" 且標題沒有被縮減的部份額外爬取
if card_name.endswith("...") and card_alt_name == "NowPrinting":
url = card_name_elem.find_next("a").get("href")
card_name = GetCardNameFromPage(url)
card_name = card_name if card_alt_name == "NowPrinting" else card_alt_name
# 價格
price = e.find_next("p", attrs={"class": "price"})
price = price.text.strip().strip("円")
# 稀有度
rarity = e.get("class")[1].strip("rarity_")
return {"卡號": card_id, "卡名": card_name, "價格": price, "稀有度": rarity}
def GetCardNameFromPage(url):
url = f"https://yuyu-tei.jp{url}"
print(f"Visiting {url}")
res = requests.get(url)
print(f"Status: {res.status_code}")
bs = BS(res.text, features="html.parser")
info_box = bs.find("div", attrs={"class": "information_box"})
card_name = info_box.find("td")
return card_name.text.strip()
def KwargsToDict(**kwargs):
return kwargs
if __name__ == "__main__":
App().Launch()
|