Spaces:

SonFox2920
/

vn_news_crawler

Sleeping

App Files Files Community

SonFox2920 commited on Feb 20

Commit

79ff5a8

verified ·

1 Parent(s): 44d6c40

Upload 2 files

Browse files

Files changed (2) hide show

app.py +285 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import datetime
+import csv
+import os
+st.set_page_config(page_title="VnExpress Crawler", page_icon="📰", layout="wide")
+def get_article_details(url):
+    try:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Lấy tiêu đề
+        title = soup.find('h1', class_='title-detail').text.strip() if soup.find('h1', class_='title-detail') else "Không có tiêu đề"
+        # Lấy mô tả
+        description = soup.find('p', class_='description').text.strip() if soup.find('p', class_='description') else "Không có mô tả"
+        # Lấy thời gian đăng
+        publish_time = ""
+        time_tag = soup.find('span', class_='date')
+        if time_tag:
+            publish_time = time_tag.text.strip()
+        # Lấy nội dung bài viết
+        content_div = soup.find('article', class_='fck_detail')
+        content = ""
+        if content_div:
+            paragraphs = content_div.find_all('p', class_='Normal')
+            content = " ".join([p.text.strip() for p in paragraphs])
+        # Lấy chuyên mục
+        category = ""
+        breadcrumb = soup.find('ul', class_='breadcrumb')
+        if breadcrumb:
+            category_item = breadcrumb.find_all('li')
+            if len(category_item) > 1:
+                category = category_item[1].text.strip()
+        return {
+            'url': url,
+            'title': title,
+            'description': description,
+            'publish_time': publish_time,
+            'content': content,
+            'category': category
+        }
+    except Exception as e:
+        st.error(f"Lỗi khi crawl bài viết {url}: {str(e)}")
+        return None
+def get_articles_from_section(section_url, limit=10):
+    articles = []
+    try:
+        response = requests.get(section_url)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Tìm các bài viết trong trang
+        article_items = soup.find_all('article', class_='item-news')
+        count = 0
+        for item in article_items:
+            if count >= limit:
+                break
+            title_tag = item.find('h3', class_='title-news')
+            if title_tag and title_tag.a:
+                article_url = title_tag.a['href']
+                if not article_url.startswith('http'):
+                    article_url = 'https://vnexpress.net' + article_url
+                articles.append(article_url)
+                count += 1
+    except Exception as e:
+        st.error(f"Lỗi khi lấy danh sách bài viết: {str(e)}")
+    return articles
+def save_to_csv(data, filename):
+    try:
+        with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
+            if not data:
+                st.error("Không có dữ liệu để lưu")
+                return False
+            fieldnames = data[0].keys()
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(data)
+        return True
+    except Exception as e:
+        st.error(f"Lỗi khi lưu file CSV: {str(e)}")
+        return False
+# UI Streamlit
+st.title("🗞️ Công cụ Crawl dữ liệu VnExpress")
+st.markdown("""
+Ứng dụng này giúp bạn crawl dữ liệu từ VnExpress và lưu thành file CSV.
+""")
+# Thanh tab để lựa chọn chế độ crawl
+tab1, tab2 = st.tabs(["Crawl theo chuyên mục", "Crawl theo URL"])
+with tab1:
+    st.header("Crawl theo chuyên mục")
+    # Danh sách các chuyên mục của VnExpress
+    sections = {
+        "Thời sự": "https://vnexpress.net/thoi-su",
+        "Thế giới": "https://vnexpress.net/the-gioi",
+        "Kinh doanh": "https://vnexpress.net/kinh-doanh",
+        "Khoa học": "https://vnexpress.net/khoa-hoc",
+        "Giải trí": "https://vnexpress.net/giai-tri",
+        "Thể thao": "https://vnexpress.net/the-thao",
+        "Pháp luật": "https://vnexpress.net/phap-luat",
+        "Giáo dục": "https://vnexpress.net/giao-duc",
+        "Sức khỏe": "https://vnexpress.net/suc-khoe",
+        "Đời sống": "https://vnexpress.net/doi-song",
+        "Du lịch": "https://vnexpress.net/du-lich",
+        "Số hóa": "https://vnexpress.net/so-hoa"
+    }
+    # Thêm tùy chọn "Tất cả" vào danh sách chuyên mục
+    section_options = ["Tất cả"] + list(sections.keys())
+    selected_section = st.selectbox("Chọn chuyên mục:", section_options)
+    num_articles = st.number_input("Số lượng bài viết cần crawl (mỗi chuyên mục nếu chọn Tất cả):", min_value=1, max_value=50, value=5)
+    if st.button("Bắt đầu crawl theo chuyên mục"):
+        article_data = []
+        if selected_section == "Tất cả":
+            # Crawl tất cả các chuyên mục
+            with st.spinner("Đang crawl dữ liệu từ tất cả các chuyên mục..."):
+                total_sections = len(sections)
+                main_progress = st.progress(0)
+                for i, (section_name, section_url) in enumerate(sections.items()):
+                    st.write(f"Đang crawl chuyên mục {section_name} ({i+1}/{total_sections})...")
+                    # Lấy danh sách các URL bài viết
+                    article_urls = get_articles_from_section(section_url, num_articles)
+                    if article_urls:
+                        section_progress = st.progress(0)
+                        # Crawl chi tiết từng bài viết
+                        for j, url in enumerate(article_urls):
+                            article = get_article_details(url)
+                            if article:
+                                article_data.append(article)
+                            # Cập nhật thanh tiến trình của chuyên mục
+                            section_progress.progress((j + 1) / len(article_urls))
+                    # Cập nhật thanh tiến trình chính
+                    main_progress.progress((i + 1) / total_sections)
+        else:
+            # Crawl chuyên mục đã chọn
+            with st.spinner(f"Đang crawl dữ liệu từ chuyên mục {selected_section}..."):
+                # Lấy danh sách các URL bài viết
+                section_url = sections[selected_section]
+                article_urls = get_articles_from_section(section_url, num_articles)
+                if article_urls:
+                    progress_bar = st.progress(0)
+                    st.write(f"Đã tìm thấy {len(article_urls)} bài viết. Đang crawl chi tiết...")
+                    # Crawl chi tiết từng bài viết
+                    for i, url in enumerate(article_urls):
+                        article = get_article_details(url)
+                        if article:
+                            article_data.append(article)
+                        # Cập nhật thanh tiến trình
+                        progress_bar.progress((i + 1) / len(article_urls))
+                else:
+                    st.error("Không tìm thấy bài viết nào trong chuyên mục này.")
+        # Lưu dữ liệu vào file CSV nếu có dữ liệu
+        if article_data:
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"vnexpress_{selected_section.replace(' ', '_')}_{timestamp}.csv"
+            if save_to_csv(article_data, filename):
+                st.success(f"Đã lưu dữ liệu vào file {filename}")
+                # Hiển thị dữ liệu đã crawl
+                st.subheader("Dữ liệu đã crawl:")
+                df = pd.DataFrame(article_data)
+                st.dataframe(df)
+                # Tạo link tải xuống file CSV
+                with open(filename, 'rb') as f:
+                    csv_data = f.read()
+                st.download_button(
+                    label="Tải xuống file CSV",
+                    data=csv_data,
+                    file_name=filename,
+                    mime="text/csv"
+                )
+                # Hiển thị thông tin thống kê
+                if selected_section == "Tất cả":
+                    st.subheader("Thống kê theo chuyên mục:")
+                    category_counts = df['category'].value_counts().reset_index()
+                    category_counts.columns = ['Chuyên mục', 'Số lượng bài viết']
+                    st.dataframe(category_counts)
+        else:
+            st.error("Không thể crawl dữ liệu hoặc không tìm thấy bài viết nào.")
+with tab2:
+    st.header("Crawl theo URL cụ thể")
+    urls_input = st.text_area("Nhập danh sách URL (mỗi URL một dòng):",
+                           placeholder="https://vnexpress.net/article1\nhttps://vnexpress.net/article2")
+    if st.button("Bắt đầu crawl theo URL"):
+        urls = [url.strip() for url in urls_input.split("\n") if url.strip()]
+        if not urls:
+            st.error("Vui lòng nhập ít nhất một URL")
+        else:
+            with st.spinner(f"Đang crawl {len(urls)} bài viết..."):
+                progress_bar = st.progress(0)
+                article_data = []
+                for i, url in enumerate(urls):
+                    article = get_article_details(url)
+                    if article:
+                        article_data.append(article)
+                    # Cập nhật thanh tiến trình
+                    progress_bar.progress((i + 1) / len(urls))
+                # Lưu dữ liệu vào file CSV
+                timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+                filename = f"vnexpress_custom_{timestamp}.csv"
+                if article_data and save_to_csv(article_data, filename):
+                    st.success(f"Đã lưu dữ liệu vào file {filename}")
+                    # Hiển thị dữ liệu đã crawl
+                    st.subheader("Dữ liệu đã crawl:")
+                    df = pd.DataFrame(article_data)
+                    st.dataframe(df)
+                    # Tạo link tải xuống file CSV
+                    with open(filename, 'rb') as f:
+                        csv_data = f.read()
+                    st.download_button(
+                        label="Tải xuống file CSV",
+                        data=csv_data,
+                        file_name=filename,
+                        mime="text/csv"
+                    )
+                else:
+                    st.error("Không thể crawl dữ liệu từ các URL đã cung cấp.")
+# Thêm phần hướng dẫn sử dụng
+with st.expander("Hướng dẫn sử dụng"):
+    st.markdown("""
+    ### Cách sử dụng:
+    1. **Crawl theo chuyên mục**:
+       - Chọn chuyên mục từ danh sách (hoặc chọn "Tất cả" để crawl tất cả chuyên mục)
+       - Nhập số lượng bài viết cần crawl (nếu chọn "Tất cả", đây là số bài viết cho mỗi chuyên mục)
+       - Nhấn "Bắt đầu crawl theo chuyên mục"
+    2. **Crawl theo URL cụ thể**:
+       - Nhập danh sách URL (mỗi URL một dòng)
+       - Nhấn "Bắt đầu crawl theo URL"
+    3. **Tải xuống dữ liệu**:
+       - Sau khi crawl hoàn tất, nhấn nút "Tải xuống file CSV"
+    ### Lưu ý:
+    - File CSV được lưu với encoding UTF-8-sig để hỗ trợ tiếng Việt
+    - Tên file bao gồm timestamp để tránh trùng lặp
+    - Khi chọn "Tất cả" chuyên mục, quá trình crawl có thể mất nhiều thời gian hơn
+    """)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+beautifulsoup4
+requests
+streamlit