File size: 8,752 Bytes
b6cc122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e62a82f
b6cc122
ccac437
b6cc122
 
c945162
b6cc122
 
fa9df97
b6cc122
 
 
 
 
 
 
a4321ae
b6cc122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccac437
 
b6cc122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e62a82f
 
b6cc122
e62a82f
b6cc122
e62a82f
b6cc122
e62a82f
b6cc122
 
 
ccac437
b6cc122
 
ccac437
 
 
 
 
 
b6cc122
af29208
b6cc122
 
 
 
 
 
ccac437
b6cc122
af29208
 
b6cc122
 
 
 
 
ccac437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23109fa
b6cc122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccac437
b6cc122
 
ccac437
 
 
 
6876d23
 
ccac437
 
 
 
 
 
 
23109fa
ccac437
6876d23
ccac437
6876d23
ccac437
 
 
 
 
 
 
 
 
 
 
 
b6cc122
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/env python3
"""
AI News Summarizer

A script to fetch, summarize, and create reports on recent AI news articles based on a specified topic.
"""

import argparse
from huggingface_hub import HfApi, InferenceClient
from newspaper import Article
import pandas as pd
import requests
from datetime import date, timedelta
import json
import os
from tqdm.auto import tqdm

def parse_arguments():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description='AI News Summarizer')
    parser.add_argument('--topic', type=str, default="Language Models",
                        help='Topic to search for news articles (default: "Language Models")')
    parser.add_argument('--num-articles', type=int, default=50,
                        help='Number of articles to fetch (default: 50)')
    parser.add_argument('--provider', type=str, default="fireworks-ai",
                        help='Inference provider for HuggingFace (default: "fireworks-ai")')
    parser.add_argument('--repo-id', type=str, default="lvwerra/news-reports",
                        help='HuggingFace repo ID to upload the report (default: "lvwerra/news-reports")')
    
    args = parser.parse_args()
    return args

def main():
    # Parse arguments
    args = parse_arguments()
    
    # Environment variables
    NEWS_API_KEY = os.getenv("NEWS_API_KEY")
    HF_API_KEY = os.getenv("HF_API_KEY")
    NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
    MODEL = "Qwen/Qwen3-30B-A3B"
    
    # Initialize clients
    client = InferenceClient(provider=args.provider, api_key=HF_API_KEY)
    
    # Set topic and number of articles
    topic = args.topic
    num = args.num_articles
    
    # Configure tqdm for pandas
    tqdm.pandas(desc="")
    
    print(f"Fetching top {num} articles on '{topic}' of today...")
    articles = fetch_news_articles(topic, num)
    df = pd.DataFrame.from_records(articles)
    
    print(f"Downloading and parsing {len(df)} articles...")
    df["content_full"] = df["url"].progress_apply(fetch_full_article)
    mask = df['content_full'].str.contains("Failed to fetch artcile.")
    df.loc[mask, 'content_full'] = df.loc[mask, 'content']
    
    print(f"Summarizing each article (total={len(df)})...")
    df["summary_raw"] = df["content_full"].progress_apply(lambda x: summarize(x, client, MODEL))
    df["summary_clean"] = df["summary_raw"].apply(lambda x: x.split("</think>")[1].strip() if "</think>" in x else x.strip())
    
    print(f"Create report...")
    df["article_summary"] = df.apply(format_summary, axis=1)
    
    sep = "\n\n" + "-"*80 + "\n\n"
    overview = sep.join([f"Article: {i+1}\n{article}" for i, article in enumerate(df["article_summary"])])
    report = create_report(overview, client, MODEL, topic)
    
    # Extract report content
    final_report = postprocess_report(report, overview, topic, num, MODEL)

    file_path = f"reports/{'-'.join(topic.lower().split())}/{date.today().strftime('%Y-%m-%d')}.md"
    print(f"Uploading to {args.repo_id} under {file_path}...")
    # Upload to HuggingFace
    hf_api = HfApi()
    hf_api.upload_file(
        path_or_fileobj=final_report.encode("utf-8"),
        path_in_repo=file_path,
        repo_id=args.repo_id,
        repo_type="space",
        token=HF_API_KEY,
    )

    print("Job finished!")

def fetch_news_articles(topic, num_articles=10):
    """Fetch news articles on the given topic"""
    NEWS_API_KEY = os.getenv("NEWS_API_KEY")
    NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
    
    today = date.today().strftime('%Y-%m-%d')
    yesterday = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    
    params = {
        'q': topic,
        'from': yesterday,
        'to': today,
        'sortBy': 'popularity',
        'language': 'en',
        'pageSize': num_articles,
        'apiKey': NEWS_API_KEY
    }

    response = requests.get(NEWS_ENDPOINT, params=params)
    
    if response.status_code == 200:
        data = response.json()
        return data['articles']
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return []

def fetch_full_article(url):
    """Fetch and parse the full content of an article"""
    try:
        a = Article(url)
        a.download()
        a.parse()
        return a.text
    except:
        return "Failed to fetch artcile."
    
def summarize(article, client, model):
    """Summarize an article using the HuggingFace inference API"""
    user_msg = f"""\
Summarize the following news article in a few bullet points. \
Note that the reader is an expert in the field and wants only the most relevant and novel information and likes to know the specific details. \
So keep the summary brief but don't omit technical terms or specific information.

Article:
{article}

/no_think"""

    messages=[
            {
                "role": "user",
                "content": user_msg,
            }
        ]
    
    response = client.chat_completion(
            model=model,
            messages=messages,
            temperature=0.8,
            max_tokens=512,
        )

    return response.choices[0].message.content

def format_summary(row):
    """Format article summary"""
    summary = f"""\
Title: **{row['title']}**

Published: {row['publishedAt']}

Description: {row['description']}

URL: {row['url']}

Summary:\n{row['summary_clean']}"""
    return summary

def create_report(articles_overview, client, model, topic):
    """Create a comprehensive report from all article summaries"""
    user_msg = f"""\
News articles:\
{articles_overview}

===========================

Create a summary report of the newspaper articles above. Ignore everything that's not releated to the topic '{topic}'

Separete the report into categories, for example for AI it could be catogories like:
- Breaking news: anything that can also appear below but is the most important news of the day
- Model news (e.g. new model releases, or insights about existing models etc.)
- Startups (e.g. new startups, fundraising etc.)
- Big Tech news (e.g. news from Google/Meta/OpenAI etc.)
- Policy (e.g. US administration or EU policy)
- Products (e.g. news of products that are powered by AI in some way)
- Miscellaneous (whatever doesn't fit into the others but still relevant to the topic)

For other topics come up with other categories that make sense.

Style: The reader is an expert in the field and wants only the most relevant and novel information. \
Omit articles that are irrelevant to the field of AI and feel free to aggregate several articles about the same topic into one point. \

Format: Use markdown formatting and add links at the end of each section linking to the original articles.

Example snippet:

# NEWS_SUMMARY

---

## **Breaking News**  
- **Google and Apple in talks to integrate Gemini AI into Apple Intelligence by mid-2025** _[Apple Insider](https://appleinsider.com/articles/25/04/30/google-wants-gemini-ai-deal-with-apple-by-mid-2025), [The Verge](https://www.theverge.com/news/658770/google-gemini-apple-iphone-deal-ai)_  
  - Google’s Gemini AI could enhance Siri with advanced reasoning and contextual capabilities, though Apple’s strict privacy controls may limit deep system access.  
  - A potential deal could accelerate Apple’s AI development and expand Google’s AI reach.  
- **Apple Vision Pro launch delayed** _[Six Colors](https://sixcolors.com/post/2025/04/apple-in-the-enterprise-the-complete-2025-commentary/)_
  - Apple’s mixed-reality headset, featuring advanced AI integration, is expected to arrive in 2025, though specifics remain unclear.  

---

... followed by the other sections.

"""

    messages=[
        {
            "role": "user",
            "content": user_msg,
        }
    ]

    response = client.chat_completion(
        model=model,
        messages=messages,
        temperature=0.8,
        max_tokens=32000,
    )
    
    return response.choices[0].message.content

def postprocess_report(report, summaries, topic, num_articles, model):
    report_summary = f"""\
# News Summary: {topic}  

**Period:** {(date.today() - timedelta(days=1)).strftime('%Y-%m-%d')} to {date.today().strftime('%Y-%m-%d')} | \
**Processed articles:** {num_articles} | **Model**: {model}
"""

    report_content = report.split("</think>")[1].strip() if "</think>" in report else report.strip()
    report_thoughts = report.split("</think>")[0].strip() if "</think>" in report else "No thoughts."
    report_thoughts.replace("<think>", "")

    final_report = f"""\
{report_content.replace('# NEWS_SUMMARY', report_summary)}

## Details

## Sources

{summaries}

## Model reasoning

{report_thoughts}

"""

    return final_report

    
if __name__ == "__main__":
    main()