Spaces:

lvwerra
/

ai-news

Runtime error

App Files Files Community

ai-news / run.py

lvwerra HF Staff

Update run.py

af29208 verified 3 days ago

raw

history blame contribute delete

8.75 kB

	#!/usr/bin/env python3
	"""
	AI News Summarizer

	A script to fetch, summarize, and create reports on recent AI news articles based on a specified topic.
	"""

	import argparse
	from huggingface_hub import HfApi, InferenceClient
	from newspaper import Article
	import pandas as pd
	import requests
	from datetime import date, timedelta
	import json
	import os
	from tqdm.auto import tqdm

	def parse_arguments():
	"""Parse command line arguments"""
	parser = argparse.ArgumentParser(description='AI News Summarizer')
	parser.add_argument('--topic', type=str, default="Language Models",
	help='Topic to search for news articles (default: "Language Models")')
	parser.add_argument('--num-articles', type=int, default=50,
	help='Number of articles to fetch (default: 50)')
	parser.add_argument('--provider', type=str, default="fireworks-ai",
	help='Inference provider for HuggingFace (default: "fireworks-ai")')
	parser.add_argument('--repo-id', type=str, default="lvwerra/news-reports",
	help='HuggingFace repo ID to upload the report (default: "lvwerra/news-reports")')

	args = parser.parse_args()
	return args

	def main():
	# Parse arguments
	args = parse_arguments()

	# Environment variables
	NEWS_API_KEY = os.getenv("NEWS_API_KEY")
	HF_API_KEY = os.getenv("HF_API_KEY")
	NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'
	MODEL = "Qwen/Qwen3-30B-A3B"

	# Initialize clients
	client = InferenceClient(provider=args.provider, api_key=HF_API_KEY)

	# Set topic and number of articles
	topic = args.topic
	num = args.num_articles

	# Configure tqdm for pandas
	tqdm.pandas(desc="")

	print(f"Fetching top {num} articles on '{topic}' of today...")
	articles = fetch_news_articles(topic, num)
	df = pd.DataFrame.from_records(articles)

	print(f"Downloading and parsing {len(df)} articles...")
	df["content_full"] = df["url"].progress_apply(fetch_full_article)
	mask = df['content_full'].str.contains("Failed to fetch artcile.")
	df.loc[mask, 'content_full'] = df.loc[mask, 'content']

	print(f"Summarizing each article (total={len(df)})...")
	df["summary_raw"] = df["content_full"].progress_apply(lambda x: summarize(x, client, MODEL))
	df["summary_clean"] = df["summary_raw"].apply(lambda x: x.split("</think>")[1].strip() if "</think>" in x else x.strip())

	print(f"Create report...")
	df["article_summary"] = df.apply(format_summary, axis=1)

	sep = "\n\n" + "-"*80 + "\n\n"
	overview = sep.join([f"Article: {i+1}\n{article}" for i, article in enumerate(df["article_summary"])])
	report = create_report(overview, client, MODEL, topic)

	# Extract report content
	final_report = postprocess_report(report, overview, topic, num, MODEL)

	file_path = f"reports/{'-'.join(topic.lower().split())}/{date.today().strftime('%Y-%m-%d')}.md"
	print(f"Uploading to {args.repo_id} under {file_path}...")
	# Upload to HuggingFace
	hf_api = HfApi()
	hf_api.upload_file(
	path_or_fileobj=final_report.encode("utf-8"),
	path_in_repo=file_path,
	repo_id=args.repo_id,
	repo_type="space",
	token=HF_API_KEY,
	)

	print("Job finished!")

	def fetch_news_articles(topic, num_articles=10):
	"""Fetch news articles on the given topic"""
	NEWS_API_KEY = os.getenv("NEWS_API_KEY")
	NEWS_ENDPOINT = 'https://newsapi.org/v2/everything'

	today = date.today().strftime('%Y-%m-%d')
	yesterday = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')

	params = {
	'q': topic,
	'from': yesterday,
	'to': today,
	'sortBy': 'popularity',
	'language': 'en',
	'pageSize': num_articles,
	'apiKey': NEWS_API_KEY
	}

	response = requests.get(NEWS_ENDPOINT, params=params)

	if response.status_code == 200:
	data = response.json()
	return data['articles']
	else:
	print(f"Error: {response.status_code}")
	print(response.text)
	return []

	def fetch_full_article(url):
	"""Fetch and parse the full content of an article"""
	try:
	a = Article(url)
	a.download()
	a.parse()
	return a.text
	except:
	return "Failed to fetch artcile."

	def summarize(article, client, model):
	"""Summarize an article using the HuggingFace inference API"""
	user_msg = f"""\
	Summarize the following news article in a few bullet points. \
	Note that the reader is an expert in the field and wants only the most relevant and novel information and likes to know the specific details. \
	So keep the summary brief but don't omit technical terms or specific information.

	Article:
	{article}

	/no_think"""

	messages=[
	{
	"role": "user",
	"content": user_msg,
	}
	]

	response = client.chat_completion(
	model=model,
	messages=messages,
	temperature=0.8,
	max_tokens=512,
	)

	return response.choices[0].message.content

	def format_summary(row):
	"""Format article summary"""
	summary = f"""\
	Title: {row['title']}

	Published: {row['publishedAt']}

	Description: {row['description']}

	URL: {row['url']}

	Summary:\n{row['summary_clean']}"""
	return summary

	def create_report(articles_overview, client, model, topic):
	"""Create a comprehensive report from all article summaries"""
	user_msg = f"""\
	News articles:\
	{articles_overview}

	===========================

	Create a summary report of the newspaper articles above. Ignore everything that's not releated to the topic '{topic}'

	Separete the report into categories, for example for AI it could be catogories like:
	- Breaking news: anything that can also appear below but is the most important news of the day
	- Model news (e.g. new model releases, or insights about existing models etc.)
	- Startups (e.g. new startups, fundraising etc.)
	- Big Tech news (e.g. news from Google/Meta/OpenAI etc.)
	- Policy (e.g. US administration or EU policy)
	- Products (e.g. news of products that are powered by AI in some way)
	- Miscellaneous (whatever doesn't fit into the others but still relevant to the topic)

	For other topics come up with other categories that make sense.

	Style: The reader is an expert in the field and wants only the most relevant and novel information. \
	Omit articles that are irrelevant to the field of AI and feel free to aggregate several articles about the same topic into one point. \

	Format: Use markdown formatting and add links at the end of each section linking to the original articles.

	Example snippet:

	# NEWS_SUMMARY

	---

	## Breaking News
	- Google and Apple in talks to integrate Gemini AI into Apple Intelligence by mid-2025 _[Apple Insider](https://appleinsider.com/articles/25/04/30/google-wants-gemini-ai-deal-with-apple-by-mid-2025), [The Verge](https://www.theverge.com/news/658770/google-gemini-apple-iphone-deal-ai)_
	- Google’s Gemini AI could enhance Siri with advanced reasoning and contextual capabilities, though Apple’s strict privacy controls may limit deep system access.
	- A potential deal could accelerate Apple’s AI development and expand Google’s AI reach.
	- Apple Vision Pro launch delayed _[Six Colors](https://sixcolors.com/post/2025/04/apple-in-the-enterprise-the-complete-2025-commentary/)_
	- Apple’s mixed-reality headset, featuring advanced AI integration, is expected to arrive in 2025, though specifics remain unclear.

	---

	... followed by the other sections.

	"""

	messages=[
	{
	"role": "user",
	"content": user_msg,
	}
	]

	response = client.chat_completion(
	model=model,
	messages=messages,
	temperature=0.8,
	max_tokens=32000,
	)

	return response.choices[0].message.content

	def postprocess_report(report, summaries, topic, num_articles, model):
	report_summary = f"""\
	# News Summary: {topic}

	Period: {(date.today() - timedelta(days=1)).strftime('%Y-%m-%d')} to {date.today().strftime('%Y-%m-%d')} \| \
	Processed articles: {num_articles} \| Model: {model}
	"""

	report_content = report.split("</think>")[1].strip() if "</think>" in report else report.strip()
	report_thoughts = report.split("</think>")[0].strip() if "</think>" in report else "No thoughts."
	report_thoughts.replace("<think>", "")

	final_report = f"""\
	{report_content.replace('# NEWS_SUMMARY', report_summary)}

	## Details

	## Sources

	{summaries}

	## Model reasoning

	{report_thoughts}

	"""

	return final_report


	if __name__ == "__main__":
	main()