Jofthomas HF Staff commited on
Commit
1e9f877
·
verified ·
1 Parent(s): 5af1bed

Create data.py

Browse files
Files changed (1) hide show
  1. app/data.py +72 -0
app/data.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pandas as pd
3
+ from datasets import load_dataset
4
+ from fastapi import HTTPException
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ DATASET_NAME = "agents-course/unit4-students-scores"
10
+ CACHE_DURATION_SECONDS = 60 # Cache data for 60 seconds
11
+
12
+ # Simple in-memory cache
13
+ cached_data = None
14
+ last_cache_time = 0
15
+
16
+ def get_sorted_leaderboard_data():
17
+ """
18
+ Loads data from Hugging Face dataset, sorts it, and caches the result.
19
+ Returns the sorted data as a list of dictionaries.
20
+ """
21
+ global cached_data, last_cache_time
22
+ current_time = time.time()
23
+
24
+ # Check cache validity
25
+ if cached_data is not None and (current_time - last_cache_time) < CACHE_DURATION_SECONDS:
26
+ logger.info("Returning cached leaderboard data.")
27
+ return cached_data
28
+
29
+ logger.info(f"Cache expired or empty. Fetching fresh data from {DATASET_NAME}...")
30
+ try:
31
+ # Load the dataset
32
+ dataset = load_dataset(DATASET_NAME, split="train")
33
+
34
+ # Convert to pandas DataFrame for easier sorting
35
+ df = pd.DataFrame(dataset)
36
+
37
+ # Ensure required columns exist
38
+ required_columns = ['username', 'score', 'timestamp', 'code']
39
+ if not all(col in df.columns for col in required_columns):
40
+ missing = [col for col in required_columns if col not in df.columns]
41
+ raise ValueError(f"Dataset missing required columns: {missing}")
42
+
43
+ # Convert timestamp to datetime objects for proper sorting
44
+ # Handle potential errors during conversion
45
+ df['timestamp_dt'] = pd.to_datetime(df['timestamp'], errors='coerce')
46
+
47
+ # Drop rows where timestamp conversion failed
48
+ df.dropna(subset=['timestamp_dt'], inplace=True)
49
+
50
+ # Sort by score (descending) and then by timestamp (ascending)
51
+ df_sorted = df.sort_values(by=['score', 'timestamp_dt'], ascending=[False, True])
52
+
53
+ # Select only the columns needed for the frontend + code
54
+ # Convert DataFrame to list of dictionaries (JSON serializable)
55
+ # Use original timestamp string for display consistency if needed,
56
+ # but sorting was done on datetime objects.
57
+ leaderboard = df_sorted[['username', 'score', 'timestamp', 'code']].to_dict(orient='records')
58
+
59
+ # Update cache
60
+ cached_data = leaderboard
61
+ last_cache_time = current_time
62
+ logger.info(f"Successfully fetched and cached data. {len(leaderboard)} entries.")
63
+
64
+ return cached_data
65
+
66
+ except Exception as e:
67
+ logger.error(f"Error loading or processing dataset {DATASET_NAME}: {e}", exc_info=True)
68
+ # Re-raise as HTTPException so FastAPI returns a proper error response
69
+ raise HTTPException(status_code=500, detail=f"Failed to load or process leaderboard data: {e}")
70
+
71
+ # Optional: Add an __init__.py file in the app directory
72
+ # Create an empty file named app/__init__.py