Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
970eef1
0
Parent(s):
first commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env.example +3 -0
- .gitignore +58 -0
- Dockerfile +61 -0
- README.md +11 -0
- backend/Dockerfile.dev +25 -0
- backend/README.md +352 -0
- backend/main.py +38 -0
- backend/old-pyproject.toml +26 -0
- backend/poetry.lock +0 -0
- backend/pyproject.toml +47 -0
- backend/routes/__init__.py +23 -0
- backend/routes/benchmark.py +132 -0
- backend/routes/download.py +74 -0
- backend/routes/evaluation.py +150 -0
- backend/routes/health.py +13 -0
- backend/routes/questions.py +87 -0
- backend/routes/upload.py +52 -0
- backend/tasks/__init__.py +3 -0
- backend/tasks/createBench.py +317 -0
- backend/tasks/createBenchConfigFile.py +313 -0
- backend/tasks/evaluationTask.py +471 -0
- backend/tasks/yourbench_lighteval_task.py +273 -0
- backend/tests/test_evaluation.py +165 -0
- backend/tests/test_hf_upload.py +78 -0
- backend/tests/test_inference.py +84 -0
- backend/tests/test_lighteval.py +151 -0
- backend/tests/test_openai.py +31 -0
- backend/tests/test_parallel_lighteval.py +278 -0
- backend/tests/test_provider_parallel_support.py +227 -0
- backend/tests/test_yourbench_results.py +394 -0
- docker-compose.yml +33 -0
- frontend/Dockerfile.dev +15 -0
- frontend/README.md +80 -0
- frontend/package.json +55 -0
- frontend/public/index.html +96 -0
- frontend/public/logo256.png +0 -0
- frontend/public/logo32.png +0 -0
- frontend/public/og-image.jpg +0 -0
- frontend/public/robots.txt +3 -0
- frontend/server.js +85 -0
- frontend/src/App.js +427 -0
- frontend/src/components/BenchmarkCreateForm.jsx +295 -0
- frontend/src/components/BenchmarkDisplay.jsx +161 -0
- frontend/src/components/BenchmarkEvaluation.jsx +364 -0
- frontend/src/components/BenchmarkGenerator.jsx +398 -0
- frontend/src/components/EvaluationDisplay.jsx +196 -0
- frontend/src/components/Footer/Footer.js +30 -0
- frontend/src/components/LogDisplay.jsx +67 -0
- frontend/src/components/Logo/HFLogo.js +19 -0
- frontend/src/components/Logo/Logo.js +56 -0
.env.example
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
ENVIRONMENT=development
|
2 |
+
HF_TOKEN=xxx
|
3 |
+
HF_HOME=.cache
|
.gitignore
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
2 |
+
|
3 |
+
__pycache__
|
4 |
+
.cache/
|
5 |
+
|
6 |
+
# dependencies
|
7 |
+
|
8 |
+
frontend/node_modules
|
9 |
+
/.pnp
|
10 |
+
.pnp.js
|
11 |
+
|
12 |
+
# testing
|
13 |
+
/coverage
|
14 |
+
|
15 |
+
/backend/uploaded_files
|
16 |
+
/backend/logs
|
17 |
+
|
18 |
+
.venv/
|
19 |
+
|
20 |
+
__pycache__/
|
21 |
+
*.pkl
|
22 |
+
model_info_cache.pkl
|
23 |
+
model_size_cache.pkl
|
24 |
+
gif.gif
|
25 |
+
src/assets/scale-hf-logo.png
|
26 |
+
|
27 |
+
.litellm_cache/
|
28 |
+
|
29 |
+
# production
|
30 |
+
|
31 |
+
/build
|
32 |
+
|
33 |
+
# misc
|
34 |
+
|
35 |
+
.DS_Store
|
36 |
+
.env.local
|
37 |
+
.env.development.local
|
38 |
+
.env.test.local
|
39 |
+
.env.production.local
|
40 |
+
|
41 |
+
npm-debug.log*
|
42 |
+
yarn-debug.log*
|
43 |
+
yarn-error.log\*
|
44 |
+
|
45 |
+
src/dataframe.json
|
46 |
+
|
47 |
+
yarn.lock
|
48 |
+
package-lock.json
|
49 |
+
|
50 |
+
/public
|
51 |
+
|
52 |
+
.claudesync/
|
53 |
+
|
54 |
+
# Environment variables
|
55 |
+
.env
|
56 |
+
.env.*
|
57 |
+
!.env.example
|
58 |
+
|
Dockerfile
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Build frontend
|
2 |
+
FROM node:18 as frontend-build
|
3 |
+
WORKDIR /app
|
4 |
+
COPY frontend/package*.json ./
|
5 |
+
RUN npm install
|
6 |
+
COPY frontend/ ./
|
7 |
+
|
8 |
+
RUN npm run build
|
9 |
+
|
10 |
+
# Build backend
|
11 |
+
FROM python:3.12-slim
|
12 |
+
WORKDIR /app
|
13 |
+
|
14 |
+
# Create non-root user
|
15 |
+
RUN useradd -m -u 1000 user
|
16 |
+
|
17 |
+
# Install uv instead of poetry
|
18 |
+
RUN pip install uv
|
19 |
+
|
20 |
+
# Create and configure cache directory
|
21 |
+
RUN mkdir -p /app/.cache && \
|
22 |
+
chown -R user:user /app
|
23 |
+
|
24 |
+
# Copy and install backend dependencies
|
25 |
+
COPY backend/pyproject.toml ./
|
26 |
+
RUN uv pip install -e . --system
|
27 |
+
|
28 |
+
# Copy backend code
|
29 |
+
COPY backend/ .
|
30 |
+
|
31 |
+
# Install Node.js and npm
|
32 |
+
RUN apt-get update && apt-get install -y \
|
33 |
+
curl \
|
34 |
+
netcat-openbsd \
|
35 |
+
&& curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
|
36 |
+
&& apt-get install -y nodejs \
|
37 |
+
&& rm -rf /var/lib/apt/lists/*
|
38 |
+
|
39 |
+
# Copy frontend server and build
|
40 |
+
COPY --from=frontend-build /app/build ./frontend/build
|
41 |
+
COPY --from=frontend-build /app/package*.json ./frontend/
|
42 |
+
COPY --from=frontend-build /app/server.js ./frontend/
|
43 |
+
|
44 |
+
# Install frontend production dependencies
|
45 |
+
WORKDIR /app/frontend
|
46 |
+
RUN npm install --production
|
47 |
+
WORKDIR /app
|
48 |
+
|
49 |
+
# Environment variables
|
50 |
+
ENV HF_HOME=/app/.cache \
|
51 |
+
HF_DATASETS_CACHE=/app/.cache \
|
52 |
+
INTERNAL_API_PORT=7861 \
|
53 |
+
PORT=7860 \
|
54 |
+
NODE_ENV=production
|
55 |
+
|
56 |
+
# Note: HF_TOKEN should be provided at runtime, not build time
|
57 |
+
USER user
|
58 |
+
EXPOSE 7860
|
59 |
+
|
60 |
+
# Start both servers with wait-for
|
61 |
+
CMD ["sh", "-c", "uvicorn app.asgi:app --host 0.0.0.0 --port 7861 & while ! nc -z localhost 7861; do sleep 1; done && cd frontend && npm run serve"]
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Yourbench simple demo
|
3 |
+
emoji: 🏆
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
+
sdk: docker
|
7 |
+
hf_oauth: true
|
8 |
+
pinned: true
|
9 |
+
license: apache-2.0
|
10 |
+
short_description: Yourbench demo
|
11 |
+
---
|
backend/Dockerfile.dev
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install required system dependencies
|
6 |
+
RUN apt-get update && apt-get install -y \
|
7 |
+
build-essential \
|
8 |
+
&& rm -rf /var/lib/apt/lists/*
|
9 |
+
|
10 |
+
# Install poetry
|
11 |
+
RUN pip install poetry
|
12 |
+
|
13 |
+
# Copy Poetry configuration files
|
14 |
+
COPY pyproject.toml poetry.lock* ./
|
15 |
+
|
16 |
+
# Install dependencies
|
17 |
+
RUN poetry config virtualenvs.create false && \
|
18 |
+
poetry install --no-interaction --no-ansi --no-root
|
19 |
+
|
20 |
+
# Environment variables configuration for logs
|
21 |
+
ENV PYTHONUNBUFFERED=1
|
22 |
+
ENV LOG_LEVEL=INFO
|
23 |
+
|
24 |
+
# In dev, mount volume directly
|
25 |
+
CMD ["uvicorn", "app.asgi:app", "--host", "0.0.0.0", "--port", "7860", "--reload", "--log-level", "warning", "--no-access-log"]
|
backend/README.md
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Backend - Open LLM Leaderboard 🏆
|
2 |
+
|
3 |
+
FastAPI backend for the Open LLM Leaderboard. This service is part of a larger architecture that includes a React frontend. For complete project installation, see the [main README](../README.md).
|
4 |
+
|
5 |
+
## ✨ Features
|
6 |
+
|
7 |
+
- 📊 REST API for LLM models leaderboard management
|
8 |
+
- 🗳️ Voting and ranking system
|
9 |
+
- 🔄 HuggingFace Hub integration
|
10 |
+
- 🚀 Caching and performance optimizations
|
11 |
+
|
12 |
+
## 🏗 Architecture
|
13 |
+
|
14 |
+
```mermaid
|
15 |
+
flowchart TD
|
16 |
+
Client(["**Frontend**<br><br>React Application"]) --> API["**API Server**<br><br>FastAPI REST Endpoints"]
|
17 |
+
|
18 |
+
subgraph Backend
|
19 |
+
API --> Core["**Core Layer**<br><br>• Middleware<br>• Cache<br>• Rate Limiting"]
|
20 |
+
Core --> Services["**Services Layer**<br><br>• Business Logic<br>• Data Processing"]
|
21 |
+
|
22 |
+
subgraph Services Layer
|
23 |
+
Services --> Models["**Model Service**<br><br>• Model Submission<br>• Evaluation Pipeline"]
|
24 |
+
Services --> Votes["**Vote Service**<br><br>• Vote Management<br>• Data Synchronization"]
|
25 |
+
Services --> Board["**Leaderboard Service**<br><br>• Rankings<br>• Performance Metrics"]
|
26 |
+
end
|
27 |
+
|
28 |
+
Models --> Cache["**Cache Layer**<br><br>• In-Memory Store<br>• Auto Invalidation"]
|
29 |
+
Votes --> Cache
|
30 |
+
Board --> Cache
|
31 |
+
|
32 |
+
Models --> HF["**HuggingFace Hub**<br><br>• Models Repository<br>• Datasets Access"]
|
33 |
+
Votes --> HF
|
34 |
+
Board --> HF
|
35 |
+
end
|
36 |
+
|
37 |
+
style Client fill:#f9f,stroke:#333,stroke-width:2px
|
38 |
+
style Models fill:#bbf,stroke:#333,stroke-width:2px
|
39 |
+
style Votes fill:#bbf,stroke:#333,stroke-width:2px
|
40 |
+
style Board fill:#bbf,stroke:#333,stroke-width:2px
|
41 |
+
style HF fill:#bfb,stroke:#333,stroke-width:2px
|
42 |
+
```
|
43 |
+
|
44 |
+
## 🛠️ HuggingFace Datasets
|
45 |
+
|
46 |
+
The application uses several datasets on the HuggingFace Hub:
|
47 |
+
|
48 |
+
### 1. Requests Dataset (`{HF_ORGANIZATION}/requests`)
|
49 |
+
|
50 |
+
- **Operations**:
|
51 |
+
- 📤 `POST /api/models/submit`: Adds a JSON file for each new model submission
|
52 |
+
- 📥 `GET /api/models/status`: Reads files to get models status
|
53 |
+
- **Format**: One JSON file per model with submission details
|
54 |
+
- **Updates**: On each new model submission
|
55 |
+
|
56 |
+
### 2. Votes Dataset (`{HF_ORGANIZATION}/votes`)
|
57 |
+
|
58 |
+
- **Operations**:
|
59 |
+
- 📤 `POST /api/votes/{model_id}`: Adds a new vote
|
60 |
+
- 📥 `GET /api/votes/model/{provider}/{model}`: Reads model votes
|
61 |
+
- 📥 `GET /api/votes/user/{user_id}`: Reads user votes
|
62 |
+
- **Format**: JSONL with one vote per line
|
63 |
+
- **Sync**: Bidirectional between local cache and Hub
|
64 |
+
|
65 |
+
### 3. Contents Dataset (`{HF_ORGANIZATION}/contents`)
|
66 |
+
|
67 |
+
- **Operations**:
|
68 |
+
- 📥 `GET /api/leaderboard`: Reads raw data
|
69 |
+
- 📥 `GET /api/leaderboard/formatted`: Reads and formats data
|
70 |
+
- **Format**: Main dataset containing all scores and metrics
|
71 |
+
- **Updates**: Automatic after model evaluations
|
72 |
+
|
73 |
+
### 4. Official Providers Dataset (`{HF_ORGANIZATION}/official-providers`)
|
74 |
+
|
75 |
+
- **Operations**:
|
76 |
+
- 📥 Read-only access for highlighted models
|
77 |
+
- **Format**: List of models selected by maintainers
|
78 |
+
- **Updates**: Manual by maintainers
|
79 |
+
|
80 |
+
## 🛠 Local Development
|
81 |
+
|
82 |
+
### Prerequisites
|
83 |
+
|
84 |
+
- Python 3.9+
|
85 |
+
- [Poetry](https://python-poetry.org/docs/#installation)
|
86 |
+
|
87 |
+
### Standalone Installation (without Docker)
|
88 |
+
|
89 |
+
```bash
|
90 |
+
# Install dependencies
|
91 |
+
poetry install
|
92 |
+
|
93 |
+
# Setup configuration
|
94 |
+
cp .env.example .env
|
95 |
+
|
96 |
+
# Start development server
|
97 |
+
poetry run uvicorn app.asgi:app --host 0.0.0.0 --port 7860 --reload
|
98 |
+
```
|
99 |
+
|
100 |
+
Server will be available at http://localhost:7860
|
101 |
+
|
102 |
+
## ⚙️ Configuration
|
103 |
+
|
104 |
+
| Variable | Description | Default |
|
105 |
+
| ------------ | ------------------------------------ | ----------- |
|
106 |
+
| ENVIRONMENT | Environment (development/production) | development |
|
107 |
+
| HF_TOKEN | HuggingFace authentication token | - |
|
108 |
+
| PORT | Server port | 7860 |
|
109 |
+
| LOG_LEVEL | Logging level (INFO/DEBUG/WARNING) | INFO |
|
110 |
+
| CORS_ORIGINS | Allowed CORS origins | ["*"] |
|
111 |
+
| CACHE_TTL | Cache Time To Live in seconds | 300 |
|
112 |
+
|
113 |
+
## 🔧 Middleware
|
114 |
+
|
115 |
+
The backend uses several middleware layers for optimal performance and security:
|
116 |
+
|
117 |
+
- **CORS Middleware**: Handles Cross-Origin Resource Sharing
|
118 |
+
- **GZIP Middleware**: Compresses responses > 500 bytes
|
119 |
+
- **Rate Limiting**: Prevents API abuse
|
120 |
+
- **Caching**: In-memory caching with automatic invalidation
|
121 |
+
|
122 |
+
## 📝 Logging
|
123 |
+
|
124 |
+
The application uses a structured logging system with:
|
125 |
+
|
126 |
+
- Formatted console output
|
127 |
+
- Different log levels per component
|
128 |
+
- Request/Response logging
|
129 |
+
- Performance metrics
|
130 |
+
- Error tracking
|
131 |
+
|
132 |
+
## 📁 File Structure
|
133 |
+
|
134 |
+
```
|
135 |
+
backend/
|
136 |
+
├── app/ # Source code
|
137 |
+
│ ├── api/ # Routes and endpoints
|
138 |
+
│ │ └── endpoints/ # Endpoint handlers
|
139 |
+
│ ├── core/ # Configurations
|
140 |
+
│ ├── services/ # Business logic
|
141 |
+
│ └── utils/ # Utilities
|
142 |
+
└── tests/ # Tests
|
143 |
+
```
|
144 |
+
|
145 |
+
## 📚 API
|
146 |
+
|
147 |
+
Swagger documentation available at http://localhost:7860/docs
|
148 |
+
|
149 |
+
### Main Endpoints & Data Structures
|
150 |
+
|
151 |
+
#### Leaderboard
|
152 |
+
|
153 |
+
- `GET /api/leaderboard/formatted` - Formatted data with computed fields and metadata
|
154 |
+
|
155 |
+
```typescript
|
156 |
+
Response {
|
157 |
+
models: [{
|
158 |
+
id: string, // eval_name
|
159 |
+
model: {
|
160 |
+
name: string, // fullname
|
161 |
+
sha: string, // Model sha
|
162 |
+
precision: string, // e.g. "fp16", "int8"
|
163 |
+
type: string, // e.g. "fined-tuned-on-domain-specific-dataset"
|
164 |
+
weight_type: string,
|
165 |
+
architecture: string,
|
166 |
+
average_score: number,
|
167 |
+
has_chat_template: boolean
|
168 |
+
},
|
169 |
+
evaluations: {
|
170 |
+
ifeval: {
|
171 |
+
name: "IFEval",
|
172 |
+
value: number, // Raw score
|
173 |
+
normalized_score: number
|
174 |
+
},
|
175 |
+
bbh: {
|
176 |
+
name: "BBH",
|
177 |
+
value: number,
|
178 |
+
normalized_score: number
|
179 |
+
},
|
180 |
+
math: {
|
181 |
+
name: "MATH Level 5",
|
182 |
+
value: number,
|
183 |
+
normalized_score: number
|
184 |
+
},
|
185 |
+
gpqa: {
|
186 |
+
name: "GPQA",
|
187 |
+
value: number,
|
188 |
+
normalized_score: number
|
189 |
+
},
|
190 |
+
musr: {
|
191 |
+
name: "MUSR",
|
192 |
+
value: number,
|
193 |
+
normalized_score: number
|
194 |
+
},
|
195 |
+
mmlu_pro: {
|
196 |
+
name: "MMLU-PRO",
|
197 |
+
value: number,
|
198 |
+
normalized_score: number
|
199 |
+
}
|
200 |
+
},
|
201 |
+
features: {
|
202 |
+
is_not_available_on_hub: boolean,
|
203 |
+
is_merged: boolean,
|
204 |
+
is_moe: boolean,
|
205 |
+
is_flagged: boolean,
|
206 |
+
is_official_provider: boolean
|
207 |
+
},
|
208 |
+
metadata: {
|
209 |
+
upload_date: string,
|
210 |
+
submission_date: string,
|
211 |
+
generation: string,
|
212 |
+
base_model: string,
|
213 |
+
hub_license: string,
|
214 |
+
hub_hearts: number,
|
215 |
+
params_billions: number,
|
216 |
+
co2_cost: number // CO₂ cost in kg
|
217 |
+
}
|
218 |
+
}]
|
219 |
+
}
|
220 |
+
```
|
221 |
+
|
222 |
+
- `GET /api/leaderboard` - Raw data from the HuggingFace dataset
|
223 |
+
```typescript
|
224 |
+
Response {
|
225 |
+
models: [{
|
226 |
+
eval_name: string,
|
227 |
+
Precision: string,
|
228 |
+
Type: string,
|
229 |
+
"Weight type": string,
|
230 |
+
Architecture: string,
|
231 |
+
Model: string,
|
232 |
+
fullname: string,
|
233 |
+
"Model sha": string,
|
234 |
+
"Average ⬆️": number,
|
235 |
+
"Hub License": string,
|
236 |
+
"Hub ❤️": number,
|
237 |
+
"#Params (B)": number,
|
238 |
+
"Available on the hub": boolean,
|
239 |
+
Merged: boolean,
|
240 |
+
MoE: boolean,
|
241 |
+
Flagged: boolean,
|
242 |
+
"Chat Template": boolean,
|
243 |
+
"CO₂ cost (kg)": number,
|
244 |
+
"IFEval Raw": number,
|
245 |
+
IFEval: number,
|
246 |
+
"BBH Raw": number,
|
247 |
+
BBH: number,
|
248 |
+
"MATH Lvl 5 Raw": number,
|
249 |
+
"MATH Lvl 5": number,
|
250 |
+
"GPQA Raw": number,
|
251 |
+
GPQA: number,
|
252 |
+
"MUSR Raw": number,
|
253 |
+
MUSR: number,
|
254 |
+
"MMLU-PRO Raw": number,
|
255 |
+
"MMLU-PRO": number,
|
256 |
+
"Maintainer's Highlight": boolean,
|
257 |
+
"Upload To Hub Date": string,
|
258 |
+
"Submission Date": string,
|
259 |
+
Generation: string,
|
260 |
+
"Base Model": string
|
261 |
+
}]
|
262 |
+
}
|
263 |
+
```
|
264 |
+
|
265 |
+
#### Models
|
266 |
+
|
267 |
+
- `GET /api/models/status` - Get all models grouped by status
|
268 |
+
```typescript
|
269 |
+
Response {
|
270 |
+
pending: [{
|
271 |
+
name: string,
|
272 |
+
submitter: string,
|
273 |
+
revision: string,
|
274 |
+
wait_time: string,
|
275 |
+
submission_time: string,
|
276 |
+
status: "PENDING" | "EVALUATING" | "FINISHED",
|
277 |
+
precision: string
|
278 |
+
}],
|
279 |
+
evaluating: Array<Model>,
|
280 |
+
finished: Array<Model>
|
281 |
+
}
|
282 |
+
```
|
283 |
+
- `GET /api/models/pending` - Get pending models only
|
284 |
+
- `POST /api/models/submit` - Submit model
|
285 |
+
|
286 |
+
```typescript
|
287 |
+
Request {
|
288 |
+
user_id: string,
|
289 |
+
model_id: string,
|
290 |
+
base_model?: string,
|
291 |
+
precision?: string,
|
292 |
+
model_type: string
|
293 |
+
}
|
294 |
+
|
295 |
+
Response {
|
296 |
+
status: string,
|
297 |
+
message: string
|
298 |
+
}
|
299 |
+
```
|
300 |
+
|
301 |
+
- `GET /api/models/{model_id}/status` - Get model status
|
302 |
+
|
303 |
+
#### Votes
|
304 |
+
|
305 |
+
- `POST /api/votes/{model_id}` - Vote
|
306 |
+
|
307 |
+
```typescript
|
308 |
+
Request {
|
309 |
+
vote_type: "up" | "down",
|
310 |
+
user_id: string // HuggingFace username
|
311 |
+
}
|
312 |
+
|
313 |
+
Response {
|
314 |
+
success: boolean,
|
315 |
+
message: string
|
316 |
+
}
|
317 |
+
```
|
318 |
+
|
319 |
+
- `GET /api/votes/model/{provider}/{model}` - Get model votes
|
320 |
+
```typescript
|
321 |
+
Response {
|
322 |
+
total_votes: number,
|
323 |
+
up_votes: number,
|
324 |
+
down_votes: number
|
325 |
+
}
|
326 |
+
```
|
327 |
+
- `GET /api/votes/user/{user_id}` - Get user votes
|
328 |
+
```typescript
|
329 |
+
Response Array<{
|
330 |
+
model_id: string,
|
331 |
+
vote_type: string,
|
332 |
+
timestamp: string
|
333 |
+
}>
|
334 |
+
```
|
335 |
+
|
336 |
+
## 🔒 Authentication
|
337 |
+
|
338 |
+
The backend uses HuggingFace token-based authentication for secure API access. Make sure to:
|
339 |
+
|
340 |
+
1. Set your HF_TOKEN in the .env file
|
341 |
+
2. Include the token in API requests via Bearer authentication
|
342 |
+
3. Keep your token secure and never commit it to version control
|
343 |
+
|
344 |
+
## 🚀 Performance
|
345 |
+
|
346 |
+
The backend implements several optimizations:
|
347 |
+
|
348 |
+
- In-memory caching with configurable TTL (Time To Live)
|
349 |
+
- Batch processing for model evaluations
|
350 |
+
- Rate limiting for API endpoints
|
351 |
+
- Efficient database queries with proper indexing
|
352 |
+
- Automatic cache invalidation for votes
|
backend/main.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from routes import routers, session_files, active_bench_tasks
|
6 |
+
|
7 |
+
# Load environment variables from .env file
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
# Verify environment variables are loaded
|
11 |
+
hf_token = os.getenv("HF_TOKEN")
|
12 |
+
if not hf_token:
|
13 |
+
print("Warning: HF_TOKEN environment variable is not set. Make sure it's defined in your .env file.")
|
14 |
+
|
15 |
+
hf_organization = os.getenv("HF_ORGANIZATION")
|
16 |
+
if not hf_organization:
|
17 |
+
print("Warning: HF_ORGANIZATION environment variable is not set. Make sure it's defined in your .env file.")
|
18 |
+
|
19 |
+
app = FastAPI(title="Yourbench API")
|
20 |
+
|
21 |
+
# Activer CORS pour permettre les requêtes depuis le frontend
|
22 |
+
app.add_middleware(
|
23 |
+
CORSMiddleware,
|
24 |
+
allow_origins=["*"], # Dans un environnement de production, spécifiez les origines exactes
|
25 |
+
allow_credentials=True,
|
26 |
+
allow_methods=["*"],
|
27 |
+
allow_headers=["*"],
|
28 |
+
)
|
29 |
+
|
30 |
+
# Ajouter un gestionnaire d'événements pour afficher les session_files au démarrage
|
31 |
+
@app.on_event("startup")
|
32 |
+
async def startup_event():
|
33 |
+
print("Application startup")
|
34 |
+
print(f"Initial session_files: {session_files}")
|
35 |
+
|
36 |
+
# Enregistrer toutes les routes
|
37 |
+
for router in routers:
|
38 |
+
app.include_router(router)
|
backend/old-pyproject.toml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "llm-leaderboard-backend"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Backend for the Open LLM Leaderboard"
|
5 |
+
authors = ["Your Name <[email protected]>"]
|
6 |
+
|
7 |
+
[tool.poetry.dependencies]
|
8 |
+
python = ">=3.12,<3.13"
|
9 |
+
fastapi = "^0.115.6"
|
10 |
+
huggingface-hub = "0.29.3"
|
11 |
+
python-dotenv = "^1.0.1"
|
12 |
+
python-multipart = "^0.0.9"
|
13 |
+
uvicorn = {extras = ["standard"], version = "^0.27.0"}
|
14 |
+
loguru = "^0.7.3"
|
15 |
+
lighteval = {version = ">=0.8.0", extras = ["math"]}
|
16 |
+
tqdm = "^4.67.1"
|
17 |
+
asyncio = "^3.4.3"
|
18 |
+
datasets = "^3.3.0"
|
19 |
+
yourbench = {git = "https://github.com/huggingface/yourbench.git"}
|
20 |
+
tiktoken = "^0.9.0"
|
21 |
+
requests = {extras = ["socks"], version = "^2.32.3"}
|
22 |
+
httpx-socks = "^0.10.0"
|
23 |
+
|
24 |
+
[build-system]
|
25 |
+
requires = ["poetry-core>=1.0.0"]
|
26 |
+
build-backend = "poetry.core.masonry.api"
|
backend/poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
backend/pyproject.toml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "yourbench-simple-demo"
|
3 |
+
version = "0.1.0"
|
4 |
+
authors = [
|
5 |
+
{ name = "Sumuk Shashidhar", email = "[email protected]" },
|
6 |
+
{ name = "Alina Lozovskaia", email = "[email protected]" },
|
7 |
+
{ name = "Clémentine Fourrier", email = "[email protected]" },
|
8 |
+
{ name = "Nathan Habib", email = "[email protected]" },
|
9 |
+
]
|
10 |
+
requires-python = ">=3.12, <3.13"
|
11 |
+
|
12 |
+
dependencies = [
|
13 |
+
"yourbench @ git+https://github.com/huggingface/yourbench.git@main",
|
14 |
+
"asyncio>=3.4.3",
|
15 |
+
"datasets>=3.3.0",
|
16 |
+
"loguru>=0.7.3",
|
17 |
+
"python-dotenv>=1.0.1",
|
18 |
+
"tqdm>=4.67.1",
|
19 |
+
"ruff>=0.11.2",
|
20 |
+
"lighteval>=0.8.0",
|
21 |
+
"huggingface-hub>=0.22.0",
|
22 |
+
]
|
23 |
+
|
24 |
+
[build-system]
|
25 |
+
requires = ["setuptools>=61.0"]
|
26 |
+
build-backend = "setuptools.build_meta"
|
27 |
+
|
28 |
+
[tool.ruff]
|
29 |
+
line-length = 119
|
30 |
+
exclude = ["**/*.ipynb"]
|
31 |
+
|
32 |
+
lint.ignore = ["E501", "C901", "F841"]
|
33 |
+
lint.select = ["C", "E", "F", "I", "W"]
|
34 |
+
lint.fixable = ["A", "B", "C", "D", "E", "F", "I", "W"]
|
35 |
+
preview = true
|
36 |
+
|
37 |
+
[tool.ruff.lint.isort]
|
38 |
+
length-sort = true
|
39 |
+
lines-after-imports = 2
|
40 |
+
no-lines-before = ["standard-library", "local-folder"]
|
41 |
+
split-on-trailing-comma = true
|
42 |
+
|
43 |
+
[tool.ruff.format]
|
44 |
+
quote-style = "double"
|
45 |
+
indent-style = "space"
|
46 |
+
skip-magic-trailing-comma = false
|
47 |
+
line-ending = "auto"
|
backend/routes/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Routes du module
|
2 |
+
from .health import router as health_router
|
3 |
+
from .upload import router as upload_router, session_files
|
4 |
+
from .benchmark import router as benchmark_router, active_bench_tasks
|
5 |
+
from .questions import router as questions_router
|
6 |
+
from .download import router as download_router
|
7 |
+
from .evaluation import router as evaluation_router, active_evaluation_tasks
|
8 |
+
|
9 |
+
# Exposer les routeurs
|
10 |
+
routers = [
|
11 |
+
health_router,
|
12 |
+
upload_router,
|
13 |
+
benchmark_router,
|
14 |
+
questions_router,
|
15 |
+
download_router,
|
16 |
+
evaluation_router
|
17 |
+
]
|
18 |
+
|
19 |
+
# Référencer les données partagées entre routes
|
20 |
+
benchmark_router.session_files = session_files
|
21 |
+
|
22 |
+
# Exposer les variables partagées pour main.py
|
23 |
+
__all__ = ['routers', 'session_files', 'active_bench_tasks', 'active_evaluation_tasks']
|
backend/routes/benchmark.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, HTTPException
|
2 |
+
from typing import Dict, Any
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
from tasks.createBenchConfigFile import CreateBenchConfigTask
|
6 |
+
from tasks.createBench import CreateBenchTask
|
7 |
+
|
8 |
+
router = APIRouter(tags=["benchmark"])
|
9 |
+
|
10 |
+
# Store active tasks by session_id (importé dans main.py)
|
11 |
+
active_bench_tasks = {}
|
12 |
+
active_config_tasks = {}
|
13 |
+
|
14 |
+
# Référence aux session_files (sera fournie par main.py)
|
15 |
+
# Cette déclaration sera écrasée par l'affectation dans __init__.py
|
16 |
+
session_files = {}
|
17 |
+
|
18 |
+
@router.post("/generate-benchmark")
|
19 |
+
async def generate_benchmark(data: Dict[str, Any]):
|
20 |
+
"""
|
21 |
+
Generate a benchmark configuration and run the ingestion process
|
22 |
+
|
23 |
+
Args:
|
24 |
+
data: Dictionary containing session_id
|
25 |
+
|
26 |
+
Returns:
|
27 |
+
Dictionary with logs and config_path
|
28 |
+
"""
|
29 |
+
session_id = data.get("session_id")
|
30 |
+
|
31 |
+
# Débogage pour vérifier les session_files et le session_id reçu
|
32 |
+
print(f"DEBUG: Session ID reçu: {session_id}")
|
33 |
+
print(f"DEBUG: Session files disponibles: {list(router.session_files.keys())}")
|
34 |
+
|
35 |
+
if not session_id or session_id not in router.session_files:
|
36 |
+
return {"error": "Invalid or missing session ID"}
|
37 |
+
|
38 |
+
file_path = router.session_files[session_id]
|
39 |
+
all_logs = []
|
40 |
+
|
41 |
+
try:
|
42 |
+
# Step 1: Generate configuration file
|
43 |
+
config_task = CreateBenchConfigTask(session_uid=session_id)
|
44 |
+
# Store the config task for later log retrieval
|
45 |
+
active_config_tasks[session_id] = config_task
|
46 |
+
|
47 |
+
# Start configuration generation asynchronously
|
48 |
+
config_path = config_task.run(file_path=file_path)
|
49 |
+
|
50 |
+
# Add initial logs
|
51 |
+
all_logs.extend(config_task.get_logs())
|
52 |
+
|
53 |
+
# Step 2: Run the createBench task with the generated config
|
54 |
+
# Note: This will be started by a separate endpoint once configuration is done
|
55 |
+
|
56 |
+
return {
|
57 |
+
"status": "running",
|
58 |
+
"config_path": config_path,
|
59 |
+
"logs": all_logs
|
60 |
+
}
|
61 |
+
except Exception as e:
|
62 |
+
return {
|
63 |
+
"status": "error",
|
64 |
+
"error": str(e),
|
65 |
+
"logs": all_logs
|
66 |
+
}
|
67 |
+
|
68 |
+
@router.get("/config-logs/{session_id}")
|
69 |
+
async def get_config_logs(session_id: str):
|
70 |
+
"""
|
71 |
+
Get the logs for a running configuration task
|
72 |
+
|
73 |
+
Args:
|
74 |
+
session_id: Session ID for the task
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
Dictionary with logs and completion status
|
78 |
+
"""
|
79 |
+
if session_id not in active_config_tasks:
|
80 |
+
raise HTTPException(status_code=404, detail="Configuration task not found")
|
81 |
+
|
82 |
+
config_task = active_config_tasks[session_id]
|
83 |
+
logs = config_task.get_logs()
|
84 |
+
is_completed = config_task.is_task_completed()
|
85 |
+
|
86 |
+
# Si la configuration est terminée et que le benchmark n'est pas encore démarré,
|
87 |
+
# démarrer automatiquement le benchmark
|
88 |
+
if is_completed and session_id not in active_bench_tasks:
|
89 |
+
try:
|
90 |
+
# Ensure the config_path is a string
|
91 |
+
config_path_str = f"uploaded_files/{session_id}/config.yml"
|
92 |
+
bench_task = CreateBenchTask(session_uid=session_id, config_path=config_path_str)
|
93 |
+
|
94 |
+
# Store the bench task for later log retrieval
|
95 |
+
active_bench_tasks[session_id] = bench_task
|
96 |
+
|
97 |
+
# Add a transition log
|
98 |
+
logs.append("[INFO] Configuration file generated, starting benchmark creation")
|
99 |
+
|
100 |
+
# Run the task
|
101 |
+
bench_task.run()
|
102 |
+
except Exception as bench_error:
|
103 |
+
error_msg = f"Error starting benchmark creation: {str(bench_error)}"
|
104 |
+
logs.append(f"[ERROR] {error_msg}")
|
105 |
+
|
106 |
+
return {
|
107 |
+
"logs": logs,
|
108 |
+
"is_completed": is_completed
|
109 |
+
}
|
110 |
+
|
111 |
+
@router.get("/benchmark-logs/{session_id}")
|
112 |
+
async def get_benchmark_logs(session_id: str):
|
113 |
+
"""
|
114 |
+
Get the logs for a running benchmark task
|
115 |
+
|
116 |
+
Args:
|
117 |
+
session_id: Session ID for the task
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
Dictionary with logs and completion status
|
121 |
+
"""
|
122 |
+
if session_id not in active_bench_tasks:
|
123 |
+
raise HTTPException(status_code=404, detail="Benchmark task not found")
|
124 |
+
|
125 |
+
bench_task = active_bench_tasks[session_id]
|
126 |
+
logs = bench_task.get_logs()
|
127 |
+
is_completed = bench_task.is_task_completed()
|
128 |
+
|
129 |
+
return {
|
130 |
+
"logs": logs,
|
131 |
+
"is_completed": is_completed
|
132 |
+
}
|
backend/routes/download.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, HTTPException
|
2 |
+
from fastapi.responses import StreamingResponse
|
3 |
+
from huggingface_hub import hf_hub_download, snapshot_download
|
4 |
+
import os
|
5 |
+
import tempfile
|
6 |
+
import shutil
|
7 |
+
import zipfile
|
8 |
+
import io
|
9 |
+
import logging
|
10 |
+
|
11 |
+
router = APIRouter(tags=["download"])
|
12 |
+
|
13 |
+
@router.get("/download-dataset/{session_id}")
|
14 |
+
async def download_dataset(session_id: str):
|
15 |
+
"""
|
16 |
+
Télécharge le dataset HuggingFace associé à une session et le renvoie au client
|
17 |
+
|
18 |
+
Args:
|
19 |
+
session_id: Identifiant de la session
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
Fichier ZIP contenant le dataset
|
23 |
+
"""
|
24 |
+
try:
|
25 |
+
# Créer un répertoire temporaire pour stocker les fichiers du dataset
|
26 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
27 |
+
# Identifiant du repo HuggingFace
|
28 |
+
repo_id = f"yourbench/yourbench_{session_id}"
|
29 |
+
|
30 |
+
try:
|
31 |
+
# Télécharger le snapshot du dataset depuis HuggingFace
|
32 |
+
logging.info(f"Téléchargement du dataset {repo_id}")
|
33 |
+
snapshot_path = snapshot_download(
|
34 |
+
repo_id=repo_id,
|
35 |
+
repo_type="dataset",
|
36 |
+
local_dir=temp_dir,
|
37 |
+
token=os.environ.get("HF_TOKEN")
|
38 |
+
)
|
39 |
+
|
40 |
+
logging.info(f"Dataset téléchargé dans {snapshot_path}")
|
41 |
+
|
42 |
+
# Créer un fichier ZIP en mémoire
|
43 |
+
zip_io = io.BytesIO()
|
44 |
+
with zipfile.ZipFile(zip_io, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
45 |
+
# Parcourir tous les fichiers du dataset et les ajouter au ZIP
|
46 |
+
for root, _, files in os.walk(snapshot_path):
|
47 |
+
for file in files:
|
48 |
+
file_path = os.path.join(root, file)
|
49 |
+
arc_name = os.path.relpath(file_path, snapshot_path)
|
50 |
+
zip_file.write(file_path, arcname=arc_name)
|
51 |
+
|
52 |
+
# Remettre le curseur au début du stream
|
53 |
+
zip_io.seek(0)
|
54 |
+
|
55 |
+
# Renvoyer le ZIP au client
|
56 |
+
filename = f"yourbench_{session_id}_dataset.zip"
|
57 |
+
return StreamingResponse(
|
58 |
+
zip_io,
|
59 |
+
media_type="application/zip",
|
60 |
+
headers={"Content-Disposition": f"attachment; filename={filename}"}
|
61 |
+
)
|
62 |
+
|
63 |
+
except Exception as e:
|
64 |
+
logging.error(f"Erreur lors du téléchargement du dataset: {str(e)}")
|
65 |
+
raise HTTPException(
|
66 |
+
status_code=500,
|
67 |
+
detail=f"Erreur lors du téléchargement du dataset: {str(e)}"
|
68 |
+
)
|
69 |
+
except Exception as e:
|
70 |
+
logging.error(f"Erreur générale: {str(e)}")
|
71 |
+
raise HTTPException(
|
72 |
+
status_code=500,
|
73 |
+
detail=f"Erreur lors du téléchargement: {str(e)}"
|
74 |
+
)
|
backend/routes/evaluation.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, HTTPException
|
2 |
+
from typing import Dict, Any
|
3 |
+
import os
|
4 |
+
from tasks.evaluationTask import EvaluationTask
|
5 |
+
|
6 |
+
router = APIRouter(tags=["evaluation"])
|
7 |
+
|
8 |
+
# Store active evaluation tasks by session_id
|
9 |
+
active_evaluation_tasks = {}
|
10 |
+
|
11 |
+
@router.post("/evaluate-benchmark")
|
12 |
+
async def evaluate_benchmark(data: Dict[str, Any]):
|
13 |
+
"""
|
14 |
+
Lancer l'évaluation d'un benchmark pour une session donnée
|
15 |
+
|
16 |
+
Args:
|
17 |
+
data: Dictionary contenant session_id
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
Dictionary avec statut et logs initiaux
|
21 |
+
"""
|
22 |
+
session_id = data.get("session_id")
|
23 |
+
|
24 |
+
if not session_id:
|
25 |
+
return {"error": "Session ID manquant ou invalide"}
|
26 |
+
|
27 |
+
# Vérifier si une évaluation est déjà en cours pour cette session
|
28 |
+
if session_id in active_evaluation_tasks:
|
29 |
+
evaluation_task = active_evaluation_tasks[session_id]
|
30 |
+
# Si l'évaluation est déjà terminée, on peut en lancer une nouvelle
|
31 |
+
if evaluation_task.is_task_completed():
|
32 |
+
# Suppression de l'ancienne tâche
|
33 |
+
del active_evaluation_tasks[session_id]
|
34 |
+
else:
|
35 |
+
# Une évaluation est déjà en cours
|
36 |
+
return {
|
37 |
+
"status": "already_running",
|
38 |
+
"message": "Une évaluation est déjà en cours pour cette session",
|
39 |
+
"logs": evaluation_task.get_logs()
|
40 |
+
}
|
41 |
+
|
42 |
+
try:
|
43 |
+
# Nom du dataset basé sur l'ID de session
|
44 |
+
dataset_name = f"yourbench_{session_id}"
|
45 |
+
|
46 |
+
# Créer et démarrer une nouvelle tâche d'évaluation
|
47 |
+
evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
|
48 |
+
active_evaluation_tasks[session_id] = evaluation_task
|
49 |
+
|
50 |
+
# Démarrer l'évaluation de manière asynchrone
|
51 |
+
evaluation_task.run()
|
52 |
+
|
53 |
+
# Récupérer les logs initiaux
|
54 |
+
initial_logs = evaluation_task.get_logs()
|
55 |
+
|
56 |
+
return {
|
57 |
+
"status": "started",
|
58 |
+
"message": f"Évaluation démarrée pour le benchmark {dataset_name}",
|
59 |
+
"logs": initial_logs
|
60 |
+
}
|
61 |
+
except Exception as e:
|
62 |
+
return {
|
63 |
+
"status": "error",
|
64 |
+
"error": str(e),
|
65 |
+
"message": f"Erreur lors du démarrage de l'évaluation: {str(e)}"
|
66 |
+
}
|
67 |
+
|
68 |
+
@router.get("/evaluation-logs/{session_id}")
|
69 |
+
async def get_evaluation_logs(session_id: str):
|
70 |
+
"""
|
71 |
+
Récupérer les logs d'une évaluation en cours
|
72 |
+
|
73 |
+
Args:
|
74 |
+
session_id: ID de la session pour laquelle récupérer les logs
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
Dictionary avec logs et statut de complétion
|
78 |
+
"""
|
79 |
+
if session_id not in active_evaluation_tasks:
|
80 |
+
raise HTTPException(status_code=404, detail="Tâche d'évaluation non trouvée")
|
81 |
+
|
82 |
+
evaluation_task = active_evaluation_tasks[session_id]
|
83 |
+
logs = evaluation_task.get_logs()
|
84 |
+
is_completed = evaluation_task.is_task_completed()
|
85 |
+
|
86 |
+
# Récupérer les résultats si disponibles et l'évaluation est terminée
|
87 |
+
results = None
|
88 |
+
if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
|
89 |
+
results = evaluation_task.results
|
90 |
+
|
91 |
+
return {
|
92 |
+
"logs": logs,
|
93 |
+
"is_completed": is_completed,
|
94 |
+
"results": results
|
95 |
+
}
|
96 |
+
|
97 |
+
@router.get("/evaluation-results/{session_id}")
|
98 |
+
async def get_evaluation_results(session_id: str):
|
99 |
+
"""
|
100 |
+
Retrieve results of a completed evaluation
|
101 |
+
|
102 |
+
Args:
|
103 |
+
session_id: Session ID to retrieve results for
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
Dictionary with evaluation results
|
107 |
+
"""
|
108 |
+
# First, check if the task is in memory
|
109 |
+
if session_id in active_evaluation_tasks:
|
110 |
+
evaluation_task = active_evaluation_tasks[session_id]
|
111 |
+
|
112 |
+
if not evaluation_task.is_task_completed():
|
113 |
+
return {
|
114 |
+
"success": False,
|
115 |
+
"message": "Evaluation is still in progress"
|
116 |
+
}
|
117 |
+
|
118 |
+
if hasattr(evaluation_task, 'results') and evaluation_task.results:
|
119 |
+
return {
|
120 |
+
"success": True,
|
121 |
+
"results": evaluation_task.results
|
122 |
+
}
|
123 |
+
|
124 |
+
# If we get here, either the task is not in memory or it doesn't have results
|
125 |
+
# Try to load results from file
|
126 |
+
try:
|
127 |
+
# Construct the path to the results file
|
128 |
+
results_path = f"uploaded_files/{session_id}/lighteval_results/models_comparison.json"
|
129 |
+
|
130 |
+
# Check if the file exists
|
131 |
+
if not os.path.exists(results_path):
|
132 |
+
return {
|
133 |
+
"success": False,
|
134 |
+
"message": "No evaluation results found for this session"
|
135 |
+
}
|
136 |
+
|
137 |
+
# Read the file
|
138 |
+
import json
|
139 |
+
with open(results_path, 'r') as f:
|
140 |
+
results = json.load(f)
|
141 |
+
|
142 |
+
return {
|
143 |
+
"success": True,
|
144 |
+
"results": results
|
145 |
+
}
|
146 |
+
except Exception as e:
|
147 |
+
return {
|
148 |
+
"success": False,
|
149 |
+
"message": f"Error retrieving evaluation results: {str(e)}"
|
150 |
+
}
|
backend/routes/health.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
|
3 |
+
router = APIRouter(tags=["health"])
|
4 |
+
|
5 |
+
@router.get("/health")
|
6 |
+
async def health_check():
|
7 |
+
"""
|
8 |
+
Check if the API is running properly
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
Dictionary with status
|
12 |
+
"""
|
13 |
+
return {"status": "ok"}
|
backend/routes/questions.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, HTTPException
|
2 |
+
import random
|
3 |
+
from datasets import load_dataset
|
4 |
+
from huggingface_hub import HfApi, dataset_info
|
5 |
+
import os
|
6 |
+
|
7 |
+
router = APIRouter(tags=["benchmark"])
|
8 |
+
|
9 |
+
@router.get("/benchmark-questions/{session_id}")
|
10 |
+
async def get_benchmark_questions(session_id: str):
|
11 |
+
"""
|
12 |
+
Get example questions from the generated benchmark
|
13 |
+
|
14 |
+
Args:
|
15 |
+
session_id: Session ID for the benchmark
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
Dictionary with sample questions from the dataset
|
19 |
+
"""
|
20 |
+
try:
|
21 |
+
# Dataset path on Hugging Face
|
22 |
+
dataset_repo_id = f"yourbench/yourbench_{session_id}"
|
23 |
+
|
24 |
+
# Initialize response
|
25 |
+
response = {
|
26 |
+
"success": False,
|
27 |
+
"questions": [],
|
28 |
+
"dataset_url": f"https://huggingface.co/datasets/{dataset_repo_id}"
|
29 |
+
}
|
30 |
+
|
31 |
+
# Try to load the dataset
|
32 |
+
questions = []
|
33 |
+
|
34 |
+
try:
|
35 |
+
# Essayer de charger les questions single-shot directement avec le nom de config
|
36 |
+
single_dataset = load_dataset(dataset_repo_id, 'single_shot_questions')
|
37 |
+
if single_dataset and len(single_dataset['train']) > 0:
|
38 |
+
# Get a random sample (up to 2) from single-shot questions
|
39 |
+
sample_indices = random.sample(range(len(single_dataset['train'])), min(2, len(single_dataset['train'])))
|
40 |
+
for idx in sample_indices:
|
41 |
+
questions.append({
|
42 |
+
"id": str(idx),
|
43 |
+
"question": single_dataset['train'][idx].get("question", ""),
|
44 |
+
"type": "single_shot"
|
45 |
+
})
|
46 |
+
print(f"Loaded {len(questions)} single-shot questions")
|
47 |
+
except Exception as e:
|
48 |
+
print(f"Error loading single-shot questions: {str(e)}")
|
49 |
+
|
50 |
+
try:
|
51 |
+
# Essayer de charger les questions multi-hop si nécessaire
|
52 |
+
if len(questions) < 2:
|
53 |
+
multi_dataset = load_dataset(dataset_repo_id, 'multi_hop_questions')
|
54 |
+
if multi_dataset and len(multi_dataset['train']) > 0:
|
55 |
+
# Get remaining questions from multi-hop questions
|
56 |
+
remaining = 2 - len(questions)
|
57 |
+
sample_indices = random.sample(range(len(multi_dataset['train'])), min(remaining, len(multi_dataset['train'])))
|
58 |
+
for idx in sample_indices:
|
59 |
+
questions.append({
|
60 |
+
"id": str(idx),
|
61 |
+
"question": multi_dataset['train'][idx].get("question", ""),
|
62 |
+
"type": "multi_hop"
|
63 |
+
})
|
64 |
+
print(f"Loaded {len(questions)} multi-hop questions")
|
65 |
+
except Exception as e:
|
66 |
+
print(f"Error loading multi-hop questions: {str(e)}")
|
67 |
+
|
68 |
+
# If we couldn't load any questions, the dataset might not exist
|
69 |
+
if len(questions) == 0:
|
70 |
+
# Check if we have a directory for this session locally as fallback
|
71 |
+
session_dir = os.path.join("uploaded_files", session_id)
|
72 |
+
if not os.path.exists(session_dir):
|
73 |
+
raise HTTPException(status_code=404, detail="Dataset not found")
|
74 |
+
|
75 |
+
# Update the response
|
76 |
+
response["success"] = len(questions) > 0
|
77 |
+
response["questions"] = questions
|
78 |
+
return response
|
79 |
+
|
80 |
+
except HTTPException:
|
81 |
+
# Re-raise HTTP exceptions
|
82 |
+
raise
|
83 |
+
except Exception as e:
|
84 |
+
return {
|
85 |
+
"success": False,
|
86 |
+
"error": f"Error retrieving benchmark questions: {str(e)}"
|
87 |
+
}
|
backend/routes/upload.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, UploadFile, File
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import uuid
|
5 |
+
|
6 |
+
router = APIRouter(tags=["files"])
|
7 |
+
|
8 |
+
# Définir le stockage des fichiers par session (importé dans main.py)
|
9 |
+
session_files = {}
|
10 |
+
|
11 |
+
# Dossier racine pour les uploads
|
12 |
+
UPLOAD_ROOT = "uploaded_files"
|
13 |
+
os.makedirs(UPLOAD_ROOT, exist_ok=True)
|
14 |
+
|
15 |
+
@router.post("/upload")
|
16 |
+
async def upload_file(file: UploadFile = File(...)):
|
17 |
+
"""
|
18 |
+
Upload a file to the server and generate a session ID
|
19 |
+
|
20 |
+
Args:
|
21 |
+
file: The file to upload
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
Dictionary with filename, status and session_id
|
25 |
+
"""
|
26 |
+
# Vérifier si le fichier est un PDF, TXT, HTML ou MD
|
27 |
+
if not file.filename.endswith(('.pdf', '.txt', '.html', '.md')):
|
28 |
+
return {"error": "Only PDF, TXT, HTML and MD files are accepted"}
|
29 |
+
|
30 |
+
# Generate a session ID for this file
|
31 |
+
session_id = str(uuid.uuid4())
|
32 |
+
|
33 |
+
# Create the session directory structure
|
34 |
+
session_dir = os.path.join(UPLOAD_ROOT, session_id)
|
35 |
+
uploaded_files_dir = os.path.join(session_dir, "uploaded_files")
|
36 |
+
os.makedirs(uploaded_files_dir, exist_ok=True)
|
37 |
+
|
38 |
+
# Create the full path to save the file
|
39 |
+
file_path = os.path.join(uploaded_files_dir, file.filename)
|
40 |
+
|
41 |
+
# Sauvegarder le fichier
|
42 |
+
with open(file_path, "wb") as buffer:
|
43 |
+
shutil.copyfileobj(file.file, buffer)
|
44 |
+
|
45 |
+
# Store file path for later use
|
46 |
+
session_files[session_id] = file_path
|
47 |
+
|
48 |
+
# Débogage pour vérifier l'état des session_files
|
49 |
+
print(f"DEBUG UPLOAD: File uploaded with session_id: {session_id}")
|
50 |
+
print(f"DEBUG UPLOAD: Current session_files: {session_files}")
|
51 |
+
|
52 |
+
return {"filename": file.filename, "status": "uploaded", "session_id": session_id}
|
backend/tasks/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Tasks module for YourbenchSimpleDemo
|
3 |
+
"""
|
backend/tasks/createBench.py
ADDED
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Task to ingest and transform documents to markdown using yourbench
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import pathlib
|
7 |
+
import subprocess
|
8 |
+
import threading
|
9 |
+
from typing import Optional, List, Tuple, Dict, Any
|
10 |
+
import yaml
|
11 |
+
|
12 |
+
from loguru import logger
|
13 |
+
|
14 |
+
|
15 |
+
class CreateBenchTask:
|
16 |
+
"""
|
17 |
+
Task to ingest and transform documents to markdown using yourbench
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(self, session_uid: str, config_path: Optional[str] = None):
|
21 |
+
"""
|
22 |
+
Initialize the ingestion task
|
23 |
+
|
24 |
+
Args:
|
25 |
+
session_uid: Session ID for this task
|
26 |
+
config_path: Path to the configuration file, will be generated if None
|
27 |
+
"""
|
28 |
+
self.session_uid = session_uid
|
29 |
+
self.logs: List[str] = []
|
30 |
+
self.is_completed = False
|
31 |
+
self.process = None
|
32 |
+
self.is_running_flag = threading.Event()
|
33 |
+
|
34 |
+
# Default config path if not provided
|
35 |
+
if config_path is None:
|
36 |
+
config_path = f"uploaded_files/{session_uid}/config.yml"
|
37 |
+
self.config_path = config_path
|
38 |
+
|
39 |
+
# Command to run yourbench - modified to avoid error with uv run
|
40 |
+
self.command = ["yourbench", "run", "--config", str(self.config_path)]
|
41 |
+
|
42 |
+
self._add_log("[INFO] Initializing ingestion task")
|
43 |
+
self._add_log(f"[INFO] Using configuration file: {self.config_path}")
|
44 |
+
|
45 |
+
def _add_log(self, message: str) -> None:
|
46 |
+
"""
|
47 |
+
Add a log message to the logs list
|
48 |
+
|
49 |
+
Args:
|
50 |
+
message: Log message to add
|
51 |
+
"""
|
52 |
+
if message not in self.logs: # Avoid duplicates
|
53 |
+
self.logs.append(message)
|
54 |
+
# Force copy of the list to avoid reference problems
|
55 |
+
self.logs = self.logs.copy()
|
56 |
+
# Log to system logs
|
57 |
+
logger.info(f"[{self.session_uid}] {message}")
|
58 |
+
|
59 |
+
def get_logs(self) -> List[str]:
|
60 |
+
"""
|
61 |
+
Get all logs for this task
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
List of log messages
|
65 |
+
"""
|
66 |
+
return self.logs.copy() # Return a copy to avoid reference problems
|
67 |
+
|
68 |
+
def is_task_completed(self) -> bool:
|
69 |
+
"""
|
70 |
+
Check if the task is completed
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
True if completed, False otherwise
|
74 |
+
"""
|
75 |
+
return self.is_completed
|
76 |
+
|
77 |
+
def is_running(self) -> bool:
|
78 |
+
"""
|
79 |
+
Check if the process is running
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
True if running, False otherwise
|
83 |
+
"""
|
84 |
+
return self.is_running_flag.is_set()
|
85 |
+
|
86 |
+
def stop(self) -> None:
|
87 |
+
"""
|
88 |
+
Stop the process if it's running
|
89 |
+
"""
|
90 |
+
if self.process and self.is_running():
|
91 |
+
self._add_log("[INFO] Stopping ingestion process")
|
92 |
+
try:
|
93 |
+
self.process.terminate()
|
94 |
+
# Wait 5 seconds for termination
|
95 |
+
self.process.wait(timeout=5)
|
96 |
+
except subprocess.TimeoutExpired:
|
97 |
+
self._add_log("[WARN] Process not responding, forcing termination")
|
98 |
+
self.process.kill()
|
99 |
+
finally:
|
100 |
+
self.is_running_flag.clear()
|
101 |
+
self.is_completed = True
|
102 |
+
self._add_log("[INFO] Ingestion process stopped")
|
103 |
+
|
104 |
+
def _capture_output(self) -> None:
|
105 |
+
"""
|
106 |
+
Capture and process the output from the yourbench process
|
107 |
+
"""
|
108 |
+
self._add_log("[INFO] Starting output capture")
|
109 |
+
|
110 |
+
try:
|
111 |
+
while self.is_running() and self.process:
|
112 |
+
line = self.process.stdout.readline()
|
113 |
+
if not line:
|
114 |
+
# If no line is read and the process is no longer running
|
115 |
+
if self.process.poll() is not None:
|
116 |
+
self.is_running_flag.clear()
|
117 |
+
break
|
118 |
+
# Otherwise, wait a bit and continue
|
119 |
+
time.sleep(0.1)
|
120 |
+
continue
|
121 |
+
|
122 |
+
# Process the output line
|
123 |
+
line = line.strip()
|
124 |
+
if line:
|
125 |
+
# Filter and format the line as needed
|
126 |
+
if "ERROR" in line:
|
127 |
+
self._add_log(f"[ERROR] {line}")
|
128 |
+
elif "WARNING" in line:
|
129 |
+
self._add_log(f"[WARN] {line}")
|
130 |
+
else:
|
131 |
+
# Detect completed stages
|
132 |
+
if "Completed stage:" in line:
|
133 |
+
stage = line.split("'")[1] if "'" in line else line
|
134 |
+
self._add_log(f"[SUCCESS] Stage completed: {stage}")
|
135 |
+
else:
|
136 |
+
self._add_log(f"[INFO] {line}")
|
137 |
+
|
138 |
+
# Check exit code once the process is finished
|
139 |
+
if self.process:
|
140 |
+
exit_code = self.process.poll()
|
141 |
+
if exit_code == 0:
|
142 |
+
self._add_log("[SUCCESS] Ingestion process completed successfully")
|
143 |
+
else:
|
144 |
+
self._add_log(f"[ERROR] Ingestion process terminated with error code: {exit_code}")
|
145 |
+
except Exception as e:
|
146 |
+
self._add_log(f"[ERROR] Error during output capture: {str(e)}")
|
147 |
+
finally:
|
148 |
+
self.is_completed = True
|
149 |
+
self.is_running_flag.clear()
|
150 |
+
self._add_log("[INFO] Output capture completed")
|
151 |
+
|
152 |
+
def run(self, token: Optional[str] = None) -> None:
|
153 |
+
"""
|
154 |
+
Run the ingestion task
|
155 |
+
|
156 |
+
Args:
|
157 |
+
token: Hugging Face token
|
158 |
+
"""
|
159 |
+
try:
|
160 |
+
self._add_log("[INFO] Starting ingestion process")
|
161 |
+
|
162 |
+
# Check if the configuration file exists
|
163 |
+
if not os.path.exists(self.config_path):
|
164 |
+
raise FileNotFoundError(f"Configuration file does not exist: {self.config_path}")
|
165 |
+
|
166 |
+
# Examine the configuration to get information
|
167 |
+
try:
|
168 |
+
with open(self.config_path, 'r') as f:
|
169 |
+
config_yaml = yaml.safe_load(f)
|
170 |
+
|
171 |
+
# Get source and destination paths
|
172 |
+
source_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("source_documents_dir", "")
|
173 |
+
output_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("output_dir", "")
|
174 |
+
|
175 |
+
if source_dir:
|
176 |
+
self._add_log(f"[INFO] Source directory: {source_dir}")
|
177 |
+
if output_dir:
|
178 |
+
self._add_log(f"[INFO] Output directory: {output_dir}")
|
179 |
+
|
180 |
+
# List files to process if the directory exists
|
181 |
+
if source_dir and os.path.exists(source_dir):
|
182 |
+
files = os.listdir(source_dir)
|
183 |
+
if files:
|
184 |
+
self._add_log(f"[INFO] Files to process: {', '.join(files)}")
|
185 |
+
else:
|
186 |
+
self._add_log("[WARN] No files found in source directory")
|
187 |
+
|
188 |
+
except Exception as e:
|
189 |
+
self._add_log(f"[WARN] Unable to read configuration: {str(e)}")
|
190 |
+
|
191 |
+
# Environment preparation
|
192 |
+
env = os.environ.copy()
|
193 |
+
|
194 |
+
# Explicitly define environment variables for authentication
|
195 |
+
hf_token = os.getenv("HF_TOKEN")
|
196 |
+
if hf_token:
|
197 |
+
# Explicitly export these variables for yourbench
|
198 |
+
env["HF_TOKEN"] = hf_token
|
199 |
+
env["HUGGING_FACE_HUB_TOKEN"] = hf_token
|
200 |
+
env["HF_ORGANIZATION"] = os.getenv("HF_ORGANIZATION", "yourbench")
|
201 |
+
self._add_log("[INFO] Environment variables HF_TOKEN, HUGGING_FACE_HUB_TOKEN and HF_ORGANIZATION exported")
|
202 |
+
|
203 |
+
# In development mode, only simulate ingestion
|
204 |
+
if os.environ.get("DEVELOPMENT_MODE", "").lower() == "true":
|
205 |
+
self._add_log("[INFO] Development mode enabled, simulating ingestion")
|
206 |
+
self._simulate_ingestion_process()
|
207 |
+
return
|
208 |
+
|
209 |
+
# Start the process
|
210 |
+
self._add_log(f"[INFO] Executing command: {' '.join(self.command)}")
|
211 |
+
|
212 |
+
self.process = subprocess.Popen(
|
213 |
+
self.command,
|
214 |
+
stdout=subprocess.PIPE,
|
215 |
+
stderr=subprocess.STDOUT,
|
216 |
+
text=True,
|
217 |
+
bufsize=1,
|
218 |
+
universal_newlines=True,
|
219 |
+
env=env
|
220 |
+
)
|
221 |
+
|
222 |
+
# Mark the process as running
|
223 |
+
self.is_running_flag.set()
|
224 |
+
|
225 |
+
# Start a thread to capture output
|
226 |
+
output_thread = threading.Thread(target=self._capture_output)
|
227 |
+
output_thread.daemon = True
|
228 |
+
output_thread.start()
|
229 |
+
|
230 |
+
self._add_log(f"[INFO] Process started with PID: {self.process.pid}")
|
231 |
+
|
232 |
+
except Exception as e:
|
233 |
+
self._add_log(f"[ERROR] Error starting ingestion process: {str(e)}")
|
234 |
+
self.is_completed = True
|
235 |
+
|
236 |
+
def _simulate_ingestion_process(self) -> None:
|
237 |
+
"""
|
238 |
+
Simulate the ingestion process for testing/development
|
239 |
+
This will be removed in production
|
240 |
+
"""
|
241 |
+
# This method is just to simulate logs during development
|
242 |
+
# It will be removed in production
|
243 |
+
|
244 |
+
threading.Thread(target=self._simulate_logs).start()
|
245 |
+
|
246 |
+
def _simulate_logs(self) -> None:
|
247 |
+
"""
|
248 |
+
Simulate logs for testing/development
|
249 |
+
This will be used when yourbench isn't installed or in development mode
|
250 |
+
"""
|
251 |
+
# Log simulation (used when yourbench is not available)
|
252 |
+
self._add_log("[INFO] Simulation mode enabled (yourbench is not actually running)")
|
253 |
+
|
254 |
+
# Get filenames from source directory
|
255 |
+
source_files = []
|
256 |
+
try:
|
257 |
+
with open(self.config_path, 'r') as f:
|
258 |
+
config_yaml = yaml.safe_load(f)
|
259 |
+
|
260 |
+
source_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("source_documents_dir", "")
|
261 |
+
if source_dir and os.path.exists(source_dir):
|
262 |
+
source_files = [f for f in os.listdir(source_dir)
|
263 |
+
if os.path.isfile(os.path.join(source_dir, f))]
|
264 |
+
except Exception:
|
265 |
+
source_files = ["document.pdf", "document.txt"] # Fallback
|
266 |
+
|
267 |
+
# Create output directory if it doesn't exist
|
268 |
+
output_dir = ""
|
269 |
+
try:
|
270 |
+
output_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("output_dir", "")
|
271 |
+
if output_dir:
|
272 |
+
os.makedirs(output_dir, exist_ok=True)
|
273 |
+
except Exception:
|
274 |
+
pass
|
275 |
+
|
276 |
+
# Simulate file processing
|
277 |
+
time.sleep(1)
|
278 |
+
self._add_log("[INFO] Initializing document ingestion")
|
279 |
+
time.sleep(1.5)
|
280 |
+
self._add_log("[INFO] Loading configuration parameters")
|
281 |
+
time.sleep(1)
|
282 |
+
self._add_log("[INFO] Verifying source files")
|
283 |
+
|
284 |
+
# Process each file
|
285 |
+
for file in source_files:
|
286 |
+
time.sleep(1.5)
|
287 |
+
self._add_log(f"[INFO] Processing file: {file}")
|
288 |
+
time.sleep(2)
|
289 |
+
self._add_log(f"[INFO] Extracting content from {file}")
|
290 |
+
time.sleep(1.5)
|
291 |
+
self._add_log(f"[INFO] Converting to markdown: {file}")
|
292 |
+
|
293 |
+
# Create a simulated markdown file if an output directory is defined
|
294 |
+
if output_dir:
|
295 |
+
base_name = os.path.splitext(file)[0]
|
296 |
+
output_file = os.path.join(output_dir, f"{base_name}.md")
|
297 |
+
try:
|
298 |
+
with open(output_file, 'w') as f:
|
299 |
+
f.write(f"# {base_name}\n\n")
|
300 |
+
f.write("This is a markdown document automatically generated by the simulation.\n\n")
|
301 |
+
f.write("## Section 1\n\n")
|
302 |
+
f.write("Content of section 1...\n\n")
|
303 |
+
f.write("## Section 2\n\n")
|
304 |
+
f.write("Content of section 2...\n\n")
|
305 |
+
self._add_log(f"[INFO] Markdown file created: {output_file}")
|
306 |
+
except Exception as e:
|
307 |
+
self._add_log(f"[ERROR] Error creating markdown file: {str(e)}")
|
308 |
+
|
309 |
+
time.sleep(2)
|
310 |
+
self._add_log("[INFO] Finalizing processing")
|
311 |
+
time.sleep(1)
|
312 |
+
self._add_log("[SUCCESS] Stage completed: ingestion")
|
313 |
+
time.sleep(0.5)
|
314 |
+
self._add_log("[SUCCESS] Ingestion completed successfully")
|
315 |
+
|
316 |
+
# Mark task as completed
|
317 |
+
self.is_completed = True
|
backend/tasks/createBenchConfigFile.py
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Task to create and save the configuration file
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import pathlib
|
6 |
+
import uuid
|
7 |
+
import yaml
|
8 |
+
import shutil
|
9 |
+
import time
|
10 |
+
import threading
|
11 |
+
from typing import Optional, Dict, Any, List, Tuple
|
12 |
+
|
13 |
+
from loguru import logger
|
14 |
+
from huggingface_hub import HfApi
|
15 |
+
|
16 |
+
|
17 |
+
class CreateBenchConfigTask:
|
18 |
+
"""
|
19 |
+
Task to create and save a configuration file for YourbenchSimpleDemo
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(self, session_uid: Optional[str] = None):
|
23 |
+
"""
|
24 |
+
Initialize the task with a session ID
|
25 |
+
|
26 |
+
Args:
|
27 |
+
session_uid: Optional session ID, will be generated if None
|
28 |
+
"""
|
29 |
+
self.session_uid = session_uid or str(uuid.uuid4())
|
30 |
+
self.logs: List[str] = []
|
31 |
+
self.is_completed = False
|
32 |
+
self.is_running_flag = threading.Event()
|
33 |
+
self.thread = None
|
34 |
+
self._add_log("[INFO] Initializing configuration creation task")
|
35 |
+
|
36 |
+
def _add_log(self, message: str) -> None:
|
37 |
+
"""
|
38 |
+
Add a log message to the logs list
|
39 |
+
|
40 |
+
Args:
|
41 |
+
message: Log message to add
|
42 |
+
"""
|
43 |
+
if message not in self.logs: # Avoid duplicates
|
44 |
+
self.logs.append(message)
|
45 |
+
# Force a copy of the list to avoid reference issues
|
46 |
+
self.logs = self.logs.copy()
|
47 |
+
# Log to system logs
|
48 |
+
logger.info(f"[{self.session_uid}] {message}")
|
49 |
+
|
50 |
+
def get_logs(self) -> List[str]:
|
51 |
+
"""
|
52 |
+
Get all logs for this task
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
List of log messages
|
56 |
+
"""
|
57 |
+
return self.logs.copy() # Retourner une copie pour éviter les problèmes de référence
|
58 |
+
|
59 |
+
def save_uploaded_file(self, file_path: str) -> str:
|
60 |
+
"""
|
61 |
+
Process the uploaded file that is already in the correct directory
|
62 |
+
|
63 |
+
Args:
|
64 |
+
file_path: Path to the uploaded file
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
Path to the file (same as input)
|
68 |
+
"""
|
69 |
+
try:
|
70 |
+
# The file is already in the correct location: uploaded_files/{session_id}/uploaded_files/
|
71 |
+
# Just log that we're processing it and return the path
|
72 |
+
self._add_log(f"[INFO] Processing file: {os.path.basename(file_path)}")
|
73 |
+
return file_path
|
74 |
+
except Exception as e:
|
75 |
+
error_msg = f"Error processing file: {str(e)}"
|
76 |
+
self._add_log(f"[ERROR] {error_msg}")
|
77 |
+
raise RuntimeError(error_msg)
|
78 |
+
|
79 |
+
def generate_base_config(self, hf_org: str, hf_dataset_name: str) -> Dict[str, Any]:
|
80 |
+
"""
|
81 |
+
Create the base configuration dictionary
|
82 |
+
|
83 |
+
Args:
|
84 |
+
hf_org: Hugging Face organization name
|
85 |
+
hf_dataset_name: Hugging Face dataset name
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
Configuration dictionary
|
89 |
+
"""
|
90 |
+
self._add_log(f"[INFO] Generating base configuration for {hf_dataset_name}")
|
91 |
+
|
92 |
+
# Check if HF token is available
|
93 |
+
hf_token = os.getenv("HF_TOKEN")
|
94 |
+
if not hf_token:
|
95 |
+
raise RuntimeError("HF_TOKEN environment variable is not defined")
|
96 |
+
|
97 |
+
return {
|
98 |
+
"hf_configuration": {
|
99 |
+
"token": "$HF_TOKEN", # Utiliser directement le token de l'environnement
|
100 |
+
"hf_organization": "$HF_ORGANIZATION",
|
101 |
+
"private": True,
|
102 |
+
"hf_dataset_name": hf_dataset_name,
|
103 |
+
"concat_if_exist": False,
|
104 |
+
},
|
105 |
+
"model_list": [
|
106 |
+
{
|
107 |
+
"model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
|
108 |
+
"provider": "novita",
|
109 |
+
"api_key": "$HF_TOKEN",
|
110 |
+
"max_concurrent_requests": 32,
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
114 |
+
"provider": "novita",
|
115 |
+
"api_key": "$HF_TOKEN",
|
116 |
+
"max_concurrent_requests": 32,
|
117 |
+
},
|
118 |
+
],
|
119 |
+
|
120 |
+
"model_roles": {
|
121 |
+
"ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
|
122 |
+
"summarization": ["Qwen/Qwen2.5-72B-Instruct"],
|
123 |
+
"chunking": ["intfloat/multilingual-e5-large-instruct"],
|
124 |
+
"single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
|
125 |
+
"multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
|
126 |
+
},
|
127 |
+
"pipeline": {
|
128 |
+
"ingestion": {
|
129 |
+
"source_documents_dir": f"uploaded_files/{self.session_uid}/uploaded_files/",
|
130 |
+
"output_dir": f"uploaded_files/{self.session_uid}/ingested",
|
131 |
+
"run": True,
|
132 |
+
},
|
133 |
+
"upload_ingest_to_hub": {
|
134 |
+
"source_documents_dir": f"uploaded_files/{self.session_uid}/ingested",
|
135 |
+
"run": True, # Réactivé pour l'upload sur le Hub
|
136 |
+
},
|
137 |
+
"summarization": {
|
138 |
+
"run": True,
|
139 |
+
},
|
140 |
+
"chunking": {
|
141 |
+
"run": True,
|
142 |
+
"chunking_configuration": {
|
143 |
+
"l_min_tokens": 64,
|
144 |
+
"l_max_tokens": 128,
|
145 |
+
"tau_threshold": 0.8,
|
146 |
+
"h_min": 2,
|
147 |
+
"h_max": 5,
|
148 |
+
"num_multihops_factor": 2,
|
149 |
+
},
|
150 |
+
},
|
151 |
+
"single_shot_question_generation": {
|
152 |
+
"run": True,
|
153 |
+
"additional_instructions": "Generate questions to test a curious adult",
|
154 |
+
"chunk_sampling": {
|
155 |
+
"mode": "count",
|
156 |
+
"value": 5,
|
157 |
+
"random_seed": 123,
|
158 |
+
},
|
159 |
+
},
|
160 |
+
"multi_hop_question_generation": {
|
161 |
+
"run": True,
|
162 |
+
"additional_instructions": "Generate questions to test a curious adult",
|
163 |
+
"chunk_sampling": {
|
164 |
+
"mode": "percentage",
|
165 |
+
"value": 0.3,
|
166 |
+
"random_seed": 42,
|
167 |
+
},
|
168 |
+
},
|
169 |
+
"lighteval": {
|
170 |
+
"run": True,
|
171 |
+
},
|
172 |
+
},
|
173 |
+
}
|
174 |
+
|
175 |
+
def save_yaml_file(self, config: Dict[str, Any], path: str) -> str:
|
176 |
+
"""
|
177 |
+
Save the given configuration dictionary to a YAML file
|
178 |
+
|
179 |
+
Args:
|
180 |
+
config: Configuration dictionary
|
181 |
+
path: Path to save the file
|
182 |
+
|
183 |
+
Returns:
|
184 |
+
Path to the saved file
|
185 |
+
"""
|
186 |
+
try:
|
187 |
+
# Create directory if it doesn't exist
|
188 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
189 |
+
|
190 |
+
with open(path, "w") as file:
|
191 |
+
yaml.dump(config, file, default_flow_style=False, sort_keys=False)
|
192 |
+
|
193 |
+
self._add_log(f"[INFO] Configuration saved: {path}")
|
194 |
+
return path
|
195 |
+
except Exception as e:
|
196 |
+
error_msg = f"Error saving configuration: {str(e)}"
|
197 |
+
self._add_log(f"[ERROR] {error_msg}")
|
198 |
+
raise RuntimeError(error_msg)
|
199 |
+
|
200 |
+
def _run_task(self, file_path: str) -> str:
|
201 |
+
"""
|
202 |
+
Internal method to run the task in a separate thread
|
203 |
+
|
204 |
+
Args:
|
205 |
+
file_path: Path to the uploaded file
|
206 |
+
|
207 |
+
Returns:
|
208 |
+
Path to the configuration file
|
209 |
+
"""
|
210 |
+
try:
|
211 |
+
# Use the default yourbench organization
|
212 |
+
org_name = os.getenv("HF_ORGANIZATION")
|
213 |
+
|
214 |
+
# Check if HF token is available
|
215 |
+
hf_token = os.getenv("HF_TOKEN")
|
216 |
+
if not hf_token:
|
217 |
+
raise RuntimeError("HF_TOKEN environment variable is not defined")
|
218 |
+
|
219 |
+
self._add_log(f"[INFO] Organization: {org_name}")
|
220 |
+
|
221 |
+
time.sleep(0.5) # Simulate delay
|
222 |
+
|
223 |
+
# Save the uploaded file
|
224 |
+
saved_file_path = self.save_uploaded_file(file_path)
|
225 |
+
|
226 |
+
time.sleep(1) # Simulate delay
|
227 |
+
|
228 |
+
# Path for the config file
|
229 |
+
config_dir = pathlib.Path(f"uploaded_files/{self.session_uid}")
|
230 |
+
config_path = config_dir / "config.yml"
|
231 |
+
|
232 |
+
# Generate dataset name based on session ID
|
233 |
+
dataset_name = f"yourbench_{self.session_uid}"
|
234 |
+
self._add_log(f"[INFO] Dataset name: {dataset_name}")
|
235 |
+
|
236 |
+
time.sleep(0.8) # Simulate delay
|
237 |
+
|
238 |
+
# Generate and save the configuration
|
239 |
+
config = self.generate_base_config(org_name, dataset_name)
|
240 |
+
|
241 |
+
time.sleep(1.2) # Simulate delay
|
242 |
+
|
243 |
+
config_file_path = self.save_yaml_file(config, str(config_path))
|
244 |
+
|
245 |
+
self._add_log(f"[INFO] Configuration generated successfully: {config_file_path}")
|
246 |
+
|
247 |
+
# Simulate additional processing
|
248 |
+
time.sleep(1.5) # Simulate delay
|
249 |
+
self._add_log("[INFO] Starting ingestion")
|
250 |
+
|
251 |
+
time.sleep(2) # Simulate delay
|
252 |
+
self._add_log(f"[INFO] Processing file: {dataset_name}")
|
253 |
+
|
254 |
+
time.sleep(2) # Simulate delay
|
255 |
+
self._add_log("[SUCCESS] Stage completed: config_generation")
|
256 |
+
|
257 |
+
# Tâche terminée
|
258 |
+
self.mark_task_completed()
|
259 |
+
|
260 |
+
return str(config_path)
|
261 |
+
except Exception as e:
|
262 |
+
error_msg = f"Error generating configuration: {str(e)}"
|
263 |
+
self._add_log(f"[ERROR] {error_msg}")
|
264 |
+
self.mark_task_completed()
|
265 |
+
raise RuntimeError(error_msg)
|
266 |
+
|
267 |
+
def run(self, file_path: str, token: Optional[str] = None) -> str:
|
268 |
+
"""
|
269 |
+
Run the task to create and save the configuration file asynchronously
|
270 |
+
|
271 |
+
Args:
|
272 |
+
file_path: Path to the uploaded file
|
273 |
+
token: Hugging Face token (not used, using HF_TOKEN from environment)
|
274 |
+
|
275 |
+
Returns:
|
276 |
+
Path to the configuration file
|
277 |
+
"""
|
278 |
+
# Mark the task as running
|
279 |
+
self.is_running_flag.set()
|
280 |
+
|
281 |
+
# Start the task in a separate thread
|
282 |
+
self.thread = threading.Thread(target=self._run_task, args=(file_path,))
|
283 |
+
self.thread.daemon = True
|
284 |
+
self.thread.start()
|
285 |
+
|
286 |
+
# Return the expected config path
|
287 |
+
return f"uploaded_files/{self.session_uid}/config.yml"
|
288 |
+
|
289 |
+
def is_running(self) -> bool:
|
290 |
+
"""
|
291 |
+
Check if the task is running
|
292 |
+
|
293 |
+
Returns:
|
294 |
+
True if running, False otherwise
|
295 |
+
"""
|
296 |
+
return self.is_running_flag.is_set() and not self.is_completed
|
297 |
+
|
298 |
+
def is_task_completed(self) -> bool:
|
299 |
+
"""
|
300 |
+
Check if the task is completed
|
301 |
+
|
302 |
+
Returns:
|
303 |
+
True if completed, False otherwise
|
304 |
+
"""
|
305 |
+
return self.is_completed
|
306 |
+
|
307 |
+
def mark_task_completed(self) -> None:
|
308 |
+
"""
|
309 |
+
Mark the task as completed
|
310 |
+
"""
|
311 |
+
self.is_completed = True
|
312 |
+
self.is_running_flag.clear()
|
313 |
+
self._add_log("[INFO] Configuration generation task completed")
|
backend/tasks/evaluationTask.py
ADDED
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Task to evaluate models on a YourbBench dataset using LightEval
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import json
|
7 |
+
import time
|
8 |
+
import tempfile
|
9 |
+
import asyncio
|
10 |
+
import threading
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Optional, List, Dict, Any, Tuple
|
13 |
+
|
14 |
+
from loguru import logger
|
15 |
+
from huggingface_hub import HfApi, CommitOperationAdd
|
16 |
+
|
17 |
+
from tasks.yourbench_lighteval_task import create_yourbench_task
|
18 |
+
|
19 |
+
|
20 |
+
class EvaluationTask:
|
21 |
+
"""
|
22 |
+
Task to evaluate models using LightEval on a YourbBench dataset
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(self, session_uid: str, dataset_name: str):
|
26 |
+
"""
|
27 |
+
Initialize the evaluation task
|
28 |
+
|
29 |
+
Args:
|
30 |
+
session_uid: Session ID for this task
|
31 |
+
dataset_name: Name of the dataset to evaluate
|
32 |
+
"""
|
33 |
+
self.session_uid = session_uid
|
34 |
+
self.dataset_name = dataset_name
|
35 |
+
self.logs: List[str] = []
|
36 |
+
self.is_completed = False
|
37 |
+
self.organization = os.getenv("HF_ORGANIZATION", "yourbench")
|
38 |
+
self.results: Dict[str, Any] = {}
|
39 |
+
self.output_dir = f"uploaded_files/{session_uid}/lighteval_results"
|
40 |
+
|
41 |
+
# Models to evaluate - can be modified to allow customization
|
42 |
+
self.models = [
|
43 |
+
("Qwen/Qwen2.5-72B-Instruct", "novita"),
|
44 |
+
("Qwen/QwQ-32B", "novita"),
|
45 |
+
]
|
46 |
+
|
47 |
+
self._add_log("[INFO] Initializing evaluation task")
|
48 |
+
self._add_log(f"[INFO] Dataset to evaluate: {self.organization}/{dataset_name}")
|
49 |
+
self._add_log(f"[INFO] Output directory: {self.output_dir}")
|
50 |
+
|
51 |
+
def _add_log(self, message: str) -> None:
|
52 |
+
"""
|
53 |
+
Add a log message to the logs list
|
54 |
+
|
55 |
+
Args:
|
56 |
+
message: Log message to add
|
57 |
+
"""
|
58 |
+
if message not in self.logs: # Avoid duplicates
|
59 |
+
self.logs.append(message)
|
60 |
+
# Force copy of the list to avoid reference problems
|
61 |
+
self.logs = self.logs.copy()
|
62 |
+
# Record in system logs
|
63 |
+
logger.info(f"[{self.session_uid}] {message}")
|
64 |
+
|
65 |
+
def get_logs(self) -> List[str]:
|
66 |
+
"""
|
67 |
+
Get all logs for this task
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
List of log messages
|
71 |
+
"""
|
72 |
+
return self.logs.copy() # Retourner une copie pour éviter les problèmes de référence
|
73 |
+
|
74 |
+
def is_task_completed(self) -> bool:
|
75 |
+
"""
|
76 |
+
Check if the task is completed
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
True if completed, False otherwise
|
80 |
+
"""
|
81 |
+
return self.is_completed
|
82 |
+
|
83 |
+
async def _evaluate_model(self, model_info: Tuple[str, str]) -> Dict[str, Any]:
|
84 |
+
"""
|
85 |
+
Evaluate a specific model
|
86 |
+
|
87 |
+
Args:
|
88 |
+
model_info: Tuple of (model_name, provider)
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
Dictionary with evaluation results
|
92 |
+
"""
|
93 |
+
model_name, provider = model_info
|
94 |
+
self._add_log(f"[INFO] Starting evaluation for {model_name} with {provider}")
|
95 |
+
|
96 |
+
# Create output directory
|
97 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
98 |
+
|
99 |
+
# Define full dataset path
|
100 |
+
dataset_path = f"{self.organization}/{self.dataset_name}"
|
101 |
+
|
102 |
+
# Create temporary file
|
103 |
+
temp_file_path = tempfile.mktemp(suffix=".py")
|
104 |
+
self._add_log(f"[INFO] Creating temporary file for {model_name}: {temp_file_path}")
|
105 |
+
|
106 |
+
with open(temp_file_path, 'w') as temp_file:
|
107 |
+
temp_file.write(f"""
|
108 |
+
import os
|
109 |
+
import sys
|
110 |
+
sys.path.append("{os.getcwd()}")
|
111 |
+
|
112 |
+
from tasks.yourbench_lighteval_task import create_yourbench_task
|
113 |
+
|
114 |
+
# Create yourbench task
|
115 |
+
yourbench = create_yourbench_task("{dataset_path}", "lighteval")
|
116 |
+
|
117 |
+
# Define TASKS_TABLE needed by lighteval
|
118 |
+
TASKS_TABLE = [yourbench]
|
119 |
+
""")
|
120 |
+
|
121 |
+
# Build lighteval command args
|
122 |
+
cmd_args = [
|
123 |
+
"lighteval",
|
124 |
+
"endpoint",
|
125 |
+
"inference-providers",
|
126 |
+
f"model={model_name},provider={provider}",
|
127 |
+
"custom|yourbench|0|0",
|
128 |
+
"--custom-tasks",
|
129 |
+
temp_file_path,
|
130 |
+
"--max-samples", "5",
|
131 |
+
"--output-dir", self.output_dir,
|
132 |
+
"--save-details",
|
133 |
+
"--no-push-to-hub"
|
134 |
+
]
|
135 |
+
|
136 |
+
self._add_log(f"[INFO] Running command for {model_name}: {' '.join(cmd_args)}")
|
137 |
+
|
138 |
+
results = {
|
139 |
+
"model_name": model_name,
|
140 |
+
"provider": provider,
|
141 |
+
"success": False,
|
142 |
+
"error": None,
|
143 |
+
"results": None,
|
144 |
+
"return_code": None
|
145 |
+
}
|
146 |
+
|
147 |
+
try:
|
148 |
+
# Prepare environment with needed tokens
|
149 |
+
env = os.environ.copy()
|
150 |
+
hf_token = os.getenv("HF_TOKEN")
|
151 |
+
if hf_token:
|
152 |
+
env["HF_TOKEN"] = hf_token
|
153 |
+
env["HUGGING_FACE_HUB_TOKEN"] = hf_token
|
154 |
+
env["HF_ORGANIZATION"] = self.organization
|
155 |
+
|
156 |
+
# Run the process asynchronously
|
157 |
+
process = await asyncio.create_subprocess_exec(
|
158 |
+
*cmd_args,
|
159 |
+
stdout=asyncio.subprocess.PIPE,
|
160 |
+
stderr=asyncio.subprocess.PIPE,
|
161 |
+
env=env
|
162 |
+
)
|
163 |
+
|
164 |
+
# Wait for the process to complete
|
165 |
+
stdout, stderr = await process.communicate()
|
166 |
+
|
167 |
+
# Store return code
|
168 |
+
exit_code = process.returncode
|
169 |
+
results["return_code"] = exit_code
|
170 |
+
|
171 |
+
# Log output
|
172 |
+
if stdout:
|
173 |
+
stdout_lines = stdout.decode().strip().split('\n')
|
174 |
+
for line in stdout_lines[:5]: # Log only first 5 lines
|
175 |
+
self._add_log(f"[INFO] {model_name} - {line}")
|
176 |
+
|
177 |
+
# Log errors if any
|
178 |
+
if stderr and exit_code != 0:
|
179 |
+
stderr_lines = stderr.decode().strip().split('\n')
|
180 |
+
for line in stderr_lines[:5]: # Log only first 5 lines
|
181 |
+
self._add_log(f"[ERROR] {model_name} - {line}")
|
182 |
+
|
183 |
+
# Find any JSON result files - LightEval organizes by model name in different ways
|
184 |
+
result_files = []
|
185 |
+
results_dir = Path(self.output_dir) / "results"
|
186 |
+
if results_dir.exists():
|
187 |
+
# Parcourir récursivement tous les répertoires pour trouver des fichiers JSON
|
188 |
+
for json_file in results_dir.glob("**/*.json"):
|
189 |
+
# Check if the filename or path contains parts of the model name
|
190 |
+
model_parts = [
|
191 |
+
model_name, # Full name
|
192 |
+
model_name.replace('/', '_'), # Name with / replaced by _
|
193 |
+
model_name.split('/')[-1] # Just the model name without the organization
|
194 |
+
]
|
195 |
+
|
196 |
+
if any(part in str(json_file) for part in model_parts):
|
197 |
+
result_files.append(json_file)
|
198 |
+
|
199 |
+
# Traiter les fichiers de résultats trouvés
|
200 |
+
if result_files:
|
201 |
+
# Prendre le fichier le plus récent
|
202 |
+
result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
|
203 |
+
latest_result = result_files[0]
|
204 |
+
self._add_log(f"[INFO] {model_name} - Found result file: {latest_result}")
|
205 |
+
|
206 |
+
try:
|
207 |
+
with open(latest_result, 'r') as f:
|
208 |
+
test_results = json.load(f)
|
209 |
+
|
210 |
+
# Vérifier si les résultats contiennent les informations essentielles
|
211 |
+
if (test_results and
|
212 |
+
isinstance(test_results, dict) and
|
213 |
+
"results" in test_results and
|
214 |
+
"all" in test_results["results"]):
|
215 |
+
|
216 |
+
# Enregistrer les résultats
|
217 |
+
results["results"] = test_results
|
218 |
+
results["success"] = True
|
219 |
+
|
220 |
+
# Afficher la précision
|
221 |
+
accuracy = test_results["results"]["all"]["accuracy"]
|
222 |
+
accuracy_stderr = test_results["results"]["all"]["accuracy_stderr"]
|
223 |
+
self._add_log(f"[SUCCESS] {model_name} - Accuracy: {accuracy:.4f} ± {accuracy_stderr:.4f}")
|
224 |
+
else:
|
225 |
+
results["error"] = "Incomplete or unexpected result format"
|
226 |
+
self._add_log(f"[WARNING] {model_name} - Unexpected result format")
|
227 |
+
|
228 |
+
except (json.JSONDecodeError, KeyError) as e:
|
229 |
+
results["error"] = f"Error reading results: {str(e)}"
|
230 |
+
self._add_log(f"[ERROR] {model_name} - {results['error']}")
|
231 |
+
|
232 |
+
# Si aucun résultat trouvé
|
233 |
+
if not results["success"]:
|
234 |
+
if exit_code == 0:
|
235 |
+
results["error"] = "Execution completed without error but no results found"
|
236 |
+
self._add_log(f"[WARNING] {model_name} - {results['error']}")
|
237 |
+
else:
|
238 |
+
results["error"] = f"Execution error (code: {exit_code})"
|
239 |
+
self._add_log(f"[ERROR] {model_name} - {results['error']}")
|
240 |
+
|
241 |
+
except Exception as e:
|
242 |
+
results["error"] = f"Exception: {str(e)}"
|
243 |
+
self._add_log(f"[ERROR] Exception during evaluation of {model_name}: {str(e)}")
|
244 |
+
finally:
|
245 |
+
# Delete temporary file
|
246 |
+
try:
|
247 |
+
os.unlink(temp_file_path)
|
248 |
+
except:
|
249 |
+
pass
|
250 |
+
|
251 |
+
return results
|
252 |
+
|
253 |
+
async def _run_evaluations(self) -> List[Dict[str, Any]]:
|
254 |
+
"""
|
255 |
+
Run evaluations for all models
|
256 |
+
|
257 |
+
Returns:
|
258 |
+
List of evaluation results
|
259 |
+
"""
|
260 |
+
self._add_log(f"[INFO] Starting evaluations for {len(self.models)} models")
|
261 |
+
|
262 |
+
# Create tasks for each model
|
263 |
+
tasks = [self._evaluate_model(model) for model in self.models]
|
264 |
+
|
265 |
+
# Run all tasks concurrently and gather results
|
266 |
+
model_results = await asyncio.gather(*tasks, return_exceptions=True)
|
267 |
+
|
268 |
+
# Process results
|
269 |
+
results = []
|
270 |
+
for i, result in enumerate(model_results):
|
271 |
+
if isinstance(result, Exception):
|
272 |
+
# Handle exception
|
273 |
+
model_name, provider = self.models[i]
|
274 |
+
self._add_log(f"[ERROR] Evaluation failed for {model_name}: {str(result)}")
|
275 |
+
results.append({
|
276 |
+
"model_name": model_name,
|
277 |
+
"provider": provider,
|
278 |
+
"success": False,
|
279 |
+
"error": str(result),
|
280 |
+
"results": None,
|
281 |
+
"return_code": None
|
282 |
+
})
|
283 |
+
else:
|
284 |
+
# Valid result
|
285 |
+
results.append(result)
|
286 |
+
|
287 |
+
return results
|
288 |
+
|
289 |
+
def _format_comparison_results(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
290 |
+
"""
|
291 |
+
Format results for easy comparison between models
|
292 |
+
|
293 |
+
Args:
|
294 |
+
results: List of evaluation results
|
295 |
+
|
296 |
+
Returns:
|
297 |
+
Dictionary with formatted comparison results
|
298 |
+
"""
|
299 |
+
comparison = {
|
300 |
+
"metadata": {
|
301 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
302 |
+
"dataset": f"{self.organization}/{self.dataset_name}",
|
303 |
+
"total_models_tested": len(results),
|
304 |
+
"successful_tests": len([r for r in results if r["success"]])
|
305 |
+
},
|
306 |
+
"models_comparison": []
|
307 |
+
}
|
308 |
+
|
309 |
+
# Liste des modèles réussis et des modèles échoués
|
310 |
+
successful_models = [r for r in results if r["success"]]
|
311 |
+
failed_models = [r for r in results if not r["success"]]
|
312 |
+
|
313 |
+
# Trier les modèles réussis par précision (du plus précis au moins précis)
|
314 |
+
if successful_models:
|
315 |
+
sorted_successful = sorted(
|
316 |
+
successful_models,
|
317 |
+
key=lambda x: x["results"]["results"]["all"]["accuracy"],
|
318 |
+
reverse=True # Du plus grand au plus petit
|
319 |
+
)
|
320 |
+
else:
|
321 |
+
sorted_successful = []
|
322 |
+
|
323 |
+
# Trier les modèles échoués par nom
|
324 |
+
sorted_failed = sorted(failed_models, key=lambda x: x["model_name"])
|
325 |
+
|
326 |
+
# Concaténer: d'abord les réussites, puis les échecs
|
327 |
+
sorted_results = sorted_successful + sorted_failed
|
328 |
+
|
329 |
+
# Créer l'entrée pour chaque modèle
|
330 |
+
for result in sorted_results:
|
331 |
+
model_result = {
|
332 |
+
"model_name": result["model_name"],
|
333 |
+
"provider": result["provider"],
|
334 |
+
"success": result["success"]
|
335 |
+
}
|
336 |
+
|
337 |
+
if result["success"]:
|
338 |
+
# Ajouter les métriques de précision et temps d'exécution
|
339 |
+
model_result.update({
|
340 |
+
"accuracy": result["results"]["results"]["all"]["accuracy"],
|
341 |
+
"accuracy_stderr": result["results"]["results"]["all"]["accuracy_stderr"],
|
342 |
+
"evaluation_time": float(result["results"]["config_general"]["total_evaluation_time_secondes"])
|
343 |
+
})
|
344 |
+
else:
|
345 |
+
# Ajouter l'erreur
|
346 |
+
model_result["error"] = result.get("error", "Unknown reason")
|
347 |
+
|
348 |
+
comparison["models_comparison"].append(model_result)
|
349 |
+
|
350 |
+
return comparison
|
351 |
+
|
352 |
+
async def _upload_results_to_dataset(self, comparison_results: Dict[str, Any]) -> bool:
|
353 |
+
"""
|
354 |
+
Upload evaluation results to the HuggingFace dataset
|
355 |
+
|
356 |
+
Args:
|
357 |
+
comparison_results: The formatted comparison results
|
358 |
+
|
359 |
+
Returns:
|
360 |
+
bool: True if upload succeeded, False otherwise
|
361 |
+
"""
|
362 |
+
try:
|
363 |
+
# Create a timestamp for the results file
|
364 |
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
365 |
+
result_filename = f"lighteval_results.json"
|
366 |
+
|
367 |
+
# Create temporary file for upload
|
368 |
+
temp_file_path = tempfile.mktemp(suffix=".json")
|
369 |
+
with open(temp_file_path, 'w') as f:
|
370 |
+
json.dump(comparison_results, f, indent=2)
|
371 |
+
|
372 |
+
# Initialize HF API
|
373 |
+
hf_token = os.getenv("HF_TOKEN")
|
374 |
+
if not hf_token:
|
375 |
+
self._add_log("[ERROR] HF_TOKEN not found, cannot upload results to dataset")
|
376 |
+
return False
|
377 |
+
|
378 |
+
api = HfApi(token=hf_token)
|
379 |
+
dataset_id = f"{self.organization}/{self.dataset_name}"
|
380 |
+
|
381 |
+
# Prepare the file operation
|
382 |
+
operation = CommitOperationAdd(
|
383 |
+
path_in_repo=f"lighteval_results/{result_filename}",
|
384 |
+
path_or_fileobj=temp_file_path
|
385 |
+
)
|
386 |
+
|
387 |
+
# Upload the file
|
388 |
+
self._add_log(f"[INFO] Uploading results to dataset {dataset_id}")
|
389 |
+
api.create_commit(
|
390 |
+
repo_id=dataset_id,
|
391 |
+
repo_type="dataset",
|
392 |
+
operations=[operation],
|
393 |
+
commit_message=f"Add evaluation results from {timestamp}"
|
394 |
+
)
|
395 |
+
|
396 |
+
# Cleanup temporary file
|
397 |
+
os.unlink(temp_file_path)
|
398 |
+
|
399 |
+
self._add_log(f"[SUCCESS] Results uploaded to dataset {dataset_id} at lighteval_results/{result_filename}")
|
400 |
+
return True
|
401 |
+
|
402 |
+
except Exception as e:
|
403 |
+
self._add_log(f"[ERROR] Failed to upload results to dataset: {str(e)}")
|
404 |
+
return False
|
405 |
+
|
406 |
+
async def _process_evaluation_results(self, results: List[Dict[str, Any]]) -> None:
|
407 |
+
"""
|
408 |
+
Process evaluation results, create summaries and save files
|
409 |
+
|
410 |
+
Args:
|
411 |
+
results: List of evaluation results
|
412 |
+
"""
|
413 |
+
if results:
|
414 |
+
try:
|
415 |
+
# Save detailed results
|
416 |
+
detailed_output_file = f"{self.output_dir}/detailed_results.json"
|
417 |
+
os.makedirs(os.path.dirname(detailed_output_file), exist_ok=True)
|
418 |
+
with open(detailed_output_file, 'w') as f:
|
419 |
+
json.dump(results, f, indent=2)
|
420 |
+
self._add_log(f"[INFO] Detailed results saved in {detailed_output_file}")
|
421 |
+
|
422 |
+
# Generate and save comparison results
|
423 |
+
comparison = self._format_comparison_results(results)
|
424 |
+
comparison_file = f"{self.output_dir}/models_comparison.json"
|
425 |
+
with open(comparison_file, 'w') as f:
|
426 |
+
json.dump(comparison, f, indent=2)
|
427 |
+
self._add_log(f"[INFO] Models comparison saved in {comparison_file}")
|
428 |
+
|
429 |
+
# Upload results to the dataset
|
430 |
+
await self._upload_results_to_dataset(comparison)
|
431 |
+
|
432 |
+
# Store results for later access
|
433 |
+
self.results = comparison
|
434 |
+
self._add_log("[SUCCESS] Evaluation completed")
|
435 |
+
except Exception as e:
|
436 |
+
self._add_log(f"[ERROR] Error during evaluation execution: {str(e)}")
|
437 |
+
finally:
|
438 |
+
self.is_completed = True
|
439 |
+
|
440 |
+
def _async_run(self) -> None:
|
441 |
+
"""
|
442 |
+
Run the evaluation asynchronously
|
443 |
+
"""
|
444 |
+
async def run_async():
|
445 |
+
try:
|
446 |
+
# Run evaluations
|
447 |
+
results = await self._run_evaluations()
|
448 |
+
|
449 |
+
# Process evaluation results
|
450 |
+
await self._process_evaluation_results(results)
|
451 |
+
except Exception as e:
|
452 |
+
self._add_log(f"[ERROR] Error during evaluation execution: {str(e)}")
|
453 |
+
finally:
|
454 |
+
self.is_completed = True
|
455 |
+
|
456 |
+
# Create and run the asyncio event loop
|
457 |
+
loop = asyncio.new_event_loop()
|
458 |
+
asyncio.set_event_loop(loop)
|
459 |
+
loop.run_until_complete(run_async())
|
460 |
+
loop.close()
|
461 |
+
|
462 |
+
def run(self) -> None:
|
463 |
+
"""
|
464 |
+
Run the evaluation task in a separate thread
|
465 |
+
"""
|
466 |
+
self._add_log("[INFO] Starting evaluation")
|
467 |
+
|
468 |
+
# Run in a separate thread to not block the main thread
|
469 |
+
thread = threading.Thread(target=self._async_run)
|
470 |
+
thread.daemon = True
|
471 |
+
thread.start()
|
backend/tasks/yourbench_lighteval_task.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# MIT License
|
3 |
+
|
4 |
+
# Copyright (c) 2024 The HuggingFace Team
|
5 |
+
|
6 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7 |
+
# of this software and associated documentation files (the "Software"), to deal
|
8 |
+
# in the Software without restriction, including without limitation the rights
|
9 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10 |
+
# copies of the Software, and to permit persons to whom the Software is
|
11 |
+
# furnished to do so, subject to the following conditions:
|
12 |
+
|
13 |
+
# The above copyright notice and this permission notice shall be included in all
|
14 |
+
# copies or substantial portions of the Software.
|
15 |
+
|
16 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22 |
+
# SOFTWARE.
|
23 |
+
|
24 |
+
import logging
|
25 |
+
import re
|
26 |
+
|
27 |
+
import numpy as np
|
28 |
+
from aenum import extend_enum
|
29 |
+
|
30 |
+
from lighteval.metrics.metrics import Metrics
|
31 |
+
from lighteval.metrics.metrics_sample import JudgeLLM
|
32 |
+
from lighteval.metrics.utils.metric_utils import (
|
33 |
+
CorpusLevelMetricGrouping,
|
34 |
+
MetricCategory,
|
35 |
+
MetricUseCase,
|
36 |
+
)
|
37 |
+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
38 |
+
from lighteval.tasks.requests import Doc
|
39 |
+
|
40 |
+
|
41 |
+
logger = logging.getLogger(__name__)
|
42 |
+
|
43 |
+
JUDGE_ANSWER_SYSTEM_PROMPT = """You will be provided with the summary of a document, a piece of text, a question generated from that text, and the correct or "gold" answer to the question. Additionally, you will receive a model answer. Your task is to determine wether the model answer is correct using the provided "gold" answer as a reference.
|
44 |
+
# Steps
|
45 |
+
1. **Document Understanding**:
|
46 |
+
- Analyze the provided document summary to grasp the context and main themes.
|
47 |
+
2. **Chunk Understanding**:
|
48 |
+
- Examine the provided text (chunk) to understand its content.
|
49 |
+
3. **Question Understanding**:
|
50 |
+
- Interpret the given question to fully comprehend what is being asked.
|
51 |
+
4. **Ground Truth Answer Understanding**:
|
52 |
+
- Understand the provided ground truth answer, identifying its key points.
|
53 |
+
6. **Answer Understanding**:
|
54 |
+
- Examine the Model Answer, identifying key points and assessing accuracy and factuality.
|
55 |
+
7. **Final Answer**:
|
56 |
+
- 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
|
57 |
+
# Output Format
|
58 |
+
- Provide your final evaluation of whether the answer is correct within `<final_answer>` XML tags.
|
59 |
+
- Include a detailed analysis for each part within the designated XML tags: `<document_understanding>`, `<chunk_understanding>`, `<question_understanding>`, `<ground_truth_answer_understanding>`, `<model_answer_understanding>`, and `<final_answer>`.
|
60 |
+
# Examples
|
61 |
+
**Input**:
|
62 |
+
```xml
|
63 |
+
<document_summary>
|
64 |
+
[Summary]
|
65 |
+
</document_summary>
|
66 |
+
<piece_of_text>
|
67 |
+
[Text]
|
68 |
+
</piece_of_text>
|
69 |
+
<question>
|
70 |
+
[Question]
|
71 |
+
</question>
|
72 |
+
<gold_answer>
|
73 |
+
[Gold Answer]
|
74 |
+
</gold_answer>
|
75 |
+
<model_answer>
|
76 |
+
[Model Answer]
|
77 |
+
</model_answer>
|
78 |
+
```
|
79 |
+
**Output**:
|
80 |
+
```xml
|
81 |
+
<document_understanding>
|
82 |
+
Understanding of the summary including key themes
|
83 |
+
</document_understanding>
|
84 |
+
<chunk_understanding>
|
85 |
+
Analysis of the piece of text
|
86 |
+
</chunk_understanding>
|
87 |
+
<question_understanding>
|
88 |
+
Comprehension of the question being asked
|
89 |
+
</question_understanding>
|
90 |
+
<ground_truth_answer_understanding>
|
91 |
+
Key points from the gold answer
|
92 |
+
</ground_truth_answer_understanding>
|
93 |
+
<model_answer_understanding>
|
94 |
+
Key points and accuracy of Answer A
|
95 |
+
</model_answer_understanding>
|
96 |
+
<final_answer>
|
97 |
+
1 or 0 (1 if the model answer is correct, 0 if it is incorrect)
|
98 |
+
</final_answer>
|
99 |
+
```
|
100 |
+
# Notes
|
101 |
+
- Always focus on key points and factual correctness as per the ground truth.
|
102 |
+
- Avoid any biases and rely solely on the evidence presented.
|
103 |
+
- Enclose all evaluations and analyses in the specified XML tags for clarity and structure."""
|
104 |
+
|
105 |
+
|
106 |
+
JUDGE_ANSWER_USER_PROMPT = """<document_summary>
|
107 |
+
{summary}
|
108 |
+
</document_summary>
|
109 |
+
<piece_of_text>
|
110 |
+
{chunk}
|
111 |
+
</piece_of_text>
|
112 |
+
<question>
|
113 |
+
{question}
|
114 |
+
</question>
|
115 |
+
<gold_answer>
|
116 |
+
{oracle_answer}
|
117 |
+
</gold_answer>
|
118 |
+
<model_answer>
|
119 |
+
{model_answer}
|
120 |
+
</model_answer>"""
|
121 |
+
|
122 |
+
|
123 |
+
def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
|
124 |
+
chunk = kwargs.get("chunks", "")
|
125 |
+
summary = kwargs.get("documents", "")
|
126 |
+
|
127 |
+
return [
|
128 |
+
{"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": JUDGE_ANSWER_USER_PROMPT.format(
|
132 |
+
summary=summary, chunk=chunk, question=question, oracle_answer=gold, model_answer=answer
|
133 |
+
),
|
134 |
+
},
|
135 |
+
]
|
136 |
+
|
137 |
+
|
138 |
+
def process_judge_response_yourbench(response):
|
139 |
+
# extract the final answer using regex from the response xml
|
140 |
+
try:
|
141 |
+
# Essayer d'abord le format XML
|
142 |
+
match = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL)
|
143 |
+
if match:
|
144 |
+
answer_text = match.group(1).strip()
|
145 |
+
# Convertir différents formats possibles en 0 ou 1
|
146 |
+
if answer_text in ["1", "correct", "true", "yes", "True", "TRUE"]:
|
147 |
+
return 1
|
148 |
+
elif answer_text in ["0", "incorrect", "false", "no", "False", "FALSE"]:
|
149 |
+
return 0
|
150 |
+
# Essayer de convertir directement en nombre
|
151 |
+
try:
|
152 |
+
value = int(answer_text)
|
153 |
+
return 1 if value > 0 else 0
|
154 |
+
except ValueError:
|
155 |
+
pass
|
156 |
+
|
157 |
+
# Rechercher des mots-clés dans la réponse
|
158 |
+
if re.search(r"\b(correct|vrai|true|yes)\b", response, re.IGNORECASE):
|
159 |
+
return 1
|
160 |
+
if re.search(r"\b(incorrect|faux|false|no)\b", response, re.IGNORECASE):
|
161 |
+
return 0
|
162 |
+
|
163 |
+
logger.warning(f"Réponse du juge non reconnue, retournant 0 par défaut: {response[:100]}...")
|
164 |
+
except Exception as e:
|
165 |
+
logger.error(f"Error processing judge response: {e}")
|
166 |
+
return 0
|
167 |
+
|
168 |
+
|
169 |
+
class JudgeLLMYourBench(JudgeLLM):
|
170 |
+
def __init__(self):
|
171 |
+
super().__init__(
|
172 |
+
judge_model_name="gpt-4o-2024-08-06",
|
173 |
+
template=get_judge_prompt,
|
174 |
+
process_judge_response=process_judge_response_yourbench,
|
175 |
+
judge_backend="openai",
|
176 |
+
short_judge_name="yourbench_judge",
|
177 |
+
)
|
178 |
+
|
179 |
+
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
180 |
+
# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
|
181 |
+
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
|
182 |
+
golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
|
183 |
+
predictions = [response[0].result[0] for response in responses]
|
184 |
+
options = [None] * len(questions)
|
185 |
+
chunks = [formatted_doc.specific["chunks"][0] for formatted_doc in formatted_docs]
|
186 |
+
documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
|
187 |
+
|
188 |
+
score, _, _ = self.judge.evaluate_answer_batch(
|
189 |
+
questions, predictions, options, golds, chunks=chunks, documents=documents
|
190 |
+
)
|
191 |
+
|
192 |
+
metrics = []
|
193 |
+
for i in range(len(sample_ids)):
|
194 |
+
metrics.append(
|
195 |
+
{
|
196 |
+
"accuracy": score[i],
|
197 |
+
}
|
198 |
+
)
|
199 |
+
|
200 |
+
return metrics
|
201 |
+
|
202 |
+
|
203 |
+
ZEROSHOT_QA_USER_PROMPT = """Answer the following question:
|
204 |
+
<question>
|
205 |
+
{question}
|
206 |
+
</question>
|
207 |
+
Enclose your full answer in <answer> XML tags. For example:
|
208 |
+
<answer>
|
209 |
+
[your answer here]
|
210 |
+
</answer>"""
|
211 |
+
|
212 |
+
|
213 |
+
def yourbench_prompt(line, task_name: str = ""):
|
214 |
+
return Doc(
|
215 |
+
task_name=task_name,
|
216 |
+
query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]),
|
217 |
+
choices=[line["ground_truth_answer"]],
|
218 |
+
gold_index=0,
|
219 |
+
specific={
|
220 |
+
"question_category": line["question_category"],
|
221 |
+
"kind": line["kind"],
|
222 |
+
"estimated_difficulty": line["estimated_difficulty"],
|
223 |
+
"document_id": line["document_id"],
|
224 |
+
"question_generating_model": line["question_generating_model"],
|
225 |
+
"chunks": line["chunks"],
|
226 |
+
"question": line["question"],
|
227 |
+
"document": line["document"],
|
228 |
+
},
|
229 |
+
)
|
230 |
+
|
231 |
+
|
232 |
+
def create_yourbench_task(hf_dataset_name, subset="lighteval_single_shot_questions"):
|
233 |
+
"""
|
234 |
+
Crée une tâche personnalisée yourbench pour lighteval.
|
235 |
+
|
236 |
+
Args:
|
237 |
+
hf_dataset_name: Nom du dataset sur le Hub HF (format: "org/nom")
|
238 |
+
subset: Nom du sous-ensemble à utiliser
|
239 |
+
|
240 |
+
Returns:
|
241 |
+
LightevalTaskConfig: Configuration de la tâche yourbench
|
242 |
+
"""
|
243 |
+
yourbench_metrics = CorpusLevelMetricGrouping(
|
244 |
+
metric_name=["accuracy"],
|
245 |
+
higher_is_better={"accuracy": True},
|
246 |
+
category=MetricCategory.LLM_AS_JUDGE,
|
247 |
+
use_case=MetricUseCase.ACCURACY,
|
248 |
+
sample_level_fn=JudgeLLMYourBench().compute,
|
249 |
+
corpus_level_fn={"accuracy": np.mean},
|
250 |
+
)
|
251 |
+
|
252 |
+
try:
|
253 |
+
extend_enum(Metrics, "accuracy", yourbench_metrics)
|
254 |
+
except Exception:
|
255 |
+
# L'enum a peut-être déjà été ajouté, on ignore l'erreur
|
256 |
+
pass
|
257 |
+
|
258 |
+
return LightevalTaskConfig(
|
259 |
+
name="yourbench",
|
260 |
+
suite=["custom"],
|
261 |
+
prompt_function=yourbench_prompt,
|
262 |
+
hf_repo=hf_dataset_name,
|
263 |
+
hf_subset=subset,
|
264 |
+
hf_avail_splits=["train"],
|
265 |
+
evaluation_splits=["train"],
|
266 |
+
few_shots_split=None,
|
267 |
+
few_shots_select=None,
|
268 |
+
generation_size=8192,
|
269 |
+
metric=[Metrics.accuracy],
|
270 |
+
stop_sequence=[],
|
271 |
+
trust_dataset=True,
|
272 |
+
version=0,
|
273 |
+
)
|
backend/tests/test_evaluation.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to test the evaluation task in standalone mode
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import uuid
|
8 |
+
import json
|
9 |
+
import time
|
10 |
+
import argparse
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from pathlib import Path
|
13 |
+
import traceback
|
14 |
+
|
15 |
+
# Ensure the environment is properly configured
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
# Add the current directory to the path to import modules
|
19 |
+
sys.path.append(os.getcwd())
|
20 |
+
from tasks.evaluationTask import EvaluationTask
|
21 |
+
|
22 |
+
|
23 |
+
def setup_environment():
|
24 |
+
"""
|
25 |
+
Configure the environment for testing
|
26 |
+
"""
|
27 |
+
# Check if the HF token is defined
|
28 |
+
hf_token = os.getenv("HF_TOKEN")
|
29 |
+
if not hf_token:
|
30 |
+
print("⚠️ The HF_TOKEN is not defined in the environment or .env file")
|
31 |
+
print(" Please define this variable before continuing.")
|
32 |
+
sys.exit(1)
|
33 |
+
|
34 |
+
# Set the default organization if not defined
|
35 |
+
if not os.getenv("HF_ORGANIZATION"):
|
36 |
+
os.environ["HF_ORGANIZATION"] = "yourbench"
|
37 |
+
print("ℹ️ The HF_ORGANIZATION variable is not defined, using 'yourbench' as default")
|
38 |
+
|
39 |
+
|
40 |
+
def run_standalone_evaluation(dataset_name, models=None, max_wait_time=3600):
|
41 |
+
"""
|
42 |
+
Run the evaluation task in standalone mode
|
43 |
+
|
44 |
+
Args:
|
45 |
+
dataset_name: Name of the dataset to evaluate
|
46 |
+
models: List of models to evaluate (optional)
|
47 |
+
max_wait_time: Maximum waiting time in seconds
|
48 |
+
"""
|
49 |
+
# Generate a unique session ID
|
50 |
+
session_uid = str(uuid.uuid4())
|
51 |
+
print(f"🔧 Session ID: {session_uid}")
|
52 |
+
|
53 |
+
# Create the evaluation task instance
|
54 |
+
evaluation_task = EvaluationTask(session_uid, dataset_name)
|
55 |
+
|
56 |
+
# If specific models are provided, use them
|
57 |
+
if models:
|
58 |
+
evaluation_task.models = models
|
59 |
+
print(f"🤖 Using custom models: {models}")
|
60 |
+
|
61 |
+
# Display dataset information
|
62 |
+
organization = os.getenv("HF_ORGANIZATION", "yourbench")
|
63 |
+
print(f"📊 Evaluating dataset: {organization}/{dataset_name}")
|
64 |
+
print(f"💾 Results saved in: {evaluation_task.output_dir}")
|
65 |
+
|
66 |
+
# Start the evaluation task
|
67 |
+
print("🚀 Starting evaluation...")
|
68 |
+
evaluation_task.run()
|
69 |
+
|
70 |
+
# Wait for the task to complete while displaying logs
|
71 |
+
start_time = time.time()
|
72 |
+
last_log_count = 0
|
73 |
+
|
74 |
+
while not evaluation_task.is_task_completed():
|
75 |
+
current_logs = evaluation_task.get_logs()
|
76 |
+
|
77 |
+
# Display only new logs
|
78 |
+
if len(current_logs) > last_log_count:
|
79 |
+
for log in current_logs[last_log_count:]:
|
80 |
+
print(f" {log}")
|
81 |
+
last_log_count = len(current_logs)
|
82 |
+
|
83 |
+
# Check if the maximum time is reached
|
84 |
+
elapsed_time = time.time() - start_time
|
85 |
+
if elapsed_time > max_wait_time:
|
86 |
+
print("⚠️ Maximum waiting time reached, forced stop")
|
87 |
+
break
|
88 |
+
|
89 |
+
time.sleep(1)
|
90 |
+
|
91 |
+
# Check if results are available
|
92 |
+
results_file = Path(f"{evaluation_task.output_dir}/models_comparison.json")
|
93 |
+
if results_file.exists():
|
94 |
+
try:
|
95 |
+
with open(results_file, 'r') as f:
|
96 |
+
results = json.load(f)
|
97 |
+
|
98 |
+
print("\n📈 Evaluation Results:")
|
99 |
+
print(f" Dataset: {results['metadata']['dataset']}")
|
100 |
+
print(f" Models tested: {results['metadata']['total_models_tested']}")
|
101 |
+
print(f" Successful tests: {results['metadata']['successful_tests']}")
|
102 |
+
print(f" Timestamp: {results['metadata']['timestamp']}")
|
103 |
+
|
104 |
+
if results['metadata']['successful_tests'] > 0:
|
105 |
+
print("\n📊 Model ranking by accuracy:")
|
106 |
+
successful_models = [m for m in results['models_comparison'] if m['success']]
|
107 |
+
for i, model in enumerate(successful_models):
|
108 |
+
print(f" {i+1}. ✅ {model['model_name']} ({model['provider']})")
|
109 |
+
print(f" Accuracy: {model['accuracy']:.4f} ± {model['accuracy_stderr']:.4f}")
|
110 |
+
print(f" Evaluation time: {model['evaluation_time']:.2f}s")
|
111 |
+
|
112 |
+
failed_models = [m for m in results['models_comparison'] if not m['success']]
|
113 |
+
if failed_models:
|
114 |
+
print("\n❌ Unevaluated models:")
|
115 |
+
for i, model in enumerate(failed_models):
|
116 |
+
print(f" {i+1}. {model['model_name']} ({model['provider']})")
|
117 |
+
error_msg = model.get('error', 'Unknown reason')
|
118 |
+
print(f" Reason: {error_msg}")
|
119 |
+
|
120 |
+
# Check detailed results files
|
121 |
+
detailed_file = Path(f"{evaluation_task.output_dir}/detailed_results.json")
|
122 |
+
if detailed_file.exists():
|
123 |
+
print(f"\n📄 Detailed results available in: {detailed_file}")
|
124 |
+
|
125 |
+
# Check raw files
|
126 |
+
raw_results = list(Path(f"{evaluation_task.output_dir}/results").glob("**/*.json"))
|
127 |
+
if raw_results:
|
128 |
+
print(f"\n📁 {len(raw_results)} raw result files available in: {evaluation_task.output_dir}/results")
|
129 |
+
|
130 |
+
print(f"\n✅ Evaluation completed!")
|
131 |
+
except Exception as e:
|
132 |
+
print(f"❌ Error reading results: {str(e)}")
|
133 |
+
print(f" Details: {traceback.format_exc()}")
|
134 |
+
else:
|
135 |
+
print(f"❌ No evaluation results found in {results_file}")
|
136 |
+
|
137 |
+
|
138 |
+
if __name__ == "__main__":
|
139 |
+
# Configure the argument parser
|
140 |
+
parser = argparse.ArgumentParser(description="Test the evaluation task in standalone mode")
|
141 |
+
parser.add_argument("dataset_name", type=str, help="Name of the dataset to evaluate (without the organization)")
|
142 |
+
parser.add_argument("--model", action="append", dest="models",
|
143 |
+
help="Model to evaluate in the format 'name/model,provider'. Can be used multiple times.")
|
144 |
+
parser.add_argument("--timeout", type=int, default=3600,
|
145 |
+
help="Maximum waiting time in seconds (default: 3600)")
|
146 |
+
|
147 |
+
args = parser.parse_args()
|
148 |
+
|
149 |
+
# Configure the environment
|
150 |
+
setup_environment()
|
151 |
+
|
152 |
+
# Transform models into tuples if specified
|
153 |
+
models_to_evaluate = None
|
154 |
+
if args.models:
|
155 |
+
models_to_evaluate = []
|
156 |
+
for model_spec in args.models:
|
157 |
+
try:
|
158 |
+
model_name, provider = model_spec.split(",")
|
159 |
+
models_to_evaluate.append((model_name, provider))
|
160 |
+
except ValueError:
|
161 |
+
print(f"⚠️ Invalid model format: {model_spec}. Use 'name/model,provider'")
|
162 |
+
sys.exit(1)
|
163 |
+
|
164 |
+
# Run the evaluation
|
165 |
+
run_standalone_evaluation(args.dataset_name, models_to_evaluate, args.timeout)
|
backend/tests/test_hf_upload.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Script de test pour vérifier l'upload vers Hugging Face Hub
|
3 |
+
Ce script crée un dataset simple et tente de l'uploader vers le Hub
|
4 |
+
en utilisant le token et l'organisation définis dans les variables d'environnement.
|
5 |
+
"""
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import datasets
|
9 |
+
from huggingface_hub import HfApi, login
|
10 |
+
from datasets import Dataset
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from pathlib import Path
|
13 |
+
|
14 |
+
def test_hf_upload():
|
15 |
+
# Charger les variables d'environnement depuis le fichier .env
|
16 |
+
dotenv_path = Path('.env')
|
17 |
+
load_dotenv(dotenv_path=dotenv_path)
|
18 |
+
|
19 |
+
# Récupérer le token et l'organisation des variables d'environnement
|
20 |
+
hf_token = os.getenv("HF_TOKEN")
|
21 |
+
org_name = os.getenv("HF_ORGANIZATION", "yourbench")
|
22 |
+
|
23 |
+
if not hf_token:
|
24 |
+
print("Erreur: La variable HF_TOKEN n'est pas définie dans le fichier .env.")
|
25 |
+
sys.exit(1)
|
26 |
+
|
27 |
+
dataset_name = "test_dataset_upload"
|
28 |
+
repo_id = f"{org_name}/{dataset_name}"
|
29 |
+
|
30 |
+
print(f"Tentative d'upload vers {repo_id} avec le token {hf_token[:5]}... (token tronqué pour la sécurité)")
|
31 |
+
|
32 |
+
try:
|
33 |
+
# Se connecter à l'API Hugging Face
|
34 |
+
print("Connexion à l'API Hugging Face...")
|
35 |
+
login(token=hf_token)
|
36 |
+
api = HfApi(token=hf_token)
|
37 |
+
|
38 |
+
# Créer un dataset simple
|
39 |
+
print("Création d'un dataset de test...")
|
40 |
+
data = {
|
41 |
+
"text": ["Ceci est un test", "Un autre exemple", "Troisième exemple"],
|
42 |
+
"label": [1, 0, 1]
|
43 |
+
}
|
44 |
+
dataset = Dataset.from_dict(data)
|
45 |
+
|
46 |
+
# Vérifier si le repo existe déjà et le supprimer si nécessaire
|
47 |
+
try:
|
48 |
+
api.delete_repo(repo_id=repo_id, repo_type="dataset")
|
49 |
+
print(f"Repo existant {repo_id} supprimé.")
|
50 |
+
except Exception:
|
51 |
+
print(f"Le repo {repo_id} n'existait pas encore.")
|
52 |
+
|
53 |
+
# Uploader le dataset
|
54 |
+
print(f"Upload du dataset vers {repo_id}...")
|
55 |
+
dataset.push_to_hub(
|
56 |
+
repo_id=repo_id,
|
57 |
+
token=hf_token,
|
58 |
+
private=True,
|
59 |
+
commit_message="Test d'upload de dataset"
|
60 |
+
)
|
61 |
+
|
62 |
+
print(f"Succès! Dataset uploadé vers https://huggingface.co/datasets/{repo_id}")
|
63 |
+
return True
|
64 |
+
|
65 |
+
except Exception as e:
|
66 |
+
print(f"Erreur lors de l'upload: {str(e)}")
|
67 |
+
print("\nTraceback complet:")
|
68 |
+
import traceback
|
69 |
+
traceback.print_exc()
|
70 |
+
return False
|
71 |
+
|
72 |
+
if __name__ == "__main__":
|
73 |
+
print("=== Test d'upload vers Hugging Face Hub ===")
|
74 |
+
success = test_hf_upload()
|
75 |
+
if success:
|
76 |
+
print("\n✅ Le test a réussi! L'upload fonctionne correctement.")
|
77 |
+
else:
|
78 |
+
print("\n❌ Le test a échoué. Vérifiez les erreurs ci-dessus.")
|
backend/tests/test_inference.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import signal
|
3 |
+
from huggingface_hub import InferenceClient
|
4 |
+
|
5 |
+
# Configuration - Modèles et leurs providers
|
6 |
+
MODELS = [
|
7 |
+
("Qwen/Qwen2.5-72B-Instruct", "sambanova"),
|
8 |
+
("meta-llama/Llama-3.3-70B-Instruct", "sambanova"),
|
9 |
+
("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "sambanova"),
|
10 |
+
("Qwen/QwQ-32B", "novita"),
|
11 |
+
# ("mistralai/Mistral-Small-24B-Instruct-2501", "novita")
|
12 |
+
]
|
13 |
+
QUESTION = "What is the capital of France?"
|
14 |
+
TIMEOUT = 10 # secondes
|
15 |
+
|
16 |
+
|
17 |
+
class TimeoutException(Exception):
|
18 |
+
pass
|
19 |
+
|
20 |
+
|
21 |
+
def timeout_handler(signum, frame):
|
22 |
+
raise TimeoutException("Timeout")
|
23 |
+
|
24 |
+
|
25 |
+
def test_model(model, provider):
|
26 |
+
client = InferenceClient(provider=provider)
|
27 |
+
|
28 |
+
# Configure le timeout
|
29 |
+
signal.signal(signal.SIGALRM, timeout_handler)
|
30 |
+
signal.alarm(TIMEOUT)
|
31 |
+
|
32 |
+
start_time = time.time()
|
33 |
+
try:
|
34 |
+
response = client.chat_completion(
|
35 |
+
model=model,
|
36 |
+
messages=[{"role": "user", "content": QUESTION}]
|
37 |
+
)
|
38 |
+
result = response.choices[0].message.content
|
39 |
+
success = True
|
40 |
+
except TimeoutException:
|
41 |
+
result = f"TIMEOUT ({TIMEOUT}s)"
|
42 |
+
success = False
|
43 |
+
except Exception as e:
|
44 |
+
result = str(e)
|
45 |
+
success = False
|
46 |
+
finally:
|
47 |
+
# Désactive l'alarme
|
48 |
+
signal.alarm(0)
|
49 |
+
|
50 |
+
execution_time = time.time() - start_time
|
51 |
+
|
52 |
+
status = "✅" if success else "❌"
|
53 |
+
print(f"{status} {model} ({provider}) - Temps: {execution_time:.2f}s")
|
54 |
+
if success:
|
55 |
+
print(f" Réponse: {result[:80]}..." if len(result) > 80 else f" Réponse: {result}")
|
56 |
+
else:
|
57 |
+
print(f" Erreur: {result}")
|
58 |
+
|
59 |
+
return success, execution_time, result
|
60 |
+
|
61 |
+
|
62 |
+
def main():
|
63 |
+
print(f"\nTest de {len(MODELS)} modèles avec leurs providers spécifiques")
|
64 |
+
print(f"Question: {QUESTION}")
|
65 |
+
print(f"Timeout: {TIMEOUT}s\n")
|
66 |
+
|
67 |
+
results = []
|
68 |
+
for model, provider in MODELS:
|
69 |
+
success, time_taken, response = test_model(model, provider)
|
70 |
+
results.append({
|
71 |
+
"model": model,
|
72 |
+
"provider": provider,
|
73 |
+
"success": success,
|
74 |
+
"time": time_taken
|
75 |
+
})
|
76 |
+
|
77 |
+
print("\n=== RÉSUMÉ ===")
|
78 |
+
for result in results:
|
79 |
+
status = "✅" if result["success"] else "❌"
|
80 |
+
print(f"{status} {result['model']} ({result['provider']}): {result['time']:.2f}s")
|
81 |
+
|
82 |
+
|
83 |
+
if __name__ == "__main__":
|
84 |
+
main()
|
backend/tests/test_lighteval.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script minimal pour tester directement lighteval avec la tâche yourbench
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import subprocess
|
8 |
+
import json
|
9 |
+
import time
|
10 |
+
from pathlib import Path
|
11 |
+
import logging
|
12 |
+
|
13 |
+
# Assurez-vous que l'environnement est correctement configuré
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Importer le module de définition de tâche yourbench
|
18 |
+
sys.path.append(os.getcwd())
|
19 |
+
from tasks.yourbench_lighteval_task import create_yourbench_task
|
20 |
+
|
21 |
+
def run_lighteval_test():
|
22 |
+
"""
|
23 |
+
Exécuter un test minimal avec lighteval
|
24 |
+
"""
|
25 |
+
# Parameters
|
26 |
+
dataset_name = "yourbench_a"
|
27 |
+
organization = "yourbench"
|
28 |
+
model_name = "Qwen/Qwen2.5-72B-Instruct"
|
29 |
+
provider = "novita"
|
30 |
+
output_dir = f"uploaded_files/test_{provider}/lighteval_results"
|
31 |
+
|
32 |
+
# Créer le répertoire de sortie
|
33 |
+
os.makedirs(output_dir, exist_ok=True)
|
34 |
+
|
35 |
+
# Définir le chemin d'accès complet au dataset
|
36 |
+
dataset_path = f"{organization}/{dataset_name}"
|
37 |
+
print(f"Dataset à évaluer: {dataset_path}")
|
38 |
+
|
39 |
+
# Créer un fichier temporaire
|
40 |
+
import tempfile
|
41 |
+
temp_file_path = tempfile.mktemp(suffix=".py")
|
42 |
+
print(f"Création du fichier temporaire: {temp_file_path}")
|
43 |
+
|
44 |
+
with open(temp_file_path, 'w') as temp_file:
|
45 |
+
# Écrire le contenu du fichier
|
46 |
+
temp_file.write(f"""
|
47 |
+
import os
|
48 |
+
import sys
|
49 |
+
import logging
|
50 |
+
sys.path.append("{os.getcwd()}")
|
51 |
+
|
52 |
+
from tasks.yourbench_lighteval_task import create_yourbench_task
|
53 |
+
|
54 |
+
# Configurer le logging
|
55 |
+
logging.basicConfig(level=logging.INFO)
|
56 |
+
|
57 |
+
# Créer la tâche yourbench
|
58 |
+
yourbench = create_yourbench_task("{dataset_path}", "lighteval")
|
59 |
+
|
60 |
+
# Définir la variable TASKS_TABLE dont lighteval a besoin
|
61 |
+
TASKS_TABLE = [yourbench]
|
62 |
+
""")
|
63 |
+
|
64 |
+
# Construire la commande lighteval
|
65 |
+
cmd = [
|
66 |
+
"lighteval",
|
67 |
+
"endpoint",
|
68 |
+
"inference-providers",
|
69 |
+
f"model={model_name},provider={provider}",
|
70 |
+
"custom|yourbench|0|0",
|
71 |
+
"--custom-tasks",
|
72 |
+
temp_file_path,
|
73 |
+
"--max-samples", "5", # Seulement 1 échantillon
|
74 |
+
"--output-dir", output_dir,
|
75 |
+
"--save-details",
|
76 |
+
"--no-push-to-hub" # Pas de push pour gagner du temps
|
77 |
+
]
|
78 |
+
|
79 |
+
# Afficher la commande
|
80 |
+
print(f"Exécution de la commande: {' '.join(cmd)}")
|
81 |
+
print(f"Heure de début: {time.strftime('%H:%M:%S')}")
|
82 |
+
|
83 |
+
# Exécuter la commande
|
84 |
+
try:
|
85 |
+
# Exécuter avec capture des sorties
|
86 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
87 |
+
|
88 |
+
# Afficher les résultats
|
89 |
+
print(f"Code de retour: {result.returncode}")
|
90 |
+
print("--- SORTIE STANDARD ---")
|
91 |
+
print(result.stdout)
|
92 |
+
print("--- ERREUR STANDARD ---")
|
93 |
+
print(result.stderr)
|
94 |
+
|
95 |
+
# Vérifier si des résultats ont été générés
|
96 |
+
results_dir = Path(output_dir) / "results"
|
97 |
+
if results_dir.exists():
|
98 |
+
print(f"Dossier de résultats créé: {results_dir}")
|
99 |
+
# Lister les fichiers de résultats
|
100 |
+
result_files = list(results_dir.glob("**/*.json"))
|
101 |
+
if result_files:
|
102 |
+
print(f"Fichiers de résultats trouvés: {result_files}")
|
103 |
+
# Trier les fichiers par date de modification pour prendre le plus récent
|
104 |
+
result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
|
105 |
+
latest_result = result_files[0]
|
106 |
+
print(f"Fichier de résultats le plus récent: {latest_result}")
|
107 |
+
|
108 |
+
# Lire le fichier de résultats
|
109 |
+
with open(latest_result, 'r') as f:
|
110 |
+
results = json.load(f)
|
111 |
+
print("Contenu du fichier de résultats:")
|
112 |
+
print(json.dumps(results, indent=2))
|
113 |
+
|
114 |
+
# Analyse des résultats
|
115 |
+
print("\n==== ANALYSE DES RÉSULTATS ====")
|
116 |
+
if "results" in results:
|
117 |
+
for task_name, task_results in results["results"].items():
|
118 |
+
print(f"Tâche: {task_name}")
|
119 |
+
for metric_name, metric_value in task_results.items():
|
120 |
+
print(f" {metric_name}: {metric_value}")
|
121 |
+
else:
|
122 |
+
print("Aucun résultat trouvé dans le fichier JSON")
|
123 |
+
|
124 |
+
# Vérifier les détails
|
125 |
+
details_dir = Path(output_dir) / "details"
|
126 |
+
if details_dir.exists():
|
127 |
+
print(f"\nDossier de détails trouvé: {details_dir}")
|
128 |
+
model_details_dirs = list(details_dir.glob("**/*"))
|
129 |
+
if model_details_dirs:
|
130 |
+
print(f"Dossiers de détails par modèle: {model_details_dirs}")
|
131 |
+
else:
|
132 |
+
print("Aucun fichier de résultats trouvé.")
|
133 |
+
else:
|
134 |
+
print(f"Aucun dossier de résultats créé.")
|
135 |
+
|
136 |
+
except subprocess.CalledProcessError as e:
|
137 |
+
print(f"Erreur lors de l'exécution de la commande: {e}")
|
138 |
+
except Exception as e:
|
139 |
+
print(f"Exception: {e}")
|
140 |
+
finally:
|
141 |
+
# Supprimer le fichier temporaire
|
142 |
+
try:
|
143 |
+
os.unlink(temp_file_path)
|
144 |
+
print(f"Fichier temporaire supprimé: {temp_file_path}")
|
145 |
+
except:
|
146 |
+
pass
|
147 |
+
|
148 |
+
print(f"Heure de fin: {time.strftime('%H:%M:%S')}")
|
149 |
+
|
150 |
+
if __name__ == "__main__":
|
151 |
+
run_lighteval_test()
|
backend/tests/test_openai.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
# Load environment variables
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
def test_openai_connection():
|
9 |
+
try:
|
10 |
+
# Initialize OpenAI client
|
11 |
+
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
12 |
+
|
13 |
+
# Make a simple request
|
14 |
+
response = client.chat.completions.create(
|
15 |
+
model="gpt-3.5-turbo",
|
16 |
+
messages=[
|
17 |
+
{"role": "user", "content": "Say 'Hello World'"}
|
18 |
+
]
|
19 |
+
)
|
20 |
+
|
21 |
+
print("✅ OpenAI API connection successful!")
|
22 |
+
print(f"Response: {response.choices[0].message.content}")
|
23 |
+
return True
|
24 |
+
|
25 |
+
except Exception as e:
|
26 |
+
print("❌ OpenAI API connection failed!")
|
27 |
+
print(f"Error: {str(e)}")
|
28 |
+
return False
|
29 |
+
|
30 |
+
if __name__ == "__main__":
|
31 |
+
test_openai_connection()
|
backend/tests/test_parallel_lighteval.py
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to run lighteval tests in parallel for multiple models
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import json
|
8 |
+
import time
|
9 |
+
import tempfile
|
10 |
+
import asyncio
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Tuple, List, Dict, Any
|
13 |
+
|
14 |
+
# Ensure environment is properly configured
|
15 |
+
from dotenv import load_dotenv
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
# Import yourbench task module
|
19 |
+
sys.path.append(os.getcwd())
|
20 |
+
from tasks.yourbench_lighteval_task import create_yourbench_task
|
21 |
+
|
22 |
+
# Define models to test
|
23 |
+
INIT_MODELS = [
|
24 |
+
# 70B
|
25 |
+
("Qwen/Qwen2.5-72B-Instruct", "novita"),
|
26 |
+
("meta-llama/Llama-3.3-70B-Instruct", "novita"),
|
27 |
+
("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "novita"),
|
28 |
+
# 20 to 30B
|
29 |
+
("Qwen/QwQ-32B", "novita"),
|
30 |
+
# ("mistralai/Mistral-Small-24B-Instruct-2501", "sambanova"),
|
31 |
+
]
|
32 |
+
|
33 |
+
async def run_lighteval_test_for_model(model_info: Tuple[str, str]) -> Dict[str, Any]:
|
34 |
+
"""
|
35 |
+
Run lighteval test for a specific model
|
36 |
+
"""
|
37 |
+
model_name, provider = model_info
|
38 |
+
|
39 |
+
# Parameters
|
40 |
+
dataset_name = "yourbench_a"
|
41 |
+
organization = "yourbench"
|
42 |
+
output_dir = f"uploaded_files/test_parallel_{provider}/lighteval_results"
|
43 |
+
|
44 |
+
# Create output directory
|
45 |
+
os.makedirs(output_dir, exist_ok=True)
|
46 |
+
|
47 |
+
# Define full dataset path
|
48 |
+
dataset_path = f"{organization}/{dataset_name}"
|
49 |
+
print(f"Dataset to evaluate for {model_name}: {dataset_path}")
|
50 |
+
|
51 |
+
# Create temporary file
|
52 |
+
temp_file_path = tempfile.mktemp(suffix=".py")
|
53 |
+
print(f"Creating temporary file for {model_name}: {temp_file_path}")
|
54 |
+
|
55 |
+
with open(temp_file_path, 'w') as temp_file:
|
56 |
+
temp_file.write(f"""
|
57 |
+
import os
|
58 |
+
import sys
|
59 |
+
sys.path.append("{os.getcwd()}")
|
60 |
+
|
61 |
+
from tasks.yourbench_lighteval_task import create_yourbench_task
|
62 |
+
|
63 |
+
# Create yourbench task
|
64 |
+
yourbench = create_yourbench_task("{dataset_path}", "lighteval")
|
65 |
+
|
66 |
+
# Define TASKS_TABLE needed by lighteval
|
67 |
+
TASKS_TABLE = [yourbench]
|
68 |
+
""")
|
69 |
+
|
70 |
+
# Build lighteval command args
|
71 |
+
cmd_args = [
|
72 |
+
"lighteval",
|
73 |
+
"endpoint",
|
74 |
+
"inference-providers",
|
75 |
+
f"model={model_name},provider={provider}",
|
76 |
+
"custom|yourbench|0|0",
|
77 |
+
"--custom-tasks",
|
78 |
+
temp_file_path,
|
79 |
+
"--max-samples", "5",
|
80 |
+
"--output-dir", output_dir,
|
81 |
+
"--save-details",
|
82 |
+
"--no-push-to-hub"
|
83 |
+
]
|
84 |
+
|
85 |
+
print(f"Running command for {model_name}: {' '.join(cmd_args)}")
|
86 |
+
print(f"Start time for {model_name}: {time.strftime('%H:%M:%S')}")
|
87 |
+
|
88 |
+
results = {
|
89 |
+
"model_name": model_name,
|
90 |
+
"provider": provider,
|
91 |
+
"success": False,
|
92 |
+
"error": None,
|
93 |
+
"results": None,
|
94 |
+
"return_code": None
|
95 |
+
}
|
96 |
+
|
97 |
+
try:
|
98 |
+
# Prepare environment with needed tokens
|
99 |
+
env = os.environ.copy()
|
100 |
+
hf_token = os.getenv("HF_TOKEN")
|
101 |
+
if hf_token:
|
102 |
+
env["HF_TOKEN"] = hf_token
|
103 |
+
env["HUGGING_FACE_HUB_TOKEN"] = hf_token
|
104 |
+
env["HF_ORGANIZATION"] = organization
|
105 |
+
|
106 |
+
# Run the process asynchronously
|
107 |
+
process = await asyncio.create_subprocess_exec(
|
108 |
+
*cmd_args,
|
109 |
+
stdout=asyncio.subprocess.PIPE,
|
110 |
+
stderr=asyncio.subprocess.PIPE,
|
111 |
+
env=env
|
112 |
+
)
|
113 |
+
|
114 |
+
# Wait for the process to complete
|
115 |
+
stdout, stderr = await process.communicate()
|
116 |
+
|
117 |
+
# Store return code
|
118 |
+
exit_code = process.returncode
|
119 |
+
results["return_code"] = exit_code
|
120 |
+
|
121 |
+
# Log some output for debugging
|
122 |
+
if stdout:
|
123 |
+
stdout_lines = stdout.decode().strip().split('\n')
|
124 |
+
if stdout_lines and len(stdout_lines) > 0:
|
125 |
+
print(f"Output from {model_name}: {stdout_lines[0]}")
|
126 |
+
|
127 |
+
# Check if results were generated
|
128 |
+
results_dir = Path(output_dir) / "results"
|
129 |
+
if results_dir.exists():
|
130 |
+
result_files = list(results_dir.glob("**/*.json"))
|
131 |
+
if result_files:
|
132 |
+
# Read the first results file
|
133 |
+
with open(result_files[0], 'r') as f:
|
134 |
+
test_results = json.load(f)
|
135 |
+
results["results"] = test_results
|
136 |
+
results["success"] = True
|
137 |
+
|
138 |
+
except asyncio.CancelledError:
|
139 |
+
results["error"] = "Task cancelled"
|
140 |
+
print(f"Task cancelled for {model_name}")
|
141 |
+
except Exception as e:
|
142 |
+
results["error"] = f"Exception: {str(e)}"
|
143 |
+
print(f"Error running test for {model_name}: {str(e)}")
|
144 |
+
finally:
|
145 |
+
# Delete temporary file
|
146 |
+
try:
|
147 |
+
os.unlink(temp_file_path)
|
148 |
+
except:
|
149 |
+
pass
|
150 |
+
|
151 |
+
print(f"End time for {model_name}: {time.strftime('%H:%M:%S')}")
|
152 |
+
return results
|
153 |
+
|
154 |
+
async def run_parallel_tests(models: List[Tuple[str, str]]) -> List[Dict[str, Any]]:
|
155 |
+
"""
|
156 |
+
Run tests in parallel for multiple models using asyncio
|
157 |
+
"""
|
158 |
+
print(f"Starting parallel tests for {len(models)} models")
|
159 |
+
|
160 |
+
# Create tasks for each model
|
161 |
+
tasks = [run_lighteval_test_for_model(model) for model in models]
|
162 |
+
|
163 |
+
# Run all tasks concurrently and gather results
|
164 |
+
model_results = await asyncio.gather(*tasks, return_exceptions=True)
|
165 |
+
|
166 |
+
# Process results
|
167 |
+
results = []
|
168 |
+
for i, result in enumerate(model_results):
|
169 |
+
if isinstance(result, Exception):
|
170 |
+
# Handle exception
|
171 |
+
model_name, provider = models[i]
|
172 |
+
print(f"Test failed for {model_name}: {str(result)}")
|
173 |
+
results.append({
|
174 |
+
"model_name": model_name,
|
175 |
+
"provider": provider,
|
176 |
+
"success": False,
|
177 |
+
"error": str(result),
|
178 |
+
"results": None,
|
179 |
+
"return_code": None
|
180 |
+
})
|
181 |
+
else:
|
182 |
+
# Valid result
|
183 |
+
results.append(result)
|
184 |
+
print(f"Test completed for {result['model_name']}")
|
185 |
+
|
186 |
+
return results
|
187 |
+
|
188 |
+
def format_comparison_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
189 |
+
"""
|
190 |
+
Format results for easy comparison between models
|
191 |
+
"""
|
192 |
+
comparison = {
|
193 |
+
"metadata": {
|
194 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
195 |
+
"total_models_tested": len(results),
|
196 |
+
"successful_tests": len([r for r in results if r["success"]])
|
197 |
+
},
|
198 |
+
"models_comparison": []
|
199 |
+
}
|
200 |
+
|
201 |
+
# Sort models by accuracy (if available) or name
|
202 |
+
sorted_results = sorted(
|
203 |
+
results,
|
204 |
+
key=lambda x: (
|
205 |
+
x["results"]["results"]["all"]["accuracy"] if x["success"] and x["results"] else -1,
|
206 |
+
x["model_name"]
|
207 |
+
),
|
208 |
+
reverse=True
|
209 |
+
)
|
210 |
+
|
211 |
+
for result in sorted_results:
|
212 |
+
model_result = {
|
213 |
+
"model_name": result["model_name"],
|
214 |
+
"provider": result["provider"],
|
215 |
+
"success": result["success"]
|
216 |
+
}
|
217 |
+
|
218 |
+
if result["success"] and result["results"]:
|
219 |
+
model_result.update({
|
220 |
+
"accuracy": result["results"]["results"]["all"]["accuracy"],
|
221 |
+
"accuracy_stderr": result["results"]["results"]["all"]["accuracy_stderr"],
|
222 |
+
"evaluation_time": float(result["results"]["config_general"]["total_evaluation_time_secondes"])
|
223 |
+
})
|
224 |
+
else:
|
225 |
+
model_result["error"] = result["error"]
|
226 |
+
|
227 |
+
comparison["models_comparison"].append(model_result)
|
228 |
+
|
229 |
+
return comparison
|
230 |
+
|
231 |
+
async def main_async():
|
232 |
+
"""
|
233 |
+
Async main function to run parallel tests
|
234 |
+
"""
|
235 |
+
print("Starting parallel lighteval tests")
|
236 |
+
start_time = time.time()
|
237 |
+
|
238 |
+
# Run tests in parallel
|
239 |
+
results = await run_parallel_tests(INIT_MODELS)
|
240 |
+
|
241 |
+
# Save detailed results
|
242 |
+
detailed_output_file = "parallel_test_detailed_results.json"
|
243 |
+
with open(detailed_output_file, 'w') as f:
|
244 |
+
json.dump(results, f, indent=2)
|
245 |
+
|
246 |
+
# Generate and save comparison results
|
247 |
+
comparison = format_comparison_results(results)
|
248 |
+
comparison_file = "models_comparison.json"
|
249 |
+
with open(comparison_file, 'w') as f:
|
250 |
+
json.dump(comparison, f, indent=2)
|
251 |
+
|
252 |
+
# Print summary
|
253 |
+
print("\nTest Summary:")
|
254 |
+
for model in comparison["models_comparison"]:
|
255 |
+
status = "✅" if model["success"] else "❌"
|
256 |
+
print(f"{status} {model['model_name']} ({model['provider']})")
|
257 |
+
if not model["success"]:
|
258 |
+
print(f" Error: {model['error']}")
|
259 |
+
else:
|
260 |
+
print(f" Accuracy: {model['accuracy']:.2%} (±{model['accuracy_stderr']:.2%})")
|
261 |
+
print(f" Evaluation time: {model['evaluation_time']:.2f}s")
|
262 |
+
|
263 |
+
duration = time.time() - start_time
|
264 |
+
print(f"\nTotal execution time: {duration:.2f} seconds")
|
265 |
+
print(f"Detailed results saved to: {detailed_output_file}")
|
266 |
+
print(f"Comparison results saved to: {comparison_file}")
|
267 |
+
|
268 |
+
def main():
|
269 |
+
"""
|
270 |
+
Main function to run parallel tests
|
271 |
+
"""
|
272 |
+
# Create event loop and run the async main
|
273 |
+
loop = asyncio.get_event_loop()
|
274 |
+
loop.run_until_complete(main_async())
|
275 |
+
loop.close()
|
276 |
+
|
277 |
+
if __name__ == "__main__":
|
278 |
+
main()
|
backend/tests/test_provider_parallel_support.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script pour tester si un fournisseur d'API supporte réellement les requêtes parallèles
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import time
|
8 |
+
import asyncio
|
9 |
+
import json
|
10 |
+
from pathlib import Path
|
11 |
+
from datetime import datetime
|
12 |
+
|
13 |
+
# Ensure environment is properly configured
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Définir le modèle et le fournisseur à tester
|
18 |
+
MODEL_NAME = "Qwen/QwQ-32B"
|
19 |
+
PROVIDER = "novita"
|
20 |
+
REQUEST_COUNT = 5 # Nombre de requêtes
|
21 |
+
|
22 |
+
# Liste de questions
|
23 |
+
PROMPTS = [
|
24 |
+
"Explain in detail how parallel computing has transformed modern data processing.",
|
25 |
+
"Describe the fundamental differences between CPU and GPU architectures.",
|
26 |
+
"Analyze the key challenges in distributed systems design.",
|
27 |
+
"Discuss the evolution of natural language processing from rule-based systems to modern transformer architectures.",
|
28 |
+
"Explain the concept of quantum computing and how it differs from classical computing paradigms."
|
29 |
+
]
|
30 |
+
|
31 |
+
async def send_request(prompt, request_id=None, show_logs=True):
|
32 |
+
"""Envoie une requête au modèle et mesure le temps d'exécution"""
|
33 |
+
if show_logs and request_id is not None:
|
34 |
+
print(f"Démarrage requête {request_id} à {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
|
35 |
+
|
36 |
+
start_time = time.time()
|
37 |
+
|
38 |
+
cmd_args = [
|
39 |
+
"curl", "-s",
|
40 |
+
"-X", "POST",
|
41 |
+
f"https://api-inference.huggingface.co/models/{MODEL_NAME}",
|
42 |
+
"-H", f"Authorization: Bearer {os.environ.get('HF_TOKEN')}",
|
43 |
+
"-H", "Content-Type: application/json",
|
44 |
+
"-d", json.dumps({
|
45 |
+
"inputs": prompt,
|
46 |
+
"parameters": {
|
47 |
+
"provider": PROVIDER,
|
48 |
+
"max_new_tokens": 20
|
49 |
+
}
|
50 |
+
})
|
51 |
+
]
|
52 |
+
|
53 |
+
process = await asyncio.create_subprocess_exec(
|
54 |
+
*cmd_args,
|
55 |
+
stdout=asyncio.subprocess.PIPE,
|
56 |
+
stderr=asyncio.subprocess.PIPE
|
57 |
+
)
|
58 |
+
|
59 |
+
stdout, stderr = await process.communicate()
|
60 |
+
|
61 |
+
end_time = time.time()
|
62 |
+
duration = end_time - start_time
|
63 |
+
|
64 |
+
response = stdout.decode("utf-8")
|
65 |
+
stderr_output = stderr.decode("utf-8")
|
66 |
+
|
67 |
+
# Déterminer le succès
|
68 |
+
is_success = False
|
69 |
+
try:
|
70 |
+
response_json = json.loads(response)
|
71 |
+
is_success = process.returncode == 0 and isinstance(response_json, list) and "generated_text" in response_json[0]
|
72 |
+
except json.JSONDecodeError:
|
73 |
+
is_success = process.returncode == 0 and not ("error" in response.lower())
|
74 |
+
except Exception:
|
75 |
+
is_success = process.returncode == 0
|
76 |
+
|
77 |
+
# Extraire message d'erreur si échec
|
78 |
+
error_message = None
|
79 |
+
if not is_success:
|
80 |
+
try:
|
81 |
+
if "error" in response.lower():
|
82 |
+
try:
|
83 |
+
response_json = json.loads(response)
|
84 |
+
if "error" in response_json:
|
85 |
+
error_message = response_json["error"]
|
86 |
+
except:
|
87 |
+
error_message = f"Erreur non-JSON: {response}"
|
88 |
+
elif stderr_output:
|
89 |
+
error_message = stderr_output
|
90 |
+
else:
|
91 |
+
error_message = f"Réponse: {response}"
|
92 |
+
except:
|
93 |
+
error_message = f"Erreur inconnue. Code: {process.returncode}"
|
94 |
+
|
95 |
+
if show_logs and request_id is not None:
|
96 |
+
print(f"Fin requête {request_id} à {datetime.now().strftime('%H:%M:%S.%f')[:-3]} (durée: {duration:.2f}s)")
|
97 |
+
if not is_success:
|
98 |
+
print(f"ERREUR requête {request_id}: {error_message[:100]}..." if error_message and len(error_message) > 100 else error_message)
|
99 |
+
|
100 |
+
return {
|
101 |
+
"request_id": request_id,
|
102 |
+
"prompt": prompt,
|
103 |
+
"start_time": start_time,
|
104 |
+
"end_time": end_time,
|
105 |
+
"duration": duration,
|
106 |
+
"success": is_success,
|
107 |
+
"response": response,
|
108 |
+
"error_message": error_message
|
109 |
+
}
|
110 |
+
|
111 |
+
async def run_parallel_requests(prompts):
|
112 |
+
"""Exécute les requêtes en parallèle"""
|
113 |
+
print(f"\n=== Test parallèle: {len(prompts)} requêtes pour {MODEL_NAME} ({PROVIDER}) ===")
|
114 |
+
print(f"Heure de début: {datetime.now().strftime('%H:%M:%S')}")
|
115 |
+
|
116 |
+
# Synchroniser le démarrage des requêtes
|
117 |
+
start_event = asyncio.Event()
|
118 |
+
|
119 |
+
async def synchronized_request(prompt, req_id):
|
120 |
+
await start_event.wait()
|
121 |
+
return await send_request(prompt, req_id)
|
122 |
+
|
123 |
+
# Créer toutes les tâches
|
124 |
+
tasks = [asyncio.create_task(synchronized_request(prompts[i], i)) for i in range(len(prompts))]
|
125 |
+
|
126 |
+
# Attendre que toutes les tâches soient prêtes
|
127 |
+
await asyncio.sleep(1)
|
128 |
+
|
129 |
+
# Lancer toutes les requêtes en même temps
|
130 |
+
parallel_start_time = time.time()
|
131 |
+
print(f"Démarrage synchronisé à {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
|
132 |
+
start_event.set()
|
133 |
+
|
134 |
+
# Attendre que toutes les tâches se terminent
|
135 |
+
results = await asyncio.gather(*tasks)
|
136 |
+
parallel_end_time = time.time()
|
137 |
+
parallel_duration = parallel_end_time - parallel_start_time
|
138 |
+
|
139 |
+
print(f"Test parallèle terminé en {parallel_duration:.2f}s\n")
|
140 |
+
return results, parallel_duration
|
141 |
+
|
142 |
+
async def run_sequential_requests(prompts):
|
143 |
+
"""Exécute les mêmes requêtes séquentiellement"""
|
144 |
+
print(f"\n=== Test séquentiel: {len(prompts)} requêtes pour {MODEL_NAME} ({PROVIDER}) ===")
|
145 |
+
print(f"Heure de début: {datetime.now().strftime('%H:%M:%S')}")
|
146 |
+
|
147 |
+
sequential_start_time = time.time()
|
148 |
+
results = []
|
149 |
+
|
150 |
+
for i, prompt in enumerate(prompts):
|
151 |
+
print(f"Requête séquentielle {i}...")
|
152 |
+
result = await send_request(prompt, i)
|
153 |
+
results.append(result)
|
154 |
+
|
155 |
+
sequential_end_time = time.time()
|
156 |
+
sequential_duration = sequential_end_time - sequential_start_time
|
157 |
+
|
158 |
+
print(f"Test séquentiel terminé en {sequential_duration:.2f}s\n")
|
159 |
+
return results, sequential_duration
|
160 |
+
|
161 |
+
async def run_tests():
|
162 |
+
"""Exécute les tests parallèles puis séquentiels et compare les résultats"""
|
163 |
+
global_start = time.time()
|
164 |
+
prompts = PROMPTS[:REQUEST_COUNT] # Utiliser le nombre de prompts spécifié
|
165 |
+
|
166 |
+
# 1. Test parallèle
|
167 |
+
parallel_results, parallel_duration = await run_parallel_requests(prompts)
|
168 |
+
|
169 |
+
# 2. Test séquentiel
|
170 |
+
sequential_results, sequential_duration = await run_sequential_requests(prompts)
|
171 |
+
|
172 |
+
# 3. Analyser les résultats
|
173 |
+
global_end = time.time()
|
174 |
+
total_duration = global_end - global_start
|
175 |
+
|
176 |
+
# Calculer les métriques
|
177 |
+
parallel_success = sum(1 for r in parallel_results if r["success"])
|
178 |
+
sequential_success = sum(1 for r in sequential_results if r["success"])
|
179 |
+
|
180 |
+
# Calculer le facteur de parallélisme réel (temps séquentiel / temps parallèle)
|
181 |
+
if parallel_duration > 0:
|
182 |
+
parallelism_factor = sequential_duration / parallel_duration
|
183 |
+
else:
|
184 |
+
parallelism_factor = 0
|
185 |
+
|
186 |
+
# Pourcentage d'amélioration
|
187 |
+
improvement_percent = (1 - (parallel_duration / sequential_duration)) * 100 if sequential_duration > 0 else 0
|
188 |
+
|
189 |
+
# Afficher le résumé
|
190 |
+
print("\n====== RÉSUMÉ DES TESTS ======")
|
191 |
+
print(f"Modèle: {MODEL_NAME}, Provider: {PROVIDER}, Requêtes: {len(prompts)}")
|
192 |
+
print(f"\nDurée test parallèle: {parallel_duration:.2f}s ({parallel_success}/{len(prompts)} réussies)")
|
193 |
+
print(f"Durée test séquentiel: {sequential_duration:.2f}s ({sequential_success}/{len(prompts)} réussies)")
|
194 |
+
print(f"Facteur de parallélisme: {parallelism_factor:.2f}x")
|
195 |
+
print(f"Amélioration: {improvement_percent:.1f}%")
|
196 |
+
|
197 |
+
if parallelism_factor >= len(prompts) * 0.8:
|
198 |
+
conclusion = "EXCELLENT parallélisme (proche du théorique maximum)"
|
199 |
+
elif parallelism_factor >= 2:
|
200 |
+
conclusion = "BON parallélisme (significativement meilleur que séquentiel)"
|
201 |
+
elif parallelism_factor >= 1.3:
|
202 |
+
conclusion = "MOYEN parallélisme (légèrement meilleur que séquentiel)"
|
203 |
+
else:
|
204 |
+
conclusion = "FAIBLE ou PAS DE parallélisme (pas d'avantage significatif)"
|
205 |
+
|
206 |
+
print(f"\nConclusion: {conclusion}")
|
207 |
+
|
208 |
+
# Enregistrer les résultats
|
209 |
+
output_file = f"parallel_test_{PROVIDER}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
210 |
+
with open(output_file, 'w') as f:
|
211 |
+
json.dump({
|
212 |
+
"model": MODEL_NAME,
|
213 |
+
"provider": PROVIDER,
|
214 |
+
"request_count": len(prompts),
|
215 |
+
"parallel_duration": parallel_duration,
|
216 |
+
"sequential_duration": sequential_duration,
|
217 |
+
"parallelism_factor": parallelism_factor,
|
218 |
+
"improvement_percent": improvement_percent,
|
219 |
+
"conclusion": conclusion,
|
220 |
+
"parallel_results": parallel_results,
|
221 |
+
"sequential_results": sequential_results
|
222 |
+
}, f, indent=2)
|
223 |
+
|
224 |
+
print(f"\nRésultats détaillés sauvegardés dans {output_file}")
|
225 |
+
|
226 |
+
if __name__ == "__main__":
|
227 |
+
asyncio.run(run_tests())
|
backend/tests/test_yourbench_results.py
ADDED
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script pour tester les résultats de Yourbench et vérifier les datasets sur le Hub Hugging Face.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import json
|
9 |
+
import argparse
|
10 |
+
import requests
|
11 |
+
import tempfile
|
12 |
+
from datetime import datetime
|
13 |
+
from typing import Dict, List, Any, Optional, Tuple
|
14 |
+
|
15 |
+
# Vérifier si les bibliothèques nécessaires sont installées
|
16 |
+
try:
|
17 |
+
from dotenv import load_dotenv
|
18 |
+
from huggingface_hub import HfApi, DatasetInfo, ModelInfo
|
19 |
+
from loguru import logger
|
20 |
+
import pandas as pd
|
21 |
+
except ImportError:
|
22 |
+
print("Installation des dépendances...")
|
23 |
+
import subprocess
|
24 |
+
subprocess.run(["pip", "install", "python-dotenv", "huggingface_hub", "loguru", "pandas", "pyarrow"], check=True)
|
25 |
+
from dotenv import load_dotenv
|
26 |
+
from huggingface_hub import HfApi, DatasetInfo, ModelInfo
|
27 |
+
from loguru import logger
|
28 |
+
import pandas as pd
|
29 |
+
|
30 |
+
# Charger les variables d'environnement depuis .env
|
31 |
+
load_dotenv()
|
32 |
+
|
33 |
+
# Configuration de la journalisation
|
34 |
+
logger.remove()
|
35 |
+
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>")
|
36 |
+
logger.add("yourbench_tests.log", rotation="10 MB", retention="1 week")
|
37 |
+
|
38 |
+
def configure_argument_parser() -> argparse.ArgumentParser:
|
39 |
+
"""Configure le parser d'arguments."""
|
40 |
+
parser = argparse.ArgumentParser(description="Tester les résultats de Yourbench et vérifier les datasets")
|
41 |
+
parser.add_argument("--dataset", type=str, help="Nom du dataset à vérifier (sans le nom de l'organisation)")
|
42 |
+
parser.add_argument("--org", type=str, default=os.environ.get("HF_ORGANIZATION", "yourbench"),
|
43 |
+
help="Organisation Hugging Face (défaut: valeur de HF_ORGANIZATION dans .env ou 'yourbench')")
|
44 |
+
parser.add_argument("--verbose", "-v", action="store_true", help="Afficher des informations détaillées")
|
45 |
+
return parser
|
46 |
+
|
47 |
+
class YourbenchTester:
|
48 |
+
"""Classe pour tester les résultats et datasets de Yourbench."""
|
49 |
+
|
50 |
+
def __init__(self, organization: str, verbose: bool = False):
|
51 |
+
"""Initialise le testeur Yourbench.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
organization: Nom de l'organisation sur Hugging Face
|
55 |
+
verbose: Afficher des informations détaillées
|
56 |
+
"""
|
57 |
+
self.organization = organization
|
58 |
+
self.verbose = verbose
|
59 |
+
self.hf_token = os.environ.get("HF_TOKEN")
|
60 |
+
|
61 |
+
if not self.hf_token:
|
62 |
+
logger.error("Variable d'environnement HF_TOKEN non trouvée dans le fichier .env")
|
63 |
+
sys.exit(1)
|
64 |
+
|
65 |
+
self.api = HfApi(token=self.hf_token)
|
66 |
+
logger.info(f"Initialisation du testeur pour l'organisation: {organization}")
|
67 |
+
|
68 |
+
def test_dataset_exists(self, dataset_name: str) -> Optional[DatasetInfo]:
|
69 |
+
"""Vérifie si un dataset existe sur le Hub.
|
70 |
+
|
71 |
+
Args:
|
72 |
+
dataset_name: Nom du dataset à vérifier
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
Informations sur le dataset s'il existe, None sinon
|
76 |
+
"""
|
77 |
+
full_dataset_name = f"{self.organization}/{dataset_name}"
|
78 |
+
logger.info(f"Vérification de l'existence du dataset: {full_dataset_name}")
|
79 |
+
|
80 |
+
try:
|
81 |
+
dataset_info = self.api.dataset_info(full_dataset_name)
|
82 |
+
logger.success(f"Dataset {full_dataset_name} trouvé!")
|
83 |
+
|
84 |
+
if self.verbose:
|
85 |
+
logger.info(f"ID: {dataset_info.id}")
|
86 |
+
logger.info(f"Dernière modification: {dataset_info.lastModified}")
|
87 |
+
logger.info(f"SHA: {dataset_info.sha}")
|
88 |
+
|
89 |
+
return dataset_info
|
90 |
+
|
91 |
+
except Exception as e:
|
92 |
+
logger.error(f"Impossible de trouver le dataset {full_dataset_name}: {str(e)}")
|
93 |
+
return None
|
94 |
+
|
95 |
+
def analyze_dataset_content(self, dataset_name: str) -> Tuple[bool, Dict[str, Any]]:
|
96 |
+
"""Analyse le contenu d'un dataset.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
dataset_name: Nom du dataset à analyser
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
Tuple contenant un booléen indiquant si l'analyse a réussi et un dictionnaire de statistiques
|
103 |
+
"""
|
104 |
+
full_dataset_name = f"{self.organization}/{dataset_name}"
|
105 |
+
logger.info(f"Analyse du contenu du dataset: {full_dataset_name}")
|
106 |
+
|
107 |
+
stats = {
|
108 |
+
"fichiers": 0,
|
109 |
+
"taille_totale": 0,
|
110 |
+
"fichiers_json": 0,
|
111 |
+
"fichiers_parquet": 0,
|
112 |
+
"a_questions": False,
|
113 |
+
"nb_questions": 0,
|
114 |
+
"structure_parquet": {},
|
115 |
+
"types_documents": set()
|
116 |
+
}
|
117 |
+
|
118 |
+
try:
|
119 |
+
# Lister les fichiers dans le dataset
|
120 |
+
files = self.api.list_repo_files(full_dataset_name, repo_type="dataset")
|
121 |
+
stats["fichiers"] = len(files)
|
122 |
+
|
123 |
+
if self.verbose:
|
124 |
+
logger.info(f"Fichiers trouvés dans le dataset: {len(files)}")
|
125 |
+
for file in files[:10]: # Limiter à 10 fichiers pour éviter un affichage trop verbeux
|
126 |
+
logger.info(f" - {file}")
|
127 |
+
if len(files) > 10:
|
128 |
+
logger.info(f" ... et {len(files) - 10} fichiers supplémentaires")
|
129 |
+
|
130 |
+
# Vérifier la présence de fichiers questions
|
131 |
+
question_files = [f for f in files if "question" in f.lower() and f.endswith(".json")]
|
132 |
+
stats["fichiers_json"] = len([f for f in files if f.endswith(".json")])
|
133 |
+
|
134 |
+
# Vérifier les fichiers Parquet qui sont utilisés par Yourbench
|
135 |
+
parquet_files = [f for f in files if f.endswith(".parquet")]
|
136 |
+
stats["fichiers_parquet"] = len(parquet_files)
|
137 |
+
|
138 |
+
if parquet_files:
|
139 |
+
logger.info(f"Fichiers Parquet trouvés: {len(parquet_files)}")
|
140 |
+
|
141 |
+
# Analyser un échantillon de fichiers Parquet
|
142 |
+
for parquet_file in parquet_files[:3]: # Limiter à 3 fichiers pour l'analyse
|
143 |
+
category = parquet_file.split('/')[0] if '/' in parquet_file else "unknown"
|
144 |
+
|
145 |
+
logger.info(f"Analyse du fichier Parquet: {parquet_file} (catégorie: {category})")
|
146 |
+
|
147 |
+
try:
|
148 |
+
# Télécharger le fichier Parquet
|
149 |
+
temp_file = self.api.hf_hub_download(
|
150 |
+
repo_id=full_dataset_name,
|
151 |
+
filename=parquet_file,
|
152 |
+
repo_type="dataset"
|
153 |
+
)
|
154 |
+
|
155 |
+
# Lire le fichier Parquet avec pandas
|
156 |
+
df = pd.read_parquet(temp_file)
|
157 |
+
|
158 |
+
# Ajouter des statistiques
|
159 |
+
stats["structure_parquet"][category] = {
|
160 |
+
"colonnes": list(df.columns),
|
161 |
+
"nb_lignes": len(df),
|
162 |
+
"exemple": df.iloc[0].to_dict() if len(df) > 0 else {}
|
163 |
+
}
|
164 |
+
|
165 |
+
# Vérifier si ce fichier contient des questions
|
166 |
+
if any(col for col in df.columns if "question" in col.lower()):
|
167 |
+
stats["a_questions"] = True
|
168 |
+
question_col = next(col for col in df.columns if "question" in col.lower())
|
169 |
+
stats["nb_questions"] = len(df)
|
170 |
+
|
171 |
+
# Récupérer un exemple de question
|
172 |
+
if len(df) > 0 and question_col in df.columns:
|
173 |
+
logger.info(f"Exemple de question: {df[question_col].iloc[0][:100]}...")
|
174 |
+
|
175 |
+
# Identifier les types de documents si disponible
|
176 |
+
if "doc_type" in df.columns and len(df) > 0:
|
177 |
+
doc_types = df["doc_type"].unique()
|
178 |
+
stats["types_documents"].update(doc_types)
|
179 |
+
|
180 |
+
except Exception as e:
|
181 |
+
logger.warning(f"Erreur lors de l'analyse du fichier {parquet_file}: {str(e)}")
|
182 |
+
|
183 |
+
# Convertir le set en liste pour la sérialisation JSON
|
184 |
+
stats["types_documents"] = list(stats["types_documents"])
|
185 |
+
|
186 |
+
if question_files:
|
187 |
+
stats["a_questions"] = True
|
188 |
+
|
189 |
+
# Analyser un fichier de questions pour comprendre sa structure
|
190 |
+
sample_file = question_files[0]
|
191 |
+
content = self.api.hf_hub_download(
|
192 |
+
repo_id=full_dataset_name,
|
193 |
+
filename=sample_file,
|
194 |
+
repo_type="dataset"
|
195 |
+
)
|
196 |
+
|
197 |
+
with open(content, 'r') as f:
|
198 |
+
data = json.load(f)
|
199 |
+
|
200 |
+
if isinstance(data, list):
|
201 |
+
stats["nb_questions"] = len(data)
|
202 |
+
elif isinstance(data, dict) and "questions" in data:
|
203 |
+
stats["nb_questions"] = len(data["questions"])
|
204 |
+
|
205 |
+
logger.success(f"Fichiers de questions trouvés: {len(question_files)}")
|
206 |
+
logger.info(f"Exemple de fichier analysé: {sample_file}")
|
207 |
+
logger.info(f"Nombre de questions trouvées: {stats['nb_questions']}")
|
208 |
+
|
209 |
+
return True, stats
|
210 |
+
|
211 |
+
except Exception as e:
|
212 |
+
logger.error(f"Erreur lors de l'analyse du dataset {full_dataset_name}: {str(e)}")
|
213 |
+
return False, stats
|
214 |
+
|
215 |
+
def check_evaluation_results(self, dataset_name: str) -> bool:
|
216 |
+
"""Vérifie s'il existe des résultats d'évaluation pour ce dataset.
|
217 |
+
|
218 |
+
Args:
|
219 |
+
dataset_name: Nom du dataset à vérifier
|
220 |
+
|
221 |
+
Returns:
|
222 |
+
True si des résultats d'évaluation existent, False sinon
|
223 |
+
"""
|
224 |
+
logger.info(f"Recherche de résultats d'évaluation pour le dataset: {dataset_name}")
|
225 |
+
|
226 |
+
try:
|
227 |
+
# Lister tous les datasets de l'organisation
|
228 |
+
datasets = self.api.list_datasets(author=self.organization)
|
229 |
+
|
230 |
+
# Chercher les datasets d'évaluation
|
231 |
+
eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")]
|
232 |
+
|
233 |
+
if self.verbose:
|
234 |
+
logger.info(f"Datasets d'évaluation trouvés: {len(eval_datasets)}")
|
235 |
+
for ds in eval_datasets[:5]:
|
236 |
+
logger.info(f" - {ds.id}")
|
237 |
+
|
238 |
+
# Vérifier si le dataset spécifié est mentionné dans les évaluations
|
239 |
+
for eval_ds in eval_datasets:
|
240 |
+
try:
|
241 |
+
# Télécharger le README pour voir si le dataset est mentionné
|
242 |
+
readme_path = self.api.hf_hub_download(
|
243 |
+
repo_id=eval_ds.id,
|
244 |
+
filename="README.md",
|
245 |
+
repo_type="dataset"
|
246 |
+
)
|
247 |
+
|
248 |
+
with open(readme_path, 'r') as f:
|
249 |
+
readme_content = f.read()
|
250 |
+
|
251 |
+
if dataset_name in readme_content:
|
252 |
+
logger.success(f"Résultats d'évaluation trouvés dans: {eval_ds.id}")
|
253 |
+
return True
|
254 |
+
except:
|
255 |
+
continue
|
256 |
+
|
257 |
+
logger.warning(f"Aucun résultat d'évaluation trouvé pour le dataset: {dataset_name}")
|
258 |
+
return False
|
259 |
+
|
260 |
+
except Exception as e:
|
261 |
+
logger.error(f"Erreur lors de la recherche de résultats d'évaluation: {str(e)}")
|
262 |
+
return False
|
263 |
+
|
264 |
+
def check_model_performances(self, dataset_name: str) -> Dict[str, float]:
|
265 |
+
"""Vérifie les performances des modèles sur le dataset spécifié.
|
266 |
+
|
267 |
+
Args:
|
268 |
+
dataset_name: Nom du dataset à vérifier
|
269 |
+
|
270 |
+
Returns:
|
271 |
+
Dictionnaire des performances des modèles (model_name -> score)
|
272 |
+
"""
|
273 |
+
logger.info(f"Vérification des performances des modèles sur le dataset: {dataset_name}")
|
274 |
+
performances = {}
|
275 |
+
|
276 |
+
try:
|
277 |
+
# Cette partie est spéculative car nous ne connaissons pas la structure exacte
|
278 |
+
# des résultats. Une approche possible serait de chercher des fichiers JSON
|
279 |
+
# contenant des métriques dans les datasets d'évaluation.
|
280 |
+
|
281 |
+
# Chercher les datasets d'évaluation
|
282 |
+
datasets = self.api.list_datasets(author=self.organization)
|
283 |
+
eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")]
|
284 |
+
|
285 |
+
for eval_ds in eval_datasets:
|
286 |
+
try:
|
287 |
+
files = self.api.list_repo_files(eval_ds.id, repo_type="dataset")
|
288 |
+
result_files = [f for f in files if "result" in f.lower() and f.endswith(".json")]
|
289 |
+
|
290 |
+
for result_file in result_files:
|
291 |
+
file_path = self.api.hf_hub_download(
|
292 |
+
repo_id=eval_ds.id,
|
293 |
+
filename=result_file,
|
294 |
+
repo_type="dataset"
|
295 |
+
)
|
296 |
+
|
297 |
+
with open(file_path, 'r') as f:
|
298 |
+
results = json.load(f)
|
299 |
+
|
300 |
+
# Analyse basique des résultats (à adapter selon la structure réelle)
|
301 |
+
if "model_name" in results and "metrics" in results:
|
302 |
+
model_name = results["model_name"]
|
303 |
+
metrics = results["metrics"]
|
304 |
+
|
305 |
+
# Prendre la première métrique trouvée comme score
|
306 |
+
if metrics and isinstance(metrics, dict):
|
307 |
+
first_metric = list(metrics.keys())[0]
|
308 |
+
performances[model_name] = metrics[first_metric]
|
309 |
+
except:
|
310 |
+
continue
|
311 |
+
|
312 |
+
if performances:
|
313 |
+
logger.success(f"Performances trouvées pour {len(performances)} modèles")
|
314 |
+
for model, score in performances.items():
|
315 |
+
logger.info(f" - {model}: {score}")
|
316 |
+
else:
|
317 |
+
logger.warning("Aucune performance de modèle trouvée")
|
318 |
+
|
319 |
+
return performances
|
320 |
+
|
321 |
+
except Exception as e:
|
322 |
+
logger.error(f"Erreur lors de la vérification des performances: {str(e)}")
|
323 |
+
return {}
|
324 |
+
|
325 |
+
def main():
|
326 |
+
"""Fonction principale."""
|
327 |
+
parser = configure_argument_parser()
|
328 |
+
args = parser.parse_args()
|
329 |
+
|
330 |
+
if not args.dataset:
|
331 |
+
logger.error("Veuillez spécifier un dataset avec --dataset")
|
332 |
+
parser.print_help()
|
333 |
+
return
|
334 |
+
|
335 |
+
# Créer le testeur
|
336 |
+
tester = YourbenchTester(args.org, args.verbose)
|
337 |
+
|
338 |
+
# 1. Vérifier l'existence du dataset
|
339 |
+
dataset_info = tester.test_dataset_exists(args.dataset)
|
340 |
+
|
341 |
+
if not dataset_info:
|
342 |
+
logger.error(f"Le dataset {args.org}/{args.dataset} n'existe pas ou n'est pas accessible")
|
343 |
+
return
|
344 |
+
|
345 |
+
# 2. Analyser le contenu du dataset
|
346 |
+
success, stats = tester.analyze_dataset_content(args.dataset)
|
347 |
+
|
348 |
+
if success:
|
349 |
+
logger.info("\n=== Statistiques du dataset ===")
|
350 |
+
logger.info(f"Nombre de fichiers: {stats['fichiers']}")
|
351 |
+
logger.info(f"Fichiers JSON: {stats['fichiers_json']}")
|
352 |
+
logger.info(f"Fichiers Parquet: {stats['fichiers_parquet']}")
|
353 |
+
logger.info(f"Contient des questions: {'Oui' if stats['a_questions'] else 'Non'}")
|
354 |
+
|
355 |
+
if stats['a_questions']:
|
356 |
+
logger.info(f"Nombre de questions: {stats['nb_questions']}")
|
357 |
+
|
358 |
+
if 'types_documents' in stats and stats['types_documents']:
|
359 |
+
logger.info(f"Types de documents: {', '.join(stats['types_documents'])}")
|
360 |
+
|
361 |
+
# Afficher la structure des fichiers Parquet
|
362 |
+
if 'structure_parquet' in stats and stats['structure_parquet']:
|
363 |
+
logger.info("\n=== Structure des fichiers Parquet ===")
|
364 |
+
for category, info in stats['structure_parquet'].items():
|
365 |
+
logger.info(f"\nCatégorie: {category}")
|
366 |
+
logger.info(f"Nombre de lignes: {info['nb_lignes']}")
|
367 |
+
logger.info(f"Colonnes: {', '.join(info['colonnes'])}")
|
368 |
+
|
369 |
+
if args.verbose and 'exemple' in info and info['exemple']:
|
370 |
+
logger.info("\nExemple de ligne:")
|
371 |
+
for key, value in info['exemple'].items():
|
372 |
+
# Tronquer les valeurs trop longues
|
373 |
+
if isinstance(value, str) and len(value) > 100:
|
374 |
+
value = value[:100] + "..."
|
375 |
+
logger.info(f" {key}: {value}")
|
376 |
+
|
377 |
+
# 3. Vérifier s'il existe des résultats d'évaluation
|
378 |
+
has_evaluations = tester.check_evaluation_results(args.dataset)
|
379 |
+
|
380 |
+
if has_evaluations:
|
381 |
+
# 4. Vérifier les performances des modèles
|
382 |
+
performances = tester.check_model_performances(args.dataset)
|
383 |
+
|
384 |
+
if performances:
|
385 |
+
logger.info("\n=== Classement des modèles ===")
|
386 |
+
# Trier les modèles par score (du plus élevé au plus bas)
|
387 |
+
sorted_models = sorted(performances.items(), key=lambda x: x[1], reverse=True)
|
388 |
+
for i, (model, score) in enumerate(sorted_models, 1):
|
389 |
+
logger.info(f"{i}. {model}: {score:.4f}")
|
390 |
+
|
391 |
+
logger.success("Test terminé !")
|
392 |
+
|
393 |
+
if __name__ == "__main__":
|
394 |
+
main()
|
docker-compose.yml
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
backend:
|
3 |
+
build:
|
4 |
+
context: ./backend
|
5 |
+
dockerfile: Dockerfile.dev
|
6 |
+
args:
|
7 |
+
- HF_TOKEN=${HF_TOKEN}
|
8 |
+
ports:
|
9 |
+
- "${BACKEND_PORT:-8000}:8000"
|
10 |
+
volumes:
|
11 |
+
- ./backend:/app
|
12 |
+
environment:
|
13 |
+
- ENVIRONMENT=${ENVIRONMENT:-development}
|
14 |
+
- HF_TOKEN=${HF_TOKEN}
|
15 |
+
- HF_HOME=${HF_HOME:-/.cache}
|
16 |
+
command: uvicorn app.asgi:app --host 0.0.0.0 --port 8000 --reload
|
17 |
+
|
18 |
+
frontend:
|
19 |
+
build:
|
20 |
+
context: ./frontend
|
21 |
+
dockerfile: Dockerfile.dev
|
22 |
+
ports:
|
23 |
+
- "${FRONTEND_PORT:-7860}:7860"
|
24 |
+
volumes:
|
25 |
+
- ./frontend:/app
|
26 |
+
- /app/node_modules
|
27 |
+
environment:
|
28 |
+
- NODE_ENV=${ENVIRONMENT:-development}
|
29 |
+
- CHOKIDAR_USEPOLLING=true
|
30 |
+
- PORT=${FRONTEND_PORT:-7860}
|
31 |
+
command: npm start
|
32 |
+
stdin_open: true
|
33 |
+
tty: true
|
frontend/Dockerfile.dev
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM node:18
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install required global dependencies
|
6 |
+
RUN npm install -g react-scripts
|
7 |
+
|
8 |
+
# Copy package.json and package-lock.json
|
9 |
+
COPY package*.json ./
|
10 |
+
|
11 |
+
# Install project dependencies
|
12 |
+
RUN npm install
|
13 |
+
|
14 |
+
# Volume will be mounted here, no need for COPY
|
15 |
+
CMD ["npm", "start"]
|
frontend/README.md
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Frontend - Open LLM Leaderboard 🏆
|
2 |
+
|
3 |
+
React interface for exploring and comparing open-source language models.
|
4 |
+
|
5 |
+
## 🏗 Architecture
|
6 |
+
|
7 |
+
```mermaid
|
8 |
+
flowchart TD
|
9 |
+
Client(["User Browser"]) --> Components["React Components"]
|
10 |
+
|
11 |
+
subgraph Frontend
|
12 |
+
Components --> Context["Context Layer<br>• LeaderboardContext<br>• Global State"]
|
13 |
+
|
14 |
+
API["API Layer<br>• /api/leaderboard/formatted<br>• TanStack Query"] --> |Data Feed| Context
|
15 |
+
|
16 |
+
Context --> Hooks["Hooks Layer<br>• Data Processing<br>• Filtering<br>• Caching"]
|
17 |
+
|
18 |
+
Hooks --> Features["Features<br>• Table Management<br>• Search & Filters<br>• Display Options"]
|
19 |
+
Features --> Cache["Cache Layer<br>• LocalStorage<br>• URL State"]
|
20 |
+
end
|
21 |
+
|
22 |
+
API --> Backend["Backend Server"]
|
23 |
+
|
24 |
+
style Backend fill:#f96,stroke:#333,stroke-width:2px
|
25 |
+
```
|
26 |
+
|
27 |
+
## ✨ Core Features
|
28 |
+
|
29 |
+
- 🔍 **Search & Filters**: Real-time filtering, regex search, advanced filters
|
30 |
+
- 📊 **Data Visualization**: Interactive table, customizable columns, sorting
|
31 |
+
- 🔄 **State Management**: URL sync, client-side caching (5min TTL)
|
32 |
+
- 📱 **Responsive Design**: Mobile-friendly, dark/light themes
|
33 |
+
|
34 |
+
## 🛠 Tech Stack
|
35 |
+
|
36 |
+
- React 18 + Material-UI
|
37 |
+
- TanStack Query & Table
|
38 |
+
- React Router v6
|
39 |
+
|
40 |
+
## 📁 Project Structure
|
41 |
+
|
42 |
+
```
|
43 |
+
src/
|
44 |
+
├── pages/
|
45 |
+
│ └── LeaderboardPage/
|
46 |
+
│ ├── components/ # UI Components
|
47 |
+
│ ├── context/ # Global State
|
48 |
+
│ └── hooks/ # Data Processing
|
49 |
+
├── components/ # Shared Components
|
50 |
+
└── utils/ # Helper Functions
|
51 |
+
```
|
52 |
+
|
53 |
+
## 🚀 Development
|
54 |
+
|
55 |
+
```bash
|
56 |
+
# Install dependencies
|
57 |
+
npm install
|
58 |
+
|
59 |
+
# Start development server
|
60 |
+
npm start
|
61 |
+
|
62 |
+
# Production build
|
63 |
+
npm run build
|
64 |
+
```
|
65 |
+
|
66 |
+
## 🔧 Environment Variables
|
67 |
+
|
68 |
+
```env
|
69 |
+
# API Configuration
|
70 |
+
REACT_APP_API_URL=http://localhost:8000
|
71 |
+
REACT_APP_CACHE_DURATION=300000 # 5 minutes
|
72 |
+
```
|
73 |
+
|
74 |
+
## 🔄 Data Flow
|
75 |
+
|
76 |
+
1. API fetches leaderboard data from backend
|
77 |
+
2. Context stores and manages global state
|
78 |
+
3. Hooks handle data processing and filtering
|
79 |
+
4. Components render based on processed data
|
80 |
+
5. Cache maintains user preferences and URL state
|
frontend/package.json
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "open-llm-leaderboard",
|
3 |
+
"version": "0.1.0",
|
4 |
+
"private": true,
|
5 |
+
"dependencies": {
|
6 |
+
"@emotion/react": "^11.13.3",
|
7 |
+
"@emotion/styled": "^11.13.0",
|
8 |
+
"@huggingface/hub": "^0.14.0",
|
9 |
+
"@mui/icons-material": "^6.1.7",
|
10 |
+
"@mui/lab": "^6.0.0-beta.16",
|
11 |
+
"@mui/material": "^6.1.6",
|
12 |
+
"@mui/x-data-grid": "^7.22.2",
|
13 |
+
"@tanstack/react-query": "^5.62.2",
|
14 |
+
"@tanstack/react-table": "^8.20.5",
|
15 |
+
"@tanstack/react-virtual": "^3.10.9",
|
16 |
+
"@testing-library/jest-dom": "^5.17.0",
|
17 |
+
"@testing-library/react": "^13.4.0",
|
18 |
+
"@testing-library/user-event": "^13.5.0",
|
19 |
+
"compression": "^1.7.4",
|
20 |
+
"cors": "^2.8.5",
|
21 |
+
"express": "^4.18.2",
|
22 |
+
"react": "^18.3.1",
|
23 |
+
"react-dom": "^18.3.1",
|
24 |
+
"react-router-dom": "^6.28.0",
|
25 |
+
"react-scripts": "5.0.1",
|
26 |
+
"serve-static": "^1.15.0",
|
27 |
+
"web-vitals": "^2.1.4"
|
28 |
+
},
|
29 |
+
"scripts": {
|
30 |
+
"start": "react-scripts start",
|
31 |
+
"build": "react-scripts build",
|
32 |
+
"test": "react-scripts test",
|
33 |
+
"eject": "react-scripts eject",
|
34 |
+
"serve": "node server.js"
|
35 |
+
},
|
36 |
+
"eslintConfig": {
|
37 |
+
"extends": [
|
38 |
+
"react-app",
|
39 |
+
"react-app/jest"
|
40 |
+
]
|
41 |
+
},
|
42 |
+
"browserslist": {
|
43 |
+
"production": [
|
44 |
+
">0.2%",
|
45 |
+
"not dead",
|
46 |
+
"not op_mini all"
|
47 |
+
],
|
48 |
+
"development": [
|
49 |
+
"last 1 chrome version",
|
50 |
+
"last 1 firefox version",
|
51 |
+
"last 1 safari version"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
"proxy": "http://backend:8000"
|
55 |
+
}
|
frontend/public/index.html
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8" />
|
5 |
+
<link rel="icon" href="%PUBLIC_URL%/logo32.png" />
|
6 |
+
<meta
|
7 |
+
name="viewport"
|
8 |
+
content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no, viewport-fit=cover"
|
9 |
+
/>
|
10 |
+
<meta
|
11 |
+
name="description"
|
12 |
+
content="Interactive leaderboard tracking and comparing open-source Large Language Models across multiple benchmarks: IFEval, BBH, MATH, GPQA, MUSR, and MMLU-PRO."
|
13 |
+
/>
|
14 |
+
|
15 |
+
<!-- Open Graph / Facebook -->
|
16 |
+
<meta property="og:type" content="website" />
|
17 |
+
<meta
|
18 |
+
property="og:url"
|
19 |
+
content="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"
|
20 |
+
/>
|
21 |
+
<meta
|
22 |
+
property="og:title"
|
23 |
+
content="Open LLM Leaderboard - Compare Open Source Large Language Models"
|
24 |
+
/>
|
25 |
+
<meta
|
26 |
+
property="og:description"
|
27 |
+
content="Interactive leaderboard for comparing LLM performance across multiple benchmarks. Features real-time filtering, community voting, and comprehensive model analysis with benchmarks like IFEval, BBH, MATH, GPQA, MUSR, and MMLU-PRO."
|
28 |
+
/>
|
29 |
+
<meta property="og:image" content="%PUBLIC_URL%/og-image.png" />
|
30 |
+
|
31 |
+
<!-- Twitter -->
|
32 |
+
<meta property="twitter:card" content="summary_large_image" />
|
33 |
+
<meta
|
34 |
+
property="twitter:url"
|
35 |
+
content="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"
|
36 |
+
/>
|
37 |
+
<meta
|
38 |
+
property="twitter:title"
|
39 |
+
content="Open LLM Leaderboard - Compare Open Source Large Language Models"
|
40 |
+
/>
|
41 |
+
<meta
|
42 |
+
property="twitter:description"
|
43 |
+
content="Interactive leaderboard for comparing LLM performance across multiple benchmarks. Features real-time filtering, community voting, and comprehensive model analysis with benchmarks like IFEval, BBH, MATH, GPQA, MUSR, and MMLU-PRO."
|
44 |
+
/>
|
45 |
+
<meta property="twitter:image" content="%PUBLIC_URL%/og-image.png" />
|
46 |
+
<!--
|
47 |
+
Notice the use of %PUBLIC_URL% in the tags above.
|
48 |
+
It will be replaced with the URL of the `public` folder during the build.
|
49 |
+
Only files inside the `public` folder can be referenced from the HTML.
|
50 |
+
|
51 |
+
Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
|
52 |
+
work correctly both with client-side routing and a non-root public URL.
|
53 |
+
Learn how to configure a non-root public URL by running `npm run build`.
|
54 |
+
-->
|
55 |
+
<title>
|
56 |
+
Open LLM Leaderboard - Compare Open Source Large Language Models
|
57 |
+
</title>
|
58 |
+
<link
|
59 |
+
href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600;700&display=swap"
|
60 |
+
rel="stylesheet"
|
61 |
+
/>
|
62 |
+
<style>
|
63 |
+
html,
|
64 |
+
body {
|
65 |
+
position: fixed;
|
66 |
+
width: 100%;
|
67 |
+
height: 100%;
|
68 |
+
overflow: hidden;
|
69 |
+
-webkit-overflow-scrolling: touch;
|
70 |
+
}
|
71 |
+
#root {
|
72 |
+
position: absolute;
|
73 |
+
top: 0;
|
74 |
+
left: 0;
|
75 |
+
right: 0;
|
76 |
+
bottom: 0;
|
77 |
+
overflow-y: auto;
|
78 |
+
-webkit-overflow-scrolling: touch;
|
79 |
+
}
|
80 |
+
</style>
|
81 |
+
</head>
|
82 |
+
<body>
|
83 |
+
<noscript>You need to enable JavaScript to run this app.</noscript>
|
84 |
+
<div id="root"></div>
|
85 |
+
<!--
|
86 |
+
This HTML file is a template.
|
87 |
+
If you open it directly in the browser, you will see an empty page.
|
88 |
+
|
89 |
+
You can add webfonts, meta tags, or analytics to this file.
|
90 |
+
The build step will place the bundled scripts into the <body> tag.
|
91 |
+
|
92 |
+
To begin the development, run `npm start` or `yarn start`.
|
93 |
+
To create a production bundle, use `npm run build` or `yarn build`.
|
94 |
+
-->
|
95 |
+
</body>
|
96 |
+
</html>
|
frontend/public/logo256.png
ADDED
![]() |
frontend/public/logo32.png
ADDED
![]() |
frontend/public/og-image.jpg
ADDED
![]() |
frontend/public/robots.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# https://www.robotstxt.org/robotstxt.html
|
2 |
+
User-agent: *
|
3 |
+
Disallow:
|
frontend/server.js
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const express = require("express");
|
2 |
+
const cors = require("cors");
|
3 |
+
const compression = require("compression");
|
4 |
+
const path = require("path");
|
5 |
+
const serveStatic = require("serve-static");
|
6 |
+
const { createProxyMiddleware } = require("http-proxy-middleware");
|
7 |
+
|
8 |
+
const app = express();
|
9 |
+
const port = process.env.PORT || 7860;
|
10 |
+
const apiPort = process.env.INTERNAL_API_PORT || 7861;
|
11 |
+
|
12 |
+
// Enable CORS for all routes
|
13 |
+
app.use(cors());
|
14 |
+
|
15 |
+
// Enable GZIP compression
|
16 |
+
app.use(compression());
|
17 |
+
|
18 |
+
// Proxy all API requests to the Python backend
|
19 |
+
app.use(
|
20 |
+
"/api",
|
21 |
+
createProxyMiddleware({
|
22 |
+
target: `http://127.0.0.1:${apiPort}`,
|
23 |
+
changeOrigin: true,
|
24 |
+
onError: (err, req, res) => {
|
25 |
+
console.error("Proxy Error:", err);
|
26 |
+
res.status(500).json({ error: "Proxy Error", details: err.message });
|
27 |
+
},
|
28 |
+
})
|
29 |
+
);
|
30 |
+
|
31 |
+
// Serve static files from the build directory
|
32 |
+
app.use(
|
33 |
+
express.static(path.join(__dirname, "build"), {
|
34 |
+
// Don't cache HTML files
|
35 |
+
setHeaders: (res, path) => {
|
36 |
+
if (path.endsWith(".html")) {
|
37 |
+
res.setHeader("Cache-Control", "no-cache, no-store, must-revalidate");
|
38 |
+
res.setHeader("Pragma", "no-cache");
|
39 |
+
res.setHeader("Expires", "0");
|
40 |
+
} else {
|
41 |
+
// Cache other static resources for 1 year
|
42 |
+
res.setHeader("Cache-Control", "public, max-age=31536000");
|
43 |
+
}
|
44 |
+
},
|
45 |
+
})
|
46 |
+
);
|
47 |
+
|
48 |
+
// Middleware to preserve URL parameters
|
49 |
+
app.use((req, res, next) => {
|
50 |
+
// Don't interfere with API requests
|
51 |
+
if (req.url.startsWith("/api")) {
|
52 |
+
return next();
|
53 |
+
}
|
54 |
+
|
55 |
+
// Preserve original URL parameters
|
56 |
+
req.originalUrl = req.url;
|
57 |
+
next();
|
58 |
+
});
|
59 |
+
|
60 |
+
// Handle all other routes by serving index.html
|
61 |
+
app.get("*", (req, res) => {
|
62 |
+
// Don't interfere with API requests
|
63 |
+
if (req.url.startsWith("/api")) {
|
64 |
+
return next();
|
65 |
+
}
|
66 |
+
|
67 |
+
// Headers for client-side routing
|
68 |
+
res.set({
|
69 |
+
"Cache-Control": "no-cache, no-store, must-revalidate",
|
70 |
+
Pragma: "no-cache",
|
71 |
+
Expires: "0",
|
72 |
+
});
|
73 |
+
|
74 |
+
// Send index.html for all other routes
|
75 |
+
res.sendFile(path.join(__dirname, "build", "index.html"));
|
76 |
+
});
|
77 |
+
|
78 |
+
app.listen(port, "0.0.0.0", () => {
|
79 |
+
console.log(
|
80 |
+
`Frontend server is running on port ${port} in ${
|
81 |
+
process.env.NODE_ENV || "development"
|
82 |
+
} mode`
|
83 |
+
);
|
84 |
+
console.log(`API proxy target: http://127.0.0.1:${apiPort}`);
|
85 |
+
});
|
frontend/src/App.js
ADDED
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React, { useState, useEffect } from "react";
|
2 |
+
import {
|
3 |
+
Box,
|
4 |
+
Container,
|
5 |
+
CssBaseline,
|
6 |
+
Typography,
|
7 |
+
CircularProgress,
|
8 |
+
} from "@mui/material";
|
9 |
+
import {
|
10 |
+
BrowserRouter as Router,
|
11 |
+
Routes,
|
12 |
+
Route,
|
13 |
+
Navigate,
|
14 |
+
useNavigate,
|
15 |
+
useSearchParams,
|
16 |
+
} from "react-router-dom";
|
17 |
+
import getTheme from "./config/theme";
|
18 |
+
import { useThemeMode } from "./hooks/useThemeMode";
|
19 |
+
import { ThemeProvider } from "@mui/material/styles";
|
20 |
+
import BenchmarkGenerator from "./components/BenchmarkGenerator";
|
21 |
+
import BenchmarkCreateForm from "./components/BenchmarkCreateForm";
|
22 |
+
import BenchmarkDisplay from "./components/BenchmarkDisplay";
|
23 |
+
import BenchmarkEvaluation from "./components/BenchmarkEvaluation";
|
24 |
+
import EvaluationDisplay from "./components/EvaluationDisplay";
|
25 |
+
|
26 |
+
// Composant d'en-tête commun
|
27 |
+
const Header = () => (
|
28 |
+
<Box sx={{ textAlign: "center", mb: 8 }}>
|
29 |
+
<h1>Yourbench Demo</h1>
|
30 |
+
<p>
|
31 |
+
Quickly create <b>zero-shot benchmarks</b> from your documents – keeping
|
32 |
+
models accurate and adaptable
|
33 |
+
</p>
|
34 |
+
</Box>
|
35 |
+
);
|
36 |
+
|
37 |
+
// Page d'accueil avec le formulaire
|
38 |
+
function HomePage() {
|
39 |
+
const navigate = useNavigate();
|
40 |
+
|
41 |
+
const handleStartGeneration = (sid) => {
|
42 |
+
navigate(`/benchmark-generation?session=${sid}`);
|
43 |
+
};
|
44 |
+
|
45 |
+
return (
|
46 |
+
<>
|
47 |
+
<Header />
|
48 |
+
<BenchmarkCreateForm onStartGeneration={handleStartGeneration} />
|
49 |
+
</>
|
50 |
+
);
|
51 |
+
}
|
52 |
+
|
53 |
+
// Page de génération de benchmark
|
54 |
+
function BenchmarkGenerationPage() {
|
55 |
+
const navigate = useNavigate();
|
56 |
+
const [searchParams] = useSearchParams();
|
57 |
+
const sessionId = searchParams.get("session");
|
58 |
+
const [isValidSession, setIsValidSession] = useState(true);
|
59 |
+
|
60 |
+
// Vérifier la validité de la session
|
61 |
+
useEffect(() => {
|
62 |
+
if (!sessionId) {
|
63 |
+
setIsValidSession(false);
|
64 |
+
}
|
65 |
+
}, [sessionId]);
|
66 |
+
|
67 |
+
const handleGenerationComplete = (result) => {
|
68 |
+
console.log("Benchmark generation completed:", result);
|
69 |
+
if (result && result.success) {
|
70 |
+
navigate(`/benchmark-display?session=${sessionId}`);
|
71 |
+
}
|
72 |
+
};
|
73 |
+
|
74 |
+
if (!isValidSession) {
|
75 |
+
return <Navigate to="/" />;
|
76 |
+
}
|
77 |
+
|
78 |
+
return (
|
79 |
+
<>
|
80 |
+
<Header />
|
81 |
+
<BenchmarkGenerator
|
82 |
+
sessionId={sessionId}
|
83 |
+
onComplete={handleGenerationComplete}
|
84 |
+
/>
|
85 |
+
</>
|
86 |
+
);
|
87 |
+
}
|
88 |
+
|
89 |
+
// Page d'affichage du benchmark
|
90 |
+
function BenchmarkDisplayPage() {
|
91 |
+
const navigate = useNavigate();
|
92 |
+
const [searchParams] = useSearchParams();
|
93 |
+
const sessionId = searchParams.get("session");
|
94 |
+
const [benchmarkQuestions, setBenchmarkQuestions] = useState([]);
|
95 |
+
const [datasetUrl, setDatasetUrl] = useState(null);
|
96 |
+
const [isValidSession, setIsValidSession] = useState(true);
|
97 |
+
const [isLoading, setIsLoading] = useState(true);
|
98 |
+
|
99 |
+
// Récupérer les questions du benchmark depuis l'API
|
100 |
+
useEffect(() => {
|
101 |
+
console.log("BenchmarkDisplayPage useEffect - sessionId:", sessionId);
|
102 |
+
|
103 |
+
if (!sessionId) {
|
104 |
+
console.log("Session ID manquante, redirection vers l'accueil");
|
105 |
+
setIsValidSession(false);
|
106 |
+
return;
|
107 |
+
}
|
108 |
+
|
109 |
+
setIsLoading(true);
|
110 |
+
|
111 |
+
const fetchBenchmarkQuestions = async () => {
|
112 |
+
console.log(
|
113 |
+
"Tentative de récupération des questions pour la session:",
|
114 |
+
sessionId
|
115 |
+
);
|
116 |
+
try {
|
117 |
+
const apiUrl = `http://localhost:3001/benchmark-questions/${sessionId}`;
|
118 |
+
console.log("Appel API:", apiUrl);
|
119 |
+
|
120 |
+
const response = await fetch(apiUrl);
|
121 |
+
console.log("Réponse API reçue:", response.status);
|
122 |
+
|
123 |
+
// Check if the response status is 404 (Not Found) or other error
|
124 |
+
if (!response.ok) {
|
125 |
+
if (response.status === 404) {
|
126 |
+
console.error("Session non trouvée");
|
127 |
+
setIsValidSession(false);
|
128 |
+
return;
|
129 |
+
} else {
|
130 |
+
console.error(`Erreur serveur: ${response.status}`);
|
131 |
+
setIsLoading(false);
|
132 |
+
return;
|
133 |
+
}
|
134 |
+
}
|
135 |
+
|
136 |
+
const data = await response.json();
|
137 |
+
console.log("Données API:", data);
|
138 |
+
|
139 |
+
if (data.success && data.questions && data.questions.length > 0) {
|
140 |
+
console.log("Questions chargées avec succès:", data.questions);
|
141 |
+
setBenchmarkQuestions(data.questions);
|
142 |
+
} else {
|
143 |
+
console.warn(
|
144 |
+
"Échec du chargement des questions, utilisation des valeurs par défaut"
|
145 |
+
);
|
146 |
+
}
|
147 |
+
|
148 |
+
if (data.dataset_url) {
|
149 |
+
setDatasetUrl(data.dataset_url);
|
150 |
+
} else {
|
151 |
+
const url = `https://huggingface.co/datasets/yourbench/yourbench_${sessionId}`;
|
152 |
+
setDatasetUrl(url);
|
153 |
+
console.log("URL du dataset générée:", url);
|
154 |
+
}
|
155 |
+
} catch (error) {
|
156 |
+
console.error("Erreur lors de la récupération des questions:", error);
|
157 |
+
setIsValidSession(false);
|
158 |
+
} finally {
|
159 |
+
setIsLoading(false);
|
160 |
+
}
|
161 |
+
};
|
162 |
+
|
163 |
+
fetchBenchmarkQuestions();
|
164 |
+
}, [sessionId]);
|
165 |
+
|
166 |
+
const handleStartEvaluation = () => {
|
167 |
+
console.log("Starting evaluation with session ID:", sessionId);
|
168 |
+
navigate(`/benchmark-evaluation?session=${sessionId}`);
|
169 |
+
};
|
170 |
+
|
171 |
+
// Questions par défaut si l'API échoue
|
172 |
+
const defaultSampleQuestions = [
|
173 |
+
{
|
174 |
+
id: 1,
|
175 |
+
question: "What are the key features discussed in the document?",
|
176 |
+
type: "single_shot",
|
177 |
+
},
|
178 |
+
{
|
179 |
+
id: 2,
|
180 |
+
question:
|
181 |
+
"How does the proposed solution address the challenges mentioned in section 2 in relation to the overall market trends?",
|
182 |
+
type: "multi_hop",
|
183 |
+
},
|
184 |
+
];
|
185 |
+
|
186 |
+
if (!isValidSession) {
|
187 |
+
return <Navigate to="/" />;
|
188 |
+
}
|
189 |
+
|
190 |
+
return (
|
191 |
+
<>
|
192 |
+
<Header />
|
193 |
+
{isLoading ? (
|
194 |
+
<Box
|
195 |
+
sx={{
|
196 |
+
display: "flex",
|
197 |
+
justifyContent: "center",
|
198 |
+
alignItems: "center",
|
199 |
+
mt: 8,
|
200 |
+
mb: 8,
|
201 |
+
}}
|
202 |
+
>
|
203 |
+
<CircularProgress size={60} />
|
204 |
+
</Box>
|
205 |
+
) : (
|
206 |
+
<BenchmarkDisplay
|
207 |
+
onStartEvaluation={handleStartEvaluation}
|
208 |
+
sessionId={sessionId}
|
209 |
+
datasetUrl={datasetUrl}
|
210 |
+
sampleQuestions={
|
211 |
+
benchmarkQuestions.length > 0
|
212 |
+
? benchmarkQuestions
|
213 |
+
: defaultSampleQuestions
|
214 |
+
}
|
215 |
+
/>
|
216 |
+
)}
|
217 |
+
</>
|
218 |
+
);
|
219 |
+
}
|
220 |
+
|
221 |
+
// Page d'évaluation du benchmark
|
222 |
+
function BenchmarkEvaluationPage() {
|
223 |
+
const navigate = useNavigate();
|
224 |
+
const [searchParams] = useSearchParams();
|
225 |
+
const sessionId = searchParams.get("session");
|
226 |
+
const [isValidSession, setIsValidSession] = useState(true);
|
227 |
+
const [isLoading, setIsLoading] = useState(true);
|
228 |
+
|
229 |
+
// Vérifier la validité de la session
|
230 |
+
useEffect(() => {
|
231 |
+
if (!sessionId) {
|
232 |
+
console.log(
|
233 |
+
"Session ID manquante pour l'évaluation, redirection vers l'accueil"
|
234 |
+
);
|
235 |
+
setIsValidSession(false);
|
236 |
+
return;
|
237 |
+
}
|
238 |
+
|
239 |
+
// Verify session exists by calling the API
|
240 |
+
const checkSession = async () => {
|
241 |
+
try {
|
242 |
+
const response = await fetch(
|
243 |
+
`http://localhost:3001/benchmark-questions/${sessionId}`
|
244 |
+
);
|
245 |
+
|
246 |
+
if (!response.ok) {
|
247 |
+
console.error(
|
248 |
+
`Session invalide ou erreur serveur: ${response.status}`
|
249 |
+
);
|
250 |
+
setIsValidSession(false);
|
251 |
+
}
|
252 |
+
} catch (error) {
|
253 |
+
console.error("Erreur lors de la vérification de la session:", error);
|
254 |
+
setIsValidSession(false);
|
255 |
+
} finally {
|
256 |
+
setIsLoading(false);
|
257 |
+
}
|
258 |
+
};
|
259 |
+
|
260 |
+
checkSession();
|
261 |
+
}, [sessionId]);
|
262 |
+
|
263 |
+
const handleEvaluationComplete = (result) => {
|
264 |
+
console.log("Évaluation terminée:", result);
|
265 |
+
// On reste sur la même page car les résultats sont affichés directement
|
266 |
+
// dans le composant BenchmarkEvaluation
|
267 |
+
};
|
268 |
+
|
269 |
+
if (!isValidSession) {
|
270 |
+
return <Navigate to="/" />;
|
271 |
+
}
|
272 |
+
|
273 |
+
return (
|
274 |
+
<>
|
275 |
+
<Header />
|
276 |
+
{isLoading ? (
|
277 |
+
<Box
|
278 |
+
sx={{
|
279 |
+
display: "flex",
|
280 |
+
justifyContent: "center",
|
281 |
+
alignItems: "center",
|
282 |
+
mt: 8,
|
283 |
+
mb: 8,
|
284 |
+
}}
|
285 |
+
>
|
286 |
+
<CircularProgress size={60} />
|
287 |
+
</Box>
|
288 |
+
) : (
|
289 |
+
<BenchmarkEvaluation
|
290 |
+
sessionId={sessionId}
|
291 |
+
onComplete={handleEvaluationComplete}
|
292 |
+
/>
|
293 |
+
)}
|
294 |
+
</>
|
295 |
+
);
|
296 |
+
}
|
297 |
+
|
298 |
+
// Page d'affichage des résultats d'évaluation
|
299 |
+
function EvaluationDisplayPage() {
|
300 |
+
const navigate = useNavigate();
|
301 |
+
const [searchParams] = useSearchParams();
|
302 |
+
const sessionId = searchParams.get("session");
|
303 |
+
const [isValidSession, setIsValidSession] = useState(true);
|
304 |
+
const [isLoading, setIsLoading] = useState(true);
|
305 |
+
|
306 |
+
// Vérifier la validité de la session
|
307 |
+
useEffect(() => {
|
308 |
+
if (!sessionId) {
|
309 |
+
console.log(
|
310 |
+
"Session ID manquante pour l'affichage des résultats, redirection vers l'accueil"
|
311 |
+
);
|
312 |
+
setIsValidSession(false);
|
313 |
+
return;
|
314 |
+
}
|
315 |
+
|
316 |
+
// Verify session exists by calling the API
|
317 |
+
const checkSession = async () => {
|
318 |
+
try {
|
319 |
+
const response = await fetch(
|
320 |
+
`http://localhost:3001/benchmark-questions/${sessionId}`
|
321 |
+
);
|
322 |
+
|
323 |
+
if (!response.ok) {
|
324 |
+
console.error(
|
325 |
+
`Session invalide ou erreur serveur: ${response.status}`
|
326 |
+
);
|
327 |
+
setIsValidSession(false);
|
328 |
+
}
|
329 |
+
} catch (error) {
|
330 |
+
console.error("Erreur lors de la vérification de la session:", error);
|
331 |
+
setIsValidSession(false);
|
332 |
+
} finally {
|
333 |
+
setIsLoading(false);
|
334 |
+
}
|
335 |
+
};
|
336 |
+
|
337 |
+
checkSession();
|
338 |
+
}, [sessionId]);
|
339 |
+
|
340 |
+
if (!isValidSession) {
|
341 |
+
return <Navigate to="/" />;
|
342 |
+
}
|
343 |
+
|
344 |
+
return (
|
345 |
+
<>
|
346 |
+
<Header />
|
347 |
+
{isLoading ? (
|
348 |
+
<Box
|
349 |
+
sx={{
|
350 |
+
display: "flex",
|
351 |
+
justifyContent: "center",
|
352 |
+
alignItems: "center",
|
353 |
+
mt: 8,
|
354 |
+
mb: 8,
|
355 |
+
}}
|
356 |
+
>
|
357 |
+
<CircularProgress size={60} />
|
358 |
+
</Box>
|
359 |
+
) : (
|
360 |
+
<EvaluationDisplay sessionId={sessionId} />
|
361 |
+
)}
|
362 |
+
</>
|
363 |
+
);
|
364 |
+
}
|
365 |
+
|
366 |
+
// Raccourcis clavier
|
367 |
+
function KeyboardShortcuts() {
|
368 |
+
useEffect(() => {
|
369 |
+
const handleKeyDown = (e) => {
|
370 |
+
if (e.key === "p") {
|
371 |
+
console.log("Debug key pressed: Clearing auth data and refreshing");
|
372 |
+
localStorage.removeItem("hf_oauth");
|
373 |
+
localStorage.removeItem("auth_return_to");
|
374 |
+
alert("Auth data cleared. Page will reload.");
|
375 |
+
window.location.reload();
|
376 |
+
}
|
377 |
+
};
|
378 |
+
|
379 |
+
window.addEventListener("keydown", handleKeyDown);
|
380 |
+
return () => {
|
381 |
+
window.removeEventListener("keydown", handleKeyDown);
|
382 |
+
};
|
383 |
+
}, []);
|
384 |
+
|
385 |
+
return null;
|
386 |
+
}
|
387 |
+
|
388 |
+
// Composant principal avec les routes
|
389 |
+
function App() {
|
390 |
+
const { mode } = useThemeMode();
|
391 |
+
const theme = getTheme(mode);
|
392 |
+
|
393 |
+
return (
|
394 |
+
<ThemeProvider theme={theme}>
|
395 |
+
<CssBaseline />
|
396 |
+
<Router>
|
397 |
+
<Container maxWidth="md">
|
398 |
+
<Box sx={{ pt: 12, pb: 4 }}>
|
399 |
+
<KeyboardShortcuts />
|
400 |
+
<Routes>
|
401 |
+
<Route path="/" element={<HomePage />} />
|
402 |
+
<Route
|
403 |
+
path="/benchmark-generation"
|
404 |
+
element={<BenchmarkGenerationPage />}
|
405 |
+
/>
|
406 |
+
<Route
|
407 |
+
path="/benchmark-display"
|
408 |
+
element={<BenchmarkDisplayPage />}
|
409 |
+
/>
|
410 |
+
<Route
|
411 |
+
path="/benchmark-evaluation"
|
412 |
+
element={<BenchmarkEvaluationPage />}
|
413 |
+
/>
|
414 |
+
<Route
|
415 |
+
path="/evaluation-display"
|
416 |
+
element={<EvaluationDisplayPage />}
|
417 |
+
/>
|
418 |
+
<Route path="*" element={<Navigate to="/" replace />} />
|
419 |
+
</Routes>
|
420 |
+
</Box>
|
421 |
+
</Container>
|
422 |
+
</Router>
|
423 |
+
</ThemeProvider>
|
424 |
+
);
|
425 |
+
}
|
426 |
+
|
427 |
+
export default App;
|
frontend/src/components/BenchmarkCreateForm.jsx
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React, { useState, useRef, useEffect } from "react";
|
2 |
+
import {
|
3 |
+
Box,
|
4 |
+
Paper,
|
5 |
+
Typography,
|
6 |
+
CircularProgress,
|
7 |
+
Alert,
|
8 |
+
Button,
|
9 |
+
Stepper,
|
10 |
+
Step,
|
11 |
+
StepLabel,
|
12 |
+
} from "@mui/material";
|
13 |
+
import { useLocation } from "react-router-dom";
|
14 |
+
import CloudUploadIcon from "@mui/icons-material/CloudUpload";
|
15 |
+
import PlayArrowIcon from "@mui/icons-material/PlayArrow";
|
16 |
+
import AuthContainer from "./shared/AuthContainer";
|
17 |
+
import { useThemeMode } from "../hooks/useThemeMode";
|
18 |
+
import getTheme from "../config/theme";
|
19 |
+
|
20 |
+
/**
|
21 |
+
* Component to display a stepper with three steps: Login, Upload File, and Generate
|
22 |
+
*
|
23 |
+
* @param {Object} props - Component props
|
24 |
+
* @param {number} props.activeStep - Current active step (0-based index)
|
25 |
+
* @returns {JSX.Element} Stepper component
|
26 |
+
*/
|
27 |
+
const StepsDisplay = ({ activeStep }) => {
|
28 |
+
const steps = ["Login", "Upload File", "Generate"];
|
29 |
+
|
30 |
+
return (
|
31 |
+
<Box sx={{ width: "100%", mb: 4 }}>
|
32 |
+
<Stepper activeStep={activeStep} alternativeLabel>
|
33 |
+
{steps.map((label) => (
|
34 |
+
<Step key={label}>
|
35 |
+
<StepLabel>{label}</StepLabel>
|
36 |
+
</Step>
|
37 |
+
))}
|
38 |
+
</Stepper>
|
39 |
+
</Box>
|
40 |
+
);
|
41 |
+
};
|
42 |
+
|
43 |
+
/**
|
44 |
+
* Component for creating a new benchmark, including authentication, file upload, and generation initiation
|
45 |
+
*
|
46 |
+
* @param {Object} props - Component props
|
47 |
+
* @param {Function} props.onStartGeneration - Callback when generation starts with sessionId
|
48 |
+
* @returns {JSX.Element} BenchmarkCreateForm component
|
49 |
+
*/
|
50 |
+
function BenchmarkCreateForm({ onStartGeneration }) {
|
51 |
+
const { mode } = useThemeMode();
|
52 |
+
const theme = getTheme(mode);
|
53 |
+
const [isDragging, setIsDragging] = useState(false);
|
54 |
+
const [uploadStatus, setUploadStatus] = useState(null);
|
55 |
+
const [isLoading, setIsLoading] = useState(false);
|
56 |
+
const [activeStep, setActiveStep] = useState(0);
|
57 |
+
const [sessionId, setSessionId] = useState(null);
|
58 |
+
const fileInputRef = useRef(null);
|
59 |
+
const location = useLocation();
|
60 |
+
|
61 |
+
// Check if we're coming back from an OAuth redirect
|
62 |
+
useEffect(() => {
|
63 |
+
// If we have code in URL parameters, it's an OAuth callback
|
64 |
+
const params = new URLSearchParams(window.location.search);
|
65 |
+
if (params.has("code")) {
|
66 |
+
console.log("Detected OAuth callback, cleaning URL");
|
67 |
+
|
68 |
+
// Remove the query parameters from the URL without reloading
|
69 |
+
window.history.replaceState({}, document.title, window.location.pathname);
|
70 |
+
|
71 |
+
// Check if we have auth data in localStorage after a brief delay to let OAuth process complete
|
72 |
+
setTimeout(() => {
|
73 |
+
const storedAuth = localStorage.getItem("hf_oauth");
|
74 |
+
if (storedAuth) {
|
75 |
+
console.log("Found auth data after redirect, refreshing UI state");
|
76 |
+
setActiveStep(1); // Move to next step if authenticated
|
77 |
+
}
|
78 |
+
}, 1000);
|
79 |
+
}
|
80 |
+
}, [location]);
|
81 |
+
|
82 |
+
const handleDragOver = (e) => {
|
83 |
+
e.preventDefault();
|
84 |
+
setIsDragging(true);
|
85 |
+
};
|
86 |
+
|
87 |
+
const handleDragLeave = () => {
|
88 |
+
setIsDragging(false);
|
89 |
+
};
|
90 |
+
|
91 |
+
const handleClick = () => {
|
92 |
+
fileInputRef.current.click();
|
93 |
+
};
|
94 |
+
|
95 |
+
const handleFileChange = (e) => {
|
96 |
+
const file = e.target.files[0];
|
97 |
+
if (!file) return;
|
98 |
+
|
99 |
+
// Vérifier si c'est un PDF, TXT, HTML ou MD
|
100 |
+
if (
|
101 |
+
!file.name.endsWith(".pdf") &&
|
102 |
+
!file.name.endsWith(".txt") &&
|
103 |
+
!file.name.endsWith(".html") &&
|
104 |
+
!file.name.endsWith(".md")
|
105 |
+
) {
|
106 |
+
setUploadStatus({
|
107 |
+
success: false,
|
108 |
+
message: "Only PDF, TXT, HTML and MD files are accepted",
|
109 |
+
});
|
110 |
+
return;
|
111 |
+
}
|
112 |
+
|
113 |
+
handleFileUpload(file);
|
114 |
+
};
|
115 |
+
|
116 |
+
const handleFileUpload = async (file) => {
|
117 |
+
setIsLoading(true);
|
118 |
+
setUploadStatus(null);
|
119 |
+
|
120 |
+
try {
|
121 |
+
const formData = new FormData();
|
122 |
+
formData.append("file", file);
|
123 |
+
|
124 |
+
const response = await fetch("http://localhost:3001/upload", {
|
125 |
+
method: "POST",
|
126 |
+
body: formData,
|
127 |
+
});
|
128 |
+
|
129 |
+
const result = await response.json();
|
130 |
+
|
131 |
+
if (response.ok) {
|
132 |
+
setUploadStatus({
|
133 |
+
success: true,
|
134 |
+
message: `File ${result.filename} uploaded successfully`,
|
135 |
+
});
|
136 |
+
// Store the session ID for the benchmark generation
|
137 |
+
setSessionId(result.session_id);
|
138 |
+
setActiveStep(2); // Advance to Generate step after successful upload
|
139 |
+
} else {
|
140 |
+
setUploadStatus({
|
141 |
+
success: false,
|
142 |
+
message: result.error || "Upload failed",
|
143 |
+
});
|
144 |
+
}
|
145 |
+
} catch (error) {
|
146 |
+
setUploadStatus({
|
147 |
+
success: false,
|
148 |
+
message: "Server connection error",
|
149 |
+
});
|
150 |
+
} finally {
|
151 |
+
setIsLoading(false);
|
152 |
+
}
|
153 |
+
};
|
154 |
+
|
155 |
+
const handleDrop = async (e) => {
|
156 |
+
e.preventDefault();
|
157 |
+
setIsDragging(false);
|
158 |
+
|
159 |
+
const file = e.dataTransfer.files[0];
|
160 |
+
if (!file) {
|
161 |
+
setUploadStatus({ success: false, message: "No file detected" });
|
162 |
+
return;
|
163 |
+
}
|
164 |
+
|
165 |
+
// Vérifier si c'est un PDF, TXT, HTML ou MD
|
166 |
+
if (
|
167 |
+
!file.name.endsWith(".pdf") &&
|
168 |
+
!file.name.endsWith(".txt") &&
|
169 |
+
!file.name.endsWith(".html") &&
|
170 |
+
!file.name.endsWith(".md")
|
171 |
+
) {
|
172 |
+
setUploadStatus({
|
173 |
+
success: false,
|
174 |
+
message: "Only PDF, TXT, HTML and MD files are accepted",
|
175 |
+
});
|
176 |
+
return;
|
177 |
+
}
|
178 |
+
|
179 |
+
handleFileUpload(file);
|
180 |
+
};
|
181 |
+
|
182 |
+
const handleGenerateClick = () => {
|
183 |
+
if (onStartGeneration && sessionId) {
|
184 |
+
onStartGeneration(sessionId);
|
185 |
+
}
|
186 |
+
};
|
187 |
+
|
188 |
+
return (
|
189 |
+
<>
|
190 |
+
<StepsDisplay activeStep={activeStep} />
|
191 |
+
|
192 |
+
{/* Authentication step */}
|
193 |
+
{activeStep === 0 && (
|
194 |
+
<AuthContainer
|
195 |
+
actionText="use this demo"
|
196 |
+
onSuccess={() => setActiveStep(1)}
|
197 |
+
/>
|
198 |
+
)}
|
199 |
+
|
200 |
+
{/* File upload step */}
|
201 |
+
{activeStep === 1 && (
|
202 |
+
<Paper
|
203 |
+
elevation={3}
|
204 |
+
sx={{
|
205 |
+
p: 4,
|
206 |
+
mt: 3,
|
207 |
+
mb: 3,
|
208 |
+
border: isDragging
|
209 |
+
? `2px dashed ${theme.palette.primary.main}`
|
210 |
+
: "2px dashed #ccc",
|
211 |
+
backgroundColor: isDragging ? "rgba(0, 0, 0, 0.05)" : "transparent",
|
212 |
+
display: "flex",
|
213 |
+
flexDirection: "column",
|
214 |
+
alignItems: "center",
|
215 |
+
justifyContent: "center",
|
216 |
+
minHeight: 200,
|
217 |
+
cursor: "pointer",
|
218 |
+
transition: "all 0.3s ease",
|
219 |
+
}}
|
220 |
+
onDragOver={handleDragOver}
|
221 |
+
onDragLeave={handleDragLeave}
|
222 |
+
onDrop={handleDrop}
|
223 |
+
onClick={handleClick}
|
224 |
+
>
|
225 |
+
<input
|
226 |
+
type="file"
|
227 |
+
ref={fileInputRef}
|
228 |
+
onChange={handleFileChange}
|
229 |
+
accept=".pdf,.txt,.html,.md"
|
230 |
+
style={{ display: "none" }}
|
231 |
+
/>
|
232 |
+
<CloudUploadIcon
|
233 |
+
sx={{ fontSize: 60, color: "text.secondary", mb: 1 }}
|
234 |
+
/>
|
235 |
+
<Typography variant="h6" component="div" gutterBottom>
|
236 |
+
Drag and drop your file here or click to browse
|
237 |
+
</Typography>
|
238 |
+
<Typography variant="body2" color="text.secondary">
|
239 |
+
Accepted formats: PDF, TXT, HTML, MD
|
240 |
+
</Typography>
|
241 |
+
|
242 |
+
{isLoading && (
|
243 |
+
<Box sx={{ mt: 2 }}>
|
244 |
+
<CircularProgress size={30} />
|
245 |
+
</Box>
|
246 |
+
)}
|
247 |
+
|
248 |
+
{uploadStatus && (
|
249 |
+
<Alert
|
250 |
+
severity={uploadStatus.success ? "success" : "error"}
|
251 |
+
sx={{ mt: 2, width: "100%" }}
|
252 |
+
>
|
253 |
+
{uploadStatus.message}
|
254 |
+
</Alert>
|
255 |
+
)}
|
256 |
+
</Paper>
|
257 |
+
)}
|
258 |
+
|
259 |
+
{/* Generate button step */}
|
260 |
+
{activeStep === 2 && (
|
261 |
+
<Paper
|
262 |
+
elevation={3}
|
263 |
+
sx={{
|
264 |
+
p: 4,
|
265 |
+
mt: 3,
|
266 |
+
mb: 3,
|
267 |
+
display: "flex",
|
268 |
+
flexDirection: "column",
|
269 |
+
alignItems: "center",
|
270 |
+
justifyContent: "center",
|
271 |
+
minHeight: 200,
|
272 |
+
}}
|
273 |
+
>
|
274 |
+
<PlayArrowIcon
|
275 |
+
sx={{ fontSize: 60, color: "text.secondary", mb: 1 }}
|
276 |
+
/>
|
277 |
+
<Typography variant="h6" component="div" gutterBottom>
|
278 |
+
Ready to generate your benchmark
|
279 |
+
</Typography>
|
280 |
+
<Button
|
281 |
+
variant="contained"
|
282 |
+
color="primary"
|
283 |
+
onClick={handleGenerateClick}
|
284 |
+
sx={{ mt: 2 }}
|
285 |
+
startIcon={<PlayArrowIcon />}
|
286 |
+
>
|
287 |
+
Generate Benchmark
|
288 |
+
</Button>
|
289 |
+
</Paper>
|
290 |
+
)}
|
291 |
+
</>
|
292 |
+
);
|
293 |
+
}
|
294 |
+
|
295 |
+
export default BenchmarkCreateForm;
|
frontend/src/components/BenchmarkDisplay.jsx
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React, { useState } from "react";
|
2 |
+
import {
|
3 |
+
Box,
|
4 |
+
Typography,
|
5 |
+
Paper,
|
6 |
+
Button,
|
7 |
+
Divider,
|
8 |
+
Card,
|
9 |
+
CardContent,
|
10 |
+
Link,
|
11 |
+
CircularProgress,
|
12 |
+
Tooltip,
|
13 |
+
} from "@mui/material";
|
14 |
+
import PlayArrowIcon from "@mui/icons-material/PlayArrow";
|
15 |
+
import AssessmentIcon from "@mui/icons-material/Assessment";
|
16 |
+
import LinkIcon from "@mui/icons-material/Link";
|
17 |
+
import DownloadIcon from "@mui/icons-material/Download";
|
18 |
+
import CheckCircleIcon from "@mui/icons-material/CheckCircle";
|
19 |
+
|
20 |
+
/**
|
21 |
+
* Component to display benchmark information and evaluation button
|
22 |
+
*
|
23 |
+
* @param {Object} props - Component props
|
24 |
+
* @param {Array} props.sampleQuestions - Array of sample questions to display
|
25 |
+
* @param {Function} props.onStartEvaluation - Function to call when evaluation button is clicked
|
26 |
+
* @param {string} props.sessionId - Session ID used for the benchmark generation
|
27 |
+
* @param {string} props.datasetUrl - URL to the Hugging Face dataset
|
28 |
+
* @returns {JSX.Element} Benchmark display component
|
29 |
+
*/
|
30 |
+
const BenchmarkDisplay = ({
|
31 |
+
sampleQuestions = [],
|
32 |
+
onStartEvaluation,
|
33 |
+
sessionId,
|
34 |
+
datasetUrl,
|
35 |
+
}) => {
|
36 |
+
const [isDownloading, setIsDownloading] = useState(false);
|
37 |
+
|
38 |
+
// Default questions if none provided
|
39 |
+
const questions =
|
40 |
+
sampleQuestions.length > 0
|
41 |
+
? sampleQuestions
|
42 |
+
: [
|
43 |
+
{
|
44 |
+
id: 1,
|
45 |
+
question: "What are the key benefits of the described technology?",
|
46 |
+
type: "single_shot",
|
47 |
+
},
|
48 |
+
{
|
49 |
+
id: 2,
|
50 |
+
question:
|
51 |
+
"Based on the context about machine learning frameworks, how does TensorFlow compare to PyTorch in terms of deployment capabilities?",
|
52 |
+
type: "multi_hop",
|
53 |
+
},
|
54 |
+
];
|
55 |
+
|
56 |
+
const handleEvaluationClick = () => {
|
57 |
+
if (onStartEvaluation) {
|
58 |
+
onStartEvaluation();
|
59 |
+
}
|
60 |
+
};
|
61 |
+
|
62 |
+
const handleDownloadClick = async () => {
|
63 |
+
if (!sessionId) return;
|
64 |
+
|
65 |
+
setIsDownloading(true);
|
66 |
+
try {
|
67 |
+
// Requête pour télécharger le dataset
|
68 |
+
const downloadUrl = `http://localhost:3001/download-dataset/${sessionId}`;
|
69 |
+
|
70 |
+
// Créer un élément a temporaire pour déclencher le téléchargement
|
71 |
+
const link = document.createElement("a");
|
72 |
+
link.href = downloadUrl;
|
73 |
+
link.setAttribute("download", `yourbench_${sessionId}_dataset.zip`);
|
74 |
+
document.body.appendChild(link);
|
75 |
+
link.click();
|
76 |
+
document.body.removeChild(link);
|
77 |
+
} catch (error) {
|
78 |
+
console.error("Erreur lors du téléchargement du dataset:", error);
|
79 |
+
alert("Erreur lors du téléchargement. Veuillez réessayer.");
|
80 |
+
} finally {
|
81 |
+
setIsDownloading(false);
|
82 |
+
}
|
83 |
+
};
|
84 |
+
|
85 |
+
return (
|
86 |
+
<Box sx={{ width: "100%", mt: 3 }}>
|
87 |
+
{/* Header avec titre et bouton de téléchargement alignés */}
|
88 |
+
<Box
|
89 |
+
sx={{
|
90 |
+
mb: 4,
|
91 |
+
display: "flex",
|
92 |
+
justifyContent: "space-between",
|
93 |
+
alignItems: "center",
|
94 |
+
}}
|
95 |
+
>
|
96 |
+
<Box sx={{ display: "flex", alignItems: "center" }}>
|
97 |
+
<CheckCircleIcon color="success" sx={{ mr: 1.5, fontSize: 28 }} />
|
98 |
+
<Typography variant="h6">Benchmark Created Successfully</Typography>
|
99 |
+
</Box>
|
100 |
+
|
101 |
+
<Tooltip title="Télécharger le benchmark complet">
|
102 |
+
<Button
|
103 |
+
variant="contained"
|
104 |
+
color="primary"
|
105 |
+
endIcon={
|
106 |
+
isDownloading ? <CircularProgress size={16} /> : <DownloadIcon />
|
107 |
+
}
|
108 |
+
onClick={handleDownloadClick}
|
109 |
+
disabled={isDownloading || !sessionId}
|
110 |
+
>
|
111 |
+
{isDownloading ? "Téléchargement..." : "Download Benchmark"}
|
112 |
+
</Button>
|
113 |
+
</Tooltip>
|
114 |
+
</Box>
|
115 |
+
|
116 |
+
<Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
|
117 |
+
Your benchmark has been generated. Here are some example questions:
|
118 |
+
</Typography>
|
119 |
+
|
120 |
+
<Box sx={{ mb: 3 }}>
|
121 |
+
{questions.map((q, index) => (
|
122 |
+
<Card
|
123 |
+
key={q.id || index}
|
124 |
+
variant="outlined"
|
125 |
+
sx={{
|
126 |
+
mb: 2,
|
127 |
+
backgroundColor: "#fafafa",
|
128 |
+
}}
|
129 |
+
>
|
130 |
+
<CardContent>
|
131 |
+
<Typography
|
132 |
+
variant="caption"
|
133 |
+
color="text.secondary"
|
134 |
+
sx={{ display: "block", mb: 1 }}
|
135 |
+
>
|
136 |
+
{q.type === "multi_hop"
|
137 |
+
? "Multi-hop Question"
|
138 |
+
: "Single-shot Question"}
|
139 |
+
</Typography>
|
140 |
+
<Typography variant="body1">{q.question}</Typography>
|
141 |
+
</CardContent>
|
142 |
+
</Card>
|
143 |
+
))}
|
144 |
+
</Box>
|
145 |
+
|
146 |
+
<Box sx={{ display: "flex", justifyContent: "center", mt: 8 }}>
|
147 |
+
<Button
|
148 |
+
variant="contained"
|
149 |
+
color="primary"
|
150 |
+
size="large"
|
151 |
+
startIcon={<AssessmentIcon />}
|
152 |
+
onClick={handleEvaluationClick}
|
153 |
+
>
|
154 |
+
Start Evaluation
|
155 |
+
</Button>
|
156 |
+
</Box>
|
157 |
+
</Box>
|
158 |
+
);
|
159 |
+
};
|
160 |
+
|
161 |
+
export default BenchmarkDisplay;
|
frontend/src/components/BenchmarkEvaluation.jsx
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React, { useState, useEffect, useRef } from "react";
|
2 |
+
import {
|
3 |
+
Box,
|
4 |
+
Typography,
|
5 |
+
CircularProgress,
|
6 |
+
Alert,
|
7 |
+
Paper,
|
8 |
+
Divider,
|
9 |
+
Button,
|
10 |
+
} from "@mui/material";
|
11 |
+
import AccessTimeIcon from "@mui/icons-material/AccessTime";
|
12 |
+
import LogDisplay from "./LogDisplay";
|
13 |
+
import { useNavigate } from "react-router-dom";
|
14 |
+
|
15 |
+
// Evaluation steps
|
16 |
+
const EVALUATION_STEPS = [
|
17 |
+
"preparation",
|
18 |
+
"model_evaluation",
|
19 |
+
"results_compilation",
|
20 |
+
];
|
21 |
+
|
22 |
+
// Friendly step names for display
|
23 |
+
const STEP_LABELS = {
|
24 |
+
preparation: "Preparation",
|
25 |
+
model_evaluation: "Model Evaluation",
|
26 |
+
results_compilation: "Results Compilation",
|
27 |
+
};
|
28 |
+
|
29 |
+
/**
|
30 |
+
* Component to handle benchmark evaluation and display logs
|
31 |
+
*
|
32 |
+
* @param {Object} props - Component props
|
33 |
+
* @param {string} props.sessionId - Session ID of the benchmark to evaluate
|
34 |
+
* @param {Function} props.onComplete - Function to call when evaluation is complete
|
35 |
+
* @returns {JSX.Element} Benchmark evaluation component
|
36 |
+
*/
|
37 |
+
const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
38 |
+
const [evaluating, setEvaluating] = useState(false);
|
39 |
+
const [evaluationComplete, setEvaluationComplete] = useState(false);
|
40 |
+
const [evaluationLogs, setEvaluationLogs] = useState([]);
|
41 |
+
const [error, setError] = useState(null);
|
42 |
+
const [currentPhase, setCurrentPhase] = useState("initializing");
|
43 |
+
const [completedSteps, setCompletedSteps] = useState([]);
|
44 |
+
const [activeStep, setActiveStep] = useState(0);
|
45 |
+
const [elapsedTime, setElapsedTime] = useState(0);
|
46 |
+
|
47 |
+
const pollingIntervalRef = useRef(null);
|
48 |
+
const timerIntervalRef = useRef(null);
|
49 |
+
const startTimeRef = useRef(null);
|
50 |
+
|
51 |
+
const navigate = useNavigate();
|
52 |
+
|
53 |
+
// Start evaluation when component mounts
|
54 |
+
useEffect(() => {
|
55 |
+
// Set start time
|
56 |
+
startTimeRef.current = Date.now();
|
57 |
+
|
58 |
+
// Start timer
|
59 |
+
timerIntervalRef.current = setInterval(() => {
|
60 |
+
const timeElapsed = Math.floor(
|
61 |
+
(Date.now() - startTimeRef.current) / 1000
|
62 |
+
);
|
63 |
+
setElapsedTime(timeElapsed);
|
64 |
+
}, 1000);
|
65 |
+
|
66 |
+
startEvaluation();
|
67 |
+
|
68 |
+
// Clean up intervals on unmount
|
69 |
+
return () => {
|
70 |
+
if (pollingIntervalRef.current) {
|
71 |
+
clearInterval(pollingIntervalRef.current);
|
72 |
+
}
|
73 |
+
if (timerIntervalRef.current) {
|
74 |
+
clearInterval(timerIntervalRef.current);
|
75 |
+
}
|
76 |
+
};
|
77 |
+
}, []);
|
78 |
+
|
79 |
+
// Determine current phase and completed steps from logs
|
80 |
+
useEffect(() => {
|
81 |
+
if (evaluationLogs.length === 0) return;
|
82 |
+
|
83 |
+
// Check all logs for completed steps
|
84 |
+
const newCompletedSteps = [...completedSteps];
|
85 |
+
let newActiveStep = activeStep;
|
86 |
+
|
87 |
+
evaluationLogs.forEach((log) => {
|
88 |
+
// Detect completed steps (format: [SUCCESS] Stage completed: step_name)
|
89 |
+
const match = log.match(/\[SUCCESS\] Stage completed: (\w+)/);
|
90 |
+
if (match && match[1]) {
|
91 |
+
const completedStep = match[1].trim();
|
92 |
+
if (
|
93 |
+
EVALUATION_STEPS.includes(completedStep) &&
|
94 |
+
!newCompletedSteps.includes(completedStep)
|
95 |
+
) {
|
96 |
+
newCompletedSteps.push(completedStep);
|
97 |
+
// Set active step to index of next step
|
98 |
+
const stepIndex = EVALUATION_STEPS.indexOf(completedStep);
|
99 |
+
if (stepIndex >= 0 && stepIndex + 1 > newActiveStep) {
|
100 |
+
newActiveStep = stepIndex + 1;
|
101 |
+
if (newActiveStep >= EVALUATION_STEPS.length) {
|
102 |
+
newActiveStep = EVALUATION_STEPS.length;
|
103 |
+
}
|
104 |
+
}
|
105 |
+
}
|
106 |
+
}
|
107 |
+
});
|
108 |
+
|
109 |
+
// Update state if there are new completed steps
|
110 |
+
if (newCompletedSteps.length > completedSteps.length) {
|
111 |
+
setCompletedSteps(newCompletedSteps);
|
112 |
+
setActiveStep(newActiveStep);
|
113 |
+
}
|
114 |
+
|
115 |
+
// Check recent logs to determine current phase
|
116 |
+
const recentLogs = evaluationLogs.slice(-10);
|
117 |
+
|
118 |
+
// Detect completion conditions
|
119 |
+
const isComplete =
|
120 |
+
recentLogs.some((log) =>
|
121 |
+
log.includes("[SUCCESS] Evaluation completed")
|
122 |
+
) ||
|
123 |
+
completedSteps.includes("results_compilation") ||
|
124 |
+
newCompletedSteps.includes("results_compilation");
|
125 |
+
|
126 |
+
if (isComplete) {
|
127 |
+
setCurrentPhase("complete");
|
128 |
+
setEvaluationComplete(true);
|
129 |
+
// Stop polling when evaluation is complete
|
130 |
+
if (pollingIntervalRef.current) {
|
131 |
+
clearInterval(pollingIntervalRef.current);
|
132 |
+
}
|
133 |
+
if (timerIntervalRef.current) {
|
134 |
+
clearInterval(timerIntervalRef.current);
|
135 |
+
}
|
136 |
+
// Notify parent component that evaluation is complete
|
137 |
+
if (onComplete) {
|
138 |
+
onComplete({
|
139 |
+
success: true,
|
140 |
+
sessionId,
|
141 |
+
logs: evaluationLogs,
|
142 |
+
});
|
143 |
+
}
|
144 |
+
} else if (recentLogs.some((log) => log.includes("Comparing models"))) {
|
145 |
+
setCurrentPhase("compiling_results");
|
146 |
+
} else if (recentLogs.some((log) => log.includes("Starting evaluations"))) {
|
147 |
+
setCurrentPhase("evaluating");
|
148 |
+
} else if (recentLogs.some((log) => log.includes("Initialization"))) {
|
149 |
+
setCurrentPhase("preparing");
|
150 |
+
}
|
151 |
+
}, [evaluationLogs, completedSteps, activeStep, sessionId, onComplete]);
|
152 |
+
|
153 |
+
// Format elapsed time as HH:MM:SS
|
154 |
+
const formatElapsedTime = () => {
|
155 |
+
const hours = Math.floor(elapsedTime / 3600);
|
156 |
+
const minutes = Math.floor((elapsedTime % 3600) / 60);
|
157 |
+
const seconds = elapsedTime % 60;
|
158 |
+
|
159 |
+
return [
|
160 |
+
hours.toString().padStart(2, "0"),
|
161 |
+
minutes.toString().padStart(2, "0"),
|
162 |
+
seconds.toString().padStart(2, "0"),
|
163 |
+
].join(":");
|
164 |
+
};
|
165 |
+
|
166 |
+
// Start benchmark evaluation
|
167 |
+
const startEvaluation = async () => {
|
168 |
+
if (!sessionId) {
|
169 |
+
setError("Missing session ID");
|
170 |
+
return;
|
171 |
+
}
|
172 |
+
|
173 |
+
setEvaluating(true);
|
174 |
+
setEvaluationLogs([]);
|
175 |
+
setError(null);
|
176 |
+
setCurrentPhase("initializing");
|
177 |
+
setCompletedSteps([]);
|
178 |
+
setActiveStep(0);
|
179 |
+
|
180 |
+
try {
|
181 |
+
// Call API to start evaluation
|
182 |
+
const response = await fetch("http://localhost:3001/evaluate-benchmark", {
|
183 |
+
method: "POST",
|
184 |
+
headers: {
|
185 |
+
"Content-Type": "application/json",
|
186 |
+
},
|
187 |
+
body: JSON.stringify({
|
188 |
+
session_id: sessionId,
|
189 |
+
}),
|
190 |
+
});
|
191 |
+
|
192 |
+
const result = await response.json();
|
193 |
+
|
194 |
+
if (response.ok) {
|
195 |
+
setEvaluationLogs(result.logs || []);
|
196 |
+
|
197 |
+
// Set up polling to retrieve more logs
|
198 |
+
pollingIntervalRef.current = setInterval(async () => {
|
199 |
+
// Check if we're already done
|
200 |
+
if (evaluationComplete) {
|
201 |
+
clearInterval(pollingIntervalRef.current);
|
202 |
+
return;
|
203 |
+
}
|
204 |
+
|
205 |
+
try {
|
206 |
+
// Call API to get latest logs
|
207 |
+
const logsResponse = await fetch(
|
208 |
+
`http://localhost:3001/evaluation-logs/${sessionId}`
|
209 |
+
);
|
210 |
+
|
211 |
+
if (logsResponse.ok) {
|
212 |
+
const logsResult = await logsResponse.json();
|
213 |
+
|
214 |
+
// Update logs if there are new ones
|
215 |
+
if (
|
216 |
+
logsResult.logs &&
|
217 |
+
logsResult.logs.length > evaluationLogs.length
|
218 |
+
) {
|
219 |
+
setEvaluationLogs(logsResult.logs);
|
220 |
+
}
|
221 |
+
|
222 |
+
// Check if evaluation is complete
|
223 |
+
if (logsResult.is_completed) {
|
224 |
+
setEvaluationComplete(true);
|
225 |
+
clearInterval(pollingIntervalRef.current);
|
226 |
+
}
|
227 |
+
}
|
228 |
+
} catch (error) {
|
229 |
+
console.log("Error polling logs:", error);
|
230 |
+
// Don't stop polling on network errors
|
231 |
+
}
|
232 |
+
}, 2000); // Poll every 2 seconds
|
233 |
+
} else {
|
234 |
+
// Handle error
|
235 |
+
setEvaluationLogs([`Error: ${result.error || "Unknown error"}`]);
|
236 |
+
setError(result.error || "Benchmark evaluation failed");
|
237 |
+
}
|
238 |
+
} catch (error) {
|
239 |
+
console.error("Error starting evaluation:", error);
|
240 |
+
setEvaluationLogs([`Error: ${error.message || "Unknown error"}`]);
|
241 |
+
setError("Error connecting to server");
|
242 |
+
} finally {
|
243 |
+
setEvaluating(false);
|
244 |
+
}
|
245 |
+
};
|
246 |
+
|
247 |
+
// Get title based on current phase
|
248 |
+
const getPhaseTitle = () => {
|
249 |
+
switch (currentPhase) {
|
250 |
+
case "initializing":
|
251 |
+
return "Preparing evaluation...";
|
252 |
+
case "preparing":
|
253 |
+
return "Preparing models...";
|
254 |
+
case "evaluating":
|
255 |
+
return "Evaluating models...";
|
256 |
+
case "compiling_results":
|
257 |
+
return "Compiling results...";
|
258 |
+
case "complete":
|
259 |
+
return "Evaluation completed successfully!";
|
260 |
+
default:
|
261 |
+
return "Processing...";
|
262 |
+
}
|
263 |
+
};
|
264 |
+
|
265 |
+
// Get current step info for display
|
266 |
+
const getCurrentStepInfo = () => {
|
267 |
+
const totalSteps = EVALUATION_STEPS.length;
|
268 |
+
const currentStepIndex = activeStep;
|
269 |
+
|
270 |
+
// If no active step yet
|
271 |
+
if (currentStepIndex === 0 && completedSteps.length === 0) {
|
272 |
+
return `Starting... (0%)`;
|
273 |
+
}
|
274 |
+
|
275 |
+
// If all steps completed
|
276 |
+
if (currentStepIndex >= totalSteps) {
|
277 |
+
return `Completed (100%)`;
|
278 |
+
}
|
279 |
+
|
280 |
+
// Calculate percentage
|
281 |
+
const percentage = Math.round((currentStepIndex / totalSteps) * 100);
|
282 |
+
|
283 |
+
// Get current step name
|
284 |
+
const currentStepName =
|
285 |
+
STEP_LABELS[EVALUATION_STEPS[currentStepIndex]] || "Processing";
|
286 |
+
|
287 |
+
return `${currentStepName} (${percentage}%)`;
|
288 |
+
};
|
289 |
+
|
290 |
+
// Function to navigate to results page
|
291 |
+
const viewResults = () => {
|
292 |
+
navigate(`/evaluation-display?session=${sessionId}`);
|
293 |
+
};
|
294 |
+
|
295 |
+
return (
|
296 |
+
<Paper
|
297 |
+
elevation={3}
|
298 |
+
sx={{
|
299 |
+
p: 4,
|
300 |
+
mt: 3,
|
301 |
+
mb: 3,
|
302 |
+
display: "flex",
|
303 |
+
flexDirection: "column",
|
304 |
+
alignItems: "center",
|
305 |
+
justifyContent: "center",
|
306 |
+
minHeight: 200,
|
307 |
+
}}
|
308 |
+
>
|
309 |
+
{error ? (
|
310 |
+
<Alert severity="error" sx={{ width: "100%" }}>
|
311 |
+
{error}
|
312 |
+
</Alert>
|
313 |
+
) : (
|
314 |
+
<>
|
315 |
+
{evaluationComplete ? (
|
316 |
+
<>
|
317 |
+
<Alert severity="success" sx={{ width: "100%", mb: 3 }}>
|
318 |
+
Evaluation completed successfully!
|
319 |
+
</Alert>
|
320 |
+
<Button
|
321 |
+
variant="contained"
|
322 |
+
color="primary"
|
323 |
+
onClick={viewResults}
|
324 |
+
sx={{ mb: 3 }}
|
325 |
+
>
|
326 |
+
View Results Leaderboard
|
327 |
+
</Button>
|
328 |
+
</>
|
329 |
+
) : (
|
330 |
+
<>
|
331 |
+
<CircularProgress size={60} sx={{ mb: 2 }} />
|
332 |
+
<Typography variant="h6" component="div" gutterBottom>
|
333 |
+
{getPhaseTitle()}
|
334 |
+
</Typography>
|
335 |
+
|
336 |
+
{/* Step progress indicator */}
|
337 |
+
<Typography variant="body1" color="text.secondary">
|
338 |
+
{getCurrentStepInfo()}
|
339 |
+
</Typography>
|
340 |
+
|
341 |
+
{/* Timer display */}
|
342 |
+
<Box
|
343 |
+
sx={{
|
344 |
+
display: "flex",
|
345 |
+
alignItems: "center",
|
346 |
+
mt: 1,
|
347 |
+
color: "text.secondary",
|
348 |
+
opacity: 0.5,
|
349 |
+
}}
|
350 |
+
>
|
351 |
+
<Typography variant="body2">{formatElapsedTime()}</Typography>
|
352 |
+
</Box>
|
353 |
+
</>
|
354 |
+
)}
|
355 |
+
</>
|
356 |
+
)}
|
357 |
+
|
358 |
+
{/* Use the LogDisplay component for logs */}
|
359 |
+
<LogDisplay logs={evaluationLogs} height={300} />
|
360 |
+
</Paper>
|
361 |
+
);
|
362 |
+
};
|
363 |
+
|
364 |
+
export default BenchmarkEvaluation;
|
frontend/src/components/BenchmarkGenerator.jsx
ADDED
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React, { useState, useEffect, useRef } from "react";
|
2 |
+
import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
|
3 |
+
import PlayArrowIcon from "@mui/icons-material/PlayArrow";
|
4 |
+
import AccessTimeIcon from "@mui/icons-material/AccessTime";
|
5 |
+
import LogDisplay from "./LogDisplay";
|
6 |
+
|
7 |
+
// Define all benchmark steps in sequence
|
8 |
+
const BENCHMARK_STEPS = [
|
9 |
+
"ingestion",
|
10 |
+
"upload_ingest_to_hub",
|
11 |
+
"summarization",
|
12 |
+
"chunking",
|
13 |
+
"single_shot_question_generation",
|
14 |
+
"multi_hop_question_generation",
|
15 |
+
"lighteval",
|
16 |
+
];
|
17 |
+
|
18 |
+
// Step labels for display (more user-friendly names)
|
19 |
+
const STEP_LABELS = {
|
20 |
+
ingestion: "Ingestion",
|
21 |
+
upload_ingest_to_hub: "Upload to Hub",
|
22 |
+
summarization: "Summarization",
|
23 |
+
chunking: "Chunking",
|
24 |
+
single_shot_question_generation: "Single-shot QG",
|
25 |
+
multi_hop_question_generation: "Multi-hop QG",
|
26 |
+
lighteval: "LightEval",
|
27 |
+
};
|
28 |
+
|
29 |
+
/**
|
30 |
+
* Component to handle benchmark generation and display logs
|
31 |
+
*
|
32 |
+
* @param {Object} props - Component props
|
33 |
+
* @param {string} props.sessionId - The session ID for the uploaded file
|
34 |
+
* @param {Function} props.onComplete - Function to call when generation is complete
|
35 |
+
* @returns {JSX.Element} Benchmark generator component
|
36 |
+
*/
|
37 |
+
const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
38 |
+
const [generating, setGenerating] = useState(false);
|
39 |
+
const [generationComplete, setGenerationComplete] = useState(false);
|
40 |
+
const [generationLogs, setGenerationLogs] = useState([]);
|
41 |
+
const [error, setError] = useState(null);
|
42 |
+
const [currentPhase, setCurrentPhase] = useState("initializing");
|
43 |
+
const [completedSteps, setCompletedSteps] = useState([]);
|
44 |
+
const [activeStep, setActiveStep] = useState(0);
|
45 |
+
const [elapsedTime, setElapsedTime] = useState(0);
|
46 |
+
|
47 |
+
// Reference to keep track of the polling interval
|
48 |
+
const pollingIntervalRef = useRef(null);
|
49 |
+
|
50 |
+
// Reference to keep track of the timer interval
|
51 |
+
const timerIntervalRef = useRef(null);
|
52 |
+
|
53 |
+
// Reference for starting time
|
54 |
+
const startTimeRef = useRef(null);
|
55 |
+
|
56 |
+
// Start generation on component mount
|
57 |
+
useEffect(() => {
|
58 |
+
// Set start time
|
59 |
+
startTimeRef.current = Date.now();
|
60 |
+
|
61 |
+
// Start timer
|
62 |
+
timerIntervalRef.current = setInterval(() => {
|
63 |
+
const timeElapsed = Math.floor(
|
64 |
+
(Date.now() - startTimeRef.current) / 1000
|
65 |
+
);
|
66 |
+
setElapsedTime(timeElapsed);
|
67 |
+
}, 1000);
|
68 |
+
|
69 |
+
generateBenchmark();
|
70 |
+
|
71 |
+
// Clean up the polling interval and timer when the component unmounts
|
72 |
+
return () => {
|
73 |
+
if (pollingIntervalRef.current) {
|
74 |
+
clearInterval(pollingIntervalRef.current);
|
75 |
+
}
|
76 |
+
if (timerIntervalRef.current) {
|
77 |
+
clearInterval(timerIntervalRef.current);
|
78 |
+
}
|
79 |
+
};
|
80 |
+
}, []);
|
81 |
+
|
82 |
+
// Determine the current phase and completed steps based on logs
|
83 |
+
useEffect(() => {
|
84 |
+
if (generationLogs.length === 0) return;
|
85 |
+
|
86 |
+
// Check all logs for completed stages
|
87 |
+
const newCompletedSteps = [...completedSteps];
|
88 |
+
let newActiveStep = activeStep;
|
89 |
+
|
90 |
+
generationLogs.forEach((log) => {
|
91 |
+
const match = log.match(/\[SUCCESS\] Stage completed: (\w+)/);
|
92 |
+
if (match && match[1]) {
|
93 |
+
const completedStep = match[1].trim();
|
94 |
+
if (
|
95 |
+
BENCHMARK_STEPS.includes(completedStep) &&
|
96 |
+
!newCompletedSteps.includes(completedStep)
|
97 |
+
) {
|
98 |
+
newCompletedSteps.push(completedStep);
|
99 |
+
// Set active step to the index of the next step
|
100 |
+
const stepIndex = BENCHMARK_STEPS.indexOf(completedStep);
|
101 |
+
if (stepIndex >= 0 && stepIndex + 1 > newActiveStep) {
|
102 |
+
newActiveStep = stepIndex + 1;
|
103 |
+
if (newActiveStep >= BENCHMARK_STEPS.length) {
|
104 |
+
newActiveStep = BENCHMARK_STEPS.length;
|
105 |
+
}
|
106 |
+
}
|
107 |
+
}
|
108 |
+
}
|
109 |
+
});
|
110 |
+
|
111 |
+
// Update state if there are new completed steps
|
112 |
+
if (newCompletedSteps.length > completedSteps.length) {
|
113 |
+
setCompletedSteps(newCompletedSteps);
|
114 |
+
setActiveStep(newActiveStep);
|
115 |
+
}
|
116 |
+
|
117 |
+
// Check the latest logs to determine the current phase
|
118 |
+
const recentLogs = generationLogs.slice(-10); // Check more logs
|
119 |
+
|
120 |
+
// Detect completion conditions
|
121 |
+
const isComplete =
|
122 |
+
recentLogs.some((log) =>
|
123 |
+
log.includes("[SUCCESS] Ingestion process completed successfully")
|
124 |
+
) ||
|
125 |
+
recentLogs.some((log) =>
|
126 |
+
log.includes(
|
127 |
+
"[SUCCESS] Configuration and ingestion completed successfully"
|
128 |
+
)
|
129 |
+
) ||
|
130 |
+
completedSteps.includes("lighteval") ||
|
131 |
+
newCompletedSteps.includes("lighteval");
|
132 |
+
|
133 |
+
if (isComplete) {
|
134 |
+
setCurrentPhase("complete");
|
135 |
+
setGenerationComplete(true);
|
136 |
+
// Stop polling when benchmark is complete
|
137 |
+
if (pollingIntervalRef.current) {
|
138 |
+
clearInterval(pollingIntervalRef.current);
|
139 |
+
}
|
140 |
+
// Notify parent component that generation is complete
|
141 |
+
if (onComplete) {
|
142 |
+
console.log("Notifying parent that generation is complete");
|
143 |
+
onComplete({
|
144 |
+
success: true,
|
145 |
+
sessionId,
|
146 |
+
logs: generationLogs,
|
147 |
+
});
|
148 |
+
}
|
149 |
+
} else if (
|
150 |
+
recentLogs.some((log) => log.includes("starting benchmark creation"))
|
151 |
+
) {
|
152 |
+
setCurrentPhase("benchmarking");
|
153 |
+
} else if (
|
154 |
+
recentLogs.some((log) => log.includes("Generating base configuration"))
|
155 |
+
) {
|
156 |
+
setCurrentPhase("configuring");
|
157 |
+
}
|
158 |
+
}, [generationLogs, completedSteps, activeStep, sessionId, onComplete]);
|
159 |
+
|
160 |
+
const generateBenchmark = async () => {
|
161 |
+
if (!sessionId) {
|
162 |
+
setError("Missing session ID");
|
163 |
+
return;
|
164 |
+
}
|
165 |
+
|
166 |
+
setGenerating(true);
|
167 |
+
setGenerationLogs([]);
|
168 |
+
setError(null);
|
169 |
+
setCurrentPhase("initializing");
|
170 |
+
setCompletedSteps([]);
|
171 |
+
setActiveStep(0);
|
172 |
+
|
173 |
+
try {
|
174 |
+
// Call the API to generate the benchmark
|
175 |
+
const response = await fetch("http://localhost:3001/generate-benchmark", {
|
176 |
+
method: "POST",
|
177 |
+
headers: {
|
178 |
+
"Content-Type": "application/json",
|
179 |
+
},
|
180 |
+
body: JSON.stringify({
|
181 |
+
session_id: sessionId,
|
182 |
+
}),
|
183 |
+
});
|
184 |
+
|
185 |
+
const result = await response.json();
|
186 |
+
|
187 |
+
if (response.ok) {
|
188 |
+
setGenerationLogs(result.logs || []);
|
189 |
+
|
190 |
+
// D'abord, on commence par interroger les logs de configuration
|
191 |
+
const pollConfigLogs = async () => {
|
192 |
+
try {
|
193 |
+
// Call the API to get the config logs
|
194 |
+
const configLogsResponse = await fetch(
|
195 |
+
`http://localhost:3001/config-logs/${sessionId}`
|
196 |
+
);
|
197 |
+
|
198 |
+
if (configLogsResponse.ok) {
|
199 |
+
const configLogsResult = await configLogsResponse.json();
|
200 |
+
|
201 |
+
// Update logs if there are new ones
|
202 |
+
if (
|
203 |
+
configLogsResult.logs &&
|
204 |
+
configLogsResult.logs.length > generationLogs.length
|
205 |
+
) {
|
206 |
+
setGenerationLogs(configLogsResult.logs);
|
207 |
+
}
|
208 |
+
|
209 |
+
// If config task is completed, switch to polling benchmark logs
|
210 |
+
if (configLogsResult.is_completed) {
|
211 |
+
// Attendre un court instant pour permettre au serveur de démarrer le benchmark
|
212 |
+
setTimeout(() => {
|
213 |
+
console.log(
|
214 |
+
"Configuration completed, switching to benchmark polling"
|
215 |
+
);
|
216 |
+
clearInterval(configPollingIntervalRef.current);
|
217 |
+
pollBenchmarkLogs();
|
218 |
+
}, 1000);
|
219 |
+
}
|
220 |
+
}
|
221 |
+
} catch (error) {
|
222 |
+
console.log("Error polling for config logs:", error);
|
223 |
+
// Don't stop polling on network errors
|
224 |
+
}
|
225 |
+
};
|
226 |
+
|
227 |
+
// Fonction pour interroger les logs du benchmark
|
228 |
+
const pollBenchmarkLogs = async () => {
|
229 |
+
// Set up polling for benchmark logs
|
230 |
+
pollingIntervalRef.current = setInterval(async () => {
|
231 |
+
// Check if we already completed
|
232 |
+
if (generationComplete) {
|
233 |
+
clearInterval(pollingIntervalRef.current);
|
234 |
+
return;
|
235 |
+
}
|
236 |
+
|
237 |
+
try {
|
238 |
+
// Call the API to get the latest benchmark logs
|
239 |
+
const logsResponse = await fetch(
|
240 |
+
`http://localhost:3001/benchmark-logs/${sessionId}`
|
241 |
+
);
|
242 |
+
|
243 |
+
if (logsResponse.ok) {
|
244 |
+
const logsResult = await logsResponse.json();
|
245 |
+
|
246 |
+
// Update logs if there are new ones
|
247 |
+
if (
|
248 |
+
logsResult.logs &&
|
249 |
+
logsResult.logs.length > generationLogs.length
|
250 |
+
) {
|
251 |
+
setGenerationLogs(logsResult.logs);
|
252 |
+
}
|
253 |
+
|
254 |
+
// Check if the task is completed
|
255 |
+
if (logsResult.is_completed) {
|
256 |
+
setGenerationComplete(true);
|
257 |
+
clearInterval(pollingIntervalRef.current);
|
258 |
+
// Notification is now handled in the useEffect above
|
259 |
+
}
|
260 |
+
}
|
261 |
+
} catch (error) {
|
262 |
+
console.log("Error polling for benchmark logs:", error);
|
263 |
+
// Don't stop polling on network errors
|
264 |
+
}
|
265 |
+
}, 3000); // Poll every 3 seconds
|
266 |
+
};
|
267 |
+
|
268 |
+
// Démarrer le polling des logs de configuration
|
269 |
+
const configPollingIntervalRef = { current: null };
|
270 |
+
configPollingIntervalRef.current = setInterval(pollConfigLogs, 1000); // Poll config logs more frequently (every second)
|
271 |
+
} else {
|
272 |
+
// Handle error
|
273 |
+
setGenerationLogs([`Error: ${result.error || "Unknown error"}`]);
|
274 |
+
setError(result.error || "Benchmark generation failed");
|
275 |
+
}
|
276 |
+
} catch (error) {
|
277 |
+
console.error("Error generating benchmark:", error);
|
278 |
+
setGenerationLogs([`Error: ${error.message || "Unknown error"}`]);
|
279 |
+
setError("Server connection error");
|
280 |
+
} finally {
|
281 |
+
setGenerating(false);
|
282 |
+
}
|
283 |
+
};
|
284 |
+
|
285 |
+
// Get title based on current phase
|
286 |
+
const getPhaseTitle = () => {
|
287 |
+
switch (currentPhase) {
|
288 |
+
case "initializing":
|
289 |
+
return "Benchmark generation...";
|
290 |
+
case "configuring":
|
291 |
+
return "Generating configuration file...";
|
292 |
+
case "benchmarking":
|
293 |
+
return "Creating benchmark...";
|
294 |
+
case "complete":
|
295 |
+
return "Benchmark generated successfully!";
|
296 |
+
default:
|
297 |
+
return "Processing...";
|
298 |
+
}
|
299 |
+
};
|
300 |
+
|
301 |
+
// Get the current step information for display
|
302 |
+
const getCurrentStepInfo = () => {
|
303 |
+
const totalSteps = BENCHMARK_STEPS.length;
|
304 |
+
const currentStepIndex = activeStep;
|
305 |
+
|
306 |
+
// If there's no active step yet
|
307 |
+
if (currentStepIndex === 0 && completedSteps.length === 0) {
|
308 |
+
return `Starting... (0%)`;
|
309 |
+
}
|
310 |
+
|
311 |
+
// If all steps are completed
|
312 |
+
if (currentStepIndex >= totalSteps) {
|
313 |
+
return `Complete (100%)`;
|
314 |
+
}
|
315 |
+
|
316 |
+
// Calculate percentage
|
317 |
+
const percentage = Math.round((currentStepIndex / totalSteps) * 100);
|
318 |
+
|
319 |
+
// Get current step name
|
320 |
+
const currentStepName =
|
321 |
+
STEP_LABELS[BENCHMARK_STEPS[currentStepIndex]] || "Processing";
|
322 |
+
|
323 |
+
return `${currentStepName} (${percentage}%)`;
|
324 |
+
};
|
325 |
+
|
326 |
+
// Format elapsed time in HH:MM:SS
|
327 |
+
const formatElapsedTime = () => {
|
328 |
+
const hours = Math.floor(elapsedTime / 3600);
|
329 |
+
const minutes = Math.floor((elapsedTime % 3600) / 60);
|
330 |
+
const seconds = elapsedTime % 60;
|
331 |
+
|
332 |
+
return [
|
333 |
+
hours.toString().padStart(2, "0"),
|
334 |
+
minutes.toString().padStart(2, "0"),
|
335 |
+
seconds.toString().padStart(2, "0"),
|
336 |
+
].join(":");
|
337 |
+
};
|
338 |
+
|
339 |
+
// If complete, stop the timer
|
340 |
+
useEffect(() => {
|
341 |
+
if (generationComplete && timerIntervalRef.current) {
|
342 |
+
clearInterval(timerIntervalRef.current);
|
343 |
+
}
|
344 |
+
}, [generationComplete]);
|
345 |
+
|
346 |
+
return (
|
347 |
+
<Paper
|
348 |
+
elevation={3}
|
349 |
+
sx={{
|
350 |
+
p: 4,
|
351 |
+
mt: 3,
|
352 |
+
mb: 3,
|
353 |
+
display: "flex",
|
354 |
+
flexDirection: "column",
|
355 |
+
alignItems: "center",
|
356 |
+
justifyContent: "center",
|
357 |
+
minHeight: 200,
|
358 |
+
}}
|
359 |
+
>
|
360 |
+
{error ? (
|
361 |
+
<Alert severity="error" sx={{ width: "100%" }}>
|
362 |
+
{error}
|
363 |
+
</Alert>
|
364 |
+
) : (
|
365 |
+
<>
|
366 |
+
<CircularProgress size={60} sx={{ mb: 2 }} />
|
367 |
+
<Typography variant="h6" component="div" gutterBottom>
|
368 |
+
{getPhaseTitle()}
|
369 |
+
</Typography>
|
370 |
+
|
371 |
+
{/* Step progress indicator */}
|
372 |
+
<Typography variant="body1" color="text.secondary">
|
373 |
+
{getCurrentStepInfo()}
|
374 |
+
</Typography>
|
375 |
+
|
376 |
+
{/* Timer display */}
|
377 |
+
<Box
|
378 |
+
sx={{
|
379 |
+
display: "flex",
|
380 |
+
alignItems: "center",
|
381 |
+
mt: 1,
|
382 |
+
color: "text.secondary",
|
383 |
+
}}
|
384 |
+
>
|
385 |
+
<Typography variant="body2" sx={{ opacity: 0.5 }}>
|
386 |
+
{formatElapsedTime()}
|
387 |
+
</Typography>
|
388 |
+
</Box>
|
389 |
+
</>
|
390 |
+
)}
|
391 |
+
|
392 |
+
{/* Use the LogDisplay component */}
|
393 |
+
<LogDisplay logs={generationLogs} height={300} />
|
394 |
+
</Paper>
|
395 |
+
);
|
396 |
+
};
|
397 |
+
|
398 |
+
export default BenchmarkGenerator;
|
frontend/src/components/EvaluationDisplay.jsx
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React, { useState, useEffect } from "react";
|
2 |
+
import {
|
3 |
+
Box,
|
4 |
+
Paper,
|
5 |
+
Typography,
|
6 |
+
Table,
|
7 |
+
TableBody,
|
8 |
+
TableCell,
|
9 |
+
TableContainer,
|
10 |
+
TableHead,
|
11 |
+
TableRow,
|
12 |
+
Alert,
|
13 |
+
LinearProgress,
|
14 |
+
Card,
|
15 |
+
CardContent,
|
16 |
+
Link,
|
17 |
+
} from "@mui/material";
|
18 |
+
import OpenInNewIcon from "@mui/icons-material/OpenInNew";
|
19 |
+
|
20 |
+
const EvaluationDisplay = ({ sessionId }) => {
|
21 |
+
const [results, setResults] = useState(null);
|
22 |
+
const [loading, setLoading] = useState(true);
|
23 |
+
const [error, setError] = useState(null);
|
24 |
+
|
25 |
+
useEffect(() => {
|
26 |
+
const fetchEvaluationResults = async () => {
|
27 |
+
if (!sessionId) {
|
28 |
+
setError("No session ID provided");
|
29 |
+
setLoading(false);
|
30 |
+
return;
|
31 |
+
}
|
32 |
+
|
33 |
+
try {
|
34 |
+
// Fetch evaluation results from the API
|
35 |
+
const response = await fetch(
|
36 |
+
`http://localhost:3001/evaluation-results/${sessionId}`
|
37 |
+
);
|
38 |
+
|
39 |
+
if (!response.ok) {
|
40 |
+
throw new Error(`Failed to fetch results: ${response.status}`);
|
41 |
+
}
|
42 |
+
|
43 |
+
const data = await response.json();
|
44 |
+
|
45 |
+
if (!data.success) {
|
46 |
+
throw new Error(data.message || "Failed to fetch evaluation results");
|
47 |
+
}
|
48 |
+
|
49 |
+
setResults(data.results);
|
50 |
+
} catch (err) {
|
51 |
+
console.error("Error fetching evaluation results:", err);
|
52 |
+
setError(err.message);
|
53 |
+
} finally {
|
54 |
+
setLoading(false);
|
55 |
+
}
|
56 |
+
};
|
57 |
+
|
58 |
+
fetchEvaluationResults();
|
59 |
+
}, [sessionId]);
|
60 |
+
|
61 |
+
// Format accuracy as percentage
|
62 |
+
const formatAccuracy = (value) => {
|
63 |
+
return `${(value * 100).toFixed(2)}%`;
|
64 |
+
};
|
65 |
+
|
66 |
+
// Format evaluation time
|
67 |
+
const formatTime = (seconds) => {
|
68 |
+
return `${seconds.toFixed(2)}s`;
|
69 |
+
};
|
70 |
+
|
71 |
+
if (loading) {
|
72 |
+
return (
|
73 |
+
<Box sx={{ width: "100%", mt: 4, mb: 4 }}>
|
74 |
+
<Typography variant="h5" gutterBottom>
|
75 |
+
Loading Evaluation Results...
|
76 |
+
</Typography>
|
77 |
+
<LinearProgress />
|
78 |
+
</Box>
|
79 |
+
);
|
80 |
+
}
|
81 |
+
|
82 |
+
if (error) {
|
83 |
+
return (
|
84 |
+
<Alert severity="error" sx={{ mt: 4, mb: 4 }}>
|
85 |
+
{error}
|
86 |
+
</Alert>
|
87 |
+
);
|
88 |
+
}
|
89 |
+
|
90 |
+
if (
|
91 |
+
!results ||
|
92 |
+
!results.models_comparison ||
|
93 |
+
results.models_comparison.length === 0
|
94 |
+
) {
|
95 |
+
return (
|
96 |
+
<Alert severity="info" sx={{ mt: 4, mb: 4 }}>
|
97 |
+
No evaluation results found for this benchmark.
|
98 |
+
</Alert>
|
99 |
+
);
|
100 |
+
}
|
101 |
+
|
102 |
+
return (
|
103 |
+
<Box sx={{ mt: 4, mb: 6 }}>
|
104 |
+
<Typography variant="h4" gutterBottom>
|
105 |
+
Evaluation Results
|
106 |
+
</Typography>
|
107 |
+
|
108 |
+
<TableContainer
|
109 |
+
component={Paper}
|
110 |
+
sx={{
|
111 |
+
border: "1px solid rgba(224, 224, 224, 1)",
|
112 |
+
boxShadow: "0 2px 4px rgba(0,0,0,0.05)",
|
113 |
+
}}
|
114 |
+
>
|
115 |
+
<Table sx={{ minWidth: 650 }}>
|
116 |
+
<TableHead>
|
117 |
+
<TableRow>
|
118 |
+
<TableCell>Rank</TableCell>
|
119 |
+
<TableCell>Model</TableCell>
|
120 |
+
<TableCell>Provider</TableCell>
|
121 |
+
<TableCell align="center">Accuracy</TableCell>
|
122 |
+
<TableCell align="center">Std Error</TableCell>
|
123 |
+
<TableCell align="center">Eval Time</TableCell>
|
124 |
+
<TableCell align="center">Status</TableCell>
|
125 |
+
</TableRow>
|
126 |
+
</TableHead>
|
127 |
+
<TableBody>
|
128 |
+
{results.models_comparison.map((model, index) => (
|
129 |
+
<TableRow
|
130 |
+
key={`${model.model_name}-${model.provider}`}
|
131 |
+
sx={{
|
132 |
+
"&:last-child td, &:last-child th": { border: 0 },
|
133 |
+
backgroundColor: model.success
|
134 |
+
? "inherit"
|
135 |
+
: "rgba(0, 0, 0, 0.04)",
|
136 |
+
}}
|
137 |
+
>
|
138 |
+
<TableCell>{index + 1}</TableCell>
|
139 |
+
<TableCell component="th" scope="row">
|
140 |
+
<Link
|
141 |
+
href={`https://huggingface.co/${model.model_name}`}
|
142 |
+
target="_blank"
|
143 |
+
rel="noopener noreferrer"
|
144 |
+
sx={{
|
145 |
+
textDecoration: "none",
|
146 |
+
"&:hover": {
|
147 |
+
textDecoration: "underline",
|
148 |
+
},
|
149 |
+
display: "flex",
|
150 |
+
alignItems: "center",
|
151 |
+
}}
|
152 |
+
>
|
153 |
+
{model.model_name}
|
154 |
+
<OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
|
155 |
+
</Link>
|
156 |
+
</TableCell>
|
157 |
+
<TableCell>{model.provider}</TableCell>
|
158 |
+
<TableCell align="center">
|
159 |
+
{model.success ? formatAccuracy(model.accuracy) : "-"}
|
160 |
+
</TableCell>
|
161 |
+
<TableCell align="center">
|
162 |
+
{model.success ? formatAccuracy(model.accuracy_stderr) : "-"}
|
163 |
+
</TableCell>
|
164 |
+
<TableCell align="center">
|
165 |
+
{model.success ? formatTime(model.evaluation_time) : "-"}
|
166 |
+
</TableCell>
|
167 |
+
<TableCell align="center">
|
168 |
+
{model.success ? (
|
169 |
+
<span style={{ color: "green" }}>✓ Success</span>
|
170 |
+
) : (
|
171 |
+
<span style={{ color: "red" }}>✗ Failed</span>
|
172 |
+
)}
|
173 |
+
</TableCell>
|
174 |
+
</TableRow>
|
175 |
+
))}
|
176 |
+
</TableBody>
|
177 |
+
</Table>
|
178 |
+
</TableContainer>
|
179 |
+
|
180 |
+
<Box sx={{ mt: 4, textAlign: "center" }}>
|
181 |
+
<Typography variant="body2" color="textSecondary">
|
182 |
+
Need larger evaluation?{" "}
|
183 |
+
<Link
|
184 |
+
href="https://huggingface.co/spaces/yourbench/yourbench"
|
185 |
+
target="_blank"
|
186 |
+
rel="noopener noreferrer"
|
187 |
+
>
|
188 |
+
Go to this page
|
189 |
+
</Link>
|
190 |
+
</Typography>
|
191 |
+
</Box>
|
192 |
+
</Box>
|
193 |
+
);
|
194 |
+
};
|
195 |
+
|
196 |
+
export default EvaluationDisplay;
|
frontend/src/components/Footer/Footer.js
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React from "react";
|
2 |
+
import { Box, Typography, Link } from "@mui/material";
|
3 |
+
|
4 |
+
const Footer = () => {
|
5 |
+
return (
|
6 |
+
<Box
|
7 |
+
component="footer"
|
8 |
+
sx={{
|
9 |
+
width: "100%",
|
10 |
+
py: 4,
|
11 |
+
textAlign: "center",
|
12 |
+
}}
|
13 |
+
>
|
14 |
+
<Typography variant="body2" color="text.secondary" sx={{ mx: 4 }}>
|
15 |
+
© 2024 Hugging Face - Open LLM Leaderboard - Made with 🤗 by the HF team
|
16 |
+
-{" "}
|
17 |
+
<Link
|
18 |
+
href="https://huggingface.co"
|
19 |
+
target="_blank"
|
20 |
+
rel="noopener noreferrer"
|
21 |
+
color="inherit"
|
22 |
+
>
|
23 |
+
huggingface.co
|
24 |
+
</Link>
|
25 |
+
</Typography>
|
26 |
+
</Box>
|
27 |
+
);
|
28 |
+
};
|
29 |
+
|
30 |
+
export default Footer;
|
frontend/src/components/LogDisplay.jsx
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React, { useRef, useEffect } from "react";
|
2 |
+
import { Box, Typography } from "@mui/material";
|
3 |
+
|
4 |
+
/**
|
5 |
+
* A reusable component for displaying logs with auto-scrolling and styling
|
6 |
+
*
|
7 |
+
* @param {Object} props - Component props
|
8 |
+
* @param {Array<string>} props.logs - Array of log messages to display
|
9 |
+
* @param {number} props.height - Height of the log container in pixels (default: 300)
|
10 |
+
* @param {Object} props.containerStyle - Additional styles for the container
|
11 |
+
* @returns {JSX.Element} Log display component
|
12 |
+
*/
|
13 |
+
const LogDisplay = ({ logs = [], height = 300, containerStyle = {} }) => {
|
14 |
+
const logsEndRef = useRef(null);
|
15 |
+
|
16 |
+
// Auto-scroll logs to bottom when new logs are added
|
17 |
+
useEffect(() => {
|
18 |
+
if (logsEndRef.current) {
|
19 |
+
logsEndRef.current.scrollIntoView({ behavior: "smooth" });
|
20 |
+
}
|
21 |
+
}, [logs]);
|
22 |
+
|
23 |
+
return (
|
24 |
+
<Box
|
25 |
+
sx={{
|
26 |
+
mt: 3,
|
27 |
+
width: "100%",
|
28 |
+
height: `${height}px`,
|
29 |
+
overflowY: "auto",
|
30 |
+
backgroundColor: "#f9f9f9",
|
31 |
+
p: 2,
|
32 |
+
borderRadius: 1,
|
33 |
+
fontFamily: "monospace",
|
34 |
+
fontSize: "0.85rem",
|
35 |
+
border: "1px solid #e0e0e0",
|
36 |
+
...containerStyle,
|
37 |
+
}}
|
38 |
+
>
|
39 |
+
{logs.length === 0 ? (
|
40 |
+
<Typography color="text.secondary" variant="body2">
|
41 |
+
Waiting for logs...
|
42 |
+
</Typography>
|
43 |
+
) : (
|
44 |
+
logs.map((log, index) => {
|
45 |
+
// Style logs based on content
|
46 |
+
let style = { opacity: 0.7 };
|
47 |
+
if (log.includes("[ERROR]")) {
|
48 |
+
style = { ...style, color: "#d32f2f" }; // Red for errors
|
49 |
+
} else if (log.includes("[WARN]")) {
|
50 |
+
style = { ...style, color: "#ed6c02" }; // Orange for warnings
|
51 |
+
} else if (log.includes("[SUCCESS]")) {
|
52 |
+
style = { ...style, color: "#2e7d32", opacity: 0.8 }; // Green for success
|
53 |
+
}
|
54 |
+
|
55 |
+
return (
|
56 |
+
<div key={index} style={{ ...style, marginBottom: "4px" }}>
|
57 |
+
{log}
|
58 |
+
</div>
|
59 |
+
);
|
60 |
+
})
|
61 |
+
)}
|
62 |
+
<div ref={logsEndRef} />
|
63 |
+
</Box>
|
64 |
+
);
|
65 |
+
};
|
66 |
+
|
67 |
+
export default LogDisplay;
|
frontend/src/components/Logo/HFLogo.js
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React from 'react';
|
2 |
+
|
3 |
+
const HFLogo = () => (
|
4 |
+
<svg width="100%" viewBox="0 0 236 220" version="1.1" xmlns="http://www.w3.org/2000/svg" >
|
5 |
+
<title>hg-logo</title>
|
6 |
+
<g id="hg-logo" stroke="none" strokeWidth="1" fill="none">
|
7 |
+
<g id="Group" transform="translate(-1.000000, 0.000000)">
|
8 |
+
<path d="M236.188357,161.726225 C235.570415,159.393906 234.569281,157.181253 233.22638,155.176863 C233.514062,154.120588 233.732701,153.048205 233.879994,151.965466 C234.832798,145.089325 232.449638,138.794251 227.956041,133.922501 C225.522249,131.262254 222.913547,129.506398 220.150646,128.428262 C221.964195,120.669591 222.882477,112.729122 222.88708,104.761037 C222.88708,101.122758 222.681099,97.5581193 222.335881,94.046409 C222.155216,92.2928551 221.937728,90.5427531 221.683417,88.7984042 C220.891716,83.6516545 219.717972,78.5709507 218.171392,73.5986359 C217.1576,70.3316637 215.985007,67.1160095 214.658216,63.9632945 C212.668606,59.2945148 210.345284,54.7746261 207.706662,50.4388375 C205.974815,47.5549087 204.089921,44.7659066 202.058884,42.0841428 C201.063504,40.7298561 200.026697,39.4075568 198.947313,38.1190859 C195.750588,34.2338824 192.277687,30.5855928 188.552777,27.2030978 C187.316894,26.0660493 186.045339,24.9682371 184.739261,23.9111571 C183.453897,22.8390039 182.139764,21.8011393 180.799165,20.798714 C178.100706,18.7906417 175.311338,16.9066068 172.44142,15.1525926 C156.583223,5.52185376 137.986291,0 118.109749,0 C60.2385495,0 13.3336831,46.9018135 13.3336831,104.76564 C13.3321871,112.833829 14.2670394,120.874403 16.1195981,128.726274 C13.6340233,129.805561 11.2508635,131.486626 9.04261448,133.920199 C4.55016831,138.788498 2.16585774,145.06171 3.11981211,151.9367 C3.26365324,153.029795 3.48229176,154.111383 3.77227548,155.174561 C2.4301802,157.180102 1.42939115,159.393906 0.810298929,161.726225 C-0.570575919,166.97423 -0.116037948,171.706754 1.63882384,175.85246 C-0.267934182,181.273058 0.208467641,187.044598 2.69289164,192.062477 C4.49953623,195.727221 7.08522438,198.561213 10.2715931,201.096041 C14.0609438,204.107229 18.8042489,206.667372 24.5268244,209.121657 C31.3529491,212.032741 39.6842274,214.767779 43.4735781,215.771124 C53.2616793,218.305953 62.6470253,219.912227 72.1599872,219.989319 C85.7109724,220.115888 97.3816663,216.928654 105.738261,208.774168 C109.842911,209.276992 113.974028,209.528979 118.109749,209.527828 C122.479067,209.518623 126.843782,209.242473 131.179729,208.70398 C139.51561,216.910244 151.231182,220.126243 164.8328,219.996223 C174.343575,219.921432 183.728921,218.315158 193.491706,215.776877 C197.306373,214.773532 205.63535,212.038494 212.464927,209.128561 C218.187502,206.668523 222.929657,204.109531 226.745474,201.101795 C229.907678,198.568116 232.491064,195.732974 234.29886,192.068231 C236.8086,187.050351 237.260836,181.278811 235.378244,175.858213 C237.116995,171.712507 237.568081,166.969627 236.188357,161.726225 Z M226.477354,175.501519 C228.400223,179.150153 228.523351,183.272846 226.826025,187.112485 C224.252995,192.932351 217.861846,197.515294 205.448932,202.436521 C197.730992,205.498336 190.662064,207.4544 190.599924,207.47281 C180.390656,210.1204 171.157207,211.464332 163.164243,211.464332 C149.928557,211.464332 140.080618,207.813396 133.834461,200.601272 C123.271919,202.399701 112.486136,202.460684 101.904031,200.781921 C95.6509699,207.874379 85.857115,211.464332 72.7330503,211.464332 C64.7390507,211.464332 55.5067517,210.1204 45.2974836,207.47281 C45.2341935,207.4544 38.1698679,205.498336 30.4484761,202.436521 C18.0355619,197.515294 11.6432621,192.934652 9.07138271,187.112485 C7.37405737,183.272846 7.49718538,179.150153 9.4200536,175.501519 C9.59841661,175.163235 9.7882869,174.831854 9.99196594,174.513131 C8.83939573,172.78259 8.06645104,170.826526 7.72364885,168.77611 C7.38096175,166.725695 7.47635718,164.624652 8.00350616,162.613358 C8.76759024,159.711479 10.3463905,157.297466 12.489048,155.563473 C11.4573043,153.86745 10.7801003,151.980424 10.4982867,150.015155 C9.88149595,145.74173 11.2991941,141.474059 14.4913165,137.995716 C16.9757405,135.288294 20.4889162,133.798233 24.3795311,133.798233 L24.4830967,133.798233 C21.5502336,124.39877 20.0630314,114.608094 20.0723523,104.762188 C20.0723523,51.0601755 63.612487,7.52279222 117.324951,7.52279222 C171.038681,7.52279222 214.577665,51.0567236 214.577665,104.762188 C214.58457,114.634558 213.087471,124.450548 210.137002,133.871873 C210.606499,133.825848 211.066791,133.801685 211.517877,133.801685 C215.407341,133.801685 218.921668,135.290595 221.406092,137.998018 C224.595912,141.474059 226.017063,145.745182 225.399121,150.017456 C225.117193,151.982725 224.440564,153.870902 223.40836,155.566925 C225.551018,157.299767 227.129818,159.71378 227.893902,162.61681 C228.419785,164.628104 228.515296,166.727996 228.172378,168.779562 C227.829461,170.829977 227.057322,172.784891 225.905442,174.516583 C226.109121,174.831854 226.301293,175.163235 226.477354,175.501519 Z" id="Shape" fill="#FFFFFF" fillRule="nonzero"></path>
|
9 |
+
<path d="M226.52977,174.037682 C227.682419,172.305523 228.455074,170.350082 228.798221,168.299114 C229.141367,166.246994 229.045793,164.146536 228.519558,162.134699 C227.754964,159.232038 226.175108,156.817373 224.031019,155.082913 C225.062761,153.386432 225.740993,151.498897 226.02311,149.533098 C226.640313,145.258521 225.221668,140.989698 222.027412,137.510418 C219.541328,134.802265 216.024653,133.311802 212.13259,133.311802 C211.680051,133.311802 211.220603,133.334821 210.750792,133.382009 C213.699779,123.958143 215.195575,114.139506 215.186363,104.265624 C215.186363,50.5501622 171.617132,7 117.873265,7 C64.1293973,7 20.5555606,50.5455585 20.5555606,104.265624 C20.5462334,114.114185 22.0344295,123.907502 24.9692525,133.3095 L24.8656177,133.3095 C20.9735543,133.3095 17.4580309,134.799963 14.9719466,137.508116 C11.7799941,140.985094 10.3590456,145.256219 10.9762485,149.530796 C11.2580201,151.496595 11.9356766,153.384131 12.9683401,155.080611 C10.8242508,156.815072 9.24439546,159.229736 8.48095227,162.133548 C7.95379648,164.145385 7.85868274,166.246994 8.20205945,168.299114 C8.54543616,170.350082 9.31935798,172.306674 10.4730439,174.037682 C10.2669257,174.356491 10.07808,174.687961 9.90074934,175.026336 C7.97774764,178.675955 7.85338585,182.79976 9.55184544,186.640434 C12.1254435,192.460719 18.522015,197.0472 30.9432242,201.968603 C38.6663215,205.031245 45.7399738,206.987836 45.8021547,207.006251 C56.0182452,209.654556 65.2567139,211 73.2550191,211 C86.3890056,211 96.1882538,207.409079 102.446646,200.313557 C113.035821,201.992773 123.828812,201.931773 134.398413,200.13286 C140.647592,207.346928 150.503264,211 163.747794,211 C171.7461,211 180.98572,209.654556 191.20181,207.006251 C191.263991,206.987836 198.334189,205.031245 206.060741,201.968603 C218.48195,197.0472 224.878521,192.460719 227.45212,186.640434 C229.150579,182.79976 229.027369,178.675955 227.103216,175.026336 C226.927036,174.684508 226.733585,174.354189 226.52977,174.037682 Z M97.9684697,189.207022 C97.4295686,190.149639 96.8526681,191.069237 96.2377682,191.963514 C94.6199135,194.33099 92.4919451,196.139111 90.0231334,197.484555 C85.30084,200.056898 79.3257167,200.95693 73.2538676,200.95693 C63.6641921,200.95693 53.832702,198.713755 48.3216324,197.284293 C48.0510304,197.214085 14.5435894,187.75454 18.7857081,179.70259 C19.4996369,178.349089 20.6753163,177.808149 22.1538398,177.808149 C28.1266601,177.808149 39.0026741,186.697981 43.677756,186.697981 C44.7210132,186.697981 45.4578568,186.252568 45.7583978,185.169537 C47.7504894,178.027978 15.4820603,175.026336 18.1995956,164.686325 C18.6797703,162.856336 19.9798115,162.113982 21.8095419,162.113982 C29.7053639,162.112831 47.4292214,175.993123 51.1358936,175.993123 C51.4203136,175.993123 51.6241287,175.910255 51.7346726,175.735313 L51.7830355,175.655898 C53.5217975,172.784312 52.5246002,170.696514 40.6042927,163.399578 L39.4597036,162.703262 C26.3441411,154.767556 17.1367629,149.993472 22.3737759,144.296338 C22.9760094,143.638002 23.8292694,143.346815 24.8667692,143.346815 C26.0977205,143.346815 27.5866075,143.758851 29.2263407,144.448261 C36.1537528,147.368187 45.7549433,155.331515 49.7656109,158.80504 C50.9481994,159.833977 51.6448557,160.462389 51.6448557,160.462389 C51.6448557,160.462389 56.722962,165.740582 59.7940072,165.740582 C60.501027,165.740582 61.099806,165.463207 61.5062848,164.773796 C63.684919,161.104611 41.282525,144.137509 40.0193317,137.137514 C39.1637686,132.393355 40.6204136,129.991351 43.3160705,129.991351 C44.5965363,129.991351 46.1602706,130.535744 47.8863662,131.630284 C53.240832,135.027848 63.5789812,152.784493 67.3639552,159.691261 C68.6329061,162.006945 70.7988738,162.985241 72.750663,162.985241 C76.6231508,162.985241 79.6504392,159.137661 73.1053244,154.248484 C63.2680768,146.890548 66.7202678,134.865566 71.4149253,134.125514 C71.6152859,134.094439 71.8179496,134.078326 72.0194617,134.077175 C76.2892164,134.077175 78.1730672,141.431658 78.1730672,141.431658 C78.1730672,141.431658 83.6921972,155.286628 93.1747834,164.756532 C101.779928,173.352875 102.980941,180.408114 97.9684697,189.207022 Z M128.631711,190.829842 L128.140021,190.88854 L127.300579,190.985218 C126.859555,191.030105 126.418531,191.07384 125.975205,191.115274 L125.542241,191.154406 L125.148429,191.187783 L124.58765,191.23267 C124.381531,191.247632 124.175413,191.263745 123.969295,191.276405 L123.352092,191.317839 L123.216215,191.325896 L122.730283,191.354669 L122.524165,191.365027 L121.948416,191.393801 L121.279396,191.421423 L120.671405,191.44214 L120.266077,191.453649 L120.061111,191.453649 C119.934446,191.453649 119.808933,191.460555 119.682268,191.461706 L119.480756,191.461706 C119.354091,191.461706 119.228578,191.461706 119.101913,191.468612 L118.587193,191.474366 L117.866356,191.474366 C117.30097,191.474366 116.737888,191.468612 116.174805,191.458253 L115.718812,191.447895 C115.589844,191.447895 115.459725,191.440989 115.330757,191.437536 L114.848279,191.422574 L114.247197,191.399555 L113.707145,191.376537 L113.566662,191.370782 L113.051942,191.34316 C112.909157,191.335103 112.766371,191.328197 112.624737,191.31899 L112.291954,191.299424 C111.87396,191.272952 111.455966,191.243028 111.037972,191.210802 L110.600403,191.176274 L110.047684,191.129085 L109.401694,191.069237 C109.041275,191.03586 108.680856,190.99903 108.320437,190.958747 L108.303164,190.958747 C113.56551,179.224952 110.904399,168.266887 100.270314,157.646048 C93.2968422,150.685185 88.6563052,140.403871 87.6948043,138.146885 C85.7464697,131.4657 80.58891,124.038709 72.0252192,124.038709 C71.300927,124.03986 70.5789377,124.097406 69.8638574,124.210198 C66.1111254,124.799478 62.831659,126.958634 60.491815,130.206576 C57.9642769,127.063369 55.5058286,124.564687 53.2834374,123.152488 C49.9325781,121.030161 46.5897794,119.952885 43.33104,119.952885 C39.2662519,119.952885 35.6309727,121.621743 33.097677,124.648705 L33.0343446,124.725818 L32.8915589,124.12618 L32.8858014,124.100859 C32.4040146,122.040683 31.9992631,119.962092 31.6721225,117.871992 C31.6721225,117.85703 31.6721225,117.843219 31.6652135,117.829408 L31.5938206,117.356373 C31.5552454,117.100865 31.5175914,116.844206 31.4809738,116.588698 C31.4510349,116.375775 31.4210959,116.162852 31.39346,115.95108 C31.365824,115.739307 31.3358851,115.526384 31.3105521,115.313461 C31.2829161,115.099387 31.2575832,114.886464 31.2322502,114.67354 C31.2079536,114.470976 31.1845782,114.268411 31.1620089,114.065846 L31.155215,114.014054 C31.0513499,113.079494 30.9623391,112.143782 30.8879523,111.20692 L30.8476499,110.664829 C30.8361349,110.516359 30.8257714,110.366737 30.8165594,110.215964 C30.8165594,110.181436 30.8119535,110.145757 30.8096505,110.113531 L30.7717662,109.512742 C30.7579482,109.295215 30.7474696,109.077688 30.7359546,108.860161 C30.7244396,108.643785 30.7129246,108.426258 30.7037126,108.20758 L30.6806827,107.637867 L30.6737737,107.465226 L30.6565012,106.938098 L30.6439499,106.491534 C30.6439499,106.313139 30.6357743,106.133593 30.6323198,105.955198 L30.6231078,105.39239 C30.6208048,105.204788 30.6231078,105.018336 30.6185018,104.830733 C30.6127443,104.64198 30.6185018,104.454377 30.6185018,104.265624 C30.6185018,56.0965241 69.6899812,17.0441057 117.887083,17.0441057 C166.084184,17.0441057 205.154512,56.0942222 205.154512,104.265624 L205.154512,105.39239 C205.154512,105.579993 205.149906,105.768746 205.1453,105.955198 C205.1453,106.111725 205.139542,106.2648 205.134936,106.421327 C205.134936,106.560591 205.129179,106.698703 205.123421,106.833362 C205.123421,107.009456 205.113058,107.184398 205.1073,107.360491 L205.1073,107.375453 C205.092331,107.757564 205.07621,108.139675 205.060089,108.521786 L205.05318,108.648389 L205.023241,109.219253 L204.995605,109.671571 C204.931121,110.743093 204.847062,111.814615 204.744579,112.883834 L204.744579,112.898797 C204.726155,113.09906 204.705428,113.300473 204.683549,113.500736 C204.632883,113.966865 204.581066,114.432995 204.529248,114.899124 L204.470522,115.367555 L204.397978,115.917702 C204.372645,116.119116 204.345009,116.319379 204.316221,116.519642 C204.285131,116.744075 204.251737,116.967356 204.219495,117.190638 L204.138891,117.717767 L204.044468,118.316254 C204.012226,118.515366 203.979984,118.713327 203.941984,118.912439 C203.902833,119.1104 203.872894,119.308361 203.83835,119.507473 C203.76926,119.903395 203.697867,120.298166 203.620716,120.692937 C203.467567,121.479026 203.307509,122.262814 203.138239,123.045451 C203.095633,123.239959 203.051876,123.435618 203.006968,123.630126 C200.550823,121.244235 197.298992,119.944828 193.710924,119.944828 C190.45564,119.944828 187.109386,121.020954 183.759679,123.144431 C181.536136,124.555479 179.079991,127.055313 176.550149,130.19852 C174.206851,126.949427 170.927385,124.791421 167.179258,124.200991 C166.464178,124.08935 165.741037,124.031803 165.016745,124.030652 C156.450751,124.030652 151.296646,131.457644 149.34716,138.138829 C148.381053,140.395815 143.740516,150.675978 136.758984,157.648349 C126.13296,168.234661 123.451121,179.144386 128.631711,190.829842 Z M218.724916,167.341535 L218.690371,167.443968 C218.66619,167.509571 218.642008,167.575175 218.615524,167.639627 C218.57407,167.737457 218.530313,167.832984 218.484253,167.928512 C218.372558,168.156398 218.245893,168.377377 218.106562,168.58915 C217.914261,168.875733 217.703537,169.149656 217.474389,169.407465 C217.42142,169.467314 217.370754,169.526012 217.310876,169.584709 C217.230272,169.673332 217.143909,169.759652 217.058698,169.845972 C215.507631,171.382472 213.144757,172.727916 210.473281,173.964022 C210.170437,174.100983 209.864139,174.237945 209.553234,174.374906 L209.244633,174.511868 C209.038515,174.602792 208.833548,174.693716 208.617066,174.782338 C208.406342,174.872111 208.194467,174.960733 207.982591,175.048204 L207.340055,175.31407 C205.83735,175.932123 204.297797,176.520251 202.795092,177.102625 L202.153708,177.351227 L201.518081,177.59983 C201.096633,177.764414 200.67979,177.928997 200.268706,178.093581 L199.65726,178.339882 L199.05733,178.586182 L198.761395,178.709332 C198.56564,178.7922 198.372188,178.872765 198.18104,178.955633 C193.76159,180.850074 190.583456,182.777892 191.251325,185.170688 C191.269749,185.238594 191.290476,185.303046 191.313506,185.365197 C191.373383,185.542441 191.45514,185.710477 191.556472,185.867005 C191.61635,185.961382 191.685439,186.050004 191.76259,186.130569 C192.547911,186.945432 193.97692,186.816527 195.779015,186.169701 C195.925255,186.116758 196.07495,186.060362 196.225796,186.000513 C196.329431,185.961382 196.431914,185.919948 196.535549,185.878514 L196.691001,185.812911 C197.103238,185.637968 197.539655,185.44346 197.986437,185.230537 C198.098132,185.178745 198.210979,185.128104 198.323826,185.068255 C200.526641,183.99213 203.02424,182.540799 205.502264,181.220675 C205.882259,181.014658 206.263404,180.817847 206.642247,180.622188 C207.261753,180.301077 207.887017,179.991475 208.518038,179.695685 C210.86479,178.593088 213.069909,177.810451 214.844368,177.810451 C215.675749,177.810451 216.411556,177.98079 217.023002,178.372108 L217.125485,178.440013 C217.435238,178.658691 217.704689,178.926859 217.924625,179.23531 C217.975291,179.307819 218.02826,179.38263 218.073168,179.459743 C218.121531,179.539157 218.167591,179.620874 218.211348,179.703741 C219.087638,181.365693 218.354134,183.088645 216.638402,184.777068 C214.990608,186.397586 212.427373,187.987029 209.512932,189.459077 C209.297602,189.568416 209.079969,189.677755 208.861184,189.783641 C200.189252,194.039803 188.835482,197.245161 188.676575,197.285443 C185.650438,198.069231 181.323109,199.099319 176.448818,199.869295 L175.726828,199.980936 L175.609375,199.9982 C174.503937,200.163935 173.395045,200.310104 172.283849,200.436707 L172.181366,200.447065 C170.160487,200.677253 168.08779,200.844138 166.011639,200.914346 L165.980549,200.914346 C165.234378,200.941968 164.489359,200.954628 163.743188,200.954628 L162.884171,200.954628 C161.746491,200.938515 160.609962,200.887874 159.475737,200.800403 C159.449252,200.800403 159.421616,200.800403 159.39398,200.794648 C158.988653,200.763573 158.582174,200.725592 158.177998,200.680705 C157.482494,200.605895 156.78814,200.51382 156.09609,200.405632 C155.826639,200.360745 155.538765,200.313557 155.261254,200.265217 C155.134589,200.242199 155.009076,200.21918 154.883563,200.193859 L154.853624,200.188105 C154.454054,200.112143 154.056787,200.028125 153.660672,199.937201 C153.431524,199.885408 153.201224,199.833616 152.974379,199.772617 L152.838502,199.736938 C152.725655,199.709315 152.615111,199.679391 152.504568,199.649467 L152.443538,199.633353 L152.087725,199.53092 C151.958758,199.49409 151.830941,199.456109 151.701974,199.418128 L151.655914,199.404317 L151.320828,199.301884 C151.194163,199.262752 151.06865,199.221318 150.943136,199.181036 C150.839501,199.146507 150.737018,199.110828 150.633383,199.077451 L150.406538,198.998037 C150.187754,198.922075 149.971272,198.841509 149.75479,198.758642 L149.549824,198.679227 L149.380554,198.611322 C149.05468,198.480115 148.729957,198.343154 148.408689,198.199287 L148.194511,198.096854 L148.158814,198.08074 C148.082815,198.045061 148.007968,198.009382 147.931969,197.977156 C147.783426,197.905798 147.636034,197.832138 147.489794,197.757327 L147.446037,197.73546 L147.234161,197.623819 C146.857621,197.42816 146.48799,197.218689 146.12642,196.995408 L145.928362,196.874559 C145.825879,196.812409 145.732608,196.749107 145.63473,196.685806 L145.376795,196.514316 C145.283524,196.452166 145.190252,196.388864 145.099284,196.323261 L144.933468,196.205865 C144.75844,196.080413 144.586867,195.952659 144.417597,195.820302 C144.32778,195.751245 144.239115,195.679887 144.150449,195.614284 C143.953543,195.456606 143.761243,195.294324 143.571246,195.127438 L143.563185,195.120532 C143.377794,194.960552 143.197009,194.794817 143.02083,194.62563 L143.011618,194.61412 C142.919498,194.526649 142.829681,194.439178 142.739864,194.348254 C142.651199,194.25733 142.561382,194.167557 142.47502,194.076633 C142.387506,193.984558 142.299992,193.891332 142.214781,193.795805 C142.131873,193.706032 142.051269,193.615108 141.970664,193.523033 L141.943028,193.491958 C141.527337,193.009715 141.138131,192.504454 140.777712,191.978476 L140.634926,191.767855 C140.540503,191.628592 140.447232,191.488178 140.355112,191.347763 L140.083358,190.929973 C139.949785,190.726258 139.819665,190.52024 139.693001,190.313072 C139.602032,190.165752 139.512215,190.018432 139.42355,189.869961 C139.348703,189.743358 139.273855,189.617906 139.202462,189.493605 C139.16216,189.4257 139.124161,189.356644 139.086161,189.287587 C139.048162,189.216229 139.003253,189.14372 138.962951,189.071211 L138.898467,188.957269 C138.876589,188.919288 138.86968,188.907778 138.858165,188.882458 C138.779863,188.745496 138.703864,188.606233 138.630168,188.464668 C138.596775,188.403668 138.563381,188.341518 138.527685,188.280518 L138.42405,188.082557 L138.321566,187.885747 L138.123509,187.486372 C138.091267,187.420769 138.060176,187.355165 138.029086,187.289562 C137.961148,187.147997 137.896664,187.008734 137.83218,186.86947 C137.806847,186.813074 137.780362,186.756679 137.757332,186.700283 C137.668667,186.508076 137.584608,186.313568 137.505154,186.116758 C137.451034,185.986702 137.40152,185.857797 137.348551,185.728892 C137.200007,185.349083 137.060676,184.965822 136.93286,184.577956 C136.89486,184.462863 136.856861,184.34892 136.823468,184.233826 C136.74056,183.972564 136.665712,183.708999 136.598925,183.443133 C136.517169,183.133531 136.444625,182.820477 136.383595,182.506271 C136.344444,182.32097 136.309899,182.134518 136.281112,181.946916 C136.250022,181.761615 136.223537,181.576314 136.201659,181.392165 C136.154447,181.025016 136.121054,180.656716 136.101478,180.287266 C136.101478,180.226266 136.095721,180.164116 136.094569,180.103116 C136.088812,179.981117 136.085357,179.859118 136.084206,179.737118 C136.019722,174.820319 138.510412,170.091121 143.833788,164.772645 C153.315222,155.303892 158.835504,141.447771 158.835504,141.447771 C158.835504,141.447771 158.984047,140.866548 159.2938,140.030968 C159.397435,139.753592 159.505676,139.477368 159.619674,139.204596 C159.749793,138.896145 159.889124,138.591147 160.038819,138.291903 L160.082576,138.204432 C160.274876,137.822321 160.483297,137.450569 160.710143,137.088024 C160.76196,137.004006 160.812626,136.919987 160.872504,136.83712 C161.040622,136.586216 161.219105,136.343368 161.406799,136.107426 C161.551888,135.926729 161.706188,135.751787 161.86855,135.586052 C162.418966,135.019792 163.046532,134.557115 163.757006,134.309664 L163.846823,134.280891 C163.906701,134.261325 163.965427,134.24291 164.026457,134.225646 C164.09785,134.207231 164.169243,134.189967 164.240636,134.175004 L164.272877,134.16925 C164.423724,134.139325 164.57457,134.118608 164.727719,134.10825 L164.741537,134.10825 C164.820991,134.10825 164.900444,134.100194 164.9822,134.100194 C165.186015,134.101344 165.388679,134.117458 165.590191,134.148533 C166.444603,134.283192 167.258712,134.791906 167.958823,135.576845 L168.053245,135.687335 C168.281242,135.954352 168.488512,136.239784 168.672751,136.539027 C168.810931,136.761158 168.94105,136.994798 169.064261,137.239948 C169.113775,137.343532 169.160987,137.437909 169.208198,137.539191 C169.231228,137.590983 169.254258,137.641624 169.276136,137.694567 C169.322196,137.797001 169.364802,137.900585 169.407407,138.008773 C169.64807,138.625675 169.834613,139.262143 169.965884,139.911271 C170.105215,140.612191 170.18582,141.32347 170.20885,142.038201 C170.21691,142.352407 170.21691,142.670065 170.20885,142.992327 C170.203092,143.114326 170.197335,143.237477 170.188123,143.360627 C170.0937,144.72909 169.795462,146.075685 169.302621,147.356677 C169.253106,147.48328 169.198986,147.612185 169.14832,147.74109 C168.900748,148.332671 168.612873,148.905838 168.288151,149.458287 C168.10276,149.773644 167.902399,150.086698 167.688221,150.397451 L167.55695,150.583903 C167.049139,151.292879 166.482602,151.958121 165.863096,152.572721 C165.252802,153.183868 164.598751,153.748977 163.906701,154.265748 C163.212348,154.777914 162.561751,155.346477 161.960669,155.96453 C160.229968,157.780707 159.826944,159.381659 160.21615,160.595897 C160.26221,160.732859 160.31633,160.867518 160.378511,160.997574 C160.522448,161.279553 160.708991,161.538514 160.932382,161.764098 L160.993411,161.825097 C161.056743,161.886097 161.12353,161.944795 161.191469,162.00119 L161.260559,162.057586 C161.428677,162.186491 161.606008,162.301585 161.791399,162.401716 C161.844368,162.43049 161.893883,162.459263 161.954912,162.486886 C162.209393,162.613489 162.475389,162.714771 162.749446,162.791884 L162.825444,162.81145 L162.927928,162.839072 L163.017745,162.86094 L163.113319,162.881657 L163.208893,162.901223 L163.299862,162.916185 C163.3678,162.928845 163.43689,162.939203 163.504828,162.948411 L163.571615,162.957618 L163.690219,162.970279 L163.763915,162.977184 L163.885974,162.986392 L163.956215,162.986392 L164.08288,162.992147 L164.48245,162.992147 L164.596448,162.986392 L164.727719,162.978335 L164.887777,162.963373 L165.037472,162.944958 C165.070866,162.944958 165.105411,162.935751 165.139955,162.929996 C165.667342,162.852883 166.18091,162.704412 166.666842,162.488036 C166.770477,162.444301 166.87296,162.395962 166.967383,162.34532 C167.170046,162.244038 167.366953,162.130095 167.555798,162.004643 C167.819491,161.832003 168.068215,161.637495 168.301969,161.425722 L168.46433,161.271497 C168.489663,161.246176 168.516148,161.222007 168.540329,161.195535 C168.699236,161.036706 168.848931,160.868669 168.988262,160.692576 C169.234682,160.381823 169.453467,160.051504 169.643464,159.703922 C171.896946,155.60889 174.258668,151.573708 176.726329,147.604129 L177.064869,147.062038 L177.406864,146.518796 C177.578437,146.244873 177.75001,145.973252 177.921584,145.705084 L178.09546,145.434614 L178.442061,144.894825 C178.907266,144.17549 179.373623,143.464211 179.843434,142.759838 L180.194641,142.235011 C180.899357,141.192263 181.602923,140.177137 182.298428,139.211501 L182.64618,138.735014 C183.402714,137.689964 184.19149,136.669083 185.010205,135.672372 L185.336079,135.283356 C185.389048,135.217753 185.438563,135.154451 185.497289,135.09115 L185.816254,134.721699 C185.869223,134.660699 185.919889,134.600851 185.975161,134.542153 L186.283762,134.193419 L186.439214,134.025383 C186.588909,133.863101 186.740907,133.703121 186.895208,133.544291 C186.998842,133.441858 187.094417,133.338274 187.194597,133.242746 C187.778407,132.64541 188.422094,132.110225 189.116447,131.646397 L189.276506,131.543964 L189.438867,131.44038 C189.542502,131.375927 189.647288,131.313777 189.754377,131.255079 C192.476519,129.711674 194.731152,129.597731 196.027738,130.893685 C196.814212,131.679775 197.248326,132.981483 197.225296,134.791906 C197.225296,134.872472 197.225296,134.951886 197.218387,135.034754 L197.218387,135.124527 C197.218387,135.206243 197.211478,135.290262 197.205721,135.37428 C197.205721,135.476713 197.193054,135.580298 197.182691,135.682731 C197.172327,135.786315 197.167722,135.864579 197.157358,135.955503 C197.157358,135.981974 197.151601,136.008446 197.148146,136.034917 C197.140086,136.115483 197.129722,136.197199 197.119359,136.277765 C197.119359,136.303086 197.119359,136.327255 197.108995,136.352576 C197.09748,136.461915 197.081359,136.571254 197.061784,136.679442 C197.049117,136.779573 197.032996,136.879705 197.013421,136.979836 C197.00536,137.037383 196.993845,137.093779 196.984633,137.151326 C196.954694,137.314759 196.915543,137.477041 196.868332,137.63587 C196.779667,137.932811 196.671426,138.223998 196.54361,138.507128 C196.488338,138.630279 196.429611,138.753429 196.369733,138.874277 C196.245372,139.12403 196.104889,139.379538 195.950588,139.644253 C195.873438,139.774309 195.793984,139.908969 195.708774,140.043628 L195.579806,140.248495 C195.448535,140.454513 195.311507,140.662832 195.168721,140.873453 C195.025935,141.084075 194.872786,141.2993 194.708122,141.516827 C194.495095,141.806863 194.270552,142.102653 194.033344,142.401897 L193.85371,142.628631 C193.3459,143.260495 192.825422,143.882001 192.29343,144.493148 L191.870831,144.970787 L191.6555,145.212483 C191.361868,145.5405 191.067085,145.866215 190.769998,146.189628 L190.542002,146.435928 C190.391156,146.599361 190.2334,146.765096 190.082554,146.92968 C189.928253,147.096566 189.77165,147.262301 189.615046,147.428035 L189.140629,147.927542 L188.660454,148.428199 C188.500396,148.595085 188.339186,148.764272 188.176825,148.931158 L187.199203,149.938227 C182.529879,154.724971 177.609528,159.495602 175.944461,162.469621 C175.833918,162.66413 175.733737,162.863242 175.642769,163.066957 C175.406712,163.607897 175.307683,164.070574 175.37447,164.444628 C175.396348,164.567778 175.440105,164.686325 175.503438,164.793362 C175.597861,164.955644 175.711859,165.105266 175.84313,165.238774 C175.904159,165.298623 175.967491,165.355019 176.036581,165.40566 C176.378576,165.649658 176.793115,165.77511 177.213412,165.76245 L177.344683,165.76245 L177.479408,165.753243 L177.615285,165.73713 L177.72698,165.721016 C177.74195,165.719865 177.756919,165.716413 177.771889,165.71296 L177.874372,165.693394 L177.902008,165.686488 L178.014855,165.662319 L178.055157,165.65196 L178.176065,165.620885 L178.320002,165.5783 C178.414425,165.549527 178.508848,165.517301 178.605573,165.481622 C178.813994,165.407962 179.01781,165.322792 179.217019,165.227265 C179.267685,165.203095 179.319502,165.180076 179.369017,165.153605 L179.522166,165.077643 C179.722526,164.976361 179.921736,164.867022 180.122096,164.748475 C180.282154,164.654099 180.441061,164.55742 180.599968,164.454987 L180.754269,164.352553 C180.804935,164.320327 180.856752,164.28695 180.907418,164.248969 L181.060567,164.146536 L181.141172,164.09014 L181.36226,163.935914 C181.568378,163.793198 181.760678,163.647029 181.956433,163.498559 L181.973705,163.485898 L182.282307,163.246503 C182.704906,162.912732 183.106779,162.576659 183.469501,162.260151 L183.711316,162.047228 L183.734346,162.026511 L183.859859,161.91487 C184.156945,161.649004 184.42179,161.400402 184.642877,161.194384 L184.732694,161.106913 L184.950327,160.900895 L185.080447,160.774292 L185.125355,160.728255 L185.138022,160.715595 C185.248565,160.605105 185.248565,160.605105 185.359109,160.493464 L185.368321,160.484256 L185.409775,160.447426 C185.422442,160.433615 185.439714,160.419804 185.45929,160.40254 L185.475411,160.387578 L185.519167,160.348446 L185.752922,160.142428 L185.884192,160.023882 C185.948676,159.968637 186.018918,159.905335 186.090311,159.83743 L186.248066,159.699318 C186.275702,159.676299 186.303338,159.649828 186.332125,159.624507 L186.496789,159.48064 L186.739755,159.268868 L186.867572,159.157227 C187.368473,158.721022 187.978767,158.195044 188.677727,157.602312 L188.96445,157.360615 L189.438867,156.963542 L189.922496,156.558413 C190.55582,156.033586 191.237507,155.475382 191.958344,154.89531 L192.43161,154.515501 C192.834634,154.195541 193.249174,153.867524 193.668319,153.538356 C193.836438,153.405998 194.008011,153.273641 194.183039,153.142434 C194.612547,152.810964 195.044359,152.480646 195.477322,152.154931 C196.465308,151.409124 197.484383,150.662166 198.514974,149.933624 L198.958301,149.626324 C199.266902,149.41225 199.575504,149.199326 199.885257,148.991007 L200.165071,148.801102 C200.718941,148.428199 201.277418,148.063352 201.8405,147.706562 L202.120315,147.530469 L202.397826,147.357828 C202.675337,147.186339 202.952848,147.016 203.226904,146.851416 L203.502112,146.687983 L204.05368,146.366872 C204.234465,146.262137 204.414099,146.159704 204.593732,146.058421 L204.701973,145.997422 L205.13033,145.763782 C205.307661,145.667103 205.486143,145.573877 205.662322,145.482953 L205.927167,145.347143 L206.1828,145.217087 C206.767761,144.923598 207.363085,144.651977 207.968773,144.403375 C208.051681,144.368847 208.133437,144.33547 208.215194,144.299791 C208.377555,144.231885 208.538764,144.169735 208.698823,144.11449 C208.858881,144.060396 209.016636,144.004 209.172088,143.948755 C209.313723,143.901567 209.450751,143.857831 209.584325,143.816397 L209.629233,143.802586 C209.700626,143.780718 209.772019,143.760002 209.843412,143.741587 L209.864139,143.735832 C210.009227,143.694398 210.154316,143.656417 210.300556,143.621889 L210.312071,143.621889 C210.527402,143.570097 210.736974,143.526362 210.941941,143.491833 C211.012182,143.480324 211.081272,143.468815 211.148059,143.459607 C211.347268,143.429683 211.547629,143.408966 211.747989,143.395155 C211.874654,143.385947 212.002471,143.382494 212.129135,143.382494 L212.225861,143.382494 C212.354829,143.382494 212.479191,143.390551 212.602401,143.403211 C212.658824,143.403211 212.715248,143.41357 212.77052,143.421626 L212.79355,143.421626 C212.848822,143.427381 212.905245,143.436588 212.959365,143.448098 C213.013486,143.459607 213.068758,143.467664 213.121727,143.480324 L213.140151,143.480324 C213.19312,143.491833 213.242634,143.506796 213.297906,143.521758 C213.737778,143.640304 214.141954,143.863586 214.47704,144.172037 L214.506979,144.201961 L214.565705,144.259508 L214.62328,144.320508 C215.085031,144.806203 215.467328,145.362105 215.755203,145.967497 L215.800111,146.069931 C215.897988,146.300118 215.975139,146.538362 216.029259,146.78236 C216.178954,147.462564 216.138651,148.17154 215.911806,148.829876 C215.813929,149.116459 215.694173,149.394986 215.552539,149.662003 L215.458116,149.832341 C215.098849,150.434281 214.675097,150.994787 214.19262,151.503501 L214.088985,151.61284 C213.86214,151.853385 213.626083,152.087025 213.383117,152.311458 C213.305966,152.383967 213.226513,152.456476 213.145908,152.528985 L213.023849,152.639475 C212.77052,152.863907 212.511433,153.082585 212.248891,153.296659 C212.202831,153.333489 212.15562,153.370319 212.110711,153.409451 C211.927623,153.556771 211.738777,153.706393 211.543023,153.856014 C210.868245,154.371634 210.180801,154.868838 209.48069,155.348779 C209.068454,155.630758 208.642399,155.915039 208.202527,156.203924 C207.38266,156.741412 206.557036,157.26739 205.723352,157.783009 C205.482689,157.93148 205.240874,158.082252 204.994454,158.231874 C202.729457,159.624507 200.21804,161.113819 197.539655,162.733186 L196.846454,163.154429 C196.087616,163.617105 195.371385,164.059064 194.697758,164.479156 L194.358066,164.69323 L193.71553,165.105266 C193.461049,165.266397 193.213477,165.425226 192.972814,165.581753 C192.790878,165.697998 192.613547,165.813091 192.439671,165.927034 L192.096524,166.152618 C191.929557,166.261956 191.763741,166.371295 191.597926,166.482936 L191.436716,166.58537 C191.266294,166.699312 191.100478,166.810953 190.939269,166.921443 L190.675576,167.100989 L190.365823,167.316214 L190.080251,167.515326 C189.628864,167.831834 189.183234,168.155247 188.741059,168.484414 L188.586758,168.60181 C188.412882,168.733017 188.244763,168.864223 188.082402,168.991977 L187.870526,169.161165 C187.513562,169.447748 187.189991,169.722822 186.897511,169.985235 L186.754725,170.116442 C186.672969,170.191253 186.592364,170.266064 186.516365,170.339724 C186.462244,170.390365 186.41273,170.442157 186.35861,170.492798 L186.283762,170.566458 C186.119098,170.731042 185.95904,170.90023 185.803588,171.074021 L185.727589,171.161492 C185.558318,171.357151 185.410927,171.544754 185.280807,171.727753 L185.223232,171.81062 C185.073538,172.029298 184.941115,172.260636 184.828269,172.501182 L184.789118,172.5852 L184.764936,172.643898 L184.745361,172.694539 L184.718876,172.764746 C184.710816,172.788916 184.694695,172.831501 184.685483,172.868331 C184.638272,173.005292 184.603727,173.146857 184.580697,173.290724 L184.569182,173.367837 L184.55997,173.438044 L184.554212,173.505949 L184.554212,173.867343 C184.554212,173.897268 184.554212,173.928343 184.561121,173.959418 C184.562273,173.977833 184.564576,173.996248 184.566879,174.014663 C184.566879,174.044587 184.574939,174.074512 184.578394,174.105587 L184.600272,174.23219 L184.600272,174.237945 C184.608333,174.278228 184.616393,174.31851 184.626757,174.358793 C184.63712,174.399076 184.647483,174.443962 184.66015,174.486547 C184.696998,174.616603 184.741906,174.745508 184.793724,174.87096 C184.793724,174.880167 184.801784,174.890526 184.805239,174.899733 C184.819057,174.930808 184.831723,174.963035 184.846693,174.99411 C184.903116,175.125317 184.966448,175.253071 185.035538,175.378523 L185.108083,175.508578 L185.184082,175.639785 L185.263535,175.770992 C185.277353,175.79286 185.292322,175.812426 185.309595,175.831992 L185.336079,175.859614 L185.363715,175.884935 L185.394806,175.909104 C185.415533,175.924066 185.437411,175.936727 185.460441,175.949387 C185.483471,175.959745 185.508804,175.971255 185.534137,175.980462 C185.561773,175.98967 185.589409,175.997726 185.617045,176.004632 C186.281459,176.154254 187.647136,175.604106 189.460745,174.646527 L189.786619,174.473887 L190.338187,174.173492 L190.608789,174.023871 C190.799937,173.920286 190.996844,173.805193 191.196053,173.69125 L191.561078,173.485232 C193.959648,172.10526 196.825727,170.331667 199.78738,168.600659 C200.063739,168.438377 200.342402,168.277246 200.621064,168.116115 L201.180692,167.759325 C201.460506,167.598193 201.739169,167.439364 202.018983,167.282837 C203.209632,166.606086 204.416402,165.955807 205.635838,165.333151 L206.1828,165.055775 C206.546673,164.876229 206.907092,164.701287 207.262905,164.530948 C207.952652,164.204082 208.649308,163.894481 209.352874,163.600992 L209.738625,163.444465 L209.785837,163.42605 C211.828594,162.616941 213.681355,162.116284 215.180605,162.116284 C215.505328,162.113982 215.828898,162.140454 216.149015,162.195699 L216.158227,162.195699 C216.261862,162.214114 216.355133,162.23483 216.449556,162.259 L216.466828,162.259 C216.605008,162.292377 216.740885,162.334962 216.872156,162.387905 C216.976942,162.425886 217.078274,162.470772 217.176151,162.523715 C217.60566,162.741242 217.972988,163.062354 218.247044,163.458276 C218.270074,163.491653 218.291953,163.526181 218.31268,163.56186 C218.414012,163.716085 218.501525,163.878367 218.572918,164.048706 C218.615524,164.146536 218.654675,164.247818 218.691523,164.357157 C218.728371,164.467647 218.760612,164.573533 218.794006,164.687476 C219.038123,165.559885 219.013942,166.482936 218.724916,167.341535 Z" id="Shape" fill="#FF9D00" fillRule="nonzero"></path>
|
10 |
+
<path d="M205.169417,104.827718 L205.169417,104.263487 C205.169417,56.0689579 166.09852,17 117.882512,17 C69.6688088,17 30.5830519,56.0712609 30.5830519,104.263487 L30.5805699,104.452331 C30.5805699,104.577844 30.5805699,104.702205 30.5805699,104.827718 C30.5874292,105.015411 30.5893875,105.203104 30.5876596,105.390797 L30.5945712,105.805334 L30.596875,105.955028 L30.6026346,106.159993 C30.6049385,106.270536 30.6083942,106.379928 30.6083942,106.490471 L30.6199134,106.938402 L30.6383442,107.464633 L30.6441038,107.637357 L30.6671422,108.182012 L30.6671422,108.207345 L30.6993961,108.84412 L30.6993961,108.860241 C30.7109153,109.077873 30.7224345,109.295505 30.7362576,109.511985 C30.7477768,109.713496 30.7604479,109.913856 30.774271,110.114215 L30.7777268,110.149912 C30.7878637,110.321484 30.7993829,110.494208 30.8122844,110.66578 L30.8157402,110.710688 C30.8272594,110.876503 30.8399306,111.043469 30.8537536,111.208132 L30.8560575,111.240374 L30.8790959,111.528247 C30.9480961,112.358472 31.0287306,113.188698 31.1209996,114.01662 L31.1267592,114.068437 L31.1958745,114.676425 L31.2753572,115.316654 C31.3286912,115.742706 31.3855961,116.167607 31.4458417,116.592507 C31.4817816,116.84929 31.5191039,117.104921 31.5575781,117.360552 L31.5633377,117.391642 C31.6493863,117.969691 31.7422312,118.54774 31.8421029,119.123486 C32.1285859,120.7943 32.4650624,122.457054 32.8511867,124.108293 L32.8569463,124.134777 L32.8938079,124.290228 C32.9283655,124.437619 32.9629232,124.58501 32.9997847,124.733552 L33.0631404,124.656403 C35.5973693,121.62798 39.2328357,119.958317 43.300273,119.958317 C46.5602128,119.958317 49.9053949,121.034962 53.2563365,123.159464 C55.4795464,124.571192 57.9377483,127.072232 60.4673695,130.216955 C62.8080754,126.966295 66.0886346,124.807248 69.8415971,124.217684 C70.5580927,124.104838 71.2814999,124.047264 72.004907,124.046112 C80.5717524,124.046112 85.7312119,131.477839 87.6802643,138.161096 C88.6421193,140.420322 93.2843658,150.704294 100.283445,157.661606 C110.921447,168.286419 113.583539,179.250921 108.319254,190.990378 L108.336533,190.990378 C108.695933,191.029529 109.056484,191.066377 109.418188,191.09977 C109.633597,191.120497 109.849007,191.141224 110.064416,191.159647 L110.140443,191.166556 C110.298257,191.181526 110.457222,191.195344 110.616187,191.206859 L111.053918,191.242555 C111.470913,191.274797 111.889061,191.303584 112.309513,191.33122 L112.641266,191.350795 L112.905057,191.365764 L113.069781,191.374976 L113.583539,191.402612 L113.724073,191.40837 L114.265477,191.431399 L114.865628,191.454429 L115.349435,191.468247 L115.43007,191.471702 C115.531439,191.475156 115.635112,191.479762 115.737633,191.479762 L115.847066,191.482065 C116.5267,191.498186 117.205182,191.506246 117.884816,191.506246 L118.605919,191.506246 L119.121981,191.500489 C119.248692,191.49358 119.374252,191.49358 119.500963,191.49358 L119.701397,191.49358 L119.875338,191.490126 C119.943301,191.487823 120.012416,191.48552 120.08038,191.48552 L120.286574,191.48552 L120.690899,191.474005 L121.299114,191.453278 L121.969532,191.424491 C122.161903,191.41643 122.353122,191.407218 122.545493,191.396855 L122.751687,191.38534 L123.056947,191.368067 L123.237799,191.357704 L123.372574,191.349644 L123.991156,191.30819 C124.19735,191.295523 124.403544,191.279403 124.609738,191.263282 L125.169572,191.219525 L125.564681,191.186132 L125.996652,191.146981 C126.440142,191.105527 126.88248,191.061771 127.322515,191.015711 C127.603584,190.984621 127.882349,190.95353 128.162266,190.920137 L128.654137,190.861411 C123.471639,179.170317 126.154466,168.255328 136.749846,157.671969 C143.732798,150.697385 148.375045,140.41111 149.342659,138.153035 C151.292864,131.468627 156.447716,124.038052 165.016865,124.038052 C165.741424,124.039203 166.464831,124.095626 167.180175,124.209624 C170.929681,124.799188 174.211508,126.959386 176.555669,130.208895 C179.086443,127.064171 181.543493,124.563132 183.766702,123.151403 C187.117644,121.026901 190.46513,119.950256 193.721614,119.950256 C197.311004,119.950256 200.565184,121.251442 203.022234,123.637333 C203.157009,123.036254 203.287176,122.435175 203.411584,121.831794 C203.48761,121.455256 203.562485,121.077567 203.636208,120.698726 C203.726058,120.233523 203.811301,119.766018 203.889631,119.298512 C203.910366,119.171848 203.932252,119.045184 203.956443,118.917369 C203.994456,118.71816 204.027862,118.521255 204.058964,118.320896 L204.071635,118.244897 L204.153421,117.723272 L204.234056,117.194737 L204.235208,117.18898 C204.267462,116.967893 204.300867,116.745655 204.330817,116.523418 C204.355008,116.3553 204.378046,116.187182 204.399933,116.017913 L204.413756,115.921187 L204.485175,115.370774 L204.543923,114.902117 C204.579633,114.593518 204.61419,114.284918 204.647596,113.975167 C204.662571,113.851957 204.675242,113.727596 204.687913,113.603235 L204.699433,113.503055 C204.721319,113.296938 204.740902,113.091973 204.759333,112.885856 L204.805409,112.406835 C204.901019,111.34516 204.978198,110.283485 205.038098,109.219506 L205.038098,109.212597 L205.0692,108.648367 L205.074959,108.521703 L205.097998,107.986259 C205.107213,107.784748 205.115276,107.580934 205.122188,107.374817 L205.122188,107.359847 L205.127948,107.211305 C205.133707,107.084641 205.138315,106.957977 205.138315,106.832464 L205.140619,106.780647 C205.145226,106.663195 205.149834,106.541137 205.149834,106.42023 L205.15329,106.316596 C205.156746,106.19569 205.160201,106.075934 205.160201,105.955028 L205.162505,105.852545 C205.165961,105.698245 205.169417,105.545097 205.169417,105.390797 L205.169417,104.827718 Z M96.2309827,192.003691 C103.143668,181.868262 102.654101,174.261508 93.1680216,164.784733 C83.6819423,155.307957 78.1607792,141.448604 78.1607792,141.448604 C78.1607792,141.448604 76.0999904,133.397375 71.4001479,134.137784 C66.7003053,134.877042 63.2527174,146.906674 73.0923216,154.270463 C82.932041,161.634252 71.1352058,166.627119 67.3476856,159.717018 C63.5591287,152.805766 53.219475,135.041705 47.8572772,131.643653 C42.4962313,128.245602 38.7259899,130.150169 39.9896486,137.153541 C40.6174462,140.629894 46.4553879,146.562378 51.9097395,152.105659 C57.4435737,157.730695 62.5822986,162.95271 61.4845168,164.800854 C59.3039281,168.471809 51.6240628,160.488518 51.6240628,160.488518 C51.6240628,160.488518 27.576535,138.613632 22.3410486,144.314667 C17.5156465,149.568923 24.9570639,154.035559 36.4256013,160.919175 C37.4012794,161.505285 38.4057556,162.108666 39.4344221,162.730472 C52.5548159,170.671158 53.575419,172.765721 51.7139127,175.769962 C51.0262152,176.88 46.6339358,174.244236 41.2314207,171.001636 C32.0218028,165.474476 19.877087,158.186686 18.1630268,164.709886 C16.6781991,170.355649 25.6125077,173.81473 33.7151284,176.952544 C40.4653925,179.56643 46.6396954,181.956927 45.7319807,185.20183 C44.7908603,188.565337 39.6889969,185.760303 34.1113896,182.69273 C27.8506925,179.248619 20.9909958,175.475181 18.7470514,179.733396 C14.50337,187.782323 48.0173944,197.257947 48.2927038,197.327036 C59.1207724,200.134373 86.6193439,206.082978 96.2309827,192.003691 Z M140.768903,192.003691 C133.855066,181.868262 134.345784,174.261508 143.831864,164.784733 C153.317943,155.307957 158.840258,141.448604 158.840258,141.448604 C158.840258,141.448604 160.899895,133.397375 165.599737,134.137784 C170.29958,134.877042 173.748435,146.906674 163.907564,154.270463 C154.066692,161.634252 165.86468,166.627119 169.653352,159.717018 C173.440872,152.805766 183.775918,135.041705 189.136964,131.643653 C194.49801,128.245602 198.269403,130.150169 197.005744,137.153541 C196.377947,140.629894 190.540005,146.56353 185.084501,152.10681 C179.551819,157.730695 174.413094,162.95271 175.510876,164.800854 C177.690313,168.471809 185.37709,160.483912 185.37709,160.483912 C185.37709,160.483912 209.423465,138.609026 214.658952,144.310061 C219.484354,149.564317 212.041785,154.032104 200.573247,160.916873 C199.597569,161.50183 198.594245,162.10406 197.565578,162.727017 C184.444033,170.666552 183.424581,172.762266 185.286088,175.765356 C185.973785,176.875394 190.366065,174.23963 195.767428,170.998181 C204.978198,165.471022 217.122913,158.18208 218.836974,164.70528 C220.321801,170.351043 211.387493,173.811275 203.28372,176.94909 C196.533456,179.562976 190.360305,181.952321 191.266868,185.198376 C192.207988,188.561882 197.307548,185.755697 202.882851,182.688124 C209.143548,179.244013 216.004397,175.469423 218.249493,179.72879 C222.493175,187.783474 188.974543,197.248735 188.702689,197.318976 C177.87462,200.134373 150.375934,206.082978 140.768903,192.003691 Z" id="Shape" fill="#FFD21E"></path>
|
11 |
+
<path d="M146.614758,80.6109193 C147.976342,81.0911 148.989747,82.5534685 149.953577,83.9434653 C151.256362,85.8239816 152.46922,87.5723908 154.330012,86.5867567 C156.5263,85.424214 158.325988,83.6390445 159.503106,81.456405 C160.680224,79.2743398 161.180586,76.793253 160.941934,74.3272149 C160.764386,72.4833439 160.178709,70.7015059 159.226408,69.1104765 C158.274107,67.5193323 156.979392,66.1585139 155.436803,65.1258957 C153.89306,64.0933923 152.138336,63.414936 150.300602,63.139349 C148.461715,62.8637621 146.584783,62.9980518 144.804694,63.532339 C142.423941,64.2467514 140.30951,65.6481209 138.727721,67.5595388 C137.147086,69.4708418 136.169421,71.806381 135.91924,74.2709257 C135.670212,76.7360448 136.160198,79.2193143 137.326939,81.4070084 C138.189314,83.0233104 140.10314,82.2616841 142.123033,81.456405 C143.707127,80.825737 145.355784,80.1686476 146.614758,80.6109193 Z M87.3830077,80.6120681 C86.0214245,81.0922488 85.0068663,82.5546173 84.043036,83.9446141 C82.7402511,85.8239816 81.5273929,87.5735396 79.666601,86.5879055 C78.0260143,85.7182959 76.5998684,84.4960178 75.4896189,83.0106741 C74.3793695,81.5253304 73.6138392,79.8136815 73.2472148,77.9977254 C72.8805903,76.1821139 72.922095,74.3082604 73.3694229,72.5107992 C73.8179037,70.7133381 74.6595259,69.036956 75.8343381,67.6023875 C77.0114562,66.168623 78.4917888,65.0125133 80.1704214,64.2172284 C81.849054,63.4219434 83.684482,63.0074717 85.5429681,63.0033066 C87.4014542,62.9993155 89.238035,63.4058608 90.9201264,64.1936788 C92.6022177,64.9816117 94.0883149,66.1312884 95.2711974,67.5598834 C96.8529859,69.4711864 97.830651,71.8071852 98.0808318,74.2723042 C98.3298598,76.7374233 97.8398743,79.2208077 96.6719795,81.4081571 C95.8072992,83.0244592 93.8934736,82.2616841 91.8735805,81.456405 C90.2906392,80.825737 88.6419821,80.1686476 87.3830077,80.6120681 Z M137.451453,134.317638 C146.839575,126.947209 150.28792,114.91627 150.28792,107.504485 C150.28792,101.645821 146.331143,103.489577 139.998225,106.613049 L139.641977,106.789958 C133.827867,109.658406 126.089555,113.476876 117.594936,113.476876 C109.099164,113.476876 101.360852,109.657257 95.549048,106.788809 C89.0109124,103.561949 84.9100221,101.537838 84.9100221,107.505634 C84.9100221,115.150616 88.5785722,127.705389 98.649215,135 C99.9773639,132.311907 101.847379,129.925937 104.14397,127.991429 C106.439408,126.056922 109.111846,124.616379 111.992961,123.758257 C112.998296,123.45958 114.033606,125.183866 115.094281,126.948357 C116.116909,128.651965 117.161443,130.393482 118.225576,130.393482 C119.357731,130.393482 120.471439,128.677238 121.55863,127.0012 C122.694244,125.250494 123.801034,123.54344 124.872085,123.884621 C130.263078,125.608906 134.772098,129.348112 137.451453,134.317638 Z" id="Shape" fill="#32343D"></path>
|
12 |
+
<path d="M137,134.589063 C132.085163,138.426668 125.552521,141 117.046296,141 C109.053457,141 102.801266,138.72867 98,135.271151 C99.3361959,132.585286 101.217068,130.200275 103.524411,128.267693 C105.831753,126.333962 108.518052,124.892851 111.41295,124.033925 C113.398122,123.447146 115.500342,130.666483 117.67673,130.666483 C120.00725,130.666483 122.252013,123.493078 124.358868,124.160238 C129.775503,125.884979 134.306748,129.622682 137,134.589063 Z" id="Path" fill="#FF323D" fillRule="nonzero"></path>
|
13 |
+
<path d="M64.7091655,90.6371141 C63.8188025,91.2251504 62.8188988,91.6348571 61.7668713,91.8425321 C60.7147288,92.0502071 59.6311738,92.0524645 58.5782258,91.8493041 C57.5252778,91.6450151 56.5236482,91.2398231 55.6307538,90.6563014 C54.7378594,90.0716511 53.971305,89.3210865 53.3749298,88.4452396 C52.7785546,87.5705215 52.3642101,86.5874511 52.1555996,85.555848 C51.9469891,84.5231163 51.9481398,83.4599105 52.1590515,82.4271787 C52.3699633,81.3955756 52.786609,80.4147626 53.3848253,79.5400444 C53.9831565,78.6662292 54.7514369,77.9166804 55.645597,77.3345131 C57.4440428,76.1637452 59.642335,75.7399301 61.7587018,76.1560702 C63.8750686,76.5722103 65.7370296,77.7943327 66.9366838,79.5547171 C68.1356477,81.3154401 68.5751911,83.4700685 68.1575098,85.5468187 C67.7398285,87.6246976 66.4994417,89.455398 64.7091655,90.6371141 Z M181.39746,90.6371141 C180.506867,91.2251504 179.506963,91.6348571 178.455281,91.8425321 C177.403599,92.0502071 176.319699,92.0524645 175.266866,91.8493041 C174.214033,91.6450151 173.211828,91.2398231 172.318933,90.6563014 C171.426039,90.0716511 170.659715,89.3210865 170.063685,88.4452396 C169.466504,87.5705215 169.052275,86.5874511 168.844009,85.555848 C168.635744,84.5231163 168.636895,83.4599105 168.847461,82.4271787 C169.058028,81.3955756 169.474559,80.4147626 170.07289,79.5400444 C170.671221,78.6662292 171.439847,77.9166804 172.333892,77.3345131 C174.132338,76.1637452 176.331205,75.7399301 178.447227,76.1560702 C180.563248,76.5722103 182.424979,77.7943327 183.625094,79.5547171 C184.824057,81.3154401 185.263601,83.4700685 184.84592,85.5468187 C184.428238,87.6246976 183.187852,89.455398 181.39746,90.6371141 Z" id="Shape" fill="#FFAD03"></path>
|
14 |
+
</g>
|
15 |
+
</g>
|
16 |
+
</svg>
|
17 |
+
);
|
18 |
+
|
19 |
+
export default HFLogo;
|
frontend/src/components/Logo/Logo.js
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React from "react";
|
2 |
+
import { useNavigate, useSearchParams, useLocation } from "react-router-dom";
|
3 |
+
import { Box } from "@mui/material";
|
4 |
+
import HFLogo from "./HFLogo";
|
5 |
+
import { useLeaderboard } from "../../pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
|
6 |
+
|
7 |
+
const Logo = ({ height = "40px" }) => {
|
8 |
+
const navigate = useNavigate();
|
9 |
+
const [searchParams, setSearchParams] = useSearchParams();
|
10 |
+
const location = useLocation();
|
11 |
+
const { actions } = useLeaderboard();
|
12 |
+
|
13 |
+
const handleReset = () => {
|
14 |
+
// Reset all leaderboard state first
|
15 |
+
actions.resetAll();
|
16 |
+
|
17 |
+
// Then clean URL in one go
|
18 |
+
if (
|
19 |
+
location.pathname !== "/" ||
|
20 |
+
searchParams.toString() !== "" ||
|
21 |
+
location.hash !== ""
|
22 |
+
) {
|
23 |
+
window.history.replaceState(null, "", "/");
|
24 |
+
navigate("/", { replace: true, state: { skipUrlSync: true } });
|
25 |
+
setSearchParams({}, { replace: true, state: { skipUrlSync: true } });
|
26 |
+
}
|
27 |
+
};
|
28 |
+
|
29 |
+
return (
|
30 |
+
<Box
|
31 |
+
onClick={handleReset}
|
32 |
+
sx={{
|
33 |
+
height,
|
34 |
+
display: "flex",
|
35 |
+
alignItems: "center",
|
36 |
+
justifyContent: "center",
|
37 |
+
cursor: "pointer",
|
38 |
+
transition: "opacity 0.2s ease",
|
39 |
+
"&:hover": {
|
40 |
+
opacity: 0.8,
|
41 |
+
},
|
42 |
+
}}
|
43 |
+
>
|
44 |
+
<Box
|
45 |
+
sx={{
|
46 |
+
height: "100%",
|
47 |
+
aspectRatio: "95/88", // Ratio du SVG original (width/height)
|
48 |
+
}}
|
49 |
+
>
|
50 |
+
<HFLogo />
|
51 |
+
</Box>
|
52 |
+
</Box>
|
53 |
+
);
|
54 |
+
};
|
55 |
+
|
56 |
+
export default Logo;
|