mset commited on
Commit
ee732a9
·
verified ·
1 Parent(s): 402b318

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -38
app.py CHANGED
@@ -11,8 +11,16 @@ from collections import Counter
11
  from typing import List, Tuple, Dict
12
  import random
13
  import math
14
- from datasets import load_dataset
15
- from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
16
  import gradio as gr
17
 
18
  class SelfOrganizingTokenizer:
@@ -151,56 +159,54 @@ class AITrainer:
151
  """Carica dataset pubblici senza API key"""
152
  datasets = []
153
 
154
- try:
155
- # Wikipedia in italiano
156
- wiki = load_dataset("wikipedia", "20220301.it", split="train[:10000]")
157
- for item in wiki:
158
- if len(item['text']) > 100:
159
- datasets.append(item['text'])
160
- except:
161
- pass
162
-
163
- try:
164
- # Common Crawl
165
- cc = load_dataset("cc100", lang="it", split="train[:5000]")
166
- for item in cc:
167
- if len(item['text']) > 100:
168
- datasets.append(item['text'])
169
- except:
170
- pass
171
-
172
- try:
173
- # OSCAR
174
- oscar = load_dataset("oscar-corpus/OSCAR-2201", "it", split="train[:5000]")
175
- for item in oscar:
176
- if len(item['text']) > 100:
177
- datasets.append(item['text'])
178
- except:
179
- pass
180
 
181
  # Dataset di testo semplice da URL pubblici
182
  urls = [
183
  "https://www.gutenberg.org/files/2000/2000-0.txt", # Divina Commedia
184
- "https://www.gutenberg.org/files/1065/1065-0.txt" # I Promessi Sposi
185
  ]
186
 
187
  for url in urls:
188
  try:
189
- response = requests.get(url, timeout=30)
190
  if response.status_code == 200:
191
  text = response.text
192
- chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
193
- datasets.extend(chunks[:500])
194
- except:
 
 
 
 
 
195
  continue
196
 
197
- # Genera dati sintetici se necessario
198
- if len(datasets) < 1000:
199
- synthetic_texts = self.generate_synthetic_data(5000)
200
- datasets.extend(synthetic_texts)
201
 
202
  self.datasets = datasets[:10000] # Limita a 10k esempi
203
- print(f"Caricati {len(self.datasets)} esempi di training")
204
 
205
  def generate_synthetic_data(self, num_samples):
206
  """Genera dati sintetici per il training"""
 
11
  from typing import List, Tuple, Dict
12
  import random
13
  import math
14
+ try:
15
+ from datasets import load_dataset
16
+ except ImportError:
17
+ print("datasets non disponibile, usando solo dati sintetici")
18
+ load_dataset = None
19
+ try:
20
+ from transformers import AutoTokenizer
21
+ except ImportError:
22
+ print("transformers non disponibile, usando tokenizer personalizzato")
23
+ AutoTokenizer = None
24
  import gradio as gr
25
 
26
  class SelfOrganizingTokenizer:
 
159
  """Carica dataset pubblici senza API key"""
160
  datasets = []
161
 
162
+ if load_dataset:
163
+ try:
164
+ # Wikipedia in italiano
165
+ wiki = load_dataset("wikipedia", "20220301.it", split="train[:1000]", trust_remote_code=True)
166
+ for item in wiki:
167
+ if len(item['text']) > 100:
168
+ datasets.append(item['text'])
169
+ print(f"Caricati {len(datasets)} esempi da Wikipedia")
170
+ except Exception as e:
171
+ print(f"Wikipedia non disponibile: {e}")
172
+
173
+ try:
174
+ # Common Crawl
175
+ cc = load_dataset("cc100", lang="it", split="train[:500]", trust_remote_code=True)
176
+ for item in cc:
177
+ if len(item['text']) > 100:
178
+ datasets.append(item['text'])
179
+ print(f"Caricati esempi da Common Crawl")
180
+ except Exception as e:
181
+ print(f"Common Crawl non disponibile: {e}")
 
 
 
 
 
 
182
 
183
  # Dataset di testo semplice da URL pubblici
184
  urls = [
185
  "https://www.gutenberg.org/files/2000/2000-0.txt", # Divina Commedia
 
186
  ]
187
 
188
  for url in urls:
189
  try:
190
+ response = requests.get(url, timeout=10)
191
  if response.status_code == 200:
192
  text = response.text
193
+ # Filtra contenuto utile
194
+ lines = text.split('\n')
195
+ filtered_lines = [line.strip() for line in lines if len(line.strip()) > 50]
196
+ chunks = filtered_lines[:1000] # Primi 1000 chunk
197
+ datasets.extend(chunks)
198
+ print(f"Caricati {len(chunks)} chunk da {url}")
199
+ except Exception as e:
200
+ print(f"Errore caricamento {url}: {e}")
201
  continue
202
 
203
+ # Genera dati sintetici
204
+ print("Generazione dati sintetici...")
205
+ synthetic_texts = self.generate_synthetic_data(8000)
206
+ datasets.extend(synthetic_texts)
207
 
208
  self.datasets = datasets[:10000] # Limita a 10k esempi
209
+ print(f"Dataset finale: {len(self.datasets)} esempi")
210
 
211
  def generate_synthetic_data(self, num_samples):
212
  """Genera dati sintetici per il training"""