File size: 1,238 Bytes
4bb9d41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import torch
from transformers import Trainer, TrainingArguments
from app.model.model import NigerianLanguageModel
from app.model.config import ModelConfig

def train_model(model: NigerianLanguageModel, train_dataset, eval_dataset=None):
    training_args = TrainingArguments(
        output_dir="outputs",
        num_train_epochs=model.config.num_train_epochs,
        per_device_train_batch_size=model.config.batch_size,
        learning_rate=model.config.learning_rate,
        save_steps=500,
    )
    
    trainer = Trainer(
        model=model.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    
    trainer.train()

# scripts/preprocess.py
from app.utils.data_preprocessing import load_language_data, preprocess_text
import os

def main():
    languages = ["yoruba", "igbo", "hausa"]
    for lang in languages:
        data = load_language_data("data/raw", lang)
        processed_data = [preprocess_text(text) for text in data]
        
        output_dir = f"data/processed/{lang}"
        os.makedirs(output_dir, exist_ok=True)
        
        with open(f"{output_dir}/processed_texts.txt", 'w', encoding='utf-8') as f:
            f.writelines(processed_data)