from llama_cpp import Llama
from huggingface_hub import hf_hub_download


def load_model() -> Llama:
    """Downlaod model from Huggingface Hub and load it."""
    try:
        model = Llama(
            model_path=hf_hub_download(
                repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
                filename="Phi-3-mini-4k-instruct-q4.gguf",
            ),
            n_ctx=4096,
            n_threads=8,
            n_gpu_layers=0,
            stop=["\n", " Q:"],
        )
        return model
    except Exception as e:
        raise Exception(f"Failed to load model: {e}")