from llama_cpp import Llama from huggingface_hub import hf_hub_download def load_model() -> Llama: """Downlaod model from Huggingface Hub and load it.""" try: model = Llama( model_path=hf_hub_download( repo_id="microsoft/Phi-3-mini-4k-instruct-gguf", filename="Phi-3-mini-4k-instruct-q4.gguf", ), n_ctx=4096, n_threads=8, n_gpu_layers=0, stop=["\n", " Q:"], ) return model except Exception as e: raise Exception(f"Failed to load model: {e}")