import time import gradio as gr import os import psutil import shutil import torch from transformers import AutoTokenizer, AutoModelForCausalLM from llama_cpp import Llama def run_test(model_type, repo_id, file_name, test_prompt, max_new_tokens, n_ctx, max_tokens, temperature, top_p, top_k): result = {} # Disk usage before download disk_before = shutil.disk_usage("/")[2] start_time = time.time() process = psutil.Process(os.getpid()) cpu_start = process.cpu_percent(interval=0.1) mem_start = process.memory_info().rss try: if model_type == "transformers": tokenizer = AutoTokenizer.from_pretrained(repo_id) model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True) inputs = tokenizer(test_prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k ) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) else: gguf_path = f"./{file_name}" if not os.path.exists(gguf_path): from huggingface_hub import hf_hub_download hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="./", local_dir_use_symlinks=False) llm = Llama( model_path=gguf_path, n_ctx=n_ctx ) output_text = llm( test_prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=top_k )["choices"][0]["text"] except Exception as e: return f"❌ Error: {str(e)}", "", "", "", "" end_time = time.time() mem_end = process.memory_info().rss cpu_end = process.cpu_percent(interval=0.1) disk_after = shutil.disk_usage("/")[2] result["output"] = output_text result["inference_time"] = round(end_time - start_time, 2) result["memory_used_MB"] = round((mem_end - mem_start) / (1024 * 1024), 2) result["cpu_percent"] = round(cpu_end - cpu_start, 2) result["disk_used_MB"] = round((disk_before - disk_after) / (1024 * 1024), 2) return ( result["output"], f"{result['inference_time']} sec", f"{result['cpu_percent']}%", f"{result['memory_used_MB']} MB", f"{result['disk_used_MB']} MB" ) gr.Interface( fn=run_test, inputs=[ gr.Dropdown(["transformers", "gguf"], label="Model Type"), gr.Textbox(label="Repo ID (e.g., TheBloke/Mistral-7B-Instruct-v0.1-GGUF)"), gr.Textbox(label="Model File Name (only for GGUF)", placeholder="mistral.Q4_0.gguf"), gr.Textbox(label="Test Prompt", value="What is the treatment for lumbar disc herniation?"), gr.Slider(1, 16384, value=50, step=1, label="Max New Tokens"), gr.Slider(256, 32768, value=2048, step=64, label="n_ctx (GGUF only)"), gr.Slider(1, 16384, value=128, step=1, label="Max Tokens (GGUF only)"), gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature"), gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="Top-p"), gr.Slider(0, 100, value=50, step=1, label="Top-k") ], outputs=[ gr.Textbox(label="Model Output"), gr.Textbox(label="Inference Time"), gr.Textbox(label="CPU Usage"), gr.Textbox(label="RAM Usage"), gr.Textbox(label="Disk Usage (downloaded size)") ], title="🧪 Model Benchmark Tester - HF CPU Space", description="Input repo and model file name to benchmark GGUF or Transformers models. Adjust generation hyperparameters as needed." ).launch()