Spaces:
Sleeping
Sleeping
import time | |
import gradio as gr | |
import os | |
import psutil | |
import shutil | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from llama_cpp import Llama | |
def run_test(model_type, repo_id, file_name, test_prompt, max_new_tokens, n_ctx, max_tokens, temperature, top_p, top_k): | |
result = {} | |
# Disk usage before download | |
disk_before = shutil.disk_usage("/")[2] | |
start_time = time.time() | |
process = psutil.Process(os.getpid()) | |
cpu_start = process.cpu_percent(interval=0.1) | |
mem_start = process.memory_info().rss | |
try: | |
if model_type == "transformers": | |
tokenizer = AutoTokenizer.from_pretrained(repo_id) | |
model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True) | |
inputs = tokenizer(test_prompt, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=max_new_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k | |
) | |
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
else: | |
gguf_path = f"./{file_name}" | |
if not os.path.exists(gguf_path): | |
from huggingface_hub import hf_hub_download | |
hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="./", local_dir_use_symlinks=False) | |
llm = Llama( | |
model_path=gguf_path, | |
n_ctx=n_ctx | |
) | |
output_text = llm( | |
test_prompt, | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k | |
)["choices"][0]["text"] | |
except Exception as e: | |
return f"❌ Error: {str(e)}", "", "", "", "" | |
end_time = time.time() | |
mem_end = process.memory_info().rss | |
cpu_end = process.cpu_percent(interval=0.1) | |
disk_after = shutil.disk_usage("/")[2] | |
result["output"] = output_text | |
result["inference_time"] = round(end_time - start_time, 2) | |
result["memory_used_MB"] = round((mem_end - mem_start) / (1024 * 1024), 2) | |
result["cpu_percent"] = round(cpu_end - cpu_start, 2) | |
result["disk_used_MB"] = round((disk_before - disk_after) / (1024 * 1024), 2) | |
return ( | |
result["output"], | |
f"{result['inference_time']} sec", | |
f"{result['cpu_percent']}%", | |
f"{result['memory_used_MB']} MB", | |
f"{result['disk_used_MB']} MB" | |
) | |
gr.Interface( | |
fn=run_test, | |
inputs=[ | |
gr.Dropdown(["transformers", "gguf"], label="Model Type"), | |
gr.Textbox(label="Repo ID (e.g., TheBloke/Mistral-7B-Instruct-v0.1-GGUF)"), | |
gr.Textbox(label="Model File Name (only for GGUF)", placeholder="mistral.Q4_0.gguf"), | |
gr.Textbox(label="Test Prompt", value="What is the treatment for lumbar disc herniation?"), | |
gr.Slider(1, 16384, value=50, step=1, label="Max New Tokens"), | |
gr.Slider(256, 32768, value=2048, step=64, label="n_ctx (GGUF only)"), | |
gr.Slider(1, 16384, value=128, step=1, label="Max Tokens (GGUF only)"), | |
gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature"), | |
gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="Top-p"), | |
gr.Slider(0, 100, value=50, step=1, label="Top-k") | |
], | |
outputs=[ | |
gr.Textbox(label="Model Output"), | |
gr.Textbox(label="Inference Time"), | |
gr.Textbox(label="CPU Usage"), | |
gr.Textbox(label="RAM Usage"), | |
gr.Textbox(label="Disk Usage (downloaded size)") | |
], | |
title="🧪 Model Benchmark Tester - HF CPU Space", | |
description="Input repo and model file name to benchmark GGUF or Transformers models. Adjust generation hyperparameters as needed." | |
).launch() | |