ujwal55's picture
Update app.py
4b7c10a verified
import time
import gradio as gr
import os
import psutil
import shutil
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_cpp import Llama
def run_test(model_type, repo_id, file_name, test_prompt, max_new_tokens, n_ctx, max_tokens, temperature, top_p, top_k):
result = {}
# Disk usage before download
disk_before = shutil.disk_usage("/")[2]
start_time = time.time()
process = psutil.Process(os.getpid())
cpu_start = process.cpu_percent(interval=0.1)
mem_start = process.memory_info().rss
try:
if model_type == "transformers":
tokenizer = AutoTokenizer.from_pretrained(repo_id)
model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True)
inputs = tokenizer(test_prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k
)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
else:
gguf_path = f"./{file_name}"
if not os.path.exists(gguf_path):
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="./", local_dir_use_symlinks=False)
llm = Llama(
model_path=gguf_path,
n_ctx=n_ctx
)
output_text = llm(
test_prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k
)["choices"][0]["text"]
except Exception as e:
return f"❌ Error: {str(e)}", "", "", "", ""
end_time = time.time()
mem_end = process.memory_info().rss
cpu_end = process.cpu_percent(interval=0.1)
disk_after = shutil.disk_usage("/")[2]
result["output"] = output_text
result["inference_time"] = round(end_time - start_time, 2)
result["memory_used_MB"] = round((mem_end - mem_start) / (1024 * 1024), 2)
result["cpu_percent"] = round(cpu_end - cpu_start, 2)
result["disk_used_MB"] = round((disk_before - disk_after) / (1024 * 1024), 2)
return (
result["output"],
f"{result['inference_time']} sec",
f"{result['cpu_percent']}%",
f"{result['memory_used_MB']} MB",
f"{result['disk_used_MB']} MB"
)
gr.Interface(
fn=run_test,
inputs=[
gr.Dropdown(["transformers", "gguf"], label="Model Type"),
gr.Textbox(label="Repo ID (e.g., TheBloke/Mistral-7B-Instruct-v0.1-GGUF)"),
gr.Textbox(label="Model File Name (only for GGUF)", placeholder="mistral.Q4_0.gguf"),
gr.Textbox(label="Test Prompt", value="What is the treatment for lumbar disc herniation?"),
gr.Slider(1, 16384, value=50, step=1, label="Max New Tokens"),
gr.Slider(256, 32768, value=2048, step=64, label="n_ctx (GGUF only)"),
gr.Slider(1, 16384, value=128, step=1, label="Max Tokens (GGUF only)"),
gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature"),
gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="Top-p"),
gr.Slider(0, 100, value=50, step=1, label="Top-k")
],
outputs=[
gr.Textbox(label="Model Output"),
gr.Textbox(label="Inference Time"),
gr.Textbox(label="CPU Usage"),
gr.Textbox(label="RAM Usage"),
gr.Textbox(label="Disk Usage (downloaded size)")
],
title="🧪 Model Benchmark Tester - HF CPU Space",
description="Input repo and model file name to benchmark GGUF or Transformers models. Adjust generation hyperparameters as needed."
).launch()