Spaces:
Runtime error
Runtime error
import time | |
import gradio as gr | |
import torch | |
from huggingface_hub import hf_hub_download | |
from onnxruntime import InferenceSession | |
from transformers import AutoModelForQuestionAnswering, AutoTokenizer | |
models = { | |
"Base model": "bert-large-uncased-whole-word-masking-finetuned-squad", | |
"Prunned model": "madlag/bert-large-uncased-wwm-squadv2-x2.63-f82.6-d16-hybrid-v1", | |
"Prunned ONNX Optimized FP16": "tryolabs/bert-large-uncased-wwm-squadv2-optimized-f16", | |
} | |
def run_ort_inference(model_name, inputs): | |
model_path = hf_hub_download(repo_id=models[model_name], filename="model.onnx") | |
sess = InferenceSession(model_path, providers=["CPUExecutionProvider"]) | |
start_time = time.time() | |
output = sess.run(None, input_feed=inputs) | |
end_time = time.time() | |
return (output[0], output[1]), (end_time - start_time) | |
def run_normal_hf(model_name, inputs): | |
start_time = time.time() | |
model = AutoModelForQuestionAnswering.from_pretrained(models[model_name]) | |
end_time = time.time() | |
return model(**inputs).values(), (end_time - start_time) | |
def inference(model_name, context, question): | |
tokenizer = AutoTokenizer.from_pretrained(models[model_name]) | |
if model_name == "Prunned ONNX Optimized FP16": | |
inputs = dict(tokenizer(question, context, return_tensors="np")) | |
output, inference_time = run_ort_inference(model_name, inputs) | |
answer_start_scores, answer_end_scores = torch.tensor(output[0]), torch.tensor( | |
output[1] | |
) | |
else: | |
inputs = tokenizer(question, context, return_tensors="pt") | |
output, inference_time = run_normal_hf(model_name, inputs) | |
answer_start_scores, answer_end_scores = output | |
input_ids = inputs["input_ids"].tolist()[0] | |
answer_start = torch.argmax(answer_start_scores) | |
answer_end = torch.argmax(answer_end_scores) + 1 | |
answer = tokenizer.convert_tokens_to_string( | |
tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]) | |
) | |
return answer, f"{inference_time:.4f}s" | |
model_field = gr.Dropdown( | |
choices=["Base model", "Prunned model", "Prunned ONNX Optimized FP16"], | |
value="Prunned ONNX Optimized FP16", | |
label="Model", | |
) | |
input_text_field = gr.Textbox(placeholder="Enter the text here", label="Text") | |
input_question_field = gr.Text(placeholder="Enter the question here", label="Question") | |
output_model = gr.Text(label="Model output") | |
output_inference_time = gr.Text(label="Inference time in seconds") | |
demo = gr.Interface( | |
inference, | |
title="Optimizing Transformers - Question Answering Demo", | |
inputs=[model_field, input_text_field, input_question_field], | |
outputs=[output_model, output_inference_time], | |
) | |
demo.launch() | |