Spaces:
Runtime error
Runtime error
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
from transformers import AutoTokenizer | |
from transformers import TFAutoModelForQuestionAnswering | |
from datasets import Dataset | |
import streamlit as st | |
# loading saved roberta-base tokenizer to tokenize the text into input IDs that model can make sense of. | |
model_checkpoint = "Modfiededition/roberta-fine-tuned-tweet-sentiment-extractor" | |
def load_tokenizer(): | |
return AutoTokenizer.from_pretrained(model_checkpoint ) | |
tokenizer = load_tokenizer() | |
def load_model(): | |
return TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint) | |
model = load_model() | |
#prompts | |
st.title("Tweet Sentiment Extractor...") | |
# take text/tweet input | |
textbox = st.text_area('Write your text in this box:', '',height=100, max_chars=500 ) | |
option = st.selectbox( | |
'Pick the sentiment', | |
('positive', 'negative', 'neutral')) | |
python_dict = {"text":[textbox], "sentiment":[option]} | |
dataset = Dataset.from_dict(python_dict) | |
MAX_LENGTH = 105 | |
button = st.button('Click here to extract the word/phrase from the text with the given sentiment: {0}..'.format(option)) | |
if not textbox: | |
st.markdown("## " + "Please write your text above!") | |
if button: | |
if not textbox: | |
st.markdown("#### " +"Please write something in the above textbox..") | |
else: | |
with st.spinner('In progress.......'): | |
def process_data(examples): | |
questions = examples["sentiment"] | |
context = examples["text"] | |
inputs = tokenizer( | |
questions, | |
context, | |
max_length = MAX_LENGTH, | |
padding="max_length", | |
return_offsets_mapping = True, | |
) | |
# Assigning None values to all offset mapping of tokens which are not the context tokens. | |
for i in range(len(inputs["input_ids"])): | |
offset = inputs["offset_mapping"][i] | |
sequence_ids = inputs.sequence_ids(i) | |
inputs["offset_mapping"][i] = [ | |
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset) | |
] | |
return inputs | |
processed_raw_data = dataset.map( | |
process_data, | |
batched = True | |
) | |
tf_raw_dataset = processed_raw_data.to_tf_dataset( | |
columns=["input_ids", "attention_mask"], | |
shuffle=False, | |
batch_size=1, | |
) | |
# final predictions. | |
outputs = model.predict(tf_raw_dataset) | |
start_logits = outputs.start_logits | |
end_logits = outputs.end_logits | |
# Post Processing. | |
# Using start_logits and end_logits to generate the final answer from the given context. | |
n_best = 20 | |
def predict_answers(inputs): | |
predicted_answer = [] | |
for i in range(len(inputs["offset_mapping"])): | |
start_logit = inputs["start_logits"][i] | |
end_logit = inputs["end_logits"][i] | |
context = inputs["text"][i] | |
offset = inputs["offset_mapping"][i] | |
start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist() | |
end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist() | |
flag = False | |
for start_index in start_indexes: | |
for end_index in end_indexes: | |
# skip answer that are not in the context. | |
if offset[start_index] is None or offset[end_index] is None: | |
continue | |
# skip answer with length that is either < 0 | |
if end_index < start_index: | |
continue | |
flag = True | |
answer = context[offset[start_index][0]: offset[end_index][1]] | |
predicted_answer.append(answer) | |
break | |
if flag: | |
break | |
if not flag: | |
predicted_answer.append(answer) | |
return {"predicted_answer":predicted_answer} | |
processed_raw_data.set_format("pandas") | |
processed_raw_df = processed_raw_data[:] | |
processed_raw_df["start_logits"] = start_logits.tolist() | |
processed_raw_df["end_logits"] = end_logits.tolist() | |
processed_raw_df["text"] = python_dict["text"] | |
final_data = Dataset.from_pandas(processed_raw_df) | |
final_data = final_data.map(predict_answers,batched=True) | |
st.markdown("## " +final_data["predicted_answer"][0]) |