Dataset and models associated with the TweetTopic datasets (tweet classification)
11 items
This is a RoBERTa-base model trained on ~124M tweets from January 2018 to December 2021 (see here), and finetuned for single-label topic classification on a corpus of 6,997 tweets. The original roBERTa-base model can be found here and the original reference paper is TweetEval. This model is suitable for English.
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
MODEL = f"cardiffnlp/tweet-topic-21-single"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
class_mapping = model.config.id2label
text = "Tesla stock is on the rise!"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# TF
#model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
#class_mapping = model.config.id2label
#text = "Tesla stock is on the rise!"
#encoded_input = tokenizer(text, return_tensors='tf')
#output = model(**encoded_input)
#scores = output[0][0]
#scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
l = class_mapping[ranking[i]]
s = scores[ranking[i]]
print(f"{i+1}) {l} {np.round(float(s), 4)}")
1) business_&_entrepreneurs 0.8361
2) science_&_technology 0.0904
3) pop_culture 0.0288
4) daily_life 0.0178
5) arts_&_culture 0.0137
6) sports_&_gaming 0.0133