import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
link = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(link)
model = AutoModelForSequenceClassification.from_pretrained(link)
tokenizer.model_max_length = 571
def main():
# vader = SentimentIntensityAnalyzer()
# print("Vader: ", vader.polarity_scores(example))
data = pd.read_csv("assets/Reviews.csv")
data = data.head(100)
result = {}
for index, row in data.iterrows():
pol = roberta_polarity_scores(row["Text"])
result[index] = {
"score": row["Score"],
"neg": pol["neg"],
"neu": pol["neu"],
"pos": pol["pos"],
}
result = pd.DataFrame(result)
print(result.head())
def roberta_polarity_scores(text):
encoded_text = tokenizer(text, return_tensors="pt")
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores = {
"neg": scores[0],
"neu": scores[1],
"pos": scores[2],
# TODO compound
}
return scores