Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

ROBERTA analysis 🚧

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

link = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(link)
model = AutoModelForSequenceClassification.from_pretrained(link)
tokenizer.model_max_length = 571

def main():
    # vader = SentimentIntensityAnalyzer()
    # print("Vader: ", vader.polarity_scores(example))

    data = pd.read_csv("assets/Reviews.csv")
    data = data.head(100)

    result = {}
    for index, row in data.iterrows():
        pol = roberta_polarity_scores(row["Text"])
        result[index] = {
            "score": row["Score"],
            "neg": pol["neg"],
            "neu": pol["neu"],
            "pos": pol["pos"],
        }
    result = pd.DataFrame(result)
    print(result.head())



def roberta_polarity_scores(text):
    encoded_text = tokenizer(text, return_tensors="pt")
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    scores = {
        "neg": scores[0],
        "neu": scores[1],
        "pos": scores[2],
        # TODO compound
    }

    return scores