简体   繁体   中英

Using CSV file for Azure Sentiment Analysis

I'm trying to do a sentiment analysis on newspaper articles using Azure Cognitive Services (explanation here )

It works perfectly fine for individual sentences, however, I'm struggling to get it to work for a csv file that contains list of quotes. I believe I'm doing something wrong when assigning the document, so the part with **

def sentiment_analysis_example(client):
**documents = ["I had the best day of my life. I wish you were there with me."]**
response = client.analyze_sentiment(documents=documents)[0]
print("Document Sentiment: {}".format(response.sentiment))
print("Overall scores: positive={0:.2f}; neutral={1:.2f}; negative={2:.2f} \n".format(
    response.confidence_scores.positive,
    response.confidence_scores.neutral,
    response.confidence_scores.negative,
))
for idx, sentence in enumerate(response.sentences):
    print("Sentence: {}".format(sentence.text))
    print("Sentence {} sentiment: {}".format(idx+1, sentence.sentiment))
    print("Sentence score:\nPositive={0:.2f}\nNeutral={1:.2f}\nNegative={2:.2f}\n".format(
        sentence.confidence_scores.positive,
        sentence.confidence_scores.neutral,
        sentence.confidence_scores.negative,
    ))        
sentiment_analysis_example(client)

I want to try a more efficient way rather than copy + pasting individual sentences into the "documents" section. I have tried creating a pandas data frame via

import pandas as pd

df = pd.read_csv('/Users/../Desktop/trial-sun.csv', sep=';')

However, when I make reference to this in documents = [] , then I get an error saying

"TypeError: Mixing string and dictionary/object document input unsupported."

My guess is that what I'm passing on there needs to be of a different format but I'm not sure how to go about it.

The following code worked for me:

import pandas as pd
import requests

subscription_key = "<>"
headers = {"Ocp-Apim-Subscription-Key": subscription_key}
endpoint = "https://<>.cognitiveservices.azure.com/"
sentiment_url = endpoint + "/text/analytics/v3.0/sentiment"


def comment_sentiment(comment=None, cid=None):
    """
    Take a single comment in string and analyze the sentiment

    Args:
        comment --  The text content to analyze.
        cid -- The numeric id of the comment analyzed.
    """
    language = "en"
    try:
        document = {"id": cid, "language": language, "text": comment}
        body = {"documents": [document]}
        res = requests.post(sentiment_url,  headers=headers, json=body)
        data = res.json()
        # Extract key phrases
        return data
    except Exception as e:
        print("[Errno {0}] {1}".format(e.errno, e.strerror))


def comment_summary(sentimentResult):
    """
        Take a single response data from comment_sentiment function and summarizes the result

        Args:
            sentimentResult --  The text response data to summarize.
    """

    summary = {"Id": 0, "Sentiment": "",
               "Positive": 0, "Neutral": 0, "Negative": 0}
    for document in sentimentResult['documents']:
        summary["Sentiment"] = document['sentiment'].capitalize()
        summary["Id"] = document['id']
        for each in document['sentences']:
            sentimentscore = each['sentiment']
            if sentimentscore == 'positive':
                summary["Positive"] += 1
            elif sentimentscore == 'negative':
                summary["Negative"] += 1
            else:
                summary["Neutral"] += 1
    return summary


def main(comment_df):
    """
    Take the data frame, get the sentiments and save the result to a CSV file

    Args:
        comment_df -- Data frame containing the text to analyze.
    Returns:
         A data frame consisting of the relevant columns
         'id','sentiment', 'positive','negative','neutral'.
    """
    df2 = comment_df
    # Drop any existing index and use a new one
    df2.reset_index(drop=True, inplace=True)
    print(u"Processing records in data frame....")
    for i, row in df2.iterrows():
        # print(u"Processing Record... #{}".format(i+1))
        text_data = df2.loc[i, "comment"].encode(
            "utf-8").decode("ascii", "ignore")
        sentimentResult = comment_sentiment(text_data, i+1)
        sentimentSummary = comment_summary(sentimentResult)
        # Add result to data frame
        df2.loc[i, "id"] = i+1
        df2.loc[i, "sentiment"] = sentimentSummary['Sentiment']
        df2.loc[i, "positive"] = sentimentSummary['Positive']
        df2.loc[i, "negative"] = sentimentSummary['Negative']
        df2.loc[i, "neutral"] = sentimentSummary['Neutral']
        dfx = df2[['id', 'sentiment', 'positive', 'negative', 'neutral']]
    print(u"Processing completed....")
    # Ensure that numbers are represented as integers and not float
    convert_dict = {'id': int,
                    'positive': int,
                    'negative': int,
                    'neutral': int,
                    'sentiment': str
                    }

    dfx = dfx.astype(convert_dict)
    return dfx


if __name__ == "__main__":
    # read comment data from csv
    commentData = pd.read_csv(
        "https://raw.githubusercontent.com/JimXu199545/data/main/comment.csv", header=0, names=["comment"])
    commentData['nwords'] = commentData.comment.apply(lambda x: len(x.split()))
    commentData['hashed'] = commentData.comment.apply(
        lambda x: hash("".join(x.split())))

    # Remove duplicated record but keep the first occurence of the record
    commentData.drop_duplicates(keep='first', inplace=True)
    # Reindex the data frame to prevent gaps in the indexes
    commentData.reset_index(drop=True, inplace=True)
    df = main(commentData)
    df.to_csv('d:\\result.csv', index=False, header=True)

Also, please refer these following links for more information:
Reference1 , Reference2

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM