I'm trying to do a sentiment analysis on newspaper articles using Azure Cognitive Services (explanation here )
It works perfectly fine for individual sentences, however, I'm struggling to get it to work for a csv file that contains list of quotes. I believe I'm doing something wrong when assigning the document, so the part with **
def sentiment_analysis_example(client):
**documents = ["I had the best day of my life. I wish you were there with me."]**
response = client.analyze_sentiment(documents=documents)[0]
print("Document Sentiment: {}".format(response.sentiment))
print("Overall scores: positive={0:.2f}; neutral={1:.2f}; negative={2:.2f} \n".format(
response.confidence_scores.positive,
response.confidence_scores.neutral,
response.confidence_scores.negative,
))
for idx, sentence in enumerate(response.sentences):
print("Sentence: {}".format(sentence.text))
print("Sentence {} sentiment: {}".format(idx+1, sentence.sentiment))
print("Sentence score:\nPositive={0:.2f}\nNeutral={1:.2f}\nNegative={2:.2f}\n".format(
sentence.confidence_scores.positive,
sentence.confidence_scores.neutral,
sentence.confidence_scores.negative,
))
sentiment_analysis_example(client)
I want to try a more efficient way rather than copy + pasting individual sentences into the "documents" section. I have tried creating a pandas data frame via
import pandas as pd
df = pd.read_csv('/Users/../Desktop/trial-sun.csv', sep=';')
However, when I make reference to this in documents = []
, then I get an error saying
"TypeError: Mixing string and dictionary/object document input unsupported."
My guess is that what I'm passing on there needs to be of a different format but I'm not sure how to go about it.
The following code worked for me:
import pandas as pd
import requests
subscription_key = "<>"
headers = {"Ocp-Apim-Subscription-Key": subscription_key}
endpoint = "https://<>.cognitiveservices.azure.com/"
sentiment_url = endpoint + "/text/analytics/v3.0/sentiment"
def comment_sentiment(comment=None, cid=None):
"""
Take a single comment in string and analyze the sentiment
Args:
comment -- The text content to analyze.
cid -- The numeric id of the comment analyzed.
"""
language = "en"
try:
document = {"id": cid, "language": language, "text": comment}
body = {"documents": [document]}
res = requests.post(sentiment_url, headers=headers, json=body)
data = res.json()
# Extract key phrases
return data
except Exception as e:
print("[Errno {0}] {1}".format(e.errno, e.strerror))
def comment_summary(sentimentResult):
"""
Take a single response data from comment_sentiment function and summarizes the result
Args:
sentimentResult -- The text response data to summarize.
"""
summary = {"Id": 0, "Sentiment": "",
"Positive": 0, "Neutral": 0, "Negative": 0}
for document in sentimentResult['documents']:
summary["Sentiment"] = document['sentiment'].capitalize()
summary["Id"] = document['id']
for each in document['sentences']:
sentimentscore = each['sentiment']
if sentimentscore == 'positive':
summary["Positive"] += 1
elif sentimentscore == 'negative':
summary["Negative"] += 1
else:
summary["Neutral"] += 1
return summary
def main(comment_df):
"""
Take the data frame, get the sentiments and save the result to a CSV file
Args:
comment_df -- Data frame containing the text to analyze.
Returns:
A data frame consisting of the relevant columns
'id','sentiment', 'positive','negative','neutral'.
"""
df2 = comment_df
# Drop any existing index and use a new one
df2.reset_index(drop=True, inplace=True)
print(u"Processing records in data frame....")
for i, row in df2.iterrows():
# print(u"Processing Record... #{}".format(i+1))
text_data = df2.loc[i, "comment"].encode(
"utf-8").decode("ascii", "ignore")
sentimentResult = comment_sentiment(text_data, i+1)
sentimentSummary = comment_summary(sentimentResult)
# Add result to data frame
df2.loc[i, "id"] = i+1
df2.loc[i, "sentiment"] = sentimentSummary['Sentiment']
df2.loc[i, "positive"] = sentimentSummary['Positive']
df2.loc[i, "negative"] = sentimentSummary['Negative']
df2.loc[i, "neutral"] = sentimentSummary['Neutral']
dfx = df2[['id', 'sentiment', 'positive', 'negative', 'neutral']]
print(u"Processing completed....")
# Ensure that numbers are represented as integers and not float
convert_dict = {'id': int,
'positive': int,
'negative': int,
'neutral': int,
'sentiment': str
}
dfx = dfx.astype(convert_dict)
return dfx
if __name__ == "__main__":
# read comment data from csv
commentData = pd.read_csv(
"https://raw.githubusercontent.com/JimXu199545/data/main/comment.csv", header=0, names=["comment"])
commentData['nwords'] = commentData.comment.apply(lambda x: len(x.split()))
commentData['hashed'] = commentData.comment.apply(
lambda x: hash("".join(x.split())))
# Remove duplicated record but keep the first occurence of the record
commentData.drop_duplicates(keep='first', inplace=True)
# Reindex the data frame to prevent gaps in the indexes
commentData.reset_index(drop=True, inplace=True)
df = main(commentData)
df.to_csv('d:\\result.csv', index=False, header=True)
Also, please refer these following links for more information:
Reference1 , Reference2
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.