I am trying to write a code that detects fake news. Unfortunately, I keep getting the same error message. Please could someone explain where I've gone wrong? I have got some lines of codes from https://data-flair.training/blogs/advanced-python-project-detecting-fake-news/ and some lines of code from https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk . When I tried to combine the two different codes (by getting rid of duplicate codes), I receive an error message.
THE CODE
%matplotlib inline
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import itertools
import json
import csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
df = pd.read_csv(r"C:\Users\johnrambo\Downloads\fake_news(1).csv", sep=',', header=0, engine='python', escapechar='\\')
X_train, X_test, y_train, y_test = train_test_split(df['headline'], is_sarcastic_1, test_size = 0.2, random_state = 7)
clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)
print("MultinomialNB Accuracy:", metrics.accuracy_score(y_test, predicted))
THE ERROR
ValueError Traceback (most recent call last)
<ipython-input-8-e1f11a702626> in <module>
21 X_train, X_test, y_train, y_test = train_test_split(df['headline'], is_sarcastic_1, test_size = 0.2, random_state = 7)
22
---> 23 clf = MultinomialNB().fit(X_train, y_train)
24
25 predicted = clf.predict(X_test)
~\Anaconda\lib\site-packages\sklearn\naive_bayes.py in fit(self, X, y, sample_weight)
586 self : object
587 """
--> 588 X, y = check_X_y(X, y, 'csr')
589 _, n_features = X.shape
590
~\Anaconda\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
717 ensure_min_features=ensure_min_features,
718 warn_on_dtype=warn_on_dtype,
--> 719 estimator=estimator)
720 if multi_output:
721 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~\Anaconda\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
494 try:
495 warnings.simplefilter('error', ComplexWarning)
--> 496 array = np.asarray(array, dtype=dtype, order=order)
497 except ComplexWarning:
498 raise ValueError("Complex data not supported\n"
~\Anaconda\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
~\Anaconda\lib\site-packages\pandas\core\series.py in __array__(self, dtype)
946 warnings.warn(msg, FutureWarning, stacklevel=3)
947 dtype = "M8[ns]"
--> 948 return np.asarray(self.array, dtype)
949
950 # ----------------------------------------------------------------------
~\Anaconda\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
~\Anaconda\lib\site-packages\pandas\core\arrays\numpy_.py in __array__(self, dtype)
164
165 def __array__(self, dtype=None):
--> 166 return np.asarray(self._ndarray, dtype=dtype)
167
168 _HANDLED_TYPES = (np.ndarray, numbers.Number)
~\Anaconda\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: could not convert string to float: 'experts caution new car loses 90% of value as soon as you drive it off cliff'
FIRST FEW LINES OF DATA
This is what I get when I input df.head().to_dict() :
{'is_sarcastic': {0: 1, 1: 0, 2: 0, 3: 1, 4: 1}, 'headline': {0: 'thirtysomething scientists unveil doomsday clock of hair loss', 1: 'dem rep. totally nails why congress is falling short on gender, racial equality', 2: 'eat your veggies: 9 deliciously different recipes', 3: 'inclement weather prevents liar from getting to work', 4: "mother comes pretty close to using word 'streaming' correctly"}, 'article_link': {0: ' https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205 ', 1: ' https://www.huffingtonpost.com/entry/donna-edwards-inequality_us_57455f7fe4b055bb1170b207 ', 2: ' https://www.huffingtonpost.com/entry/eat-your-veggies-9-delici_b_8899742.html ', 3: ' https://local.theonion.com/inclement-weather-prevents-liar-from-getting-to-work-1819576031 ', 4: ' https://www.theonion.com/mother-comes-pretty-close-to-using-word-streaming-cor-1819575546 '}}
I imagine you have text data in df['headline']
column, you need a few steps to first convert the text data to a number based format, then pass it to machine learning models to handle.
You might want to refer to sklearn's CountVectorizer
and TfidfTransformer
here
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.