[英]String Matching Using TF-IDF, NGrams and Cosine Similarity in Python
我正在做我的第一個主要數據科學項目。 我試圖將來自一個源的大量數據列表與另一個源中的清理過的字典之間的名稱匹配。 我正在使用此字符串匹配博客作為指南。
我正在嘗試使用兩個不同的數據集。 不幸的是,我似乎無法得到好的結果,我認為我沒有適當地應用它。
代碼:
import pandas as pd, numpy as np, re, sparse_dot_topn.sparse_dot_topn as ct
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
df_dirty = {"name":["gogle","bing","amazn","facebook","fcbook","abbasasdfzz","zsdfzl","gogle","bing","amazn","facebook","fcbook","abbasasdfzz","zsdfzl"]}
df_clean = {"name":["google","bing","amazon","facebook"]}
print (df_dirty["name"])
print (df_clean["name"])
def ngrams(string, n=3):
string = (re.sub(r'[,-./]|\sBD',r'', string)).upper()
ngrams = zip(*[string[i:] for i in range(n)])
return [''.join(ngram) for ngram in ngrams]
def awesome_cossim_top(A, B, ntop, lower_bound=0):
# force A and B as a CSR matrix.
# If they have already been CSR, there is no overhead
A = A.tocsr()
B = B.tocsr()
M, _ = A.shape
_, N = B.shape
idx_dtype = np.int32
nnz_max = M * ntop
indptr = np.zeros(M + 1, dtype=idx_dtype)
indices = np.zeros(nnz_max, dtype=idx_dtype)
data = np.zeros(nnz_max, dtype=A.dtype)
ct.sparse_dot_topn(
M, N, np.asarray(A.indptr, dtype=idx_dtype),
np.asarray(A.indices, dtype=idx_dtype),
A.data,
np.asarray(B.indptr, dtype=idx_dtype),
np.asarray(B.indices, dtype=idx_dtype),
B.data,
ntop,
lower_bound,
indptr, indices, data)
return csr_matrix((data, indices, indptr), shape=(M, N))
def get_matches_df(sparse_matrix, name_vector, top=5):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
print (top)
nr_matches = top
else:
print (sparsecols.size)
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = name_vector[sparserows[index]]
right_side[index] = name_vector[sparsecols[index]]
similairity[index] = sparse_matrix.data[index]
return pd.DataFrame({'left_side': left_side,
'right_side': right_side,
'similairity': similairity})
company_names = df_clean['name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 4, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)
matches_df = get_matches_df(matches, company_names, top=4)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(matches_df)
預期結果如下:
您可以直接從 sparse_dot_topn 庫中導入 awesome_cossim_top 函數。
將函數 get_matches_df 更改為:
def get_matches_df(sparse_matrix, A, B, top=100):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
nr_matches = top
else:
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = A[sparserows[index]]
right_side[index] = B[sparsecols[index]]
similairity[index] = sparse_matrix.data[index]
return pd.DataFrame({'left_side': left_side,
'right_side': right_side,
'similairity': similairity})
現在你可以執行你的代碼如下:
df_dirty = {"name":["gogle","bing","amazn","facebook","fcbook","abbasasdfzz","zsdfzl"]}
df_clean = {"name":["google","bing","amazon","facebook"]}
print (df_dirty["name"])
print (df_clean["name"])
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_clean = vectorizer.fit_transform(df_clean['name'])
tf_idf_matrix_dirty = vectorizer.transform(df_dirty['name'])
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0)
t = time.time()-t1
print("SELFTIMED:", t)
matches_df = get_matches_df(matches, df_dirty['name'], df_clean['name'], top=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(matches_df)
基本上,您發現的示例在其自己的數組中標識了重復項,並且您想使用 2 個源而不是一個。
希望能幫助到你!
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.