简体   繁体   中英

Cosine similarity between the same dictionary's values

I have this dict called queries :

{'q1': ['similar',
  'law',
  'must',
  'obey',
  'construct',
  'aeroelast',
  'model',
  'heat',
  'high',
  'speed',
  'aircraft'],
 'q2': ['structur',
  'aeroelast',
  'problem',
  'associ',
  'flight',
  'high',
  'speed',
  'aircraft'],
 'q3': ['problem', 'heat', 'conduct', 'composit', 'slab', 'solv', 'far']
...
}

And have used this code to transform it into a dict of vectorized arrays:

class RetrievalSystem:
    def __init__(self, docs, num_concepts, min_df=1, alpha=1.0, beta=0.75, gamma=0.15):
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.q_vecs = dict() # query vectors
        
        self.svd = TruncatedSVD(n_components = num_concepts, random_state = 42)
        
        self.doc_vecs = self.svd.fit_transform(doc_term_mat)

    def retrieve_n_rank_docs(self, queries, max_docs=-1):
       
        for query in queries:
            s = self.vec.transform([" ".join(queries[query])])
            s = self.svd.transform(s)
            if query not in self.q_vecs.keys():
                self.q_vecs[query] = s

The max_docs arguments governs the maximum number of documents to be returned by each query. And now self.q_vecs looks like this:

{'q217': array([[ 0.16555858,  0.12041974,  0.10034606,  0.03249144,  0.00843294,
         0.16582048, -0.20520625, -0.05597786, -0.12666519, -0.10517737,
         0.14363559, -0.01525909, -0.16574115, -0.04112081, -0.1374631 ,
         0.05047798,  0.05825697, -0.01779095, -0.05663042, -0.14333234,
        -0.09671375, -0.02205753,  0.03309577, -0.04512224, -0.01605542,
         0.00762974,  0.02407301,  0.00426722,  0.00654344,  0.08085963,
         0.08657383, -0.09913353,  0.01492773, -0.06813004, -0.01151318,
        -0.08565942,  0.03826287, -0.00330817,  0.13141591,  0.04920131,
        -0.08375895,  0.09465868, -0.03466024,  0.01838176, -0.00336209,
         0.02372735, -0.03390722,  0.0440413 ,  0.00371048,  0.09835254,
        -0.01099799,  0.0014484 ,  0.06276236,  0.04311937, -0.0867389 ,
         0.00850617,  0.00496759, -0.17198825,  0.07988587,  0.05727097,
         0.13304752,  0.08784825, -0.06141824, -0.01383098, -0.02348199,
        -0.04522944,  0.05257815,  0.08263177, -0.01140021, -0.05829286,
        -0.04885191,  0.09377792,  0.0190092 ,  0.00947696,  0.05598195,
        -0.03815088, -0.02834209,  0.0281708 , -0.02843137, -0.03210851,
         0.04751607, -0.01162277,  0.02034976, -0.02088302,  0.07665635,
         0.0195319 , -0.0157795 ,  0.01210985, -0.03183579,  0.01161029,
         0.02409737, -0.01007874,  0.10754846,  0.01010833, -0.05662593,
        -0.01729383, -0.03097083,  0.03369774,  0.00572065,  0.02632313]]), 'q99': array([[ 0.10287323, -0.01085065, -0.00967409, -0.04218846,  0.09239141,
         0.07992809, -0.00359886, -0.03796564,  0.01250241,  0.01951022,
        -0.03673524, -0.02372439, -0.03240905, -0.03081271,  0.02817431,
         0.12468386, -0.02051108,  0.12191644,  0.00624408, -0.05094331,
         0.09598166, -0.02341246, -0.0020474 , -0.05629724,  0.03516377,
         0.09028871,  0.02806492, -0.02300581, -0.02998558, -0.00270938,
         0.01611941,  0.04106955,  0.05371339, -0.02561045, -0.01916819,
         0.08158927, -0.03353019, -0.01020131, -0.03670832,  0.02845091,
         0.07133292, -0.0944471 , -0.00662414,  0.0920997 , -0.00206586,
         0.07063442, -0.00814919, -0.00374118, -0.01353651,  0.07968094,
         0.00796783, -0.01397921, -0.07712498, -0.00308536,  0.07785687,
        -0.01220938, -0.06646712,  0.04048088,  0.01321445,  0.00041508,
        -0.04644943,  0.09307773,  0.0188646 , -0.03233048, -0.04803833,
        -0.06355723, -0.00560934, -0.05478746,  0.03196071,  0.08420215,
        -0.07706163, -0.12595219, -0.01330823, -0.00079499, -0.02515943,
         0.00087481, -0.00596035,  0.01680558,  0.0138655 , -0.01290259,
        -0.0497661 , -0.04627047, -0.00239779, -0.06377815, -0.01103349,
         0.00205314, -0.0774958 ,  0.00223332, -0.00976858,  0.02365778,
         0.02600081,  0.01212485,  0.03451618,  0.00642054, -0.00025119,
         0.00898667,  0.00749051,  0.02099796, -0.00906813, -0.06770008]])
...
}

I'd like to take the cosine similarity between the vector representations, and then sort the documents these queries came from in descending order of the cosine similarity. So the desired output would look something like this:

{
    'q217': ['d983', 'd554', ..., 'd623'],
    'q99' : ['d716', 'd67', ..., 'd164'],
    ...
}

I have written this code to try to output to cos similarity but it only returns 1 key-value pair:

class RetrievalSystem:
    def __init__(self, docs, num_concepts, min_df=1, alpha=1.0, beta=0.75, gamma=0.15):
        self.alpha, self.beta, self.gamma = alpha, beta, gamma
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.q_vecs = dict() # query vectors
        
        self.svd = TruncatedSVD(n_components = num_concepts, random_state = 42)
        
        self.doc_vecs = self.svd.fit_transform(doc_term_mat)
        # YOUR CODE HERE
        #raise NotImplementedError()

    def retrieve_n_rank_docs(self, queries, max_docs=-1):
       
        for query in queries:
            s = self.vec.transform([" ".join(queries[query])])
            s = self.svd.transform(s)
            if query not in self.q_vecs.keys():
                self.q_vecs[query] = s
        
            all_keys = list(self.q_vecs.keys())
            new_d = {}
        
            for i in range(len(all_keys)):
                for j in range(i+1,len(all_keys)):
                    new_d[query] = {1 - spatial.distance.cosine(self.q_vecs[all_keys[i]], self.q_vecs[all_keys[j]])}

because the code is not a minimal reproducable example, I cant fully help. But to create a dictionary of cosine similarities of every combination of keys you can do:

import itertools
import numpy as np


q_ves = {
    "q1": np.array([0, 1]),
    "q2": np.array([1, 0]),
    "q3": np.array([0, 2]),
    "q4": np.array([10, 10])
}
new_q = {}
for k1, k2 in list(map(dict, itertools.combinations(
    q_vecs.items(), 2))):
    new_d[(k1, k2)] = 1 - scipy.spatial.distance.cosine(q_vecs[k1], q_vecs[k2])

Which will give you:

{
    ('q1', 'q2'): -0.04468849422512422,
    ('q1', 'q3'): 1,
    ('q1', 'q4'): -0.04468849422512422,
    ('q2', 'q3'): -0.04468849422512422,
    ('q2', 'q4'): 1,
    ('q3', 'q4'): -0.04468849422512422
}

I hope this is what you are after, as I dont understand how you generate the strings for:

{
    'q217': ['d983', 'd554', ..., 'd623'],
    'q99' : ['d716', 'd67', ..., 'd164'],
    ...
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM