查找最接近质心的列 - Pandas

Question

import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df = pd.DataFrame(columns=["State",       "Adult", "Senior","Children"]) 
df.loc[0]             = ["California",     111,    2,       6    ] 
df.loc[1]             = ["Texas",          70,     2,       4    ] 
df.loc[2]             = ["Florida",        64,     4,       5    ] 
df.loc[3]             = ["Georgia",        25,     2,       3    ] 
df.loc[4]             = ["Alaska",         90,     1,       2    ] 
df.loc[5]             = ["Hawaii",         105,    2,       1    ] 
df.loc[6]             = ["Washington",     27,     3,       2    ] 
df.loc[7]             = ["Pennsylvania",   90,     2,       1    ] 
df.loc[8]             = ["Virginia",       63,     2,       3    ] 
df.loc[9]             = ["Arizona",        34,     2,       4    ] 
df.loc[10]            = ["Michigan",       22,     5,       2    ] 
 
kmeans = KMeans(n_clusters=4) 
y = kmeans.fit_predict(df[['Adult', 'Senior', 'Children']]) 
df['Cluster'] = y 
centers = kmeans.cluster_centers_ 
plt.scatter(df.Adult, df.Senior, c=df.Cluster) 
plt.scatter(centers[:,0],centers[:,1],color='black',marker='*',label='centroid')
plt.show()

For the Kmeans analysis broken out by state above, I would like to extract/identify elements out of each cluster, which are the closest to that cluster's centroid.对于上面 state 分解的 Kmeans 分析，我想从每个集群中提取/识别元素，这些元素最接近该集群的质心。

Answer 1

Basically: the KMeans implementation is based on Euclidean distance.基本上： KMeans实现基于欧几里得距离。 To get the two closest points to each centroid we can look at the set of points that belong to each cluster, take the 2-norm of the difference between the relevant centroid, and return the two closest points:为了获得离每个质心最近的两个点，我们可以查看属于每个聚类的点集，取相关质心之间差异的 2-范数，并返回两个最近的点：

def get_2_closest(cluster_id, df, columns, centers):
    current = df[df["Cluster"] == cluster_id][columns]
    closest = np.argsort(
        np.linalg.norm(current.to_numpy(dtype=np.float64) - centers[cluster_id], axis=1)
    )
    return current.iloc[closest[:2]]

Full example in context:上下文中的完整示例：

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df = pd.DataFrame(columns=["State",       "Adult", "Senior","Children"])
df.loc[0]             = ["California",     111,    2,       6    ]
df.loc[1]             = ["Texas",          70,     2,       4    ]
df.loc[2]             = ["Florida",        64,     4,       5    ]
df.loc[3]             = ["Georgia",        25,     2,       3    ]
df.loc[4]             = ["Alaska",         90,     1,       2    ]
df.loc[5]             = ["Hawaii",         105,    2,       1    ]
df.loc[6]             = ["Washington",     27,     3,       2    ]
df.loc[7]             = ["Pennsylvania",   90,     2,       1    ]
df.loc[8]             = ["Virginia",       63,     2,       3    ]
df.loc[9]             = ["Arizona",        34,     2,       4    ]
df.loc[10]            = ["Michigan",       22,     5,       2    ]

kmeans = KMeans(n_clusters=4)
y = kmeans.fit_predict(df[["Adult", "Senior", "Children"]])
df["Cluster"] = y
centers = kmeans.cluster_centers_


def get_2_closest(cluster_id, df, columns, centers):
    current = df[df["Cluster"] == cluster_id][columns]
    closest = np.argsort(
        np.linalg.norm(current.to_numpy(dtype=np.float64) - centers[cluster_id], axis=1)
    )
    return current.iloc[closest[:2]]


_closest = pd.DataFrame(columns=['Adult', "Senior", "Children"])

for i in range(len(centers)):
    output = get_2_closest(i, df, ["Adult", "Senior", "Children"], kmeans.cluster_centers_)
    _closest = _closest.append(output)

plt.scatter(df.Adult, df.Senior, label="Original")
plt.scatter(_closest.Adult, _closest.Senior, label="2 Closest to Centroid")
plt.scatter(centers[:, 0], centers[:, 1], color="black", marker="*", label="centroid")
plt.legend()
plt.show()

Expected output:预期 output：

Question raised in comment: you can get the State column back by merging the two data frames:评论中提出的问题：您可以通过合并两个数据框来获取State列：

print(
  _closest.merge(df, left_index=True, right_index=True)['State']
)

Output: Output：

4          Alaska
7    Pennsylvania
6      Washington
3         Georgia
2         Florida
8        Virginia
0      California
5          Hawaii

查找最接近质心的列 - Pandas

问题描述

1 个解决方案

解决方案1
1 已采纳 2021-04-19 19:04:44

查找最接近质心的列 - Pandas

问题描述

1 个解决方案

解决方案1 1 已采纳 2021-04-19 19:04:44

解决方案1
1 已采纳 2021-04-19 19:04:44