简体   繁体   中英

extract images from clusters separately in kmeans python

i have done K-means clustering over a dataset of images after which i have 5 clusters. Now i want to extract the images from each clusters and save them separately. i have no idea how to do that. i have tried doing this but i am not able to access the images.

here is my code

import matplotlib.pyplot
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.externals import joblib
import numpy as np
import cv2 
import sys
import pickle
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import os 
from skimage.feature import local_binary_pattern 
# To calculate a normalized histogram
from scipy.stats import itemfreq 
from sklearn.preprocessing import normalize 
import cvutils 
import csv 
import numpy 
from matplotlib.pyplot import imshow
from PIL import Image
import time
from sklearn.cluster import KMeans

start_time=time.time()
  ############################################################################################
dir_unknown = 'UntitledFolder'
trainingSet='/home/irum/Desktop/Face-Recognition/thakarrecog            /UntitledFolder/UntitledFolder1'
imageLabels='/home/irum/Desktop/Face-Recognition/thakarrecog/class_train'
path='/home/irum/Desktop/Face-Recognition/thakarrecog/Clusters'
#Create CSV File
images_names = []
SEPARATOR=" "
print"start"
'''
for (dirname, dirnames, filenames) in os.walk(dir_unknown):
    for subdirname in dirnames:
        subject_path = os.path.join(dirname, subdirname)
        for filename in os.listdir(subject_path):
        abs_path = "%s/%s" % (subject_path, filename)

        #csv_path = "%s%s%d" % (abs_path, SEPARATOR, label)
        #print "%s%s%d" % (abs_path, SEPARATOR, label)
        images_names.append("%s%s%d" % (abs_path, SEPARATOR, label))
        #print images_names 
        with open('class_train1', 'w') as myfile:
               wr = csv.writer(myfile,delimiter=' ', doublequote=False , quotechar=None, lineterminator='\r\n', skipinitialspace=True)
           wr.writerow(imageLabels)
        label = label + 1
'''
# Store the path of training images in train_images
train_images = cvutils.imlist(trainingSet)
print "Total Images",len(train_images)

# Dictionary containing image paths as keys and corresponding label as value
train_dic = {}
with open('/home/irum/Desktop/Face-Recognition/thakarrecog/class_train', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ')
    for row in reader:
        train_dic[row[0]] = row[1]

# List for storing the LBP Histograms, address of images and the corresponding label 
X_test = []
X_name = []
y_test = []

print"Calculating LBP Histograms"
h1 = time.time()
# For each image in the training set calculate the LBP histogram
# and update X_test, X_name and y_test
for train_image in train_images:
    # Read the image
    im = cv2.imread(train_image)

    # Convert to grayscale as LBP works on grayscale image
    im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)

    radius = 3
    # Number of points to be considered as neighbourers 
    no_points = 8 * radius

    # Uniform LBP is used
    lbp = local_binary_pattern(im_gray, no_points, radius, method='uniform')

    # Calculate the histogram
    x = itemfreq(lbp.ravel())


    # Normalize the histogram
    hist = x[:, 1]/sum(x[:, 1])


    # Append image path in X_name
    X_name.append(os.path.join(train_image))

    # Append histogram to X_name
    X_test.append(os.path.join(hist))

    # Append class label in y_test
    #y_test.append(train_dic[os.path.split(images_names)[1]])

h2 = time.time()
t = (h2 - h1)
print"Time taken by LBPH",t

# Dump the  data
joblib.dump((X_name, X_test), "lbp.pkl", compress=3)

p1 = time.time()
print"Applying PCA on LBP Histograms"
X_test = np.array(X_test)
pca = PCA(n_components=26)
pca.fit(X_test)
pca_activations = pca.transform(X_test)
p2 = time.time()
t = (p2 - p1)
print"Time taken by PCA",t

t1 = time.time()
print"Applying t-SNE on PCA"
# then run the PCA-projected activations through t-SNE to get our final embedding
X = np.array(pca_activations)
tsne = TSNE(n_components=2, learning_rate=500, perplexity=50, verbose=2, angle=0.2, early_exaggeration=7.0).fit_transform(X)
print "t-SNE Type", type(tsne)
print"tsne",tsne
t2 = time.time()
t = (t2 - t1)
print"Time taken by t-SNE",t

n1 = time.time()
print"normalize t-sne points to {0,1}"
tx, ty = tsne[:,0], tsne[:,1]
tx = (tx-np.min(tx)) / (np.max(tx) - np.min(tx))
ty = (ty-np.min(ty)) / (np.max(ty) - np.min(ty))

n2 = time.time()
t = (n2 - n1)
print "Normalization completed in time",t

width = 5000
height = 5000
max_dim = 100

print "displaying"
full_image = Image.new('RGB', (width, height))
for img, x, y in zip(X_name, tx, ty):
    #print "for loop"
    tile = Image.open(img)
    rs = max(1, tile.width/max_dim, tile.height/max_dim)
    tile = tile.resize((tile.width/rs, tile.height/rs), Image.ANTIALIAS)
    full_image.paste(tile, (int((width-max_dim)*x), int((height-max_dim)*y)))
full_image.save("myTSNE.png")

#matplotlib.pyplot.figure(figsize = (12,12))
#plt.imshow(full_image)

print "K-Means clustering"
#Convert Images to Float32 
images = np.asarray(tsne, np.float32)
N = len(images)
images = images.reshape(N,-1)

#using kmeans clustring having 5 clusters
kmeans = KMeans(n_clusters=5)

#passing images to kmeans 
kmeans.fit(images)

centroids = kmeans.cluster_centers_
labels = kmeans.labels_

colors = 10*['r.','g.','b.','c.','k.','y.','m.']

#I want to Move each cluster to seperate folder (5 clusters means 5 folders)

for i in range(len(images)):
    print("coordinate:",images[i], "label:", labels[i])
    plt.plot(images[i][0], images[i][1], colors[labels[i]], markersize = 10)

    img = cv2.convertScaleAbs(images[i])
    print "Images Type", img.dtype

    pin=sorted([int(n[:n.find('.')]) for n in os.listdir(path)
               if n[0]!='.' ]+[0])[-1] + 1
        cv2.imwrite('%s/%s.png' % (path, pin), img)




plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)

plt.show()

end_time=time.time()
total_time=t = (end_time - start_time)
print"Total execution time in seconds",total_time

i am trying to extract clusters here, but failing. I need images I clusters separately as an output so that I can manipulate them further.

`#I want to Move each cluster to seperate folder (5 clusters means 5 folders)

For i in range(len(images)):
    print("coordinate:",images[i], "label:", labels[i])
    plt.plot(images[i][0], images[i][1], colors[labels[i]], markersize = 10)

    img = cv2.convertScaleAbs(images[i])
    print "Images Type", img.dtype

` 聚类输出

I want images in red cluster separate, in blue cluster separate and so on, in separate folders actually. 5 clusters 5 folders.

I have accessed images like this:

for i,j in zip(images, labels):
    if labels[j] == 1:
        #print "Images Type", images.dtype
        img = images[i]
        pin=sorted([int(n[:n.find('.')]) for n in os.listdir(path)
                if n[0]!='.' ]+[0])[-1] + 1
        cv2.imwrite('%s/%s.png' % (path, pin), img)

but i am getting deformed images and in a very small size. i get output like this 输出图像

for an image like this 输入

From your code, it seems that you have your images here images and that the variable labels is an array with the same dimension, containing the class labels.

If you want to get all the images for a class called myclass , then simply do:

images_in_myclass = [i for i,j in zip(images, labels) where j=='myclass']

zip allows you to iterate over the two arrays element-wise, and you are only returning the images for which the label condition is satisfied.

In your code, images does not contain the pictures.

It's an array of coordinates:

images = np.asarray(tsne, np.float32)

Writing an array of coordinates to an image file of course yields such a small glitch. If you want the original images, copy the original images files.

Note that tSNE is a visualization technique. It is probably not a good idea to use this visualization for clustering; as Van der Maaten and Hinton note: "it is unclear how t-SNE performs on the more general dimensionality reduction tasks". For clustering, it may be sensible to use the original data (and a better algorithm than k-means); and use tSNE only for visualizing and validating the result.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM