본문 바로가기

Web/Python

k-means clustering-check

clustering-check
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sn
import pprint

%matplotlib inline
In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.datasets import make_blobs
from collections import Counter


class ClusteringManager(object):
    
    
    def __init__(self, rec_uf_g_idx, factor_limit_cnt=4, k=2):
        self.rec_uf_g_idx = rec_uf_g_idx
        self.factor_limit_cnt = factor_limit_cnt
        self.k = k
        self.k_init = k
        self.file_path = f'rawdata/pure_vector_{rec_uf_g_idx}_new.csv'
        self.df_all = pd.read_csv(self.file_path, sep=',')
        self.df = self.df_all.loc[:, self.df_all.columns != 'member_idx']
        self.cluster_result = {}
    
    
    def get_df(self):
        return self.df
    
    
    def get_df_all(self):
        return self.df_all
    
    '''
    적정 수의 K를 먼저 찾아낸다
    '''
    def elbow(self):
        sse = []
        for i in range(1, 11):
            kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0).fit(self.df)
            sse.append(kmeans.inertia_)
        
        plt.plot(range(1, 11), sse, marker='o')
        plt.xlabel('k')
        plt.ylabel('SSE')
        plt.show()
        
        sse_d = {i+1: sse[i+1] - sse[i] for i in range(9)}        
        plt.plot(list(f'{k}~{k+1}/' for k in sse_d.keys()), sse_d.values(), marker='o')
        plt.xlabel('k_delta')
        plt.ylabel('SSE_delta')
        plt.show()
        
    '''
    실루엣 기법
    '''
    def plot_silhouette(self):
        for n_clusters in range(2, 11):
            
            # Create a subplot with 1 row and 2 columns
            fig, (ax1) = plt.subplots(1)
            fig.set_size_inches(14, 8)
            
            # The 1st subplot is the silhouette plot
            # The silhouette coefficient can range from -1, 1 but in this example all
            # lie within [-0.1, 1]
            ax1.set_xlim([-0.1, 1])
    
            # Initialize the clusterer with n_clusters value and a random generator
            # seed of 10 for reproducibility.
            clusterer = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0)
            cluster_labels = clusterer.fit_predict(self.df)
            silhouette_avg = silhouette_score(self.df, cluster_labels)
            #print(f'k={n_clusters} >> silhouette_avg is {silhouette_avg}')
            print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

            # Compute the silhouette scores for each sample
            sample_silhouette_values = silhouette_samples(self.df, cluster_labels)
            
            y_lower = 10
            for i in range(n_clusters):
                # Aggregate the silhouette scores for samples belonging to
                # cluster i, and sort them
                ith_cluster_silhouette_values =                     sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.nipy_spectral(float(i) / n_clusters)
                ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                  0, ith_cluster_silhouette_values,
                                  facecolor=color, edgecolor=color, alpha=0.7)

                # Label the silhouette plots with their cluster numbers at the middle
                ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                # Compute the new y_lower for next plot
                y_lower = y_upper + 10  # 10 for the 0 samples

            ax1.set_title("The silhouette plot for the various clusters. (%f)" % silhouette_avg)
            ax1.set_xlabel("The silhouette coefficient values")
            ax1.set_ylabel("Cluster label")

            # The vertical line for average silhouette score of all the values
            ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

            plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                          "with n_clusters = %d" % n_clusters),
                         fontsize=14, fontweight='bold')
    
        plt.show()
In [7]:
res = ClusteringManager(27)
res.elbow()
res.plot_silhouette()
For n_clusters = 2 The average silhouette_score is : 0.3581656737263628
For n_clusters = 3 The average silhouette_score is : 0.219458344360034
For n_clusters = 4 The average silhouette_score is : 0.2010447087199042
For n_clusters = 5 The average silhouette_score is : 0.1867630836499229
For n_clusters = 6 The average silhouette_score is : 0.1449763274605214
For n_clusters = 7 The average silhouette_score is : 0.15966931714472116
For n_clusters = 8 The average silhouette_score is : 0.15825056888315572
For n_clusters = 9 The average silhouette_score is : 0.1741002636021062
For n_clusters = 10 The average silhouette_score is : 0.1695097202472913
In [4]:
cm2 = ClusteringManager(28)
cm2.elbow()
cm2.plot_silhouette()
For n_clusters = 2 The average silhouette_score is : 0.15733173801316172
For n_clusters = 3 The average silhouette_score is : 0.1186702254780341
For n_clusters = 4 The average silhouette_score is : 0.10827643805189824
For n_clusters = 5 The average silhouette_score is : 0.10334779101280687
For n_clusters = 6 The average silhouette_score is : 0.0850718661196539
For n_clusters = 7 The average silhouette_score is : 0.08019888041177912
For n_clusters = 8 The average silhouette_score is : 0.06252800179993218
For n_clusters = 9 The average silhouette_score is : 0.07684464559442432
For n_clusters = 10 The average silhouette_score is : 0.0746713396063883
In [5]:
cm3 = ClusteringManager(29)
cm3.elbow()
cm3.plot_silhouette()
For n_clusters = 2 The average silhouette_score is : 0.15898373338333446
For n_clusters = 3 The average silhouette_score is : 0.1591523637366883
For n_clusters = 4 The average silhouette_score is : 0.11407208136709593
For n_clusters = 5 The average silhouette_score is : 0.11603332428069567
For n_clusters = 6 The average silhouette_score is : 0.1263449070871331
For n_clusters = 7 The average silhouette_score is : 0.11453758389701854
For n_clusters = 8 The average silhouette_score is : 0.10865858306141145
For n_clusters = 9 The average silhouette_score is : 0.10824713020106996
For n_clusters = 10 The average silhouette_score is : 0.10618402592672672

'Web > Python' 카테고리의 다른 글

Hierarchical Clustering  (0) 2019.01.17
DBSCAN clustering  (0) 2019.01.17
k-means clustering  (0) 2019.01.17