In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sn
import pprint
%matplotlib inline
In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.datasets import make_blobs
from collections import Counter
class ClusteringManager(object):
def __init__(self, rec_uf_g_idx, factor_limit_cnt=4, k=2):
self.rec_uf_g_idx = rec_uf_g_idx
self.factor_limit_cnt = factor_limit_cnt
self.k = k
self.k_init = k
self.file_path = f'rawdata/pure_vector_{rec_uf_g_idx}_new.csv'
self.df_all = pd.read_csv(self.file_path, sep=',')
self.df = self.df_all.loc[:, self.df_all.columns != 'member_idx']
self.cluster_result = {}
def get_df(self):
return self.df
def get_df_all(self):
return self.df_all
'''
적정 수의 K를 먼저 찾아낸다
'''
def elbow(self):
sse = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0).fit(self.df)
sse.append(kmeans.inertia_)
plt.plot(range(1, 11), sse, marker='o')
plt.xlabel('k')
plt.ylabel('SSE')
plt.show()
sse_d = {i+1: sse[i+1] - sse[i] for i in range(9)}
plt.plot(list(f'{k}~{k+1}/' for k in sse_d.keys()), sse_d.values(), marker='o')
plt.xlabel('k_delta')
plt.ylabel('SSE_delta')
plt.show()
'''
실루엣 기법
'''
def plot_silhouette(self):
for n_clusters in range(2, 11):
# Create a subplot with 1 row and 2 columns
fig, (ax1) = plt.subplots(1)
fig.set_size_inches(14, 8)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0)
cluster_labels = clusterer.fit_predict(self.df)
silhouette_avg = silhouette_score(self.df, cluster_labels)
#print(f'k={n_clusters} >> silhouette_avg is {silhouette_avg}')
print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(self.df, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters. (%f)" % silhouette_avg)
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
In [7]:
res = ClusteringManager(27)
res.elbow()
res.plot_silhouette()
In [4]:
cm2 = ClusteringManager(28)
cm2.elbow()
cm2.plot_silhouette()
In [5]:
cm3 = ClusteringManager(29)
cm3.elbow()
cm3.plot_silhouette()
'Web > Python' 카테고리의 다른 글
Hierarchical Clustering (0) | 2019.01.17 |
---|---|
DBSCAN clustering (0) | 2019.01.17 |
k-means clustering (0) | 2019.01.17 |