Identify groups and patterns in data using k-means, hierarchical clustering, and DBSCAN for cluster discovery, customer segmentation, and unsupervised learning
Clustering Analysis
Overview
Clustering partitions data into groups of similar observations without pre-defined labels, enabling discovery of natural patterns and structures in data.
When to Use
Segmenting customers based on purchasing behavior or demographics
Discovering natural groupings in data without prior knowledge of categories
Identifying market segments for targeted marketing campaigns
Organizing large datasets into meaningful categories for further analysis
Finding patterns in gene expression data or medical imaging
Grouping documents, products, or users by similarity for recommendation systems
Clustering Algorithms
K-Means: Partitioning into k clusters
Hierarchical: Dendrograms showing nested clusters
DBSCAN: Density-based arbitrary-shaped clusters
Gaussian Mixture: Probabilistic clustering
Agglomerative: Bottom-up hierarchical approach
Key Concepts
Cluster Validation: Metrics to evaluate cluster quality
Optimal Clusters: Methods to determine best k
Inertia: Within-cluster sum of squares
Silhouette Score: Measure of cluster separation
Dendrogram: Hierarchical clustering visualization
Implementation with Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
silhouette_score, silhouette_samples, davies_bouldin_score,
calinski_harabasz_score
)
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
# Generate sample data
np.random.seed(42)
n_samples = 300
centers = [[0, 0], [5, 5], [-3, 4]]
X = np.vstack([
np.random.randn(100, 2) + centers[0],
np.random.randn(100, 2) + centers[1],
np.random.randn(100, 2) + centers[2],
])
# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# K-Means with Elbow method
inertias = []
silhouette_scores = []
k_range = range(2, 11)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
inertias.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
axes[0].plot(k_range, inertias, 'bo-')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True, alpha=0.3)
axes[1].plot(k_range, silhouette_scores, 'go-')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Optimal k = 3
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)
# K-Means visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# K-Means clusters
axes[0].scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)
axes[0].scatter(
kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
c='red', marker='X', s=200, edgecolors='black', linewidths=2
)
axes[0].set_title(f'K-Means (k={optimal_k})')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
# Silhouette plot
ax = axes[1]
y_lower = 10
silhouette_vals = silhouette_samples(X_scaled, kmeans_labels)
for i in range(optimal_k):
cluster_silhouette_vals = silhouette_vals[kmeans_labels == i]
cluster_silhouette_vals.sort()
size_cluster_i = cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
ax.fill_betweenx(np.arange(y_lower, y_upper),
0, cluster_silhouette_vals,
alpha=0.7, label=f'Cluster {i}')
y_lower = y_upper + 10
ax.axvline(x=silhouette_score(X_scaled, kmeans_labels), color="red", linestyle="--")
ax.set_xlabel('Silhouette Coefficient')
ax.set_ylabel('Cluster Label')
ax.set_title('Silhouette Plot')
# Hierarchical clustering
linkage_matrix = linkage(X_scaled, method='ward')
dendrogram(linkage_matrix, ax=axes[2], truncate_mode='lastp', p=10)
axes[2].set_title('Dendrogram (Ward)')
axes[2].set_xlabel('Sample Index')
plt.tight_layout()
plt.show()
# Hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=optimal_k, linkage='ward')
hier_labels = hierarchical.fit_predict(X_scaled)
# DBSCAN clustering
dbscan = DBSCAN(eps=0.4, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)
# Gaussian Mixture Model
gmm = GaussianMixture(n_components=optimal_k, random_state=42)
gmm_labels = gmm.fit_predict(X_scaled)
gmm_proba = gmm.predict_proba(X_scaled)
# Clustering algorithm comparison
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
algorithms = [
(kmeans_labels, 'K-Means'),
(hier_labels, 'Hierarchical'),
(dbscan_labels, 'DBSCAN'),
(gmm_labels, 'Gaussian Mixture'),
]
for idx, (labels, title) in enumerate(algorithms):
ax = axes[idx // 2, idx % 2]
# Skip noise points for DBSCAN
mask = labels != -1
scatter = ax.scatter(
X[mask, 0], X[mask, 1], c=labels[mask], cmap='viridis', alpha=0.6
)
if title == 'DBSCAN' and n_noise > 0:
noise_mask = labels == -1
ax.scatter(X[noise_mask, 0], X[noise_mask, 1], c='red', marker='x', s=100, label='Noise')
ax.legend()
ax.set_title(f'{title} (n_clusters={len(set(labels[mask]))})')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
plt.tight_layout()
plt.show()
# Cluster validation metrics
validation_metrics = {
'Algorithm': ['K-Means', 'Hierarchical', 'DBSCAN', 'GMM'],
'Silhouette Score': [
silhouette_score(X_scaled, kmeans_labels),
silhouette_score(X_scaled, hier_labels),
silhouette_score(X_scaled[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]) if n_noise < len(X_scaled) else np.nan,
silhouette_score(X_scaled, gmm_labels),
],
'Davies-Bouldin Index': [
davies_bouldin_score(X_scaled, kmeans_labels),
davies_bouldin_score(X_scaled, hier_labels),
davies_bouldin_score(X_scaled[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]) if n_noise < len(X_scaled) else np.nan,
davies_bouldin_score(X_scaled, gmm_labels),
],
'Calinski-Harabasz Index': [
calinski_harabasz_score(X_scaled, kmeans_labels),
calinski_harabasz_score(X_scaled, hier_labels),
calinski_harabasz_score(X_scaled[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]) if n_noise < len(X_scaled) else np.nan,
calinski_harabasz_score(X_scaled, gmm_labels),
],
}
metrics_df = pd.DataFrame(validation_metrics)
print("Clustering Validation Metrics:")
print(metrics_df)
# Cluster size analysis
sizes_df = pd.DataFrame({
'K-Means': pd.Series(kmeans_labels).value_counts().sort_index(),
'Hierarchical': pd.Series(hier_labels).value_counts().sort_index(),
'GMM': pd.Series(gmm_labels).value_counts().sort_index(),
})
print("\nCluster Sizes:")
print(sizes_df)
# Membership probability (GMM)
fig, ax = plt.subplots(figsize=(10, 6))
membership = gmm_proba.max(axis=1)
scatter = ax.scatter(X[:, 0], X[:, 1], c=membership, cmap='RdYlGn', alpha=0.6, s=50)
ax.set_title('Cluster Membership Confidence (GMM)')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
plt.colorbar(scatter, ax=ax, label='Membership Probability')
plt.show()
# Cluster characteristics
kmeans_centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_df = pd.DataFrame(X, columns=['Feature 1', 'Feature 2'])
cluster_df['Cluster'] = kmeans_labels
for cluster_id in range(optimal_k):
cluster_data = cluster_df[cluster_df['Cluster'] == cluster_id]
print(f"\nCluster {cluster_id} Characteristics:")
print(cluster_data[['Feature 1', 'Feature 2']].describe())
Cluster Quality Metrics
Silhouette Score: -1 to 1 (higher is better)
Davies-Bouldin Index: Lower is better
Calinski-Harabasz Index: Higher is better
Inertia: Lower is better (KMeans only)
Algorithm Selection
K-Means: Fast, spherical clusters, k needs specification
Hierarchical: Produces dendrogram, interpretable
DBSCAN: Arbitrary shapes, handles noise
GMM: Probabilistic, soft assignments
Deliverables
Optimal cluster count analysis
Cluster visualizations
Validation metrics comparison
Cluster characteristics summary
Silhouette plots
Dendrogram for hierarchical clustering
Membership assignmentsdon't have the plugin yet? install it then click "run inline in claude" again.