What is Clustering
Learn the fundamental concept of grouping similar data points together without labeled examples.
Grouping
Similarity
Unsupervised
Clustering is the task of grouping data points such that points in the same group (cluster) are more similar to each other than to points in other groups.
Types of Clustering Algorithms
Explore different approaches to clustering and understand where K-means fits.
Partitional: K-means, K-medoids - Divide data into non-overlapping clusters
Hierarchical: Agglomerative, Divisive - Create tree-like cluster structures
Density-based: DBSCAN, OPTICS - Find clusters based on density regions
Model-based: Gaussian Mixture Models - Assume underlying probability distributions
# Overview of clustering algorithm types
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
# Generate sample data
X, y_true = make_blobs(n_samples=300, centers=4, n_features=2,
random_state=42, cluster_std=1.5)
print("=== CLUSTERING ALGORITHM COMPARISON ===")
# K-means (Partitional)
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(X)
print(f"K-means clusters: {len(np.unique(kmeans_labels))}")
# Hierarchical (Agglomerative)
hierarchical = AgglomerativeClustering(n_clusters=4)
hierarchical_labels = hierarchical.fit_predict(X)
print(f"Hierarchical clusters: {len(np.unique(hierarchical_labels))}")
# Density-based (DBSCAN)
dbscan = DBSCAN(eps=0.8, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)
n_dbscan_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
print(f"DBSCAN clusters: {n_dbscan_clusters}")
# Model-based (Gaussian Mixture)
gmm = GaussianMixture(n_components=4, random_state=42)
gmm_labels = gmm.fit_predict(X)
print(f"GMM clusters: {len(np.unique(gmm_labels))}")
print("\\n=== ALGORITHM CHARACTERISTICS ===")
characteristics = {
"K-means": {
"Type": "Partitional",
"Clusters": "Spherical, equal size",
"K": "Must specify",
"Complexity": "O(n*k*i*d)"
},
"Hierarchical": {
"Type": "Hierarchical",
"Clusters": "Any shape",
"K": "Choose from dendrogram",
"Complexity": "O(n³)"
},
"DBSCAN": {
"Type": "Density-based",
"Clusters": "Any shape, handles noise",
"K": "Automatic",
"Complexity": "O(n log n)"
}
}
for alg, props in characteristics.items():
print(f"\\n{alg}:")
for prop, value in props.items():
print(f" {prop}: {value}")
Clustering Applications
Discover real-world applications where clustering provides valuable insights.
Customer Segmentation: Group customers by behavior
Market Research: Identify consumer preferences
Image Segmentation: Separate image regions
Gene Sequencing: Group similar genetic patterns
Recommendation Systems: Find user/item groups
# Real-world clustering applications
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
print("=== CUSTOMER SEGMENTATION EXAMPLE ===")
# Simulate customer data
np.random.seed(42)
n_customers = 1000
# Customer features
age = np.random.normal(40, 15, n_customers)
income = np.random.normal(50000, 20000, n_customers)
spending_score = np.random.normal(50, 25, n_customers)
# Create customer dataset
customers = pd.DataFrame({
'Age': np.clip(age, 18, 80),
'Income': np.clip(income, 20000, 150000),
'SpendingScore': np.clip(spending_score, 1, 100)
})
print("Customer data sample:")
print(customers.head())
print(f"\\nDataset shape: {customers.shape}")
# Standardize features
scaler = StandardScaler()
customers_scaled = scaler.fit_transform(customers)
# Apply K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
customer_segments = kmeans.fit_predict(customers_scaled)
# Add cluster labels to dataframe
customers['Segment'] = customer_segments
print("\\n=== CUSTOMER SEGMENTS ===")
for segment in range(4):
segment_data = customers[customers['Segment'] == segment]
print(f"\\nSegment {segment}: {len(segment_data)} customers")
print(f" Avg Age: {segment_data['Age'].mean():.1f}")
print(f" Avg Income: ${segment_data['Income'].mean():,.0f}")
print(f" Avg Spending: {segment_data['SpendingScore'].mean():.1f}")
print("\\n=== OTHER APPLICATIONS ===")
applications = {
"Healthcare": "Patient risk stratification, disease subtypes",
"Marketing": "Campaign targeting, product positioning",
"Biology": "Gene expression analysis, species classification",
"Image Processing": "Color quantization, object detection",
"Social Networks": "Community detection, influence analysis",
"Finance": "Portfolio optimization, fraud detection"
}
for domain, use_cases