MemoLearning K-Means Clustering

What is Clustering

Learn the fundamental concept of grouping similar data points together without labeled examples.

Grouping Similarity Unsupervised

Clustering is the task of grouping data points such that points in the same group (cluster) are more similar to each other than to points in other groups.

Types of Clustering Algorithms

Explore different approaches to clustering and understand where K-means fits.

Partitional: K-means, K-medoids - Divide data into non-overlapping clusters
Hierarchical: Agglomerative, Divisive - Create tree-like cluster structures
Density-based: DBSCAN, OPTICS - Find clusters based on density regions
Model-based: Gaussian Mixture Models - Assume underlying probability distributions

          # Overview of clustering algorithm types

          import numpy as np

          from sklearn.datasets import make_blobs

          from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

          from sklearn.mixture import GaussianMixture

          import matplotlib.pyplot as plt

          # Generate sample data

          X, y_true = make_blobs(n_samples=300, centers=4, n_features=2, 

                                  random_state=42, cluster_std=1.5)

          print("=== CLUSTERING ALGORITHM COMPARISON ===")

          # K-means (Partitional)

          kmeans = KMeans(n_clusters=4, random_state=42)

          kmeans_labels = kmeans.fit_predict(X)

          print(f"K-means clusters: {len(np.unique(kmeans_labels))}")

          # Hierarchical (Agglomerative)

          hierarchical = AgglomerativeClustering(n_clusters=4)

          hierarchical_labels = hierarchical.fit_predict(X)

          print(f"Hierarchical clusters: {len(np.unique(hierarchical_labels))}")

          # Density-based (DBSCAN)

          dbscan = DBSCAN(eps=0.8, min_samples=5)

          dbscan_labels = dbscan.fit_predict(X)

          n_dbscan_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)

          print(f"DBSCAN clusters: {n_dbscan_clusters}")

          # Model-based (Gaussian Mixture)

          gmm = GaussianMixture(n_components=4, random_state=42)

          gmm_labels = gmm.fit_predict(X)

          print(f"GMM clusters: {len(np.unique(gmm_labels))}")

          print("\\n=== ALGORITHM CHARACTERISTICS ===")

          characteristics = {

            "K-means": {

              "Type": "Partitional",

              "Clusters": "Spherical, equal size",

              "K": "Must specify",

              "Complexity": "O(n*k*i*d)"

            },

            "Hierarchical": {

              "Type": "Hierarchical",

              "Clusters": "Any shape",

              "K": "Choose from dendrogram",

              "Complexity": "O(n³)"

            },

            "DBSCAN": {

              "Type": "Density-based",

              "Clusters": "Any shape, handles noise",

              "K": "Automatic",

              "Complexity": "O(n log n)"

            }

          }

          for alg, props in characteristics.items():

            print(f"\\n{alg}:")

            for prop, value in props.items():

              print(f"  {prop}: {value}")

Clustering Applications

Discover real-world applications where clustering provides valuable insights.

Customer Segmentation: Group customers by behavior
Market Research: Identify consumer preferences
Image Segmentation: Separate image regions
Gene Sequencing: Group similar genetic patterns
Recommendation Systems: Find user/item groups

          # Real-world clustering applications

          import numpy as np

          import pandas as pd

          from sklearn.cluster import KMeans

          from sklearn.preprocessing import StandardScaler

          print("=== CUSTOMER SEGMENTATION EXAMPLE ===")

          # Simulate customer data

          np.random.seed(42)

          n_customers = 1000

          # Customer features

          age = np.random.normal(40, 15, n_customers)

          income = np.random.normal(50000, 20000, n_customers)

          spending_score = np.random.normal(50, 25, n_customers)

          # Create customer dataset

          customers = pd.DataFrame({

            'Age': np.clip(age, 18, 80),

            'Income': np.clip(income, 20000, 150000),

            'SpendingScore': np.clip(spending_score, 1, 100)

          })

          print("Customer data sample:")

          print(customers.head())

          print(f"\\nDataset shape: {customers.shape}")

          # Standardize features

          scaler = StandardScaler()

          customers_scaled = scaler.fit_transform(customers)

          # Apply K-means clustering

          kmeans = KMeans(n_clusters=4, random_state=42)

          customer_segments = kmeans.fit_predict(customers_scaled)

          # Add cluster labels to dataframe

          customers['Segment'] = customer_segments

          print("\\n=== CUSTOMER SEGMENTS ===")

          for segment in range(4):

            segment_data = customers[customers['Segment'] == segment]

            print(f"\\nSegment {segment}: {len(segment_data)} customers")

            print(f"  Avg Age: {segment_data['Age'].mean():.1f}")

            print(f"  Avg Income: ${segment_data['Income'].mean():,.0f}")

            print(f"  Avg Spending: {segment_data['SpendingScore'].mean():.1f}")

          print("\\n=== OTHER APPLICATIONS ===")

          applications = {

            "Healthcare": "Patient risk stratification, disease subtypes",

            "Marketing": "Campaign targeting, product positioning",

            "Biology": "Gene expression analysis, species classification",

            "Image Processing": "Color quantization, object detection",

            "Social Networks": "Community detection, influence analysis",

            "Finance": "Portfolio optimization, fraud detection"

          }

          for domain, use_cases

🎯 K-Means Clustering

K-Means Clustering Curriculum

Introduction to Clustering

K-Means Algorithm Fundamentals

Distance Metrics and Similarity

Centroid Initialization

Choosing the Optimal K

K-Means Convergence

K-Means Limitations

K-Means Variants

Cluster Evaluation

Implementation and Applications

Unit 1: Introduction to Clustering

What is Clustering

Types of Clustering Algorithms

Clustering Applications