Curse of Dimensionality
Learn how high-dimensional data creates unique challenges for machine learning algorithms.
High Dimensions
Sparsity
Distance Metrics
The curse of dimensionality refers to various phenomena that arise when analyzing data in high-dimensional spaces, where intuitions from low-dimensional space often don't apply.
Computational Complexity
Understand how computational requirements scale with the number of dimensions.
Storage: O(n Ć d) - Linear growth with dimensions
Distance Computation: O(d) per pair
Matrix Operations: O(d²) to O(d³) depending on algorithm
import numpy as np
import time
import matplotlib.pyplot as plt
def demonstrate_computational_scaling():
"""Show how computation time scales with dimensions"""
dimensions = [10, 50, 100, 500, 1000, 2000]
times = []
n_samples = 1000
for d in dimensions:
# Generate random data
X = np.random.randn(n_samples, d)
# Time covariance matrix computation
start_time = time.time()
cov_matrix = np.cov(X.T)
end_time = time.time()
times.append(end_time - start_time)
print(f"Dimensions: {d:4d}, Time: {end_time - start_time:.4f}s, "
f"Memory: {X.nbytes / 1024**2:.1f}MB")
return dimensions, times
# Demonstrate scaling issues
print("=== COMPUTATIONAL SCALING WITH DIMENSIONS ===")
dims, computation_times = demonstrate_computational_scaling()
print("\\n=== STORAGE REQUIREMENTS ===")
for d in [100, 1000, 10000]:
n_samples = 10000
storage_mb = (n_samples * d * 8) / (1024**2) # 8 bytes per float64
print(f"{d:5d} dimensions: {storage_mb:6.1f} MB")
print("\\n=== DISTANCE COMPUTATION CHALLENGES ===")
# Demonstrate how distances become similar in high dimensions
for d in [2, 10, 100, 1000]:
np.random.seed(42)
X = np.random.randn(100, d)
# Compute pairwise distances
from scipy.spatial.distance import pdist
distances = pdist(X)
print(f"Dimensions: {d:4d}, Distance std/mean: {distances.std()/distances.mean():.3f}")
Visualization Challenges
Explore the difficulties of visualizing and understanding high-dimensional data.
Humans can only visualize up to 3 dimensions effectively. Higher dimensions require projection techniques or alternative visualization methods to gain insights.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
def visualization_challenge_demo():
"""Demonstrate visualization challenges with high-D data"""
# Load high-dimensional data (64 dimensions)
digits = load_digits()
X, y = digits.data, digits.target
print(f"Original data shape: {X.shape}")
print(f"We have {X.shape[1]} dimensions to visualize!")
# Can't visualize 64D directly - need dimensionality reduction
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X)
print(f"After PCA: {X_2d.shape}")
print(f"Variance explained: {pca.explained_variance_ratio_.sum():.1%}")
# Now we can visualize in 2D
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap='tab10')
plt.colorbar(scatter)
plt.title('64D Digits Data Projected to 2D using PCA')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()
return X_2d
# Show the challenge
print("=== HIGH-DIMENSIONAL VISUALIZATION CHALLENGE ===")
projected_data = visualization_challenge_demo()
print("\\n=== WHAT WE LOSE IN PROJECTION ===")
print("- Can't see all relationships between features")
print("- Some clusters might overlap in 2D projection")
print("- Information is compressed and potentially lost")
print("- Need to choose which dimensions to visualize")
Feature Redundancy
Learn how correlated features create redundancy in high-dimensional datasets.
Correlation
Redundancy
Information
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def demonstrate_feature_redundancy():
"""Show how features can be redundant/correlated"""
np.random.seed(42)
n_samples = 1000
# Create correlated features
feature1 = np.random.randn(n_samples)
feature2 = feature1 + 0.1 * np.random.randn(n_samples) # Highly correlated
feature3 = 2 * feature1 + 0.05 * np.random.randn(n_samples) # Linear combination
feature4 = np.random.randn(n_samples) # Independent
feature5 = -feature1 + 0.1 * np.random.randn(n_samples) # Negatively correlated
# Create DataFrame