Why Evaluate Models
Learn the fundamental reasons why proper model evaluation is critical for machine learning success.
Performance
Reliability
Generalization
Model evaluation helps us understand how well our models will perform on unseen data, compare different algorithms, and make informed decisions about model deployment.
Training vs Testing Performance
Understand the critical difference between training and testing performance.
Training Performance: How well the model fits the training data
Testing Performance: How well the model generalizes to new, unseen data
Gap Analysis: Large gaps indicate overfitting
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
def demonstrate_train_test_performance():
"""Show difference between training and testing performance"""
# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20,
n_informative=10, random_state=42)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
print("=== MODEL PERFORMANCE COMPARISON ===")
models = {
'Logistic Regression': LogisticRegression(random_state=42),
'Decision Tree (Shallow)': DecisionTreeClassifier(max_depth=3, random_state=42),
'Decision Tree (Deep)': DecisionTreeClassifier(max_depth=20, random_state=42)
}
for name, model in models.items():
# Train the model
model.fit(X_train, y_train)
# Get predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
# Calculate accuracies
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)
gap = train_acc - test_acc
print(f"\\n{name}:")
print(f" Training Accuracy: {train_acc:.3f}")
print(f" Testing Accuracy: {test_acc:.3f}")
print(f" Gap (Overfitting): {gap:.3f}")
if gap > 0.05:
print(f" ⚠️ Potential overfitting detected!")
elif gap < 0:
print(f" ⚠️ Unusual: test > train (check for data leakage)")
else:
print(f" ✅ Good generalization")
demonstrate_train_test_performance()
print("\\n=== KEY INSIGHTS ===")
print("1. Training accuracy is often higher than test accuracy")
print("2. Large gaps indicate overfitting")
print("3. Test accuracy is more reliable for real-world performance")
print("4. Use validation set for model selection, test set for final evaluation")
Overfitting and Underfitting
Learn to identify and diagnose overfitting and underfitting through evaluation metrics.
Overfitting: Model learns training data too well, poor generalization
Underfitting: Model is too simple to capture underlying patterns
Sweet Spot: Balanced model that generalizes well to new data
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
def demonstrate_fitting_behavior():
"""Demonstrate overfitting/underfitting with polynomial regression"""
# Generate synthetic data
np.random.seed(42)
X = np.linspace(0, 1, 100).reshape(-1, 1)
y = 1.5 * X.ravel() + 0.5 * np.sin(15 * X.ravel()) + 0.1 * np.random.randn(100)
# Split data
train_size = 70
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
print("===