Tutorial
In-Depth Feature Selection Tutorial
This tutorial demonstrates feature selection on a synthetic high-dimensional dataset.
Problem Setup
We have a classification task with:
1000 samples
100 features (only 5 truly informative)
Binary classification problem
Goal: identify the 5 important features
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from SToG import STGLayer, FeatureSelectionTrainer, create_classification_model
# Create synthetic dataset
np.random.seed(42)
torch.manual_seed(42)
X, y = make_classification(
n_samples=1000,
n_features=100,
n_informative=5,
n_redundant=10,
n_repeated=0,
random_state=42
)
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Split data: 60% train, 20% val, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.4, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=42
)
# Convert to tensors
X_train = torch.FloatTensor(X_train)
y_train = torch.LongTensor(y_train)
X_val = torch.FloatTensor(X_val)
y_val = torch.LongTensor(y_val)
X_test = torch.FloatTensor(X_test)
y_test = torch.LongTensor(y_test)
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
Step 1: Creating Components
# Create classification model
model = create_classification_model(
input_dim=100,
num_classes=2,
hidden_dim=64
)
# Create feature selector (STG with sigma=0.5)
selector = STGLayer(
input_dim=100,
sigma=0.5
)
# Create trainer with regularization strength lambda=0.05
trainer = FeatureSelectionTrainer(
model=model,
selector=selector,
criterion=nn.CrossEntropyLoss(),
lambda_reg=0.05,
device='cpu'
)
Step 2: Training
# Train for maximum 300 epochs with early stopping
history = trainer.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
epochs=300,
patience=50,
verbose=True
)
Expected output:
Epoch 50: val_acc=92.50%, sel=47, λ=0.0500
Epoch 100: val_acc=94.00%, sel=32, λ=0.0500
Epoch 150: val_acc=95.00%, sel=18, λ=0.0500
Epoch 200: val_acc=95.50%, sel=12, λ=0.0500
Epoch 250: val_acc=95.50%, sel=10, λ=0.0500
Early stopping at epoch 283
Step 3: Analyzing Results
# Evaluate on test set
result = trainer.evaluate(X_test, y_test)
print(f"Test Accuracy: {result['test_acc']:.2f}%")
print(f"Selected Features: {result['selected_count']} / 100")
print(f"Sparsity: {1 - result['selected_count']/100:.1%}")
# Get selected feature indices
selected_mask = result['selected_features']
selected_indices = np.where(selected_mask)[0]
print(f"\nSelected feature indices: {selected_indices}")
Expected output:
Test Accuracy: 95.50%
Selected Features: 10 / 100
Sparsity: 90.0%
Selected feature indices: [ 0 1 2 3 4 12 34 56 78 91]
Step 4: Visualizing Training History
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Plot 1: Validation Accuracy
axes[0].plot(history['val_acc'], label='Validation Accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy (%)')
axes[0].set_title('Validation Accuracy over Time')
axes[0].grid(True, alpha=0.3)
axes[0].legend()
# Plot 2: Selected Feature Count
axes[1].plot(history['sel_count'], label='Selected Features', color='orange')
axes[1].axhline(y=5, color='r', linestyle='--', label='True Informative (5)')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Number of Features')
axes[1].set_title('Feature Selection Progress')
axes[1].grid(True, alpha=0.3)
axes[1].legend()
# Plot 3: Regularization Loss
axes[2].plot(history['reg_loss'], label='Regularization Loss', color='green')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('Loss')
axes[2].set_title('Regularization Loss over Time')
axes[2].grid(True, alpha=0.3)
axes[2].legend()
plt.tight_layout()
plt.savefig('stg_training_history.png', dpi=300, bbox_inches='tight')
plt.show()
Comparing Methods
from SToG import STELayer, GumbelLayer, L1Layer
methods = {
'STG': (STGLayer, {'sigma': 0.5}),
'STE': (STELayer, {}),
'Gumbel': (GumbelLayer, {'temperature': 1.0}),
'L1': (L1Layer, {}),
}
comparison_results = {}
for method_name, (SelectorClass, kwargs) in methods.items():
# Fresh model
model = create_classification_model(100, 2)
selector = SelectorClass(input_dim=100, **kwargs)
trainer = FeatureSelectionTrainer(
model=model,
selector=selector,
criterion=nn.CrossEntropyLoss(),
lambda_reg=0.05
)
trainer.fit(X_train, y_train, X_val, y_val, epochs=300, verbose=False)
comparison_results[method_name] = trainer.evaluate(X_test, y_test)
# Display comparison table
print(f"\n{'Method':<15} {'Accuracy':<12} {'Selected':<12} {'Sparsity':<12}")
print('-' * 51)
for name, result in comparison_results.items():
sparsity = 1 - result['selected_count'] / 100
print(f"{name:<15} {result['test_acc']:>10.2f}% {result['selected_count']:>10} {sparsity:>10.1%}")
Advanced: Lambda Search
Automatic search for optimal sparsity-accuracy trade-off:
lambdas = np.logspace(-3, -0.5, 10)
results_by_lambda = {}
for lam in lambdas:
model = create_classification_model(100, 2)
selector = STGLayer(input_dim=100, sigma=0.5)
trainer = FeatureSelectionTrainer(
model=model,
selector=selector,
criterion=nn.CrossEntropyLoss(),
lambda_reg=lam
)
trainer.fit(X_train, y_train, X_val, y_val, epochs=300, verbose=False)
result = trainer.evaluate(X_test, y_test)
results_by_lambda[lam] = result
# Find best lambda by accuracy-sparsity balance
best_lambda = max(
results_by_lambda.keys(),
key=lambda lam: (
results_by_lambda[lam]['test_acc'] -
0.5 * abs(results_by_lambda[lam]['selected_count'] - 5)
)
)
print(f"Best lambda: {best_lambda:.4f}")
print(f"Accuracy: {results_by_lambda[best_lambda]['test_acc']:.2f}%")
print(f"Selected: {results_by_lambda[best_lambda]['selected_count']}")
Key Insights
Convergence speed varies by method - STE converges fastest but may over-select - STG provides good balance - Gumbel-Softmax requires temperature annealing
Lambda selection is critical - Too small: selects all features - Too large: selects too few features - Optimal: balances accuracy and sparsity
Feature correlation matters - Independent methods (STG, STE) may select all correlated copies - Use CorrelatedSTG for correlated feature sets - Preprocessing (PCA) can reduce correlation
Early stopping improves generalization - Prevents overfitting to training data - Saves best model by validation metric - Patience parameter: larger for noisier data