Source code for SToG.datasets

"""Dataset loading utilities."""
import numpy as np
from sklearn.datasets import load_breast_cancer, load_wine, make_classification


[docs] class DatasetLoader: """Load and prepare datasets for benchmarking."""
[docs] @staticmethod def load_breast_cancer(): """ Load breast cancer dataset. Returns: Dictionary with dataset information """ data = load_breast_cancer() return { 'name': 'Breast Cancer', 'X': data.data, 'y': data.target, 'n_important': 10, 'description': 'Binary classification, 30 features' }
[docs] @staticmethod def load_wine(): """ Load wine dataset. Returns: Dictionary with dataset information """ data = load_wine() return { 'name': 'Wine', 'X': data.data, 'y': data.target, 'n_important': 7, 'description': '3-class classification, 13 features' }
[docs] @staticmethod def create_synthetic_high_dim(): """ Create synthetic high-dimensional dataset (MADELON-like). Returns: Dictionary with dataset information """ X, y = make_classification( n_samples=600, n_features=100, n_informative=5, n_redundant=10, n_repeated=0, n_classes=2, n_clusters_per_class=2, flip_y=0.03, class_sep=1.0, random_state=42 ) return { 'name': 'Synthetic-HighDim', 'X': X, 'y': y, 'n_important': 5, 'description': 'Binary classification, 100 features, 5 informative' }
[docs] @staticmethod def create_synthetic_correlated(): """ Create synthetic dataset with correlated features. Returns: Dictionary with dataset information """ np.random.seed(42) n_samples = 500 n_informative = 5 n_total = 50 X_inform = np.random.randn(n_samples, n_informative) X_redundant = [] for i in range(n_informative): for _ in range(2): noise = np.random.randn(n_samples) * 0.1 X_redundant.append(X_inform[:, i] + noise) X_redundant = np.column_stack(X_redundant) n_noise = n_total - n_informative - X_redundant.shape[1] X_noise = np.random.randn(n_samples, n_noise) X = np.column_stack([X_inform, X_redundant, X_noise]) y = (X_inform[:, 0] + X_inform[:, 1] * X_inform[:, 2] > 0).astype(int) return { 'name': 'Synthetic-Correlated', 'X': X, 'y': y, 'n_important': n_informative, 'description': f'Binary classification, {n_total} features, {n_informative} informative with correlated copies' }