| """ |
| Meta-learner: trains models to predict algorithm performance from dataset meta-features. |
| Supports multi-output regression (predict SHD per algorithm) and ranking evaluation. |
| """ |
| import os |
| import numpy as np |
| import pandas as pd |
| import json |
| import logging |
| from collections import defaultdict |
|
|
| from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor |
| from sklearn.multioutput import MultiOutputRegressor |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict |
| from sklearn.metrics import mean_squared_error, mean_absolute_error |
| import joblib |
|
|
| from causal_selection.features.extractor import FEATURE_NAMES |
| from causal_selection.discovery.algorithms import ALGORITHM_POOL |
|
|
| logger = logging.getLogger(__name__) |
|
|
| ALGO_NAMES = list(ALGORITHM_POOL.keys()) |
| RESULTS_DIR = '/app/causal_selection/data/results' |
| MODEL_DIR = '/app/causal_selection/models' |
|
|
|
|
| def load_meta_dataset(results_dir=RESULTS_DIR): |
| """Load meta-dataset from CSV files.""" |
| X = pd.read_csv(os.path.join(results_dir, 'meta_features.csv')) |
| Y_shd = pd.read_csv(os.path.join(results_dir, 'shd_matrix.csv')) |
| Y_nshd = pd.read_csv(os.path.join(results_dir, 'normalized_shd_matrix.csv')) |
| configs = pd.read_csv(os.path.join(results_dir, 'configs.csv')) |
| return X, Y_shd, Y_nshd, configs |
|
|
|
|
| def train_meta_learner(X, Y, model_type='rf', **model_kwargs): |
| """Train a multi-output regression model. |
| |
| Args: |
| X: feature matrix (n_tasks, n_features) |
| Y: target matrix (n_tasks, n_algorithms) - SHD values |
| model_type: 'rf' or 'gbm' |
| |
| Returns: |
| trained model, scaler |
| """ |
| scaler = StandardScaler() |
| X_scaled = scaler.fit_transform(X) |
| |
| if model_type == 'rf': |
| base = RandomForestRegressor( |
| n_estimators=model_kwargs.get('n_estimators', 200), |
| max_depth=model_kwargs.get('max_depth', None), |
| min_samples_leaf=model_kwargs.get('min_samples_leaf', 2), |
| random_state=42, |
| n_jobs=-1, |
| ) |
| elif model_type == 'gbm': |
| base = GradientBoostingRegressor( |
| n_estimators=model_kwargs.get('n_estimators', 200), |
| max_depth=model_kwargs.get('max_depth', 5), |
| learning_rate=model_kwargs.get('learning_rate', 0.1), |
| min_samples_leaf=model_kwargs.get('min_samples_leaf', 3), |
| random_state=42, |
| ) |
| else: |
| raise ValueError(f"Unknown model type: {model_type}") |
| |
| model = MultiOutputRegressor(base) |
| model.fit(X_scaled, Y) |
| |
| return model, scaler |
|
|
|
|
| def predict_top_k(model, scaler, X_new, k=3): |
| """Predict top-k algorithms for new dataset(s). |
| |
| Args: |
| model: trained multi-output model |
| scaler: fitted StandardScaler |
| X_new: feature matrix (n_new, n_features) |
| k: number of top algorithms to return |
| |
| Returns: |
| top_k_indices: (n_new, k) array of algorithm indices (sorted by predicted SHD ascending) |
| predicted_shd: (n_new, n_algorithms) full predicted SHD matrix |
| """ |
| X_scaled = scaler.transform(X_new) |
| predicted = model.predict(X_scaled) |
| |
| if predicted.ndim == 1: |
| predicted = predicted.reshape(1, -1) |
| |
| top_k_indices = np.argsort(predicted, axis=1)[:, :k] |
| return top_k_indices, predicted |
|
|
|
|
| def evaluate_lono_cv(X, Y, configs, model_type='rf', k=3, **model_kwargs): |
| """Leave-One-Network-Out Cross-Validation. |
| |
| For each network, train on all other networks, test on that network. |
| This tests generalization to truly unseen graph structures. |
| |
| Returns: |
| results: dict with metrics per network and overall |
| """ |
| networks = configs['network'].values |
| unique_networks = sorted(configs['network'].unique()) |
| |
| results = { |
| 'per_network': {}, |
| 'all_predictions': [], |
| 'all_true': [], |
| 'all_configs': [], |
| } |
| |
| scaler = StandardScaler() |
| |
| for test_net in unique_networks: |
| test_mask = networks == test_net |
| train_mask = ~test_mask |
| |
| if train_mask.sum() < 3: |
| logger.warning(f"Skipping {test_net}: only {train_mask.sum()} training samples") |
| continue |
| |
| X_train = X.values[train_mask] |
| Y_train = Y.values[train_mask] |
| X_test = X.values[test_mask] |
| Y_test = Y.values[test_mask] |
| |
| |
| scaler.fit(X_train) |
| X_train_s = scaler.transform(X_train) |
| X_test_s = scaler.transform(X_test) |
| |
| |
| if model_type == 'rf': |
| base = RandomForestRegressor( |
| n_estimators=model_kwargs.get('n_estimators', 200), |
| max_depth=model_kwargs.get('max_depth', None), |
| min_samples_leaf=model_kwargs.get('min_samples_leaf', 2), |
| random_state=42, n_jobs=-1, |
| ) |
| else: |
| base = GradientBoostingRegressor( |
| n_estimators=model_kwargs.get('n_estimators', 200), |
| max_depth=model_kwargs.get('max_depth', 5), |
| learning_rate=model_kwargs.get('learning_rate', 0.1), |
| min_samples_leaf=model_kwargs.get('min_samples_leaf', 3), |
| random_state=42, |
| ) |
| |
| model = MultiOutputRegressor(base) |
| model.fit(X_train_s, Y_train) |
| |
| |
| Y_pred = model.predict(X_test_s) |
| |
| |
| net_metrics = _compute_ranking_metrics(Y_pred, Y_test, k=k) |
| net_metrics['n_test'] = int(test_mask.sum()) |
| net_metrics['n_train'] = int(train_mask.sum()) |
| results['per_network'][test_net] = net_metrics |
| |
| results['all_predictions'].extend(Y_pred.tolist()) |
| results['all_true'].extend(Y_test.tolist()) |
| results['all_configs'].extend( |
| configs[test_mask][['network', 'n_samples', 'seed']].to_dict('records') |
| ) |
| |
| logger.info(f" {test_net:15s}: top{k}_hit={net_metrics['top_k_hit_rate']:.3f} " |
| f"regret={net_metrics['mean_regret']:.2f} " |
| f"ndcg={net_metrics['ndcg_at_k']:.3f}") |
| |
| |
| all_pred = np.array(results['all_predictions']) |
| all_true = np.array(results['all_true']) |
| overall = _compute_ranking_metrics(all_pred, all_true, k=k) |
| results['overall'] = overall |
| |
| return results |
|
|
|
|
| def _compute_ranking_metrics(Y_pred, Y_true, k=3): |
| """Compute ranking metrics for algorithm selection. |
| |
| Args: |
| Y_pred: (n, n_algos) predicted SHD values |
| Y_true: (n, n_algos) true SHD values |
| k: top-k to consider |
| """ |
| n = Y_pred.shape[0] |
| |
| top_k_hits = 0 |
| regrets = [] |
| ndcgs = [] |
| |
| for i in range(n): |
| true_ranking = np.argsort(Y_true[i]) |
| pred_ranking = np.argsort(Y_pred[i]) |
| |
| true_best = true_ranking[0] |
| pred_top_k = pred_ranking[:k] |
| |
| |
| if true_best in pred_top_k: |
| top_k_hits += 1 |
| |
| |
| oracle_shd = Y_true[i, true_best] |
| selected_shds = [Y_true[i, j] for j in pred_top_k] |
| best_selected_shd = min(selected_shds) |
| regret = best_selected_shd - oracle_shd |
| regrets.append(regret) |
| |
| |
| ndcg = _ndcg_at_k(Y_true[i], Y_pred[i], k) |
| ndcgs.append(ndcg) |
| |
| |
| top_k_overlap = 0 |
| for i in range(n): |
| true_top_k = set(np.argsort(Y_true[i])[:k]) |
| pred_top_k = set(np.argsort(Y_pred[i])[:k]) |
| overlap = len(true_top_k & pred_top_k) |
| top_k_overlap += overlap / k |
| |
| return { |
| 'top_k_hit_rate': top_k_hits / n, |
| 'top_k_overlap_rate': top_k_overlap / n, |
| 'mean_regret': np.mean(regrets), |
| 'median_regret': np.median(regrets), |
| 'max_regret': np.max(regrets), |
| 'ndcg_at_k': np.mean(ndcgs), |
| 'mean_pred_mse': mean_squared_error(Y_true, Y_pred), |
| 'mean_pred_mae': mean_absolute_error(Y_true, Y_pred), |
| } |
|
|
|
|
| def _ndcg_at_k(true_scores, pred_scores, k): |
| """Normalized Discounted Cumulative Gain at k. |
| |
| For algorithm selection: lower SHD = better, so we negate scores for ranking. |
| """ |
| |
| max_shd = max(true_scores.max(), 1) |
| relevance = max_shd - true_scores |
| |
| |
| pred_order = np.argsort(pred_scores)[:k] |
| |
| |
| dcg = 0 |
| for rank, idx in enumerate(pred_order): |
| dcg += relevance[idx] / np.log2(rank + 2) |
| |
| |
| ideal_order = np.argsort(-relevance)[:k] |
| idcg = 0 |
| for rank, idx in enumerate(ideal_order): |
| idcg += relevance[idx] / np.log2(rank + 2) |
| |
| return dcg / idcg if idcg > 0 else 0 |
|
|
|
|
| def get_feature_importance(model, feature_names=FEATURE_NAMES, algo_names=ALGO_NAMES): |
| """Extract feature importance from trained model.""" |
| importances = {} |
| for i, (algo, estimator) in enumerate(zip(algo_names, model.estimators_)): |
| if hasattr(estimator, 'feature_importances_'): |
| importances[algo] = dict(zip(feature_names, estimator.feature_importances_)) |
| |
| |
| avg_importance = defaultdict(float) |
| for algo, imp in importances.items(): |
| for feat, val in imp.items(): |
| avg_importance[feat] += val / len(importances) |
| |
| return dict(avg_importance), importances |
|
|
|
|
| def save_model(model, scaler, model_dir=MODEL_DIR): |
| """Save trained model and scaler.""" |
| os.makedirs(model_dir, exist_ok=True) |
| joblib.dump(model, os.path.join(model_dir, 'meta_learner.pkl')) |
| joblib.dump(scaler, os.path.join(model_dir, 'scaler.pkl')) |
| logger.info(f"Model saved to {model_dir}") |
|
|
|
|
| def load_model(model_dir=MODEL_DIR): |
| """Load trained model and scaler.""" |
| model = joblib.load(os.path.join(model_dir, 'meta_learner.pkl')) |
| scaler = joblib.load(os.path.join(model_dir, 'scaler.pkl')) |
| return model, scaler |
|
|
|
|
| if __name__ == '__main__': |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') |
| |
| |
| X, Y_shd, Y_nshd, configs = load_meta_dataset() |
| |
| print(f"Meta-dataset: X={X.shape}, Y_shd={Y_shd.shape}") |
| print(f"Networks: {configs['network'].unique()}") |
| print(f"Configs per network:") |
| print(configs['network'].value_counts().to_string()) |
| |
| |
| print("\n" + "=" * 80) |
| print("LEAVE-ONE-NETWORK-OUT CV (RandomForest)") |
| print("=" * 80) |
| |
| results_rf = evaluate_lono_cv(X, Y_nshd, configs, model_type='rf', k=3) |
| |
| print(f"\nOverall Results (RF):") |
| for k, v in results_rf['overall'].items(): |
| print(f" {k:25s}: {v:.4f}") |
| |
| print("\n" + "=" * 80) |
| print("LEAVE-ONE-NETWORK-OUT CV (GradientBoosting)") |
| print("=" * 80) |
| |
| results_gbm = evaluate_lono_cv(X, Y_nshd, configs, model_type='gbm', k=3) |
| |
| print(f"\nOverall Results (GBM):") |
| for k, v in results_gbm['overall'].items(): |
| print(f" {k:25s}: {v:.4f}") |
| |
| |
| print("\n" + "=" * 80) |
| print("TRAINING FINAL MODEL") |
| print("=" * 80) |
| |
| best_type = 'rf' if results_rf['overall']['top_k_hit_rate'] >= results_gbm['overall']['top_k_hit_rate'] else 'gbm' |
| print(f"Selected model type: {best_type}") |
| |
| model, scaler = train_meta_learner(X, Y_nshd, model_type=best_type) |
| save_model(model, scaler) |
| |
| |
| avg_imp, per_algo_imp = get_feature_importance(model) |
| print("\nTop 10 Most Important Features:") |
| for feat, imp in sorted(avg_imp.items(), key=lambda x: -x[1])[:10]: |
| print(f" {feat:30s}: {imp:.4f}") |
| |
| |
| with open(os.path.join(RESULTS_DIR, 'evaluation_results.json'), 'w') as f: |
| json.dump({ |
| 'rf': {k: v for k, v in results_rf['overall'].items()}, |
| 'gbm': {k: v for k, v in results_gbm['overall'].items()}, |
| 'feature_importance': avg_imp, |
| 'selected_model': best_type, |
| }, f, indent=2) |
|
|