Oguzz07's picture
Add run_benchmark.py
fbf08d1 verified
"""
Resume benchmark from partial results, then run medium and large networks too.
Optimized for CPU: reduced timeouts, skip heavy combos.
"""
import os
import sys
import time
import numpy as np
import pandas as pd
import json
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)
# Suppress the verbose BOSS/GRaSP output
logging.getLogger('causallearn').setLevel(logging.WARNING)
sys.path.insert(0, '/app')
from causal_selection.data.generator import (
load_bn_model, get_true_dag_adjmat, dag_to_cpdag, sample_dataset,
SMALL_NETWORKS, MEDIUM_NETWORKS, LARGE_NETWORKS, get_network_tier
)
from causal_selection.discovery.algorithms import run_algorithm, ALGORITHM_POOL
from causal_selection.discovery.evaluator import evaluate_algorithm_result
from causal_selection.features.extractor import extract_all_features, FEATURE_NAMES
ALGO_NAMES = list(ALGORITHM_POOL.keys())
RESULTS_DIR = '/app/causal_selection/data/results'
# More aggressive sample sizes - fewer but covering range
SAMPLE_SIZES_FAST = {
'small': [500, 1000, 2000, 5000],
'medium': [500, 1000, 2000],
'large': [500, 1000],
}
# Per-algorithm timeout (seconds) - algorithm-specific!
ALGO_TIMEOUT = {
'PC_discrete': {'small': 30, 'medium': 120, 'large': 300},
'FCI': {'small': 30, 'medium': 120, 'large': 300},
'GES': {'small': 30, 'medium': 120, 'large': 300},
'BOSS': {'small': 30, 'medium': 120, 'large': 300},
'GRaSP': {'small': 30, 'medium': 120, 'large': 300},
'HC': {'small': 30, 'medium': 60, 'large': 120},
'Tabu': {'small': 30, 'medium': 60, 'large': 120},
'MMHC': {'small': 30, 'medium': 60, 'large': 120},
'K2': {'small': 20, 'medium': 30, 'large': 60},
}
SEEDS = 2 # Reduced from 3 to speed up
def load_existing_results():
"""Load existing partial results to avoid re-running."""
existing = set()
partial_path = os.path.join(RESULTS_DIR, 'configs_partial.csv')
final_path = os.path.join(RESULTS_DIR, 'configs.csv')
for path in [partial_path, final_path]:
if os.path.exists(path):
df = pd.read_csv(path)
for _, row in df.iterrows():
key = (row['network'], int(row['n_samples']), int(row['seed']))
existing.add(key)
return existing
def run_benchmark():
"""Run full benchmark with resume capability."""
existing = load_existing_results()
logger.info(f"Found {len(existing)} existing configs")
# Load existing partial data
all_features = []
all_shd = []
all_nshd = []
all_configs = []
for prefix in ['meta_features_partial', 'meta_features']:
path = os.path.join(RESULTS_DIR, f'{prefix}.csv')
if os.path.exists(path):
df = pd.read_csv(path)
all_features = df.to_dict('records')
break
for prefix in ['shd_matrix_partial', 'shd_matrix']:
path = os.path.join(RESULTS_DIR, f'{prefix}.csv')
if os.path.exists(path):
df = pd.read_csv(path)
all_shd = df.to_dict('records')
break
for prefix in ['normalized_shd_partial', 'normalized_shd_matrix']:
path = os.path.join(RESULTS_DIR, f'{prefix}.csv')
if os.path.exists(path):
df = pd.read_csv(path)
all_nshd = df.to_dict('records')
break
for prefix in ['configs_partial', 'configs']:
path = os.path.join(RESULTS_DIR, f'{prefix}.csv')
if os.path.exists(path):
df = pd.read_csv(path)
all_configs = df.to_dict('records')
break
logger.info(f"Starting with {len(all_configs)} existing results")
# Generate all configs to run
all_networks = SMALL_NETWORKS + MEDIUM_NETWORKS + LARGE_NETWORKS
configs_to_run = []
for net in all_networks:
tier = get_network_tier(net)
for n_samples in SAMPLE_SIZES_FAST[tier]:
for seed in range(SEEDS):
key = (net, n_samples, seed)
if key not in existing:
configs_to_run.append((net, n_samples, seed, tier))
logger.info(f"Configs to run: {len(configs_to_run)}")
total = len(configs_to_run)
for idx, (network, n_samples, seed, tier) in enumerate(configs_to_run):
logger.info(f"\n[{idx+1}/{total}] {network} N={n_samples} seed={seed}")
try:
# Load network
model = load_bn_model(network)
true_dag, node_names = get_true_dag_adjmat(model)
true_cpdag = dag_to_cpdag(true_dag)
# Sample data
df = sample_dataset(model, n_samples, seed=seed)
# Extract features
features = extract_all_features(df, n_probe_triplets=80)
# Run algorithms with per-algo timeout
algo_metrics = {}
for algo_name in ALGO_NAMES:
timeout = ALGO_TIMEOUT[algo_name][tier]
result = run_algorithm(algo_name, df, timeout_sec=timeout)
metrics = evaluate_algorithm_result(result, true_cpdag)
algo_metrics[algo_name] = metrics
s = metrics['status']
if s == 'success':
logger.info(f" {algo_name:12s}: SHD={metrics['shd']:3d} F1={metrics['skeleton_f1']:.3f} t={metrics['runtime']:.1f}s")
else:
logger.info(f" {algo_name:12s}: {s} t={metrics['runtime']:.1f}s")
# Store results
feat_row = {name: features.get(name, 0.0) for name in FEATURE_NAMES}
all_features.append(feat_row)
shd_row = {algo: algo_metrics[algo]['shd'] for algo in ALGO_NAMES}
nshd_row = {algo: algo_metrics[algo]['normalized_shd'] for algo in ALGO_NAMES}
all_shd.append(shd_row)
all_nshd.append(nshd_row)
config = {
'network': network,
'n_samples': n_samples,
'seed': seed,
'n_variables': len(node_names),
'n_true_edges': int(((true_cpdag + true_cpdag.T) > 0).sum() // 2),
}
all_configs.append(config)
# Save periodically
if (idx + 1) % 3 == 0:
_save_results(all_features, all_shd, all_nshd, all_configs, partial=True)
except Exception as e:
logger.error(f"FAILED {network} N={n_samples} seed={seed}: {e}")
import traceback
traceback.print_exc()
# Final save
_save_results(all_features, all_shd, all_nshd, all_configs, partial=False)
# Print summary
Y_shd = pd.DataFrame(all_shd)
configs_df = pd.DataFrame(all_configs)
print("\n" + "=" * 80)
print("BENCHMARK COMPLETE")
print("=" * 80)
print(f"Total configs: {len(all_configs)}")
print(f"Networks: {configs_df['network'].unique()}")
print(f"\nMean SHD per algorithm:")
print(Y_shd.mean().sort_values())
print(f"\nBest algorithm count:")
print(Y_shd.idxmin(axis=1).value_counts())
def _save_results(features, shds, nshds, configs, partial=True):
os.makedirs(RESULTS_DIR, exist_ok=True)
suffix = '_partial' if partial else ''
pd.DataFrame(features).to_csv(os.path.join(RESULTS_DIR, f'meta_features{suffix}.csv'), index=False)
pd.DataFrame(shds).to_csv(os.path.join(RESULTS_DIR, f'shd_matrix{suffix}.csv'), index=False)
pd.DataFrame(nshds).to_csv(os.path.join(RESULTS_DIR, f'normalized_shd_matrix{suffix}.csv'), index=False)
pd.DataFrame(configs).to_csv(os.path.join(RESULTS_DIR, f'configs{suffix}.csv'), index=False)
if __name__ == '__main__':
run_benchmark()