Add run_benchmark.py

fbf08d1 verified 5 days ago

7.97 kB

	"""
	Resume benchmark from partial results, then run medium and large networks too.
	Optimized for CPU: reduced timeouts, skip heavy combos.
	"""
	import os
	import sys
	import time
	import numpy as np
	import pandas as pd
	import json
	import logging
	import warnings

	warnings.filterwarnings('ignore')
	logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
	logger = logging.getLogger(__name__)

	# Suppress the verbose BOSS/GRaSP output
	logging.getLogger('causallearn').setLevel(logging.WARNING)

	sys.path.insert(0, '/app')
	from causal_selection.data.generator import (
	load_bn_model, get_true_dag_adjmat, dag_to_cpdag, sample_dataset,
	SMALL_NETWORKS, MEDIUM_NETWORKS, LARGE_NETWORKS, get_network_tier
	)
	from causal_selection.discovery.algorithms import run_algorithm, ALGORITHM_POOL
	from causal_selection.discovery.evaluator import evaluate_algorithm_result
	from causal_selection.features.extractor import extract_all_features, FEATURE_NAMES

	ALGO_NAMES = list(ALGORITHM_POOL.keys())
	RESULTS_DIR = '/app/causal_selection/data/results'

	# More aggressive sample sizes - fewer but covering range
	SAMPLE_SIZES_FAST = {
	'small': [500, 1000, 2000, 5000],
	'medium': [500, 1000, 2000],
	'large': [500, 1000],
	}

	# Per-algorithm timeout (seconds) - algorithm-specific!
	ALGO_TIMEOUT = {
	'PC_discrete': {'small': 30, 'medium': 120, 'large': 300},
	'FCI': {'small': 30, 'medium': 120, 'large': 300},
	'GES': {'small': 30, 'medium': 120, 'large': 300},
	'BOSS': {'small': 30, 'medium': 120, 'large': 300},
	'GRaSP': {'small': 30, 'medium': 120, 'large': 300},
	'HC': {'small': 30, 'medium': 60, 'large': 120},
	'Tabu': {'small': 30, 'medium': 60, 'large': 120},
	'MMHC': {'small': 30, 'medium': 60, 'large': 120},
	'K2': {'small': 20, 'medium': 30, 'large': 60},
	}

	SEEDS = 2 # Reduced from 3 to speed up


	def load_existing_results():
	"""Load existing partial results to avoid re-running."""
	existing = set()
	partial_path = os.path.join(RESULTS_DIR, 'configs_partial.csv')
	final_path = os.path.join(RESULTS_DIR, 'configs.csv')

	for path in [partial_path, final_path]:
	if os.path.exists(path):
	df = pd.read_csv(path)
	for _, row in df.iterrows():
	key = (row['network'], int(row['n_samples']), int(row['seed']))
	existing.add(key)

	return existing


	def run_benchmark():
	"""Run full benchmark with resume capability."""
	existing = load_existing_results()
	logger.info(f"Found {len(existing)} existing configs")

	# Load existing partial data
	all_features = []
	all_shd = []
	all_nshd = []
	all_configs = []

	for prefix in ['meta_features_partial', 'meta_features']:
	path = os.path.join(RESULTS_DIR, f'{prefix}.csv')
	if os.path.exists(path):
	df = pd.read_csv(path)
	all_features = df.to_dict('records')
	break

	for prefix in ['shd_matrix_partial', 'shd_matrix']:
	path = os.path.join(RESULTS_DIR, f'{prefix}.csv')
	if os.path.exists(path):
	df = pd.read_csv(path)
	all_shd = df.to_dict('records')
	break

	for prefix in ['normalized_shd_partial', 'normalized_shd_matrix']:
	path = os.path.join(RESULTS_DIR, f'{prefix}.csv')
	if os.path.exists(path):
	df = pd.read_csv(path)
	all_nshd = df.to_dict('records')
	break

	for prefix in ['configs_partial', 'configs']:
	path = os.path.join(RESULTS_DIR, f'{prefix}.csv')
	if os.path.exists(path):
	df = pd.read_csv(path)
	all_configs = df.to_dict('records')
	break

	logger.info(f"Starting with {len(all_configs)} existing results")

	# Generate all configs to run
	all_networks = SMALL_NETWORKS + MEDIUM_NETWORKS + LARGE_NETWORKS
	configs_to_run = []

	for net in all_networks:
	tier = get_network_tier(net)
	for n_samples in SAMPLE_SIZES_FAST[tier]:
	for seed in range(SEEDS):
	key = (net, n_samples, seed)
	if key not in existing:
	configs_to_run.append((net, n_samples, seed, tier))

	logger.info(f"Configs to run: {len(configs_to_run)}")

	total = len(configs_to_run)
	for idx, (network, n_samples, seed, tier) in enumerate(configs_to_run):
	logger.info(f"\n[{idx+1}/{total}] {network} N={n_samples} seed={seed}")

	try:
	# Load network
	model = load_bn_model(network)
	true_dag, node_names = get_true_dag_adjmat(model)
	true_cpdag = dag_to_cpdag(true_dag)

	# Sample data
	df = sample_dataset(model, n_samples, seed=seed)

	# Extract features
	features = extract_all_features(df, n_probe_triplets=80)

	# Run algorithms with per-algo timeout
	algo_metrics = {}
	for algo_name in ALGO_NAMES:
	timeout = ALGO_TIMEOUT[algo_name][tier]
	result = run_algorithm(algo_name, df, timeout_sec=timeout)
	metrics = evaluate_algorithm_result(result, true_cpdag)
	algo_metrics[algo_name] = metrics

	s = metrics['status']
	if s == 'success':
	logger.info(f" {algo_name:12s}: SHD={metrics['shd']:3d} F1={metrics['skeleton_f1']:.3f} t={metrics['runtime']:.1f}s")
	else:
	logger.info(f" {algo_name:12s}: {s} t={metrics['runtime']:.1f}s")

	# Store results
	feat_row = {name: features.get(name, 0.0) for name in FEATURE_NAMES}
	all_features.append(feat_row)

	shd_row = {algo: algo_metrics[algo]['shd'] for algo in ALGO_NAMES}
	nshd_row = {algo: algo_metrics[algo]['normalized_shd'] for algo in ALGO_NAMES}
	all_shd.append(shd_row)
	all_nshd.append(nshd_row)

	config = {
	'network': network,
	'n_samples': n_samples,
	'seed': seed,
	'n_variables': len(node_names),
	'n_true_edges': int(((true_cpdag + true_cpdag.T) > 0).sum() // 2),
	}
	all_configs.append(config)

	# Save periodically
	if (idx + 1) % 3 == 0:
	_save_results(all_features, all_shd, all_nshd, all_configs, partial=True)

	except Exception as e:
	logger.error(f"FAILED {network} N={n_samples} seed={seed}: {e}")
	import traceback
	traceback.print_exc()

	# Final save
	_save_results(all_features, all_shd, all_nshd, all_configs, partial=False)

	# Print summary
	Y_shd = pd.DataFrame(all_shd)
	configs_df = pd.DataFrame(all_configs)

	print("\n" + "=" * 80)
	print("BENCHMARK COMPLETE")
	print("=" * 80)
	print(f"Total configs: {len(all_configs)}")
	print(f"Networks: {configs_df['network'].unique()}")
	print(f"\nMean SHD per algorithm:")
	print(Y_shd.mean().sort_values())
	print(f"\nBest algorithm count:")
	print(Y_shd.idxmin(axis=1).value_counts())


	def _save_results(features, shds, nshds, configs, partial=True):
	os.makedirs(RESULTS_DIR, exist_ok=True)
	suffix = '_partial' if partial else ''
	pd.DataFrame(features).to_csv(os.path.join(RESULTS_DIR, f'meta_features{suffix}.csv'), index=False)
	pd.DataFrame(shds).to_csv(os.path.join(RESULTS_DIR, f'shd_matrix{suffix}.csv'), index=False)
	pd.DataFrame(nshds).to_csv(os.path.join(RESULTS_DIR, f'normalized_shd_matrix{suffix}.csv'), index=False)
	pd.DataFrame(configs).to_csv(os.path.join(RESULTS_DIR, f'configs{suffix}.csv'), index=False)


	if __name__ == '__main__':
	run_benchmark()