| """ |
| Data generation module: load bnlearn networks, sample datasets, extract ground truth. |
| """ |
| import os |
| import numpy as np |
| import pandas as pd |
| from pgmpy.readwrite import BIFReader |
| from pgmpy.sampling import BayesianModelSampling |
| import warnings |
| import logging |
|
|
| warnings.filterwarnings('ignore') |
| logger = logging.getLogger(__name__) |
|
|
| BIF_DIR = os.path.join(os.path.dirname(__file__), 'bif_files') |
|
|
| |
| SMALL_NETWORKS = ['asia', 'cancer', 'earthquake', 'sachs', 'survey'] |
| MEDIUM_NETWORKS = ['alarm', 'barley', 'child', 'insurance', 'mildew', 'water'] |
| LARGE_NETWORKS = ['hailfinder', 'hepar2', 'win95pts'] |
|
|
| ALL_NETWORKS = SMALL_NETWORKS + MEDIUM_NETWORKS + LARGE_NETWORKS |
|
|
| |
| SAMPLE_SIZES = { |
| 'small': [250, 500, 1000, 2000, 5000, 10000], |
| 'medium': [500, 1000, 2000, 5000], |
| 'large': [500, 1000, 2000], |
| } |
|
|
| SEEDS_PER_CONFIG = 3 |
|
|
|
|
| def get_network_tier(name): |
| if name in SMALL_NETWORKS: |
| return 'small' |
| elif name in MEDIUM_NETWORKS: |
| return 'medium' |
| else: |
| return 'large' |
|
|
|
|
| def load_bn_model(name): |
| """Load a Bayesian network from BIF file.""" |
| bif_path = os.path.join(BIF_DIR, f'{name}.bif') |
| if not os.path.exists(bif_path): |
| raise FileNotFoundError(f"BIF file not found: {bif_path}") |
| reader = BIFReader(bif_path) |
| model = reader.get_model() |
| return model |
|
|
|
|
| def get_true_dag_adjmat(model): |
| """Extract ground-truth DAG adjacency matrix from a BayesianNetwork model. |
| |
| Returns: |
| adjmat: np.ndarray of shape (n_nodes, n_nodes), adjmat[i,j]=1 means i->j |
| node_names: list of node names (ordering) |
| """ |
| nodes = sorted(model.nodes()) |
| n = len(nodes) |
| node_idx = {node: i for i, node in enumerate(nodes)} |
| adjmat = np.zeros((n, n), dtype=int) |
| for parent, child in model.edges(): |
| adjmat[node_idx[parent], node_idx[child]] = 1 |
| return adjmat, nodes |
|
|
|
|
| def dag_to_cpdag(dag_adjmat): |
| """Convert a DAG adjacency matrix to its CPDAG (completed partially directed acyclic graph). |
| |
| A CPDAG represents the Markov equivalence class: |
| - Compelled edges (in all DAGs of the class) remain directed |
| - Reversible edges become undirected (represented as bidirectional) |
| |
| Uses the Chickering (2002) algorithm: |
| 1. Find all v-structures (i -> k <- j where i and j not adjacent) |
| 2. Apply Meek's orientation rules iteratively |
| |
| Returns: |
| cpdag: np.ndarray, cpdag[i,j]=1 and cpdag[j,i]=0 means i->j (directed) |
| cpdag[i,j]=1 and cpdag[j,i]=1 means i--j (undirected) |
| """ |
| n = dag_adjmat.shape[0] |
| |
| |
| skeleton = ((dag_adjmat + dag_adjmat.T) > 0).astype(int) |
| cpdag = skeleton.copy() |
| |
| |
| |
| for k in range(n): |
| parents_of_k = np.where(dag_adjmat[:, k] == 1)[0] |
| for idx_a in range(len(parents_of_k)): |
| for idx_b in range(idx_a + 1, len(parents_of_k)): |
| i = parents_of_k[idx_a] |
| j = parents_of_k[idx_b] |
| |
| if skeleton[i, j] == 0: |
| |
| |
| cpdag[i, k] = 1 |
| cpdag[k, i] = 0 |
| cpdag[j, k] = 1 |
| cpdag[k, j] = 0 |
| |
| |
| changed = True |
| while changed: |
| changed = False |
| for i in range(n): |
| for j in range(n): |
| if cpdag[i, j] == 1 and cpdag[j, i] == 1: |
| |
| |
| |
| for k in range(n): |
| if k != i and k != j: |
| if cpdag[k, i] == 1 and cpdag[i, k] == 0: |
| if cpdag[k, j] == 0 and cpdag[j, k] == 0: |
| cpdag[j, i] = 0 |
| changed = True |
| |
| |
| if cpdag[i, j] == 1 and cpdag[j, i] == 1: |
| for k in range(n): |
| if k != i and k != j: |
| if (cpdag[i, k] == 1 and cpdag[k, i] == 0 and |
| cpdag[k, j] == 1 and cpdag[j, k] == 0): |
| cpdag[j, i] = 0 |
| changed = True |
| |
| |
| if cpdag[i, j] == 1 and cpdag[j, i] == 1: |
| k_candidates = [] |
| for k in range(n): |
| if k != i and k != j: |
| if (cpdag[i, k] == 1 and cpdag[k, i] == 1 and |
| cpdag[k, j] == 1 and cpdag[j, k] == 0): |
| k_candidates.append(k) |
| for idx_a in range(len(k_candidates)): |
| for idx_b in range(idx_a + 1, len(k_candidates)): |
| k1, k2 = k_candidates[idx_a], k_candidates[idx_b] |
| if cpdag[k1, k2] == 0 and cpdag[k2, k1] == 0: |
| cpdag[j, i] = 0 |
| changed = True |
| |
| return cpdag |
|
|
|
|
| def sample_dataset(model, n_samples, seed=42): |
| """Sample observational data from a Bayesian network. |
| |
| Returns: |
| df: pd.DataFrame with integer-encoded discrete variables |
| """ |
| np.random.seed(seed) |
| sampler = BayesianModelSampling(model) |
| try: |
| df = sampler.forward_sample(size=n_samples, seed=seed) |
| except TypeError: |
| |
| |
| df = _manual_forward_sample(model, n_samples, seed) |
| |
| |
| df = df[sorted(df.columns)] |
| |
| |
| for col in df.columns: |
| if df[col].dtype == object or df[col].dtype.name == 'category': |
| df[col] = df[col].astype('category').cat.codes |
| |
| |
| df = df.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) |
| |
| return df |
|
|
|
|
| def _manual_forward_sample(model, n_samples, seed=42): |
| """Manual forward sampling when pgmpy's sampler has compatibility issues.""" |
| import networkx as nx |
| |
| rng = np.random.RandomState(seed) |
| nodes = list(nx.topological_sort(model)) |
| |
| |
| cpd_dict = {} |
| for cpd in model.get_cpds(): |
| cpd_dict[cpd.variable] = cpd |
| |
| samples = {node: [] for node in nodes} |
| |
| for _ in range(n_samples): |
| sample = {} |
| for node in nodes: |
| cpd = cpd_dict[node] |
| parents = cpd.get_evidence() |
| |
| if not parents: |
| |
| probs = cpd.get_values().flatten() |
| probs = probs / probs.sum() |
| val = rng.choice(len(probs), p=probs) |
| else: |
| |
| parent_vals = tuple(sample[p] for p in parents) |
| |
| values = cpd.get_values() |
| state_names = cpd.state_names |
| |
| |
| col_idx = 0 |
| stride = 1 |
| for p in reversed(parents): |
| p_card = len(state_names[p]) |
| col_idx += sample[p] * stride |
| stride *= p_card |
| |
| probs = values[:, col_idx] |
| probs = np.abs(probs) |
| probs = probs / probs.sum() |
| val = rng.choice(len(probs), p=probs) |
| |
| sample[node] = val |
| samples[node].append(val) |
| |
| return pd.DataFrame(samples) |
|
|
|
|
| def generate_all_datasets(networks=None, output_dir=None): |
| """Generate all dataset configurations. |
| |
| Returns list of dicts with: |
| - network: str |
| - n_samples: int |
| - seed: int |
| - df: pd.DataFrame |
| - true_dag: np.ndarray |
| - true_cpdag: np.ndarray |
| - node_names: list |
| """ |
| if networks is None: |
| networks = ALL_NETWORKS |
| |
| configs = [] |
| for net_name in networks: |
| tier = get_network_tier(net_name) |
| sample_sizes = SAMPLE_SIZES[tier] |
| |
| logger.info(f"Loading network: {net_name}") |
| model = load_bn_model(net_name) |
| true_dag, node_names = get_true_dag_adjmat(model) |
| true_cpdag = dag_to_cpdag(true_dag) |
| |
| for n_samples in sample_sizes: |
| for seed in range(SEEDS_PER_CONFIG): |
| try: |
| df = sample_dataset(model, n_samples, seed=seed) |
| config = { |
| 'network': net_name, |
| 'n_samples': n_samples, |
| 'seed': seed, |
| 'df': df, |
| 'true_dag': true_dag, |
| 'true_cpdag': true_cpdag, |
| 'node_names': node_names, |
| } |
| configs.append(config) |
| logger.info(f" {net_name} N={n_samples} seed={seed}: {df.shape}") |
| except Exception as e: |
| logger.error(f" FAILED {net_name} N={n_samples} seed={seed}: {e}") |
| |
| return configs |
|
|
|
|
| if __name__ == '__main__': |
| logging.basicConfig(level=logging.INFO) |
| |
| |
| model = load_bn_model('asia') |
| dag, nodes = get_true_dag_adjmat(model) |
| cpdag = dag_to_cpdag(dag) |
| |
| print(f"ASIA - nodes: {nodes}") |
| print(f"DAG adjacency:\n{dag}") |
| print(f"CPDAG adjacency:\n{cpdag}") |
| |
| df = sample_dataset(model, 1000, seed=0) |
| print(f"\nSampled data: {df.shape}") |
| print(df.head()) |
|
|