Initial commit of warm-start interface in Python

andrewherren · andrewherren · commit 8cf53ee9d98d · 2025-04-05T01:26:30.000-05:00
diff --git a/R/bart.R b/R/bart.R
@@ -206,7 +206,7 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train
         if (previous_bart_model$model_params$include_mean_forest) {
             previous_forest_samples_mean <- previous_bart_model$mean_forests
         } else previous_forest_samples_mean <- NULL
-        if (previous_bart_model$model_params$include_mean_forest) {
+        if (previous_bart_model$model_params$include_variance_forest) {
             previous_forest_samples_variance <- previous_bart_model$variance_forests
         } else previous_forest_samples_variance <- NULL
         if (previous_bart_model$model_params$sample_sigma_global) {
diff --git a/demo/debug/multi_chain.py b/demo/debug/multi_chain.py
@@ -0,0 +1,147 @@
+# Multi Chain Demo Script
+
+# Load necessary libraries
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+
+from stochtree import BARTModel
+
+# Generate sample data
+# RNG
+random_seed = 1234
+rng = np.random.default_rng(random_seed)
+
+# Generate covariates and basis
+n = 500
+p_X = 10
+p_W = 1
+X = rng.uniform(0, 1, (n, p_X))
+W = rng.uniform(0, 1, (n, p_W))
+
+# Define the outcome mean function
+def outcome_mean(X, W):
+    return np.where(
+        (X[:, 0] >= 0.0) & (X[:, 0] < 0.25),
+        -7.5 * W[:, 0],
+        np.where(
+            (X[:, 0] >= 0.25) & (X[:, 0] < 0.5),
+            -2.5 * W[:, 0],
+            np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5 * W[:, 0], 7.5 * W[:, 0]),
+        ),
+    )
+
+# Generate outcome
+f_XW = outcome_mean(X, W)
+epsilon = rng.normal(0, 1, n)
+y = f_XW + epsilon
+
+# Test-train split
+sample_inds = np.arange(n)
+train_inds, test_inds = train_test_split(sample_inds, test_size=0.5, random_state=random_seed)
+X_train = X[train_inds, :]
+X_test = X[test_inds, :]
+basis_train = W[train_inds, :]
+basis_test = W[test_inds, :]
+y_train = y[train_inds]
+y_test = y[test_inds]
+
+# Run the GFR algorithm for a small number of iterations
+general_model_params = {"random_seed": -1}
+mean_forest_model_params = {"num_trees": 20}
+num_warmstart = 10
+num_mcmc = 10
+bart_model = BARTModel()
+bart_model.sample(
+    X_train=X_train,
+    y_train=y_train,
+    leaf_basis_train=basis_train,
+    X_test=X_test,
+    leaf_basis_test=basis_test,
+    num_gfr=num_warmstart,
+    num_mcmc=0, 
+    general_params=general_model_params, 
+    mean_forest_params=mean_forest_model_params
+)
+bart_model_json = bart_model.to_json()
+
+# Run several BART MCMC samples from the last GFR forest
+bart_model_2 = BARTModel()
+bart_model_2.sample(
+    X_train=X_train,
+    y_train=y_train,
+    leaf_basis_train=basis_train,
+    X_test=X_test,
+    leaf_basis_test=basis_test,
+    num_gfr=0,
+    num_mcmc=num_mcmc,
+    previous_model_json=bart_model_json,
+    previous_model_warmstart_sample_num=num_warmstart-1,
+    general_params=general_model_params, 
+    mean_forest_params=mean_forest_model_params
+)
+
+# Run several BART MCMC samples from the second-to-last GFR forest
+bart_model_3 = BARTModel()
+bart_model_3.sample(
+    X_train=X_train,
+    y_train=y_train,
+    leaf_basis_train=basis_train,
+    X_test=X_test,
+    leaf_basis_test=basis_test,
+    num_gfr=0,
+    num_mcmc=num_mcmc,
+    previous_model_json=bart_model_json,
+    previous_model_warmstart_sample_num=num_warmstart-2,
+    general_params=general_model_params, 
+    mean_forest_params=mean_forest_model_params
+)
+
+# Run several BART MCMC samples from root
+bart_model_4 = BARTModel()
+bart_model_4.sample(
+    X_train=X_train,
+    y_train=y_train,
+    leaf_basis_train=basis_train,
+    X_test=X_test,
+    leaf_basis_test=basis_test,
+    num_gfr=0,
+    num_mcmc=num_mcmc,
+    general_params=general_model_params, 
+    mean_forest_params=mean_forest_model_params
+)
+
+# Inspect the model outputs
+y_hat_mcmc_2 = bart_model_2.predict(X_test, basis_test)
+y_avg_mcmc_2 = np.squeeze(y_hat_mcmc_2).mean(axis=1, keepdims=True)
+y_hat_mcmc_3 = bart_model_3.predict(X_test, basis_test)
+y_avg_mcmc_3 = np.squeeze(y_hat_mcmc_3).mean(axis=1, keepdims=True)
+y_hat_mcmc_4 = bart_model_4.predict(X_test, basis_test)
+y_avg_mcmc_4 = np.squeeze(y_hat_mcmc_4).mean(axis=1, keepdims=True)
+y_df = pd.DataFrame(
+    np.concatenate((y_avg_mcmc_2, y_avg_mcmc_3, y_avg_mcmc_4, np.expand_dims(y_test, axis=1)), axis=1),
+    columns=["First Chain", "Second Chain", "Third Chain", "Outcome"],
+)
+
+# Compare first warm-start chain to root chain with equal number of MCMC draws
+sns.scatterplot(data=y_df, x="First Chain", y="Third Chain")
+plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
+plt.show()
+
+# Compare first warm-start chain to outcome
+sns.scatterplot(data=y_df, x="First Chain", y="Outcome")
+plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
+plt.show()
+
+# Compare root chain to outcome
+sns.scatterplot(data=y_df, x="Third Chain", y="Outcome")
+plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
+plt.show()
+
+# Compute RMSEs
+rmse_1 = np.sqrt(np.mean((np.squeeze(y_avg_mcmc_2)-y_test)*(np.squeeze(y_avg_mcmc_2)-y_test)))
+rmse_2 = np.sqrt(np.mean((np.squeeze(y_avg_mcmc_3)-y_test)*(np.squeeze(y_avg_mcmc_3)-y_test)))
+rmse_3 = np.sqrt(np.mean((np.squeeze(y_avg_mcmc_4)-y_test)*(np.squeeze(y_avg_mcmc_4)-y_test)))
+print("Chain 1 rmse: {:0.3f}; Chain 2 rmse: {:0.3f}; Chain 3 rmse: {:0.3f}".format(rmse_1, rmse_2, rmse_3))
diff --git a/stochtree/bart.py b/stochtree/bart.py
@@ -77,6 +77,8 @@ def sample(
         general_params: Optional[Dict[str, Any]] = None,
         mean_forest_params: Optional[Dict[str, Any]] = None,
         variance_forest_params: Optional[Dict[str, Any]] = None,
+        previous_model_json: Optional[str] = None,
+        previous_model_warmstart_sample_num: Optional[int] = None,
     ) -> None:
         """Runs a BART sampler on provided training set. Predictions will be cached for the training set and (if provided) the test set.
         Does not require a leaf regression basis.
@@ -154,6 +156,11 @@ def sample(
             * `var_forest_prior_scale` (`float`): Scale parameter in the [optional] `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance forest (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / leaf_prior_calibration_param^2` if not set here.
             * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the variance forest. Defaults to `None`.
             * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the variance forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored.
+        
+        previous_model_json : str, optional
+            JSON string containing a previous BART model. This can be used to "continue" a sampler interactively after inspecting the samples or to run parallel chains "warm-started" from existing forest samples. Defaults to `None`.
+        previous_model_warmstart_sample_num : int, optional    
+            Sample number from `previous_model_json` that will be used to warmstart this BART sampler. Zero-indexed (so that the first sample is used for warm-start by setting `previous_model_warmstart_sample_num = 0`). Defaults to `None`.
 
         Returns
         -------
@@ -612,6 +619,51 @@ def sample(
         else:
             variable_subset_variance = [i for i in range(X_train.shape[1])]
 
+        # Check if previous model JSON is provided and parse it if so
+        has_prev_model = previous_model_json is not None
+        if has_prev_model:
+            if num_gfr > 0:
+                if num_mcmc == 0:
+                    raise ValueError("A previous model is being used to initialize this sampler, so `num_mcmc` must be greater than zero")
+                else:
+                    warnings.warn("A previous model is being used to initialize this sampler, so num_gfr will be ignored and the MCMC sampler will be run from the previous samples")
+            previous_bart_model = BARTModel()
+            previous_bart_model.from_json(previous_model_json)
+            previous_y_bar = previous_bart_model.y_bar
+            previous_y_scale = previous_bart_model.y_std
+            previous_model_num_samples = previous_bart_model.num_samples
+            if previous_bart_model.include_mean_forest:
+                previous_forest_samples_mean = previous_bart_model.forest_container_mean
+            else:
+                previous_forest_samples_mean = None
+            if previous_bart_model.include_variance_forest:
+                previous_forest_samples_variance = previous_bart_model.forest_container_variance
+            else:
+                previous_forest_samples_variance = None
+            if previous_bart_model.sample_sigma_global:
+                previous_global_var_samples = previous_bart_model.global_var_samples / (previous_y_scale * previous_y_scale)
+            else:
+                previous_global_var_samples = None
+            if previous_bart_model.sample_sigma_leaf:
+                previous_leaf_var_samples = previous_bart_model.leaf_scale_samples
+            else:
+                previous_leaf_var_samples = None
+            if previous_bart_model.has_rfx:
+                previous_rfx_samples = previous_bart_model.rfx_container
+            else:
+                previous_rfx_samples = None
+            if previous_model_warmstart_sample_num + 1 > previous_model_num_samples:
+                raise ValueError("`previous_model_warmstart_sample_num` exceeds the number of samples in `previous_model_json`")
+        else:
+            previous_y_bar = None
+            previous_y_scale = None
+            previous_global_var_samples = None
+            previous_leaf_var_samples = None
+            previous_rfx_samples = None
+            previous_forest_samples_mean = None
+            previous_forest_samples_variance = None
+            previous_model_num_samples = 0
+        
         # Update variable weights if the covariates have been resized (by e.g. one-hot encoding)
         if X_train_processed.shape[1] != X_train.shape[1]:
             variable_counts = [
@@ -992,6 +1044,22 @@ def sample(
                         )
                     if sample_sigma_global:
                         current_sigma2 = self.global_var_samples[forest_ind]
+                elif has_prev_model:
+                    if self.include_mean_forest:
+                        active_forest_mean.reset(previous_bart_model.forest_container_mean, previous_model_warmstart_sample_num)
+                        forest_sampler_mean.reconstitute_from_forest(active_forest_mean, forest_dataset_train, residual_train, True)
+                        if sample_sigma_leaf and previous_leaf_var_samples is not None:
+                            leaf_scale_double = previous_leaf_var_samples[previous_model_warmstart_sample_num]
+                            current_leaf_scale[0, 0] = leaf_scale_double
+                            forest_model_config_mean.update_leaf_model_scale(leaf_scale_double)
+                    if self.include_variance_forest:
+                        active_forest_variance.reset(previous_bart_model.forest_container_variance, previous_model_warmstart_sample_num)
+                        forest_sampler_variance.reconstitute_from_forest(active_forest_variance, forest_dataset_train, residual_train, True)
+                    # if self.has_rfx:
+                    #     pass
+                    if self.sample_sigma_global:
+                        current_sigma2 = previous_global_var_samples[previous_model_warmstart_sample_num]
+                        global_model_config.update_global_error_variance(current_sigma2)
                 else:
                     if self.include_mean_forest:
                         active_forest_mean.reset_root()
@@ -1069,12 +1137,14 @@ def sample(
                         current_sigma2 = global_var_model.sample_one_iteration(
                             residual_train, cpp_rng, a_global, b_global
                         )
+                        global_model_config.update_global_error_variance(current_sigma2)
                         if keep_sample:
                             self.global_var_samples[sample_counter] = current_sigma2
                     if self.sample_sigma_leaf:
                         current_leaf_scale[0, 0] = leaf_var_model.sample_one_iteration(
                             active_forest_mean, cpp_rng, a_leaf, b_leaf
                         )
+                        forest_model_config_mean.update_leaf_model_scale(current_leaf_scale)
                         if keep_sample:
                             self.leaf_scale_samples[sample_counter] = (
                                 current_leaf_scale[0, 0]