diff --git a/Cargo.toml b/Cargo.toml
index 3d5ccaa..2704718 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "forust-ml"
-version = "0.2.26"
+version = "0.3.0"
 edition = "2021"
 authors = ["James Inlow <james.d.inlow@gmail.com>"]
 homepage = "https://github.com/jinlow/forust"
diff --git a/README.md b/README.md
index 6f717a8..8b20a8f 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ pip install forust
 
 To use in a rust project add the following to your Cargo.toml file.
 ```toml
-forust-ml = "0.2.26"
+forust-ml = "0.3.0"
 ```
 
 ## Usage
diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml
index da31aac..a67b018 100644
--- a/py-forust/Cargo.toml
+++ b/py-forust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-forust"
-version = "0.2.26"
+version = "0.3.0"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -10,7 +10,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.19.0", features = ["extension-module"] }
-forust-ml = { version = "0.2.26", path = "../" }
+forust-ml = { version = "0.3.0", path = "../" }
 numpy = "0.19.0"
 ndarray = "0.15.1"
 serde_plain = { version = "1.0" }
diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
index 920b639..7d9ab97 100644
--- a/py-forust/forust/__init__.py
+++ b/py-forust/forust/__init__.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import inspect
 import sys
 import warnings
 from typing import Any, Iterable, Protocol, Union, cast
@@ -10,6 +11,11 @@
 from forust.forust import GradientBooster as CrateGradientBooster  # type: ignore
 from forust.serialize import BaseSerializer, ObjectSerializer, ScalerSerializer
 
+
+class UnimplementedWarning(Warning):
+    """Warning to throw when users try and adjust base score"""
+
+
 ArrayLike = Union[pd.Series, np.ndarray]
 FrameLike = Union[pd.DataFrame, np.ndarray]
 
@@ -176,7 +182,7 @@ def __init__(
         l2: float = 1.0,
         gamma: float = 0.0,
         min_leaf_weight: float = 1.0,
-        base_score: float | None = None,
+        base_score: float = 0.5,
         nbins: int = 256,
         parallel: bool = True,
         allow_missing_splits: bool = True,
@@ -191,7 +197,7 @@ def __init__(
         grow_policy: str = "DepthWise",
         evaluation_metric: str | None = None,
         early_stopping_rounds: int | None = None,
-        initialize_base_score: bool = False,
+        initialize_base_score: bool = True,
         terminate_missing_features: Iterable[Any] | None = None,
         missing_node_treatment: str = "None",
         log_iterations: int = 0,
@@ -217,7 +223,9 @@ def __init__(
                 Valid values are 0 to infinity. Defaults to 0.0.
             min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function
                 required to be in a node. Defaults to 1.0.
-            base_score (float, optional): The initial prediction value of the model. If set to None the parameter `initialize_base_score` will automatically be set to True, in which case the base score will be chosen based on the objective function at fit time. Defaults to None.
+            base_score (float, optional): The initial prediction value of the model. If `initialize_base_score`
+                is set to True the `base_score` will automatically be will be chosen based on the objective
+                function at fit time. Defaults to 0.5.
             nbins (int, optional): Number of bins to calculate to partition the data. Setting this to
                 a smaller number, will result in faster training time, while potentially sacrificing
                 accuracy. If there are more bins, than unique values in a column, all unique values
@@ -324,10 +332,17 @@ def __init__(
             else sample_method_
         )
         terminate_missing_features_ = (
-            set()
-            if terminate_missing_features is None
-            else set(terminate_missing_features)
+            set() if terminate_missing_features is None else terminate_missing_features
         )
+
+        if (base_score != 0.5) and initialize_base_score:
+            warnings.warn(
+                "It appears as if you are modifying the `base_score` value, but "
+                + "`initialize_base_score` is set to True. The `base_score` will be"
+                + " calculated at `fit` time. If this it not the desired behavior, set"
+                + " `initialize_base_score` to False.",
+            )
+
         booster = CrateGradientBooster(
             objective_type=objective_type,
             iterations=iterations,
@@ -370,8 +385,9 @@ def __init__(
         self.l2 = l2
         self.gamma = gamma
         self.min_leaf_weight = min_leaf_weight
-        # Use booster getter, as it's more dynamic
-        # self.base_score = base_score
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            self.base_score = base_score
         self.nbins = nbins
         self.parallel = parallel
         self.allow_missing_splits = allow_missing_splits
@@ -381,6 +397,7 @@ def __init__(
         self.missing = missing
         self.create_missing_branch = create_missing_branch
         self.sample_method = sample_method
+        self.grow_policy = grow_policy
         self.top_rate = top_rate
         self.other_rate = other_rate
         self.evaluation_metric = evaluation_metric
@@ -487,6 +504,10 @@ def fit(
             evaluation_data=evaluation_data_,  # type: ignore
         )
 
+        # Once it's been fit, reset the `base_score`
+        # this will account for the fact that's it's adjusted after fit.
+        self.base_score = self.booster.base_score
+
     def _validate_features(self, features: list[str]):
         if len(features) > 0 and hasattr(self, "feature_names_in_"):
             if features != self.feature_names_in_:
@@ -523,8 +544,10 @@ def feature_importances_(self) -> np.ndarray:
             method=self.feature_importance_method, normalize=True
         )
         if hasattr(self, "feature_names_in_"):
+            vals = cast(dict[str, float], vals)
             return np.array([vals.get(ft, 0.0) for ft in self.feature_names_in_])
         else:
+            vals = cast(dict[int, float], vals)
             return np.array([vals.get(ft, 0.0) for ft in range(self.n_features_)])
 
     def predict_contributions(
@@ -652,9 +675,8 @@ def partial_dependence(
             ```
             <img  height="340" src="https://github.com/jinlow/forust/raw/main/resources/pdp_plot_age_mono.png">
         """
-        is_dataframe = isinstance(X, pd.DataFrame)
         if isinstance(feature, str):
-            if not is_dataframe:
+            if not isinstance(X, pd.DataFrame):
                 raise ValueError(
                     "If `feature` is a string, then the object passed as `X` must be a pandas DataFrame."
                 )
@@ -673,7 +695,7 @@ def partial_dependence(
                 [feature_idx] = [i for i, v in enumerate(X.columns) if v == feature]
         elif isinstance(feature, int):
             feature_idx = feature
-            if is_dataframe:
+            if isinstance(X, pd.DataFrame):
                 values = X.iloc[:, feature].unique()
             else:
                 values = X[:, feature]
@@ -781,7 +803,9 @@ def load_booster(cls, path: str) -> GradientBooster:
         booster = CrateGradientBooster.load_booster(str(path))
 
         params = booster.get_params()
-        c = cls(**params)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            c = cls(**params)
         c.booster = booster
         for m in c.meta_data_attributes:
             try:
@@ -824,7 +848,7 @@ def _standardize_terminate_missing_features(
         X: Union[pd.DataFrame, np.ndarray],
     ) -> set[int]:
         if isinstance(X, np.ndarray):
-            return self.terminate_missing_features
+            return set(self.terminate_missing_features)
         else:
             feature_map = {f: i for i, f in enumerate(X.columns)}
             return set(feature_map[f] for f in self.terminate_missing_features)
@@ -891,11 +915,6 @@ def best_iteration(self) -> int | None:
         """
         return self.booster.best_iteration
 
-    @property
-    def base_score(self) -> float:
-        """Base score used as initial prediction value"""
-        return self.booster.base_score
-
     @property
     def prediction_iteration(self) -> int | None:
         """The prediction_iteration that will be used when predicting, up to this many trees will be used.
@@ -912,3 +931,9 @@ def get_best_iteration(self) -> int | None:
             int | None: The best iteration, or None if `early_stopping_rounds` wasn't used.
         """
         return self.booster.best_iteration
+
+    # Functions for scikit-learn compatibility, will feel out adding these manually,
+    # and then if that feels too unwieldy will add scikit-learn as a dependency.
+    def get_params(self, deep=True):
+        args = inspect.getfullargspec(GradientBooster).kwonlyargs
+        return {param: getattr(self, param) for param in args}
diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs
index 4f0e3a3..9ddfd32 100644
--- a/py-forust/src/lib.rs
+++ b/py-forust/src/lib.rs
@@ -90,7 +90,7 @@ impl GradientBooster {
         l2: f32,
         gamma: f32,
         min_leaf_weight: f32,
-        base_score: Option<f64>,
+        base_score: f64,
         nbins: u16,
         parallel: bool,
         allow_missing_splits: bool,
diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
index 2d04115..b32da97 100644
--- a/py-forust/tests/test_booster.py
+++ b/py-forust/tests/test_booster.py
@@ -1,9 +1,11 @@
 import json
+import warnings
 from typing import Tuple
 
 import numpy as np
 import pandas as pd
 import pytest
+from sklearn.base import clone
 from sklearn.metrics import roc_auc_score
 from xgboost import XGBClassifier, XGBRegressor
 
@@ -48,12 +50,67 @@ def test_booster_to_xgboosts(X_y):
         min_leaf_weight=1.0,
         gamma=0,
         objective_type="LogLoss",
+        initialize_base_score=False,
     )
     fmod.fit(X, y=y)
     fmod_preds = fmod.predict(X)
     assert np.allclose(fmod_preds, xmod_preds, atol=0.00001)
 
 
+def test_sklearn_clone(X_y):
+    X, y = X_y
+    fmod = GradientBooster(
+        base_score=0.5,
+        iterations=100,
+        learning_rate=0.3,
+        max_depth=5,
+        l2=1,
+        min_leaf_weight=1.0,
+        gamma=0,
+        objective_type="LogLoss",
+        initialize_base_score=True,
+    )
+    fmod_cloned = clone(fmod)
+    fmod_cloned.fit(X, y=y)
+
+    fmod.fit(X, y=y)
+
+    # After it's fit it can still be cloned.
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        fmod_cloned_post_fit = clone(fmod)
+    fmod_cloned_post_fit.fit(X, y=y)
+
+    fmod_preds = fmod.predict(X)
+    fmod_cloned_preds = fmod_cloned.predict(X)
+    fmod_cloned_post_fit_preds = fmod_cloned_post_fit.predict(X)
+
+    assert np.allclose(fmod_preds, fmod_cloned_preds)
+    assert np.allclose(fmod_preds, fmod_cloned_post_fit_preds)
+
+
+def test_multuple_fit_calls(X_y):
+    X, y = X_y
+    fmod = GradientBooster(
+        base_score=0.5,
+        iterations=100,
+        learning_rate=0.3,
+        max_depth=5,
+        l2=1,
+        min_leaf_weight=1.0,
+        gamma=0,
+        objective_type="LogLoss",
+        initialize_base_score=True,
+    )
+    fmod.fit(X, y=y)
+    fmod_preds = fmod.predict(X)
+
+    fmod.fit(X, y=y)
+    fmod_fit_again_preds = fmod.predict(X)
+
+    assert np.allclose(fmod_preds, fmod_fit_again_preds)
+
+
 def test_booster_from_numpy(X_y):
     X, y = X_y
     X = X.astype("float32").astype("float64")
@@ -66,6 +123,7 @@ def test_booster_from_numpy(X_y):
         min_leaf_weight=1.0,
         gamma=0,
         objective_type="LogLoss",
+        initialize_base_score=False,
     )
     fmod1.fit(X, y=y)
     fmod1_preds = fmod1.predict(X)
@@ -79,6 +137,7 @@ def test_booster_from_numpy(X_y):
         min_leaf_weight=1.0,
         gamma=0,
         objective_type="LogLoss",
+        initialize_base_score=False,
     )
     fmod2.fit(X, y=y)
     fmod2_preds = fmod2.predict(X.to_numpy())
@@ -92,6 +151,7 @@ def test_booster_from_numpy(X_y):
         min_leaf_weight=1.0,
         gamma=0,
         objective_type="LogLoss",
+        initialize_base_score=False,
     )
     fmod3.fit(X.to_numpy().astype("float32"), y=y)
     fmod3_preds = fmod3.predict(X)
@@ -129,6 +189,7 @@ def test_booster_to_xgboosts_with_missing(X_y):
         objective_type="LogLoss",
         nbins=500,
         parallel=True,
+        initialize_base_score=False,
     )
     fmod.fit(X, y=y)
     fmod_preds = fmod.predict(X)
@@ -162,6 +223,7 @@ def test_importance(X_y):
         objective_type="LogLoss",
         nbins=500,
         parallel=True,
+        initialize_base_score=False,
     )
     fmod.fit(X, y)
     x_imp = xmod.get_booster().get_score(importance_type="weight")
@@ -224,6 +286,7 @@ def test_booster_to_xgboosts_with_missing_sl(X_y):
         objective_type="SquaredLoss",
         nbins=500,
         parallel=True,
+        initialize_base_score=False,
     )
     fmod.fit(X, y=y)
     fmod_preds = fmod.predict(X)
@@ -361,6 +424,7 @@ def test_booster_to_xgboosts_weighted(X_y):
         min_leaf_weight=1,
         gamma=0,
         objective_type="LogLoss",
+        initialize_base_score=False,
     )
     fmod.fit(X, y=y, sample_weight=w)
     fmod_preds = fmod.predict(X)
@@ -571,6 +635,7 @@ def test_booster_to_xgboosts_with_contributions(X_y):
         nbins=500,
         parallel=True,
         base_score=0.5,
+        initialize_base_score=False,
     )
     fmod.fit(X, y=y)
     fmod_preds = fmod.predict(X)
@@ -678,7 +743,8 @@ def test_missing_branch_with_contributions(X_y):
     )
 
 
-def test_booster_metadata(X_y, tmp_path):
+@pytest.mark.parametrize("initialize_base_score", [True, False])
+def test_booster_metadata(X_y, tmp_path, initialize_base_score):
     f64_model_path = tmp_path / "modelf64_sl.json"
     X, y = X_y
     X = X
@@ -693,6 +759,7 @@ def test_booster_metadata(X_y, tmp_path):
         nbins=500,
         parallel=True,
         base_score=0.5,
+        initialize_base_score=initialize_base_score,
     )
     fmod.fit(X, y=y)
     fmod_preds = fmod.predict(X)
@@ -707,6 +774,11 @@ def test_booster_metadata(X_y, tmp_path):
     with pytest.raises(KeyError):
         loaded.get_metadata("No-key")
 
+    # Make sure the base score is adjusted
+    assert fmod.base_score == loaded.base_score
+    if initialize_base_score:
+        assert loaded.base_score != 0.5
+
     loaded_dict = loaded.__dict__
     fmod_dict = fmod.__dict__
     assert sorted(loaded_dict.keys()) == sorted(fmod_dict.keys())
@@ -720,7 +792,7 @@ def test_booster_metadata(X_y, tmp_path):
         elif isinstance(v, forust.CrateGradientBooster):
             assert isinstance(c_v, forust.CrateGradientBooster)
         else:
-            assert v == c_v
+            assert v == c_v, k
     fmod_loaded_preds = loaded.predict(X)
     assert np.allclose(fmod_preds, fmod_loaded_preds)
 
diff --git a/rs-example.md b/rs-example.md
index 9216f22..42e237b 100644
--- a/rs-example.md
+++ b/rs-example.md
@@ -3,7 +3,7 @@
 To run this example, add the following code to your `Cargo.toml` file.
 ```toml
 [dependencies]
-forust-ml = "0.2.26"
+forust-ml = "0.3.0"
 polars = "0.28"
 reqwest = { version = "0.11", features = ["blocking"] }
 ```
diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
index 7ddedfa..cc1b431 100644
--- a/src/gradientbooster.rs
+++ b/src/gradientbooster.rs
@@ -239,7 +239,7 @@ impl Default for GradientBooster {
             1.,
             0.,
             1.,
-            None,
+            0.5,
             256,
             true,
             true,
@@ -254,7 +254,7 @@ impl Default for GradientBooster {
             GrowPolicy::DepthWise,
             None,
             None,
-            false,
+            true,
             HashSet::new(),
             MissingNodeTreatment::AssignToParent,
             0,
@@ -317,7 +317,7 @@ impl GradientBooster {
         l2: f32,
         gamma: f32,
         min_leaf_weight: f32,
-        base_score: Option<f64>,
+        base_score: f64,
         nbins: u16,
         parallel: bool,
         allow_missing_splits: bool,
@@ -338,10 +338,6 @@ impl GradientBooster {
         log_iterations: usize,
         force_children_to_bound_parent: bool,
     ) -> Result<Self, ForustError> {
-        let (base_score_, initialize_base_score_) = match base_score {
-            Some(v) => (v, initialize_base_score),
-            None => (0.5, true),
-        };
         let booster = GradientBooster {
             objective_type,
             iterations,
@@ -351,7 +347,7 @@ impl GradientBooster {
             l2,
             gamma,
             min_leaf_weight,
-            base_score: base_score_,
+            base_score,
             nbins,
             parallel,
             allow_missing_splits,
@@ -366,7 +362,7 @@ impl GradientBooster {
             grow_policy,
             evaluation_metric,
             early_stopping_rounds,
-            initialize_base_score: initialize_base_score_,
+            initialize_base_score,
             terminate_missing_features,
             evaluation_history: None,
             best_iteration: None,
@@ -467,6 +463,13 @@ impl GradientBooster {
         metric_callables(&metric)
     }
 
+    fn reset(&mut self) {
+        self.trees = Vec::new();
+        self.evaluation_history = None;
+        self.best_iteration = None;
+        self.prediction_iteration = None;
+    }
+
     fn fit_trees<T: Splitter>(
         &mut self,
         y: &[f64],
@@ -475,6 +478,12 @@ impl GradientBooster {
         splitter: &T,
         evaluation_data: Option<Vec<EvaluationData>>,
     ) -> Result<(), ForustError> {
+        // Is this a booster that has already been fit? If it is, reset the trees.
+        // In the future we could continue training.
+        if !self.trees.is_empty() {
+            self.reset()
+        }
+
         let mut rng = StdRng::seed_from_u64(self.seed);
 
         if self.initialize_base_score {