diff --git a/Cargo.toml b/Cargo.toml
index 94874ad..17f53de 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "forust-ml"
-version = "0.4.3"
+version = "0.4.4"
 edition = "2021"
 authors = ["James Inlow <james.d.inlow@gmail.com>"]
 homepage = "https://github.com/jinlow/forust"
diff --git a/README.md b/README.md
index 0fd7129..50aaeb3 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ pip install forust
 
 To use in a rust project add the following to your Cargo.toml file.
 ```toml
-forust-ml = "0.4.3"
+forust-ml = "0.4.4"
 ```
 
 ## Usage
diff --git a/benches/forust_benchmarks.rs b/benches/forust_benchmarks.rs
index de6b637..23672c4 100644
--- a/benches/forust_benchmarks.rs
+++ b/benches/forust_benchmarks.rs
@@ -2,7 +2,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use forust_ml::binning::bin_matrix;
 use forust_ml::constraints::ConstraintMap;
 use forust_ml::data::Matrix;
-use forust_ml::gradientbooster::GradientBooster;
+use forust_ml::gradientbooster::{GradientBooster, GrowPolicy};
 use forust_ml::objective::{LogLoss, ObjectiveFunction};
 use forust_ml::sampler::SampleMethod;
 use forust_ml::splitter::MissingImputerSplitter;
@@ -33,7 +33,9 @@ pub fn tree_benchmarks(c: &mut Criterion) {
 
     let data = Matrix::new(&data_vec, y.len(), 5);
     let splitter = MissingImputerSplitter {
+        l1: 0.0,
         l2: 1.0,
+        max_delta_step: 0.,
         gamma: 3.0,
         min_leaf_weight: 1.0,
         learning_rate: 0.3,
@@ -44,9 +46,11 @@ pub fn tree_benchmarks(c: &mut Criterion) {
 
     let bindata = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
     let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols);
+    let col_index: Vec<usize> = (0..data.cols).collect();
     tree.fit(
         &bdata,
         data.index.to_owned(),
+        &col_index,
         &bindata.cuts,
         &g,
         &h,
@@ -55,6 +59,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
         5,
         true,
         &SampleMethod::None,
+        &GrowPolicy::DepthWise,
     );
     println!("{}", tree.nodes.len());
     c.bench_function("Train Tree", |b| {
@@ -63,6 +68,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
             train_tree.fit(
                 black_box(&bdata),
                 black_box(data.index.to_owned()),
+                black_box(&col_index),
                 black_box(&bindata.cuts),
                 black_box(&g),
                 black_box(&h),
@@ -71,6 +77,26 @@ pub fn tree_benchmarks(c: &mut Criterion) {
                 black_box(10),
                 black_box(false),
                 black_box(&SampleMethod::None),
+                black_box(&GrowPolicy::DepthWise),
+            );
+        })
+    });
+    c.bench_function("Train Tree - column subset", |b| {
+        b.iter(|| {
+            let mut train_tree: Tree = Tree::new();
+            train_tree.fit(
+                black_box(&bdata),
+                black_box(data.index.to_owned()),
+                black_box(&[1, 3, 4]),
+                black_box(&bindata.cuts),
+                black_box(&g),
+                black_box(&h),
+                black_box(&splitter),
+                black_box(usize::MAX),
+                black_box(10),
+                black_box(false),
+                black_box(&SampleMethod::None),
+                black_box(&GrowPolicy::DepthWise),
             );
         })
     });
@@ -100,6 +126,21 @@ pub fn tree_benchmarks(c: &mut Criterion) {
                 .unwrap();
         })
     });
+    booster_train.bench_function("Train Booster - Column Sampling", |b| {
+        b.iter(|| {
+            let mut booster = GradientBooster::default()
+                .set_parallel(false)
+                .set_colsample_bytree(0.5);
+            booster
+                .fit(
+                    black_box(&data),
+                    black_box(&y),
+                    black_box(&w),
+                    black_box(None),
+                )
+                .unwrap();
+        })
+    });
     let mut booster = GradientBooster::default();
     booster.fit(&data, &y, &w, None).unwrap();
     booster_train.bench_function("Predict Booster", |b| {
diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml
index 111db83..49951bc 100644
--- a/py-forust/Cargo.toml
+++ b/py-forust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-forust"
-version = "0.4.3"
+version = "0.4.4"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -10,7 +10,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.20.0", features = ["extension-module"] }
-forust-ml = { version = "0.4.3", path = "../" }
+forust-ml = { version = "0.4.4", path = "../" }
 numpy = "0.20.0"
 ndarray = "0.15.1"
 serde_plain = { version = "1.0" }
diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
index b23a599..f856d42 100644
--- a/py-forust/forust/__init__.py
+++ b/py-forust/forust/__init__.py
@@ -282,8 +282,10 @@ def __init__(
         learning_rate: float = 0.3,
         max_depth: int = 5,
         max_leaves: int = sys.maxsize,
+        l1: float = 0.0,
         l2: float = 1.0,
         gamma: float = 0.0,
+        max_delta_step: float = 0.0,
         min_leaf_weight: float = 1.0,
         base_score: float = 0.5,
         nbins: int = 256,
@@ -293,6 +295,7 @@ def __init__(
         subsample: float = 1.0,
         top_rate: float = 0.1,
         other_rate: float = 0.2,
+        colsample_bytree: float = 1.0,
         seed: int = 0,
         missing: float = np.nan,
         create_missing_branch: bool = False,
@@ -321,9 +324,12 @@ def __init__(
                 conservative the weights will be. Defaults to 0.3.
             max_depth (int, optional): Maximum depth of an individual tree. Valid values are 0 to infinity. Defaults to 5.
             max_leaves (int, optional): Maximum number of leaves allowed on a tree. Valid values are 0 to infinity. This is the total number of final nodes. Defaults to sys.maxsize.
+            l1 (float, optional): L1 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 0.0.
             l2 (float, optional): L2 regularization term applied to the weights of the tree. Valid values are 0 to infinity. Defaults to 1.0.
             gamma (float, optional): The minimum amount of loss required to further split a node.
                 Valid values are 0 to infinity. Defaults to 0.0.
+            max_delta_step (float, optional): Maximum delta step allowed at each leaf. This is the maximum magnitude a
+                leaf can take. Setting to 0 results in no constrain. Defaults to 0..
             min_leaf_weight (float, optional): Minimum sum of the hessian values of the loss function
                 required to be in a node. Defaults to 1.0.
             base_score (float, optional): The initial prediction value of the model. If `initialize_base_score`
@@ -355,7 +361,8 @@ def __init__(
             subsample (float, optional): Percent of records to randomly sample at each iteration when
                 training a tree. Defaults to 1.0, meaning all data is used to training.
             top_rate (float, optional): Used only in goss. The retain ratio of large gradient data.
-            other_rate (float, optional):Used only in goss. the retain ratio of small gradient data.
+            other_rate (float, optional): Used only in goss. the retain ratio of small gradient data.
+            colsample_bytree (float, optional): Specify the fraction of columns that should be sampled at each iteration, valid values are in the range `(0.0,1.0]`.
             seed (integer, optional): Integer value used to seed any randomness used in the
                 algorithm. Defaults to 0.
             missing (float, optional): Value to consider missing, when training and predicting
@@ -452,8 +459,10 @@ def __init__(
             learning_rate=learning_rate,
             max_depth=max_depth,
             max_leaves=max_leaves,
+            l1=l1,
             l2=l2,
             gamma=gamma,
+            max_delta_step=max_delta_step,
             min_leaf_weight=min_leaf_weight,
             base_score=base_score,
             nbins=nbins,
@@ -463,6 +472,7 @@ def __init__(
             subsample=subsample,
             top_rate=top_rate,
             other_rate=other_rate,
+            colsample_bytree=colsample_bytree,
             seed=seed,
             missing=missing,
             create_missing_branch=create_missing_branch,
@@ -485,8 +495,10 @@ def __init__(
         self.learning_rate = learning_rate
         self.max_depth = max_depth
         self.max_leaves = max_leaves
+        self.l1 = l1
         self.l2 = l2
         self.gamma = gamma
+        self.max_delta_step = max_delta_step
         self.min_leaf_weight = min_leaf_weight
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
@@ -496,6 +508,9 @@ def __init__(
         self.allow_missing_splits = allow_missing_splits
         self.monotone_constraints = monotone_constraints_
         self.subsample = subsample
+        self.top_rate = top_rate
+        self.other_rate = other_rate
+        self.colsample_bytree = colsample_bytree
         self.seed = seed
         self.missing = missing
         self.create_missing_branch = create_missing_branch
@@ -1062,6 +1077,13 @@ def __setstate__(self, d: dict[Any, Any]) -> None:
         # Load the booster object the pickled JSon string.
         booster_object = CrateGradientBooster.from_json(d["__booster_json_file__"])
         d["booster"] = booster_object
+        # Are there any new parameters, that need to be added to the python object,
+        # that would have been loaded in as defaults on the json object?
+        # This makes sure that defaults set with a serde default function get
+        # carried through to the python object.
+        for p, v in booster_object.get_params().items():
+            if p not in d:
+                d[p] = v
         del d["__booster_json_file__"]
         self.__dict__ = d
 
@@ -1119,16 +1141,22 @@ def get_node_lists(self, map_features_names: bool = True) -> list[list[Node]]:
         """
         model = json.loads(self.json_dump())["trees"]
         feature_map: dict[int, str] | dict[int, int]
+        leaf_split_feature: str | int
         if map_features_names and hasattr(self, "feature_names_in_"):
             feature_map = {i: ft for i, ft in enumerate(self.feature_names_in_)}
+            leaf_split_feature = ""
         else:
             feature_map = {i: i for i in range(self.n_features_)}
+            leaf_split_feature = -1
 
         trees = []
         for t in model:
             tree = []
             for n in t["nodes"]:
-                n["split_feature"] = feature_map[n["split_feature"]]
+                if not n["is_leaf"]:
+                    n["split_feature"] = feature_map[n["split_feature"]]
+                else:
+                    n["split_feature"] = leaf_split_feature
                 tree.append(Node(**n))
             trees.append(tree)
         return trees
diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs
index d13beed..4fa4816 100644
--- a/py-forust/src/lib.rs
+++ b/py-forust/src/lib.rs
@@ -57,8 +57,10 @@ impl GradientBooster {
         learning_rate,
         max_depth,
         max_leaves,
+        l1,
         l2,
         gamma,
+        max_delta_step,
         min_leaf_weight,
         base_score,
         nbins,
@@ -68,6 +70,7 @@ impl GradientBooster {
         subsample,
         top_rate,
         other_rate,
+        colsample_bytree,
         seed,
         missing,
         create_missing_branch,
@@ -87,8 +90,10 @@ impl GradientBooster {
         learning_rate: f32,
         max_depth: usize,
         max_leaves: usize,
+        l1: f32,
         l2: f32,
         gamma: f32,
+        max_delta_step: f32,
         min_leaf_weight: f32,
         base_score: f64,
         nbins: u16,
@@ -98,6 +103,7 @@ impl GradientBooster {
         subsample: f32,
         top_rate: f64,
         other_rate: f64,
+        colsample_bytree: f64,
         seed: u64,
         missing: f64,
         create_missing_branch: bool,
@@ -130,8 +136,10 @@ impl GradientBooster {
             learning_rate,
             max_depth,
             max_leaves,
+            l1,
             l2,
             gamma,
+            max_delta_step,
             min_leaf_weight,
             base_score,
             nbins,
@@ -141,6 +149,7 @@ impl GradientBooster {
             subsample,
             top_rate,
             other_rate,
+            colsample_bytree,
             seed,
             missing,
             create_missing_branch,
@@ -374,8 +383,10 @@ impl GradientBooster {
             ("learning_rate", self.booster.learning_rate.to_object(py)),
             ("max_depth", self.booster.max_depth.to_object(py)),
             ("max_leaves", self.booster.max_leaves.to_object(py)),
+            ("l1", self.booster.l1.to_object(py)),
             ("l2", self.booster.l2.to_object(py)),
             ("gamma", self.booster.gamma.to_object(py)),
+            ("max_delta_step", self.booster.max_delta_step.to_object(py)),
             (
                 "min_leaf_weight",
                 self.booster.min_leaf_weight.to_object(py),
@@ -391,6 +402,10 @@ impl GradientBooster {
             ("subsample", self.booster.subsample.to_object(py)),
             ("top_rate", self.booster.top_rate.to_object(py)),
             ("other_rate", self.booster.other_rate.to_object(py)),
+            (
+                "colsample_bytree",
+                self.booster.colsample_bytree.to_object(py),
+            ),
             ("seed", self.booster.seed.to_object(py)),
             ("missing", self.booster.missing.to_object(py)),
             (
diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
index 7e3679c..03ae916 100644
--- a/py-forust/tests/test_booster.py
+++ b/py-forust/tests/test_booster.py
@@ -15,7 +15,7 @@
 from xgboost import XGBClassifier, XGBRegressor
 
 import forust
-from forust import GradientBooster
+from forust import GradientBooster, Node
 
 
 def loggodds_to_odds(v):
@@ -30,6 +30,53 @@ def X_y() -> Tuple[pd.DataFrame, pd.Series]:
     return X, y
 
 
+def test_booster_no_variance(X_y):
+    X, y = X_y
+    X.iloc[:, 3] = 1
+    X.iloc[:, 1] = np.nan
+    xmod = XGBClassifier(
+        n_estimators=100,
+        learning_rate=0.3,
+        max_depth=5,
+        reg_lambda=1,
+        reg_alpha=0.0,
+        min_child_weight=1,
+        gamma=1,
+        objective="binary:logitraw",
+        eval_metric="auc",
+        tree_method="hist",
+        max_bin=10000,
+    )
+    xmod.fit(X, y)
+    xmod_preds = xmod.predict(X, output_margin=True)
+
+    fmod = GradientBooster(
+        base_score=0.5,
+        iterations=100,
+        learning_rate=0.3,
+        max_depth=5,
+        l1=0.0,
+        l2=1,
+        min_leaf_weight=1,
+        gamma=1,
+        objective_type="LogLoss",
+        nbins=500,
+        parallel=True,
+        initialize_base_score=False,
+    )
+    fmod.fit(X, y=y)
+    fmod_preds = fmod.predict(X)
+    assert fmod.feature_importances_[1] == 0.0
+    assert fmod.feature_importances_[3] == 0.0
+    assert np.allclose(fmod_preds, xmod_preds, atol=0.00001)
+
+    fmod.fit(X.iloc[:, [1]], y)
+    assert len(np.unique(fmod.predict(X.iloc[:, [1]]))) == 1
+
+    fmod.fit(X.iloc[:, [3]], y)
+    assert len(np.unique(fmod.predict(X.iloc[:, [3]]))) == 1
+
+
 def test_booster_to_xgboosts(X_y):
     X, y = X_y
     X = X.fillna(0)
@@ -62,6 +109,125 @@ def test_booster_to_xgboosts(X_y):
     assert np.allclose(fmod_preds, xmod_preds, atol=0.00001)
 
 
+@pytest.mark.parametrize("l1", [0.0, 0.3, 1.0, 3.0])
+def test_booster_to_xgboosts_l1(X_y, l1):
+    # Small differences in the spits make a big difference
+    # when l1 is used.
+    c = ["pclass"]
+    X, y = X_y
+    X = X[c].fillna(0)
+    xmod = XGBClassifier(
+        n_estimators=5,
+        learning_rate=0.3,
+        max_depth=5,
+        reg_lambda=1,
+        min_child_weight=1.0,
+        gamma=0,
+        reg_alpha=l1,
+        objective="binary:logitraw",
+        tree_method="exact",
+    )
+    xmod.fit(X, y)
+    xmod_preds = xmod.predict(X, output_margin=True)
+
+    fmod = GradientBooster(
+        base_score=0.5,
+        iterations=5,
+        learning_rate=0.3,
+        max_depth=5,
+        l2=1,
+        l1=l1,
+        min_leaf_weight=1.0,
+        gamma=0,
+        objective_type="LogLoss",
+        initialize_base_score=False,
+    )
+    fmod.fit(X, y=y)
+    fmod_preds = fmod.predict(X)
+    assert np.allclose(fmod_preds, xmod_preds, atol=0.0001)
+
+    # Model trained without is different.
+    if l1 > 0:
+        fmod2 = GradientBooster(
+            base_score=0.5,
+            iterations=5,
+            learning_rate=0.3,
+            max_depth=5,
+            l2=1,
+            min_leaf_weight=1.0,
+            gamma=0,
+            objective_type="LogLoss",
+            initialize_base_score=False,
+        )
+        fmod2.fit(X, y=y)
+        fmod2_preds = fmod2.predict(X)
+        assert not np.allclose(fmod2_preds, fmod_preds, atol=0.0001)
+
+
+@pytest.mark.parametrize("max_delta_step", [0.0, 1.0, 2.0])
+def test_booster_to_xgboosts_max_delta_step(X_y, max_delta_step):
+    # Small differences in the spits make a big difference
+    # when l1 is used.
+    X, y = X_y
+    c = X.columns
+    X = X[c].fillna(0)
+    xmod = XGBClassifier(
+        n_estimators=5,
+        learning_rate=0.3,
+        max_depth=5,
+        reg_lambda=1,
+        min_child_weight=1.0,
+        gamma=0,
+        max_delta_step=max_delta_step,
+        objective="binary:logitraw",
+        tree_method="exact",
+    )
+    xmod.fit(X, y)
+    xmod_preds = xmod.predict(X, output_margin=True)
+
+    fmod = GradientBooster(
+        base_score=0.5,
+        iterations=5,
+        learning_rate=0.3,
+        max_depth=5,
+        l2=1,
+        max_delta_step=max_delta_step,
+        min_leaf_weight=1.0,
+        gamma=0,
+        objective_type="LogLoss",
+        initialize_base_score=False,
+    )
+    fmod.fit(X, y=y)
+    fmod_preds = fmod.predict(X)
+    assert np.allclose(fmod_preds, xmod_preds, atol=0.0001)
+
+    # Model trained without is different.
+    if max_delta_step > 0:
+        # The nodes weights will be maxed out at max_delta_step*learning_rate
+        max_w = []
+        for tree in fmod.get_node_lists():
+            max_w.append(max(abs(n.weight_value) for n in tree))
+        assert max(max_w) <= max_delta_step * 0.3
+        fmod2 = GradientBooster(
+            base_score=0.5,
+            iterations=5,
+            learning_rate=0.3,
+            max_depth=5,
+            l2=1,
+            min_leaf_weight=1.0,
+            gamma=0,
+            objective_type="LogLoss",
+            initialize_base_score=False,
+        )
+        fmod2.fit(X, y=y)
+        fmod2_preds = fmod2.predict(X)
+        assert not np.allclose(fmod2_preds, fmod_preds, atol=0.0001)
+        max_w = []
+        for tree in fmod2.get_node_lists():
+            max_w.append(max(abs(n.weight_value) for n in tree))
+        assert max(max_w) > max_delta_step * 0.3
+
+
 def test_sklearn_clone(X_y):
     X, y = X_y
     fmod = GradientBooster(
@@ -116,6 +282,47 @@ def test_multiple_fit_calls(X_y):
     assert np.allclose(fmod_preds, fmod_fit_again_preds)
 
 
+@pytest.mark.parametrize(
+    "colsample_bytree,create_missing_branch",
+    list(itertools.product([0.25, 0.5, 0.75], [True, False])),
+)
+def test_colsample_bytree(X_y, colsample_bytree, create_missing_branch):
+    X, y = X_y
+    fmod1 = GradientBooster(create_missing_branch=create_missing_branch)
+    fmod1.fit(X, y=y)
+    fmod1_preds = fmod1.predict(X)
+
+    fmod2 = GradientBooster(
+        colsample_bytree=colsample_bytree, create_missing_branch=create_missing_branch
+    )
+    fmod2.fit(X, y=y)
+    fmod2_preds = fmod2.predict(X)
+
+    assert not np.allclose(fmod1_preds, fmod2_preds)
+
+    # Assert than every tree, has only 50% or less of the features.
+    trees = fmod2.get_node_lists()
+
+    def gather_feature_names(
+        node: Node, tree: list[Node], features: set[str | int]
+    ) -> None:
+        if not node.is_leaf:
+            features.add(node.split_feature)
+            gather_feature_names(tree[node.right_child], tree, features)
+            gather_feature_names(tree[node.left_child], tree, features)
+            gather_feature_names(tree[node.missing_node], tree, features)
+
+    total_features = set()
+    features = set()
+    for tree in trees:
+        features = set()
+        gather_feature_names(tree[0], tree, features)
+        assert len(features) > 0
+        assert len(features) <= (len(X.columns) * colsample_bytree)
+        total_features.update(features)
+    assert len(total_features) > len(features)
+
+
 def test_different_data_passed(X_y):
     X, y = X_y
     fmod = GradientBooster(
@@ -185,7 +392,10 @@ def test_booster_from_numpy(X_y):
     itertools.product([True, False], [True, False], [-9999, np.nan, 11, 9999]),
 )
 def test_booster_to_xgboosts_with_missing(
-    X_y, with_mono: bool, reverse: bool, missing: float
+    X_y,
+    with_mono: bool,
+    reverse: bool,
+    missing: float,
 ):
     X, y = X_y
     if with_mono:
@@ -201,6 +411,7 @@ def test_booster_to_xgboosts_with_missing(
         learning_rate=0.3,
         max_depth=5,
         reg_lambda=1,
+        reg_alpha=0.0,
         min_child_weight=1,
         gamma=1,
         objective="binary:logitraw",
@@ -218,6 +429,7 @@ def test_booster_to_xgboosts_with_missing(
         iterations=100,
         learning_rate=0.3,
         max_depth=5,
+        l1=0.0,
         l2=1,
         min_leaf_weight=1,
         gamma=1,
diff --git a/rs-example.md b/rs-example.md
index 88a5cc7..bcf2944 100644
--- a/rs-example.md
+++ b/rs-example.md
@@ -3,7 +3,7 @@
 To run this example, add the following code to your `Cargo.toml` file.
 ```toml
 [dependencies]
-forust-ml = "0.4.3"
+forust-ml = "0.4.4"
 polars = "0.28"
 reqwest = { version = "0.11", features = ["blocking"] }
 ```
diff --git a/src/binning.rs b/src/binning.rs
index 4197234..6ae1d91 100644
--- a/src/binning.rs
+++ b/src/binning.rs
@@ -101,9 +101,9 @@ pub fn bin_matrix(
         let mut col_cuts = percentiles_or_value(&no_miss, &w, &pcts);
         col_cuts.push(f64::MAX);
         col_cuts.dedup();
-        if col_cuts.len() < 2 {
-            return Err(ForustError::NoVariance(i));
-        }
+        // if col_cuts.len() < 2 {
+        //     return Err(ForustError::NoVariance(i));
+        // }
         // There will be one less bins, then there are cuts.
         // The first value will be for missing.
         nunique.push(col_cuts.len());
diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
index 1d794e1..d887e44 100644
--- a/src/gradientbooster.rs
+++ b/src/gradientbooster.rs
@@ -14,6 +14,7 @@ use crate::tree::Tree;
 use crate::utils::{fmt_vec_output, odds, validate_positive_float_field};
 use log::info;
 use rand::rngs::StdRng;
+use rand::seq::IteratorRandom;
 use rand::SeedableRng;
 use rayon::prelude::*;
 use serde::{Deserialize, Deserializer, Serialize};
@@ -93,12 +94,19 @@ pub struct GradientBooster {
     /// Maximum number of leaves allowed on a tree. Valid values
     /// are 0 to infinity. This is the total number of final nodes.
     pub max_leaves: usize,
+    /// L1 regularization term applied to the weights of the tree. Valid values
+    /// are 0 to infinity. 0 Means no regularization applied.
+    #[serde(default = "default_l1")]
+    pub l1: f32,
     /// L2 regularization term applied to the weights of the tree. Valid values
     /// are 0 to infinity.
     pub l2: f32,
     /// The minimum amount of loss required to further split a node.
     /// Valid values are 0 to infinity.
     pub gamma: f32,
+    /// Maximum delta step allowed at each leaf. This is the maximum magnitude a leaf can take. Setting to 0 results in no constrain.
+    #[serde(default = "default_max_delta_step")]
+    pub max_delta_step: f32,
     /// Minimum sum of the hessian values of the loss function
     /// required to be in a node.
     pub min_leaf_weight: f32,
@@ -125,6 +133,9 @@ pub struct GradientBooster {
     /// Used only in goss. the retain ratio of small gradient data.
     #[serde(default = "default_other_rate")]
     pub other_rate: f64,
+    /// Specify the fraction of columns that should be sampled at each iteration, valid values are in the range (0.0,1.0].
+    #[serde(default = "default_colsample_bytree")]
+    pub colsample_bytree: f64,
     /// Integer value used to seed any randomness used in the algorithm.
     pub seed: u64,
     /// Value to consider missing.
@@ -177,6 +188,13 @@ pub struct GradientBooster {
     metadata: HashMap<String, String>,
 }
 
+fn default_l1() -> f32 {
+    0.0
+}
+fn default_max_delta_step() -> f32 {
+    0.0
+}
+
 fn default_initialize_base_score() -> bool {
     false
 }
@@ -212,7 +230,9 @@ fn default_prediction_iteration() -> Option<usize> {
 fn default_terminate_missing_features() -> HashSet<usize> {
     HashSet::new()
 }
-
+fn default_colsample_bytree() -> f64 {
+    1.0
+}
 fn default_missing_node_treatment() -> MissingNodeTreatment {
     MissingNodeTreatment::AssignToParent
 }
@@ -239,8 +259,10 @@ impl Default for GradientBooster {
             0.3,
             5,
             usize::MAX,
+            0.,
             1.,
             0.,
+            0.,
             1.,
             0.5,
             256,
@@ -250,6 +272,7 @@ impl Default for GradientBooster {
             1.,
             0.1,
             0.2,
+            1.0,
             0,
             f64::NAN,
             false,
@@ -301,6 +324,7 @@ impl GradientBooster {
     /// * `subsample` - Percent of records to randomly sample at each iteration when training a tree.
     /// * `top_rate` - Used only in goss. The retain ratio of large gradient data.
     /// * `other_rate` - Used only in goss. the retain ratio of small gradient data.
+    /// * `colsample_bytree` - Specify the fraction of columns that should be sampled at each iteration, valid values are in the range (0.0,1.0].
     /// * `seed` - Integer value used to seed any randomness used in the algorithm.
     /// * `missing` - Value to consider missing.
     /// * `create_missing_branch` - Should missing be split out it's own separate branch?
@@ -317,8 +341,10 @@ impl GradientBooster {
         learning_rate: f32,
         max_depth: usize,
         max_leaves: usize,
+        l1: f32,
         l2: f32,
         gamma: f32,
+        max_delta_step: f32,
         min_leaf_weight: f32,
         base_score: f64,
         nbins: u16,
@@ -328,6 +354,7 @@ impl GradientBooster {
         subsample: f32,
         top_rate: f64,
         other_rate: f64,
+        colsample_bytree: f64,
         seed: u64,
         missing: f64,
         create_missing_branch: bool,
@@ -347,8 +374,10 @@ impl GradientBooster {
             learning_rate,
             max_depth,
             max_leaves,
+            l1,
             l2,
             gamma,
+            max_delta_step,
             min_leaf_weight,
             base_score,
             nbins,
@@ -358,6 +387,7 @@ impl GradientBooster {
             subsample,
             top_rate,
             other_rate,
+            colsample_bytree,
             seed,
             missing,
             create_missing_branch,
@@ -382,12 +412,15 @@ impl GradientBooster {
 
     fn validate_parameters(&self) -> Result<(), ForustError> {
         validate_positive_float_field!(self.learning_rate);
+        validate_positive_float_field!(self.l1);
         validate_positive_float_field!(self.l2);
         validate_positive_float_field!(self.gamma);
+        validate_positive_float_field!(self.max_delta_step);
         validate_positive_float_field!(self.min_leaf_weight);
         validate_positive_float_field!(self.subsample);
         validate_positive_float_field!(self.top_rate);
         validate_positive_float_field!(self.other_rate);
+        validate_positive_float_field!(self.colsample_bytree);
         Ok(())
     }
 
@@ -411,7 +444,9 @@ impl GradientBooster {
             .to_owned();
         if self.create_missing_branch {
             let splitter = MissingBranchSplitter {
+                l1: self.l1,
                 l2: self.l2,
+                max_delta_step: self.max_delta_step,
                 gamma: self.gamma,
                 min_leaf_weight: self.min_leaf_weight,
                 learning_rate: self.learning_rate,
@@ -424,7 +459,9 @@ impl GradientBooster {
             self.fit_trees(y, sample_weight, data, &splitter, evaluation_data)?;
         } else {
             let splitter = MissingImputerSplitter {
+                l1: self.l1,
                 l2: self.l2,
+                max_delta_step: self.max_delta_step,
                 gamma: self.gamma,
                 min_leaf_weight: self.min_leaf_weight,
                 learning_rate: self.learning_rate,
@@ -518,7 +555,7 @@ impl GradientBooster {
 
         // This will always be false, unless early stopping rounds are used.
         let mut stop_early = false;
-
+        let col_index: Vec<usize> = (0..data.cols).collect();
         for i in 0..self.iterations {
             let verbose = if self.log_iterations == 0 {
                 false
@@ -530,9 +567,31 @@ impl GradientBooster {
                 self.sample_index(&mut rng, &data.index, &mut grad, &mut hess);
             let mut tree = Tree::new();
 
+            // If we are doing any column sampling...
+            let colsample_index: Vec<usize> = if self.colsample_bytree == 1.0 {
+                Vec::new()
+            } else {
+                let amount = ((col_index.len() as f64) * self.colsample_bytree).floor() as usize;
+                let mut v: Vec<usize> = col_index
+                    .iter()
+                    .choose_multiple(&mut rng, amount)
+                    .iter()
+                    .map(|i| **i)
+                    .collect();
+                v.sort();
+                v
+            };
+
+            let fit_col_index = if self.colsample_bytree == 1.0 {
+                &col_index
+            } else {
+                &colsample_index
+            };
+
             tree.fit(
                 &bdata,
                 chosen_index,
+                fit_col_index,
                 &binned_data.cuts,
                 &grad,
                 &hess,
@@ -1020,6 +1079,13 @@ impl GradientBooster {
         self
     }
 
+    /// Set the l1 on the booster.
+    /// * `l1` - The l1 regulation term of the booster.
+    pub fn set_l1(mut self, l1: f32) -> Self {
+        self.l1 = l1;
+        self
+    }
+
     /// Set the l2 on the booster.
     /// * `l2` - The l2 regulation term of the booster.
     pub fn set_l2(mut self, l2: f32) -> Self {
@@ -1034,6 +1100,13 @@ impl GradientBooster {
         self
     }
 
+    /// Set the max_delta_step on the booster.
+    /// * `max_delta_step` - The max_delta_step value of the booster.
+    pub fn set_max_delta_step(mut self, max_delta_step: f32) -> Self {
+        self.max_delta_step = max_delta_step;
+        self
+    }
+
     /// Set the min_leaf_weight on the booster.
     /// * `min_leaf_weight` - The minimum sum of the hession values allowed in the
     ///     node of a tree of the booster.
@@ -1084,6 +1157,13 @@ impl GradientBooster {
         self
     }
 
+    /// Set the colsample_bytree on the booster.
+    /// * `colsample_bytree` - Percent of the columns to randomly sample when training each tree.
+    pub fn set_colsample_bytree(mut self, colsample_bytree: f64) -> Self {
+        self.colsample_bytree = colsample_bytree;
+        self
+    }
+
     /// Set the seed on the booster.
     /// * `seed` - Integer value used to see any randomness used in the algorithm.
     pub fn set_seed(mut self, seed: u64) -> Self {
diff --git a/src/histogram.rs b/src/histogram.rs
index 14e820a..bf1080f 100644
--- a/src/histogram.rs
+++ b/src/histogram.rs
@@ -116,16 +116,17 @@ impl HistogramMatrix {
             n_records: 0,
         })
     }
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         data: &Matrix<u16>,
         cuts: &JaggedMatrix<f64>,
         grad: &[f32],
         hess: &[f32],
         index: &[usize],
+        col_index: &[usize],
         parallel: bool,
         sort: bool,
     ) -> Self {
-        let col_index: Vec<usize> = (0..data.cols).collect();
         // Sort gradients and hessians to reduce cache hits.
         // This made a really sizeable difference on larger datasets
         // Bringing training time down from nearly 6 minutes, to 2 minutes.
@@ -172,11 +173,32 @@ impl HistogramMatrix {
                 })
                 .collect::<Vec<Bin<f32>>>()
         };
+
+        // If we have sampled down the columns, we need to recalculate the ends.
+        // we can do this by iterating over the cut's, as this will be the size
+        // of the histograms.
+        let ends: Vec<usize> = if col_index.len() == data.cols {
+            cuts.ends.to_owned()
+        } else {
+            col_index
+                .iter()
+                .scan(0_usize, |state, i| {
+                    *state += cuts.get_col(*i).len();
+                    Some(*state)
+                })
+                .collect()
+        };
+        let n_records = if col_index.len() == data.cols {
+            cuts.n_records
+        } else {
+            ends.iter().sum()
+        };
+
         HistogramMatrix(JaggedMatrix {
             data: histograms,
-            ends: cuts.ends.to_owned(),
-            cols: cuts.cols,
-            n_records: cuts.n_records,
+            ends,
+            cols: col_index.len(),
+            n_records,
         })
     }
 
diff --git a/src/partial_dependence.rs b/src/partial_dependence.rs
index e765791..2b2a682 100644
--- a/src/partial_dependence.rs
+++ b/src/partial_dependence.rs
@@ -97,7 +97,9 @@ mod tests {
 
         let data = Matrix::new(&data_vec, 891, 5);
         let splitter = MissingImputerSplitter {
+            l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -108,10 +110,11 @@ mod tests {
 
         let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
-
+        let col_index: Vec<usize> = (0..data.cols).collect();
         tree.fit(
             &bdata,
             data.index.to_owned(),
+            &col_index,
             &b.cuts,
             &g,
             &h,
diff --git a/src/splitter.rs b/src/splitter.rs
index c5a51ca..c65f4c4 100644
--- a/src/splitter.rs
+++ b/src/splitter.rs
@@ -50,7 +50,9 @@ pub trait Splitter {
     fn get_constraint(&self, feature: &usize) -> Option<&Constraint>;
     // fn get_allow_missing_splits(&self) -> bool;
     fn get_gamma(&self) -> f32;
+    fn get_l1(&self) -> f32;
     fn get_l2(&self) -> f32;
+    fn get_max_delta_step(&self) -> f32;
     fn get_learning_rate(&self) -> f32;
 
     /// Perform any post processing on the tree that is
@@ -62,12 +64,11 @@ pub trait Splitter {
     /// Find the best possible split, considering all feature histograms.
     /// If we wanted to add Column sampling, this is probably where
     /// we would need to do it, otherwise, it would be at the tree level.
-    fn best_split(&self, node: &SplittableNode) -> Option<SplitInfo> {
+    fn best_split(&self, node: &SplittableNode, col_index: &[usize]) -> Option<SplitInfo> {
         let mut best_split_info = None;
         let mut best_gain = 0.0;
-        let HistogramMatrix(histograms) = &node.histograms;
-        for i in 0..histograms.cols {
-            let split_info = self.best_feature_split(node, i);
+        for (idx, feature) in col_index.iter().enumerate() {
+            let split_info = self.best_feature_split(node, *feature, idx);
             match split_info {
                 Some(info) => {
                     if info.split_gain > best_gain {
@@ -98,12 +99,19 @@ pub trait Splitter {
         constraint: Option<&Constraint>,
     ) -> Option<(NodeInfo, NodeInfo, MissingInfo)>;
 
-    fn best_feature_split(&self, node: &SplittableNode, feature: usize) -> Option<SplitInfo> {
+    /// The idx is the index of the feature in the histogram data, whereas feature
+    /// is the index of the actual feature in the data.
+    fn best_feature_split(
+        &self,
+        node: &SplittableNode,
+        feature: usize,
+        idx: usize,
+    ) -> Option<SplitInfo> {
         let mut split_info: Option<SplitInfo> = None;
         let mut max_gain: Option<f32> = None;
 
         let HistogramMatrix(histograms) = &node.histograms;
-        let histogram = histograms.get_col(feature);
+        let histogram = histograms.get_col(idx);
 
         // We also know we will have a missing bin.
         let missing = &histogram[0];
@@ -200,6 +208,7 @@ pub trait Splitter {
         n_nodes: &usize,
         node: &mut SplittableNode,
         index: &mut [usize],
+        col_index: &[usize],
         data: &Matrix<u16>,
         cuts: &JaggedMatrix<f64>,
         grad: &[f32],
@@ -215,15 +224,16 @@ pub trait Splitter {
         n_nodes: &usize,
         node: &mut SplittableNode,
         index: &mut [usize],
+        col_index: &[usize],
         data: &Matrix<u16>,
         cuts: &JaggedMatrix<f64>,
         grad: &[f32],
         hess: &[f32],
         parallel: bool,
     ) -> Vec<SplittableNode> {
-        match self.best_split(node) {
+        match self.best_split(node, col_index) {
             Some(split_info) => self.handle_split_info(
-                split_info, n_nodes, node, index, data, cuts, grad, hess, parallel,
+                split_info, n_nodes, node, index, col_index, data, cuts, grad, hess, parallel,
             ),
             None => Vec::new(),
         }
@@ -236,7 +246,9 @@ pub trait Splitter {
 /// If this node is able, it will be split further, otherwise it will
 /// a leaf node will be generated.
 pub struct MissingBranchSplitter {
+    pub l1: f32,
     pub l2: f32,
+    pub max_delta_step: f32,
     pub gamma: f32,
     pub min_leaf_weight: f32,
     pub learning_rate: f32,
@@ -317,9 +329,16 @@ impl Splitter for MissingBranchSplitter {
         self.gamma
     }
 
+    fn get_l1(&self) -> f32 {
+        self.l1
+    }
+
     fn get_l2(&self) -> f32 {
         self.l2
     }
+    fn get_max_delta_step(&self) -> f32 {
+        self.max_delta_step
+    }
 
     fn get_learning_rate(&self) -> f32 {
         self.learning_rate
@@ -348,7 +367,9 @@ impl Splitter for MissingBranchSplitter {
         }
 
         let mut left_weight = constrained_weight(
+            &self.l1,
             &self.l2,
+            &self.max_delta_step,
             left_gradient,
             left_hessian,
             lower_bound,
@@ -356,7 +377,9 @@ impl Splitter for MissingBranchSplitter {
             constraint,
         );
         let mut right_weight = constrained_weight(
+            &self.l1,
             &self.l2,
+            &self.max_delta_step,
             right_gradient,
             right_hessian,
             lower_bound,
@@ -387,7 +410,9 @@ impl Splitter for MissingBranchSplitter {
         // Set weight based on the missing node treatment.
         let missing_weight = match self.missing_node_treatment {
             MissingNodeTreatment::AssignToParent => constrained_weight(
+                &self.get_l1(),
                 &self.get_l2(),
+                &self.max_delta_step,
                 missing_gradient + left_gradient + right_gradient,
                 missing_hessian + left_hessian + right_hessian,
                 lower_bound,
@@ -407,7 +432,9 @@ impl Splitter for MissingBranchSplitter {
                     parent_weight
                 } else {
                     constrained_weight(
+                        &self.get_l1(),
                         &self.get_l2(),
+                        &self.max_delta_step,
                         missing_gradient,
                         missing_hessian,
                         lower_bound,
@@ -473,6 +500,7 @@ impl Splitter for MissingBranchSplitter {
         n_nodes: &usize,
         node: &mut SplittableNode,
         index: &mut [usize],
+        col_index: &[usize],
         data: &Matrix<u16>,
         cuts: &JaggedMatrix<f64>,
         grad: &[f32],
@@ -484,7 +512,7 @@ impl Splitter for MissingBranchSplitter {
         let right_child = missing_child + 2;
         node.update_children(missing_child, left_child, right_child, &split_info);
 
-        let (missing_is_leaf, mut missing_info) = match split_info.missing_node {
+        let (mut missing_is_leaf, mut missing_info) = match split_info.missing_node {
             MissingInfo::Branch(i) => {
                 if self
                     .terminate_missing_features
@@ -542,6 +570,9 @@ impl Splitter for MissingBranchSplitter {
         let right_histograms: HistogramMatrix;
         let missing_histograms: HistogramMatrix;
         if n_missing == 0 {
+            // If there are no missing records, we know the missing value
+            // will be a leaf, assign this node as a leaf.
+            missing_is_leaf = true;
             if max_ == 1 {
                 missing_histograms = HistogramMatrix::empty();
                 right_histograms = HistogramMatrix::new(
@@ -550,6 +581,7 @@ impl Splitter for MissingBranchSplitter {
                     grad,
                     hess,
                     &index[split_idx..node.stop_idx],
+                    col_index,
                     parallel,
                     true,
                 );
@@ -563,6 +595,7 @@ impl Splitter for MissingBranchSplitter {
                     grad,
                     hess,
                     &index[missing_split_idx..split_idx],
+                    col_index,
                     parallel,
                     true,
                 );
@@ -578,6 +611,7 @@ impl Splitter for MissingBranchSplitter {
                 grad,
                 hess,
                 &index[missing_split_idx..split_idx],
+                col_index,
                 parallel,
                 true,
             );
@@ -587,6 +621,7 @@ impl Splitter for MissingBranchSplitter {
                 grad,
                 hess,
                 &index[split_idx..node.stop_idx],
+                col_index,
                 parallel,
                 true,
             );
@@ -602,6 +637,7 @@ impl Splitter for MissingBranchSplitter {
                 grad,
                 hess,
                 &index[node.start_idx..missing_split_idx],
+                col_index,
                 parallel,
                 true,
             );
@@ -611,6 +647,7 @@ impl Splitter for MissingBranchSplitter {
                 grad,
                 hess,
                 &index[split_idx..node.stop_idx],
+                col_index,
                 parallel,
                 true,
             );
@@ -627,6 +664,7 @@ impl Splitter for MissingBranchSplitter {
                 grad,
                 hess,
                 &index[node.start_idx..missing_split_idx],
+                col_index,
                 parallel,
                 true,
             );
@@ -636,6 +674,7 @@ impl Splitter for MissingBranchSplitter {
                 grad,
                 hess,
                 &index[missing_split_idx..split_idx],
+                col_index,
                 parallel,
                 true,
             );
@@ -680,7 +719,9 @@ impl Splitter for MissingBranchSplitter {
 /// them down either the right or left branch, depending
 /// on which results in a higher increase in gain.
 pub struct MissingImputerSplitter {
+    pub l1: f32,
     pub l2: f32,
+    pub max_delta_step: f32,
     pub gamma: f32,
     pub min_leaf_weight: f32,
     pub learning_rate: f32,
@@ -690,8 +731,11 @@ pub struct MissingImputerSplitter {
 
 impl MissingImputerSplitter {
     /// Generate a new missing imputer splitter object.
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
+        l1: f32,
         l2: f32,
+        max_delta_step: f32,
         gamma: f32,
         min_leaf_weight: f32,
         learning_rate: f32,
@@ -699,7 +743,9 @@ impl MissingImputerSplitter {
         constraints_map: ConstraintMap,
     ) -> Self {
         MissingImputerSplitter {
+            l1,
             l2,
+            max_delta_step,
             gamma,
             min_leaf_weight,
             learning_rate,
@@ -718,9 +764,16 @@ impl Splitter for MissingImputerSplitter {
         self.gamma
     }
 
+    fn get_l1(&self) -> f32 {
+        self.l1
+    }
+
     fn get_l2(&self) -> f32 {
         self.l2
     }
+    fn get_max_delta_step(&self) -> f32 {
+        self.max_delta_step
+    }
 
     fn get_learning_rate(&self) -> f32 {
         self.learning_rate
@@ -760,7 +813,9 @@ impl Splitter for MissingImputerSplitter {
         let mut right_hessian = right_hessian;
 
         let mut left_weight = constrained_weight(
+            &self.l1,
             &self.l2,
+            &self.max_delta_step,
             left_gradient,
             left_hessian,
             lower_bound,
@@ -768,7 +823,9 @@ impl Splitter for MissingImputerSplitter {
             constraint,
         );
         let mut right_weight = constrained_weight(
+            &self.l1,
             &self.l2,
+            &self.max_delta_step,
             right_gradient,
             right_hessian,
             lower_bound,
@@ -798,7 +855,9 @@ impl Splitter for MissingImputerSplitter {
             // back to f32...
             // The weight if missing went left
             let missing_left_weight = constrained_weight(
+                &self.l1,
                 &self.l2,
+                &self.max_delta_step,
                 left_gradient + missing_gradient,
                 left_hessian + missing_hessian,
                 lower_bound,
@@ -822,7 +881,9 @@ impl Splitter for MissingImputerSplitter {
 
             // The gain if missing went right
             let missing_right_weight = constrained_weight(
+                &self.l1,
                 &self.l2,
+                &self.max_delta_step,
                 right_gradient + missing_gradient,
                 right_hessian + missing_hessian,
                 lower_bound,
@@ -890,6 +951,7 @@ impl Splitter for MissingImputerSplitter {
         n_nodes: &usize,
         node: &mut SplittableNode,
         index: &mut [usize],
+        col_index: &[usize],
         data: &Matrix<u16>,
         cuts: &JaggedMatrix<f64>,
         grad: &[f32],
@@ -937,6 +999,7 @@ impl Splitter for MissingImputerSplitter {
                 grad,
                 hess,
                 &index[node.start_idx..split_idx],
+                col_index,
                 parallel,
                 true,
             );
@@ -949,6 +1012,7 @@ impl Splitter for MissingImputerSplitter {
                 grad,
                 hess,
                 &index[split_idx..node.stop_idx],
+                col_index,
                 parallel,
                 true,
             );
@@ -1003,9 +1067,11 @@ mod tests {
         let b = bin_matrix(&data, &w, 10, f64::NAN).unwrap();
         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
         let index = data.index.to_owned();
-        let hists = HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, true, true);
+        let hists = HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, &[0], true, true);
         let splitter = MissingImputerSplitter {
+            l1: 0.0,
             l2: 0.0,
+            max_delta_step: 0.,
             gamma: 0.0,
             min_leaf_weight: 0.0,
             learning_rate: 1.0,
@@ -1026,7 +1092,7 @@ mod tests {
             f32::NEG_INFINITY,
             f32::INFINITY,
         );
-        let s = splitter.best_feature_split(&mut n, 0).unwrap();
+        let s = splitter.best_feature_split(&mut n, 0, 0).unwrap();
         assert_eq!(s.split_value, 4.0);
         assert_eq!(s.left_node.cover, 0.75);
         assert_eq!(s.right_node.cover, 1.0);
@@ -1047,10 +1113,13 @@ mod tests {
         let b = bin_matrix(&data, &w, 10, f64::NAN).unwrap();
         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
         let index = data.index.to_owned();
-        let hists = HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, true, true);
+        let hists =
+            HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, &[0, 1], true, true);
         println!("{:?}", hists);
         let splitter = MissingImputerSplitter {
+            l1: 0.0,
             l2: 0.0,
+            max_delta_step: 0.,
             gamma: 0.0,
             min_leaf_weight: 0.0,
             learning_rate: 1.0,
@@ -1071,7 +1140,7 @@ mod tests {
             f32::NEG_INFINITY,
             f32::INFINITY,
         );
-        let s = splitter.best_split(&mut n).unwrap();
+        let s = splitter.best_split(&mut n, &[0, 1]).unwrap();
         println!("{:?}", s);
         assert_eq!(s.split_feature, 1);
         assert_eq!(s.split_value, 4.);
@@ -1095,7 +1164,9 @@ mod tests {
         let (grad, hess) = LogLoss::calc_grad_hess(&y, &yhat, &w);
 
         let splitter = MissingImputerSplitter {
+            l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -1104,14 +1175,23 @@ mod tests {
         };
         let gradient_sum = grad.iter().copied().sum();
         let hessian_sum = hess.iter().copied().sum();
+        let root_weight = weight(
+            &splitter.l1,
+            &splitter.l2,
+            &splitter.max_delta_step,
+            gradient_sum,
+            hessian_sum,
+        );
         let root_gain = gain(&splitter.l2, gradient_sum, hessian_sum);
-        let root_weight = weight(&splitter.l2, gradient_sum, hessian_sum);
         let data = Matrix::new(&data_vec, 891, 5);
 
         let b = bin_matrix(&data, &w, 10, f64::NAN).unwrap();
         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
         let index = data.index.to_owned();
-        let hists = HistogramMatrix::new(&bdata, &b.cuts, &grad, &hess, &index, true, false);
+        let col_index: Vec<usize> = (0..data.cols).collect();
+        let hists = HistogramMatrix::new(
+            &bdata, &b.cuts, &grad, &hess, &index, &col_index, true, false,
+        );
 
         let mut n = SplittableNode::new(
             0,
@@ -1127,7 +1207,7 @@ mod tests {
             f32::NEG_INFINITY,
             f32::INFINITY,
         );
-        let s = splitter.best_split(&mut n).unwrap();
+        let s = splitter.best_split(&mut n, &col_index).unwrap();
         println!("{:?}", s);
         n.update_children(2, 1, 2, &s);
         assert_eq!(0, s.split_feature);
diff --git a/src/tree.rs b/src/tree.rs
index 44caf43..efa4885 100644
--- a/src/tree.rs
+++ b/src/tree.rs
@@ -34,6 +34,7 @@ impl Tree {
         &mut self,
         data: &Matrix<u16>,
         mut index: Vec<usize>,
+        col_index: &[usize],
         cuts: &JaggedMatrix<f64>,
         grad: &[f32],
         hess: &[f32],
@@ -66,9 +67,16 @@ impl Tree {
 
         let mut n_nodes = 1;
         let root_gain = gain(&splitter.get_l2(), gradient_sum, hessian_sum);
-        let root_weight = weight(&splitter.get_l2(), gradient_sum, hessian_sum);
+        let root_weight = weight(
+            &splitter.get_l1(),
+            &splitter.get_l2(),
+            &splitter.get_max_delta_step(),
+            gradient_sum,
+            hessian_sum,
+        );
         // Calculate the histograms for the root node.
-        let root_hists = HistogramMatrix::new(data, cuts, grad, hess, &index, parallel, sort);
+        let root_hists =
+            HistogramMatrix::new(data, cuts, grad, hess, &index, col_index, parallel, sort);
         let root_node = SplittableNode::new(
             0,
             root_hists,
@@ -123,7 +131,7 @@ impl Tree {
             n_leaves -= 1;
 
             let new_nodes = splitter.split_node(
-                &n_nodes, &mut node, &mut index, data, cuts, grad, hess, parallel,
+                &n_nodes, &mut node, &mut index, col_index, data, cuts, grad, hess, parallel,
             );
 
             let n_new_nodes = new_nodes.len();
@@ -576,7 +584,9 @@ mod tests {
 
         let data = Matrix::new(&data_vec, 891, 5);
         let splitter = MissingImputerSplitter {
+            l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -591,9 +601,11 @@ mod tests {
         let (index, excluded) =
             RandomSampler::new(0.5).sample(&mut rng, &data.index, &mut g, &mut h);
         assert!(excluded.len() > 0);
+        let col_index: Vec<usize> = (0..data.cols).collect();
         tree.fit(
             &bdata,
             index,
+            &col_index,
             &b.cuts,
             &g,
             &h,
@@ -620,7 +632,9 @@ mod tests {
 
         let data = Matrix::new(&data_vec, 891, 5);
         let splitter = MissingImputerSplitter {
+            l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -631,9 +645,11 @@ mod tests {
 
         let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
+        let col_index: Vec<usize> = (0..data.cols).collect();
         tree.fit(
             &bdata,
             data.index.to_owned(),
+            &col_index,
             &b.cuts,
             &g,
             &h,
@@ -687,6 +703,55 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_tree_colsample() {
+        let file = fs::read_to_string("resources/contiguous_no_missing.csv")
+            .expect("Something went wrong reading the file");
+        let data_vec: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
+        let file = fs::read_to_string("resources/performance.csv")
+            .expect("Something went wrong reading the file");
+        let y: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
+        let yhat = vec![0.5; y.len()];
+        let w = vec![1.; y.len()];
+        let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w);
+
+        let data = Matrix::new(&data_vec, 891, 5);
+        let splitter = MissingImputerSplitter {
+            l1: 0.0,
+            l2: 1.0,
+            max_delta_step: 0.,
+            gamma: 3.0,
+            min_leaf_weight: 1.0,
+            learning_rate: 0.3,
+            allow_missing_splits: true,
+            constraints_map: ConstraintMap::new(),
+        };
+        let mut tree = Tree::new();
+
+        let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
+        let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
+        let col_index: Vec<usize> = vec![1, 3];
+        tree.fit(
+            &bdata,
+            data.index.to_owned(),
+            &col_index,
+            &b.cuts,
+            &g,
+            &h,
+            &splitter,
+            usize::MAX,
+            5,
+            false,
+            &SampleMethod::None,
+            &GrowPolicy::DepthWise,
+        );
+        for n in tree.nodes {
+            if !n.is_leaf {
+                assert!((n.split_feature == 1) || (n.split_feature == 3))
+            }
+        }
+    }
+
     #[test]
     fn test_tree_fit_monotone() {
         let file = fs::read_to_string("resources/contiguous_no_missing.csv")
@@ -704,7 +769,9 @@ mod tests {
         let data = Matrix::new(data_.get_col(1), 891, 1);
         let map = ConstraintMap::from([(0, Constraint::Negative)]);
         let splitter = MissingImputerSplitter {
+            l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 0.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -715,10 +782,11 @@ mod tests {
 
         let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
-
+        let col_index: Vec<usize> = (0..data.cols).collect();
         tree.fit(
             &bdata,
             data.index.to_owned(),
+            &col_index,
             &b.cuts,
             &g,
             &h,
@@ -785,7 +853,9 @@ mod tests {
 
         let data = Matrix::new(&data_vec, 891, 5);
         let splitter = MissingImputerSplitter {
+            l1: 0.0,
             l2: 1.0,
+            max_delta_step: 0.,
             gamma: 3.0,
             min_leaf_weight: 1.0,
             learning_rate: 0.3,
@@ -796,9 +866,11 @@ mod tests {
 
         let b = bin_matrix(&data, &w, 300, f64::NAN).unwrap();
         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
+        let col_index: Vec<usize> = (0..data.cols).collect();
         tree.fit(
             &bdata,
             data.index.to_owned(),
+            &col_index,
             &b.cuts,
             &g,
             &h,
diff --git a/src/utils.rs b/src/utils.rs
index ffe6696..ee48dd0 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -83,16 +83,19 @@ pub fn is_missing(value: &f64, missing: &f64) -> bool {
 
 /// Calculate the constraint weight given bounds
 /// and a constraint.
+#[allow(clippy::too_many_arguments)]
 #[inline]
 pub fn constrained_weight(
+    l1: &f32,
     l2: &f32,
+    max_delta_step: &f32,
     gradient_sum: f32,
     hessian_sum: f32,
     lower_bound: f32,
     upper_bound: f32,
     constraint: Option<&Constraint>,
 ) -> f32 {
-    let weight = weight(l2, gradient_sum, hessian_sum);
+    let weight = weight(l1, l2, max_delta_step, gradient_sum, hessian_sum);
     match constraint {
         None | Some(Constraint::Unconstrained) => weight,
         _ => {
@@ -202,11 +205,35 @@ pub fn cull_gain(
     }
 }
 
+/// Calculate l1 regularization
+#[inline]
+pub fn l1_regularization(w: &f32, l1: &f32) -> f32 {
+    if l1 == &0. {
+        *w
+    } else if w > l1 {
+        w - l1
+    } else if w < &-l1 {
+        w + l1
+    } else {
+        0.0
+    }
+}
+
 /// Calculate the weight of a given node, given the sum
 /// of the gradients, and the hessians in a node.
 #[inline]
-pub fn weight(l2: &f32, gradient_sum: f32, hessian_sum: f32) -> f32 {
-    -(gradient_sum / (hessian_sum + l2))
+pub fn weight(
+    l1: &f32,
+    l2: &f32,
+    max_delta_step: &f32,
+    gradient_sum: f32,
+    hessian_sum: f32,
+) -> f32 {
+    let w = -(l1_regularization(&gradient_sum, l1) / (hessian_sum + l2));
+    if (max_delta_step != &0.) && (&w.abs() > max_delta_step) {
+        return max_delta_step.copysign(w);
+    }
+    w
 }
 
 const LANES: usize = 16;