diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index b3e57f8..2ac7a4c 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -17,12 +17,13 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.10", "3.11", "3.12"]
+        hlink_extras: ["dev", "dev,lightgbm,xgboost"]
     runs-on: ubuntu-latest
 
     steps:
     - uses: actions/checkout@v4
     - name: Build the Docker image
-      run: docker build . --file Dockerfile --tag $HLINK_TAG-${{ matrix.python_version}} --build-arg PYTHON_VERSION=${{ matrix.python_version }}
+      run: docker build . --file Dockerfile --tag $HLINK_TAG-${{ matrix.python_version}} --build-arg PYTHON_VERSION=${{ matrix.python_version }} --build-arg HLINK_EXTRAS=${{ matrix.hlink_extras }}
 
     - name: Check dependency versions
       run: |
@@ -34,7 +35,7 @@ jobs:
       run: docker run $HLINK_TAG-${{ matrix.python_version}} black --check .
       
     - name: Test
-      run: docker run $HLINK_TAG-${{ matrix.python_version}} pytest
+      run: docker run $HLINK_TAG-${{ matrix.python_version}} pytest -ra
     
     - name: Build sdist and wheel
       run: docker run $HLINK_TAG-${{ matrix.python_version}} python -m build
diff --git a/.gitignore b/.gitignore
index c397fe7..2adf10e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ scala_jar/target
 scala_jar/project/target
 *.class
 *.cache
+.metals/
 
 # MacOS
 .DS_Store
diff --git a/Dockerfile b/Dockerfile
index 80d5c6e..0f2e036 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,6 @@
 ARG PYTHON_VERSION=3.10
 FROM python:${PYTHON_VERSION}
+ARG HLINK_EXTRAS=dev
 
 RUN apt-get update && apt-get install default-jre-headless -y
 
@@ -8,4 +9,4 @@ WORKDIR /hlink
 
 COPY . .
 RUN python -m pip install --upgrade pip
-RUN pip install -e .[dev]
+RUN pip install -e .[${HLINK_EXTRAS}]
diff --git a/README.md b/README.md
index c020e7a..3092f3a 100755
--- a/README.md
+++ b/README.md
@@ -26,19 +26,56 @@ We do our best to make hlink compatible with Python 3.10-3.12. If you have a
 problem using hlink on one of these versions of Python, please open an issue
 through GitHub. Versions of Python older than 3.10 are not supported.
 
-Note that pyspark 3.5 does not yet officially support Python 3.12. If you
-encounter pyspark-related import errors while running hlink on Python 3.12, try
+Note that PySpark 3.5 does not yet officially support Python 3.12. If you
+encounter PySpark-related import errors while running hlink on Python 3.12, try
 
 - Installing the setuptools package. The distutils package was deleted from the
-  standard library in Python 3.12, but some versions of pyspark still import
+  standard library in Python 3.12, but some versions of PySpark still import
   it. The setuptools package provides a hacky stand-in distutils library which
-  should fix some import errors in pyspark. We install setuptools in our
+  should fix some import errors in PySpark. We install setuptools in our
   development and test dependencies so that our tests work on Python 3.12.
 
-- Downgrading Python to 3.10 or 3.11. Pyspark officially supports these
-  versions of Python. So you should have better chances getting pyspark to work
+- Downgrading Python to 3.10 or 3.11. PySpark officially supports these
+  versions of Python. So you should have better chances getting PySpark to work
   well on Python 3.10 or 3.11.
 
+### Additional Machine Learning Algorithms
+
+hlink has optional support for two additional machine learning algorithms,
+[XGBoost](https://xgboost.readthedocs.io/en/stable/index.html) and
+[LightGBM](https://lightgbm.readthedocs.io/en/latest/index.html). Both of these
+algorithms are highly performant gradient boosting libraries, each with its own
+characteristics. These algorithms are not implemented directly in Spark, so
+they require some additional dependencies. To install the required Python
+dependencies, run
+
+```
+pip install hlink[xgboost]
+```
+
+for XGBoost or
+
+```
+pip install hlink[lightgbm]
+```
+
+for LightGBM. If you would like to install both at once, you can run
+
+```
+pip install hlink[xgboost,lightgbm]
+```
+
+to get the Python dependencies for both. Both XGBoost and LightGBM also require
+libomp, which will need to be installed separately if you don't already have it.
+
+After installing the dependencies for one or both of these algorithms, you can
+use them as model types in training and model exploration. You can read more
+about these models in the hlink documentation [here](https://hlink.docs.ipums.org/models.html).
+
+*Note: The XGBoost-PySpark integration provided by the xgboost Python package is
+currently unstable. So the hlink xgboost support is experimental and may change
+in the future.*
+
 ## Docs
 
 The documentation site can be found at [hlink.docs.ipums.org](https://hlink.docs.ipums.org).
diff --git a/docs/_sources/model_exploration.md.txt b/docs/_sources/model_exploration.md.txt
new file mode 100644
index 0000000..1d1266e
--- /dev/null
+++ b/docs/_sources/model_exploration.md.txt
@@ -0,0 +1 @@
+# Configuring Model Exploration
diff --git a/docs/_sources/models.md.txt b/docs/_sources/models.md.txt
index a1c9996..31c9eb6 100644
--- a/docs/_sources/models.md.txt
+++ b/docs/_sources/models.md.txt
@@ -1,53 +1,80 @@
 # Models
 
-These are models available to be used in the model evaluation, training, and household training link tasks.
-
-* Attributes for all models:
-  * `threshold` -- Type: `float`.  Alpha threshold (model hyperparameter).
-  * `threshold_ratio` -- Type: `float`.  Beta threshold (de-duplication distance ratio).
-  * Any parameters available in the model as defined in the Spark documentation can be passed as params using the label given in the Spark docs.  Commonly used parameters are listed below with descriptive explanations from the Spark docs.
+These are the machine learning models available for use in the model evaluation
+and training tasks and in their household counterparts.
+
+There are a few attributes available for all models.
+
+* `type` -- Type: `string`. The name of the model type. The available model
+  types are listed below.
+* `threshold` -- Type: `float`.  The "alpha threshold". This is the probability
+  score required for a potential match to be labeled a match. `0 ≤ threshold ≤
+  1`.
+* `threshold_ratio` -- Type: `float`. The threshold ratio or "beta threshold".
+  This applies to records which have multiple potential matches when
+  `training.decision` is set to `"drop_duplicate_with_threshold_ratio"`. For
+  each record, only potential matches which have the highest probability, have
+  a probability of at least `threshold`, *and* whose probabilities are at least
+  `threshold_ratio` times larger than the second-highest probability are
+  matches. This is sometimes called the "de-duplication distance ratio". `1 ≤
+  threshold_ratio < ∞`.
+
+In addition, any model parameters documented in a model type's Spark
+documentation can be passed as parameters to the model through hlink's
+`training.chosen_model` and `training.model_exploration` configuration
+sections.
+
+Here is an example `training.chosen_model` configuration. The `type`,
+`threshold`, and `threshold_ratio` attributes are hlink specific. `maxDepth` is
+a parameter to the random forest model which hlink passes through to the
+underlying Spark classifier.
+
+```toml
+[training.chosen_model]
+type = "random_forest"
+threshold = 0.2
+threshold_ratio = 1.2
+maxDepth = 5
+```
 
 ## random_forest
 
-Uses [pyspark.ml.classification.RandomForestClassifier](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.RandomForestClassifier.html).  Returns probability as an array.
+Uses [pyspark.ml.classification.RandomForestClassifier](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.RandomForestClassifier.html).
 * Parameters:
   * `maxDepth` -- Type: `int`. Maximum depth of the tree. Spark default value is 5.
   * `numTrees` -- Type: `int`. The number of trees to train.  Spark default value is 20, must be >= 1.
   * `featureSubsetStrategy` -- Type: `string`. Per the Spark docs: "The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]."
 
-```
-model_parameters = {
-    type = "random_forest",
-    maxDepth = 5,
-    numTrees = 75,
-    featureSubsetStrategy = "sqrt",
-    threshold = 0.15,
-    threshold_ratio = 1.0
-}
+```toml
+[training.chosen_model]
+type = "random_forest"
+threshold = 0.15
+threshold_ratio = 1.0
+maxDepth = 5
+numTrees = 75
+featureSubsetStrategy = "sqrt"
 ```
 
 ## probit
 
 Uses [pyspark.ml.regression.GeneralizedLinearRegression](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.GeneralizedLinearRegression.html) with `family="binomial"` and `link="probit"`.  
 
-```
-model_parameters = {
-    type = "probit",
-    threshold = 0.85,
-    threshold_ratio = 1.2
-}
+```toml
+[training.chosen_model]
+type = "probit"
+threshold = 0.85
+threshold_ratio = 1.2
 ```
 
 ## logistic_regression
 
 Uses [pyspark.ml.classification.LogisticRegression](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.LogisticRegression.html)
 
-```
-chosen_model = {
-    type = "logistic_regression",
-    threshold = 0.5,
-    threshold_ratio = 1.0
-}
+```toml
+[training.chosen_model]
+type = "logistic_regression"
+threshold = 0.5
+threshold_ratio = 1.0
 ```
 
 ## decision_tree
@@ -59,13 +86,14 @@ Uses [pyspark.ml.classification.DecisionTreeClassifier](https://spark.apache.org
   * `minInstancesPerNode` -- Type `int`. Per the Spark docs: "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1."
   * `maxBins` -- Type: `int`. Per the Spark docs: "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature."
 
-```
-chosen_model = {
-    type = "decision_tree",
-    maxDepth = 6,
-    minInstancesPerNode = 2,
-    maxBins = 4
-}
+```toml
+[training.chosen_model]
+type = "decision_tree"
+threshold = 0.5
+threshold_ratio = 1.5
+maxDepth = 6
+minInstancesPerNode = 2
+maxBins = 4
 ```
 
 ## gradient_boosted_trees
@@ -77,13 +105,94 @@ Uses [pyspark.ml.classification.GBTClassifier](https://spark.apache.org/docs/lat
   * `minInstancesPerNode` -- Type `int`. Per the Spark docs: "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1."
   * `maxBins` -- Type: `int`. Per the Spark docs: "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature."
   
+```toml
+[training.chosen_model]
+type = "gradient_boosted_trees"
+threshold = 0.7
+threshold_ratio = 1.3
+maxDepth = 4
+minInstancesPerNode = 1
+maxBins = 6
+```
+
+## xgboost
+
+*Added in version 3.8.0.*
+
+XGBoost is an alternate, high-performance implementation of gradient boosting.
+It uses [xgboost.spark.SparkXGBClassifier](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.spark.SparkXGBClassifier).
+Since the XGBoost-PySpark integration which the xgboost Python package provides
+is currently unstable, support for the xgboost model type is disabled in hlink
+by default. hlink will stop with an error if you try to use this model type
+without enabling support for it. To enable support for xgboost, install hlink
+with the `xgboost` extra.
+
 ```
-chosen_model = {
-    type = "gradient_boosted_trees",
-    maxDepth = 4,
-    minInstancesPerNode = 1,
-    maxBins = 6,
-    threshold = 0.7,
-    threshold_ratio = 1.3
-}
+pip install hlink[xgboost]
+```
+
+This installs the xgboost package and its Python dependencies. Depending on
+your machine and operating system, you may also need to install the libomp
+library, which is another dependency of xgboost. xgboost should raise a helpful
+error if it detects that you need to install libomp.
+
+You can view a list of xgboost's parameters
+[here](https://xgboost.readthedocs.io/en/latest/parameter.html).
+
+```toml
+[training.chosen_model]
+type = "xgboost"
+threshold = 0.8
+threshold_ratio = 1.5
+max_depth = 5
+eta = 0.5
+gamma = 0.05
+```
+
+## lightgbm
+
+*Added in version 3.8.0.*
+
+LightGBM is another alternate, high-performance implementation of gradient
+boosting. It uses
+[synapse.ml.lightgbm.LightGBMClassifier](https://mmlspark.blob.core.windows.net/docs/1.0.8/pyspark/synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMClassifier).
+`synapse.ml` is a library which provides various integrations with PySpark,
+including integrations between the C++ LightGBM library and PySpark.
+
+LightGBM requires some additional Scala libraries that hlink does not usually
+install, so support for the lightgbm model is disabled in hlink by default.
+hlink will stop with an error if you try to use this model type without
+enabling support for it. To enable support for lightgbm, install hlink with the
+`lightgbm` extra.
+
+```
+pip install hlink[lightgbm]
+```
+
+This installs the lightgbm package and its Python dependencies. Depending on
+your machine and operating system, you may also need to install the libomp
+library, which is another dependency of lightgbm. If you encounter errors when
+training a lightgbm model, please try installing libomp if you do not have it
+installed.
+
+lightgbm has an enormous number of available parameters. Many of these are
+available as normal in hlink, via the [LightGBMClassifier
+class](https://mmlspark.blob.core.windows.net/docs/1.0.8/pyspark/synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMClassifier).
+Others are available through the special `passThroughArgs` parameter, which
+passes additional parameters through to the C++ library. You can see a full
+list of the supported parameters
+[here](https://lightgbm.readthedocs.io/en/latest/Parameters.html).
+
+```toml
+[training.chosen_model]
+type = "lightgbm"
+# hlink's threshold and threshold_ratio
+threshold = 0.8
+threshold_ratio = 1.5
+# LightGBMClassifier supports these parameters (and many more).
+maxDepth = 5
+learningRate = 0.5
+# LightGBMClassifier does not directly support this parameter,
+# so we have to send it to the C++ library with passThroughArgs.
+passThroughArgs = "force_row_wise=true"
 ```
diff --git a/docs/column_mappings.html b/docs/column_mappings.html
index c191199..59e847d 100644
--- a/docs/column_mappings.html
+++ b/docs/column_mappings.html
@@ -402,6 +402,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/comparison_features.html b/docs/comparison_features.html
index d251a28..e542e78 100644
--- a/docs/comparison_features.html
+++ b/docs/comparison_features.html
@@ -1301,6 +1301,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/comparisons.html b/docs/comparisons.html
index 244760e..f110d17 100644
--- a/docs/comparisons.html
+++ b/docs/comparisons.html
@@ -197,6 +197,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/config.html b/docs/config.html
index 07bb31c..3958557 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -958,6 +958,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/feature_selection_transforms.html b/docs/feature_selection_transforms.html
index 4a58650..e53344c 100644
--- a/docs/feature_selection_transforms.html
+++ b/docs/feature_selection_transforms.html
@@ -220,6 +220,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/index.html b/docs/index.html
index 4d8f405..6ce1d97 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -135,6 +135,8 @@ <h1>Configuration API<a class="headerlink" href="#configuration-api" title="Link
 <li class="toctree-l2"><a class="reference internal" href="models.html#logistic-regression">logistic_regression</a></li>
 <li class="toctree-l2"><a class="reference internal" href="models.html#decision-tree">decision_tree</a></li>
 <li class="toctree-l2"><a class="reference internal" href="models.html#gradient-boosted-trees">gradient_boosted_trees</a></li>
+<li class="toctree-l2"><a class="reference internal" href="models.html#xgboost">xgboost</a></li>
+<li class="toctree-l2"><a class="reference internal" href="models.html#lightgbm">lightgbm</a></li>
 </ul>
 </li>
 </ul>
diff --git a/docs/installation.html b/docs/installation.html
index 6cd06c5..881d79e 100644
--- a/docs/installation.html
+++ b/docs/installation.html
@@ -108,6 +108,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/introduction.html b/docs/introduction.html
index bf6781f..8b42fec 100644
--- a/docs/introduction.html
+++ b/docs/introduction.html
@@ -120,6 +120,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/link_tasks.html b/docs/link_tasks.html
index 7a7ab55..1f2d75e 100644
--- a/docs/link_tasks.html
+++ b/docs/link_tasks.html
@@ -273,6 +273,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/model_exploration.html b/docs/model_exploration.html
new file mode 100644
index 0000000..b5d9c53
--- /dev/null
+++ b/docs/model_exploration.html
@@ -0,0 +1,121 @@
+<!DOCTYPE html>
+
+<html lang="en" data-content_root="./">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>Configuring Model Exploration &#8212; hlink 3.7.0 documentation</title>
+    <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
+    <script src="_static/documentation_options.js?v=229cbe3b"></script>
+    <script src="_static/doctools.js?v=9bcbadda"></script>
+    <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
+    <link rel="index" title="Index" href="genindex.html" />
+    <link rel="search" title="Search" href="search.html" />
+    <link rel="prev" title="Models" href="models.html" />
+   
+  <link rel="stylesheet" href="_static/custom.css" type="text/css" />
+  
+
+  
+  
+
+  </head><body>
+  
+
+    <div class="document">
+      <div class="documentwrapper">
+        <div class="bodywrapper">
+          
+
+          <div class="body" role="main">
+            
+  <section id="configuring-model-exploration">
+<h1>Configuring Model Exploration<a class="headerlink" href="#configuring-model-exploration" title="Link to this heading">¶</a></h1>
+</section>
+
+
+          </div>
+          
+        </div>
+      </div>
+      <div class="sphinxsidebar" role="navigation" aria-label="Main">
+        <div class="sphinxsidebarwrapper">
+<h1 class="logo"><a href="index.html">hlink</a></h1>
+
+
+
+
+
+
+
+
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
+<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="link_tasks.html">Link Tasks</a></li>
+<li class="toctree-l1"><a class="reference internal" href="running_the_program.html">Running hlink</a></li>
+<li class="toctree-l1"><a class="reference internal" href="use_examples.html">Advanced Workflows</a></li>
+<li class="toctree-l1"><a class="reference internal" href="config.html">Configuration</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
+<ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mappings</a></li>
+<li class="toctree-l1"><a class="reference internal" href="comparisons.html">Comparisons</a></li>
+<li class="toctree-l1"><a class="reference internal" href="comparison_features.html">Comparison Features</a></li>
+<li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
+<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
+<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
+<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1 current"><a class="current reference internal" href="#">Model Exploration</a></li>
+</ul>
+
+<div class="relations">
+<h3>Related Topics</h3>
+<ul>
+  <li><a href="index.html">Documentation overview</a><ul>
+      <li>Previous: <a href="models.html" title="previous chapter">Models</a></li>
+  </ul></li>
+</ul>
+</div>
+
+
+
+
+
+
+
+
+        </div>
+      </div>
+      <div class="clearer"></div>
+    </div>
+    <div class="footer">
+      &#169;2019-2022, IPUMS.
+      
+      |
+      Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
+      
+      |
+      <a href="_sources/model_exploration.md.txt"
+          rel="nofollow">Page source</a>
+    </div>
+
+    
+
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/docs/models.html b/docs/models.html
index 9bfce4b..7cf0c65 100644
--- a/docs/models.html
+++ b/docs/models.html
@@ -34,19 +34,40 @@
             
   <section id="models">
 <h1>Models<a class="headerlink" href="#models" title="Link to this heading">¶</a></h1>
-<p>These are models available to be used in the model evaluation, training, and household training link tasks.</p>
+<p>These are the machine learning models available for use in the model evaluation
+and training tasks and in their household counterparts.</p>
+<p>There are a few attributes available for all models.</p>
 <ul class="simple">
-<li><p>Attributes for all models:</p>
-<ul>
-<li><p><code class="docutils literal notranslate"><span class="pre">threshold</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">float</span></code>.  Alpha threshold (model hyperparameter).</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">threshold_ratio</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">float</span></code>.  Beta threshold (de-duplication distance ratio).</p></li>
-<li><p>Any parameters available in the model as defined in the Spark documentation can be passed as params using the label given in the Spark docs.  Commonly used parameters are listed below with descriptive explanations from the Spark docs.</p></li>
-</ul>
-</li>
+<li><p><code class="docutils literal notranslate"><span class="pre">type</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. The name of the model type. The available model
+types are listed below.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">threshold</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">float</span></code>.  The “alpha threshold”. This is the probability
+score required for a potential match to be labeled a match. <code class="docutils literal notranslate"><span class="pre">0</span> <span class="pre">≤</span> <span class="pre">threshold</span> <span class="pre">≤</span> <span class="pre">1</span></code>.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">threshold_ratio</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">float</span></code>. The threshold ratio or “beta threshold”.
+This applies to records which have multiple potential matches when
+<code class="docutils literal notranslate"><span class="pre">training.decision</span></code> is set to <code class="docutils literal notranslate"><span class="pre">&quot;drop_duplicate_with_threshold_ratio&quot;</span></code>. For
+each record, only potential matches which have the highest probability, have
+a probability of at least <code class="docutils literal notranslate"><span class="pre">threshold</span></code>, <em>and</em> whose probabilities are at least
+<code class="docutils literal notranslate"><span class="pre">threshold_ratio</span></code> times larger than the second-highest probability are
+matches. This is sometimes called the “de-duplication distance ratio”. <code class="docutils literal notranslate"><span class="pre">1</span> <span class="pre">≤</span> <span class="pre">threshold_ratio</span> <span class="pre">&lt;</span> <span class="pre">∞</span></code>.</p></li>
 </ul>
+<p>In addition, any model parameters documented in a model type’s Spark
+documentation can be passed as parameters to the model through hlink’s
+<code class="docutils literal notranslate"><span class="pre">training.chosen_model</span></code> and <code class="docutils literal notranslate"><span class="pre">training.model_exploration</span></code> configuration
+sections.</p>
+<p>Here is an example <code class="docutils literal notranslate"><span class="pre">training.chosen_model</span></code> configuration. The <code class="docutils literal notranslate"><span class="pre">type</span></code>,
+<code class="docutils literal notranslate"><span class="pre">threshold</span></code>, and <code class="docutils literal notranslate"><span class="pre">threshold_ratio</span></code> attributes are hlink specific. <code class="docutils literal notranslate"><span class="pre">maxDepth</span></code> is
+a parameter to the random forest model which hlink passes through to the
+underlying Spark classifier.</p>
+<div class="highlight-toml notranslate"><div class="highlight"><pre><span></span><span class="k">[training.chosen_model]</span>
+<span class="n">type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;random_forest&quot;</span>
+<span class="n">threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.2</span>
+<span class="n">threshold_ratio</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">1.2</span>
+<span class="n">maxDepth</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">5</span>
+</pre></div>
+</div>
 <section id="random-forest">
 <h2>random_forest<a class="headerlink" href="#random-forest" title="Link to this heading">¶</a></h2>
-<p>Uses <a class="reference external" href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.RandomForestClassifier.html">pyspark.ml.classification.RandomForestClassifier</a>.  Returns probability as an array.</p>
+<p>Uses <a class="reference external" href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.RandomForestClassifier.html">pyspark.ml.classification.RandomForestClassifier</a>.</p>
 <ul class="simple">
 <li><p>Parameters:</p>
 <ul>
@@ -56,36 +77,33 @@ <h2>random_forest<a class="headerlink" href="#random-forest" title="Link to this
 </ul>
 </li>
 </ul>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">model_parameters</span> <span class="o">=</span> <span class="p">{</span>
-    <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;random_forest&quot;</span><span class="p">,</span>
-    <span class="n">maxDepth</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
-    <span class="n">numTrees</span> <span class="o">=</span> <span class="mi">75</span><span class="p">,</span>
-    <span class="n">featureSubsetStrategy</span> <span class="o">=</span> <span class="s2">&quot;sqrt&quot;</span><span class="p">,</span>
-    <span class="n">threshold</span> <span class="o">=</span> <span class="mf">0.15</span><span class="p">,</span>
-    <span class="n">threshold_ratio</span> <span class="o">=</span> <span class="mf">1.0</span>
-<span class="p">}</span>
+<div class="highlight-toml notranslate"><div class="highlight"><pre><span></span><span class="k">[training.chosen_model]</span>
+<span class="n">type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;random_forest&quot;</span>
+<span class="n">threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.15</span>
+<span class="n">threshold_ratio</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">1.0</span>
+<span class="n">maxDepth</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">5</span>
+<span class="n">numTrees</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">75</span>
+<span class="n">featureSubsetStrategy</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;sqrt&quot;</span>
 </pre></div>
 </div>
 </section>
 <section id="probit">
 <h2>probit<a class="headerlink" href="#probit" title="Link to this heading">¶</a></h2>
 <p>Uses <a class="reference external" href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.GeneralizedLinearRegression.html">pyspark.ml.regression.GeneralizedLinearRegression</a> with <code class="docutils literal notranslate"><span class="pre">family=&quot;binomial&quot;</span></code> and <code class="docutils literal notranslate"><span class="pre">link=&quot;probit&quot;</span></code>.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">model_parameters</span> <span class="o">=</span> <span class="p">{</span>
-    <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;probit&quot;</span><span class="p">,</span>
-    <span class="n">threshold</span> <span class="o">=</span> <span class="mf">0.85</span><span class="p">,</span>
-    <span class="n">threshold_ratio</span> <span class="o">=</span> <span class="mf">1.2</span>
-<span class="p">}</span>
+<div class="highlight-toml notranslate"><div class="highlight"><pre><span></span><span class="k">[training.chosen_model]</span>
+<span class="n">type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;probit&quot;</span>
+<span class="n">threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.85</span>
+<span class="n">threshold_ratio</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">1.2</span>
 </pre></div>
 </div>
 </section>
 <section id="logistic-regression">
 <h2>logistic_regression<a class="headerlink" href="#logistic-regression" title="Link to this heading">¶</a></h2>
 <p>Uses <a class="reference external" href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.LogisticRegression.html">pyspark.ml.classification.LogisticRegression</a></p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">chosen_model</span> <span class="o">=</span> <span class="p">{</span>
-    <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;logistic_regression&quot;</span><span class="p">,</span>
-    <span class="n">threshold</span> <span class="o">=</span> <span class="mf">0.5</span><span class="p">,</span>
-    <span class="n">threshold_ratio</span> <span class="o">=</span> <span class="mf">1.0</span>
-<span class="p">}</span>
+<div class="highlight-toml notranslate"><div class="highlight"><pre><span></span><span class="k">[training.chosen_model]</span>
+<span class="n">type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;logistic_regression&quot;</span>
+<span class="n">threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.5</span>
+<span class="n">threshold_ratio</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">1.0</span>
 </pre></div>
 </div>
 </section>
@@ -101,12 +119,13 @@ <h2>decision_tree<a class="headerlink" href="#decision-tree" title="Link to this
 </ul>
 </li>
 </ul>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">chosen_model</span> <span class="o">=</span> <span class="p">{</span>
-    <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;decision_tree&quot;</span><span class="p">,</span>
-    <span class="n">maxDepth</span> <span class="o">=</span> <span class="mi">6</span><span class="p">,</span>
-    <span class="n">minInstancesPerNode</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span>
-    <span class="n">maxBins</span> <span class="o">=</span> <span class="mi">4</span>
-<span class="p">}</span>
+<div class="highlight-toml notranslate"><div class="highlight"><pre><span></span><span class="k">[training.chosen_model]</span>
+<span class="n">type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;decision_tree&quot;</span>
+<span class="n">threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.5</span>
+<span class="n">threshold_ratio</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">1.5</span>
+<span class="n">maxDepth</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">6</span>
+<span class="n">minInstancesPerNode</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">2</span>
+<span class="n">maxBins</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">4</span>
 </pre></div>
 </div>
 </section>
@@ -122,14 +141,84 @@ <h2>gradient_boosted_trees<a class="headerlink" href="#gradient-boosted-trees" t
 </ul>
 </li>
 </ul>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">chosen_model</span> <span class="o">=</span> <span class="p">{</span>
-    <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;gradient_boosted_trees&quot;</span><span class="p">,</span>
-    <span class="n">maxDepth</span> <span class="o">=</span> <span class="mi">4</span><span class="p">,</span>
-    <span class="n">minInstancesPerNode</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
-    <span class="n">maxBins</span> <span class="o">=</span> <span class="mi">6</span><span class="p">,</span>
-    <span class="n">threshold</span> <span class="o">=</span> <span class="mf">0.7</span><span class="p">,</span>
-    <span class="n">threshold_ratio</span> <span class="o">=</span> <span class="mf">1.3</span>
-<span class="p">}</span>
+<div class="highlight-toml notranslate"><div class="highlight"><pre><span></span><span class="k">[training.chosen_model]</span>
+<span class="n">type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;gradient_boosted_trees&quot;</span>
+<span class="n">threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.7</span>
+<span class="n">threshold_ratio</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">1.3</span>
+<span class="n">maxDepth</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">4</span>
+<span class="n">minInstancesPerNode</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span>
+<span class="n">maxBins</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">6</span>
+</pre></div>
+</div>
+</section>
+<section id="xgboost">
+<h2>xgboost<a class="headerlink" href="#xgboost" title="Link to this heading">¶</a></h2>
+<p><em>Added in version 3.8.0.</em></p>
+<p>XGBoost is an alternate, high-performance implementation of gradient boosting.
+It uses <a class="reference external" href="https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.spark.SparkXGBClassifier">xgboost.spark.SparkXGBClassifier</a>.
+Since the XGBoost-PySpark integration which the xgboost Python package provides
+is currently unstable, support for the xgboost model type is disabled in hlink
+by default. hlink will stop with an error if you try to use this model type
+without enabling support for it. To enable support for xgboost, install hlink
+with the <code class="docutils literal notranslate"><span class="pre">xgboost</span></code> extra.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">hlink</span><span class="p">[</span><span class="n">xgboost</span><span class="p">]</span>
+</pre></div>
+</div>
+<p>This installs the xgboost package and its Python dependencies. Depending on
+your machine and operating system, you may also need to install the libomp
+library, which is another dependency of xgboost. xgboost should raise a helpful
+error if it detects that you need to install libomp.</p>
+<p>You can view a list of xgboost’s parameters
+<a class="reference external" href="https://xgboost.readthedocs.io/en/latest/parameter.html">here</a>.</p>
+<div class="highlight-toml notranslate"><div class="highlight"><pre><span></span><span class="k">[training.chosen_model]</span>
+<span class="n">type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;xgboost&quot;</span>
+<span class="n">threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.8</span>
+<span class="n">threshold_ratio</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">1.5</span>
+<span class="n">max_depth</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">5</span>
+<span class="n">eta</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.5</span>
+<span class="n">gamma</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.05</span>
+</pre></div>
+</div>
+</section>
+<section id="lightgbm">
+<h2>lightgbm<a class="headerlink" href="#lightgbm" title="Link to this heading">¶</a></h2>
+<p><em>Added in version 3.8.0.</em></p>
+<p>LightGBM is another alternate, high-performance implementation of gradient
+boosting. It uses
+<a class="reference external" href="https://mmlspark.blob.core.windows.net/docs/1.0.8/pyspark/synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMClassifier">synapse.ml.lightgbm.LightGBMClassifier</a>.
+<code class="docutils literal notranslate"><span class="pre">synapse.ml</span></code> is a library which provides various integrations with PySpark,
+including integrations between the C++ LightGBM library and PySpark.</p>
+<p>LightGBM requires some additional Scala libraries that hlink does not usually
+install, so support for the lightgbm model is disabled in hlink by default.
+hlink will stop with an error if you try to use this model type without
+enabling support for it. To enable support for lightgbm, install hlink with the
+<code class="docutils literal notranslate"><span class="pre">lightgbm</span></code> extra.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">hlink</span><span class="p">[</span><span class="n">lightgbm</span><span class="p">]</span>
+</pre></div>
+</div>
+<p>This installs the lightgbm package and its Python dependencies. Depending on
+your machine and operating system, you may also need to install the libomp
+library, which is another dependency of lightgbm. If you encounter errors when
+training a lightgbm model, please try installing libomp if you do not have it
+installed.</p>
+<p>lightgbm has an enormous number of available parameters. Many of these are
+available as normal in hlink, via the <a class="reference external" href="https://mmlspark.blob.core.windows.net/docs/1.0.8/pyspark/synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMClassifier">LightGBMClassifier
+class</a>.
+Others are available through the special <code class="docutils literal notranslate"><span class="pre">passThroughArgs</span></code> parameter, which
+passes additional parameters through to the C++ library. You can see a full
+list of the supported parameters
+<a class="reference external" href="https://lightgbm.readthedocs.io/en/latest/Parameters.html">here</a>.</p>
+<div class="highlight-toml notranslate"><div class="highlight"><pre><span></span><span class="k">[training.chosen_model]</span>
+<span class="n">type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;lightgbm&quot;</span>
+<span class="c1"># hlink&#39;s threshold and threshold_ratio</span>
+<span class="n">threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.8</span>
+<span class="n">threshold_ratio</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">1.5</span>
+<span class="c1"># LightGBMClassifier supports these parameters (and many more).</span>
+<span class="n">maxDepth</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">5</span>
+<span class="n">learningRate</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">0.5</span>
+<span class="c1"># LightGBMClassifier does not directly support this parameter,</span>
+<span class="c1"># so we have to send it to the C++ library with passThroughArgs.</span>
+<span class="n">passThroughArgs</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;force_row_wise=true&quot;</span>
 </pre></div>
 </div>
 </section>
@@ -183,6 +272,8 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l2"><a class="reference internal" href="#logistic-regression">logistic_regression</a></li>
 <li class="toctree-l2"><a class="reference internal" href="#decision-tree">decision_tree</a></li>
 <li class="toctree-l2"><a class="reference internal" href="#gradient-boosted-trees">gradient_boosted_trees</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#xgboost">xgboost</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#lightgbm">lightgbm</a></li>
 </ul>
 </li>
 </ul>
diff --git a/docs/pipeline_features.html b/docs/pipeline_features.html
index 931dac7..56cd7c3 100644
--- a/docs/pipeline_features.html
+++ b/docs/pipeline_features.html
@@ -130,6 +130,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/running_the_program.html b/docs/running_the_program.html
index 085b800..0a2ef98 100644
--- a/docs/running_the_program.html
+++ b/docs/running_the_program.html
@@ -317,6 +317,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/docs/searchindex.js b/docs/searchindex.js
index 01a8b75..4022b53 100644
--- a/docs/searchindex.js
+++ b/docs/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"1:1 substitution by data table": [[12, "substitution-by-data-table"]], "Advanced Config File": [[3, "advanced-config-file"]], "Advanced Usage": [[0, "advanced-usage"]], "Advanced Workflow Examples": [[13, null]], "Aggregate Features": [[1, "aggregate-features"]], "Basic Config File": [[3, "basic-config-file"]], "Basic Usage": [[0, "basic-usage"]], "Blocking": [[3, "blocking"]], "Column Mappings": [[0, null], [3, "column-mappings"]], "Comparison Features": [[1, null], [3, "comparison-features"]], "Comparison Types": [[1, "comparison-types"], [2, "comparison-types"]], "Comparisons": [[2, null], [3, "comparisons"]], "Configuration": [[3, null]], "Configuration API": [[5, "configuration-api"], [5, null]], "Data sources": [[3, "data-sources"]], "Defining Multiple Comparisons": [[2, "defining-multiple-comparisons"]], "Example interactive mode workflow": [[11, "example-interactive-mode-workflow"]], "Example model exploration and FP/FN export workflow": [[13, "example-model-exploration-and-fp-fn-export-workflow"]], "Example training data export with generated ML features": [[13, "example-training-data-export-with-generated-ml-features"]], "Export training data after generating features to reuse in different linking years": [[13, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Feature Selection Transforms": [[4, null]], "Feature Selections": [[3, "feature-selections"]], "Feature add-ons": [[1, "feature-add-ons"]], "Filter": [[3, "filter"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "Household Comparisons": [[3, "household-comparisons"]], "Household Matching": [[8, "household-matching"]], "Household training and models": [[3, "household-training-and-models"]], "Installation": [[6, null]], "Installing from PyPI": [[6, "installing-from-pypi"]], "Installing from source": [[6, "installing-from-source"]], "Interactive Mode": [[11, "interactive-mode"]], "Introduction": [[7, null]], "Link Tasks": [[8, null]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[13, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Matching": [[8, "matching"]], "Model Exploration and Household Model Exploration": [[8, "model-exploration-and-household-model-exploration"]], "Models": [[9, null]], "Multiple Comparisons": [[3, "multiple-comparisons"]], "Overview": [[2, "overview"], [7, "overview"], [8, "overview"], [8, "id1"], [8, "id4"], [8, "id7"], [8, "id10"], [8, "id13"]], "Pipeline generated features": [[10, null]], "Pipeline-generated Features": [[3, "pipeline-generated-features"]], "Potential Matches Universe": [[3, "potential-matches-universe"]], "Preprocessing": [[8, "preprocessing"]], "Related Configuration Sections": [[8, "related-configuration-sections"], [8, "id3"], [8, "id6"], [8, "id9"], [8, "id12"], [8, "id15"]], "Reporting": [[8, "reporting"]], "Requirements": [[6, "requirements"]], "Running Linking Tasks and Steps": [[11, "running-linking-tasks-and-steps"]], "Running hlink": [[11, null]], "Single Comparison": [[3, "single-comparison"]], "Starting the program": [[11, "starting-the-program"]], "Substitution Columns": [[3, "substitution-columns"]], "Substitution by regex word replace": [[12, "substitution-by-regex-word-replace"]], "Substitutions": [[12, null]], "Task steps": [[8, "task-steps"], [8, "id2"], [8, "id5"], [8, "id8"], [8, "id11"], [8, "id14"]], "Top level configs": [[3, "top-level-configs"]], "Training and Household Training": [[8, "training-and-household-training"]], "Training and models": [[3, "training-and-models"]], "Transformer types": [[10, "transformer-types"]], "Transforms": [[0, "transforms"]], "Using hlink as a Library": [[11, "using-hlink-as-a-library"]], "Welcome to hlink\u2019s documentation!": [[5, null]], "abs_diff": [[1, "abs-diff"]], "add_to_a": [[0, "add-to-a"]], "alias": [[1, "alias"]], "all_equals": [[1, "all-equals"]], "and": [[1, "and"]], "any_equals": [[1, "any-equals"]], "array": [[4, "array"]], "array_index": [[0, "array-index"]], "b_minus_a": [[1, "b-minus-a"]], "bigrams": [[4, "bigrams"]], "btwn_threshold": [[1, "btwn-threshold"]], "bucketizer": [[10, "bucketizer"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "decision_tree": [[9, "decision-tree"]], "divide_by_int": [[0, "divide-by-int"]], "either_are_0": [[1, "either-are-0"]], "either_are_1": [[1, "either-are-1"]], "equals": [[1, "equals"]], "equals_as_int": [[1, "equals-as-int"]], "exact_mult": [[1, "exact-mult"]], "extra_children": [[1, "extra-children"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "geo_distance": [[1, "geo-distance"]], "get_floor": [[0, "get-floor"]], "gradient_boosted_trees": [[9, "gradient-boosted-trees"]], "gt_threshold": [[1, "gt-threshold"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "interaction": [[10, "interaction"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "length_b": [[1, "length-b"]], "logistic_regression": [[9, "logistic-regression"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "lower_threshold": [[1, "lower-threshold"]], "lowercase_strip": [[0, "lowercase-strip"]], "mapping": [[0, "mapping"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "neither_are_null": [[1, "neither-are-null"]], "not_equals": [[1, "not-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "power": [[1, "power"], [4, "power"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_both_years": [[1, "present-both-years"]], "probit": [[9, "probit"]], "random_forest": [[9, "random-forest"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "remove_prefixes": [[0, "remove-prefixes"]], "remove_punctuation": [[0, "remove-punctuation"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_suffixes": [[0, "remove-suffixes"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "second_gen_imm": [[1, "second-gen-imm"]], "soundex": [[4, "soundex"]], "split": [[0, "split"]], "sql_condition": [[1, "sql-condition"], [4, "sql-condition"]], "substring": [[0, "substring"]], "sum": [[1, "sum"]], "threshold": [[1, "threshold"]], "times": [[1, "times"]], "union": [[4, "union"]], "upper_threshold": [[1, "upper-threshold"]], "when_value": [[0, "when-value"]]}, "docnames": ["column_mappings", "comparison_features", "comparisons", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["column_mappings.md", "comparison_features.md", "comparisons.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [0, 1, 3, 7, 8, 11, 12], "0": [0, 1, 2, 3, 8, 9, 10, 11, 13], "005": 3, "012": 1, "05": [1, 3], "1": [0, 1, 2, 3, 5, 8, 9, 10, 11, 13], "10": [0, 3, 6, 13], "100": [0, 3, 13], "11": [0, 3, 6, 10], "12": [0, 6], "14": 1, "15": 9, "1867": 3, "1868": 3, "1869": 3, "1870": 3, "1871": 3, "1872": 3, "1873": 3, "1900": [3, 13], "1900_1910_potential_fn": 13, "1900_1910_potential_fp": 13, "1900_1910_training_data_20191023": 3, "1900_1910_training_result": 13, "1910": [1, 3, 13], "1920": 13, "1930": [3, 13], "1940": [3, 13], "1999": [0, 3], "2": [0, 1, 2, 3, 4, 8, 9, 12, 13], "20": 9, "25": 1, "3": [0, 1, 2, 3, 6, 8, 9, 10, 13], "300": 0, "301": 0, "302": 0, "303": 0, "4": [0, 1, 9], "5": [0, 1, 2, 3, 9, 10, 11, 13], "50": [3, 13], "50g": 13, "53": 3, "5g": 11, "6": [0, 3, 9, 10, 13], "65": 3, "7": [0, 1, 3, 9, 13], "75": [3, 9, 13], "79": [2, 3], "7th": 12, "8": [1, 3, 6, 11], "80": 3, "84": [2, 3], "85": [9, 11], "9": 1, "95": 1, "99": [1, 3], "9998": 0, "9999": [0, 3, 10], "A": [0, 1, 2, 3, 4, 10, 11], "AND": [1, 2, 3], "As": [0, 2], "At": [7, 8], "But": [3, 6], "By": [0, 3, 11], "For": [0, 1, 2, 3, 8, 11, 13], "If": [0, 1, 3, 4, 8, 9, 11, 12], "In": [0, 1, 2, 7, 11, 13], "It": [0, 1, 2, 3, 7, 11, 13], "NOT": 1, "OR": [1, 2, 3], "THEN": 1, "The": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13], "Then": [0, 6], "There": [1, 3, 8], "These": [0, 1, 2, 3, 7, 8, 9, 10], "To": [0, 1, 6, 8, 11], "Will": 3, "With": [0, 10], "_": [0, 1, 2, 3, 4, 5, 9, 10, 11], "_a": 1, "_bpl": 1, "_namefrst": 1, "_sex": 1, "a304bt": 3, "ab": 1, "abbrevi": [8, 12], "abl": 3, "about": [3, 11, 13], "abov": [1, 2, 6], "absolut": 1, "accept": [1, 3, 13], "access": [11, 13], "accord": 1, "across": 1, "ad": [0, 1, 2, 3], "add": [0, 2, 5], "add_to_a": 3, "addit": [0, 1, 3, 6, 7, 11], "addl": 1, "addl_var": 1, "adjust": 11, "adopt": 0, "advanc": 5, "affect": [4, 12], "after": [1, 2, 3, 5, 9, 11], "ag": [0, 1, 3, 4], "against": [1, 3, 12], "age_2": 3, "age_at_dataset_b": 0, "age_threshold": 1, "aggreg": 5, "ah": 0, "ahead": 8, "akin": 1, "algorithm": [1, 2, 7, 8], "alia": [0, 3, 8], "all": [0, 1, 3, 4, 8, 9, 10, 11], "allow": [1, 3, 8, 13], "along": 1, "alpha": 9, "alphabet": 0, "alphanumer": 3, "also": [0, 1, 2, 3, 6, 7, 8, 10, 11, 13], "altern": [0, 3], "although": 3, "alwai": 2, "among": 1, "amount": 3, "an": [0, 1, 2, 3, 7, 9, 11], "analysi": 13, "analyz": [11, 13], "ani": [1, 3, 4, 6, 9], "anoth": [0, 1, 3, 4, 8], "anyon": 8, "anywher": 12, "apach": 6, "apart": 0, "api": [3, 7, 10], "apostroph": 0, "appear": [0, 1], "appli": [0, 2, 3, 4, 8, 13], "apply_model": 3, "appropri": 3, "ar": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "arbitrari": 0, "area": 3, "aren": 1, "arg": 13, "argument": [1, 11, 13], "around": 3, "arrai": [0, 3, 5, 9, 10], "array_index": 3, "ask": 13, "aspect": [8, 11], "assert": [1, 11], "assum": 3, "attach_vari": 3, "attempt": 3, "attribut": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12], "auto": 9, "automat": [0, 3, 6, 8], "av": 12, "avail": [0, 1, 3, 4, 6, 8, 9, 10, 13], "avenu": [0, 12], "b": [0, 1, 2, 3, 4, 11], "b200": 3, "back": 1, "backup": 1, "base": [1, 3, 8], "basic": 5, "becaus": 2, "been": 7, "befor": [0, 1, 3, 4, 6, 8], "begin": 11, "behind": 0, "being": [1, 8], "belong": 3, "below": [0, 1, 3, 4, 9, 10, 11], "best": [3, 8], "beta": [3, 9], "better": [0, 7], "between": [0, 1, 2, 3, 7, 8, 11, 13], "beyond": 1, "bigint": 3, "bigram": [3, 5], "bin": 9, "binomi": 9, "birth": 1, "birthplac": [0, 3], "birthyr": [0, 3], "birthyr_3": 3, "birthyr_col": 1, "block": [2, 5, 8], "blvd": 0, "boolean": [1, 3, 4, 12, 13], "boost": [5, 9], "born": 1, "borrow_t": 11, "both": [0, 1, 2, 3, 8, 13], "boundari": 1, "bpl": [0, 1, 3], "bpl1": 3, "bpl2": 3, "bpl2_str": 3, "bpl3": 3, "bpl_clean": 3, "bpl_orig": 3, "bpl_root": 0, "bpl_str": 3, "bplmatch": 3, "broken": 7, "btwn": 1, "bucket": [3, 8], "built": 6, "builtin": 1, "byrdifcat": 3, "byrdiff": [1, 3, 13], "c": 1, "c201": 3, "calcul": [1, 13], "call": 0, "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "cannot": 3, "care": 2, "cartesian": 1, "case": [0, 1, 2, 3, 4, 7], "cast": 1, "categor": [1, 3, 9, 10], "categori": [0, 9], "caus": 9, "caution": [1, 10], "censu": [0, 8, 13], "census": 7, "chang": [0, 1, 3, 6, 11, 13], "charact": 0, "characterist": [7, 8], "check": [1, 11], "check_val_expr": 1, "checkpoint": 4, "child": [0, 9], "children": 1, "chines": 0, "choic": 11, "chosen": 8, "chosen_model": [3, 9, 13], "circl": 0, "circumst": 1, "class": 11, "classif": [8, 9], "classifi": [], "claus": [0, 1], "clean": [0, 7], "clean_birthyr": [0, 3, 4], "clone": 6, "code": [0, 1, 3, 6], "coeffici": [3, 8], "col": [0, 1], "col_to_add": 3, "col_to_join_on": 3, "col_typ": 3, "column": [1, 4, 5, 8, 10, 11, 12, 13], "column_map": [0, 3, 8], "column_nam": [0, 1, 3, 12], "column_to_append": 0, "combin": [1, 2, 3, 4, 8], "come": 1, "command": [3, 7, 11, 13], "comment": 4, "common": [7, 8, 12, 13], "commonli": 9, "comp": 1, "comp_a": [1, 2, 3], "comp_b": [1, 2, 3], "comp_c": 1, "comp_d": 1, "compar": [0, 1, 3, 7, 8], "comparis": 3, "comparison": [5, 8], "comparison_featur": [1, 2, 3, 8], "comparison_typ": [1, 2, 3], "complet": 11, "complex": [3, 4], "comput": [1, 4, 8], "concat": 0, "concaten": [0, 1], "condens": 0, "condense_strip_whitespac": 3, "condit": [0, 1, 2, 3, 4, 5, 8], "conf": [11, 13], "config": [1, 4, 5, 8, 11, 13], "configur": [0, 1, 2, 7, 11, 13], "conjuct": 3, "conjunct": 3, "connect": [2, 3], "consid": [1, 2, 9], "consider": 1, "consol": 11, "constraint": [1, 2], "construct": 8, "contain": [0, 1, 3, 12], "context": 10, "continu": [9, 10, 13], "conveni": 11, "convert": [0, 1, 3], "convert_ints_to_long": 3, "copi": [4, 13], "core": [1, 8, 11, 13], "correspond": [7, 8], "could": [0, 2, 3], "count": [1, 11, 13], "counterpart": [], "counti": [0, 1], "county_1900_1910_distances_km": 1, "county_a": 1, "county_b": 1, "county_dist": [1, 3, 13], "county_distance_lookup": 1, "county_distance_squar": [1, 3, 13], "county_state_dist": 1, "court": 0, "cover": 10, "coverag": 3, "cpu": 11, "creat": [0, 3, 7, 8, 10, 11, 12, 13], "creation": 3, "crosswalk": 8, "csv": [1, 3, 8, 11, 12, 13], "current": [1, 2, 3, 11], "d": 1, "d425": 3, "data": [0, 1, 5, 7, 8, 11], "databas": 11, "datafram": [8, 11, 13], "dataset": [0, 1, 3, 4, 7, 8, 11, 13], "datasourc": [1, 3, 11], "datasource_a": [3, 8], "datasource_b": [3, 8], "de": 9, "decis": [3, 5, 9, 13], "decisiontreeclassifi": 9, "default": [0, 1, 2, 3, 8, 9, 11], "defin": [1, 3, 5, 8, 9, 10, 11], "definit": [3, 8], "demograph": 8, "depend": [1, 2, 3, 6, 13], "dependent_var": [3, 13], "depth": 9, "deriv": 13, "derived_from": 3, "desc": 11, "describ": [0, 1, 2, 3, 11], "descript": [3, 9, 11], "detail": [0, 3, 11], "detect": [], "determin": [1, 8], "determinist": [7, 8], "dev": 6, "develop": [6, 7], "df": [3, 11], "dictionari": 11, "diff": 1, "differ": [0, 1, 3, 5, 7, 8], "digit": 0, "dir": 12, "directli": 2, "directori": [6, 11, 13], "disabl": [], "discard": 9, "discret": 9, "discuss": 3, "dist": 1, "dist_tabl": 1, "distanc": [1, 9], "distance_col": 1, "distance_km": 1, "distances_fil": 1, "distinct": 1, "divid": 0, "divide_by_int": 3, "do": [0, 1, 2, 4, 11, 13], "doc": [9, 10], "document": [1, 3, 9, 11, 13], "doe": [1, 4, 8, 13], "don": [3, 4], "doubl": 10, "down": [0, 7, 13], "drastic": 8, "drop": [0, 3, 11], "drop_al": 11, "drop_all_prc": 11, "drop_all_temp": 11, "drop_data_from_scored_match": 3, "drop_duplicate_a": 3, "drop_duplicate_with_threshold_ratio": [3, 13], "duplic": [3, 9], "durat": 1, "dure": [1, 2, 3, 8], "durmarr": [1, 3], "e": 6, "each": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11], "easiest": [6, 11], "easili": 11, "edit": 6, "effect": [2, 4], "either": [0, 1, 2, 3, 7, 12], "either_0": 1, "either_1": 1, "element": 0, "elig": 8, "els": [1, 3, 4], "else_v": 1, "else_valu": 0, "empti": 1, "enabl": [3, 8, 11], "encod": [3, 4], "encount": [], "end": [0, 1, 3, 4, 12], "enorm": [], "ensur": 3, "enter": 11, "entir": [0, 3], "enum_dist": 1, "enumer": 11, "equal": [2, 3, 12], "equal_and_not_null_templ": 1, "equival": 3, "error": [3, 10], "especi": 3, "eta": [], "etc": 13, "eval": 3, "evalu": [1, 3, 7, 8, 9], "even": 1, "everi": [1, 4], "ex": 1, "exact": [1, 3], "exact_mult": [3, 13], "exampl": [0, 1, 2, 3], "except": [3, 10], "exclud": 1, "excute_command": 11, "execut": 11, "execute_command": 11, "execute_task": 11, "executor": 11, "executor_memori": [11, 13], "exist": [1, 3, 11], "exit": 11, "expand": 3, "expand_length": 3, "expect": 1, "experi": 7, "experiment": [8, 11], "explan": 9, "explicitli": [1, 10], "explod": [2, 3, 8], "exploded_df_a": 11, "exploded_df_b": 11, "explor": [3, 5, 7, 11], "expon": 4, "exponenti": 1, "export": [5, 8, 11], "express": [0, 1, 3], "extend": 1, "extens": 8, "extra": 1, "extract": 3, "f": [1, 11], "f1": 1, "f1_match": 3, "f2": 1, "f2_match": 3, "f_caution": [3, 13], "f_interacted_jw_f": [3, 13], "f_pre": [3, 13], "factori": 11, "fail": 3, "fallback": 1, "fals": [1, 3, 4, 5, 7, 11], "famili": 9, "father_namefrst": 1, "favor": 1, "fbpl": 1, "fbpl_nomatch": 1, "fbplmatch": [3, 13], "featur": [2, 5, 7, 8, 9, 11], "feature_import": [3, 8, 13], "feature_nam": [2, 3], "feature_select": [2, 3, 4, 8], "featuresubsetstrategi": 9, "fed": [3, 8], "femal": [3, 12], "fetch": 1, "fetch_a": 3, "few": 4, "fewer": [1, 9], "fi": 1, "file": [1, 4, 5, 7, 8, 11, 12, 13], "filepath": 11, "fill": 1, "filter": [1, 2, 5, 8, 12], "final": [1, 3, 13], "find": [1, 8, 13], "finish": 11, "first": [0, 1, 3, 6, 8, 11, 12], "first_init_col": 1, "first_nam": 0, "five": 11, "fix": 8, "flag": [1, 2, 3, 10, 11, 13], "flexibl": [1, 2], "float": [1, 3, 9], "floor": 0, "focus": [2, 8], "follow": [0, 1, 2, 7, 11, 12, 13], "force_row_wis": [], "foreign": 1, "forest": [5, 9], "form": [1, 3, 8, 12], "format": 0, "four": 1, "framework": 13, "from": [0, 1, 3, 5, 7, 8, 9, 10, 11, 13], "from_icpsrctyi": 1, "from_statefip": 1, "fsoundex": [3, 13], "full": [3, 8, 13], "full_count_1870_1880": 11, "full_count_1900_1910": 13, "fullcount_1870_1880": 11, "function": [0, 1, 3, 7, 11], "further": [7, 13], "gamma": [], "gbtclassifi": 9, "gen": 1, "gener": [0, 1, 5, 7, 8, 11], "generalizedlinearregress": 9, "geo": 1, "geograph": 1, "get": [0, 1, 2, 3, 11], "get_floor": 3, "get_set": 11, "get_step": 11, "get_tabl": 11, "get_task": 11, "github": 6, "give": [0, 3], "given": [0, 1, 2, 3, 4, 9, 13], "go": [3, 11], "good": 1, "gradient": [5, 9], "greater": [1, 2, 6], "greatest": 1, "group": [3, 8], "gt": 1, "h": 11, "ha": [0, 1, 2, 3, 4, 7, 11, 13], "handl": 11, "harmon": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "haven": 3, "head": 0, "header": [3, 10, 12], "help": [3, 8, 11], "here": [2, 3, 8, 11, 13], "hh": [1, 2], "hh_blocked_match": 11, "hh_col": 3, "hh_comparison": [2, 3, 8], "hh_match": [3, 11], "hh_model_eval_repeat_fn": 11, "hh_model_eval_repeat_fp": 11, "hh_model_eval_training_data": 11, "hh_model_eval_training_featur": 11, "hh_model_eval_training_result": 11, "hh_model_eval_training_vector": 11, "hh_model_explor": 11, "hh_potential_match": [2, 11], "hh_potential_matchs_prep": 11, "hh_predicted_match": 11, "hh_repeat_fn": 13, "hh_repeat_fp": 13, "hh_scored_potential_match": 11, "hh_train": [1, 3, 8, 11, 13], "hh_training_data": 11, "hh_training_data_1900_1910": 3, "hh_training_featur": [11, 13], "hh_training_result": 13, "hidden": 11, "hierarch": [7, 11], "hierarchi": 11, "high": 11, "highest": [1, 3], "highli": [7, 8], "histid": [1, 3, 13], "histid_col": 1, "hit": [3, 11, 13], "hits2": [3, 13], "hlink": [0, 1, 2, 3, 4, 6, 7, 8, 13], "hold": 10, "hot": 3, "household": [0, 2, 5, 7, 9, 11, 13], "how": [1, 3, 8], "howev": [4, 7, 13], "hundr": 0, "hyper": [3, 7, 13], "hyperparamet": [9, 13], "hyphen": 0, "i": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13], "id": [1, 3], "id_column": [1, 3], "ident": 13, "identifi": [3, 7, 13], "if_valu": 0, "ignor": 7, "ii": [0, 3], "iii": [0, 3], "imm": [1, 3, 13], "imm_interacted_bplmatch": 3, "imm_interacted_immyear_caut": [3, 13], "immigr": 1, "immyear_caut": [3, 10], "immyear_diff": [1, 3, 10, 13], "implement": 13, "implicitli": 2, "import": [3, 8, 11, 13], "improv": 8, "includ": [1, 3, 8, 10, 11], "incompar": 1, "increas": [3, 10], "independ": [0, 3], "independent_var": [1, 3, 13], "index": [0, 6], "indic": [1, 13], "individu": [1, 3, 8, 13], "inf": 10, "inform": [0, 1, 3, 11], "ingest": 8, "initi": [0, 1, 11], "input": [0, 1, 3, 4, 7, 8, 11, 12], "input_col": 4, "input_column": [3, 4, 10], "input_table_nam": 11, "inspect": 8, "instal": 5, "instanc": [0, 9], "instead": [1, 3, 4, 6, 8], "instruct": [6, 11], "int": [0, 1, 3, 4, 9], "integ": [0, 1, 3, 10], "integr": 6, "interact": [3, 5, 8, 13], "interfac": 11, "intermedi": 11, "introduct": 5, "introspect": 8, "invalid": [1, 9], "ipum": [0, 7], "ipython": 11, "isn": 13, "istemporari": 11, "item": 0, "iter": 3, "its": [0, 1, 7, 11], "itself": [1, 2], "iv": 3, "jaro": [1, 10], "jaro_winkl": 3, "java": 6, "job": 11, "join": [1, 3, 12], "join_column": [3, 12], "join_valu": [3, 12], "jr": [0, 3], "json": [3, 11], "just": [1, 3, 11, 13], "jw": 1, "jw_col_templ": 1, "jw_f": [1, 3, 13], "jw_m": [3, 10, 13], "jw_max_a": 3, "jw_max_b": 3, "jw_sp": [3, 13], "jw_street": 1, "jw_threshold": 1, "kei": [1, 8, 11], "key_count": 1, "kind": 1, "know": 3, "known": 0, "label": 9, "languag": 7, "larger": [], "last": [0, 1, 8, 10], "latest": 6, "launch": [11, 13], "law": 0, "le": [], "lead": 0, "learn": [1, 2, 3, 7, 8, 11, 13], "learningr": [], "least": [0, 1], "leav": 0, "left": 9, "length": [1, 3, 10], "leq": [], "less": [1, 3], "let": 11, "letter": 0, "level": [1, 5, 11], "leverag": 3, "libomp": [], "librari": [5, 7], "lightgbm": [], "lightgbmclassifi": [], "like": [0, 2, 3, 8, 11, 12], "limit": 3, "line": [7, 11], "link": [0, 1, 3, 5, 7, 9], "link_run": 11, "linkrun": [7, 11], "list": [0, 1, 3, 4, 5, 9, 11, 12], "liter": 3, "ll": 11, "load": 11, "load_conf_fil": 11, "load_config": 11, "loc_a": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b": 1, "loc_b_0": 1, "loc_b_1": 1, "locat": [1, 3, 11], "log2": 9, "logic": [0, 2, 3], "logist": [5, 9], "logistic_regress": [3, 13], "logisticregress": 9, "long": [3, 12], "longest": 8, "look": [1, 2, 11, 12], "lookup": 1, "lower": [0, 1], "lowercas": 0, "lowercase_strip": 3, "lowest": 0, "lr": 11, "lsoundex": [3, 13], "m": [0, 1], "m_caution": [1, 3, 10, 13], "m_interacted_jw_m": [3, 10, 13], "m_namefrst": 1, "m_pre": [3, 10, 13], "machin": [1, 2, 3, 7, 8, 11, 13], "made": 6, "mai": [0, 2, 3, 7, 8, 11, 12], "main": 11, "mainli": 1, "major": [1, 11], "make": [0, 1, 2, 3, 6, 13], "male": [3, 12], "mani": [2, 8, 11], "manual": 13, "map": [5, 8, 10], "mardurmatch": [1, 3], "mark": 2, "marriag": 1, "match": [0, 1, 2, 5, 7, 11, 12, 13], "matches_df": 11, "matrix": 13, "max": [1, 9, 11], "max_depth": [], "maxbin": 9, "maxdepth": [3, 9, 13], "maximum": [1, 9], "maximum_jw": 1, "mbpl": 1, "mbplmatch": [3, 13], "mean": [0, 2, 3], "meant": 3, "meet": 1, "member": [1, 8], "memori": 11, "men": 3, "messag": 11, "metadata": 8, "method": [11, 13], "mfbplmatch": 1, "mi": [3, 13], "mi0": 1, "mi1": 1, "mid_init_col": 1, "mid_init_match": 1, "middl": [0, 1], "might": 13, "minimum": [1, 9], "mininstancespernod": 9, "minu": [1, 3], "mismatch": 1, "miss": [1, 11], "ml": [3, 5, 9, 10], "mode": [5, 6, 13], "model": [5, 7, 11], "model_eval_repeat_fn": 11, "model_eval_repeat_fp": 11, "model_eval_training_data": 11, "model_eval_training_featur": 11, "model_eval_training_result": 11, "model_eval_training_vector": 11, "model_explor": [3, 11, 13], "model_paramet": [3, 8, 9, 13], "modifi": 3, "modul": 11, "moment": 8, "momloc": 1, "more": [0, 1, 2, 3, 10, 11, 13], "most": [0, 1, 8, 11], "mother": 10, "mother_birthyr": 1, "mpre": 1, "mr": 0, "much": [1, 8], "mult": 1, "multi": 1, "multipl": [0, 1, 5, 11], "multipli": 1, "must": [0, 1, 2, 3, 4, 9, 10, 12], "my": [11, 13], "my_conf": 11, "my_fil": 3, "myriad": 3, "n": [1, 9, 10], "n_training_iter": [3, 8, 13], "name": [0, 1, 3, 4, 11, 12], "name_col": 1, "name_std": [3, 12], "namefrst": [0, 1, 3], "namefrst_clean": [0, 3], "namefrst_init": 1, "namefrst_jw": [1, 2, 3, 13], "namefrst_mid_init": [0, 1], "namefrst_mid_init_2": 1, "namefrst_rel": 1, "namefrst_related_row": 1, "namefrst_split": [0, 3], "namefrst_std": [3, 12], "namefrst_std_jw": [3, 13], "namefrst_unstd": [1, 3], "namefrst_unstd_bigram": 4, "namelast": [1, 3], "namelast1": 1, "namelast2": 1, "namelast3": 1, "namelast_bigram": 3, "namelast_clean": [1, 3, 4], "namelast_clean_bigram": [3, 4], "namelast_clean_soundex": 4, "namelast_equal_as_int": 1, "namelast_frst_bigram": 4, "namelast_jw": [2, 3, 13], "namelast_jw_max": 1, "namelast_neighbor": 1, "namelast_popular": 1, "namelast_popularity_sum": 1, "nativ": 1, "nbor": [1, 3, 13], "ncount": [3, 4, 13], "ncount2": [3, 4, 13], "nearest": 0, "necessari": [4, 13], "need": [0, 1, 2, 3, 8, 11, 13], "neg": [3, 5, 7], "neighbor": 1, "neighborhood": 1, "neither": 1, "nest": [2, 3, 7], "new": [0, 3, 13], "new_marr": [1, 3], "newli": 3, "niu": 1, "no_first_pad": 4, "node": 9, "non": 0, "nonzero": 1, "nor": 1, "normal": 8, "note": [2, 3], "now": [2, 11], "null": [1, 3, 4], "null_fil": 3, "num": 11, "num_col": 1, "number": [0, 1, 3, 4, 8, 9, 11], "numer": [0, 1], "numtre": [3, 9, 13], "nvl": 1, "object": [1, 3, 7, 11], "often": [2, 4, 11], "onc": [1, 11], "one": [0, 1, 3, 8], "onethird": 9, "onli": [0, 1, 2, 3, 8, 13], "ons": 5, "oper": [0, 1, 2, 3], "opt": [], "option": [0, 1, 3, 4, 7, 8, 9, 11, 13], "or_group": 3, "order": [0, 3, 8], "org": 6, "organ": 11, "original_valu": 3, "oth": [3, 13], "other": [1, 3, 13], "otherwis": [0, 1, 10, 13], "our": 11, "out": [3, 4, 8, 13], "output": [0, 1, 2, 3, 4, 7, 8, 11, 13], "output_col": 4, "output_column": [3, 4, 10], "output_suspicious_td": [3, 13], "output_table_nam": 11, "output_typ": 3, "outsid": 10, "overrid": [0, 3], "override_column_a": [0, 3, 4], "override_column_b": [0, 3, 4], "override_transform": [0, 3], "overview": 5, "own": [1, 3], "p": 3, "packag": 6, "page": [1, 2, 3, 11], "pair": [1, 2, 3, 13], "param": [9, 13], "param_grid": [3, 13], "paramet": [3, 7, 8, 9, 11, 13], "parent": [0, 1, 12], "parent_step_chang": 1, "parenthes": 3, "parquet": [3, 8], "part": [3, 8], "particular": [1, 2, 3], "particularli": 1, "partit": 13, "pass": [1, 8, 9], "passthrougharg": [], "path": [1, 3, 11, 12, 13], "pattern": 11, "peopl": [0, 1, 7, 11], "per": [1, 3, 9, 10, 11], "percent": 8, "percentag": 1, "perform": [1, 3, 7, 8, 12], "persist": 11, "person": [0, 1, 7], "pip": 6, "pipelin": 5, "pipeline_featur": [3, 8, 10], "piplin": 3, "placehold": 1, "pleas": [3, 11], "plu": 3, "point": [10, 13], "popul": 8, "posit": [0, 1, 3, 5, 7], "possibl": 3, "post": [3, 8], "potenti": [1, 2, 5, 8], "potential_match": [1, 2, 8, 11], "potential_matches_prep": 11, "potential_matches_univers": [3, 8], "power": 5, "predefin": 1, "predict": [3, 13], "predicted_match": 11, "prediction_col": 3, "preexist": 11, "prefer": 13, "prefix": 0, "prep": 3, "prep_step": 11, "prepar": [7, 8, 11], "prepend": 4, "prepped_df_a": 11, "prepped_df_b": 11, "preprocess": [5, 7, 11, 13], "present": [1, 3, 10], "pretti": 2, "primari": [1, 7], "primarili": [1, 8], "print": 11, "probabilist": [3, 7], "probabl": [0, 3, 9], "probit": [3, 5], "proceed": 12, "process": [2, 3, 7, 11], "produc": [0, 11], "product": [1, 7, 13], "program": [1, 3, 8, 13], "project": 6, "prompt": 11, "provid": [0, 3, 4, 7, 8, 10, 11], "pull": 8, "punctuat": 0, "put": [11, 13], "py": [1, 3], "pypi": 5, "pyspark": [6, 9, 10, 11], "python": [6, 7, 11], "q": [11, 13], "qmark": 0, "qualifi": 3, "queri": [1, 3], "race": [0, 1, 3, 10, 13], "race_interacted_srac": [3, 10, 13], "racematch": 3, "rais": [1, 4], "random": [5, 9], "random_forest": [3, 13], "randomforestclassifi": 9, "rang": [3, 10], "rate": 1, "ratio": [3, 9], "ration": 0, "rationalize_name_word": 3, "raw": [0, 3, 8, 11], "raw_df_a": 11, "raw_df_b": 11, "read": [0, 1, 3, 8, 11], "readm": 3, "real": 7, "reason": 1, "recod": 0, "record": [0, 1, 2, 3, 7, 8], "recurs": [2, 3], "reduc": [4, 8], "refer": 3, "regex": 5, "regex_word_replac": 12, "region": [3, 13], "region_dict": 3, "regionf": 3, "regist": [8, 11], "regress": [5, 9], "regular": 13, "rel": [1, 3, 13], "relat": [0, 1, 3], "relate_a": [3, 10], "relate_col": 1, "relate_div_100": [0, 1, 3], "related_individual_max_jw": 1, "related_individual_row": 1, "relatematch": [1, 3], "relatetyp": [3, 10], "relatetype_interacted_relatematch": 3, "relationship": 7, "relev": 13, "reload": 11, "remain": 8, "remov": 0, "remove_alternate_nam": 3, "remove_qmark_hyphen": 3, "remove_suffix": 3, "repeat_fn": 13, "repeat_fp": 13, "repeatedli": 3, "replac": [0, 1, 2, 5], "replace_apostroph": 3, "replaced_birthyr": [1, 3, 4], "report": [1, 5, 7, 11], "repositori": 6, "repres": [0, 1, 3, 10, 11], "represent": [3, 8], "reproduc": 11, "request": 8, "requir": [0, 1, 2, 3, 4, 5, 8, 10, 11, 12], "research": 7, "reshap": 8, "resourc": 4, "respect": [1, 8], "restrict": 2, "result": [0, 1, 7, 10, 11, 13], "return": [1, 4, 9, 11], "reus": 5, "right": [9, 11], "risk": 1, "road": 0, "robust": 3, "root": 6, "round": [0, 3], "row": [0, 3, 4], "rule": [2, 7, 8], "run": [5, 6, 7, 8, 13], "run_all_step": [3, 11, 13], "run_step": 11, "runtim": 8, "sai": 11, "same": [0, 1, 2, 3, 4, 7, 8, 11], "sampl": 1, "satisfi": [2, 3, 8], "save": [3, 8, 13], "scala": 1, "scale": 3, "scale_data": [3, 13], "scenario": 13, "schema": 3, "score": [1, 3, 8, 10], "score_with_model": [3, 13], "scored_potential_match": 11, "scratch": 3, "script": [3, 7, 11], "search": 1, "second": [0, 1, 3, 12], "secondari": 1, "secondary_distance_col": 1, "secondary_distances_fil": 1, "secondary_key_count": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_source_column": 1, "secondary_table_nam": 1, "section": [0, 1, 2, 3, 13], "see": [1, 3, 6, 11, 13], "seen": 1, "select": [0, 1, 5, 7, 11, 13], "send": [], "separ": 8, "sequenc": 7, "seri": 11, "serialp": 3, "serv": 1, "set": [0, 1, 2, 3, 4, 7, 8, 11, 13], "set_executor_memori": 11, "set_link_task": 11, "set_loc": 11, "set_num_cor": 11, "set_preexisting_t": 11, "set_print_sql": 11, "set_value_column_a": [3, 4], "set_value_column_b": [3, 4], "sever": [1, 7], "sex": [1, 3, 12], "sex_equ": 3, "sex_region_interact": 3, "sex_regionf_interact": 3, "sexmatch": 3, "sgen": [1, 3, 13], "share": 7, "short": 3, "should": [1, 3, 4, 9, 10, 11], "show": 11, "showf": 11, "shut": 13, "sibl": 0, "sign": 1, "signific": 0, "similar": [1, 3], "simpli": [0, 3], "simplifi": 1, "sinc": [0, 3], "singl": [0, 4, 11, 13], "size": 1, "skip": [3, 4, 8], "sm_bpl": 1, "sm_namefrst": 1, "sm_sex": 1, "small": 3, "smaller": 7, "smallest": 3, "sn_bpl": 1, "sn_namefrst": 1, "sn_sex": 1, "so": [0, 1, 2, 3, 6, 13], "some": [0, 1, 2, 3, 4, 7, 8, 11], "someth": 11, "sometim": 3, "somewhat": 2, "soundex": 5, "sourc": [1, 5, 8, 11, 13], "source_column_a": 1, "source_column_b": 1, "sp": 1, "sp_caution": [1, 3, 13], "sp_interacted_jw_sp": [3, 13], "sp_pre": [3, 13], "space": [0, 3, 4, 12], "span": 13, "spark": [1, 3, 4, 6, 9, 10, 11, 13], "sparkfactori": 11, "sparksess": 11, "sparkxgbclassifi": [], "special": 1, "specif": [1, 3, 11], "specifi": [0, 1, 3, 7, 8, 10, 11, 12], "split": [3, 4, 8, 9, 10, 13], "split_by_id_a": [3, 13], "sploc": 1, "spous": 0, "spouse_birthyr": 1, "spouse_bpl": 1, "sql": [0, 1, 2, 3, 4, 5, 8, 11], "sql_condit": 3, "sqrt": 9, "squar": 1, "sr": [0, 3], "srace": [3, 10, 13], "stage": 3, "standard": [0, 1, 12], "start": [0, 12], "state": [1, 7], "state_1900_1910_distances_km": 1, "state_dist": 1, "state_distance_lookup": 1, "statecode1": 1, "statecode2": 1, "statefip": [0, 1, 3], "statefip_h": 3, "step": [0, 1, 3, 7], "stepmom": 1, "still": 12, "stop": 0, "street": [0, 1], "street_jw": [1, 3, 13], "street_unstd": 12, "strictli": 10, "string": [0, 1, 3, 4, 8, 9, 11, 12], "strip": [0, 8], "structur": [2, 3, 7], "sub": [1, 2, 3], "subhead": 12, "subset": [3, 12], "substitut": [5, 8], "substitution_column": [3, 8, 12], "substitution_fil": [3, 12], "substitutions_street_abbrev": 12, "subtract": 1, "suffix": 0, "suppli": 12, "support": [0, 2, 3, 8, 9], "suppos": [0, 2, 3], "sure": [2, 3, 6, 11], "surnam": 1, "surround": 0, "suspect": [1, 7], "swap": 12, "synaps": [], "syntax": 2, "system": 6, "t": [0, 1, 3, 4, 13], "tabl": [1, 2, 3, 5, 8, 11, 13], "table_nam": 1, "tablenam": 11, "tailor": 7, "take": [0, 1, 2, 3, 4, 8, 11], "taken": [0, 1], "task": [2, 3, 5, 7, 9, 13], "task_nam": 11, "tell": [1, 3, 4], "templat": 1, "ten": 1, "term": 3, "test": [3, 8, 13], "text": 11, "than": [0, 1, 2, 3, 9], "thei": [0, 1, 2, 3, 8, 11], "them": [0, 1, 3, 8], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 13], "thing": 3, "those": [1, 3], "though": 1, "thought": 8, "three": [2, 3, 8], "threshold": [2, 3, 9, 13], "threshold_expr": [2, 3], "threshold_ratio": [3, 9, 13], "through": [6, 8, 10, 11], "thu": 1, "time": [0, 3, 8, 11], "to_icpsrctyi": 1, "to_statefip": 1, "togeth": [0, 1, 3], "toml": [3, 7, 11], "tool": [6, 7], "top": [1, 5], "topic": 11, "total": 8, "train": [1, 5, 7, 9, 11], "training_data": [3, 11], "training_data_1900_1910": 13, "training_data_1900_1910_hlink_featur": 13, "training_data_subset": 3, "training_featur": [11, 13], "training_result": 13, "transform": [1, 3, 5, 7, 8], "transformer_typ": [3, 10], "treat": [3, 10], "tree": [5, 9], "true": [1, 3, 4, 8, 10, 12, 13], "try": 3, "tune": [7, 13], "tutori": [3, 11], "two": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "type": [0, 3, 4, 5, 8, 9, 11, 12, 13], "typic": [3, 8], "u": 0, "ugli": 2, "under": [1, 3], "underli": [], "understand": 7, "union": 5, "uniqu": 3, "unit": 7, "univers": [1, 5, 8], "unknown": 1, "unlik": 2, "unrel": 1, "unstabl": 11, "up": [1, 3, 11, 12], "updat": 13, "upper": 1, "upper_threshold": 3, "uppercas": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13], "us1900": 3, "us1900m_usa": 3, "us1910": 3, "us1910m_usa": 3, "usag": [3, 4, 5, 11], "use_potential_matches_featur": 13, "use_training_data_featur": [3, 8, 13], "user": [1, 11], "usual": [0, 8, 13], "util": 4, "v": 3, "valu": [0, 1, 2, 3, 4, 9, 10, 11, 12], "var": [1, 3], "vari": [0, 3, 4], "variabl": [0, 1, 3, 13], "variant": 12, "varieti": 8, "variou": [], "ve": 13, "vector": [3, 10], "verbos": 2, "veri": [4, 8, 13], "version": [0, 6, 13], "vi": 3, "via": [6, 7], "view": [], "vii": 3, "viii": 3, "volumn": 1, "wa": [1, 13], "wai": [1, 2, 3, 6, 11], "want": [0, 1, 3, 11, 13], "washington": 3, "we": [0, 1, 11, 13], "well": 3, "were": 3, "weren": 13, "what": [1, 3, 11, 13], "when": [0, 1, 2, 3, 4, 8, 13], "where": [1, 8, 11, 13], "whether": [1, 3, 12], "which": [0, 1, 2, 3, 4, 7, 8, 10, 11, 12, 13], "white": 0, "whitespac": [0, 8], "who": 1, "whole": [0, 7], "whose": 1, "width": 8, "winkler": [1, 10], "within": [1, 3, 7, 11, 12], "without": [], "word": [0, 5], "work": [0, 1, 3, 6, 8, 11, 13], "workflow": [4, 5], "world": 7, "would": [1, 2, 3, 13], "write": [1, 11, 13], "written": [3, 7], "x": [3, 10, 11], "x_crosswalk": 11, "x_hh_tfam": 11, "x_hh_tfam_2a": 11, "x_hh_tfam_2b": 11, "x_load": 11, "x_parquet_from_csv": 11, "x_persist": 11, "x_sql": 11, "x_sqlf": 11, "x_summari": 11, "x_tab": 11, "x_tfam": 11, "x_tfam_raw": 11, "x_union": 11, "xgboost": [], "y": 10, "year": [0, 1, 3, 4, 5], "year_b": 1, "yet": 11, "you": [0, 1, 2, 3, 6, 11, 12, 13], "your": [2, 3, 4, 6, 8, 11, 13], "yrimmig": 1, "zero": 1}, "titles": ["Column Mappings", "Comparison Features", "Comparisons", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "titleterms": {"": 5, "1": 12, "abs_diff": 1, "add": 1, "add_to_a": 0, "advanc": [0, 3, 13], "after": 13, "aggreg": 1, "alia": 1, "all_equ": 1, "any_equ": 1, "api": 5, "arrai": 4, "array_index": 0, "b_minus_a": 1, "basic": [0, 3], "bigram": 4, "block": 3, "btwn_threshold": 1, "bucket": 10, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "column": [0, 3], "comparison": [1, 2, 3], "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "condense_strip_whitespac": 0, "config": 3, "configur": [3, 5, 8], "data": [3, 12, 13], "decision_tre": 9, "defin": 2, "differ": 13, "divide_by_int": 0, "document": 5, "either_are_0": 1, "either_are_1": 1, "equal": 1, "equals_as_int": 1, "exact_mult": 1, "exampl": [11, 13], "explor": [8, 13], "export": 13, "extra_children": 1, "f1_match": 1, "f2_match": 1, "fals": 13, "featur": [1, 3, 4, 10, 13], "fetch_a": 1, "fetch_b": 1, "file": 3, "filter": 3, "fn": 13, "fp": 13, "from": 6, "gener": [3, 10, 13], "geo_dist": 1, "get_floor": 0, "gradient_boosted_tre": 9, "gt_threshold": 1, "hit": 1, "hits2": 1, "hlink": [5, 11], "household": [1, 3, 8], "instal": 6, "interact": [10, 11], "introduct": 7, "jaro_winkl": 1, "jaro_winkler_r": 1, "jaro_winkler_street": 1, "jw_max_a": 1, "jw_max_b": 1, "length_b": 1, "level": 3, "librari": 11, "lightgbm": [], "link": [8, 11, 13], "list": 13, "logistic_regress": 9, "look_at_addl_var": 1, "lower_threshold": 1, "lowercase_strip": 0, "map": [0, 3], "match": [3, 8], "max_jaro_winkl": 1, "maximum_jaro_winkl": 1, "ml": 13, "mode": 11, "model": [3, 8, 9, 13], "multi_jaro_winkler_search": 1, "multipl": [2, 3], "neg": 13, "neither_are_nul": 1, "not_equ": 1, "not_zero_and_not_equ": 1, "ons": 1, "overview": [2, 7, 8], "pipelin": [3, 10], "posit": 13, "potenti": [3, 13], "power": [1, 4], "preprocess": 8, "present_and_equal_categorical_in_univers": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_both_year": 1, "probit": 9, "program": 11, "pypi": 6, "random_forest": 9, "rationalize_name_word": 0, "regex": 12, "rel_jaro_winkl": 1, "relat": 8, "remove_alternate_nam": 0, "remove_one_letter_nam": 0, "remove_prefix": 0, "remove_punctu": 0, "remove_qmark_hyphen": 0, "remove_stop_word": 0, "remove_suffix": 0, "replac": 12, "replace_apostroph": 0, "report": 8, "requir": 6, "reus": 13, "run": 11, "second_gen_imm": 1, "section": 8, "select": [3, 4], "singl": 3, "soundex": 4, "sourc": [3, 6], "split": 0, "sql_condit": [1, 4], "start": 11, "step": [8, 11], "substitut": [3, 12], "substr": 0, "sum": 1, "tabl": 12, "task": [8, 11], "threshold": 1, "time": 1, "top": 3, "train": [3, 8, 13], "transform": [0, 4, 10], "type": [1, 2, 10], "union": 4, "univers": 3, "upper_threshold": 1, "us": 11, "usag": 0, "welcom": 5, "when_valu": 0, "word": 12, "workflow": [11, 13], "xgboost": [], "year": 13}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"1:1 substitution by data table": [[12, "substitution-by-data-table"]], "Advanced Config File": [[3, "advanced-config-file"]], "Advanced Usage": [[0, "advanced-usage"]], "Advanced Workflow Examples": [[13, null]], "Aggregate Features": [[1, "aggregate-features"]], "Basic Config File": [[3, "basic-config-file"]], "Basic Usage": [[0, "basic-usage"]], "Blocking": [[3, "blocking"]], "Column Mappings": [[0, null], [3, "column-mappings"]], "Comparison Features": [[1, null], [3, "comparison-features"]], "Comparison Types": [[1, "comparison-types"], [2, "comparison-types"]], "Comparisons": [[2, null], [3, "comparisons"]], "Configuration": [[3, null]], "Configuration API": [[5, "configuration-api"], [5, null]], "Data sources": [[3, "data-sources"]], "Defining Multiple Comparisons": [[2, "defining-multiple-comparisons"]], "Example interactive mode workflow": [[11, "example-interactive-mode-workflow"]], "Example model exploration and FP/FN export workflow": [[13, "example-model-exploration-and-fp-fn-export-workflow"]], "Example training data export with generated ML features": [[13, "example-training-data-export-with-generated-ml-features"]], "Export training data after generating features to reuse in different linking years": [[13, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Feature Selection Transforms": [[4, null]], "Feature Selections": [[3, "feature-selections"]], "Feature add-ons": [[1, "feature-add-ons"]], "Filter": [[3, "filter"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "Household Comparisons": [[3, "household-comparisons"]], "Household Matching": [[8, "household-matching"]], "Household training and models": [[3, "household-training-and-models"]], "Installation": [[6, null]], "Installing from PyPI": [[6, "installing-from-pypi"]], "Installing from source": [[6, "installing-from-source"]], "Interactive Mode": [[11, "interactive-mode"]], "Introduction": [[7, null]], "Link Tasks": [[8, null]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[13, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Matching": [[8, "matching"]], "Model Exploration and Household Model Exploration": [[8, "model-exploration-and-household-model-exploration"]], "Models": [[9, null]], "Multiple Comparisons": [[3, "multiple-comparisons"]], "Overview": [[2, "overview"], [7, "overview"], [8, "overview"], [8, "id1"], [8, "id4"], [8, "id7"], [8, "id10"], [8, "id13"]], "Pipeline generated features": [[10, null]], "Pipeline-generated Features": [[3, "pipeline-generated-features"]], "Potential Matches Universe": [[3, "potential-matches-universe"]], "Preprocessing": [[8, "preprocessing"]], "Related Configuration Sections": [[8, "related-configuration-sections"], [8, "id3"], [8, "id6"], [8, "id9"], [8, "id12"], [8, "id15"]], "Reporting": [[8, "reporting"]], "Requirements": [[6, "requirements"]], "Running Linking Tasks and Steps": [[11, "running-linking-tasks-and-steps"]], "Running hlink": [[11, null]], "Single Comparison": [[3, "single-comparison"]], "Starting the program": [[11, "starting-the-program"]], "Substitution Columns": [[3, "substitution-columns"]], "Substitution by regex word replace": [[12, "substitution-by-regex-word-replace"]], "Substitutions": [[12, null]], "Task steps": [[8, "task-steps"], [8, "id2"], [8, "id5"], [8, "id8"], [8, "id11"], [8, "id14"]], "Top level configs": [[3, "top-level-configs"]], "Training and Household Training": [[8, "training-and-household-training"]], "Training and models": [[3, "training-and-models"]], "Transformer types": [[10, "transformer-types"]], "Transforms": [[0, "transforms"]], "Using hlink as a Library": [[11, "using-hlink-as-a-library"]], "Welcome to hlink\u2019s documentation!": [[5, null]], "abs_diff": [[1, "abs-diff"]], "add_to_a": [[0, "add-to-a"]], "alias": [[1, "alias"]], "all_equals": [[1, "all-equals"]], "and": [[1, "and"]], "any_equals": [[1, "any-equals"]], "array": [[4, "array"]], "array_index": [[0, "array-index"]], "b_minus_a": [[1, "b-minus-a"]], "bigrams": [[4, "bigrams"]], "btwn_threshold": [[1, "btwn-threshold"]], "bucketizer": [[10, "bucketizer"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "decision_tree": [[9, "decision-tree"]], "divide_by_int": [[0, "divide-by-int"]], "either_are_0": [[1, "either-are-0"]], "either_are_1": [[1, "either-are-1"]], "equals": [[1, "equals"]], "equals_as_int": [[1, "equals-as-int"]], "exact_mult": [[1, "exact-mult"]], "extra_children": [[1, "extra-children"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "geo_distance": [[1, "geo-distance"]], "get_floor": [[0, "get-floor"]], "gradient_boosted_trees": [[9, "gradient-boosted-trees"]], "gt_threshold": [[1, "gt-threshold"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "interaction": [[10, "interaction"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "length_b": [[1, "length-b"]], "lightgbm": [[9, "lightgbm"]], "logistic_regression": [[9, "logistic-regression"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "lower_threshold": [[1, "lower-threshold"]], "lowercase_strip": [[0, "lowercase-strip"]], "mapping": [[0, "mapping"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "neither_are_null": [[1, "neither-are-null"]], "not_equals": [[1, "not-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "power": [[1, "power"], [4, "power"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_both_years": [[1, "present-both-years"]], "probit": [[9, "probit"]], "random_forest": [[9, "random-forest"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "remove_prefixes": [[0, "remove-prefixes"]], "remove_punctuation": [[0, "remove-punctuation"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_suffixes": [[0, "remove-suffixes"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "second_gen_imm": [[1, "second-gen-imm"]], "soundex": [[4, "soundex"]], "split": [[0, "split"]], "sql_condition": [[1, "sql-condition"], [4, "sql-condition"]], "substring": [[0, "substring"]], "sum": [[1, "sum"]], "threshold": [[1, "threshold"]], "times": [[1, "times"]], "union": [[4, "union"]], "upper_threshold": [[1, "upper-threshold"]], "when_value": [[0, "when-value"]], "xgboost": [[9, "xgboost"]]}, "docnames": ["column_mappings", "comparison_features", "comparisons", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["column_mappings.md", "comparison_features.md", "comparisons.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [0, 1, 3, 7, 8, 9, 11, 12], "0": [0, 1, 2, 3, 8, 9, 10, 11, 13], "005": 3, "012": 1, "05": [1, 3, 9], "1": [0, 1, 2, 3, 5, 8, 9, 10, 11, 13], "10": [0, 3, 6, 13], "100": [0, 3, 13], "11": [0, 3, 6, 10], "12": [0, 6], "14": 1, "15": 9, "1867": 3, "1868": 3, "1869": 3, "1870": 3, "1871": 3, "1872": 3, "1873": 3, "1900": [3, 13], "1900_1910_potential_fn": 13, "1900_1910_potential_fp": 13, "1900_1910_training_data_20191023": 3, "1900_1910_training_result": 13, "1910": [1, 3, 13], "1920": 13, "1930": [3, 13], "1940": [3, 13], "1999": [0, 3], "2": [0, 1, 2, 3, 4, 8, 9, 12, 13], "20": 9, "25": 1, "3": [0, 1, 2, 3, 6, 8, 9, 10, 13], "300": 0, "301": 0, "302": 0, "303": 0, "4": [0, 1, 9], "5": [0, 1, 2, 3, 9, 10, 11, 13], "50": [3, 13], "50g": 13, "53": 3, "5g": 11, "6": [0, 3, 9, 10, 13], "65": 3, "7": [0, 1, 3, 9, 13], "75": [3, 9, 13], "79": [2, 3], "7th": 12, "8": [1, 3, 6, 9, 11], "80": 3, "84": [2, 3], "85": [9, 11], "9": 1, "95": 1, "99": [1, 3], "9998": 0, "9999": [0, 3, 10], "A": [0, 1, 2, 3, 4, 10, 11], "AND": [1, 2, 3], "As": [0, 2], "At": [7, 8], "But": [3, 6], "By": [0, 3, 11], "For": [0, 1, 2, 3, 8, 9, 11, 13], "If": [0, 1, 3, 4, 8, 9, 11, 12], "In": [0, 1, 2, 7, 9, 11, 13], "It": [0, 1, 2, 3, 7, 9, 11, 13], "NOT": 1, "OR": [1, 2, 3], "THEN": 1, "The": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13], "Then": [0, 6], "There": [1, 3, 8, 9], "These": [0, 1, 2, 3, 7, 8, 9, 10], "To": [0, 1, 6, 8, 9, 11], "Will": 3, "With": [0, 10], "_": [0, 1, 2, 3, 4, 5, 9, 10, 11], "_a": 1, "_bpl": 1, "_namefrst": 1, "_sex": 1, "a304bt": 3, "ab": 1, "abbrevi": [8, 12], "abl": 3, "about": [3, 11, 13], "abov": [1, 2, 6], "absolut": 1, "accept": [1, 3, 13], "access": [11, 13], "accord": 1, "across": 1, "ad": [0, 1, 2, 3, 9], "add": [0, 2, 5], "add_to_a": 3, "addit": [0, 1, 3, 6, 7, 9, 11], "addl": 1, "addl_var": 1, "adjust": 11, "adopt": 0, "advanc": 5, "affect": [4, 12], "after": [1, 2, 3, 5, 9, 11], "ag": [0, 1, 3, 4], "against": [1, 3, 12], "age_2": 3, "age_at_dataset_b": 0, "age_threshold": 1, "aggreg": 5, "ah": 0, "ahead": 8, "akin": 1, "algorithm": [1, 2, 7, 8], "alia": [0, 3, 8], "all": [0, 1, 3, 4, 8, 9, 10, 11], "allow": [1, 3, 8, 13], "along": 1, "alpha": 9, "alphabet": 0, "alphanumer": 3, "also": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 13], "altern": [0, 3, 9], "although": 3, "alwai": 2, "among": 1, "amount": 3, "an": [0, 1, 2, 3, 7, 9, 11], "analysi": 13, "analyz": [11, 13], "ani": [1, 3, 4, 6, 9], "anoth": [0, 1, 3, 4, 8, 9], "anyon": 8, "anywher": 12, "apach": 6, "apart": 0, "api": [3, 7, 10], "apostroph": 0, "appear": [0, 1], "appli": [0, 2, 3, 4, 8, 9, 13], "apply_model": 3, "appropri": 3, "ar": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "arbitrari": 0, "area": 3, "aren": 1, "arg": 13, "argument": [1, 11, 13], "around": 3, "arrai": [0, 3, 5, 10], "array_index": 3, "ask": 13, "aspect": [8, 11], "assert": [1, 11], "assum": 3, "attach_vari": 3, "attempt": 3, "attribut": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12], "auto": 9, "automat": [0, 3, 6, 8], "av": 12, "avail": [0, 1, 3, 4, 6, 8, 9, 10, 13], "avenu": [0, 12], "b": [0, 1, 2, 3, 4, 11], "b200": 3, "back": 1, "backup": 1, "base": [1, 3, 8], "basic": 5, "becaus": 2, "been": 7, "befor": [0, 1, 3, 4, 6, 8], "begin": 11, "behind": 0, "being": [1, 8], "belong": 3, "below": [0, 1, 3, 4, 9, 10, 11], "best": [3, 8], "beta": [3, 9], "better": [0, 7], "between": [0, 1, 2, 3, 7, 8, 9, 11, 13], "beyond": 1, "bigint": 3, "bigram": [3, 5], "bin": 9, "binomi": 9, "birth": 1, "birthplac": [0, 3], "birthyr": [0, 3], "birthyr_3": 3, "birthyr_col": 1, "block": [2, 5, 8], "blvd": 0, "boolean": [1, 3, 4, 12, 13], "boost": [5, 9], "born": 1, "borrow_t": 11, "both": [0, 1, 2, 3, 8, 13], "boundari": 1, "bpl": [0, 1, 3], "bpl1": 3, "bpl2": 3, "bpl2_str": 3, "bpl3": 3, "bpl_clean": 3, "bpl_orig": 3, "bpl_root": 0, "bpl_str": 3, "bplmatch": 3, "broken": 7, "btwn": 1, "bucket": [3, 8], "built": 6, "builtin": 1, "byrdifcat": 3, "byrdiff": [1, 3, 13], "c": [1, 9], "c201": 3, "calcul": [1, 13], "call": [0, 9], "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "cannot": 3, "care": 2, "cartesian": 1, "case": [0, 1, 2, 3, 4, 7], "cast": 1, "categor": [1, 3, 9, 10], "categori": [0, 9], "caus": 9, "caution": [1, 10], "censu": [0, 8, 13], "census": 7, "chang": [0, 1, 3, 6, 11, 13], "charact": 0, "characterist": [7, 8], "check": [1, 11], "check_val_expr": 1, "checkpoint": 4, "child": [0, 9], "children": 1, "chines": 0, "choic": 11, "chosen": 8, "chosen_model": [3, 9, 13], "circl": 0, "circumst": 1, "class": [9, 11], "classif": [8, 9], "classifi": 9, "claus": [0, 1], "clean": [0, 7], "clean_birthyr": [0, 3, 4], "clone": 6, "code": [0, 1, 3, 6], "coeffici": [3, 8], "col": [0, 1], "col_to_add": 3, "col_to_join_on": 3, "col_typ": 3, "column": [1, 4, 5, 8, 10, 11, 12, 13], "column_map": [0, 3, 8], "column_nam": [0, 1, 3, 12], "column_to_append": 0, "combin": [1, 2, 3, 4, 8], "come": 1, "command": [3, 7, 11, 13], "comment": 4, "common": [7, 8, 12, 13], "commonli": [], "comp": 1, "comp_a": [1, 2, 3], "comp_b": [1, 2, 3], "comp_c": 1, "comp_d": 1, "compar": [0, 1, 3, 7, 8], "comparis": 3, "comparison": [5, 8], "comparison_featur": [1, 2, 3, 8], "comparison_typ": [1, 2, 3], "complet": 11, "complex": [3, 4], "comput": [1, 4, 8], "concat": 0, "concaten": [0, 1], "condens": 0, "condense_strip_whitespac": 3, "condit": [0, 1, 2, 3, 4, 5, 8], "conf": [11, 13], "config": [1, 4, 5, 8, 11, 13], "configur": [0, 1, 2, 7, 9, 11, 13], "conjuct": 3, "conjunct": 3, "connect": [2, 3], "consid": [1, 2, 9], "consider": 1, "consol": 11, "constraint": [1, 2], "construct": 8, "contain": [0, 1, 3, 12], "context": 10, "continu": [9, 10, 13], "conveni": 11, "convert": [0, 1, 3], "convert_ints_to_long": 3, "copi": [4, 13], "core": [1, 8, 11, 13], "correspond": [7, 8], "could": [0, 2, 3], "count": [1, 11, 13], "counterpart": 9, "counti": [0, 1], "county_1900_1910_distances_km": 1, "county_a": 1, "county_b": 1, "county_dist": [1, 3, 13], "county_distance_lookup": 1, "county_distance_squar": [1, 3, 13], "county_state_dist": 1, "court": 0, "cover": 10, "coverag": 3, "cpu": 11, "creat": [0, 3, 7, 8, 10, 11, 12, 13], "creation": 3, "crosswalk": 8, "csv": [1, 3, 8, 11, 12, 13], "current": [1, 2, 3, 9, 11], "d": 1, "d425": 3, "data": [0, 1, 5, 7, 8, 11], "databas": 11, "datafram": [8, 11, 13], "dataset": [0, 1, 3, 4, 7, 8, 11, 13], "datasourc": [1, 3, 11], "datasource_a": [3, 8], "datasource_b": [3, 8], "de": 9, "decis": [3, 5, 9, 13], "decisiontreeclassifi": 9, "default": [0, 1, 2, 3, 8, 9, 11], "defin": [1, 3, 5, 8, 10, 11], "definit": [3, 8], "demograph": 8, "depend": [1, 2, 3, 6, 9, 13], "dependent_var": [3, 13], "depth": 9, "deriv": 13, "derived_from": 3, "desc": 11, "describ": [0, 1, 2, 3, 11], "descript": [3, 11], "detail": [0, 3, 11], "detect": 9, "determin": [1, 8], "determinist": [7, 8], "dev": 6, "develop": [6, 7], "df": [3, 11], "dictionari": 11, "diff": 1, "differ": [0, 1, 3, 5, 7, 8], "digit": 0, "dir": 12, "directli": [2, 9], "directori": [6, 11, 13], "disabl": 9, "discard": 9, "discret": 9, "discuss": 3, "dist": 1, "dist_tabl": 1, "distanc": [1, 9], "distance_col": 1, "distance_km": 1, "distances_fil": 1, "distinct": 1, "divid": 0, "divide_by_int": 3, "do": [0, 1, 2, 4, 9, 11, 13], "doc": [9, 10], "document": [1, 3, 9, 11, 13], "doe": [1, 4, 8, 9, 13], "don": [3, 4], "doubl": 10, "down": [0, 7, 13], "drastic": 8, "drop": [0, 3, 11], "drop_al": 11, "drop_all_prc": 11, "drop_all_temp": 11, "drop_data_from_scored_match": 3, "drop_duplicate_a": 3, "drop_duplicate_with_threshold_ratio": [3, 9, 13], "duplic": [3, 9], "durat": 1, "dure": [1, 2, 3, 8], "durmarr": [1, 3], "e": 6, "each": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11], "easiest": [6, 11], "easili": 11, "edit": 6, "effect": [2, 4], "either": [0, 1, 2, 3, 7, 12], "either_0": 1, "either_1": 1, "element": 0, "elig": 8, "els": [1, 3, 4], "else_v": 1, "else_valu": 0, "empti": 1, "enabl": [3, 8, 9, 11], "encod": [3, 4], "encount": 9, "end": [0, 1, 3, 4, 12], "enorm": 9, "ensur": 3, "enter": 11, "entir": [0, 3], "enum_dist": 1, "enumer": 11, "equal": [2, 3, 12], "equal_and_not_null_templ": 1, "equival": 3, "error": [3, 9, 10], "especi": 3, "eta": 9, "etc": 13, "eval": 3, "evalu": [1, 3, 7, 8, 9], "even": 1, "everi": [1, 4], "ex": 1, "exact": [1, 3], "exact_mult": [3, 13], "exampl": [0, 1, 2, 3, 9], "except": [3, 10], "exclud": 1, "excute_command": 11, "execut": 11, "execute_command": 11, "execute_task": 11, "executor": 11, "executor_memori": [11, 13], "exist": [1, 3, 11], "exit": 11, "expand": 3, "expand_length": 3, "expect": 1, "experi": 7, "experiment": [8, 11], "explan": [], "explicitli": [1, 10], "explod": [2, 3, 8], "exploded_df_a": 11, "exploded_df_b": 11, "explor": [3, 5, 7, 11], "expon": 4, "exponenti": 1, "export": [5, 8, 11], "express": [0, 1, 3], "extend": 1, "extens": 8, "extra": [1, 9], "extract": 3, "f": [1, 11], "f1": 1, "f1_match": 3, "f2": 1, "f2_match": 3, "f_caution": [3, 13], "f_interacted_jw_f": [3, 13], "f_pre": [3, 13], "factori": 11, "fail": 3, "fallback": 1, "fals": [1, 3, 4, 5, 7, 11], "famili": 9, "father_namefrst": 1, "favor": 1, "fbpl": 1, "fbpl_nomatch": 1, "fbplmatch": [3, 13], "featur": [2, 5, 7, 8, 9, 11], "feature_import": [3, 8, 13], "feature_nam": [2, 3], "feature_select": [2, 3, 4, 8], "featuresubsetstrategi": 9, "fed": [3, 8], "femal": [3, 12], "fetch": 1, "fetch_a": 3, "few": [4, 9], "fewer": [1, 9], "fi": 1, "file": [1, 4, 5, 7, 8, 11, 12, 13], "filepath": 11, "fill": 1, "filter": [1, 2, 5, 8, 12], "final": [1, 3, 13], "find": [1, 8, 13], "finish": 11, "first": [0, 1, 3, 6, 8, 11, 12], "first_init_col": 1, "first_nam": 0, "five": 11, "fix": 8, "flag": [1, 2, 3, 10, 11, 13], "flexibl": [1, 2], "float": [1, 3, 9], "floor": 0, "focus": [2, 8], "follow": [0, 1, 2, 7, 11, 12, 13], "force_row_wis": 9, "foreign": 1, "forest": [5, 9], "form": [1, 3, 8, 12], "format": 0, "four": 1, "framework": 13, "from": [0, 1, 3, 5, 7, 8, 10, 11, 13], "from_icpsrctyi": 1, "from_statefip": 1, "fsoundex": [3, 13], "full": [3, 8, 9, 13], "full_count_1870_1880": 11, "full_count_1900_1910": 13, "fullcount_1870_1880": 11, "function": [0, 1, 3, 7, 11], "further": [7, 13], "gamma": 9, "gbtclassifi": 9, "gen": 1, "gener": [0, 1, 5, 7, 8, 11], "generalizedlinearregress": 9, "geo": 1, "geograph": 1, "get": [0, 1, 2, 3, 11], "get_floor": 3, "get_set": 11, "get_step": 11, "get_tabl": 11, "get_task": 11, "github": 6, "give": [0, 3], "given": [0, 1, 2, 3, 4, 13], "go": [3, 11], "good": 1, "gradient": [5, 9], "greater": [1, 2, 6], "greatest": 1, "group": [3, 8], "gt": 1, "h": 11, "ha": [0, 1, 2, 3, 4, 7, 9, 11, 13], "handl": 11, "harmon": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "haven": 3, "head": 0, "header": [3, 10, 12], "help": [3, 8, 9, 11], "here": [2, 3, 8, 9, 11, 13], "hh": [1, 2], "hh_blocked_match": 11, "hh_col": 3, "hh_comparison": [2, 3, 8], "hh_match": [3, 11], "hh_model_eval_repeat_fn": 11, "hh_model_eval_repeat_fp": 11, "hh_model_eval_training_data": 11, "hh_model_eval_training_featur": 11, "hh_model_eval_training_result": 11, "hh_model_eval_training_vector": 11, "hh_model_explor": 11, "hh_potential_match": [2, 11], "hh_potential_matchs_prep": 11, "hh_predicted_match": 11, "hh_repeat_fn": 13, "hh_repeat_fp": 13, "hh_scored_potential_match": 11, "hh_train": [1, 3, 8, 11, 13], "hh_training_data": 11, "hh_training_data_1900_1910": 3, "hh_training_featur": [11, 13], "hh_training_result": 13, "hidden": 11, "hierarch": [7, 11], "hierarchi": 11, "high": [9, 11], "highest": [1, 3, 9], "highli": [7, 8], "histid": [1, 3, 13], "histid_col": 1, "hit": [3, 11, 13], "hits2": [3, 13], "hlink": [0, 1, 2, 3, 4, 6, 7, 8, 9, 13], "hold": 10, "hot": 3, "household": [0, 2, 5, 7, 9, 11, 13], "how": [1, 3, 8], "howev": [4, 7, 13], "hundr": 0, "hyper": [3, 7, 13], "hyperparamet": 13, "hyphen": 0, "i": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13], "id": [1, 3], "id_column": [1, 3], "ident": 13, "identifi": [3, 7, 13], "if_valu": 0, "ignor": 7, "ii": [0, 3], "iii": [0, 3], "imm": [1, 3, 13], "imm_interacted_bplmatch": 3, "imm_interacted_immyear_caut": [3, 13], "immigr": 1, "immyear_caut": [3, 10], "immyear_diff": [1, 3, 10, 13], "implement": [9, 13], "implicitli": 2, "import": [3, 8, 11, 13], "improv": 8, "includ": [1, 3, 8, 9, 10, 11], "incompar": 1, "increas": [3, 10], "independ": [0, 3], "independent_var": [1, 3, 13], "index": [0, 6], "indic": [1, 13], "individu": [1, 3, 8, 13], "inf": 10, "inform": [0, 1, 3, 11], "ingest": 8, "initi": [0, 1, 11], "input": [0, 1, 3, 4, 7, 8, 11, 12], "input_col": 4, "input_column": [3, 4, 10], "input_table_nam": 11, "inspect": 8, "instal": [5, 9], "instanc": [0, 9], "instead": [1, 3, 4, 6, 8], "instruct": [6, 11], "int": [0, 1, 3, 4, 9], "integ": [0, 1, 3, 10], "integr": [6, 9], "interact": [3, 5, 8, 13], "interfac": 11, "intermedi": 11, "introduct": 5, "introspect": 8, "invalid": [1, 9], "ipum": [0, 7], "ipython": 11, "isn": 13, "istemporari": 11, "item": 0, "iter": 3, "its": [0, 1, 7, 9, 11], "itself": [1, 2], "iv": 3, "jaro": [1, 10], "jaro_winkl": 3, "java": 6, "job": 11, "join": [1, 3, 12], "join_column": [3, 12], "join_valu": [3, 12], "jr": [0, 3], "json": [3, 11], "just": [1, 3, 11, 13], "jw": 1, "jw_col_templ": 1, "jw_f": [1, 3, 13], "jw_m": [3, 10, 13], "jw_max_a": 3, "jw_max_b": 3, "jw_sp": [3, 13], "jw_street": 1, "jw_threshold": 1, "kei": [1, 8, 11], "key_count": 1, "kind": 1, "know": 3, "known": 0, "label": 9, "languag": 7, "larger": 9, "last": [0, 1, 8, 10], "latest": 6, "launch": [11, 13], "law": 0, "lead": 0, "learn": [1, 2, 3, 7, 8, 9, 11, 13], "learningr": 9, "least": [0, 1, 9], "leav": 0, "left": 9, "length": [1, 3, 10], "less": [1, 3], "let": 11, "letter": 0, "level": [1, 5, 11], "leverag": 3, "libomp": 9, "librari": [5, 7, 9], "lightgbm": 5, "lightgbmclassifi": 9, "like": [0, 2, 3, 8, 11, 12], "limit": 3, "line": [7, 11], "link": [0, 1, 3, 5, 7, 9], "link_run": 11, "linkrun": [7, 11], "list": [0, 1, 3, 4, 5, 9, 11, 12], "liter": 3, "ll": 11, "load": 11, "load_conf_fil": 11, "load_config": 11, "loc_a": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b": 1, "loc_b_0": 1, "loc_b_1": 1, "locat": [1, 3, 11], "log2": 9, "logic": [0, 2, 3], "logist": [5, 9], "logistic_regress": [3, 13], "logisticregress": 9, "long": [3, 12], "longest": 8, "look": [1, 2, 11, 12], "lookup": 1, "lower": [0, 1], "lowercas": 0, "lowercase_strip": 3, "lowest": 0, "lr": 11, "lsoundex": [3, 13], "m": [0, 1], "m_caution": [1, 3, 10, 13], "m_interacted_jw_m": [3, 10, 13], "m_namefrst": 1, "m_pre": [3, 10, 13], "machin": [1, 2, 3, 7, 8, 9, 11, 13], "made": 6, "mai": [0, 2, 3, 7, 8, 9, 11, 12], "main": 11, "mainli": 1, "major": [1, 11], "make": [0, 1, 2, 3, 6, 13], "male": [3, 12], "mani": [2, 8, 9, 11], "manual": 13, "map": [5, 8, 10], "mardurmatch": [1, 3], "mark": 2, "marriag": 1, "match": [0, 1, 2, 5, 7, 9, 11, 12, 13], "matches_df": 11, "matrix": 13, "max": [1, 9, 11], "max_depth": 9, "maxbin": 9, "maxdepth": [3, 9, 13], "maximum": [1, 9], "maximum_jw": 1, "mbpl": 1, "mbplmatch": [3, 13], "mean": [0, 2, 3], "meant": 3, "meet": 1, "member": [1, 8], "memori": 11, "men": 3, "messag": 11, "metadata": 8, "method": [11, 13], "mfbplmatch": 1, "mi": [3, 13], "mi0": 1, "mi1": 1, "mid_init_col": 1, "mid_init_match": 1, "middl": [0, 1], "might": 13, "minimum": [1, 9], "mininstancespernod": 9, "minu": [1, 3], "mismatch": 1, "miss": [1, 11], "ml": [3, 5, 9, 10], "mode": [5, 6, 13], "model": [5, 7, 11], "model_eval_repeat_fn": 11, "model_eval_repeat_fp": 11, "model_eval_training_data": 11, "model_eval_training_featur": 11, "model_eval_training_result": 11, "model_eval_training_vector": 11, "model_explor": [3, 9, 11, 13], "model_paramet": [3, 8, 13], "modifi": 3, "modul": 11, "moment": 8, "momloc": 1, "more": [0, 1, 2, 3, 9, 10, 11, 13], "most": [0, 1, 8, 11], "mother": 10, "mother_birthyr": 1, "mpre": 1, "mr": 0, "much": [1, 8], "mult": 1, "multi": 1, "multipl": [0, 1, 5, 9, 11], "multipli": 1, "must": [0, 1, 2, 3, 4, 9, 10, 12], "my": [11, 13], "my_conf": 11, "my_fil": 3, "myriad": 3, "n": [1, 9, 10], "n_training_iter": [3, 8, 13], "name": [0, 1, 3, 4, 9, 11, 12], "name_col": 1, "name_std": [3, 12], "namefrst": [0, 1, 3], "namefrst_clean": [0, 3], "namefrst_init": 1, "namefrst_jw": [1, 2, 3, 13], "namefrst_mid_init": [0, 1], "namefrst_mid_init_2": 1, "namefrst_rel": 1, "namefrst_related_row": 1, "namefrst_split": [0, 3], "namefrst_std": [3, 12], "namefrst_std_jw": [3, 13], "namefrst_unstd": [1, 3], "namefrst_unstd_bigram": 4, "namelast": [1, 3], "namelast1": 1, "namelast2": 1, "namelast3": 1, "namelast_bigram": 3, "namelast_clean": [1, 3, 4], "namelast_clean_bigram": [3, 4], "namelast_clean_soundex": 4, "namelast_equal_as_int": 1, "namelast_frst_bigram": 4, "namelast_jw": [2, 3, 13], "namelast_jw_max": 1, "namelast_neighbor": 1, "namelast_popular": 1, "namelast_popularity_sum": 1, "nativ": 1, "nbor": [1, 3, 13], "ncount": [3, 4, 13], "ncount2": [3, 4, 13], "nearest": 0, "necessari": [4, 13], "need": [0, 1, 2, 3, 8, 9, 11, 13], "neg": [3, 5, 7], "neighbor": 1, "neighborhood": 1, "neither": 1, "nest": [2, 3, 7], "new": [0, 3, 13], "new_marr": [1, 3], "newli": 3, "niu": 1, "no_first_pad": 4, "node": 9, "non": 0, "nonzero": 1, "nor": 1, "normal": [8, 9], "note": [2, 3], "now": [2, 11], "null": [1, 3, 4], "null_fil": 3, "num": 11, "num_col": 1, "number": [0, 1, 3, 4, 8, 9, 11], "numer": [0, 1], "numtre": [3, 9, 13], "nvl": 1, "object": [1, 3, 7, 11], "often": [2, 4, 11], "onc": [1, 11], "one": [0, 1, 3, 8], "onethird": 9, "onli": [0, 1, 2, 3, 8, 9, 13], "ons": 5, "oper": [0, 1, 2, 3, 9], "option": [0, 1, 3, 4, 7, 8, 9, 11, 13], "or_group": 3, "order": [0, 3, 8], "org": 6, "organ": 11, "original_valu": 3, "oth": [3, 13], "other": [1, 3, 9, 13], "otherwis": [0, 1, 10, 13], "our": 11, "out": [3, 4, 8, 13], "output": [0, 1, 2, 3, 4, 7, 8, 11, 13], "output_col": 4, "output_column": [3, 4, 10], "output_suspicious_td": [3, 13], "output_table_nam": 11, "output_typ": 3, "outsid": 10, "overrid": [0, 3], "override_column_a": [0, 3, 4], "override_column_b": [0, 3, 4], "override_transform": [0, 3], "overview": 5, "own": [1, 3], "p": 3, "packag": [6, 9], "page": [1, 2, 3, 11], "pair": [1, 2, 3, 13], "param": 13, "param_grid": [3, 13], "paramet": [3, 7, 8, 9, 11, 13], "parent": [0, 1, 12], "parent_step_chang": 1, "parenthes": 3, "parquet": [3, 8], "part": [3, 8], "particular": [1, 2, 3], "particularli": 1, "partit": 13, "pass": [1, 8, 9], "passthrougharg": 9, "path": [1, 3, 11, 12, 13], "pattern": 11, "peopl": [0, 1, 7, 11], "per": [1, 3, 9, 10, 11], "percent": 8, "percentag": 1, "perform": [1, 3, 7, 8, 9, 12], "persist": 11, "person": [0, 1, 7], "pip": [6, 9], "pipelin": 5, "pipeline_featur": [3, 8, 10], "piplin": 3, "placehold": 1, "pleas": [3, 9, 11], "plu": 3, "point": [10, 13], "popul": 8, "posit": [0, 1, 3, 5, 7], "possibl": 3, "post": [3, 8], "potenti": [1, 2, 5, 8, 9], "potential_match": [1, 2, 8, 11], "potential_matches_prep": 11, "potential_matches_univers": [3, 8], "power": 5, "predefin": 1, "predict": [3, 13], "predicted_match": 11, "prediction_col": 3, "preexist": 11, "prefer": 13, "prefix": 0, "prep": 3, "prep_step": 11, "prepar": [7, 8, 11], "prepend": 4, "prepped_df_a": 11, "prepped_df_b": 11, "preprocess": [5, 7, 11, 13], "present": [1, 3, 10], "pretti": 2, "primari": [1, 7], "primarili": [1, 8], "print": 11, "probabilist": [3, 7], "probabl": [0, 3, 9], "probit": [3, 5], "proceed": 12, "process": [2, 3, 7, 11], "produc": [0, 11], "product": [1, 7, 13], "program": [1, 3, 8, 13], "project": 6, "prompt": 11, "provid": [0, 3, 4, 7, 8, 9, 10, 11], "pull": 8, "punctuat": 0, "put": [11, 13], "py": [1, 3], "pypi": 5, "pyspark": [6, 9, 10, 11], "python": [6, 7, 9, 11], "q": [11, 13], "qmark": 0, "qualifi": 3, "queri": [1, 3], "race": [0, 1, 3, 10, 13], "race_interacted_srac": [3, 10, 13], "racematch": 3, "rais": [1, 4, 9], "random": [5, 9], "random_forest": [3, 13], "randomforestclassifi": 9, "rang": [3, 10], "rate": 1, "ratio": [3, 9], "ration": 0, "rationalize_name_word": 3, "raw": [0, 3, 8, 11], "raw_df_a": 11, "raw_df_b": 11, "read": [0, 1, 3, 8, 11], "readm": 3, "real": 7, "reason": 1, "recod": 0, "record": [0, 1, 2, 3, 7, 8, 9], "recurs": [2, 3], "reduc": [4, 8], "refer": 3, "regex": 5, "regex_word_replac": 12, "region": [3, 13], "region_dict": 3, "regionf": 3, "regist": [8, 11], "regress": [5, 9], "regular": 13, "rel": [1, 3, 13], "relat": [0, 1, 3], "relate_a": [3, 10], "relate_col": 1, "relate_div_100": [0, 1, 3], "related_individual_max_jw": 1, "related_individual_row": 1, "relatematch": [1, 3], "relatetyp": [3, 10], "relatetype_interacted_relatematch": 3, "relationship": 7, "relev": 13, "reload": 11, "remain": 8, "remov": 0, "remove_alternate_nam": 3, "remove_qmark_hyphen": 3, "remove_suffix": 3, "repeat_fn": 13, "repeat_fp": 13, "repeatedli": 3, "replac": [0, 1, 2, 5], "replace_apostroph": 3, "replaced_birthyr": [1, 3, 4], "report": [1, 5, 7, 11], "repositori": 6, "repres": [0, 1, 3, 10, 11], "represent": [3, 8], "reproduc": 11, "request": 8, "requir": [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12], "research": 7, "reshap": 8, "resourc": 4, "respect": [1, 8], "restrict": 2, "result": [0, 1, 7, 10, 11, 13], "return": [1, 4, 11], "reus": 5, "right": [9, 11], "risk": 1, "road": 0, "robust": 3, "root": 6, "round": [0, 3], "row": [0, 3, 4], "rule": [2, 7, 8], "run": [5, 6, 7, 8, 13], "run_all_step": [3, 11, 13], "run_step": 11, "runtim": 8, "sai": 11, "same": [0, 1, 2, 3, 4, 7, 8, 11], "sampl": 1, "satisfi": [2, 3, 8], "save": [3, 8, 13], "scala": [1, 9], "scale": 3, "scale_data": [3, 13], "scenario": 13, "schema": 3, "score": [1, 3, 8, 9, 10], "score_with_model": [3, 13], "scored_potential_match": 11, "scratch": 3, "script": [3, 7, 11], "search": 1, "second": [0, 1, 3, 9, 12], "secondari": 1, "secondary_distance_col": 1, "secondary_distances_fil": 1, "secondary_key_count": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_source_column": 1, "secondary_table_nam": 1, "section": [0, 1, 2, 3, 9, 13], "see": [1, 3, 6, 9, 11, 13], "seen": 1, "select": [0, 1, 5, 7, 11, 13], "send": 9, "separ": 8, "sequenc": 7, "seri": 11, "serialp": 3, "serv": 1, "set": [0, 1, 2, 3, 4, 7, 8, 9, 11, 13], "set_executor_memori": 11, "set_link_task": 11, "set_loc": 11, "set_num_cor": 11, "set_preexisting_t": 11, "set_print_sql": 11, "set_value_column_a": [3, 4], "set_value_column_b": [3, 4], "sever": [1, 7], "sex": [1, 3, 12], "sex_equ": 3, "sex_region_interact": 3, "sex_regionf_interact": 3, "sexmatch": 3, "sgen": [1, 3, 13], "share": 7, "short": 3, "should": [1, 3, 4, 9, 10, 11], "show": 11, "showf": 11, "shut": 13, "sibl": 0, "sign": 1, "signific": 0, "similar": [1, 3], "simpli": [0, 3], "simplifi": 1, "sinc": [0, 3, 9], "singl": [0, 4, 11, 13], "size": 1, "skip": [3, 4, 8], "sm_bpl": 1, "sm_namefrst": 1, "sm_sex": 1, "small": 3, "smaller": 7, "smallest": 3, "sn_bpl": 1, "sn_namefrst": 1, "sn_sex": 1, "so": [0, 1, 2, 3, 6, 9, 13], "some": [0, 1, 2, 3, 4, 7, 8, 9, 11], "someth": 11, "sometim": [3, 9], "somewhat": 2, "soundex": 5, "sourc": [1, 5, 8, 11, 13], "source_column_a": 1, "source_column_b": 1, "sp": 1, "sp_caution": [1, 3, 13], "sp_interacted_jw_sp": [3, 13], "sp_pre": [3, 13], "space": [0, 3, 4, 12], "span": 13, "spark": [1, 3, 4, 6, 9, 10, 11, 13], "sparkfactori": 11, "sparksess": 11, "sparkxgbclassifi": 9, "special": [1, 9], "specif": [1, 3, 9, 11], "specifi": [0, 1, 3, 7, 8, 10, 11, 12], "split": [3, 4, 8, 9, 10, 13], "split_by_id_a": [3, 13], "sploc": 1, "spous": 0, "spouse_birthyr": 1, "spouse_bpl": 1, "sql": [0, 1, 2, 3, 4, 5, 8, 11], "sql_condit": 3, "sqrt": 9, "squar": 1, "sr": [0, 3], "srace": [3, 10, 13], "stage": 3, "standard": [0, 1, 12], "start": [0, 12], "state": [1, 7], "state_1900_1910_distances_km": 1, "state_dist": 1, "state_distance_lookup": 1, "statecode1": 1, "statecode2": 1, "statefip": [0, 1, 3], "statefip_h": 3, "step": [0, 1, 3, 7], "stepmom": 1, "still": 12, "stop": [0, 9], "street": [0, 1], "street_jw": [1, 3, 13], "street_unstd": 12, "strictli": 10, "string": [0, 1, 3, 4, 8, 9, 11, 12], "strip": [0, 8], "structur": [2, 3, 7], "sub": [1, 2, 3], "subhead": 12, "subset": [3, 12], "substitut": [5, 8], "substitution_column": [3, 8, 12], "substitution_fil": [3, 12], "substitutions_street_abbrev": 12, "subtract": 1, "suffix": 0, "suppli": 12, "support": [0, 2, 3, 8, 9], "suppos": [0, 2, 3], "sure": [2, 3, 6, 11], "surnam": 1, "surround": 0, "suspect": [1, 7], "swap": 12, "synaps": 9, "syntax": 2, "system": [6, 9], "t": [0, 1, 3, 4, 13], "tabl": [1, 2, 3, 5, 8, 11, 13], "table_nam": 1, "tablenam": 11, "tailor": 7, "take": [0, 1, 2, 3, 4, 8, 11], "taken": [0, 1], "task": [2, 3, 5, 7, 9, 13], "task_nam": 11, "tell": [1, 3, 4], "templat": 1, "ten": 1, "term": 3, "test": [3, 8, 13], "text": 11, "than": [0, 1, 2, 3, 9], "thei": [0, 1, 2, 3, 8, 11], "them": [0, 1, 3, 8], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "thing": 3, "those": [1, 3], "though": 1, "thought": 8, "three": [2, 3, 8], "threshold": [2, 3, 9, 13], "threshold_expr": [2, 3], "threshold_ratio": [3, 9, 13], "through": [6, 8, 9, 10, 11], "thu": 1, "time": [0, 3, 8, 9, 11], "to_icpsrctyi": 1, "to_statefip": 1, "togeth": [0, 1, 3], "toml": [3, 7, 11], "tool": [6, 7], "top": [1, 5], "topic": 11, "total": 8, "train": [1, 5, 7, 9, 11], "training_data": [3, 11], "training_data_1900_1910": 13, "training_data_1900_1910_hlink_featur": 13, "training_data_subset": 3, "training_featur": [11, 13], "training_result": 13, "transform": [1, 3, 5, 7, 8], "transformer_typ": [3, 10], "treat": [3, 10], "tree": [5, 9], "true": [1, 3, 4, 8, 9, 10, 12, 13], "try": [3, 9], "tune": [7, 13], "tutori": [3, 11], "two": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "type": [0, 3, 4, 5, 8, 9, 11, 12, 13], "typic": [3, 8], "u": 0, "ugli": 2, "under": [1, 3], "underli": 9, "understand": 7, "union": 5, "uniqu": 3, "unit": 7, "univers": [1, 5, 8], "unknown": 1, "unlik": 2, "unrel": 1, "unstabl": [9, 11], "up": [1, 3, 11, 12], "updat": 13, "upper": 1, "upper_threshold": 3, "uppercas": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13], "us1900": 3, "us1900m_usa": 3, "us1910": 3, "us1910m_usa": 3, "usag": [3, 4, 5, 11], "use_potential_matches_featur": 13, "use_training_data_featur": [3, 8, 13], "user": [1, 11], "usual": [0, 8, 9, 13], "util": 4, "v": 3, "valu": [0, 1, 2, 3, 4, 9, 10, 11, 12], "var": [1, 3], "vari": [0, 3, 4], "variabl": [0, 1, 3, 13], "variant": 12, "varieti": 8, "variou": 9, "ve": 13, "vector": [3, 10], "verbos": 2, "veri": [4, 8, 13], "version": [0, 6, 9, 13], "vi": 3, "via": [6, 7, 9], "view": 9, "vii": 3, "viii": 3, "volumn": 1, "wa": [1, 13], "wai": [1, 2, 3, 6, 11], "want": [0, 1, 3, 11, 13], "washington": 3, "we": [0, 1, 9, 11, 13], "well": 3, "were": 3, "weren": 13, "what": [1, 3, 11, 13], "when": [0, 1, 2, 3, 4, 8, 9, 13], "where": [1, 8, 11, 13], "whether": [1, 3, 12], "which": [0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13], "white": 0, "whitespac": [0, 8], "who": 1, "whole": [0, 7], "whose": [1, 9], "width": 8, "winkler": [1, 10], "within": [1, 3, 7, 11, 12], "without": 9, "word": [0, 5], "work": [0, 1, 3, 6, 8, 11, 13], "workflow": [4, 5], "world": 7, "would": [1, 2, 3, 13], "write": [1, 11, 13], "written": [3, 7], "x": [3, 10, 11], "x_crosswalk": 11, "x_hh_tfam": 11, "x_hh_tfam_2a": 11, "x_hh_tfam_2b": 11, "x_load": 11, "x_parquet_from_csv": 11, "x_persist": 11, "x_sql": 11, "x_sqlf": 11, "x_summari": 11, "x_tab": 11, "x_tfam": 11, "x_tfam_raw": 11, "x_union": 11, "xgboost": 5, "y": 10, "year": [0, 1, 3, 4, 5], "year_b": 1, "yet": 11, "you": [0, 1, 2, 3, 6, 9, 11, 12, 13], "your": [2, 3, 4, 6, 8, 9, 11, 13], "yrimmig": 1, "zero": 1}, "titles": ["Column Mappings", "Comparison Features", "Comparisons", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "titleterms": {"": 5, "1": 12, "abs_diff": 1, "add": 1, "add_to_a": 0, "advanc": [0, 3, 13], "after": 13, "aggreg": 1, "alia": 1, "all_equ": 1, "any_equ": 1, "api": 5, "arrai": 4, "array_index": 0, "b_minus_a": 1, "basic": [0, 3], "bigram": 4, "block": 3, "btwn_threshold": 1, "bucket": 10, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "column": [0, 3], "comparison": [1, 2, 3], "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "condense_strip_whitespac": 0, "config": 3, "configur": [3, 5, 8], "data": [3, 12, 13], "decision_tre": 9, "defin": 2, "differ": 13, "divide_by_int": 0, "document": 5, "either_are_0": 1, "either_are_1": 1, "equal": 1, "equals_as_int": 1, "exact_mult": 1, "exampl": [11, 13], "explor": [8, 13], "export": 13, "extra_children": 1, "f1_match": 1, "f2_match": 1, "fals": 13, "featur": [1, 3, 4, 10, 13], "fetch_a": 1, "fetch_b": 1, "file": 3, "filter": 3, "fn": 13, "fp": 13, "from": 6, "gener": [3, 10, 13], "geo_dist": 1, "get_floor": 0, "gradient_boosted_tre": 9, "gt_threshold": 1, "hit": 1, "hits2": 1, "hlink": [5, 11], "household": [1, 3, 8], "instal": 6, "interact": [10, 11], "introduct": 7, "jaro_winkl": 1, "jaro_winkler_r": 1, "jaro_winkler_street": 1, "jw_max_a": 1, "jw_max_b": 1, "length_b": 1, "level": 3, "librari": 11, "lightgbm": 9, "link": [8, 11, 13], "list": 13, "logistic_regress": 9, "look_at_addl_var": 1, "lower_threshold": 1, "lowercase_strip": 0, "map": [0, 3], "match": [3, 8], "max_jaro_winkl": 1, "maximum_jaro_winkl": 1, "ml": 13, "mode": 11, "model": [3, 8, 9, 13], "multi_jaro_winkler_search": 1, "multipl": [2, 3], "neg": 13, "neither_are_nul": 1, "not_equ": 1, "not_zero_and_not_equ": 1, "ons": 1, "overview": [2, 7, 8], "pipelin": [3, 10], "posit": 13, "potenti": [3, 13], "power": [1, 4], "preprocess": 8, "present_and_equal_categorical_in_univers": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_both_year": 1, "probit": 9, "program": 11, "pypi": 6, "random_forest": 9, "rationalize_name_word": 0, "regex": 12, "rel_jaro_winkl": 1, "relat": 8, "remove_alternate_nam": 0, "remove_one_letter_nam": 0, "remove_prefix": 0, "remove_punctu": 0, "remove_qmark_hyphen": 0, "remove_stop_word": 0, "remove_suffix": 0, "replac": 12, "replace_apostroph": 0, "report": 8, "requir": 6, "reus": 13, "run": 11, "second_gen_imm": 1, "section": 8, "select": [3, 4], "singl": 3, "soundex": 4, "sourc": [3, 6], "split": 0, "sql_condit": [1, 4], "start": 11, "step": [8, 11], "substitut": [3, 12], "substr": 0, "sum": 1, "tabl": 12, "task": [8, 11], "threshold": 1, "time": 1, "top": 3, "train": [3, 8, 13], "transform": [0, 4, 10], "type": [1, 2, 10], "union": 4, "univers": 3, "upper_threshold": 1, "us": 11, "usag": 0, "welcom": 5, "when_valu": 0, "word": 12, "workflow": [11, 13], "xgboost": 9, "year": 13}})
\ No newline at end of file
diff --git a/docs/use_examples.html b/docs/use_examples.html
index 94e3c6a..919e602 100644
--- a/docs/use_examples.html
+++ b/docs/use_examples.html
@@ -209,6 +209,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
 <li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="model_exploration.html">Model Exploration</a></li>
 </ul>
 
 <div class="relations">
diff --git a/hlink/linking/core/classifier.py b/hlink/linking/core/classifier.py
index 0efaf38..d9543ed 100644
--- a/hlink/linking/core/classifier.py
+++ b/hlink/linking/core/classifier.py
@@ -13,6 +13,20 @@
 )
 import hlink.linking.transformers.rename_prob_column
 
+try:
+    import synapse.ml.lightgbm
+except ModuleNotFoundError:
+    _lightgbm_available = False
+else:
+    _lightgbm_available = True
+
+try:
+    import xgboost.spark
+except ModuleNotFoundError:
+    _xgboost_available = False
+else:
+    _xgboost_available = True
+
 
 def choose_classifier(model_type, params, dep_var):
     """Returns a classifier and a post_classification transformer given model type and params.
@@ -96,7 +110,49 @@ def choose_classifier(model_type, params, dep_var):
         post_transformer = (
             hlink.linking.transformers.rename_prob_column.RenameProbColumn()
         )
-
+    elif model_type == "lightgbm":
+        if not _lightgbm_available:
+            raise ModuleNotFoundError(
+                "To use the 'lightgbm' model type, you need to install the synapseml "
+                "Python package, which provides LightGBM-Spark integration, and "
+                "its dependencies. Try installing hlink with the lightgbm extra: "
+                "\n\n    pip install hlink[lightgbm]"
+            )
+        params_without_threshold = {
+            key: val
+            for key, val in params.items()
+            if key not in {"threshold", "threshold_ratio"}
+        }
+        classifier = synapse.ml.lightgbm.LightGBMClassifier(
+            **params_without_threshold,
+            featuresCol=features_vector,
+            labelCol=dep_var,
+            probabilityCol="probability_array",
+        )
+        post_transformer = SQLTransformer(
+            statement="SELECT *, parseProbVector(probability_array, 1) as probability FROM __THIS__"
+        )
+    elif model_type == "xgboost":
+        if not _xgboost_available:
+            raise ModuleNotFoundError(
+                "To use the experimental 'xgboost' model type, you need to install "
+                "the xgboost library and its dependencies. Try installing hlink with "
+                "the xgboost extra:\n\n    pip install hlink[xgboost]"
+            )
+        params_without_threshold = {
+            key: val
+            for key, val in params.items()
+            if key not in {"threshold", "threshold_ratio"}
+        }
+        classifier = xgboost.spark.SparkXGBClassifier(
+            **params_without_threshold,
+            features_col=features_vector,
+            label_col=dep_var,
+            probability_col="probability_array",
+        )
+        post_transformer = SQLTransformer(
+            statement="SELECT *, parseProbVector(probability_array, 1) as probability FROM __THIS__"
+        )
     else:
         raise ValueError(
             "Model type not recognized! Please check your config, reload, and try again."
diff --git a/hlink/linking/core/pipeline.py b/hlink/linking/core/pipeline.py
index e06edc8..534eccb 100644
--- a/hlink/linking/core/pipeline.py
+++ b/hlink/linking/core/pipeline.py
@@ -12,6 +12,7 @@
     Interaction,
 )
 import hlink.linking.transformers.float_cast_transformer
+from hlink.linking.transformers.rename_vector_attributes import RenameVectorAttributes
 import logging
 
 logger = logging.getLogger(__name__)
@@ -130,7 +131,20 @@ def generate_pipeline_stages(conf, ind_vars, tf, tconf):
                         inputCol=input_col,
                         outputCol=pipeline_feature["output_column"],
                     )
+
+                    # Spark's Bucketizer adds commas to its output vector slot
+                    # names. This causes issues later in the pipeline if the ML
+                    # model chosen is LightGBM. So we rename the slots here to
+                    # remove the commas. A similar issue happens with
+                    # Interaction below; see the comment there for a more
+                    # detailed description.
+                    remove_commas_from_bucketizer_vector = RenameVectorAttributes(
+                        inputCol=bucketizer.getOutputCol(),
+                        strsToReplace=[","],
+                        replaceWith="",
+                    )
                     pipeline_stages.append(bucketizer)
+                    pipeline_stages.append(remove_commas_from_bucketizer_vector)
 
                 elif pipeline_feature["transformer_type"] == "interaction":
                     input_cols = []
@@ -143,7 +157,25 @@ def generate_pipeline_stages(conf, ind_vars, tf, tconf):
                         inputCols=input_cols,
                         outputCol=pipeline_feature["output_column"],
                     )
+
+                    # Spark's Interaction creates its output vector attribute names
+                    # by concatenating the input column names with colons :. This
+                    # works fine for most of the down-pipeline transformers, but
+                    # LightGBM cannot run with attribute names that contain colons.
+                    # So this custom hlink transformer replaces colons in the vector
+                    # attribute names with underscores.
+                    #
+                    # Without this step, the colons propagate into the attribute
+                    # names for the features vector created by the VectorAssembler
+                    # and cause an error when training a LightGBM model.
+                    remove_colons_from_interaction_vector = RenameVectorAttributes(
+                        inputCol=interaction.getOutputCol(),
+                        strsToReplace=[":"],
+                        replaceWith="_",
+                    )
+
                     pipeline_stages.append(interaction)
+                    pipeline_stages.append(remove_colons_from_interaction_vector)
 
     if len(categorical_pipeline_features) > 0:
         encoded_output_cols = [
diff --git a/hlink/linking/training/link_step_save_model_metadata.py b/hlink/linking/training/link_step_save_model_metadata.py
index 00e3922..95191b7 100644
--- a/hlink/linking/training/link_step_save_model_metadata.py
+++ b/hlink/linking/training/link_step_save_model_metadata.py
@@ -3,8 +3,20 @@
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
 
+import logging
+
+from pyspark.sql.types import (
+    FloatType,
+    IntegerType,
+    StringType,
+    StructField,
+    StructType,
+)
+
 from hlink.linking.link_step import LinkStep
 
+logger = logging.getLogger(__name__)
+
 
 class LinkStepSaveModelMetadata(LinkStep):
     """Save metadata about the trained machine learning model.
@@ -36,6 +48,9 @@ def _run(self):
         do_get_feature_importances = config[training_conf].get("feature_importances")
 
         if do_get_feature_importances is None or not do_get_feature_importances:
+            logger.info(
+                "Skipping training step 3 - save model metadata since training.feature_importances is not set"
+            )
             print(
                 "Skipping the save model metadata training step. "
                 "To run this step and save model metadata like feature importances, "
@@ -58,64 +73,115 @@ def _run(self):
 
             raise new_error from e
 
-        # The pipeline model has three stages: vector assembler, classifier, post
-        # transformer.
+        # The pipeline model has three stages: vector assembler, model, and post transformer.
         vector_assembler = pipeline_model.stages[0]
-        classifier = pipeline_model.stages[1]
-
-        print("Retrieving model feature importances or coefficients...")
-        try:
-            feature_imp = classifier.coefficients
-        except:
-            try:
-                feature_imp = classifier.featureImportances
-            except:
-                print(
-                    "This model doesn't contain a coefficient or feature importances parameter -- check chosen model type."
-                )
-                return
-            else:
-                label = "Feature importances"
-        else:
-            label = "Coefficients"
-
-        column_names = vector_assembler.getInputCols()
-        # We need to convert from numpy float64s to Python floats to avoid type
-        # issues when creating the DataFrame below.
-        feature_importances = [
-            float(importance) for importance in feature_imp.toArray()
-        ]
+        model = pipeline_model.stages[1]
 
+        feature_names = vector_assembler.getInputCols()
+        logger.debug(f"Feature names are {feature_names}")
         tf_prepped = self.task.spark.table(f"{table_prefix}training_features_prepped")
         tf_prepped_schema = dict(tf_prepped.dtypes)
         tf_prepped_row = tf_prepped.head()
 
-        # Expand categorical features into multiple columns for display with their
-        # respective coefficients / feature importances.
-        true_cols = []
-        for col in column_names:
+        # Expand categorical features into multiple rows for display with their
+        # respective coefficients / feature importances per category. Non-categorical
+        # features just get one entry that looks like ("feature_name", None).
+        expanded_features: list[(str, int | None)] = []
+        for feature_name in feature_names:
             # Columns with type "vector" are categorical and may have more than one coefficient.
             # Many of these columns end with "_onehotencoded", and we remove that
             # suffix to clean up the column names. Categorical columns created through
             # feature interaction will probably not have the "_onehotencoded" suffix,
             # so we can't just check for that to find the categorical features.
-            data_type = tf_prepped_schema[col]
+            data_type = tf_prepped_schema[feature_name]
             if data_type == "vector":
-                base_col = col.removesuffix("_onehotencoded")
-                num_categories = len(tf_prepped_row[col])
-                true_cols.extend((base_col, i) for i in range(num_categories))
+                base_name = feature_name.removesuffix("_onehotencoded")
+                num_categories = len(tf_prepped_row[feature_name])
+                # Categories are numeric, starting at 0.
+                expanded_features.extend(
+                    (base_name, category) for category in range(num_categories)
+                )
             else:
-                base_col = col.removesuffix("_imp")
-                true_cols.append((base_col, None))
+                base_name = feature_name.removesuffix("_imp")
+                expanded_features.append((base_name, None))
 
-        true_column_names = [column_name for (column_name, _) in true_cols]
-        true_categories = [category for (_, category) in true_cols]
+        model_type = config[training_conf]["chosen_model"]["type"]
 
+        logger.debug(f"Expanded features with categories are {expanded_features}")
+        logger.debug(f"The model type is '{model_type}'")
+
+        print("Retrieving model feature importances or coefficients...")
+
+        if model_type == "xgboost":
+            raw_weights = model.get_feature_importances("weight")
+            raw_gains = model.get_feature_importances("total_gain")
+            keys = [f"f{index}" for index in range(len(expanded_features))]
+
+            weights = [raw_weights.get(key, 0.0) for key in keys]
+            gains = [raw_gains.get(key, 0.0) for key in keys]
+            label = "Feature importances (weights and gains)"
+
+            importance_columns = [
+                (StructField("weight", FloatType(), nullable=False), weights),
+                (StructField("gain", FloatType(), nullable=False), gains),
+            ]
+        elif model_type == "lightgbm":
+            # The "weight" of a feature is the number of splits it causes.
+            weights = model.getFeatureImportances("split")
+            gains = model.getFeatureImportances("gain")
+            label = "Feature importances (weights and gains)"
+
+            importance_columns = [
+                (StructField("weight", FloatType(), nullable=False), weights),
+                (StructField("gain", FloatType(), nullable=False), gains),
+            ]
+        else:
+            try:
+                feature_imp = model.coefficients
+            except:
+                try:
+                    feature_imp = model.featureImportances
+                except:
+                    logger.warning(
+                        f"Cannot compute feature importances for model of type '{model_type}'"
+                    )
+                    print(
+                        "This model doesn't contain a coefficient or feature importances parameter -- check chosen model type."
+                    )
+                    return
+                else:
+                    label = "Feature importances"
+            else:
+                label = "Coefficients"
+
+            # We need to convert from numpy float64s to Python floats to avoid type
+            # issues when creating the DataFrame below.
+            feature_importances = [
+                float(importance) for importance in feature_imp.toArray()
+            ]
+
+            importance_columns = [
+                (
+                    StructField(
+                        "coefficient_or_importance", FloatType(), nullable=False
+                    ),
+                    feature_importances,
+                ),
+            ]
+
+        logger.debug("Creating the DataFrame and saving it as a table")
+        feature_names, categories = zip(*expanded_features)
+        importance_schema, importance_data = zip(*importance_columns)
         features_df = self.task.spark.createDataFrame(
-            zip(true_column_names, true_categories, feature_importances, strict=True),
-            "feature_name: string, category: int, coefficient_or_importance: double",
+            zip(feature_names, categories, *importance_data, strict=True),
+            StructType(
+                [
+                    StructField("feature_name", StringType(), nullable=False),
+                    StructField("category", IntegerType(), nullable=True),
+                    *importance_schema,
+                ]
+            ),
         ).sort("feature_name", "category")
-
         feature_importances_table = (
             f"{self.task.table_prefix}training_feature_importances"
         )
diff --git a/hlink/linking/transformers/rename_vector_attributes.py b/hlink/linking/transformers/rename_vector_attributes.py
new file mode 100644
index 0000000..e407c4d
--- /dev/null
+++ b/hlink/linking/transformers/rename_vector_attributes.py
@@ -0,0 +1,96 @@
+# This file is part of the ISRDI's hlink.
+# For copyright and licensing information, see the NOTICE and LICENSE files
+# in this project's top-level directory, and also on-line at:
+#   https://github.com/ipums/hlink
+
+import logging
+
+from pyspark import keyword_only
+from pyspark.ml import Transformer
+from pyspark.ml.param.shared import HasInputCol, Param, Params, TypeConverters
+from pyspark.sql import DataFrame
+
+logger = logging.getLogger(__name__)
+
+
+class RenameVectorAttributes(Transformer, HasInputCol):
+    """
+    A custom transformer which renames the attributes or "slot names" of a
+    given input column of type vector. This is helpful when you don't have
+    complete control over the names of the attributes when they are created,
+    but you still need them to look a certain way.
+
+    For example, LightGBM can't handle vector attributes with colons in their
+    names. But the Spark Interaction class creates vector attributes named with
+    colons. So we need to rename the attributes and remove the colons before
+    passing the feature vector to LightGBM for training.
+    """
+
+    strsToReplace: Param[list[str]] = Param(
+        Params._dummy(),
+        "strsToReplace",
+        "Substrings to replace in the vector attribute names.",
+        typeConverter=TypeConverters.toListString,
+    )
+
+    replaceWith: Param[str] = Param(
+        Params._dummy(),
+        "replaceWith",
+        "The string to replace removed substrings.",
+        typeConverter=TypeConverters.toString,
+    )
+
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        inputCol: str | None = None,
+        strsToReplace: str | None = None,
+        replaceWith: str | None = None,
+    ) -> None:
+        super(RenameVectorAttributes, self).__init__()
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(
+        self,
+        *,
+        inputCol: str | None = None,
+        strsToReplace: str | None = None,
+        replaceWith: str | None = None,
+    ):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+    def _transform(self, dataset: DataFrame) -> DataFrame:
+        input_col = self.getInputCol()
+        to_replace = self.getOrDefault("strsToReplace")
+        replacement_str = self.getOrDefault("replaceWith")
+        metadata = dataset.schema[input_col].metadata
+
+        logger.debug(
+            f"Renaming the attributes of vector column '{input_col}': "
+            f"replacing {to_replace} with '{replacement_str}'"
+        )
+
+        if "attrs" in metadata["ml_attr"]:
+            attributes_by_type = metadata["ml_attr"]["attrs"]
+
+            # The attributes are grouped by type, which may be numeric, binary, or
+            # nominal. We don't care about the type here; we'll just rename all of
+            # the attributes.
+            for _attribute_type, attributes in attributes_by_type.items():
+                for attribute in attributes:
+                    for substring in to_replace:
+                        attribute["name"] = attribute["name"].replace(
+                            substring, replacement_str
+                        )
+        elif "vals" in metadata["ml_attr"]:
+            values = metadata["ml_attr"]["vals"]
+
+            for index in range(len(values)):
+                for substring in to_replace:
+                    values[index] = values[index].replace(substring, replacement_str)
+
+        return dataset.withMetadata(input_col, metadata)
diff --git a/hlink/spark/session.py b/hlink/spark/session.py
index b9014af..a03db15 100644
--- a/hlink/spark/session.py
+++ b/hlink/spark/session.py
@@ -17,6 +17,17 @@
     StructType,
 )
 
+# SynapseML is a package which provides LightGBM-Spark integration for hlink.
+# It's an optional dependency. When it is installed, we need to download an
+# additional Scala library by setting some Spark configurations. When it's not
+# installed, we avoid downloading the extra library since it won't be useful.
+try:
+    import synapse.ml  # noqa: F401
+except ModuleNotFoundError:
+    _synapse_ml_available = False
+else:
+    _synapse_ml_available = True
+
 
 class SparkConnection:
     """Handles initialization of spark session and connection to local cluster."""
@@ -59,6 +70,16 @@ def spark_conf(self, executor_cores, executor_memory, driver_memory, cores):
 
         if os.path.isfile(jar_path):
             conf = conf.set("spark.jars", jar_path)
+
+        # A bit of a kludge. We set spark.jars.repositories here in the configuration,
+        # but then we actually download the SynapseML Scala jar later in connect().
+        # See the comment on the ADD JAR SQL statement in connect() for some more
+        # context.
+        #
+        # SynapseML used to be named MMLSpark, thus the URL.
+        if _synapse_ml_available:
+            conf.set("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
+
         return conf
 
     def local(self, cores=1, executor_memory="10G"):
@@ -96,6 +117,18 @@ def connect(
         session.catalog.setCurrentDatabase(self.db_name)
         session.sparkContext.setCheckpointDir(str(self.tmp_dir))
         self._register_udfs(session)
+
+        # If the SynapseML Python package is available, include the Scala
+        # package as well. Note that we have to pin to a particular version of
+        # the Scala package here.
+        #
+        # Despite what the documentation for the spark.jars.packages config setting
+        # says, this is the only way that I have found to include this jar for both
+        # the driver and the executors. Setting spark.jars.packages caused errors
+        # because the executors could not find the jar.
+        if _synapse_ml_available:
+            session.sql("ADD JAR ivy://com.microsoft.azure:synapseml_2.12:1.0.8")
+
         return session
 
     def _register_udfs(self, session):
diff --git a/hlink/tests/core/classifier_test.py b/hlink/tests/core/classifier_test.py
new file mode 100644
index 0000000..e95b878
--- /dev/null
+++ b/hlink/tests/core/classifier_test.py
@@ -0,0 +1,32 @@
+# This file is part of the ISRDI's hlink.
+# For copyright and licensing information, see the NOTICE and LICENSE files
+# in this project's top-level directory, and also on-line at:
+#   https://github.com/ipums/hlink
+
+from hlink.linking.core.classifier import choose_classifier
+from hlink.tests.markers import requires_lightgbm, requires_xgboost
+
+
+@requires_lightgbm
+def test_choose_classifier_supports_lightgbm() -> None:
+    params = {
+        "maxDepth": 7,
+        "numIterations": 5,
+    }
+
+    classifier, _post_transformer = choose_classifier("lightgbm", params, "match")
+    assert classifier.getLabelCol() == "match"
+
+
+@requires_xgboost
+def test_choose_classifier_supports_xgboost():
+    """
+    If the xgboost module is installed, then choose_classifier() supports a model
+    type of "xgboost".
+    """
+    params = {
+        "max_depth": 2,
+        "eta": 0.5,
+    }
+    classifier, _post_transformer = choose_classifier("xgboost", params, "match")
+    assert classifier.getLabelCol() == "match"
diff --git a/hlink/tests/integration_score_with_trained_models_test.py b/hlink/tests/integration_score_with_trained_models_test.py
index 993a497..cbb9c99 100644
--- a/hlink/tests/integration_score_with_trained_models_test.py
+++ b/hlink/tests/integration_score_with_trained_models_test.py
@@ -3,6 +3,8 @@
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
 
+from hlink.tests.markers import requires_lightgbm, requires_xgboost
+
 
 def test_apply_chosen_model_RF(
     spark,
@@ -492,6 +494,98 @@ def test_apply_chosen_model_probit(
     )
 
 
+@requires_lightgbm
+def test_apply_chosen_model_lightgbm(
+    spark,
+    matching,
+    training,
+    training_conf,
+    datasource_training_input,
+    state_dist_path,
+    spark_test_tmp_dir_path,
+    potential_matches_path,
+):
+    training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input
+
+    training_conf["comparison_features"] = [
+        {
+            "alias": "regionf",
+            "column_name": "region",
+            "comparison_type": "fetch_a",
+            "categorical": True,
+        },
+        {
+            "alias": "namelast_jw",
+            "column_name": "namelast",
+            "comparison_type": "jaro_winkler",
+        },
+        {
+            "alias": "state_distance",
+            "key_count": 1,
+            "column_name": "bpl",
+            "comparison_type": "geo_distance",
+            "loc_a": "statecode1",
+            "loc_b": "statecode2",
+            "distance_col": "dist",
+            "table_name": "state_distances_lookup",
+            "distances_file": state_dist_path,
+        },
+    ]
+
+    training_conf["training"]["dataset"] = training_data_path
+    training_conf["training"]["dependent_var"] = "match"
+    training_conf["training"]["independent_vars"] = [
+        "namelast_jw",
+        "regionf",
+        "state_distance",
+    ]
+    training_conf["training"]["chosen_model"] = {
+        "type": "lightgbm",
+        "maxDepth": 10,
+        "numIterations": 5,
+        "minDataInLeaf": 1,
+        "threshold": 0.5,
+    }
+    training_conf["training"]["score_with_model"] = True
+    training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path
+    training_conf["drop_data_from_scored_matches"] = True
+
+    prepped_df_a = spark.read.csv(prepped_df_a_path, header=True, inferSchema=True)
+    prepped_df_b = spark.read.csv(prepped_df_b_path, header=True, inferSchema=True)
+    potential_matches = spark.read.csv(
+        potential_matches_path, header=True, inferSchema=True
+    )
+
+    prepped_df_a.write.mode("overwrite").saveAsTable("prepped_df_a")
+    prepped_df_b.write.mode("overwrite").saveAsTable("prepped_df_b")
+    potential_matches.write.mode("overwrite").saveAsTable("potential_matches")
+
+    training.run_all_steps()
+    matching.run_step(2)
+
+    potential_matches_df = spark.table("scored_potential_matches").toPandas()
+
+    # Check one case that we expect to be a match and one case that we expect
+    # not to be a match.
+    should_be_match = potential_matches_df.query(
+        "id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'"
+    )
+    assert (
+        should_be_match.shape[0] == 1
+    ), "expected exactly one potential match for 0202928A"
+    assert should_be_match["probability"].iloc[0] >= 0.5
+    assert should_be_match["prediction"].iloc[0] == 1
+
+    should_not_be_match = potential_matches_df.query(
+        "id_b == '033FD0FA-C523-42B5-976A-751E830F7021'"
+    )
+    assert (
+        should_not_be_match.shape[0] == 1
+    ), "expected exactly one potential match for 033FD0FA"
+    assert should_not_be_match["probability"].iloc[0] <= 0.5
+    assert should_not_be_match["prediction"].iloc[0] == 0
+
+
 def test_step_3_apply_chosen_model_logistic_regression(
     spark,
     training_conf,
@@ -859,6 +953,100 @@ def test_step_3_apply_chosen_model_boosted_trees(
     )
 
 
+@requires_xgboost
+def test_apply_chosen_model_xgboost(
+    spark,
+    training,
+    matching,
+    training_conf,
+    datasource_training_input,
+    potential_matches_path,
+    state_dist_path,
+    spark_test_tmp_dir_path,
+):
+    training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input
+    training_conf["comparison_features"] = [
+        {
+            "alias": "regionf",
+            "column_name": "region",
+            "comparison_type": "fetch_a",
+            "categorical": True,
+        },
+        {
+            "alias": "namelast_jw",
+            "column_name": "namelast",
+            "comparison_type": "jaro_winkler",
+        },
+        {
+            "alias": "state_distance",
+            "key_count": 1,
+            "column_name": "bpl",
+            "comparison_type": "geo_distance",
+            "loc_a": "statecode1",
+            "loc_b": "statecode2",
+            "distance_col": "dist",
+            "table_name": "state_distances_lookup",
+            "distances_file": state_dist_path,
+        },
+    ]
+
+    training_conf["training"]["dataset"] = training_data_path
+    training_conf["training"]["dependent_var"] = "match"
+    training_conf["training"]["independent_vars"] = [
+        "namelast_jw",
+        "regionf",
+        "state_distance",
+    ]
+    training_conf["training"]["chosen_model"] = {
+        "type": "xgboost",
+        "max_depth": 5,
+        "eta": 0.5,
+        "threshold": 0.5,
+        "threshold_ratio": 1.3,
+    }
+    training_conf["training"]["score_with_model"] = True
+    training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path
+    training_conf["drop_data_from_scored_matches"] = True
+
+    prepped_df_a = spark.read.csv(prepped_df_a_path, header=True, inferSchema=True)
+    prepped_df_b = spark.read.csv(prepped_df_b_path, header=True, inferSchema=True)
+    potential_matches = spark.read.csv(
+        potential_matches_path, header=True, inferSchema=True
+    )
+    prepped_df_a.write.mode("overwrite").saveAsTable("prepped_df_a")
+    prepped_df_b.write.mode("overwrite").saveAsTable("prepped_df_b")
+    potential_matches.write.mode("overwrite").saveAsTable("potential_matches")
+
+    training.run_all_steps()
+    matching.run_step(2)
+
+    potential_matches_df = spark.table("scored_potential_matches").toPandas()
+
+    # Check one case that we expect to be a match and one case that we expect not
+    # to be a match.
+    should_be_match = potential_matches_df.query(
+        "id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'"
+    )
+    assert (
+        should_be_match.shape[0] == 1
+    ), "expected exactly one potential match for 0202928A"
+    assert should_be_match["probability"].iloc[0] >= 0.5
+    assert should_be_match["prediction"].iloc[0] == 1
+
+    # In the real world, this would probably be a match, depending on how the
+    # additional features looked. But we've included so few training features
+    # for our test model that small differences in names can really hurt a
+    # potential match's chances of being classified as a match.
+    should_not_be_match = potential_matches_df.query(
+        "id_b == '033FD0FA-C523-42B5-976A-751E830F7021'"
+    )
+    assert (
+        should_not_be_match.shape[0] == 1
+    ), "expected exactly one potential match for 033FD0FA"
+    assert should_not_be_match["probability"].iloc[0] <= 0.5
+    assert should_not_be_match["prediction"].iloc[0] == 0
+
+
 def test_step_3_apply_chosen_model_RF_threshold(
     spark,
     training_conf,
diff --git a/hlink/tests/markers.py b/hlink/tests/markers.py
new file mode 100644
index 0000000..1616cd5
--- /dev/null
+++ b/hlink/tests/markers.py
@@ -0,0 +1,31 @@
+# This file is part of the ISRDI's hlink.
+# For copyright and licensing information, see the NOTICE and LICENSE files
+# in this project's top-level directory, and also on-line at:
+#   https://github.com/ipums/hlink
+
+import pytest
+
+try:
+    import xgboost  # noqa: F401
+except ModuleNotFoundError:
+    xgboost_available = False
+else:
+    xgboost_available = True
+
+try:
+    import synapse.ml.lightgbm  # noqa: F401
+except ModuleNotFoundError:
+    lightgbm_available = False
+else:
+    lightgbm_available = True
+
+
+requires_xgboost = pytest.mark.skipif(
+    not xgboost_available, reason="requires the xgboost library"
+)
+"""For tests which require the xgboost library. This checks whether xgboost is
+installed and skips the test if it is not."""
+
+requires_lightgbm = pytest.mark.skipif(
+    not lightgbm_available, reason="requires the lightgbm library"
+)
diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py
index 5c07b67..ebc8a77 100644
--- a/hlink/tests/training_test.py
+++ b/hlink/tests/training_test.py
@@ -6,6 +6,7 @@
 import pytest
 from pyspark.ml import Pipeline
 import hlink.linking.core.pipeline as pipeline_core
+from hlink.tests.markers import requires_lightgbm, requires_xgboost
 
 
 def test_all_steps(
@@ -431,6 +432,239 @@ def test_step_3_with_probit_model(
     )
 
 
+@requires_lightgbm
+def test_step_3_with_lightgbm_model(
+    spark, training, training_conf, datasource_training_input, state_dist_path
+):
+    training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input
+    training_conf["comparison_features"] = [
+        {
+            "alias": "regionf",
+            "column_name": "region",
+            "comparison_type": "fetch_a",
+            "categorical": True,
+        },
+        {
+            "alias": "namelast_jw",
+            "column_name": "namelast",
+            "comparison_type": "jaro_winkler",
+        },
+        {
+            "alias": "state_distance",
+            "key_count": 1,
+            "column_name": "bpl",
+            "comparison_type": "geo_distance",
+            "loc_a": "statecode1",
+            "loc_b": "statecode2",
+            "distance_col": "dist",
+            "table_name": "state_distances_lookup",
+            "distances_file": state_dist_path,
+        },
+    ]
+    training_conf["training"]["dataset"] = training_data_path
+    training_conf["training"]["dependent_var"] = "match"
+    training_conf["training"]["independent_vars"] = [
+        "namelast_jw",
+        "regionf",
+        "state_distance",
+    ]
+    training_conf["training"]["chosen_model"] = {
+        "type": "lightgbm",
+        "maxDepth": 7,
+        "numIterations": 5,
+        "minDataInLeaf": 1,
+        "threshold": 0.5,
+    }
+    training_conf["training"]["score_with_model"] = True
+    training_conf["training"]["feature_importances"] = True
+
+    prepped_df_a = spark.read.csv(prepped_df_a_path, header=True, inferSchema=True)
+    prepped_df_b = spark.read.csv(prepped_df_b_path, header=True, inferSchema=True)
+
+    prepped_df_a.write.mode("overwrite").saveAsTable("prepped_df_a")
+    prepped_df_b.write.mode("overwrite").saveAsTable("prepped_df_b")
+
+    training.run_all_steps()
+
+    importances_df = spark.table("training_feature_importances")
+    assert importances_df.columns == [
+        "feature_name",
+        "category",
+        "weight",
+        "gain",
+    ]
+
+
+@requires_lightgbm
+def test_lightgbm_with_interacted_features(
+    spark, training, training_conf, datasource_training_input
+):
+    """
+    Interacted features add colons to vector attribute names, which cause
+    problems for LightGBM. Hlink handles this automatically by renaming the
+    vector attributes to remove the colons before invoking LightGBM.
+    """
+    training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input
+    training_conf["comparison_features"] = [
+        {
+            "alias": "regionf",
+            "column_name": "region",
+            "comparison_type": "fetch_a",
+            "categorical": True,
+        },
+        {
+            "alias": "namelast_jw",
+            "column_name": "namelast",
+            "comparison_type": "jaro_winkler",
+        },
+    ]
+    training_conf["pipeline_features"] = [
+        {
+            "input_columns": ["regionf", "namelast_jw"],
+            "output_column": "regionf_interacted_namelast_jw",
+            "transformer_type": "interaction",
+        }
+    ]
+    training_conf["training"]["dataset"] = training_data_path
+    training_conf["training"]["dependent_var"] = "match"
+    training_conf["training"]["independent_vars"] = [
+        "namelast_jw",
+        "regionf",
+        "regionf_interacted_namelast_jw",
+    ]
+    training_conf["training"]["chosen_model"] = {
+        "type": "lightgbm",
+        "maxDepth": 7,
+        "numIterations": 5,
+        "minDataInLeaf": 1,
+        "threshold": 0.5,
+    }
+    training_conf["training"]["score_with_model"] = True
+    training_conf["training"]["feature_importances"] = True
+    prepped_df_a = spark.read.csv(prepped_df_a_path, header=True, inferSchema=True)
+    prepped_df_b = spark.read.csv(prepped_df_b_path, header=True, inferSchema=True)
+
+    prepped_df_a.write.mode("overwrite").saveAsTable("prepped_df_a")
+    prepped_df_b.write.mode("overwrite").saveAsTable("prepped_df_b")
+
+    training.run_all_steps()
+
+    importances_df = spark.table("training_feature_importances")
+    assert importances_df.columns == [
+        "feature_name",
+        "category",
+        "weight",
+        "gain",
+    ]
+
+
+@requires_lightgbm
+def test_lightgbm_with_bucketized_features(
+    spark, training, training_conf, datasource_training_input
+):
+    """
+    Bucketized features add commas to vector attribute names, which cause
+    problems for LightGBM. Hlink handles this automatically by renaming the
+    vector attributes to remove the commas before invoking LightGBM.
+    """
+    training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input
+    training_conf["comparison_features"] = [
+        {
+            "alias": "namelast_jw",
+            "column_name": "namelast",
+            "comparison_type": "jaro_winkler",
+        },
+    ]
+    training_conf["pipeline_features"] = [
+        {
+            "input_column": "namelast_jw",
+            "output_column": "namelast_jw_buckets",
+            "transformer_type": "bucketizer",
+            "categorical": True,
+            "splits": [0.0, 0.33, 0.67, 1.0],
+        }
+    ]
+    training_conf["training"]["dataset"] = training_data_path
+    training_conf["training"]["dependent_var"] = "match"
+    training_conf["training"]["independent_vars"] = [
+        "namelast_jw_buckets",
+    ]
+    training_conf["training"]["chosen_model"] = {
+        "type": "lightgbm",
+        "threshold": 0.5,
+    }
+    training_conf["training"]["score_with_model"] = True
+    training_conf["training"]["feature_importances"] = True
+
+    prepped_df_a = spark.read.csv(prepped_df_a_path, header=True, inferSchema=True)
+    prepped_df_b = spark.read.csv(prepped_df_b_path, header=True, inferSchema=True)
+
+    prepped_df_a.write.mode("overwrite").saveAsTable("prepped_df_a")
+    prepped_df_b.write.mode("overwrite").saveAsTable("prepped_df_b")
+
+    training.run_all_steps()
+
+    importances_df = spark.table("training_feature_importances")
+    assert importances_df.columns == [
+        "feature_name",
+        "category",
+        "weight",
+        "gain",
+    ]
+
+
+@requires_xgboost
+def test_step_3_with_xgboost_model(
+    spark, training, training_conf, datasource_training_input
+):
+    training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input
+    training_conf["comparison_features"] = [
+        {
+            "alias": "regionf",
+            "column_name": "region",
+            "comparison_type": "fetch_a",
+            "categorical": True,
+        },
+        {
+            "alias": "namelast_jw",
+            "column_name": "namelast",
+            "comparison_type": "jaro_winkler",
+        },
+    ]
+    training_conf["training"]["dataset"] = training_data_path
+    training_conf["training"]["dependent_var"] = "match"
+    training_conf["training"]["independent_vars"] = ["namelast_jw", "regionf"]
+    training_conf["training"]["chosen_model"] = {
+        "type": "xgboost",
+        "max_depth": 2,
+        "eta": 0.5,
+        "threshold": 0.7,
+        "threshold_ratio": 1.3,
+    }
+    training_conf["training"]["score_with_model"] = True
+    training_conf["training"]["feature_importances"] = True
+
+    spark.read.csv(prepped_df_a_path, header=True, inferSchema=True).write.mode(
+        "overwrite"
+    ).saveAsTable("prepped_df_a")
+    spark.read.csv(prepped_df_b_path, header=True, inferSchema=True).write.mode(
+        "overwrite"
+    ).saveAsTable("prepped_df_b")
+
+    training.run_step(0)
+    training.run_step(1)
+    training.run_step(2)
+    training.run_step(3)
+
+    importances_df = spark.table("training_feature_importances")
+    assert importances_df.columns == [
+        "feature_name",
+        "category",
+        "weight",
+        "gain",
+    ]
+
+
 def test_step_3_requires_table(training_conf, training):
     training_conf["training"]["feature_importances"] = True
     with pytest.raises(RuntimeError, match="Missing input tables"):
diff --git a/hlink/tests/transformers_test.py b/hlink/tests/transformers_test.py
new file mode 100644
index 0000000..1b3c6aa
--- /dev/null
+++ b/hlink/tests/transformers_test.py
@@ -0,0 +1,77 @@
+# This file is part of the ISRDI's hlink.
+# For copyright and licensing information, see the NOTICE and LICENSE files
+# in this project's top-level directory, and also on-line at:
+#   https://github.com/ipums/hlink
+
+from pyspark.sql import SparkSession
+from pyspark.ml.feature import Bucketizer, VectorAssembler
+
+from hlink.linking.transformers.rename_vector_attributes import RenameVectorAttributes
+
+
+def test_rename_vector_attributes(spark: SparkSession) -> None:
+    df = spark.createDataFrame(
+        [[0.0, 1.0], [1.0, 2.0], [3.0, 4.0]], schema=["A", "regionf_0:namelast_jw"]
+    )
+
+    assembler = VectorAssembler(
+        inputCols=["A", "regionf_0:namelast_jw"], outputCol="vectorized"
+    )
+    remove_colons = RenameVectorAttributes(
+        inputCol="vectorized", strsToReplace=[":"], replaceWith="_"
+    )
+    transformed = remove_colons.transform(assembler.transform(df))
+
+    # Save to Java, then reload to confirm that the metadata changes are persistent
+    transformed.write.mode("overwrite").saveAsTable("transformed")
+    df = spark.table("transformed")
+
+    attrs = df.schema["vectorized"].metadata["ml_attr"]["attrs"]["numeric"]
+    attr_names = [attr["name"] for attr in attrs]
+    assert attr_names == ["A", "regionf_0_namelast_jw"]
+
+
+def test_rename_vector_attributes_multiple_replacements(spark: SparkSession) -> None:
+    df = spark.createDataFrame(
+        [[1, 2], [3, 4]], schema=["column1*has*stars", "column2*multiple/symbols"]
+    )
+
+    assembler = VectorAssembler(
+        inputCols=["column1*has*stars", "column2*multiple/symbols"], outputCol="vector"
+    )
+    rename_attrs = RenameVectorAttributes(
+        inputCol="vector", strsToReplace=["*", "/"], replaceWith=""
+    )
+    transformed = rename_attrs.transform(assembler.transform(df))
+
+    # Save to Java, then reload to confirm that the metadata changes are persistent
+    transformed.write.mode("overwrite").saveAsTable("transformed")
+    df = spark.table("transformed")
+
+    attrs = df.schema["vector"].metadata["ml_attr"]["attrs"]["numeric"]
+    attr_names = [attr["name"] for attr in attrs]
+    assert attr_names == ["column1hasstars", "column2multiplesymbols"]
+
+
+def test_rename_vector_attributes_on_bucketized_feature(spark: SparkSession) -> None:
+    df = spark.createDataFrame(
+        [[0.1, 0.7, 0.2, 0.5, 0.8, 0.2, 0.3]], schema=["namelast_jw"]
+    )
+
+    bucketizer = Bucketizer(
+        inputCol="namelast_jw",
+        outputCol="namelast_jw_buckets",
+        splits=[0.0, 0.33, 0.67, 1.0],
+    )
+    rename_attrs = RenameVectorAttributes(
+        inputCol="namelast_jw_buckets", strsToReplace=[","], replaceWith=""
+    )
+    transformed = rename_attrs.transform(bucketizer.transform(df))
+
+    # Save to Java, then reload to confirm that the metadata changes are persistent
+    transformed.write.mode("overwrite").saveAsTable("transformed")
+    output_df = spark.table("transformed")
+
+    # Bucketized vectors have different metadata
+    values = output_df.schema["namelast_jw_buckets"].metadata["ml_attr"]["vals"]
+    assert values == ["0.0 0.33", "0.33 0.67", "0.67 1.0"]
diff --git a/pyproject.toml b/pyproject.toml
index efa43f0..e31d9e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,13 @@ dev = [
     "sphinx==8.1.3",
     "recommonmark==0.7.1",
 ]
+lightgbm = [
+    "synapseml>=1.0"
+]
+xgboost = [
+    "xgboost>=2.0",
+    "pyarrow>=4.0",
+]
 
 [project.scripts]
 hlink = "hlink.scripts.main:cli"
diff --git a/sphinx-docs/models.md b/sphinx-docs/models.md
index a1c9996..31c9eb6 100644
--- a/sphinx-docs/models.md
+++ b/sphinx-docs/models.md
@@ -1,53 +1,80 @@
 # Models
 
-These are models available to be used in the model evaluation, training, and household training link tasks.
-
-* Attributes for all models:
-  * `threshold` -- Type: `float`.  Alpha threshold (model hyperparameter).
-  * `threshold_ratio` -- Type: `float`.  Beta threshold (de-duplication distance ratio).
-  * Any parameters available in the model as defined in the Spark documentation can be passed as params using the label given in the Spark docs.  Commonly used parameters are listed below with descriptive explanations from the Spark docs.
+These are the machine learning models available for use in the model evaluation
+and training tasks and in their household counterparts.
+
+There are a few attributes available for all models.
+
+* `type` -- Type: `string`. The name of the model type. The available model
+  types are listed below.
+* `threshold` -- Type: `float`.  The "alpha threshold". This is the probability
+  score required for a potential match to be labeled a match. `0 ≤ threshold ≤
+  1`.
+* `threshold_ratio` -- Type: `float`. The threshold ratio or "beta threshold".
+  This applies to records which have multiple potential matches when
+  `training.decision` is set to `"drop_duplicate_with_threshold_ratio"`. For
+  each record, only potential matches which have the highest probability, have
+  a probability of at least `threshold`, *and* whose probabilities are at least
+  `threshold_ratio` times larger than the second-highest probability are
+  matches. This is sometimes called the "de-duplication distance ratio". `1 ≤
+  threshold_ratio < ∞`.
+
+In addition, any model parameters documented in a model type's Spark
+documentation can be passed as parameters to the model through hlink's
+`training.chosen_model` and `training.model_exploration` configuration
+sections.
+
+Here is an example `training.chosen_model` configuration. The `type`,
+`threshold`, and `threshold_ratio` attributes are hlink specific. `maxDepth` is
+a parameter to the random forest model which hlink passes through to the
+underlying Spark classifier.
+
+```toml
+[training.chosen_model]
+type = "random_forest"
+threshold = 0.2
+threshold_ratio = 1.2
+maxDepth = 5
+```
 
 ## random_forest
 
-Uses [pyspark.ml.classification.RandomForestClassifier](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.RandomForestClassifier.html).  Returns probability as an array.
+Uses [pyspark.ml.classification.RandomForestClassifier](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.RandomForestClassifier.html).
 * Parameters:
   * `maxDepth` -- Type: `int`. Maximum depth of the tree. Spark default value is 5.
   * `numTrees` -- Type: `int`. The number of trees to train.  Spark default value is 20, must be >= 1.
   * `featureSubsetStrategy` -- Type: `string`. Per the Spark docs: "The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]."
 
-```
-model_parameters = {
-    type = "random_forest",
-    maxDepth = 5,
-    numTrees = 75,
-    featureSubsetStrategy = "sqrt",
-    threshold = 0.15,
-    threshold_ratio = 1.0
-}
+```toml
+[training.chosen_model]
+type = "random_forest"
+threshold = 0.15
+threshold_ratio = 1.0
+maxDepth = 5
+numTrees = 75
+featureSubsetStrategy = "sqrt"
 ```
 
 ## probit
 
 Uses [pyspark.ml.regression.GeneralizedLinearRegression](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.GeneralizedLinearRegression.html) with `family="binomial"` and `link="probit"`.  
 
-```
-model_parameters = {
-    type = "probit",
-    threshold = 0.85,
-    threshold_ratio = 1.2
-}
+```toml
+[training.chosen_model]
+type = "probit"
+threshold = 0.85
+threshold_ratio = 1.2
 ```
 
 ## logistic_regression
 
 Uses [pyspark.ml.classification.LogisticRegression](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.LogisticRegression.html)
 
-```
-chosen_model = {
-    type = "logistic_regression",
-    threshold = 0.5,
-    threshold_ratio = 1.0
-}
+```toml
+[training.chosen_model]
+type = "logistic_regression"
+threshold = 0.5
+threshold_ratio = 1.0
 ```
 
 ## decision_tree
@@ -59,13 +86,14 @@ Uses [pyspark.ml.classification.DecisionTreeClassifier](https://spark.apache.org
   * `minInstancesPerNode` -- Type `int`. Per the Spark docs: "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1."
   * `maxBins` -- Type: `int`. Per the Spark docs: "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature."
 
-```
-chosen_model = {
-    type = "decision_tree",
-    maxDepth = 6,
-    minInstancesPerNode = 2,
-    maxBins = 4
-}
+```toml
+[training.chosen_model]
+type = "decision_tree"
+threshold = 0.5
+threshold_ratio = 1.5
+maxDepth = 6
+minInstancesPerNode = 2
+maxBins = 4
 ```
 
 ## gradient_boosted_trees
@@ -77,13 +105,94 @@ Uses [pyspark.ml.classification.GBTClassifier](https://spark.apache.org/docs/lat
   * `minInstancesPerNode` -- Type `int`. Per the Spark docs: "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1."
   * `maxBins` -- Type: `int`. Per the Spark docs: "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature."
   
+```toml
+[training.chosen_model]
+type = "gradient_boosted_trees"
+threshold = 0.7
+threshold_ratio = 1.3
+maxDepth = 4
+minInstancesPerNode = 1
+maxBins = 6
+```
+
+## xgboost
+
+*Added in version 3.8.0.*
+
+XGBoost is an alternate, high-performance implementation of gradient boosting.
+It uses [xgboost.spark.SparkXGBClassifier](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.spark.SparkXGBClassifier).
+Since the XGBoost-PySpark integration which the xgboost Python package provides
+is currently unstable, support for the xgboost model type is disabled in hlink
+by default. hlink will stop with an error if you try to use this model type
+without enabling support for it. To enable support for xgboost, install hlink
+with the `xgboost` extra.
+
 ```
-chosen_model = {
-    type = "gradient_boosted_trees",
-    maxDepth = 4,
-    minInstancesPerNode = 1,
-    maxBins = 6,
-    threshold = 0.7,
-    threshold_ratio = 1.3
-}
+pip install hlink[xgboost]
+```
+
+This installs the xgboost package and its Python dependencies. Depending on
+your machine and operating system, you may also need to install the libomp
+library, which is another dependency of xgboost. xgboost should raise a helpful
+error if it detects that you need to install libomp.
+
+You can view a list of xgboost's parameters
+[here](https://xgboost.readthedocs.io/en/latest/parameter.html).
+
+```toml
+[training.chosen_model]
+type = "xgboost"
+threshold = 0.8
+threshold_ratio = 1.5
+max_depth = 5
+eta = 0.5
+gamma = 0.05
+```
+
+## lightgbm
+
+*Added in version 3.8.0.*
+
+LightGBM is another alternate, high-performance implementation of gradient
+boosting. It uses
+[synapse.ml.lightgbm.LightGBMClassifier](https://mmlspark.blob.core.windows.net/docs/1.0.8/pyspark/synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMClassifier).
+`synapse.ml` is a library which provides various integrations with PySpark,
+including integrations between the C++ LightGBM library and PySpark.
+
+LightGBM requires some additional Scala libraries that hlink does not usually
+install, so support for the lightgbm model is disabled in hlink by default.
+hlink will stop with an error if you try to use this model type without
+enabling support for it. To enable support for lightgbm, install hlink with the
+`lightgbm` extra.
+
+```
+pip install hlink[lightgbm]
+```
+
+This installs the lightgbm package and its Python dependencies. Depending on
+your machine and operating system, you may also need to install the libomp
+library, which is another dependency of lightgbm. If you encounter errors when
+training a lightgbm model, please try installing libomp if you do not have it
+installed.
+
+lightgbm has an enormous number of available parameters. Many of these are
+available as normal in hlink, via the [LightGBMClassifier
+class](https://mmlspark.blob.core.windows.net/docs/1.0.8/pyspark/synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMClassifier).
+Others are available through the special `passThroughArgs` parameter, which
+passes additional parameters through to the C++ library. You can see a full
+list of the supported parameters
+[here](https://lightgbm.readthedocs.io/en/latest/Parameters.html).
+
+```toml
+[training.chosen_model]
+type = "lightgbm"
+# hlink's threshold and threshold_ratio
+threshold = 0.8
+threshold_ratio = 1.5
+# LightGBMClassifier supports these parameters (and many more).
+maxDepth = 5
+learningRate = 0.5
+# LightGBMClassifier does not directly support this parameter,
+# so we have to send it to the C++ library with passThroughArgs.
+passThroughArgs = "force_row_wise=true"
 ```