Xgboost integration (#78)

* integration v1 created * Preparing docs * Warning fix * Added docstrings and check if experiment exist * Modified docstrings * Links corrections for better readability * Updated docstrings and logic * Updated docstrings * Log model works with cv * Fixes in log model, log importance work with cv, metrics logger fixes * Log trees works with cv * Added cv examples * minors * Added sklearn API examples * Dropped dask ToDo * Fixing dependencies * Updated version * Fixing travis errors * Added gif with UI tour, docs fix * dropped gif from here * Put link to colab on the foreground * Added option to pass neptune exp
neptune-ai · Apr 10, 2020 · 991f8cf · 991f8cf
1 parent 4562263
commit 991f8cf
Show file tree

Hide file tree

Showing 6 changed files with 247 additions and 8 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -40,7 +40,8 @@
                         'skopt',
                         'optuna',
                         'scikitplot',
-                        'scikitplot.metrics']
+                        'scikitplot.metrics',
+                        'xgboost']
 
 # -- Project information -----------------------------------------------------
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -55,6 +55,7 @@ And the best thing is you can extend it yourself or... tell us to do it for you
    hpo.utils <user_guide/hpo/utils>
    bots.telegram_bot <user_guide/bots/telegram_bot>
    monitoring.lightgbm <user_guide/monitoring/lightgbm>
+   monitoring.xgboost <user_guide/monitoring/xgboost>
    monitoring.fastai <user_guide/monitoring/fastai>
    monitoring.metrics <user_guide/monitoring/metrics>
    monitoring.fairness <user_guide/monitoring/fairness>

diff --git a/docs/user_guide/monitoring/xgboost.rst b/docs/user_guide/monitoring/xgboost.rst
@@ -0,0 +1,6 @@
+XGBoost
+=======
+
+.. automodule:: neptunecontrib.monitoring.xgboost_monitor
+    :members:
+    :show-inheritance:
diff --git a/neptunecontrib/hpo/utils.py b/neptunecontrib/hpo/utils.py
@@ -166,7 +166,7 @@ def bayes2skopt(bayes_opt):
     format.
 
     Args:
-        bayes_opt(`bayes_opt.Bbyesian_optimization.BayesianOptimization`): BayesianOptimization instance.
+        bayes_opt(`bayes_opt.Bayesian_optimization.BayesianOptimization`): BayesianOptimization instance.
 
     Returns:
         `scipy.optimize.optimize.OptimizeResult`: Converted OptimizeResult.
@@ -184,10 +184,8 @@ def bayes2skopt(bayes_opt):
             results = hp_utils.bayes2skopt(bayes_optimization)
 
     Note:
-        Since skopt is always minimizng and BayesianOptimization is maximizing, the objective function values are
-        converted into negatives for consistency::
-
-
+        Since skopt is always minimizing and BayesianOptimization is maximizing, the objective function values are
+        converted into negatives for consistency.
     """
     results = bayes_opt.space.res()
     results = [{'target': trial['target'], **trial['params']} for trial in results]

diff --git a/neptunecontrib/monitoring/xgboost_monitor.py b/neptunecontrib/monitoring/xgboost_monitor.py
@@ -0,0 +1,233 @@
+#
+# Copyright (c) 2020, Neptune Labs Sp. z o.o.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import tempfile
+
+import neptune
+import xgboost as xgb
+
+
+def neptune_callback(log_model=True,
+                     log_importance=True,
+                     max_num_features=None,
+                     log_tree=(0,),
+                     experiment=None,
+                     **kwargs):
+    """XGBoost callback for Neptune experiments.
+
+    This is XGBoost callback that automatically logs training and evaluation metrics, feature importance chart,
+    visualized trees and trained Booster to Neptune.
+
+    Check Neptune documentation for the `full example <https://docs.neptune.ai/integrations/xgboost.html>`_.
+
+    Make sure you created an experiment before you start XGBoost training using ``neptune.create_experiment()``
+    (`check our docs <https://docs.neptune.ai/neptune-client/docs/project.html
+    #neptune.projects.Project.create_experiment>`_).
+
+    Integration works with ``xgboost>=0.82``.
+
+    Tip:
+        Use this `Google Colab <https://colab.research.google.com/github/neptune-ai/neptune-colab-examples
+        /blob/master/xgboost-integration.ipynb>`_ to try it without further ado.
+
+    Args:
+        log_model (:obj:`bool`, optional, default is ``True``):
+            | Log booster to Neptune after last boosting iteration.
+            | If you run xgb.cv, log booster for all folds.
+        log_importance (:obj:`bool`, optional, default is ``True``):
+            | Log feature importance to Neptune as image after last boosting iteration.
+            | Specify number of features using ``max_num_features`` parameter below.
+            | If you run xgb.cv, log feature importance for each folds' booster.
+        max_num_features (:obj:`int`, optional, default is ``None``):
+            | Plot top ``max_num_features`` features on the importance plot.
+            | If ``None``, plot all features.
+        log_tree (:obj:`list` of :obj:`int`, optional, default is ``[1,]``):
+            | Log specified trees to Neptune as images after last boosting iteration.
+            | If you run xgb.cv, log specified trees for each folds' booster.
+            | Default is to log first tree.
+            | If ``None``, do not log any tree.
+        experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``):
+            | For advanced users only. Pass Neptune
+              `Experiment <https://docs.neptune.ai/neptune-client/docs/experiment.html#neptune.experiments.Experiment>`_
+              object if you want to control to which experiment data is logged.
+            | If ``None``, log to currently active, and most recent experiment.
+        kwargs:
+            Parametrize XGBoost functions used in this callback:
+            `xgboost.plot_importance <https://xgboost.readthedocs.io/en/latest/python/python_api.html
+            ?highlight=plot_tree#xgboost.plot_importance>`_
+            and `xgboost.to_graphviz <https://xgboost.readthedocs.io/en/latest/python/python_api.html
+            ?highlight=plot_tree#xgboost.to_graphviz>`_.
+
+    Returns:
+        :obj:`callback`, function that you can pass directly to the XGBoost callbacks list, for example to the
+        ``xgboost.cv()``
+        (`see docs <https://xgboost.readthedocs.io/en/latest/python/python_api.html?highlight=plot_tree#xgboost.cv>`_)
+        or ``XGBClassifier.fit()``
+        (`check docs <https://xgboost.readthedocs.io/en/latest/python/python_api.html?highlight=plot_tree
+        #xgboost.XGBClassifier.fit>`_).
+
+    Note:
+        If you use early stopping, make sure to log model, feature importance and trees on your own.
+        Neptune logs these artifacts only after last iteration, which you may not reach because of early stop.
+
+    Examples:
+        ``xgb.train`` examples
+
+        .. code:: python3
+
+            # basic usage
+            xgb.train(param, dtrain, num_round, watchlist,
+                      callbacks=[neptune_callback()])
+
+            # do not log model
+            xgb.train(param, dtrain, num_round, watchlist,
+                      callbacks=[neptune_callback(log_model=False)])
+
+            # log top 5 features' importance chart
+            xgb.train(param, dtrain, num_round, watchlist,
+                      callbacks=[neptune_callback(max_num_features=5)])
+
+        ``xgb.cv`` examples
+
+        .. code:: python3
+
+            # log 5 trees per each folds' booster
+            xgb.cv(param, dtrain, num_boost_round=num_round, nfold=7,
+                   callbacks=neptune_callback(log_tree=[0,1,2,3,4]))
+
+            # log only metrics
+            xgb.cv(param, dtrain, num_boost_round=num_round, nfold=7,
+                   callbacks=[neptune_callback(log_model=False,
+                                               log_importance=False,
+                                               max_num_features=None,
+                                               log_tree=None)])
+
+            # log top 5 features per each folds' booster
+            xgb.cv(param, dtrain, num_boost_round=num_round, nfold=7,
+                   callbacks=[neptune_callback(log_model=False,
+                                               max_num_features=3,
+                                               log_tree=None)])
+
+        ``sklearn`` API examples
+
+        .. code:: python3
+
+            # basic usage with early stopping
+            xgb.XGBRegressor().fit(X_train, y_train,
+                                   early_stopping_rounds=10,
+                                   eval_metric=['mae', 'rmse', 'rmsle'],
+                                   eval_set=[(X_test, y_test)],
+                                   callbacks=[neptune_callback()])
+
+            # do not log model
+            clf = xgb.XGBRegressor()
+            clf.fit(X_train, y_train,
+                    eval_metric=['mae', 'rmse', 'rmsle'],
+                    eval_set=[(X_test, y_test)],
+                    callbacks=[neptune_callback(log_model=False)])
+            y_pred = clf.predict(X_test)
+
+            # log 8 trees
+            reg = xgb.XGBRegressor(**params)
+            reg.fit(X_train, y_train,
+                    eval_metric=['mae', 'rmse', 'rmsle'],
+                    eval_set=[(X_test, y_test)],
+                    callbacks=[neptune_callback(log_tree=[0,1,2,3,4,5,6,7])])
+    """
+    if experiment:
+        _exp = experiment
+    else:
+        try:
+            neptune.get_experiment()
+            _exp = neptune
+        except neptune.exceptions.NoExperimentContext:
+            msg = 'No currently running Neptune experiment. \n'\
+                  'To start logging to Neptune create experiment by using: `neptune.create_experiment()`. \n'\
+                  'More info in the documentation: '\
+                  '<https://docs.neptune.ai/neptune-client/docs/project.html' \
+                  '#neptune.projects.Project.create_experiment>.'
+            raise neptune.exceptions.NeptuneException(msg)
+
+    assert isinstance(log_model, bool),\
+        'log_model must be bool, got {} instead. Check log_model parameter.'.format(type(log_model))
+    assert isinstance(log_importance, bool),\
+        'log_importance must be bool, got {} instead. Check log_importance parameter.'.format(type(log_importance))
+    if max_num_features is not None:
+        assert isinstance(max_num_features, int),\
+            'max_num_features must be int, got {} instead. ' \
+            'Check max_num_features parameter.'.format(type(max_num_features))
+    if log_tree is not None:
+        if isinstance(log_tree, tuple):
+            log_tree = list(log_tree)
+        assert isinstance(log_tree, list),\
+            'log_tree must be list of int, got {} instead. Check log_tree parameter.'.format(type(log_tree))
+
+    def callback(env):
+        # Log metrics after iteration
+        for item in env.evaluation_result_list:
+            if len(item) == 2:  # train case
+                _exp.log_metric(item[0], item[1])
+            if len(item) == 3:  # cv case
+                _exp.log_metric('{}-mean'.format(item[0]), item[1])
+                _exp.log_metric('{}-std'.format(item[0]), item[2])
+
+        # Log booster, end of training
+        if env.iteration + 1 == env.end_iteration and log_model:
+            if env.cvfolds:  # cv case
+                for i, cvpack in enumerate(env.cvfolds):
+                    _log_model(cvpack.bst, 'cv-fold-{}-bst.model'.format(i), _exp)
+            else:  # train case
+                _log_model(env.model, 'bst.model', _exp)
+
+        # Log feature importance, end of training
+        if env.iteration + 1 == env.end_iteration and log_importance:
+            if env.cvfolds:  # cv case
+                for i, cvpack in enumerate(env.cvfolds):
+                    _log_importance(cvpack.bst, max_num_features, _exp, title='cv-fold-{}'.format(i), **kwargs)
+            else:  # train case
+                _log_importance(env.model, max_num_features, _exp, **kwargs)
+
+        # Log trees, end of training
+        if env.iteration + 1 == env.end_iteration and log_tree:
+            if env.cvfolds:
+                for j, cvpack in enumerate(env.cvfolds):
+                    _log_trees(cvpack.bst, log_tree, 'trees-cv-fold-{}'.format(j), _exp, **kwargs)
+            else:
+                _log_trees(env.model, log_tree, 'trees', _exp, **kwargs)
+    return callback
+
+
+def _log_model(booster, name, npt):
+    with tempfile.TemporaryDirectory(dir='.') as d:
+        path = os.path.join(d, name)
+        booster.save_model(path)
+        npt.log_artifact(path)
+
+
+def _log_importance(booster, max_num_features, npt, **kwargs):
+    importance = xgb.plot_importance(booster, max_num_features=max_num_features, **kwargs)
+    npt.log_image('feature_importance', importance.figure)
+
+
+def _log_trees(booster, tree_list, img_name, npt, **kwargs):
+    with tempfile.TemporaryDirectory(dir='.') as d:
+        for i in tree_list:
+            file_name = 'tree_{}'.format(i)
+            tree = xgb.to_graphviz(booster=booster, num_trees=i, **kwargs)
+            tree.render(filename=file_name, directory=d, view=False, format='png')
+            npt.log_image(img_name,
+                          os.path.join(d, '{}.png'.format(file_name)),
+                          image_name=file_name)
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@ def main():
         'bots': ['python-telegram-bot'],
         'hpo': ['scikit-optimize>=0.5.2', 'scipy'],
         'monitoring': ['scikit-optimize>=0.5.2', 'sacred>=0.7.5', 'scikit-learn>=0.21.3',
-                       'scikit-plot>=0.3.7', 'seaborn>=0.8.1', 'aif360>=0.2.1'],
+                       'scikit-plot>=0.3.7', 'seaborn>=0.8.1', 'aif360>=0.2.1', 'xgboost>=0.82'],
         'versioning': ['boto3', 'numpy'],
         'viz': ['altair>=2.3.0', 'hiplot>=0.1.5'],
     }
@@ -23,7 +23,7 @@ def main():
 
     setup(
         name='neptune-contrib',
-        version='0.17.0',
+        version='0.18.0',
         description='Neptune.ai contributions library',
         author='neptune.ai',
         support='[email protected]',