diff --git a/docs/conf.py b/docs/conf.py index 3d1ba9e..79aba65 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,7 +40,8 @@ 'skopt', 'optuna', 'scikitplot', - 'scikitplot.metrics'] + 'scikitplot.metrics', + 'xgboost'] # -- Project information ----------------------------------------------------- diff --git a/docs/index.rst b/docs/index.rst index 7a44bbc..3038c1a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -55,6 +55,7 @@ And the best thing is you can extend it yourself or... tell us to do it for you hpo.utils bots.telegram_bot monitoring.lightgbm + monitoring.xgboost monitoring.fastai monitoring.metrics monitoring.fairness diff --git a/docs/user_guide/monitoring/xgboost.rst b/docs/user_guide/monitoring/xgboost.rst new file mode 100644 index 0000000..64a77a9 --- /dev/null +++ b/docs/user_guide/monitoring/xgboost.rst @@ -0,0 +1,6 @@ +XGBoost +======= + +.. automodule:: neptunecontrib.monitoring.xgboost_monitor + :members: + :show-inheritance: diff --git a/neptunecontrib/hpo/utils.py b/neptunecontrib/hpo/utils.py index 1bcce82..706f08e 100644 --- a/neptunecontrib/hpo/utils.py +++ b/neptunecontrib/hpo/utils.py @@ -166,7 +166,7 @@ def bayes2skopt(bayes_opt): format. Args: - bayes_opt(`bayes_opt.Bbyesian_optimization.BayesianOptimization`): BayesianOptimization instance. + bayes_opt(`bayes_opt.Bayesian_optimization.BayesianOptimization`): BayesianOptimization instance. Returns: `scipy.optimize.optimize.OptimizeResult`: Converted OptimizeResult. @@ -184,10 +184,8 @@ def bayes2skopt(bayes_opt): results = hp_utils.bayes2skopt(bayes_optimization) Note: - Since skopt is always minimizng and BayesianOptimization is maximizing, the objective function values are - converted into negatives for consistency:: - - + Since skopt is always minimizing and BayesianOptimization is maximizing, the objective function values are + converted into negatives for consistency. """ results = bayes_opt.space.res() results = [{'target': trial['target'], **trial['params']} for trial in results] diff --git a/neptunecontrib/monitoring/xgboost_monitor.py b/neptunecontrib/monitoring/xgboost_monitor.py new file mode 100644 index 0000000..cb058d3 --- /dev/null +++ b/neptunecontrib/monitoring/xgboost_monitor.py @@ -0,0 +1,233 @@ +# +# Copyright (c) 2020, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import tempfile + +import neptune +import xgboost as xgb + + +def neptune_callback(log_model=True, + log_importance=True, + max_num_features=None, + log_tree=(0,), + experiment=None, + **kwargs): + """XGBoost callback for Neptune experiments. + + This is XGBoost callback that automatically logs training and evaluation metrics, feature importance chart, + visualized trees and trained Booster to Neptune. + + Check Neptune documentation for the `full example `_. + + Make sure you created an experiment before you start XGBoost training using ``neptune.create_experiment()`` + (`check our docs `_). + + Integration works with ``xgboost>=0.82``. + + Tip: + Use this `Google Colab `_ to try it without further ado. + + Args: + log_model (:obj:`bool`, optional, default is ``True``): + | Log booster to Neptune after last boosting iteration. + | If you run xgb.cv, log booster for all folds. + log_importance (:obj:`bool`, optional, default is ``True``): + | Log feature importance to Neptune as image after last boosting iteration. + | Specify number of features using ``max_num_features`` parameter below. + | If you run xgb.cv, log feature importance for each folds' booster. + max_num_features (:obj:`int`, optional, default is ``None``): + | Plot top ``max_num_features`` features on the importance plot. + | If ``None``, plot all features. + log_tree (:obj:`list` of :obj:`int`, optional, default is ``[1,]``): + | Log specified trees to Neptune as images after last boosting iteration. + | If you run xgb.cv, log specified trees for each folds' booster. + | Default is to log first tree. + | If ``None``, do not log any tree. + experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): + | For advanced users only. Pass Neptune + `Experiment `_ + object if you want to control to which experiment data is logged. + | If ``None``, log to currently active, and most recent experiment. + kwargs: + Parametrize XGBoost functions used in this callback: + `xgboost.plot_importance `_ + and `xgboost.to_graphviz `_. + + Returns: + :obj:`callback`, function that you can pass directly to the XGBoost callbacks list, for example to the + ``xgboost.cv()`` + (`see docs `_) + or ``XGBClassifier.fit()`` + (`check docs `_). + + Note: + If you use early stopping, make sure to log model, feature importance and trees on your own. + Neptune logs these artifacts only after last iteration, which you may not reach because of early stop. + + Examples: + ``xgb.train`` examples + + .. code:: python3 + + # basic usage + xgb.train(param, dtrain, num_round, watchlist, + callbacks=[neptune_callback()]) + + # do not log model + xgb.train(param, dtrain, num_round, watchlist, + callbacks=[neptune_callback(log_model=False)]) + + # log top 5 features' importance chart + xgb.train(param, dtrain, num_round, watchlist, + callbacks=[neptune_callback(max_num_features=5)]) + + ``xgb.cv`` examples + + .. code:: python3 + + # log 5 trees per each folds' booster + xgb.cv(param, dtrain, num_boost_round=num_round, nfold=7, + callbacks=neptune_callback(log_tree=[0,1,2,3,4])) + + # log only metrics + xgb.cv(param, dtrain, num_boost_round=num_round, nfold=7, + callbacks=[neptune_callback(log_model=False, + log_importance=False, + max_num_features=None, + log_tree=None)]) + + # log top 5 features per each folds' booster + xgb.cv(param, dtrain, num_boost_round=num_round, nfold=7, + callbacks=[neptune_callback(log_model=False, + max_num_features=3, + log_tree=None)]) + + ``sklearn`` API examples + + .. code:: python3 + + # basic usage with early stopping + xgb.XGBRegressor().fit(X_train, y_train, + early_stopping_rounds=10, + eval_metric=['mae', 'rmse', 'rmsle'], + eval_set=[(X_test, y_test)], + callbacks=[neptune_callback()]) + + # do not log model + clf = xgb.XGBRegressor() + clf.fit(X_train, y_train, + eval_metric=['mae', 'rmse', 'rmsle'], + eval_set=[(X_test, y_test)], + callbacks=[neptune_callback(log_model=False)]) + y_pred = clf.predict(X_test) + + # log 8 trees + reg = xgb.XGBRegressor(**params) + reg.fit(X_train, y_train, + eval_metric=['mae', 'rmse', 'rmsle'], + eval_set=[(X_test, y_test)], + callbacks=[neptune_callback(log_tree=[0,1,2,3,4,5,6,7])]) + """ + if experiment: + _exp = experiment + else: + try: + neptune.get_experiment() + _exp = neptune + except neptune.exceptions.NoExperimentContext: + msg = 'No currently running Neptune experiment. \n'\ + 'To start logging to Neptune create experiment by using: `neptune.create_experiment()`. \n'\ + 'More info in the documentation: '\ + '.' + raise neptune.exceptions.NeptuneException(msg) + + assert isinstance(log_model, bool),\ + 'log_model must be bool, got {} instead. Check log_model parameter.'.format(type(log_model)) + assert isinstance(log_importance, bool),\ + 'log_importance must be bool, got {} instead. Check log_importance parameter.'.format(type(log_importance)) + if max_num_features is not None: + assert isinstance(max_num_features, int),\ + 'max_num_features must be int, got {} instead. ' \ + 'Check max_num_features parameter.'.format(type(max_num_features)) + if log_tree is not None: + if isinstance(log_tree, tuple): + log_tree = list(log_tree) + assert isinstance(log_tree, list),\ + 'log_tree must be list of int, got {} instead. Check log_tree parameter.'.format(type(log_tree)) + + def callback(env): + # Log metrics after iteration + for item in env.evaluation_result_list: + if len(item) == 2: # train case + _exp.log_metric(item[0], item[1]) + if len(item) == 3: # cv case + _exp.log_metric('{}-mean'.format(item[0]), item[1]) + _exp.log_metric('{}-std'.format(item[0]), item[2]) + + # Log booster, end of training + if env.iteration + 1 == env.end_iteration and log_model: + if env.cvfolds: # cv case + for i, cvpack in enumerate(env.cvfolds): + _log_model(cvpack.bst, 'cv-fold-{}-bst.model'.format(i), _exp) + else: # train case + _log_model(env.model, 'bst.model', _exp) + + # Log feature importance, end of training + if env.iteration + 1 == env.end_iteration and log_importance: + if env.cvfolds: # cv case + for i, cvpack in enumerate(env.cvfolds): + _log_importance(cvpack.bst, max_num_features, _exp, title='cv-fold-{}'.format(i), **kwargs) + else: # train case + _log_importance(env.model, max_num_features, _exp, **kwargs) + + # Log trees, end of training + if env.iteration + 1 == env.end_iteration and log_tree: + if env.cvfolds: + for j, cvpack in enumerate(env.cvfolds): + _log_trees(cvpack.bst, log_tree, 'trees-cv-fold-{}'.format(j), _exp, **kwargs) + else: + _log_trees(env.model, log_tree, 'trees', _exp, **kwargs) + return callback + + +def _log_model(booster, name, npt): + with tempfile.TemporaryDirectory(dir='.') as d: + path = os.path.join(d, name) + booster.save_model(path) + npt.log_artifact(path) + + +def _log_importance(booster, max_num_features, npt, **kwargs): + importance = xgb.plot_importance(booster, max_num_features=max_num_features, **kwargs) + npt.log_image('feature_importance', importance.figure) + + +def _log_trees(booster, tree_list, img_name, npt, **kwargs): + with tempfile.TemporaryDirectory(dir='.') as d: + for i in tree_list: + file_name = 'tree_{}'.format(i) + tree = xgb.to_graphviz(booster=booster, num_trees=i, **kwargs) + tree.render(filename=file_name, directory=d, view=False, format='png') + npt.log_image(img_name, + os.path.join(d, '{}.png'.format(file_name)), + image_name=file_name) diff --git a/setup.py b/setup.py index 8e2a7b3..4541228 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ def main(): 'bots': ['python-telegram-bot'], 'hpo': ['scikit-optimize>=0.5.2', 'scipy'], 'monitoring': ['scikit-optimize>=0.5.2', 'sacred>=0.7.5', 'scikit-learn>=0.21.3', - 'scikit-plot>=0.3.7', 'seaborn>=0.8.1', 'aif360>=0.2.1'], + 'scikit-plot>=0.3.7', 'seaborn>=0.8.1', 'aif360>=0.2.1', 'xgboost>=0.82'], 'versioning': ['boto3', 'numpy'], 'viz': ['altair>=2.3.0', 'hiplot>=0.1.5'], } @@ -23,7 +23,7 @@ def main(): setup( name='neptune-contrib', - version='0.17.0', + version='0.18.0', description='Neptune.ai contributions library', author='neptune.ai', support='contact@neptune.ai',