Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Lambert W x F distributions to XGBoostLSS #65

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5823920
add lambertw gaussian and lambertw weibull
gmgeorg Nov 19, 2023
cf112f8
Add file
gmgeorg Nov 19, 2023
960f356
add weibull file
gmgeorg Nov 19, 2023
9433561
clean up gaussian
gmgeorg Nov 19, 2023
5b315e5
add gamm/exponential/weibull/lognormal distributions
gmgeorg Nov 23, 2023
c431439
update dist utuls
gmgeorg Nov 23, 2023
426a35f
set mean variance = true
gmgeorg Nov 23, 2023
62e03d2
fix gamma distr
gmgeorg Nov 23, 2023
9b73c10
update distribution utils
gmgeorg Nov 26, 2023
e575da3
fix pandas issue
gmgeorg Nov 26, 2023
950d147
updte notebooks
gmgeorg Nov 27, 2023
d11e642
change order of scale/conc
gmgeorg Nov 29, 2023
9ea4f9f
update notebook
gmgeorg Nov 30, 2023
3ceec0d
add lambertw gaussian and lambertw weibull
gmgeorg Nov 19, 2023
86a68ef
Add file
gmgeorg Nov 19, 2023
81b70dd
add weibull file
gmgeorg Nov 19, 2023
fe2cb54
clean up gaussian
gmgeorg Nov 19, 2023
2c481f1
add gamm/exponential/weibull/lognormal distributions
gmgeorg Nov 23, 2023
8610ffa
update dist utuls
gmgeorg Nov 23, 2023
cd8127b
set mean variance = true
gmgeorg Nov 23, 2023
51c359b
fix gamma distr
gmgeorg Nov 23, 2023
334220d
update distribution utils
gmgeorg Nov 26, 2023
a3d1761
fix pandas issue
gmgeorg Nov 26, 2023
ecae1ed
updte notebooks
gmgeorg Nov 27, 2023
803008e
change order of scale/conc
gmgeorg Nov 29, 2023
86ff341
update notebook
gmgeorg Nov 30, 2023
2e11176
update setup.py
gmgeorg Nov 30, 2023
8642160
fix unit tests; update torchlambertw; speedup computations for lamber…
gmgeorg Dec 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
806 changes: 507 additions & 299 deletions docs/examples/Gamma_Regression_CaliforniaHousing.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions docs/examples/Gaussian_Regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1167,7 +1167,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -1181,7 +1181,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
"version": "3.8.10"
}
},
"nbformat": 4,
Expand Down
354 changes: 296 additions & 58 deletions docs/examples/How_To_Select_A_Univariate_Distribution.ipynb

Large diffs are not rendered by default.

1,311 changes: 1,311 additions & 0 deletions docs/examples/LambertWGamma_Regression_CaliforniaHousing.ipynb

Large diffs are not rendered by default.

1,756 changes: 1,756 additions & 0 deletions docs/examples/LambertW_Gaussian_Regression.ipynb

Large diffs are not rendered by default.

54 changes: 33 additions & 21 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,51 @@
from setuptools import setup, find_packages


import re

_VERSION_FILE = "xgboostlss/_version.py"
verstrline = open(_VERSION_FILE, "rt").read()
_VERSION = r"^__version__ = ['\"]([^'\"]*)['\"]"
mo = re.search(_VERSION, verstrline, re.M)
if mo:
verstr = mo.group(1)
else:
raise RuntimeError("Unable to find version string in %s." % (_VERSION_FILE,))


setup(
name="xgboostlss",
version="0.4.0",
version=verstr,
description="XGBoostLSS - An extension of XGBoost to probabilistic modelling",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
author="Alexander März",
author='Alexander M"{a}rz',
author_email="[email protected]",
url="https://github.com/StatMixedML/XGBoostLSS",
license="Apache License 2.0",
packages=find_packages(exclude=["docs", "tests*"]),
include_package_data=True,
package_data={'': ['datasets/*.csv']},
package_data={"": ["datasets/*.csv"]},
zip_safe=True,
python_requires=">=3.9",
python_requires=">=3.8",
install_requires=[
"xgboost~=2.0.2",
"torch~=2.1.1",
"pyro-ppl~=1.8.6",
"optuna~=3.4.0",
"properscoring~=0.1",
"scikit-learn~=1.3.2",
"numpy~=1.26.2",
"pandas~=2.1.3",
"plotnine~=0.12.4",
"scipy~=1.11.4",
"seaborn~=0.13.0",
"tqdm~=4.66.1",
"matplotlib~=3.8.2",
"ipython~=8.18.1",
"xgboost>=1.6.1",
"torch>=2.0.1",
"pyro-ppl>=1.5.0",
"optuna>=3.0.0",
"properscoring>=0.1",
"scikit-learn>=1.0.2",
"numpy>=1.23.0",
"pandas>=2.0.3",
"plotnine>=0.10.0",
"statsmodels>=0.14.0",
"scipy>=1.0.0",
"seaborn>=0.13.0",
"torchlambertw @ git+ssh://[email protected]/gmgeorg/torchlambertw.git#egg=torchlambertw-0.0.3",
"tqdm>=4.0.0",
"matplotlib>=3.6.0",
],
extras_require={
"docs": ["mkdocs", "mkdocstrings[python]", "mkdocs-jupyter"]
},
extras_require={"docs": ["mkdocs", "mkdocstrings[python]", "mkdocs-jupyter"]},
test_suite="tests",
tests_require=["flake8", "pytest"],
)
143 changes: 97 additions & 46 deletions tests/test_distribution_utils/test_dist_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,25 @@
LogNormal,
Weibull,
Gumbel,
Laplace)
Laplace,
)
from xgboostlss.distributions.Mixture import *
from xgboostlss.distributions.SplineFlow import *
from xgboostlss.distributions.MVN import *
from xgboostlss.distributions.MVT import *
from xgboostlss.distributions.MVN_LoRa import *
from xgboostlss.distributions.distribution_utils import DistributionClass as univariate_dist_class
from xgboostlss.distributions.multivariate_distribution_utils import Multivariate_DistributionClass as multivariate_dist_class
from xgboostlss.distributions.distribution_utils import (
DistributionClass as univariate_dist_class,
)
from xgboostlss.distributions.multivariate_distribution_utils import (
Multivariate_DistributionClass as multivariate_dist_class,
)
from xgboostlss.distributions.flow_utils import NormalizingFlowClass as flow_dist_class
from xgboostlss.distributions.mixture_distribution_utils import MixtureDistributionClass as mixture_dist_class
from xgboostlss.distributions.mixture_distribution_utils import (
MixtureDistributionClass as mixture_dist_class,
)

import xgboostlss.distributions.distribution_utils as du


class TestClass(BaseTestClass):
Expand All @@ -28,38 +37,58 @@ class TestClass(BaseTestClass):
def test_univar_dist_select(self):
# Create data for testing
target = np.array([0.2, 0.4, 0.6, 0.8]).reshape(-1, 1)
candidate_distributions = [Beta, Gaussian, StudentT, Gamma, Cauchy, LogNormal, Weibull, Gumbel, Laplace]
candidate_distributions = [
Beta.Beta(),
Gaussian.Gaussian(),
StudentT.StudentT(),
Gamma.Gamma(),
Cauchy.Cauchy(),
LogNormal.LogNormal(),
Weibull.Weibull(),
Gumbel.Gumbel(),
Laplace.Laplace(),
]

# Call the function
dist_df = univariate_dist_class().dist_select(
dist_df = du.dist_select(
target, candidate_distributions, plot=False, max_iter=2
).reset_index(drop=True)

# Assertions
assert isinstance(dist_df, pd.DataFrame)
assert not dist_df.isna().any().any()
assert isinstance(dist_df["distribution"].values[0], str)
assert np.issubdtype(dist_df["nll"].dtype, np.float64)
assert not np.isnan(dist_df["nll"].values).any()
assert not np.isinf(dist_df["nll"].values).any()
assert np.issubdtype(dist_df["loss"].dtype, np.float64)
assert not np.isnan(dist_df["loss"].values).any()
assert not np.isinf(dist_df["loss"].values).any()

def test_univar_dist_select_plot(self):
# Create data for testing
target = np.array([0.2, 0.4, 0.6, 0.8]).reshape(-1, 1)
candidate_distributions = [Beta, Gaussian, StudentT, Gamma, Cauchy, LogNormal, Weibull, Gumbel, Laplace]
candidate_distributions = [
Beta.Beta(),
Gaussian.Gaussian(),
StudentT.StudentT(),
Gamma.Gamma(),
Cauchy.Cauchy(),
LogNormal.LogNormal(),
Weibull.Weibull(),
Gumbel.Gumbel(),
Laplace.Laplace(),
]

# Call the function
dist_df = univariate_dist_class().dist_select(
dist_df = du.dist_select(
target, candidate_distributions, plot=True, max_iter=2
).reset_index(drop=True)

# Assertions
assert isinstance(dist_df, pd.DataFrame)
assert not dist_df.isna().any().any()
assert isinstance(dist_df["distribution"].values[0], str)
assert np.issubdtype(dist_df["nll"].dtype, np.float64)
assert not np.isnan(dist_df["nll"].values).any()
assert not np.isinf(dist_df["nll"].values).any()
assert np.issubdtype(dist_df["loss"].dtype, np.float64)
assert not np.isnan(dist_df["loss"].values).any()
assert not np.isinf(dist_df["loss"].values).any()

####################################################################################################################
# Normalizing Flows
Expand All @@ -71,14 +100,23 @@ def test_flow_select(self):
target_support = "real"

candidate_flows = [
SplineFlow(target_support=target_support, count_bins=2, bound=bound, order="linear"),
SplineFlow(target_support=target_support, count_bins=2, bound=bound, order="quadratic")
SplineFlow(
target_support=target_support, count_bins=2, bound=bound, order="linear"
),
SplineFlow(
target_support=target_support,
count_bins=2,
bound=bound,
order="quadratic",
),
]

# Call the function
dist_df = flow_dist_class().flow_select(
target, candidate_flows, plot=False, max_iter=2
).reset_index(drop=True)
dist_df = (
flow_dist_class()
.flow_select(target, candidate_flows, plot=False, max_iter=2)
.reset_index(drop=True)
)

# Assertions
assert isinstance(dist_df, pd.DataFrame)
Expand All @@ -95,14 +133,23 @@ def test_flow_select_plot(self):
target_support = "real"

candidate_flows = [
SplineFlow(target_support=target_support, count_bins=2, bound=bound, order="linear"),
SplineFlow(target_support=target_support, count_bins=2, bound=bound, order="quadratic")
SplineFlow(
target_support=target_support, count_bins=2, bound=bound, order="linear"
),
SplineFlow(
target_support=target_support,
count_bins=2,
bound=bound,
order="quadratic",
),
]

# Call the function
dist_df = flow_dist_class().flow_select(
target, candidate_flows, plot=True, max_iter=2
).reset_index(drop=True)
dist_df = (
flow_dist_class()
.flow_select(target, candidate_flows, plot=True, max_iter=2)
.reset_index(drop=True)
)

# Assertions
assert isinstance(dist_df, pd.DataFrame)
Expand All @@ -127,13 +174,15 @@ def test_mixture_dist_select(self):
Mixture(LogNormal.LogNormal()),
Mixture(Weibull.Weibull()),
Mixture(Gumbel.Gumbel()),
Mixture(Laplace.Laplace())
Mixture(Laplace.Laplace()),
]

# Call the function
dist_df = mixture_dist_class().dist_select(
target, candidate_distributions, plot=False, max_iter=2
).reset_index(drop=True)
dist_df = (
mixture_dist_class()
.dist_select(target, candidate_distributions, plot=False, max_iter=2)
.reset_index(drop=True)
)

# Assertions
assert isinstance(dist_df, pd.DataFrame)
Expand All @@ -155,13 +204,15 @@ def test_mixture_dist_select_plot(self):
Mixture(LogNormal.LogNormal()),
Mixture(Weibull.Weibull()),
Mixture(Gumbel.Gumbel()),
Mixture(Laplace.Laplace())
Mixture(Laplace.Laplace()),
]

# Call the function
dist_df = mixture_dist_class().dist_select(
target, candidate_distributions, plot=True, max_iter=2
).reset_index(drop=True)
dist_df = (
mixture_dist_class()
.dist_select(target, candidate_distributions, plot=True, max_iter=2)
.reset_index(drop=True)
)

# Assertions
assert isinstance(dist_df, pd.DataFrame)
Expand All @@ -179,16 +230,16 @@ def test_multivar_dist_select(self):
multivar_dist_class = MVN()
target = np.arange(0.1, 0.9, 0.1)
target = multivar_dist_class.target_append(
target,
multivar_dist_class.n_targets,
multivar_dist_class.n_dist_param
)[:, :multivar_dist_class.n_targets]
target, multivar_dist_class.n_targets, multivar_dist_class.n_dist_param
)[:, : multivar_dist_class.n_targets]
candidate_distributions = [MVN(), MVT(), MVN_LoRa()]

# Call the function
dist_df = multivariate_dist_class().dist_select(
target, candidate_distributions, plot=False, max_iter=2
).reset_index(drop=True)
dist_df = (
multivariate_dist_class()
.dist_select(target, candidate_distributions, plot=False, max_iter=2)
.reset_index(drop=True)
)

# Assertions
assert isinstance(dist_df, pd.DataFrame)
Expand All @@ -203,16 +254,16 @@ def test_multivar_dist_select_plot(self):
multivar_dist_class = MVN()
target = np.arange(0.1, 0.9, 0.1)
target = multivar_dist_class.target_append(
target,
multivar_dist_class.n_targets,
multivar_dist_class.n_dist_param
)[:, :multivar_dist_class.n_targets]
target, multivar_dist_class.n_targets, multivar_dist_class.n_dist_param
)[:, : multivar_dist_class.n_targets]
candidate_distributions = [MVN(), MVT(), MVN_LoRa()]

# Call the function
dist_df = multivariate_dist_class().dist_select(
target, candidate_distributions, plot=True, ncol=1, max_iter=2
).reset_index(drop=True)
dist_df = (
multivariate_dist_class()
.dist_select(target, candidate_distributions, plot=True, ncol=1, max_iter=2)
.reset_index(drop=True)
)

# Assertions
assert isinstance(dist_df, pd.DataFrame)
Expand Down
Loading