From e3f6b0aa431d886c92d3324080dc7460950dabb7 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Sun, 24 Oct 2021 17:33:36 +0200 Subject: [PATCH 01/34] docs: specify requirements --- docs/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 docs/requirements.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..0b27c37e --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +sphinx_rtd_theme +myst_parser From 0606c566c4c8a3a711de08ad5c6acf01751a99af Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Sun, 24 Oct 2021 17:34:38 +0200 Subject: [PATCH 02/34] ci: docs requirements --- .readthedocs.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index d2482943..caf5186a 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -7,4 +7,5 @@ build: python: version: 3.8 setup_py_install: true - + install: + - requirements: docs/requirements.txt From 5dd4a70261488febee4ee1f3550c81c5487fa84c Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Mon, 25 Oct 2021 17:39:52 +0200 Subject: [PATCH 03/34] ci: enable codeql --- .github/workflows/codeql-analysis.yml | 60 +++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 .github/workflows/codeql-analysis.yml diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 00000000..f8cbd885 --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,60 @@ +name: "CodeQL" + +on: + push: + branches: [ master, develop ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ master ] + schedule: + - cron: '22 11 * * 2' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] + # Learn more: + # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v1 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v1 + + # ℹī¸ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl + + # ✏ī¸ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v1 From b187d360bc303b347826c77eb356d2d4dcc5ad38 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 13:03:15 +0200 Subject: [PATCH 04/34] docs: changelog md syntax --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93e40737..b13d51ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,4 +66,4 @@ ## v0.4.0 and before -The release notes for preceding versions are available `here `_ +The release notes for preceding versions are available [here](https://github.com/ing-bank/popmon/blob/master/CHANGES.rst>). From d5caf4e0d9ac023ee40ae9669886033da4ed96a6 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 22:13:44 +0200 Subject: [PATCH 05/34] ci: black on docs --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 83925539..45222b48 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,3 +21,7 @@ repos: hooks: - id: pyupgrade args: ['--py36-plus','--exit-zero-even-if-changed'] +- repo: https://github.com/asottile/blacken-docs + rev: v1.0.0 + hooks: + - id: blacken-docs From fb9642c30d0316fe4196141fde416c09a83877a4 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 20:15:04 +0000 Subject: [PATCH 06/34] ci: dependency update --- .pre-commit-config.yaml | 2 +- README.rst | 38 ++++++++++++++------- docs/source/configuration.rst | 64 +++++++++++++++++++++++------------ popmon/pipeline/metrics.py | 46 +++++++++++++++---------- popmon/pipeline/report.py | 46 +++++++++++++++---------- 5 files changed, 124 insertions(+), 72 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 45222b48..010c763a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,6 +22,6 @@ repos: - id: pyupgrade args: ['--py36-plus','--exit-zero-even-if-changed'] - repo: https://github.com/asottile/blacken-docs - rev: v1.0.0 + rev: v1.11.0 hooks: - id: blacken-docs diff --git a/README.rst b/README.rst index b74375c5..994a531b 100644 --- a/README.rst +++ b/README.rst @@ -29,7 +29,10 @@ With Spark 3.0, based on Scala 2.12, make sure to pick up the correct `histogram .. code-block:: python - spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20").getOrCreate() + spark = SparkSession.builder.config( + "spark.jars.packages", + "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20", + ).getOrCreate() For Spark 2.X compiled against scala 2.11, in the string above simply replace 2.12 with 2.11. @@ -101,12 +104,12 @@ As a quick example, you can do: from popmon import resources # open synthetic data - df = pd.read_csv(resources.data('test.csv.gz'), parse_dates=['date']) + df = pd.read_csv(resources.data("test.csv.gz"), parse_dates=["date"]) df.head() # generate stability report using automatic binning of all encountered features # (importing popmon automatically adds this functionality to a dataframe) - report = df.pm_stability_report(time_axis='date', features=['date:age', 'date:gender']) + report = df.pm_stability_report(time_axis="date", features=["date:age", "date:gender"]) # to show the output of the report in a Jupyter notebook you can simply run: report @@ -119,23 +122,32 @@ To specify your own binning specifications and features you want to report on, y .. code-block:: python # time-axis specifications alone; all other features are auto-binned. - report = df.pm_stability_report(time_axis='date', time_width='1w', time_offset='2020-1-6') + report = df.pm_stability_report( + time_axis="date", time_width="1w", time_offset="2020-1-6" + ) # histogram selections. Here 'date' is the first axis of each histogram. - features=[ - 'date:isActive', 'date:age', 'date:eyeColor', 'date:gender', - 'date:latitude', 'date:longitude', 'date:isActive:age' + features = [ + "date:isActive", + "date:age", + "date:eyeColor", + "date:gender", + "date:latitude", + "date:longitude", + "date:isActive:age", ] # Specify your own binning specifications for individual features or combinations thereof. # This bin specification uses open-ended ("sparse") histograms; unspecified features get # auto-binned. The time-axis binning, when specified here, needs to be in nanoseconds. - bin_specs={ - 'longitude': {'bin_width': 5.0, 'bin_offset': 0.0}, - 'latitude': {'bin_width': 5.0, 'bin_offset': 0.0}, - 'age': {'bin_width': 10.0, 'bin_offset': 0.0}, - 'date': {'bin_width': pd.Timedelta('4w').value, - 'bin_offset': pd.Timestamp('2015-1-1').value} + bin_specs = { + "longitude": {"bin_width": 5.0, "bin_offset": 0.0}, + "latitude": {"bin_width": 5.0, "bin_offset": 0.0}, + "age": {"bin_width": 10.0, "bin_offset": 0.0}, + "date": { + "bin_width": pd.Timedelta("4w").value, + "bin_offset": pd.Timestamp("2015-1-1").value, + }, } # generate stability report diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 5ec9dc04..4ed8342b 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -55,7 +55,9 @@ To specify the time-axis binning alone, do: .. code-block:: python - report = df.pm_stability_report(time_axis='date', time_width='1w', time_offset='2020-1-6') + report = df.pm_stability_report( + time_axis="date", time_width="1w", time_offset="2020-1-6" + ) The ``time_axis`` argument should be the name of a column that is of type **numeric (e.g. batch id, time in ns) or date(time)**. The default time width is 30 days ('30d'), with time offset 2010-1-4 (a Monday). @@ -72,11 +74,15 @@ An example bin_specs dictionary is: .. code-block:: python - bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, - 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, - 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}], - 'date': {'bin_width': pd.Timedelta('4w').value, - 'bin_offset': pd.Timestamp('2015-1-1').value}} + bin_specs = { + "x": {"bin_width": 1, "bin_offset": 0}, + "y": {"num": 10, "low": 0.0, "high": 2.0}, + "x:y": [{}, {"num": 5, "low": 0.0, "high": 1.0}], + "date": { + "bin_width": pd.Timedelta("4w").value, + "bin_offset": pd.Timestamp("2015-1-1").value, + }, + } In the bin specs for 'x:y', 'x' is not provided (here) and reverts to the 1-dim setting. Any time-axis, when specified here ('date'), needs to be specified in nanoseconds. This takes precedence over @@ -112,9 +118,11 @@ When not provided, the default setting is: .. code-block:: python - monitoring_rules = {"*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]} + monitoring_rules = { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], + } Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". Same for ``"*_zscore"``. @@ -132,11 +140,13 @@ feature name in front. This also works for a combinations of two features. E.g. .. code-block:: python - monitoring_rules = {"featureA:*_pull": [5, 3, -3, -5], - "featureA:featureB:*_pull": [6, 3, -3, -6], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0]} + monitoring_rules = { + "featureA:*_pull": [5, 3, -3, -5], + "featureA:featureB:*_pull": [6, 3, -3, -6], + "featureA:nan": [4, 1, 0, 0], + "*_pull": [7, 4, -4, -7], + "nan": [8, 1, 0, 0], + } In the case where multiple rules could apply for a feature's statistic, the most specific one gets applied. So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule @@ -204,13 +214,16 @@ Spark usage from pyspark.sql import SparkSession # downloads histogrammar jar files if not already installed, used for histogramming of spark dataframe - spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20").getOrCreate() + spark = SparkSession.builder.config( + "spark.jars.packages", + "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20", + ).getOrCreate() # load a dataframe - spark_df = spark.read.format('csv').options(header='true').load('file.csv') + spark_df = spark.read.format("csv").options(header="true").load("file.csv") # generate the report - report = spark_df.pm_stability_report(time_axis='timestamp') + report = spark_df.pm_stability_report(time_axis="timestamp") Spark example on Google Colab @@ -231,16 +244,23 @@ Now that spark is installed, restart the runtime. .. code-block:: python import os + os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7" import findspark + findspark.init() from pyspark.sql import SparkSession - spark = SparkSession.builder.master("local[*]") \ - .config("spark.jars", "/content/jars/histogrammar_2.12-1.0.20.jar,/content/jars/histogrammar-sparksql_2.12-1.0.20.jar") \ - .config("spark.sql.execution.arrow.enabled", "false") \ - .config("spark.sql.session.timeZone", "GMT") \ - .getOrCreate() + spark = ( + SparkSession.builder.master("local[*]") + .config( + "spark.jars", + "/content/jars/histogrammar_2.12-1.0.20.jar,/content/jars/histogrammar-sparksql_2.12-1.0.20.jar", + ) + .config("spark.sql.execution.arrow.enabled", "false") + .config("spark.sql.session.timeZone", "GMT") + .getOrCreate() + ) diff --git a/popmon/pipeline/metrics.py b/popmon/pipeline/metrics.py index aa79604b..47135dab 100644 --- a/popmon/pipeline/metrics.py +++ b/popmon/pipeline/metrics.py @@ -73,9 +73,11 @@ def stability_metrics( .. code-block:: python - monitoring_rules = {"*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]} + monitoring_rules = { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], + } Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". @@ -84,10 +86,12 @@ def stability_metrics( .. code-block:: python - monitoring_rules = {"featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0]} + monitoring_rules = { + "featureA:*_pull": [5, 3, -3, -5], + "featureA:nan": [4, 1, 0, 0], + "*_pull": [7, 4, -4, -7], + "nan": [8, 1, 0, 0], + } In case of multiple rules could apply for a feature's statistic, the most specific one applies. So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule @@ -182,7 +186,7 @@ def df_stability_metrics( .. code-block:: python - features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] + features = ["x", "date", "date:x", "date:y", "date:x:y"] :param str binning: default binning to revert to in case bin_specs not supplied. options are: "unit" or "auto", default is "auto". When using "auto", semi-clever binning is automatically done. @@ -191,9 +195,11 @@ def df_stability_metrics( .. code-block:: python - bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, - 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, - 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]} + bin_specs = { + "x": {"bin_width": 1, "bin_offset": 0}, + "y": {"num": 10, "low": 0.0, "high": 2.0}, + "x:y": [{}, {"num": 5, "low": 0.0, "high": 1.0}], + } In the bin specs for x:y, x is not provided (here) and reverts to the 1-dim setting. The 'bin_width', 'bin_offset' notation makes an open-ended histogram (for that feature) with given bin width @@ -224,9 +230,11 @@ def df_stability_metrics( .. code-block:: python - monitoring_rules = {"*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]} + monitoring_rules = { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], + } Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". @@ -235,10 +243,12 @@ def df_stability_metrics( .. code-block:: python - monitoring_rules = {"featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0]} + monitoring_rules = { + "featureA:*_pull": [5, 3, -3, -5], + "featureA:nan": [4, 1, 0, 0], + "*_pull": [7, 4, -4, -7], + "nan": [8, 1, 0, 0], + } In case of multiple rules could apply for a feature's statistic, the most specific one applies. So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index f78aa572..9329924b 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -83,9 +83,11 @@ def stability_report( .. code-block:: python - monitoring_rules = {"*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]} + monitoring_rules = { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], + } Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". @@ -94,10 +96,12 @@ def stability_report( .. code-block:: python - monitoring_rules = {"featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0]} + monitoring_rules = { + "featureA:*_pull": [5, 3, -3, -5], + "featureA:nan": [4, 1, 0, 0], + "*_pull": [7, 4, -4, -7], + "nan": [8, 1, 0, 0], + } In case of multiple rules could apply for a feature's statistic, the most specific one applies. So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule @@ -212,7 +216,7 @@ def df_stability_report( .. code-block:: python - features = ['x', 'date', 'date:x', 'date:y', 'date:x:y'] + features = ["x", "date", "date:x", "date:y", "date:x:y"] :param str binning: default binning to revert to in case bin_specs not supplied. options are: "unit" or "auto", default is "auto". When using "auto", semi-clever binning is automatically done. @@ -221,9 +225,11 @@ def df_stability_report( .. code-block:: python - bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0}, - 'y': {'num': 10, 'low': 0.0, 'high': 2.0}, - 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}]} + bin_specs = { + "x": {"bin_width": 1, "bin_offset": 0}, + "y": {"num": 10, "low": 0.0, "high": 2.0}, + "x:y": [{}, {"num": 5, "low": 0.0, "high": 1.0}], + } In the bin specs for x:y, x is not provided (here) and reverts to the 1-dim setting. The 'bin_width', 'bin_offset' notation makes an open-ended histogram (for that feature) with given bin width @@ -254,9 +260,11 @@ def df_stability_report( .. code-block:: python - monitoring_rules = {"*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]} + monitoring_rules = { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], + } Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". @@ -265,10 +273,12 @@ def df_stability_report( .. code-block:: python - monitoring_rules = {"featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0]} + monitoring_rules = { + "featureA:*_pull": [5, 3, -3, -5], + "featureA:nan": [4, 1, 0, 0], + "*_pull": [7, 4, -4, -7], + "nan": [8, 1, 0, 0], + } In case of multiple rules could apply for a feature's statistic, the most specific one applies. So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule From 29dc36466847fd1d7d72e02906e0d1f5586b0cf3 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Mon, 1 Nov 2021 17:51:30 +0000 Subject: [PATCH 07/34] ci: upgrading packages --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 010c763a..d77e65fc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 21.9b0 + rev: 21.10b0 hooks: - id: black - repo: https://github.com/pycqa/isort From 721572e69ca7cf69758f4e52761b782ddbec28fe Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 3 Nov 2021 17:51:56 +0000 Subject: [PATCH 08/34] ci: update dependencies --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d77e65fc..7245ef66 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: black - repo: https://github.com/pycqa/isort - rev: 5.9.3 + rev: 5.10.0 hooks: - id: isort files: '.*' From 9494f17bda5ee407e0d0e966fe9b8e50c003ecc5 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 9 Nov 2021 17:51:17 +0000 Subject: [PATCH 09/34] ci: dependency update --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7245ef66..edb82db5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: black - repo: https://github.com/pycqa/isort - rev: 5.10.0 + rev: 5.10.1 hooks: - id: isort files: '.*' From add2936c52b44262b8ec7e3489ee9c7cce7e527c Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 12:36:54 +0200 Subject: [PATCH 10/34] refactor: move parallel processing to utils centralize configuration less duplicate code --- popmon/analysis/apply_func.py | 43 ++++++++++--------- popmon/config.py | 5 ++- popmon/utils.py | 21 +++++++++ .../visualization/alert_section_generator.py | 14 +++--- popmon/visualization/histogram_section.py | 14 ++---- popmon/visualization/section_generator.py | 15 +++---- .../traffic_light_section_generator.py | 14 +++--- 7 files changed, 68 insertions(+), 58 deletions(-) diff --git a/popmon/analysis/apply_func.py b/popmon/analysis/apply_func.py index 7c9e62da..0ed7ae4e 100644 --- a/popmon/analysis/apply_func.py +++ b/popmon/analysis/apply_func.py @@ -18,15 +18,13 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import collections -import multiprocessing import warnings import numpy as np import pandas as pd -from joblib import Parallel, delayed from ..base import Module +from ..utils import parallel class ApplyFunc(Module): @@ -157,27 +155,32 @@ def transform(self, datastore): features = self.get_features(apply_to_data.keys()) - num_cores = multiprocessing.cpu_count() same_key = self.assign_to_key == self.apply_to_key - res = Parallel(n_jobs=num_cores)( - delayed(apply_func_array)( - feature=feature, - metrics=self.metrics, - apply_to_df=self.get_datastore_object( + args = [ + { + "feature": feature, + "metrics": self.metrics, + "apply_to_df": self.get_datastore_object( apply_to_data, feature, dtype=pd.DataFrame ), - assign_to_df=None - if same_key - else self.get_datastore_object( - assign_to_data, feature, dtype=pd.DataFrame, default=pd.DataFrame() + "assign_to_df": ( + None + if same_key + else self.get_datastore_object( + assign_to_data, + feature, + dtype=pd.DataFrame, + default=pd.DataFrame(), + ) ), - apply_funcs=self.apply_funcs, - same_key=same_key, - ) + "apply_funcs": self.apply_funcs, + "same_key": same_key, + } for feature in features - ) - new_metrics = {r[0]: r[1] for r in res} + ] + result = parallel(apply_func_array, args, mode="kwargs") + new_metrics = dict(result) # storage datastore[self.store_key] = new_metrics @@ -189,7 +192,7 @@ def apply_func_array( ): """Apply list of functions to dataframe - Split off for parallellization reasons + Split off for parallelization reasons :param str feature: feature currently looping over :param list metrics: list of selected metrics to apply functions to @@ -197,7 +200,7 @@ def apply_func_array( :param assign_to_df: pandas data frame the output of function is assigned to :param apply_funcs: list of functions to apply to :param same_key: if True, merge apply_to_df and assign_to_df before returning assign_to_df - :return: untion of feature and assign_to_df + :return: union of feature and assign_to_df """ if not isinstance(apply_to_df, pd.DataFrame): raise TypeError( diff --git a/popmon/config.py b/popmon/config.py index b25b4200..cc7f6a48 100644 --- a/popmon/config.py +++ b/popmon/config.py @@ -17,7 +17,7 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - +import multiprocessing from fnmatch import fnmatch profiles = { @@ -130,3 +130,6 @@ def get_stat_description(name: str): return f"{int(name[1:])}% percentile" return "" + + +num_jobs = multiprocessing.cpu_count() diff --git a/popmon/utils.py b/popmon/utils.py index 35bbbeab..04fe848c 100644 --- a/popmon/utils.py +++ b/popmon/utils.py @@ -21,6 +21,10 @@ from textwrap import shorten from typing import Iterable, Optional +from joblib import Parallel, delayed + +from popmon.config import num_jobs + def short_date(date: str): return shorten(date, width=22, placeholder="") @@ -37,3 +41,20 @@ def filter_metrics(metrics, ignore_stat_endswith, show_stats: Optional[Iterable] if any(fnmatch.fnmatch(m, pattern) for pattern in show_stats) ] return metrics + + +def parallel(func, args_list, mode="args"): + """ + Routine for parallel processing + """ + + if num_jobs == 1: + results = [ + func(*args) if mode == "args" else func(**args) for args in args_list + ] + else: + results = Parallel(n_jobs=num_jobs)( + delayed(func)(*args) if mode == "args" else delayed(func)(**args) + for args in args_list + ) + return results diff --git a/popmon/visualization/alert_section_generator.py b/popmon/visualization/alert_section_generator.py index 5d2daac6..23cc5a58 100644 --- a/popmon/visualization/alert_section_generator.py +++ b/popmon/visualization/alert_section_generator.py @@ -18,16 +18,13 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import multiprocessing - import numpy as np import pandas as pd -from joblib import Parallel, delayed from tqdm import tqdm from ..base import Module from ..config import get_stat_description -from ..utils import filter_metrics, short_date +from ..utils import filter_metrics, parallel, short_date from ..visualization.utils import _prune, plot_bars_b64 from .traffic_light_section_generator import _plot_metrics @@ -109,8 +106,6 @@ def transform(self, datastore): features = self.get_features(data_obj.keys()) features_w_metrics = [] - num_cores = multiprocessing.cpu_count() - self.logger.info( f'Generating section "{self.section_name}". skip empty plots: {self.skip_empty_plots}' ) @@ -149,8 +144,8 @@ def transform(self, datastore): ) ) if self.plot_metrics: - plots += Parallel(n_jobs=num_cores)( - delayed(_plot_metric)( + args = [ + ( feature, metric, dates, @@ -165,7 +160,8 @@ def transform(self, datastore): self.skip_empty_plots, ) for metric in metrics - ) + ] + plots += parallel(_plot_metric, args) # filter out potential empty plots (from skip empty plots) if self.skip_empty_plots: plots = [e for e in plots if len(e["plot"])] diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index 2ec5d348..3be3dc38 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -18,11 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import multiprocessing - import pandas as pd from histogrammar.util import get_hist_props -from joblib import Parallel, delayed from tqdm import tqdm from ..analysis.hist_numpy import ( @@ -32,7 +29,7 @@ ) from ..base import Module from ..config import get_stat_description -from ..utils import short_date +from ..utils import parallel, short_date from ..visualization.utils import plot_overlay_1d_histogram_b64 @@ -80,8 +77,6 @@ def transform(self, datastore): features = self.get_features(data_obj.keys()) features_w_metrics = [] - num_cores = multiprocessing.cpu_count() - self.logger.info(f'Generating section "{self.section_name}".') for feature in tqdm(features, ncols=100): @@ -106,10 +101,9 @@ def transform(self, datastore): df[hist_names].iloc[-i].values for i in reversed(range(1, last_n + 1)) ] - plots = Parallel(n_jobs=num_cores)( - delayed(_plot_histograms)(feature, dates[i], hists[i], hist_names) - for i in range(last_n) - ) + args = [(feature, dates[i], hists[i], hist_names) for i in range(last_n)] + plots = parallel(_plot_histograms, args) + # filter out potential empty plots plots = [e for e in plots if len(e["plot"])] features_w_metrics.append( diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py index dd7ba4b2..3f94bca3 100644 --- a/popmon/visualization/section_generator.py +++ b/popmon/visualization/section_generator.py @@ -18,16 +18,13 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import multiprocessing - import numpy as np import pandas as pd -from joblib import Parallel, delayed from tqdm import tqdm from ..base import Module from ..config import get_stat_description -from ..utils import filter_metrics, short_date +from ..utils import filter_metrics, parallel, short_date from ..visualization.utils import _prune, plot_bars_b64 @@ -106,8 +103,6 @@ def transform(self, datastore): features = self.get_features(data_obj.keys()) features_w_metrics = [] - num_cores = multiprocessing.cpu_count() - self.logger.info( f'Generating section "{self.section_name}". skip empty plots: {self.skip_empty_plots}' ) @@ -130,8 +125,8 @@ def transform(self, datastore): df.columns, self.ignore_stat_endswith, self.show_stats ) - plots = Parallel(n_jobs=num_cores)( - delayed(_plot_metric)( + args = [ + ( feature, metric, dates, @@ -146,7 +141,9 @@ def transform(self, datastore): self.skip_empty_plots, ) for metric in metrics - ) + ] + plots = parallel(_plot_metric, args) + # filter out potential empty plots (from skip empty plots) if self.skip_empty_plots: plots = [e for e in plots if len(e["plot"])] diff --git a/popmon/visualization/traffic_light_section_generator.py b/popmon/visualization/traffic_light_section_generator.py index 503fd027..ca5ce1d7 100644 --- a/popmon/visualization/traffic_light_section_generator.py +++ b/popmon/visualization/traffic_light_section_generator.py @@ -18,16 +18,13 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import multiprocessing - import numpy as np import pandas as pd -from joblib import Parallel, delayed from tqdm import tqdm from ..base import Module from ..config import get_stat_description -from ..utils import filter_metrics, short_date +from ..utils import filter_metrics, parallel, short_date from ..visualization.utils import ( _prune, plot_traffic_lights_alerts_b64, @@ -114,8 +111,6 @@ def transform(self, datastore): features = self.get_features(data_obj.keys()) features_w_metrics = [] - num_cores = multiprocessing.cpu_count() - self.logger.info( f'Generating section "{self.section_name}". skip empty plots: {self.skip_empty_plots}' ) @@ -154,8 +149,8 @@ def transform(self, datastore): ) if self.plot_metrics: - plots += Parallel(n_jobs=num_cores)( - delayed(_plot_metric)( + args = [ + ( metric, dates, df[metric], @@ -165,7 +160,8 @@ def transform(self, datastore): self.skip_empty_plots, ) for metric in metrics - ) + ] + plots += parallel(_plot_metric, args) # filter out potential empty plots (from skip empty plots) if self.skip_empty_plots: From 4076d5eac45127b427111ac31b4d60963dc9a9ba Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 16 Nov 2021 17:51:27 +0000 Subject: [PATCH 11/34] ci: update dependencies --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index edb82db5..a7540272 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - flake8-comprehensions args: [ "--select=E9,F63,F7,F82,C4"] - repo: https://github.com/asottile/pyupgrade - rev: v2.29.0 + rev: v2.29.1 hooks: - id: pyupgrade args: ['--py36-plus','--exit-zero-even-if-changed'] From 10a3f0d8221863f2036ba3cc53686b03bc07a983 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 17 Nov 2021 17:51:43 +0000 Subject: [PATCH 12/34] ci: upgrading packages --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7540272..634044a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 21.10b0 + rev: 21.11b0 hooks: - id: black - repo: https://github.com/pycqa/isort From 9cd7744458a462c40539dad8094f700071c237da Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 18 Nov 2021 17:51:38 +0000 Subject: [PATCH 13/34] ci: upgrading packages --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 634044a9..9fd4be2f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 21.11b0 + rev: 21.11b1 hooks: - id: black - repo: https://github.com/pycqa/isort From 8977b1e4ea7305b1858fb4dbfe28bfb27c78d30e Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Fri, 19 Nov 2021 17:51:35 +0000 Subject: [PATCH 14/34] ci: upgrading packages --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9fd4be2f..2918d023 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,6 +22,6 @@ repos: - id: pyupgrade args: ['--py36-plus','--exit-zero-even-if-changed'] - repo: https://github.com/asottile/blacken-docs - rev: v1.11.0 + rev: v1.12.0 hooks: - id: blacken-docs From 2534cea19f5cc4ec2a99f387be77ab4f25a61ebc Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 14 Oct 2021 01:22:50 +0200 Subject: [PATCH 15/34] refactor: pipeline transformation structure --- popmon/alerting/alerts_summary.py | 22 ++--- popmon/alerting/compute_tl_bounds.py | 68 ++++++------- popmon/analysis/apply_func.py | 45 +++++---- popmon/analysis/comparison/hist_comparer.py | 25 +++-- popmon/analysis/functions.py | 2 +- popmon/analysis/merge_statistics.py | 17 ++-- popmon/analysis/profiling/hist_profiler.py | 16 ++- popmon/analysis/profiling/pull_calculator.py | 13 ++- popmon/base/module.py | 98 ++++++++++++++++--- popmon/base/pipeline.py | 79 ++++++++++++++- popmon/hist/hist_splitter.py | 26 ++--- popmon/io/file_reader.py | 15 +-- popmon/io/file_writer.py | 23 ++--- popmon/io/json_reader.py | 4 +- popmon/pipeline/metrics_pipelines.py | 8 +- popmon/pipeline/report.py | 4 +- popmon/pipeline/report_pipelines.py | 2 + popmon/stitching/hist_stitcher.py | 26 ++--- .../visualization/alert_section_generator.py | 60 +++++++----- popmon/visualization/histogram_section.py | 35 ++++--- popmon/visualization/report_generator.py | 10 +- popmon/visualization/section_generator.py | 48 ++++----- .../traffic_light_section_generator.py | 46 +++++---- .../popmon/alerting/test_compute_tl_bounds.py | 5 +- .../analysis/profiling/test_apply_func.py | 2 +- .../popmon/analysis/test_merge_statistics.py | 2 +- tests/popmon/base/test_pipeline.py | 51 +++++----- tests/popmon/io/test_file_reader.py | 2 +- tests/popmon/io/test_file_writer.py | 8 +- tests/popmon/io/test_json_reader.py | 2 +- tools/pipeline_viz.py | 97 ++++++++++++++++++ 31 files changed, 572 insertions(+), 289 deletions(-) create mode 100644 tools/pipeline_viz.py diff --git a/popmon/alerting/alerts_summary.py b/popmon/alerting/alerts_summary.py index 88a35343..870abcce 100644 --- a/popmon/alerting/alerts_summary.py +++ b/popmon/alerting/alerts_summary.py @@ -19,6 +19,7 @@ import fnmatch +from typing import Optional import numpy as np import pandas as pd @@ -31,6 +32,8 @@ class AlertsSummary(Module): It combines the alerts-summaries of all individual features into an artificial feature "_AGGREGATE_". """ + _input_keys = ("read_key", ) + _output_keys = ("store_key", ) def __init__( self, @@ -50,21 +53,16 @@ def __init__( """ super().__init__() self.read_key = read_key - self.store_key = store_key - if not self.store_key: - self.store_key = self.read_key + self.store_key = store_key or self.read_key self.features = features or [] self.ignore_features = ignore_features or [] self.combined_variable = combined_variable - def transform(self, datastore): - # fetch and check input data - data = self.get_datastore_object(datastore, self.read_key, dtype=dict) - + def transform(self, data: dict) -> Optional[dict]: # determine all possible features, used for the comparison below - features = self.get_features(data.keys()) + features = self.get_features(list(data.keys())) if len(features) == 0: - return datastore + return None self.logger.info( f'Combining alerts into artificial variable "{self.combined_variable}"' @@ -88,7 +86,7 @@ def transform(self, datastore): self.logger.warning( "indices of features are different. no alerts summary generated." ) - return datastore + return None # STEP 2: Concatenate the dataframes, there was one for each original feature. tlv = pd.concat(df_list, axis=1) @@ -104,6 +102,4 @@ def transform(self, datastore): # store combination of traffic alerts data[self.combined_variable] = dfc - datastore[self.store_key] = data - - return datastore + return data diff --git a/popmon/alerting/compute_tl_bounds.py b/popmon/alerting/compute_tl_bounds.py index ef7fb5bb..e8c77480 100644 --- a/popmon/alerting/compute_tl_bounds.py +++ b/popmon/alerting/compute_tl_bounds.py @@ -18,11 +18,10 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import collections import copy import fnmatch -import uuid from collections import defaultdict +from typing import Tuple, Any import numpy as np import pandas as pd @@ -117,6 +116,8 @@ class ComputeTLBounds(Module): meant to be generic. Then bounds can be stored as either raw values or as directly calculated values on the statistics of the data. """ + _input_keys = ("read_key", ) + _output_keys = ("store_key", "apply_funcs_key") def __init__( self, @@ -133,7 +134,7 @@ def __init__( entire=False, **kwargs, ): - """Initialize an instance of TafficLightBounds module. + """Initialize an instance of TrafficLightBounds module. :param str read_key: key of input data to read from datastore :param str store_key: key of output data to store in datastore (optional) @@ -152,12 +153,13 @@ def __init__( super().__init__() self.read_key = read_key self.store_key = store_key + self.apply_funcs_key = apply_funcs_key + self.monitoring_rules = monitoring_rules or {} self.features = features or [] self.ignore_features = ignore_features or [] self.traffic_lights = {} self.traffic_light_funcs = [] - self.apply_funcs_key = apply_funcs_key self.traffic_light_func = func if func is not None else traffic_light self.metrics_wide = metrics_wide self.prefix = prefix @@ -165,10 +167,12 @@ def __init__( self.entire = entire self.kwargs = copy.copy(kwargs) - # check inputs - if not isinstance(self.traffic_light_func, collections.Callable): + if not callable(self.traffic_light_func): raise TypeError("supplied function must be callable object") + def get_description(self): + return self.traffic_light_func.__name__ + def _set_traffic_lights(self, feature, cols, pattern, rule_name): process_cols = fnmatch.filter(cols, pattern) @@ -195,12 +199,9 @@ def _set_traffic_lights(self, feature, cols, pattern, rule_name): } ) - def transform(self, datastore): - # fetch and check input data - test_data = self.get_datastore_object(datastore, self.read_key, dtype=dict) - + def transform(self, test_data: dict) -> Tuple[Any, Any]: # determine all possible features, used for the comparison below - features = self.get_features(test_data.keys()) + features = self.get_features(list(test_data.keys())) pkeys, nkeys = collect_traffic_light_bounds(self.monitoring_rules) @@ -212,7 +213,9 @@ def transform(self, datastore): # --- 1. tl bounds explicitly defined for a particular feature if feature in pkeys: explicit_cols = [ - pcol for pcol in pkeys[feature] if pcol in test_df.columns + pcol + for pcol in pkeys[feature] + if pcol in test_df.columns ] implicit_cols = set(pkeys[feature]) - set(explicit_cols) @@ -237,13 +240,7 @@ def transform(self, datastore): feature, test_df.columns, pattern, rule_name="pattern" ) - # storage - if self.store_key: - datastore[self.store_key] = self.traffic_lights - if self.apply_funcs_key: - datastore[self.apply_funcs_key] = self.traffic_light_funcs - - return datastore + return self.traffic_lights, self.traffic_light_funcs def pull_bounds( @@ -338,7 +335,12 @@ class DynamicBounds(Pipeline): """Calculate dynamic traffic light bounds based on pull thresholds and dynamic mean and std.deviation.""" def __init__( - self, read_key, rules, store_key="", suffix_mean="_mean", suffix_std="_std" + self, + read_key, + rules, + store_key="", + suffix_mean="_mean", + suffix_std="_std", ): """Initialize an instance of DynamicTrafficLightBounds. @@ -348,10 +350,8 @@ def __init__( :param str suffix_mean: suffix of mean. mean column = metric + suffix_mean :param str suffix_std: suffix of std. std column = metric + suffix_std """ - super().__init__(modules=[]) self.read_key = read_key - - apply_funcs_key = str(uuid.uuid4()) + apply_funcs_key = f"{read_key}__{store_key}" expand_bounds = ComputeTLBounds( read_key=read_key, @@ -368,8 +368,7 @@ def __init__( assign_to_key=store_key, apply_funcs_key=apply_funcs_key, ) - - self.modules = [expand_bounds, calc_bounds] + super().__init__(modules=[expand_bounds, calc_bounds]) def transform(self, datastore): self.logger.info(f'Calculating dynamic bounds for "{self.read_key}"') @@ -380,7 +379,12 @@ class StaticBounds(Pipeline): """Calculate static traffic light bounds based on pull thresholds and static mean and std.deviation.""" def __init__( - self, read_key, rules, store_key="", suffix_mean="_mean", suffix_std="_std" + self, + read_key, + rules, + store_key="", + suffix_mean="_mean", + suffix_std="_std", ): """Initialize an instance of StaticBounds. @@ -390,10 +394,8 @@ def __init__( :param str suffix_mean: suffix of mean. mean column = metric + suffix_mean :param str suffix_std: suffix of std. std column = metric + suffix_std """ - super().__init__(modules=[]) self.read_key = read_key - - apply_funcs_key = str(uuid.uuid4()) + apply_funcs_key = f"{read_key}__{store_key}" expand_bounds = ComputeTLBounds( read_key=read_key, @@ -411,7 +413,7 @@ def __init__( apply_funcs_key=apply_funcs_key, ) - self.modules = [expand_bounds, calc_bounds] + super().__init__(modules=[expand_bounds, calc_bounds]) def transform(self, datastore): self.logger.info(f'Calculating static bounds for "{self.read_key}"') @@ -437,10 +439,8 @@ def __init__(self, read_key, store_key, rules, expanded_rules_key=""): :param str expanded_rules_key: store key of expanded monitoring rules to store in data store, eg. these can be used for plotting. (optional) """ - super().__init__(modules=[]) self.read_key = read_key - - apply_funcs_key = str(uuid.uuid4()) + apply_funcs_key = f"{read_key}__{store_key}" # generate static traffic light bounds by expanding the wildcarded monitoring rules expand_bounds = ComputeTLBounds( @@ -457,7 +457,7 @@ def __init__(self, read_key, store_key, rules, expanded_rules_key=""): apply_funcs_key=apply_funcs_key, ) - self.modules = [expand_bounds, apply_bounds] + super().__init__(modules=[expand_bounds, apply_bounds]) def transform(self, datastore): self.logger.info(f'Calculating traffic light alerts for "{self.read_key}"') diff --git a/popmon/analysis/apply_func.py b/popmon/analysis/apply_func.py index 0ed7ae4e..617fc669 100644 --- a/popmon/analysis/apply_func.py +++ b/popmon/analysis/apply_func.py @@ -19,6 +19,7 @@ import warnings +from typing import Optional import numpy as np import pandas as pd @@ -32,6 +33,8 @@ class ApplyFunc(Module): Extra parameters (kwargs) can be passed to the apply function. """ + _input_keys = ("apply_to_key", "assign_to_key", "apply_funcs_key") + _output_keys = ("store_key", ) def __init__( self, @@ -67,9 +70,10 @@ def __init__( """ super().__init__() self.apply_to_key = apply_to_key - self.assign_to_key = self.apply_to_key if not assign_to_key else assign_to_key - self.store_key = self.assign_to_key if not store_key else store_key + self.assign_to_key = assign_to_key or apply_to_key self.apply_funcs_key = apply_funcs_key + self.store_key = store_key or self.assign_to_key + self.features = features or [] self.metrics = metrics or [] self.msg = msg @@ -79,6 +83,14 @@ def __init__( for af in apply_funcs: self.add_apply_func(**af) + def get_description(self): + if len(self.apply_funcs) > 0: + return " and ".join([x['func'].__name__ for x in self.apply_funcs]) + elif self.apply_funcs_key: + return f"functions from arg '{self.apply_funcs_key}'" + else: + raise NotImplementedError + def add_apply_func( self, func, @@ -127,7 +139,7 @@ def add_apply_func( } ) - def transform(self, datastore): + def transform(self, apply_to_data: dict, assign_to_data: Optional[dict] = None, apply_funcs: Optional[list] = None): """ Apply functions to specified feature and metrics @@ -137,23 +149,17 @@ def transform(self, datastore): :return: updated datastore :rtype: dict """ - if self.msg: - self.logger.info(self.msg) + assert isinstance(apply_to_data, dict) + if assign_to_data is None: + assign_to_data = {} - apply_to_data = self.get_datastore_object( - datastore, self.apply_to_key, dtype=dict - ) - assign_to_data = self.get_datastore_object( - datastore, self.assign_to_key, dtype=dict, default={} - ) - - if self.apply_funcs_key: - apply_funcs = self.get_datastore_object( - datastore, self.apply_funcs_key, dtype=list - ) + if apply_funcs is not None: self.apply_funcs += apply_funcs - features = self.get_features(apply_to_data.keys()) + if self.msg: + self.logger.info(self.msg) + + features = self.get_features(list(apply_to_data.keys())) same_key = self.assign_to_key == self.apply_to_key @@ -181,10 +187,7 @@ def transform(self, datastore): ] result = parallel(apply_func_array, args, mode="kwargs") new_metrics = dict(result) - - # storage - datastore[self.store_key] = new_metrics - return datastore + return new_metrics def apply_func_array( diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index abdbc6ef..9f51fb36 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -162,13 +162,14 @@ def __init__( :param args: (tuple, optional): residual args passed on to func_mean and func_std :param kwargs: (dict, optional): residual kwargs passed on to func_mean and func_std """ - super().__init__(modules=[]) - if assign_to_key is None: assign_to_key = read_key # make reference histogram(s) - hist_collector = ApplyFunc(apply_to_key=read_key, assign_to_key=assign_to_key) + hist_collector = ApplyFunc( + apply_to_key=read_key, + assign_to_key=assign_to_key, + ) hist_collector.add_apply_func( func=func_hist_collector, entire=True, suffix=suffix, *args, **kwargs ) @@ -187,7 +188,8 @@ def __init__( } ], ) - self.modules = [hist_collector, hist_comparer] + + super().__init__(modules=[hist_collector, hist_comparer]) class RollingHistComparer(HistComparer): @@ -374,15 +376,20 @@ def __init__( :param args: (tuple, optional): residual args passed on to func_hist_collector :param kwargs: (dict, optional): residual kwargs passed on to func_hist_collector """ - super().__init__(modules=[]) - if assign_to_key is None: assign_to_key = read_key # make reference histogram(s) - hist_collector = ApplyFunc(apply_to_key=read_key, assign_to_key=assign_to_key) + hist_collector = ApplyFunc( + apply_to_key=read_key, + assign_to_key=assign_to_key + ) hist_collector.add_apply_func( - func=func_hist_collector, hist_name=hist_col, suffix="", *args, **kwargs + func=func_hist_collector, + hist_name=hist_col, + suffix="", + *args, + **kwargs ) # do histogram comparison @@ -399,7 +406,7 @@ def __init__( ], ) - self.modules = [hist_collector, hist_comparer] + super().__init__(modules=[hist_collector, hist_comparer]) class RollingNormHistComparer(NormHistComparer): diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py index 75a2938b..bc7054e6 100644 --- a/popmon/analysis/functions.py +++ b/popmon/analysis/functions.py @@ -46,7 +46,7 @@ def pull(row, suffix_mean="_mean", suffix_std="_std", cols=None): """ x = pd.Series() if cols is None or len(cols) == 0: - # if no columns are given, find colums for which pulls can be calculated. + # if no columns are given, find columns for which pulls can be calculated. # e.g. to calculate x_pull, need to have [x, x_mean, x_std] present. If so, put x in cols. cols = [] for m in row.index.to_list()[:]: diff --git a/popmon/analysis/merge_statistics.py b/popmon/analysis/merge_statistics.py index 188158b3..3d6eb3be 100644 --- a/popmon/analysis/merge_statistics.py +++ b/popmon/analysis/merge_statistics.py @@ -18,6 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from typing import List + import pandas as pd from ..base import Module @@ -25,22 +27,20 @@ class MergeStatistics(Module): """Merging dictionaries of features containing dataframes with statistics as its values.""" + _input_keys = ("read_keys", ) + _output_keys = ("store_key", ) - def __init__(self, read_keys, store_key): + def __init__(self, read_keys: List[str], store_key: str): """Initialize an instance of MergeStatistics. - :param str read_keys: list of keys of input data to read from the datastore + :param list read_keys: list of keys of input data to read from the datastore :param str store_key: key of output data to store in the datastore """ super().__init__() self.read_keys = read_keys self.store_key = store_key - def transform(self, datastore): - dicts = [ - self.get_datastore_object(datastore, read_key, dtype=dict) - for read_key in self.read_keys - ] + def transform(self, dicts: list): merged_stats = {} for dict_ in dicts: for feature in dict_.keys(): @@ -53,5 +53,4 @@ def transform(self, datastore): ) else: merged_stats[feature] = dict_[feature] - datastore[self.store_key] = merged_stats - return datastore + return merged_stats diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index afd5862a..45571ac8 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -57,6 +57,8 @@ class HistProfiler(Module): :param str index_col: key for index in split dictionary :param dict stats_functions: function_name, function(bin_labels, bin_counts) dictionary """ + _input_keys = ("read_key", ) + _output_keys = ("store_key", ) def __init__( self, @@ -72,12 +74,12 @@ def __init__( super().__init__() self.read_key = read_key self.store_key = store_key + self.features = features or [] self.ignore_features = ignore_features or [] self.var_timestamp = var_timestamp or [] self.hist_col = hist_col self.index_col = index_col - self.general_stats_1d = [ "count", "filled", @@ -89,7 +91,6 @@ def __init__( ] self.general_stats_2d = ["count", "phik"] self.category_stats_1d = ["fraction_true"] - self.stats_functions = stats_functions if self.stats_functions is None: self.stats_functions = DEFAULT_STATS @@ -222,15 +223,13 @@ def _profile_hist(self, split, hist_name): return profile_list - def transform(self, datastore): + def transform(self, data: dict) -> dict: self.logger.info( f'Profiling histograms "{self.read_key}" as "{self.store_key}"' ) - data = self.get_datastore_object(datastore, self.read_key, dtype=dict) - profiled = {} - - features = self.get_features(data.keys()) + features = self.get_features(list(data.keys())) + profiled = {} for feature in features[:]: df = self.get_datastore_object(data, feature, dtype=pd.DataFrame) hist_split_list = df.reset_index().to_dict("records") @@ -242,5 +241,4 @@ def transform(self, datastore): [self.index_col] ) - datastore[self.store_key] = profiled - return datastore + return profiled diff --git a/popmon/analysis/profiling/pull_calculator.py b/popmon/analysis/profiling/pull_calculator.py index 17936872..3e266545 100644 --- a/popmon/analysis/profiling/pull_calculator.py +++ b/popmon/analysis/profiling/pull_calculator.py @@ -131,8 +131,11 @@ def __init__( :param args: (tuple, optional): residual args passed on to mean and std functions :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions """ - kws = {"window": window, "shift": shift} - kws.update(kwargs) + kws = { + "window": window, + "shift": shift, + **kwargs + } super().__init__( rolling_mean, rolling_std, @@ -183,8 +186,10 @@ def __init__( :param args: (tuple, optional): residual args passed on to mean and std functions :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions """ - kws = {"shift": shift} - kws.update(kwargs) + kws = { + "shift": shift, + **kwargs + } super().__init__( expanding_mean, expanding_std, diff --git a/popmon/base/module.py b/popmon/base/module.py index 150db6b6..56b5f33c 100644 --- a/popmon/base/module.py +++ b/popmon/base/module.py @@ -19,10 +19,13 @@ import logging +from abc import ABC, abstractmethod -class Module: +class Module(ABC): """Base class used for modules in a pipeline.""" + _input_keys = None + _output_keys = None def __init__(self): """Module initialization""" @@ -31,6 +34,26 @@ def __init__(self): self.feature_begins_with = [] self.ignore_features = [] + def get_inputs(self): + in_keys = {} + for x in self._input_keys: + in_key = self.__dict__[x] + if in_key != "" and in_key is not None and in_key not in in_keys: + in_keys[x] = in_key + return in_keys + + def get_outputs(self): + out_keys = {} + for x in self._output_keys: + out_key = self.__dict__[x] + if out_key != "" and out_key is not None and out_key not in out_keys: + out_keys[x] = out_key + return out_keys + + # @abstractmethod + def get_description(self): + return "" + def set_logger(self, logger): """Set logger of module @@ -38,7 +61,8 @@ def set_logger(self, logger): """ self.logger = logger - def get_datastore_object(self, datastore, feature, dtype, default=None): + @staticmethod + def get_datastore_object(datastore, feature, dtype, default=None): """Get object from datastore. Bit more advanced than dict.get() @@ -49,17 +73,19 @@ def get_datastore_object(self, datastore, feature, dtype, default=None): :param obj default: object to default to in case key not found. :return: retrieved object """ - obj = datastore.get(feature) - if obj is None: - if default is not None: - obj = default - else: + if default is not None: + obj = datastore.get(feature, default) + else: + try: + obj = datastore[feature] + except KeyError: raise ValueError(f"`{feature}` not found in the datastore!") + if not isinstance(obj, dtype): raise TypeError(f"obj `{feature}` is not an instance of `{dtype}`!") return obj - def get_features(self, all_features): + def get_features(self, all_features: list) -> list: """Get all features that meet feature_begins_with and ignore_features requirements :param list all_features: input features list @@ -67,25 +93,65 @@ def get_features(self, all_features): :rtype: list """ all_features = sorted(all_features) - features = self.features - if not self.features: - features = all_features + features = self.features or all_features + if self.feature_begins_with: features = [k for k in features if k.startswith(self.feature_begins_with)] if self.ignore_features: features = [k for k in features if k not in self.ignore_features] features_not_in_input = [ - feature for feature in features if feature not in all_features + feature + for feature in features + if feature not in all_features ] - features = [feature for feature in features if feature in all_features] - for feature in features_not_in_input: self.logger.warning(f'Feature "{feature}" not in input data; skipping.') + features = [ + feature + for feature in features + if feature in all_features + ] return features - def transform(self, datastore): + def _transform(self, datastore): + """Transformation helper function""" + + inputs = {} + self.logger.debug(f"load from: {type(self)}") + for key in self._input_keys: + key_value = self.__dict__[key] + if key_value and len(key_value) > 0: + if isinstance(key_value, list): + inputs[key] = [datastore.get(k) for k in key_value] + else: + inputs[key] = datastore.get(key_value) + else: + inputs[key] = None + + self.logger.debug(f"load(key={key}, key_value={key_value}, value={str(inputs[key]):.100s})") + + # cache datastore + self._datastore = datastore + + # transformation + outputs = self.transform(*list(inputs.values())) + + # transform returns None if no update needs to be made + if outputs is not None: + if len(self._output_keys) == 1: + outputs = (outputs,) + + for k, v in zip(self._output_keys, outputs): + key_value = self.__dict__[k] + self.logger.debug(f"store(key={k}, key_value={key_value}, value={str(v):.100s})") + if key_value and len(key_value) > 0: # and v is not None: + datastore[key_value] = v + + return datastore + + def transform(self, *args): """Central function of the module. Typically transform() takes something from the datastore, does something to it, and puts the results @@ -95,4 +161,4 @@ def transform(self, datastore): :return: updated output datastore :rtype: dict """ - return datastore + raise NotImplementedError diff --git a/popmon/base/pipeline.py b/popmon/base/pipeline.py index 31a83afe..3995235a 100644 --- a/popmon/base/pipeline.py +++ b/popmon/base/pipeline.py @@ -17,13 +17,14 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - +import json import logging +from pathlib import Path from ..base import Module -class Pipeline(Module): +class Pipeline: """Base class used for to run modules in a pipeline.""" def __init__(self, modules, logger=None): @@ -32,7 +33,6 @@ def __init__(self, modules, logger=None): :param list modules: modules of the pipeline. :param logger: logger to be used by each module. """ - super().__init__() self.modules = modules self.set_logger(logger) @@ -68,5 +68,76 @@ def transform(self, datastore): """ for module in self.modules: - datastore = module.transform(datastore) + self.logger.debug(f"transform {module.__class__.__name__}") + if isinstance(module, Pipeline): + datastore = module.transform(datastore) + else: + datastore = module._transform(datastore) return datastore + + def visualize(self, versioned=True, funcs=None, dsets=None): + if dsets is None: + dsets = {} + if funcs is None: + funcs = {} + + modules = [] + for module in self.modules: + name = module.__class__.__name__ + if isinstance(module, Pipeline): + modules.append( + module.visualize(versioned, funcs, dsets) + ) + else: + in_keys = module.get_inputs() + + if versioned: + new_ins = {} + for k, in_key in in_keys.items(): + if in_key not in dsets: + dsets[in_key] = 1 + in_key += f" (v{dsets[in_key]})" + new_ins[k] = in_key + in_keys = new_ins + + out_keys = module.get_outputs() + if versioned: + new_outs = {} + for k, out_key in out_keys.items(): + if out_key in dsets: + dsets[out_key] += 1 + else: + dsets[out_key] = 1 + out_key += f" (v{dsets[out_key]})" + new_outs[k] = out_key + out_keys = new_outs + + self.logger.debug(f"{name}(inputs={in_keys}, outputs={out_keys})") + + # add unique id + if name not in funcs: + funcs[name] = {} + if id(module) not in funcs[name]: + funcs[name][id(module)] = len(funcs[name]) + 1 + + modules.append( + { + 'type': 'module', + 'name': f'{name}', + 'i': f'{funcs[name][id(module)]}', + 'desc': module.get_description(), + 'in': in_keys, + 'out': out_keys + } + ) + data = { + 'type': 'subgraph', + 'name': self.__class__.__name__, + 'modules': modules + } + return data + + def to_json(self, file_name, versioned=True): + d = self.visualize(versioned=versioned) + data = json.dumps(d, indent=4, sort_keys=True) + Path(file_name).write_text(data) diff --git a/popmon/hist/hist_splitter.py b/popmon/hist/hist_splitter.py index 4d11260e..43163414 100644 --- a/popmon/hist/hist_splitter.py +++ b/popmon/hist/hist_splitter.py @@ -37,6 +37,9 @@ class HistSplitter(Module): where time is the index and each row is a x:y histogram. """ + _input_keys = ("read_key", ) + _output_keys = ("store_key", ) + def __init__( self, read_key, @@ -70,6 +73,7 @@ def __init__( super().__init__() self.read_key = read_key self.store_key = store_key + self.features = features or [] self.ignore_features = ignore_features or [] self.feature_begins_with = feature_begins_with @@ -86,6 +90,9 @@ def __init__( "flatten_output requires short_keys attribute to be False." ) + def get_description(self): + return "" + def update_divided(self, divided, split, yname): if self.flatten_output: divided.update(split) @@ -95,18 +102,16 @@ def update_divided(self, divided, split, yname): ] return divided - def transform(self, datastore): - divided = {} - + def transform(self, data: dict) -> dict: self.logger.info( f'Splitting histograms "{self.read_key}" as "{self.store_key}"' ) - data = self.get_datastore_object(datastore, self.read_key, dtype=dict) # determine all possible features, used for comparison below - features = self.get_features(data.keys()) + features = self.get_features(list(data.keys())) # if so requested split selected histograms along first axis, and then divide + divided = {} for feature in features[:]: self.logger.debug(f'Now splitting histogram "{feature}"') hist = get_histogram(data[feature]) @@ -147,9 +152,8 @@ def transform(self, datastore): self.update_divided(divided=divided, split=split, yname=yname) # turn divided dicts into dataframes with index - keys = list(divided.keys()) - for k in keys: - divided[k] = pd.DataFrame(divided.pop(k)).set_index(self.index_col) - - datastore[self.store_key] = divided - return datastore + divided = { + k: pd.DataFrame(v).set_index(self.index_col) + for k, v in divided.items() + } + return divided diff --git a/popmon/io/file_reader.py b/popmon/io/file_reader.py index 06c5e0f6..929bec0b 100644 --- a/popmon/io/file_reader.py +++ b/popmon/io/file_reader.py @@ -28,6 +28,9 @@ class FileReader(Module): """Module to read contents from a file, transform the contents with a function and write them to the datastore.""" + _input_keys = tuple() + _output_keys = ("store_key", ) + def __init__( self, store_key: str, @@ -45,9 +48,7 @@ def __init__( super().__init__() if not isinstance(file_path, (str, Path)): raise TypeError("file_path should be of type `str` or `pathlib.Path`") - if apply_func is not None and not isinstance( - apply_func, collections.abc.Callable - ): + if apply_func is not None and not callable(apply_func): raise TypeError("transformation function must be a callable object") self.store_key = store_key @@ -55,7 +56,10 @@ def __init__( self.apply_func = apply_func self.kwargs = kwargs - def transform(self, datastore): + def get_description(self): + return self.file_path + + def transform(self): with open(self.file_path) as file: data = file.read() @@ -68,5 +72,4 @@ def transform(self, datastore): ) # store the transformed/original contents - datastore[self.store_key] = data - return datastore + return data diff --git a/popmon/io/file_writer.py b/popmon/io/file_writer.py index 2408b032..800729c2 100644 --- a/popmon/io/file_writer.py +++ b/popmon/io/file_writer.py @@ -28,6 +28,8 @@ class FileWriter(Module): """Module transforms specific datastore content and writes it to a file.""" + _input_keys = ("read_key", ) + _output_keys = ("store_key", ) def __init__( self, @@ -48,18 +50,20 @@ def __init__( super().__init__() if file_path is not None and not isinstance(file_path, (str, Path)): raise TypeError("file_path should be of type `str` or `pathlib.Path`") - if apply_func is not None and not isinstance( - apply_func, collections.abc.Callable - ): + if apply_func is not None and not callable(apply_func): raise TypeError("transformation function must be a callable object") self.read_key = read_key - self.store_key = store_key + self.store_key = store_key or read_key + self.file_path = file_path self.apply_func = apply_func self.kwargs = kwargs - def transform(self, datastore): - data = copy.deepcopy(datastore[self.read_key]) + def get_description(self): + return self.file_path + + def transform(self, data): + data = copy.deepcopy(data) # if a transformation function is provided, transform the data if self.apply_func is not None: @@ -67,14 +71,11 @@ def transform(self, datastore): # if file path is provided, write data to a file. Otherwise, write data into the datastore if self.file_path is None: - datastore[ - self.read_key if self.store_key is None else self.store_key - ] = data + return data else: with open(self.file_path, "w+") as file: file.write(data) self.logger.info( f'Object "{self.read_key}" written to file "{self.file_path}".' ) - - return datastore + return None diff --git a/popmon/io/json_reader.py b/popmon/io/json_reader.py index 6fe4f7f2..aaf0c492 100644 --- a/popmon/io/json_reader.py +++ b/popmon/io/json_reader.py @@ -36,5 +36,5 @@ def __init__(self, file_path: Union[str, Path], store_key: str): """ super().__init__(store_key, file_path, apply_func=json.loads) - def transform(self, datastore): - return super().transform(datastore) + def transform(self, *args): + return super().transform(*args) diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index 3de19b23..ba0bff9d 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -382,7 +382,13 @@ def metrics_rolling_reference( ), ApplyFunc( apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], + apply_funcs=[ + { + "func": traffic_light_summary, + "axis": 1, + "suffix": "" + } + ], assign_to_key="alerts", msg="Generating traffic light alerts summary.", ), diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index 9329924b..7b31a9bf 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -27,7 +27,7 @@ make_histograms, ) -from ..base import Module +from ..base import Module, Pipeline from ..config import config from ..pipeline.report_pipelines import ( ReportPipe, @@ -425,6 +425,8 @@ class StabilityReport(Module): after running the pipeline and generating the report. Report can be represented as a HTML string, HTML file or Jupyter notebook's cell output. """ + _input_keys = ("read_key", ) + _output_keys = tuple() def __init__(self, read_key="html_report"): """Initialize an instance of StabilityReport. diff --git a/popmon/pipeline/report_pipelines.py b/popmon/pipeline/report_pipelines.py index 2b66c8ac..ad71727f 100644 --- a/popmon/pipeline/report_pipelines.py +++ b/popmon/pipeline/report_pipelines.py @@ -90,6 +90,8 @@ def self_reference( ] pipeline = Pipeline(modules) + # pipeline.to_json("pipeline_self_reference_versioned.json", versioned=True) + # pipeline.to_json("pipeline_self_reference_unversioned.json", versioned=False) return pipeline diff --git a/popmon/stitching/hist_stitcher.py b/popmon/stitching/hist_stitcher.py index 7843842e..8b482682 100644 --- a/popmon/stitching/hist_stitcher.py +++ b/popmon/stitching/hist_stitcher.py @@ -28,6 +28,9 @@ class HistStitcher(Module): """Module stitches histograms by date""" + _input_keys = ("read_key", "delta_key") + _output_keys = ("store_key", ) + def __init__( self, mode="add", @@ -51,28 +54,25 @@ def __init__( (only required when calling transform(datastore) as module) """ super().__init__() - self.mode = mode - self.time_axis = time_axis - self.time_bin_idx = time_bin_idx self.read_key = read_key self.delta_key = delta_key self.store_key = store_key + self.mode = mode + self.time_axis = time_axis + self.time_bin_idx = time_bin_idx self.allowed_modes = ["add", "replace"] - assert self.mode in self.allowed_modes + if self.mode not in self.allowed_modes: + raise ValueError("mode should be either 'add' or 'replace'") + + def get_description(self): + return f"{self.mode}" - def transform(self, datastore): - # --- get input dict lists + def transform(self, hists_basis: dict, hists_delta: dict) -> dict: self.logger.info( f'Stitching histograms "{self.read_key}" and "{self.delta_key}" as "{self.store_key}"' ) - - hists_basis = self.get_datastore_object(datastore, self.read_key, dtype=dict) - hists_delta = self.get_datastore_object(datastore, self.delta_key, dtype=dict) - stitched = self.stitch_histograms(self.mode, hists_basis, hists_delta) - - datastore[self.store_key] = stitched - return datastore + return stitched def stitch_histograms( self, diff --git a/popmon/visualization/alert_section_generator.py b/popmon/visualization/alert_section_generator.py index 23cc5a58..ca5712da 100644 --- a/popmon/visualization/alert_section_generator.py +++ b/popmon/visualization/alert_section_generator.py @@ -18,6 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from typing import Optional + import numpy as np import pandas as pd from tqdm import tqdm @@ -34,6 +36,8 @@ class AlertSectionGenerator(Module): combines all the plots into a list which is stored together with the section name in a dictionary which later will be used for the report generation. """ + _input_keys = ("read_key", "static_bounds", "dynamic_bounds", "store_key") + _output_keys = ("store_key", ) def __init__( self, @@ -76,14 +80,15 @@ def __init__( super().__init__() self.read_key = read_key self.store_key = store_key + self.dynamic_bounds = dynamic_bounds + self.static_bounds = static_bounds + self.features = features or [] self.ignore_features = ignore_features or [] self.section_name = section_name self.last_n = last_n self.skip_first_n = skip_first_n self.skip_last_n = skip_last_n - self.dynamic_bounds = dynamic_bounds - self.static_bounds = static_bounds self.prefix = prefix self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] @@ -93,17 +98,28 @@ def __init__( self.plot_overview = True self.plot_metrics = False - def transform(self, datastore): - data_obj = self.get_datastore_object(datastore, self.read_key, dtype=dict) - - static_bounds = self.get_datastore_object( - datastore, self.static_bounds, dtype=dict, default={} - ) - dynamic_bounds = self.get_datastore_object( - datastore, self.dynamic_bounds, dtype=dict, default={} - ) + def get_description(self): + return self.section_name - features = self.get_features(data_obj.keys()) + def transform( + self, + data_obj: dict, + static_bounds: Optional[dict] = None, + dynamic_bounds: Optional[dict] = None, + sections: Optional[list] = None + ): + assert isinstance(data_obj, dict) + if static_bounds is None: + static_bounds = {} + assert isinstance(static_bounds, dict) + if dynamic_bounds is None: + dynamic_bounds = {} + assert isinstance(dynamic_bounds, dict) + if sections is None: + sections = [] + assert isinstance(sections, list) + + features = self.get_features(list(data_obj.keys())) features_w_metrics = [] self.logger.info( @@ -170,18 +186,14 @@ def transform(self, datastore): {"name": feature, "plots": sorted(plots, key=lambda plot: plot["name"])} ) - params = { - "section_title": self.section_name, - "section_description": self.description, - "features": features_w_metrics, - } - - if self.store_key in datastore: - datastore[self.store_key].append(params) - else: - datastore[self.store_key] = [params] - - return datastore + sections.append( + { + "section_title": self.section_name, + "section_description": self.description, + "features": features_w_metrics, + } + ) + return sections def _plot_metric( diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index 3be3dc38..e5cb75ee 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -18,6 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from typing import Optional + import pandas as pd from histogrammar.util import get_hist_props from tqdm import tqdm @@ -35,6 +37,8 @@ class HistogramSection(Module): """This module plots histograms of all selected features for the last 'n' periods.""" + _input_keys = ("read_key", "store_key") + _output_keys = ("store_key", ) def __init__( self, @@ -63,6 +67,7 @@ def __init__( super().__init__() self.read_key = read_key self.store_key = store_key + self.features = features or [] self.ignore_features = ignore_features or [] self.section_name = section_name @@ -71,10 +76,14 @@ def __init__( self.hist_name_starts_with = hist_name_starts_with self.description = description - def transform(self, datastore): - data_obj = self.get_datastore_object(datastore, self.read_key, dtype=dict) + def get_description(self): + return self.section_name - features = self.get_features(data_obj.keys()) + def transform(self, data_obj: dict, sections: Optional[list] = None): + if sections is None: + sections = [] + + features = self.get_features(list(data_obj.keys())) features_w_metrics = [] self.logger.info(f'Generating section "{self.section_name}".') @@ -110,18 +119,14 @@ def transform(self, datastore): {"name": feature, "plots": sorted(plots, key=lambda plot: plot["name"])} ) - params = { - "section_title": self.section_name, - "section_description": self.description, - "features": features_w_metrics, - } - - if self.store_key in datastore: - datastore[self.store_key].append(params) - else: - datastore[self.store_key] = [params] - - return datastore + sections.append( + { + "section_title": self.section_name, + "section_description": self.description, + "features": features_w_metrics, + } + ) + return sections def _plot_histograms(feature, date, hc_list, hist_names): diff --git a/popmon/visualization/report_generator.py b/popmon/visualization/report_generator.py index eec0f158..b95ac0b3 100644 --- a/popmon/visualization/report_generator.py +++ b/popmon/visualization/report_generator.py @@ -29,6 +29,8 @@ class ReportGenerator(Module): """This module takes already prepared section data, renders HTML section template with the data and glues sections together into one compressed report which is created based on the provided template. """ + _input_keys = ("read_key", ) + _output_keys = ("store_key", ) def __init__(self, read_key, store_key): """Initialize an instance of ReportGenerator. @@ -40,9 +42,10 @@ def __init__(self, read_key, store_key): self.read_key = read_key self.store_key = store_key - def transform(self, datastore): - sections = self.get_datastore_object(datastore, self.read_key, dtype=list) + def get_description(self): + return "HTML Report" + def transform(self, sections: list) -> str: # concatenate HTML sections' code sections_html = "" for i, section_info in enumerate(sections): @@ -51,11 +54,10 @@ def transform(self, datastore): ) # get HTML template for the final report, insert placeholder data and compress the code - datastore[self.store_key] = htmlmin.minify( + return htmlmin.minify( templates_env( filename="core.html", generator=f"popmon {version}", sections=sections_html, ) ) - return datastore diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py index 3f94bca3..f1049858 100644 --- a/popmon/visualization/section_generator.py +++ b/popmon/visualization/section_generator.py @@ -18,6 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from typing import Optional + import numpy as np import pandas as pd from tqdm import tqdm @@ -33,6 +35,8 @@ class SectionGenerator(Module): combines all the plots into a list which is stored together with the section name in a dictionary which later will be used for the report generation. """ + _input_keys = ("read_key", "static_bounds", "dynamic_bounds", "store_key") + _output_keys = ("store_key", ) def __init__( self, @@ -75,14 +79,15 @@ def __init__( super().__init__() self.read_key = read_key self.store_key = store_key + self.dynamic_bounds = dynamic_bounds + self.static_bounds = static_bounds + self.features = features or [] self.ignore_features = ignore_features or [] self.section_name = section_name self.last_n = last_n self.skip_first_n = skip_first_n self.skip_last_n = skip_last_n - self.dynamic_bounds = dynamic_bounds - self.static_bounds = static_bounds self.prefix = prefix self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] @@ -90,17 +95,18 @@ def __init__( self.description = description self.show_stats = show_stats - def transform(self, datastore): - data_obj = self.get_datastore_object(datastore, self.read_key, dtype=dict) + def get_description(self): + return self.section_name - static_bounds = self.get_datastore_object( - datastore, self.static_bounds, dtype=dict, default={} - ) - dynamic_bounds = self.get_datastore_object( - datastore, self.dynamic_bounds, dtype=dict, default={} - ) + def transform(self, data_obj: dict, static_bounds: Optional[dict] = None, dynamic_bounds: Optional[dict] = None, sections: Optional[list] = None): + if static_bounds is None: + static_bounds = {} + if dynamic_bounds is None: + dynamic_bounds = {} + if sections is None: + sections = [] - features = self.get_features(data_obj.keys()) + features = self.get_features(list(data_obj.keys())) features_w_metrics = [] self.logger.info( @@ -151,18 +157,14 @@ def transform(self, datastore): {"name": feature, "plots": sorted(plots, key=lambda plot: plot["name"])} ) - params = { - "section_title": self.section_name, - "section_description": self.description, - "features": features_w_metrics, - } - - if self.store_key not in datastore: - datastore[self.store_key] = [] - - datastore[self.store_key].append(params) - - return datastore + sections.append( + { + "section_title": self.section_name, + "section_description": self.description, + "features": features_w_metrics, + } + ) + return sections def _plot_metric( diff --git a/popmon/visualization/traffic_light_section_generator.py b/popmon/visualization/traffic_light_section_generator.py index ca5ce1d7..56d19d26 100644 --- a/popmon/visualization/traffic_light_section_generator.py +++ b/popmon/visualization/traffic_light_section_generator.py @@ -18,6 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from typing import Optional + import numpy as np import pandas as pd from tqdm import tqdm @@ -38,6 +40,8 @@ class TrafficLightSectionGenerator(Module): combines all the plots into a list which is stored together with the section name in a dictionary which later will be used for the report generation. """ + _input_keys = ("read_key", "dynamic_bounds", "store_key") + _output_keys = ("store_key", ) def __init__( self, @@ -84,14 +88,15 @@ def __init__( super().__init__() self.read_key = read_key self.store_key = store_key + self.dynamic_bounds = dynamic_bounds + self.static_bounds = static_bounds + self.features = features or [] self.ignore_features = ignore_features or [] self.section_name = section_name self.last_n = last_n self.skip_first_n = skip_first_n self.skip_last_n = skip_last_n - self.dynamic_bounds = dynamic_bounds - self.static_bounds = static_bounds self.prefix = prefix self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] @@ -101,14 +106,19 @@ def __init__( self.plot_overview = plot_overview self.plot_metrics = plot_metrics - def transform(self, datastore): - data_obj = self.get_datastore_object(datastore, self.read_key, dtype=dict) + def get_description(self): + return self.section_name - dynamic_bounds = self.get_datastore_object( - datastore, self.dynamic_bounds, dtype=dict, default={} - ) + def transform(self, data_obj: dict, dynamic_bounds: Optional[dict] = None, sections: Optional[list] = None): + assert isinstance(data_obj, dict) + if dynamic_bounds is None: + dynamic_bounds = {} + assert isinstance(dynamic_bounds, dict) + if sections is None: + sections = [] + assert isinstance(sections, list) - features = self.get_features(data_obj.keys()) + features = self.get_features(list(data_obj.keys())) features_w_metrics = [] self.logger.info( @@ -170,18 +180,14 @@ def transform(self, datastore): {"name": feature, "plots": sorted(plots, key=lambda plot: plot["name"])} ) - params = { - "section_title": self.section_name, - "section_description": self.description, - "features": features_w_metrics, - } - - if self.store_key in datastore: - datastore[self.store_key].append(params) - else: - datastore[self.store_key] = [params] - - return datastore + sections.append( + { + "section_title": self.section_name, + "section_description": self.description, + "features": features_w_metrics, + } + ) + return sections def _plot_metric(metric, dates, values, last_n, skip_first_n, skip_last_n, skip_empty): diff --git a/tests/popmon/alerting/test_compute_tl_bounds.py b/tests/popmon/alerting/test_compute_tl_bounds.py index c9b392dc..b2211866 100644 --- a/tests/popmon/alerting/test_compute_tl_bounds.py +++ b/tests/popmon/alerting/test_compute_tl_bounds.py @@ -35,14 +35,13 @@ def test_compute_traffic_light_bounds(): monitoring_rules=conf["monitoring_rules"], ) - output = module.transform(datastore)["output_data"] + output = module._transform(datastore)["output_data"] assert "dummy_feature:mae" not in output.keys() assert output["the_feature:mae"] == [8, 4, 2, 2] assert output["the_feature:mse"] == [0.2, 0.11, 0.09, 0] def test_compute_traffic_light_funcs(): - datastore = {"test_data": pytest.test_comparer_df} conf = { @@ -61,7 +60,7 @@ def test_compute_traffic_light_funcs(): monitoring_rules=conf["monitoring_rules"], ) - output = module.transform(datastore)["output_data"] + output = module._transform(datastore)["output_data"] assert len(output) == 3 assert output[0]["features"] == ["dummy_feature"] diff --git a/tests/popmon/analysis/profiling/test_apply_func.py b/tests/popmon/analysis/profiling/test_apply_func.py index 8a53e87e..4adff82e 100644 --- a/tests/popmon/analysis/profiling/test_apply_func.py +++ b/tests/popmon/analysis/profiling/test_apply_func.py @@ -60,7 +60,7 @@ def func(x): module.add_apply_func(np.mean, entire=True) module.add_apply_func(func) - datastore = module.transform(datastore) + datastore = module._transform(datastore) p = datastore["profiled"]["asc_numbers"] diff --git a/tests/popmon/analysis/test_merge_statistics.py b/tests/popmon/analysis/test_merge_statistics.py index cc7c1a54..ff474311 100644 --- a/tests/popmon/analysis/test_merge_statistics.py +++ b/tests/popmon/analysis/test_merge_statistics.py @@ -40,7 +40,7 @@ def test_merge_statistics(): } datastore = MergeStatistics( read_keys=["first_df", "second_df"], store_key="output_df" - ).transform(datastore) + )._transform(datastore) pd.testing.assert_frame_equal(df1.combine_first(df2), out) pd.testing.assert_frame_equal(datastore["output_df"]["feature_1"], out) diff --git a/tests/popmon/base/test_pipeline.py b/tests/popmon/base/test_pipeline.py index 613182e1..650a1c71 100644 --- a/tests/popmon/base/test_pipeline.py +++ b/tests/popmon/base/test_pipeline.py @@ -6,66 +6,63 @@ class LogTransformer(Module): + _input_keys = ("input_key", ) + _output_keys = ("output_key", ) + def __init__(self, input_key, output_key): super().__init__() self.input_key = input_key self.output_key = output_key - def transform(self, datastore): - input_array = self.get_datastore_object( - datastore, self.input_key, dtype=np.ndarray - ) - datastore[self.output_key] = np.log(input_array) + def transform(self, input_array: np.ndarray): + output = np.log(input_array) self.logger.info(f"{self.__class__.__name__} is calculated.") - return datastore + return output class PowerTransformer(Module): + _input_keys = ("input_key",) + _output_keys = ("output_key",) + def __init__(self, input_key, output_key, power): super().__init__() self.input_key = input_key self.output_key = output_key self.power = power - def transform(self, datastore): - input_array = self.get_datastore_object( - datastore, self.input_key, dtype=np.ndarray - ) - datastore[self.output_key] = np.power(input_array, self.power) - return datastore + def transform(self, input_array: np.ndarray): + result = np.power(input_array, self.power) + return result class SumNormalizer(Module): + _input_keys = ("input_key",) + _output_keys = ("output_key",) + def __init__(self, input_key, output_key): super().__init__() self.input_key = input_key self.output_key = output_key - def transform(self, datastore): - input_array = self.get_datastore_object( - datastore, self.input_key, dtype=np.ndarray - ) - datastore[self.output_key] = input_array / input_array.sum() - return datastore + def transform(self, input_array: np.ndarray): + result = input_array / input_array.sum() + return result class WeightedSum(Module): + _input_keys = ("input_key", "weight_key") + _output_keys = ("output_key",) + def __init__(self, input_key, weight_key, output_key): super().__init__() self.input_key = input_key self.weight_key = weight_key self.output_key = output_key - def transform(self, datastore): - input_array = self.get_datastore_object( - datastore, self.input_key, dtype=np.ndarray - ) - weights = self.get_datastore_object( - datastore, self.weight_key, dtype=np.ndarray - ) - datastore[self.output_key] = np.sum(input_array * weights) + def transform(self, input_array: np.ndarray, weights: np.ndarray): + result = np.sum(input_array * weights) self.logger.info(f"{self.__class__.__name__} is calculated.") - return datastore + return result def test_popmon_pipeline(): diff --git a/tests/popmon/io/test_file_reader.py b/tests/popmon/io/test_file_reader.py index 9ad91703..d953d3d2 100644 --- a/tests/popmon/io/test_file_reader.py +++ b/tests/popmon/io/test_file_reader.py @@ -10,7 +10,7 @@ def test_file_reader_json(): store_key="example", apply_func=json.loads, ) - datastore = fr.transform(datastore={}) + datastore = fr._transform(datastore={}) assert datastore["example"]["boolean"] assert len(datastore["example"]["array"]) == 3 diff --git a/tests/popmon/io/test_file_writer.py b/tests/popmon/io/test_file_writer.py index c00fa308..b505b4d0 100644 --- a/tests/popmon/io/test_file_writer.py +++ b/tests/popmon/io/test_file_writer.py @@ -23,25 +23,25 @@ def to_pandas(data): def test_file_writer_json(): datastore = get_ready_ds() - FileWriter("my_data", apply_func=to_json).transform(datastore) + FileWriter("my_data", apply_func=to_json)._transform(datastore) assert datastore["my_data"] == to_json(DATA) def test_file_writer_json_with_kwargument(): datastore = get_ready_ds() - FileWriter("my_data", apply_func=to_json, indent=4).transform(datastore) + FileWriter("my_data", apply_func=to_json, indent=4)._transform(datastore) assert datastore["my_data"] == to_json(DATA, indent=4) def test_file_writer_not_a_func(): datastore = get_ready_ds() with pytest.raises(TypeError): - FileWriter("my_data", apply_func={}).transform(datastore) + FileWriter("my_data", apply_func={})._transform(datastore) def test_file_writer_df(): datastore = get_ready_ds() - FileWriter("my_data", store_key="transformed_data", apply_func=to_pandas).transform( + FileWriter("my_data", store_key="transformed_data", apply_func=to_pandas)._transform( datastore ) assert datastore["my_data"] == DATA diff --git a/tests/popmon/io/test_json_reader.py b/tests/popmon/io/test_json_reader.py index 4a46651b..d47e155b 100644 --- a/tests/popmon/io/test_json_reader.py +++ b/tests/popmon/io/test_json_reader.py @@ -4,7 +4,7 @@ def test_json_reader(): jr = JsonReader(file_path=resources.data("example.json"), store_key="example") - datastore = jr.transform(datastore={}) + datastore = jr._transform(datastore={}) assert datastore["example"]["boolean"] assert len(datastore["example"]["array"]) == 3 diff --git a/tools/pipeline_viz.py b/tools/pipeline_viz.py new file mode 100644 index 00000000..69f2f117 --- /dev/null +++ b/tools/pipeline_viz.py @@ -0,0 +1,97 @@ +import json +from pathlib import Path + +import networkx as nx +import pygraphviz +from networkx.drawing.nx_agraph import to_agraph + + +def generate_pipeline_vizualisation(input_file, output_file, include_subgraphs: bool = False, include_labels: bool = False): + data = Path(input_file).read_text() + data = json.loads(data) + + subgraphs = [] + modules = [] + + def populate(item): + if item['type'] == 'subgraph': + mods = [] + for m in item['modules']: + mods += populate(m) + + subgraphs.append( + { + 'modules': mods, + 'name': item['name'] + } + ) + return mods + elif item['type'] == 'module': + modules.append(item) + name = f"{item['name']}_{item['i']}" + return [name]+list(item["out"].values()) + else: + raise ValueError() + + populate(data) + + G = nx.DiGraph() + for module in modules: + label = f"<{module['name']}" + d = module.get('desc', '') + if len(d) > 0: + label += f"
{d}" + label += ">" + + # unique name + name = f"{module['name']}_{module['i']}" + + G.add_node(name, shape='rectangle', fillcolor='chartreuse', style='filled', label=label) + + + for k, v in module['in'].items(): + kwargs = {} + if include_labels: + kwargs['headlabel'] = k + G.add_edge(v, name, **kwargs) + for k, v in module['out'].items(): + kwargs = {} + if include_labels: + kwargs['taillabel'] = k + G.add_edge(name, v, **kwargs) + + # set defaults + G.graph['graph'] = {'rankdir':'TD'} + G.graph['node'] = {'shape':'oval', 'fillcolor': 'orange', 'style': 'filled'} + G.graph['edge'] = {'fontcolor':"gray50"} + + A = to_agraph(G) + if include_subgraphs: + for idx, subgraph in enumerate(subgraphs): + H = A.subgraph(subgraph["modules"], name=f'cluster_{idx}_{subgraph["name"].lower().replace(" ", "_")}') + H.graph_attr["color"] = "blue" + H.graph_attr["label"] = subgraph["name"] + H.graph_attr["style"] = "dotted" + + A.layout('dot') + A.draw(output_file) + + +if __name__ == "__main__": + data_path = Path("<...>") + + input_file = data_path / "pipeline_self_reference_unversioned.json" + output_file = 'popmon-report-pipeline-subgraphs-unversioned.pdf' + generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=True) + + input_file = data_path / "pipeline_self_reference_unversioned.json" + output_file = 'popmon-report-pipeline-unversioned.pdf' + generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=False) + + input_file = data_path / "pipeline_self_reference_versioned.json" + output_file = 'popmon-report-pipeline-subgraphs-versioned.pdf' + generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=True) + + input_file = data_path / "pipeline_self_reference_versioned.json" + output_file = 'popmon-report-pipeline-versioned.pdf' + generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=False) From aa663b0f8c14e0361bfa2914daef17f45ceb09cf Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 14 Oct 2021 01:22:59 +0200 Subject: [PATCH 16/34] chore: version bump --- bump.py | 4 ++-- popmon/version.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bump.py b/bump.py index 8a8e803f..50c8df52 100644 --- a/bump.py +++ b/bump.py @@ -2,8 +2,8 @@ from pathlib import Path MAJOR = 0 -REVISION = 4 -PATCH = 4 +REVISION = 5 +PATCH = 0 VERSION = f"{MAJOR}.{REVISION}.{PATCH}" diff --git a/popmon/version.py b/popmon/version.py index e143821c..5aa30b4d 100644 --- a/popmon/version.py +++ b/popmon/version.py @@ -1,3 +1,3 @@ """THIS FILE IS AUTO-GENERATED BY SETUP.PY.""" -version = "0.4.4" +version = "0.5.0" From c9d861ff9e633b598f9daa399d356f4cb3005540 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 14 Oct 2021 12:04:50 +0200 Subject: [PATCH 17/34] style: lint --- popmon/alerting/alerts_summary.py | 5 +- popmon/alerting/compute_tl_bounds.py | 9 ++- popmon/analysis/apply_func.py | 12 +++- popmon/analysis/comparison/hist_comparer.py | 11 +--- popmon/analysis/merge_statistics.py | 5 +- popmon/analysis/profiling/hist_profiler.py | 5 +- popmon/analysis/profiling/pull_calculator.py | 11 +--- popmon/base/module.py | 21 ++++--- popmon/base/pipeline.py | 22 +++---- popmon/hist/hist_splitter.py | 7 +-- popmon/io/file_reader.py | 4 +- popmon/io/file_writer.py | 5 +- popmon/pipeline/metrics_pipelines.py | 8 +-- popmon/pipeline/report.py | 5 +- popmon/stitching/hist_stitcher.py | 2 +- .../visualization/alert_section_generator.py | 5 +- popmon/visualization/histogram_section.py | 3 +- popmon/visualization/report_generator.py | 5 +- popmon/visualization/section_generator.py | 11 +++- .../traffic_light_section_generator.py | 10 +++- tests/popmon/base/test_pipeline.py | 4 +- tests/popmon/io/test_file_writer.py | 6 +- tools/pipeline_viz.py | 58 ++++++++++--------- 23 files changed, 118 insertions(+), 116 deletions(-) diff --git a/popmon/alerting/alerts_summary.py b/popmon/alerting/alerts_summary.py index 870abcce..ff21e568 100644 --- a/popmon/alerting/alerts_summary.py +++ b/popmon/alerting/alerts_summary.py @@ -32,8 +32,9 @@ class AlertsSummary(Module): It combines the alerts-summaries of all individual features into an artificial feature "_AGGREGATE_". """ - _input_keys = ("read_key", ) - _output_keys = ("store_key", ) + + _input_keys = ("read_key",) + _output_keys = ("store_key",) def __init__( self, diff --git a/popmon/alerting/compute_tl_bounds.py b/popmon/alerting/compute_tl_bounds.py index e8c77480..1eb1f3db 100644 --- a/popmon/alerting/compute_tl_bounds.py +++ b/popmon/alerting/compute_tl_bounds.py @@ -21,7 +21,7 @@ import copy import fnmatch from collections import defaultdict -from typing import Tuple, Any +from typing import Any, Tuple import numpy as np import pandas as pd @@ -116,7 +116,8 @@ class ComputeTLBounds(Module): meant to be generic. Then bounds can be stored as either raw values or as directly calculated values on the statistics of the data. """ - _input_keys = ("read_key", ) + + _input_keys = ("read_key",) _output_keys = ("store_key", "apply_funcs_key") def __init__( @@ -213,9 +214,7 @@ def transform(self, test_data: dict) -> Tuple[Any, Any]: # --- 1. tl bounds explicitly defined for a particular feature if feature in pkeys: explicit_cols = [ - pcol - for pcol in pkeys[feature] - if pcol in test_df.columns + pcol for pcol in pkeys[feature] if pcol in test_df.columns ] implicit_cols = set(pkeys[feature]) - set(explicit_cols) diff --git a/popmon/analysis/apply_func.py b/popmon/analysis/apply_func.py index 617fc669..37a4296c 100644 --- a/popmon/analysis/apply_func.py +++ b/popmon/analysis/apply_func.py @@ -33,8 +33,9 @@ class ApplyFunc(Module): Extra parameters (kwargs) can be passed to the apply function. """ + _input_keys = ("apply_to_key", "assign_to_key", "apply_funcs_key") - _output_keys = ("store_key", ) + _output_keys = ("store_key",) def __init__( self, @@ -85,7 +86,7 @@ def __init__( def get_description(self): if len(self.apply_funcs) > 0: - return " and ".join([x['func'].__name__ for x in self.apply_funcs]) + return " and ".join([x["func"].__name__ for x in self.apply_funcs]) elif self.apply_funcs_key: return f"functions from arg '{self.apply_funcs_key}'" else: @@ -139,7 +140,12 @@ def add_apply_func( } ) - def transform(self, apply_to_data: dict, assign_to_data: Optional[dict] = None, apply_funcs: Optional[list] = None): + def transform( + self, + apply_to_data: dict, + assign_to_data: Optional[dict] = None, + apply_funcs: Optional[list] = None, + ): """ Apply functions to specified feature and metrics diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 9f51fb36..e542c6a5 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -380,16 +380,9 @@ def __init__( assign_to_key = read_key # make reference histogram(s) - hist_collector = ApplyFunc( - apply_to_key=read_key, - assign_to_key=assign_to_key - ) + hist_collector = ApplyFunc(apply_to_key=read_key, assign_to_key=assign_to_key) hist_collector.add_apply_func( - func=func_hist_collector, - hist_name=hist_col, - suffix="", - *args, - **kwargs + func=func_hist_collector, hist_name=hist_col, suffix="", *args, **kwargs ) # do histogram comparison diff --git a/popmon/analysis/merge_statistics.py b/popmon/analysis/merge_statistics.py index 3d6eb3be..232f8b98 100644 --- a/popmon/analysis/merge_statistics.py +++ b/popmon/analysis/merge_statistics.py @@ -27,8 +27,9 @@ class MergeStatistics(Module): """Merging dictionaries of features containing dataframes with statistics as its values.""" - _input_keys = ("read_keys", ) - _output_keys = ("store_key", ) + + _input_keys = ("read_keys",) + _output_keys = ("store_key",) def __init__(self, read_keys: List[str], store_key: str): """Initialize an instance of MergeStatistics. diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index 45571ac8..4e4a7022 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -57,8 +57,9 @@ class HistProfiler(Module): :param str index_col: key for index in split dictionary :param dict stats_functions: function_name, function(bin_labels, bin_counts) dictionary """ - _input_keys = ("read_key", ) - _output_keys = ("store_key", ) + + _input_keys = ("read_key",) + _output_keys = ("store_key",) def __init__( self, diff --git a/popmon/analysis/profiling/pull_calculator.py b/popmon/analysis/profiling/pull_calculator.py index 3e266545..63290d8b 100644 --- a/popmon/analysis/profiling/pull_calculator.py +++ b/popmon/analysis/profiling/pull_calculator.py @@ -131,11 +131,7 @@ def __init__( :param args: (tuple, optional): residual args passed on to mean and std functions :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions """ - kws = { - "window": window, - "shift": shift, - **kwargs - } + kws = {"window": window, "shift": shift, **kwargs} super().__init__( rolling_mean, rolling_std, @@ -186,10 +182,7 @@ def __init__( :param args: (tuple, optional): residual args passed on to mean and std functions :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions """ - kws = { - "shift": shift, - **kwargs - } + kws = {"shift": shift, **kwargs} super().__init__( expanding_mean, expanding_std, diff --git a/popmon/base/module.py b/popmon/base/module.py index 56b5f33c..13208c8d 100644 --- a/popmon/base/module.py +++ b/popmon/base/module.py @@ -24,6 +24,7 @@ class Module(ABC): """Base class used for modules in a pipeline.""" + _input_keys = None _output_keys = None @@ -101,18 +102,12 @@ def get_features(self, all_features: list) -> list: features = [k for k in features if k not in self.ignore_features] features_not_in_input = [ - feature - for feature in features - if feature not in all_features + feature for feature in features if feature not in all_features ] for feature in features_not_in_input: self.logger.warning(f'Feature "{feature}" not in input data; skipping.') - features = [ - feature - for feature in features - if feature in all_features - ] + features = [feature for feature in features if feature in all_features] return features def _transform(self, datastore): @@ -130,7 +125,9 @@ def _transform(self, datastore): else: inputs[key] = None - self.logger.debug(f"load(key={key}, key_value={key_value}, value={str(inputs[key]):.100s})") + self.logger.debug( + f"load(key={key}, key_value={key_value}, value={str(inputs[key]):.100s})" + ) # cache datastore self._datastore = datastore @@ -145,8 +142,10 @@ def _transform(self, datastore): for k, v in zip(self._output_keys, outputs): key_value = self.__dict__[k] - self.logger.debug(f"store(key={k}, key_value={key_value}, value={str(v):.100s})") - if key_value and len(key_value) > 0: # and v is not None: + self.logger.debug( + f"store(key={k}, key_value={key_value}, value={str(v):.100s})" + ) + if key_value and len(key_value) > 0: # and v is not None: datastore[key_value] = v return datastore diff --git a/popmon/base/pipeline.py b/popmon/base/pipeline.py index 3995235a..18b02146 100644 --- a/popmon/base/pipeline.py +++ b/popmon/base/pipeline.py @@ -85,9 +85,7 @@ def visualize(self, versioned=True, funcs=None, dsets=None): for module in self.modules: name = module.__class__.__name__ if isinstance(module, Pipeline): - modules.append( - module.visualize(versioned, funcs, dsets) - ) + modules.append(module.visualize(versioned, funcs, dsets)) else: in_keys = module.get_inputs() @@ -122,19 +120,15 @@ def visualize(self, versioned=True, funcs=None, dsets=None): modules.append( { - 'type': 'module', - 'name': f'{name}', - 'i': f'{funcs[name][id(module)]}', - 'desc': module.get_description(), - 'in': in_keys, - 'out': out_keys + "type": "module", + "name": f"{name}", + "i": f"{funcs[name][id(module)]}", + "desc": module.get_description(), + "in": in_keys, + "out": out_keys, } ) - data = { - 'type': 'subgraph', - 'name': self.__class__.__name__, - 'modules': modules - } + data = {"type": "subgraph", "name": self.__class__.__name__, "modules": modules} return data def to_json(self, file_name, versioned=True): diff --git a/popmon/hist/hist_splitter.py b/popmon/hist/hist_splitter.py index 43163414..27a1a883 100644 --- a/popmon/hist/hist_splitter.py +++ b/popmon/hist/hist_splitter.py @@ -37,8 +37,8 @@ class HistSplitter(Module): where time is the index and each row is a x:y histogram. """ - _input_keys = ("read_key", ) - _output_keys = ("store_key", ) + _input_keys = ("read_key",) + _output_keys = ("store_key",) def __init__( self, @@ -153,7 +153,6 @@ def transform(self, data: dict) -> dict: # turn divided dicts into dataframes with index divided = { - k: pd.DataFrame(v).set_index(self.index_col) - for k, v in divided.items() + k: pd.DataFrame(v).set_index(self.index_col) for k, v in divided.items() } return divided diff --git a/popmon/io/file_reader.py b/popmon/io/file_reader.py index 929bec0b..19353cb3 100644 --- a/popmon/io/file_reader.py +++ b/popmon/io/file_reader.py @@ -28,8 +28,8 @@ class FileReader(Module): """Module to read contents from a file, transform the contents with a function and write them to the datastore.""" - _input_keys = tuple() - _output_keys = ("store_key", ) + _input_keys = () + _output_keys = ("store_key",) def __init__( self, diff --git a/popmon/io/file_writer.py b/popmon/io/file_writer.py index 800729c2..6342291f 100644 --- a/popmon/io/file_writer.py +++ b/popmon/io/file_writer.py @@ -28,8 +28,9 @@ class FileWriter(Module): """Module transforms specific datastore content and writes it to a file.""" - _input_keys = ("read_key", ) - _output_keys = ("store_key", ) + + _input_keys = ("read_key",) + _output_keys = ("store_key",) def __init__( self, diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index ba0bff9d..3de19b23 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -382,13 +382,7 @@ def metrics_rolling_reference( ), ApplyFunc( apply_to_key="traffic_lights", - apply_funcs=[ - { - "func": traffic_light_summary, - "axis": 1, - "suffix": "" - } - ], + apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], assign_to_key="alerts", msg="Generating traffic light alerts summary.", ), diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index 7b31a9bf..a25789e1 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -425,8 +425,9 @@ class StabilityReport(Module): after running the pipeline and generating the report. Report can be represented as a HTML string, HTML file or Jupyter notebook's cell output. """ - _input_keys = ("read_key", ) - _output_keys = tuple() + + _input_keys = ("read_key",) + _output_keys = () def __init__(self, read_key="html_report"): """Initialize an instance of StabilityReport. diff --git a/popmon/stitching/hist_stitcher.py b/popmon/stitching/hist_stitcher.py index 8b482682..2b79e91f 100644 --- a/popmon/stitching/hist_stitcher.py +++ b/popmon/stitching/hist_stitcher.py @@ -29,7 +29,7 @@ class HistStitcher(Module): """Module stitches histograms by date""" _input_keys = ("read_key", "delta_key") - _output_keys = ("store_key", ) + _output_keys = ("store_key",) def __init__( self, diff --git a/popmon/visualization/alert_section_generator.py b/popmon/visualization/alert_section_generator.py index ca5712da..89ec31e4 100644 --- a/popmon/visualization/alert_section_generator.py +++ b/popmon/visualization/alert_section_generator.py @@ -36,8 +36,9 @@ class AlertSectionGenerator(Module): combines all the plots into a list which is stored together with the section name in a dictionary which later will be used for the report generation. """ + _input_keys = ("read_key", "static_bounds", "dynamic_bounds", "store_key") - _output_keys = ("store_key", ) + _output_keys = ("store_key",) def __init__( self, @@ -106,7 +107,7 @@ def transform( data_obj: dict, static_bounds: Optional[dict] = None, dynamic_bounds: Optional[dict] = None, - sections: Optional[list] = None + sections: Optional[list] = None, ): assert isinstance(data_obj, dict) if static_bounds is None: diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index e5cb75ee..2f685ac2 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -37,8 +37,9 @@ class HistogramSection(Module): """This module plots histograms of all selected features for the last 'n' periods.""" + _input_keys = ("read_key", "store_key") - _output_keys = ("store_key", ) + _output_keys = ("store_key",) def __init__( self, diff --git a/popmon/visualization/report_generator.py b/popmon/visualization/report_generator.py index b95ac0b3..d5d806fb 100644 --- a/popmon/visualization/report_generator.py +++ b/popmon/visualization/report_generator.py @@ -29,8 +29,9 @@ class ReportGenerator(Module): """This module takes already prepared section data, renders HTML section template with the data and glues sections together into one compressed report which is created based on the provided template. """ - _input_keys = ("read_key", ) - _output_keys = ("store_key", ) + + _input_keys = ("read_key",) + _output_keys = ("store_key",) def __init__(self, read_key, store_key): """Initialize an instance of ReportGenerator. diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py index f1049858..342892ff 100644 --- a/popmon/visualization/section_generator.py +++ b/popmon/visualization/section_generator.py @@ -35,8 +35,9 @@ class SectionGenerator(Module): combines all the plots into a list which is stored together with the section name in a dictionary which later will be used for the report generation. """ + _input_keys = ("read_key", "static_bounds", "dynamic_bounds", "store_key") - _output_keys = ("store_key", ) + _output_keys = ("store_key",) def __init__( self, @@ -98,7 +99,13 @@ def __init__( def get_description(self): return self.section_name - def transform(self, data_obj: dict, static_bounds: Optional[dict] = None, dynamic_bounds: Optional[dict] = None, sections: Optional[list] = None): + def transform( + self, + data_obj: dict, + static_bounds: Optional[dict] = None, + dynamic_bounds: Optional[dict] = None, + sections: Optional[list] = None, + ): if static_bounds is None: static_bounds = {} if dynamic_bounds is None: diff --git a/popmon/visualization/traffic_light_section_generator.py b/popmon/visualization/traffic_light_section_generator.py index 56d19d26..662b9a22 100644 --- a/popmon/visualization/traffic_light_section_generator.py +++ b/popmon/visualization/traffic_light_section_generator.py @@ -40,8 +40,9 @@ class TrafficLightSectionGenerator(Module): combines all the plots into a list which is stored together with the section name in a dictionary which later will be used for the report generation. """ + _input_keys = ("read_key", "dynamic_bounds", "store_key") - _output_keys = ("store_key", ) + _output_keys = ("store_key",) def __init__( self, @@ -109,7 +110,12 @@ def __init__( def get_description(self): return self.section_name - def transform(self, data_obj: dict, dynamic_bounds: Optional[dict] = None, sections: Optional[list] = None): + def transform( + self, + data_obj: dict, + dynamic_bounds: Optional[dict] = None, + sections: Optional[list] = None, + ): assert isinstance(data_obj, dict) if dynamic_bounds is None: dynamic_bounds = {} diff --git a/tests/popmon/base/test_pipeline.py b/tests/popmon/base/test_pipeline.py index 650a1c71..79c22908 100644 --- a/tests/popmon/base/test_pipeline.py +++ b/tests/popmon/base/test_pipeline.py @@ -6,8 +6,8 @@ class LogTransformer(Module): - _input_keys = ("input_key", ) - _output_keys = ("output_key", ) + _input_keys = ("input_key",) + _output_keys = ("output_key",) def __init__(self, input_key, output_key): super().__init__() diff --git a/tests/popmon/io/test_file_writer.py b/tests/popmon/io/test_file_writer.py index b505b4d0..7471a067 100644 --- a/tests/popmon/io/test_file_writer.py +++ b/tests/popmon/io/test_file_writer.py @@ -41,8 +41,8 @@ def test_file_writer_not_a_func(): def test_file_writer_df(): datastore = get_ready_ds() - FileWriter("my_data", store_key="transformed_data", apply_func=to_pandas)._transform( - datastore - ) + FileWriter( + "my_data", store_key="transformed_data", apply_func=to_pandas + )._transform(datastore) assert datastore["my_data"] == DATA assert datastore["transformed_data"].to_dict() == to_pandas(DATA).to_dict() diff --git a/tools/pipeline_viz.py b/tools/pipeline_viz.py index 69f2f117..8e50ad43 100644 --- a/tools/pipeline_viz.py +++ b/tools/pipeline_viz.py @@ -6,7 +6,12 @@ from networkx.drawing.nx_agraph import to_agraph -def generate_pipeline_vizualisation(input_file, output_file, include_subgraphs: bool = False, include_labels: bool = False): +def generate_pipeline_vizualisation( + input_file, + output_file, + include_subgraphs: bool = False, + include_labels: bool = False, +): data = Path(input_file).read_text() data = json.loads(data) @@ -14,22 +19,17 @@ def generate_pipeline_vizualisation(input_file, output_file, include_subgraphs: modules = [] def populate(item): - if item['type'] == 'subgraph': + if item["type"] == "subgraph": mods = [] - for m in item['modules']: + for m in item["modules"]: mods += populate(m) - subgraphs.append( - { - 'modules': mods, - 'name': item['name'] - } - ) + subgraphs.append({"modules": mods, "name": item["name"]}) return mods - elif item['type'] == 'module': + elif item["type"] == "module": modules.append(item) name = f"{item['name']}_{item['i']}" - return [name]+list(item["out"].values()) + return [name] + list(item["out"].values()) else: raise ValueError() @@ -38,7 +38,7 @@ def populate(item): G = nx.DiGraph() for module in modules: label = f"<{module['name']}" - d = module.get('desc', '') + d = module.get("desc", "") if len(d) > 0: label += f"
{d}" label += ">" @@ -46,34 +46,38 @@ def populate(item): # unique name name = f"{module['name']}_{module['i']}" - G.add_node(name, shape='rectangle', fillcolor='chartreuse', style='filled', label=label) - + G.add_node( + name, shape="rectangle", fillcolor="chartreuse", style="filled", label=label + ) - for k, v in module['in'].items(): + for k, v in module["in"].items(): kwargs = {} if include_labels: - kwargs['headlabel'] = k + kwargs["headlabel"] = k G.add_edge(v, name, **kwargs) - for k, v in module['out'].items(): + for k, v in module["out"].items(): kwargs = {} if include_labels: - kwargs['taillabel'] = k + kwargs["taillabel"] = k G.add_edge(name, v, **kwargs) # set defaults - G.graph['graph'] = {'rankdir':'TD'} - G.graph['node'] = {'shape':'oval', 'fillcolor': 'orange', 'style': 'filled'} - G.graph['edge'] = {'fontcolor':"gray50"} + G.graph["graph"] = {"rankdir": "TD"} + G.graph["node"] = {"shape": "oval", "fillcolor": "orange", "style": "filled"} + G.graph["edge"] = {"fontcolor": "gray50"} A = to_agraph(G) if include_subgraphs: for idx, subgraph in enumerate(subgraphs): - H = A.subgraph(subgraph["modules"], name=f'cluster_{idx}_{subgraph["name"].lower().replace(" ", "_")}') + H = A.subgraph( + subgraph["modules"], + name=f'cluster_{idx}_{subgraph["name"].lower().replace(" ", "_")}', + ) H.graph_attr["color"] = "blue" H.graph_attr["label"] = subgraph["name"] H.graph_attr["style"] = "dotted" - A.layout('dot') + A.layout("dot") A.draw(output_file) @@ -81,17 +85,17 @@ def populate(item): data_path = Path("<...>") input_file = data_path / "pipeline_self_reference_unversioned.json" - output_file = 'popmon-report-pipeline-subgraphs-unversioned.pdf' + output_file = "popmon-report-pipeline-subgraphs-unversioned.pdf" generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=True) input_file = data_path / "pipeline_self_reference_unversioned.json" - output_file = 'popmon-report-pipeline-unversioned.pdf' + output_file = "popmon-report-pipeline-unversioned.pdf" generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=False) input_file = data_path / "pipeline_self_reference_versioned.json" - output_file = 'popmon-report-pipeline-subgraphs-versioned.pdf' + output_file = "popmon-report-pipeline-subgraphs-versioned.pdf" generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=True) input_file = data_path / "pipeline_self_reference_versioned.json" - output_file = 'popmon-report-pipeline-versioned.pdf' + output_file = "popmon-report-pipeline-versioned.pdf" generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=False) From ba98c973c8e27fe69ce1c3a82c4fa14abba3d818 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 14 Oct 2021 17:38:22 +0200 Subject: [PATCH 18/34] fix: ensure uniqueness of apply_funcs_key --- popmon/alerting/compute_tl_bounds.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/popmon/alerting/compute_tl_bounds.py b/popmon/alerting/compute_tl_bounds.py index 1eb1f3db..5269bb4d 100644 --- a/popmon/alerting/compute_tl_bounds.py +++ b/popmon/alerting/compute_tl_bounds.py @@ -440,6 +440,8 @@ def __init__(self, read_key, store_key, rules, expanded_rules_key=""): """ self.read_key = read_key apply_funcs_key = f"{read_key}__{store_key}" + if len(expanded_rules_key) > 0: + apply_funcs_key += f"__{expanded_rules_key}" # generate static traffic light bounds by expanding the wildcarded monitoring rules expand_bounds = ComputeTLBounds( From 66141b2b96dbd302d2472455b87a70d01070b375 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 21 Oct 2021 13:23:33 +0200 Subject: [PATCH 19/34] refactor: remove unused imports --- popmon/base/pipeline.py | 2 -- popmon/io/file_reader.py | 1 - popmon/io/file_writer.py | 1 - 3 files changed, 4 deletions(-) diff --git a/popmon/base/pipeline.py b/popmon/base/pipeline.py index 18b02146..8f042250 100644 --- a/popmon/base/pipeline.py +++ b/popmon/base/pipeline.py @@ -21,8 +21,6 @@ import logging from pathlib import Path -from ..base import Module - class Pipeline: """Base class used for to run modules in a pipeline.""" diff --git a/popmon/io/file_reader.py b/popmon/io/file_reader.py index 19353cb3..09e6a90b 100644 --- a/popmon/io/file_reader.py +++ b/popmon/io/file_reader.py @@ -18,7 +18,6 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import collections.abc from pathlib import Path from typing import Callable, Optional, Union diff --git a/popmon/io/file_writer.py b/popmon/io/file_writer.py index 6342291f..2bbe37c0 100644 --- a/popmon/io/file_writer.py +++ b/popmon/io/file_writer.py @@ -18,7 +18,6 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import collections.abc import copy from pathlib import Path from typing import Callable, Optional, Union From 98acf6875fd791d22e12568495e87e176531ac39 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 21 Oct 2021 13:25:26 +0200 Subject: [PATCH 20/34] refactor: simplification --- popmon/analysis/comparison/hist_comparer.py | 48 +++++++++++---------- popmon/analysis/profiling/hist_profiler.py | 5 ++- popmon/pipeline/metrics.py | 2 +- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index e542c6a5..254e191e 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -57,20 +57,20 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): Default is 7.0. :return: pandas Series with popular comparison metrics. """ - x = pd.Series() - x["ks"] = np.nan - x["ks_zscore"] = np.nan - x["ks_pvalue"] = np.nan - x["pearson"] = np.nan - x["chi2"] = np.nan - x["chi2_norm"] = np.nan - x["chi2_zscore"] = np.nan - x["chi2_pvalue"] = np.nan - x["chi2_max_residual"] = np.nan - x["chi2_spike_count"] = np.nan - x["max_prob_diff"] = np.nan - unknown_labels = np.nan - x["unknown_labels"] = unknown_labels + x = { + "ks": np.nan, + "ks_zscore": np.nan, + "ks_pvalue": np.nan, + "pearson": np.nan, + "chi2": np.nan, + "chi2_norm": np.nan, + "chi2_zscore": np.nan, + "chi2_pvalue": np.nan, + "chi2_max_residual": np.nan, + "chi2_spike_count": np.nan, + "max_prob_diff": np.nan, + "unknown_labels": np.nan, + } # basic name checks cols = row.index.to_list() @@ -83,15 +83,14 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): # basic histogram checks hist1 = row[hist_name1] hist2 = row[hist_name2] - if not all([isinstance(hist, COMMON_HIST_TYPES) for hist in [hist1, hist2]]): - return x - if not check_similar_hists([hist1, hist2]): - return x + if not all( + [isinstance(hist, COMMON_HIST_TYPES) for hist in [hist1, hist2]] + ) or not check_similar_hists([hist1, hist2]): + return pd.Series(x) # compare - is_num = is_numeric(hist1) if hist1.n_dim == 1: - if is_num: + if is_numeric(hist1): numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2]) entries_list = [nphist[0] for nphist in numpy_1dhists] # KS-test only properly defined for (ordered) 1D interval variables @@ -106,10 +105,14 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): labels1 = hist1.bin_labels() labels2 = hist2.bin_labels() subset = set(labels1) <= set(labels2) - unknown_labels = int(not subset) + x["unknown_labels"] = int(not subset) elif hist1.n_dim == 2: numpy_2dgrids = get_consistent_numpy_2dgrids([hist1, hist2]) entries_list = [entry.flatten() for entry in numpy_2dgrids] + else: + raise NotImplementedError( + f"histogram with dimension {hist1.n_dim} is not supported" + ) # calculate pearson coefficient pearson, pvalue = (np.nan, np.nan) @@ -130,8 +133,7 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): x["chi2_max_residual"] = max(list(map(abs, res))) x["chi2_spike_count"] = sum(abs(r) > max_res_bound for r in res) x["max_prob_diff"] = googl_test(*entries_list) - x["unknown_labels"] = unknown_labels - return x + return pd.Series(x) class HistComparer(Pipeline): diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index 4e4a7022..86c63ff3 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -148,7 +148,7 @@ def _profile_1d_histogram(self, name, hist): for f_name, result in zip(name, results) ] - profile.update({k: v for k, v in zip(names, results)}) + profile.update(dict(zip(names, results))) elif not is_num: profile["fraction_true"] = pm_np.fraction_of_true(bin_labels, bin_counts) @@ -190,7 +190,6 @@ def _profile_hist(self, split, hist_name): is_num = is_numeric(hist0) # these are the profiled quantities we will monitor - fields = [] if dimension == 1: fields = list(self.general_stats_1d) fields += ( @@ -200,6 +199,8 @@ def _profile_hist(self, split, hist_name): ) elif dimension == 2: fields = list(self.general_stats_2d) + else: + fields = [] # now loop over split-axis, e.g. time index, and profile each sub-hist x:y profile_list = [] diff --git a/popmon/pipeline/metrics.py b/popmon/pipeline/metrics.py index 47135dab..27d4093c 100644 --- a/popmon/pipeline/metrics.py +++ b/popmon/pipeline/metrics.py @@ -147,8 +147,8 @@ def stability_metrics( "monitoring_rules": monitoring_rules, "pull_rules": pull_rules, "features": features, + **kwargs, } - cfg.update(kwargs) datastore = {"hists": hists} if reference_type == "external": From 540d8a54539da00fe74177b6be7f62c989a29822 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 12:20:01 +0200 Subject: [PATCH 21/34] refactor: pipeline abstract class Refactor the code in order for the Pipeline class to be abstract This ensures that each pipeline is named and improves the code quality in general --- popmon/base/pipeline.py | 5 +- popmon/pipeline/amazing_pipeline.py | 24 +- popmon/pipeline/metrics.py | 45 +- popmon/pipeline/metrics_pipelines.py | 939 ++++++++++-------- popmon/pipeline/report.py | 24 +- popmon/pipeline/report_pipelines.py | 514 +++++----- .../popmon/pipeline/test_report_pipelines.py | 24 +- 7 files changed, 817 insertions(+), 758 deletions(-) diff --git a/popmon/base/pipeline.py b/popmon/base/pipeline.py index 8f042250..ae18013c 100644 --- a/popmon/base/pipeline.py +++ b/popmon/base/pipeline.py @@ -19,11 +19,12 @@ import json import logging +from abc import ABC from pathlib import Path -class Pipeline: - """Base class used for to run modules in a pipeline.""" +class Pipeline(ABC): + """Abstract base class used for to run modules in a pipeline.""" def __init__(self, modules, logger=None): """Initialization of the pipeline diff --git a/popmon/pipeline/amazing_pipeline.py b/popmon/pipeline/amazing_pipeline.py index a5dbb0ac..3324dd90 100644 --- a/popmon/pipeline/amazing_pipeline.py +++ b/popmon/pipeline/amazing_pipeline.py @@ -25,7 +25,19 @@ from ..base import Pipeline from ..config import config from ..io import JsonReader -from ..pipeline.report_pipelines import self_reference +from ..pipeline.report_pipelines import SelfReference + + +class AmazingPipeline(Pipeline): + def __init__(self, **kwargs): + modules = [ + JsonReader( + file_path=kwargs["histograms_path"], store_key=kwargs["hists_key"] + ), + # Or ExternalReference, RollingReference etc. + SelfReference(**kwargs), + ] + super().__init__(modules) def run(): @@ -51,15 +63,7 @@ def run(): "show_stats": config["limited_stats"], } - pipeline = Pipeline( - modules=[ - JsonReader(file_path=cfg["histograms_path"], store_key=cfg["hists_key"]), - self_reference(**cfg), - # fixed_reference(**config), - # rolling_reference(**config), - # expanding_reference(**config), - ] - ) + pipeline = AmazingPipeline(**cfg) pipeline.transform(datastore={}) diff --git a/popmon/pipeline/metrics.py b/popmon/pipeline/metrics.py index 27d4093c..84b9dc62 100644 --- a/popmon/pipeline/metrics.py +++ b/popmon/pipeline/metrics.py @@ -27,25 +27,13 @@ make_histograms, ) -from ..pipeline.metrics_pipelines import ( - metrics_expanding_reference, - metrics_external_reference, - metrics_rolling_reference, - metrics_self_reference, -) +from ..pipeline.metrics_pipelines import create_metrics_pipeline logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s [%(module)s]: %(message)s" ) logger = logging.getLogger() -_metrics_pipeline = { - "self": metrics_self_reference, - "external": metrics_external_reference, - "rolling": metrics_rolling_reference, - "expanding": metrics_expanding_reference, -} - def stability_metrics( hists, @@ -111,15 +99,9 @@ def stability_metrics( :param kwargs: residual keyword arguments passed on to report pipeline. :return: dict with results of metrics pipeline """ - # perform basic input checks - reference_types = list(_metrics_pipeline.keys()) - if reference_type not in reference_types: - raise TypeError(f"reference_type should be one of {str(reference_types)}.") if not isinstance(hists, dict): raise TypeError("hists should be a dict of histogrammar histograms.") - if reference_type == "external" and not isinstance(reference, dict): - raise TypeError("reference should be a dict of histogrammar histograms.") if not isinstance(monitoring_rules, dict): monitoring_rules = { @@ -137,25 +119,24 @@ def stability_metrics( first_cols = [k.split(":")[0] for k in list(hists.keys())] time_axis = max(set(first_cols), key=first_cols.count) - # configuration and datastore for report pipeline - cfg = { - "hists_key": "hists", - "ref_hists_key": "ref_hists", - "time_axis": time_axis, - "window": window, - "shift": shift, - "monitoring_rules": monitoring_rules, - "pull_rules": pull_rules, - "features": features, + pipeline = create_metrics_pipeline( + reference_type=reference_type, + reference=reference, + hists_key="hists", + ref_hists_key="ref_hists", + time_axis=time_axis, + window=window, + shift=shift, + monitoring_rules=monitoring_rules, + pull_rules=pull_rules, + features=features, **kwargs, - } + ) datastore = {"hists": hists} if reference_type == "external": datastore["ref_hists"] = reference - # execute reporting pipeline - pipeline = _metrics_pipeline[reference_type](**cfg) return pipeline.transform(datastore) diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index 3de19b23..170695af 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -44,464 +44,521 @@ from ..hist.hist_splitter import HistSplitter -def metrics_self_reference( - hists_key="test_hists", - time_axis="date", - window=10, - monitoring_rules={}, - pull_rules={}, - features=None, - **kwargs, -): - """Example metrics pipeline for comparing test data with itself (full test set) +def get_metrics_pipeline_class(reference_type, reference): + _metrics_pipeline_register = { + "self": SelfReferenceMetricsPipeline, + "external": ExternalReferenceMetricsPipeline, + "rolling": RollingReferenceMetricsPipeline, + "expanding": ExpandingReferenceMetricsPipeline, + } - :param str hists_key: key to test histograms in datastore. default is 'test_hists' - :param str time_axis: name of datetime feature. default is 'date' - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries - :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments - :return: assembled self reference pipeline - """ - modules = [ - # --- 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # --- 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), - # --- 3. Comparison of with profiled test histograms, results in chi2 comparison of histograms - ReferenceHistComparer( - reference_key="split_hists", - assign_to_key="split_hists", - store_key="comparisons", - ), - RefMedianMadPullCalculator( - reference_key="comparisons", - assign_to_key="comparisons", - suffix_mean="_mean", - suffix_std="_std", - suffix_pull="_pull", - metrics=["ref_max_prob_diff"], - ), - # --- 4. profiling of histograms, then pull calculation compared with reference mean and std, - # to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), - RefMedianMadPullCalculator( - reference_key="profiles", - assign_to_key="profiles", - suffix_mean="_mean", - suffix_std="_std", - suffix_pull="_pull", - ), - # --- 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # --- 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - StaticBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - StaticBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # --- 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), - ] + if reference_type not in _metrics_pipeline_register: + raise ValueError( + f"reference_type should be in {str(_metrics_pipeline_register.keys())}'." + ) + if ( + reference_type == "external" + and not isinstance(reference, dict) + and reference is not None + ): + raise TypeError("reference should be a dict of histogrammar histograms.") - pipeline = Pipeline(modules) - return pipeline + return _metrics_pipeline_register[reference_type] -def metrics_external_reference( - hists_key="test_hists", - ref_hists_key="ref_hists", - time_axis="date", +def create_metrics_pipeline( + reference_type="self", + reference=None, + hists_key="hists", + # ref_hists_key="ref_hists", + time_axis="", window=10, monitoring_rules={}, pull_rules={}, features=None, + # shift=1, **kwargs, ): - """Example metrics pipeline for comparing test data with other (full) external reference set + # configuration and datastore for report pipeline + cfg = { + "hists_key": hists_key, + "time_axis": time_axis, + "window": window, + "monitoring_rules": monitoring_rules, + "pull_rules": pull_rules, + "features": features, + # "ref_hists_key": ref_hists_key, + # "shift": shift, + **kwargs, + } - :param str hists_key: key to test histograms in datastore. default is 'test_hists' - :param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists' - :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries - :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments - :return: assembled external reference pipeline - """ - modules = [ - # --- 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # --- 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), - # --- 3. Profiling of split reference histograms, then chi2 comparison with test histograms - HistSplitter( - read_key=ref_hists_key, - store_key="split_ref_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - ReferenceHistComparer( - reference_key="split_ref_hists", - assign_to_key="split_hists", - store_key="comparisons", - ), - RefMedianMadPullCalculator( - reference_key="comparisons", - assign_to_key="comparisons", - suffix_mean="_mean", - suffix_std="_std", - suffix_pull="_pull", - metrics=["ref_max_prob_diff"], - ), - # --- 4. pull calculation compared with reference mean and std, to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), - HistProfiler(read_key="split_ref_hists", store_key="ref_profiles"), - ReferencePullCalculator( - reference_key="ref_profiles", - assign_to_key="profiles", - suffix_mean="_mean", - suffix_std="_std", - suffix_pull="_pull", - ), - # --- 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # --- 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - StaticBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - StaticBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # --- 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), - ] - - pipeline = Pipeline(modules) + # execute reporting pipeline + cls = get_metrics_pipeline_class(reference_type, reference) + pipeline = cls(**cfg) return pipeline -def metrics_rolling_reference( - hists_key="test_hists", - time_axis="date", - window=10, - shift=1, - monitoring_rules={}, - pull_rules={}, - features=None, - **kwargs, -): - """Example metrics pipeline for comparing test data with itself (rolling test set) +class SelfReferenceMetricsPipeline(Pipeline): + def __init__( + self, + hists_key="test_hists", + time_axis="date", + window=10, + monitoring_rules={}, + pull_rules={}, + features=None, + **kwargs, + ): + """Example metrics pipeline for comparing test data with itself (full test set) - :param str hists_key: key to test histograms in datastore. default is 'test_hists' - :param str time_axis: name of datetime feature. default is 'date' - :param int window: size of rolling window and for trend detection. default is 10 - :param int shift: shift in rolling window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries - :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments - :return: assembled rolling reference pipeline - """ - modules = [ - # --- 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # --- 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), - # --- 3. profiling of reference histograms, then comparison of with profiled test histograms - # results in chi2 comparison of histograms - RollingHistComparer( - read_key="split_hists", window=window, shift=shift, store_key="comparisons" - ), - RefMedianMadPullCalculator( - reference_key="comparisons", - assign_to_key="comparisons", - suffix_mean="_mean", - suffix_std="_std", - suffix_pull="_pull", - metrics=["roll_max_prob_diff"], - ), - # --- 4. profiling of histograms, then pull calculation compared with reference mean and std, - # to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), - RollingPullCalculator( - read_key="profiles", - window=window, - shift=shift, - suffix_mean="_mean", - suffix_std="_std", - suffix_pull="_pull", - ), - # --- 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # --- 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - DynamicBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - DynamicBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # --- 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), - ] + :param str hists_key: key to test histograms in datastore. default is 'test_hists' + :param str time_axis: name of datetime feature. default is 'date' + :param int window: window size for trend detection. default is 10 + :param dict monitoring_rules: traffic light rules + :param dict pull_rules: pull rules to determine dynamic boundaries + :param list features: features of histograms to pick up from input data (optional) + :param kwargs: residual keyword arguments + :return: assembled self reference pipeline + """ + modules = [ + # 1. splitting of test histograms + HistSplitter( + read_key=hists_key, + store_key="split_hists", + features=features, + feature_begins_with=f"{time_axis}:", + ), + # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in + # chi2 comparison of histograms + PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + # 3. Comparison of with profiled test histograms, results in chi2 comparison of histograms + ReferenceHistComparer( + reference_key="split_hists", + assign_to_key="split_hists", + store_key="comparisons", + ), + RefMedianMadPullCalculator( + reference_key="comparisons", + assign_to_key="comparisons", + suffix_mean="_mean", + suffix_std="_std", + suffix_pull="_pull", + metrics=["ref_max_prob_diff"], + ), + # 4. profiling of histograms, then pull calculation compared with reference mean and std, + # to obtain normalized residuals of profiles + HistProfiler(read_key="split_hists", store_key="profiles"), + RefMedianMadPullCalculator( + reference_key="profiles", + assign_to_key="profiles", + suffix_mean="_mean", + suffix_std="_std", + suffix_pull="_pull", + ), + # 5. looking for significant rolling linear trends in selected features/metrics + ApplyFunc( + apply_to_key="profiles", + assign_to_key="comparisons", + apply_funcs=[ + { + "func": rolling_lr_zscore, + "suffix": f"_trend{window}_zscore", + "entire": True, + "window": window, + "metrics": ["mean", "phik", "fraction_true"], + } + ], + msg="Computing significance of (rolling) trend in means of features", + ), + # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, + # used for plotting in popmon_profiles report. + StaticBounds( + read_key="profiles", + rules=pull_rules, + store_key="dynamic_bounds", + suffix_mean="_mean", + suffix_std="_std", + ), + StaticBounds( + read_key="comparisons", + rules=pull_rules, + store_key="dynamic_bounds_comparisons", + suffix_mean="_mean", + suffix_std="_std", + ), + # 7. expand all (wildcard) static traffic light bounds and apply them. + # Applied to both profiles and comparisons datasets + TrafficLightAlerts( + read_key="profiles", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds", + ), + TrafficLightAlerts( + read_key="comparisons", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds_comparisons", + ), + ApplyFunc( + apply_to_key="traffic_lights", + apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], + assign_to_key="alerts", + msg="Generating traffic light alerts summary.", + ), + AlertsSummary(read_key="alerts"), + ] + super().__init__(modules) - pipeline = Pipeline(modules) - return pipeline +class ExternalReferenceMetricsPipeline(Pipeline): + def __init__( + self, + hists_key="test_hists", + ref_hists_key="ref_hists", + time_axis="date", + window=10, + monitoring_rules={}, + pull_rules={}, + features=None, + **kwargs, + ): + """Example metrics pipeline for comparing test data with other (full) external reference set -def metrics_expanding_reference( - hists_key="test_hists", - time_axis="date", - window=10, - shift=1, - monitoring_rules={}, - pull_rules={}, - features=None, - **kwargs, -): - """Example metrics pipeline for comparing test data with itself (expanding test set) + :param str hists_key: key to test histograms in datastore. default is 'test_hists' + :param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists' + :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) + :param int window: window size for trend detection. default is 10 + :param dict monitoring_rules: traffic light rules + :param dict pull_rules: pull rules to determine dynamic boundaries + :param list features: features of histograms to pick up from input data (optional) + :param kwargs: residual keyword arguments + :return: assembled external reference pipeline + """ + modules = [ + # 1. splitting of test histograms + HistSplitter( + read_key=hists_key, + store_key="split_hists", + features=features, + feature_begins_with=f"{time_axis}:", + ), + # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in + # chi2 comparison of histograms + PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + # 3. Profiling of split reference histograms, then chi2 comparison with test histograms + HistSplitter( + read_key=ref_hists_key, + store_key="split_ref_hists", + features=features, + feature_begins_with=f"{time_axis}:", + ), + ReferenceHistComparer( + reference_key="split_ref_hists", + assign_to_key="split_hists", + store_key="comparisons", + ), + RefMedianMadPullCalculator( + reference_key="comparisons", + assign_to_key="comparisons", + suffix_mean="_mean", + suffix_std="_std", + suffix_pull="_pull", + metrics=["ref_max_prob_diff"], + ), + # 4. pull calculation compared with reference mean and std, to obtain normalized residuals of profiles + HistProfiler(read_key="split_hists", store_key="profiles"), + HistProfiler(read_key="split_ref_hists", store_key="ref_profiles"), + ReferencePullCalculator( + reference_key="ref_profiles", + assign_to_key="profiles", + suffix_mean="_mean", + suffix_std="_std", + suffix_pull="_pull", + ), + # 5. looking for significant rolling linear trends in selected features/metrics + ApplyFunc( + apply_to_key="profiles", + assign_to_key="comparisons", + apply_funcs=[ + { + "func": rolling_lr_zscore, + "suffix": f"_trend{window}_zscore", + "entire": True, + "window": window, + "metrics": ["mean", "phik", "fraction_true"], + } + ], + msg="Computing significance of (rolling) trend in means of features", + ), + # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, + # used for plotting in popmon_profiles report. + StaticBounds( + read_key="profiles", + rules=pull_rules, + store_key="dynamic_bounds", + suffix_mean="_mean", + suffix_std="_std", + ), + StaticBounds( + read_key="comparisons", + rules=pull_rules, + store_key="dynamic_bounds_comparisons", + suffix_mean="_mean", + suffix_std="_std", + ), + # 7. expand all (wildcard) static traffic light bounds and apply them. + # Applied to both profiles and comparisons datasets + TrafficLightAlerts( + read_key="profiles", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds", + ), + TrafficLightAlerts( + read_key="comparisons", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds_comparisons", + ), + ApplyFunc( + apply_to_key="traffic_lights", + apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], + assign_to_key="alerts", + msg="Generating traffic light alerts summary.", + ), + AlertsSummary(read_key="alerts"), + ] + super().__init__(modules) - :param str hists_key: key to test histograms in datastore. default is 'test_hists' - :param str time_axis: name of datetime feature. default is 'date' - :param int window: window size for trend detection. default is 10 - :param int shift: shift in expanding window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries - :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments - :return: assembled expanding reference pipeline - """ - modules = [ - # --- 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # --- 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), - # --- 3. profiling of reference histograms, then comparison of with profiled test histograms - # results in chi2 comparison of histograms - ExpandingHistComparer( - read_key="split_hists", shift=shift, store_key="comparisons" - ), - # --- 4. profiling of histograms, then pull calculation compared with reference mean and std, - # to obtain normalized residuals of profiles - RefMedianMadPullCalculator( - reference_key="comparisons", - assign_to_key="comparisons", - suffix_mean="_mean", - suffix_std="_std", - suffix_pull="_pull", - metrics=["expanding_max_prob_diff"], - ), - HistProfiler(read_key="split_hists", store_key="profiles"), - ExpandingPullCalculator( - read_key="profiles", - shift=shift, - suffix_mean="_mean", - suffix_std="_std", - suffix_pull="_pull", - ), - # --- 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # --- 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - DynamicBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - DynamicBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # --- 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), - ] - pipeline = Pipeline(modules) - return pipeline +class RollingReferenceMetricsPipeline(Pipeline): + def __init__( + self, + hists_key="test_hists", + time_axis="date", + window=10, + shift=1, + monitoring_rules={}, + pull_rules={}, + features=None, + **kwargs, + ): + """Example metrics pipeline for comparing test data with itself (rolling test set) + + :param str hists_key: key to test histograms in datastore. default is 'test_hists' + :param str time_axis: name of datetime feature. default is 'date' + :param int window: size of rolling window and for trend detection. default is 10 + :param int shift: shift in rolling window. default is 1 + :param dict monitoring_rules: traffic light rules + :param dict pull_rules: pull rules to determine dynamic boundaries + :param list features: features of histograms to pick up from input data (optional) + :param kwargs: residual keyword arguments + :return: assembled rolling reference pipeline + """ + modules = [ + # 1. splitting of test histograms + HistSplitter( + read_key=hists_key, + store_key="split_hists", + features=features, + feature_begins_with=f"{time_axis}:", + ), + # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in + # chi2 comparison of histograms + PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + # 3. profiling of reference histograms, then comparison of with profiled test histograms + # results in chi2 comparison of histograms + RollingHistComparer( + read_key="split_hists", + window=window, + shift=shift, + store_key="comparisons", + ), + RefMedianMadPullCalculator( + reference_key="comparisons", + assign_to_key="comparisons", + suffix_mean="_mean", + suffix_std="_std", + suffix_pull="_pull", + metrics=["roll_max_prob_diff"], + ), + # 4. profiling of histograms, then pull calculation compared with reference mean and std, + # to obtain normalized residuals of profiles + HistProfiler(read_key="split_hists", store_key="profiles"), + RollingPullCalculator( + read_key="profiles", + window=window, + shift=shift, + suffix_mean="_mean", + suffix_std="_std", + suffix_pull="_pull", + ), + # 5. looking for significant rolling linear trends in selected features/metrics + ApplyFunc( + apply_to_key="profiles", + assign_to_key="comparisons", + apply_funcs=[ + { + "func": rolling_lr_zscore, + "suffix": f"_trend{window}_zscore", + "entire": True, + "window": window, + "metrics": ["mean", "phik", "fraction_true"], + } + ], + msg="Computing significance of (rolling) trend in means of features", + ), + # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, + # used for plotting in popmon_profiles report. + DynamicBounds( + read_key="profiles", + rules=pull_rules, + store_key="dynamic_bounds", + suffix_mean="_mean", + suffix_std="_std", + ), + DynamicBounds( + read_key="comparisons", + rules=pull_rules, + store_key="dynamic_bounds_comparisons", + suffix_mean="_mean", + suffix_std="_std", + ), + # 7. expand all (wildcard) static traffic light bounds and apply them. + # Applied to both profiles and comparisons datasets + TrafficLightAlerts( + read_key="profiles", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds", + ), + TrafficLightAlerts( + read_key="comparisons", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds_comparisons", + ), + ApplyFunc( + apply_to_key="traffic_lights", + apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], + assign_to_key="alerts", + msg="Generating traffic light alerts summary.", + ), + AlertsSummary(read_key="alerts"), + ] + super().__init__(modules) + + +class ExpandingReferenceMetricsPipeline(Pipeline): + def __init__( + self, + hists_key="test_hists", + time_axis="date", + window=10, + shift=1, + monitoring_rules={}, + pull_rules={}, + features=None, + **kwargs, + ): + """Example metrics pipeline for comparing test data with itself (expanding test set) + + :param str hists_key: key to test histograms in datastore. default is 'test_hists' + :param str time_axis: name of datetime feature. default is 'date' + :param int window: window size for trend detection. default is 10 + :param int shift: shift in expanding window. default is 1 + :param dict monitoring_rules: traffic light rules + :param dict pull_rules: pull rules to determine dynamic boundaries + :param list features: features of histograms to pick up from input data (optional) + :param kwargs: residual keyword arguments + :return: assembled expanding reference pipeline + """ + modules = [ + # 1. splitting of test histograms + HistSplitter( + read_key=hists_key, + store_key="split_hists", + features=features, + feature_begins_with=f"{time_axis}:", + ), + # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in + # chi2 comparison of histograms + PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + # 3. profiling of reference histograms, then comparison of with profiled test histograms + # results in chi2 comparison of histograms + ExpandingHistComparer( + read_key="split_hists", shift=shift, store_key="comparisons" + ), + # 4. profiling of histograms, then pull calculation compared with reference mean and std, + # to obtain normalized residuals of profiles + RefMedianMadPullCalculator( + reference_key="comparisons", + assign_to_key="comparisons", + suffix_mean="_mean", + suffix_std="_std", + suffix_pull="_pull", + metrics=["expanding_max_prob_diff"], + ), + HistProfiler(read_key="split_hists", store_key="profiles"), + ExpandingPullCalculator( + read_key="profiles", + shift=shift, + suffix_mean="_mean", + suffix_std="_std", + suffix_pull="_pull", + ), + # 5. looking for significant rolling linear trends in selected features/metrics + ApplyFunc( + apply_to_key="profiles", + assign_to_key="comparisons", + apply_funcs=[ + { + "func": rolling_lr_zscore, + "suffix": f"_trend{window}_zscore", + "entire": True, + "window": window, + "metrics": ["mean", "phik", "fraction_true"], + } + ], + msg="Computing significance of (rolling) trend in means of features", + ), + # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, + # used for plotting in popmon_profiles report. + DynamicBounds( + read_key="profiles", + rules=pull_rules, + store_key="dynamic_bounds", + suffix_mean="_mean", + suffix_std="_std", + ), + DynamicBounds( + read_key="comparisons", + rules=pull_rules, + store_key="dynamic_bounds_comparisons", + suffix_mean="_mean", + suffix_std="_std", + ), + # 7. expand all (wildcard) static traffic light bounds and apply them. + # Applied to both profiles and comparisons datasets + TrafficLightAlerts( + read_key="profiles", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds", + ), + TrafficLightAlerts( + read_key="comparisons", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds_comparisons", + ), + ApplyFunc( + apply_to_key="traffic_lights", + apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], + assign_to_key="alerts", + msg="Generating traffic light alerts summary.", + ), + AlertsSummary(read_key="alerts"), + ] + super().__init__(modules) diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index a25789e1..0fbe9377 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -29,13 +29,7 @@ from ..base import Module, Pipeline from ..config import config -from ..pipeline.report_pipelines import ( - ReportPipe, - expanding_reference, - external_reference, - rolling_reference, - self_reference, -) +from ..pipeline.report_pipelines import ReportPipe, get_report_pipeline_class from ..resources import templates_env logging.basicConfig( @@ -43,13 +37,6 @@ ) logger = logging.getLogger() -_report_pipeline = { - "self": self_reference, - "external": external_reference, - "rolling": rolling_reference, - "expanding": expanding_reference, -} - def stability_report( hists, @@ -128,13 +115,8 @@ def stability_report( :return: dict with results of reporting pipeline """ # perform basic input checks - reference_types = list(_report_pipeline.keys()) - if reference_type not in reference_types: - raise ValueError(f"reference_type should be one of {str(reference_types)}.") if not isinstance(hists, dict): raise TypeError("hists should be a dict of histogrammar histograms.") - if reference_type == "external" and not isinstance(reference, dict): - raise TypeError("reference should be a dict of histogrammar histograms.") if not isinstance(monitoring_rules, dict): monitoring_rules = { "*_pull": [7, 4, -4, -7], @@ -177,7 +159,7 @@ def stability_report( datastore["ref_hists"] = reference # execute reporting pipeline - pipeline = _report_pipeline[reference_type](**cfg) + pipeline = get_report_pipeline_class(reference_type, reference)(**cfg) stability_report = StabilityReport() stability_report.transform(pipeline.transform(datastore)) return stability_report @@ -522,7 +504,7 @@ def regenerate( """ # basic checks if not self.datastore: - self.logger.warning("Empty datastore, cannot regenerate report.") + self.logger.warning("Empty datastore, could not regenerate report.") return None # start from clean slate diff --git a/popmon/pipeline/report_pipelines.py b/popmon/pipeline/report_pipelines.py index ad71727f..22f7295b 100644 --- a/popmon/pipeline/report_pipelines.py +++ b/popmon/pipeline/report_pipelines.py @@ -24,10 +24,10 @@ from ..config import config from ..io import FileWriter from ..pipeline.metrics_pipelines import ( - metrics_expanding_reference, - metrics_external_reference, - metrics_rolling_reference, - metrics_self_reference, + ExpandingReferenceMetricsPipeline, + ExternalReferenceMetricsPipeline, + RollingReferenceMetricsPipeline, + SelfReferenceMetricsPipeline, ) from ..visualization import ( AlertSectionGenerator, @@ -38,235 +38,253 @@ ) -def self_reference( - hists_key="test_hists", - time_axis="date", - window=10, - monitoring_rules={}, - pull_rules={}, - features=None, - skip_empty_plots=True, - last_n=0, - plot_hist_n=6, - report_filepath=None, - show_stats=None, - **kwargs, -): - """Example pipeline for comparing test data with itself (full test set) +def get_report_pipeline_class(reference_type, reference): + _report_pipeline = { + "self": SelfReference, + "external": ExternalReference, + "rolling": RollingReference, + "expanding": ExpandingReference, + } + reference_types = list(_report_pipeline.keys()) + if reference_type not in reference_types: + raise ValueError(f"reference_type should be one of {str(reference_types)}.") + if reference_type == "external" and not isinstance(reference, dict): + raise TypeError("reference should be a dict of histogrammar histograms.") - :param str hists_key: key to test histograms in datastore. default is 'test_hists' - :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries - :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments - :return: assembled self reference pipeline - """ - modules = [ - metrics_self_reference( - hists_key, - time_axis, - window, - monitoring_rules, - pull_rules, - features, - **kwargs, - ), - ReportPipe( - sections_key="report_sections", - store_key="html_report", - skip_empty_plots=skip_empty_plots, - last_n=last_n, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, - ), - ] + return _report_pipeline[reference_type] - pipeline = Pipeline(modules) - # pipeline.to_json("pipeline_self_reference_versioned.json", versioned=True) - # pipeline.to_json("pipeline_self_reference_unversioned.json", versioned=False) - return pipeline +class SelfReference(Pipeline): + def __init__( + self, + hists_key="test_hists", + time_axis="date", + window=10, + monitoring_rules={}, + pull_rules={}, + features=None, + skip_empty_plots=True, + last_n=0, + plot_hist_n=6, + report_filepath=None, + show_stats=None, + **kwargs, + ): + """Example pipeline for comparing test data with itself (full test set) -def external_reference( - hists_key="test_hists", - ref_hists_key="ref_hists", - time_axis="date", - window=10, - monitoring_rules={}, - pull_rules={}, - features=None, - skip_empty_plots=True, - last_n=0, - plot_hist_n=2, - report_filepath=None, - show_stats=None, - **kwargs, -): - """Example pipeline for comparing test data with other (full) external reference set + :param str hists_key: key to test histograms in datastore. default is 'test_hists' + :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) + :param int window: window size for trend detection. default is 10 + :param dict monitoring_rules: traffic light rules + :param dict pull_rules: pull rules to determine dynamic boundaries + :param list features: features of histograms to pick up from input data (optional) + :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) + :param int last_n: plot statistic data for last 'n' periods (optional) + :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) + :param str report_filepath: the file path where to output the report (optional) + :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) + :param kwargs: residual keyword arguments + :return: assembled self reference pipeline + """ + modules = [ + SelfReferenceMetricsPipeline( + hists_key, + time_axis, + window, + monitoring_rules, + pull_rules, + features, + **kwargs, + ), + ReportPipe( + sections_key="report_sections", + store_key="html_report", + skip_empty_plots=skip_empty_plots, + last_n=last_n, + plot_hist_n=plot_hist_n, + report_filepath=report_filepath, + show_stats=show_stats, + ), + ] - :param str hists_key: key to test histograms in datastore. default is 'test_hists' - :param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists' - :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries - :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments - :return: assembled external reference pipeline - """ - modules = [ - metrics_external_reference( - hists_key, - ref_hists_key, - time_axis, - window, - monitoring_rules, - pull_rules, - features, - **kwargs, - ), - ReportPipe( - sections_key="report_sections", - store_key="html_report", - skip_empty_plots=skip_empty_plots, - last_n=last_n, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, - ), - ] + super().__init__(modules) - pipeline = Pipeline(modules) - return pipeline +class ExternalReference(Pipeline): + def __init__( + self, + hists_key="test_hists", + ref_hists_key="ref_hists", + time_axis="date", + window=10, + monitoring_rules={}, + pull_rules={}, + features=None, + skip_empty_plots=True, + last_n=0, + plot_hist_n=2, + report_filepath=None, + show_stats=None, + **kwargs, + ): + """Example pipeline for comparing test data with other (full) external reference set -def rolling_reference( - hists_key="test_hists", - time_axis="date", - window=10, - shift=1, - monitoring_rules={}, - pull_rules={}, - features=None, - skip_empty_plots=True, - last_n=0, - plot_hist_n=6, - report_filepath=None, - show_stats=None, - **kwargs, -): - """Example pipeline for comparing test data with itself (rolling test set) + :param str hists_key: key to test histograms in datastore. default is 'test_hists' + :param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists' + :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) + :param int window: window size for trend detection. default is 10 + :param dict monitoring_rules: traffic light rules + :param dict pull_rules: pull rules to determine dynamic boundaries + :param list features: features of histograms to pick up from input data (optional) + :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) + :param int last_n: plot statistic data for last 'n' periods (optional) + :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) + :param str report_filepath: the file path where to output the report (optional) + :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) + :param kwargs: residual keyword arguments + :return: assembled external reference pipeline + """ + modules = [ + ExternalReferenceMetricsPipeline( + hists_key, + ref_hists_key, + time_axis, + window, + monitoring_rules, + pull_rules, + features, + **kwargs, + ), + ReportPipe( + sections_key="report_sections", + store_key="html_report", + skip_empty_plots=skip_empty_plots, + last_n=last_n, + plot_hist_n=plot_hist_n, + report_filepath=report_filepath, + show_stats=show_stats, + ), + ] - :param str hists_key: key to test histograms in datastore. default is 'test_hists' - :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: size of rolling window and for trend detection. default is 10 - :param int shift: shift in rolling window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries - :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments - :return: assembled rolling reference pipeline - """ - modules = [ - metrics_rolling_reference( - hists_key, - time_axis, - window, - shift, - monitoring_rules, - pull_rules, - features, - **kwargs, - ), - ReportPipe( - sections_key="report_sections", - store_key="html_report", - skip_empty_plots=skip_empty_plots, - last_n=last_n, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, - ), - ] + super().__init__(modules) - pipeline = Pipeline(modules) - return pipeline +class RollingReference(Pipeline): + def __init__( + self, + hists_key="test_hists", + time_axis="date", + window=10, + shift=1, + monitoring_rules={}, + pull_rules={}, + features=None, + skip_empty_plots=True, + last_n=0, + plot_hist_n=6, + report_filepath=None, + show_stats=None, + **kwargs, + ): + """Example pipeline for comparing test data with itself (rolling test set) -def expanding_reference( - hists_key="test_hists", - time_axis="date", - window=10, - shift=1, - monitoring_rules={}, - pull_rules={}, - features=None, - skip_empty_plots=True, - last_n=0, - plot_hist_n=6, - report_filepath=None, - show_stats=None, - **kwargs, -): - """Example pipeline for comparing test data with itself (expanding test set) + :param str hists_key: key to test histograms in datastore. default is 'test_hists' + :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) + :param int window: size of rolling window and for trend detection. default is 10 + :param int shift: shift in rolling window. default is 1 + :param dict monitoring_rules: traffic light rules + :param dict pull_rules: pull rules to determine dynamic boundaries + :param list features: features of histograms to pick up from input data (optional) + :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) + :param int last_n: plot statistic data for last 'n' periods (optional) + :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) + :param str report_filepath: the file path where to output the report (optional) + :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) + :param kwargs: residual keyword arguments + :return: assembled rolling reference pipeline + """ + modules = [ + RollingReferenceMetricsPipeline( + hists_key, + time_axis, + window, + shift, + monitoring_rules, + pull_rules, + features, + **kwargs, + ), + ReportPipe( + sections_key="report_sections", + store_key="html_report", + skip_empty_plots=skip_empty_plots, + last_n=last_n, + plot_hist_n=plot_hist_n, + report_filepath=report_filepath, + show_stats=show_stats, + ), + ] + + super().__init__(modules) + + +class ExpandingReference(Pipeline): + def __init__( + self, + hists_key="test_hists", + time_axis="date", + window=10, + shift=1, + monitoring_rules={}, + pull_rules={}, + features=None, + skip_empty_plots=True, + last_n=0, + plot_hist_n=6, + report_filepath=None, + show_stats=None, + **kwargs, + ): + """Example pipeline for comparing test data with itself (expanding test set) - :param str hists_key: key to test histograms in datastore. default is 'test_hists' - :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param int shift: shift in expanding window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries - :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments - :return: assembled expanding reference pipeline - """ - modules = [ - metrics_expanding_reference( - hists_key, - time_axis, - window, - shift, - monitoring_rules, - pull_rules, - features, - **kwargs, - ), - ReportPipe( - sections_key="report_sections", - store_key="html_report", - skip_empty_plots=skip_empty_plots, - last_n=last_n, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, - ), - ] + :param str hists_key: key to test histograms in datastore. default is 'test_hists' + :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) + :param int window: window size for trend detection. default is 10 + :param int shift: shift in expanding window. default is 1 + :param dict monitoring_rules: traffic light rules + :param dict pull_rules: pull rules to determine dynamic boundaries + :param list features: features of histograms to pick up from input data (optional) + :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) + :param int last_n: plot statistic data for last 'n' periods (optional) + :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) + :param str report_filepath: the file path where to output the report (optional) + :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) + :param kwargs: residual keyword arguments + :return: assembled expanding reference pipeline + """ + modules = [ + ExpandingReferenceMetricsPipeline( + hists_key, + time_axis, + window, + shift, + monitoring_rules, + pull_rules, + features, + **kwargs, + ), + ReportPipe( + sections_key="report_sections", + store_key="html_report", + skip_empty_plots=skip_empty_plots, + last_n=last_n, + plot_hist_n=plot_hist_n, + report_filepath=report_filepath, + show_stats=show_stats, + ), + ] - pipeline = Pipeline(modules) - return pipeline + super().__init__(modules) class ReportPipe(Pipeline): @@ -306,32 +324,23 @@ def __init__( :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) """ - super().__init__(modules=[]) self.store_key = store_key # dictionary of section descriptions descs = config["section_descriptions"] # default keyword arguments for each section - def sg_kws(read_key): - return { - "read_key": read_key, - "store_key": sections_key, - "skip_empty_plots": skip_empty_plots, - "last_n": last_n, - "skip_first_n": skip_first_n, - "skip_last_n": skip_last_n, - "show_stats": show_stats, - "description": descs.get(read_key, ""), - } + sg_kws = { + "store_key": sections_key, + "skip_empty_plots": skip_empty_plots, + "last_n": last_n, + "skip_first_n": skip_first_n, + "skip_last_n": skip_last_n, + "show_stats": show_stats, + } - self.modules = [ - # --- o generate sections - # - a section of profiled statistics with dynamic or static traffic light bounds - # - a section of histogram and pull comparison statistics - # - a section showing all traffic light alerts of monitored statistics - # - a section with a summary of traffic light alerts - # --- o generate report + modules = [ + # generate section with histogram HistogramSection( read_key="split_hists", store_key=sections_key, @@ -340,28 +349,47 @@ def sg_kws(read_key): last_n=plot_hist_n, description=descs.get("histograms", ""), ), + # section showing all traffic light alerts of monitored statistics TrafficLightSectionGenerator( - section_name=traffic_lights_section, **sg_kws("traffic_lights") + read_key="traffic_lights", + description=descs.get("traffic_lights", ""), + section_name=traffic_lights_section, + **sg_kws, ), - AlertSectionGenerator(section_name=alerts_section, **sg_kws("alerts")), + # section with a summary of traffic light alerts + AlertSectionGenerator( + read_key="alerts", + description=descs.get("alerts", ""), + section_name=alerts_section, + **sg_kws, + ), + # section of histogram and pull comparison statistics SectionGenerator( dynamic_bounds="dynamic_bounds_comparisons", static_bounds="static_bounds_comparisons", section_name=comparisons_section, ignore_stat_endswith=["_mean", "_std", "_pull"], - **sg_kws("comparisons"), + read_key="comparisons", + description=descs.get("comparisons", ""), + **sg_kws, ), + # section of profiled statistics with dynamic or static traffic light bounds SectionGenerator( dynamic_bounds="dynamic_bounds", section_name=profiles_section, static_bounds="static_bounds", ignore_stat_endswith=["_mean", "_std", "_pull"], - **sg_kws("profiles"), + read_key="profiles", + description=descs.get("profiles", ""), + **sg_kws, ), + # generate report ReportGenerator(read_key=sections_key, store_key=store_key), ] if isinstance(report_filepath, (str, Path)) and len(report_filepath) > 0: - self.modules.append(FileWriter(store_key, file_path=report_filepath)) + modules.append(FileWriter(store_key, file_path=report_filepath)) + + super().__init__(modules=modules) def transform(self, datastore): self.logger.info(f'Generating report "{self.store_key}".') diff --git a/tests/popmon/pipeline/test_report_pipelines.py b/tests/popmon/pipeline/test_report_pipelines.py index a22f7c9c..983a6212 100644 --- a/tests/popmon/pipeline/test_report_pipelines.py +++ b/tests/popmon/pipeline/test_report_pipelines.py @@ -2,10 +2,10 @@ from popmon.base import Pipeline from popmon.io import JsonReader from popmon.pipeline.report_pipelines import ( - expanding_reference, - external_reference, - rolling_reference, - self_reference, + ExpandingReference, + ExternalReference, + RollingReference, + SelfReference, ) @@ -17,7 +17,7 @@ def test_self_reference(): JsonReader( file_path=resources.data("example_histogram.json"), store_key="hists" ), - self_reference(hists_key="hists", features=hist_list), + SelfReference(hists_key="hists", features=hist_list), ] ) pipeline.transform(datastore={}) @@ -31,8 +31,10 @@ def test_external_reference(): JsonReader( file_path=resources.data("example_histogram.json"), store_key="hists" ), - external_reference( - hists_key="hists", ref_hists_key="hists", features=hist_list + ExternalReference( + hists_key="hists", + ref_hists_key="hists", + features=hist_list, ), ] ) @@ -47,7 +49,11 @@ def test_rolling_reference(): JsonReader( file_path=resources.data("example_histogram.json"), store_key="hists" ), - rolling_reference(hists_key="hists", window=5, features=hist_list), + RollingReference( + hists_key="hists", + window=5, + features=hist_list, + ), ] ) pipeline.transform(datastore={}) @@ -61,7 +67,7 @@ def test_expanding_reference(): JsonReader( file_path=resources.data("example_histogram.json"), store_key="hists" ), - expanding_reference(hists_key="hists", features=hist_list), + ExpandingReference(hists_key="hists", features=hist_list), ] ) pipeline.transform(datastore={}) From 21b1e96f65a7f366e1c465b711b63e74b8bd33eb Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 12:21:21 +0200 Subject: [PATCH 22/34] refactor: remove unused imports --- examples/flight_delays.py | 2 +- examples/synthetic_data.py | 2 +- popmon/__init__.py | 13 +++++++++++++ popmon/base/module.py | 4 ++-- popmon/pipeline/report.py | 2 +- 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/examples/flight_delays.py b/examples/flight_delays.py index 657cff06..103871bb 100644 --- a/examples/flight_delays.py +++ b/examples/flight_delays.py @@ -1,6 +1,6 @@ import pandas as pd -import popmon +import popmon # noqa from popmon import resources # open synthetic data diff --git a/examples/synthetic_data.py b/examples/synthetic_data.py index b219a40b..62fe981a 100644 --- a/examples/synthetic_data.py +++ b/examples/synthetic_data.py @@ -1,6 +1,6 @@ import pandas as pd -import popmon +import popmon # noqa from popmon import resources # open synthetic data diff --git a/popmon/__init__.py b/popmon/__init__.py index f8ee496d..ee6dbcb8 100644 --- a/popmon/__init__.py +++ b/popmon/__init__.py @@ -32,3 +32,16 @@ from .pipeline.report import df_stability_report, stability_report from .stitching import stitch_histograms from .version import version as __version__ + +__all__ = [ + "get_bin_specs", + "get_time_axes", + "make_histograms", + "decorators", + "df_stability_metrics", + "stability_metrics", + "df_stability_report", + "stability_report", + "stitch_histograms", + "__version__", +] diff --git a/popmon/base/module.py b/popmon/base/module.py index 13208c8d..de24b50d 100644 --- a/popmon/base/module.py +++ b/popmon/base/module.py @@ -19,11 +19,11 @@ import logging -from abc import ABC, abstractmethod +from abc import ABC class Module(ABC): - """Base class used for modules in a pipeline.""" + """Abstract base class used for modules in a pipeline.""" _input_keys = None _output_keys = None diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index 0fbe9377..86e5e5e1 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -27,7 +27,7 @@ make_histograms, ) -from ..base import Module, Pipeline +from ..base import Module from ..config import config from ..pipeline.report_pipelines import ReportPipe, get_report_pipeline_class from ..resources import templates_env From fd84944468c9c783c32a7ee115328fc0408d98c1 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 12:21:28 +0200 Subject: [PATCH 23/34] ci: check for unused imports --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2918d023..60d700ff 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: - id: flake8 additional_dependencies: - flake8-comprehensions - args: [ "--select=E9,F63,F7,F82,C4"] + args: [ "--select=E9,F63,F7,F82,C4,F401"] - repo: https://github.com/asottile/pyupgrade rev: v2.29.1 hooks: From bb09d730d275e4a97d0d7174d8a325e8c98bea44 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 12:22:05 +0200 Subject: [PATCH 24/34] feat: improve pipeline visualization better handling of subgraphs standalone --- tools/pipeline_viz.py | 189 ++++++++++++++++++++++++------------------ 1 file changed, 110 insertions(+), 79 deletions(-) diff --git a/tools/pipeline_viz.py b/tools/pipeline_viz.py index 8e50ad43..d6f53765 100644 --- a/tools/pipeline_viz.py +++ b/tools/pipeline_viz.py @@ -1,12 +1,11 @@ import json +from itertools import cycle from pathlib import Path -import networkx as nx -import pygraphviz -from networkx.drawing.nx_agraph import to_agraph +import pygraphviz as pgv -def generate_pipeline_vizualisation( +def generate_pipeline_visualisation( input_file, output_file, include_subgraphs: bool = False, @@ -15,87 +14,119 @@ def generate_pipeline_vizualisation( data = Path(input_file).read_text() data = json.loads(data) - subgraphs = [] - modules = [] - - def populate(item): - if item["type"] == "subgraph": - mods = [] - for m in item["modules"]: - mods += populate(m) - - subgraphs.append({"modules": mods, "name": item["name"]}) - return mods - elif item["type"] == "module": - modules.append(item) - name = f"{item['name']}_{item['i']}" - return [name] + list(item["out"].values()) - else: - raise ValueError() - - populate(data) - - G = nx.DiGraph() - for module in modules: + tableau20 = [ + (31, 119, 180), + (174, 199, 232), + (255, 127, 14), + (255, 187, 120), + (44, 160, 44), + (152, 223, 138), + (214, 39, 40), + (255, 152, 150), + (148, 103, 189), + (197, 176, 213), + (140, 86, 75), + (196, 156, 148), + (227, 119, 194), + (247, 182, 210), + (127, 127, 127), + (199, 199, 199), + (188, 189, 34), + (219, 219, 141), + (23, 190, 207), + (158, 218, 229), + ] + + colors = [f"#{r:02x}{g:02x}{b:02x}" for r, g, b in tableau20] + subgraph_colors = cycle(colors) + module_style = {"shape": "rectangle", "fillcolor": "chartreuse", "style": "filled"} + dataset_style = {"shape": "oval", "fillcolor": "orange", "style": "filled"} + subgraph_style = {} + edge_style = {"fontcolor": "gray50"} + + def get_module_label(module): label = f"<{module['name']}" d = module.get("desc", "") if len(d) > 0: label += f"
{d}" label += ">" + return label + + def process(data, G): + if data["type"] == "subgraph": + if include_subgraphs: + c = G.add_subgraph( + name=f'cluster_{data["name"]}', + label=data["name"], + color=next(subgraph_colors), + **subgraph_style, + ) + else: + c = G + for m in data["modules"]: + process(m, c) + elif data["type"] == "module": + name = f"{data['name']}_{data['i']}" + G.add_node(name, label=get_module_label(data), **module_style) + + for k, v in data["in"].items(): + kwargs = {} + if include_labels: + kwargs["headlabel"] = k + G.add_edge(v, name, **edge_style, **kwargs) + for k, v in data["out"].items(): + kwargs = {} + if include_labels: + kwargs["taillabel"] = k + G.add_edge(name, v, **edge_style, **kwargs) + else: + raise ValueError("type should be 'subgraph' or 'module'") - # unique name - name = f"{module['name']}_{module['i']}" - - G.add_node( - name, shape="rectangle", fillcolor="chartreuse", style="filled", label=label - ) - - for k, v in module["in"].items(): - kwargs = {} - if include_labels: - kwargs["headlabel"] = k - G.add_edge(v, name, **kwargs) - for k, v in module["out"].items(): - kwargs = {} - if include_labels: - kwargs["taillabel"] = k - G.add_edge(name, v, **kwargs) - - # set defaults - G.graph["graph"] = {"rankdir": "TD"} - G.graph["node"] = {"shape": "oval", "fillcolor": "orange", "style": "filled"} - G.graph["edge"] = {"fontcolor": "gray50"} - - A = to_agraph(G) - if include_subgraphs: - for idx, subgraph in enumerate(subgraphs): - H = A.subgraph( - subgraph["modules"], - name=f'cluster_{idx}_{subgraph["name"].lower().replace(" ", "_")}', - ) - H.graph_attr["color"] = "blue" - H.graph_attr["label"] = subgraph["name"] - H.graph_attr["style"] = "dotted" - - A.layout("dot") - A.draw(output_file) - - -if __name__ == "__main__": - data_path = Path("<...>") - - input_file = data_path / "pipeline_self_reference_unversioned.json" - output_file = "popmon-report-pipeline-subgraphs-unversioned.pdf" - generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=True) + g = pgv.AGraph(name="popmon-pipeline", directed=True) + g.node_attr.update(**dataset_style) + process(data, g) - input_file = data_path / "pipeline_self_reference_unversioned.json" - output_file = "popmon-report-pipeline-unversioned.pdf" - generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=False) + g.layout("dot") + g.draw(output_file) - input_file = data_path / "pipeline_self_reference_versioned.json" - output_file = "popmon-report-pipeline-subgraphs-versioned.pdf" - generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=True) - input_file = data_path / "pipeline_self_reference_versioned.json" - output_file = "popmon-report-pipeline-versioned.pdf" - generate_pipeline_vizualisation(input_file, output_file, include_subgraphs=False) +if __name__ == "__main__": + data_path = Path(".") + + # Example pipeline + from popmon import resources + from popmon.config import config + from popmon.pipeline.amazing_pipeline import AmazingPipeline + + cfg = { + **config, + "histograms_path": resources.data("synthetic_histograms.json"), + "hists_key": "hists", + "ref_hists_key": "hists", + "datetime_name": "date", + "window": 20, + "shift": 1, + "monitoring_rules": { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + }, + "pull_rules": {"*_pull": [7, 4, -4, -7]}, + "show_stats": config["limited_stats"], + } + + pipeline = AmazingPipeline(**cfg) + name = pipeline.__class__.__name__.lower() + + input_file = data_path / f"pipeline_{name}_unversioned.json" + pipeline.to_json(input_file, versioned=False) + output_file = f"pipeline_{name}_subgraphs_unversioned.pdf" + generate_pipeline_visualisation(input_file, output_file, include_subgraphs=True) + output_file = f"pipeline_{name}_unversioned.pdf" + generate_pipeline_visualisation(input_file, output_file, include_subgraphs=False) + + input_file = data_path / f"pipeline_{name}_versioned.json" + pipeline.to_json(input_file, versioned=True) + output_file = f"pipeline_{name}_subgraphs_versioned.pdf" + generate_pipeline_visualisation(input_file, output_file, include_subgraphs=True) + output_file = f"pipeline_{name}_versioned.pdf" + generate_pipeline_visualisation(input_file, output_file, include_subgraphs=False) From 1f6c6488c64c8efe0c7c7561214340ed613025d6 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 12:22:27 +0200 Subject: [PATCH 25/34] style: various code style improvements --- popmon/analysis/merge_statistics.py | 12 ++++++------ popmon/io/file_writer.py | 14 +++++++------- popmon/pipeline/report.py | 13 +++++++++---- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/popmon/analysis/merge_statistics.py b/popmon/analysis/merge_statistics.py index 232f8b98..bcffa833 100644 --- a/popmon/analysis/merge_statistics.py +++ b/popmon/analysis/merge_statistics.py @@ -44,14 +44,14 @@ def __init__(self, read_keys: List[str], store_key: str): def transform(self, dicts: list): merged_stats = {} for dict_ in dicts: - for feature in dict_.keys(): - # we add statistics dataframe to the final output for specific feature however - # if the feature already exists - we concatenate its dataframe with the existing one - if isinstance(dict_[feature], pd.DataFrame): + for feature, values in dict_.items(): + if isinstance(values, pd.DataFrame): + # we add statistics dataframe to the final output for specific feature however + # if the feature already exists - we concatenate its dataframe with the existing one if feature in merged_stats: merged_stats[feature] = merged_stats[feature].combine_first( - dict_[feature] + values ) else: - merged_stats[feature] = dict_[feature] + merged_stats[feature] = values return merged_stats diff --git a/popmon/io/file_writer.py b/popmon/io/file_writer.py index 2bbe37c0..c7455bae 100644 --- a/popmon/io/file_writer.py +++ b/popmon/io/file_writer.py @@ -72,10 +72,10 @@ def transform(self, data): # if file path is provided, write data to a file. Otherwise, write data into the datastore if self.file_path is None: return data - else: - with open(self.file_path, "w+") as file: - file.write(data) - self.logger.info( - f'Object "{self.read_key}" written to file "{self.file_path}".' - ) - return None + + with open(self.file_path, "w+") as file: + file.write(data) + self.logger.info( + f'Object "{self.read_key}" written to file "{self.file_path}".' + ) + return None diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index 86e5e5e1..dc5259cf 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -418,12 +418,14 @@ def __init__(self, read_key="html_report"): """ super().__init__() self.read_key = read_key - self.html_report = "" self.datastore = {} + @property + def html_report(self): + return self.get_datastore_object(self.datastore, self.read_key, str) + def transform(self, datastore): self.datastore = datastore - self.html_report = self.get_datastore_object(datastore, self.read_key, str) def _repr_html_(self): """HTML representation of the class (report) embedded in an iframe. @@ -444,9 +446,12 @@ def to_html(self, escape=False): :param bool escape: escape characters which could conflict with other HTML code. default: False :return str: HTML code of the report """ - import html - return html.escape(self.html_report) if escape else self.html_report + if escape: + import html + + return html.escape(self.html_report) + return self.html_report def to_file(self, filename): """Store HTML report in the local file system. From 913bfb0aec607ea68567ecc71e65cfae7c86ff75 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 26 Oct 2021 12:22:54 +0200 Subject: [PATCH 26/34] docs: pipeline visualizations in docs and notebooks --- README.rst | 14 +++ docs/source/assets/pipeline.png | Bin 0 -> 27222 bytes .../notebooks/popmon_tutorial_advanced.ipynb | 82 ++++++++++++------ 3 files changed, 68 insertions(+), 28 deletions(-) create mode 100644 docs/source/assets/pipeline.png diff --git a/README.rst b/README.rst index 994a531b..620c4158 100644 --- a/README.rst +++ b/README.rst @@ -157,6 +157,17 @@ These examples also work with spark dataframes. You can see the output of such example notebook code `here `_. For all available examples, please see the `tutorials `_ at read-the-docs. +Pipelines for monitoring dataset shift +====================================== +Advanced users can leverage popmon's modular data pipeline to customize their workflow. +Visualization of the pipeline can be useful when debugging, or for didactic purposes. +There is a `script `_ included with the package that you can use. +The plotting is configurable, and depending on the options you will obtain a result that can be used for understanding the data flow, the high-level components and the (re)use of datasets. + +|pipeline| + +*Example pipeline visualization (click to enlarge)* + Resources ========= @@ -214,6 +225,9 @@ Copyright ING WBAA. `popmon` is completely free, open-source and licensed under :target: https://github.com/ing-bank/popmon .. |example| image:: https://raw.githubusercontent.com/ing-bank/popmon/master/docs/source/assets/traffic_light_overview.png :alt: Traffic Light Overview +.. |pipeline| image:: https://raw.githubusercontent.com/ing-bank/popmon/master/docs/source/assets/pipeline.png + :alt: Pipeline Visualization + :target: https://github.com/ing-bank/popmon/files/7417124/pipeline_amazingpipeline_subgraphs_unversioned.pdf .. |build| image:: https://github.com/ing-bank/popmon/workflows/build/badge.svg :alt: Build status .. |docs| image:: https://readthedocs.org/projects/popmon/badge/?version=latest diff --git a/docs/source/assets/pipeline.png b/docs/source/assets/pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..784c2cdf7b0a13d002cc1e522258cb407c380fe9 GIT binary patch literal 27222 zcmXtfWl$VU(=8U<-6cqHcNTYdch}$nmf$YI-Gc{rU)P_tu?%)75=W zPftx%&-8S(nu;tMGBGj~6cn1goRkI>6f6)53fc$}_Mb#zE>aN+3g%f&NlW_U<0Ie= z?yVFmsAl2i_5JAT{^sFzeC=u$e7$>qdw+NPezy769`#o3@o{@OvU(}u_W2*e=H{l~ zBP;|0aovY;*@gpy!G`vl%gf8AUv*_=WnXS@eNHO4P;+lUYZZX zJU%_X1QUp<^7-C>x;Z=9nHpQ`?;08!YD$V#Qc_x2STF*?oSmPIkB@U}Q)yO1zdS!n z13!0kbQ~Wa&&|#T2Lks}aE-5J%`}&zrk7~dl3IqaKPeD&jO^wuVN@@r>til|v zty+7Sj*N`d*3{(WG&D3!2Viz~b`anZQ&LiFW?@%XS8;K1#ABd(dwV@SJ?A5#GBPq+ zTU#6I>wSE@6|!JIAP1+rkYD9x8o!{$#l$ z0&ifsxwy`9a`wHwe>p+l&Cde@pznV|+1lD_Yik`G91K;J%L*|$Z^6{MLVf@KeQ$5C zzrVk`Fvorwb|nT%O^QdQ2wF!+XEzDz+d1q;Jd_{{sTzPnUqvj(3Mw`Ahrho+c;Wmg z9ZEz*#QO?np>Kb0cFNGu@O{1gVh3y~N10#*V{M?EX4{=^2K~MTazBLGbN3Jv6MMVA z`>IT7C{AHxW8-lGv%TIcd^?1!fi5sN@ZQlztHa35;@FMR+=zD#VaF`dGiumHaTlgXBRm$q|!qVrfX2<=9|_>SFaC zjOH&_vKO*iX?nKjEfoG}mEu*8|NfKLQ7q7)V5GB6?nkd?v3VR3@xBL6nI=l2id5Tn z-{nyAXjqtcRY<9oDN{PruqEbDnC@$?w#rv+xoNf>C@5P3c`0!%pY@ByLi7+d!ie3# zeM0W+CHIhJEA=(hA2e{4cT;^4;ZrkmaI?ZbC}84M${cwdcwY4H1%Lf@|NeKt{x|vp zmr(ouruyZyK?fDK+-yT&pxK*)ARyeDLrLRO&GWa`uCG9x$6(0oEn>n<-YkOVNe1gV zT|@YycFE%UZzcAC)Z<~Uw|1HK3<{`_aEQ(=^5D-(jt-xU&W&$GYq;xHjXG?MVe zyT3TYd@y|&Hz`&B>@aN1hxGk4rVz{N5i4_kV@^Y%`(2WQUabP z(V)1z%z8yUexP0S?Gm;+VPH5%IrH6irC4JL;j06!CJs;i%KQhUZPcm7^lqUx;{N=p zznyaHCmAZGpHLnr$^R)96esx$?hJoJFjeg#(QT3ofhT^$l^aHHwbEe@8mQ0urbH+?t(Pf@*^B@V}yR5}4#nrxYYiQcFnK}A|8vt~*i9}x5nMYr2E zBfW*==A`uUNG(apZZ{`U%*40tVv|&kSLc?K2q=kcZ z1z4cd(Wwpz^+sX^%MehXhxuPe?aoH zlSTNU5_chaZat>_+T|W*=T+Da~FbDvEz8p+}^uc$_C9kgm(qq7m zfG&{zoA%DeSFtgDpzNV>?da&s&i&%n;;5`reC8QMpYs!qMwU*OU)2 ztoR(0ir);FsAJ!;IK!02(Gbvy&s zgkVolDA@eHJl>eNGzxdtSMK_*d)xLq@Q6*`XB+e`X<2w6HDAnZsh&9-iP90C*)dY)whkiZhB(A!lsQ~gGhRjWpZ(4C2 z>8nU;I;n_WC1N24tsr-AK9NSLNC*wH%in$+nEpZl(&YIc^1*BW`9i?>1c~eF`QlxR z?;DOvA`;r+xVA~fyT7J-K2r?mb-?%+O#F$IN5zpXnvJB3n(JjlK49Vq{Xe1BVceHl(4V)r zthhH@4|yO?BwUCDkTcEuud)D7z8bb6e+x77e|WG=ToVGa1xWwnO7LAxh))yMX%m_F z*7d{aKN$7O=EfEwKDVpG`*m~`FYjCoWHDx ziG;)!0#aA4fLH60wN4NgaaekcLmquF&6*oq-rv(y;}-7E3a9vQpALEc?W4)fVV0NG zbM~PSkM<&Rzm|T7P9))Y<7t`SypOzCGBSL|A9U!(vP4X1P9j z3W$98W^??mPYgf6AO8Y^m8^dc?K2l^v*DM*B*P8;+#g{ywEOF)l`B8{jzOXNrl2)W zm{|YT!wzdLnIDS+1KQWw`*mrC-Yd_Jse)aBb`qQeaquHL1vy*fL7Yo@jKD1ASSaW^ zPk*Jd0+3sui2zLGds-G9dpQ;dgT}gQ)AhTS8#8J_ac3?plKJL|2*B*>gvPoz_wM$E zw6=%4LYEgr$W#bmY|&ahOW9gc7?^vzmXm0U|Br?SSdu}2ce`k)kjh7r$oKW1G1r8m zN14PLI<_k0hqQ+DY@#jsfq1@2ex9zB_(4g`ST}^sR?eg;D-wXMpjMW z0ijf7OMql!7k;50*V?hsXykWBp8-yJ^RH1IfqF3@N3!{E`bi6#@|o=9QlGIt#~6k} ze(klOoE^1mCB7j8N5shwlrwLfj?Rxj5}o!dzuIngSn2l0#f8=J#<wWXe>895qS$bEFXwHh;#|`fP0>gD_;qP)=JMc6)5dAH*3t1BZxP7sy>_{w5 z*mrIIdl9S?hG}(839hyfGinKriYy|B5qNBs!=%Ngmd3TH)cCYmNgp0wQbE(&jHtg5 zLtbKFK(GAMUR>d1AzAXpxO3w`1Lq{Zd;t)cDS+xh{K(-!oLDYsIB1=he!NhGoB{%p3m7D3&my8PU9gX^r$|zqWPTnq++6-N%qaN*l)?g(Fz3?P^!A4Q$m_K23(QgsK zn*~Z$e@cvlX&N?9*7%ao3D>GX`n$>UJ@ptEAmcwA7kFUA%|2p}!N&^<#+W9-7%cDr za^&&w_l_q)Gz6gi-#VFRGaW4i$ZwQ)pQuI-ZFXrpdW z9|(CD$u?`8PdI~;+y<;pD*8NL+2j``OqeOd={dua;vr}#7sC?10wq$;;<(+C2pH1y zOx+Umg}>18Zr#rewA$Vm!8Q)}&MjB{w)=eVvp0Pr?*WU^fjOXSRzPwZ;Y3Lb)IuGc(q9^D6!q*j3&TedfT~Fl;dMNM3MsIditUH2A0cg(TPB+{ zN`+@Puh5R{;`YUJjq$LArzC5?&bOHoW*r~Q*JK}HYu2?b@*KCwoEXHJ#du%j+v7=rbTrxxB!shmEd~jIq+UEIYh&8GoCrj#3hmxTU@GE;bQ?ZjV37p_2rc0=|%e zGiqHwQ6CIN_EcJ=^|RP>CZgEA3%$yE+EEA2s~^AOK?PrCt=WxuwtG+&%U5Mn*i|$$ zgFl}7CntH{T3C#J=_-T$(M+%GSLmC`cJ1%)TV8TpOy}WdA5C4ik9=Gt&j1t*X@U;1 zWHELUJVd7Sh`Icb9%U1?1rPN@4L1}mpyiSZr7i%Up2Pp~`#ph44QYYiRPgNH2ibOt zb9v^ETGh5Li?W;4;kRSu`6Rv{0M6Fuft)6xcd^#ca=zYVAvW^f_Qkg6y=zBTn{-hZ z@3n!z8c;uMWdg^y)-7C?XdW641fHzxS6$)xq=#=nlS7^Ri>cExTrQE1nDz@vhz z0ZbpQrr2MYIlHJ80b1B`F`*)DZ2G7yBeP<6YUCbT|lbB$eo4D#_>PbCsaiKW0n$%b}$L>gK}w|-qT|6+ihL=r=|l_q;r zwk-R>UaJivbtN2Xa#-P_pAn^WTC#G_8-cVYB4d-1n75L*v`Ty}h#Ygd!B5}bduXl3 zf_wA~Zsl=q&`1unSbN1)0imtQwiG&m}P>S{b1kC*nJ> znI&mD07z7iIzVL(`35`x5W2g4a<3eiVsX6)Exf+D%#lXSGo+|9s9_StpQ8=yhJQ-| zrq^{zwzEEf$uT!Qhf4=j>JjG1N=oVXo`91v5_*gW6f-$sJ7Xg8?!L7060x?-GMsq2 zbnvQDGb~~ruUq_={mL&YBl0GeR*R$lRe+8{kCOE{HO)IDBcMm}t^h@U61KQ?bTy@i zbyAtVWJ*n;XG8S+RvC5xSkk2ZK4aoMX4O6Ox7shO>VQn0R+>G|l)I2#%6JeoBvUR3 zffv}z=LNhE1391}7$ap<@ChH&1a%o;2AMzr8<74$@qCtj$kY70E^Se08mdg? zhiL=wLn0C**$g|G0YwmroXs5gTp99n6vft#4BwYA>XqEb(ZeQ!~pF)NvL7&&F z26%}ECoYvgk)s48+L{h`Q#1{~s7{_|%YPzsR$Ew+EfhqKyaknkL&V>5Pgku8?Jz0|n4wR$y zK%BSjv8IF)vT&9v0-fw}j(8$hZO}?V7F;B9I8Fxih%j&*K>)(W z>(3{x)&Z`#fUN^GFN`1MfEOu+iRH~}KhmeP9SI!kMD1L-Ygu%%PvX38!=VV@k$nk~LnoZc;Z3|D z!YFSEQ!kgxJ;iG#^U)cAdef~%7Zr;2FL5J%xLQyryJi)DG+52WlC#FKs?MM2HdO~} z1T_cZtv4H>7k#TFTX{D~x-l|EUiX{TjY;{I$o1RPq$3}3M%%Ifm#f}MMQ#l(Tq$@M zTY79p;^jkenM7>H-+wF`@#4-PN(9GJpOijjb4Jj_&*&X!$J$tUtN(~VfxvW^cFbiJ zn58q)=`mG~HtBwWVdo5iD#|;YryC^cSR2=d#KVqF3 zwJmaP)K1KyPDMP(k=5-w@3U_7oIB%Xn2tdX>WK%FM<6azImFIsZ4Z;Qy(5rlZJa9K zi#P9f!Fh`i8`!AJ?X@}OVHGkV&j3R^Ns2~z1fa)7a)gY%T82!m_QBpT$~-NQ;p<^m zff$+Yl}Phn0sOV&lYI@OQ#x1mg!`6iJj(|x{{rLqD>QPpr~Bp6q0Sq!aa5rgcY}}% z>Mgi2^PeF>KBW#PjG{d@77$WMf)hi*Ca9n>UP%s6HW?sv7?D>tDYK5^h@EujxtLw^09uI=l5~89w#fC3u?KnS5x3v#fCP<9+}KN(ZNx&9~jm{oSB4)b|d(vh%@DXJw# zIlG%=gc`b-_FUfj78Hn{nhIPU)O>Zpx$GvC0K~ne&E8P1 z9RtC2?fUGo2q?0QKvCWJeZSS4#$zDn)E;Z|*4GtHN;mJEk6IPJe32)+OG%*wihGjs zJWv`0U>Kgym*G8TXj};7F>GG^%?|nX_QCP)YG!EC5SU6J`>bfm>H^el^K*nz9!>7- zVVYe?@zF;5a)t2S_9B~~aAX%%#Tl#Lm&z*9Jyz;AQ7SZaOniyTZ~?U+C59@S%F0em z3dX^a#*>S6rJ`zQ2i*S$U671)bM76Q%wRn<2c1>BeW7bsruOz?4G=r{0C{2nJl>$W zK#br9!>V)bUYX?4gFb*P3Yjm-ZR&gO9J`2LV?kSlIV zo|vu~sBJdQx)5iAo@Jyrr$^1Hv(ef8)X;tt?d*#u|w*5qN>gxf?Uf z<)Zm$$(P3)f=>APHpB*=;xEjUN7QC3PcgYE(<-07@@MxV>u7Ve6E)k{UphQkhEuf0 zsw7>lSkl_GU>j@-@8E+HdqMnJy;kDwPq6G)%rJRmr0IYg-Vx5P=vC6NqsJCrh+l;Q z?ekN3{jGm29_cY7b>LW%UO{*yEm5EhQv{M0j?iqvs1@oiJhFZ$ms!QE9`3;nFT&0F zZ|kj2xsnc6m|!-evtJEih|7#A=8! zCl`bA5Af(KoU6+sfkP|(##mU}Z9}0dmrI9q2YZ80t$6Atw%{6xU9|!?GbjbBlX%uL zCY%!2IuDY&rYPNa#B%*5#zRVNA$hv(@PwkYN6Nr5XSADWoxUsER_I;6g_sV|%4Rc;dS`oI?CmYvVgtQM?)*UL&+#lpl@@*L|J z$ogy1*yj^5i?8`-O99@Zlql($6U5o6ccL~Xa0&2K-3M81NSk%XpS1EgkquX#{X&40 zV+9vmSB}Kqq zxcE7^MbBSft(T>;|TequEh1`mej|+ zmpAakdH9@cx10cOD^LV_&Ju4AZ^S_y#JVrGowjWtkWa7yRCg-SBUI?Yn53OX$W3R| zz<=frDIpwSm(VmZ#%5h0xi?~9E6}n;?q^Tqs~EXDHqTNkCkM%*^pd3Vp<<}*su59X zvL2^>x$_Ihl`S7+X$GnM8p#h-txI~+kY`Ry-ylQ5;mAqV_4C$4FSX}XiOB4|LZ9a3 zUYOw5yv9v1W}Ho8mgkzO0F8fwjMy|HzrmEXQ*41i<~QpR%L7G-xl8+hA{46$=~3iY zRt>G2rOh)$U|Yj?B|{7lK4kM|VoB60XwX8lQ1}q$GcECnPm@ea;@aP^W?!G~H1&+} zEdz)f;Is)27?uapqX)|&bOd8vD*T6+wwJ?W58LA;dAv5CaH*K|-@^HR?A*IX;+&~x zD-PSp)`s^4c6N2OeK0l{3Xxt)zp}I2P4y|Irl1~)Q4JW#T+MM^6}~dqSBxEQEO)*; zK@CNJOKuKpIxN4MdAQ>kJs&J@+1e@|dJp*Akn>A8aB!CKA35e&mq~k5N;iUIPfj}D zyRacQrl+m!HgCu9s`^s*hGj^#&1NnobjV{=4J=1o!%@_R_zMx<(;W*zK_x!pZtZbSIUas8q z!R1%@3=9`3$peRnn=%!~%6K*X0tsX*dG%_z!^v_oL}z4eg*wOa_vArslfKtf*Te-A zB&$OQ;vUu}Zf!YHAoI*f*XMI)LO5!6^QASt7K6OkKiKG1v)6^7>)nVTY8+=IEAo*6 z>+!-UpHuX>m{K&JX1``mk8iO#JX>1eb~-u?4VlFlp)IS!xEW$DC0^|`JyL&`-se(` zgnZJmgrBCpDTn?#l`LfHuG0Fhh^WhId)HCir=v~7{nQn8I1v>f4rMiR8g5+g4nyiI zkX`LNq>AX|_SVzf-k8+xMH>+n*c8!Y3kn{97i* z!e9}9szt-GXbX|;^JO`g6q|4bM%)&wRbz7*7Z&~}L~9gY#K=KAzMSJDPv`{M^P4|T zS*Y-+i*{O!e(!#GZKfj2c{#)z!*HY#ySx8uI`n*X^UZlU2p1?%PiWO%Ew;UP^ znj*TvJJV?lhE|A_yj#y!tOue*detq`hBxe?PHU#u(_r!uq3euXK9W3_9{dCp(V$l4 z2f|(;v#KQ1rF)xtJ{461zX_nghLv?f;Xc`cK7-)4XZC&jIYen}Lr8bqDRCS5c zroK}<5{*W)$51f>swy%nJO?~ubN}Fk--yKm>|K_9THYqdTHu3224h*%X}`@TDYyWX z59)K4@$=>bna$}bNd23iD zk0EK-~UJ531ngO&Fs^5onnW%MN zDK14<<1dcwYp5VUb`~HqpeqMET-~-qBT10{NP$H5DJ6XQs712g%Val3y{%6n!N&F~ z;WFI)&CXE#=tcdt4&B%2PJOnJpY#e|oAcF*rcpuUDKbwLySm{-X0K?ONF)@Nuk5!l z3yjgtH)6Q13i1H!Tw3)#{Xe&>MJ#&V&*nW$XocdD^VK(xK^4flj3eA>-gO6A^FlCz4*FbTKMdYJ)TrLCkVZYOAkw-2f)$8725H+ndojI%Vnaw8XSLQ zzm#c%^b{`w`G5P}E@x~5zPC5NeSH}1UN@iej-9Ee=b=mJNQw>~;VFy`l4)cNV`lAM zv_W-(9u7>`u>N~`5f`;cY~s?7Q}mSs#!nAwL@Oj>~YAd=@+a(zQEM z@0~x5FZUT7_C~5$h{8$_kYT)k-cpUc+6MWvXb3L<5p9bmX*QcbW0PT0mM_XcvmWG_ zz+$fu`Czc61@BrcDJ%ZYzVtPi#qlt#4Az~ZQBI8uW7+$0TAJmKg5Ol1E{z~Kd3a>z zHF*+j18d~~UFObswG(Ym%j?*NmuVqK?}2z))BLJ4QM>%gv`I&nG+%y>#|E(EO0)Kf zL?G#nxtk&)X;l)Z@RIj1F5H*qm$Eo5eQ~@iifw@4nqxeo5+7=TYm#k$gw~)6y*ADf ze=5Sq3-zQg2Gs&m3KV(RnbzE3cz?{T>Pt=K38kh4^weV|8ciI=%WUA4LTpmop|lDFFjTC((w+S-zN0m%+t&-6&3xL)@8$GVkGW-yO$fq)|?vj zjM!Qw7KqgiHorz0HBn4F-)ZR3npD(HhPd#Aro_P!U!T)Xvy-@ehJYPnWMrsR(dx!H^&hjjW660bbut4W6sHP| zb9_6`UPhm#&9L8?U8$-3U`p}$G%wvfY;&t6t^W^oO;;U=UCbi2(3W$Xhm!LT=3??f z%9a(m)sB)HBk394|ZVrH+fkZ-n@9$#XG&G8PYvEuh}rIRr8>y{lloe8tc3cKEV z?29IUd3MLI*fuZ-`XFP$?Yg=yk@oNA<+QoxAFcc;6Lg3>l5*73#tMu>yY^fn*_msI zub{O&$uuy?vIcG_FrRvkrFdLL40dsneRUHw;fXyP%!^=X2qtaeaM?N%SRzaZ@+oCS z&LZWybPQ`dL{F{UiLse>I?J}muQPWr%hc5+Pi6=g`edVf_6D-)yFouG`1;}EFS&RU z>v&S!#z?+07SE-R^rlH`Ujh2JTOn3VjHWN%_gyp|k5a-*lP{|?AyD3yCx6QE{Tu?f zaR#j5La|z{4?lyP*;gL_iV@zfi!l9hdd5bOh8OBldIa#hhnLJ>^M0D?;Bd)1(r#^t z;&hz?Q9|Q@s_zEaGk!4&i1n=c`OU|7BP85A5ty_CclQFv!QtLl+uWj=6up~Ik53Dy zkGFWM?`C^vw*AKm-XeZ)p1e8FXKx(j9A5u&I!o~SR56Jj=I3vlATO&4Kvb1+z3dDZ zE9KacV|{7$0(+#|-`?)SCy4Q?GKLt9S>@U{Yqp9k|ad%p8z( zXhMV}Sjv8+TxuMcK7uOP=}?!mu}sn{JcR&$mVoRlU~JPnO|GI~13m}x9ySs+BKDD3 z8pconU1K%S)dZXFtcj%BdCD*WNQw}(`YHAq5atwVj&I+Bx?_voJ9(N;*1bf%(i%hBR#tgK;IqfP>T zmF!@+o2J#zO#c3I= z6S*F8;Hs9;My>vf>dyW_n!8j}QNG~-VCR_~nq`FdsAe5tLD-;xS7v7-EIZAMfyi=h z^JRM3p8=PX3d_vVM0~n`{I^D={o@BeCkErF*Yu*?5EZ=Qu1-ia{wGBo`EMYF$E73j zq;MoFo>q>lFE3tG3v!qdZm&f3>z8+8haq}!ZZXu}-|J}__!HTO*A6I$*Ej#fphT7> zNq>$)ERw_({DX zmVm5{r9obQ_FE7}ro$6TsjD}54gx8QpgKj*di1jbc94K9w{m7!M@raAoZGl{M>Fwb zGuoImsncy~nCVxH{54EKhh)=5mcnu5^3N0|Ciy39FgXXAngXj5cEDKK?}XW2yT1$m zvg9z&$`#Ir7Ve!Y;lsZ$YL#o@QRGNg7fnh^%+wC$IEayKT3ynPI!~jYOC^H*x?P-A($M z<{#Q%1^Xo2(0ve6nam$fGm+>t`=lNhASQiJWdfxi1O4d3`g=-=$FU|B*WtA$H&P%L z%r0Ct=?{?&f;T|rkpR+}dR^~E5KPJ$Js4{ z4od?8r$32cOsn=V!lt_q4Xwy_h81^69Ak5#KXhjVypvNro*DAf4AYLl>h6dw(M5}B z3A4*N5g`=e1B=i8@n#6)dIxGnt1nhcVVqqJ@hiWRyC7uUpd+_?WgYG>U){K)Y4kF( z$q+*&WIg*2mS^r@8{5NM=;2*Yz-EW}^)B<34+OuL@Az)83ADstwk9}lvG;4SdStm4 zw8PM*gUuQzXN%oZWA&E;*fza-PRS-`9vx0=fE~9o9x}9uyx&wOm==V%oCW=nH4UV{7QF8lw+Ly0;-Le)MNrG1?Q^!vMz$*mZ zZdQo1hFD7K;oN=+&58^SW0l|?m1c#s#6DF+hZ})c-tYNHAd4n-Jb2fFL4R{ki z;OzX2IoxFj5uEql7Y*ZJXjjKaI3S1jl)$ znSFYw8GFKY~|II=dMe8xDet#Y$NY%Zy;2{N@t!M(a|IV2{87Kt(c;D z{@KJ_475s#c*Lsp3k%~C$alkNkjq^n0;5m;2AY^)HEY2t&X$z8<6}uEPZYQ6r7N6b zdM8V9DvqY;7o8>w7;RplB}FRAiX!ll0GZPHo?YisGPHg)DFBjeqgVu*b{fKUWI_)aC~-A8jA5can@3dfBzv#(07^M(DDgLc1D335`q)K>%0_AT zv2cFw6Mq?Q(GIpg00J>_$;GPskBJVetNYF{iK7Mtj!9!Vcx*Lil%*JOLXZ9uoxVka z=Fk7cf$%1Ue7oppp%+TO)Wt6LIE<Ng;DDGuK3raek zxnWxI9}EgoMGUOLzWi`pnH~(0Kl%AR8F8OwZ{tJkTXtE2SAtu%?!McR$NnN;Hv3=F z5HhUqJ%9+Xo<27NZeo0@G>kQG|K9j>QG=nzj@DeUG|Az{cdz8c2W+*Xx!HC@1XS{3 zfDp4MN$i0Hc?&R8v4X};m_F^T32Ql<`6p<#DV%R4ztBr$Kgj1?JEXH^Bh1miYjCKN zH@oa&S(JYc2Z~d?n%>v|17zQ;0QNAayVE$d%PjBzd(HXB2 z=HT>=^ckJHIIsqCEY>(09P^I(q&>j}m znVTXxfzIPn#>5m*C58KDBHX~31l;~ZIkiw`Lt(Y!=RpmPs25i>wPDIHbeCq4uT}XiESMGv04sxN1qO6V%Lf!d7G8!~sukiyIKCtIU ze>5JZ-lCSN7&#d0raE+Vr|Spj90kF0F^aeD4IFGN#u`M8Oz96QIXGu+&!jHVZ1SIFDA z-Q9^wu2?zkyjIvg)xWNmih4?zj>e<@PV!ehC?lFgJPm$%Bv3SxW01eow~^p;Y|&u& z=D+=&P_;bIx$W>8ck<9kFz1^UXRuz&{IAuGEU#{`ot9;=*htcJna82~{J#CSuDnyv z*SW4BdBFbE-I;DeM=?@qX%?jGJCBt3IC z&f2PhTbeu${}7qvHMRDSZ(sz><_5Z(8v#$l7uPhpVw%DP&!uCoDOpS2$&uaqvSNH| zC}6>?)^)5-FHWXD4VPO_Mi+9nWgr2H3ztK`$2%Y2o2f2AuPI}b8^~S$o||*JCwf|o z&b*_osmqO=o==88y0DX4TN&In-+MI$u91_hzz3ZXTb?oL)o{)2%tV(ruofgd*z_j) zRvvd;y|uqLPB9_mMZ5T~`Ny3vd1<`#9!4Ot@t5Juaktlzf^YS_?uLo+r(Uix&?;ii zF+94W*NIo^fP2A#*DRabHy7aRff1 zw$R~%PExi47QL>a2ArcsOIbGUSQhC(_s-#&6|UeB8@(u%QcrV-pdTQV7^&H#Kfc?j2ZU1U{{TYn$>UiGWI zhJ9*Ig{RZMbsB2YCkAxCu~v9n+8!DQ^67%}clXB&Mr-|^-F-92H#N1uZ=WZXmEl%* z6lJczB!0#ev0KHW``k|_{t35~Uz_)#w8|)1idU+ngGT=6VDu5xlzQ?{-aZ6Y{h!?B zyl{d$sCA2yV=?+DNfin0YOX-FeU1_ho>hdYi|1w-^o7B$u>7)!eAuBmlFyHcG|~DXECc*jS7*9q0RWG^myQm}>0WzeE=#u4BUOyuP3dB3LX^?R{pU_1}YK z5Byvu6H;8AduFbq(2vW#2~7{T(`BI4Dp8Z$4;11Ig~ZRMwb{12zGu|Fh7MwQTaMV~ z|Aa`$DMKTG8;nV)sB>ae&~PousF8@|nuflviXX1kTByy-(_BiGQ0Ez7M`>B*#Oh#y z1+c0)XY@)O7HphxrG1o@VI@!+dgSRWB-Rj9Fmo4cWj&}jpO01;!paO#fY_5lGK0+K5#4Ng(r)lxlEO5 zmz1V!l)2Pfi4PKALDQWZBX?C@%Zss`Bz7+l+sD-b0Cd5DC4dV^i!23(LDSwdHY3{> zs$q(bjQG=z2Nh!~N11?~iU5KF{jS&)_AV=JHitDV0TlFZ*uQ=~I2=q7ILc3U!%=jE z$9)1?e!AFAVp)_aw)xbC=3IG?&E~!AOTGB4>@uJFZ=nUtX$eY>{cr^bfEa! z7D8;2&tCf_l6gV$&~T@_%Vw$C7M^`u!8-`v@#d=pBKSt7J`|yIze%GvWyg{(pXG04 zdJ34XdNs)R9;Y`RYypq?o05lA_&MexrgiOBka1Rb9A7CX)&pezsMO=+ZDlI}dxSbo z*~WU5b7}r;D?xKb#p$GCKkfl z*2v<$4eq{HuVB;b31R#Ly0cyD$Z9Tk*`c|*@7m2#sOjXhX%D>&vyCcS{iAKMwA@~ZZG$_)AdhfVVE?~s83bA1#UiBY zp=Jby?3Pxc5R+5U7ehiia_nc+4y_5h;8LmK9L2ngy^G5bSYKe^uLm)D{_&dk%g)~i z57_v0o>1w0`U$o)Pa}fqIlC))j0xW@x46gCg=vpb)ouUB>sbe%@`Qq}j@#S8u46T^ zO9?6rMqqIw!x_JOB%-K~fB6#GziNo*?rn=??fMNtA-7u{3i<|3#h?2_pG5MUNWp&C zG@x49(kI*Wnf{Cdl2vEZ_V-7;)Q*0bhhtujU{{CR9)!tr!D~PdhWD?wayGwwU!RAA zPlyLNp3wY2F=1nTvw6{ZhoW$xhdntV4NngC>qNlr{My0uaj5q9SFk%)!^G z>|{#tIQ{n}EH+pDHs8bFMVNEg;|#l!@@fbQT(na%4yI!;Rp4Mrm2u$W_cxHJ?(pL+ z1Nrs3xYplY>tq`lS~DwFN8yXi z7qOy4*MMn6hZV0#HF&mqMs}7Zvo`(gz>`rKR{Wgs{ip@XAd~Oo2wE0bBMp(@2FiFw zWj< zn`?~)LQ&B$bTQ7^{d;k4Wv`hI$coh9J%7_;*dDjjgpWNx-7JkzNWO2>R%)|Axj6y@ zg3`4h3NSCjApgIfDzreRo;VvgLt{mbe`M2=^DXV8S<8tZQCOtPky37B!9tz(JaP&x zj70xP{c;e7*27y4k#{Y?zh~^}ICJvbi|;I+FR z)L}Q!7q`O+otnr%EfvxW3I1Qll<3453|~93ks?WD0L)8HUQzD-}W+FY+pb4@ObM%6|Z;p5xao;T|P+O6SMR^6o22lA;aa=q4@*PdZV zzG;#izJ;Sw+yn|e(03ixf8RUsRWEw@Z1*OXQxwZSocCF01iq<(2FRs1apov?vx zI?h#;uioG{-_5pf^et~>HbALF9;?dL{+{>4@dc*7UXu+z7S~TGe66AYWB#}EOFROK z?5P~T$0MYpJW=nG)i?0;z;=`l=-cI3YQWw!gVOuOqWw*`z}4tcTqiIDZ?g7M$n&zV zb?wPV#B%<$iftATIh>#=FiqZVWo2#9MpN`g$D5^_7KZtvLD~H1;+1sTij3yZN&s0V z@R#c^*Fdkp6$L}OmH25A{~NeT6jqasQq{B9wL0pAkzjvN{1eN({E^m-y6)Ji~$ zjW}6Y;{CZwJmx320=|dS+XmoAFwmC5_JbNc0RM=5X;z;9i0T&-l-de(>2tvh!G#Nw zf&II&^L|wGisP2s3f$36N~1FJb;!Rpx48Qan&)%TwaEujNYUR?-T8<4Ifpw}!Srpd za$4kyS!iVn;@&K+Hpct)_G@oTLPIJfKFWYk`>5N11Dh5HohT_5WO+GwZkPUDuHk>4 zL<39Tpg5Z5>m6GRV4r)=Ud75;ZlzDM)%UsOh4XME!Eqz)p(2)@)V`&5eh)^=UT z54s-lAht7(bGGaZ(V4%~vD7Sbjsm6C>7g9F@Vu%9{zrd*gL=P$lSR$J1g`fH2Q|0h z6;O8awzZbz3F0zU=<8~8;hS)#{};r)sA^W zy@O-({;qkKmi)2br{g&0uiH-m@jcv9YqVAh@)0}ouCOFl}6PHQ49cVnpl<|LX_1&7Gp+@W)n^v7;1B_aqJPC_-`YLYl0s72UU=1B+7%*$=w`~%V`^N*M} zcgLSJ&okpj+4ej=mVp&P({W83d}khFWpBT2*X>5l(M?na9z~{IcFQt^Hu60H?0U_y zUDw7oteGgPnC*1}%Ed}*c(zvupwaNXY87i0Sk_>Jy-FgUxoZ+Jp2Iv?bvsGLIU;Gzsboo!2}M$h%VMPJIYC*;V)(jI^-Ku8{gp<| zZf9{|*))CVl@ri0PXsJiVQeF8+52rhs>6zAm0cH>L@Q($0omi0Ep{;*jj|cnbf>*e z)F|0iNGi2PS@g_?qSwCTS$5S;hZq!F$71t}X~S~aLJbB@yHXS59rFZhZm-B|o-`qi zGXIEqYaaYb^C$sO5HJ=J#|X-ZWBi?YGE}TU!&IS|89c&Vh8zST=crC}3qTV95{enl z%0z`u!UEJ@7ZF4(o>;n$42CrlSs0o;Km)G`duY(bv^%ZajeoDhN_W=(F1 zVU8@G_L{}0j;|0CXXj>S9*_;*bGQ5*4z=?U|A|`b??Rb=Kz@BLDI? zEbOOAgHOYyP7F=d>5&C=%rm~3=&qF*UpCRYYNj=k5-5c~y29>~GU@dVOPlkna+^0r z8e#qs^Q2RYhHu`qo^1U3pfIuc)Q!{cru{w}rcHY*w*Fm~_c!m|uf$oNbM4#XnRx?W zd0I<62BXWosb?6|+uXesz7_+SAL5zr&_0 z#p2|h;_oxBpLz3(Ia&0wSiE-amufy7H1UpO#bV(}aX=1s%o8lSp3^+>dlw_jzha*B z;KY#4>&J@Am%jgd=ozbaUzwQLv-`y1`O-jaz9dbVoSY{On{M-Rv*fuXO-?SA{syad zN>e8%bM!6G@37pNo6o$>&!rLOBYAhsnm8E8ar`$pWqU-B5W+KvTR;>fMsbi#n!&Bc zK~gzMNYWwPpdb>2+G{+-@w#}OI##&By_;kza?tm4ZpT5>OCk4MLccHJfzR{s4FBgz zjXdtD%1$0&54hLY!!Wh0(Rp&H?<(&#bl-Sd6Ve2WdLCp#f@I|Ff5ekl zA0or0Mjq5rj-for;#<(~Il1C!_i zkZmL+kIv%B^C+@gisWIwXDJU2p%Z_Zsuh{~7n}1bxLmbm)!?G!HSuR3M_wl}d6H0$ zC9fnRyQN7UR#uLoJa7jM?~coNUCp082%`8f^EFrc{qKlF0X1D`US$s^Q5TT4#N^G* zSn_bKBeSJS9<%|+P#$tktK3EGUi}zsWjzbR8R43aZ*M51IU-^$ulr@XhU$Ywi^-*j z1x-wzvX3XPW+AhsOCDswU?>l0Z%GLW#2GbrO~_W*ezoR-q;rUfoxH;2QijwAi;9y8 z4=+eeUR{hQ4?+o9EoJhce|A{P3kW$NsfN=60-BeC`En$Pt)1;uO8-wD4upe4!KB+M zpvj5J!-k6|&+(Df(k2hw@}l&w<#7!|=arh&aeBKO5mvzDX9vt z8`U!6RxT>{?iC5cgVpOO`2v6fFKfHjP(B_kL8^x&tO>>W>Gcslkvs$NP!UilYw zXY<-b5Qgz#$tgFbAf-qS!JAkth*bzo>B)GTgCDbqNJYnwLm;3tjyN7fl=ZR@m9SLU z;IbwwthXfO6c72pAR(cPhxTCp5@(Y%U9{2NsE|CDl+4aEeIMr6X%;r`_2spVd#Tou zRr?take58VHtg11DCA{N)q1lz7<4K%2kEFnImCnz200amj+|PhGw_Qp_uIF26Y^%}($oa6q`no3^+5$09ZaC``xO`S)yC?X@xtfQk1icwxpx2J#?dj# zW4x}XdxdPd*{SIQl5wZw&-v2Mt>moao!CmZ{D$Y!6Y12!gi-0*@#@8Vp_ka4oxk8c z-o{S5Btw)LdNM+ppdmO!gkZ==@yYc3@bZ4I+Q!+{^;mTB?%oMW-pQw6K{bcuf%wZ) z=l&ge$<0iw^JOH*|f8F{AGWNQ%kA}FP9+K8CHbwo&!(QiehDSuFomP_wu*(IB; zLeUeJGNUXigKMqDLX!tp!jkuPQJ)o>Px1iRx_o*0FUd=GOCHQAL0KkK0#cz%{8yW3 zN>2V1lsRN)awQI&X3c72bR@0k`N8!z8(O(`z{<>S9=2I-zWJi7zh6oHURsJkUaVWT zM-{q%s6g|oOHubos2F$CeW}*Mb0`9DQ^R* zDpvNb+)7Pol5@fc<`XML%Yws{bw@VPVD|n>79_;69?TVy2*E zQUN1oumau&(bN`-)YxCQ@DzWjDJrv|B?J`IBm!fuL2PJTnTWQ-lK1*;Sn^`8mcb%w ze#w((!}+U;_{`+BZ6zS&AV)KFglgQu!#9vtF|aOQBmp|B5QGpl{1FEu;h^CFf*sA_ zHR0$;&I*Wr1X6ib*|#D{NBW*5kDwYHTlvby4huay8gwYv(PSm)aOkhp9}%)wh(KPC zjmtD~hY6%R#(}Ta=P+1-ngx**F9u>XVc(kKFMn@lbK)Rhw1Sjc-_iP%_Elw~&&e?4 zy#wIcg|Os-w^!p}0X5I$0T55E++LoMykyS{98-Opy3mE3V#u&BNG*dBP6;$I=CC4% z_D5{$i*Sq?)}Urk1_fmSkj>`K2_V(oRebqYm~(C$1Y^QG!nYcodAte^QCfvw{}1#0~wh^BPiFJ|m-PW9&|Cc4Hg zPI-qwAv7TaXNOJD&t-{eW1d$^KF&UT08Xsb~Fj&q7s z4k`yyb5eo-M02X5acwf91_yrD9^5ULy6fZcT4h)96`Y|`YjW|l&s6bm_2IB@A;P_^33{s*Sn93p;)Y6j)FRdhkJ9kR)3j(+rIqD zpnK$IB#_-rn=Se>>4f)EE_dz9Mio6!LgZGjd;De_hTuWh@ZfKFbFuD`v<6m)pS*Jf zMNv~`^vSUen@}1Qg|1s`)!TY9=_wk3tLQKfh|ugET9^&<5W#8)$@f#$W2_MMq?{*j zKT=9Cx_1%XT&~xzw<8N7exZEw^93Q~H%i7NPbLRa+|Ve>w` zv=_CB-pw`TPwJso(S74K?GeZ*bLL4+;5zO)W$=jDkG3*W_i0WBT z-3kg{@BPIt3Lc7G+$^xl9pSvm$7we&9syv(dd!1%cIJ+!&y0CP`LYF<<=0hAV=PWE zoY0mG!Km3xMuUa}g20E8xCH|}7${%Aw!lmX5=j~ahW%=#R@|{AA8X!lwm}t~X)jcL zC1Al)!$F1jJ9$QD#WYKBs8OYU61Dn$m}opx+}xR?OgHSwG;bI>s4axU2r2=L2Bo&Z zF*vIdJ>1*rlvs`Cxyy=cy3B)UXx2_G%#wMCXq5`oIUVNd2lIy|&_jum@YsR^d>hWY zIa|xewmFJM7W9Fey^R1xRjM~1aFvxo;W&T(ejpI2RBr6vEsA2HP!JV%@7{gmMg(;HW6`p?0i!q+ow!L`!2Js5<@-t44nI}|qn}_hsn0m8k z9)k5WOgoA^`#k+ETU!FqgxD=<^-4$EA|L{IwGH^s2yT8b1*n>tEL)da0eW2^-imX z4WPne;sF)TaElPs@G%DZNdvh1YIMOtoA+t=J&}oGD(3y(X^$1{7}gM3m=ef5 z7-5k-UjOD6MLFbT$u|GaWs4~Qsn8*HM0PQ(xWM@+kq4l3uQo?WSsQ4?eGvZ`OshKz zQ|8=XV4i}7%*b=pX4;L9oA;ZiJz*{lgd+*l1DW?F1gYf2DU=hBq}}t!K)i1O0K@qN zq5e6##i^PH08u$ao7f*T4;WJlv_T)FgXWYq$R8|u{bd*F6%G>cE&iJ~5iFRtheP*^ zDGAeWp8jdic9n!+)(BGsnKub`VSH#{OHQW9d9cY>b%BEGs(V84cy;p-tTUDnFw+m2 z2W-W!(Y*aZKTp#>nr_EWNlt69Y?a_!{5S7AaK@xaQGq*g;aA8UF&RJ|uZe>abpWOb zGVgofQL9hxP;-?=ofqKG5g!S_v@)Ou)1uYQQ?UC?GNs$R5yc9w=?%6^F0yE|nd3~{ ziy6};uKs_{`*|?;ZT}655+hav_OsJoQl$j~8X}9?MHZ$CGEXZ4pDCj$@P0YNbMOb? z8y~u2b3?t0Wcx((l2&l#{p`4TI{#;@#$I<}FoG1|yZkp#YxNi=xsnAFb-rA;F+Vfy zxv4KY(y+i00d9vArVKJqCycv3EPpAcNgSq$rwSja%AiUyG0D6%ja+kP-9Kg?EDU+K zurR;*yUiQH@EN;u#o>VvQfcV}u+SJx*o@7us z7*l#|BJ<%`!#`x+JNC}+wP_#@;NOEj?a3Oe)?p=R3yLR{eZMql#$z9EJ@>4L(tB{CZA9&OICif&7 z9=q$K5yIdH?*^VzOCySCsd}tyUOhoJ&)h2USXOuir%HKE*Ne8Y)p6!cPdiCPCRi`$JL1 zu78KBNN74AaP!JvU&I2USbLY=3UNt{tygJzVK_FnIg|= zY~2+WAGFWB5u7m{Nz;gtG(}z3QBfwcVLN(0f@OzS-yvHOiC`*Pz;c|I=9H*medy!Zx0J+sr7D?ZI5f+Aso_JF0Dyrt+w!O5<9Y}#+0H9|6` zszemk=1t5S$i#WYl58U{IaboL5i&5&(>f_5N`iqXW;&^saEi~U>xffT%eEz3QxPSl z8ir%ZI;}vurC3I(h>0VrL7#0Ftc>J{N!kuHL8wNmLi$(^Aq(T3RPWvQ=B?n#ZL5=52|n77oQc zYk9JTIEmg@4|6D~iJCio*%D1!Zfein*OaT0nxMtF@13p*wVxVv^ISMAZdSjzW^v_5 z9I7Ew@H<|IfW;vDs+gk>8_YWx@}|wn-H)c4HqUcf$BW;eb!VGw+HT&dNV~wnHSc7E z3C}p@YrNhX1Qz$R%&fN&tv7FSA1v;HpA4Ud;jw1SYbKbkc^=wr-o69-+s5gjnzx71 z2v@G`-ydFRJpKuMaA_Wx48GsZ>WjsC^XSTEuCF#N&H~I1z%RMV3g${fB!8d(*t~xc zYTln`_!7j!AD}ftLvdiTn!OB-tX!%yZ}g^TaTDTqXptKLzl>jpM0f#$N9UUN^=K#3 z4)f;w;QGA>JMM+VP5Q^P$Gvbkl1d^wzD?%#Q*kmji!1Jp#iMF9W#8SLd47<#%~F zSRSyg5}KLYF%fO_G`#xYs@J?@37A=hpF}4sKF|A3;O<74!J}(lmwKjlmv<^}nf{OC zumht*ccG5sGhXvnj>Fs`s3BXiHTM?$kh@IHdji3uYu-2Z?&dXZFb?4OkC2l$i!erz z9)ddzi6{hLilBI^9ts|`R-`Kyoz{aW%+-}msO#)H2EwWn*TKq4t!-Hkg;siyb+8vI zrA#(@Xb;_gupM_=Vm5u^%i3D`Ezj!*7}B9zL_lEN zmcT`gJK|Ts7KHwT%fp(ykxH+EK@VMFWpo{@j443mTO2Kqlmi6gfG}u5V3w=lf<~lL zJxutZ|KRelCIgdm!f(Blo#_?-8Xl}Xq(WdN&)BWM{uvXd9TKd#|N^n+A3P&t-h0O2NbtvRnf<@0gHs~agog)BX z8x%F{hdSeCRAmcN9!wEZIQx*Wl|XSc80tGlKu!5RCYAI;JV=nC~SW*FKKE`}zQhaM=`Z`QVoNB|KJh@NuM zp%#rm2yeB@4WXJ}ucXriv3t8>y1h4Mx7#bbb`w!Eb_|z?%fosk$4?K{N~JPgxpS|| ze~9m741ULvAp94xm zC6O{qV?uf9%JgnJovyCf2_n6>XWGk~cK7m?%AIwh#(o5shs(oyWaCWulIG@DwZ+;1 zG_Z7bRIk)pje(@Wzc|ED6rbavBQ!rTXw`Bx>SQFGV=0zt*WReh*C-EE5f$(}LQ^3t zASBc%ph#*ax(NdYJB1D-ecF)>0Y|J^E$fx;qA!K!yD4aPU8~t_6B@|o7)uSG#HgjBy@ty z!+MJL`Ael^>4u(#-XQ5wO)$XBJz!jvd?+z*n^Jn!BmjJUbkryhmNI^Ih6X{A8dLOT5pXz zxG6AXGY=@B6)q3^!Os1=s11ez9DjyP*-`=`FCjy)Q-~xWCXo!mp?D5%Ni&3NP%b29 zkPHSyM2Nr18oTICu7bnuLT^91^bhHO&{NdQ^{(wft8~a`$si`vmv?wxp7U~wz!N&0 zSx)hLB9Ixur8}NZAu9T^7;lPs@l-s^844WG+Vk`%=4H{h!E83$7*$?Z0ifsAiBF@` zY_JdXC!%wzw51zb-=qdY0e*(pU8~O_)2^zSXTPT9rOs<0K#ORTaBi4tYI!&*e9tS| zPzyPPxNvP<-PUZZ~Hbh9m*uLLW`i4A+N)rYE3u=Ik2@8%a8*7gjSmGbT z6s^2ID$yBuP3|$kfJ3VVT{(OzEiZLm>pb*E<2+nIR*PILd2cg=y1OdGyxOXHI@KPJ zMmk-T!~zu25)$hEyQhdMw%@y2Y~_8XB0Z04rUKZ#QcKio^*>@xT!1A0D)zN?==WU) zegV&44MK$OJKcINEid&u|I6;Nx8XZmi+L&!a={`|hxQIH2SYB{9fWA4D{A(X$kGrF zXP4*8&7SX-6E7W8eqdDL87IJCQu;X`EMJSF#oD9>Wek2zysg?1TFDcMOv_8X>i^k2 z_C|#!r%8$1V-xB#95r|KU^jZyA_33dZKvJ?LR=_UT}kB4D*$P|U33S2z$UXHITdnT zxI*h915fC%0IrF{3uMvuL)kVSl+W;6q|}n<$XvH1JS{H;aE|g=xS(twweVwkQ1{iR z+ti0A&FRG)2nLM-V7%m3<&Z*+#t6ywHdYIe#852Yt0q7pPsfO}t2c*P!S2Mn;YfH* zhAtsn)(ubnV^p{_kxjJfHS2}6ywtygA9jyj6iFN2X_AiL%f};(6OPBwbsBP96S? z6lj}5n#1yGd8vOQ-tQjE!=;tKa9=-wn1e?O*9`Yhf?(m~T@C3M#rMg$S9p&A^Wru^)69c#CCm zaA&Nh?sZyT>ThNCyT>wcaqpRLn?4#c@Tk&g3jn~6+7cu!tI>#J#I8*mJv@2vt*kh|J6xd^8En)B` zWW|Y*DR=kziD`L1*tMIM1cERQ;QL(X_8=W1GNPhOP_`b*ut+I-Au>wTQ!6Z9A`poU zI!Lu>Cg~-@ONk(K6BN4chv*b_aKB2=Sjuc6E*inKlfffj+wL;5>6UE~1vx5RI<_7g5KsFg%L+{kdi}H4%8pBx2dZ ze7P7FPB(R#N0X(-mY9QYG*w7Gn&UU)vFu40P2W^1kV7NUjHN5gCEr}q%-m(yo5eVD zK_*WkA8l1&+p0<(0()B-0-n4$8mOsU+pzh~ph`qKYKS{@Aq{hFUGH1F{m{@6(Rn(J z&*%%&rh&W!BzD{8Q{A)AlYB#UAe70I$krx^TJXdB(2}PhIF&r`3o848M)MU0@&}ci z8Jul_d#F^)x9K(bo5p@k#pvP!+=kqv4^fscAd+^t<%g;2C^@f8lIsUR=V1AAbFXhXhA4T$C6dj_u4 z)lHs45a9m$yNAkiY$KB=eX?W5D3ZfK6vbySGhu_k2T-6Bbl^;2p{lHK(4epafzl4z zxg%v0H3&N218g=I@Qqg9^|Iw1I-QxoFR(>f`jUcY8Gl>zY*Cm-0%c<`=`DuZ4nt0s zBH&G!_gIE7dN`Ld5#+`g#Gnud0Mi;B5} zRI9rCy#+bVVcx+%^IEWFB;yG0c>o9*J+dlOCjHswwarGmU3Libn)YKyQ4!|t>d%J= z^Ztj`+GW3(cHfrft(?qO-A{9TZ5~MT;`@Tp#6B4I068E><1puVS6}vr1>BE%J*7AO z#Jm-iF5=O>5lHj)a9dUh9?hQT=4o2jFuT;(8?|+sQ@0=f!Mxa$C!O2H)iGq7jh}~k z2hu!2+eqRs_U`YcsW6V?_;xbG>PAilLMR6&>lZm;k_E!ikV-hTv+aOvV}UR_83#PL zi;W=zDR0KS*&voT>X@=ANOwv%38D_Y&@Li^UHAv&U(iL*en>Ih4~5y&=FRIOY@(-v^*|8DTjFMl`eHvL_r96+x8P%fXD5{X<`&Ms~oGvw8@bD-P=zPO+ zNl7tjRmzLT{0`kKBxPyex?r^^o{v78(=YK(?0GTxmdDqG@*wYxxw^qNL&KHI8#LGU+(0}s*P-7)JTafbLj%!0Xg;GM1M%8i zW_sFO<_F3HY81+2(oo;&eyi~7I>7lsd8P8cTmv{yD6dc+4gY8h7YoWmc~Bpehw|Y6 zhw@M!)Cc9&q4(vxUX%wEW3;@tt*v7y4=BcHc~_^VZlFA%7^CGGrr+b72Q*`>yq>nP z?d)1+etu_Xem=97{o2>niSmF#ES1;2nHgVSa^*O}9p{L+%f3FIS?)!7KpS?-`|v$I zljBa13#YTMP#(~QnesN%Dp!%3UPgI96;{gI+;Wq`A}KC!afK`QZ5ibOO;{;!+|3ay zyNjO#;ee`gEA}}x97u^`c!tY!kE1-G2`l9#B8Bn-c48)|a*^n}B|17WRRFrMQy$T^Oh)MkGoqTxxw%|Q6~hN)kiPf0 z36pf74O8V2O~a!dd)wIzxjUBK-s>10?!moHpb=~3{ec_PK(lA}?_C%_Q65m^_T3Al zr&4*%O#sk1%_o=Fas>byspZ7-0HD#1mv?vx02=MEyu7B%0PwpVJ0<`C06-YjzqKXY i2Qlb*OaK4?fRh0kZC&FHGqsTb0000`_ included with the package that you can use.\n", + "The plotting is configurable, and depending on the options you will obtain a result that can be used for understanding the data flow, the high-level components and the (re)use of datasets.\n", + "The parameters are: subgraph (yes/no), version datasets (yes/no) and display edge labels (yes/no)." + ] } ], "metadata": { @@ -470,7 +496,7 @@ "name": "python3" }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -484,7 +510,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.8.8" }, "nteract": { "version": "0.15.0" From b141965fcf71e90215ab73c89945acb70bcce738 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 27 Oct 2021 23:32:15 +0200 Subject: [PATCH 27/34] refactor: the StabilityReport is not a module It does not transform. Rather it allows rendering to notebook/html. --- .../notebooks/popmon_tutorial_advanced.ipynb | 30 +++++++++---------- popmon/pipeline/report.py | 27 +++++++---------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/popmon/notebooks/popmon_tutorial_advanced.ipynb b/popmon/notebooks/popmon_tutorial_advanced.ipynb index fcfb6922..65c3d6ba 100644 --- a/popmon/notebooks/popmon_tutorial_advanced.ipynb +++ b/popmon/notebooks/popmon_tutorial_advanced.ipynb @@ -4,7 +4,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -390,10 +389,9 @@ " \"*_zscore\": [7, 4, -4, -7],\n", " \"[!p]*_unknown_labels\": [0.5, 0.5, 0, 0],\n", "}\n", - "datastore = {}\n", - "datastore[\"hists\"] = df.pm_make_histograms(\n", - " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n", - ")\n", + "datastore = {\n", + " \"hists\": df.pm_make_histograms(time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\")\n", + "}\n", "\n", "\n", "class CustomPipeline(Pipeline):\n", @@ -410,9 +408,9 @@ "\n", "\n", "pipeline = CustomPipeline()\n", + "datastore = pipeline.transform(datastore)\n", "\n", - "stability_report = StabilityReport()\n", - "stability_report.transform(pipeline.transform(datastore))\n", + "stability_report = StabilityReport(datastore)\n", "stability_report" ] }, @@ -431,10 +429,10 @@ "source": [ "from popmon.analysis.comparison.hist_comparer import ReferenceHistComparer\n", "\n", - "datastore = {}\n", - "datastore[\"hists\"] = df.pm_make_histograms(\n", - " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n", - ")\n", + "\n", + "datastore = {\n", + " \"hists\": df.pm_make_histograms(time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\")\n", + "}\n", "\n", "\n", "class CustomComparisonsPipeline(Pipeline):\n", @@ -459,9 +457,9 @@ "\n", " \n", "pipeline = CustomComparisonsPipeline()\n", + "datastore = pipeline.transform(datastore)\n", "\n", - "stability_report = StabilityReport()\n", - "stability_report.transform(pipeline.transform(datastore))\n", + "stability_report = StabilityReport(datastore)\n", "stability_report" ] }, @@ -485,7 +483,7 @@ "(Click to enlarge)\n", "\n", "Visualization of the pipeline can be useful when debugging, or for didactic purposes.\n", - "There is a `script `_ included with the package that you can use.\n", + "There is a [script](https://github.com/ing-bank/popmon/tree/master/tools/) included with the package that you can use.\n", "The plotting is configurable, and depending on the options you will obtain a result that can be used for understanding the data flow, the high-level components and the (re)use of datasets.\n", "The parameters are: subgraph (yes/no), version datasets (yes/no) and display edge labels (yes/no)." ] @@ -496,7 +494,7 @@ "name": "python3" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -510,7 +508,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.9.7" }, "nteract": { "version": "0.15.0" diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index dc5259cf..c00746fd 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -27,7 +27,6 @@ make_histograms, ) -from ..base import Module from ..config import config from ..pipeline.report_pipelines import ReportPipe, get_report_pipeline_class from ..resources import templates_env @@ -160,8 +159,9 @@ def stability_report( # execute reporting pipeline pipeline = get_report_pipeline_class(reference_type, reference)(**cfg) - stability_report = StabilityReport() - stability_report.transform(pipeline.transform(datastore)) + result = pipeline.transform(datastore) + + stability_report = StabilityReport(datastore=result) return stability_report @@ -400,7 +400,7 @@ def df_stability_report( ) -class StabilityReport(Module): +class StabilityReport: """Representation layer of the report. Stability report module wraps the representation functionality of the report @@ -408,24 +408,18 @@ class StabilityReport(Module): as a HTML string, HTML file or Jupyter notebook's cell output. """ - _input_keys = ("read_key",) - _output_keys = () - - def __init__(self, read_key="html_report"): + def __init__(self, datastore, read_key="html_report"): """Initialize an instance of StabilityReport. :param str read_key: key of HTML report data to read from data store. default is html_report. """ - super().__init__() self.read_key = read_key - self.datastore = {} + self.datastore = datastore + self.logger = logging.getLogger() @property def html_report(self): - return self.get_datastore_object(self.datastore, self.read_key, str) - - def transform(self, datastore): - self.datastore = datastore + return self.datastore[self.read_key] def _repr_html_(self): """HTML representation of the class (report) embedded in an iframe. @@ -531,6 +525,7 @@ def regenerate( report_filepath=report_filepath, show_stats=show_stats, ) - stability_report = StabilityReport() - stability_report.transform(pipeline.transform(self.datastore)) + result = pipeline.transform(self.datastore) + + stability_report = StabilityReport(datastore=result) return stability_report From 678145909fef37fdd9bbf72f3cb7ccbe91ae494b Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 27 Oct 2021 23:34:04 +0200 Subject: [PATCH 28/34] refactor: remove dead code --- popmon/analysis/apply_func.py | 1 - popmon/analysis/profiling/hist_profiler.py | 1 - popmon/stats/numpy.py | 4 ---- popmon/visualization/backend.py | 5 ----- popmon/visualization/histogram_section.py | 1 - tests/popmon/stats/test_numpy.py | 1 - 6 files changed, 13 deletions(-) diff --git a/popmon/analysis/apply_func.py b/popmon/analysis/apply_func.py index 37a4296c..00664b77 100644 --- a/popmon/analysis/apply_func.py +++ b/popmon/analysis/apply_func.py @@ -288,7 +288,6 @@ def apply_func(feature, selected_metrics, df, arr): else selected_metrics ) metrics = [m for m in metrics if m in df.columns] - # assert all(m in df.columns for m in metrics) if len(metrics) == 0: return {} df = df[metrics] if len(metrics) >= 2 else df[metrics[0]] diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index 86c63ff3..3f1f762e 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -171,7 +171,6 @@ def _profile_2d_histogram(self, name, hist): # calculate phik correlation try: phi_k = phik.phik_from_hist2d(observed=grid) - # p, Z = significance.significance_from_hist2d(values=grid, significance_method='asymptotic') except ValueError: self.logger.debug( f"Not enough values in the 2d `{name}` time-split histogram to apply the phik test." diff --git a/popmon/stats/numpy.py b/popmon/stats/numpy.py index a3718b74..e5a7cbd7 100644 --- a/popmon/stats/numpy.py +++ b/popmon/stats/numpy.py @@ -458,8 +458,4 @@ def mad(a, c=0.6745, axis=0): center = a.median(axis=axis) rel_abs_diff = (a - center).abs() / c mad = rel_abs_diff.median(axis=axis) - - # mad = np.median((np.abs(a-center)) / c, axis=axis) - # if isinstance(a, pd.DataFrame): - # mad = pd.Series(data=mad, index=a.columns) return mad diff --git a/popmon/visualization/backend.py b/popmon/visualization/backend.py index f209da95..48ce5ab5 100644 --- a/popmon/visualization/backend.py +++ b/popmon/visualization/backend.py @@ -107,11 +107,6 @@ def set_matplotlib_backend(backend=None, batch=None, silent=True): raise RuntimeError( "Cannot set Matplotlib backend: pyplot module already loaded." ) - # Warning is too verbose - # else: - # logger.warning( - # "Cannot set Matplotlib backend: pyplot module already loaded." - # ) return # set matplotlib backend diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index 2f685ac2..4611aa4d 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -184,7 +184,6 @@ def _plot_histograms(feature, date, hc_list, hist_names): hists, feature, hist_names, y_label, is_num, is_ts ) elif hc_list[0].n_dim == 2: - # grid2d_list, xkeys, ykeys = get_consistent_numpy_2dgrids(hc_list, get_bin_labels=True) plot = "" else: plot = "" diff --git a/tests/popmon/stats/test_numpy.py b/tests/popmon/stats/test_numpy.py index 3cf15ff7..0a382cda 100644 --- a/tests/popmon/stats/test_numpy.py +++ b/tests/popmon/stats/test_numpy.py @@ -255,7 +255,6 @@ def test_probability_distribution_mean_covariance(): n_histos = 5000 max_hist_entries = 10000 rel_error = 0.1 - # basic = np.random.uniform(0, 1, size=n_bins) bin_entries = [] for k in range(n_histos): bin_probs = np.random.normal(1.0, rel_error, size=n_bins) # + basic From 8efc072468be407eb2fa7dab6a94be3da6e8d782 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 27 Oct 2021 23:35:09 +0200 Subject: [PATCH 29/34] refactor: move visualization code --- popmon/base/pipeline.py | 63 -------------------------------- tools/pipeline_viz.py | 81 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 72 deletions(-) diff --git a/popmon/base/pipeline.py b/popmon/base/pipeline.py index ae18013c..9afef85e 100644 --- a/popmon/base/pipeline.py +++ b/popmon/base/pipeline.py @@ -17,10 +17,8 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import json import logging from abc import ABC -from pathlib import Path class Pipeline(ABC): @@ -73,64 +71,3 @@ def transform(self, datastore): else: datastore = module._transform(datastore) return datastore - - def visualize(self, versioned=True, funcs=None, dsets=None): - if dsets is None: - dsets = {} - if funcs is None: - funcs = {} - - modules = [] - for module in self.modules: - name = module.__class__.__name__ - if isinstance(module, Pipeline): - modules.append(module.visualize(versioned, funcs, dsets)) - else: - in_keys = module.get_inputs() - - if versioned: - new_ins = {} - for k, in_key in in_keys.items(): - if in_key not in dsets: - dsets[in_key] = 1 - in_key += f" (v{dsets[in_key]})" - new_ins[k] = in_key - in_keys = new_ins - - out_keys = module.get_outputs() - if versioned: - new_outs = {} - for k, out_key in out_keys.items(): - if out_key in dsets: - dsets[out_key] += 1 - else: - dsets[out_key] = 1 - out_key += f" (v{dsets[out_key]})" - new_outs[k] = out_key - out_keys = new_outs - - self.logger.debug(f"{name}(inputs={in_keys}, outputs={out_keys})") - - # add unique id - if name not in funcs: - funcs[name] = {} - if id(module) not in funcs[name]: - funcs[name][id(module)] = len(funcs[name]) + 1 - - modules.append( - { - "type": "module", - "name": f"{name}", - "i": f"{funcs[name][id(module)]}", - "desc": module.get_description(), - "in": in_keys, - "out": out_keys, - } - ) - data = {"type": "subgraph", "name": self.__class__.__name__, "modules": modules} - return data - - def to_json(self, file_name, versioned=True): - d = self.visualize(versioned=versioned) - data = json.dumps(d, indent=4, sort_keys=True) - Path(file_name).write_text(data) diff --git a/tools/pipeline_viz.py b/tools/pipeline_viz.py index d6f53765..64919586 100644 --- a/tools/pipeline_viz.py +++ b/tools/pipeline_viz.py @@ -4,6 +4,70 @@ import pygraphviz as pgv +from popmon.base import Pipeline + + +def serialize_module(module, versioned, funcs, dsets): + in_keys = module.get_inputs() + name = module.__class__.__name__ + + if versioned: + new_ins = {} + for k, in_key in in_keys.items(): + if in_key not in dsets: + dsets[in_key] = 1 + in_key += f" (v{dsets[in_key]})" + new_ins[k] = in_key + in_keys = new_ins + + out_keys = module.get_outputs() + if versioned: + new_outs = {} + for k, out_key in out_keys.items(): + if out_key in dsets: + dsets[out_key] += 1 + else: + dsets[out_key] = 1 + out_key += f" (v{dsets[out_key]})" + new_outs[k] = out_key + out_keys = new_outs + + # add unique id + if name not in funcs: + funcs[name] = {} + if id(module) not in funcs[name]: + funcs[name][id(module)] = len(funcs[name]) + 1 + + return { + "type": "module", + "name": f"{name}", + "i": f"{funcs[name][id(module)]}", + "desc": module.get_description(), + "in": in_keys, + "out": out_keys, + } + + +def serialize_pipeline(pipeline, versioned=True, funcs=None, dsets=None): + if dsets is None: + dsets = {} + if funcs is None: + funcs = {} + + modules = [] + for module in pipeline.modules: + if isinstance(module, Pipeline): + modules.append(serialize_pipeline(module, versioned, funcs, dsets)) + else: + modules.append(serialize_module(module, versioned, funcs, dsets)) + return {"type": "pipeline", "name": pipeline.__class__.__name__, "modules": modules} + + +def pipeline_to_json(pipeline, file_name, versioned=True): + d = serialize_pipeline(pipeline, versioned=versioned) + data = json.dumps(d, indent=4, sort_keys=True) + Path(file_name).write_text(data) + def generate_pipeline_visualisation( input_file, @@ -38,10 +102,10 @@ def generate_pipeline_visualisation( ] colors = [f"#{r:02x}{g:02x}{b:02x}" for r, g, b in tableau20] - subgraph_colors = cycle(colors) + pipeline_colors = cycle(colors) + pipeline_style = {} module_style = {"shape": "rectangle", "fillcolor": "chartreuse", "style": "filled"} dataset_style = {"shape": "oval", "fillcolor": "orange", "style": "filled"} - subgraph_style = {} edge_style = {"fontcolor": "gray50"} def get_module_label(module): @@ -53,13 +117,13 @@ def get_module_label(module): return label def process(data, G): - if data["type"] == "subgraph": + if data["type"] == "pipeline": if include_subgraphs: c = G.add_subgraph( name=f'cluster_{data["name"]}', label=data["name"], - color=next(subgraph_colors), - **subgraph_style, + color=next(pipeline_colors), + **pipeline_style, ) else: c = G @@ -80,12 +144,11 @@ def process(data, G): kwargs["taillabel"] = k G.add_edge(name, v, **edge_style, **kwargs) else: - raise ValueError("type should be 'subgraph' or 'module'") + raise ValueError("type should be 'pipeline' or 'module'") g = pgv.AGraph(name="popmon-pipeline", directed=True) g.node_attr.update(**dataset_style) process(data, g) - g.layout("dot") g.draw(output_file) @@ -118,14 +181,14 @@ def process(data, G): name = pipeline.__class__.__name__.lower() input_file = data_path / f"pipeline_{name}_unversioned.json" - pipeline.to_json(input_file, versioned=False) + pipeline_to_json(pipeline, input_file, versioned=False) output_file = f"pipeline_{name}_subgraphs_unversioned.pdf" generate_pipeline_visualisation(input_file, output_file, include_subgraphs=True) output_file = f"pipeline_{name}_unversioned.pdf" generate_pipeline_visualisation(input_file, output_file, include_subgraphs=False) input_file = data_path / f"pipeline_{name}_versioned.json" - pipeline.to_json(input_file, versioned=True) + pipeline_to_json(pipeline, input_file, versioned=True) output_file = f"pipeline_{name}_subgraphs_versioned.pdf" generate_pipeline_visualisation(input_file, output_file, include_subgraphs=True) output_file = f"pipeline_{name}_versioned.pdf" From 871d2c6bd986532be35fb4180a3215ced660fb59 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 27 Oct 2021 23:37:38 +0200 Subject: [PATCH 30/34] test: remove boilerplate code --- tests/popmon/base/test_module.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/popmon/base/test_module.py b/tests/popmon/base/test_module.py index 3c25d080..c5322b73 100644 --- a/tests/popmon/base/test_module.py +++ b/tests/popmon/base/test_module.py @@ -5,6 +5,9 @@ def test_popmon_module(): class Scaler(Module): + _input_keys = ("input_key",) + _output_keys = ("output_key",) + def __init__(self, input_key, output_key, mean, std): super().__init__() self.input_key = input_key @@ -12,16 +15,12 @@ def __init__(self, input_key, output_key, mean, std): self.mean = mean self.std = std - def transform(self, datastore): - input_array = self.get_datastore_object( - datastore, self.input_key, dtype=np.ndarray - ) + def transform(self, input_array: np.ndarray): res = input_array - np.mean(input_array) res = res / np.std(res) res = res * self.std res = res + self.mean - datastore[self.output_key] = res - return datastore + return res test_module = Scaler(input_key="x", output_key="scaled_x", mean=2.0, std=0.3) From c3bffb0e97a199d37d00f21d0233a51049389c4d Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 27 Oct 2021 23:41:58 +0200 Subject: [PATCH 31/34] refactor: module transform rather than _transform Refactor using metaclass --- popmon/base/module.py | 133 ++++++++++-------- popmon/base/pipeline.py | 5 +- popmon/io/json_reader.py | 2 +- .../popmon/alerting/test_compute_tl_bounds.py | 4 +- .../analysis/profiling/test_apply_func.py | 2 +- .../popmon/analysis/test_merge_statistics.py | 2 +- tests/popmon/io/test_file_reader.py | 2 +- tests/popmon/io/test_file_writer.py | 12 +- tests/popmon/io/test_json_reader.py | 2 +- 9 files changed, 91 insertions(+), 73 deletions(-) diff --git a/popmon/base/module.py b/popmon/base/module.py index de24b50d..a56e02dc 100644 --- a/popmon/base/module.py +++ b/popmon/base/module.py @@ -19,10 +19,73 @@ import logging -from abc import ABC +from abc import ABCMeta +from functools import wraps -class Module(ABC): +def datastore_helper(func): + """Decorator for passing and storing only the relevant keys in the datastore to + the transform() method.""" + + @wraps(func) + def _transform(self, datastore): + """Transformation helper function""" + inputs = {} + self.logger.debug(f"load from: {type(self)}") + for key in self._input_keys: + key_value = self.__dict__[key] + if key_value and len(key_value) > 0: + if isinstance(key_value, list): + inputs[key] = [datastore.get(k) for k in key_value] + else: + inputs[key] = datastore.get(key_value) + else: + inputs[key] = None + + self.logger.debug( + f"load(key={key}, key_value={key_value}, value={str(inputs[key]):.100s})" + ) + + # transformation + outputs = func(self, *list(inputs.values())) + + # transform returns None if no update needs to be made + if outputs is not None: + if len(self._output_keys) == 1: + outputs = (outputs,) + + for k, v in zip(self._output_keys, outputs): + key_value = self.__dict__[k] + self.logger.debug( + f"store(key={k}, key_value={key_value}, value={str(v):.100s})" + ) + if key_value and len(key_value) > 0: + datastore[key_value] = v + + return datastore + + return _transform + + +class ModuleMetaClass(type): + """Metaclass that wraps all transform() methods using the datastore_helper + This obviates the need to decorate all methods in subclasses""" + + def __new__(cls, name, bases, local): + if "transform" in local: + value = local["transform"] + if callable(value): + local["transform"] = datastore_helper(value) + return type.__new__(cls, name, bases, local) + + +def combine_classes(*args): + """Combine multiple metaclasses""" + name = "".join(a.__name__ for a in args) + return type(name, args, {}) + + +class Module(metaclass=combine_classes(ABCMeta, ModuleMetaClass)): """Abstract base class used for modules in a pipeline.""" _input_keys = None @@ -35,23 +98,21 @@ def __init__(self): self.feature_begins_with = [] self.ignore_features = [] + def _get_values(self, keys): + """Get the class attribute values for certain keys.""" + values = {} + for x in keys: + value = self.__dict__[x] + if value != "" and value is not None and value not in values: + values[x] = value + return values + def get_inputs(self): - in_keys = {} - for x in self._input_keys: - in_key = self.__dict__[x] - if in_key != "" and in_key is not None and in_key not in in_keys: - in_keys[x] = in_key - return in_keys + return self._get_values(self._input_keys) def get_outputs(self): - out_keys = {} - for x in self._output_keys: - out_key = self.__dict__[x] - if out_key != "" and out_key is not None and out_key not in out_keys: - out_keys[x] = out_key - return out_keys - - # @abstractmethod + return self._get_values(self._output_keys) + def get_description(self): return "" @@ -110,46 +171,6 @@ def get_features(self, all_features: list) -> list: features = [feature for feature in features if feature in all_features] return features - def _transform(self, datastore): - """Transformation helper function""" - - inputs = {} - self.logger.debug(f"load from: {type(self)}") - for key in self._input_keys: - key_value = self.__dict__[key] - if key_value and len(key_value) > 0: - if isinstance(key_value, list): - inputs[key] = [datastore.get(k) for k in key_value] - else: - inputs[key] = datastore.get(key_value) - else: - inputs[key] = None - - self.logger.debug( - f"load(key={key}, key_value={key_value}, value={str(inputs[key]):.100s})" - ) - - # cache datastore - self._datastore = datastore - - # transformation - outputs = self.transform(*list(inputs.values())) - - # transform returns None if no update needs to be made - if outputs is not None: - if len(self._output_keys) == 1: - outputs = (outputs,) - - for k, v in zip(self._output_keys, outputs): - key_value = self.__dict__[k] - self.logger.debug( - f"store(key={k}, key_value={key_value}, value={str(v):.100s})" - ) - if key_value and len(key_value) > 0: # and v is not None: - datastore[key_value] = v - - return datastore - def transform(self, *args): """Central function of the module. diff --git a/popmon/base/pipeline.py b/popmon/base/pipeline.py index 9afef85e..bf34f253 100644 --- a/popmon/base/pipeline.py +++ b/popmon/base/pipeline.py @@ -66,8 +66,5 @@ def transform(self, datastore): for module in self.modules: self.logger.debug(f"transform {module.__class__.__name__}") - if isinstance(module, Pipeline): - datastore = module.transform(datastore) - else: - datastore = module._transform(datastore) + datastore = module.transform(datastore) return datastore diff --git a/popmon/io/json_reader.py b/popmon/io/json_reader.py index aaf0c492..80f5ba16 100644 --- a/popmon/io/json_reader.py +++ b/popmon/io/json_reader.py @@ -37,4 +37,4 @@ def __init__(self, file_path: Union[str, Path], store_key: str): super().__init__(store_key, file_path, apply_func=json.loads) def transform(self, *args): - return super().transform(*args) + return super().transform.__wrapped__(self, *args) diff --git a/tests/popmon/alerting/test_compute_tl_bounds.py b/tests/popmon/alerting/test_compute_tl_bounds.py index b2211866..9e97ded3 100644 --- a/tests/popmon/alerting/test_compute_tl_bounds.py +++ b/tests/popmon/alerting/test_compute_tl_bounds.py @@ -35,7 +35,7 @@ def test_compute_traffic_light_bounds(): monitoring_rules=conf["monitoring_rules"], ) - output = module._transform(datastore)["output_data"] + output = module.transform(datastore)["output_data"] assert "dummy_feature:mae" not in output.keys() assert output["the_feature:mae"] == [8, 4, 2, 2] assert output["the_feature:mse"] == [0.2, 0.11, 0.09, 0] @@ -60,7 +60,7 @@ def test_compute_traffic_light_funcs(): monitoring_rules=conf["monitoring_rules"], ) - output = module._transform(datastore)["output_data"] + output = module.transform(datastore)["output_data"] assert len(output) == 3 assert output[0]["features"] == ["dummy_feature"] diff --git a/tests/popmon/analysis/profiling/test_apply_func.py b/tests/popmon/analysis/profiling/test_apply_func.py index 4adff82e..8a53e87e 100644 --- a/tests/popmon/analysis/profiling/test_apply_func.py +++ b/tests/popmon/analysis/profiling/test_apply_func.py @@ -60,7 +60,7 @@ def func(x): module.add_apply_func(np.mean, entire=True) module.add_apply_func(func) - datastore = module._transform(datastore) + datastore = module.transform(datastore) p = datastore["profiled"]["asc_numbers"] diff --git a/tests/popmon/analysis/test_merge_statistics.py b/tests/popmon/analysis/test_merge_statistics.py index ff474311..cc7c1a54 100644 --- a/tests/popmon/analysis/test_merge_statistics.py +++ b/tests/popmon/analysis/test_merge_statistics.py @@ -40,7 +40,7 @@ def test_merge_statistics(): } datastore = MergeStatistics( read_keys=["first_df", "second_df"], store_key="output_df" - )._transform(datastore) + ).transform(datastore) pd.testing.assert_frame_equal(df1.combine_first(df2), out) pd.testing.assert_frame_equal(datastore["output_df"]["feature_1"], out) diff --git a/tests/popmon/io/test_file_reader.py b/tests/popmon/io/test_file_reader.py index d953d3d2..9ad91703 100644 --- a/tests/popmon/io/test_file_reader.py +++ b/tests/popmon/io/test_file_reader.py @@ -10,7 +10,7 @@ def test_file_reader_json(): store_key="example", apply_func=json.loads, ) - datastore = fr._transform(datastore={}) + datastore = fr.transform(datastore={}) assert datastore["example"]["boolean"] assert len(datastore["example"]["array"]) == 3 diff --git a/tests/popmon/io/test_file_writer.py b/tests/popmon/io/test_file_writer.py index 7471a067..c00fa308 100644 --- a/tests/popmon/io/test_file_writer.py +++ b/tests/popmon/io/test_file_writer.py @@ -23,26 +23,26 @@ def to_pandas(data): def test_file_writer_json(): datastore = get_ready_ds() - FileWriter("my_data", apply_func=to_json)._transform(datastore) + FileWriter("my_data", apply_func=to_json).transform(datastore) assert datastore["my_data"] == to_json(DATA) def test_file_writer_json_with_kwargument(): datastore = get_ready_ds() - FileWriter("my_data", apply_func=to_json, indent=4)._transform(datastore) + FileWriter("my_data", apply_func=to_json, indent=4).transform(datastore) assert datastore["my_data"] == to_json(DATA, indent=4) def test_file_writer_not_a_func(): datastore = get_ready_ds() with pytest.raises(TypeError): - FileWriter("my_data", apply_func={})._transform(datastore) + FileWriter("my_data", apply_func={}).transform(datastore) def test_file_writer_df(): datastore = get_ready_ds() - FileWriter( - "my_data", store_key="transformed_data", apply_func=to_pandas - )._transform(datastore) + FileWriter("my_data", store_key="transformed_data", apply_func=to_pandas).transform( + datastore + ) assert datastore["my_data"] == DATA assert datastore["transformed_data"].to_dict() == to_pandas(DATA).to_dict() diff --git a/tests/popmon/io/test_json_reader.py b/tests/popmon/io/test_json_reader.py index d47e155b..4a46651b 100644 --- a/tests/popmon/io/test_json_reader.py +++ b/tests/popmon/io/test_json_reader.py @@ -4,7 +4,7 @@ def test_json_reader(): jr = JsonReader(file_path=resources.data("example.json"), store_key="example") - datastore = jr._transform(datastore={}) + datastore = jr.transform(datastore={}) assert datastore["example"]["boolean"] assert len(datastore["example"]["array"]) == 3 From bbccba83ccf187c674236255935ac43a71a0b396 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 27 Oct 2021 23:45:43 +0200 Subject: [PATCH 32/34] chore: clean up --- popmon/pipeline/metrics_pipelines.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index 170695af..8c1f26ba 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -70,13 +70,11 @@ def create_metrics_pipeline( reference_type="self", reference=None, hists_key="hists", - # ref_hists_key="ref_hists", time_axis="", window=10, monitoring_rules={}, pull_rules={}, features=None, - # shift=1, **kwargs, ): # configuration and datastore for report pipeline @@ -87,8 +85,6 @@ def create_metrics_pipeline( "monitoring_rules": monitoring_rules, "pull_rules": pull_rules, "features": features, - # "ref_hists_key": ref_hists_key, - # "shift": shift, **kwargs, } From 4ee8c5208b2e8970784b1a9226aaa8be8339829a Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 24 Nov 2021 15:53:21 +0100 Subject: [PATCH 33/34] lint: try-except best practices --- .pre-commit-config.yaml | 3 ++- popmon/analysis/profiling/hist_profiler.py | 4 ++-- popmon/base/module.py | 4 ++-- popmon/visualization/utils.py | 8 +++++--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 60d700ff..e8589e1d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,8 @@ repos: - id: flake8 additional_dependencies: - flake8-comprehensions - args: [ "--select=E9,F63,F7,F82,C4,F401"] + - tryceratops + args: [ "--select=E9,F63,F7,F82,C4,F401,TR004,TC200,TC201,TC202"] - repo: https://github.com/asottile/pyupgrade rev: v2.29.1 hooks: diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index 3f1f762e..7bce2396 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -162,8 +162,8 @@ def _profile_2d_histogram(self, name, hist): return [] try: grid = get_2dgrid(hist) - except Exception as e: - raise e + except Exception: + raise # calc some basic 2d-histogram statistics sume = int(sum_entries(hist)) diff --git a/popmon/base/module.py b/popmon/base/module.py index a56e02dc..5bb49e62 100644 --- a/popmon/base/module.py +++ b/popmon/base/module.py @@ -140,8 +140,8 @@ def get_datastore_object(datastore, feature, dtype, default=None): else: try: obj = datastore[feature] - except KeyError: - raise ValueError(f"`{feature}` not found in the datastore!") + except KeyError as e: + raise ValueError(f"`{feature}` not found in the datastore!") from e if not isinstance(obj, dtype): raise TypeError(f"obj `{feature}` is not an instance of `{dtype}`!") diff --git a/popmon/visualization/utils.py b/popmon/visualization/utils.py index 9cf3e7e4..924014c7 100644 --- a/popmon/visualization/utils.py +++ b/popmon/visualization/utils.py @@ -147,7 +147,7 @@ def plot_bars_b64(data, labels=None, bounds=None, ylim=False, skip_empty=True): if y_max > y_min: ax.set_ylim(y_min, y_max) except Exception: - pass + logger.debug("unable to plot boundaries") ax.grid(True, linestyle=":") @@ -368,8 +368,10 @@ def plot_overlay_1d_histogram_b64( try: hist_values = hist[0] hist_bins = hist[1] - except BaseException: - raise ValueError("Cannot extract binning and values from input histogram") + except BaseException as e: + raise ValueError( + "Cannot extract binning and values from input histogram" + ) from e assert hist_values is not None and len( hist_values From 0bccc7e4e7725fee00d37b1279fb8988dacccbec Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Wed, 24 Nov 2021 16:16:02 +0100 Subject: [PATCH 34/34] docs: refresh notebooks (#151) * docs: improve flow of basic tutorial * docs: improve flow of advanced tutorial * docs: improve flow of incremental tutorial * docs: reorder advanced tutorial --- .../notebooks/popmon_tutorial_advanced.ipynb | 74 +++++++++-- popmon/notebooks/popmon_tutorial_basic.ipynb | 115 +++++++++++++----- .../popmon_tutorial_incremental_data.ipynb | 66 ++++++---- 3 files changed, 182 insertions(+), 73 deletions(-) diff --git a/popmon/notebooks/popmon_tutorial_advanced.ipynb b/popmon/notebooks/popmon_tutorial_advanced.ipynb index 65c3d6ba..7272761b 100644 --- a/popmon/notebooks/popmon_tutorial_advanced.ipynb +++ b/popmon/notebooks/popmon_tutorial_advanced.ipynb @@ -1,9 +1,17 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial on advanced features" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -25,11 +33,9 @@ "metadata": {}, "outputs": [], "source": [ - "%%capture\n", "# install popmon (if not installed yet)\n", "import sys\n", - "\n", - "!\"{sys.executable}\" -m pip install popmon" + "!\"{sys.executable}\" -m pip install -q popmon" ] }, { @@ -58,9 +64,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(\n", - " resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n", - ")" + "df = pd.read_csv(resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"])" ] }, { @@ -74,7 +78,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "df.pm_stability_report(time_axis=\"DATE\")" @@ -263,7 +269,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Plotting the individual histograms\n", + "# Accessing the datastore\n", + "When you need programmtic access to popmon's results, then you can access the datastore directly.\n", + "For instanfce, you would like the exact maximum value of a histogram." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting the individual histograms\n", "Sometimes, when you're diving into alerts from the report, you may want to plot some individual histograms. \n", "Fortunately, you can! Let's first have a look at how these histograms are stored." ] @@ -276,7 +291,24 @@ "source": [ "report = df.pm_stability_report(\n", " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n", - ")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(report.datastore.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "split_hists = report.datastore[\"split_hists\"][\"DEPARTURE_DELAY\"]\n", "split_hists" ] @@ -295,7 +327,7 @@ "outputs": [], "source": [ "split_hist = split_hists.query(\"date == '2015-07-05 12:00:00'\")\n", - "split_hist.histogram[0].plot.matplotlib()" + "split_hist.histogram[0].plot.matplotlib();" ] }, { @@ -311,7 +343,23 @@ "metadata": {}, "outputs": [], "source": [ - "split_hist.histogram_ref[0].plot.matplotlib()" + "split_hist.histogram_ref[0].plot.matplotlib();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Integrations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Access to the datastore means that its possible to integrate popmon in almost any workflow. To give an example, one could store the histogram data in a [PostgreSQL](https://www.psycopg.org/docs/) database and load that from [Grafana](https://github.com/grafana/grafana) and benefit from their visualisation and alert handling features (e.g. send an email or slack message upon alert) [[#158]](https://github.com/ing-bank/popmon/issues/158). Similar flows are possible when popmon is integrated in a workflow scheduler framework, such as [Airflow](https://airflow.apache.org/). \n", + "\n", + "If you have set up such a workflow, please consider contributing this as a feature. In order to do so, [open an issue](https://github.com/ing-bank/popmon) in the repository." ] }, { @@ -361,7 +409,7 @@ " report_filepath=None,\n", " store_key=\"html_report\",\n", " sections_key=\"report_sections\",\n", - ")\n" + ")" ] }, { @@ -494,7 +542,7 @@ "name": "python3" }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/popmon/notebooks/popmon_tutorial_basic.ipynb b/popmon/notebooks/popmon_tutorial_basic.ipynb index 13ac619c..3c086ea0 100644 --- a/popmon/notebooks/popmon_tutorial_basic.ipynb +++ b/popmon/notebooks/popmon_tutorial_basic.ipynb @@ -1,9 +1,24 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `popmon` introductory notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook contains examples of how to generate `popmon` reports from a pandas DataFrame." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -26,7 +41,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Reporting given a histograms object (dict)" + "## Setup `popmon` and load our dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install popmon (if not installed yet) in the current environment." ] }, { @@ -35,11 +57,15 @@ "metadata": {}, "outputs": [], "source": [ - "%%capture\n", - "# install popmon (if not installed yet)\n", "import sys\n", - "\n", - "!\"{sys.executable}\" -m pip install popmon" + "!\"{sys.executable}\" -m pip install -q popmon" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import pandas and popmon, load and example dataset provided by popmon and show the first few results." ] }, { @@ -71,40 +97,48 @@ "df.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reporting given a pandas.DataFrame" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ - "# first we generate histograms,\n", - "# but we could load pre-generated histograms from a pickle or json file as well.\n", - "hists = df.pm_make_histograms(\n", + "report = df.pm_stability_report(\n", + " # Use the 'date' column as our time axis\n", " time_axis=\"date\",\n", + " # Create batches for every two weeks of data\n", " time_width=\"2w\",\n", - " features=[\"date:age\", \"date:gender\", \"date:isActive\"],\n", + " # Select a subset of features\n", + " features=[\"date:age\", \"date:isActive\", \"date:eyeColor\"],\n", ")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "print(hists.keys())" + "report" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# generate report based on histograms\n", - "report = popmon.stability_report(hists)" + "### Regenerate the report\n", + "You can change the report parameters without having to rerun the computational part of the pipeline using the `regenerate` method. For example: a short (limited) report will be generated since `extended_report` flag is set to `False`. If a user wants to configure which statistics she/he wants to see, `show_stats` argument has to be set accordingly.\n", + "\n", + "Another option is to change the `plot_hist_n` parameter to control the number of histograms being displayed per feature." ] }, { @@ -115,16 +149,22 @@ }, "outputs": [], "source": [ - "report # or report_.to_notebook_iframe()" + "report.regenerate(extended_report=False, plot_hist_n=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Regenerate the report\n", - "A short (limited) report will be generated since `extended_report` flag is set to `False`. \n", - "If a user wants to configure which statistics she/he wants to see, `show_stats` argument has to be set accordingly." + "## Reporting given a histograms" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the user would like to generate the report directly from histograms, then popmon also supports that.\n", + "First, we generate histograms, (but we could load pre-generated histograms from a pickle or json file as well)" ] }, { @@ -135,14 +175,27 @@ }, "outputs": [], "source": [ - "report.regenerate(extended_report=False)" + "hists = df.pm_make_histograms(\n", + " time_axis=\"date\",\n", + " time_width=\"2w\",\n", + " features=[\"date:age\", \"date:gender\", \"date:isActive\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(hists.keys())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Reporting given a pandas.DataFrame" + "And then generate the report based on histograms:" ] }, { @@ -151,11 +204,7 @@ "metadata": {}, "outputs": [], "source": [ - "report_ = df.pm_stability_report(\n", - " time_axis=\"date\",\n", - " time_width=\"2w\",\n", - " features=[\"date:age\", \"date:isActive\", \"date:eyeColor\"],\n", - ")" + "report = popmon.stability_report(hists)" ] }, { @@ -166,13 +215,13 @@ }, "outputs": [], "source": [ - "report_" + "report" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -186,7 +235,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.8" }, "pycharm": { "stem_cell": { diff --git a/popmon/notebooks/popmon_tutorial_incremental_data.ipynb b/popmon/notebooks/popmon_tutorial_incremental_data.ipynb index 719571ff..ed30289e 100644 --- a/popmon/notebooks/popmon_tutorial_incremental_data.ipynb +++ b/popmon/notebooks/popmon_tutorial_incremental_data.ipynb @@ -4,15 +4,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# This notebook shows how to generate reports on incremental datasets\n", + "# Working with incremental data\n", + "\n", + "This notebook shows how to generate reports on incremental datasets\n", "\n", "The incremental data will either have a proper time-axis, or will be batches of data without \n", "a specific time-axis. \n", "\n", "The histograms of these datasets will be stitched together, and we generate a (consistent) report on the stitched dataset.\n", "\n", - "Note that we always generate the report on the full stitched histograms, b/c algorithms like trend detection\n", - "and comparison with reference histograms rely on having the historical histograms in place." + "Note that we always generate the report on the full stitched histograms, because algorithms like trend detection and comparison with reference histograms rely on having the historical histograms in place." ] }, { @@ -22,17 +23,21 @@ "## Reporting given a histograms object (dict)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install popmon (if not installed yet)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "%%capture\n", - "# install popmon (if not installed yet)\n", "import sys\n", - "\n", - "!\"{sys.executable}\" -m pip install popmon" + "!\"{sys.executable}\" -m pip install -q popmon" ] }, { @@ -57,27 +62,26 @@ "df = pd.read_csv(resources.data(\"test.csv.gz\"), parse_dates=[\"date\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add month column, so we can make data batches per month." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# add month and week, so we can make data batches per month and week\n", "def to_month(x):\n", " date = pd.to_datetime(x)\n", " return str(12 * date.year + date.month)\n", "\n", "\n", - "def to_week(x):\n", - " date = pd.to_datetime(x)\n", - " return 52 * date.year + date.week\n", - "\n", - "\n", "df[\"month\"] = df[\"date\"].apply(to_month)\n", - "df[\"week\"] = df[\"date\"].apply(to_week)\n", - "months = df.month.unique()\n", - "weeks = df.week.unique().tolist()" + "months = df.month.unique()" ] }, { @@ -202,7 +206,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Adding to existing histograms" + "### Adding to existing histograms" ] }, { @@ -277,6 +281,21 @@ "bin_specs = popmon.get_bin_specs(hists, skip_first_axis=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def to_week(x):\n", + " date = pd.to_datetime(x)\n", + " return 52 * date.year + date.week\n", + "\n", + "\n", + "df[\"week\"] = df[\"date\"].apply(to_week)\n", + "weeks = df.week.unique().tolist()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -340,7 +359,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Adding to an existing stitched histograms" + "### Adding to an existing stitched histograms" ] }, { @@ -403,13 +422,6 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -441,7 +453,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -455,7 +467,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.8" }, "pycharm": { "stem_cell": {