Merge pull request #177 from ing-bank/develop

v0.5.0
ing-bank · Nov 24, 2021 · 70917d2 · 70917d2
2 parents 729d61a + 92eee2b
commit 70917d2
Show file tree

Hide file tree

Showing 52 changed files with 2,027 additions and 1,359 deletions.
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -0,0 +1,60 @@
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ master, develop ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ master ]
+  schedule:
+    - cron: '22 11 * * 2'
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'python' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
+        # Learn more:
+        # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v1
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v1
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    #- run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,10 +1,10 @@
 repos:
 -   repo: https://github.com/psf/black
-    rev: 21.9b0
+    rev: 21.11b1
     hooks:
     - id: black
 -   repo: https://github.com/pycqa/isort
-    rev: 5.9.3
+    rev: 5.10.1
     hooks:
       - id: isort
         files: '.*'
@@ -15,9 +15,14 @@ repos:
     -   id: flake8
         additional_dependencies:
             - flake8-comprehensions
-        args: [ "--select=E9,F63,F7,F82,C4"]
+            - tryceratops
+        args: [ "--select=E9,F63,F7,F82,C4,F401,TR004,TC200,TC201,TC202"]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.29.0
+    rev: v2.29.1
     hooks:
     -   id: pyupgrade
         args: ['--py36-plus','--exit-zero-even-if-changed']
+-   repo: https://github.com/asottile/blacken-docs
+    rev: v1.12.0
+    hooks:
+    -   id: blacken-docs
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -7,4 +7,5 @@ build:
 python:
   version: 3.8
   setup_py_install: true
-
+  install:
+    - requirements: docs/requirements.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -66,4 +66,4 @@
 
 ## v0.4.0 and before
 
-The release notes for preceding versions are available `here <https://github.com/ing-bank/popmon/blob/master/CHANGES.rst>`_
+The release notes for preceding versions are available [here](https://github.com/ing-bank/popmon/blob/master/CHANGES.rst>).
diff --git a/README.rst b/README.rst
@@ -29,7 +29,10 @@ With Spark 3.0, based on Scala 2.12, make sure to pick up the correct `histogram
 
 .. code-block:: python
 
-  spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20").getOrCreate()
+  spark = SparkSession.builder.config(
+      "spark.jars.packages",
+      "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20",
+  ).getOrCreate()
 
 For Spark 2.X compiled against scala 2.11, in the string above simply replace 2.12 with 2.11.
 
@@ -101,12 +104,12 @@ As a quick example, you can do:
   from popmon import resources
 
   # open synthetic data
-  df = pd.read_csv(resources.data('test.csv.gz'), parse_dates=['date'])
+  df = pd.read_csv(resources.data("test.csv.gz"), parse_dates=["date"])
   df.head()
 
   # generate stability report using automatic binning of all encountered features
   # (importing popmon automatically adds this functionality to a dataframe)
-  report = df.pm_stability_report(time_axis='date', features=['date:age', 'date:gender'])
+  report = df.pm_stability_report(time_axis="date", features=["date:age", "date:gender"])
 
   # to show the output of the report in a Jupyter notebook you can simply run:
   report
@@ -119,23 +122,32 @@ To specify your own binning specifications and features you want to report on, y
 .. code-block:: python
 
   # time-axis specifications alone; all other features are auto-binned.
-  report = df.pm_stability_report(time_axis='date', time_width='1w', time_offset='2020-1-6')
+  report = df.pm_stability_report(
+      time_axis="date", time_width="1w", time_offset="2020-1-6"
+  )
 
   # histogram selections. Here 'date' is the first axis of each histogram.
-  features=[
-      'date:isActive', 'date:age', 'date:eyeColor', 'date:gender',
-      'date:latitude', 'date:longitude', 'date:isActive:age'
+  features = [
+      "date:isActive",
+      "date:age",
+      "date:eyeColor",
+      "date:gender",
+      "date:latitude",
+      "date:longitude",
+      "date:isActive:age",
   ]
 
   # Specify your own binning specifications for individual features or combinations thereof.
   # This bin specification uses open-ended ("sparse") histograms; unspecified features get
   # auto-binned. The time-axis binning, when specified here, needs to be in nanoseconds.
-  bin_specs={
-      'longitude': {'bin_width': 5.0, 'bin_offset': 0.0},
-      'latitude': {'bin_width': 5.0, 'bin_offset': 0.0},
-      'age': {'bin_width': 10.0, 'bin_offset': 0.0},
-      'date': {'bin_width': pd.Timedelta('4w').value,
-               'bin_offset': pd.Timestamp('2015-1-1').value}
+  bin_specs = {
+      "longitude": {"bin_width": 5.0, "bin_offset": 0.0},
+      "latitude": {"bin_width": 5.0, "bin_offset": 0.0},
+      "age": {"bin_width": 10.0, "bin_offset": 0.0},
+      "date": {
+          "bin_width": pd.Timedelta("4w").value,
+          "bin_offset": pd.Timestamp("2015-1-1").value,
+      },
   }
 
   # generate stability report
@@ -145,6 +157,17 @@ These examples also work with spark dataframes.
 You can see the output of such example notebook code `here <https://crclz.com/popmon/reports/test_data_report.html>`_.
 For all available examples, please see the `tutorials <https://popmon.readthedocs.io/en/latest/tutorials.html>`_ at read-the-docs.
 
+Pipelines for monitoring dataset shift
+======================================
+Advanced users can leverage popmon's modular data pipeline to customize their workflow.
+Visualization of the pipeline can be useful when debugging, or for didactic purposes.
+There is a `script <https://github.com/ing-bank/popmon/tree/master/tools/>`_ included with the package that you can use.
+The plotting is configurable, and depending on the options you will obtain a result that can be used for understanding the data flow, the high-level components and the (re)use of datasets.
+
+|pipeline|
+
+*Example pipeline visualization (click to enlarge)*
+
 Resources
 =========
 
@@ -202,6 +225,9 @@ Copyright ING WBAA. `popmon` is completely free, open-source and licensed under
     :target: https://github.com/ing-bank/popmon
 .. |example| image:: https://raw.githubusercontent.com/ing-bank/popmon/master/docs/source/assets/traffic_light_overview.png
     :alt: Traffic Light Overview
+.. |pipeline| image:: https://raw.githubusercontent.com/ing-bank/popmon/master/docs/source/assets/pipeline.png
+    :alt: Pipeline Visualization
+    :target: https://github.com/ing-bank/popmon/files/7417124/pipeline_amazingpipeline_subgraphs_unversioned.pdf
 .. |build| image:: https://github.com/ing-bank/popmon/workflows/build/badge.svg
     :alt: Build status
 .. |docs| image:: https://readthedocs.org/projects/popmon/badge/?version=latest

diff --git a/bump.py b/bump.py
@@ -2,8 +2,8 @@
 from pathlib import Path
 
 MAJOR = 0
-REVISION = 4
-PATCH = 4
+REVISION = 5
+PATCH = 0
 VERSION = f"{MAJOR}.{REVISION}.{PATCH}"
 
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -0,0 +1,2 @@
+sphinx_rtd_theme
+myst_parser
diff --git a/docs/source/assets/pipeline.png b/docs/source/assets/pipeline.png
diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
@@ -55,7 +55,9 @@ To specify the time-axis binning alone, do:
 
 .. code-block:: python
 
-  report = df.pm_stability_report(time_axis='date', time_width='1w', time_offset='2020-1-6')
+  report = df.pm_stability_report(
+      time_axis="date", time_width="1w", time_offset="2020-1-6"
+  )
 
 The ``time_axis`` argument should be the name of a column that is of type **numeric (e.g. batch id, time in ns) or date(time)**.
 The default time width is 30 days ('30d'), with time offset 2010-1-4 (a Monday).
@@ -72,11 +74,15 @@ An example bin_specs dictionary is:
 
 .. code-block:: python
 
-    bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0},
-                 'y': {'num': 10, 'low': 0.0, 'high': 2.0},
-                 'x:y': [{}, {'num': 5, 'low': 0.0, 'high': 1.0}],
-                 'date': {'bin_width': pd.Timedelta('4w').value,
-                          'bin_offset': pd.Timestamp('2015-1-1').value}}
+    bin_specs = {
+        "x": {"bin_width": 1, "bin_offset": 0},
+        "y": {"num": 10, "low": 0.0, "high": 2.0},
+        "x:y": [{}, {"num": 5, "low": 0.0, "high": 1.0}],
+        "date": {
+            "bin_width": pd.Timedelta("4w").value,
+            "bin_offset": pd.Timestamp("2015-1-1").value,
+        },
+    }
 
 In the bin specs for 'x:y', 'x' is not provided (here) and reverts to the 1-dim setting.
 Any time-axis, when specified here ('date'), needs to be specified in nanoseconds. This takes precedence over
@@ -112,9 +118,11 @@ When not provided, the default setting is:
 
 .. code-block:: python
 
-    monitoring_rules = {"*_pull": [7, 4, -4, -7],
-                        "*_zscore": [7, 4, -4, -7],
-                        "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]}
+    monitoring_rules = {
+        "*_pull": [7, 4, -4, -7],
+        "*_zscore": [7, 4, -4, -7],
+        "[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
+    }
 
 Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
 For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". Same for ``"*_zscore"``.
@@ -132,11 +140,13 @@ feature name in front. This also works for a combinations of two features. E.g.
 
 .. code-block:: python
 
-    monitoring_rules = {"featureA:*_pull": [5, 3, -3, -5],
-                        "featureA:featureB:*_pull": [6, 3, -3, -6],
-                        "featureA:nan": [4, 1, 0, 0],
-                        "*_pull": [7, 4, -4, -7],
-                        "nan": [8, 1, 0, 0]}
+    monitoring_rules = {
+        "featureA:*_pull": [5, 3, -3, -5],
+        "featureA:featureB:*_pull": [6, 3, -3, -6],
+        "featureA:nan": [4, 1, 0, 0],
+        "*_pull": [7, 4, -4, -7],
+        "nan": [8, 1, 0, 0],
+    }
 
 In the case where multiple rules could apply for a feature's statistic, the most specific one gets applied.
 So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule
@@ -204,13 +214,16 @@ Spark usage
     from pyspark.sql import SparkSession
 
     # downloads histogrammar jar files if not already installed, used for histogramming of spark dataframe
-    spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20").getOrCreate()
+    spark = SparkSession.builder.config(
+        "spark.jars.packages",
+        "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20",
+    ).getOrCreate()
 
     # load a dataframe
-    spark_df = spark.read.format('csv').options(header='true').load('file.csv')
+    spark_df = spark.read.format("csv").options(header="true").load("file.csv")
 
     # generate the report
-    report = spark_df.pm_stability_report(time_axis='timestamp')
+    report = spark_df.pm_stability_report(time_axis="timestamp")
 
 
 Spark example on Google Colab
@@ -231,16 +244,23 @@ Now that spark is installed, restart the runtime.
 .. code-block:: python
 
     import os
+
     os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
     os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"
 
     import findspark
+
     findspark.init()
 
     from pyspark.sql import SparkSession
 
-    spark = SparkSession.builder.master("local[*]") \
-      .config("spark.jars", "/content/jars/histogrammar_2.12-1.0.20.jar,/content/jars/histogrammar-sparksql_2.12-1.0.20.jar") \
-      .config("spark.sql.execution.arrow.enabled", "false") \
-      .config("spark.sql.session.timeZone", "GMT") \
-      .getOrCreate()
+    spark = (
+        SparkSession.builder.master("local[*]")
+        .config(
+            "spark.jars",
+            "/content/jars/histogrammar_2.12-1.0.20.jar,/content/jars/histogrammar-sparksql_2.12-1.0.20.jar",
+        )
+        .config("spark.sql.execution.arrow.enabled", "false")
+        .config("spark.sql.session.timeZone", "GMT")
+        .getOrCreate()
+    )
diff --git a/examples/flight_delays.py b/examples/flight_delays.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-import popmon
+import popmon  # noqa
 from popmon import resources
 
 # open synthetic data

diff --git a/examples/synthetic_data.py b/examples/synthetic_data.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-import popmon
+import popmon  # noqa
 from popmon import resources
 
 # open synthetic data

diff --git a/popmon/__init__.py b/popmon/__init__.py
@@ -32,3 +32,16 @@
 from .pipeline.report import df_stability_report, stability_report
 from .stitching import stitch_histograms
 from .version import version as __version__
+
+__all__ = [
+    "get_bin_specs",
+    "get_time_axes",
+    "make_histograms",
+    "decorators",
+    "df_stability_metrics",
+    "stability_metrics",
+    "df_stability_report",
+    "stability_report",
+    "stitch_histograms",
+    "__version__",
+]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -66,4 +66,4 @@

		## v0.4.0 and before

		The release notes for preceding versions are available `here <https://github.com/ing-bank/popmon/blob/master/CHANGES.rst>`_
		The release notes for preceding versions are available [here](https://github.com/ing-bank/popmon/blob/master/CHANGES.rst>).