diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 744284849..5fa558e9f 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,10 +1,10 @@ [bumpversion] -current_version = 1.2.0a1 +current_version = 1.4.0a1 parse = (?P\d+) \.(?P\d+) \.(?P\d+) ((?Pa|b|rc)(?P\d+))? -serialize = +serialize = {major}.{minor}.{patch}{prerelease}{num} {major}.{minor}.{patch} commit = False @@ -13,7 +13,7 @@ tag = False [bumpversion:part:prerelease] first_value = a optional_value = final -values = +values = a b rc @@ -25,4 +25,3 @@ first_value = 1 [bumpversion:file:setup.py] [bumpversion:file:dbt/adapters/spark/__version__.py] - diff --git a/.changes/0.0.0.md b/.changes/0.0.0.md new file mode 100644 index 000000000..14c2cf9e7 --- /dev/null +++ b/.changes/0.0.0.md @@ -0,0 +1,8 @@ +## Previous Releases +For information on prior major and minor releases, see their changelogs: +- [1.2](https://github.com/dbt-labs/dbt-spark/blob/1.2.latest/CHANGELOG.md) +- [1.1](https://github.com/dbt-labs/dbt-spark/blob/1.1.latest/CHANGELOG.md) +- [1.0](https://github.com/dbt-labs/dbt-spark/blob/1.0.latest/CHANGELOG.md) +- [0.21](https://github.com/dbt-labs/dbt-spark/blob/0.21.latest/CHANGELOG.md) +- [0.20](https://github.com/dbt-labs/dbt-spark/blob/0.20.latest/CHANGELOG.md) +- [0.19 and earlier](https://github.com/dbt-labs/dbt-spark/blob/0.19.latest/CHANGELOG.md) diff --git a/.changes/README.md b/.changes/README.md new file mode 100644 index 000000000..dc6106dfe --- /dev/null +++ b/.changes/README.md @@ -0,0 +1,3 @@ +# CHANGELOG + +To view information about the changelog operation we suggest reading this [README](https://github.com/dbt-labs/dbt-spark/blob/main/.changes/README.md) found in `dbt-spark`. diff --git a/.changes/header.tpl.md b/.changes/header.tpl.md new file mode 100644 index 000000000..251ea5d51 --- /dev/null +++ b/.changes/header.tpl.md @@ -0,0 +1,6 @@ +# dbt-spark Changelog + +- This file provides a full account of all changes to `dbt-spark`. +- Changes are listed under the (pre)release in which they first appear. Subsequent releases include changes from previous releases. +- "Breaking changes" listed under a version may require action from end users or external maintainers when upgrading to that version. +- Do not edit this file directly. This file is auto-generated using [changie](https://github.com/miniscruff/changie). For details on how to document a change, see [the contributing guide](https://github.com/dbt-labs/dbt-spark/blob/main/CONTRIBUTING.md#adding-changelog-entry) diff --git a/.changes/unreleased/.gitkeep b/.changes/unreleased/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.changes/unreleased/Breaking Changes-20221031-201109.yaml b/.changes/unreleased/Breaking Changes-20221031-201109.yaml new file mode 100644 index 000000000..cc4567f24 --- /dev/null +++ b/.changes/unreleased/Breaking Changes-20221031-201109.yaml @@ -0,0 +1,7 @@ +kind: Breaking Changes +body: Add schema to the default location root +time: 2022-10-31T20:11:09.291461+01:00 +custom: + Author: dan1elt0m JCZuurmond + Issue: "239" + PR: "339" diff --git a/.changes/unreleased/Features-20220926-123609.yaml b/.changes/unreleased/Features-20220926-123609.yaml new file mode 100644 index 000000000..b80b3730d --- /dev/null +++ b/.changes/unreleased/Features-20220926-123609.yaml @@ -0,0 +1,7 @@ +kind: Features +body: Migrate dbt-utils current_timestamp macros into core + adapters +time: 2022-09-26T12:36:09.319981-07:00 +custom: + Author: colin-rogers-dbt + Issue: "483" + PR: "480" diff --git a/.changes/unreleased/Fixes-20220926-112857.yaml b/.changes/unreleased/Fixes-20220926-112857.yaml new file mode 100644 index 000000000..2a18f13ac --- /dev/null +++ b/.changes/unreleased/Fixes-20220926-112857.yaml @@ -0,0 +1,7 @@ +kind: Fixes +body: Password doesn't pass to server using LDAP connection via thrift (#310) +time: 2022-09-26T11:28:57.306285-04:00 +custom: + Author: VShkaberda + Issue: "310" + PR: "396" diff --git a/.changie.yaml b/.changie.yaml new file mode 100644 index 000000000..f5800f324 --- /dev/null +++ b/.changie.yaml @@ -0,0 +1,62 @@ +changesDir: .changes +unreleasedDir: unreleased +headerPath: header.tpl.md +versionHeaderPath: "" +changelogPath: CHANGELOG.md +versionExt: md +versionFormat: '## dbt-spark {{.Version}} - {{.Time.Format "January 02, 2006"}}' +kindFormat: '### {{.Kind}}' +changeFormat: '- {{.Body}} ([#{{.Custom.Issue}}](https://github.com/dbt-labs/dbt-spark/issues/{{.Custom.Issue}}), [#{{.Custom.PR}}](https://github.com/dbt-labs/dbt-spark/pull/{{.Custom.PR}}))' +kinds: +- label: Breaking Changes +- label: Features +- label: Fixes +- label: Under the Hood +- label: Dependencies + changeFormat: '- {{.Body}} ({{if ne .Custom.Issue ""}}[#{{.Custom.Issue}}](https://github.com/dbt-labs/dbt-spark/issues/{{.Custom.Issue}}), {{end}}[#{{.Custom.PR}}](https://github.com/dbt-labs/dbt-spark/pull/{{.Custom.PR}}))' +- label: Security + changeFormat: '- {{.Body}} ({{if ne .Custom.Issue ""}}[#{{.Custom.Issue}}](https://github.com/dbt-labs/dbt-spark/issues/{{.Custom.Issue}}), {{end}}[#{{.Custom.PR}}](https://github.com/dbt-labs/dbt-spark/pull/{{.Custom.PR}}))' +custom: +- key: Author + label: GitHub Username(s) (separated by a single space if multiple) + type: string + minLength: 3 +- key: Issue + label: GitHub Issue Number + type: int + minLength: 4 +- key: PR + label: GitHub Pull Request Number + type: int + minLength: 4 +footerFormat: | + {{- $contributorDict := dict }} + {{- /* any names added to this list should be all lowercase for later matching purposes */}} + {{- $core_team := list "emmyoop" "nathaniel-may" "gshank" "leahwicz" "chenyulinx" "stu-k" "iknox-fa" "versusfacit" "mcknight-42" "jtcohen6" "dependabot[bot]" "snyk-bot" }} + {{- range $change := .Changes }} + {{- $authorList := splitList " " $change.Custom.Author }} + {{- /* loop through all authors for a PR */}} + {{- range $author := $authorList }} + {{- $authorLower := lower $author }} + {{- /* we only want to include non-core team contributors */}} + {{- if not (has $authorLower $core_team)}} + {{- $pr := $change.Custom.PR }} + {{- /* check if this contributor has other PRs associated with them already */}} + {{- if hasKey $contributorDict $author }} + {{- $prList := get $contributorDict $author }} + {{- $prList = append $prList $pr }} + {{- $contributorDict := set $contributorDict $author $prList }} + {{- else }} + {{- $prList := list $change.Custom.PR }} + {{- $contributorDict := set $contributorDict $author $prList }} + {{- end }} + {{- end}} + {{- end}} + {{- end }} + {{- /* no indentation here for formatting so the final markdown doesn't have unneeded indentations */}} + {{- if $contributorDict}} + ### Contributors + {{- range $k,$v := $contributorDict }} + - [@{{$k}}](https://github.com/{{$k}}) ({{ range $index, $element := $v }}{{if $index}}, {{end}}[#{{$element}}](https://github.com/dbt-labs/dbt-spark/pull/{{$element}}){{end}}) + {{- end }} + {{- end }} diff --git a/.circleci/config.yml b/.circleci/config.yml index 34e449acf..8f0afa6ce 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,29 +33,12 @@ jobs: DBT_INVOCATION_ENV: circle docker: - image: fishtownanalytics/test-container:10 - - image: godatadriven/spark:2 + - image: godatadriven/spark:3.1.1 environment: WAIT_FOR: localhost:5432 command: > --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 --name Thrift JDBC/ODBC Server - --conf spark.hadoop.javax.jdo.option.ConnectionURL=jdbc:postgresql://localhost/metastore - --conf spark.hadoop.javax.jdo.option.ConnectionUserName=dbt - --conf spark.hadoop.javax.jdo.option.ConnectionPassword=dbt - --conf spark.hadoop.javax.jdo.option.ConnectionDriverName=org.postgresql.Driver - --conf spark.serializer=org.apache.spark.serializer.KryoSerializer - --conf spark.jars.packages=org.apache.hudi:hudi-spark-bundle_2.11:0.9.0 - --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension - --conf spark.driver.userClassPathFirst=true - --conf spark.hadoop.datanucleus.autoCreateTables=true - --conf spark.hadoop.datanucleus.schema.autoCreateTables=true - --conf spark.hadoop.datanucleus.fixedDatastore=false - --conf spark.sql.hive.convertMetastoreParquet=false - --hiveconf hoodie.datasource.hive_sync.use_jdbc=false - --hiveconf hoodie.datasource.hive_sync.mode=hms - --hiveconf datanucleus.schema.autoCreateAll=true - --hiveconf hive.metastore.schema.verification=false - - image: postgres:9.6.17-alpine environment: POSTGRES_USER: dbt @@ -80,6 +63,9 @@ jobs: environment: DBT_INVOCATION_ENV: circle DBT_DATABRICKS_RETRY_ALL: True + DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" + DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" + DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" docker: - image: fishtownanalytics/test-container:10 steps: @@ -95,6 +81,9 @@ jobs: environment: DBT_INVOCATION_ENV: circle ODBC_DRIVER: Simba # TODO: move env var to Docker image + DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" + DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" + DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" docker: # image based on `fishtownanalytics/test-container` w/ Simba ODBC Spark driver installed - image: 828731156495.dkr.ecr.us-east-1.amazonaws.com/dbt-spark-odbc-test-container:latest diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..f39d154c0 --- /dev/null +++ b/.flake8 @@ -0,0 +1,12 @@ +[flake8] +select = + E + W + F +ignore = + W503 # makes Flake8 work like black + W504 + E203 # makes Flake8 work like black + E741 + E501 +exclude = test diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml new file mode 100644 index 000000000..f5494b313 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -0,0 +1,84 @@ +name: 🐞 Bug +description: Report a bug or an issue you've found with dbt-spark +title: "[Bug] " +labels: ["bug", "triage"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + - type: checkboxes + attributes: + label: Is this a new bug in dbt-spark? + description: > + In other words, is this an error, flaw, failure or fault in our software? + + If this is a bug that broke existing functionality that used to work, please open a regression issue. + If this is a bug in the dbt-core logic, please open an issue in the dbt-core repository. + If this is a bug experienced while using dbt Cloud, please report to [support](mailto:support@getdbt.com). + If this is a request for help or troubleshooting code in your own dbt project, please join our [dbt Community Slack](https://www.getdbt.com/community/join-the-community/) or open a [Discussion question](https://github.com/dbt-labs/docs.getdbt.com/discussions). + + Please search to see if an issue already exists for the bug you encountered. + options: + - label: I believe this is a new bug in dbt-spark + required: true + - label: I have searched the existing issues, and I could not find an existing issue for this bug + required: true + - type: textarea + attributes: + label: Current Behavior + description: A concise description of what you're experiencing. + validations: + required: true + - type: textarea + attributes: + label: Expected Behavior + description: A concise description of what you expected to happen. + validations: + required: true + - type: textarea + attributes: + label: Steps To Reproduce + description: Steps to reproduce the behavior. + placeholder: | + 1. In this environment... + 2. With this config... + 3. Run '...' + 4. See error... + validations: + required: true + - type: textarea + id: logs + attributes: + label: Relevant log output + description: | + If applicable, log output to help explain your problem. + render: shell + validations: + required: false + - type: textarea + attributes: + label: Environment + description: | + examples: + - **OS**: Ubuntu 20.04 + - **Python**: 3.9.12 (`python3 --version`) + - **dbt-core**: 1.1.1 (`dbt --version`) + - **dbt-spark**: 1.1.0 (`dbt --version`) + value: | + - OS: + - Python: + - dbt-core: + - dbt-spark: + render: markdown + validations: + required: false + - type: textarea + attributes: + label: Additional Context + description: | + Links? References? Anything that will give us more context about the issue you are encountering! + + Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in. + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 43f19a154..000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -name: Bug report -about: Report a bug or an issue you've found with dbt-spark -title: '' -labels: bug, triage -assignees: '' - ---- - -### Describe the bug -A clear and concise description of what the bug is. What command did you run? What happened? - -### Steps To Reproduce -In as much detail as possible, please provide steps to reproduce the issue. Sample data that triggers the issue, example model code, etc is all very helpful here. - -### Expected behavior -A clear and concise description of what you expected to happen. - -### Screenshots and log output -If applicable, add screenshots or log output to help explain your problem. - -### System information -**The output of `dbt --version`:** -``` -<output goes here> -``` - -**The operating system you're using:** - -**The output of `python --version`:** - -### Additional context -Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..129ea7779 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,14 @@ +blank_issues_enabled: false +contact_links: + - name: Ask the community for help + url: https://github.com/dbt-labs/docs.getdbt.com/discussions + about: Need help troubleshooting? Check out our guide on how to ask + - name: Contact dbt Cloud support + url: mailto:support@getdbt.com + about: Are you using dbt Cloud? Contact our support team for help! + - name: Participate in Discussions + url: https://github.com/dbt-labs/dbt-spark/discussions + about: Do you have a Big Idea for dbt-spark? Read open discussions, or start a new one + - name: Create an issue for dbt-core + url: https://github.com/dbt-labs/dbt-core/issues/new/choose + about: Report a bug or request a feature for dbt-core diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml new file mode 100644 index 000000000..8c123ba51 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -0,0 +1,59 @@ +name: ✨ Feature +description: Propose a straightforward extension of dbt-spark functionality +title: "[Feature] <title>" +labels: ["enhancement", "triage"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this feature request! + - type: checkboxes + attributes: + label: Is this your first time submitting a feature request? + description: > + We want to make sure that features are distinct and discoverable, + so that other members of the community can find them and offer their thoughts. + + Issues are the right place to request straightforward extensions of existing dbt-spark functionality. + For "big ideas" about future capabilities of dbt-spark, we ask that you open a + [discussion](https://github.com/dbt-labs/dbt-spark/discussions) in the "Ideas" category instead. + options: + - label: I have read the [expectations for open source contributors](https://docs.getdbt.com/docs/contributing/oss-expectations) + required: true + - label: I have searched the existing issues, and I could not find an existing issue for this feature + required: true + - label: I am requesting a straightforward extension of existing dbt-spark functionality, rather than a Big Idea better suited to a discussion + required: true + - type: textarea + attributes: + label: Describe the feature + description: A clear and concise description of what you want to happen. + validations: + required: true + - type: textarea + attributes: + label: Describe alternatives you've considered + description: | + A clear and concise description of any alternative solutions or features you've considered. + validations: + required: false + - type: textarea + attributes: + label: Who will this benefit? + description: | + What kind of use case will this feature be useful for? Please be specific and provide examples, this will help us prioritize properly. + validations: + required: false + - type: input + attributes: + label: Are you interested in contributing this feature? + description: Let us know if you want to write some code, and how we can help. + validations: + required: false + - type: textarea + attributes: + label: Anything else? + description: | + Links? References? Anything that will give us more context about the feature you are suggesting! + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 5edc9f6ca..000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -name: Feature request -about: Suggest an idea for dbt-spark -title: '' -labels: enhancement, triage -assignees: '' - ---- - -### Describe the feature -A clear and concise description of what you want to happen. - -### Describe alternatives you've considered -A clear and concise description of any alternative solutions or features you've considered. - -### Additional context -Please include any other relevant context here. - -### Who will this benefit? -What kind of use case will this feature be useful for? Please be specific and provide examples, this will help us prioritize properly. - -### Are you interested in contributing this feature? -Let us know if you want to write some code, and how we can help. diff --git a/.github/ISSUE_TEMPLATE/regression-report.yml b/.github/ISSUE_TEMPLATE/regression-report.yml new file mode 100644 index 000000000..8b65d6a26 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/regression-report.yml @@ -0,0 +1,82 @@ +name: ☣️ Regression +description: Report a regression you've observed in a newer version of dbt-spark +title: "[Regression] <title>" +labels: ["bug", "regression", "triage"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this regression report! + - type: checkboxes + attributes: + label: Is this a regression in a recent version of dbt-spark? + description: > + A regression is when documented functionality works as expected in an older version of dbt-spark, + and no longer works after upgrading to a newer version of dbt-spark + options: + - label: I believe this is a regression in dbt-spark functionality + required: true + - label: I have searched the existing issues, and I could not find an existing issue for this regression + required: true + - type: textarea + attributes: + label: Current Behavior + description: A concise description of what you're experiencing. + validations: + required: true + - type: textarea + attributes: + label: Expected/Previous Behavior + description: A concise description of what you expected to happen. + validations: + required: true + - type: textarea + attributes: + label: Steps To Reproduce + description: Steps to reproduce the behavior. + placeholder: | + 1. In this environment... + 2. With this config... + 3. Run '...' + 4. See error... + validations: + required: true + - type: textarea + id: logs + attributes: + label: Relevant log output + description: | + If applicable, log output to help explain your problem. + render: shell + validations: + required: false + - type: textarea + attributes: + label: Environment + description: | + examples: + - **OS**: Ubuntu 20.04 + - **Python**: 3.9.12 (`python3 --version`) + - **dbt-core (working version)**: 1.1.1 (`dbt --version`) + - **dbt-spark (working version)**: 1.1.0 (`dbt --version`) + - **dbt-core (regression version)**: 1.2.0 (`dbt --version`) + - **dbt-spark (regression version)**: 1.2.0 (`dbt --version`) + value: | + - OS: + - Python: + - dbt-core (working version): + - dbt-spark (working version): + - dbt-core (regression version): + - dbt-spark (regression version): + render: markdown + validations: + required: true + - type: textarea + attributes: + label: Additional Context + description: | + Links? References? Anything that will give us more context about the issue you are encountering! + + Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in. + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/release.md b/.github/ISSUE_TEMPLATE/release.md deleted file mode 100644 index ac28792a3..000000000 --- a/.github/ISSUE_TEMPLATE/release.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -name: Release -about: Release a new version of dbt-spark -title: '' -labels: release -assignees: '' - ---- - -### TBD \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/dependabot.yml b/.github/dependabot.yml similarity index 80% rename from .github/ISSUE_TEMPLATE/dependabot.yml rename to .github/dependabot.yml index 8a8c85b9f..2a6f34492 100644 --- a/.github/ISSUE_TEMPLATE/dependabot.yml +++ b/.github/dependabot.yml @@ -5,4 +5,4 @@ updates: directory: "/" schedule: interval: "daily" - rebase-strategy: "disabled" \ No newline at end of file + rebase-strategy: "disabled" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 60e12779b..11381456a 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -11,11 +11,16 @@ resolves # ### Description -<!--- Describe the Pull Request here --> +<!--- + Describe the Pull Request here. Add any references and info to help reviewers + understand your changes. Include any tradeoffs you considered. +--> ### Checklist +- [ ] I have read [the contributing guide](https://github.com/dbt-labs/dbt-spark/blob/main/CONTRIBUTING.md) and understand what's expected of me - [ ] I have signed the [CLA](https://docs.getdbt.com/docs/contributor-license-agreements) - [ ] I have run this code in development and it appears to resolve the stated issue - [ ] This PR includes tests, or tests are not required/relevant for this PR -- [ ] I have updated the `CHANGELOG.md` and added information about my change to the "dbt-spark next" section. \ No newline at end of file +- [ ] I have [opened an issue to add/update docs](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose), or docs changes are not required/relevant for this PR +- [ ] I have run `changie new` to [create a changelog entry](https://github.com/dbt-labs/dbt-spark/blob/main/CONTRIBUTING.md#Adding-CHANGELOG-Entry) diff --git a/.github/workflows/backport.yml b/.github/workflows/backport.yml new file mode 100644 index 000000000..8c0355bda --- /dev/null +++ b/.github/workflows/backport.yml @@ -0,0 +1,42 @@ + + +# **what?** +# When a PR is merged, if it has the backport label, it will create +# a new PR to backport those changes to the given branch. If it can't +# cleanly do a backport, it will comment on the merged PR of the failure. +# +# Label naming convention: "backport <branch name to backport to>" +# Example: backport 1.0.latest +# +# You MUST "Squash and merge" the original PR or this won't work. + +# **why?** +# Changes sometimes need to be backported to release branches. +# This automates the backporting process + +# **when?** +# Once a PR is "Squash and merge"'d, by adding a backport label, this is triggered + +name: Backport +on: + pull_request: + types: + - labeled + +permissions: + contents: write + pull-requests: write + +jobs: + backport: + name: Backport + runs-on: ubuntu-latest + # Only react to merged PRs for security reasons. + # See https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target. + if: > + github.event.pull_request.merged + && contains(github.event.label.name, 'backport') + steps: + - uses: tibdex/backport@v2.0.2 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/bot-changelog.yml b/.github/workflows/bot-changelog.yml new file mode 100644 index 000000000..39eacf9a6 --- /dev/null +++ b/.github/workflows/bot-changelog.yml @@ -0,0 +1,61 @@ +# **what?** +# When bots create a PR, this action will add a corresponding changie yaml file to that +# PR when a specific label is added. +# +# The file is created off a template: +# +# kind: <per action matrix> +# body: <PR title> +# time: <current timestamp> +# custom: +# Author: <PR User Login (generally the bot)> +# Issue: 4904 +# PR: <PR number> +# +# **why?** +# Automate changelog generation for more visability with automated bot PRs. +# +# **when?** +# Once a PR is created, label should be added to PR before or after creation. You can also +# manually trigger this by adding the appropriate label at any time. +# +# **how to add another bot?** +# Add the label and changie kind to the include matrix. That's it! +# + +name: Bot Changelog + +on: + pull_request: + # catch when the PR is opened with the label or when the label is added + types: [labeled] + +permissions: + contents: write + pull-requests: read + +jobs: + generate_changelog: + strategy: + matrix: + include: + - label: "dependencies" + changie_kind: "Dependency" + - label: "snyk" + changie_kind: "Security" + runs-on: ubuntu-latest + + steps: + + - name: Create and commit changelog on bot PR + if: ${{ contains(github.event.pull_request.labels.*.name, matrix.label) }} + id: bot_changelog + uses: emmyoop/changie_bot@v1.0.1 + with: + GITHUB_TOKEN: ${{ secrets.FISHTOWN_BOT_PAT }} + commit_author_name: "Github Build Bot" + commit_author_email: "<buildbot@fishtownanalytics.com>" + commit_message: "Add automated changelog yaml from template for bot PR" + changie_kind: ${{ matrix.changie_kind }} + label: ${{ matrix.label }} + custom_changelog_string: "custom:\n Author: ${{ github.event.pull_request.user.login }}\n Issue: 417\n PR: ${{ github.event.pull_request.number }}" diff --git a/.github/workflows/changelog-existence.yml b/.github/workflows/changelog-existence.yml new file mode 100644 index 000000000..6e51e8afc --- /dev/null +++ b/.github/workflows/changelog-existence.yml @@ -0,0 +1,41 @@ +# **what?** +# Checks that a file has been committed under the /.changes directory +# as a new CHANGELOG entry. Cannot check for a specific filename as +# it is dynamically generated by change type and timestamp. +# This workflow should not require any secrets since it runs for PRs +# from forked repos. +# By default, secrets are not passed to workflows running from +# a forked repo. + +# **why?** +# Ensure code change gets reflected in the CHANGELOG. + +# **when?** +# This will run for all PRs going into main and *.latest. It will +# run when they are opened, reopened, when any label is added or removed +# and when new code is pushed to the branch. The action will then get +# skipped if the 'Skip Changelog' label is present is any of the labels. + +name: Check Changelog Entry + +on: + pull_request: + types: [opened, reopened, labeled, unlabeled, synchronize] + workflow_dispatch: + +defaults: + run: + shell: bash + +permissions: + contents: read + pull-requests: write + + +jobs: + changelog: + uses: dbt-labs/actions/.github/workflows/changelog-existence.yml@main + with: + changelog_comment: 'Thank you for your pull request! We could not find a changelog entry for this change. For details on how to document a change, see the [dbt-spark contributing guide](https://github.com/dbt-labs/dbt-spark/blob/main/CONTRIBUTING.MD).' + skip_label: 'Skip Changelog' + secrets: inherit # this is only acceptable because we own the action we're calling diff --git a/.github/workflows/jira-creation.yml b/.github/workflows/jira-creation.yml index c84e106a7..b4016befc 100644 --- a/.github/workflows/jira-creation.yml +++ b/.github/workflows/jira-creation.yml @@ -13,7 +13,7 @@ name: Jira Issue Creation on: issues: types: [opened, labeled] - + permissions: issues: write diff --git a/.github/workflows/jira-label.yml b/.github/workflows/jira-label.yml index fd533a170..3da2e3a38 100644 --- a/.github/workflows/jira-label.yml +++ b/.github/workflows/jira-label.yml @@ -13,7 +13,7 @@ name: Jira Label Mirroring on: issues: types: [labeled, unlabeled] - + permissions: issues: read @@ -24,4 +24,3 @@ jobs: JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }} JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }} JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} - diff --git a/.github/workflows/jira-transition.yml b/.github/workflows/jira-transition.yml index 71273c7a9..ed9f9cd4f 100644 --- a/.github/workflows/jira-transition.yml +++ b/.github/workflows/jira-transition.yml @@ -21,4 +21,4 @@ jobs: secrets: JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }} JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }} - JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} \ No newline at end of file + JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fbdbbbaae..bf607c379 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -18,7 +18,6 @@ on: push: branches: - "main" - - "develop" - "*.latest" - "releases/*" pull_request: @@ -37,18 +36,10 @@ defaults: jobs: code-quality: - name: ${{ matrix.toxenv }} + name: code-quality runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - toxenv: [flake8] - - env: - TOXENV: ${{ matrix.toxenv }} - PYTEST_ADDOPTS: "-v --color=yes" + timeout-minutes: 10 steps: - name: Check out the repository @@ -58,28 +49,36 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 - with: + with: python-version: '3.8' - name: Install python dependencies run: | sudo apt-get install libsasl2-dev - pip install --user --upgrade pip - pip install tox - pip --version - tox --version - - name: Run tox - run: tox + python -m pip install --user --upgrade pip + python -m pip --version + python -m pip install pre-commit + pre-commit --version + python -m pip install mypy==0.942 + python -m pip install types-requests + mypy --version + python -m pip install -r requirements.txt + python -m pip install -r dev-requirements.txt + dbt --version + + - name: Run pre-commit hooks + run: pre-commit run --all-files --show-diff-on-failure unit: name: unit test / python ${{ matrix.python-version }} runs-on: ubuntu-latest + timeout-minutes: 10 strategy: fail-fast: false matrix: - python-version: [3.7, 3.8] # TODO: support unit testing for python 3.9 (https://github.com/dbt-labs/dbt/issues/3689) + python-version: ["3.7", "3.8", "3.9", "3.10"] env: TOXENV: "unit" @@ -88,8 +87,6 @@ jobs: steps: - name: Check out the repository uses: actions/checkout@v2 - with: - persist-credentials: false - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 @@ -99,9 +96,9 @@ jobs: - name: Install python dependencies run: | sudo apt-get install libsasl2-dev - pip install --user --upgrade pip - pip install tox - pip --version + python -m pip install --user --upgrade pip + python -m pip --version + python -m pip install tox tox --version - name: Run tox run: tox @@ -128,8 +125,6 @@ jobs: steps: - name: Check out the repository uses: actions/checkout@v2 - with: - persist-credentials: false - name: Set up Python uses: actions/setup-python@v2 @@ -138,9 +133,10 @@ jobs: - name: Install python dependencies run: | - pip install --user --upgrade pip - pip install --upgrade setuptools wheel twine check-wheel-contents - pip --version + python -m pip install --user --upgrade pip + python -m pip install --upgrade setuptools wheel twine check-wheel-contents + python -m pip --version + - name: Build distributions run: ./scripts/build-dist.sh @@ -153,7 +149,7 @@ jobs: - name: Check wheel contents run: | check-wheel-contents dist/*.whl --ignore W007,W008 - + - name: Check if this is an alpha version id: check-is-alpha run: | @@ -179,7 +175,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: [3.7, 3.8, 3.9] + python-version: ["3.7", "3.8", "3.9", "3.10"] steps: - name: Set up Python ${{ matrix.python-version }} @@ -189,9 +185,9 @@ jobs: - name: Install python dependencies run: | - pip install --user --upgrade pip - pip install --upgrade wheel - pip --version + python -m pip install --user --upgrade pip + python -m pip install --upgrade wheel + python -m pip --version - uses: actions/download-artifact@v2 with: name: dist @@ -202,13 +198,13 @@ jobs: - name: Install wheel distributions run: | - find ./dist/*.whl -maxdepth 1 -type f | xargs pip install --force-reinstall --find-links=dist/ + find ./dist/*.whl -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/ - name: Check wheel distributions run: | dbt --version - name: Install source distributions run: | - find ./dist/*.gz -maxdepth 1 -type f | xargs pip install --force-reinstall --find-links=dist/ + find ./dist/*.gz -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/ - name: Check source distributions run: | dbt --version diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b25ea884e..554e13a8d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,28 +3,28 @@ name: Build and Release on: workflow_dispatch: - + # Release version number that must be updated for each release env: version_number: '0.20.0rc2' -jobs: +jobs: Test: runs-on: ubuntu-latest steps: - name: Setup Python uses: actions/setup-python@v2.2.2 - with: + with: python-version: '3.8' - + - uses: actions/checkout@v2 - - name: Test release + - name: Test release run: | python3 -m venv env source env/bin/activate sudo apt-get install libsasl2-dev - pip install -r dev_requirements.txt + pip install -r dev-requirements.txt pip install twine wheel setuptools python setup.py sdist bdist_wheel pip install dist/dbt-spark-*.tar.gz @@ -38,9 +38,9 @@ jobs: steps: - name: Setup Python uses: actions/setup-python@v2.2.2 - with: + with: python-version: '3.8' - + - uses: actions/checkout@v2 - name: Bumping version @@ -48,7 +48,7 @@ jobs: python3 -m venv env source env/bin/activate sudo apt-get install libsasl2-dev - pip install -r dev_requirements.txt + pip install -r dev-requirements.txt bumpversion --config-file .bumpversion-dbt.cfg patch --new-version ${{env.version_number}} bumpversion --config-file .bumpversion.cfg patch --new-version ${{env.version_number}} --allow-dirty git status @@ -60,7 +60,7 @@ jobs: author_email: 'leah.antkiewicz@dbtlabs.com' message: 'Bumping version to ${{env.version_number}}' tag: v${{env.version_number}} - + # Need to set an output variable because env variables can't be taken as input # This is needed for the next step with releasing to GitHub - name: Find release type @@ -69,7 +69,7 @@ jobs: IS_PRERELEASE: ${{ contains(env.version_number, 'rc') || contains(env.version_number, 'b') }} run: | echo ::set-output name=isPrerelease::$IS_PRERELEASE - + - name: Create GitHub release uses: actions/create-release@v1 env: @@ -88,7 +88,7 @@ jobs: # or $ pip install "dbt-spark[PyHive]==${{env.version_number}}" ``` - + PypiRelease: name: Pypi release runs-on: ubuntu-latest @@ -97,13 +97,13 @@ jobs: steps: - name: Setup Python uses: actions/setup-python@v2.2.2 - with: + with: python-version: '3.8' - + - uses: actions/checkout@v2 with: ref: v${{env.version_number}} - + - name: Release to pypi env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} @@ -112,8 +112,7 @@ jobs: python3 -m venv env source env/bin/activate sudo apt-get install libsasl2-dev - pip install -r dev_requirements.txt + pip install -r dev-requirements.txt pip install twine wheel setuptools python setup.py sdist bdist_wheel twine upload --non-interactive dist/dbt_spark-${{env.version_number}}-py3-none-any.whl dist/dbt-spark-${{env.version_number}}.tar.gz - diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 2848ce8f7..a56455d55 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,5 +13,3 @@ jobs: stale-pr-message: "This PR has been marked as Stale because it has been open for 180 days with no activity. If you would like the PR to remain open, please remove the stale label or comment on the PR, or it will be closed in 7 days." # mark issues/PRs stale when they haven't seen activity in 180 days days-before-stale: 180 - # ignore checking issues with the following labels - exempt-issue-labels: "epic, discussion" \ No newline at end of file diff --git a/.github/workflows/version-bump.yml b/.github/workflows/version-bump.yml index 7fb8bb6eb..bde34d683 100644 --- a/.github/workflows/version-bump.yml +++ b/.github/workflows/version-bump.yml @@ -1,18 +1,15 @@ # **what?** -# This workflow will take a version number and a dry run flag. With that -# it will run versionbump to update the version number everywhere in the -# code base and then generate an update Docker requirements file. If this -# is a dry run, a draft PR will open with the changes. If this isn't a dry -# run, the changes will be committed to the branch this is run on. +# This workflow will take the new version number to bump to. With that +# it will run versionbump to update the version number everywhere in the +# code base and then run changie to create the corresponding changelog. +# A PR will be created with the changes that can be reviewed before committing. # **why?** -# This is to aid in releasing dbt and making sure we have updated -# the versions and Docker requirements in all places. +# This is to aid in releasing dbt and making sure we have updated +# the version in all places and generated the changelog. # **when?** -# This is triggered either manually OR -# from the repository_dispatch event "version-bump" which is sent from -# the dbt-release repo Action +# This is triggered manually name: Version Bump @@ -20,84 +17,12 @@ on: workflow_dispatch: inputs: version_number: - description: 'The version number to bump to' + description: 'The version number to bump to (ex. 1.2.0, 1.3.0b1)' required: true - is_dry_run: - description: 'Creates a draft PR to allow testing instead of committing to a branch' - required: true - default: 'true' - repository_dispatch: - types: [version-bump] - -jobs: - bump: - runs-on: ubuntu-latest - steps: - - name: Check out the repository - uses: actions/checkout@v2 - - - name: Set version and dry run values - id: variables - env: - VERSION_NUMBER: "${{ github.event.client_payload.version_number == '' && github.event.inputs.version_number || github.event.client_payload.version_number }}" - IS_DRY_RUN: "${{ github.event.client_payload.is_dry_run == '' && github.event.inputs.is_dry_run || github.event.client_payload.is_dry_run }}" - run: | - echo Repository dispatch event version: ${{ github.event.client_payload.version_number }} - echo Repository dispatch event dry run: ${{ github.event.client_payload.is_dry_run }} - echo Workflow dispatch event version: ${{ github.event.inputs.version_number }} - echo Workflow dispatch event dry run: ${{ github.event.inputs.is_dry_run }} - echo ::set-output name=VERSION_NUMBER::$VERSION_NUMBER - echo ::set-output name=IS_DRY_RUN::$IS_DRY_RUN - - - uses: actions/setup-python@v2 - with: - python-version: "3.8" - - - name: Install python dependencies - run: | - sudo apt-get install libsasl2-dev - python3 -m venv env - source env/bin/activate - pip install --upgrade pip - - - name: Create PR branch - if: ${{ steps.variables.outputs.IS_DRY_RUN == 'true' }} - run: | - git checkout -b bumping-version/${{steps.variables.outputs.VERSION_NUMBER}}_$GITHUB_RUN_ID - git push origin bumping-version/${{steps.variables.outputs.VERSION_NUMBER}}_$GITHUB_RUN_ID - git branch --set-upstream-to=origin/bumping-version/${{steps.variables.outputs.VERSION_NUMBER}}_$GITHUB_RUN_ID bumping-version/${{steps.variables.outputs.VERSION_NUMBER}}_$GITHUB_RUN_ID - - - name: Bumping version - run: | - source env/bin/activate - pip install -r dev_requirements.txt - env/bin/bumpversion --allow-dirty --new-version ${{steps.variables.outputs.VERSION_NUMBER}} major - git status - - - name: Commit version bump directly - uses: EndBug/add-and-commit@v7 - if: ${{ steps.variables.outputs.IS_DRY_RUN == 'false' }} - with: - author_name: 'Github Build Bot' - author_email: 'buildbot@fishtownanalytics.com' - message: 'Bumping version to ${{steps.variables.outputs.VERSION_NUMBER}}' - - - name: Commit version bump to branch - uses: EndBug/add-and-commit@v7 - if: ${{ steps.variables.outputs.IS_DRY_RUN == 'true' }} - with: - author_name: 'Github Build Bot' - author_email: 'buildbot@fishtownanalytics.com' - message: 'Bumping version to ${{steps.variables.outputs.VERSION_NUMBER}}' - branch: 'bumping-version/${{steps.variables.outputs.VERSION_NUMBER}}_${{GITHUB.RUN_ID}}' - push: 'origin origin/bumping-version/${{steps.variables.outputs.VERSION_NUMBER}}_${{GITHUB.RUN_ID}}' - - name: Create Pull Request - uses: peter-evans/create-pull-request@v3 - if: ${{ steps.variables.outputs.IS_DRY_RUN == 'true' }} - with: - author: 'Github Build Bot <buildbot@fishtownanalytics.com>' - draft: true - base: ${{github.ref}} - title: 'Bumping version to ${{steps.variables.outputs.VERSION_NUMBER}}' - branch: 'bumping-version/${{steps.variables.outputs.VERSION_NUMBER}}_${{GITHUB.RUN_ID}}' +jobs: + version_bump_and_changie: + uses: dbt-labs/actions/.github/workflows/version-bump.yml@main + with: + version_number: ${{ inputs.version_number }} + secrets: inherit # ok since what we are calling is internally maintained diff --git a/.gitignore b/.gitignore index cc586f5fe..189589cf4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,18 +1,47 @@ -.hive-metastore/ -.spark-warehouse/ -*.egg-info -env/ -*.pyc +# Byte-compiled / optimized / DLL files __pycache__ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +env*/ +dbt_env/ +dist/ +*.egg-info +logs/ + + +# Unit test .tox/ .env +test.env + + +# Django stuff +*.log + +# Mypy +*.pytest_cache/ + +# Vim +*.sw* + +# Pyenv +.python-version + +# pycharm .idea/ -build/ -dist/ -dbt-integration-tests -test/integration/.user.yml + +# MacOS .DS_Store -test.env + +# vscode .vscode -*.log -logs/ \ No newline at end of file + +# other +.hive-metastore/ +.spark-warehouse/ +dbt-integration-tests +test/integration/.user.yml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..e85b1dc8b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,66 @@ +# For more on configuring pre-commit hooks (see https://pre-commit.com/) + +# TODO: remove global exclusion of tests when testing overhaul is complete +exclude: '^tests/.*' + +# Force all unspecified python hooks to run python 3.8 +default_language_version: + python: python3 + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: check-yaml + args: [--unsafe] + - id: check-json + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-case-conflict +- repo: https://github.com/psf/black + rev: 21.12b0 + hooks: + - id: black + additional_dependencies: ['click==8.0.4'] + args: + - "--line-length=99" + - "--target-version=py38" + - id: black + alias: black-check + stages: [manual] + additional_dependencies: ['click==8.0.4'] + args: + - "--line-length=99" + - "--target-version=py38" + - "--check" + - "--diff" +- repo: https://gitlab.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + - id: flake8 + alias: flake8-check + stages: [manual] +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.950 + hooks: + - id: mypy + # N.B.: Mypy is... a bit fragile. + # + # By using `language: system` we run this hook in the local + # environment instead of a pre-commit isolated one. This is needed + # to ensure mypy correctly parses the project. + + # It may cause trouble in that it adds environmental variables out + # of our control to the mix. Unfortunately, there's nothing we can + # do about per pre-commit's author. + # See https://github.com/pre-commit/pre-commit/issues/730 for details. + args: [--show-error-codes, --ignore-missing-imports] + files: ^dbt/adapters/.* + language: system + - id: mypy + alias: mypy-check + stages: [manual] + args: [--show-error-codes, --pretty, --ignore-missing-imports] + files: ^dbt/adapters + language: system diff --git a/CHANGELOG.md b/CHANGELOG.md index 27f7db545..6dd49494b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,220 +1,16 @@ -## dbt-spark 1.1.0 (TBD) - -### Features -- Add session connection method ([#272](https://github.com/dbt-labs/dbt-spark/issues/272), [#279](https://github.com/dbt-labs/dbt-spark/pull/279)) - -### Under the hood -- Use dbt.tests.adapter.basic in test suite ([#298](https://github.com/dbt-labs/dbt-spark/issues/298), [#299](https://github.com/dbt-labs/dbt-spark/pull/299)) -- Make internal macros use macro dispatch to be overridable in child adapters ([#319](https://github.com/dbt-labs/dbt-spark/issues/319), [#320](https://github.com/dbt-labs/dbt-spark/pull/320)) -- Override adapter method 'run_sql_for_tests' ([#323](https://github.com/dbt-labs/dbt-spark/issues/323), [#324](https://github.com/dbt-labs/dbt-spark/pull/324)) -- when a table or view doesn't exist, 'adapter.get_columns_in_relation' will return empty list instead of fail ([#328]https://github.com/dbt-labs/dbt-spark/pull/328) -- Add schema to the default location root ([#239](https://github.com/dbt-labs/dbt-spark/issues/239), [#339](https://github.com/dbt-labs/dbt-spark/pull/339)) - - -### Contributors -- [@JCZuurmond](https://github.com/dbt-labs/dbt-spark/pull/279) ( [#279](https://github.com/dbt-labs/dbt-spark/pull/279)) -- [@ueshin](https://github.com/ueshin) ([#320](https://github.com/dbt-labs/dbt-spark/pull/320)) -- [@dan1elt0m](https://github.com/dan1elt0m) ([#339](https://github.com/dbt-labs/dbt-spark/pull/339)) - -## dbt-spark 1.1.0b1 (March 23, 2022) - -### Features -- Adds new integration test to check against new ability to allow unique_key to be a list. ([#282](https://github.com/dbt-labs/dbt-spark/issues/282)), [#291](https://github.com/dbt-labs/dbt-spark/pull/291)) - -### Fixes -- Closes the connection properly ([#280](https://github.com/dbt-labs/dbt-spark/issues/280), [#285](https://github.com/dbt-labs/dbt-spark/pull/285)) - -### Under the hood -- get_response -> AdapterResponse ([#265](https://github.com/dbt-labs/dbt-spark/pull/265)) -- Adding stale Actions workflow ([#275](https://github.com/dbt-labs/dbt-spark/pull/275)) -- Update plugin author name (`fishtown-analytics` → `dbt-labs`) in ODBC user agent ([#288](https://github.com/dbt-labs/dbt-spark/pull/288)) -- Configure insert_overwrite models to use parquet ([#301](https://github.com/dbt-labs/dbt-spark/pull/301)) - -### Contributors -- [@amychen1776](https://github.com/amychen1776) ([#288](https://github.com/dbt-labs/dbt-spark/pull/288)) -- [@ueshin](https://github.com/ueshin) ([#285](https://github.com/dbt-labs/dbt-spark/pull/285)) - -## dbt-spark 1.0.1rc0 (Release TBD) - -### Fixes -- Closes the connection properly ([#280](https://github.com/dbt-labs/dbt-spark/issues/280), [#285](https://github.com/dbt-labs/dbt-spark/pull/285)) - -### Contributors -- [@ueshin](https://github.com/ueshin) ([#285](https://github.com/dbt-labs/dbt-spark/pull/285)) - -## dbt-spark 1.0.0 (December 3, 2021) - -### Fixes -- Incremental materialization corrected to respect `full_refresh` config, by using `should_full_refresh()` macro ([#260](https://github.com/dbt-labs/dbt-spark/issues/260), [#262](https://github.com/dbt-labs/dbt-spark/pull/262/)) - -### Contributors -- [@grindheim](https://github.com/grindheim) ([#262](https://github.com/dbt-labs/dbt-spark/pull/262/)) - -## dbt-spark 1.0.0rc2 (November 24, 2021) - -### Features -- Add support for Apache Hudi (hudi file format) which supports incremental merge strategies ([#187](https://github.com/dbt-labs/dbt-spark/issues/187), [#210](https://github.com/dbt-labs/dbt-spark/pull/210)) - -### Under the hood -- Refactor seed macros: remove duplicated code from dbt-core, and provide clearer logging of SQL parameters that differ by connection method ([#249](https://github.com/dbt-labs/dbt-spark/issues/249), [#250](https://github.com/dbt-labs/dbt-snowflake/pull/250)) -- Replace `sample_profiles.yml` with `profile_template.yml`, for use with new `dbt init` ([#247](https://github.com/dbt-labs/dbt-spark/pull/247)) - -### Contributors -- [@vingov](https://github.com/vingov) ([#210](https://github.com/dbt-labs/dbt-spark/pull/210)) - -## dbt-spark 1.0.0rc1 (November 10, 2021) - -### Under the hood -- Remove official support for python 3.6, which is reaching end of life on December 23, 2021 ([dbt-core#4134](https://github.com/dbt-labs/dbt-core/issues/4134), [#253](https://github.com/dbt-labs/dbt-snowflake/pull/253)) -- Add support for structured logging ([#251](https://github.com/dbt-labs/dbt-spark/pull/251)) - -## dbt-spark 0.21.1 (Release TBD) - -## dbt-spark 0.21.1rc1 (November 3, 2021) - -### Fixes -- Fix `--store-failures` for tests, by suppressing irrelevant error in `comment_clause()` macro ([#232](https://github.com/dbt-labs/dbt-spark/issues/232), [#233](https://github.com/dbt-labs/dbt-spark/pull/233)) -- Add support for `on_schema_change` config in incremental models: `ignore`, `fail`, `append_new_columns`. For `sync_all_columns`, removing columns is not supported by Apache Spark or Delta Lake ([#198](https://github.com/dbt-labs/dbt-spark/issues/198), [#226](https://github.com/dbt-labs/dbt-spark/issues/226), [#229](https://github.com/dbt-labs/dbt-spark/pull/229)) -- Add `persist_docs` call to incremental model ([#224](https://github.com/dbt-labs/dbt-spark/issues/224), [#234](https://github.com/dbt-labs/dbt-spark/pull/234)) - -### Contributors -- [@binhnefits](https://github.com/binhnefits) ([#234](https://github.com/dbt-labs/dbt-spark/pull/234)) - -## dbt-spark 0.21.0 (October 4, 2021) - -### Fixes -- Enhanced get_columns_in_relation method to handle a bug in open source deltalake which doesnt return schema details in `show table extended in databasename like '*'` query output. This impacts dbt snapshots if file format is open source deltalake ([#207](https://github.com/dbt-labs/dbt-spark/pull/207)) -- Parse properly columns when there are struct fields to avoid considering inner fields: Issue ([#202](https://github.com/dbt-labs/dbt-spark/issues/202)) - -### Under the hood -- Add `unique_field` to better understand adapter adoption in anonymous usage tracking ([#211](https://github.com/dbt-labs/dbt-spark/pull/211)) - -### Contributors -- [@harryharanb](https://github.com/harryharanb) ([#207](https://github.com/dbt-labs/dbt-spark/pull/207)) -- [@SCouto](https://github.com/Scouto) ([#204](https://github.com/dbt-labs/dbt-spark/pull/204)) - -## dbt-spark 0.21.0b2 (August 20, 2021) - -### Fixes -- Add pyodbc import error message to dbt.exceptions.RuntimeException to get more detailed information when running `dbt debug` ([#192](https://github.com/dbt-labs/dbt-spark/pull/192)) -- Add support for ODBC Server Side Parameters, allowing options that need to be set with the `SET` statement to be used ([#201](https://github.com/dbt-labs/dbt-spark/pull/201)) -- Add `retry_all` configuration setting to retry all connection issues, not just when the `_is_retryable_error` function determines ([#194](https://github.com/dbt-labs/dbt-spark/pull/194)) - -### Contributors -- [@JCZuurmond](https://github.com/JCZuurmond) ([#192](https://github.com/fishtown-analytics/dbt-spark/pull/192)) -- [@jethron](https://github.com/jethron) ([#201](https://github.com/fishtown-analytics/dbt-spark/pull/201)) -- [@gregingenii](https://github.com/gregingenii) ([#194](https://github.com/dbt-labs/dbt-spark/pull/194)) - -## dbt-spark 0.21.0b1 (August 3, 2021) - -## dbt-spark 0.20.1 (August 2, 2021) - -## dbt-spark 0.20.1rc1 (August 2, 2021) - -### Fixes -- Fix `get_columns_in_relation` when called on models created in the same run ([#196](https://github.com/dbt-labs/dbt-spark/pull/196), [#197](https://github.com/dbt-labs/dbt-spark/pull/197)) - -### Contributors -- [@ali-tny](https://github.com/ali-tny) ([#197](https://github.com/fishtown-analytics/dbt-spark/pull/197)) - - -## dbt-spark 0.20.0 (July 12, 2021) - -## dbt-spark 0.20.0rc2 (July 7, 2021) - -### Features - -- Add support for `merge_update_columns` config in `merge`-strategy incremental models ([#183](https://github.com/fishtown-analytics/dbt-spark/pull/183), [#184](https://github.com/fishtown-analytics/dbt-spark/pull/184)) - -### Fixes - -- Fix column-level `persist_docs` on Delta tables, add tests ([#180](https://github.com/fishtown-analytics/dbt-spark/pull/180)) - -## dbt-spark 0.20.0rc1 (June 8, 2021) - -### Features - -- Allow user to specify `use_ssl` ([#169](https://github.com/fishtown-analytics/dbt-spark/pull/169)) -- Allow setting table `OPTIONS` using `config` ([#171](https://github.com/fishtown-analytics/dbt-spark/pull/171)) -- Add support for column-level `persist_docs` on Delta tables ([#84](https://github.com/fishtown-analytics/dbt-spark/pull/84), [#170](https://github.com/fishtown-analytics/dbt-spark/pull/170)) - -### Fixes -- Cast `table_owner` to string to avoid errors generating docs ([#158](https://github.com/fishtown-analytics/dbt-spark/pull/158), [#159](https://github.com/fishtown-analytics/dbt-spark/pull/159)) -- Explicitly cast column types when inserting seeds ([#139](https://github.com/fishtown-analytics/dbt-spark/pull/139), [#166](https://github.com/fishtown-analytics/dbt-spark/pull/166)) - -### Under the hood -- Parse information returned by `list_relations_without_caching` macro to speed up catalog generation ([#93](https://github.com/fishtown-analytics/dbt-spark/issues/93), [#160](https://github.com/fishtown-analytics/dbt-spark/pull/160)) -- More flexible host passing, https:// can be omitted ([#153](https://github.com/fishtown-analytics/dbt-spark/issues/153)) - -### Contributors -- [@friendofasquid](https://github.com/friendofasquid) ([#159](https://github.com/fishtown-analytics/dbt-spark/pull/159)) -- [@franloza](https://github.com/franloza) ([#160](https://github.com/fishtown-analytics/dbt-spark/pull/160)) -- [@Fokko](https://github.com/Fokko) ([#165](https://github.com/fishtown-analytics/dbt-spark/pull/165)) -- [@rahulgoyal2987](https://github.com/rahulgoyal2987) ([#169](https://github.com/fishtown-analytics/dbt-spark/pull/169)) -- [@JCZuurmond](https://github.com/JCZuurmond) ([#171](https://github.com/fishtown-analytics/dbt-spark/pull/171)) -- [@cristianoperez](https://github.com/cristianoperez) ([#170](https://github.com/fishtown-analytics/dbt-spark/pull/170)) - - -## dbt-spark 0.19.1 (April 2, 2021) - -## dbt-spark 0.19.1b2 (February 26, 2021) - -### Under the hood -- Update serialization calls to use new API in dbt-core `0.19.1b2` ([#150](https://github.com/fishtown-analytics/dbt-spark/pull/150)) - -## dbt-spark 0.19.0.1 (February 26, 2021) - -### Fixes -- Fix package distribution to include incremental model materializations ([#151](https://github.com/fishtown-analytics/dbt-spark/pull/151), [#152](https://github.com/fishtown-analytics/dbt-spark/issues/152)) - -## dbt-spark 0.19.0 (February 21, 2021) - -### Breaking changes -- Incremental models have `incremental_strategy: append` by default. This strategy adds new records without updating or overwriting existing records. For that, use `merge` or `insert_overwrite` instead, depending on the file format, connection method, and attributes of your underlying data. dbt will try to raise a helpful error if you configure a strategy that is not supported for a given file format or connection. ([#140](https://github.com/fishtown-analytics/dbt-spark/pull/140), [#141](https://github.com/fishtown-analytics/dbt-spark/pull/141)) - -### Fixes -- Capture hard-deleted records in snapshot merge, when `invalidate_hard_deletes` config is set ([#109](https://github.com/fishtown-analytics/dbt-spark/pull/143), [#126](https://github.com/fishtown-analytics/dbt-spark/pull/144)) - -## dbt-spark 0.19.0rc1 (January 8, 2021) - -### Breaking changes -- Users of the `http` and `thrift` connection methods need to install extra requirements: `pip install dbt-spark[PyHive]` ([#109](https://github.com/fishtown-analytics/dbt-spark/pull/109), [#126](https://github.com/fishtown-analytics/dbt-spark/pull/126)) - -### Under the hood -- Enable `CREATE OR REPLACE` support when using Delta. Instead of dropping and recreating the table, it will keep the existing table, and add a new version as supported by Delta. This will ensure that the table stays available when running the pipeline, and you can track the history. -- Add changelog, issue templates ([#119](https://github.com/fishtown-analytics/dbt-spark/pull/119), [#120](https://github.com/fishtown-analytics/dbt-spark/pull/120)) - -### Fixes -- Handle case of 0 retries better for HTTP Spark Connections ([#132](https://github.com/fishtown-analytics/dbt-spark/pull/132)) - -### Contributors -- [@danielvdende](https://github.com/danielvdende) ([#132](https://github.com/fishtown-analytics/dbt-spark/pull/132)) -- [@Fokko](https://github.com/Fokko) ([#125](https://github.com/fishtown-analytics/dbt-spark/pull/125)) - -## dbt-spark 0.18.1.1 (November 13, 2020) - -### Fixes -- Fix `extras_require` typo to enable `pip install dbt-spark[ODBC]` (([#121](https://github.com/fishtown-analytics/dbt-spark/pull/121)), ([#122](https://github.com/fishtown-analytics/dbt-spark/pull/122))) - -## dbt-spark 0.18.1 (November 6, 2020) - -### Features -- Allows users to specify `auth` and `kerberos_service_name` ([#107](https://github.com/fishtown-analytics/dbt-spark/pull/107)) -- Add support for ODBC driver connections to Databricks clusters and endpoints ([#116](https://github.com/fishtown-analytics/dbt-spark/pull/116)) - -### Under the hood -- Updated README links ([#115](https://github.com/fishtown-analytics/dbt-spark/pull/115)) -- Support complete atomic overwrite of non-partitioned incremental models ([#117](https://github.com/fishtown-analytics/dbt-spark/pull/117)) -- Update to support dbt-core 0.18.1 ([#110](https://github.com/fishtown-analytics/dbt-spark/pull/110), [#118](https://github.com/fishtown-analytics/dbt-spark/pull/118)) - -### Contributors -- [@danielhstahl](https://github.com/danielhstahl) ([#107](https://github.com/fishtown-analytics/dbt-spark/pull/107)) -- [@collinprather](https://github.com/collinprather) ([#115](https://github.com/fishtown-analytics/dbt-spark/pull/115)) -- [@charlottevdscheun](https://github.com/charlottevdscheun) ([#117](https://github.com/fishtown-analytics/dbt-spark/pull/117)) -- [@Fokko](https://github.com/Fokko) ([#117](https://github.com/fishtown-analytics/dbt-spark/pull/117)) - -## dbt-spark 0.18.0 (September 18, 2020) - -### Under the hood -- Make a number of changes to support dbt-adapter-tests ([#103](https://github.com/fishtown-analytics/dbt-spark/pull/103)) -- Update to support dbt-core 0.18.0. Run CI tests against local Spark, Databricks ([#105](https://github.com/fishtown-analytics/dbt-spark/pull/105)) +# dbt-spark Changelog + +- This file provides a full account of all changes to `dbt-spark`. +- Changes are listed under the (pre)release in which they first appear. Subsequent releases include changes from previous releases. +- "Breaking changes" listed under a version may require action from end users or external maintainers when upgrading to that version. +- Do not edit this file directly. This file is auto-generated using [changie](https://github.com/miniscruff/changie). For details on how to document a change, see [the contributing guide](https://github.com/dbt-labs/dbt-spark/blob/main/CONTRIBUTING.md#adding-changelog-entry) + +## Previous Releases +For information on prior major and minor releases, see their changelogs: +- [1.3](https://github.com/dbt-labs/dbt-spark/blob/1.3.latest/CHANGELOG.md) +- [1.2](https://github.com/dbt-labs/dbt-spark/blob/1.2.latest/CHANGELOG.md) +- [1.1](https://github.com/dbt-labs/dbt-spark/blob/1.1.latest/CHANGELOG.md) +- [1.0](https://github.com/dbt-labs/dbt-spark/blob/1.0.latest/CHANGELOG.md) +- [0.21](https://github.com/dbt-labs/dbt-spark/blob/0.21.latest/CHANGELOG.md) +- [0.20](https://github.com/dbt-labs/dbt-spark/blob/0.20.latest/CHANGELOG.md) +- [0.19 and earlier](https://github.com/dbt-labs/dbt-spark/blob/0.19.latest/CHANGELOG.md) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..1d6e76d31 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,111 @@ +# Contributing to `dbt-spark` + +1. [About this document](#about-this-document) +3. [Getting the code](#getting-the-code) +5. [Running `dbt-spark` in development](#running-dbt-spark-in-development) +6. [Testing](#testing) +7. [Updating Docs](#updating-docs) +7. [Submitting a Pull Request](#submitting-a-pull-request) + +## About this document +This document is a guide intended for folks interested in contributing to `dbt-spark`. Below, we document the process by which members of the community should create issues and submit pull requests (PRs) in this repository. It is not intended as a guide for using `dbt-spark`, and it assumes a certain level of familiarity with Python concepts such as virtualenvs, `pip`, Python modules, and so on. This guide assumes you are using macOS or Linux and are comfortable with the command line. + +For those wishing to contribute we highly suggest reading the dbt-core's [contribution guide](https://github.com/dbt-labs/dbt-core/blob/HEAD/CONTRIBUTING.md) if you haven't already. Almost all of the information there is applicable to contributing here, too! + +### Signing the CLA + +Please note that all contributors to `dbt-spark` must sign the [Contributor License Agreement](https://docs.getdbt.com/docs/contributor-license-agreements) to have their Pull Request merged into an `dbt-spark` codebase. If you are unable to sign the CLA, then the `dbt-spark` maintainers will unfortunately be unable to merge your Pull Request. You are, however, welcome to open issues and comment on existing ones. + + +## Getting the code + +You will need `git` in order to download and modify the `dbt-spark` source code. You can find directions [here](https://github.com/git-guides/install-git) on how to install `git`. + +### External contributors + +If you are not a member of the `dbt-labs` GitHub organization, you can contribute to `dbt-spark` by forking the `dbt-spark` repository. For a detailed overview on forking, check out the [GitHub docs on forking](https://help.github.com/en/articles/fork-a-repo). In short, you will need to: + +1. fork the `dbt-spark` repository +2. clone your fork locally +3. check out a new branch for your proposed changes +4. push changes to your fork +5. open a pull request against `dbt-labs/dbt-spark` from your forked repository + +### dbt Labs contributors + +If you are a member of the `dbt Labs` GitHub organization, you will have push access to the `dbt-spark` repo. Rather than forking `dbt-spark` to make your changes, just clone the repository, check out a new branch, and push directly to that branch. + + +## Running `dbt-spark` in development + +### Installation + +First make sure that you set up your `virtualenv` as described in [Setting up an environment](https://github.com/dbt-labs/dbt-core/blob/HEAD/CONTRIBUTING.md#setting-up-an-environment). Ensure you have the latest version of pip installed with `pip install --upgrade pip`. Next, install `dbt-spark` latest dependencies: + +```sh +pip install -e . -r dev-requirements.txt +``` + +When `dbt-spark` is installed this way, any changes you make to the `dbt-spark` source code will be reflected immediately in your next `dbt-spark` run. + +To confirm you have correct version of `dbt-core` installed please run `dbt --version` and `which dbt`. + + +## Testing + +### Initial Setup + +`dbt-spark` uses test credentials specified in a `test.env` file in the root of the repository. This `test.env` file is git-ignored, but please be _extra_ careful to never check in credentials or other sensitive information when developing. To create your `test.env` file, copy the provided example file, then supply your relevant credentials. + +``` +cp test.env.example test.env +$EDITOR test.env +``` + +### Test commands +There are a few methods for running tests locally. + +#### `tox` +`tox` takes care of managing Python virtualenvs and installing dependencies in order to run tests. You can also run tests in parallel, for example you can run unit tests for Python 3.7, Python 3.8, Python 3.9, and `flake8` checks in parallel with `tox -p`. Also, you can run unit tests for specific python versions with `tox -e py37`. The configuration of these tests are located in `tox.ini`. + +#### `pytest` +Finally, you can also run a specific test or group of tests using `pytest` directly. With a Python virtualenv active and dev dependencies installed you can do things like: + +```sh +# run specific spark integration tests +python -m pytest -m profile_spark tests/integration/get_columns_in_relation +# run specific functional tests +python -m pytest --profile databricks_sql_endpoint tests/functional/adapter/test_basic.py +# run all unit tests in a file +python -m pytest tests/unit/test_adapter.py +# run a specific unit test +python -m pytest test/unit/test_adapter.py::TestSparkAdapter::test_profile_with_database +``` +## Updating Docs + +Many changes will require and update to the `dbt-spark` docs here are some useful resources. + +- Docs are [here](https://docs.getdbt.com/). +- The docs repo for making changes is located [here]( https://github.com/dbt-labs/docs.getdbt.com). +- The changes made are likely to impact one or both of [Spark Profile](https://docs.getdbt.com/reference/warehouse-profiles/spark-profile), or [Saprk Configs](https://docs.getdbt.com/reference/resource-configs/spark-configs). +- We ask every community member who makes a user-facing change to open an issue or PR regarding doc changes. + +## Adding CHANGELOG Entry + +We use [changie](https://changie.dev) to generate `CHANGELOG` entries. **Note:** Do not edit the `CHANGELOG.md` directly. Your modifications will be lost. + +Follow the steps to [install `changie`](https://changie.dev/guide/installation/) for your system. + +Once changie is installed and your PR is created, simply run `changie new` and changie will walk you through the process of creating a changelog entry. Commit the file that's created and your changelog entry is complete! + +You don't need to worry about which `dbt-spark` version your change will go into. Just create the changelog entry with `changie`, and open your PR against the `main` branch. All merged changes will be included in the next minor version of `dbt-spark`. The Core maintainers _may_ choose to "backport" specific changes in order to patch older minor versions. In that case, a maintainer will take care of that backport after merging your PR, before releasing the new version of `dbt-spark`. + +## Submitting a Pull Request + +dbt Labs provides a CI environment to test changes to the `dbt-spark` adapter, and periodic checks against the development version of `dbt-core` through Github Actions. + +A `dbt-spark` maintainer will review your PR. They may suggest code revision for style or clarity, or request that you add unit or integration test(s). These are good things! We believe that, with a little bit of help, anyone can contribute high-quality code. + +Once all requests and answers have been answered the `dbt-spark` maintainer can trigger CI testing. + +Once all tests are passing and your PR has been approved, a `dbt-spark` maintainer will merge your changes into the active development branch. And that's it! Happy developing :tada: diff --git a/MANIFEST.in b/MANIFEST.in index 78412d5b8..cfbc714ed 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -recursive-include dbt/include *.sql *.yml *.md \ No newline at end of file +recursive-include dbt/include *.sql *.yml *.md diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..a520c425f --- /dev/null +++ b/Makefile @@ -0,0 +1,56 @@ +.DEFAULT_GOAL:=help + +.PHONY: dev +dev: ## Installs adapter in develop mode along with development depedencies + @\ + pip install -r dev-requirements.txt && pre-commit install + +.PHONY: mypy +mypy: ## Runs mypy against staged changes for static type checking. + @\ + pre-commit run --hook-stage manual mypy-check | grep -v "INFO" + +.PHONY: flake8 +flake8: ## Runs flake8 against staged changes to enforce style guide. + @\ + pre-commit run --hook-stage manual flake8-check | grep -v "INFO" + +.PHONY: black +black: ## Runs black against staged changes to enforce style guide. + @\ + pre-commit run --hook-stage manual black-check -v | grep -v "INFO" + +.PHONY: lint +lint: ## Runs flake8 and mypy code checks against staged changes. + @\ + pre-commit run flake8-check --hook-stage manual | grep -v "INFO"; \ + pre-commit run mypy-check --hook-stage manual | grep -v "INFO" + +.PHONY: linecheck +linecheck: ## Checks for all Python lines 100 characters or more + @\ + find dbt -type f -name "*.py" -exec grep -I -r -n '.\{100\}' {} \; + +.PHONY: unit +unit: ## Runs unit tests with py38. + @\ + tox -e py38 + +.PHONY: test +test: ## Runs unit tests with py38 and code checks against staged changes. + @\ + tox -p -e py38; \ + pre-commit run black-check --hook-stage manual | grep -v "INFO"; \ + pre-commit run flake8-check --hook-stage manual | grep -v "INFO"; \ + pre-commit run mypy-check --hook-stage manual | grep -v "INFO" + +.PHONY: clean + @echo "cleaning repo" + @git clean -f -X + +.PHONY: help +help: ## Show this help message. + @echo 'usage: make [target]' + @echo + @echo 'targets:' + @grep -E '^[7+a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' diff --git a/README.md b/README.md index 037a49895..241d869d7 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ more information, consult [the docs](https://docs.getdbt.com/docs/profile-spark) ## Running locally A `docker-compose` environment starts a Spark Thrift server and a Postgres database as a Hive Metastore backend. -Note that this is spark 2 not spark 3 so some functionalities might not be available. +Note: dbt-spark now supports Spark 3.1.1 (formerly on Spark 2.x). The following command would start two docker containers ``` diff --git a/dbt/adapters/spark/__init__.py b/dbt/adapters/spark/__init__.py index 469e202b9..91ad54768 100644 --- a/dbt/adapters/spark/__init__.py +++ b/dbt/adapters/spark/__init__.py @@ -5,9 +5,8 @@ from dbt.adapters.spark.impl import SparkAdapter from dbt.adapters.base import AdapterPlugin -from dbt.include import spark +from dbt.include import spark # type: ignore Plugin = AdapterPlugin( - adapter=SparkAdapter, - credentials=SparkCredentials, - include_path=spark.PACKAGE_PATH) + adapter=SparkAdapter, credentials=SparkCredentials, include_path=spark.PACKAGE_PATH +) diff --git a/dbt/adapters/spark/__version__.py b/dbt/adapters/spark/__version__.py index a6b977228..70ba273f5 100644 --- a/dbt/adapters/spark/__version__.py +++ b/dbt/adapters/spark/__version__.py @@ -1 +1 @@ -version = "1.2.0a1" +version = "1.4.0a1" diff --git a/dbt/adapters/spark/column.py b/dbt/adapters/spark/column.py index fd377ad15..8100fa450 100644 --- a/dbt/adapters/spark/column.py +++ b/dbt/adapters/spark/column.py @@ -1,15 +1,15 @@ from dataclasses import dataclass -from typing import TypeVar, Optional, Dict, Any +from typing import Any, Dict, Optional, TypeVar, Union from dbt.adapters.base.column import Column from dbt.dataclass_schema import dbtClassMixin from hologram import JsonDict -Self = TypeVar('Self', bound='SparkColumn') +Self = TypeVar("Self", bound="SparkColumn") @dataclass -class SparkColumn(dbtClassMixin, Column): +class SparkColumn(dbtClassMixin, Column): # type: ignore table_database: Optional[str] = None table_schema: Optional[str] = None table_name: Optional[str] = None @@ -22,7 +22,7 @@ class SparkColumn(dbtClassMixin, Column): def translate_type(cls, dtype: str) -> str: return dtype - def can_expand_to(self: Self, other_column: Self) -> bool: + def can_expand_to(self: Self, other_column: Self) -> bool: # type: ignore """returns True if both columns are strings""" return self.is_string() and other_column.is_string() @@ -31,37 +31,42 @@ def literal(self, value): @property def quoted(self) -> str: - return '`{}`'.format(self.column) + return "`{}`".format(self.column) @property def data_type(self) -> str: return self.dtype + @classmethod + def numeric_type(cls, dtype: str, precision: Any, scale: Any) -> str: + # SparkSQL does not support 'numeric' or 'number', only 'decimal' + if precision is None or scale is None: + return "decimal" + else: + return "{}({},{})".format("decimal", precision, scale) + def __repr__(self) -> str: return "<SparkColumn {} ({})>".format(self.name, self.data_type) @staticmethod def convert_table_stats(raw_stats: Optional[str]) -> Dict[str, Any]: - table_stats = {} + table_stats: Dict[str, Union[int, str, bool]] = {} if raw_stats: # format: 1109049927 bytes, 14093476 rows stats = { - stats.split(" ")[1]: int(stats.split(" ")[0]) - for stats in raw_stats.split(', ') + stats.split(" ")[1]: int(stats.split(" ")[0]) for stats in raw_stats.split(", ") } for key, val in stats.items(): - table_stats[f'stats:{key}:label'] = key - table_stats[f'stats:{key}:value'] = val - table_stats[f'stats:{key}:description'] = '' - table_stats[f'stats:{key}:include'] = True + table_stats[f"stats:{key}:label"] = key + table_stats[f"stats:{key}:value"] = val + table_stats[f"stats:{key}:description"] = "" + table_stats[f"stats:{key}:include"] = True return table_stats - def to_column_dict( - self, omit_none: bool = True, validate: bool = False - ) -> JsonDict: + def to_column_dict(self, omit_none: bool = True, validate: bool = False) -> JsonDict: original_dict = self.to_dict(omit_none=omit_none) # If there are stats, merge them into the root of the dict - original_stats = original_dict.pop('table_stats', None) + original_stats = original_dict.pop("table_stats", None) if original_stats: original_dict.update(original_stats) return original_dict diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py index 11163ccf0..66ca93d30 100644 --- a/dbt/adapters/spark/connections.py +++ b/dbt/adapters/spark/connections.py @@ -26,6 +26,7 @@ from hologram.helpers import StrEnum from dataclasses import dataclass, field from typing import Any, Dict, Optional + try: from thrift.transport.TSSLSocket import TSSLSocket import thrift @@ -33,11 +34,7 @@ import sasl import thrift_sasl except ImportError: - TSSLSocket = None - thrift = None - ssl = None - sasl = None - thrift_sasl = None + pass # done deliberately: setting modules to None explicitly violates MyPy contracts by degrading type semantics import base64 import time @@ -52,26 +49,27 @@ def _build_odbc_connnection_string(**kwargs) -> str: class SparkConnectionMethod(StrEnum): - THRIFT = 'thrift' - HTTP = 'http' - ODBC = 'odbc' - SESSION = 'session' + THRIFT = "thrift" + HTTP = "http" + ODBC = "odbc" + SESSION = "session" @dataclass class SparkCredentials(Credentials): host: str method: SparkConnectionMethod - database: Optional[str] + database: Optional[str] # type: ignore driver: Optional[str] = None cluster: Optional[str] = None endpoint: Optional[str] = None token: Optional[str] = None user: Optional[str] = None + password: Optional[str] = None port: int = 443 auth: Optional[str] = None kerberos_service_name: Optional[str] = None - organization: str = '0' + organization: str = "0" connect_retries: int = 0 connect_timeout: int = 10 use_ssl: bool = False @@ -81,27 +79,28 @@ class SparkCredentials(Credentials): @classmethod def __pre_deserialize__(cls, data): data = super().__pre_deserialize__(data) - if 'database' not in data: - data['database'] = None + if "database" not in data: + data["database"] = None return data + @property + def cluster_id(self): + return self.cluster + def __post_init__(self): # spark classifies database and schema as the same thing - if ( - self.database is not None and - self.database != self.schema - ): + if self.database is not None and self.database != self.schema: raise dbt.exceptions.RuntimeException( - f' schema: {self.schema} \n' - f' database: {self.database} \n' - f'On Spark, database must be omitted or have the same value as' - f' schema.' + f" schema: {self.schema} \n" + f" database: {self.database} \n" + f"On Spark, database must be omitted or have the same value as" + f" schema." ) self.database = None if self.method == SparkConnectionMethod.ODBC: try: - import pyodbc # noqa: F401 + import pyodbc # noqa: F401 except ImportError as e: raise dbt.exceptions.RuntimeException( f"{self.method} connection method requires " @@ -111,22 +110,16 @@ def __post_init__(self): f"ImportError({e.msg})" ) from e - if ( - self.method == SparkConnectionMethod.ODBC and - self.cluster and - self.endpoint - ): + if self.method == SparkConnectionMethod.ODBC and self.cluster and self.endpoint: raise dbt.exceptions.RuntimeException( "`cluster` and `endpoint` cannot both be set when" f" using {self.method} method to connect to Spark" ) if ( - self.method == SparkConnectionMethod.HTTP or - self.method == SparkConnectionMethod.THRIFT - ) and not ( - ThriftState and THttpClient and hive - ): + self.method == SparkConnectionMethod.HTTP + or self.method == SparkConnectionMethod.THRIFT + ) and not (ThriftState and THttpClient and hive): raise dbt.exceptions.RuntimeException( f"{self.method} connection method requires " "additional dependencies. \n" @@ -148,19 +141,19 @@ def __post_init__(self): @property def type(self): - return 'spark' + return "spark" @property def unique_field(self): return self.host def _connection_keys(self): - return ('host', 'port', 'cluster', - 'endpoint', 'schema', 'organization') + return ("host", "port", "cluster", "endpoint", "schema", "organization") class PyhiveConnectionWrapper(object): """Wrap a Spark connection in a way that no-ops transactions""" + # https://forums.databricks.com/questions/2157/in-apache-spark-sql-can-we-roll-back-the-transacti.html # noqa def __init__(self, handle): @@ -178,9 +171,7 @@ def cancel(self): try: self._cursor.cancel() except EnvironmentError as exc: - logger.debug( - "Exception while cancelling query: {}".format(exc) - ) + logger.debug("Exception while cancelling query: {}".format(exc)) def close(self): if self._cursor: @@ -189,9 +180,7 @@ def close(self): try: self._cursor.close() except EnvironmentError as exc: - logger.debug( - "Exception while closing cursor: {}".format(exc) - ) + logger.debug("Exception while closing cursor: {}".format(exc)) self.handle.close() def rollback(self, *args, **kwargs): @@ -247,23 +236,20 @@ def execute(self, sql, bindings=None): dbt.exceptions.raise_database_error(poll_state.errorMessage) elif state not in STATE_SUCCESS: - status_type = ThriftState._VALUES_TO_NAMES.get( - state, - 'Unknown<{!r}>'.format(state)) + status_type = ThriftState._VALUES_TO_NAMES.get(state, "Unknown<{!r}>".format(state)) - dbt.exceptions.raise_database_error( - "Query failed with status: {}".format(status_type)) + dbt.exceptions.raise_database_error("Query failed with status: {}".format(status_type)) logger.debug("Poll status: {}, query complete".format(state)) @classmethod def _fix_binding(cls, value): """Convert complex datatypes to primitives that can be loaded by - the Spark driver""" + the Spark driver""" if isinstance(value, NUMBERS): return float(value) elif isinstance(value, datetime): - return value.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + return value.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] else: return value @@ -273,7 +259,6 @@ def description(self): class PyodbcConnectionWrapper(PyhiveConnectionWrapper): - def execute(self, sql, bindings=None): if sql.strip().endswith(";"): sql = sql.strip()[:-1] @@ -282,19 +267,17 @@ def execute(self, sql, bindings=None): self._cursor.execute(sql) else: # pyodbc only supports `qmark` sql params! - query = sqlparams.SQLParams('format', 'qmark') + query = sqlparams.SQLParams("format", "qmark") sql, bindings = query.format(sql, bindings) self._cursor.execute(sql, *bindings) class SparkConnectionManager(SQLConnectionManager): - TYPE = 'spark' + TYPE = "spark" SPARK_CLUSTER_HTTP_PATH = "/sql/protocolv1/o/{organization}/{cluster}" SPARK_SQL_ENDPOINT_HTTP_PATH = "/sql/1.0/endpoints/{endpoint}" - SPARK_CONNECTION_URL = ( - "{host}:{port}" + SPARK_CLUSTER_HTTP_PATH - ) + SPARK_CONNECTION_URL = "{host}:{port}" + SPARK_CLUSTER_HTTP_PATH @contextmanager def exception_handler(self, sql): @@ -308,7 +291,7 @@ def exception_handler(self, sql): raise thrift_resp = exc.args[0] - if hasattr(thrift_resp, 'status'): + if hasattr(thrift_resp, "status"): msg = thrift_resp.status.errorMessage raise dbt.exceptions.RuntimeException(msg) else: @@ -320,10 +303,8 @@ def cancel(self, connection): @classmethod def get_response(cls, cursor) -> AdapterResponse: # https://github.com/dbt-labs/dbt-spark/issues/142 - message = 'OK' - return AdapterResponse( - _message=message - ) + message = "OK" + return AdapterResponse(_message=message) # No transactions on Spark.... def add_begin_query(self, *args, **kwargs): @@ -346,12 +327,13 @@ def validate_creds(cls, creds, required): if not hasattr(creds, key): raise dbt.exceptions.DbtProfileError( "The config '{}' is required when using the {} method" - " to connect to Spark".format(key, method)) + " to connect to Spark".format(key, method) + ) @classmethod def open(cls, connection): if connection.state == ConnectionState.OPEN: - logger.debug('Connection is already open, skipping open.') + logger.debug("Connection is already open, skipping open.") return connection creds = connection.credentials @@ -360,19 +342,18 @@ def open(cls, connection): for i in range(1 + creds.connect_retries): try: if creds.method == SparkConnectionMethod.HTTP: - cls.validate_creds(creds, ['token', 'host', 'port', - 'cluster', 'organization']) + cls.validate_creds(creds, ["token", "host", "port", "cluster", "organization"]) # Prepend https:// if it is missing host = creds.host - if not host.startswith('https://'): - host = 'https://' + creds.host + if not host.startswith("https://"): + host = "https://" + creds.host conn_url = cls.SPARK_CONNECTION_URL.format( host=host, port=creds.port, organization=creds.organization, - cluster=creds.cluster + cluster=creds.cluster, ) logger.debug("connection url: {}".format(conn_url)) @@ -381,15 +362,12 @@ def open(cls, connection): raw_token = "token:{}".format(creds.token).encode() token = base64.standard_b64encode(raw_token).decode() - transport.setCustomHeaders({ - 'Authorization': 'Basic {}'.format(token) - }) + transport.setCustomHeaders({"Authorization": "Basic {}".format(token)}) conn = hive.connect(thrift_transport=transport) handle = PyhiveConnectionWrapper(conn) elif creds.method == SparkConnectionMethod.THRIFT: - cls.validate_creds(creds, - ['host', 'port', 'user', 'schema']) + cls.validate_creds(creds, ["host", "port", "user", "schema"]) if creds.use_ssl: transport = build_ssl_transport( @@ -397,26 +375,35 @@ def open(cls, connection): port=creds.port, username=creds.user, auth=creds.auth, - kerberos_service_name=creds.kerberos_service_name) + kerberos_service_name=creds.kerberos_service_name, + password=creds.password, + ) conn = hive.connect(thrift_transport=transport) else: - conn = hive.connect(host=creds.host, - port=creds.port, - username=creds.user, - auth=creds.auth, - kerberos_service_name=creds.kerberos_service_name) # noqa + conn = hive.connect( + host=creds.host, + port=creds.port, + username=creds.user, + auth=creds.auth, + kerberos_service_name=creds.kerberos_service_name, + password=creds.password, + ) # noqa handle = PyhiveConnectionWrapper(conn) elif creds.method == SparkConnectionMethod.ODBC: if creds.cluster is not None: - required_fields = ['driver', 'host', 'port', 'token', - 'organization', 'cluster'] + required_fields = [ + "driver", + "host", + "port", + "token", + "organization", + "cluster", + ] http_path = cls.SPARK_CLUSTER_HTTP_PATH.format( - organization=creds.organization, - cluster=creds.cluster + organization=creds.organization, cluster=creds.cluster ) elif creds.endpoint is not None: - required_fields = ['driver', 'host', 'port', 'token', - 'endpoint'] + required_fields = ["driver", "host", "port", "token", "endpoint"] http_path = cls.SPARK_SQL_ENDPOINT_HTTP_PATH.format( endpoint=creds.endpoint ) @@ -429,13 +416,12 @@ def open(cls, connection): cls.validate_creds(creds, required_fields) dbt_spark_version = __version__.version - user_agent_entry = f"dbt-labs-dbt-spark/{dbt_spark_version} (Databricks)" # noqa + user_agent_entry = ( + f"dbt-labs-dbt-spark/{dbt_spark_version} (Databricks)" # noqa + ) # http://simba.wpengine.com/products/Spark/doc/ODBC_InstallGuide/unix/content/odbc/hi/configuring/serverside.htm - ssp = { - f"SSP_{k}": f"{{{v}}}" - for k, v in creds.server_side_parameters.items() - } + ssp = {f"SSP_{k}": f"{{{v}}}" for k, v in creds.server_side_parameters.items()} # https://www.simba.com/products/Spark/doc/v2/ODBC_InstallGuide/unix/content/odbc/options/driver.htm connection_str = _build_odbc_connnection_string( @@ -461,6 +447,7 @@ def open(cls, connection): Connection, SessionConnectionWrapper, ) + handle = SessionConnectionWrapper(Connection()) else: raise dbt.exceptions.DbtProfileError( @@ -472,9 +459,9 @@ def open(cls, connection): if isinstance(e, EOFError): # The user almost certainly has invalid credentials. # Perhaps a token expired, or something - msg = 'Failed to connect' + msg = "Failed to connect" if creds.token is not None: - msg += ', is your token valid?' + msg += ", is your token valid?" raise dbt.exceptions.FailedToConnectException(msg) from e retryable_message = _is_retryable_error(e) if retryable_message and creds.connect_retries > 0: @@ -496,9 +483,7 @@ def open(cls, connection): logger.warning(msg) time.sleep(creds.connect_timeout) else: - raise dbt.exceptions.FailedToConnectException( - 'failed to connect' - ) from e + raise dbt.exceptions.FailedToConnectException("failed to connect") from e else: raise exc @@ -507,56 +492,50 @@ def open(cls, connection): return connection -def build_ssl_transport(host, port, username, auth, - kerberos_service_name, password=None): +def build_ssl_transport(host, port, username, auth, kerberos_service_name, password=None): transport = None if port is None: port = 10000 if auth is None: - auth = 'NONE' + auth = "NONE" socket = TSSLSocket(host, port, cert_reqs=ssl.CERT_NONE) - if auth == 'NOSASL': + if auth == "NOSASL": # NOSASL corresponds to hive.server2.authentication=NOSASL # in hive-site.xml transport = thrift.transport.TTransport.TBufferedTransport(socket) - elif auth in ('LDAP', 'KERBEROS', 'NONE', 'CUSTOM'): + elif auth in ("LDAP", "KERBEROS", "NONE", "CUSTOM"): # Defer import so package dependency is optional - if auth == 'KERBEROS': + if auth == "KERBEROS": # KERBEROS mode in hive.server2.authentication is GSSAPI # in sasl library - sasl_auth = 'GSSAPI' + sasl_auth = "GSSAPI" else: - sasl_auth = 'PLAIN' + sasl_auth = "PLAIN" if password is None: # Password doesn't matter in NONE mode, just needs # to be nonempty. - password = 'x' + password = "x" def sasl_factory(): sasl_client = sasl.Client() - sasl_client.setAttr('host', host) - if sasl_auth == 'GSSAPI': - sasl_client.setAttr('service', kerberos_service_name) - elif sasl_auth == 'PLAIN': - sasl_client.setAttr('username', username) - sasl_client.setAttr('password', password) + sasl_client.setAttr("host", host) + if sasl_auth == "GSSAPI": + sasl_client.setAttr("service", kerberos_service_name) + elif sasl_auth == "PLAIN": + sasl_client.setAttr("username", username) + sasl_client.setAttr("password", password) else: raise AssertionError sasl_client.init() return sasl_client - transport = thrift_sasl.TSaslClientTransport(sasl_factory, - sasl_auth, socket) + transport = thrift_sasl.TSaslClientTransport(sasl_factory, sasl_auth, socket) return transport -def _is_retryable_error(exc: Exception) -> Optional[str]: - message = getattr(exc, 'message', None) - if message is None: - return None - message = message.lower() - if 'pending' in message: - return exc.message - if 'temporarily_unavailable' in message: - return exc.message - return None +def _is_retryable_error(exc: Exception) -> str: + message = str(exc).lower() + if "pending" in message or "temporarily_unavailable" in message: + return str(exc) + else: + return "" diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index eb001fbc9..6eff652eb 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -1,19 +1,26 @@ import re from concurrent.futures import Future from dataclasses import dataclass -from typing import Optional, List, Dict, Any, Union, Iterable +from typing import Any, Dict, Iterable, List, Optional, Union, Type +from typing_extensions import TypeAlias + import agate from dbt.contracts.relation import RelationType import dbt import dbt.exceptions -from dbt.adapters.base import AdapterConfig +from dbt.adapters.base import AdapterConfig, PythonJobHelper from dbt.adapters.base.impl import catch_as_completed +from dbt.contracts.connection import AdapterResponse from dbt.adapters.sql import SQLAdapter from dbt.adapters.spark import SparkConnectionManager from dbt.adapters.spark import SparkRelation from dbt.adapters.spark import SparkColumn +from dbt.adapters.spark.python_submissions import ( + JobClusterPythonJobHelper, + AllPurposeClusterPythonJobHelper, +) from dbt.adapters.base import BaseRelation from dbt.clients.agate_helper import DEFAULT_TYPE_TESTER from dbt.events import AdapterLogger @@ -21,19 +28,19 @@ logger = AdapterLogger("Spark") -GET_COLUMNS_IN_RELATION_MACRO_NAME = 'get_columns_in_relation' -LIST_SCHEMAS_MACRO_NAME = 'list_schemas' -LIST_RELATIONS_MACRO_NAME = 'list_relations_without_caching' -DROP_RELATION_MACRO_NAME = 'drop_relation' -FETCH_TBL_PROPERTIES_MACRO_NAME = 'fetch_tbl_properties' +GET_COLUMNS_IN_RELATION_RAW_MACRO_NAME = "get_columns_in_relation_raw" +LIST_SCHEMAS_MACRO_NAME = "list_schemas" +LIST_RELATIONS_MACRO_NAME = "list_relations_without_caching" +DROP_RELATION_MACRO_NAME = "drop_relation" +FETCH_TBL_PROPERTIES_MACRO_NAME = "fetch_tbl_properties" -KEY_TABLE_OWNER = 'Owner' -KEY_TABLE_STATISTICS = 'Statistics' +KEY_TABLE_OWNER = "Owner" +KEY_TABLE_STATISTICS = "Statistics" @dataclass class SparkConfig(AdapterConfig): - file_format: str = 'parquet' + file_format: str = "parquet" location_root: Optional[str] = None partition_by: Optional[Union[List[str], str]] = None clustered_by: Optional[Union[List[str], str]] = None @@ -44,48 +51,44 @@ class SparkConfig(AdapterConfig): class SparkAdapter(SQLAdapter): COLUMN_NAMES = ( - 'table_database', - 'table_schema', - 'table_name', - 'table_type', - 'table_comment', - 'table_owner', - 'column_name', - 'column_index', - 'column_type', - 'column_comment', - - 'stats:bytes:label', - 'stats:bytes:value', - 'stats:bytes:description', - 'stats:bytes:include', - - 'stats:rows:label', - 'stats:rows:value', - 'stats:rows:description', - 'stats:rows:include', + "table_database", + "table_schema", + "table_name", + "table_type", + "table_comment", + "table_owner", + "column_name", + "column_index", + "column_type", + "column_comment", + "stats:bytes:label", + "stats:bytes:value", + "stats:bytes:description", + "stats:bytes:include", + "stats:rows:label", + "stats:rows:value", + "stats:rows:description", + "stats:rows:include", ) - INFORMATION_COLUMNS_REGEX = re.compile( - r"^ \|-- (.*): (.*) \(nullable = (.*)\b", re.MULTILINE) + INFORMATION_COLUMNS_REGEX = re.compile(r"^ \|-- (.*): (.*) \(nullable = (.*)\b", re.MULTILINE) INFORMATION_OWNER_REGEX = re.compile(r"^Owner: (.*)$", re.MULTILINE) - INFORMATION_STATISTICS_REGEX = re.compile( - r"^Statistics: (.*)$", re.MULTILINE) + INFORMATION_STATISTICS_REGEX = re.compile(r"^Statistics: (.*)$", re.MULTILINE) HUDI_METADATA_COLUMNS = [ - '_hoodie_commit_time', - '_hoodie_commit_seqno', - '_hoodie_record_key', - '_hoodie_partition_path', - '_hoodie_file_name' + "_hoodie_commit_time", + "_hoodie_commit_seqno", + "_hoodie_record_key", + "_hoodie_partition_path", + "_hoodie_file_name", ] - Relation = SparkRelation - Column = SparkColumn - ConnectionManager = SparkConnectionManager - AdapterSpecificConfigs = SparkConfig + Relation: TypeAlias = SparkRelation + Column: TypeAlias = SparkColumn + ConnectionManager: TypeAlias = SparkConnectionManager + AdapterSpecificConfigs: TypeAlias = SparkConfig @classmethod def date_function(cls) -> str: - return 'current_timestamp()' + return "current_timestamp()" @classmethod def convert_text_type(cls, agate_table, col_idx): @@ -109,31 +112,28 @@ def convert_datetime_type(cls, agate_table, col_idx): return "timestamp" def quote(self, identifier): - return '`{}`'.format(identifier) + return "`{}`".format(identifier) def add_schema_to_cache(self, schema) -> str: """Cache a new schema in dbt. It will show up in `list relations`.""" if schema is None: name = self.nice_connection_name() dbt.exceptions.raise_compiler_error( - 'Attempted to cache a null schema for {}'.format(name) + "Attempted to cache a null schema for {}".format(name) ) - if dbt.flags.USE_CACHE: + if dbt.flags.USE_CACHE: # type: ignore self.cache.add_schema(None, schema) # so jinja doesn't render things - return '' + return "" def list_relations_without_caching( self, schema_relation: SparkRelation ) -> List[SparkRelation]: - kwargs = {'schema_relation': schema_relation} + kwargs = {"schema_relation": schema_relation} try: - results = self.execute_macro( - LIST_RELATIONS_MACRO_NAME, - kwargs=kwargs - ) + results = self.execute_macro(LIST_RELATIONS_MACRO_NAME, kwargs=kwargs) except dbt.exceptions.RuntimeException as e: - errmsg = getattr(e, 'msg', '') + errmsg = getattr(e, "msg", "") if f"Database '{schema_relation}' not found" in errmsg: return [] else: @@ -146,13 +146,12 @@ def list_relations_without_caching( if len(row) != 4: raise dbt.exceptions.RuntimeException( f'Invalid value from "show table extended ...", ' - f'got {len(row)} values, expected 4' + f"got {len(row)} values, expected 4" ) _schema, name, _, information = row - rel_type = RelationType.View \ - if 'Type: VIEW' in information else RelationType.Table - is_delta = 'Provider: delta' in information - is_hudi = 'Provider: hudi' in information + rel_type = RelationType.View if "Type: VIEW" in information else RelationType.Table + is_delta = "Provider: delta" in information + is_hudi = "Provider: hudi" in information relation = self.Relation.create( schema=_schema, identifier=name, @@ -165,18 +164,14 @@ def list_relations_without_caching( return relations - def get_relation( - self, database: str, schema: str, identifier: str - ) -> Optional[BaseRelation]: + def get_relation(self, database: str, schema: str, identifier: str) -> Optional[BaseRelation]: if not self.Relation.include_policy.database: - database = None + database = None # type: ignore return super().get_relation(database, schema, identifier) def parse_describe_extended( - self, - relation: Relation, - raw_rows: List[agate.Row] + self, relation: Relation, raw_rows: List[agate.Row] ) -> List[SparkColumn]: # Convert the Row to a dict dict_rows = [dict(zip(row._keys, row._values)) for row in raw_rows] @@ -185,83 +180,61 @@ def parse_describe_extended( pos = self.find_table_information_separator(dict_rows) # Remove rows that start with a hash, they are comments - rows = [ - row for row in raw_rows[0:pos] - if not row['col_name'].startswith('#') - ] - metadata = { - col['col_name']: col['data_type'] for col in raw_rows[pos + 1:] - } + rows = [row for row in raw_rows[0:pos] if not row["col_name"].startswith("#")] + metadata = {col["col_name"]: col["data_type"] for col in raw_rows[pos + 1 :]} raw_table_stats = metadata.get(KEY_TABLE_STATISTICS) table_stats = SparkColumn.convert_table_stats(raw_table_stats) - return [SparkColumn( - table_database=None, - table_schema=relation.schema, - table_name=relation.name, - table_type=relation.type, - table_owner=str(metadata.get(KEY_TABLE_OWNER)), - table_stats=table_stats, - column=column['col_name'], - column_index=idx, - dtype=column['data_type'], - ) for idx, column in enumerate(rows)] + return [ + SparkColumn( + table_database=None, + table_schema=relation.schema, + table_name=relation.name, + table_type=relation.type, + table_owner=str(metadata.get(KEY_TABLE_OWNER)), + table_stats=table_stats, + column=column["col_name"], + column_index=idx, + dtype=column["data_type"], + ) + for idx, column in enumerate(rows) + ] @staticmethod def find_table_information_separator(rows: List[dict]) -> int: pos = 0 for row in rows: - if not row['col_name'] or row['col_name'].startswith('#'): + if not row["col_name"] or row["col_name"].startswith("#"): break pos += 1 return pos def get_columns_in_relation(self, relation: Relation) -> List[SparkColumn]: - cached_relations = self.cache.get_relations( - relation.database, relation.schema) - cached_relation = next((cached_relation - for cached_relation in cached_relations - if str(cached_relation) == str(relation)), - None) columns = [] - if cached_relation and cached_relation.information: - columns = self.parse_columns_from_information(cached_relation) - if not columns: - # in open source delta 'show table extended' query output doesnt - # return relation's schema. if columns are empty from cache, - # use get_columns_in_relation spark macro - # which would execute 'describe extended tablename' query - try: - rows: List[agate.Row] = super().get_columns_in_relation(relation) - columns = self.parse_describe_extended(relation, rows) - except dbt.exceptions.RuntimeException as e: - # spark would throw error when table doesn't exist, where other - # CDW would just return and empty list, normalizing the behavior here - errmsg = getattr(e, "msg", "") - if ( - "Table or view not found" in errmsg or - "NoSuchTableException" in errmsg - ): - pass - else: - raise e + try: + rows: List[agate.Row] = self.execute_macro( + GET_COLUMNS_IN_RELATION_RAW_MACRO_NAME, kwargs={"relation": relation} + ) + columns = self.parse_describe_extended(relation, rows) + except dbt.exceptions.RuntimeException as e: + # spark would throw error when table doesn't exist, where other + # CDW would just return and empty list, normalizing the behavior here + errmsg = getattr(e, "msg", "") + if "Table or view not found" in errmsg or "NoSuchTableException" in errmsg: + pass + else: + raise e # strip hudi metadata columns. - columns = [x for x in columns - if x.name not in self.HUDI_METADATA_COLUMNS] + columns = [x for x in columns if x.name not in self.HUDI_METADATA_COLUMNS] return columns - def parse_columns_from_information( - self, relation: SparkRelation - ) -> List[SparkColumn]: - owner_match = re.findall( - self.INFORMATION_OWNER_REGEX, relation.information) + def parse_columns_from_information(self, relation: SparkRelation) -> List[SparkColumn]: + owner_match = re.findall(self.INFORMATION_OWNER_REGEX, relation.information) owner = owner_match[0] if owner_match else None - matches = re.finditer( - self.INFORMATION_COLUMNS_REGEX, relation.information) + matches = re.finditer(self.INFORMATION_COLUMNS_REGEX, relation.information) columns = [] - stats_match = re.findall( - self.INFORMATION_STATISTICS_REGEX, relation.information) + stats_match = re.findall(self.INFORMATION_STATISTICS_REGEX, relation.information) raw_table_stats = stats_match[0] if stats_match else None table_stats = SparkColumn.convert_table_stats(raw_table_stats) for match_num, match in enumerate(matches): @@ -275,28 +248,25 @@ def parse_columns_from_information( table_owner=owner, column=column_name, dtype=column_type, - table_stats=table_stats + table_stats=table_stats, ) columns.append(column) return columns - def _get_columns_for_catalog( - self, relation: SparkRelation - ) -> Iterable[Dict[str, Any]]: + def _get_columns_for_catalog(self, relation: SparkRelation) -> Iterable[Dict[str, Any]]: columns = self.parse_columns_from_information(relation) for column in columns: # convert SparkColumns into catalog dicts as_dict = column.to_column_dict() - as_dict['column_name'] = as_dict.pop('column', None) - as_dict['column_type'] = as_dict.pop('dtype') - as_dict['table_database'] = None + as_dict["column_name"] = as_dict.pop("column", None) + as_dict["column_type"] = as_dict.pop("dtype") + as_dict["table_database"] = None yield as_dict def get_properties(self, relation: Relation) -> Dict[str, str]: properties = self.execute_macro( - FETCH_TBL_PROPERTIES_MACRO_NAME, - kwargs={'relation': relation} + FETCH_TBL_PROPERTIES_MACRO_NAME, kwargs={"relation": relation} ) return dict(properties) @@ -304,28 +274,35 @@ def get_catalog(self, manifest): schema_map = self._get_catalog_schemas(manifest) if len(schema_map) > 1: dbt.exceptions.raise_compiler_error( - f'Expected only one database in get_catalog, found ' - f'{list(schema_map)}' + f"Expected only one database in get_catalog, found " f"{list(schema_map)}" ) with executor(self.config) as tpe: futures: List[Future[agate.Table]] = [] for info, schemas in schema_map.items(): for schema in schemas: - futures.append(tpe.submit_connected( - self, schema, - self._get_one_catalog, info, [schema], manifest - )) + futures.append( + tpe.submit_connected( + self, + schema, + self._get_one_catalog, + info, + [schema], + manifest, + ) + ) catalogs, exceptions = catch_as_completed(futures) return catalogs, exceptions def _get_one_catalog( - self, information_schema, schemas, manifest, + self, + information_schema, + schemas, + manifest, ) -> agate.Table: if len(schemas) != 1: dbt.exceptions.raise_compiler_error( - f'Expected only one schema in spark _get_one_catalog, found ' - f'{schemas}' + f"Expected only one schema in spark _get_one_catalog, found " f"{schemas}" ) database = information_schema.database @@ -335,15 +312,10 @@ def _get_one_catalog( for relation in self.list_relations(database, schema): logger.debug("Getting table schema for relation {}", relation) columns.extend(self._get_columns_for_catalog(relation)) - return agate.Table.from_object( - columns, column_types=DEFAULT_TYPE_TESTER - ) + return agate.Table.from_object(columns, column_types=DEFAULT_TYPE_TESTER) def check_schema_exists(self, database, schema): - results = self.execute_macro( - LIST_SCHEMAS_MACRO_NAME, - kwargs={'database': database} - ) + results = self.execute_macro(LIST_SCHEMAS_MACRO_NAME, kwargs={"database": database}) exists = True if schema in [row[0] for row in results] else False return exists @@ -353,7 +325,7 @@ def get_rows_different_sql( relation_a: BaseRelation, relation_b: BaseRelation, column_names: Optional[List[str]] = None, - except_operator: str = 'EXCEPT', + except_operator: str = "EXCEPT", ) -> str: """Generate SQL for a query that returns a single row with a two columns: the number of rows that are different between the two @@ -366,7 +338,7 @@ def get_rows_different_sql( names = sorted((self.quote(c.name) for c in columns)) else: names = sorted((self.quote(n) for n in column_names)) - columns_csv = ', '.join(names) + columns_csv = ", ".join(names) sql = COLUMNS_EQUAL_SQL.format( columns=columns_csv, @@ -384,7 +356,7 @@ def run_sql_for_tests(self, sql, fetch, conn): try: cursor.execute(sql) if fetch == "one": - if hasattr(cursor, 'fetchone'): + if hasattr(cursor, "fetchone"): return cursor.fetchone() else: # AttributeError: 'PyhiveConnectionWrapper' object has no attribute 'fetchone' @@ -400,13 +372,44 @@ def run_sql_for_tests(self, sql, fetch, conn): finally: conn.transaction_open = False + def generate_python_submission_response(self, submission_result: Any) -> AdapterResponse: + return self.connections.get_response(None) + + @property + def default_python_submission_method(self) -> str: + return "all_purpose_cluster" + + @property + def python_submission_helpers(self) -> Dict[str, Type[PythonJobHelper]]: + return { + "job_cluster": JobClusterPythonJobHelper, + "all_purpose_cluster": AllPurposeClusterPythonJobHelper, + } + + def standardize_grants_dict(self, grants_table: agate.Table) -> dict: + grants_dict: Dict[str, List[str]] = {} + for row in grants_table: + grantee = row["Principal"] + privilege = row["ActionType"] + object_type = row["ObjectType"] + + # we only want to consider grants on this object + # (view or table both appear as 'TABLE') + # and we don't want to consider the OWN privilege + if object_type == "TABLE" and privilege != "OWN": + if privilege in grants_dict.keys(): + grants_dict[privilege].append(grantee) + else: + grants_dict.update({privilege: [grantee]}) + return grants_dict + # spark does something interesting with joins when both tables have the same # static values for the join condition and complains that the join condition is # "trivial". Which is true, though it seems like an unreasonable cause for # failure! It also doesn't like the `from foo, bar` syntax as opposed to # `from foo cross join bar`. -COLUMNS_EQUAL_SQL = ''' +COLUMNS_EQUAL_SQL = """ with diff_count as ( SELECT 1 as id, @@ -433,4 +436,4 @@ def run_sql_for_tests(self, sql, fetch, conn): diff_count.num_missing as num_mismatched from row_count_diff cross join diff_count -'''.strip() +""".strip() diff --git a/dbt/adapters/spark/python_submissions.py b/dbt/adapters/spark/python_submissions.py new file mode 100644 index 000000000..1e81c572a --- /dev/null +++ b/dbt/adapters/spark/python_submissions.py @@ -0,0 +1,306 @@ +import base64 +import time +import requests +from typing import Any, Dict +import uuid + +import dbt.exceptions +from dbt.adapters.base import PythonJobHelper +from dbt.adapters.spark import SparkCredentials +from dbt.adapters.spark import __version__ + +DEFAULT_POLLING_INTERVAL = 10 +SUBMISSION_LANGUAGE = "python" +DEFAULT_TIMEOUT = 60 * 60 * 24 +DBT_SPARK_VERSION = __version__.version + + +class BaseDatabricksHelper(PythonJobHelper): + def __init__(self, parsed_model: Dict, credentials: SparkCredentials) -> None: + self.credentials = credentials + self.identifier = parsed_model["alias"] + self.schema = parsed_model["schema"] + self.parsed_model = parsed_model + self.timeout = self.get_timeout() + self.polling_interval = DEFAULT_POLLING_INTERVAL + self.check_credentials() + self.auth_header = { + "Authorization": f"Bearer {self.credentials.token}", + "User-Agent": f"dbt-labs-dbt-spark/{DBT_SPARK_VERSION} (Databricks)", + } + + @property + def cluster_id(self) -> str: + return self.parsed_model["config"].get("cluster_id", self.credentials.cluster_id) + + def get_timeout(self) -> int: + timeout = self.parsed_model["config"].get("timeout", DEFAULT_TIMEOUT) + if timeout <= 0: + raise ValueError("Timeout must be a positive integer") + return timeout + + def check_credentials(self) -> None: + raise NotImplementedError( + "Overwrite this method to check specific requirement for current submission method" + ) + + def _create_work_dir(self, path: str) -> None: + response = requests.post( + f"https://{self.credentials.host}/api/2.0/workspace/mkdirs", + headers=self.auth_header, + json={ + "path": path, + }, + ) + if response.status_code != 200: + raise dbt.exceptions.RuntimeException( + f"Error creating work_dir for python notebooks\n {response.content!r}" + ) + + def _upload_notebook(self, path: str, compiled_code: str) -> None: + b64_encoded_content = base64.b64encode(compiled_code.encode()).decode() + response = requests.post( + f"https://{self.credentials.host}/api/2.0/workspace/import", + headers=self.auth_header, + json={ + "path": path, + "content": b64_encoded_content, + "language": "PYTHON", + "overwrite": True, + "format": "SOURCE", + }, + ) + if response.status_code != 200: + raise dbt.exceptions.RuntimeException( + f"Error creating python notebook.\n {response.content!r}" + ) + + def _submit_job(self, path: str, cluster_spec: dict) -> str: + job_spec = { + "run_name": f"{self.schema}-{self.identifier}-{uuid.uuid4()}", + "notebook_task": { + "notebook_path": path, + }, + } + job_spec.update(cluster_spec) # updates 'new_cluster' config + # PYPI packages + packages = self.parsed_model["config"].get("packages", []) + # additional format of packages + additional_libs = self.parsed_model["config"].get("additional_libs", []) + libraries = [] + for package in packages: + libraries.append({"pypi": {"package": package}}) + for lib in additional_libs: + libraries.append(lib) + job_spec.update({"libraries": libraries}) # type: ignore + submit_response = requests.post( + f"https://{self.credentials.host}/api/2.1/jobs/runs/submit", + headers=self.auth_header, + json=job_spec, + ) + if submit_response.status_code != 200: + raise dbt.exceptions.RuntimeException( + f"Error creating python run.\n {submit_response.content!r}" + ) + return submit_response.json()["run_id"] + + def _submit_through_notebook(self, compiled_code: str, cluster_spec: dict) -> None: + # it is safe to call mkdirs even if dir already exists and have content inside + work_dir = f"/Shared/dbt_python_model/{self.schema}/" + self._create_work_dir(work_dir) + # add notebook + whole_file_path = f"{work_dir}{self.identifier}" + self._upload_notebook(whole_file_path, compiled_code) + + # submit job + run_id = self._submit_job(whole_file_path, cluster_spec) + + self.polling( + status_func=requests.get, + status_func_kwargs={ + "url": f"https://{self.credentials.host}/api/2.1/jobs/runs/get?run_id={run_id}", + "headers": self.auth_header, + }, + get_state_func=lambda response: response.json()["state"]["life_cycle_state"], + terminal_states=("TERMINATED", "SKIPPED", "INTERNAL_ERROR"), + expected_end_state="TERMINATED", + get_state_msg_func=lambda response: response.json()["state"]["state_message"], + ) + + # get end state to return to user + run_output = requests.get( + f"https://{self.credentials.host}" f"/api/2.1/jobs/runs/get-output?run_id={run_id}", + headers=self.auth_header, + ) + json_run_output = run_output.json() + result_state = json_run_output["metadata"]["state"]["result_state"] + if result_state != "SUCCESS": + raise dbt.exceptions.RuntimeException( + "Python model failed with traceback as:\n" + "(Note that the line number here does not " + "match the line number in your code due to dbt templating)\n" + f"{json_run_output['error_trace']}" + ) + + def submit(self, compiled_code: str) -> None: + raise NotImplementedError( + "BasePythonJobHelper is an abstract class and you should implement submit method." + ) + + def polling( + self, + status_func, + status_func_kwargs, + get_state_func, + terminal_states, + expected_end_state, + get_state_msg_func, + ) -> Dict: + state = None + start = time.time() + exceeded_timeout = False + response = {} + while state not in terminal_states: + if time.time() - start > self.timeout: + exceeded_timeout = True + break + # should we do exponential backoff? + time.sleep(self.polling_interval) + response = status_func(**status_func_kwargs) + state = get_state_func(response) + if exceeded_timeout: + raise dbt.exceptions.RuntimeException("python model run timed out") + if state != expected_end_state: + raise dbt.exceptions.RuntimeException( + "python model run ended in state" + f"{state} with state_message\n{get_state_msg_func(response)}" + ) + return response + + +class JobClusterPythonJobHelper(BaseDatabricksHelper): + def check_credentials(self) -> None: + if not self.parsed_model["config"].get("job_cluster_config", None): + raise ValueError("job_cluster_config is required for commands submission method.") + + def submit(self, compiled_code: str) -> None: + cluster_spec = {"new_cluster": self.parsed_model["config"]["job_cluster_config"]} + self._submit_through_notebook(compiled_code, cluster_spec) + + +class DBContext: + def __init__(self, credentials: SparkCredentials, cluster_id: str, auth_header: dict) -> None: + self.auth_header = auth_header + self.cluster_id = cluster_id + self.host = credentials.host + + def create(self) -> str: + # https://docs.databricks.com/dev-tools/api/1.2/index.html#create-an-execution-context + response = requests.post( + f"https://{self.host}/api/1.2/contexts/create", + headers=self.auth_header, + json={ + "clusterId": self.cluster_id, + "language": SUBMISSION_LANGUAGE, + }, + ) + if response.status_code != 200: + raise dbt.exceptions.RuntimeException( + f"Error creating an execution context.\n {response.content!r}" + ) + return response.json()["id"] + + def destroy(self, context_id: str) -> str: + # https://docs.databricks.com/dev-tools/api/1.2/index.html#delete-an-execution-context + response = requests.post( + f"https://{self.host}/api/1.2/contexts/destroy", + headers=self.auth_header, + json={ + "clusterId": self.cluster_id, + "contextId": context_id, + }, + ) + if response.status_code != 200: + raise dbt.exceptions.RuntimeException( + f"Error deleting an execution context.\n {response.content!r}" + ) + return response.json()["id"] + + +class DBCommand: + def __init__(self, credentials: SparkCredentials, cluster_id: str, auth_header: dict) -> None: + self.auth_header = auth_header + self.cluster_id = cluster_id + self.host = credentials.host + + def execute(self, context_id: str, command: str) -> str: + # https://docs.databricks.com/dev-tools/api/1.2/index.html#run-a-command + response = requests.post( + f"https://{self.host}/api/1.2/commands/execute", + headers=self.auth_header, + json={ + "clusterId": self.cluster_id, + "contextId": context_id, + "language": SUBMISSION_LANGUAGE, + "command": command, + }, + ) + if response.status_code != 200: + raise dbt.exceptions.RuntimeException( + f"Error creating a command.\n {response.content!r}" + ) + return response.json()["id"] + + def status(self, context_id: str, command_id: str) -> Dict[str, Any]: + # https://docs.databricks.com/dev-tools/api/1.2/index.html#get-information-about-a-command + response = requests.get( + f"https://{self.host}/api/1.2/commands/status", + headers=self.auth_header, + params={ + "clusterId": self.cluster_id, + "contextId": context_id, + "commandId": command_id, + }, + ) + if response.status_code != 200: + raise dbt.exceptions.RuntimeException( + f"Error getting status of command.\n {response.content!r}" + ) + return response.json() + + +class AllPurposeClusterPythonJobHelper(BaseDatabricksHelper): + def check_credentials(self) -> None: + if not self.cluster_id: + raise ValueError( + "Databricks cluster_id is required for all_purpose_cluster submission method with running with notebook." + ) + + def submit(self, compiled_code: str) -> None: + if self.parsed_model["config"].get("create_notebook", False): + self._submit_through_notebook(compiled_code, {"existing_cluster_id": self.cluster_id}) + else: + context = DBContext(self.credentials, self.cluster_id, self.auth_header) + command = DBCommand(self.credentials, self.cluster_id, self.auth_header) + context_id = context.create() + try: + command_id = command.execute(context_id, compiled_code) + # poll until job finish + response = self.polling( + status_func=command.status, + status_func_kwargs={ + "context_id": context_id, + "command_id": command_id, + }, + get_state_func=lambda response: response["status"], + terminal_states=("Cancelled", "Error", "Finished"), + expected_end_state="Finished", + get_state_msg_func=lambda response: response.json()["results"]["data"], + ) + if response["results"]["resultType"] == "error": + raise dbt.exceptions.RuntimeException( + f"Python model failed with traceback as:\n" + f"{response['results']['cause']}" + ) + finally: + context.destroy(context_id) diff --git a/dbt/adapters/spark/relation.py b/dbt/adapters/spark/relation.py index 043cabfa0..249caf0d7 100644 --- a/dbt/adapters/spark/relation.py +++ b/dbt/adapters/spark/relation.py @@ -24,19 +24,19 @@ class SparkIncludePolicy(Policy): class SparkRelation(BaseRelation): quote_policy: SparkQuotePolicy = SparkQuotePolicy() include_policy: SparkIncludePolicy = SparkIncludePolicy() - quote_character: str = '`' + quote_character: str = "`" is_delta: Optional[bool] = None is_hudi: Optional[bool] = None - information: str = None + information: Optional[str] = None def __post_init__(self): if self.database != self.schema and self.database: - raise RuntimeException('Cannot set database in spark!') + raise RuntimeException("Cannot set database in spark!") def render(self): if self.include_policy.database and self.include_policy.schema: raise RuntimeException( - 'Got a spark relation with schema and database set to ' - 'include, but only one can be set' + "Got a spark relation with schema and database set to " + "include, but only one can be set" ) return super().render() diff --git a/dbt/adapters/spark/session.py b/dbt/adapters/spark/session.py index 6010df920..beb77d548 100644 --- a/dbt/adapters/spark/session.py +++ b/dbt/adapters/spark/session.py @@ -4,7 +4,7 @@ import datetime as dt from types import TracebackType -from typing import Any +from typing import Any, List, Optional, Tuple from dbt.events import AdapterLogger from dbt.utils import DECIMALS @@ -25,17 +25,17 @@ class Cursor: """ def __init__(self) -> None: - self._df: DataFrame | None = None - self._rows: list[Row] | None = None + self._df: Optional[DataFrame] = None + self._rows: Optional[List[Row]] = None def __enter__(self) -> Cursor: return self def __exit__( self, - exc_type: type[BaseException] | None, - exc_val: Exception | None, - exc_tb: TracebackType | None, + exc_type: Optional[BaseException], + exc_val: Optional[Exception], + exc_tb: Optional[TracebackType], ) -> bool: self.close() return True @@ -43,13 +43,13 @@ def __exit__( @property def description( self, - ) -> list[tuple[str, str, None, None, None, None, bool]]: + ) -> List[Tuple[str, str, None, None, None, None, bool]]: """ Get the description. Returns ------- - out : list[tuple[str, str, None, None, None, None, bool]] + out : List[Tuple[str, str, None, None, None, None, bool]] The description. Source @@ -109,13 +109,13 @@ def execute(self, sql: str, *parameters: Any) -> None: spark_session = SparkSession.builder.enableHiveSupport().getOrCreate() self._df = spark_session.sql(sql) - def fetchall(self) -> list[Row] | None: + def fetchall(self) -> Optional[List[Row]]: """ Fetch all data. Returns ------- - out : list[Row] | None + out : Optional[List[Row]] The rows. Source @@ -126,7 +126,7 @@ def fetchall(self) -> list[Row] | None: self._rows = self._df.collect() return self._rows - def fetchone(self) -> Row | None: + def fetchone(self) -> Optional[Row]: """ Fetch the first output. diff --git a/dbt/include/spark/__init__.py b/dbt/include/spark/__init__.py index 564a3d1e8..b177e5d49 100644 --- a/dbt/include/spark/__init__.py +++ b/dbt/include/spark/__init__.py @@ -1,2 +1,3 @@ import os + PACKAGE_PATH = os.path.dirname(__file__) diff --git a/dbt/include/spark/macros/adapters.sql b/dbt/include/spark/macros/adapters.sql index b7333c5e4..80407dd7b 100644 --- a/dbt/include/spark/macros/adapters.sql +++ b/dbt/include/spark/macros/adapters.sql @@ -118,35 +118,46 @@ {%- endmacro %} -{% macro create_temporary_view(relation, sql) -%} - {{ return(adapter.dispatch('create_temporary_view', 'dbt')(relation, sql)) }} +{% macro create_temporary_view(relation, compiled_code) -%} + {{ return(adapter.dispatch('create_temporary_view', 'dbt')(relation, compiled_code)) }} {%- endmacro -%} -{#-- We can't use temporary tables with `create ... as ()` syntax #} -{% macro spark__create_temporary_view(relation, sql) -%} - create temporary view {{ relation.include(schema=false) }} as - {{ sql }} -{% endmacro %} +{#-- We can't use temporary tables with `create ... as ()` syntax --#} +{% macro spark__create_temporary_view(relation, compiled_code) -%} + create temporary view {{ relation }} as + {{ compiled_code }} +{%- endmacro -%} -{% macro spark__create_table_as(temporary, relation, sql) -%} - {% if temporary -%} - {{ create_temporary_view(relation, sql) }} - {%- else -%} - {% if config.get('file_format', validator=validation.any[basestring]) == 'delta' %} - create or replace table {{ relation }} - {% else %} - create table {{ relation }} - {% endif %} - {{ file_format_clause() }} - {{ options_clause() }} - {{ partition_cols(label="partitioned by") }} - {{ clustered_cols(label="clustered by") }} - {{ location_clause() }} - {{ comment_clause() }} - as - {{ sql }} - {%- endif %} +{%- macro spark__create_table_as(temporary, relation, compiled_code, language='sql') -%} + {%- if language == 'sql' -%} + {%- if temporary -%} + {{ create_temporary_view(relation, compiled_code) }} + {%- else -%} + {% if config.get('file_format', validator=validation.any[basestring]) == 'delta' %} + create or replace table {{ relation }} + {% else %} + create table {{ relation }} + {% endif %} + {{ file_format_clause() }} + {{ options_clause() }} + {{ partition_cols(label="partitioned by") }} + {{ clustered_cols(label="clustered by") }} + {{ location_clause() }} + {{ comment_clause() }} + as + {{ compiled_code }} + {%- endif -%} + {%- elif language == 'python' -%} + {#-- + N.B. Python models _can_ write to temp views HOWEVER they use a different session + and have already expired by the time they need to be used (I.E. in merges for incremental models) + + TODO: Deep dive into spark sessions to see if we can reuse a single session for an entire + dbt invocation. + --#} + {{ py_write_table(compiled_code=compiled_code, target_relation=relation) }} + {%- endif -%} {%- endmacro -%} @@ -169,11 +180,19 @@ {%- endcall -%} {% endmacro %} -{% macro spark__get_columns_in_relation(relation) -%} - {% call statement('get_columns_in_relation', fetch_result=True) %} - describe extended {{ relation.include(schema=(schema is not none)) }} +{% macro get_columns_in_relation_raw(relation) -%} + {{ return(adapter.dispatch('get_columns_in_relation_raw', 'dbt')(relation)) }} +{%- endmacro -%} + +{% macro spark__get_columns_in_relation_raw(relation) -%} + {% call statement('get_columns_in_relation_raw', fetch_result=True) %} + describe extended {{ relation }} {% endcall %} - {% do return(load_result('get_columns_in_relation').table) %} + {% do return(load_result('get_columns_in_relation_raw').table) %} +{% endmacro %} + +{% macro spark__get_columns_in_relation(relation) -%} + {{ return(adapter.get_columns_in_relation(relation)) }} {% endmacro %} {% macro spark__list_relations_without_caching(relation) %} @@ -191,10 +210,6 @@ {{ return(load_result('list_schemas').table) }} {% endmacro %} -{% macro spark__current_timestamp() -%} - current_timestamp() -{%- endmacro %} - {% macro spark__rename_relation(from_relation, to_relation) -%} {% call statement('rename_relation') -%} {% if not from_relation.type %} @@ -232,7 +247,7 @@ {% set comment = column_dict[column_name]['description'] %} {% set escaped_comment = comment | replace('\'', '\\\'') %} {% set comment_query %} - alter table {{ relation }} change column + alter table {{ relation }} change column {{ adapter.quote(column_name) if column_dict[column_name]['quote'] else column_name }} comment '{{ escaped_comment }}'; {% endset %} @@ -245,8 +260,7 @@ {% macro spark__make_temp_relation(base_relation, suffix) %} {% set tmp_identifier = base_relation.identifier ~ suffix %} {% set tmp_relation = base_relation.incorporate(path = { - "identifier": tmp_identifier, - "schema": None + "identifier": tmp_identifier }) -%} {% do return(tmp_relation) %} @@ -261,25 +275,25 @@ {% macro spark__alter_relation_add_remove_columns(relation, add_columns, remove_columns) %} - + {% if remove_columns %} {% set platform_name = 'Delta Lake' if relation.is_delta else 'Apache Spark' %} {{ exceptions.raise_compiler_error(platform_name + ' does not support dropping columns from tables') }} {% endif %} - + {% if add_columns is none %} {% set add_columns = [] %} {% endif %} - + {% set sql -%} - + alter {{ relation.type }} {{ relation }} - + {% if add_columns %} add columns {% endif %} {% for column in add_columns %} {{ column.name }} {{ column.data_type }}{{ ',' if not loop.last }} {% endfor %} - + {%- endset -%} {% do run_query(sql) %} diff --git a/dbt/include/spark/macros/apply_grants.sql b/dbt/include/spark/macros/apply_grants.sql new file mode 100644 index 000000000..49dae95dc --- /dev/null +++ b/dbt/include/spark/macros/apply_grants.sql @@ -0,0 +1,39 @@ +{% macro spark__copy_grants() %} + + {% if config.materialized == 'view' %} + {#-- Spark views don't copy grants when they're replaced --#} + {{ return(False) }} + + {% else %} + {#-- This depends on how we're replacing the table, which depends on its file format + -- Just play it safe by assuming that grants have been copied over, and need to be checked / possibly revoked + -- We can make this more efficient in the future + #} + {{ return(True) }} + + {% endif %} +{% endmacro %} + + +{%- macro spark__get_grant_sql(relation, privilege, grantees) -%} + grant {{ privilege }} on {{ relation }} to {{ adapter.quote(grantees[0]) }} +{%- endmacro %} + + +{%- macro spark__get_revoke_sql(relation, privilege, grantees) -%} + revoke {{ privilege }} on {{ relation }} from {{ adapter.quote(grantees[0]) }} +{%- endmacro %} + + +{%- macro spark__support_multiple_grantees_per_dcl_statement() -%} + {{ return(False) }} +{%- endmacro -%} + + +{% macro spark__call_dcl_statements(dcl_statement_list) %} + {% for dcl_statement in dcl_statement_list %} + {% call statement('grant_or_revoke') %} + {{ dcl_statement }} + {% endcall %} + {% endfor %} +{% endmacro %} diff --git a/dbt/include/spark/macros/materializations/incremental/column_helpers.sql b/dbt/include/spark/macros/materializations/incremental/column_helpers.sql new file mode 100644 index 000000000..3eec968d5 --- /dev/null +++ b/dbt/include/spark/macros/materializations/incremental/column_helpers.sql @@ -0,0 +1,23 @@ +{% macro spark__get_merge_update_columns(merge_update_columns, merge_exclude_columns, dest_columns) %} + {%- set default_cols = None -%} + + {%- if merge_update_columns and merge_exclude_columns -%} + {{ exceptions.raise_compiler_error( + 'Model cannot specify merge_update_columns and merge_exclude_columns. Please update model to use only one config' + )}} + {%- elif merge_update_columns -%} + {%- set update_columns = merge_update_columns -%} + {%- elif merge_exclude_columns -%} + {%- set update_columns = [] -%} + {%- for column in dest_columns -%} + {% if column.column | lower not in merge_exclude_columns | map("lower") | list %} + {%- do update_columns.append(column.quoted) -%} + {% endif %} + {%- endfor -%} + {%- else -%} + {%- set update_columns = default_cols -%} + {%- endif -%} + + {{ return(update_columns) }} + +{% endmacro %} diff --git a/dbt/include/spark/macros/materializations/incremental/incremental.sql b/dbt/include/spark/macros/materializations/incremental/incremental.sql index d0b6e89ba..e293441b8 100644 --- a/dbt/include/spark/macros/materializations/incremental/incremental.sql +++ b/dbt/include/spark/macros/materializations/incremental/incremental.sql @@ -1,48 +1,80 @@ -{% materialization incremental, adapter='spark' -%} - +{% materialization incremental, adapter='spark', supported_languages=['sql', 'python'] -%} {#-- Validate early so we don't run SQL if the file_format + strategy combo is invalid --#} {%- set raw_file_format = config.get('file_format', default='parquet') -%} - {%- set raw_strategy = config.get('incremental_strategy', default='append') -%} - + {%- set raw_strategy = config.get('incremental_strategy') or 'append' -%} + {%- set grant_config = config.get('grants') -%} + {%- set file_format = dbt_spark_validate_get_file_format(raw_file_format) -%} {%- set strategy = dbt_spark_validate_get_incremental_strategy(raw_strategy, file_format) -%} - + + {#-- Set vars --#} + {%- set unique_key = config.get('unique_key', none) -%} {%- set partition_by = config.get('partition_by', none) -%} + {%- set language = model['language'] -%} + {%- set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') -%} + {%- set target_relation = this -%} + {%- set existing_relation = load_relation(this) -%} + {%- set tmp_relation = make_temp_relation(this) -%} - {%- set full_refresh_mode = (should_full_refresh()) -%} - - {% set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') %} - - {% set target_relation = this %} - {% set existing_relation = load_relation(this) %} - {% set tmp_relation = make_temp_relation(this) %} + {#-- for SQL model we will create temp view that doesn't have database and schema --#} + {%- if language == 'sql'-%} + {%- set tmp_relation = tmp_relation.include(database=false, schema=false) -%} + {%- endif -%} - {% if strategy == 'insert_overwrite' and partition_by %} - {% call statement() %} + {#-- Set Overwrite Mode --#} + {%- if strategy == 'insert_overwrite' and partition_by -%} + {%- call statement() -%} set spark.sql.sources.partitionOverwriteMode = DYNAMIC - {% endcall %} - {% endif %} + {%- endcall -%} + {%- endif -%} + {#-- Run pre-hooks --#} {{ run_hooks(pre_hooks) }} - {% if existing_relation is none %} - {% set build_sql = create_table_as(False, target_relation, sql) %} - {% elif existing_relation.is_view or full_refresh_mode %} - {% do adapter.drop_relation(existing_relation) %} - {% set build_sql = create_table_as(False, target_relation, sql) %} - {% else %} - {% do run_query(create_table_as(True, tmp_relation, sql)) %} - {% do process_schema_changes(on_schema_change, tmp_relation, existing_relation) %} - {% set build_sql = dbt_spark_get_incremental_sql(strategy, tmp_relation, target_relation, unique_key) %} - {% endif %} - - {%- call statement('main') -%} - {{ build_sql }} - {%- endcall -%} + {#-- Incremental run logic --#} + {%- if existing_relation is none -%} + {#-- Relation must be created --#} + {%- call statement('main', language=language) -%} + {{ create_table_as(False, target_relation, compiled_code, language) }} + {%- endcall -%} + {%- elif existing_relation.is_view or should_full_refresh() -%} + {#-- Relation must be dropped & recreated --#} + {% set is_delta = (file_format == 'delta' and existing_relation.is_delta) %} + {% if not is_delta %} {#-- If Delta, we will `create or replace` below, so no need to drop --#} + {% do adapter.drop_relation(existing_relation) %} + {% endif %} + {%- call statement('main', language=language) -%} + {{ create_table_as(False, target_relation, compiled_code, language) }} + {%- endcall -%} + {%- else -%} + {#-- Relation must be merged --#} + {%- call statement('create_tmp_relation', language=language) -%} + {{ create_table_as(True, tmp_relation, compiled_code, language) }} + {%- endcall -%} + {%- do process_schema_changes(on_schema_change, tmp_relation, existing_relation) -%} + {%- call statement('main') -%} + {{ dbt_spark_get_incremental_sql(strategy, tmp_relation, target_relation, unique_key) }} + {%- endcall -%} + {%- if language == 'python' -%} + {#-- + This is yucky. + See note in dbt-spark/dbt/include/spark/macros/adapters.sql + re: python models and temporary views. + + Also, why doesn't either drop_relation or adapter.drop_relation work here?! + --#} + {% call statement('drop_relation') -%} + drop table if exists {{ tmp_relation }} + {%- endcall %} + {%- endif -%} + {%- endif -%} + + {% set should_revoke = should_revoke(existing_relation, full_refresh_mode) %} + {% do apply_grants(target_relation, grant_config, should_revoke) %} {% do persist_docs(target_relation, model) %} - + {{ run_hooks(post_hooks) }} {{ return({'relations': [target_relation]}) }} diff --git a/dbt/include/spark/macros/materializations/incremental/strategies.sql b/dbt/include/spark/macros/materializations/incremental/strategies.sql index 215b5f3f9..17196e85d 100644 --- a/dbt/include/spark/macros/materializations/incremental/strategies.sql +++ b/dbt/include/spark/macros/materializations/incremental/strategies.sql @@ -1,10 +1,10 @@ {% macro get_insert_overwrite_sql(source_relation, target_relation) %} - + {%- set dest_columns = adapter.get_columns_in_relation(target_relation) -%} {%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} insert overwrite table {{ target_relation }} {{ partition_cols(label="partition") }} - select {{dest_cols_csv}} from {{ source_relation.include(database=false, schema=false) }} + select {{dest_cols_csv}} from {{ source_relation }} {% endmacro %} @@ -14,15 +14,18 @@ {%- set dest_columns = adapter.get_columns_in_relation(target_relation) -%} {%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} insert into table {{ target_relation }} - select {{dest_cols_csv}} from {{ source_relation.include(database=false, schema=false) }} + select {{dest_cols_csv}} from {{ source_relation }} {% endmacro %} {% macro spark__get_merge_sql(target, source, unique_key, dest_columns, predicates=none) %} - {# skip dest_columns, use merge_update_columns config if provided, otherwise use "*" #} + {# need dest_columns for merge_exclude_columns, default to use "*" #} {%- set predicates = [] if predicates is none else [] + predicates -%} - {%- set update_columns = config.get("merge_update_columns") -%} + {%- set dest_columns = adapter.get_columns_in_relation(target) -%} + {%- set merge_update_columns = config.get('merge_update_columns') -%} + {%- set merge_exclude_columns = config.get('merge_exclude_columns') -%} + {%- set update_columns = get_merge_update_columns(merge_update_columns, merge_exclude_columns, dest_columns) -%} {% if unique_key %} {% if unique_key is sequence and unique_key is not mapping and unique_key is not string %} @@ -41,20 +44,20 @@ {% else %} {% do predicates.append('FALSE') %} {% endif %} - + {{ sql_header if sql_header is not none }} - + merge into {{ target }} as DBT_INTERNAL_DEST - using {{ source.include(schema=false) }} as DBT_INTERNAL_SOURCE + using {{ source }} as DBT_INTERNAL_SOURCE on {{ predicates | join(' and ') }} - + when matched then update set {% if update_columns -%}{%- for column_name in update_columns %} {{ column_name }} = DBT_INTERNAL_SOURCE.{{ column_name }} {%- if not loop.last %}, {%- endif %} {%- endfor %} {%- else %} * {% endif %} - + when not matched then insert * {% endmacro %} diff --git a/dbt/include/spark/macros/materializations/incremental/validate.sql b/dbt/include/spark/macros/materializations/incremental/validate.sql index 3e9de359b..ffd56f106 100644 --- a/dbt/include/spark/macros/materializations/incremental/validate.sql +++ b/dbt/include/spark/macros/materializations/incremental/validate.sql @@ -28,13 +28,13 @@ Invalid incremental strategy provided: {{ raw_strategy }} You can only choose this strategy when file_format is set to 'delta' or 'hudi' {%- endset %} - + {% set invalid_insert_overwrite_delta_msg -%} Invalid incremental strategy provided: {{ raw_strategy }} You cannot use this strategy when file_format is set to 'delta' Use the 'append' or 'merge' strategy instead {%- endset %} - + {% set invalid_insert_overwrite_endpoint_msg -%} Invalid incremental strategy provided: {{ raw_strategy }} You cannot use this strategy when connecting via endpoint diff --git a/dbt/include/spark/macros/materializations/snapshot.sql b/dbt/include/spark/macros/materializations/snapshot.sql index 82d186ce2..6cf2358fe 100644 --- a/dbt/include/spark/macros/materializations/snapshot.sql +++ b/dbt/include/spark/macros/materializations/snapshot.sql @@ -32,7 +32,7 @@ {% macro spark_build_snapshot_staging_table(strategy, sql, target_relation) %} {% set tmp_identifier = target_relation.identifier ~ '__dbt_tmp' %} - + {%- set tmp_relation = api.Relation.create(identifier=tmp_identifier, schema=target_relation.schema, database=none, @@ -75,6 +75,7 @@ {%- set strategy_name = config.get('strategy') -%} {%- set unique_key = config.get('unique_key') %} {%- set file_format = config.get('file_format', 'parquet') -%} + {%- set grant_config = config.get('grants') -%} {% set target_relation_exists, target_relation = get_or_create_relation( database=none, @@ -116,7 +117,7 @@ {% if not target_relation_exists %} - {% set build_sql = build_snapshot_table(strategy, model['compiled_sql']) %} + {% set build_sql = build_snapshot_table(strategy, model['compiled_code']) %} {% set final_sql = create_table_as(False, target_relation, build_sql) %} {% else %} @@ -163,6 +164,9 @@ {{ final_sql }} {% endcall %} + {% set should_revoke = should_revoke(target_relation_exists, full_refresh_mode) %} + {% do apply_grants(target_relation, grant_config, should_revoke) %} + {% do persist_docs(target_relation, model) %} {{ run_hooks(post_hooks, inside_transaction=True) }} diff --git a/dbt/include/spark/macros/materializations/table.sql b/dbt/include/spark/macros/materializations/table.sql index 3ae2df973..c82e27e9c 100644 --- a/dbt/include/spark/macros/materializations/table.sql +++ b/dbt/include/spark/macros/materializations/table.sql @@ -1,6 +1,7 @@ -{% materialization table, adapter = 'spark' %} - +{% materialization table, adapter = 'spark', supported_languages=['sql', 'python'] %} + {%- set language = model['language'] -%} {%- set identifier = model['alias'] -%} + {%- set grant_config = config.get('grants') -%} {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%} {%- set target_relation = api.Relation.create(identifier=identifier, @@ -18,10 +19,14 @@ {%- endif %} -- build model - {% call statement('main') -%} - {{ create_table_as(False, target_relation, sql) }} - {%- endcall %} - + + {%- call statement('main', language=language) -%} + {{ create_table_as(False, target_relation, compiled_code, language) }} + {%- endcall -%} + + {% set should_revoke = should_revoke(old_relation, full_refresh_mode=True) %} + {% do apply_grants(target_relation, grant_config, should_revoke) %} + {% do persist_docs(target_relation, model) %} {{ run_hooks(post_hooks) }} @@ -29,3 +34,63 @@ {{ return({'relations': [target_relation]})}} {% endmaterialization %} + + +{% macro py_write_table(compiled_code, target_relation) %} +{{ compiled_code }} +# --- Autogenerated dbt materialization code. --- # +dbt = dbtObj(spark.table) +df = model(dbt, spark) + +# make sure pyspark exists in the namepace, for 7.3.x-scala2.12 it does not exist +import pyspark +# make sure pandas exists before using it +try: + import pandas + pandas_available = True +except ImportError: + pandas_available = False + +# make sure pyspark.pandas exists before using it +try: + import pyspark.pandas + pyspark_pandas_api_available = True +except ImportError: + pyspark_pandas_api_available = False + +# make sure databricks.koalas exists before using it +try: + import databricks.koalas + koalas_available = True +except ImportError: + koalas_available = False + +# preferentially convert pandas DataFrames to pandas-on-Spark or Koalas DataFrames first +# since they know how to convert pandas DataFrames better than `spark.createDataFrame(df)` +# and converting from pandas-on-Spark to Spark DataFrame has no overhead +if pyspark_pandas_api_available and pandas_available and isinstance(df, pandas.core.frame.DataFrame): + df = pyspark.pandas.frame.DataFrame(df) +elif koalas_available and pandas_available and isinstance(df, pandas.core.frame.DataFrame): + df = databricks.koalas.frame.DataFrame(df) + +# convert to pyspark.sql.dataframe.DataFrame +if isinstance(df, pyspark.sql.dataframe.DataFrame): + pass # since it is already a Spark DataFrame +elif pyspark_pandas_api_available and isinstance(df, pyspark.pandas.frame.DataFrame): + df = df.to_spark() +elif koalas_available and isinstance(df, databricks.koalas.frame.DataFrame): + df = df.to_spark() +elif pandas_available and isinstance(df, pandas.core.frame.DataFrame): + df = spark.createDataFrame(df) +else: + msg = f"{type(df)} is not a supported type for dbt Python materialization" + raise Exception(msg) + +df.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("{{ target_relation }}") +{%- endmacro -%} + +{%macro py_script_comment()%} +# how to execute python model in notebook +# dbt = dbtObj(spark.table) +# df = model(dbt, spark) +{%endmacro%} diff --git a/dbt/include/spark/macros/utils/any_value.sql b/dbt/include/spark/macros/utils/any_value.sql new file mode 100644 index 000000000..eb0a019b3 --- /dev/null +++ b/dbt/include/spark/macros/utils/any_value.sql @@ -0,0 +1,5 @@ +{% macro spark__any_value(expression) -%} + {#-- return any value (non-deterministic) --#} + first({{ expression }}) + +{%- endmacro %} diff --git a/dbt/include/spark/macros/utils/array_append.sql b/dbt/include/spark/macros/utils/array_append.sql new file mode 100644 index 000000000..efe39e7ab --- /dev/null +++ b/dbt/include/spark/macros/utils/array_append.sql @@ -0,0 +1,3 @@ +{% macro spark__array_append(array, new_element) -%} + {{ array_concat(array, array_construct([new_element])) }} +{%- endmacro %} diff --git a/dbt/include/spark/macros/utils/array_concat.sql b/dbt/include/spark/macros/utils/array_concat.sql new file mode 100644 index 000000000..1441618c8 --- /dev/null +++ b/dbt/include/spark/macros/utils/array_concat.sql @@ -0,0 +1,3 @@ +{% macro spark__array_concat(array_1, array_2) -%} + concat({{ array_1 }}, {{ array_2 }}) +{%- endmacro %} diff --git a/dbt/include/spark/macros/utils/array_construct.sql b/dbt/include/spark/macros/utils/array_construct.sql new file mode 100644 index 000000000..a4e5e0c7c --- /dev/null +++ b/dbt/include/spark/macros/utils/array_construct.sql @@ -0,0 +1,3 @@ +{% macro spark__array_construct(inputs, data_type) -%} + array( {{ inputs|join(' , ') }} ) +{%- endmacro %} diff --git a/dbt/include/spark/macros/utils/assert_not_null.sql b/dbt/include/spark/macros/utils/assert_not_null.sql new file mode 100644 index 000000000..e5454bce9 --- /dev/null +++ b/dbt/include/spark/macros/utils/assert_not_null.sql @@ -0,0 +1,9 @@ +{% macro assert_not_null(function, arg) -%} + {{ return(adapter.dispatch('assert_not_null', 'dbt')(function, arg)) }} +{%- endmacro %} + +{% macro spark__assert_not_null(function, arg) %} + + coalesce({{function}}({{arg}}), nvl2({{function}}({{arg}}), assert_true({{function}}({{arg}}) is not null), null)) + +{% endmacro %} diff --git a/dbt/include/spark/macros/utils/bool_or.sql b/dbt/include/spark/macros/utils/bool_or.sql new file mode 100644 index 000000000..60d705eb3 --- /dev/null +++ b/dbt/include/spark/macros/utils/bool_or.sql @@ -0,0 +1,11 @@ +{#-- Spark v3 supports 'bool_or' and 'any', but Spark v2 needs to use 'max' for this + -- https://spark.apache.org/docs/latest/api/sql/index.html#any + -- https://spark.apache.org/docs/latest/api/sql/index.html#bool_or + -- https://spark.apache.org/docs/latest/api/sql/index.html#max +#} + +{% macro spark__bool_or(expression) -%} + + max({{ expression }}) + +{%- endmacro %} diff --git a/dbt/include/spark/macros/utils/concat.sql b/dbt/include/spark/macros/utils/concat.sql new file mode 100644 index 000000000..30f1a420e --- /dev/null +++ b/dbt/include/spark/macros/utils/concat.sql @@ -0,0 +1,3 @@ +{% macro spark__concat(fields) -%} + concat({{ fields|join(', ') }}) +{%- endmacro %} diff --git a/dbt/include/spark/macros/utils/dateadd.sql b/dbt/include/spark/macros/utils/dateadd.sql new file mode 100644 index 000000000..e2a20d0f2 --- /dev/null +++ b/dbt/include/spark/macros/utils/dateadd.sql @@ -0,0 +1,62 @@ +{% macro spark__dateadd(datepart, interval, from_date_or_timestamp) %} + + {%- set clock_component -%} + {# make sure the dates + timestamps are real, otherwise raise an error asap #} + to_unix_timestamp({{ assert_not_null('to_timestamp', from_date_or_timestamp) }}) + - to_unix_timestamp({{ assert_not_null('date', from_date_or_timestamp) }}) + {%- endset -%} + + {%- if datepart in ['day', 'week'] -%} + + {%- set multiplier = 7 if datepart == 'week' else 1 -%} + + to_timestamp( + to_unix_timestamp( + date_add( + {{ assert_not_null('date', from_date_or_timestamp) }}, + cast({{interval}} * {{multiplier}} as int) + ) + ) + {{clock_component}} + ) + + {%- elif datepart in ['month', 'quarter', 'year'] -%} + + {%- set multiplier -%} + {%- if datepart == 'month' -%} 1 + {%- elif datepart == 'quarter' -%} 3 + {%- elif datepart == 'year' -%} 12 + {%- endif -%} + {%- endset -%} + + to_timestamp( + to_unix_timestamp( + add_months( + {{ assert_not_null('date', from_date_or_timestamp) }}, + cast({{interval}} * {{multiplier}} as int) + ) + ) + {{clock_component}} + ) + + {%- elif datepart in ('hour', 'minute', 'second', 'millisecond', 'microsecond') -%} + + {%- set multiplier -%} + {%- if datepart == 'hour' -%} 3600 + {%- elif datepart == 'minute' -%} 60 + {%- elif datepart == 'second' -%} 1 + {%- elif datepart == 'millisecond' -%} (1/1000000) + {%- elif datepart == 'microsecond' -%} (1/1000000) + {%- endif -%} + {%- endset -%} + + to_timestamp( + {{ assert_not_null('to_unix_timestamp', from_date_or_timestamp) }} + + cast({{interval}} * {{multiplier}} as int) + ) + + {%- else -%} + + {{ exceptions.raise_compiler_error("macro dateadd not implemented for datepart ~ '" ~ datepart ~ "' ~ on Spark") }} + + {%- endif -%} + +{% endmacro %} diff --git a/dbt/include/spark/macros/utils/datediff.sql b/dbt/include/spark/macros/utils/datediff.sql new file mode 100644 index 000000000..d0e684c47 --- /dev/null +++ b/dbt/include/spark/macros/utils/datediff.sql @@ -0,0 +1,107 @@ +{% macro spark__datediff(first_date, second_date, datepart) %} + + {%- if datepart in ['day', 'week', 'month', 'quarter', 'year'] -%} + + {# make sure the dates are real, otherwise raise an error asap #} + {% set first_date = assert_not_null('date', first_date) %} + {% set second_date = assert_not_null('date', second_date) %} + + {%- endif -%} + + {%- if datepart == 'day' -%} + + datediff({{second_date}}, {{first_date}}) + + {%- elif datepart == 'week' -%} + + case when {{first_date}} < {{second_date}} + then floor(datediff({{second_date}}, {{first_date}})/7) + else ceil(datediff({{second_date}}, {{first_date}})/7) + end + + -- did we cross a week boundary (Sunday)? + + case + when {{first_date}} < {{second_date}} and dayofweek({{second_date}}) < dayofweek({{first_date}}) then 1 + when {{first_date}} > {{second_date}} and dayofweek({{second_date}}) > dayofweek({{first_date}}) then -1 + else 0 end + + {%- elif datepart == 'month' -%} + + case when {{first_date}} < {{second_date}} + then floor(months_between(date({{second_date}}), date({{first_date}}))) + else ceil(months_between(date({{second_date}}), date({{first_date}}))) + end + + -- did we cross a month boundary? + + case + when {{first_date}} < {{second_date}} and dayofmonth({{second_date}}) < dayofmonth({{first_date}}) then 1 + when {{first_date}} > {{second_date}} and dayofmonth({{second_date}}) > dayofmonth({{first_date}}) then -1 + else 0 end + + {%- elif datepart == 'quarter' -%} + + case when {{first_date}} < {{second_date}} + then floor(months_between(date({{second_date}}), date({{first_date}}))/3) + else ceil(months_between(date({{second_date}}), date({{first_date}}))/3) + end + + -- did we cross a quarter boundary? + + case + when {{first_date}} < {{second_date}} and ( + (dayofyear({{second_date}}) - (quarter({{second_date}}) * 365/4)) + < (dayofyear({{first_date}}) - (quarter({{first_date}}) * 365/4)) + ) then 1 + when {{first_date}} > {{second_date}} and ( + (dayofyear({{second_date}}) - (quarter({{second_date}}) * 365/4)) + > (dayofyear({{first_date}}) - (quarter({{first_date}}) * 365/4)) + ) then -1 + else 0 end + + {%- elif datepart == 'year' -%} + + year({{second_date}}) - year({{first_date}}) + + {%- elif datepart in ('hour', 'minute', 'second', 'millisecond', 'microsecond') -%} + + {%- set divisor -%} + {%- if datepart == 'hour' -%} 3600 + {%- elif datepart == 'minute' -%} 60 + {%- elif datepart == 'second' -%} 1 + {%- elif datepart == 'millisecond' -%} (1/1000) + {%- elif datepart == 'microsecond' -%} (1/1000000) + {%- endif -%} + {%- endset -%} + + case when {{first_date}} < {{second_date}} + then ceil(( + {# make sure the timestamps are real, otherwise raise an error asap #} + {{ assert_not_null('to_unix_timestamp', assert_not_null('to_timestamp', second_date)) }} + - {{ assert_not_null('to_unix_timestamp', assert_not_null('to_timestamp', first_date)) }} + ) / {{divisor}}) + else floor(( + {{ assert_not_null('to_unix_timestamp', assert_not_null('to_timestamp', second_date)) }} + - {{ assert_not_null('to_unix_timestamp', assert_not_null('to_timestamp', first_date)) }} + ) / {{divisor}}) + end + + {% if datepart == 'millisecond' %} + + cast(date_format({{second_date}}, 'SSS') as int) + - cast(date_format({{first_date}}, 'SSS') as int) + {% endif %} + + {% if datepart == 'microsecond' %} + {% set capture_str = '[0-9]{4}-[0-9]{2}-[0-9]{2}.[0-9]{2}:[0-9]{2}:[0-9]{2}.([0-9]{6})' %} + -- Spark doesn't really support microseconds, so this is a massive hack! + -- It will only work if the timestamp-string is of the format + -- 'yyyy-MM-dd-HH mm.ss.SSSSSS' + + cast(regexp_extract({{second_date}}, '{{capture_str}}', 1) as int) + - cast(regexp_extract({{first_date}}, '{{capture_str}}', 1) as int) + {% endif %} + + {%- else -%} + + {{ exceptions.raise_compiler_error("macro datediff not implemented for datepart ~ '" ~ datepart ~ "' ~ on Spark") }} + + {%- endif -%} + +{% endmacro %} diff --git a/dbt/include/spark/macros/utils/listagg.sql b/dbt/include/spark/macros/utils/listagg.sql new file mode 100644 index 000000000..3577edb71 --- /dev/null +++ b/dbt/include/spark/macros/utils/listagg.sql @@ -0,0 +1,17 @@ +{% macro spark__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} + + {% if order_by_clause %} + {{ exceptions.warn("order_by_clause is not supported for listagg on Spark/Databricks") }} + {% endif %} + + {% set collect_list %} collect_list({{ measure }}) {% endset %} + + {% set limited %} slice({{ collect_list }}, 1, {{ limit_num }}) {% endset %} + + {% set collected = limited if limit_num else collect_list %} + + {% set final %} array_join({{ collected }}, {{ delimiter_text }}) {% endset %} + + {% do return(final) %} + +{%- endmacro %} diff --git a/dbt/include/spark/macros/utils/split_part.sql b/dbt/include/spark/macros/utils/split_part.sql new file mode 100644 index 000000000..d5ae30924 --- /dev/null +++ b/dbt/include/spark/macros/utils/split_part.sql @@ -0,0 +1,23 @@ +{% macro spark__split_part(string_text, delimiter_text, part_number) %} + + {% set delimiter_expr %} + + -- escape if starts with a special character + case when regexp_extract({{ delimiter_text }}, '([^A-Za-z0-9])(.*)', 1) != '_' + then concat('\\', {{ delimiter_text }}) + else {{ delimiter_text }} end + + {% endset %} + + {% set split_part_expr %} + + split( + {{ string_text }}, + {{ delimiter_expr }} + )[({{ part_number - 1 }})] + + {% endset %} + + {{ return(split_part_expr) }} + +{% endmacro %} diff --git a/dbt/include/spark/macros/utils/timestamps.sql b/dbt/include/spark/macros/utils/timestamps.sql new file mode 100644 index 000000000..68d6f6884 --- /dev/null +++ b/dbt/include/spark/macros/utils/timestamps.sql @@ -0,0 +1,3 @@ +{% macro spark__current_timestamp() -%} + current_timestamp() +{%- endmacro %} diff --git a/dev_requirements.txt b/dev-requirements.txt similarity index 85% rename from dev_requirements.txt rename to dev-requirements.txt index 0f84cbd5d..e93c1b41a 100644 --- a/dev_requirements.txt +++ b/dev-requirements.txt @@ -3,19 +3,25 @@ git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-core&subdirectory=core git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-tests-adapter&subdirectory=tests/adapter + + +black==22.8.0 +bumpversion +click~=8.1.3 +flake8 +flaky freezegun==0.3.9 -pytest>=6.0.2 +ipdb mock>=1.3.0 -flake8 +mypy==0.971 +pre-commit +pytest-csv +pytest-dotenv +pytest-xdist +pytest>=6.0.2 pytz -bumpversion tox>=3.2.0 -ipdb -pytest-xdist -pytest-dotenv -pytest-csv -flaky # Test requirements sasl>=0.2.1 -thrift_sasl==0.4.1 +thrift_sasl==0.4.3 diff --git a/docker-compose.yml b/docker-compose.yml index 8054dfd75..9bc9e509c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,8 +1,8 @@ version: "3.7" services: - dbt-spark2-thrift: - image: godatadriven/spark:3.0 + dbt-spark3-thrift: + image: godatadriven/spark:3.1.1 ports: - "10000:10000" - "4040:4040" diff --git a/docker/spark-defaults.conf b/docker/spark-defaults.conf index 48a0501c2..30ec59591 100644 --- a/docker/spark-defaults.conf +++ b/docker/spark-defaults.conf @@ -1,7 +1,9 @@ +spark.driver.memory 2g +spark.executor.memory 2g spark.hadoop.datanucleus.autoCreateTables true spark.hadoop.datanucleus.schema.autoCreateTables true spark.hadoop.datanucleus.fixedDatastore false spark.serializer org.apache.spark.serializer.KryoSerializer -spark.jars.packages org.apache.hudi:hudi-spark3-bundle_2.12:0.9.0 +spark.jars.packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0 spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension spark.driver.userClassPathFirst true diff --git a/requirements.txt b/requirements.txt index e03320a41..14b36b723 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ PyHive[hive]>=0.6.0,<0.7.0 -pyodbc>=4.0.30 +requests[python]>=2.28.1 + +pyodbc==4.0.34 sqlparams>=3.0.0 thrift>=0.13.0 sqlparse>=0.4.2 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/scripts/build-dist.sh b/scripts/build-dist.sh index 65e6dbc97..3c3808399 100755 --- a/scripts/build-dist.sh +++ b/scripts/build-dist.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash set -eo pipefail diff --git a/setup.py b/setup.py index 12ecbacde..9d6c1367e 100644 --- a/setup.py +++ b/setup.py @@ -5,41 +5,39 @@ # require python 3.7 or newer if sys.version_info < (3, 7): - print('Error: dbt does not support this version of Python.') - print('Please upgrade to Python 3.7 or higher.') + print("Error: dbt does not support this version of Python.") + print("Please upgrade to Python 3.7 or higher.") sys.exit(1) # require version of setuptools that supports find_namespace_packages from setuptools import setup + try: from setuptools import find_namespace_packages except ImportError: # the user has a downlevel version of setuptools. - print('Error: dbt requires setuptools v40.1.0 or higher.') - print('Please upgrade setuptools with "pip install --upgrade setuptools" ' - 'and try again') + print("Error: dbt requires setuptools v40.1.0 or higher.") + print('Please upgrade setuptools with "pip install --upgrade setuptools" ' "and try again") sys.exit(1) # pull long description from README this_directory = os.path.abspath(os.path.dirname(__file__)) -with open(os.path.join(this_directory, 'README.md'), 'r', encoding='utf8') as f: +with open(os.path.join(this_directory, "README.md"), "r", encoding="utf8") as f: long_description = f.read() # get this package's version from dbt/adapters/<name>/__version__.py def _get_plugin_version_dict(): - _version_path = os.path.join( - this_directory, 'dbt', 'adapters', 'spark', '__version__.py' - ) - _semver = r'''(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)''' - _pre = r'''((?P<prekind>a|b|rc)(?P<pre>\d+))?''' - _version_pattern = fr'''version\s*=\s*["']{_semver}{_pre}["']''' + _version_path = os.path.join(this_directory, "dbt", "adapters", "spark", "__version__.py") + _semver = r"""(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)""" + _pre = r"""((?P<prekind>a|b|rc)(?P<pre>\d+))?""" + _version_pattern = fr"""version\s*=\s*["']{_semver}{_pre}["']""" with open(_version_path) as f: match = re.search(_version_pattern, f.read().strip()) if match is None: - raise ValueError(f'invalid version at {_version_path}') + raise ValueError(f"invalid version at {_version_path}") return match.groupdict() @@ -47,42 +45,37 @@ def _get_plugin_version_dict(): def _get_dbt_core_version(): parts = _get_plugin_version_dict() minor = "{major}.{minor}.0".format(**parts) - pre = (parts["prekind"]+"1" if parts["prekind"] else "") + pre = parts["prekind"] + "1" if parts["prekind"] else "" return f"{minor}{pre}" package_name = "dbt-spark" -package_version = "1.2.0a1" +package_version = "1.4.0a1" dbt_core_version = _get_dbt_core_version() description = """The Apache Spark adapter plugin for dbt""" -odbc_extras = ['pyodbc>=4.0.30'] +odbc_extras = ["pyodbc>=4.0.30"] pyhive_extras = [ - 'PyHive[hive]>=0.6.0,<0.7.0', - 'thrift>=0.11.0,<0.16.0', -] -session_extras = [ - "pyspark>=3.0.0,<4.0.0" + "PyHive[hive]>=0.6.0,<0.7.0", + "thrift>=0.11.0,<0.16.0", ] +session_extras = ["pyspark>=3.0.0,<4.0.0"] all_extras = odbc_extras + pyhive_extras + session_extras setup( name=package_name, version=package_version, - description=description, long_description=long_description, - long_description_content_type='text/markdown', - - author='dbt Labs', - author_email='info@dbtlabs.com', - url='https://github.com/dbt-labs/dbt-spark', - - packages=find_namespace_packages(include=['dbt', 'dbt.*']), + long_description_content_type="text/markdown", + author="dbt Labs", + author_email="info@dbtlabs.com", + url="https://github.com/dbt-labs/dbt-spark", + packages=find_namespace_packages(include=["dbt", "dbt.*"]), include_package_data=True, install_requires=[ - 'dbt-core~={}'.format(dbt_core_version), - 'sqlparams>=3.0.0', + "dbt-core~={}".format(dbt_core_version), + "sqlparams>=3.0.0", ], extras_require={ "ODBC": odbc_extras, @@ -92,17 +85,14 @@ def _get_dbt_core_version(): }, zip_safe=False, classifiers=[ - 'Development Status :: 5 - Production/Stable', - - 'License :: OSI Approved :: Apache Software License', - - 'Operating System :: Microsoft :: Windows', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: POSIX :: Linux', - - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "Operating System :: Microsoft :: Windows", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", ], python_requires=">=3.7", ) diff --git a/test.env.example b/test.env.example new file mode 100644 index 000000000..e69f700b7 --- /dev/null +++ b/test.env.example @@ -0,0 +1,15 @@ +# Cluster ID +DBT_DATABRICKS_CLUSTER_NAME= +# SQL Endpoint +DBT_DATABRICKS_ENDPOINT= +# Server Hostname value +DBT_DATABRICKS_HOST_NAME= +# personal token +DBT_DATABRICKS_TOKEN= +# file path to local ODBC driver +ODBC_DRIVER= + +# users for testing 'grants' functionality +DBT_TEST_USER_1= +DBT_TEST_USER_2= +DBT_TEST_USER_3= diff --git a/tests/conftest.py b/tests/conftest.py index 7ba95d47b..2fa50d6c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ def pytest_addoption(parser): parser.addoption("--profile", action="store", default="apache_spark", type=str) -# Using @pytest.mark.skip_adapter('apache_spark') uses the 'skip_by_adapter_type' +# Using @pytest.mark.skip_profile('apache_spark') uses the 'skip_by_profile_type' # autouse fixture below def pytest_configure(config): config.addinivalue_line( @@ -60,6 +60,7 @@ def databricks_cluster_target(): "connect_retries": 3, "connect_timeout": 5, "retry_all": True, + "user": os.getenv('DBT_DATABRICKS_USER'), } @@ -91,6 +92,7 @@ def databricks_http_cluster_target(): "connect_retries": 5, "connect_timeout": 60, "retry_all": bool(os.getenv('DBT_DATABRICKS_RETRY_ALL', False)), + "user": os.getenv('DBT_DATABRICKS_USER'), } @@ -108,4 +110,4 @@ def skip_by_profile_type(request): if request.node.get_closest_marker("skip_profile"): for skip_profile_type in request.node.get_closest_marker("skip_profile").args: if skip_profile_type == profile_type: - pytest.skip("skipped on '{profile_type}' profile") + pytest.skip(f"skipped on '{profile_type}' profile") diff --git a/tests/functional/adapter/test_basic.py b/tests/functional/adapter/test_basic.py index 70f3267a4..bdccf169d 100644 --- a/tests/functional/adapter/test_basic.py +++ b/tests/functional/adapter/test_basic.py @@ -64,7 +64,7 @@ def project_config_update(self): } -#hese tests were not enabled in the dbtspec files, so skipping here. +# These tests were not enabled in the dbtspec files, so skipping here. # Error encountered was: Error running query: java.lang.ClassNotFoundException: delta.DefaultSource @pytest.mark.skip_profile('apache_spark', 'spark_session') class TestSnapshotTimestampSpark(BaseSnapshotTimestamp): @@ -79,5 +79,6 @@ def project_config_update(self): } } +@pytest.mark.skip_profile('spark_session') class TestBaseAdapterMethod(BaseAdapterMethod): - pass \ No newline at end of file + pass diff --git a/tests/functional/adapter/test_grants.py b/tests/functional/adapter/test_grants.py new file mode 100644 index 000000000..8e0341df6 --- /dev/null +++ b/tests/functional/adapter/test_grants.py @@ -0,0 +1,60 @@ +import pytest +from dbt.tests.adapter.grants.test_model_grants import BaseModelGrants +from dbt.tests.adapter.grants.test_incremental_grants import BaseIncrementalGrants +from dbt.tests.adapter.grants.test_invalid_grants import BaseInvalidGrants +from dbt.tests.adapter.grants.test_seed_grants import BaseSeedGrants +from dbt.tests.adapter.grants.test_snapshot_grants import BaseSnapshotGrants + + +@pytest.mark.skip_profile("apache_spark", "spark_session") +class TestModelGrantsSpark(BaseModelGrants): + def privilege_grantee_name_overrides(self): + # insert --> modify + return { + "select": "select", + "insert": "modify", + "fake_privilege": "fake_privilege", + "invalid_user": "invalid_user", + } + + +@pytest.mark.skip_profile("apache_spark", "spark_session") +class TestIncrementalGrantsSpark(BaseIncrementalGrants): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "models": { + "+file_format": "delta", + "+incremental_strategy": "merge", + } + } + + +@pytest.mark.skip_profile("apache_spark", "spark_session") +class TestSeedGrantsSpark(BaseSeedGrants): + # seeds in dbt-spark are currently "full refreshed," in such a way that + # the grants are not carried over + # see https://github.com/dbt-labs/dbt-spark/issues/388 + def seeds_support_partial_refresh(self): + return False + + +@pytest.mark.skip_profile("apache_spark", "spark_session") +class TestSnapshotGrantsSpark(BaseSnapshotGrants): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "snapshots": { + "+file_format": "delta", + "+incremental_strategy": "merge", + } + } + + +@pytest.mark.skip_profile("apache_spark", "spark_session") +class TestInvalidGrantsSpark(BaseInvalidGrants): + def grantee_does_not_exist_error(self): + return "RESOURCE_DOES_NOT_EXIST" + + def privilege_does_not_exist_error(self): + return "Action Unknown" diff --git a/tests/functional/adapter/test_python_model.py b/tests/functional/adapter/test_python_model.py new file mode 100644 index 000000000..140f41621 --- /dev/null +++ b/tests/functional/adapter/test_python_model.py @@ -0,0 +1,79 @@ +import os +import pytest +from dbt.tests.util import run_dbt, write_file, run_dbt_and_capture +from dbt.tests.adapter.python_model.test_python_model import BasePythonModelTests, BasePythonIncrementalTests +from dbt.tests.adapter.python_model.test_spark import BasePySparkTests +@pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") +class TestPythonModelSpark(BasePythonModelTests): + pass + +@pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") +class TestPySpark(BasePySparkTests): + pass + +@pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") +class TestPythonIncrementalModelSpark(BasePythonIncrementalTests): + @pytest.fixture(scope="class") + def project_config_update(self): + return {} + + +models__simple_python_model = """ +import pandas +import torch +import spacy + +def model(dbt, spark): + dbt.config( + materialized='table', + submission_method='job_cluster', + job_cluster_config={ + "spark_version": "7.3.x-scala2.12", + "node_type_id": "i3.xlarge", + "num_workers": 0, + "spark_conf": { + "spark.databricks.cluster.profile": "singleNode", + "spark.master": "local[*, 4]" + }, + "custom_tags": { + "ResourceClass": "SingleNode" + } + }, + packages=['spacy', 'torch'] + ) + data = [[1,2]] * 10 + return spark.createDataFrame(data, schema=['test', 'test2']) +""" +models__simple_python_model_v2 = """ +import pandas + +def model(dbt, spark): + dbt.config( + materialized='table', + ) + data = [[1,2]] * 10 + return spark.createDataFrame(data, schema=['test1', 'test3']) +""" + + +@pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") +class TestChangingSchemaSpark: + @pytest.fixture(scope="class") + def models(self): + return {"simple_python_model.py": models__simple_python_model} + + def test_changing_schema_with_log_validation(self, project, logs_dir): + run_dbt(["run"]) + write_file( + models__simple_python_model_v2, + project.project_root + "/models", + "simple_python_model.py", + ) + run_dbt(["run"]) + log_file = os.path.join(logs_dir, "dbt.log") + with open(log_file, "r") as f: + log = f.read() + # validate #5510 log_code_execution works + assert "On model.test.simple_python_model:" in log + assert "spark.createDataFrame(data, schema=['test1', 'test3'])" in log + assert "Execution status: OK in" in log diff --git a/tests/functional/adapter/utils/fixture_listagg.py b/tests/functional/adapter/utils/fixture_listagg.py new file mode 100644 index 000000000..0262ca234 --- /dev/null +++ b/tests/functional/adapter/utils/fixture_listagg.py @@ -0,0 +1,61 @@ +# SparkSQL does not support 'order by' for its 'listagg' equivalent +# the argument is ignored, so let's ignore those fields when checking equivalency + +models__test_listagg_no_order_by_sql = """ +with data as ( + select * from {{ ref('data_listagg') }} +), +data_output as ( + select * from {{ ref('data_listagg_output') }} +), +calculate as ( +/* + + select + group_col, + {{ listagg('string_text', "'_|_'", "order by order_col") }} as actual, + 'bottom_ordered' as version + from data + group by group_col + union all + select + group_col, + {{ listagg('string_text', "'_|_'", "order by order_col", 2) }} as actual, + 'bottom_ordered_limited' as version + from data + group by group_col + union all + +*/ + select + group_col, + {{ listagg('string_text', "', '") }} as actual, + 'comma_whitespace_unordered' as version + from data + where group_col = 3 + group by group_col + union all + select + group_col, + {{ listagg('DISTINCT string_text', "','") }} as actual, + 'distinct_comma' as version + from data + where group_col = 3 + group by group_col + union all + select + group_col, + {{ listagg('string_text') }} as actual, + 'no_params' as version + from data + where group_col = 3 + group by group_col +) +select + calculate.actual, + data_output.expected +from calculate +left join data_output +on calculate.group_col = data_output.group_col +and calculate.version = data_output.version +""" diff --git a/tests/functional/adapter/utils/test_data_types.py b/tests/functional/adapter/utils/test_data_types.py new file mode 100644 index 000000000..ce6085803 --- /dev/null +++ b/tests/functional/adapter/utils/test_data_types.py @@ -0,0 +1,72 @@ +import pytest +from dbt.tests.adapter.utils.data_types.test_type_bigint import BaseTypeBigInt +from dbt.tests.adapter.utils.data_types.test_type_float import ( + BaseTypeFloat, seeds__expected_csv as seeds__float_expected_csv +) +from dbt.tests.adapter.utils.data_types.test_type_int import ( + BaseTypeInt, seeds__expected_csv as seeds__int_expected_csv +) +from dbt.tests.adapter.utils.data_types.test_type_numeric import BaseTypeNumeric +from dbt.tests.adapter.utils.data_types.test_type_string import BaseTypeString +from dbt.tests.adapter.utils.data_types.test_type_timestamp import BaseTypeTimestamp +from dbt.tests.adapter.utils.data_types.test_type_boolean import BaseTypeBoolean + + +class TestTypeBigInt(BaseTypeBigInt): + pass + + +# need to explicitly cast this to avoid it being inferred/loaded as a DOUBLE on Spark +# in SparkSQL, the two are equivalent for `=` comparison, but distinct for EXCEPT comparison +seeds__float_expected_yml = """ +version: 2 +seeds: + - name: expected + config: + column_types: + float_col: float +""" + +class TestTypeFloat(BaseTypeFloat): + @pytest.fixture(scope="class") + def seeds(self): + return { + "expected.csv": seeds__float_expected_csv, + "expected.yml": seeds__float_expected_yml, + } + + +# need to explicitly cast this to avoid it being inferred/loaded as a BIGINT on Spark +seeds__int_expected_yml = """ +version: 2 +seeds: + - name: expected + config: + column_types: + int_col: int +""" + +class TestTypeInt(BaseTypeInt): + @pytest.fixture(scope="class") + def seeds(self): + return { + "expected.csv": seeds__int_expected_csv, + "expected.yml": seeds__int_expected_yml, + } + + +class TestTypeNumeric(BaseTypeNumeric): + def numeric_fixture_type(self): + return "decimal(28,6)" + + +class TestTypeString(BaseTypeString): + pass + + +class TestTypeTimestamp(BaseTypeTimestamp): + pass + + +class TestTypeBoolean(BaseTypeBoolean): + pass diff --git a/tests/functional/adapter/utils/test_timestamps.py b/tests/functional/adapter/utils/test_timestamps.py new file mode 100644 index 000000000..8507c0a6b --- /dev/null +++ b/tests/functional/adapter/utils/test_timestamps.py @@ -0,0 +1,18 @@ +import pytest +from dbt.tests.adapter.utils.test_timestamps import BaseCurrentTimestamps + + +class TestCurrentTimestampSpark(BaseCurrentTimestamps): + @pytest.fixture(scope="class") + def models(self): + return {"get_current_timestamp.sql": "select {{ current_timestamp() }} as current_timestamp"} + + @pytest.fixture(scope="class") + def expected_schema(self): + return { + "current_timestamp": "timestamp" + } + + @pytest.fixture(scope="class") + def expected_sql(self): + return """select current_timestamp() as current_timestamp""" diff --git a/tests/functional/adapter/utils/test_utils.py b/tests/functional/adapter/utils/test_utils.py new file mode 100644 index 000000000..102df731a --- /dev/null +++ b/tests/functional/adapter/utils/test_utils.py @@ -0,0 +1,143 @@ +import pytest + +from dbt.tests.adapter.utils.test_array_append import BaseArrayAppend +from dbt.tests.adapter.utils.test_array_concat import BaseArrayConcat +from dbt.tests.adapter.utils.test_array_construct import BaseArrayConstruct +from dbt.tests.adapter.utils.test_any_value import BaseAnyValue +from dbt.tests.adapter.utils.test_bool_or import BaseBoolOr +from dbt.tests.adapter.utils.test_cast_bool_to_text import BaseCastBoolToText +from dbt.tests.adapter.utils.test_concat import BaseConcat +from dbt.tests.adapter.utils.test_current_timestamp import BaseCurrentTimestampNaive +from dbt.tests.adapter.utils.test_dateadd import BaseDateAdd +from dbt.tests.adapter.utils.test_datediff import BaseDateDiff +from dbt.tests.adapter.utils.test_date_trunc import BaseDateTrunc +from dbt.tests.adapter.utils.test_escape_single_quotes import BaseEscapeSingleQuotesQuote +from dbt.tests.adapter.utils.test_escape_single_quotes import BaseEscapeSingleQuotesBackslash +from dbt.tests.adapter.utils.test_except import BaseExcept +from dbt.tests.adapter.utils.test_hash import BaseHash +from dbt.tests.adapter.utils.test_intersect import BaseIntersect +from dbt.tests.adapter.utils.test_last_day import BaseLastDay +from dbt.tests.adapter.utils.test_length import BaseLength +from dbt.tests.adapter.utils.test_position import BasePosition +from dbt.tests.adapter.utils.test_replace import BaseReplace +from dbt.tests.adapter.utils.test_right import BaseRight +from dbt.tests.adapter.utils.test_safe_cast import BaseSafeCast +from dbt.tests.adapter.utils.test_split_part import BaseSplitPart +from dbt.tests.adapter.utils.test_string_literal import BaseStringLiteral + +# requires modification +from dbt.tests.adapter.utils.test_listagg import BaseListagg +from dbt.tests.adapter.utils.fixture_listagg import models__test_listagg_yml +from tests.functional.adapter.utils.fixture_listagg import models__test_listagg_no_order_by_sql + + +class TestAnyValue(BaseAnyValue): + pass + + +class TestArrayAppend(BaseArrayAppend): + pass + + +class TestArrayConcat(BaseArrayConcat): + pass + + +class TestArrayConstruct(BaseArrayConstruct): + pass + + +class TestBoolOr(BaseBoolOr): + pass + + +class TestCastBoolToText(BaseCastBoolToText): + pass + + +@pytest.mark.skip_profile('spark_session') +class TestConcat(BaseConcat): + pass + + +# Use either BaseCurrentTimestampAware or BaseCurrentTimestampNaive but not both +class TestCurrentTimestamp(BaseCurrentTimestampNaive): + pass + + +class TestDateAdd(BaseDateAdd): + pass + + +# this generates too much SQL to run successfully in our testing environments :( +@pytest.mark.skip_profile('apache_spark', 'spark_session') +class TestDateDiff(BaseDateDiff): + pass + + +class TestDateTrunc(BaseDateTrunc): + pass + + +class TestEscapeSingleQuotes(BaseEscapeSingleQuotesQuote): + pass + + +class TestExcept(BaseExcept): + pass + + +@pytest.mark.skip_profile('spark_session') +class TestHash(BaseHash): + pass + + +class TestIntersect(BaseIntersect): + pass + + +class TestLastDay(BaseLastDay): + pass + + +class TestLength(BaseLength): + pass + + +# SparkSQL does not support 'order by' for its 'listagg' equivalent +# the argument is ignored, so let's ignore those fields when checking equivalency +class TestListagg(BaseListagg): + @pytest.fixture(scope="class") + def models(self): + return { + "test_listagg.yml": models__test_listagg_yml, + "test_listagg.sql": self.interpolate_macro_namespace( + models__test_listagg_no_order_by_sql, "listagg" + ), + } + + +class TestPosition(BasePosition): + pass + + +@pytest.mark.skip_profile('spark_session') +class TestReplace(BaseReplace): + pass + + +@pytest.mark.skip_profile('spark_session') +class TestRight(BaseRight): + pass + + +class TestSafeCast(BaseSafeCast): + pass + + +class TestSplitPart(BaseSplitPart): + pass + + +class TestStringLiteral(BaseStringLiteral): + pass diff --git a/tests/integration/incremental_strategies/models_delta/merge_exclude_columns.sql b/tests/integration/incremental_strategies/models_delta/merge_exclude_columns.sql new file mode 100644 index 000000000..815f46b1d --- /dev/null +++ b/tests/integration/incremental_strategies/models_delta/merge_exclude_columns.sql @@ -0,0 +1,22 @@ +{{ config( + materialized = 'incremental', + incremental_strategy = 'merge', + file_format = 'delta', + unique_key = 'id', + merge_exclude_columns = ['msg'], +) }} + +{% if not is_incremental() %} + +select cast(1 as bigint) as id, 'hello' as msg, 'blue' as color +union all +select cast(2 as bigint) as id, 'goodbye' as msg, 'red' as color + +{% else %} + +-- msg will be ignored, color will be updated +select cast(2 as bigint) as id, 'yo' as msg, 'green' as color +union all +select cast(3 as bigint) as id, 'anyway' as msg, 'purple' as color + +{% endif %} \ No newline at end of file diff --git a/tests/integration/incremental_strategies/seeds/expected_exclude_upsert.csv b/tests/integration/incremental_strategies/seeds/expected_exclude_upsert.csv new file mode 100644 index 000000000..a0f1a6526 --- /dev/null +++ b/tests/integration/incremental_strategies/seeds/expected_exclude_upsert.csv @@ -0,0 +1,4 @@ +id,msg,color +1,hello,blue +2,goodbye,green +3,anyway,purple \ No newline at end of file diff --git a/tests/integration/incremental_strategies/test_incremental_strategies.py b/tests/integration/incremental_strategies/test_incremental_strategies.py index 839f167e6..73bb6ba2b 100644 --- a/tests/integration/incremental_strategies/test_incremental_strategies.py +++ b/tests/integration/incremental_strategies/test_incremental_strategies.py @@ -60,6 +60,8 @@ def run_and_test(self): def test_insert_overwrite_apache_spark(self): self.run_and_test() + # This test requires settings on the test cluster + # more info at https://docs.getdbt.com/reference/resource-configs/spark-configs#the-insert_overwrite-strategy @use_profile("databricks_cluster") def test_insert_overwrite_databricks_cluster(self): self.run_and_test() @@ -76,6 +78,7 @@ def run_and_test(self): self.assertTablesEqual("merge_no_key", "expected_append") self.assertTablesEqual("merge_unique_key", "expected_upsert") self.assertTablesEqual("merge_update_columns", "expected_partial_upsert") + self.assertTablesEqual("merge_exclude_columns", "expected_exclude_upsert") @use_profile("databricks_cluster") def test_delta_strategies_databricks_cluster(self): diff --git a/tests/unit/test_adapter.py b/tests/unit/test_adapter.py index f87a89b2b..53b95f731 100644 --- a/tests/unit/test_adapter.py +++ b/tests/unit/test_adapter.py @@ -154,12 +154,13 @@ def test_thrift_connection(self): config = self._get_target_thrift(self.project_cfg) adapter = SparkAdapter(config) - def hive_thrift_connect(host, port, username, auth, kerberos_service_name): + def hive_thrift_connect(host, port, username, auth, kerberos_service_name, password): self.assertEqual(host, 'myorg.sparkhost.com') self.assertEqual(port, 10001) self.assertEqual(username, 'dbt') self.assertIsNone(auth) self.assertIsNone(kerberos_service_name) + self.assertIsNone(password) with mock.patch.object(hive, 'connect', new=hive_thrift_connect): connection = adapter.acquire_connection('dummy') @@ -193,12 +194,13 @@ def test_thrift_connection_kerberos(self): config = self._get_target_thrift_kerberos(self.project_cfg) adapter = SparkAdapter(config) - def hive_thrift_connect(host, port, username, auth, kerberos_service_name): + def hive_thrift_connect(host, port, username, auth, kerberos_service_name, password): self.assertEqual(host, 'myorg.sparkhost.com') self.assertEqual(port, 10001) self.assertEqual(username, 'dbt') self.assertEqual(auth, 'KERBEROS') self.assertEqual(kerberos_service_name, 'hive') + self.assertIsNone(password) with mock.patch.object(hive, 'connect', new=hive_thrift_connect): connection = adapter.acquire_connection('dummy') diff --git a/tox.ini b/tox.ini index 1e0e2b8b6..a75e2a26a 100644 --- a/tox.ini +++ b/tox.ini @@ -2,21 +2,13 @@ skipsdist = True envlist = unit, flake8, integration-spark-thrift - -[testenv:flake8] -basepython = python3.8 -commands = /bin/bash -c '$(which flake8) --max-line-length 99 --select=E,W,F --ignore=W504 dbt/' -passenv = DBT_* PYTEST_ADDOPTS -deps = - -r{toxinidir}/dev_requirements.txt - [testenv:unit] basepython = python3.8 commands = /bin/bash -c '{envpython} -m pytest -v {posargs} tests/unit' passenv = DBT_* PYTEST_ADDOPTS deps = -r{toxinidir}/requirements.txt - -r{toxinidir}/dev_requirements.txt + -r{toxinidir}/dev-requirements.txt [testenv:integration-spark-databricks-http] basepython = python3.8 @@ -24,7 +16,7 @@ commands = /bin/bash -c '{envpython} -m pytest -v --profile databricks_http_clus passenv = DBT_* PYTEST_ADDOPTS deps = -r{toxinidir}/requirements.txt - -r{toxinidir}/dev_requirements.txt + -r{toxinidir}/dev-requirements.txt -e. [testenv:integration-spark-databricks-odbc-cluster] @@ -34,7 +26,7 @@ commands = /bin/bash -c '{envpython} -m pytest -v --profile databricks_cluster { passenv = DBT_* PYTEST_ADDOPTS ODBC_DRIVER deps = -r{toxinidir}/requirements.txt - -r{toxinidir}/dev_requirements.txt + -r{toxinidir}/dev-requirements.txt -e. [testenv:integration-spark-databricks-odbc-sql-endpoint] @@ -44,7 +36,7 @@ commands = /bin/bash -c '{envpython} -m pytest -v --profile databricks_sql_endpo passenv = DBT_* PYTEST_ADDOPTS ODBC_DRIVER deps = -r{toxinidir}/requirements.txt - -r{toxinidir}/dev_requirements.txt + -r{toxinidir}/dev-requirements.txt -e. @@ -55,7 +47,7 @@ commands = /bin/bash -c '{envpython} -m pytest -v --profile apache_spark {posarg passenv = DBT_* PYTEST_ADDOPTS deps = -r{toxinidir}/requirements.txt - -r{toxinidir}/dev_requirements.txt + -r{toxinidir}/dev-requirements.txt -e. [testenv:integration-spark-session] @@ -67,5 +59,5 @@ passenv = PIP_CACHE_DIR deps = -r{toxinidir}/requirements.txt - -r{toxinidir}/dev_requirements.txt + -r{toxinidir}/dev-requirements.txt -e.[session]