From 0df1df07b64c57daab7e12f1d951194d5251fe41 Mon Sep 17 00:00:00 2001 From: Bill Date: Mon, 10 Jun 2024 17:45:41 +0100 Subject: [PATCH 1/2] =?UTF-8?q?=E2=9C=A8=20docs:=20refine=20the=20SQL=20ch?= =?UTF-8?q?allenges?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 ++--- .../challenging-sql-problems.md | 6 ++++ .../problems/silver/funnel-analytics.md | 22 ++++++------- .../bronze/personalised-customer-emails.md | 2 ++ .../temperature-anomaly-detection--duckdb.sql | 8 ++--- .../temperature-anomaly-detection--sqlite.sql | 32 +++++++++++++++++++ .../bronze/temperature-anomaly-detection.md | 7 +++- 7 files changed, 64 insertions(+), 21 deletions(-) create mode 100644 docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--sqlite.sql diff --git a/README.md b/README.md index 1089829..06de52b 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ [![GitHub last commit](https://img.shields.io/github/last-commit/Bilbottom/sql-learning-materials)](https://shields.io/badges/git-hub-last-commit-by-committer) [![SQL Server](https://img.shields.io/badge/SQL%20Server-2022-teal.svg)](https://www.microsoft.com/en-gb/sql-server/sql-server-downloads) -[![PostgreSQL](https://img.shields.io/badge/PostgreSQL-15.4-teal.svg)](https://www.postgresql.org/download/) -[![SQLite](https://img.shields.io/badge/SQLite-3.43-teal.svg)](https://www.sqlite.org/index.html) -[![DuckDB](https://img.shields.io/badge/DuckDB-0.9-teal.svg)](https://duckdb.org/) +[![PostgreSQL](https://img.shields.io/badge/PostgreSQL-16.2-teal.svg)](https://www.postgresql.org/download/) +[![SQLite](https://img.shields.io/badge/SQLite-3.45-teal.svg)](https://www.sqlite.org/index.html) +[![DuckDB](https://img.shields.io/badge/DuckDB-1.0-teal.svg)](https://duckdb.org/) [![Metabase](https://img.shields.io/badge/Metabase-0.47-teal.svg)](https://www.metabase.com/) @@ -21,7 +21,7 @@ SQL scripts that demonstrate various features and concepts. This project contains a bunch of SQL learning materials aimed at different levels of experience and covering a variety of topics. It focuses on just writing `SELECT` statements so there will be very few resources for anything else. -Jump into [`docs/index.md`](docs/index.md) to see the summary of what's covered in this project, and continue below for instructions on how to set up the databases. +Jump into [https://bilbottom.github.io/sql-learning-materials/](https://bilbottom.github.io/sql-learning-materials/) to see the summary of what's covered in this project, and continue below for instructions on how to set up the databases. ## Acknowledgements diff --git a/docs/challenging-sql-problems/challenging-sql-problems.md b/docs/challenging-sql-problems/challenging-sql-problems.md index f742cc6..d1546b8 100644 --- a/docs/challenging-sql-problems/challenging-sql-problems.md +++ b/docs/challenging-sql-problems/challenging-sql-problems.md @@ -4,6 +4,12 @@ > > These questions are not for people new to SQL! These expect you to use advanced SQL techniques that most people don't know. +> [!NOTE] +> +> The database versions used for the solutions can be found at the top of the following page: +> +> - https://github.com/Bilbottom/sql-learning-materials/blob/main/README.md + ## Problems ### 🟤 Bronze Tier diff --git a/docs/challenging-sql-problems/problems/silver/funnel-analytics.md b/docs/challenging-sql-problems/problems/silver/funnel-analytics.md index 4948976..29d7a64 100644 --- a/docs/challenging-sql-problems/problems/silver/funnel-analytics.md +++ b/docs/challenging-sql-problems/problems/silver/funnel-analytics.md @@ -47,17 +47,17 @@ The solution can be found at: >? INFO: **Sample output** > -| cohort | stage | mortgages | step_rate | total_rate | -|:--------|:---------------------|----------:|----------:|-----------:| -| 2024-01 | full application | 4 | 100.00 | 100.00 | -| 2024-01 | decision | 4 | 100.00 | 100.00 | -| 2024-01 | documentation | 3 | 75.00 | 75.00 | -| 2024-01 | valuation inspection | 3 | 100.00 | 75.00 | -| 2024-01 | valuation made | 3 | 100.00 | 75.00 | -| 2024-01 | valuation submitted | 3 | 100.00 | 75.00 | -| 2024-01 | solicitation | 1 | 33.33 | 25.00 | -| 2024-01 | funds released | 1 | 100.00 | 25.00 | -| ... | ... | ... | ... | ... | +> | cohort | stage | mortgages | step_rate | total_rate | +> |:--------|:---------------------|----------:|----------:|-----------:| +> | 2024-01 | full application | 4 | 100.00 | 100.00 | +> | 2024-01 | decision | 4 | 100.00 | 100.00 | +> | 2024-01 | documentation | 3 | 75.00 | 75.00 | +> | 2024-01 | valuation inspection | 3 | 100.00 | 75.00 | +> | 2024-01 | valuation made | 3 | 100.00 | 75.00 | +> | 2024-01 | valuation submitted | 3 | 100.00 | 75.00 | +> | 2024-01 | solicitation | 1 | 33.33 | 25.00 | +> | 2024-01 | funds released | 1 | 100.00 | 25.00 | +> | ... | ... | ... | ... | ... | >? TIP: **Hint 1** diff --git a/docs/challenging-sql-problems/solutions/bronze/personalised-customer-emails.md b/docs/challenging-sql-problems/solutions/bronze/personalised-customer-emails.md index 817011e..8589bd0 100644 --- a/docs/challenging-sql-problems/solutions/bronze/personalised-customer-emails.md +++ b/docs/challenging-sql-problems/solutions/bronze/personalised-customer-emails.md @@ -35,4 +35,6 @@ Some SQL solutions per database are provided below. > SUCCESS: **SQL Server** > +> This SQL Server solution uses the Soundex differences with a 3 (out of 4) match threshold, but this isn't the only way to solve this problem. +> --8<-- "docs/challenging-sql-problems/solutions/bronze/personalised-customer-emails--sql-server.sql" diff --git a/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--duckdb.sql b/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--duckdb.sql index 0ac61b0..31189a7 100644 --- a/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--duckdb.sql +++ b/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--duckdb.sql @@ -4,8 +4,7 @@ with temperatures as ( site_id, reading_datetime, temperature, - avg(temperature) over rows_around_site_reading as average_temperature, - count(temperature) over rows_around_site_reading as count_of_rows + avg(temperature) over rows_around_site_reading as average_temperature from readings window rows_around_site_reading as ( partition by site_id @@ -14,6 +13,7 @@ with temperatures as ( and 2 following exclude current row ) + qualify 4 = count(*) over rows_around_site_reading ) select @@ -23,9 +23,7 @@ select round(average_temperature, 4) as average_temperature, round(100.0 * (temperature - average_temperature) / average_temperature, 4) as percentage_increase from temperatures -where 1=1 - and count_of_rows = 4 - and (temperature - average_temperature) / average_temperature > 0.1 +where percentage_increase > 10 order by site_id, reading_datetime diff --git a/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--sqlite.sql b/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--sqlite.sql new file mode 100644 index 0000000..6b69a15 --- /dev/null +++ b/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--sqlite.sql @@ -0,0 +1,32 @@ +```sql +with temperatures as ( + select + site_id, + reading_datetime, + temperature, + avg(temperature) over rows_around_site_reading as average_temperature, + count(*) over rows_around_site_reading as count_of_rows + from readings + window rows_around_site_reading as ( + partition by site_id + order by reading_datetime + rows between 2 preceding + and 2 following + exclude current row + ) +) + +select + site_id, + reading_datetime, + temperature, + round(average_temperature, 4) as average_temperature, + round(100.0 * (temperature - average_temperature) / average_temperature, 4) as percentage_increase +from temperatures +where 1=1 + and count_of_rows = 4 + and (temperature - average_temperature) / average_temperature > 0.1 +order by + site_id, + reading_datetime +``` diff --git a/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection.md b/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection.md index ba6e7be..45178e3 100644 --- a/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection.md +++ b/docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection.md @@ -26,10 +26,15 @@ Regardless of the database, the result set should look like: Some SQL solutions per database are provided below. -> SUCCESS: **DuckDB, SQLite, PostgreSQL** +> SUCCESS: **DuckDB** > --8<-- "docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--duckdb.sql" + +> SUCCESS: **SQLite, PostgreSQL** +> +--8<-- "docs/challenging-sql-problems/solutions/bronze/temperature-anomaly-detection--sqlite.sql" + > SUCCESS: **Snowflake** > From c9c5790515e69dddb3590c504bfe895b2391996b Mon Sep 17 00:00:00 2001 From: Bill Date: Mon, 10 Jun 2024 21:53:01 +0100 Subject: [PATCH 2/2] =?UTF-8?q?=E2=9C=A8=20docs:=20add=20linear=20regressi?= =?UTF-8?q?on=20SQL=20problem?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../challenging-sql-problems.md | 3 +- .../problems/silver/predicting-values.md | 52 +++++++++++++++++++ .../problems/silver/predicting-values.sql | 26 ++++++++++ .../predicting-values--duckdb--manual.sql | 42 +++++++++++++++ .../predicting-values--duckdb--regr.sql | 38 ++++++++++++++ .../solutions/silver/predicting-values.md | 39 ++++++++++++++ .../solutions/silver/predicting-values.sql | 8 +++ mkdocs.yml | 2 + 8 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 docs/challenging-sql-problems/problems/silver/predicting-values.md create mode 100644 docs/challenging-sql-problems/problems/silver/predicting-values.sql create mode 100644 docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--manual.sql create mode 100644 docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--regr.sql create mode 100644 docs/challenging-sql-problems/solutions/silver/predicting-values.md create mode 100644 docs/challenging-sql-problems/solutions/silver/predicting-values.sql diff --git a/docs/challenging-sql-problems/challenging-sql-problems.md b/docs/challenging-sql-problems/challenging-sql-problems.md index d1546b8..89daf58 100644 --- a/docs/challenging-sql-problems/challenging-sql-problems.md +++ b/docs/challenging-sql-problems/challenging-sql-problems.md @@ -30,7 +30,8 @@ These require a bit more thinking. 2. [Bannable login activity](problems/silver/bannable-login-activity.md) 3. [Bus routes](problems/silver/bus-routes.md) 4. [Region precipitation](problems/silver/region-precipitation.md) -5. [Customer sales running totals](problems/silver/customer-sales-running-totals.md) +5. [Predicting values](problems/silver/predicting-values.md) +6. [Customer sales running totals](problems/silver/customer-sales-running-totals.md) ### 🟡 Gold Tier diff --git a/docs/challenging-sql-problems/problems/silver/predicting-values.md b/docs/challenging-sql-problems/problems/silver/predicting-values.md new file mode 100644 index 0000000..f662fdf --- /dev/null +++ b/docs/challenging-sql-problems/problems/silver/predicting-values.md @@ -0,0 +1,52 @@ +# Predicting values 🎱 + +> [!SUCCESS] Scenario +> +> Some students are studying [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet) and have been asked to predict the `y` values for a given set of `x` values for each of the four datasets using [linear regression](https://en.wikipedia.org/wiki/Linear_regression). + +> [!QUESTION] +> +> For each of the four datasets in Anscombe's quartet, use linear regression to predict the `y` values for `x` values `16`, `17`, and `18`. +> +> The output should have a row for each `x` value (`16`, `17`, `18`), with the columns: +> +> - `x` +> - `dataset_1` as the predicted value for dataset 1, rounded to 1 decimal place +> - `dataset_2` as the predicted value for dataset 2, rounded to 1 decimal place +> - `dataset_3` as the predicted value for dataset 3, rounded to 1 decimal place +> - `dataset_4` as the predicted value for dataset 4, rounded to 1 decimal place +> +> Order the output by `x`. + +
+Expand for the DDL +--8<-- "docs/challenging-sql-problems/problems/silver/predicting-values.sql" +
+ +There are plenty of resources online that walk through the maths behind linear regression, such as: + +- [https://www.youtube.com/watch?v=GAmzwIkGFgE](https://www.youtube.com/watch?v=GAmzwIkGFgE) + +The solution can be found at: + +- [predicting-values.md](../../solutions/silver/predicting-values.md) + +--- + + +>? INFO: **Sample output** +> +> | x | dataset_1 | dataset_2 | dataset_3 | dataset_4 | +> |----:|----------:|----------:|----------:|----------:| +> | 16 | 11.0 | 11.0 | 11.0 | 11.0 | +> | ... | ... | ... | ... | ... | + + +>? TIP: **Hint 1** +> +> Unpivot the datasets so that you have a table with headers `dataset`, `x`, and `y`, then apply the linear regression, and finally pivot the results back. + + +>? TIP: **Hint 2** +> +> For databases that support them, use the `regr_slope` and `regr_intercept` functions (or equivalent) to calculate the slope and intercept of the regression line. Otherwise, you'll need to calculate these manually 😄 diff --git a/docs/challenging-sql-problems/problems/silver/predicting-values.sql b/docs/challenging-sql-problems/problems/silver/predicting-values.sql new file mode 100644 index 0000000..f23361b --- /dev/null +++ b/docs/challenging-sql-problems/problems/silver/predicting-values.sql @@ -0,0 +1,26 @@ +```sql +create table anscombes_quartet ( + dataset_1__x int, + dataset_1__y decimal(5, 2), + dataset_2__x int, + dataset_2__y decimal(5, 2), + dataset_3__x int, + dataset_3__y decimal(5, 2), + dataset_4__x int, + dataset_4__y decimal(5, 2), +); +insert into anscombes_quartet +values + (10, 8.04, 10, 9.14, 10, 7.46, 8, 6.58), + ( 8, 6.95, 8, 8.14, 8, 6.77, 8, 5.76), + (13, 7.58, 13, 8.74, 13, 12.74, 8, 7.71), + ( 9, 8.81, 9, 8.77, 9, 7.11, 8, 8.84), + (11, 8.33, 11, 9.26, 11, 7.81, 8, 8.47), + (14, 9.96, 14, 8.10, 14, 8.84, 8, 7.04), + ( 6, 7.24, 6, 6.13, 6, 6.08, 8, 5.25), + ( 4, 4.26, 4, 3.10, 4, 5.39, 19, 12.50), + (12, 10.84, 12, 9.13, 12, 8.15, 8, 5.56), + ( 7, 4.82, 7, 7.26, 7, 6.42, 8, 7.91), + ( 5, 5.68, 5, 4.74, 5, 5.73, 8, 6.89) +; +``` diff --git a/docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--manual.sql b/docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--manual.sql new file mode 100644 index 0000000..702eb0d --- /dev/null +++ b/docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--manual.sql @@ -0,0 +1,42 @@ +```sql +with + +unpivoted as ( + unpivot anscombes_quartet + on + (dataset_1__x, dataset_1__y) as dataset_1, + (dataset_2__x, dataset_2__y) as dataset_2, + (dataset_3__x, dataset_3__y) as dataset_3, + (dataset_4__x, dataset_4__y) as dataset_4, + into + name dataset + value x, y +), + +coefficients as ( + select + dataset, + avg(x) as avg_x, + avg(y) as avg_y, + avg(x * x) as avg_xx, + avg(x * y) as avg_xy, + (avg_x * avg_y - avg_xy) / (avg_x * avg_x - avg_xx) as m, + avg_y - m * avg_x as c, + from unpivoted + group by dataset +), + +predictions as ( + select + dataset, + x, + round(m * x + c, 1) as y + from coefficients + cross join (values (16), (17), (18)) as v(x) +) + +pivot predictions +on dataset +using any_value(y) +order by x +``` diff --git a/docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--regr.sql b/docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--regr.sql new file mode 100644 index 0000000..2d31662 --- /dev/null +++ b/docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--regr.sql @@ -0,0 +1,38 @@ +```sql +with + +unpivoted as ( + unpivot anscombes_quartet + on + (dataset_1__x, dataset_1__y) as dataset_1, + (dataset_2__x, dataset_2__y) as dataset_2, + (dataset_3__x, dataset_3__y) as dataset_3, + (dataset_4__x, dataset_4__y) as dataset_4, + into + name dataset + value x, y +), + +coefficients as ( + select + dataset, + regr_slope(y, x) as m, + regr_intercept(y, x) as c, + from unpivoted + group by dataset +), + +predictions as ( + select + dataset, + x, + round(m * x + c, 1) as y + from coefficients + cross join (values (16), (17), (18)) as v(x) +) + +pivot predictions +on dataset +using any_value(y) +order by x +``` diff --git a/docs/challenging-sql-problems/solutions/silver/predicting-values.md b/docs/challenging-sql-problems/solutions/silver/predicting-values.md new file mode 100644 index 0000000..d045b6c --- /dev/null +++ b/docs/challenging-sql-problems/solutions/silver/predicting-values.md @@ -0,0 +1,39 @@ +# Predicting values 🎱 + +> [!TIP] +> +> Solution to the following problem: +> +> - [predicting-values.md](../../problems/silver/predicting-values.md) + +## Result Set + +Regardless of the database, the result set should look like: + +| x | dataset_1 | dataset_2 | dataset_3 | dataset_4 | +| --: | --------: | --------: | --------: | --------: | +| 16 | 11.0 | 11.0 | 11.0 | 11.0 | +| 17 | 11.5 | 11.5 | 11.5 | 11.5 | +| 18 | 12.0 | 12.0 | 12.0 | 12.0 | + +This is one of the interesting things about Anscombe's quartet (and is the reason Anscombe created it): the four datasets have the same summary statistics, but look very different when plotted! + +
+Expand for the DDL +--8<-- "docs/challenging-sql-problems/solutions/silver/predicting-values.sql" +
+ +## Solution + +Some SQL solutions per database are provided below. + + +> SUCCESS: **DuckDB** +> +> Here's a solution using the `regr_slope` and `regr_intercept` functions: +> +--8<-- "docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--regr.sql" +> +> ...and one doing this manually: +> +--8<-- "docs/challenging-sql-problems/solutions/silver/predicting-values--duckdb--manual.sql" diff --git a/docs/challenging-sql-problems/solutions/silver/predicting-values.sql b/docs/challenging-sql-problems/solutions/silver/predicting-values.sql new file mode 100644 index 0000000..7daecca --- /dev/null +++ b/docs/challenging-sql-problems/solutions/silver/predicting-values.sql @@ -0,0 +1,8 @@ +```sql +select * +from values + (16, 11.0, 11.0, 11.0, 11.0), + (17, 11.5, 11.5, 11.5, 11.5), + (18, 12.0, 12.0, 12.0, 12.0) +as solution(x, dataset_1, dataset_2, dataset_3, dataset_4) +``` diff --git a/mkdocs.yml b/mkdocs.yml index 0d68574..37ee7c1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -85,6 +85,7 @@ nav: - challenging-sql-problems/problems/silver/bannable-login-activity.md # window functions (gaps and islands, range between) - challenging-sql-problems/problems/silver/bus-routes.md # recursive CTE - challenging-sql-problems/problems/silver/region-precipitation.md # unpivot and rollup + - challenging-sql-problems/problems/silver/predicting-values.md # unpivot, regr, and pivot - challenging-sql-problems/problems/silver/customer-sales-running-totals.md # window functions - 🟡 Gold Tier: - challenging-sql-problems/problems/gold/loan-repayment-schedule.md # recursive CTE @@ -101,6 +102,7 @@ nav: - challenging-sql-problems/solutions/silver/bannable-login-activity.md - challenging-sql-problems/solutions/silver/bus-routes.md - challenging-sql-problems/solutions/silver/region-precipitation.md + - challenging-sql-problems/solutions/silver/predicting-values.md - challenging-sql-problems/solutions/silver/customer-sales-running-totals.md - 🟡 Gold Tier: - challenging-sql-problems/solutions/gold/loan-repayment-schedule.md