Skip to content

Commit

Permalink
Added Missing Notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
LearningJournal committed Dec 29, 2023
1 parent c42be84 commit 33d16ad
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 0 deletions.
1 change: 1 addition & 0 deletions 01-getting-started.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","source":["diamonds_df = spark.read.format(\"csv\") \\\n .option(\"header\", \"true\") \\\n .option(\"inferSchema\", \"true\") \\\n .load(\"/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv\")\n\ndiamonds_df.show(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b1b0d245-342b-472b-9c14-9e7883cf73f4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.sql.functions import avg\n\nresults_df = diamonds_df.select(\"color\", \"price\") \\\n .groupBy(\"color\") \\\n .agg(avg(\"price\")) \\\n .sort(\"color\")\n\nresults_df.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"015c9e24-1834-4494-a61e-f842bf41371d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(results_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c6d78c3a-7eb6-4376-af28-4bde47c83b4c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"32c660e0-a73f-43d4-9695-affcc19a0a2d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"01-getting-started","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":2879982568079096}},"nbformat":4,"nbformat_minor":0}
1 change: 1 addition & 0 deletions 02-spark-dataframe-demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","source":["raw_fire_df = spark.read \\\n .format(\"csv\") \\\n .option(\"header\", \"true\") \\\n .option(\"inferSchema\", \"true\") \\\n .load(\"/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1493b35d-a05e-4322-949c-2c6a7db9e146"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["raw_fire_df.show(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"33c9f1f1-299b-45d9-b7cf-7940ac9e1d80"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(raw_fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b333860d-334b-42a4-b073-a98bc58b1c43"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["raw_fire_df.createGlobalTempView(\"fire_service_calls_view\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"56d2f0d6-90a7-4399-8f09-9dead0bbf526"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["%sql\nselect * from global_temp.fire_service_calls_view"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"75c57597-0f81-40a7-885f-64829f3db180"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"19f40c42-622a-43a2-bd56-d182e528fe6b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"02-spark-dataframe-demo","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":2787702214819532}},"nbformat":4,"nbformat_minor":0}
71 changes: 71 additions & 0 deletions 03-spark-table-demo.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
-- Databricks notebook source
drop table if exists demo_db.fire_service_calls_tbl;
drop view if exists demo_db;

-- COMMAND ----------

-- MAGIC %fs rm -r /user/hive/warehouse/demo_db.db

-- COMMAND ----------

create database if not exists demo_db

-- COMMAND ----------

create table if not exists demo_db.fire_service_calls_tbl(
CallNumber integer,
UnitID string,
IncidentNumber integer,
CallType string,
CallDate string,
WatchDate string,
CallFinalDisposition string,
AvailableDtTm string,
Address string,
City string,
Zipcode integer,
Battalion string,
StationArea string,
Box string,
OriginalPriority string,
Priority string,
FinalPriority integer,
ALSUnit boolean,
CallTypeGroup string,
NumAlarms integer,
UnitType string,
UnitSequenceInCallDispatch integer,
FirePreventionDistrict string,
SupervisorDistrict string,
Neighborhood string,
Location string,
RowID string,
Delay float
) using parquet

-- COMMAND ----------

insert into demo_db.fire_service_calls_tbl
values(1234, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null,
null, null, null, null, null, null, null, null, null)

-- COMMAND ----------

select * from demo_db.fire_service_calls_tbl

-- COMMAND ----------

truncate table demo_db.fire_service_calls_tbl

-- COMMAND ----------

insert into demo_db.fire_service_calls_tbl
select * from global_temp.fire_service_calls_view

-- COMMAND ----------

select * from demo_db.fire_service_calls_tbl

-- COMMAND ----------


135 changes: 135 additions & 0 deletions 04-spark-sql-demo.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
-- Databricks notebook source
select * from demo_db.fire_service_calls_tbl limit 100

-- COMMAND ----------

drop view if exists fire_service_calls_tbl_cache;

-- COMMAND ----------

cache lazy table fire_service_calls_tbl_cache as
select * from demo_db.fire_service_calls_tbl

-- COMMAND ----------

select count(*) from demo_db.fire_service_calls_tbl

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q1. How many distinct types of calls were made to the Fire Department?

-- COMMAND ----------

select count(distinct callType) as distinct_call_type_count
from demo_db.fire_service_calls_tbl
where callType is not null

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q2. What were distinct types of calls made to the Fire Department?

-- COMMAND ----------

select distinct callType as distinct_call_types
from demo_db.fire_service_calls_tbl
where callType is not null

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q3. Find out all response for delayed times greater than 5 mins?

-- COMMAND ----------

select callNumber, Delay
from demo_db.fire_service_calls_tbl
where Delay > 5

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q4. What were the most common call types?

-- COMMAND ----------

select callType, count(*) as count
from demo_db.fire_service_calls_tbl
where callType is not null
group by callType
order by count desc

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q5. What zip codes accounted for most common calls?

-- COMMAND ----------

select callType, zipCode, count(*) as count
from demo_db.fire_service_calls_tbl
where callType is not null
group by callType, zipCode
order by count desc

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q6. What San Francisco neighborhoods are in the zip codes 94102 and 94103?

-- COMMAND ----------

select zipCode, neighborhood
from demo_db.fire_service_calls_tbl
where zipCode == 94102 or zipCode == 94103

-- COMMAND ----------

-- MAGIC %md
-- MAGIC #####Q7. What was the sum of all call alarms, average, min, and max of the call response times?

-- COMMAND ----------

select sum(NumAlarms), avg(Delay), min(Delay), max(Delay)
from demo_db.fire_service_calls_tbl

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q8. How many distinct years of data is in the data set?

-- COMMAND ----------

select distinct year(to_date(callDate, "MM/dd/yyyy")) as year_num
from demo_db.fire_service_calls_tbl
order by year_num

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q9. What week of the year in 2018 had the most fire calls?

-- COMMAND ----------

select weekofyear(to_date(callDate, "MM/dd/yyyy")) week_year, count(*) as count
from demo_db.fire_service_calls_tbl
where year(to_date(callDate, "MM/dd/yyyy")) == 2018
group by week_year
order by count desc

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##### Q10. What neighborhoods in San Francisco had the worst response time in 2018?

-- COMMAND ----------

select neighborhood, delay
from demo_db.fire_service_calls_tbl
where year(to_date(callDate, "MM/dd/yyyy")) == 2018
order by delay desc

-- COMMAND ----------


1 change: 1 addition & 0 deletions 05-working-with-dataframe.ipynb

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions HelloSpark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pyspark.sql import *

if __name__ == "__main__":

spark = SparkSession.builder \
.appName("Hello Spark") \
.master("local[2]") \
.getOrCreate()

data_list = [("Ravi", 28),
("David", 45),
("Abdul", 27)]



df = spark.createDataFrame(data_list).toDF("Name", "Age")
df.show()

0 comments on commit 33d16ad

Please sign in to comment.