diff --git a/05-working-with-dataframe.ipynb b/05-working-with-dataframe.ipynb index 53247e4..9b82cd7 100644 --- a/05-working-with-dataframe.ipynb +++ b/05-working-with-dataframe.ipynb @@ -1 +1,779 @@ -{"cells":[{"cell_type":"code","source":["from pyspark.sql.functions import *"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"714857fa-35e9-45e3-a6d3-1d16a0cc7f91"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["raw_fire_df = spark.read \\\n .format(\"csv\") \\\n .option(\"header\", \"true\") \\\n .option(\"inferSchema\",\"true\") \\\n .load(\"/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6fac9a4c-8d5f-4eda-aca1-180eb49086c1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["display(raw_fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"28a6868e-f4be-4406-ae8d-f67613d9f493"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["renamed_fire_df = raw_fire_df \\\n .withColumnRenamed(\"Call Number\", \"CallNumber\") \\\n .withColumnRenamed(\"Unit ID\", \"UnitID\") \\\n .withColumnRenamed(\"Incident Number\", \"IncidentNumber\") \\\n .withColumnRenamed(\"Call Date\", \"CallDate\") \\\n .withColumnRenamed(\"Watch Date\", \"WatchDate\") \\\n .withColumnRenamed(\"Call Final Disposition\", \"CallFinalDisposition\") \\\n .withColumnRenamed(\"Available DtTm\", \"AvailableDtTm\") \\\n .withColumnRenamed(\"Zipcode of Incident\", \"Zipcode\") \\\n .withColumnRenamed(\"Station Area\", \"StationArea\") \\\n .withColumnRenamed(\"Final Priority\", \"FinalPriority\") \\\n .withColumnRenamed(\"ALS Unit\", \"ALSUnit\") \\\n .withColumnRenamed(\"Call Type Group\", \"CallTypeGroup\") \\\n .withColumnRenamed(\"Unit sequence in call dispatch\", \"UnitSequenceInCallDispatch\") \\\n .withColumnRenamed(\"Fire Prevention District\", \"FirePreventionDistrict\") \\\n .withColumnRenamed(\"Supervisor District\", \"SupervisorDistrict\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3a45840e-00d7-403c-bcb5-f4c2b0e1dbba"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["display(renamed_fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"34acbca6-5403-4389-812b-47d79aae4d6d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["renamed_fire_df.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"12343768-27f0-4b8c-bd7d-e43ea856ea6a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["fire_df = renamed_fire_df \\\n .withColumn(\"CallDate\", to_date(\"CallDate\", \"MM/dd/yyyy\")) \\\n .withColumn(\"WatchDate\", to_date(\"WatchDate\", \"MM/dd/yyyy\")) \\\n .withColumn(\"AvailableDtTm\", to_timestamp(\"AvailableDtTm\", \"MM/dd/yyyy hh:mm:ss a\")) \\\n .withColumn(\"Delay\", round(\"Delay\", 2))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1bfbd70e-ee18-49d6-affd-bc4033116772"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["display(fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b2605e0c-3bdb-4e27-ad86-939c80bd3a9d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["fire_df.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4ed3cca5-aac7-4529-bbec-8c8c85e11b5d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["fire_df.cache()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2cf3909b-63a5-4140-a744-b3ff9213544e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q1. How many distinct types of calls were made to the Fire Department?\n```SQL\nselect count(distinct CallType) as distinct_call_type_count\nfrom fire_service_calls_tbl\nwhere CallType is not null\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7596246e-e79c-4e85-81eb-d05047216212"}}},{"cell_type":"code","source":["fire_df.createOrReplaceTempView(\"fire_service_calls_view\")\nq1_sql_df = spark.sql(\"\"\"\n select count(distinct CallType) as distinct_call_type_count\n from fire_service_calls_view\n where CallType is not null\n \"\"\")\ndisplay(q1_sql_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"92360fa0-13d4-4834-bfc6-bf462125e615"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["q1_df = fire_df.where(\"CallType is not null\") \\\n .select(\"CallType\") \\\n .distinct()\nprint(q1_df.count())"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6efce8ff-052a-4bdf-9c1c-0602a4837653"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["q1_df1 = fire_df.where(\"CallType is not null\")\nq1_df2 = q1_df1.select(\"CallType\")\nq1_df3 = q1_df2.distinct()\nprint(q1_df3.count())"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"cbdf0db8-e0a3-41b7-b672-b134b6c811ba"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q2. What were distinct types of calls made to the Fire Department?\n```sql\nselect distinct CallType as distinct_call_types\nfrom fire_service_calls_tbl\nwhere CallType is not null\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7d4f11c9-374c-41df-b4bb-9a0d812a7975"}}},{"cell_type":"code","source":["q2_df = fire_df.where(\"CallType is not null\") \\\n .select(expr(\"CallType as distinct_call_type\")) \\\n .distinct()\nq2_df.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7af8c2e5-864b-4196-8315-2ef9f9d82c64"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["display(q2_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"822d62b3-8c09-4237-92fb-80a855c202c1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q3. Find out all response for delayed times greater than 5 mins?\n``` sql\nselect CallNumber, Delay\nfrom fire_service_calls_tbl\nwhere Delay > 5\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"85b11023-f976-465a-9aed-0d7de4383722"}}},{"cell_type":"code","source":["fire_df.where(\"Delay > 5\") \\\n .select(\"CallNumber\", \"Delay\") \\\n .show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"06543f39-21ea-4bda-a228-03257788c5f9"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q4. What were the most common call types?\n```sql\nselect CallType, count(*) as count\nfrom fire_service_calls_tbl\nwhere CallType is not null\ngroup by CallType\norder by count desc\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d682e1e3-28f2-4cb1-87af-3b2a41fa1f9f"}}},{"cell_type":"code","source":["fire_df.select(\"CallType\") \\\n .where(\"CallType is not null\") \\\n .groupBy(\"CallType\") \\\n .count() \\\n .orderBy(\"count\", ascending=False) \\\n .show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b8c3327f-104b-4e9e-b941-9b0a57571d0b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q5. What zip codes accounted for most common calls?\n```sql\nselect CallType, ZipCode, count(*) as count\nfrom fire_service_calls_tbl\nwhere CallType is not null\ngroup by CallType, Zipcode\norder by count desc\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"05668ab2-6c49-46f7-bf75-05bcaa7ca666"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c88d0eb0-fd17-424e-bc49-d0a2231d7d06"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q6. What San Francisco neighborhoods are in the zip codes 94102 and 94103\n```sql\nselect distinct Neighborhood, Zipcode\nfrom fire_service_calls_tbl\nwhere Zipcode== 94102 or Zipcode == 94103\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"48e6bc0a-bcec-4d20-a165-9079aa74adb2"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"21ff24e5-845d-42ad-bba1-157f7cb1dab0"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q7. What was the sum of all calls, average, min and max of the response times for calls?\n```sql\nselect sum(NumAlarms), avg(Delay), min(Delay), max(Delay)\nfrom fire_service_calls_tbl\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bb33f91e-0d67-414b-b056-88fc38b6a6bd"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"aa54b23f-6b28-4b07-80c8-38c319216d11"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q8. How many distinct years of data is in the CSV file?\n```sql\nselect distinct year(to_timestamp(CallDate, \"MM/dd/yyyy\")) as year_num\nfrom fire_service_calls_tbl\norder by year_num\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"8894d83e-905c-4b33-85ba-a85c0b41b5cb"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"cc09d106-3298-459a-905a-7efaea20437a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q9. What week of the year in 2018 had the most fire calls?\n```sql\nselect weekofyear(to_timestamp(CallDate, \"MM/dd/yyyy\")) week_year, count(*) as count\nfrom fire_service_calls_tbl \nwhere year(to_timestamp(CallDate, \"MM/dd/yyyy\")) == 2018\ngroup by week_year\norder by count desc\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b52c7b9e-6493-4572-85e0-4bd5e8534372"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f8aa82ee-5140-4e78-9d78-35b0c8d40385"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q10. What neighborhoods in San Francisco had the worst response time in 2018?\n```sql\nselect Neighborhood, Delay\nfrom fire_service_calls_tbl \nwhere year(to_timestamp(CallDate, \"MM/dd/yyyy\")) == 2018\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"63144f63-f0e7-4361-97ca-78173498fc2f"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c03714de-f4af-407f-9227-dae8df634a4a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"05-working-with-dataframe","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":384359114294295}},"nbformat":4,"nbformat_minor":0} +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "714857fa-35e9-45e3-a6d3-1d16a0cc7f91", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import *" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6fac9a4c-8d5f-4eda-aca1-180eb49086c1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "raw_fire_df = spark.read \\\n", + " .format(\"csv\") \\\n", + " .option(\"header\", \"true\") \\\n", + " .option(\"inferSchema\",\"true\") \\\n", + " .load(\"/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "28a6868e-f4be-4406-ae8d-f67613d9f493", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(raw_fire_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3a45840e-00d7-403c-bcb5-f4c2b0e1dbba", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "renamed_fire_df = raw_fire_df \\\n", + " .withColumnRenamed(\"Call Number\", \"CallNumber\") \\\n", + " .withColumnRenamed(\"Unit ID\", \"UnitID\") \\\n", + " .withColumnRenamed(\"Incident Number\", \"IncidentNumber\") \\\n", + " .withColumnRenamed(\"Call Date\", \"CallDate\") \\\n", + " .withColumnRenamed(\"Watch Date\", \"WatchDate\") \\\n", + " .withColumnRenamed(\"Call Final Disposition\", \"CallFinalDisposition\") \\\n", + " .withColumnRenamed(\"Available DtTm\", \"AvailableDtTm\") \\\n", + " .withColumnRenamed(\"Zipcode of Incident\", \"Zipcode\") \\\n", + " .withColumnRenamed(\"Station Area\", \"StationArea\") \\\n", + " .withColumnRenamed(\"Final Priority\", \"FinalPriority\") \\\n", + " .withColumnRenamed(\"ALS Unit\", \"ALSUnit\") \\\n", + " .withColumnRenamed(\"Call Type Group\", \"CallTypeGroup\") \\\n", + " .withColumnRenamed(\"Unit sequence in call dispatch\", \"UnitSequenceInCallDispatch\") \\\n", + " .withColumnRenamed(\"Fire Prevention District\", \"FirePreventionDistrict\") \\\n", + " .withColumnRenamed(\"Supervisor District\", \"SupervisorDistrict\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34acbca6-5403-4389-812b-47d79aae4d6d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(renamed_fire_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "12343768-27f0-4b8c-bd7d-e43ea856ea6a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "renamed_fire_df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1bfbd70e-ee18-49d6-affd-bc4033116772", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df = renamed_fire_df \\\n", + " .withColumn(\"CallDate\", to_date(\"CallDate\", \"MM/dd/yyyy\")) \\\n", + " .withColumn(\"WatchDate\", to_date(\"WatchDate\", \"MM/dd/yyyy\")) \\\n", + " .withColumn(\"AvailableDtTm\", to_timestamp(\"AvailableDtTm\", \"MM/dd/yyyy hh:mm:ss a\")) \\\n", + " .withColumn(\"Delay\", round(\"Delay\", 2))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b2605e0c-3bdb-4e27-ad86-939c80bd3a9d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(fire_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4ed3cca5-aac7-4529-bbec-8c8c85e11b5d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2cf3909b-63a5-4140-a744-b3ff9213544e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7596246e-e79c-4e85-81eb-d05047216212", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q1. How many distinct types of calls were made to the Fire Department?\n", + "```SQL\n", + "select count(distinct CallType) as distinct_call_type_count\n", + "from fire_service_calls_tbl\n", + "where CallType is not null\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "92360fa0-13d4-4834-bfc6-bf462125e615", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.createOrReplaceTempView(\"fire_service_calls_view\")\n", + "q1_sql_df = spark.sql(\"\"\"\n", + " select count(distinct CallType) as distinct_call_type_count\n", + " from fire_service_calls_view\n", + " where CallType is not null\n", + " \"\"\")\n", + "display(q1_sql_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6efce8ff-052a-4bdf-9c1c-0602a4837653", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "q1_df = fire_df.where(\"CallType is not null\") \\\n", + " .select(\"CallType\") \\\n", + " .distinct()\n", + "print(q1_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cbdf0db8-e0a3-41b7-b672-b134b6c811ba", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "q1_df1 = fire_df.where(\"CallType is not null\")\n", + "q1_df2 = q1_df1.select(\"CallType\")\n", + "q1_df3 = q1_df2.distinct()\n", + "print(q1_df3.count())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7d4f11c9-374c-41df-b4bb-9a0d812a7975", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q2. What were distinct types of calls made to the Fire Department?\n", + "```sql\n", + "select distinct CallType as distinct_call_types\n", + "from fire_service_calls_tbl\n", + "where CallType is not null\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7af8c2e5-864b-4196-8315-2ef9f9d82c64", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "q2_df = fire_df.where(\"CallType is not null\") \\\n", + " .select(expr(\"CallType as distinct_call_type\")) \\\n", + " .distinct()\n", + "q2_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "822d62b3-8c09-4237-92fb-80a855c202c1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(q2_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "85b11023-f976-465a-9aed-0d7de4383722", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q3. Find out all response for delayed times greater than 5 mins?\n", + "``` sql\n", + "select CallNumber, Delay\n", + "from fire_service_calls_tbl\n", + "where Delay > 5\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "06543f39-21ea-4bda-a228-03257788c5f9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.where(\"Delay > 5\") \\\n", + " .select(\"CallNumber\", \"Delay\") \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d682e1e3-28f2-4cb1-87af-3b2a41fa1f9f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q4. What were the most common call types?\n", + "```sql\n", + "select CallType, count(*) as count\n", + "from fire_service_calls_tbl\n", + "where CallType is not null\n", + "group by CallType\n", + "order by count desc\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b8c3327f-104b-4e9e-b941-9b0a57571d0b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.select(\"CallType\") \\\n", + " .where(\"CallType is not null\") \\\n", + " .groupBy(\"CallType\") \\\n", + " .count() \\\n", + " .orderBy(\"count\", ascending=False) \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "05668ab2-6c49-46f7-bf75-05bcaa7ca666", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q5. What zip codes accounted for most common calls?\n", + "```sql\n", + "select CallType, ZipCode, count(*) as count\n", + "from fire_service_calls_tbl\n", + "where CallType is not null\n", + "group by CallType, Zipcode\n", + "order by count desc\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c88d0eb0-fd17-424e-bc49-d0a2231d7d06", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.where(\"CallType is not null\") \\\n", + " .select(\"CallType\", \"ZipCode\") \\\n", + " .groupBy(\"CallType\", \"ZipCode\") \\\n", + " .count() \\\n", + " .orderBy(\"count\", ascending=False) \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "48e6bc0a-bcec-4d20-a165-9079aa74adb2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q6. What San Francisco neighborhoods are in the zip codes 94102 and 94103\n", + "```sql\n", + "select distinct Neighborhood, Zipcode\n", + "from fire_service_calls_tbl\n", + "where Zipcode== 94102 or Zipcode == 94103\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "21ff24e5-845d-42ad-bba1-157f7cb1dab0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.where(\"ZipCode==94102 or ZipCode==94103\") \\\n", + " .select(\"Neighborhood\", \"Zipcode\") \\\n", + " .distinct() \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bb33f91e-0d67-414b-b056-88fc38b6a6bd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q7. What was the sum of all calls, average, min and max of the response times for calls?\n", + "```sql\n", + "select sum(NumAlarms), avg(Delay), min(Delay), max(Delay)\n", + "from fire_service_calls_tbl\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "aa54b23f-6b28-4b07-80c8-38c319216d11", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.select(sum(\"NumAlarms\"), avg(\"Delay\"), min(\"Delay\"), max(\"Delay\")) \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8894d83e-905c-4b33-85ba-a85c0b41b5cb", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q8. How many distinct years of data is in the CSV file?\n", + "```sql\n", + "select distinct year(to_timestamp(CallDate, \"MM/dd/yyyy\")) as year_num\n", + "from fire_service_calls_tbl\n", + "order by year_num\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cc09d106-3298-459a-905a-7efaea20437a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.select(year(\"CallDate\").alias(\"year_num\")) \\\n", + " .distinct() \\\n", + " .orderBy(\"year_num\") \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b52c7b9e-6493-4572-85e0-4bd5e8534372", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q9. What week of the year in 2018 had the most fire calls?\n", + "```sql\n", + "select weekofyear(to_timestamp(CallDate, \"MM/dd/yyyy\")) week_year, count(*) as count\n", + "from fire_service_calls_tbl \n", + "where year(to_timestamp(CallDate, \"MM/dd/yyyy\")) == 2018\n", + "group by week_year\n", + "order by count desc\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f8aa82ee-5140-4e78-9d78-35b0c8d40385", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.select(weekofyear(\"CallDate\").alias(\"week_year\")) \\\n", + " .where(year(\"CallDate\") == 2018) \\\n", + " .groupBy(\"week_year\") \\\n", + " .count() \\\n", + " .orderBy(\"count\", ascending=False) \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "63144f63-f0e7-4361-97ca-78173498fc2f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "##### Q10. What neighborhoods in San Francisco had the worst response time in 2018?\n", + "```sql\n", + "select Neighborhood, Delay\n", + "from fire_service_calls_tbl \n", + "where year(to_timestamp(CallDate, \"MM/dd/yyyy\")) == 2018\n", + "order by Delay desc\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c03714de-f4af-407f-9227-dae8df634a4a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "fire_df.select(\"Neighborhood\", \"Delay\") \\\n", + " .where(year(\"CallDate\") == 2018) \\\n", + " .orderBy(\"Delay\", ascending=False) \\\n", + " .show()" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "05-working-with-dataframe (1)", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}