{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Getting spark JARs\n",
      "Creating SparkSession\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a href=\"http://192.168.1.5:4040\">Spark UI</a>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36m$ivy.$                                  \n",
       "\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.log4j.{Level, Logger}\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@4f21ba60\n",
       "defined \u001b[32mfunction\u001b[39m \u001b[36msc\u001b[39m"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import $ivy.`org.apache.spark::spark-sql:2.4.0`\n",
    "\n",
    "import org.apache.log4j.{Level, Logger}\n",
    "Logger.getLogger(\"org\").setLevel(Level.OFF)\n",
    "\n",
    "import org.apache.spark.sql._\n",
    "\n",
    "val spark = {\n",
    "  NotebookSparkSession.builder()\n",
    "    .config(\"spark.sql.autoBroadcastJoinThreshold\", 0)\n",
    "    .master(\"local[*]\")\n",
    "    .getOrCreate()\n",
    "}\n",
    "\n",
    "def sc = spark.sparkContext // Разница между def, val, lazy val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">foreach at cmd34.sc:3</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    8 / 8\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[36maccum\u001b[39m: \u001b[32morg\u001b[39m.\u001b[32mapache\u001b[39m.\u001b[32mspark\u001b[39m.\u001b[32mutil\u001b[39m.\u001b[32mLongAccumulator\u001b[39m = LongAccumulator(id: 1355, name: Some(My Accumulator), value: 10)\n",
       "\u001b[36mres34_2\u001b[39m: \u001b[32mjava\u001b[39m.\u001b[32mlang\u001b[39m.\u001b[32mLong\u001b[39m = \u001b[32m10L\u001b[39m"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val accum = sc.longAccumulator(\"My Accumulator\")\n",
    "\n",
    "sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum.add(x))\n",
    "\n",
    "accum.value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">csv at cmd17.sc:5</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    1 / 1\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">csv at cmd17.sc:5</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    8 / 8\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">show at cmd17.sc:7</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    1 / 1\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------------+-----------+------------+--------------------+\n",
      "|INCIDENT_NUMBER|OFFENSE_CODE|  OFFENSE_CODE_GROUP| OFFENSE_DESCRIPTION|DISTRICT|REPORTING_AREA|SHOOTING|   OCCURRED_ON_DATE|YEAR|MONTH|DAY_OF_WEEK|HOUR|  UCR_PART|           STREET|        Lat|        Long|            Location|\n",
      "+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------------+-----------+------------+--------------------+\n",
      "|     I182070945|         619|             Larceny|  LARCENY ALL OTHERS|     D14|           808|    null|2018-09-02 13:00:00|2018|    9|     Sunday|  13|  Part One|       LINCOLN ST|42.35779134|-71.13937053|(42.35779134, -71...|\n",
      "|     I182070943|        1402|           Vandalism|           VANDALISM|     C11|           347|    null|2018-08-21 00:00:00|2018|    8|    Tuesday|   0|  Part Two|         HECLA ST|42.30682138|-71.06030035|(42.30682138, -71...|\n",
      "|     I182070941|        3410|               Towed| TOWED MOTOR VEHICLE|      D4|           151|    null|2018-09-03 19:27:00|2018|    9|     Monday|  19|Part Three|      CAZENOVE ST|42.34658879|-71.07242943|(42.34658879, -71...|\n",
      "|     I182070940|        3114|Investigate Property|INVESTIGATE PROPERTY|      D4|           272|    null|2018-09-03 21:16:00|2018|    9|     Monday|  21|Part Three|       NEWCOMB ST|42.33418175|-71.07866441|(42.33418175, -71...|\n",
      "|     I182070938|        3114|Investigate Property|INVESTIGATE PROPERTY|      B3|           421|    null|2018-09-03 21:05:00|2018|    9|     Monday|  21|Part Three|         DELHI ST|42.27536542|-71.09036101|(42.27536542, -71...|\n",
      "|     I182070936|        3820|Motor Vehicle Acc...|M/V ACCIDENT INVO...|     C11|           398|    null|2018-09-03 21:09:00|2018|    9|     Monday|  21|Part Three|       TALBOT AVE|42.29019621|-71.07159012|(42.29019621, -71...|\n",
      "|     I182070933|         724|          Auto Theft|          AUTO THEFT|      B2|           330|    null|2018-09-03 21:25:00|2018|    9|     Monday|  21|  Part One|      NORMANDY ST|42.30607218| -71.0827326|(42.30607218, -71...|\n",
      "|     I182070932|        3301|     Verbal Disputes|      VERBAL DISPUTE|      B2|           584|    null|2018-09-03 20:39:37|2018|    9|     Monday|  20|Part Three|          LAWN ST|42.32701648|-71.10555088|(42.32701648, -71...|\n",
      "|     I182070931|         301|             Robbery|    ROBBERY - STREET|      C6|           177|    null|2018-09-03 20:48:00|2018|    9|     Monday|  20|  Part One|MASSACHUSETTS AVE|42.33152148|-71.07085307|(42.33152148, -71...|\n",
      "|     I182070929|        3301|     Verbal Disputes|      VERBAL DISPUTE|     C11|           364|    null|2018-09-03 20:38:00|2018|    9|     Monday|  20|Part Three|        LESLIE ST|42.29514664|-71.05860832|(42.29514664, -71...|\n",
      "|     I182070928|        3301|     Verbal Disputes|      VERBAL DISPUTE|      C6|           913|    null|2018-09-03 19:55:00|2018|    9|     Monday|  19|Part Three|    OCEAN VIEW DR|42.31957856|-71.04032766|(42.31957856, -71...|\n",
      "|     I182070927|        3114|Investigate Property|INVESTIGATE PROPERTY|      C6|           936|    null|2018-09-03 20:19:00|2018|    9|     Monday|  20|Part Three|      DALESSIO CT|42.34011469|-71.05339029|(42.34011469, -71...|\n",
      "|     I182070923|        3108|Fire Related Reports|FIRE REPORT - HOU...|      D4|           139|    null|2018-09-03 19:58:00|2018|    9|     Monday|  19|Part Three|   MARLBOROUGH ST| 42.3503876| -71.0878529|(42.35038760, -71...|\n",
      "|     I182070922|        2647|               Other|THREATS TO DO BOD...|      B3|           429|    null|2018-09-03 20:39:00|2018|    9|     Monday|  20|  Part Two|      WOODROW AVE|42.28647012|-71.08714661|(42.28647012, -71...|\n",
      "|     I182070921|        3201|       Property Lost|     PROPERTY - LOST|      B3|           469|    null|2018-09-02 14:00:00|2018|    9|     Sunday|  14|Part Three|        MULVEY ST|42.27924052|-71.09667382|(42.27924052, -71...|\n",
      "|     I182070920|        3006|  Medical Assistance|SICK/INJURED/MEDI...|    null|              |    null|2018-09-03 19:43:00|2018|    9|     Monday|  19|Part Three|             null|42.35287456| -71.0738297|(42.35287456, -71...|\n",
      "|     I182070919|        3301|     Verbal Disputes|      VERBAL DISPUTE|     C11|           341|    null|2018-09-03 18:52:00|2018|    9|     Monday|  18|Part Three|    STONEHURST ST|42.30526428|-71.06683755|(42.30526428, -71...|\n",
      "|     I182070918|        3305|Assembly or Gathe...| DEMONSTRATIONS/RIOT|      D4|           130|    null|2018-09-03 17:00:00|2018|    9|     Monday|  17|Part Three|   HUNTINGTON AVE|42.34857652|-71.07772012|(42.34857652, -71...|\n",
      "|     I182070917|        2647|               Other|THREATS TO DO BOD...|      B2|           901|    null|2018-09-03 19:52:00|2018|    9|     Monday|  19|  Part Two|      HORADAN WAY|42.33371742|-71.09665806|(42.33371742, -71...|\n",
      "|     I182070915|         614|Larceny From Moto...|LARCENY THEFT FRO...|      B2|           181|    null|2018-09-02 18:00:00|2018|    9|     Sunday|  18|  Part One|       SHIRLEY ST| 42.3256949|-71.06816778|(42.32569490, -71...|\n",
      "+---------------+------------+--------------------+--------------------+--------+--------------+--------+-------------------+----+-----+-----------+----+----------+-----------------+-----------+------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\u001b[36mcrimeFacts\u001b[39m: \u001b[32mDataFrame\u001b[39m = [INCIDENT_NUMBER: string, OFFENSE_CODE: int ... 15 more fields]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val crimeFacts = spark\n",
    "    .read\n",
    "    .option(\"header\", \"true\")\n",
    "    .option(\"inferSchema\", \"true\")\n",
    "    .csv(\"crime.csv\")\n",
    "\n",
    "crimeFacts.show"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import spark.implicits._\n",
    "\n",
    "case class Crime (\n",
    "    INCIDENT_NUMBER: String,\n",
    "    OFFENSE_CODE: Int,\n",
    "    OFFENSE_CODE_GROUP: String,\n",
    "    OFFENSE_DESCRIPTION: String,\n",
    "    DISTRICT: String,\n",
    "    REPORTING_AREA: String,\n",
    "    SHOOTING: String,\n",
    "    OCCURRED_ON_DATE: String,\n",
    "    YEAR: Int,\n",
    "    MONTH: Int,\n",
    "    DAY_OF_WEEK: String,\n",
    "    HOUR: Int,\n",
    "    UCR_PART: String,\n",
    "    STREET: String,\n",
    "    Lat: Double,\n",
    "    Long: Double,\n",
    "    Location: String\n",
    ")\n",
    "\n",
    "crimeFacts.as[Crime].filter(_.OFFENSE_DESCRIPTION == \"VANDALISM\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<script>\n",
       "var comm = Jupyter.notebook.kernel.comm_manager.new_comm('cancel-stage-358f0ff4-2649-454e-9135-f17fc5804d45', {});\n",
       "\n",
       "function cancelStage(stageId) {\n",
       "  console.log('Cancelling stage ' + stageId);\n",
       "  comm.send({ 'stageId': stageId });\n",
       "}\n",
       "</script>\n",
       "          "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">csv at cmd16.sc:1</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    1 / 1\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">csv at cmd16.sc:1</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    1 / 1\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">show at cmd16.sc:2</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    1 / 1\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+--------------------+\n",
      "|CODE|                NAME|\n",
      "+----+--------------------+\n",
      "| 612|LARCENY PURSE SNA...|\n",
      "| 613| LARCENY SHOPLIFTING|\n",
      "| 615|LARCENY THEFT OF ...|\n",
      "|1731|              INCEST|\n",
      "|3111|LICENSE PREMISE V...|\n",
      "|2646|LIQUOR - DRINKING...|\n",
      "|2204|LIQUOR LAW VIOLATION|\n",
      "|3810|M/V ACCIDENT - IN...|\n",
      "|3801|M/V ACCIDENT - OTHER|\n",
      "|3807|M/V ACCIDENT - OT...|\n",
      "|3803|M/V ACCIDENT - PE...|\n",
      "|3805|M/V ACCIDENT - PO...|\n",
      "|3802|M/V ACCIDENT - PR...|\n",
      "|3205|   M/V PLATES - LOST|\n",
      "| 123|MANSLAUGHTER - NO...|\n",
      "| 121|MANSLAUGHTER - VE...|\n",
      "|3501|      MISSING PERSON|\n",
      "|3502|MISSING PERSON - ...|\n",
      "|3503|MISSING PERSON - ...|\n",
      "| 111|MURDER, NON-NEGLI...|\n",
      "+----+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\u001b[36moffenseCodes\u001b[39m: \u001b[32mDataFrame\u001b[39m = [CODE: int, NAME: string]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val offenseCodes = spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(\"offense_codes.csv\")\n",
    "offenseCodes.show"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">show at cmd30.sc:10</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    1 / 1\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">show at cmd30.sc:10</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    8 / 8\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">show at cmd30.sc:10</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    200 / 200\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">show at cmd30.sc:10</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    200 / 200\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------------------------------+-----+\n",
      "|NAME                                          |count|\n",
      "+----------------------------------------------+-----+\n",
      "|ROBBERY - STREET                              |3056 |\n",
      "|ROBBERY - FIREARM - BANK                      |3056 |\n",
      "|ROBBERY ATTEMPT - OTHER WEAPON - MISCELLANEOUS|635  |\n",
      "|ROBBERY - OTHER                               |635  |\n",
      "|ROBBERY - COMMERCIAL                          |554  |\n",
      "|ROBBERY - KNIFE - CHAIN STORE                 |554  |\n",
      "|ROBBERY ATTEMPT - KNIFE - CHAIN STORE         |181  |\n",
      "|ROBBERY - BANK                                |181  |\n",
      "|ROBBERY - HOME INVASION                       |104  |\n",
      "|ROBBERY - CAR JACKING                         |86   |\n",
      "|ROBBERY - UNARMED - STREET                    |3    |\n",
      "|ROBBERY ATTEMPT - KNIFE - BANK                |1    |\n",
      "|ROBBERY - UNARMED - RESIDENCE                 |1    |\n",
      "|ROBBERY - KNIFE - STREET                      |1    |\n",
      "|ROBBERY - UNARMED - CHAIN STORE               |1    |\n",
      "|ROBBERY - UNARMED - BUSINESS                  |1    |\n",
      "+----------------------------------------------+-----+\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36mspark.implicits._\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36mrobberyStats\u001b[39m: \u001b[32mDataset\u001b[39m[\u001b[32mRow\u001b[39m] = [NAME: string, count: bigint]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import spark.implicits._\n",
    "\n",
    "val robberyStats = crimeFacts\n",
    "    .join(offenseCodes, $\"CODE\" === $\"OFFENSE_CODE\")\n",
    "    .filter($\"NAME\".startsWith(\"ROBBERY\"))\n",
    "    .groupBy($\"NAME\")\n",
    "    .count()\n",
    "    .orderBy($\"count\".desc)\n",
    "    \n",
    "robberyStats.show(false)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">run at ThreadPoolExecutor.java:1149</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    1 / 1\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">show at cmd31.sc:12</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    8 / 8\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">show at cmd31.sc:12</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    200 / 200\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------------------------------+-----+\n",
      "|NAME                                          |count|\n",
      "+----------------------------------------------+-----+\n",
      "|ROBBERY - FIREARM - BANK                      |3056 |\n",
      "|ROBBERY - STREET                              |3056 |\n",
      "|ROBBERY ATTEMPT - OTHER WEAPON - MISCELLANEOUS|635  |\n",
      "|ROBBERY - OTHER                               |635  |\n",
      "|ROBBERY - COMMERCIAL                          |554  |\n",
      "|ROBBERY - KNIFE - CHAIN STORE                 |554  |\n",
      "|ROBBERY ATTEMPT - KNIFE - CHAIN STORE         |181  |\n",
      "|ROBBERY - BANK                                |181  |\n",
      "|ROBBERY - HOME INVASION                       |104  |\n",
      "|ROBBERY - CAR JACKING                         |86   |\n",
      "|ROBBERY - UNARMED - STREET                    |3    |\n",
      "|ROBBERY - KNIFE - STREET                      |1    |\n",
      "|ROBBERY - UNARMED - BUSINESS                  |1    |\n",
      "|ROBBERY - UNARMED - CHAIN STORE               |1    |\n",
      "|ROBBERY ATTEMPT - KNIFE - BANK                |1    |\n",
      "|ROBBERY - UNARMED - RESIDENCE                 |1    |\n",
      "+----------------------------------------------+-----+\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions.broadcast\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36moffenseCodesBroadcast\u001b[39m: \u001b[32mDataset\u001b[39m[\u001b[32mRow\u001b[39m] = [CODE: int, NAME: string]\n",
       "\u001b[36mrobberyStatsWithBroadcast\u001b[39m: \u001b[32mDataset\u001b[39m[\u001b[32mRow\u001b[39m] = [NAME: string, count: bigint]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import org.apache.spark.sql.functions.broadcast\n",
    "\n",
    "val offenseCodesBroadcast = broadcast(offenseCodes)\n",
    "\n",
    "val robberyStatsWithBroadcast = crimeFacts\n",
    "    .join(offenseCodesBroadcast, $\"CODE\" === $\"OFFENSE_CODE\")\n",
    "    .filter($\"NAME\".startsWith(\"ROBBERY\"))\n",
    "    .groupBy($\"NAME\")\n",
    "    .count()\n",
    "    .orderBy($\"count\".desc)\n",
    "    \n",
    "robberyStatsWithBroadcast.show(false)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "  <span style=\"float: left;\">collect at cmd33.sc:6</span>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"progress\">\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: blue; width: 100%; word-wrap: normal; white-space: nowrap; text-align: center; color: white\" aria-valuenow=\"100\" aria-valuemin=\"0\" aria-valuemax=\"100\">\n",
       "    8 / 8\n",
       "  </div>\n",
       "  <div class=\"progress-bar\" role=\"progressbar\" style=\"background-color: red; width: 0%\" aria-valuenow=\"0\" aria-valuemin=\"0\" aria-valuemax=\"100\"></div>\n",
       "</div>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "defined \u001b[32mclass\u001b[39m \u001b[36mNotSerializableClass\u001b[39m\n",
       "\u001b[36mres33_1\u001b[39m: \u001b[32mArray\u001b[39m[\u001b[32mInt\u001b[39m] = \u001b[33mArray\u001b[39m(\u001b[32m2\u001b[39m, \u001b[32m3\u001b[39m, \u001b[32m4\u001b[39m)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class NotSerializableClass(value: Int) {\n",
    "    def getValue = value\n",
    "}\n",
    "\n",
    "sc.parallelize(Seq(1, 2, 3))\n",
    "    .map { x => \n",
    "        val breakIt = new NotSerializableClass(1)\n",
    "        x + breakIt.getValue \n",
    "    }\n",
    "    .collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "== Parsed Logical Plan ==\n",
      "'Sort ['count DESC NULLS LAST], true\n",
      "+- Aggregate [NAME#379], [NAME#379, count(1) AS count#583L]\n",
      "   +- Filter StartsWith(NAME#379, ROBBERY)\n",
      "      +- Join Inner, (CODE#378 = OFFENSE_CODE#402)\n",
      "         :- Relation[INCIDENT_NUMBER#401,OFFENSE_CODE#402,OFFENSE_CODE_GROUP#403,OFFENSE_DESCRIPTION#404,DISTRICT#405,REPORTING_AREA#406,SHOOTING#407,OCCURRED_ON_DATE#408,YEAR#409,MONTH#410,DAY_OF_WEEK#411,HOUR#412,UCR_PART#413,STREET#414,Lat#415,Long#416,Location#417] csv\n",
      "         +- Relation[CODE#378,NAME#379] csv\n",
      "\n",
      "== Analyzed Logical Plan ==\n",
      "NAME: string, count: bigint\n",
      "Sort [count#583L DESC NULLS LAST], true\n",
      "+- Aggregate [NAME#379], [NAME#379, count(1) AS count#583L]\n",
      "   +- Filter StartsWith(NAME#379, ROBBERY)\n",
      "      +- Join Inner, (CODE#378 = OFFENSE_CODE#402)\n",
      "         :- Relation[INCIDENT_NUMBER#401,OFFENSE_CODE#402,OFFENSE_CODE_GROUP#403,OFFENSE_DESCRIPTION#404,DISTRICT#405,REPORTING_AREA#406,SHOOTING#407,OCCURRED_ON_DATE#408,YEAR#409,MONTH#410,DAY_OF_WEEK#411,HOUR#412,UCR_PART#413,STREET#414,Lat#415,Long#416,Location#417] csv\n",
      "         +- Relation[CODE#378,NAME#379] csv\n",
      "\n",
      "== Optimized Logical Plan ==\n",
      "Sort [count#583L DESC NULLS LAST], true\n",
      "+- Aggregate [NAME#379], [NAME#379, count(1) AS count#583L]\n",
      "   +- Project [NAME#379]\n",
      "      +- Join Inner, (CODE#378 = OFFENSE_CODE#402)\n",
      "         :- Project [OFFENSE_CODE#402]\n",
      "         :  +- Filter isnotnull(OFFENSE_CODE#402)\n",
      "         :     +- Relation[INCIDENT_NUMBER#401,OFFENSE_CODE#402,OFFENSE_CODE_GROUP#403,OFFENSE_DESCRIPTION#404,DISTRICT#405,REPORTING_AREA#406,SHOOTING#407,OCCURRED_ON_DATE#408,YEAR#409,MONTH#410,DAY_OF_WEEK#411,HOUR#412,UCR_PART#413,STREET#414,Lat#415,Long#416,Location#417] csv\n",
      "         +- Filter ((isnotnull(NAME#379) && StartsWith(NAME#379, ROBBERY)) && isnotnull(CODE#378))\n",
      "            +- Relation[CODE#378,NAME#379] csv\n",
      "\n",
      "== Physical Plan ==\n",
      "*(7) Sort [count#583L DESC NULLS LAST], true, 0\n",
      "+- Exchange rangepartitioning(count#583L DESC NULLS LAST, 200)\n",
      "   +- *(6) HashAggregate(keys=[NAME#379], functions=[count(1)], output=[NAME#379, count#583L])\n",
      "      +- Exchange hashpartitioning(NAME#379, 200)\n",
      "         +- *(5) HashAggregate(keys=[NAME#379], functions=[partial_count(1)], output=[NAME#379, count#593L])\n",
      "            +- *(5) Project [NAME#379]\n",
      "               +- *(5) SortMergeJoin [OFFENSE_CODE#402], [CODE#378], Inner\n",
      "                  :- *(2) Sort [OFFENSE_CODE#402 ASC NULLS FIRST], false, 0\n",
      "                  :  +- Exchange hashpartitioning(OFFENSE_CODE#402, 200)\n",
      "                  :     +- *(1) Project [OFFENSE_CODE#402]\n",
      "                  :        +- *(1) Filter isnotnull(OFFENSE_CODE#402)\n",
      "                  :           +- *(1) FileScan csv [OFFENSE_CODE#402] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/e.mateshuk/Documents/projects/playground/spark_showcase_notebook/cr..., PartitionFilters: [], PushedFilters: [IsNotNull(OFFENSE_CODE)], ReadSchema: struct<OFFENSE_CODE:int>\n",
      "                  +- *(4) Sort [CODE#378 ASC NULLS FIRST], false, 0\n",
      "                     +- Exchange hashpartitioning(CODE#378, 200)\n",
      "                        +- *(3) Project [CODE#378, NAME#379]\n",
      "                           +- *(3) Filter ((isnotnull(NAME#379) && StartsWith(NAME#379, ROBBERY)) && isnotnull(CODE#378))\n",
      "                              +- *(3) FileScan csv [CODE#378,NAME#379] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/e.mateshuk/Documents/projects/playground/spark_showcase_notebook/of..., PartitionFilters: [], PushedFilters: [IsNotNull(NAME), StringStartsWith(NAME,ROBBERY), IsNotNull(CODE)], ReadSchema: struct<CODE:int,NAME:string>\n"
     ]
    }
   ],
   "source": [
    "robberyStats.explain(true)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scala",
   "language": "scala",
   "name": "scala"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".scala",
   "mimetype": "text/x-scala",
   "name": "scala",
   "nbconvert_exporter": "script",
   "version": "2.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
