From 7395e1cf15f28ae685f4b81ddecf8d60a196283b Mon Sep 17 00:00:00 2001 From: Rahul Parajuli <86224264+RahulParajuli@users.noreply.github.com> Date: Tue, 30 Aug 2022 10:45:50 +0545 Subject: [PATCH] Created using Colaboratory --- PlayingWithPySpark.ipynb | 1603 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1603 insertions(+) create mode 100644 PlayingWithPySpark.ipynb diff --git a/PlayingWithPySpark.ipynb b/PlayingWithPySpark.ipynb new file mode 100644 index 0000000..f8e2650 --- /dev/null +++ b/PlayingWithPySpark.ipynb @@ -0,0 +1,1603 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "PlayingWithPySpark.ipynb", + "provenance": [], + "authorship_tag": "ABX9TyOF7+/AN17kQLuEoGi/PlvX", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#Basic File handling in pyspark" + ], + "metadata": { + "id": "WvhoMDm3qYXY" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YTmURVunjz-b", + "outputId": "d97d3537-12ba-4b7d-c7d4-792f836328cc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting pyspark\n", + " Downloading pyspark-3.3.0.tar.gz (281.3 MB)\n", + "\u001b[K |████████████████████████████████| 281.3 MB 44 kB/s \n", + "\u001b[?25hCollecting py4j==0.10.9.5\n", + " Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)\n", + "\u001b[K |████████████████████████████████| 199 kB 23.2 MB/s \n", + "\u001b[?25hBuilding wheels for collected packages: pyspark\n", + " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=989eb92b401d94eed999e05a79755e4016b986bcba113aedc51dd7a64fe1f890\n", + " Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885\n", + "Successfully built pyspark\n", + "Installing collected packages: py4j, pyspark\n", + "Successfully installed py4j-0.10.9.5 pyspark-3.3.0\n" + ] + } + ], + "source": [ + "!pip install pyspark" + ] + }, + { + "cell_type": "code", + "source": [ + "import pyspark" + ], + "metadata": { + "id": "wu2_80ZOj2EN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql import SparkSession" + ], + "metadata": { + "id": "8r0aUQpWj4vu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "spark = SparkSession.builder.appName(\"Practice\").getOrCreate()" + ], + "metadata": { + "id": "6Jr8Y72vkpel" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "spark" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 219 + }, + "id": "PgeetEYmkwc_", + "outputId": "2b6e6a53-4638-46ae-da39-8e0e4c88aa9a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v3.3.0
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
Practice
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark = spark.read.csv(\"/content/5000_sales_records.csv\")" + ], + "metadata": { + "id": "IGc2mnbDkxWy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "42CsxNRUk9xU", + "outputId": "f91eae4a-b3aa-4798-ab54-6b26df21ffc9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string]" + ] + }, + "metadata": {}, + "execution_count": 38 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bL3O-W4NlA99", + "outputId": "5269efb9-0dbf-42b2-b391-9ad7394d146e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| _c0| _c1| _c2| _c3| _c4| _c5| _c6| _c7| _c8| _c9| _c10| _c11| _c12| _c13|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark = spark.read.option(\"header\", \"true\").csv(\"/content/5000_sales_records.csv\")" + ], + "metadata": { + "id": "qOjlOq1hlwnD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KiTTLvy9mRJ_", + "outputId": "64096b44-98fb-4074-a458-4223312a75ae" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "type(df_pyspark)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6t1J1sUvmD1j", + "outputId": "044d8f28-4183-40f6-f6d6-0d9edb15c5a4" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "pyspark.sql.dataframe.DataFrame" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.printSchema()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "h1T9Fy0nmL6O", + "outputId": "d764fafc-4c73-4058-daf5-2fd6081940c6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "root\n", + " |-- Region: string (nullable = true)\n", + " |-- Country: string (nullable = true)\n", + " |-- Item Type: string (nullable = true)\n", + " |-- Sales Channel: string (nullable = true)\n", + " |-- Order Priority: string (nullable = true)\n", + " |-- Order Date: string (nullable = true)\n", + " |-- Order ID: string (nullable = true)\n", + " |-- Ship Date: string (nullable = true)\n", + " |-- Units Sold: string (nullable = true)\n", + " |-- Unit Price: string (nullable = true)\n", + " |-- Unit Cost: string (nullable = true)\n", + " |-- Total Revenue: string (nullable = true)\n", + " |-- Total Cost: string (nullable = true)\n", + " |-- Total Profit: string (nullable = true)\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Picking up data" + ], + "metadata": { + "id": "LeIvG6cpoUjA" + } + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.select([\"Region\",\"Country\"]).show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1QhCW-WYmhpz", + "outputId": "d1956f24-83a5-46d3-ef5b-ae6315f00b72" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+\n", + "| Region| Country|\n", + "+--------------------+--------------------+\n", + "|Central America a...|Antigua and Barbuda |\n", + "|Central America a...| Panama|\n", + "| Europe| Czech Republic|\n", + "| Asia| North Korea|\n", + "| Asia| Sri Lanka|\n", + "|Middle East and N...| Morocco|\n", + "|Australia and Oce...|Federated States ...|\n", + "| Europe|Bosnia and Herzeg...|\n", + "|Middle East and N...| Afghanistan|\n", + "| Sub-Saharan Africa| Ethiopia|\n", + "|Middle East and N...| Turkey|\n", + "|Middle East and N...| Oman|\n", + "| Asia| Malaysia|\n", + "|Central America a...| Saint Lucia|\n", + "|Central America a...|Saint Vincent and...|\n", + "|Middle East and N...| Lebanon|\n", + "| Europe| Austria|\n", + "| Europe| Bulgaria|\n", + "| North America| Mexico|\n", + "|Central America a...| Trinidad and Tobago|\n", + "+--------------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.dtypes" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lXt_9Pt9niic", + "outputId": "6f8bd081-a67c-43bf-8396-11beac0b91f6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('Region', 'string'),\n", + " ('Country', 'string'),\n", + " ('Item Type', 'string'),\n", + " ('Sales Channel', 'string'),\n", + " ('Order Priority', 'string'),\n", + " ('Order Date', 'string'),\n", + " ('Order ID', 'string'),\n", + " ('Ship Date', 'string'),\n", + " ('Units Sold', 'string'),\n", + " ('Unit Price', 'string'),\n", + " ('Unit Cost', 'string'),\n", + " ('Total Revenue', 'string'),\n", + " ('Total Cost', 'string'),\n", + " ('Total Profit', 'string')]" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.describe().show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wKk4pwxFoN91", + "outputId": "c6bcb8f9-d8bf-4c2c-c1e5-206d79f6f381" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+-------+------------------+-----------+----------+-------------+--------------+----------+------------------+---------+------------------+-----------------+------------------+------------------+------------------+------------------+\n", + "|summary| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID|Ship Date| Units Sold| Unit Price| Unit Cost| Total Revenue| Total Cost| Total Profit|\n", + "+-------+------------------+-----------+----------+-------------+--------------+----------+------------------+---------+------------------+-----------------+------------------+------------------+------------------+------------------+\n", + "| count| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000|\n", + "| mean| null| null| null| null| null| null| 5.486447372216E8| null| 5030.6982|265.7455639999977|187.49414399999978|1325737.8417080024| 933093.1958900003|392644.64581800025|\n", + "| stddev| null| null| null| null| null| null|2.59467108849919E8| null|2914.5154267478674|218.7166954289816| 176.4162798197045| 1475374.673360283|1150873.2176060663| 382935.1484319066|\n", + "| min| Asia|Afghanistan| Baby Food| Offline| C| 1/1/2010| 100090873| 1/1/2011| 10| 109.28| 117.11| 100024.60| 1000002.29| 1000278.75|\n", + "| max|Sub-Saharan Africa| Zimbabwe|Vegetables| Online| M| 9/9/2015| 999879729| 9/9/2016| 9999| 9.33| 97.44| 998462.86| 99979.55| 99990.00|\n", + "+-------+------------------+-----------+----------+-------------+--------------+----------+------------------+---------+------------------+-----------------+------------------+------------------+------------------+------------------+\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Adding Columns" + ], + "metadata": { + "id": "uJKqyQs6osQJ" + } + }, + { + "cell_type": "code", + "source": [ + "df_pyspark = df_pyspark.withColumn(\"adding 200 rupee in total cost\", df_pyspark['Total Cost']+200)" + ], + "metadata": { + "id": "tSLhXvHbofg6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "L7bSCWmPpIeo", + "outputId": "b97e91da-14f4-4dc4-f7ad-3e5db70d90fa" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|adding 200 rupee in total cost|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72| 88199.84|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38| 211352.48|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48| 152092.62|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44| 1056063.76|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88| 735092.48|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88| 2920.16|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52| 296166.72|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88| 33423.68|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04| 317061.44|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62| 1565226.14|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00| 1944651.84|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34| 1944102.06|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58| 1143427.82|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82| 1787157.38|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08| 1024951.76|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60| 1347000.17|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44| 657889.76|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50| 3289599.36|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72| 55578.18|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92| 824720.24|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "dropping column" + ], + "metadata": { + "id": "TXYiKuvnp3FM" + } + }, + { + "cell_type": "code", + "source": [ + "df_pyspark = df_pyspark.drop(\"Region\")" + ], + "metadata": { + "id": "Q6awTLOwpruu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VM9VRUW3pz_T", + "outputId": "0e0382c8-a288-426a-b433-efc224993b91" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|adding 200 rupee in total cost|\n", + "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72| 88199.84|\n", + "| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38| 211352.48|\n", + "| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48| 152092.62|\n", + "| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44| 1056063.76|\n", + "| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88| 735092.48|\n", + "| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88| 2920.16|\n", + "|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52| 296166.72|\n", + "|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88| 33423.68|\n", + "| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04| 317061.44|\n", + "| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62| 1565226.14|\n", + "| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00| 1944651.84|\n", + "| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34| 1944102.06|\n", + "| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58| 1143427.82|\n", + "| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82| 1787157.38|\n", + "|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08| 1024951.76|\n", + "| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60| 1347000.17|\n", + "| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44| 657889.76|\n", + "| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50| 3289599.36|\n", + "| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72| 55578.18|\n", + "| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92| 824720.24|\n", + "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Rename columns" + ], + "metadata": { + "id": "52oD6KkXp5Ve" + } + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.withColumnRenamed(\"Country\", \"Region\").show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VoMKuwm9p1cB", + "outputId": "d7b5b708-399d-4b3d-a7ef-48019fd544ab" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "| Region| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|adding 200 rupee in total cost|\n", + "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72| 88199.84|\n", + "| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38| 211352.48|\n", + "| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48| 152092.62|\n", + "| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44| 1056063.76|\n", + "| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88| 735092.48|\n", + "| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88| 2920.16|\n", + "|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52| 296166.72|\n", + "|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88| 33423.68|\n", + "| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04| 317061.44|\n", + "| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62| 1565226.14|\n", + "| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00| 1944651.84|\n", + "| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34| 1944102.06|\n", + "| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58| 1143427.82|\n", + "| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82| 1787157.38|\n", + "|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08| 1024951.76|\n", + "| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60| 1347000.17|\n", + "| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44| 657889.76|\n", + "| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50| 3289599.36|\n", + "| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72| 55578.18|\n", + "| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92| 824720.24|\n", + "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "#**Handling Missing Value in pyspark**" + ], + "metadata": { + "id": "D7LEZTm5qNJO" + } + }, + { + "cell_type": "code", + "source": [ + "#create session which is already made in top\n", + "spark" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 219 + }, + "id": "cpQXhhruqB6T", + "outputId": "889f4a6a-a22e-4099-d2d2-0bb7292eb595" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v3.3.0
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
Practice
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark = spark.read.csv(\"/content/5000_sales_records.csv\", header = True, inferSchema = True)" + ], + "metadata": { + "id": "HBwAaOjWqppc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dlxEWEefq-oR", + "outputId": "92b9775c-6ab0-43d6-b023-79b430074d89" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "##drop columns\n", + "df_pyspark.drop(\"Order Priority\").show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FzVgDq9MrJRI", + "outputId": "93d65472-32c4-4f03-b5ca-44c4ab2080a5" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#deletes rows containing null value\n", + "df_pyspark.na.drop().show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lyEdvPTqrR_B", + "outputId": "749d09e4-94e8-4999-e85a-0dbe78995f58" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "### any = how (any row containing null value)\n", + "### all = how(any row containing all null value)\n", + "df_pyspark.na.drop(how=\"any\").show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "T61WAtwrrcQD", + "outputId": "4e4a2fba-dbb8-4e00-dfb0-5963f791d176" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#thresh = threshold (if its 2 then we must have atleast 2 non null value)\n", + "df_pyspark.na.drop(how=\"any\", thresh= 2).show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ra7IH1CSr3Dn", + "outputId": "75c79a09-1cb9-43d2-e156-f0bdf5d3c630" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#subset= only from specific column\n", + "df_pyspark.na.drop(how=\"any\", subset=[\"Item Type\"]).show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ljaYa0WwsWdk", + "outputId": "ce3c7911-6f74-4e70-9dfc-bfe2bcbd643c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "## filling the missing value\n", + "df_pyspark.na.fill(\"Missing Values\", \"Region\").show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nK_H8g46sw-I", + "outputId": "201897b5-98af-4218-dd81-2ddbc280196e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kjKsOHhptDFA", + "outputId": "c57a35c5-3b9c-495f-c2d7-98256fcf88b0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#imputer function for filling null values\n", + "\n", + "from pyspark.ml.feature import Imputer\n", + "imputer = Imputer(\n", + " inputCols = ['Unit Price','Total Revenue','Total Cost'],\n", + " outputCols = ['{}_imputed'.format(c) for c in ['Unit Price','Total Revenue','Total Cost']]\n", + ").setStrategy(\"mean\")" + ], + "metadata": { + "id": "zaxEflzdtNY0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "imputer.fit(df_pyspark).transform(df_pyspark).show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vB_lvoxOt4uZ", + "outputId": "994f88c5-8fd8-4665-b045-da37f6d9647e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------+---------------------+------------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|Unit Price_imputed|Total Revenue_imputed|Total Cost_imputed|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------+---------------------+------------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72| 255.28| 140914.56| 87999.84|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38| 152.58| 330640.86| 211152.48|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48| 47.45| 226716.1| 151892.62|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44| 205.7| 1854591.2| 1055863.76|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88| 152.58| 1150758.36| 734892.48|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88| 81.73| 3923.04| 2720.16|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52| 109.28| 902434.24| 295966.72|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88| 109.28| 101302.56| 33223.68|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04| 109.28| 966144.48| 316861.44|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62| 255.28| 2506083.76| 1565026.14|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0| 651.21| 2412081.84| 1944451.84|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34| 437.2| 3227410.4| 1943902.06|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58| 205.7| 2008043.4| 1143227.82|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82| 437.2| 2966839.2| 1786957.38|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08| 255.28| 1640939.84| 1024751.76|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6| 421.89| 1558039.77| 1346800.17|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44| 205.7| 1155211.2| 657689.76|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5| 651.21| 4080481.86| 3289399.36|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72| 47.45| 82657.9| 55378.18|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92| 255.28| 1320308.16| 824520.24|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------+---------------------+------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "#Filter operation" + ], + "metadata": { + "id": "oYxVvo2Kuf77" + } + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rdeT55X4t98W", + "outputId": "17690dfa-57ae-4bcd-d725-a2f75f6fed6d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "### Units sold less than or equal to 300\n", + "df_pyspark.filter(df_pyspark['Total Cost']<=80000).show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qKrv8-fsvBRy", + "outputId": "c65cd792-c130-4b66-f468-cf20df6c0dd1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Middle East and N...| Morocco|Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n", + "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n", + "|Middle East and N...| Libya| Beverages| Offline| L| 1/18/2010|993345010| 3/3/2010| 1718| 47.45| 31.79| 81519.1| 54615.22| 26903.88|\n", + "| Europe| Estonia| Fruits| Online| L| 9/28/2016|579463422| 11/1/2016| 4958| 9.33| 6.92| 46258.14| 34309.36| 11948.78|\n", + "| Europe| Montenegro| Fruits| Offline| L| 5/29/2016|313705861| 7/10/2016| 1390| 9.33| 6.92| 12968.7| 9618.8| 3349.9|\n", + "|Middle East and N...| Lebanon| Fruits| Online| H| 4/5/2013|441150701| 5/12/2013| 5150| 9.33| 6.92| 48049.5| 35638.0| 12411.5|\n", + "| Asia| Kyrgyzstan| Snacks| Online| C| 8/6/2013|727492606| 9/22/2013| 84| 152.58| 97.44| 12816.72| 8184.96| 4631.76|\n", + "| Asia| Taiwan| Baby Food| Online| M| 5/27/2014|369560611| 6/13/2014| 52| 255.28| 159.42| 13274.56| 8289.84| 4984.72|\n", + "|Australia and Oce...| Vanuatu| Fruits| Online| M| 7/13/2017|135336816| 8/17/2017| 8026| 9.33| 6.92| 74882.58| 55539.92| 19342.66|\n", + "| Europe| Russia| Household| Online| M| 7/10/2017|194176757| 8/20/2017| 72| 668.27| 502.54| 48115.44| 36182.88| 11932.56|\n", + "| Europe| Serbia| Beverages| Online| L| 9/3/2012|599624192| 9/21/2012| 978| 47.45| 31.79| 46406.1| 31090.62| 15315.48|\n", + "| Asia| Sri Lanka| Cereal| Online| C| 2/24/2015|743410336| 3/20/2015| 494| 205.7| 117.11| 101615.8| 57852.34| 43763.46|\n", + "| Sub-Saharan Africa| Senegal| Beverages| Offline| L| 6/30/2014|530853211| 8/2/2014| 117| 47.45| 31.79| 5551.65| 3719.43| 1832.22|\n", + "|Australia and Oce...| Kiribati| Fruits| Online| L| 12/7/2011|905054843| 1/12/2012| 4695| 9.33| 6.92| 43804.35| 32489.4| 11314.95|\n", + "| Europe| Liechtenstein| Cosmetics| Online| L| 7/15/2012|229693067| 7/15/2012| 138| 437.2| 263.33| 60333.6| 36339.54| 23994.06|\n", + "|Australia and Oce...| Vanuatu| Fruits| Offline| L| 11/6/2015|202262866|12/14/2015| 2932| 9.33| 6.92| 27355.56| 20289.44| 7066.12|\n", + "|Central America a...| Haiti| Clothes| Offline| L| 8/12/2010|422752892| 8/12/2010| 830| 109.28| 35.84| 90702.4| 29747.2| 60955.2|\n", + "| Europe| Slovenia| Fruits| Online| H|10/22/2012|169799983|11/20/2012| 6443| 9.33| 6.92| 60113.19| 44585.56| 15527.63|\n", + "| Europe| Cyprus| Fruits| Offline| M| 7/12/2015|600515115| 8/30/2015| 4622| 9.33| 6.92| 43123.26| 31984.24| 11139.02|\n", + "+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#multiple condition\n", + "\n", + "df_pyspark.filter((df_pyspark['Total Cost']<=80000) & (df_pyspark['Region']==\"Europe\")).show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FBQLb1Mmwd9N", + "outputId": "330e69b8-d4fa-42e7-e2ac-a4a1441b9d9a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n", + "|Europe| Estonia| Fruits| Online| L| 9/28/2016|579463422| 11/1/2016| 4958| 9.33| 6.92| 46258.14| 34309.36| 11948.78|\n", + "|Europe| Montenegro| Fruits| Offline| L| 5/29/2016|313705861| 7/10/2016| 1390| 9.33| 6.92| 12968.7| 9618.8| 3349.9|\n", + "|Europe| Russia| Household| Online| M| 7/10/2017|194176757| 8/20/2017| 72| 668.27| 502.54| 48115.44| 36182.88| 11932.56|\n", + "|Europe| Serbia| Beverages| Online| L| 9/3/2012|599624192| 9/21/2012| 978| 47.45| 31.79| 46406.1| 31090.62| 15315.48|\n", + "|Europe| Liechtenstein| Cosmetics| Online| L| 7/15/2012|229693067| 7/15/2012| 138| 437.2| 263.33| 60333.6| 36339.54| 23994.06|\n", + "|Europe| Slovenia| Fruits| Online| H|10/22/2012|169799983|11/20/2012| 6443| 9.33| 6.92| 60113.19| 44585.56| 15527.63|\n", + "|Europe| Cyprus| Fruits| Offline| M| 7/12/2015|600515115| 8/30/2015| 4622| 9.33| 6.92| 43123.26| 31984.24| 11139.02|\n", + "|Europe| Norway| Cereal| Online| M| 10/8/2014|100640618|10/18/2014| 650| 205.7| 117.11| 133705.0| 76121.5| 57583.5|\n", + "|Europe| Armenia| Fruits| Online| M| 3/23/2011|120977771| 5/2/2011| 8866| 9.33| 6.92| 82719.78| 61352.72| 21367.06|\n", + "|Europe| Denmark| Beverages| Online| H| 6/5/2016|973268353| 6/26/2016| 589| 47.45| 31.79| 27948.05| 18724.31| 9223.74|\n", + "|Europe| Kosovo| Fruits| Online| L| 5/2/2010|291995418| 6/6/2010| 6788| 9.33| 6.92| 63332.04| 46972.96| 16359.08|\n", + "|Europe|Bosnia and Herzeg...| Baby Food| Online| C| 4/30/2014|871923768| 5/30/2014| 474| 255.28| 159.42| 121002.72| 75565.08| 45437.64|\n", + "|Europe| Lithuania|Office Supplies| Offline| C|12/15/2016|112330758| 1/11/2017| 22| 651.21| 524.96| 14326.62| 11549.12| 2777.5|\n", + "|Europe| Spain| Fruits| Online| H|10/22/2011|817006289|11/14/2011| 9172| 9.33| 6.92| 85574.76| 63470.24| 22104.52|\n", + "|Europe| Malta| Personal Care| Offline| M| 8/16/2011|466988742| 8/16/2011| 1201| 81.73| 56.67| 98157.73| 68060.67| 30097.06|\n", + "|Europe| Italy| Personal Care| Online| C| 2/25/2011|309342658| 3/28/2011| 222| 81.73| 56.67| 18144.06| 12580.74| 5563.32|\n", + "|Europe| Kosovo| Fruits| Online| C| 3/2/2012|899868094| 3/26/2012| 9821| 9.33| 6.92| 91629.93| 67961.32| 23668.61|\n", + "|Europe|Bosnia and Herzeg...| Fruits| Online| H| 1/5/2013|291305768| 1/6/2013| 2705| 9.33| 6.92| 25237.65| 18718.6| 6519.05|\n", + "|Europe| Slovenia| Fruits| Offline| M|12/15/2013|812637078| 1/5/2014| 6225| 9.33| 6.92| 58079.25| 43077.0| 15002.25|\n", + "+------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#inverse operation\n", + "df_pyspark.filter(~(df_pyspark['Total Cost']<=80000)).show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VgXHvC318iTF", + "outputId": "21c9e5b4-d0ca-4095-fd25-da06edb4f7e2" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n", + "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n", + "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n", + "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n", + "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n", + "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n", + "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n", + "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n", + "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n", + "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n", + "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n", + "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n", + "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n", + "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n", + "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n", + "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n", + "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n", + "|Middle East and N...| Algeria| Baby Food| Offline| M| 9/5/2015|977806651|10/14/2015| 3572| 255.28| 159.42| 911860.16| 569448.24| 342411.92|\n", + "|Australia and Oce...| Tuvalu| Beverages| Offline| L| 3/22/2012|610864150| 4/7/2012| 7132| 47.45| 31.79| 338413.4| 226726.28| 111687.12|\n", + "|Middle East and N...| Saudi Arabia| Snacks| Offline| L| 4/25/2017|604870164| 6/5/2017| 3378| 152.58| 97.44| 515415.24| 329152.32| 186262.92|\n", + "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "#Groupby and aggregation" + ], + "metadata": { + "id": "llhOyJua9Jqg" + } + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"Aggregate\").getOrCreate()" + ], + "metadata": { + "id": "8tbfOvxv9E-H" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IGFm7O2z9c5s", + "outputId": "5d4d9486-420c-467d-f0f0-d6d1b2f87437" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Row(Region='Central America and the Caribbean', Country='Antigua and Barbuda ', Item Type='Baby Food', Sales Channel='Online', Order Priority='M', Order Date='12/20/2013', Order ID=957081544, Ship Date='1/11/2014', Units Sold=552, Unit Price=255.28, Unit Cost=159.42, Total Revenue=140914.56, Total Cost=87999.84, Total Profit=52914.72)" + ] + }, + "metadata": {}, + "execution_count": 90 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pyspark.printSchema()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Okb_Wwmw9d7m", + "outputId": "f565628f-83c6-4d58-d197-97cfda3e5c62" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "root\n", + " |-- Region: string (nullable = true)\n", + " |-- Country: string (nullable = true)\n", + " |-- Item Type: string (nullable = true)\n", + " |-- Sales Channel: string (nullable = true)\n", + " |-- Order Priority: string (nullable = true)\n", + " |-- Order Date: string (nullable = true)\n", + " |-- Order ID: integer (nullable = true)\n", + " |-- Ship Date: string (nullable = true)\n", + " |-- Units Sold: integer (nullable = true)\n", + " |-- Unit Price: double (nullable = true)\n", + " |-- Unit Cost: double (nullable = true)\n", + " |-- Total Revenue: double (nullable = true)\n", + " |-- Total Cost: double (nullable = true)\n", + " |-- Total Profit: double (nullable = true)\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#Groupby\n", + "#group to find maximum selling region\n", + "df_pyspark.groupby(\"Region\").sum().show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "R5v7YHME9vTz", + "outputId": "c3188a30-dcdc-46cb-d181-28020bd65d5b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+-------------+---------------+------------------+------------------+--------------------+--------------------+--------------------+\n", + "| Region|sum(Order ID)|sum(Units Sold)| sum(Unit Price)| sum(Unit Cost)| sum(Total Revenue)| sum(Total Cost)| sum(Total Profit)|\n", + "+--------------------+-------------+---------------+------------------+------------------+--------------------+--------------------+--------------------+\n", + "|Middle East and N...| 329477662194| 3013431|157454.56999999983|110155.62999999966| 7.668677777200001E8| 5.354874771899998E8| 2.313803005300001E8|\n", + "|Australia and Oce...| 233075865831| 2111786|116833.26999999983| 82580.71999999994| 5.873640885299999E8|4.1242951558999985E8|1.7493457293999985E8|\n", + "| Europe| 719802331123| 6582322| 352308.410000005|249038.25999999998|1.7036223981099987E9|1.2019463497899995E9|5.0167604832000005E8|\n", + "| Sub-Saharan Africa| 714129889652| 6642380| 349025.5400000039|246882.89000000057|1.8145672823999999E9| 1.28342501448E9|5.3114226791999996E8|\n", + "|Central America a...| 295473045677| 2698776|138441.96999999983| 97945.32999999967| 6.849763237699999E8|4.8036434404999995E8| 2.046119797200002E8|\n", + "| North America| 53808179824| 484760|31107.340000000026|22288.680000000004| 1.5101425209E8|1.0977385412000002E8|4.1240397970000006E7|\n", + "| Asia| 397456711807| 3620036|183556.71999999988|128579.20999999954| 9.202770859200008E8| 6.420394242299997E8| 2.782376616899999E8|\n", + "+--------------------+-------------+---------------+------------------+------------------+--------------------+--------------------+--------------------+\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# from pyspark.mllib.classification import LogisticRegressionWithSGD\n", + "# from pyspark.mllib.regression import LabeledPoint\n", + "# from numpy import array\n", + "\n", + "# # Load and parse the data\n", + "# def parsePoint(line):\n", + "# values = [float(x) for x in line.split(' ')]\n", + "# return LabeledPoint(values[0], values[1:])\n", + "\n", + "# data = sc.textFile(\"/content/5000_sales_records.csv\")\n", + "# parsedData = data.map(parsePoint)\n", + "\n", + "# # Build the model\n", + "# model = LogisticRegressionWithSGD.train(parsedData)\n", + "\n", + "# # Evaluating the model on training data\n", + "# labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))\n", + "# trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())\n", + "# print(\"Training Error = \" + str(trainErr))" + ], + "metadata": { + "id": "UpS_aHY194m7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8gjAVr7DBvr7", + "outputId": "a0f9a97f-1ad4-408b-885e-29fe32019db7" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DataFrame[Region: string, Country: string, Item Type: string, Sales Channel: string, Order Priority: string, Order Date: string, Order ID: int, Ship Date: string, Units Sold: int, Unit Price: double, Unit Cost: double, Total Revenue: double, Total Cost: double, Total Profit: double]" + ] + }, + "metadata": {}, + "execution_count": 95 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "CvVdj8QrBxNA" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file