diff --git a/PlayingWithPySpark.ipynb b/PlayingWithPySpark.ipynb
new file mode 100644
index 0000000..f8e2650
--- /dev/null
+++ b/PlayingWithPySpark.ipynb
@@ -0,0 +1,1603 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "PlayingWithPySpark.ipynb",
+ "provenance": [],
+ "authorship_tag": "ABX9TyOF7+/AN17kQLuEoGi/PlvX",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Basic File handling in pyspark"
+ ],
+ "metadata": {
+ "id": "WvhoMDm3qYXY"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YTmURVunjz-b",
+ "outputId": "d97d3537-12ba-4b7d-c7d4-792f836328cc"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+ "Collecting pyspark\n",
+ " Downloading pyspark-3.3.0.tar.gz (281.3 MB)\n",
+ "\u001b[K |████████████████████████████████| 281.3 MB 44 kB/s \n",
+ "\u001b[?25hCollecting py4j==0.10.9.5\n",
+ " Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)\n",
+ "\u001b[K |████████████████████████████████| 199 kB 23.2 MB/s \n",
+ "\u001b[?25hBuilding wheels for collected packages: pyspark\n",
+ " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=989eb92b401d94eed999e05a79755e4016b986bcba113aedc51dd7a64fe1f890\n",
+ " Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885\n",
+ "Successfully built pyspark\n",
+ "Installing collected packages: py4j, pyspark\n",
+ "Successfully installed py4j-0.10.9.5 pyspark-3.3.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install pyspark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pyspark"
+ ],
+ "metadata": {
+ "id": "wu2_80ZOj2EN"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pyspark.sql import SparkSession"
+ ],
+ "metadata": {
+ "id": "8r0aUQpWj4vu"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spark = SparkSession.builder.appName(\"Practice\").getOrCreate()"
+ ],
+ "metadata": {
+ "id": "6Jr8Y72vkpel"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spark"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 219
+ },
+ "id": "PgeetEYmkwc_",
+ "outputId": "2b6e6a53-4638-46ae-da39-8e0e4c88aa9a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
SparkSession - in-memory
\n",
+ " \n",
+ "
\n",
+ "
SparkContext
\n",
+ "\n",
+ "
Spark UI
\n",
+ "\n",
+ "
\n",
+ " - Version
\n",
+ " v3.3.0 \n",
+ " - Master
\n",
+ " local[*] \n",
+ " - AppName
\n",
+ " Practice \n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark = spark.read.csv(\"/content/5000_sales_records.csv\")"
+ ],
+ "metadata": {
+ "id": "IGc2mnbDkxWy"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "42CsxNRUk9xU",
+ "outputId": "f91eae4a-b3aa-4798-ab54-6b26df21ffc9"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 38
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "bL3O-W4NlA99",
+ "outputId": "5269efb9-0dbf-42b2-b391-9ad7394d146e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| _c0| _c1| _c2| _c3| _c4| _c5| _c6| _c7| _c8| _c9| _c10| _c11| _c12| _c13|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark = spark.read.option(\"header\", \"true\").csv(\"/content/5000_sales_records.csv\")"
+ ],
+ "metadata": {
+ "id": "qOjlOq1hlwnD"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "KiTTLvy9mRJ_",
+ "outputId": "64096b44-98fb-4074-a458-4223312a75ae"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "type(df_pyspark)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "6t1J1sUvmD1j",
+ "outputId": "044d8f28-4183-40f6-f6d6-0d9edb15c5a4"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "pyspark.sql.dataframe.DataFrame"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.printSchema()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "h1T9Fy0nmL6O",
+ "outputId": "d764fafc-4c73-4058-daf5-2fd6081940c6"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "root\n",
+ " |-- Region: string (nullable = true)\n",
+ " |-- Country: string (nullable = true)\n",
+ " |-- Item Type: string (nullable = true)\n",
+ " |-- Sales Channel: string (nullable = true)\n",
+ " |-- Order Priority: string (nullable = true)\n",
+ " |-- Order Date: string (nullable = true)\n",
+ " |-- Order ID: string (nullable = true)\n",
+ " |-- Ship Date: string (nullable = true)\n",
+ " |-- Units Sold: string (nullable = true)\n",
+ " |-- Unit Price: string (nullable = true)\n",
+ " |-- Unit Cost: string (nullable = true)\n",
+ " |-- Total Revenue: string (nullable = true)\n",
+ " |-- Total Cost: string (nullable = true)\n",
+ " |-- Total Profit: string (nullable = true)\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Picking up data"
+ ],
+ "metadata": {
+ "id": "LeIvG6cpoUjA"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.select([\"Region\",\"Country\"]).show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1QhCW-WYmhpz",
+ "outputId": "d1956f24-83a5-46d3-ef5b-ae6315f00b72"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+\n",
+ "| Region| Country|\n",
+ "+--------------------+--------------------+\n",
+ "|Central America a...|Antigua and Barbuda |\n",
+ "|Central America a...| Panama|\n",
+ "| Europe| Czech Republic|\n",
+ "| Asia| North Korea|\n",
+ "| Asia| Sri Lanka|\n",
+ "|Middle East and N...| Morocco|\n",
+ "|Australia and Oce...|Federated States ...|\n",
+ "| Europe|Bosnia and Herzeg...|\n",
+ "|Middle East and N...| Afghanistan|\n",
+ "| Sub-Saharan Africa| Ethiopia|\n",
+ "|Middle East and N...| Turkey|\n",
+ "|Middle East and N...| Oman|\n",
+ "| Asia| Malaysia|\n",
+ "|Central America a...| Saint Lucia|\n",
+ "|Central America a...|Saint Vincent and...|\n",
+ "|Middle East and N...| Lebanon|\n",
+ "| Europe| Austria|\n",
+ "| Europe| Bulgaria|\n",
+ "| North America| Mexico|\n",
+ "|Central America a...| Trinidad and Tobago|\n",
+ "+--------------------+--------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.dtypes"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "lXt_9Pt9niic",
+ "outputId": "6f8bd081-a67c-43bf-8396-11beac0b91f6"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[('Region', 'string'),\n",
+ " ('Country', 'string'),\n",
+ " ('Item Type', 'string'),\n",
+ " ('Sales Channel', 'string'),\n",
+ " ('Order Priority', 'string'),\n",
+ " ('Order Date', 'string'),\n",
+ " ('Order ID', 'string'),\n",
+ " ('Ship Date', 'string'),\n",
+ " ('Units Sold', 'string'),\n",
+ " ('Unit Price', 'string'),\n",
+ " ('Unit Cost', 'string'),\n",
+ " ('Total Revenue', 'string'),\n",
+ " ('Total Cost', 'string'),\n",
+ " ('Total Profit', 'string')]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 30
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.describe().show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wKk4pwxFoN91",
+ "outputId": "c6bcb8f9-d8bf-4c2c-c1e5-206d79f6f381"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+-------+------------------+-----------+----------+-------------+--------------+----------+------------------+---------+------------------+-----------------+------------------+------------------+------------------+------------------+\n",
+ "|summary| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID|Ship Date| Units Sold| Unit Price| Unit Cost| Total Revenue| Total Cost| Total Profit|\n",
+ "+-------+------------------+-----------+----------+-------------+--------------+----------+------------------+---------+------------------+-----------------+------------------+------------------+------------------+------------------+\n",
+ "| count| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000| 5000|\n",
+ "| mean| null| null| null| null| null| null| 5.486447372216E8| null| 5030.6982|265.7455639999977|187.49414399999978|1325737.8417080024| 933093.1958900003|392644.64581800025|\n",
+ "| stddev| null| null| null| null| null| null|2.59467108849919E8| null|2914.5154267478674|218.7166954289816| 176.4162798197045| 1475374.673360283|1150873.2176060663| 382935.1484319066|\n",
+ "| min| Asia|Afghanistan| Baby Food| Offline| C| 1/1/2010| 100090873| 1/1/2011| 10| 109.28| 117.11| 100024.60| 1000002.29| 1000278.75|\n",
+ "| max|Sub-Saharan Africa| Zimbabwe|Vegetables| Online| M| 9/9/2015| 999879729| 9/9/2016| 9999| 9.33| 97.44| 998462.86| 99979.55| 99990.00|\n",
+ "+-------+------------------+-----------+----------+-------------+--------------+----------+------------------+---------+------------------+-----------------+------------------+------------------+------------------+------------------+\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Adding Columns"
+ ],
+ "metadata": {
+ "id": "uJKqyQs6osQJ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark = df_pyspark.withColumn(\"adding 200 rupee in total cost\", df_pyspark['Total Cost']+200)"
+ ],
+ "metadata": {
+ "id": "tSLhXvHbofg6"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "L7bSCWmPpIeo",
+ "outputId": "b97e91da-14f4-4dc4-f7ad-3e5db70d90fa"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|adding 200 rupee in total cost|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72| 88199.84|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38| 211352.48|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48| 152092.62|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44| 1056063.76|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88| 735092.48|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88| 2920.16|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52| 296166.72|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88| 33423.68|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04| 317061.44|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62| 1565226.14|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00| 1944651.84|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34| 1944102.06|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58| 1143427.82|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82| 1787157.38|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08| 1024951.76|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60| 1347000.17|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44| 657889.76|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50| 3289599.36|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72| 55578.18|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92| 824720.24|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "dropping column"
+ ],
+ "metadata": {
+ "id": "TXYiKuvnp3FM"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark = df_pyspark.drop(\"Region\")"
+ ],
+ "metadata": {
+ "id": "Q6awTLOwpruu"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VM9VRUW3pz_T",
+ "outputId": "0e0382c8-a288-426a-b433-efc224993b91"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|adding 200 rupee in total cost|\n",
+ "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72| 88199.84|\n",
+ "| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38| 211352.48|\n",
+ "| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48| 152092.62|\n",
+ "| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44| 1056063.76|\n",
+ "| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88| 735092.48|\n",
+ "| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88| 2920.16|\n",
+ "|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52| 296166.72|\n",
+ "|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88| 33423.68|\n",
+ "| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04| 317061.44|\n",
+ "| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62| 1565226.14|\n",
+ "| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00| 1944651.84|\n",
+ "| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34| 1944102.06|\n",
+ "| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58| 1143427.82|\n",
+ "| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82| 1787157.38|\n",
+ "|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08| 1024951.76|\n",
+ "| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60| 1347000.17|\n",
+ "| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44| 657889.76|\n",
+ "| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50| 3289599.36|\n",
+ "| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72| 55578.18|\n",
+ "| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92| 824720.24|\n",
+ "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Rename columns"
+ ],
+ "metadata": {
+ "id": "52oD6KkXp5Ve"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.withColumnRenamed(\"Country\", \"Region\").show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VoMKuwm9p1cB",
+ "outputId": "d7b5b708-399d-4b3d-a7ef-48019fd544ab"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "| Region| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|adding 200 rupee in total cost|\n",
+ "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72| 88199.84|\n",
+ "| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38| 211352.48|\n",
+ "| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.10| 151892.62| 74823.48| 152092.62|\n",
+ "| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.70| 117.11| 1854591.20|1055863.76| 798727.44| 1056063.76|\n",
+ "| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88| 735092.48|\n",
+ "| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88| 2920.16|\n",
+ "|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52| 296166.72|\n",
+ "|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88| 33423.68|\n",
+ "| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04| 317061.44|\n",
+ "| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62| 1565226.14|\n",
+ "| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.00| 1944651.84|\n",
+ "| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.20| 263.33| 3227410.40|1943902.06| 1283508.34| 1944102.06|\n",
+ "| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.70| 117.11| 2008043.40|1143227.82| 864815.58| 1143427.82|\n",
+ "| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.20| 263.33| 2966839.20|1786957.38| 1179881.82| 1787157.38|\n",
+ "|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08| 1024951.76|\n",
+ "| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.60| 1347000.17|\n",
+ "| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.70| 117.11| 1155211.20| 657689.76| 497521.44| 657889.76|\n",
+ "| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.50| 3289599.36|\n",
+ "| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.90| 55378.18| 27279.72| 55578.18|\n",
+ "| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92| 824720.24|\n",
+ "+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#**Handling Missing Value in pyspark**"
+ ],
+ "metadata": {
+ "id": "D7LEZTm5qNJO"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#create session which is already made in top\n",
+ "spark"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 219
+ },
+ "id": "cpQXhhruqB6T",
+ "outputId": "889f4a6a-a22e-4099-d2d2-0bb7292eb595"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
SparkSession - in-memory
\n",
+ " \n",
+ "
\n",
+ "
SparkContext
\n",
+ "\n",
+ "
Spark UI
\n",
+ "\n",
+ "
\n",
+ " - Version
\n",
+ " v3.3.0 \n",
+ " - Master
\n",
+ " local[*] \n",
+ " - AppName
\n",
+ " Practice \n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 48
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark = spark.read.csv(\"/content/5000_sales_records.csv\", header = True, inferSchema = True)"
+ ],
+ "metadata": {
+ "id": "HBwAaOjWqppc"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dlxEWEefq-oR",
+ "outputId": "92b9775c-6ab0-43d6-b023-79b430074d89"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "##drop columns\n",
+ "df_pyspark.drop(\"Order Priority\").show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "FzVgDq9MrJRI",
+ "outputId": "93d65472-32c4-4f03-b5ca-44c4ab2080a5"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#deletes rows containing null value\n",
+ "df_pyspark.na.drop().show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "lyEdvPTqrR_B",
+ "outputId": "749d09e4-94e8-4999-e85a-0dbe78995f58"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "### any = how (any row containing null value)\n",
+ "### all = how(any row containing all null value)\n",
+ "df_pyspark.na.drop(how=\"any\").show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "T61WAtwrrcQD",
+ "outputId": "4e4a2fba-dbb8-4e00-dfb0-5963f791d176"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#thresh = threshold (if its 2 then we must have atleast 2 non null value)\n",
+ "df_pyspark.na.drop(how=\"any\", thresh= 2).show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ra7IH1CSr3Dn",
+ "outputId": "75c79a09-1cb9-43d2-e156-f0bdf5d3c630"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#subset= only from specific column\n",
+ "df_pyspark.na.drop(how=\"any\", subset=[\"Item Type\"]).show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ljaYa0WwsWdk",
+ "outputId": "ce3c7911-6f74-4e70-9dfc-bfe2bcbd643c"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "## filling the missing value\n",
+ "df_pyspark.na.fill(\"Missing Values\", \"Region\").show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "nK_H8g46sw-I",
+ "outputId": "201897b5-98af-4218-dd81-2ddbc280196e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "kjKsOHhptDFA",
+ "outputId": "c57a35c5-3b9c-495f-c2d7-98256fcf88b0"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#imputer function for filling null values\n",
+ "\n",
+ "from pyspark.ml.feature import Imputer\n",
+ "imputer = Imputer(\n",
+ " inputCols = ['Unit Price','Total Revenue','Total Cost'],\n",
+ " outputCols = ['{}_imputed'.format(c) for c in ['Unit Price','Total Revenue','Total Cost']]\n",
+ ").setStrategy(\"mean\")"
+ ],
+ "metadata": {
+ "id": "zaxEflzdtNY0"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "imputer.fit(df_pyspark).transform(df_pyspark).show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "vB_lvoxOt4uZ",
+ "outputId": "994f88c5-8fd8-4665-b045-da37f6d9647e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------+---------------------+------------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|Unit Price_imputed|Total Revenue_imputed|Total Cost_imputed|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------+---------------------+------------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72| 255.28| 140914.56| 87999.84|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38| 152.58| 330640.86| 211152.48|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48| 47.45| 226716.1| 151892.62|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44| 205.7| 1854591.2| 1055863.76|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88| 152.58| 1150758.36| 734892.48|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88| 81.73| 3923.04| 2720.16|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52| 109.28| 902434.24| 295966.72|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88| 109.28| 101302.56| 33223.68|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04| 109.28| 966144.48| 316861.44|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62| 255.28| 2506083.76| 1565026.14|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0| 651.21| 2412081.84| 1944451.84|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34| 437.2| 3227410.4| 1943902.06|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58| 205.7| 2008043.4| 1143227.82|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82| 437.2| 2966839.2| 1786957.38|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08| 255.28| 1640939.84| 1024751.76|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6| 421.89| 1558039.77| 1346800.17|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44| 205.7| 1155211.2| 657689.76|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5| 651.21| 4080481.86| 3289399.36|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72| 47.45| 82657.9| 55378.18|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92| 255.28| 1320308.16| 824520.24|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------------+---------------------+------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Filter operation"
+ ],
+ "metadata": {
+ "id": "oYxVvo2Kuf77"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "rdeT55X4t98W",
+ "outputId": "17690dfa-57ae-4bcd-d725-a2f75f6fed6d"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Middle East and N...| Morocco| Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "### Units sold less than or equal to 300\n",
+ "df_pyspark.filter(df_pyspark['Total Cost']<=80000).show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "qKrv8-fsvBRy",
+ "outputId": "c65cd792-c130-4b66-f468-cf20df6c0dd1"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Middle East and N...| Morocco|Personal Care| Offline| L| 11/8/2010|412882792|11/22/2010| 48| 81.73| 56.67| 3923.04| 2720.16| 1202.88|\n",
+ "| Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "| North America| Mexico| Beverages| Online| C| 3/13/2017|127374303| 3/20/2017| 1742| 47.45| 31.79| 82657.9| 55378.18| 27279.72|\n",
+ "|Middle East and N...| Libya| Beverages| Offline| L| 1/18/2010|993345010| 3/3/2010| 1718| 47.45| 31.79| 81519.1| 54615.22| 26903.88|\n",
+ "| Europe| Estonia| Fruits| Online| L| 9/28/2016|579463422| 11/1/2016| 4958| 9.33| 6.92| 46258.14| 34309.36| 11948.78|\n",
+ "| Europe| Montenegro| Fruits| Offline| L| 5/29/2016|313705861| 7/10/2016| 1390| 9.33| 6.92| 12968.7| 9618.8| 3349.9|\n",
+ "|Middle East and N...| Lebanon| Fruits| Online| H| 4/5/2013|441150701| 5/12/2013| 5150| 9.33| 6.92| 48049.5| 35638.0| 12411.5|\n",
+ "| Asia| Kyrgyzstan| Snacks| Online| C| 8/6/2013|727492606| 9/22/2013| 84| 152.58| 97.44| 12816.72| 8184.96| 4631.76|\n",
+ "| Asia| Taiwan| Baby Food| Online| M| 5/27/2014|369560611| 6/13/2014| 52| 255.28| 159.42| 13274.56| 8289.84| 4984.72|\n",
+ "|Australia and Oce...| Vanuatu| Fruits| Online| M| 7/13/2017|135336816| 8/17/2017| 8026| 9.33| 6.92| 74882.58| 55539.92| 19342.66|\n",
+ "| Europe| Russia| Household| Online| M| 7/10/2017|194176757| 8/20/2017| 72| 668.27| 502.54| 48115.44| 36182.88| 11932.56|\n",
+ "| Europe| Serbia| Beverages| Online| L| 9/3/2012|599624192| 9/21/2012| 978| 47.45| 31.79| 46406.1| 31090.62| 15315.48|\n",
+ "| Asia| Sri Lanka| Cereal| Online| C| 2/24/2015|743410336| 3/20/2015| 494| 205.7| 117.11| 101615.8| 57852.34| 43763.46|\n",
+ "| Sub-Saharan Africa| Senegal| Beverages| Offline| L| 6/30/2014|530853211| 8/2/2014| 117| 47.45| 31.79| 5551.65| 3719.43| 1832.22|\n",
+ "|Australia and Oce...| Kiribati| Fruits| Online| L| 12/7/2011|905054843| 1/12/2012| 4695| 9.33| 6.92| 43804.35| 32489.4| 11314.95|\n",
+ "| Europe| Liechtenstein| Cosmetics| Online| L| 7/15/2012|229693067| 7/15/2012| 138| 437.2| 263.33| 60333.6| 36339.54| 23994.06|\n",
+ "|Australia and Oce...| Vanuatu| Fruits| Offline| L| 11/6/2015|202262866|12/14/2015| 2932| 9.33| 6.92| 27355.56| 20289.44| 7066.12|\n",
+ "|Central America a...| Haiti| Clothes| Offline| L| 8/12/2010|422752892| 8/12/2010| 830| 109.28| 35.84| 90702.4| 29747.2| 60955.2|\n",
+ "| Europe| Slovenia| Fruits| Online| H|10/22/2012|169799983|11/20/2012| 6443| 9.33| 6.92| 60113.19| 44585.56| 15527.63|\n",
+ "| Europe| Cyprus| Fruits| Offline| M| 7/12/2015|600515115| 8/30/2015| 4622| 9.33| 6.92| 43123.26| 31984.24| 11139.02|\n",
+ "+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#multiple condition\n",
+ "\n",
+ "df_pyspark.filter((df_pyspark['Total Cost']<=80000) & (df_pyspark['Region']==\"Europe\")).show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "FBQLb1Mmwd9N",
+ "outputId": "330e69b8-d4fa-42e7-e2ac-a4a1441b9d9a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Europe|Bosnia and Herzeg...| Clothes| Online| M|10/14/2013|919133651| 11/4/2013| 927| 109.28| 35.84| 101302.56| 33223.68| 68078.88|\n",
+ "|Europe| Estonia| Fruits| Online| L| 9/28/2016|579463422| 11/1/2016| 4958| 9.33| 6.92| 46258.14| 34309.36| 11948.78|\n",
+ "|Europe| Montenegro| Fruits| Offline| L| 5/29/2016|313705861| 7/10/2016| 1390| 9.33| 6.92| 12968.7| 9618.8| 3349.9|\n",
+ "|Europe| Russia| Household| Online| M| 7/10/2017|194176757| 8/20/2017| 72| 668.27| 502.54| 48115.44| 36182.88| 11932.56|\n",
+ "|Europe| Serbia| Beverages| Online| L| 9/3/2012|599624192| 9/21/2012| 978| 47.45| 31.79| 46406.1| 31090.62| 15315.48|\n",
+ "|Europe| Liechtenstein| Cosmetics| Online| L| 7/15/2012|229693067| 7/15/2012| 138| 437.2| 263.33| 60333.6| 36339.54| 23994.06|\n",
+ "|Europe| Slovenia| Fruits| Online| H|10/22/2012|169799983|11/20/2012| 6443| 9.33| 6.92| 60113.19| 44585.56| 15527.63|\n",
+ "|Europe| Cyprus| Fruits| Offline| M| 7/12/2015|600515115| 8/30/2015| 4622| 9.33| 6.92| 43123.26| 31984.24| 11139.02|\n",
+ "|Europe| Norway| Cereal| Online| M| 10/8/2014|100640618|10/18/2014| 650| 205.7| 117.11| 133705.0| 76121.5| 57583.5|\n",
+ "|Europe| Armenia| Fruits| Online| M| 3/23/2011|120977771| 5/2/2011| 8866| 9.33| 6.92| 82719.78| 61352.72| 21367.06|\n",
+ "|Europe| Denmark| Beverages| Online| H| 6/5/2016|973268353| 6/26/2016| 589| 47.45| 31.79| 27948.05| 18724.31| 9223.74|\n",
+ "|Europe| Kosovo| Fruits| Online| L| 5/2/2010|291995418| 6/6/2010| 6788| 9.33| 6.92| 63332.04| 46972.96| 16359.08|\n",
+ "|Europe|Bosnia and Herzeg...| Baby Food| Online| C| 4/30/2014|871923768| 5/30/2014| 474| 255.28| 159.42| 121002.72| 75565.08| 45437.64|\n",
+ "|Europe| Lithuania|Office Supplies| Offline| C|12/15/2016|112330758| 1/11/2017| 22| 651.21| 524.96| 14326.62| 11549.12| 2777.5|\n",
+ "|Europe| Spain| Fruits| Online| H|10/22/2011|817006289|11/14/2011| 9172| 9.33| 6.92| 85574.76| 63470.24| 22104.52|\n",
+ "|Europe| Malta| Personal Care| Offline| M| 8/16/2011|466988742| 8/16/2011| 1201| 81.73| 56.67| 98157.73| 68060.67| 30097.06|\n",
+ "|Europe| Italy| Personal Care| Online| C| 2/25/2011|309342658| 3/28/2011| 222| 81.73| 56.67| 18144.06| 12580.74| 5563.32|\n",
+ "|Europe| Kosovo| Fruits| Online| C| 3/2/2012|899868094| 3/26/2012| 9821| 9.33| 6.92| 91629.93| 67961.32| 23668.61|\n",
+ "|Europe|Bosnia and Herzeg...| Fruits| Online| H| 1/5/2013|291305768| 1/6/2013| 2705| 9.33| 6.92| 25237.65| 18718.6| 6519.05|\n",
+ "|Europe| Slovenia| Fruits| Offline| M|12/15/2013|812637078| 1/5/2014| 6225| 9.33| 6.92| 58079.25| 43077.0| 15002.25|\n",
+ "+------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#inverse operation\n",
+ "df_pyspark.filter(~(df_pyspark['Total Cost']<=80000)).show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VgXHvC318iTF",
+ "outputId": "21c9e5b4-d0ca-4095-fd25-da06edb4f7e2"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "| Region| Country| Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "|Central America a...|Antigua and Barbuda | Baby Food| Online| M|12/20/2013|957081544| 1/11/2014| 552| 255.28| 159.42| 140914.56| 87999.84| 52914.72|\n",
+ "|Central America a...| Panama| Snacks| Offline| C| 7/5/2010|301644504| 7/26/2010| 2167| 152.58| 97.44| 330640.86| 211152.48| 119488.38|\n",
+ "| Europe| Czech Republic| Beverages| Offline| C| 9/12/2011|478051030| 9/29/2011| 4778| 47.45| 31.79| 226716.1| 151892.62| 74823.48|\n",
+ "| Asia| North Korea| Cereal| Offline| L| 5/13/2010|892599952| 6/15/2010| 9016| 205.7| 117.11| 1854591.2|1055863.76| 798727.44|\n",
+ "| Asia| Sri Lanka| Snacks| Offline| C| 7/20/2015|571902596| 7/27/2015| 7542| 152.58| 97.44| 1150758.36| 734892.48| 415865.88|\n",
+ "|Australia and Oce...|Federated States ...| Clothes| Offline| H| 3/28/2011|932776868| 5/10/2011| 8258| 109.28| 35.84| 902434.24| 295966.72| 606467.52|\n",
+ "|Middle East and N...| Afghanistan| Clothes| Offline| M| 8/27/2016|579814469| 10/5/2016| 8841| 109.28| 35.84| 966144.48| 316861.44| 649283.04|\n",
+ "| Sub-Saharan Africa| Ethiopia| Baby Food| Online| M| 4/13/2015|192993152| 5/7/2015| 9817| 255.28| 159.42| 2506083.76|1565026.14| 941057.62|\n",
+ "|Middle East and N...| Turkey|Office Supplies| Offline| C| 9/25/2013|557156026|10/15/2013| 3704| 651.21| 524.96| 2412081.84|1944451.84| 467630.0|\n",
+ "|Middle East and N...| Oman| Cosmetics| Online| M| 5/12/2013|741101920| 5/17/2013| 7382| 437.2| 263.33| 3227410.4|1943902.06| 1283508.34|\n",
+ "| Asia| Malaysia| Cereal| Offline| L| 7/31/2016|333942162| 8/25/2016| 9762| 205.7| 117.11| 2008043.4|1143227.82| 864815.58|\n",
+ "|Central America a...| Saint Lucia| Cosmetics| Offline| H| 7/6/2015|795100581| 7/16/2015| 6786| 437.2| 263.33| 2966839.2|1786957.38| 1179881.82|\n",
+ "|Central America a...|Saint Vincent and...| Baby Food| Online| L|11/28/2010|504313504| 12/3/2010| 6428| 255.28| 159.42| 1640939.84|1024751.76| 616188.08|\n",
+ "|Middle East and N...| Lebanon| Meat| Offline| H|12/17/2015|611629760| 1/31/2016| 3693| 421.89| 364.69| 1558039.77|1346800.17| 211239.6|\n",
+ "| Europe| Austria| Cereal| Offline| C| 8/13/2014|987410676| 9/6/2014| 5616| 205.7| 117.11| 1155211.2| 657689.76| 497521.44|\n",
+ "| Europe| Bulgaria|Office Supplies| Online| L|10/31/2010|672330081|11/29/2010| 6266| 651.21| 524.96| 4080481.86|3289399.36| 791082.5|\n",
+ "|Central America a...| Trinidad and Tobago| Baby Food| Offline| C| 4/16/2013|783842170| 6/1/2013| 5172| 255.28| 159.42| 1320308.16| 824520.24| 495787.92|\n",
+ "|Middle East and N...| Algeria| Baby Food| Offline| M| 9/5/2015|977806651|10/14/2015| 3572| 255.28| 159.42| 911860.16| 569448.24| 342411.92|\n",
+ "|Australia and Oce...| Tuvalu| Beverages| Offline| L| 3/22/2012|610864150| 4/7/2012| 7132| 47.45| 31.79| 338413.4| 226726.28| 111687.12|\n",
+ "|Middle East and N...| Saudi Arabia| Snacks| Offline| L| 4/25/2017|604870164| 6/5/2017| 3378| 152.58| 97.44| 515415.24| 329152.32| 186262.92|\n",
+ "+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Groupby and aggregation"
+ ],
+ "metadata": {
+ "id": "llhOyJua9Jqg"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pyspark.sql import SparkSession\n",
+ "spark = SparkSession.builder.appName(\"Aggregate\").getOrCreate()"
+ ],
+ "metadata": {
+ "id": "8tbfOvxv9E-H"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "IGFm7O2z9c5s",
+ "outputId": "5d4d9486-420c-467d-f0f0-d6d1b2f87437"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Row(Region='Central America and the Caribbean', Country='Antigua and Barbuda ', Item Type='Baby Food', Sales Channel='Online', Order Priority='M', Order Date='12/20/2013', Order ID=957081544, Ship Date='1/11/2014', Units Sold=552, Unit Price=255.28, Unit Cost=159.42, Total Revenue=140914.56, Total Cost=87999.84, Total Profit=52914.72)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 90
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_pyspark.printSchema()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Okb_Wwmw9d7m",
+ "outputId": "f565628f-83c6-4d58-d197-97cfda3e5c62"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "root\n",
+ " |-- Region: string (nullable = true)\n",
+ " |-- Country: string (nullable = true)\n",
+ " |-- Item Type: string (nullable = true)\n",
+ " |-- Sales Channel: string (nullable = true)\n",
+ " |-- Order Priority: string (nullable = true)\n",
+ " |-- Order Date: string (nullable = true)\n",
+ " |-- Order ID: integer (nullable = true)\n",
+ " |-- Ship Date: string (nullable = true)\n",
+ " |-- Units Sold: integer (nullable = true)\n",
+ " |-- Unit Price: double (nullable = true)\n",
+ " |-- Unit Cost: double (nullable = true)\n",
+ " |-- Total Revenue: double (nullable = true)\n",
+ " |-- Total Cost: double (nullable = true)\n",
+ " |-- Total Profit: double (nullable = true)\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#Groupby\n",
+ "#group to find maximum selling region\n",
+ "df_pyspark.groupby(\"Region\").sum().show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "R5v7YHME9vTz",
+ "outputId": "c3188a30-dcdc-46cb-d181-28020bd65d5b"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+-------------+---------------+------------------+------------------+--------------------+--------------------+--------------------+\n",
+ "| Region|sum(Order ID)|sum(Units Sold)| sum(Unit Price)| sum(Unit Cost)| sum(Total Revenue)| sum(Total Cost)| sum(Total Profit)|\n",
+ "+--------------------+-------------+---------------+------------------+------------------+--------------------+--------------------+--------------------+\n",
+ "|Middle East and N...| 329477662194| 3013431|157454.56999999983|110155.62999999966| 7.668677777200001E8| 5.354874771899998E8| 2.313803005300001E8|\n",
+ "|Australia and Oce...| 233075865831| 2111786|116833.26999999983| 82580.71999999994| 5.873640885299999E8|4.1242951558999985E8|1.7493457293999985E8|\n",
+ "| Europe| 719802331123| 6582322| 352308.410000005|249038.25999999998|1.7036223981099987E9|1.2019463497899995E9|5.0167604832000005E8|\n",
+ "| Sub-Saharan Africa| 714129889652| 6642380| 349025.5400000039|246882.89000000057|1.8145672823999999E9| 1.28342501448E9|5.3114226791999996E8|\n",
+ "|Central America a...| 295473045677| 2698776|138441.96999999983| 97945.32999999967| 6.849763237699999E8|4.8036434404999995E8| 2.046119797200002E8|\n",
+ "| North America| 53808179824| 484760|31107.340000000026|22288.680000000004| 1.5101425209E8|1.0977385412000002E8|4.1240397970000006E7|\n",
+ "| Asia| 397456711807| 3620036|183556.71999999988|128579.20999999954| 9.202770859200008E8| 6.420394242299997E8| 2.782376616899999E8|\n",
+ "+--------------------+-------------+---------------+------------------+------------------+--------------------+--------------------+--------------------+\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# from pyspark.mllib.classification import LogisticRegressionWithSGD\n",
+ "# from pyspark.mllib.regression import LabeledPoint\n",
+ "# from numpy import array\n",
+ "\n",
+ "# # Load and parse the data\n",
+ "# def parsePoint(line):\n",
+ "# values = [float(x) for x in line.split(' ')]\n",
+ "# return LabeledPoint(values[0], values[1:])\n",
+ "\n",
+ "# data = sc.textFile(\"/content/5000_sales_records.csv\")\n",
+ "# parsedData = data.map(parsePoint)\n",
+ "\n",
+ "# # Build the model\n",
+ "# model = LogisticRegressionWithSGD.train(parsedData)\n",
+ "\n",
+ "# # Evaluating the model on training data\n",
+ "# labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))\n",
+ "# trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())\n",
+ "# print(\"Training Error = \" + str(trainErr))"
+ ],
+ "metadata": {
+ "id": "UpS_aHY194m7"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "8gjAVr7DBvr7",
+ "outputId": "a0f9a97f-1ad4-408b-885e-29fe32019db7"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "DataFrame[Region: string, Country: string, Item Type: string, Sales Channel: string, Order Priority: string, Order Date: string, Order ID: int, Ship Date: string, Units Sold: int, Unit Price: double, Unit Cost: double, Total Revenue: double, Total Cost: double, Total Profit: double]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 95
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "CvVdj8QrBxNA"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file