diff --git a/tutorials/missing_data_imputation.ipynb b/tutorials/missing_data_imputation.ipynb new file mode 100644 index 0000000..39fc9b4 --- /dev/null +++ b/tutorials/missing_data_imputation.ipynb @@ -0,0 +1,549 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "86ee5293", + "metadata": {}, + "source": [ + "# Missing Data Imputation using Machine Learning Methods" + ] + }, + { + "cell_type": "markdown", + "id": "11340512", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "Missing data is a common problem in real-world datasets. Machine learning models cannot handle missing values directly, so we need to fill (impute) them.\n", + "\n", + "In this tutorial, we will explore different techniques to handle missing data." + ] + }, + { + "cell_type": "markdown", + "id": "a7c2e405", + "metadata": {}, + "source": [ + "## Why Handle Missing Data?\n", + "\n", + "- Many ML algorithms cannot work with missing values\n", + "- Missing data can reduce model accuracy\n", + "- Proper imputation improves model performance" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f3541284", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.impute import SimpleImputer, KNNImputer\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a28d3e7f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSalary
025.050000.0
130.060000.0
2NaN55000.0
335.0NaN
440.065000.0
5NaN70000.0
\n", + "
" + ], + "text/plain": [ + " Age Salary\n", + "0 25.0 50000.0\n", + "1 30.0 60000.0\n", + "2 NaN 55000.0\n", + "3 35.0 NaN\n", + "4 40.0 65000.0\n", + "5 NaN 70000.0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = {\n", + " \"Age\": [25, 30, np.nan, 35, 40, np.nan],\n", + " \"Salary\": [50000, 60000, 55000, np.nan, 65000, 70000]\n", + "}\n", + "\n", + "df = pd.DataFrame(data)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "c6d32952", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "We created a small dataset with missing values to demonstrate imputation techniques." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b68dc8b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Age 2\n", + "Salary 1\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check missing values\n", + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fa85cf3f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAHRCAYAAABJk+KzAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAMNtJREFUeJzt3Ql8VNX9//9PCCQENGGTLBoJmyBLQgWBIBSogYB8KbTfIlArSwGrFQuiIEEgILZRqhQUhCqy2a+yVMUFBDEIFAkiIKUoImCQNWySBIIEJPN7fM7/P9NMMoFMgMzJ5PV8PK7JvXPmzrkT4rxzthvgcDgcAgAAYLEKvq4AAADA1RBYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgACwQEBMikSZOu+3ljYmJk0KBB4g86depkNgDlE4EFuE4WLFhggoduGzduLPS43gUjOjraPP4///M/4q/eeecdc41z584tssyaNWtMmZdeekn8yf79++UPf/iD1KtXTypXriyhoaFyzz33yIwZM+THH38UG7zyyivm3ypQ1lT0dQUAf6MfVG+++aa0b9/e7fj69evl8OHDEhwcXOg5+mFWseL1/3Xcs2ePVKhQun+X9OjRQ8LCwsx7MHToUI9l9LHAwEDp16+f+IsVK1ZInz59zM93wIAB0qxZM7l48aIJr6NHj5avvvpKXn31VSsCS61atfym5Q3lB4EFuM7uu+8+WbZsmWk9yB9C9EO6ZcuWcurUKY8h50bwFI5uNH3N3/zmNzJ//nw5evSoREVFuT1+4cIFeffdd6VLly5Su3Zt8Qfp6ekmfNWpU0fWrl0rkZGRrsceffRR2bdvnwk0AEqOLiHgOuvfv7+cPn3adHs46V/a//znP+W3v/1tscawnD17VkaOHGnGoGgA0A92/YDfvn27q8zevXvlf//3fyUiIsIEnttuu818aGZlZRU5hsXZbfXZZ5/JqFGj5JZbbpGqVavKr371Kzl58qRbnfLy8kydNHBUqVJFOnfuLF9//XWxxsX87ne/M89fvHhxocf0g1vr+MADD5h9DTa/+MUvzDXqtTZp0kRmz559lXf5v9dy4MABt+Pr1q0zx/Vrfp9//rl069bNtP7o9XTs2NG8D/kV5333ZOrUqXLu3Dl5/fXX3cKKU4MGDWTEiBGu/Z9++kmmTJki9evXN6+jrzdu3DjJzc0t1timkv5c9Xna0qOtfc7uS8YFoayghQW4zvRDIT4+Xt566y3p3r27OfbRRx+ZD2kNFMUZt/Hwww+bgDN8+HDzAa4BSLsWdu/eLXfddZcJQImJieYD7rHHHjOh5ciRI/Lhhx9KZmam+VC+En1O9erVJTk52XzgT58+3bzWkiVLXGWSkpLMB3HPnj3Na/373/82X7WF5Gp+/vOfmwClrUr6AZqfHtPA0Lt3b7Ov4aRp06byy1/+0rRIffDBB/LHP/7RBB5tnbgetNVDfxbawqXXrN1kzqD0r3/9S1q3bl2s970oWmcdt9KuXbti1Ue7yhYuXGhaop544gkTplJSUszraOtTSV3t56r7Wuamm26Sp59+2hwLDw8v8esBpcoB4LqYP3++Q3+lvvjiC8fMmTMdN998s+P8+fPmsT59+jg6d+5svq9Tp46jR48ebs/V5yUnJ7v2w8LCHI8++miRr/Xll1+a5yxbtuyKddLXGjhwYKE6JiQkOPLy8lzHH3/8cUdgYKAjMzPT7GdkZDgqVqzo6N27t9v5Jk2aZJ6f/5xFGT16tCm7Z88e17GsrCxH5cqVHf3793cdc75H+SUmJjrq1avndqxjx45mK3gt6enpbuU+/fRTc1y/Kr3Ohg0bmnPmv2Z93bp16zq6dOlS7PfdE70mfb1evXoVq/yOHTtM+aFDh7odf/LJJ83xtWvXFvnv4lp/rqpp06Zu7yNQVtAlBNwA999/vxlIqy0e2s2gX4vqDvKkWrVq5q9uHQPiibMFZfXq1XL+/Hmv6/fQQw+Z7gCnDh06yOXLl+X77783+6mpqabbQls68tO/zotLu4WcLSpOb7/9tmmhcXYHqZCQENf32gqlY3y0u+a7775z694qqR07dpjuM33/tcVEz69bTk6O3HvvvbJhwwbTmlOc992T7Oxs8/Xmm28uVvmVK1earwVbnrSlRV3LWJer/VyBsozAAtwAOoYgISHBfFjrNF/90NDm/+LSrphdu3aZadDaXaHjGPQD3Klu3brmA0+nDuuMD+2qmTVrVrE/4G+//Xa3fe1GUGfOnDFfnR9wOvYivxo1arjKXk1sbKyZKaNdY076fjjr66TjLvS90jEXGhj0vdPxHOp6BBYNK2rgwIHm3Pk3ff+0W835Old73z3RqctKg2lx6HurXVIF31vt1tPrv5ZwcbWfK1CWEViAG0T/otexK3PmzDHjJ/TDyJsWGv2gfPnll82g17/+9a9mnIeez+nFF1+UnTt3mg93bc3505/+ZMro1Omr0SnFnvx/vRDXj7ayfPvtt7J161bJyMiQTz/91Fybc/aUrluirRza4jFt2jTTuqCDlR9//HHzuLPlw5P8LQn5aTjMz3kOfQ/13J42HdNR3PfdU2DRshp0vFFU/Yuj4DWW9s8V8AUCC3CD6AwN/Ut68+bNXnUHOelsE+2SWb58uZk2W7NmTfnzn//sVqZ58+Yyfvx4062hg0d14K0GpGul03OVTsfNT7tUvPlrXWdM6QeztqzowE/9oM3fHaSDVbWF4/333zcLrumUcG1tyd9NVBRn64EOMs6vYAuFzsRxBgs9t6etUqVKXr3vBelCgBq+0tLSivXeaohytvw4HT9+3FyL8713XmPB69MB18eOHZOSupagBPgSgQW4QfSvdp0Bo90KOtOmuPRDvWBXiE6v1b/indNeddyEjjEpGF40IBWcGlsS2uqhrSAFpxfPnDnT6y4KHUehYeUf//iH6crKP5PG2SKQvwVAr11n8FyNM4hoWMv/3hVcnE1nBmnZF154wUw9Lsg57bc473tRxowZY7q0dPaPBo+CNMzoardKQ5lzxk5+2sLkXHgv/zXmvz6l11dUC0txaD0LhiCgLGBaM3AD6bgJb+lYCJ0SrGNe4uLiTPD55JNP5IsvvjDdQM5pujpdVVdWveOOO0x4eeONN0wA0LVZrpVOddV1Q/T1dLqxrl+i05q1a0THoHjzV7p2C+lgUB3I6pxK69S1a1cJCgoygU5bWDRQvPbaayYoXK0VQbtq2rZta6Zf//DDD2Z8ja77UjDIaYjTsSraLafPGTx4sNx6662mNUq7qLTlRVt6ivO+F0WDhbYi9e3bV+688063lW43bdpkFhJ0rpui59Z/Fxo8NDjoAOMtW7aYac461VvXu3HSAKRTrfVnquvB6M9AB1rrz6CkNMBpEH322WfNOBp9r3V6N2A9X09TAvxxWvOVXG1ac25urpkSHBcXZ6ZGV61a1Xz/yiuvuMp/9913jt///veO+vXrm2nCNWrUMNOmP/nkk2JNfy1Yx4JTgdVPP/3kmDBhgiMiIsIREhLi+MUvfuHYvXu3o2bNmo6HH3642O/LDz/84AgODjbn//rrrws9/v777ztiY2PNdcTExDief/55x7x58wpNWS44rVnt37/fTOXV84eHhzvGjRvnWLNmTaFrcU4F//Wvf23qr+X1vbn//vsdqampxX7fr+bbb791DBs2zFxHUFCQOc8999zjePnllx0XLlxwlbt06ZJj8uTJZlp1pUqVHNHR0Y6kpCS3Mury5cuOp556ylGrVi1HlSpVzNTsffv2XdPPVaes678/rZs+xhRnlBUB+h9fhyYAZYO2COi4Cv3rvGBrCQDcSIxhAeCRp7sLO8ddsJw7gNLGGBYAHulAWb1HjQ4S1fEcukS9rqmi407uueceX1cPQDlDYAFQ5MJvOlNIF1PTWUnOgbjaHQQApY0xLAAAwHqMYQEAANYjsAAAAOv5xRgWXeZaF6XSu6Wy7DQAAGWDjkrRRRt1RWld5NHvA4uGFb27KgAAKHsOHTpkVpr2+8CiLSvOC3be6h0AANhNZyBqg4Pzc9zvA4uzG0jDCoEFAICypTjDORh0CwAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAAD+FVhSUlLk7rvvNmv+165dW3r37i179uy56vOWLVsmjRs3lsqVK0vz5s1l5cqVhe7WOHHiRImMjJSQkBBJSEiQvXv3en81AADAL3kVWNavXy+PPvqobN68WdasWSOXLl2Srl27Sk5OTpHP2bRpk/Tv31+GDBkiX375pQk5uu3atctVZurUqfLSSy/JnDlz5PPPP5eqVatKYmKiXLhw4dquDgAA+IUAhzZvlNDJkydNS4sGmZ///Ocey/Tt29cEmg8//NB1rG3bttKiRQsTUPTlo6Ki5IknnpAnn3zSPJ6VlSXh4eGyYMEC6devX7Hu9hgWFmaex80PAQAoG7z5/L6mMSz6AqpGjRpFlklLSzNdPPlp64keV+np6ZKRkeFWRivfpk0bV5mCcnNzzUXm3wAAgP+qWNIn5uXlyciRI+Wee+6RZs2aFVlOw4i2luSn+3rc+bjzWFFlPI2lmTx5ckmr7ldixq7wdRVQig4818PXVQAAnyhxC4uOZdFxKIsXL5bSlpSUZFp3nNuhQ4dKvQ4AAMDyFpbhw4ebMSkbNmyQ22677YplIyIi5Pjx427HdF+POx93HtNZQvnL6DgXT4KDg80GAADKB69aWHSArIaVd999V9auXSt169a96nPi4+MlNTXV7ZjOMNLjSs+hoSV/GR2TorOFnGUAAED5VtHbbqA333xT3nvvPbMWi3OMiQ6S1fVT1IABA+TWW28140zUiBEjpGPHjvLiiy9Kjx49TBfS1q1b5dVXXzWPBwQEmLEwzz77rDRs2NAEmAkTJpiZQzr9GQAAwKvAMnv2bPO1U6dObsfnz58vgwYNMt8fPHhQKlT4b8NNu3btTMgZP368jBs3zoSS5cuXuw3UHTNmjJn6/NBDD0lmZqa0b99eVq1aZRaaAwAAuKZ1WGxRntdhYZZQ+cIsIQD+pNTWYQEAACgNBBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwP8Cy4YNG6Rnz54SFRUlAQEBsnz58iuWHzRokClXcGvatKmrzKRJkwo93rhx45JdEQAA8DteB5acnByJi4uTWbNmFav8jBkz5NixY67t0KFDUqNGDenTp49bOQ0w+ctt3LjR26oBAAA/VdHbJ3Tv3t1sxRUWFmY2J22ROXPmjAwePNi9IhUrSkREhLfVAQAA5UCpj2F5/fXXJSEhQerUqeN2fO/evaabqV69evLAAw/IwYMHizxHbm6uZGdnu20AAMB/lWpgOXr0qHz00UcydOhQt+Nt2rSRBQsWyKpVq2T27NmSnp4uHTp0kLNnz3o8T0pKiqvlRrfo6OhSugIAAOD3gWXhwoVSrVo16d27t9tx7WLSMS2xsbGSmJgoK1eulMzMTFm6dKnH8yQlJUlWVpZr03ExAADAf3k9hqWkHA6HzJs3Tx588EEJCgq6YlkNNXfccYfs27fP4+PBwcFmAwAA5UOptbCsX7/eBJAhQ4Zctey5c+dk//79EhkZWSp1AwAAfhZYNEzs2LHDbErHm+j3zkGy2l0zYMAAj4NtdaxKs2bNCj325JNPmkBz4MAB2bRpk/zqV7+SwMBA6d+/f8muCgAAlO8uoa1bt0rnzp1d+6NGjTJfBw4caAbO6hoqBWf46DiTt99+26zJ4snhw4dNODl9+rTccsst0r59e9m8ebP5HgAAIMChg0vKOJ3WrLOFNBiFhoZKeRIzdoWvq4BSdOC5Hr6uAgD45PObewkBAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAP8LLBs2bJCePXtKVFSUBAQEyPLly69Yft26daZcwS0jI8Ot3KxZsyQmJkYqV64sbdq0kS1btnh/NQAAwC95HVhycnIkLi7OBAxv7NmzR44dO+baateu7XpsyZIlMmrUKElOTpbt27eb8ycmJsqJEye8rR4AAPBDFb19Qvfu3c3mLQ0o1apV8/jYtGnTZNiwYTJ48GCzP2fOHFmxYoXMmzdPxo4d6/VrAQAA/1JqY1hatGghkZGR0qVLF/nss89cxy9evCjbtm2ThISE/1aqQgWzn5aW5vFcubm5kp2d7bYBAAD/dcMDi4YUbTF5++23zRYdHS2dOnUyXT/q1KlTcvnyZQkPD3d7nu4XHOfilJKSImFhYa5NzwkAAPyX111C3mrUqJHZnNq1ayf79++Xv/3tb/LGG2+U6JxJSUlmzIuTtrAQWgAA8F83PLB40rp1a9m4caP5vlatWhIYGCjHjx93K6P7ERERHp8fHBxsNgAAUD74ZB2WHTt2mK4iFRQUJC1btpTU1FTX43l5eWY/Pj7eF9UDAABlvYXl3Llzsm/fPtd+enq6CSA1atSQ22+/3XTXHDlyRBYtWmQenz59utStW1eaNm0qFy5ckLlz58ratWvl448/dp1Du3cGDhworVq1Mq0v+hydPu2cNQQAAMo3rwPL1q1bpXPnzq5951gSDRwLFiwwa6wcPHjQbRbQE088YUJMlSpVJDY2Vj755BO3c/Tt21dOnjwpEydONANtdUbRqlWrCg3EBQAA5VOAw+FwSBmng251tlBWVpaEhoZKeRIzdoWvq4BSdOC5Hr6uAgD45PObewkBAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAP8LLBs2bJCePXtKVFSUBAQEyPLly69Y/p133pEuXbrILbfcIqGhoRIfHy+rV692KzNp0iRzrvxb48aNvb8aAADgl7wOLDk5ORIXFyezZs0qdsDRwLJy5UrZtm2bdO7c2QSeL7/80q1c06ZN5dixY65t48aN3lYNAAD4qYrePqF79+5mK67p06e77f/lL3+R9957Tz744AP52c9+9t+KVKwoERER3lYHAACUA6U+hiUvL0/Onj0rNWrUcDu+d+9e081Ur149eeCBB+TgwYNFniM3N1eys7PdNgAA4L9KPbC88MILcu7cObn//vtdx9q0aSMLFiyQVatWyezZsyU9PV06dOhggo0nKSkpEhYW5tqio6NL8QoAAIBfB5Y333xTJk+eLEuXLpXatWu7jmsXU58+fSQ2NlYSExPNeJfMzExTzpOkpCTJyspybYcOHSrFqwAAANaPYSmpxYsXy9ChQ2XZsmWSkJBwxbLVqlWTO+64Q/bt2+fx8eDgYLMBAIDyoVRaWN566y0ZPHiw+dqjR4+rltcuo/3790tkZGRpVA8AAPhbC4uGifwtHzreZMeOHWYQ7e233266a44cOSKLFi1ydQMNHDhQZsyYYcaqZGRkmOMhISFm/Il68sknzVTnOnXqyNGjRyU5OVkCAwOlf//+1+9KAQBA+Wlh2bp1q5mO7JySPGrUKPP9xIkTzb6uoZJ/hs+rr74qP/30kzz66KOmxcS5jRgxwlXm8OHDJpw0atTIDMatWbOmbN682Sw2BwAAEOBwOBxSxum0Zm2t0QG4uppueRIzdoWvq4BSdOC5q3epAoA/fn5zLyEAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAA4H+BZcOGDdKzZ0+JioqSgIAAWb58+VWfs27dOrnrrrskODhYGjRoIAsWLChUZtasWRITEyOVK1eWNm3ayJYtW7ytGgAA8FNeB5acnByJi4szAaM40tPTpUePHtK5c2fZsWOHjBw5UoYOHSqrV692lVmyZImMGjVKkpOTZfv27eb8iYmJcuLECW+rBwAA/FCAw+FwlPjJAQHy7rvvSu/evYss89RTT8mKFStk165drmP9+vWTzMxMWbVqldnXFpW7775bZs6cafbz8vIkOjpaHnvsMRk7duxV65GdnS1hYWGSlZUloaGhUp7EjF3h6yqgFB14roevqwAA1403n983fAxLWlqaJCQkuB3T1hM9ri5evCjbtm1zK1OhQgWz7yxTUG5urrnI/BsAAPBfFW/0C2RkZEh4eLjbMd3XkPHjjz/KmTNn5PLlyx7LfPPNNx7PmZKSIpMnT76h9QYAX6MFtXyhBdUPZwklJSWZ5iPndujQIV9XCQAAlOUWloiICDl+/LjbMd3XvqqQkBAJDAw0m6cy+lxPdLaRbgAAoHy44S0s8fHxkpqa6nZszZo15rgKCgqSli1bupXRQbe67ywDAADKN68Dy7lz58z0ZN2c05b1+4MHD7q6awYMGOAq//DDD8t3330nY8aMMWNSXnnlFVm6dKk8/vjjrjI6pfm1116ThQsXyu7du+WRRx4x06cHDx58fa4SAACUry6hrVu3mjVV8ocNNXDgQLMg3LFjx1zhRdWtW9dMa9aAMmPGDLnttttk7ty5ZqaQU9++feXkyZMyceJEM0i3RYsWZspzwYG4AACgfLqmdVhswTosKC+YRVC+8PtdvpTH3+9sm9ZhAQAAuFYEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAeiUKLLNmzZKYmBipXLmytGnTRrZs2VJk2U6dOklAQEChrUePHq4ygwYNKvR4t27dSnZFAADA71T09glLliyRUaNGyZw5c0xYmT59uiQmJsqePXukdu3ahcq/8847cvHiRdf+6dOnJS4uTvr06eNWTgPK/PnzXfvBwcHeXw0AAPBLXrewTJs2TYYNGyaDBw+WJk2amOBSpUoVmTdvnsfyNWrUkIiICNe2Zs0aU75gYNGAkr9c9erVS35VAACg/AYWbSnZtm2bJCQk/PcEFSqY/bS0tGKd4/XXX5d+/fpJ1apV3Y6vW7fOtNA0atRIHnnkEdMSU5Tc3FzJzs522wAAgP/yKrCcOnVKLl++LOHh4W7HdT8jI+Oqz9exLrt27ZKhQ4cW6g5atGiRpKamyvPPPy/r16+X7t27m9fyJCUlRcLCwlxbdHS0N5cBAAD8fQzLtdDWlebNm0vr1q3djmuLi5M+HhsbK/Xr1zetLvfee2+h8yQlJZlxNE7awkJoAQDAf3nVwlKrVi0JDAyU48ePux3XfR13ciU5OTmyePFiGTJkyFVfp169eua19u3b5/FxHe8SGhrqtgEAAP/lVWAJCgqSli1bmq4bp7y8PLMfHx9/xecuW7bMjD353e9+d9XXOXz4sBnDEhkZ6U31AACAn/J6lpB2xbz22muycOFC2b17txkgq60nOmtIDRgwwHTZeOoO6t27t9SsWdPt+Llz52T06NGyefNmOXDggAk/vXr1kgYNGpjp0gAAAF6PYenbt6+cPHlSJk6caAbatmjRQlatWuUaiHvw4EEzcyg/XaNl48aN8vHHHxc6n3Yx7dy50wSgzMxMiYqKkq5du8qUKVNYiwUAAJR80O3w4cPN5okOlC1Ipyo7HA6P5UNCQmT16tUlqQYAACgnuJcQAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAAPDPwDJr1iyJiYmRypUrS5s2bWTLli1Fll2wYIEEBAS4bfq8/BwOh0ycOFEiIyMlJCREEhISZO/evSWpGgAA8ENeB5YlS5bIqFGjJDk5WbZv3y5xcXGSmJgoJ06cKPI5oaGhcuzYMdf2/fffuz0+depUeemll2TOnDny+eefS9WqVc05L1y4ULKrAgAA5TuwTJs2TYYNGyaDBw+WJk2amJBRpUoVmTdvXpHP0VaViIgI1xYeHu7WujJ9+nQZP3689OrVS2JjY2XRokVy9OhRWb58ecmvDAAAlM/AcvHiRdm2bZvpsnGdoEIFs5+Wllbk886dOyd16tSR6OhoE0q++uor12Pp6emSkZHhds6wsDDT1VTUOXNzcyU7O9ttAwAA/surwHLq1Cm5fPmyWwuJ0n0NHZ40atTItL6899578o9//EPy8vKkXbt2cvjwYfO483nenDMlJcWEGuemQQgAAPivGz5LKD4+XgYMGCAtWrSQjh07yjvvvCO33HKL/P3vfy/xOZOSkiQrK8u1HTp06LrWGQAAlOHAUqtWLQkMDJTjx4+7Hdd9HZtSHJUqVZKf/exnsm/fPrPvfJ435wwODjYDefNvAADAf3kVWIKCgqRly5aSmprqOqZdPLqvLSnFoV1K//nPf8wUZlW3bl0TTPKfU8ek6Gyh4p4TAAD4t4rePkGnNA8cOFBatWolrVu3NjN8cnJyzKwhpd0/t956qxlnop555hlp27atNGjQQDIzM+Wvf/2rmdY8dOhQ1wyikSNHyrPPPisNGzY0AWbChAkSFRUlvXv3vt7XCwAAykNg6du3r5w8edIs9KaDYnVsyqpVq1yDZg8ePGhmDjmdOXPGTIPWstWrVzctNJs2bTJTop3GjBljQs9DDz1kQk379u3NOQsuMAcAAMqnAIcuhFLGaReSzhbSAbjlbTxLzNgVvq4CStGB53r4ugooRfx+ly/l8fc724vPb+4lBAAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAAD8M7DMmjVLYmJipHLlytKmTRvZsmVLkWVfe+016dChg1SvXt1sCQkJhcoPGjRIAgIC3LZu3bqVpGoAAMAPeR1YlixZIqNGjZLk5GTZvn27xMXFSWJiopw4ccJj+XXr1kn//v3l008/lbS0NImOjpauXbvKkSNH3MppQDl27Jhre+utt0p+VQAAoHwHlmnTpsmwYcNk8ODB0qRJE5kzZ45UqVJF5s2b57H8//3f/8kf//hHadGihTRu3Fjmzp0reXl5kpqa6lYuODhYIiIiXJu2xgAAAHgdWC5evCjbtm0z3TpOFSpUMPvaelIc58+fl0uXLkmNGjUKtcTUrl1bGjVqJI888oicPn26yHPk5uZKdna22wYAAPyXV4Hl1KlTcvnyZQkPD3c7rvsZGRnFOsdTTz0lUVFRbqFHu4MWLVpkWl2ef/55Wb9+vXTv3t28licpKSkSFhbm2rSbCQAA+K+Kpflizz33nCxevNi0puiAXad+/fq5vm/evLnExsZK/fr1Tbl777230HmSkpLMOBonbWEhtAAA4L+8amGpVauWBAYGyvHjx92O676OO7mSF154wQSWjz/+2ASSK6lXr555rX379nl8XMe7hIaGum0AAMB/eRVYgoKCpGXLlm4DZp0DaOPj44t83tSpU2XKlCmyatUqadWq1VVf5/Dhw2YMS2RkpDfVAwAAfsrrWULaFaNrqyxcuFB2795tBsjm5OSYWUNqwIABpsvGScekTJgwwcwi0rVbdKyLbufOnTOP69fRo0fL5s2b5cCBAyb89OrVSxo0aGCmSwMAAHg9hqVv375y8uRJmThxogkeOl1ZW06cA3EPHjxoZg45zZ4928wu+s1vfuN2Hl3HZdKkSaaLaefOnSYAZWZmmgG5uk6Ltsho1w8AAECJBt0OHz7cbJ7oQNn8tNXkSkJCQmT16tUlqQYAACgnuJcQAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwAIAAKxHYAEAANYjsAAAAOsRWAAAgPUILAAAwHoEFgAAYD0CCwAAsB6BBQAAWI/AAgAArEdgAQAA1iOwAAAA6xFYAACA9QgsAADAegQWAABgPQILAACwHoEFAABYj8ACAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAAPDPwDJr1iyJiYmRypUrS5s2bWTLli1XLL9s2TJp3LixKd+8eXNZuXKl2+MOh0MmTpwokZGREhISIgkJCbJ3796SVA0AAPghrwPLkiVLZNSoUZKcnCzbt2+XuLg4SUxMlBMnTngsv2nTJunfv78MGTJEvvzyS+ndu7fZdu3a5SozdepUeemll2TOnDny+eefS9WqVc05L1y4cG1XBwAAymdgmTZtmgwbNkwGDx4sTZo0MSGjSpUqMm/ePI/lZ8yYId26dZPRo0fLnXfeKVOmTJG77rpLZs6c6WpdmT59uowfP1569eolsbGxsmjRIjl69KgsX7782q8QAACUeRW9KXzx4kXZtm2bJCUluY5VqFDBdOGkpaV5fI4e1xaZ/LT1xBlG0tPTJSMjw5zDKSwszHQ16XP79etX6Jy5ublmc8rKyjJfs7OzpbzJyz3v6yqgFJXHf+PlGb/f5Ut5/P3O/v+vWRsvrmtgOXXqlFy+fFnCw8Pdjuv+N9984/E5GkY8ldfjzsedx4oqU1BKSopMnjy50PHo6GhvLgcoc8Km+7oGAG6U8vz7ffbsWdNYcd0Ciy20hSd/q01eXp788MMPUrNmTQkICPBp3VA6iVzD6aFDhyQ0NNTX1QFwHfH7Xb44HA4TVqKioq5a1qvAUqtWLQkMDJTjx4+7Hdf9iIgIj8/R41cq7/yqx3SWUP4yLVq08HjO4OBgs+VXrVo1by4FfkD/Z8b/0AD/xO93+RF2lZaVEg26DQoKkpYtW0pqaqpb64bux8fHe3yOHs9fXq1Zs8ZVvm7duia05C+jCVtnCxV1TgAAUL543SWkXTEDBw6UVq1aSevWrc0Mn5ycHDNrSA0YMEBuvfVWM85EjRgxQjp27Cgvvvii9OjRQxYvXixbt26VV1991TyuXTgjR46UZ599Vho2bGgCzIQJE0zzkE5/BgAA8Dqw9O3bV06ePGkWetNBsdpts2rVKteg2YMHD5qZQ07t2rWTN99800xbHjdunAklOkOoWbNmrjJjxowxoeehhx6SzMxMad++vTmnLjQHFKTdgboOUMFuQQBlH7/fKEqAozhziQAAAHyIewkBAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAALAegQUAAFiPwIIyRe8YvmfPHvnpp598XRUA18mnn37q6yqgDCCwoEw4f/68DBkyRKpUqSJNmzY1CxSqxx57TJ577jlfVw/ANejWrZvUr1/frHiuNz0EPCGwoMzcofvf//63rFu3zm0F5ISEBFmyZIlP6wbg2hw5ckSGDx8u//znP6VevXqSmJgoS5cuNS2qgBMr3aJMqFOnjgkmbdu2lZtvvtmEF/0f2759++Suu+4yN8wEUPZt375d5s+fL2+99ZbZ/+1vf2taV+Pi4nxdNfgYLSwoE/T+VbVr1y50XO9BpTfQBOAf9A8QbVHVFpdz587JvHnzpGXLltKhQwf56quvfF09+BCBBWWC3h18xYoVrn1nSJk7d67Ex8f7sGYArodLly6ZLqH77rvPtKiuXr1aZs6cKcePHzctqXqsT58+vq4mytLdmgFf+Mtf/iLdu3eXr7/+2swQmjFjhvl+06ZNsn79el9XD8A10MHz2gWkIxQefPBBmTp1qjRr1sz1eNWqVeWFF16QqKgon9YTvkULC8qE9u3by44dO0xYad68uXz88cemiygtLc00FwMou/SPj5dfflmOHj0q06dPdwsrTrVq1WL6cznHoFsAgE+7gv7whz/IhAkTpG7dur6uDixGYEGZUNQsIB3LEhwcLEFBQaVeJwDXR1hYmGlBJbDgSugSQplQrVo1qV69eqFNj4eEhJgBecnJyZKXl+frqgLwUu/evWX58uW+rgYsx6BblAkLFiyQp59+WgYNGiStW7c2x7Zs2SILFy6U8ePHm2nPOihPW1vGjRvn6+oC8ELDhg3lmWeekc8++8yMSdNBtvn96U9/8lndYA+6hFAm3Hvvvaaf+/7773c7rqth/v3vf5fU1FR544035M9//rN88803PqsnAO9dqStIu32/++67Uq0P7ERgQZmg3T47d+40f4nlt3fvXrMCpt5rKD093dxnSL8HAPgXxrCgTIiOjpbXX3+90HE9po+p06dPm3EtAAD/wxgWlAk6PkVXufzoo4/k7rvvNse2bt0qu3fvlrffftvsf/HFF9K3b18f1xRASRw+fFjef/99cyf2gjc9nDZtms/qBXvQJYQy48CBAzJnzhz59ttvzX6jRo3MuBa934inhaYAlA06Bu2Xv/yluaGpjkHT32f9fdePJ7230Nq1a31dRViAwIIyuy6LLuWtN0bTlpbLly/7ukoASkhn/umtNyZPnuy6G7uuZP3AAw9It27d5JFHHvF1FWEBxrCgTNmwYYMMHDjQ3FPkxRdflM6dO8vmzZt9XS0A10C7dgcMGGC+r1ixovz4449y0003manOzz//vK+rB0swhgXWy8jIMOuw6ABbbVnRqc25ublmoakmTZr4unoArpGuu+IctxIZGSn79+83M/7UqVOnfFw72IIWFlitZ8+eZqyKTmnWm6LpzdH0JmkA/Efbtm1l48aN5vv77rtPnnjiCbOm0u9//3vzGKAYwwKrafOwrnKpfdj512CpVKmS6eemhQUo+3RhOB08HxsbKzk5OSawbNq0yfzO6wwhvfUGQGCB1XR8inYFLVmyRO6880558MEHpV+/fqbZmMACAOUHgQVlgv7VpaFFZwXpPYR0VpD+5aVNxjqrAADg3wgsKHP27NljWl303kGZmZnSpUsXs+AUgLJDV6XW+wQVxw8//HDD6wP7EVhQZmkrywcffGBaXQgsQNmid1ovLl3KACCwAAAA67EOCwDAGhcuXCh0L6HQ0FCf1Qf2YB0WAIDPB9UPHz7cLMevi8jp+Jb8G6AILAAAnxozZoy5weHs2bMlODhY5s6da+4rpLfgWLRoka+rB0swhgUA4FO33367CSadOnUy3T/bt2+XBg0amJmAepPTlStX+rqKsAAtLAAAn9Jpy/Xq1TPfa2BxTmNu3769ueEpoAgsAACf0rCSnp5uvm/cuLEsXbrUfK/LFlSrVs3HtYMt6BICAPjU3/72NwkMDDT3Dfvkk0/MTU/1o+nSpUtmResRI0b4uoqwAIEFAGCV77//XrZt22bGsegNEQFFlxAAwCfS0tLkww8/dDvmHHz78MMPy8yZMyU3N9dn9YNdCCwAAJ945pln5KuvvnLt/+c//5EhQ4ZIQkKCJCUlmTEsKSkpPq0j7EGXEADAJyIjI00oadWqldl/+umnZf369bJx40azv2zZMklOTpavv/7axzWFDWhhAQD4xJkzZyQ8PNy1r2Gle/furv27775bDh065KPawTYEFgCAT2hYcU5n1vsH6YJxbdu2dT1+9uxZqVSpkg9rCJsQWAAAPnHffffJ2LFj5V//+pcZs1KlShXp0KGD6/GdO3dK/fr1fVpH2IO7NQMAfGLKlCny61//Wjp27Cg33XSTLFy4UIKCglyPz5s3T7p27erTOsIeDLoFAPhUVlaWCSy6eFx+ukS/Hs8fYlB+EVgAAID1GMMCAACsR2ABAADWI7AAAADrEVgAAID1CCwAAMB6BBYAAGA9AgsAABDb/T/y7u+e0AuRdgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.isnull().sum().plot(kind='bar')\n", + "plt.title(\"Missing Values Count\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "04b33922", + "metadata": {}, + "source": [ + "# METHODS\n" + ] + }, + { + "cell_type": "markdown", + "id": "c9fb73ca", + "metadata": {}, + "source": [ + "## 🔹 Mean Imputation\n", + "\n", + "Missing values are replaced with the mean of the column." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0e2077d4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSalary
025.050000.0
130.060000.0
232.555000.0
335.060000.0
440.065000.0
532.570000.0
\n", + "
" + ], + "text/plain": [ + " Age Salary\n", + "0 25.0 50000.0\n", + "1 30.0 60000.0\n", + "2 32.5 55000.0\n", + "3 35.0 60000.0\n", + "4 40.0 65000.0\n", + "5 32.5 70000.0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_imputer = SimpleImputer(strategy=\"mean\")\n", + "df_mean = pd.DataFrame(mean_imputer.fit_transform(df), columns=df.columns)\n", + "df_mean" + ] + }, + { + "cell_type": "markdown", + "id": "413b5891", + "metadata": {}, + "source": [ + "## 🔹 Median Imputation\n", + "\n", + "Missing values are replaced with the median of the column." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a7f0147e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSalary
025.050000.0
130.060000.0
232.555000.0
335.060000.0
440.065000.0
532.570000.0
\n", + "
" + ], + "text/plain": [ + " Age Salary\n", + "0 25.0 50000.0\n", + "1 30.0 60000.0\n", + "2 32.5 55000.0\n", + "3 35.0 60000.0\n", + "4 40.0 65000.0\n", + "5 32.5 70000.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "median_imputer = SimpleImputer(strategy=\"median\")\n", + "df_median = pd.DataFrame(median_imputer.fit_transform(df), columns=df.columns)\n", + "df_median" + ] + }, + { + "cell_type": "markdown", + "id": "1d486e5f", + "metadata": {}, + "source": [ + "## 🔹 KNN Imputation\n", + "\n", + "KNN imputation fills missing values using nearest neighbors." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cdea2819", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSalary
025.050000.0
130.060000.0
227.555000.0
335.062500.0
440.065000.0
535.070000.0
\n", + "
" + ], + "text/plain": [ + " Age Salary\n", + "0 25.0 50000.0\n", + "1 30.0 60000.0\n", + "2 27.5 55000.0\n", + "3 35.0 62500.0\n", + "4 40.0 65000.0\n", + "5 35.0 70000.0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_imputer = KNNImputer(n_neighbors=2)\n", + "df_knn = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)\n", + "df_knn" + ] + }, + { + "cell_type": "markdown", + "id": "35ab2260", + "metadata": {}, + "source": [ + "## Comparison of Methods\n", + "\n", + "| Method | Pros | Cons |\n", + "|--------|------|------|\n", + "| Mean | Simple, fast | Sensitive to outliers |\n", + "| Median | Robust to outliers | May ignore distribution |\n", + "| KNN | More accurate | Computationally expensive |" + ] + }, + { + "cell_type": "markdown", + "id": "403b21fa", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "- Mean and median imputation are simple and quick methods\n", + "- KNN imputation provides better results but is slower\n", + "- The choice of method depends on the dataset and problem" + ] + }, + { + "cell_type": "markdown", + "id": "1384e70a", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}