SQLShark · vmisirlis010 · Mar 11, 2023
diff --git a/MLFLow/Sklearn-Regression-MLFlow.ipynb b/MLFLow/Sklearn-Regression-MLFlow.ipynb
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","source":["# Data Science with Python \n## Regression \n\nIn this demo we are going to look at a regression algorthm. \nRegression is typicaly a supervised Machine Learning technique (refer back to the slides for a definition). \n\nIn this demo, we will explore Regression with linear regression. We will use a series of modules:\n\n**matplotlib** - This module will allow us to visualise the output of our model. We will want to examine the data in 2 dimensions, we could do more but that will do for now. Interested in more dimensions? Ask me about PCA.   \n**numpy** - Statistical package for working with numbers.  \n**sklearn** -sklearn is one of the most used modules for general machine learning. Shallow learning. We can talk more about deep learning another time.      \n\nOk. Lets begin by looking at importing those modules.\n\n[Original example](http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html)"],"metadata":{"deletable":true,"editable":true}},{"cell_type":"code","source":["import matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn import datasets, linear_model\nfrom sklearn.metrics import mean_squared_error, r2_score\nimport pandas as pd\nimport os\nimport mlflow\nfrom math import sqrt\n\n# Set the experiment name to an experiment in the shared experiments folder\n\nmlflow.set_experiment(\"/mlflow/regressionDiabetes\")\n"],"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">INFO: &apos;/mlflow/regressionDiabetes&apos; does not exist. Creating a new experiment\n</div>"]}}],"execution_count":2},{"cell_type":"markdown","source":["Load a sample dataset. This will use the diabete dataset."],"metadata":{"deletable":true,"editable":true}},{"cell_type":"code","source":["# Load the diabetes dataset\ndiabetes = datasets.load_diabetes()\n"],"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":4},{"cell_type":"markdown","source":["For ease, lets load this in to a Padnas DataFrame and look at the top few rows."],"metadata":{"deletable":true,"editable":true}},{"cell_type":"code","source":["diabetespd = pd.DataFrame(data=diabetes.data)\ndiabetespd.to_csv('diabetes.txt', encoding='utf-8', index=False)\n"],"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":6},{"cell_type":"code","source":["diabetespd.head()"],"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">4</span><span class=\"ansired\">]: </span>\n          0         1         2         3         4         5         6  \\\n0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   \n1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   \n2  0.085299  0.050680  0.044451 -0.005671 -0.045599 -0.034194 -0.032356   \n3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   \n4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   \n\n          7         8         9  \n0 -0.002592  0.019908 -0.017646  \n1 -0.039493 -0.068330 -0.092204  \n2 -0.002592  0.002864 -0.025930  \n3  0.034309  0.022692 -0.009362  \n4 -0.002592 -0.031991 -0.046641  \n</div>"]}}],"execution_count":7},{"cell_type":"code","source":["# Use only one feature\ndiabetes_X = diabetes.data[:, np.newaxis, 2]"],"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":8},{"cell_type":"code","source":["diabetes_X[0:5]"],"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">6</span><span class=\"ansired\">]: </span>\narray([[ 0.06169621],\n       [-0.05147406],\n       [ 0.04445121],\n       [-0.01159501],\n       [-0.03638469]])\n</div>"]}}],"execution_count":9},{"cell_type":"markdown","source":["Lets split the data into training/testing sets.\nWe will do an 80/20 split."],"metadata":{"deletable":true,"editable":true}},{"cell_type":"code","source":["with mlflow.start_run():\n  # 1st idea\n  diabetes_X = diabetes.data[:, np.newaxis, 2]\n  \n  # 2nd idea\n  #diabetes_X = diabetes.data\n  \n  diabetes_X_train = diabetes_X[:-20]\n  diabetes_X_test = diabetes_X[-20:]\n\n  diabetes_y_train = diabetes.target[:-20]\n  diabetes_y_test = diabetes.target[-20:]\n\n  regr = linear_model\n\n  #regr = linear_model.Lasso(alpha=0.1)\n  #mlflow.log_param(\"alpha\", 0.1)\n  \n  #regr = linear_model.LassoLars(alpha=0.1)\n  #mlflow.log_param(\"alpha\", 0.1)\n\n  #regr = linear_model.BayesianRidge()   \n\n  regr.fit(diabetes_X_train, diabetes_y_train)\n\n  diabetes_y_pred = regr.predict(diabetes_X_test)\n\n  mlflow.log_metric(\"mse\", mean_squared_error(diabetes_y_test, diabetes_y_pred))\n  mlflow.log_metric(\"rmse\", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred)))\n  mlflow.log_metric(\"r2\", r2_score(diabetes_y_test, diabetes_y_pred))\n  \n  mlflow.log_artifact(\"diabetes.txt\")\n"],"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":11}],"metadata":{"language_info":{"mimetype":"text/x-python","name":"python","pygments_lexer":"ipython3","codemirror_mode":{"name":"ipython","version":3},"version":"3.5.2","nbconvert_exporter":"python","file_extension":".py"},"name":"Sklearn-Regression-MLFlow","notebookId":1962496471401116,"kernelspec":{"display_name":"Python [conda root]","language":"python","name":"conda-root-py"},"anaconda-cloud":{}},"nbformat":4,"nbformat_minor":0}
+{"cells":[{"cell_type":"markdown","metadata":{"deletable":true,"editable":true},"source":["# Data Science with Python \n","## Regression \n","\n","In this demo we are going to look at a regression algorthm. \n","Regression is typicaly a supervised Machine Learning technique (refer back to the slides for a definition). \n","\n","In this demo, we will explore Regression with linear regression. We will use a series of modules:\n","\n","**matplotlib** - This module will allow us to visualise the output of our model. We will want to examine the data in 2 dimensions, we could do more but that will do for now. Interested in more dimensions? Ask me about PCA.   \n","**numpy** - Statistical package for working with numbers.  \n","**sklearn** -sklearn is one of the most used modules for general machine learning. Shallow learning. We can talk more about deep learning another time.      \n","\n","Ok. Lets begin by looking at importing those modules.\n","\n","[Original example](http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html)"]},{"cell_type":"code","execution_count":2,"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\">INFO: &apos;/mlflow/regressionDiabetes&apos; does not exist. Creating a new experiment\n","</div>"]},"metadata":{},"output_type":"display_data"}],"source":["import matplotlib.pyplot as plt\n","from sklearn.linear_model import LinearRegression #added this lib\n","import numpy as np\n","from sklearn import datasets, linear_model\n","from sklearn.metrics import mean_squared_error, r2_score\n","import pandas as pd\n","import os\n","import mlflow\n","from math import sqrt\n","\n","# Set the experiment name to an experiment in the shared experiments folder\n","\n","mlflow.set_experiment(\"/mlflow/regressionDiabetes\")\n"]},{"cell_type":"markdown","metadata":{"deletable":true,"editable":true},"source":["Load a sample dataset. This will use the diabete dataset."]},{"cell_type":"code","execution_count":4,"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{},"output_type":"display_data"}],"source":["# Load the diabetes dataset\n","diabetes = datasets.load_diabetes()\n"]},{"cell_type":"markdown","metadata":{"deletable":true,"editable":true},"source":["For ease, lets load this in to a Padnas DataFrame and look at the top few rows."]},{"cell_type":"code","execution_count":6,"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{},"output_type":"display_data"}],"source":["diabetespd = pd.DataFrame(data=diabetes.data)\n","diabetespd.to_csv('diabetes.txt', encoding='utf-8', index=False)\n"]},{"cell_type":"code","execution_count":7,"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">4</span><span class=\"ansired\">]: </span>\n","          0         1         2         3         4         5         6  \\\n","0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   \n","1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   \n","2  0.085299  0.050680  0.044451 -0.005671 -0.045599 -0.034194 -0.032356   \n","3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   \n","4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   \n","\n","          7         8         9  \n","0 -0.002592  0.019908 -0.017646  \n","1 -0.039493 -0.068330 -0.092204  \n","2 -0.002592  0.002864 -0.025930  \n","3  0.034309  0.022692 -0.009362  \n","4 -0.002592 -0.031991 -0.046641  \n","</div>"]},"metadata":{},"output_type":"display_data"}],"source":["diabetespd.head()"]},{"cell_type":"code","execution_count":8,"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{},"output_type":"display_data"}],"source":["# Use only one feature\n","diabetes_X = diabetes.data[:, np.newaxis, 2]"]},{"cell_type":"code","execution_count":9,"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">6</span><span class=\"ansired\">]: </span>\n","array([[ 0.06169621],\n","       [-0.05147406],\n","       [ 0.04445121],\n","       [-0.01159501],\n","       [-0.03638469]])\n","</div>"]},"metadata":{},"output_type":"display_data"}],"source":["diabetes_X[0:5]"]},{"cell_type":"markdown","metadata":{"deletable":true,"editable":true},"source":["Lets split the data into training/testing sets.\n","We will do an 80/20 split."]},{"cell_type":"code","execution_count":11,"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n","  .ansiout {\n","    display: block;\n","    unicode-bidi: embed;\n","    white-space: pre-wrap;\n","    word-wrap: break-word;\n","    word-break: break-all;\n","    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n","    font-size: 13px;\n","    color: #555;\n","    margin-left: 4px;\n","    line-height: 19px;\n","  }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{},"output_type":"display_data"}],"source":["with mlflow.start_run():\n","  # 1st idea\n","  diabetes_X = diabetes.data[:, np.newaxis, 2]\n","  \n","  # 2nd idea\n","  #diabetes_X = diabetes.data\n","  \n","  diabetes_X_train = diabetes_X[:-20]\n","  diabetes_X_test = diabetes_X[-20:]\n","\n","  diabetes_y_train = diabetes.target[:-20]\n","  diabetes_y_test = diabetes.target[-20:]\n","\n","  regr = LinearRegression()\n","  #regr = linear_model\n","\n","  #regr = linear_model.Lasso(alpha=0.1)\n","  #mlflow.log_param(\"alpha\", 0.1)\n","  \n","  #regr = linear_model.LassoLars(alpha=0.1)\n","  #mlflow.log_param(\"alpha\", 0.1)\n","\n","  #regr = linear_model.BayesianRidge()   \n","\n","  regr.fit(diabetes_X_train, diabetes_y_train)\n","\n","  diabetes_y_pred = regr.predict(diabetes_X_test)\n","\n","  mlflow.log_metric(\"mse\", mean_squared_error(diabetes_y_test, diabetes_y_pred))\n","  mlflow.log_metric(\"rmse\", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred)))\n","  mlflow.log_metric(\"r2\", r2_score(diabetes_y_test, diabetes_y_pred))\n","  \n","  mlflow.log_artifact(\"diabetes.txt\")\n"]}],"metadata":{"anaconda-cloud":{},"kernelspec":{"display_name":"Python [conda root]","language":"python","name":"conda-root-py"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.5.2"},"name":"Sklearn-Regression-MLFlow","notebookId":1962496471401116},"nbformat":4,"nbformat_minor":0}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"cells":[{"cell_type":"markdown","source":["# Data Science with Python \n## Regression \n\nIn this demo we are going to look at a regression algorthm. \nRegression is typicaly a supervised Machine Learning technique (refer back to the slides for a definition). \n\nIn this demo, we will explore Regression with linear regression. We will use a series of modules:\n\nmatplotlib - This module will allow us to visualise the output of our model. We will want to examine the data in 2 dimensions, we could do more but that will do for now. Interested in more dimensions? Ask me about PCA. \nnumpy - Statistical package for working with numbers. \nsklearn -sklearn is one of the most used modules for general machine learning. Shallow learning. We can talk more about deep learning another time. \n\nOk. Lets begin by looking at importing those modules.\n\n[Original example](http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html)"],"metadata":{"deletable":true,"editable":true}},{"cell_type":"code","source":["import matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn import datasets, linear_model\nfrom sklearn.metrics import mean_squared_error, r2_score\nimport pandas as pd\nimport os\nimport mlflow\nfrom math import sqrt\n\n# Set the experiment name to an experiment in the shared experiments folder\n\nmlflow.set_experiment(\"/mlflow/regressionDiabetes\")\n"],"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>\n<div class=\"ansiout\">INFO: '/mlflow/regressionDiabetes' does not exist. Creating a new experiment\n</div>"]}}],"execution_count":2},{"cell_type":"markdown","source":["Load a sample dataset. This will use the diabete dataset."],"metadata":{"deletable":true,"editable":true}},{"cell_type":"code","source":["# Load the diabetes dataset\ndiabetes = datasets.load_diabetes()\n"],"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":4},{"cell_type":"markdown","source":["For ease, lets load this in to a Padnas DataFrame and look at the top few rows."],"metadata":{"deletable":true,"editable":true}},{"cell_type":"code","source":["diabetespd = pd.DataFrame(data=diabetes.data)\ndiabetespd.to_csv('diabetes.txt', encoding='utf-8', index=False)\n"],"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":6},{"cell_type":"code","source":["diabetespd.head()"],"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">4</span><span class=\"ansired\">]: </span>\n 0 1 2 3 4 5 6 \\\n0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 \n1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 \n2 0.085299 0.050680 0.044451 -0.005671 -0.045599 -0.034194 -0.032356 \n3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 \n4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 \n\n 7 8 9 \n0 -0.002592 0.019908 -0.017646 \n1 -0.039493 -0.068330 -0.092204 \n2 -0.002592 0.002864 -0.025930 \n3 0.034309 0.022692 -0.009362 \n4 -0.002592 -0.031991 -0.046641 \n</div>"]}}],"execution_count":7},{"cell_type":"code","source":["# Use only one feature\ndiabetes_X = diabetes.data[:, np.newaxis, 2]"],"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":8},{"cell_type":"code","source":["diabetes_X[0:5]"],"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>\n<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">6</span><span class=\"ansired\">]: </span>\narray([[ 0.06169621],\n [-0.05147406],\n [ 0.04445121],\n [-0.01159501],\n [-0.03638469]])\n</div>"]}}],"execution_count":9},{"cell_type":"markdown","source":["Lets split the data into training/testing sets.\nWe will do an 80/20 split."],"metadata":{"deletable":true,"editable":true}},{"cell_type":"code","source":["with mlflow.start_run():\n # 1st idea\n diabetes_X = diabetes.data[:, np.newaxis, 2]\n \n # 2nd idea\n #diabetes_X = diabetes.data\n \n diabetes_X_train = diabetes_X[:-20]\n diabetes_X_test = diabetes_X[-20:]\n\n diabetes_y_train = diabetes.target[:-20]\n diabetes_y_test = diabetes.target[-20:]\n\n regr = linear_model\n\n #regr = linear_model.Lasso(alpha=0.1)\n #mlflow.log_param(\"alpha\", 0.1)\n \n #regr = linear_model.LassoLars(alpha=0.1)\n #mlflow.log_param(\"alpha\", 0.1)\n\n #regr = linear_model.BayesianRidge() \n\n regr.fit(diabetes_X_train, diabetes_y_train)\n\n diabetes_y_pred = regr.predict(diabetes_X_test)\n\n mlflow.log_metric(\"mse\", mean_squared_error(diabetes_y_test, diabetes_y_pred))\n mlflow.log_metric(\"rmse\", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred)))\n mlflow.log_metric(\"r2\", r2_score(diabetes_y_test, diabetes_y_pred))\n \n mlflow.log_artifact(\"diabetes.txt\")\n"],"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>\n<div class=\"ansiout\"></div>"]}}],"execution_count":11}],"metadata":{"language_info":{"mimetype":"text/x-python","name":"python","pygments_lexer":"ipython3","codemirror_mode":{"name":"ipython","version":3},"version":"3.5.2","nbconvert_exporter":"python","file_extension":".py"},"name":"Sklearn-Regression-MLFlow","notebookId":1962496471401116,"kernelspec":{"display_name":"Python [conda root]","language":"python","name":"conda-root-py"},"anaconda-cloud":{}},"nbformat":4,"nbformat_minor":0}
		{"cells":[{"cell_type":"markdown","metadata":{"deletable":true,"editable":true},"source":["# Data Science with Python \n","## Regression \n","\n","In this demo we are going to look at a regression algorthm. \n","Regression is typicaly a supervised Machine Learning technique (refer back to the slides for a definition). \n","\n","In this demo, we will explore Regression with linear regression. We will use a series of modules:\n","\n","matplotlib - This module will allow us to visualise the output of our model. We will want to examine the data in 2 dimensions, we could do more but that will do for now. Interested in more dimensions? Ask me about PCA. \n","numpy - Statistical package for working with numbers. \n","sklearn -sklearn is one of the most used modules for general machine learning. Shallow learning. We can talk more about deep learning another time. \n","\n","Ok. Lets begin by looking at importing those modules.\n","\n","[Original example](http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html)"]},{"cell_type":"code","execution_count":2,"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n"," .ansiout {\n"," display: block;\n"," unicode-bidi: embed;\n"," white-space: pre-wrap;\n"," word-wrap: break-word;\n"," word-break: break-all;\n"," font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n"," font-size: 13px;\n"," color: #555;\n"," margin-left: 4px;\n"," line-height: 19px;\n"," }\n","</style>\n","<div class=\"ansiout\">INFO: '/mlflow/regressionDiabetes' does not exist. Creating a new experiment\n","</div>"]},"metadata":{},"output_type":"display_data"}],"source":["import matplotlib.pyplot as plt\n","from sklearn.linear_model import LinearRegression #added this lib\n","import numpy as np\n","from sklearn import datasets, linear_model\n","from sklearn.metrics import mean_squared_error, r2_score\n","import pandas as pd\n","import os\n","import mlflow\n","from math import sqrt\n","\n","# Set the experiment name to an experiment in the shared experiments folder\n","\n","mlflow.set_experiment(\"/mlflow/regressionDiabetes\")\n"]},{"cell_type":"markdown","metadata":{"deletable":true,"editable":true},"source":["Load a sample dataset. This will use the diabete dataset."]},{"cell_type":"code","execution_count":4,"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n"," .ansiout {\n"," display: block;\n"," unicode-bidi: embed;\n"," white-space: pre-wrap;\n"," word-wrap: break-word;\n"," word-break: break-all;\n"," font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n"," font-size: 13px;\n"," color: #555;\n"," margin-left: 4px;\n"," line-height: 19px;\n"," }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{},"output_type":"display_data"}],"source":["# Load the diabetes dataset\n","diabetes = datasets.load_diabetes()\n"]},{"cell_type":"markdown","metadata":{"deletable":true,"editable":true},"source":["For ease, lets load this in to a Padnas DataFrame and look at the top few rows."]},{"cell_type":"code","execution_count":6,"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n"," .ansiout {\n"," display: block;\n"," unicode-bidi: embed;\n"," white-space: pre-wrap;\n"," word-wrap: break-word;\n"," word-break: break-all;\n"," font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n"," font-size: 13px;\n"," color: #555;\n"," margin-left: 4px;\n"," line-height: 19px;\n"," }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{},"output_type":"display_data"}],"source":["diabetespd = pd.DataFrame(data=diabetes.data)\n","diabetespd.to_csv('diabetes.txt', encoding='utf-8', index=False)\n"]},{"cell_type":"code","execution_count":7,"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n"," .ansiout {\n"," display: block;\n"," unicode-bidi: embed;\n"," white-space: pre-wrap;\n"," word-wrap: break-word;\n"," word-break: break-all;\n"," font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n"," font-size: 13px;\n"," color: #555;\n"," margin-left: 4px;\n"," line-height: 19px;\n"," }\n","</style>\n","<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">4</span><span class=\"ansired\">]: </span>\n"," 0 1 2 3 4 5 6 \\\n","0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 \n","1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 \n","2 0.085299 0.050680 0.044451 -0.005671 -0.045599 -0.034194 -0.032356 \n","3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 \n","4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 \n","\n"," 7 8 9 \n","0 -0.002592 0.019908 -0.017646 \n","1 -0.039493 -0.068330 -0.092204 \n","2 -0.002592 0.002864 -0.025930 \n","3 0.034309 0.022692 -0.009362 \n","4 -0.002592 -0.031991 -0.046641 \n","</div>"]},"metadata":{},"output_type":"display_data"}],"source":["diabetespd.head()"]},{"cell_type":"code","execution_count":8,"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n"," .ansiout {\n"," display: block;\n"," unicode-bidi: embed;\n"," white-space: pre-wrap;\n"," word-wrap: break-word;\n"," word-break: break-all;\n"," font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n"," font-size: 13px;\n"," color: #555;\n"," margin-left: 4px;\n"," line-height: 19px;\n"," }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{},"output_type":"display_data"}],"source":["# Use only one feature\n","diabetes_X = diabetes.data[:, np.newaxis, 2]"]},{"cell_type":"code","execution_count":9,"metadata":{"collapsed":false,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n"," .ansiout {\n"," display: block;\n"," unicode-bidi: embed;\n"," white-space: pre-wrap;\n"," word-wrap: break-word;\n"," word-break: break-all;\n"," font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n"," font-size: 13px;\n"," color: #555;\n"," margin-left: 4px;\n"," line-height: 19px;\n"," }\n","</style>\n","<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">6</span><span class=\"ansired\">]: </span>\n","array([[ 0.06169621],\n"," [-0.05147406],\n"," [ 0.04445121],\n"," [-0.01159501],\n"," [-0.03638469]])\n","</div>"]},"metadata":{},"output_type":"display_data"}],"source":["diabetes_X[0:5]"]},{"cell_type":"markdown","metadata":{"deletable":true,"editable":true},"source":["Lets split the data into training/testing sets.\n","We will do an 80/20 split."]},{"cell_type":"code","execution_count":11,"metadata":{"collapsed":true,"deletable":true,"editable":true},"outputs":[{"data":{"text/html":["<style scoped>\n"," .ansiout {\n"," display: block;\n"," unicode-bidi: embed;\n"," white-space: pre-wrap;\n"," word-wrap: break-word;\n"," word-break: break-all;\n"," font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n"," font-size: 13px;\n"," color: #555;\n"," margin-left: 4px;\n"," line-height: 19px;\n"," }\n","</style>\n","<div class=\"ansiout\"></div>"]},"metadata":{},"output_type":"display_data"}],"source":["with mlflow.start_run():\n"," # 1st idea\n"," diabetes_X = diabetes.data[:, np.newaxis, 2]\n"," \n"," # 2nd idea\n"," #diabetes_X = diabetes.data\n"," \n"," diabetes_X_train = diabetes_X[:-20]\n"," diabetes_X_test = diabetes_X[-20:]\n","\n"," diabetes_y_train = diabetes.target[:-20]\n"," diabetes_y_test = diabetes.target[-20:]\n","\n"," regr = LinearRegression()\n"," #regr = linear_model\n","\n"," #regr = linear_model.Lasso(alpha=0.1)\n"," #mlflow.log_param(\"alpha\", 0.1)\n"," \n"," #regr = linear_model.LassoLars(alpha=0.1)\n"," #mlflow.log_param(\"alpha\", 0.1)\n","\n"," #regr = linear_model.BayesianRidge() \n","\n"," regr.fit(diabetes_X_train, diabetes_y_train)\n","\n"," diabetes_y_pred = regr.predict(diabetes_X_test)\n","\n"," mlflow.log_metric(\"mse\", mean_squared_error(diabetes_y_test, diabetes_y_pred))\n"," mlflow.log_metric(\"rmse\", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred)))\n"," mlflow.log_metric(\"r2\", r2_score(diabetes_y_test, diabetes_y_pred))\n"," \n"," mlflow.log_artifact(\"diabetes.txt\")\n"]}],"metadata":{"anaconda-cloud":{},"kernelspec":{"display_name":"Python [conda root]","language":"python","name":"conda-root-py"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.5.2"},"name":"Sklearn-Regression-MLFlow","notebookId":1962496471401116},"nbformat":4,"nbformat_minor":0}