{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "spend_health.ipynb", "provenance": [], "authorship_tag": "ABX9TyNH4+1exhSCGhK18PO1txk5", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1kpnj75N98Jy" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "source": [ "df1 = pd.read_csv('life_expec.csv')" ], "metadata": { "id": "gpTfu13C9_lx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#Turn the wide-form data into long-form\n", "df1 = df1.melt(id_vars=[\"iso\"], \n", " var_name=\"year\", value_name=\"value\")" ], "metadata": { "id": "opbeWbpf-L6u" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df1['year'] = pd.to_datetime(df1['year'], format='%Y')" ], "metadata": { "id": "H0XhTpAPT33c" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#Filter the data to one year\n", "df1 =df1[(df1['year'] == '2019-01-01')]" ], "metadata": { "id": "sy6ZsMnKUjz-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df2 = pd.read_csv('gov_spend.csv')" ], "metadata": { "id": "nFzug5Qb_Z7F" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#merge the two dataframes\n", "df = pd.merge(df1,df2)" ], "metadata": { "id": "hIcuiRHxAvPu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import requests\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns; sns.set()\n", "import numpy as np\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.preprocessing import PolynomialFeatures\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.metrics import r2_score" ], "metadata": { "id": "ocXQtRgiUydS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# remove n/a values so regression is possible\n", "x = df.dropna()['average']\n", "y = df.dropna()['value']\n", "\n", "poly_model = make_pipeline(PolynomialFeatures(2),\n", " LinearRegression())\n", "poly_model.fit(x[:, np.newaxis], y)\n", "xfit = np.linspace(min(x), max(x), 1000)\n", "yfit = poly_model.predict(xfit[:, np.newaxis])\n", "\n", "\n", "plt.rcParams['axes.facecolor'] = '#DFDEDE'\n", "plt.scatter(x, y, color ='#0000FF')\n", "plt.plot(xfit, yfit,color='crimson');\n", "plt.xlabel('Government spending as % of GDP', x=0.5, y=0.1)\n", "plt.ylabel('Life expectancy at birth')\n", "plt.title('How does public spending affect health outcomes', x=0.5, y=1, fontweight = \"bold\")\n", "plt.suptitle('Sources: World Bank and ICTD', x=0.5, y=0.89, fontsize=9)\n", "plt.savefig('spend_health.png', dpi=300, bbox_inches = \"tight\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 436 }, "id": "mK7p9OmPVBdo", "outputId": "a7f7b550-ab9e-46c2-d359-1e3a55019367" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead.\n", " \n" ] }, { "output_type": "error", "ename": "ValueError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mxfit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinspace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0myfit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpoly_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxfit\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mr2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr2_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0myfit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_regression.py\u001b[0m in \u001b[0;36mr2_score\u001b[0;34m(y_true, y_pred, sample_weight, multioutput)\u001b[0m\n\u001b[1;32m 773\u001b[0m \"\"\"\n\u001b[1;32m 774\u001b[0m y_type, y_true, y_pred, multioutput = _check_reg_targets(\n\u001b[0;32m--> 775\u001b[0;31m \u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmultioutput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 776\u001b[0m )\n\u001b[1;32m 777\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_regression.py\u001b[0m in \u001b[0;36m_check_reg_targets\u001b[0;34m(y_true, y_pred, multioutput, dtype)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0margument\u001b[0m \u001b[0mpassed\u001b[0m \u001b[0mto\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \"\"\"\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m \u001b[0my_true\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m 331\u001b[0m raise ValueError(\n\u001b[1;32m 332\u001b[0m \u001b[0;34m\"Found input variables with inconsistent numbers of samples: %r\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 333\u001b[0;31m \u001b[0;34m%\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlengths\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 334\u001b[0m )\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [167, 1000]" ] } ] } ] }