{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "spend_health.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyNH4+1exhSCGhK18PO1txk5",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1kpnj75N98Jy"
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"source": [
"df1 = pd.read_csv('life_expec.csv')"
],
"metadata": {
"id": "gpTfu13C9_lx"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#Turn the wide-form data into long-form\n",
"df1 = df1.melt(id_vars=[\"iso\"], \n",
" var_name=\"year\", value_name=\"value\")"
],
"metadata": {
"id": "opbeWbpf-L6u"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df1['year'] = pd.to_datetime(df1['year'], format='%Y')"
],
"metadata": {
"id": "H0XhTpAPT33c"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#Filter the data to one year\n",
"df1 =df1[(df1['year'] == '2019-01-01')]"
],
"metadata": {
"id": "sy6ZsMnKUjz-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df2 = pd.read_csv('gov_spend.csv')"
],
"metadata": {
"id": "nFzug5Qb_Z7F"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#merge the two dataframes\n",
"df = pd.merge(df1,df2)"
],
"metadata": {
"id": "hIcuiRHxAvPu"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import requests\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns; sns.set()\n",
"import numpy as np\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.metrics import r2_score"
],
"metadata": {
"id": "ocXQtRgiUydS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# remove n/a values so regression is possible\n",
"x = df.dropna()['average']\n",
"y = df.dropna()['value']\n",
"\n",
"poly_model = make_pipeline(PolynomialFeatures(2),\n",
" LinearRegression())\n",
"poly_model.fit(x[:, np.newaxis], y)\n",
"xfit = np.linspace(min(x), max(x), 1000)\n",
"yfit = poly_model.predict(xfit[:, np.newaxis])\n",
"\n",
"\n",
"plt.rcParams['axes.facecolor'] = '#DFDEDE'\n",
"plt.scatter(x, y, color ='#0000FF')\n",
"plt.plot(xfit, yfit,color='crimson');\n",
"plt.xlabel('Government spending as % of GDP', x=0.5, y=0.1)\n",
"plt.ylabel('Life expectancy at birth')\n",
"plt.title('How does public spending affect health outcomes', x=0.5, y=1, fontweight = \"bold\")\n",
"plt.suptitle('Sources: World Bank and ICTD', x=0.5, y=0.89, fontsize=9)\n",
"plt.savefig('spend_health.png', dpi=300, bbox_inches = \"tight\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 436
},
"id": "mK7p9OmPVBdo",
"outputId": "a7f7b550-ab9e-46c2-d359-1e3a55019367"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead.\n",
" \n"
]
},
{
"output_type": "error",
"ename": "ValueError",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mxfit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinspace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0myfit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpoly_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxfit\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mr2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr2_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0myfit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_regression.py\u001b[0m in \u001b[0;36mr2_score\u001b[0;34m(y_true, y_pred, sample_weight, multioutput)\u001b[0m\n\u001b[1;32m 773\u001b[0m \"\"\"\n\u001b[1;32m 774\u001b[0m y_type, y_true, y_pred, multioutput = _check_reg_targets(\n\u001b[0;32m--> 775\u001b[0;31m \u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmultioutput\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 776\u001b[0m )\n\u001b[1;32m 777\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_regression.py\u001b[0m in \u001b[0;36m_check_reg_targets\u001b[0;34m(y_true, y_pred, multioutput, dtype)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0margument\u001b[0m \u001b[0mpassed\u001b[0m \u001b[0mto\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \"\"\"\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m \u001b[0my_true\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m 331\u001b[0m raise ValueError(\n\u001b[1;32m 332\u001b[0m \u001b[0;34m\"Found input variables with inconsistent numbers of samples: %r\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 333\u001b[0;31m \u001b[0;34m%\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlengths\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 334\u001b[0m )\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [167, 1000]"
]
}
]
}
]
}