|
@@ -16,7 +16,18 @@
|
|
"outputs_hidden": false
|
|
"outputs_hidden": false
|
|
}
|
|
}
|
|
},
|
|
},
|
|
- "outputs": [],
|
|
|
|
|
|
+ "outputs": [
|
|
|
|
+ {
|
|
|
|
+ "data": {
|
|
|
|
+ "text/plain": [
|
|
|
|
+ "'Connected: None@factbook.db'"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ "execution_count": 1,
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "output_type": "execute_result"
|
|
|
|
+ }
|
|
|
|
+ ],
|
|
"source": [
|
|
"source": [
|
|
"%%capture\n",
|
|
"%%capture\n",
|
|
"%load_ext sql\n",
|
|
"%load_ext sql\n",
|
|
@@ -30,6 +41,13 @@
|
|
"## Overview of the Data"
|
|
"## Overview of the Data"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "We'll begin by getting a sense of what the data looks like."
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"execution_count": 2,
|
|
@@ -44,7 +62,6 @@
|
|
"name": "stdout",
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"text": [
|
|
- " * sqlite:///factbook.db\n",
|
|
|
|
"Done.\n"
|
|
"Done.\n"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
@@ -147,7 +164,28 @@
|
|
],
|
|
],
|
|
"source": [
|
|
"source": [
|
|
"%%sql\n",
|
|
"%%sql\n",
|
|
- "SELECT * FROM facts limit 5;"
|
|
|
|
|
|
+ "SELECT *\n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ " LIMIT 5;"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "Here are the descriptions for some of the columns:\n",
|
|
|
|
+ "\n",
|
|
|
|
+ "* `name` - The name of the country.\n",
|
|
|
|
+ "* `area` - The total land and sea area of the country.\n",
|
|
|
|
+ "* `population` - The country's population.\n",
|
|
|
|
+ "* `population_growth`- The country's population growth as a percentage.\n",
|
|
|
|
+ "* `birth_rate` - The country's birth rate, or the number of births a year per 1,000 people.\n",
|
|
|
|
+ "* `death_rate` - The country's death rate, or the number of death a year per 1,000 people.\n",
|
|
|
|
+ "* `area`- The country's total area (both land and water).\n",
|
|
|
|
+ "* `area_land` - The country's land area in [square kilometers](https://www.cia.gov/library/publications/the-world-factbook/rankorder/2147rank.html).\n",
|
|
|
|
+ "* `area_water` - The country's waterarea in square kilometers.\n",
|
|
|
|
+ "\n",
|
|
|
|
+ "Let's start by calculating some summary statistics and see what they tell us."
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
@@ -171,7 +209,6 @@
|
|
"name": "stdout",
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"text": [
|
|
- " * sqlite:///factbook.db\n",
|
|
|
|
"Done.\n"
|
|
"Done.\n"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
@@ -182,8 +219,8 @@
|
|
" <tr>\n",
|
|
" <tr>\n",
|
|
" <th>min_pop</th>\n",
|
|
" <th>min_pop</th>\n",
|
|
" <th>max_pop</th>\n",
|
|
" <th>max_pop</th>\n",
|
|
- " <th>min_pop_grwth</th>\n",
|
|
|
|
- " <th>max_pop_grwth</th>\n",
|
|
|
|
|
|
+ " <th>min_pop_growth</th>\n",
|
|
|
|
+ " <th>max_pop_growth</th>\n",
|
|
" </tr>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <tr>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
@@ -204,12 +241,23 @@
|
|
],
|
|
],
|
|
"source": [
|
|
"source": [
|
|
"%%sql\n",
|
|
"%%sql\n",
|
|
- "SELECT\n",
|
|
|
|
- " MIN(population) min_pop,\n",
|
|
|
|
- " MAX(population) max_pop, \n",
|
|
|
|
- " MIN(population_growth) min_pop_grwth,\n",
|
|
|
|
- " MAX(population_growth) max_pop_grwth \n",
|
|
|
|
- "FROM facts;"
|
|
|
|
|
|
+ "SELECT MIN(population) AS min_pop,\n",
|
|
|
|
+ " MAX(population) AS max_pop,\n",
|
|
|
|
+ " MIN(population_growth) AS min_pop_growth,\n",
|
|
|
|
+ " MAX(population_growth) max_pop_growth \n",
|
|
|
|
+ " FROM facts;"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "A few things stick out from the summary statistics in the last screen:\n",
|
|
|
|
+ "\n",
|
|
|
|
+ "- There's a country with a population of `0`\n",
|
|
|
|
+ "- There's a country with a population of `7256490011` (or more than 7.2 billion people) \n",
|
|
|
|
+ "\n",
|
|
|
|
+ "Let's use subqueries to zoom in on just these countries _without_ using the specific values."
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
@@ -233,7 +281,6 @@
|
|
"name": "stdout",
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"text": [
|
|
- " * sqlite:///factbook.db\n",
|
|
|
|
"Done.\n"
|
|
"Done.\n"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
@@ -255,22 +302,22 @@
|
|
" <th>migration_rate</th>\n",
|
|
" <th>migration_rate</th>\n",
|
|
" </tr>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <tr>\n",
|
|
- " <td>261</td>\n",
|
|
|
|
- " <td>xx</td>\n",
|
|
|
|
- " <td>World</td>\n",
|
|
|
|
|
|
+ " <td>250</td>\n",
|
|
|
|
+ " <td>ay</td>\n",
|
|
|
|
+ " <td>Antarctica</td>\n",
|
|
|
|
+ " <td>None</td>\n",
|
|
|
|
+ " <td>280000</td>\n",
|
|
|
|
+ " <td>None</td>\n",
|
|
|
|
+ " <td>0</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
- " <td>7256490011</td>\n",
|
|
|
|
- " <td>1.08</td>\n",
|
|
|
|
- " <td>18.6</td>\n",
|
|
|
|
- " <td>7.8</td>\n",
|
|
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" </tr>\n",
|
|
" </tr>\n",
|
|
"</table>"
|
|
"</table>"
|
|
],
|
|
],
|
|
"text/plain": [
|
|
"text/plain": [
|
|
- "[(261, 'xx', 'World', None, None, None, 7256490011, 1.08, 18.6, 7.8, None)]"
|
|
|
|
|
|
+ "[(250, 'ay', 'Antarctica', None, 280000, None, 0, None, None, None, None)]"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
"execution_count": 4,
|
|
"execution_count": 4,
|
|
@@ -280,10 +327,20 @@
|
|
],
|
|
],
|
|
"source": [
|
|
"source": [
|
|
"%%sql\n",
|
|
"%%sql\n",
|
|
- "SELECT * FROM facts\n",
|
|
|
|
- "WHERE population == (\n",
|
|
|
|
- " SELECT MAX(population) FROM facts\n",
|
|
|
|
- ");"
|
|
|
|
|
|
+ "SELECT *\n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ " WHERE population == (SELECT MIN(population)\n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ " );"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "It seems like the table contains a row for Antarctica, which explains the population of 0. This seems to match the CIA Factbook [page for Antarctica](https://www.cia.gov/library/publications/the-world-factbook/geos/ay.html):\n",
|
|
|
|
+ "\n",
|
|
|
|
+ "<img src = \"https://s3.amazonaws.com/dq-content/257/fb_antarctica.png\">"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
@@ -300,7 +357,6 @@
|
|
"name": "stdout",
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"text": [
|
|
- " * sqlite:///factbook.db\n",
|
|
|
|
"Done.\n"
|
|
"Done.\n"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
@@ -322,22 +378,22 @@
|
|
" <th>migration_rate</th>\n",
|
|
" <th>migration_rate</th>\n",
|
|
" </tr>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <tr>\n",
|
|
- " <td>250</td>\n",
|
|
|
|
- " <td>ay</td>\n",
|
|
|
|
- " <td>Antarctica</td>\n",
|
|
|
|
- " <td>None</td>\n",
|
|
|
|
- " <td>280000</td>\n",
|
|
|
|
- " <td>None</td>\n",
|
|
|
|
- " <td>0</td>\n",
|
|
|
|
|
|
+ " <td>261</td>\n",
|
|
|
|
+ " <td>xx</td>\n",
|
|
|
|
+ " <td>World</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
|
|
+ " <td>7256490011</td>\n",
|
|
|
|
+ " <td>1.08</td>\n",
|
|
|
|
+ " <td>18.6</td>\n",
|
|
|
|
+ " <td>7.8</td>\n",
|
|
" <td>None</td>\n",
|
|
" <td>None</td>\n",
|
|
" </tr>\n",
|
|
" </tr>\n",
|
|
"</table>"
|
|
"</table>"
|
|
],
|
|
],
|
|
"text/plain": [
|
|
"text/plain": [
|
|
- "[(250, 'ay', 'Antarctica', None, 280000, None, 0, None, None, None, None)]"
|
|
|
|
|
|
+ "[(261, 'xx', 'World', None, None, None, 7256490011, 1.08, 18.6, 7.8, None)]"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
"execution_count": 5,
|
|
"execution_count": 5,
|
|
@@ -347,22 +403,106 @@
|
|
],
|
|
],
|
|
"source": [
|
|
"source": [
|
|
"%%sql\n",
|
|
"%%sql\n",
|
|
- "SELECT * FROM facts\n",
|
|
|
|
- "WHERE population == (\n",
|
|
|
|
- " SELECT MIN(population) FROM facts\n",
|
|
|
|
- ");"
|
|
|
|
|
|
+ "SELECT *\n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ " WHERE population == (SELECT MAX(population)\n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ " );"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"source": [
|
|
"source": [
|
|
- "## Exploring Average Population and Area"
|
|
|
|
|
|
+ "We also see that the table contains a row for the whole world, which explains the maximum population of over 7.2 billion we found earlier.\n",
|
|
|
|
+ "\n",
|
|
|
|
+ "Now that we know this, we should recalculate the summary statistics we calculated earlier, while excluding the row for the whole world."
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "## Summary Statistics Revisited"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"execution_count": 6,
|
|
|
|
+ "metadata": {
|
|
|
|
+ "collapsed": false
|
|
|
|
+ },
|
|
|
|
+ "outputs": [
|
|
|
|
+ {
|
|
|
|
+ "name": "stdout",
|
|
|
|
+ "output_type": "stream",
|
|
|
|
+ "text": [
|
|
|
|
+ "Done.\n"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "data": {
|
|
|
|
+ "text/html": [
|
|
|
|
+ "<table>\n",
|
|
|
|
+ " <tr>\n",
|
|
|
|
+ " <th>min_pop</th>\n",
|
|
|
|
+ " <th>max_pop</th>\n",
|
|
|
|
+ " <th>min_pop_growth</th>\n",
|
|
|
|
+ " <th>max_pop_growth</th>\n",
|
|
|
|
+ " </tr>\n",
|
|
|
|
+ " <tr>\n",
|
|
|
|
+ " <td>0</td>\n",
|
|
|
|
+ " <td>1367485388</td>\n",
|
|
|
|
+ " <td>0.0</td>\n",
|
|
|
|
+ " <td>4.02</td>\n",
|
|
|
|
+ " </tr>\n",
|
|
|
|
+ "</table>"
|
|
|
|
+ ],
|
|
|
|
+ "text/plain": [
|
|
|
|
+ "[(0, 1367485388, 0.0, 4.02)]"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ "execution_count": 6,
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "output_type": "execute_result"
|
|
|
|
+ }
|
|
|
|
+ ],
|
|
|
|
+ "source": [
|
|
|
|
+ "%%sql\n",
|
|
|
|
+ "SELECT MIN(population) AS min_pop,\n",
|
|
|
|
+ " MAX(population) AS max_pop,\n",
|
|
|
|
+ " MIN(population_growth) AS min_pop_growth,\n",
|
|
|
|
+ " MAX(population_growth) AS max_pop_growth \n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ " WHERE name <> 'World';"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "There's a country whose population closes in on 1.4 billion!"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "## Exploring Average Population and Area"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "Let's explore density. Density depends on the population and the country's area. Let's look at the average values for these two columns.\n",
|
|
|
|
+ "\n",
|
|
|
|
+ "We should take care of discarding the row for the whole planet."
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "code",
|
|
|
|
+ "execution_count": 7,
|
|
"metadata": {
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"collapsed": false,
|
|
"jupyter": {
|
|
"jupyter": {
|
|
@@ -374,7 +514,6 @@
|
|
"name": "stdout",
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"text": [
|
|
- " * sqlite:///factbook.db\n",
|
|
|
|
"Done.\n"
|
|
"Done.\n"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
@@ -396,18 +535,25 @@
|
|
"[(32242666.56846473, 555093.546184739)]"
|
|
"[(32242666.56846473, 555093.546184739)]"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
- "execution_count": 6,
|
|
|
|
|
|
+ "execution_count": 7,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
"output_type": "execute_result"
|
|
}
|
|
}
|
|
],
|
|
],
|
|
"source": [
|
|
"source": [
|
|
"%%sql\n",
|
|
"%%sql\n",
|
|
- "SELECT AVG(population) avg_population, AVG(area) avg_area\n",
|
|
|
|
|
|
+ "SELECT AVG(population) AS avg_population, AVG(area) AS avg_area\n",
|
|
" FROM facts\n",
|
|
" FROM facts\n",
|
|
" WHERE name <> 'World';"
|
|
" WHERE name <> 'World';"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "We see that the average population is around 32 million and the average area is 555 thousand square kilometers."
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
{
|
|
{
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"metadata": {},
|
|
@@ -415,9 +561,19 @@
|
|
"## Finding Densely Populated Countries"
|
|
"## Finding Densely Populated Countries"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "To finish, we'll build on the query above to find countries that are densely populated. We'll identify countries that have:\n",
|
|
|
|
+ "\n",
|
|
|
|
+ "- Above average values for population.\n",
|
|
|
|
+ "- Below average values for area."
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
{
|
|
{
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
- "execution_count": 7,
|
|
|
|
|
|
+ "execution_count": 8,
|
|
"metadata": {
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"collapsed": false,
|
|
"jupyter": {
|
|
"jupyter": {
|
|
@@ -429,7 +585,6 @@
|
|
"name": "stdout",
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"text": [
|
|
- " * sqlite:///factbook.db\n",
|
|
|
|
"Done.\n"
|
|
"Done.\n"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
@@ -553,22 +708,28 @@
|
|
" (192, 'vm', 'Vietnam', 331210, 310070, 21140, 94348835, 0.97, 15.96, 5.93, 0.3)]"
|
|
" (192, 'vm', 'Vietnam', 331210, 310070, 21140, 94348835, 0.97, 15.96, 5.93, 0.3)]"
|
|
]
|
|
]
|
|
},
|
|
},
|
|
- "execution_count": 7,
|
|
|
|
|
|
+ "execution_count": 8,
|
|
"metadata": {},
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
"output_type": "execute_result"
|
|
}
|
|
}
|
|
],
|
|
],
|
|
"source": [
|
|
"source": [
|
|
"%%sql\n",
|
|
"%%sql\n",
|
|
- "SELECT * FROM facts\n",
|
|
|
|
- "WHERE population > (\n",
|
|
|
|
- " SELECT AVG(population)\n",
|
|
|
|
- " FROM facts\n",
|
|
|
|
- ")\n",
|
|
|
|
- " AND area < (\n",
|
|
|
|
- " SELECT AVG(area)\n",
|
|
|
|
- " FROM facts\n",
|
|
|
|
- ")"
|
|
|
|
|
|
+ "SELECT *\n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ " WHERE population > (SELECT AVG(population)\n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ " )\n",
|
|
|
|
+ " AND area < (SELECT AVG(area)\n",
|
|
|
|
+ " FROM facts\n",
|
|
|
|
+ ");"
|
|
|
|
+ ]
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
+ "metadata": {},
|
|
|
|
+ "source": [
|
|
|
|
+ "Some of these countries are generally known to be densely populated, so we have confidence in our results!"
|
|
]
|
|
]
|
|
}
|
|
}
|
|
],
|
|
],
|
|
@@ -588,7 +749,7 @@
|
|
"name": "python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"pygments_lexer": "ipython3",
|
|
- "version": "3.7.6"
|
|
|
|
|
|
+ "version": "3.4.3"
|
|
}
|
|
}
|
|
},
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat": 4,
|