In [5]:

import sqlite3
import pandas as pd
conn = sqlite3.connect("Data/factbook.db")
q = "SELECT * FROM sqlite_master WHERE type = 'table';"
pd.read_sql_query(q, conn)

Out[5]:

	type	name	tbl_name	rootpage	sql
0	table	facts	facts	2	CREATE TABLE "facts" ("id" INTEGER PRIMARY KEY...
1	table	sqlite_sequence	sqlite_sequence	3	CREATE TABLE sqlite_sequence(name,seq)

In [6]:

q1 = "SELECT * FROM facts LIMIT 5;"
pd.read_sql_query(q1, conn)

Out[6]:

	id	code	name	area	area_land	area_water	population	population_growth	birth_rate	death_rate	migration_rate	created_at	updated_at
0	1	af	Afghanistan	652230	652230	0	32564342	2.32	38.57	13.89	1.51	2015-11-01 13:19:49.461734	2015-11-01 13:19:49.461734
1	2	al	Albania	28748	27398	1350	3029278	0.30	12.92	6.58	3.30	2015-11-01 13:19:54.431082	2015-11-01 13:19:54.431082
2	3	ag	Algeria	2381741	2381741	0	39542166	1.84	23.67	4.31	0.92	2015-11-01 13:19:59.961286	2015-11-01 13:19:59.961286
3	4	an	Andorra	468	468	0	85580	0.12	8.13	6.96	0.00	2015-11-01 13:20:03.659945	2015-11-01 13:20:03.659945
4	5	ao	Angola	1246700	1246700	0	19625353	2.78	38.78	11.49	0.46	2015-11-01 13:20:08.625072	2015-11-01 13:20:08.625072

In [8]:

q2 = "SELECT MIN(population), MAX(population), MIN(population_growth), MAX(population_growth) FROM facts;"
pd.read_sql_query(q2, conn)

Out[8]:

	name	MIN(population)	MAX(population)	MIN(population_growth)	MAX(population_growth)
0	South Sudan	0	7256490011	0.0	4.02

In [9]:

q3 = "SELECT name, MIN(population) FROM facts;"
pd.read_sql_query(q3, conn)

Out[9]:

	name	MIN(population)
0	Antarctica	0

In [10]:

q4 = "SELECT name, MAX(population) FROM facts;"
pd.read_sql_query(q4, conn)

Out[10]:

	name	MAX(population)
0	World	7256490011

Findings:¶

instead of listing the country names, the table also include continent name like Antarctica or group name "World" of all the people in the world which are not comparable to other data in the table

In [37]:

q5 = "SELECT population, population_growth, birth_rate, death_rate FROM facts WHERE (name != 'World' AND name !='Antarctica')"

import matplotlib.pyplot as plt

import seaborn as sns
%matplotlib inline

#Comment portion is my original way, below is an alternative way
#df = pd.read_sql_query(q5, conn)
# cols = df.columns.tolist()

# fig = plt.figure(figsize =(12, 12))
# for i in range(4):
#     ax = fig.add_subplot(2, 2, i+1)
#     data = df[cols[i]][df[cols[i]].isnull() == False]
#     ax.set_title(cols[i])
#     ax.hist(data)

fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(111) #same as add_subplot(1, 1, 1)
pd.read_sql_query(q5, conn).hist(ax = ax)

/Applications/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2862: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  exec(code_obj, self.user_global_ns, self.user_ns)

Out[37]:

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1e104e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11c0f7358>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11c22add8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11be39e80>]], dtype=object)

In [66]:

q6 = '''
SELECT name, population/area_land pop_density FROM facts 
WHERE  (name != 'World' AND name !='Antarctica')
ORDER BY pop_density DESC
LIMIT 30'''
fig, ax = plt.subplots(figsize = (12, 12))
df = pd.read_sql_query(q6, conn)
df = df.set_index('name')
print(df.head())
df.plot.barh(ax = ax)

            pop_density
name                   
Macau             21168
Monaco            15267
Singapore          8259
Hong Kong          6655
Gaza Strip         5191

Out[66]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a2133c9b0>

Phoebe's blog

Study CIA World Factbook with sqlite3

Findings:¶

Findings:¶

social