Meta Comment : I have tried to have all my analysis as replicable as possible. if you are having trouble getting the same results anywhere or, feel like I did something wrong please let me know @arjunbazinga

Context

I couldn't find a good source of data for all the things I wanted for the analysis in a data file format, so I ended up extracting the data from a summary report created every year. here. The downloaded file is also availble in the repo.

In [3]:
from tabula import read_pdf
import pandas as pd
In [4]:
population, birth_rate, infant_mortality = read_pdf(input_path="Demographic Indicators.pdf",multiple_tables=True, pages=[3,6,7],)

Cleaning Infant_mortality

In [5]:
infant_mortality.head(10)
Out[5]:
0 1 2 3
0 NaN NaN INFANT MORTALITY RATES - 2016 NaN
1 NaN NaN Infant Mortality Rate [IMR] NaN
2 S. No. State/UT NaN NaN
3 NaN NaN Total Rural Urban
4 1 Andhra Pradesh 34 38 24
5 2 Arunachal Pradesh 36 38 23
6 3 Assam 44 46 22
7 4 Bihar 38 39 29
8 5 Chhattisgarh 39 41 31
9 6 Goa 8 10 7
In [6]:
infant_mortality_clean = pd.DataFrame.copy(infant_mortality)
In [7]:
infant_mortality_clean = infant_mortality_clean[4:]
print(infant_mortality_clean.head())
   0                  1      2   3
4  1     Andhra Pradesh  34 38  24
5  2  Arunachal Pradesh  36 38  23
6  3              Assam  44 46  22
7  4              Bihar  38 39  29
8  5       Chhattisgarh  39 41  31
In [8]:
infant_mortality_clean = infant_mortality_clean[[1,2,3]]
print(infant_mortality_clean.head())
                   1      2   3
4     Andhra Pradesh  34 38  24
5  Arunachal Pradesh  36 38  23
6              Assam  44 46  22
7              Bihar  38 39  29
8       Chhattisgarh  39 41  31
In [9]:
infant_mortality_clean.reset_index(drop=True,inplace=True)
infant_mortality_clean.head()
Out[9]:
1 2 3
0 Andhra Pradesh 34 38 24
1 Arunachal Pradesh 36 38 23
2 Assam 44 46 22
3 Bihar 38 39 29
4 Chhattisgarh 39 41 31
In [10]:
temp = pd.DataFrame(infant_mortality_clean[2].str.split().values.tolist(), columns=["total","rural"])
In [11]:
infant_mortality_clean = pd.concat([infant_mortality_clean, temp], axis=1)
In [12]:
infant_mortality_clean.head()
Out[12]:
1 2 3 total rural
0 Andhra Pradesh 34 38 24 34 38
1 Arunachal Pradesh 36 38 23 36 38
2 Assam 44 46 22 44 46
3 Bihar 38 39 29 38 39
4 Chhattisgarh 39 41 31 39 41
In [13]:
infant_mortality_clean.drop(columns=2, inplace=True)
In [14]:
infant_mortality_clean.head()
Out[14]:
1 3 total rural
0 Andhra Pradesh 24 34 38
1 Arunachal Pradesh 23 36 38
2 Assam 22 44 46
3 Bihar 29 38 39
4 Chhattisgarh 31 39 41
In [15]:
infant_mortality_clean.rename(columns={1:"states", 3:"urban"}, inplace=True)
In [16]:
infant_mortality_clean.head()
Out[16]:
states urban total rural
0 Andhra Pradesh 24 34 38
1 Arunachal Pradesh 23 36 38
2 Assam 22 44 46
3 Bihar 29 38 39
4 Chhattisgarh 31 39 41
In [17]:
infant_mortality_clean
Out[17]:
states urban total rural
0 Andhra Pradesh 24 34 38
1 Arunachal Pradesh 23 36 38
2 Assam 22 44 46
3 Bihar 29 38 39
4 Chhattisgarh 31 39 41
5 Goa 7 8 10
6 Gujarat 19 30 38
7 Haryana 27 33 35
8 Himachal Pradesh 19 25 25
9 Jammu & Kashmir 23 24 25
10 Jharkhand 21 29 31
11 Karnataka 19 24 27
12 Kerala 10 10 10
13 Madhya Pradesh 33 47 50
14 Maharashtra 13 19 24
15 Manipur 10 11 12
16 Meghalaya 26 39 40
17 Mizoram 14 27 35
18 Nagaland 14 12 11
19 Odisha 34 44 46
20 Punjab 18 21 23
21 Rajasthan 30 41 45
22 Sikkim 13 16 18
23 Tamil Nadu 14 17 20
24 Telangana 24 31 35
25 Tripura 32 24 21
26 Uttarakhand 29 38 41
27 Uttar Pradesh 34 43 46
28 West Bengal 22 25 25
29 A& N Islands 22 16 12
30 Chandigarh 14 14 6
31 D & N Haveli 12 17 24
32 Daman & Diu 19 19 18
33 Delhi 17 18 24
34 Lakshadweep 20 19 16
35 Puducherry 8 10 16
36 All India 23 34 38
In [18]:
infant_mortality_clean.isnull().values.any()
Out[18]:
False
In [19]:
infant_mortality_clean.to_csv("infant_mortality.csv", index=False)

Cleaning birthrate

In [20]:
birth_rate.head(10)
Out[20]:
0 1
0 Table 4. NaN
1 NaN ESTIMATES OF BIRTH RATES AND DEATH RATES - 2016
2 NaN Crude Birth Rate Crude Death Rate
3 S. No. State/UT
4 NaN Total Rural Urban Total Rural Urban
5 1 Andhra Pradesh 16.4 16.7 15.8 6.8 7.7 4.9
6 2 Arunachal Pradesh 18.9 19.5 16.1 6.2 6.5 4.5
7 3 Assam 21.7 22.8 15.0 6.7 7.1 4.9
8 4 Bihar 26.8 27.7 21.1 6.0 6.1 5.5
9 5 Chhattisgarh 22.8 24.1 18.3 7.4 7.8 6.2
In [21]:
birth_rate_clean = pd.DataFrame.copy(birth_rate)
In [22]:
birth_rate_clean = birth_rate_clean[5:]
birth_rate_clean.head()
Out[22]:
0 1
5 1 Andhra Pradesh 16.4 16.7 15.8 6.8 7.7 4.9
6 2 Arunachal Pradesh 18.9 19.5 16.1 6.2 6.5 4.5
7 3 Assam 21.7 22.8 15.0 6.7 7.1 4.9
8 4 Bihar 26.8 27.7 21.1 6.0 6.1 5.5
9 5 Chhattisgarh 22.8 24.1 18.3 7.4 7.8 6.2
In [23]:
x = birth_rate_clean[1].str.split()
In [24]:
states = []
total = []
rural = []
urban = []
for e in x:
    urban.append(float(e[-4]))
    rural.append(float(e[-5]))
    total.append(float(e[-6]))
    temp = e[0]
    for i in e[1:-6]:
        temp = temp + " " + i
    states.append(temp)
In [25]:
df = pd.DataFrame(data={n:eval(n) for n in ["states", "total", "rural", "urban"]})
In [26]:
df.to_csv("birth_rate.csv")

Cleaning population

In [27]:
population.head(10)
Out[27]:
0 1 2 3 4 5 6 7
0 NaN NaN NaN Population 2001 Census NaN NaN Population 2011 Census NaN
1 S. No State/UT NaN NaN Rural NaN NaN Rural
2 NaN NaN Rural Urban Total NaN Rural Urban Total NaN
3 NaN NaN NaN NaN % NaN NaN %
4 1 Andhra Pradesh** 55401067 20808940 76210007 72.7 34776389 14610410 49386799 70.4
5 2 Arunachal Pradesh 870087 227881 1097968 79.2 1066358 317369 1383727 77.1
6 3 Assam 23216288 3439240 26655528 87.1 26807034 4398542 31205576 85.9
7 4 Bihar 74316709 8681800 82998509 89.5 92341436 11758016 104099452 88.7
8 5 Chhattisgarh 16648056 4185747 20833803 79.9 19607961 5937237 25545198 76.8
9 6 Goa 677091 670577 1347668 50.2 551731 906814 1458545 37.8
In [28]:
population_clean = pd.DataFrame.copy(population)
In [29]:
population_clean = population_clean[4:]
In [30]:
population_clean.head()
Out[30]:
0 1 2 3 4 5 6 7
4 1 Andhra Pradesh** 55401067 20808940 76210007 72.7 34776389 14610410 49386799 70.4
5 2 Arunachal Pradesh 870087 227881 1097968 79.2 1066358 317369 1383727 77.1
6 3 Assam 23216288 3439240 26655528 87.1 26807034 4398542 31205576 85.9
7 4 Bihar 74316709 8681800 82998509 89.5 92341436 11758016 104099452 88.7
8 5 Chhattisgarh 16648056 4185747 20833803 79.9 19607961 5937237 25545198 76.8

next step: taking the census data from 2011

In [31]:
population_clean = population_clean[[1,5,6]]
population_clean.reset_index(drop=True,inplace=True)
In [32]:
population_clean.head()
Out[32]:
1 5 6
0 Andhra Pradesh** 34776389 14610410 49386799
1 Arunachal Pradesh 1066358 317369 1383727
2 Assam 26807034 4398542 31205576
3 Bihar 92341436 11758016 104099452
4 Chhattisgarh 19607961 5937237 25545198
In [33]:
temp = pd.DataFrame(population_clean[6].str.split().values.tolist(), columns=["urban","total"])
In [34]:
population_clean = pd.concat([population_clean, temp], axis=1)
In [35]:
population_clean.head()
Out[35]:
1 5 6 urban total
0 Andhra Pradesh** 34776389 14610410 49386799 14610410 49386799
1 Arunachal Pradesh 1066358 317369 1383727 317369 1383727
2 Assam 26807034 4398542 31205576 4398542 31205576
3 Bihar 92341436 11758016 104099452 11758016 104099452
4 Chhattisgarh 19607961 5937237 25545198 5937237 25545198
In [36]:
population_clean.drop(columns=6,inplace=True)
In [37]:
population_clean.head()
Out[37]:
1 5 urban total
0 Andhra Pradesh** 34776389 14610410 49386799
1 Arunachal Pradesh 1066358 317369 1383727
2 Assam 26807034 4398542 31205576
3 Bihar 92341436 11758016 104099452
4 Chhattisgarh 19607961 5937237 25545198
In [38]:
population_clean.rename(columns={5:"rural"},inplace=True)
In [39]:
population_clean["states"] = [a.strip("*") for a in population_clean[1].values]
In [40]:
population_clean.drop(columns=1, inplace=True)
In [41]:
population_clean
Out[41]:
rural urban total states
0 34776389 14610410 49386799 Andhra Pradesh
1 1066358 317369 1383727 Arunachal Pradesh
2 26807034 4398542 31205576 Assam
3 92341436 11758016 104099452 Bihar
4 19607961 5937237 25545198 Chhattisgarh
5 551731 906814 1458545 Goa
6 34694609 25745083 60439692 Gujarat
7 16509359 8842103 25351462 Haryana
8 6176050 688552 6864602 Himachal Pradesh
9 9108060 3433242 12541302 Jammu & Kashmir
10 25055073 7933061 32988134 Jharkhand
11 37469335 23625962 61095297 Karnataka
12 17471135 15934926 33406061 Kerala
13 52557404 20069405 72626809 Madhya Pradesh
14 61556074 50818259 112374333 Maharashtra
15 2021640 834154 2855794 Manipur
16 2371439 595450 2966889 Meghalaya
17 525435 571771 1097206 Mizoram
18 1407536 570966 1978502 Nagaland
19 34970562 7003656 41974218 Odisha
20 17344192 10399146 27743338 Punjab
21 51500352 17048085 68548437 Rajasthan
22 456999 153578 610577 Sikkim
23 37229590 34917440 72147030 Tamil Nadu
24 21585313 13608665 35193978 Telangana
25 2712464 961453 3673917 Tripura
26 7036954 3049338 10086292 Uttarakhand
27 155317278 44495063 199812341 Uttar Pradesh
28 62183113 29093002 91276115 West Bengal
29 237093 143488 380581 A& N Islands
30 28991 1026459 1055450 Chandigarh
31 183114 160595 343709 D & N Haveli
32 60396 182851 243247 Daman & Diu
33 419042 16368899 16787941 Delhi
34 14141 50332 64473 Lakshadweep
35 395200 852753 1247953 Puducherry
36 833748852 377106125 1210854977 All India
In [42]:
population_clean.to_csv("population.csv")