Assignment 3

Header - Imports, join, and unpivot

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
%matplotlib inline
import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot,plot
init_notebook_mode(connected=True)
In [2]:
# Creating 'States Data Frame' for Plotly
States = ['USA','AL','AK','AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']
df_states = pd.DataFrame(data=States,columns = ['States'])
In [3]:
#Creating 'Region Data Frame' to seperate data
Region = ['usa','South','West','West','South','West','West','Northeast','South','South','South','South','West','West','Midwest','Midwest',
          'Midwest','Midwest','South','South','Northeast','South','Northeast','Midwest','Midwest','South','Midwest','West','Midwest',
          'West','Northeast','Northeast','West','Northeast','South','Midwest','Midwest','South','West','Northeast','Northeast','South',
          'Midwest','South','South','West','Northeast','South','West','South','Midwest','West']

df_region = pd.DataFrame(data=Region,columns = ['Region'])
df_region.head(3)
Out[3]:
Region
0 usa
1 South
2 West
In [4]:
# Importing data from xls
df_current = pd.read_csv('df_current_1.csv')
df_current.columns = ['Location','1970','1980','1990','2000','2010','2014','2015']
df_current.head(3)
Out[4]:
Location 1970 1980 1990 2000 2010 2014 2015
0 United States 8626.0 15970.0 31367.0 41807.0 55202.0 56610.0 57379.0
1 Alabama 6818.0 13060.0 24828.0 36689.0 47571.0 48720.0 49497.0
2 Alaska 10560.0 27210.0 43153.0 46462.0 59672.0 65891.0 66755.0
In [5]:
# Joining Data Frames
df_current['Region'] = df_region['Region']
df_current['State'] = df_states['States']
df_current.head(3)
Out[5]:
Location 1970 1980 1990 2000 2010 2014 2015 Region State
0 United States 8626.0 15970.0 31367.0 41807.0 55202.0 56610.0 57379.0 usa USA
1 Alabama 6818.0 13060.0 24828.0 36689.0 47571.0 48720.0 49497.0 South AL
2 Alaska 10560.0 27210.0 43153.0 46462.0 59672.0 65891.0 66755.0 West AK
In [6]:
# Importing data from xls
df_constant = pd.read_csv('df_constant.csv')
df_constant.columns = ['Location','1970','1980','1990','2000','2010','2014','2015']
df_constant.head(3)
Out[6]:
Location 1970 1980 1990 2000 2010 2014 2015
0 United States ............. 54046.0 48687.0 58467.0 58448.0 60281.0 57022.0 57379.0
1 Alabama .................... 42718.0 39815.0 46278.0 51293.0 51948.0 49075.0 49497.0
2 Alaska .................. 66163.0 82954.0 80436.0 64956.0 65162.0 66371.0 66755.0
In [7]:
# Unpivot Data
df = df_current.melt(id_vars = ('Location','State','Region'),value_vars = ('1970','1980','1990','2000','2010','2014','2015'))
df.columns = ['Location','State','Region','Year','Current_Avg']
df['Year']=df['Year'].astype(int)
df['Current_Avg'].astype(float)
df.head(3)
Out[7]:
Location State Region Year Current_Avg
0 United States USA usa 1970 8626.0
1 Alabama AL South 1970 6818.0
2 Alaska AK West 1970 10560.0
In [8]:
#Unpivot Data
df_append = df_constant.melt(id_vars = 'Location',value_vars = ('1970','1980','1990','2000','2010','2014','2015'))
df_append.columns = ['Location','Year','Constant_Avg']
df_append['Year']=df['Year'].astype(int)
df_append['Constant_Avg'].astype(float)
df_append.head(3)
Out[8]:
Location Year Constant_Avg
0 United States ............. 1970 54046.0
1 Alabama .................... 1970 42718.0
2 Alaska .................. 1970 66163.0
In [9]:
#Joining Data Frames
df['Constant_Avg'] = df_append['Constant_Avg']
In [10]:
# Build a Data Frame of locations as blank template
ls_locations = df['Location'].unique()
df_locations = pd.DataFrame(data = ls_locations,columns = ['Location'])
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 6 columns):
Location        364 non-null object
State           364 non-null object
Region          364 non-null object
Year            364 non-null int64
Current_Avg     364 non-null float64
Constant_Avg    364 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 17.1+ KB
In [12]:
# % difference function that will be used throughout
def differ(v1,v2):
    y1 = (v1-v2)
    y2 = (v1+v2)/2
    y3 = (y1*100)/(y2*100)*100
    return y3

Question 1

Determine the average increase for each region.

In [13]:
# Unique Regions
df['Region'].unique()
Out[13]:
array(['usa', 'South', 'West', 'Northeast', 'Midwest'], dtype=object)
In [14]:
#Grouping and aggerating data
df_groups = df.groupby(['Region','Year'])['Current_Avg','Constant_Avg'].mean()
In [15]:
df_groups.head()
Out[15]:
Current_Avg Constant_Avg
Region Year
Midwest 1970 8199.083333 51370.833333
1980 15146.000000 46175.000000
1990 29043.333333 54135.666667
2000 38188.083333 53388.500000
2010 49941.833333 54537.000000
In [16]:
#Splitting and Grouping Data
df_south = df_groups.transpose()['South']
df_west = df_groups.transpose()['West']
df_northeast = df_groups.transpose()['Northeast']
df_midwest = df_groups.transpose()['Midwest']
In [17]:
df_south
Out[17]:
Year 1970 1980 1990 2000 2010 2014 2015
Current_Avg 7564.882353 14444.000000 27851.117647 37597.411765 50373.764706 50934.294118 51613.352941
Constant_Avg 47397.352941 44034.529412 51913.352941 52562.764706 55008.647059 51305.235294 51613.352941
In [18]:
#Building Current Data Dictionary
dic_ans_current = {
        'South': differ(df_south[2015][0],df_south[1970][0]),
        'West': differ(df_west[2015][0],df_west[1970][0]),                      
        'Northeast': differ(df_northeast[2015][0],df_northeast[1970][0]),
        'Midwest': differ(df_midwest[2015][0],df_midwest[1970][0])
        }
In [19]:
#Building Constant Data Dictionary
dic_ans_constant = {
        'South': differ(df_south[2015][1],df_south[1970][1]),
        'West': differ(df_west[2015][1],df_west[1970][1]),                      
        'Northeast': differ(df_northeast[2015][1],df_northeast[1970][1]),
        'Midwest': differ(df_midwest[2015][1],df_midwest[1970][1])
        }
In [20]:
# Ans to Question 1
dic_ans_current
Out[20]:
{'Midwest': 146.116194468899,
 'Northeast': 153.06440875737698,
 'South': 148.86713119887082,
 'West': 145.25227147996833}
In [21]:
# Ans to Question 1
dic_ans_constant
Out[21]:
{'Midwest': 2.4895069046169613,
 'Northeast': 18.232254676051447,
 'South': 8.516250767890817,
 'West': 0.6491034957887728}
In [22]:
#Building a blank region database for plot
df_region['Current'] = df_region['Region']
df_region['Constant'] = df_region['Region']

#Apply Dictionary
df_ans1 = df_region.replace({'Current': dic_ans_current,'Constant': dic_ans_constant})
In [103]:
#Ans 1 Plot for 'Current'
data = dict(type='choropleth',
            colorscale = 'RdBu',
            locations = df_states['States'],
            #Adding Values
            z = df_ans1['Current'],
            locationmode = 'USA-states',
            #Add Text
            text = df_ans1['Current'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Currnet % Increase"}
            )
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)
In [102]:
#Ans 1 Plot for 'Constant'
data = dict(type='choropleth',
            colorscale = 'RdBu',
            locations = df_states['States'],
            #Adding Values
            z = df_ans1['Constant'],
            locationmode = 'USA-states',
            #Add Text
            text = df_ans1['Constant'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Constant% Increase"}
            )
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)

Question 2

Compare the average salary in 1999-2000 by region to 2014-15 by region Which groups had statistically significant increases? Decreases?

In [25]:
#Example Data
df_south
Out[25]:
Year 1970 1980 1990 2000 2010 2014 2015
Current_Avg 7564.882353 14444.000000 27851.117647 37597.411765 50373.764706 50934.294118 51613.352941
Constant_Avg 47397.352941 44034.529412 51913.352941 52562.764706 55008.647059 51305.235294 51613.352941
In [26]:
# Building Current Data Dictionary
dic_ans_current = {
        'South': differ(df_south[2014][0],df_south[2000][0]),
        'West': differ(df_west[2014][0],df_west[2000][0]),                      
        'Northeast': differ(df_northeast[2014][0],df_northeast[2000][0]),
        'Midwest': differ(df_midwest[2014][0],df_midwest[2000][0])
        }
In [27]:
#Building Constant Data Dictionary
dic_ans_constant = {
        'South': differ(df_south[2014][1],df_south[2000][1]),
        'West': differ(df_west[2014][1],df_west[2000][1]),                      
        'Northeast': differ(df_northeast[2014][1],df_northeast[2000][1]),
        'Midwest': differ(df_midwest[2014][1],df_midwest[2000][1])
        }
In [28]:
#Building a blank region database for plot
df_region['Current'] = df_region['Region']
df_region['Constant'] = df_region['Region']

#Apply Dictionary
df_ans2 = df_region.replace({'Current': dic_ans_current,'Constant': dic_ans_constant})
In [101]:
#Ans 2 Plot for 'Current'
data = dict(type='choropleth',
            colorscale = 'RdBu',
            locations = df_states['States'],
            #Adding Values
            z = df_ans2['Current'],
            locationmode = 'USA-states',
            #Add Text
            text =df_ans2['Current'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Current % Increase"}
            )
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)
In [100]:
#Ans 2 Plot for 'Constant'
data = dict(type='choropleth',
            colorscale = 'RdBu',
            locations = df_states['States'],
            #Adding Values
            z = df_ans2['Constant'],
            locationmode = 'USA-states',
            #Add Text
            text = df_ans2['Constant'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Constnat % Increase"}
            )
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)

Ans, the North East

In [31]:
#sns.kdeplot(x)
#sns.kdeplot(x, bw=.2, label="bw: 0.2")
#sns.kdeplot(x, bw=2, label="bw: 2")
#plt.legend();

Question 3

Compare the average salary by region for 1999-2000. Are there statistically significant differences? Conduct the same analysis using 2014-15 data.

In [32]:
#Filtering and Aggeragaing Data
df_q3 = df[df['Year'] == 2000].groupby('Region').mean()
In [33]:
df_q3
Out[33]:
Year Current_Avg Constant_Avg
Region
Midwest 2000 38188.083333 53388.500000
Northeast 2000 45312.222222 63348.444444
South 2000 37597.411765 52562.764706
West 2000 38603.769231 53969.615385
usa 2000 41807.000000 58448.000000
In [34]:
#Building Current Data Dictionary
dic_ans_current = {
        'South': df_q3['Current_Avg']['South'],
        'West': df_q3['Current_Avg']['West'],                      
        'Northeast': df_q3['Current_Avg']['Northeast'],
        'Midwest': df_q3['Current_Avg']['Midwest']
        }
In [35]:
#Building Constant Data Dictionary
dic_ans_constant = {
        'South': df_q3['Constant_Avg']['South'],
        'West': df_q3['Constant_Avg']['West'],                      
        'Northeast': df_q3['Constant_Avg']['Northeast'],
        'Midwest': df_q3['Constant_Avg']['Midwest']
        }
In [36]:
dic_ans_current
Out[36]:
{'Midwest': 38188.083333333336,
 'Northeast': 45312.22222222222,
 'South': 37597.41176470588,
 'West': 38603.769230769234}
In [37]:
dic_ans_constant
Out[37]:
{'Midwest': 53388.5,
 'Northeast': 63348.444444444445,
 'South': 52562.76470588235,
 'West': 53969.61538461538}
In [38]:
#Building a blank region database for plot
df_region['Current'] = df_region['Region']
df_region['Constant'] = df_region['Region']

#Apply Dictionary
df_ans3 = df_region.replace({'Current': dic_ans_current,'Constant': dic_ans_constant})
In [99]:
#Ans 3 Plot for 'Current'
data = dict(type='choropleth',
            colorscale = 'RdBu',
            locations = df_states['States'],
            #Adding Values
            z = df_ans3['Current'],
            locationmode = 'USA-states',
            #Add Text
            text = df_ans3['Current'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Current % Increase"}
            )
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)
In [98]:
#Ans 3 Plot for 'Constant'
data = dict(type='choropleth',
            colorscale = 'RdBu',
            locations = df_states['States'],
            #Adding Values
            z = df_ans3['Constant'],
            locationmode = 'USA-states',
            #Add Text
            text = df_ans3['Constant'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Constnat % Increase"}
            )
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)
In [93]:
df_q3.describe()
Out[93]:
Year Current_Avg Constant_Avg
count 5.0 5.000000 5.000000
mean 2000.0 40301.697310 56343.464907
std 0.0 3241.711747 4532.093320
min 2000.0 37597.411765 52562.764706
25% 2000.0 38188.083333 53388.500000
50% 2000.0 38603.769231 53969.615385
75% 2000.0 41807.000000 58448.000000
max 2000.0 45312.222222 63348.444444
In [95]:
 [40301.697310+3241.711747 ,56343.464907+ 4532.093320]
Out[95]:
[43543.409057000004, 60875.558227]
In [ ]:
 

Question 4

For each region compare the 2014-15 average to the U.S. average. Which regions are above/below the US. Average?

In [41]:
df_q4 = df[df['Year'] == 2014].groupby('Region').mean()
In [42]:
df_q4
Out[42]:
Year Current_Avg Constant_Avg
Region
Midwest 2014 51846.166667 52223.666667
Northeast 2014 64341.000000 64809.333333
South 2014 50934.294118 51305.235294
West 2014 53716.230769 54107.384615
usa 2014 56610.000000 57022.000000
In [43]:
#Building Current Data Dictionary
dic_ans_current = {
        'South': df_q4['Current_Avg']['South'],
        'West': df_q4['Current_Avg']['West'],                      
        'Northeast': df_q4['Current_Avg']['Northeast'],
        'Midwest': df_q4['Current_Avg']['Midwest']
        }
In [44]:
#Building Constant Data Dictionary
dic_ans_constant = {
        'South': df_q4['Constant_Avg']['South'],
        'West': df_q4['Constant_Avg']['West'],                      
        'Northeast': df_q4['Constant_Avg']['Northeast'],
        'Midwest': df_q4['Constant_Avg']['Midwest']
        }
In [45]:
#Building a blank region database for plot
df_region['Current'] = df_region['Region']
df_region['Constant'] = df_region['Region']

#Apply Dictionary
df_ans4 = df_region.replace({'Current': dic_ans_current,'Constant': dic_ans_constant})
In [46]:
#Defining function for yes (2) and no (1) on mean being greater than or less than USA Average
def usa_current(v1):
    if v1> 56610:
        return 2
    else:
        return 1
def usa_constant(v1):
    if v1> 57022:
        return 2
    else:
        return 1
In [47]:
#Applying Function 2 for yes and 1 for no
df_ans4['Current'] = df_ans4['Current'].apply(lambda x : usa_current(x))
In [48]:
#Applying Function 2 for yes and 1 for no
df_ans4['Constant'] = df_ans4['Constant'].apply(lambda x : usa_constant(x))
In [49]:
df_ans4.head(3)
Out[49]:
Region Current Constant
0 usa 2 2
1 South 1 1
2 West 1 1
In [97]:
#Ans 4 Plot for 'Current'
data = dict(type='choropleth',
            colorscale = 'RdBu',
            locations = df_states['States'],
            #Adding Values
            z = df_ans4['Current'],
            locationmode = 'USA-states',
            #Add Text
            text = df_ans4['Current'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Current % Increase"}
            )
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)
In [96]:
#Ans 4 Plot for 'Current'
data = dict(type='choropleth',
            colorscale = 'RdBu',
            locations = df_states['States'],
            #Adding Values
            z = df_ans4['Current'],
            locationmode = 'USA-states',
            #Add Text
            text = df_ans4['Current'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Current % Increase"}
            )
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)

Question 5

Conduct one analysis of your choice based on the results of the above. Feel free to add data to the set if that is relevant to your analysis.

The question, does higher Teacher Income translate into higher GDP?

In [104]:
#This is economic data selected from various years
df_1970 = pd.read_csv('1970.csv')
df_1970['Year'] = 1970

df_1980 = pd.read_csv('1980.csv')
df_1980['Year'] = 1980

df_1990 = pd.read_csv('1990.csv')
df_1990['Year'] = 1990

df_2000 = pd.read_csv('2000.csv')
df_2000['Year'] = 2000

df_2010 = pd.read_csv('2010.csv')
df_2010['Year'] = 2010

df_2014 = pd.read_csv('2014.csv')
df_2014['Year'] = 2014

df_2015 = pd.read_csv('2015.csv')
df_2015['Year'] = 2015
In [53]:
#Consolidating Data Frame
df_econ = df_1970.append([df_1980,df_1990,df_2000,df_2010,df_2014,df_2015], ignore_index=True)
df_econ['Gross State Product'] = df_econ['Gross State Product'].apply(lambda x : float(x.replace(',','')))
df_econ.columns = [['State','Spending','GDP','Growth','Population','Year_prime']]
#Dropping 'USA' from data set
df_econ = df_econ.drop(df_econ.index[[0,52,104,156,208,260,312]])
df_econ.head(3)
Out[53]:
State Spending GDP Growth Population Year_prime
1 Alabama 2 12.5 -0.1 3.4 1970
2 Alaska 0.4 2.4 8.1 0.3 1970
3 Arizona 1.3 8.6 7.6 1.8 1970
In [54]:
#Combining with current Data Frame
df['Spending'] = df_econ['Spending']
df['GDP'] = df_econ['GDP']
df['Growth'] = df_econ['Growth']
df['Population'] = df_econ['Population'].astype(float)
In [55]:
#Heat map of Corr() anything blue should be have a high corr
sns.heatmap(df.corr(), cmap='RdBu')
Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09898fa510>
In [57]:
#Passing Corr table into a Data Frame
df_corr = df.corr()
In [88]:
# Function to translate answer
def corx(x):
    if x < .3:
        return 'None'
    if .3 < x < .50:
        return 'Weak'
    if .5 <= x <=.7:
        return 'Moderate'
    if x > .7:
        return 'Strong'
    else:
        return 'Unsure'
In [114]:
# Ans
df_corr['Current_Avg'].apply(lambda x : corx(x))[3:]
Out[114]:
GDP           Weak
Growth        None
Population    None
Name: Current_Avg, dtype: object
In [111]:
# Ans
df_corr['Constant_Avg'].apply(lambda x : corx(x))[3:]
Out[111]:
GDP           Weak
Growth        None
Population    Weak
Name: Constant_Avg, dtype: object

Ans: Yes there is a corr but it is weak. GDP is more tied to Population than Teacher Saleries

In [ ]: