Margin Mart Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from collections import Counter
In [2]:
%matplotlib inline
In [3]:
#sns.set_palette("GnBu_d")
#sns.set_style('whitegrid')
In [4]:
df = pd.read_csv('MarginMart.csv')
In [5]:
df_date = pd.read_csv('MarginMart_Date.csv')

Initial Inspection

In [6]:
df.head()
Out[6]:
brand_name product_name product_subcategory product_category product_department product_family store_sales store_cost unit_sales date city state_province country
0 Ebony Ebony Lettuce Fresh Vegetables Vegetables Produce Food 10.53 4.2120 3 42092 Issaquah WA USA
1 Gorilla Gorilla String Cheese Cheese Dairy Dairy Food 5.37 1.7184 3 42111 Issaquah WA USA
2 Gorilla Gorilla Chocolate Milk Milk Dairy Dairy Drink 2.94 1.2348 2 42111 Issaquah WA USA
3 Musial Musial Spicy Mints Hard Candy Candy Snacks Food 8.76 2.8908 3 42111 Issaquah WA USA
4 Monarch Monarch Thai Rice Rice Starchy Foods Starchy Foods Food 6.22 1.8660 2 42111 Issaquah WA USA
In [7]:
df.describe()
Out[7]:
store_sales store_cost unit_sales date
count 9997.000000 9997.000000 9997.000000 9997.000000
mean 6.553135 2.641228 3.085426 42168.349105
std 3.459607 1.470528 0.824000 97.147702
min 0.520000 0.169600 1.000000 42005.000000
25% 3.840000 1.516800 3.000000 42084.000000
50% 6.000000 2.409200 3.000000 42171.000000
75% 8.640000 3.505500 4.000000 42253.000000
max 19.800000 9.850000 6.000000 42338.000000
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9997 entries, 0 to 9996
Data columns (total 13 columns):
brand_name             9997 non-null object
product_name           9997 non-null object
product_subcategory    9997 non-null object
product_category       9997 non-null object
product_department     9997 non-null object
product_family         9997 non-null object
store_sales            9997 non-null float64
store_cost             9997 non-null float64
unit_sales             9997 non-null int64
date                   9997 non-null int64
city                   9997 non-null object
state_province         9997 non-null object
country                9997 non-null object
dtypes: float64(2), int64(2), object(9)
memory usage: 1015.4+ KB
In [9]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1cf0487650>
In [10]:
sns.heatmap(df.corr())
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1cee409790>
In [11]:
sns.pairplot(df)
Out[11]:
<seaborn.axisgrid.PairGrid at 0x7f1cedd13a50>

Part 1.0 - Store_Sales vs Store_Cost

In [12]:
sns.distplot(df['store_cost'],bins= 30)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1ced768050>
In [13]:
sns.distplot(df['store_sales'],bins= 30)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1ced4cd490>
In [14]:
sns.jointplot(x='store_sales' , y='store_cost' ,data=df, kind='kde')
Out[14]:
<seaborn.axisgrid.JointGrid at 0x7f1cf061c210>
In [15]:
sns.jointplot(x='store_sales' , y='store_cost' , data=df, kind='hex')
Out[15]:
<seaborn.axisgrid.JointGrid at 0x7f1cec1900d0>

Correlation between store_sales and store_cost

In [16]:
df['store_cost'].corr(df['store_sales'])
Out[16]:
0.95534495039761558

Part 1.1 There is a strong correlation between Store_Cost and Store_Sales lets look at how it relates to other viarables such as Unit_Sales. "Blues" for Store Cost and "Reds" for Store_Sales

In [17]:
df['unit_sales'].unique()
Out[17]:
array([3, 2, 4, 5, 1, 6])
In [18]:
sns.boxplot(x='unit_sales', y='store_sales',data=df, palette="Reds")
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1cec07a0d0>
In [19]:
sns.violinplot(x='unit_sales', y='store_sales',data=df, palette="Reds")
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1ce7750490>
In [20]:
sns.boxplot(x='unit_sales', y='store_cost',data=df, palette="Blues")
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1ce767d550>
In [21]:
sns.violinplot(x='unit_sales', y='store_cost',data=df, palette="Blues")
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1ced991f10>

Part 1.2 There must be a correlation here, lets look at impact when it is plotted seperatly Store_Cost and Store_Sales

In [22]:
sns.lmplot(x='store_sales',y='store_cost',data=df,row='unit_sales',hue='unit_sales',palette='rainbow',aspect=2,size=5)
Out[22]:
<seaborn.axisgrid.FacetGrid at 0x7f1ce74830d0>
In [23]:
ls_result = []
ls_unique = df['unit_sales'].unique()
for i in range(len(ls_unique)):
    v1 = ls_unique[i]
    v2 = df[df['unit_sales'] == v1]['store_cost'].corr(df[df['unit_sales'] == v1]['store_sales']) 
    ls_result.append([v1,v2])
### adding initial corr as 0
v1 = 0
v2 = df['store_cost'].corr(df['store_sales'])
ls_result.append([v1,v2])
### into dataframe
df_result = pd.DataFrame(data = ls_result, columns = ['Unit_Sales','Correlation'])
df_result.sort_values('Unit_Sales', ascending=True)
Out[23]:
Unit_Sales Correlation
6 0 0.955345
4 1 0.950055
1 2 0.944478
0 3 0.939431
2 4 0.940386
3 5 0.941754
5 6 0.930578
In [24]:
sns.barplot(x='Unit_Sales',y='Correlation',data = df_result)
plt.ylim(.92,.97)
Out[24]:
(0.92, 0.97)

Result, adding "Unit_Sales" does not give us stronger correlations between Store_Sales and Store_Cost

Part 2 - Inspecting Cities

Citties.csv is a database of cities throuhgout the USA. We will use it to check to see if cities in the Margin_Mart Database is accurate.

In [25]:
df_cities = pd.read_csv('citties.csv')
In [26]:
df_cities.head()
Out[26]:
city city_ascii lat lng pop country iso2 iso3 province
0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300 2997.0 Afghanistan AF AFG Badghis
1 Chaghcharan Chaghcharan 34.516701 65.250001 15000.0 Afghanistan AF AFG Ghor
2 Lashkar Gah Lashkar Gah 31.582998 64.360000 201546.0 Afghanistan AF AFG Hilmand
3 Zaranj Zaranj 31.112001 61.886998 49851.0 Afghanistan AF AFG Nimroz
4 Tarin Kowt Tarin Kowt 32.633298 65.866699 10000.0 Afghanistan AF AFG Uruzgan
In [27]:
df_cities.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7322 entries, 0 to 7321
Data columns (total 9 columns):
city          7322 non-null object
city_ascii    7322 non-null object
lat           7322 non-null float64
lng           7322 non-null float64
pop           7322 non-null float64
country       7322 non-null object
iso2          7291 non-null object
iso3          7322 non-null object
province      7203 non-null object
dtypes: float64(3), object(6)
memory usage: 514.9+ KB
In [28]:
list_TF = []
for i in range(len(df['city'])):
    loc = df['city'][i]  
    TF = loc in df_cities['city'].values                       
    list_TF.append([loc,TF])
           
df_TF = pd.DataFrame(data = list_TF, columns = ['CITY','T/F'])
In [29]:
df_TF.head()
Out[29]:
CITY T/F
0 Issaquah False
1 Issaquah False
2 Issaquah False
3 Issaquah False
4 Issaquah False

Cities that passed the test

In [30]:
df_TF[df_TF['T/F'] == True]['CITY'].unique()
Out[30]:
array(['San Andres', 'Richmond', 'Santa Fe', 'Everett', 'Concord',
       'La Cruz', 'Tacoma', 'Long Beach', 'National City', 'Shawnee',
       'Albany', 'Bremerton', 'Oakland', 'Salem', 'Glendale', 'Victoria',
       'Seattle', 'San Jose', 'Los Angeles', 'Walla Walla', 'Corvallis',
       'Spokane', 'Mexico City', 'Olympia', 'Bellingham', 'Merida',
       'Vancouver', 'San Gabriel', 'San Diego', 'Acapulco', 'Guadalajara',
       'Yakima', 'San Francisco', 'Orizaba', 'Portland', 'Berkeley'], dtype=object)

Cities that did not pass the test

In [31]:
df_TF[df_TF['T/F'] == False]['CITY'].unique()
Out[31]:
array(['Issaquah', 'Lynnwood', 'Lake Oswego', 'La Mesa', 'Lincoln Acres',
       'Altadena', 'Spring Valley', 'Renton', 'La Jolla', 'Haney',
       'Burbank', 'Anacortes', 'Milwaukie', 'Ladner', 'Oregon City',
       'Arcadia', 'Lebanon', 'W. Linn', 'Bellflower', 'Oak Bay', 'Camacho',
       'Tixapan', 'Grossmont', 'Burnaby', 'Langley', 'Santa Monica',
       'Beaverton', 'Metchosin', 'Westminster', 'Coronado', 'Ballard',
       'Port Orchard', 'Downey', 'Lemon Grove', 'N. Vancouver',
       'Woodland Hills', 'Sooke', 'Woodburn', 'Langford', 'Hidalgo',
       'Newton', 'West Covina', 'Palo Alto', 'Newport Beach',
       'Santa Anita', 'Kirkland', 'Chula Vista', 'Port Hammond',
       'Lakewood', 'Edmonds', 'Imperial Beach', 'El Cajon', 'Royal Oak',
       'Sedro Woolley', 'Cliffside', 'Torrance', 'Novato', 'Marysville',
       'Fremont', 'Pomona', 'Redmond', 'Burien', 'Mill Valley',
       'Burlingame', 'Beverly Hills', 'Puyallup', 'Daly City'], dtype=object)

Part 3 Country Inspection

In [32]:
df['country'].unique()
Out[32]:
array(['USA', 'Mexico', 'Canada'], dtype=object)

Do these countries exist, yes I think so....

Part 3.1 Does dividing Store_Cost and Store_Sales give a higher correlation?

In [33]:
sns.lmplot(x='store_sales',y='store_cost',data=df,row='country',hue='country',palette='Set1',aspect=2,size=5)
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x7f1ce578d350>
In [34]:
ls_result = []
ls_unique = df['country'].unique()
for i in range(len(ls_unique)):
    v1 = ls_unique[i]
    v2 = df[df['country'] == v1]['store_cost'].corr(df[df['country'] == v1]['store_sales']) 
    ls_result.append([v1,v2])
### adding initial corr as 'Initial'
v1 = 'Initial'
v2 = df['store_cost'].corr(df['store_sales'])
ls_result.append([v1,v2])
### into dataframe
df_result = pd.DataFrame(data = ls_result, columns = ['Unit_Sales','Correlation'])
df_result.sort_values('Correlation', ascending=True)
Out[34]:
Unit_Sales Correlation
2 Canada 0.952998
1 Mexico 0.955060
3 Initial 0.955345
0 USA 0.956050
In [35]:
sns.barplot(x='Unit_Sales',y='Correlation',data = df_result)
plt.ylim(.95,.96)
Out[35]:
(0.95, 0.96)
In [36]:
(.956050-.955345)*100/(.955345)
Out[36]:
0.07379533048270057

Result, the countries do exist. More inspection is needed to match countries with cities. By looking at the correlation in the USA yields a 7% increase over our initial correlation.

Part 4 Product inspection

In [37]:
df.head()
Out[37]:
brand_name product_name product_subcategory product_category product_department product_family store_sales store_cost unit_sales date city state_province country
0 Ebony Ebony Lettuce Fresh Vegetables Vegetables Produce Food 10.53 4.2120 3 42092 Issaquah WA USA
1 Gorilla Gorilla String Cheese Cheese Dairy Dairy Food 5.37 1.7184 3 42111 Issaquah WA USA
2 Gorilla Gorilla Chocolate Milk Milk Dairy Dairy Drink 2.94 1.2348 2 42111 Issaquah WA USA
3 Musial Musial Spicy Mints Hard Candy Candy Snacks Food 8.76 2.8908 3 42111 Issaquah WA USA
4 Monarch Monarch Thai Rice Rice Starchy Foods Starchy Foods Food 6.22 1.8660 2 42111 Issaquah WA USA

Part 4.1 How many products does each vendor provide

In [38]:
list_brand = []
for i in range(110):
    df_brand = df['brand_name'].unique()
    
    v1 = df_brand[i]
    v2 = df[df['brand_name'] == df_brand[i]]['brand_name'].count()
    
    list_brand.append([v1,v2])
    
df_brand_count = pd.DataFrame(data=list_brand, columns = ('brand_name','num_products'))
In [39]:
df_plot_top = df_brand_count.sort_values(by=['num_products'],ascending=False).head(15)
In [40]:
sns.barplot(x='brand_name',y='num_products',data = df_plot_top, palette = 'Set1')
plt.xticks(rotation=90)
Out[40]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 <a list of 15 Text xticklabel objects>)

Part 4.2 'Product_Department' Inspection

In [41]:
ls_products = df['product_department'].unique()
ls_prod = []
for i in range(len(ls_products)):
    prod = ls_products[i]
    v1 = df[df['product_department'] == prod]['store_sales'].sum()
    v2 = df[df['product_department'] == prod]['store_cost'].sum()
    
    ls_prod.append([prod,v1,v2])

df_prod = pd.DataFrame(ls_prod)
df_prod.columns = ['Product_Department','Store_Sales','Store_Cost']
df_sorted = df_prod.sort_values('Store_Sales', ascending=False)
In [42]:
sns.barplot(x='Product_Department',y='Store_Sales',data = df_sorted, palette = 'Set2')
plt.xticks(rotation=90)
Out[42]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21]), <a list of 22 Text xticklabel objects>)

Part 4.3 'Product_Catagory' Inspection

In [43]:
ls_product_catagory = df['product_category'].unique()
ls_product_export =[]
for i in range(len(ls_product_catagory)):
    v1 = ls_product_catagory[i]
    v2 = df[df['product_category'] == v1]['store_sales'].sum()
    
    ls_product_export.append([v1,v2])
    
df_product_catagory = pd.DataFrame(ls_product_export)
df_product_catagory.columns = ['Product_Catagory','Store_Sales']
df_sorted = df_product_catagory.sort_values('Store_Sales', ascending=False).head(15)
In [44]:
sns.barplot(x='Product_Catagory',y='Store_Sales',data = df_sorted, palette = 'Set3')
plt.xticks(rotation=90)
Out[44]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 <a list of 15 Text xticklabel objects>)

Part 4.4 Looking at Product Catagory vs Product Department in different countries

In [45]:
ls_result = []
In [46]:
ls_product_catagory = df['product_category'].unique()
ls_products = df['product_department'].unique()
ls_countries = df['country'].unique()

for ii in range(len(ls_countries)):

    v0 = ls_countries[ii]
    
    for i in range(len(ls_product_catagory)):
        v1 = ls_product_catagory[i]
        v2 = df[df['product_category'] == v1]['store_sales'].sum()
    
        ls_result.append([v0,v1,v2,'product_catagory'])
    
    for i in range(len(ls_products)):
        v1 = ls_products[i]
        v3 = df[df['product_department'] == prod]['store_sales'].sum()
    
        ls_result.append([v0,v1,v3,'product_department'])


        
df_result = pd.DataFrame(ls_result)
In [47]:
ls_product_catagory = df['product_category'].unique()
ls_products = df['product_department'].unique()
ls_countries = df['country'].unique()

for ii in range(len(ls_countries)):

    v0 = ls_countries[ii]
       
    for i in range(len(ls_products)):
        v1 = ls_products[i]
        v3 = df[df['product_department'] == prod]['store_sales'].sum()
    
        ls_result.append([v0,v1,v3,'product_department'])
In [48]:
df[df['product_department'] == 'Beverages']['store_sales'].head()
Out[48]:
12     4.48
62     4.94
63     5.55
64    14.56
74     2.20
Name: store_sales, dtype: float64
In [49]:
df_result.head()
Out[49]:
0 1 2 3
0 USA Vegetables 8660.26 product_catagory
1 USA Dairy 4461.98 product_catagory
2 USA Candy 1910.89 product_catagory
3 USA Starchy Foods 1301.06 product_catagory
4 USA Kitchen Products 1399.61 product_catagory

Part 4.4.1 USA by Product Catagory vs Product Department

In [50]:
df_usa_catagory = df_result[(df_result[3]== 'product_catagory') & (df_result[0] == 'USA')].sort_values(2, ascending=False).head(15)

sns.barplot(x=1,y=2, data=df_usa_catagory, palette = 'coolwarm')
plt.xticks(rotation=90)
Out[50]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 <a list of 15 Text xticklabel objects>)
In [51]:
df_usa_department = df_result[(df_result[3]== 'product_department') & (df_result[0] == 'USA')].sort_values(2, ascending=False).head(15)

sns.barplot(x=1,y=2, data=df_usa_department, palette = 'coolwarm')
plt.xticks(rotation=90)
plt.ylim(600,800)
Out[51]:
(600, 800)

Part 4.4.2 Canada by Product Catagory vs Product Department

In [52]:
df_canada_catagory = df_result[(df_result[3]== 'product_catagory') & (df_result[0] == 'Canada')].sort_values(2, ascending=False).head(15)

sns.barplot(x=1,y=2, data=df_canada_catagory, palette = 'Reds')
plt.xticks(rotation=90)
Out[52]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 <a list of 15 Text xticklabel objects>)
In [53]:
df_canada_department = df_result[(df_result[3]== 'product_department') & (df_result[0] == 'Canada')].sort_values(2, ascending=False).head(15)

sns.barplot(x=1,y=2, data=df_canada_department, palette = 'Reds')
plt.xticks(rotation=90)
plt.ylim(700,800)
Out[53]:
(700, 800)