Imports and Headers

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
%matplotlib inline  
In [34]:
df = pd.read_csv('dice.csv')
In [35]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 12 columns):
advertiserurl               22000 non-null object
company                     21950 non-null object
employmenttype_jobstatus    21770 non-null object
jobdescription              22000 non-null object
jobid                       22000 non-null object
joblocation_address         21997 non-null object
jobtitle                    22000 non-null object
postdate                    22000 non-null object
shift                       21643 non-null object
site_name                   3490 non-null object
skills                      21957 non-null object
uniq_id                     22000 non-null object
dtypes: object(12)
memory usage: 2.0+ MB
In [36]:
df_lower = df.apply(lambda x: x.astype(str).str.lower())
In [37]:
df_lower.head()
Out[37]:
advertiserurl company employmenttype_jobstatus jobdescription jobid joblocation_address jobtitle postdate shift site_name skills uniq_id
0 https://www.dice.com/jobs/detail/automation-te... digital intelligence systems, llc c2h corp-to-corp, c2h independent, c2h w2, 3 m... looking for selenium engineers...must have sol... dice id : 10110693 atlanta, ga automation test engineer 1 hour ago telecommuting not available|travel not required nan see below 418ff92580b270ef4e7c14f0ddfc36b4
1 https://www.dice.com/jobs/detail/information-s... university of chicago/it services full time the university of chicago has a rapidly growin... dice id : 10114469 chicago, il information security engineer 1 week ago telecommuting not available|travel not required nan linux/unix, network monitoring, incident respo... 8aec88cba08d53da65ab99cf20f6f9d9
2 https://www.dice.com/jobs/detail/business-solu... galaxy systems, inc. full time galaxe.solutionsevery day, our solutions affec... dice id : cxgalxys schaumburg, il business solutions architect 2 weeks ago telecommuting not available|travel not required nan enterprise solutions architecture, business in... 46baa1f69ac07779274bcd90b85d9a72
3 https://www.dice.com/jobs/detail/java-develope... transtech llc full time java developerfull-time/direct-hirebolingbrook... dice id : 10113627 bolingbrook, il java developer (mid level)- ft- great culture,... 2 weeks ago telecommuting not available|travel not required nan please see job description 3941b2f206ae0f900c4fba4ac0b18719
4 https://www.dice.com/jobs/detail/devops-engine... matrix resources full time midtown based high tech firm has an immediate ... dice id : matrixga atlanta, ga devops engineer 48 minutes ago telecommuting not available|travel not required nan configuration management, developer, linux, ma... 45efa1f6bc65acc32bbbb953a1ed13b7
In [38]:
#Convert to snake case
df_engineers = df_lower[df_lower['jobtitle'].str.contains('engineer')]
In [39]:
#Split and count function
ls_engineer = []
ls_engineer = Counter("".join(df_engineers['jobtitle']).split()).most_common(20)
In [40]:
# Resulting List
ls_engineer
Out[40]:
[('engineer', 1497),
 ('-', 821),
 ('software', 652),
 ('engineersenior', 346),
 ('engineersoftware', 255),
 ('network', 219),
 ('/', 202),
 ('security', 194),
 ('systems', 192),
 ('engineersr.', 183),
 ('development', 169),
 ('engineernetwork', 164),
 ('test', 137),
 ('data', 135),
 ('and', 127),
 ('engineer,', 120),
 ('support', 116),
 ('in', 88),
 ('engineerdevops', 85),
 ('automation', 85)]
In [41]:
#Deleting redundant data
del ls_engineer[0]
del ls_engineer[0]
del ls_engineer[4]
del ls_engineer[1]
del ls_engineer[11]
del ls_engineer[10]
del ls_engineer[5]
del ls_engineer[10]
In [42]:
#New resulting list
ls_engineer
Out[42]:
[('software', 652),
 ('engineersoftware', 255),
 ('network', 219),
 ('security', 194),
 ('systems', 192),
 ('development', 169),
 ('engineernetwork', 164),
 ('test', 137),
 ('data', 135),
 ('support', 116),
 ('engineerdevops', 85),
 ('automation', 85)]
In [43]:
#Converting to Data Frame
df_engr_count = pd.DataFrame(ls_engineer)
In [44]:
#Checking Data Frame
df_engr_count
Out[44]:
0 1
0 software 652
1 engineersoftware 255
2 network 219
3 security 194
4 systems 192
5 development 169
6 engineernetwork 164
7 test 137
8 data 135
9 support 116
10 engineerdevops 85
11 automation 85
In [45]:
#Combining Data
df_engr_count.at[0,1] = 907
df_engr_count.at[2,1] = 219+164
In [46]:
#Combining Data
df_engr_count.drop(df_engr_count.index[1],inplace=True)
df_engr_count.drop(df_engr_count.index[5],inplace=True)
In [47]:
df_engr_count
Out[47]:
0 1
0 software 907
2 network 383
3 security 194
4 systems 192
5 development 169
7 test 137
8 data 135
9 support 116
10 engineerdevops 85
11 automation 85
In [48]:
len(df_engr_count)
Out[48]:
10
In [49]:
# Naming columns
df_engr_count.columns = ['Job_Title','#_of_Listings']
In [50]:
# Capitalize 'Job Titles'
df_engr_count['Job_Title'] = df_engr_count['Job_Title'].apply(lambda x : x.capitalize())
In [51]:
sns.barplot(x='Job_Title',y='#_of_Listings',data = df_engr_count)
plt.xticks(rotation=90)
Out[51]:
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), <a list of 10 Text xticklabel objects>)
In [52]:
#Final Data Frame
df_engr_count
Out[52]:
Job_Title #_of_Listings
0 Software 907
2 Network 383
3 Security 194
4 Systems 192
5 Development 169
7 Test 137
8 Data 135
9 Support 116
10 Engineerdevops 85
11 Automation 85

Ans. Software, Networking, and Security seem to be the highest demand fields in engineering.