{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Read Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"ibm = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dateset Information"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1470, 35)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>DailyRate</th>\n",
" <th>DistanceFromHome</th>\n",
" <th>Education</th>\n",
" <th>EmployeeCount</th>\n",
" <th>EmployeeNumber</th>\n",
" <th>EnvironmentSatisfaction</th>\n",
" <th>HourlyRate</th>\n",
" <th>JobInvolvement</th>\n",
" <th>JobLevel</th>\n",
" <th>JobSatisfaction</th>\n",
" <th>MonthlyIncome</th>\n",
" <th>MonthlyRate</th>\n",
" <th>NumCompaniesWorked</th>\n",
" <th>PercentSalaryHike</th>\n",
" <th>PerformanceRating</th>\n",
" <th>RelationshipSatisfaction</th>\n",
" <th>StandardHours</th>\n",
" <th>StockOptionLevel</th>\n",
" <th>TotalWorkingYears</th>\n",
" <th>TrainingTimesLastYear</th>\n",
" <th>WorkLifeBalance</th>\n",
" <th>YearsAtCompany</th>\n",
" <th>YearsInCurrentRole</th>\n",
" <th>YearsSinceLastPromotion</th>\n",
" <th>YearsWithCurrManager</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.0</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.0</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>36.923810</td>\n",
" <td>802.485714</td>\n",
" <td>9.192517</td>\n",
" <td>2.912925</td>\n",
" <td>1.0</td>\n",
" <td>1024.865306</td>\n",
" <td>2.721769</td>\n",
" <td>65.891156</td>\n",
" <td>2.729932</td>\n",
" <td>2.063946</td>\n",
" <td>2.728571</td>\n",
" <td>6502.931293</td>\n",
" <td>14313.103401</td>\n",
" <td>2.693197</td>\n",
" <td>15.209524</td>\n",
" <td>3.153741</td>\n",
" <td>2.712245</td>\n",
" <td>80.0</td>\n",
" <td>0.793878</td>\n",
" <td>11.279592</td>\n",
" <td>2.799320</td>\n",
" <td>2.761224</td>\n",
" <td>7.008163</td>\n",
" <td>4.229252</td>\n",
" <td>2.187755</td>\n",
" <td>4.123129</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>9.135373</td>\n",
" <td>403.509100</td>\n",
" <td>8.106864</td>\n",
" <td>1.024165</td>\n",
" <td>0.0</td>\n",
" <td>602.024335</td>\n",
" <td>1.093082</td>\n",
" <td>20.329428</td>\n",
" <td>0.711561</td>\n",
" <td>1.106940</td>\n",
" <td>1.102846</td>\n",
" <td>4707.956783</td>\n",
" <td>7117.786044</td>\n",
" <td>2.498009</td>\n",
" <td>3.659938</td>\n",
" <td>0.360824</td>\n",
" <td>1.081209</td>\n",
" <td>0.0</td>\n",
" <td>0.852077</td>\n",
" <td>7.780782</td>\n",
" <td>1.289271</td>\n",
" <td>0.706476</td>\n",
" <td>6.126525</td>\n",
" <td>3.623137</td>\n",
" <td>3.222430</td>\n",
" <td>3.568136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>18.000000</td>\n",
" <td>102.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>30.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1009.000000</td>\n",
" <td>2094.000000</td>\n",
" <td>0.000000</td>\n",
" <td>11.000000</td>\n",
" <td>3.000000</td>\n",
" <td>1.000000</td>\n",
" <td>80.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>30.000000</td>\n",
" <td>465.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" <td>1.0</td>\n",
" <td>491.250000</td>\n",
" <td>2.000000</td>\n",
" <td>48.000000</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2911.000000</td>\n",
" <td>8047.000000</td>\n",
" <td>1.000000</td>\n",
" <td>12.000000</td>\n",
" <td>3.000000</td>\n",
" <td>2.000000</td>\n",
" <td>80.0</td>\n",
" <td>0.000000</td>\n",
" <td>6.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" <td>3.000000</td>\n",
" <td>2.000000</td>\n",
" <td>0.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>36.000000</td>\n",
" <td>802.000000</td>\n",
" <td>7.000000</td>\n",
" <td>3.000000</td>\n",
" <td>1.0</td>\n",
" <td>1020.500000</td>\n",
" <td>3.000000</td>\n",
" <td>66.000000</td>\n",
" <td>3.000000</td>\n",
" <td>2.000000</td>\n",
" <td>3.000000</td>\n",
" <td>4919.000000</td>\n",
" <td>14235.500000</td>\n",
" <td>2.000000</td>\n",
" <td>14.000000</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>80.0</td>\n",
" <td>1.000000</td>\n",
" <td>10.000000</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>5.000000</td>\n",
" <td>3.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>43.000000</td>\n",
" <td>1157.000000</td>\n",
" <td>14.000000</td>\n",
" <td>4.000000</td>\n",
" <td>1.0</td>\n",
" <td>1555.750000</td>\n",
" <td>4.000000</td>\n",
" <td>83.750000</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>4.000000</td>\n",
" <td>8379.000000</td>\n",
" <td>20461.500000</td>\n",
" <td>4.000000</td>\n",
" <td>18.000000</td>\n",
" <td>3.000000</td>\n",
" <td>4.000000</td>\n",
" <td>80.0</td>\n",
" <td>1.000000</td>\n",
" <td>15.000000</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>9.000000</td>\n",
" <td>7.000000</td>\n",
" <td>3.000000</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>60.000000</td>\n",
" <td>1499.000000</td>\n",
" <td>29.000000</td>\n",
" <td>5.000000</td>\n",
" <td>1.0</td>\n",
" <td>2068.000000</td>\n",
" <td>4.000000</td>\n",
" <td>100.000000</td>\n",
" <td>4.000000</td>\n",
" <td>5.000000</td>\n",
" <td>4.000000</td>\n",
" <td>19999.000000</td>\n",
" <td>26999.000000</td>\n",
" <td>9.000000</td>\n",
" <td>25.000000</td>\n",
" <td>4.000000</td>\n",
" <td>4.000000</td>\n",
" <td>80.0</td>\n",
" <td>3.000000</td>\n",
" <td>40.000000</td>\n",
" <td>6.000000</td>\n",
" <td>4.000000</td>\n",
" <td>40.000000</td>\n",
" <td>18.000000</td>\n",
" <td>15.000000</td>\n",
" <td>17.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age DailyRate DistanceFromHome Education EmployeeCount \\\n",
"count 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 \n",
"mean 36.923810 802.485714 9.192517 2.912925 1.0 \n",
"std 9.135373 403.509100 8.106864 1.024165 0.0 \n",
"min 18.000000 102.000000 1.000000 1.000000 1.0 \n",
"25% 30.000000 465.000000 2.000000 2.000000 1.0 \n",
"50% 36.000000 802.000000 7.000000 3.000000 1.0 \n",
"75% 43.000000 1157.000000 14.000000 4.000000 1.0 \n",
"max 60.000000 1499.000000 29.000000 5.000000 1.0 \n",
"\n",
" EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement \\\n",
"count 1470.000000 1470.000000 1470.000000 1470.000000 \n",
"mean 1024.865306 2.721769 65.891156 2.729932 \n",
"std 602.024335 1.093082 20.329428 0.711561 \n",
"min 1.000000 1.000000 30.000000 1.000000 \n",
"25% 491.250000 2.000000 48.000000 2.000000 \n",
"50% 1020.500000 3.000000 66.000000 3.000000 \n",
"75% 1555.750000 4.000000 83.750000 3.000000 \n",
"max 2068.000000 4.000000 100.000000 4.000000 \n",
"\n",
" JobLevel JobSatisfaction MonthlyIncome MonthlyRate \\\n",
"count 1470.000000 1470.000000 1470.000000 1470.000000 \n",
"mean 2.063946 2.728571 6502.931293 14313.103401 \n",
"std 1.106940 1.102846 4707.956783 7117.786044 \n",
"min 1.000000 1.000000 1009.000000 2094.000000 \n",
"25% 1.000000 2.000000 2911.000000 8047.000000 \n",
"50% 2.000000 3.000000 4919.000000 14235.500000 \n",
"75% 3.000000 4.000000 8379.000000 20461.500000 \n",
"max 5.000000 4.000000 19999.000000 26999.000000 \n",
"\n",
" NumCompaniesWorked PercentSalaryHike PerformanceRating \\\n",
"count 1470.000000 1470.000000 1470.000000 \n",
"mean 2.693197 15.209524 3.153741 \n",
"std 2.498009 3.659938 0.360824 \n",
"min 0.000000 11.000000 3.000000 \n",
"25% 1.000000 12.000000 3.000000 \n",
"50% 2.000000 14.000000 3.000000 \n",
"75% 4.000000 18.000000 3.000000 \n",
"max 9.000000 25.000000 4.000000 \n",
"\n",
" RelationshipSatisfaction StandardHours StockOptionLevel \\\n",
"count 1470.000000 1470.0 1470.000000 \n",
"mean 2.712245 80.0 0.793878 \n",
"std 1.081209 0.0 0.852077 \n",
"min 1.000000 80.0 0.000000 \n",
"25% 2.000000 80.0 0.000000 \n",
"50% 3.000000 80.0 1.000000 \n",
"75% 4.000000 80.0 1.000000 \n",
"max 4.000000 80.0 3.000000 \n",
"\n",
" TotalWorkingYears TrainingTimesLastYear WorkLifeBalance \\\n",
"count 1470.000000 1470.000000 1470.000000 \n",
"mean 11.279592 2.799320 2.761224 \n",
"std 7.780782 1.289271 0.706476 \n",
"min 0.000000 0.000000 1.000000 \n",
"25% 6.000000 2.000000 2.000000 \n",
"50% 10.000000 3.000000 3.000000 \n",
"75% 15.000000 3.000000 3.000000 \n",
"max 40.000000 6.000000 4.000000 \n",
"\n",
" YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion \\\n",
"count 1470.000000 1470.000000 1470.000000 \n",
"mean 7.008163 4.229252 2.187755 \n",
"std 6.126525 3.623137 3.222430 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 3.000000 2.000000 0.000000 \n",
"50% 5.000000 3.000000 1.000000 \n",
"75% 9.000000 7.000000 3.000000 \n",
"max 40.000000 18.000000 15.000000 \n",
"\n",
" YearsWithCurrManager \n",
"count 1470.000000 \n",
"mean 4.123129 \n",
"std 3.568136 \n",
"min 0.000000 \n",
"25% 2.000000 \n",
"50% 3.000000 \n",
"75% 7.000000 \n",
"max 17.000000 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Age mode: 35\n",
"Attrition mode: No\n",
"BusinessTravel mode: Travel_Rarely\n",
"DailyRate mode: 691\n",
"Department mode: Research & Development\n",
"DistanceFromHome mode: 2\n",
"Education mode: 3\n",
"EducationField mode: Life Sciences\n",
"EmployeeCount mode: 1\n",
"EmployeeNumber mode: 1\n",
"EnvironmentSatisfaction mode: 3\n",
"Gender mode: Male\n",
"HourlyRate mode: 66\n",
"JobInvolvement mode: 3\n",
"JobLevel mode: 1\n",
"JobRole mode: Sales Executive\n",
"JobSatisfaction mode: 4\n",
"MaritalStatus mode: Married\n",
"MonthlyIncome mode: 2342\n",
"MonthlyRate mode: 9150\n",
"NumCompaniesWorked mode: 1\n",
"Over18 mode: Y\n",
"OverTime mode: No\n",
"PercentSalaryHike mode: 11\n",
"PerformanceRating mode: 3\n",
"RelationshipSatisfaction mode: 3\n",
"StandardHours mode: 80\n",
"StockOptionLevel mode: 0\n",
"TotalWorkingYears mode: 10\n",
"TrainingTimesLastYear mode: 2\n",
"WorkLifeBalance mode: 3\n",
"YearsAtCompany mode: 5\n",
"YearsInCurrentRole mode: 2\n",
"YearsSinceLastPromotion mode: 0\n",
"YearsWithCurrManager mode: 2\n"
]
}
],
"source": [
"import statistics\n",
"for i in ibm.columns:\n",
" print(i, \" mode: \", statistics.mode(ibm[i]));"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1470 entries, 0 to 1469\n",
"Data columns (total 35 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Age 1470 non-null int64 \n",
" 1 Attrition 1470 non-null object\n",
" 2 BusinessTravel 1470 non-null object\n",
" 3 DailyRate 1470 non-null int64 \n",
" 4 Department 1470 non-null object\n",
" 5 DistanceFromHome 1470 non-null int64 \n",
" 6 Education 1470 non-null int64 \n",
" 7 EducationField 1470 non-null object\n",
" 8 EmployeeCount 1470 non-null int64 \n",
" 9 EmployeeNumber 1470 non-null int64 \n",
" 10 EnvironmentSatisfaction 1470 non-null int64 \n",
" 11 Gender 1470 non-null object\n",
" 12 HourlyRate 1470 non-null int64 \n",
" 13 JobInvolvement 1470 non-null int64 \n",
" 14 JobLevel 1470 non-null int64 \n",
" 15 JobRole 1470 non-null object\n",
" 16 JobSatisfaction 1470 non-null int64 \n",
" 17 MaritalStatus 1470 non-null object\n",
" 18 MonthlyIncome 1470 non-null int64 \n",
" 19 MonthlyRate 1470 non-null int64 \n",
" 20 NumCompaniesWorked 1470 non-null int64 \n",
" 21 Over18 1470 non-null object\n",
" 22 OverTime 1470 non-null object\n",
" 23 PercentSalaryHike 1470 non-null int64 \n",
" 24 PerformanceRating 1470 non-null int64 \n",
" 25 RelationshipSatisfaction 1470 non-null int64 \n",
" 26 StandardHours 1470 non-null int64 \n",
" 27 StockOptionLevel 1470 non-null int64 \n",
" 28 TotalWorkingYears 1470 non-null int64 \n",
" 29 TrainingTimesLastYear 1470 non-null int64 \n",
" 30 WorkLifeBalance 1470 non-null int64 \n",
" 31 YearsAtCompany 1470 non-null int64 \n",
" 32 YearsInCurrentRole 1470 non-null int64 \n",
" 33 YearsSinceLastPromotion 1470 non-null int64 \n",
" 34 YearsWithCurrManager 1470 non-null int64 \n",
"dtypes: int64(26), object(9)\n",
"memory usage: 402.1+ KB\n"
]
}
],
"source": [
"ibm.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"ibm.drop(columns = 'EmployeeCount', inplace = True)\n",
"ibm.drop(columns = 'EmployeeNumber', inplace = True)\n",
"ibm.drop(columns = 'Over18', inplace = True)\n",
"ibm.drop(columns = 'StandardHours', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Attrition</th>\n",
" <th>BusinessTravel</th>\n",
" <th>DailyRate</th>\n",
" <th>Department</th>\n",
" <th>DistanceFromHome</th>\n",
" <th>Education</th>\n",
" <th>EducationField</th>\n",
" <th>EnvironmentSatisfaction</th>\n",
" <th>Gender</th>\n",
" <th>HourlyRate</th>\n",
" <th>JobInvolvement</th>\n",
" <th>JobLevel</th>\n",
" <th>JobRole</th>\n",
" <th>JobSatisfaction</th>\n",
" <th>MaritalStatus</th>\n",
" <th>MonthlyIncome</th>\n",
" <th>MonthlyRate</th>\n",
" <th>NumCompaniesWorked</th>\n",
" <th>OverTime</th>\n",
" <th>PercentSalaryHike</th>\n",
" <th>PerformanceRating</th>\n",
" <th>RelationshipSatisfaction</th>\n",
" <th>StockOptionLevel</th>\n",
" <th>TotalWorkingYears</th>\n",
" <th>TrainingTimesLastYear</th>\n",
" <th>WorkLifeBalance</th>\n",
" <th>YearsAtCompany</th>\n",
" <th>YearsInCurrentRole</th>\n",
" <th>YearsSinceLastPromotion</th>\n",
" <th>YearsWithCurrManager</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>41</td>\n",
" <td>Yes</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>1102</td>\n",
" <td>Sales</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>Life Sciences</td>\n",
" <td>2</td>\n",
" <td>Female</td>\n",
" <td>94</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>Sales Executive</td>\n",
" <td>4</td>\n",
" <td>Single</td>\n",
" <td>5993</td>\n",
" <td>19479</td>\n",
" <td>8</td>\n",
" <td>Yes</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>No</td>\n",
" <td>Travel_Frequently</td>\n",
" <td>279</td>\n",
" <td>Research & Development</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>Life Sciences</td>\n",
" <td>3</td>\n",
" <td>Male</td>\n",
" <td>61</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Research Scientist</td>\n",
" <td>2</td>\n",
" <td>Married</td>\n",
" <td>5130</td>\n",
" <td>24907</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" <td>23</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>10</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>37</td>\n",
" <td>Yes</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>1373</td>\n",
" <td>Research & Development</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Other</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>92</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Laboratory Technician</td>\n",
" <td>3</td>\n",
" <td>Single</td>\n",
" <td>2090</td>\n",
" <td>2396</td>\n",
" <td>6</td>\n",
" <td>Yes</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>33</td>\n",
" <td>No</td>\n",
" <td>Travel_Frequently</td>\n",
" <td>1392</td>\n",
" <td>Research & Development</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>Life Sciences</td>\n",
" <td>4</td>\n",
" <td>Female</td>\n",
" <td>56</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Research Scientist</td>\n",
" <td>3</td>\n",
" <td>Married</td>\n",
" <td>2909</td>\n",
" <td>23159</td>\n",
" <td>1</td>\n",
" <td>Yes</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>27</td>\n",
" <td>No</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>591</td>\n",
" <td>Research & Development</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Medical</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" <td>40</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Laboratory Technician</td>\n",
" <td>2</td>\n",
" <td>Married</td>\n",
" <td>3468</td>\n",
" <td>16632</td>\n",
" <td>9</td>\n",
" <td>No</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1465</th>\n",
" <td>36</td>\n",
" <td>No</td>\n",
" <td>Travel_Frequently</td>\n",
" <td>884</td>\n",
" <td>Research & Development</td>\n",
" <td>23</td>\n",
" <td>2</td>\n",
" <td>Medical</td>\n",
" <td>3</td>\n",
" <td>Male</td>\n",
" <td>41</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>Laboratory Technician</td>\n",
" <td>4</td>\n",
" <td>Married</td>\n",
" <td>2571</td>\n",
" <td>12290</td>\n",
" <td>4</td>\n",
" <td>No</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1466</th>\n",
" <td>39</td>\n",
" <td>No</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>613</td>\n",
" <td>Research & Development</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>Medical</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>42</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>Healthcare Representative</td>\n",
" <td>1</td>\n",
" <td>Married</td>\n",
" <td>9991</td>\n",
" <td>21457</td>\n",
" <td>4</td>\n",
" <td>No</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1467</th>\n",
" <td>27</td>\n",
" <td>No</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>155</td>\n",
" <td>Research & Development</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>Life Sciences</td>\n",
" <td>2</td>\n",
" <td>Male</td>\n",
" <td>87</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>Manufacturing Director</td>\n",
" <td>2</td>\n",
" <td>Married</td>\n",
" <td>6142</td>\n",
" <td>5174</td>\n",
" <td>1</td>\n",
" <td>Yes</td>\n",
" <td>20</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1468</th>\n",
" <td>49</td>\n",
" <td>No</td>\n",
" <td>Travel_Frequently</td>\n",
" <td>1023</td>\n",
" <td>Sales</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>Medical</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>63</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Sales Executive</td>\n",
" <td>2</td>\n",
" <td>Married</td>\n",
" <td>5390</td>\n",
" <td>13243</td>\n",
" <td>2</td>\n",
" <td>No</td>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1469</th>\n",
" <td>34</td>\n",
" <td>No</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>628</td>\n",
" <td>Research & Development</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>Medical</td>\n",
" <td>2</td>\n",
" <td>Male</td>\n",
" <td>82</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>Laboratory Technician</td>\n",
" <td>3</td>\n",
" <td>Married</td>\n",
" <td>4404</td>\n",
" <td>10228</td>\n",
" <td>2</td>\n",
" <td>No</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1470 rows Γ 31 columns</p>\n",
"</div>"
],
"text/plain": [
" Age Attrition BusinessTravel DailyRate Department \\\n",
"0 41 Yes Travel_Rarely 1102 Sales \n",
"1 49 No Travel_Frequently 279 Research & Development \n",
"2 37 Yes Travel_Rarely 1373 Research & Development \n",
"3 33 No Travel_Frequently 1392 Research & Development \n",
"4 27 No Travel_Rarely 591 Research & Development \n",
"... ... ... ... ... ... \n",
"1465 36 No Travel_Frequently 884 Research & Development \n",
"1466 39 No Travel_Rarely 613 Research & Development \n",
"1467 27 No Travel_Rarely 155 Research & Development \n",
"1468 49 No Travel_Frequently 1023 Sales \n",
"1469 34 No Travel_Rarely 628 Research & Development \n",
"\n",
" DistanceFromHome Education EducationField EnvironmentSatisfaction \\\n",
"0 1 2 Life Sciences 2 \n",
"1 8 1 Life Sciences 3 \n",
"2 2 2 Other 4 \n",
"3 3 4 Life Sciences 4 \n",
"4 2 1 Medical 1 \n",
"... ... ... ... ... \n",
"1465 23 2 Medical 3 \n",
"1466 6 1 Medical 4 \n",
"1467 4 3 Life Sciences 2 \n",
"1468 2 3 Medical 4 \n",
"1469 8 3 Medical 2 \n",
"\n",
" Gender HourlyRate JobInvolvement JobLevel JobRole \\\n",
"0 Female 94 3 2 Sales Executive \n",
"1 Male 61 2 2 Research Scientist \n",
"2 Male 92 2 1 Laboratory Technician \n",
"3 Female 56 3 1 Research Scientist \n",
"4 Male 40 3 1 Laboratory Technician \n",
"... ... ... ... ... ... \n",
"1465 Male 41 4 2 Laboratory Technician \n",
"1466 Male 42 2 3 Healthcare Representative \n",
"1467 Male 87 4 2 Manufacturing Director \n",
"1468 Male 63 2 2 Sales Executive \n",
"1469 Male 82 4 2 Laboratory Technician \n",
"\n",
" JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate \\\n",
"0 4 Single 5993 19479 \n",
"1 2 Married 5130 24907 \n",
"2 3 Single 2090 2396 \n",
"3 3 Married 2909 23159 \n",
"4 2 Married 3468 16632 \n",
"... ... ... ... ... \n",
"1465 4 Married 2571 12290 \n",
"1466 1 Married 9991 21457 \n",
"1467 2 Married 6142 5174 \n",
"1468 2 Married 5390 13243 \n",
"1469 3 Married 4404 10228 \n",
"\n",
" NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating \\\n",
"0 8 Yes 11 3 \n",
"1 1 No 23 4 \n",
"2 6 Yes 15 3 \n",
"3 1 Yes 11 3 \n",
"4 9 No 12 3 \n",
"... ... ... ... ... \n",
"1465 4 No 17 3 \n",
"1466 4 No 15 3 \n",
"1467 1 Yes 20 4 \n",
"1468 2 No 14 3 \n",
"1469 2 No 12 3 \n",
"\n",
" RelationshipSatisfaction StockOptionLevel TotalWorkingYears \\\n",
"0 1 0 8 \n",
"1 4 1 10 \n",
"2 2 0 7 \n",
"3 3 0 8 \n",
"4 4 1 6 \n",
"... ... ... ... \n",
"1465 3 1 17 \n",
"1466 1 1 9 \n",
"1467 2 1 6 \n",
"1468 4 0 17 \n",
"1469 1 0 6 \n",
"\n",
" TrainingTimesLastYear WorkLifeBalance YearsAtCompany \\\n",
"0 0 1 6 \n",
"1 3 3 10 \n",
"2 3 3 0 \n",
"3 3 3 8 \n",
"4 3 3 2 \n",
"... ... ... ... \n",
"1465 3 3 5 \n",
"1466 5 3 7 \n",
"1467 0 3 6 \n",
"1468 3 2 9 \n",
"1469 3 4 4 \n",
"\n",
" YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager \n",
"0 4 0 5 \n",
"1 7 1 7 \n",
"2 0 0 0 \n",
"3 7 3 0 \n",
"4 2 2 2 \n",
"... ... ... ... \n",
"1465 2 0 3 \n",
"1466 7 1 7 \n",
"1467 2 0 3 \n",
"1468 6 0 8 \n",
"1469 3 1 2 \n",
"\n",
"[1470 rows x 31 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Age 0\n",
"Attrition 0\n",
"BusinessTravel 0\n",
"DailyRate 0\n",
"Department 0\n",
"DistanceFromHome 0\n",
"Education 0\n",
"EducationField 0\n",
"EnvironmentSatisfaction 0\n",
"Gender 0\n",
"HourlyRate 0\n",
"JobInvolvement 0\n",
"JobLevel 0\n",
"JobRole 0\n",
"JobSatisfaction 0\n",
"MaritalStatus 0\n",
"MonthlyIncome 0\n",
"MonthlyRate 0\n",
"NumCompaniesWorked 0\n",
"OverTime 0\n",
"PercentSalaryHike 0\n",
"PerformanceRating 0\n",
"RelationshipSatisfaction 0\n",
"StockOptionLevel 0\n",
"TotalWorkingYears 0\n",
"TrainingTimesLastYear 0\n",
"WorkLifeBalance 0\n",
"YearsAtCompany 0\n",
"YearsInCurrentRole 0\n",
"YearsSinceLastPromotion 0\n",
"YearsWithCurrManager 0\n",
"dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# replace Attrition (0 - No, 1 - Yes)\n",
"ibm.replace({'Attrition' : {'Yes': 1, 'No': 0}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# replace BusinessTravel (0 - Non-Travel, 1 - Travel_Rarely, 2 - Travel_Frequently)\n",
"ibm.replace({'BusinessTravel' : {'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"#Department\n",
"dummy = pd.get_dummies(ibm['Department'])\n",
"ibm.insert(5,'Dp_Sales&Development', dummy['Research & Development'])\n",
"ibm.insert(6,'Dp_Sales', dummy['Sales'])\n",
"ibm.insert(7,'Dp_HumanResources', dummy['Human Resources'])\n",
"\n",
"ibm.drop(columns = 'Department', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"#EducationField\n",
"dummy = pd.get_dummies(ibm['EducationField'])\n",
"ibm.insert(11,'EF_Life Sciences',dummy['Life Sciences'])\n",
"ibm.insert(12,'EF_Medical',dummy['Medical'])\n",
"ibm.insert(13,'EF_Marketing',dummy['Marketing'])\n",
"ibm.insert(14,'EF_TechnicalDegree',dummy['Technical Degree'])\n",
"ibm.insert(15,'EF_HumanResources',dummy['Human Resources'])\n",
"ibm.insert(16,'EF_Other',dummy['Other'])\n",
"\n",
"ibm.drop(columns = 'EducationField', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# replace Gender (0 - Male; 1 - Female)\n",
"ibm.replace({'Gender': {'Male': 0, 'Female': 1}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Job role dummy variables\n",
"dummy=pd.get_dummies(ibm['JobRole'])\n",
"ibm.insert(23, 'JR_HealthcareRepresentive', dummy['Healthcare Representative'])\n",
"ibm.insert(24, 'JR_HumanResource', dummy['Human Resources'])\n",
"ibm.insert(25, 'JR_LaboratoryTechnician', dummy['Laboratory Technician'])\n",
"ibm.insert(26, 'JR_Manager', dummy['Manager'])\n",
"ibm.insert(27, 'JR_ManufacturingDirector', dummy['Manufacturing Director'])\n",
"ibm.insert(28, 'JR_ResearchDirector', dummy['Research Director'])\n",
"ibm.insert(29, 'JR_ResearchScientist', dummy['Research Scientist'])\n",
"ibm.insert(30, 'JR_SalesExecutive', dummy['Sales Executive'])\n",
"ibm.insert(31, 'JR_SalesRepresentative', dummy['Sales Representative'])\n",
"\n",
"ibm.drop(columns = 'JobRole', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# MaritalStatus role dummy variables\n",
"dummy=pd.get_dummies(ibm['MaritalStatus'])\n",
"ibm.insert(34, 'MS_Married', dummy['Married'])\n",
"ibm.insert(35, 'MS_Single', dummy['Single'])\n",
"ibm.insert(36, 'MS_Divorced', dummy['Divorced'])\n",
"\n",
"ibm.drop(columns = 'MaritalStatus', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# replace Overtime (0 - No; 1 - Yes)\n",
"ibm.replace({'OverTime': {'No': 0, 'Yes': 1}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# replace Over18 (0 - N; 1 - Y)\n",
"ibm.replace({'Over18': {'N': 0, 'Y': 1}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def iqr_outliers(data):\n",
" out=[]\n",
" \n",
" firstQuartile = data.quantile(0.25)\n",
" thirdQuartile = data.quantile(0.75)\n",
" \n",
" iqr = thirdQuartile-firstQuartile\n",
" \n",
" Lower_bound = firstQuartile - 1.5 * iqr\n",
" Upper_bound = thirdQuartile + 1.5 * iqr\n",
" \n",
" for i in data:\n",
" if i > Upper_bound or i < Lower_bound:\n",
" out.append(i)\n",
" \n",
" print(\"Outliers:\",out , \"\\nCount: \", len(out), \"\\n\")\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Age\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Attrition\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 237 \n",
"\n",
"BusinessTravel\n",
"Outliers: [2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2] \n",
"Count: 427 \n",
"\n",
"DailyRate\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Dp_Sales&Development\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Dp_Sales\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Dp_HumanResources\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 63 \n",
"\n",
"DistanceFromHome\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Education\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"EnvironmentSatisfaction\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"EF_Life Sciences\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"EF_Medical\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"EF_Marketing\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 159 \n",
"\n",
"EF_TechnicalDegree\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 132 \n",
"\n",
"EF_HumanResources\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 27 \n",
"\n",
"EF_Other\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 82 \n",
"\n",
"Gender\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"HourlyRate\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"JobInvolvement\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"JobLevel\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"JobSatisfaction\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"JR_HealthcareRepresentive\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 131 \n",
"\n",
"JR_HumanResource\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 52 \n",
"\n",
"JR_LaboratoryTechnician\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 259 \n",
"\n",
"JR_Manager\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 102 \n",
"\n",
"JR_ManufacturingDirector\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 145 \n",
"\n",
"JR_ResearchDirector\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 80 \n",
"\n",
"JR_ResearchScientist\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 292 \n",
"\n",
"JR_SalesExecutive\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 326 \n",
"\n",
"JR_SalesRepresentative\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 83 \n",
"\n",
"MonthlyIncome\n",
"Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880] \n",
"Count: 114 \n",
"\n",
"MonthlyRate\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"NumCompaniesWorked\n",
"Outliers: [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9] \n",
"Count: 52 \n",
"\n",
"MS_Married\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"MS_Single\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"MS_Divorced\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 327 \n",
"\n",
"OverTime\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"PercentSalaryHike\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"PerformanceRating\n",
"Outliers: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] \n",
"Count: 226 \n",
"\n",
"RelationshipSatisfaction\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"StockOptionLevel\n",
"Outliers: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] \n",
"Count: 85 \n",
"\n",
"TotalWorkingYears\n",
"Outliers: [31, 29, 37, 38, 30, 40, 36, 34, 32, 33, 37, 30, 36, 31, 33, 32, 37, 31, 32, 32, 30, 34, 30, 40, 29, 35, 31, 33, 31, 29, 32, 30, 33, 30, 29, 31, 32, 33, 36, 34, 31, 36, 33, 31, 29, 33, 29, 32, 31, 35, 29, 32, 34, 36, 32, 30, 36, 29, 34, 37, 29, 29, 35] \n",
"Count: 63 \n",
"\n",
"TrainingTimesLastYear\n",
"Outliers: [0, 5, 5, 5, 6, 5, 5, 5, 6, 6, 0, 0, 0, 5, 0, 5, 5, 5, 6, 6, 5, 0, 6, 5, 5, 0, 5, 5, 6, 5, 5, 5, 0, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 0, 0, 5, 5, 5, 6, 6, 5, 0, 5, 0, 5, 5, 0, 6, 0, 5, 5, 6, 6, 5, 6, 5, 0, 5, 5, 5, 5, 0, 6, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 0, 5, 0, 5, 5, 6, 5, 6, 5, 0, 5, 5, 0, 6, 6, 5, 6, 0, 5, 0, 6, 6, 6, 6, 5, 5, 0, 5, 0, 0, 6, 0, 6, 5, 6, 5, 5, 0, 5, 6, 6, 5, 5, 0, 0, 6, 0, 0, 5, 0, 5, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6, 0, 6, 6, 5, 5, 0, 0, 6, 6, 0, 5, 0, 0, 0, 0, 0, 5, 5, 6, 5, 5, 0, 5, 5, 0, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 0, 0, 5, 5, 5, 5, 6, 0, 0, 6, 6, 6, 6, 5, 5, 5, 6, 5, 0, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 0, 5, 5, 5, 5, 5, 0, 0, 0, 6, 5, 6, 6, 5, 6, 0, 6, 6, 5, 6, 6, 5, 5, 5, 0] \n",
"Count: 238 \n",
"\n",
"WorkLifeBalance\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"YearsAtCompany\n",
"Outliers: [25, 22, 22, 27, 21, 22, 37, 25, 20, 40, 20, 24, 20, 24, 33, 20, 19, 22, 33, 24, 19, 21, 20, 36, 20, 20, 22, 24, 21, 21, 25, 21, 29, 20, 27, 20, 31, 32, 20, 20, 21, 22, 22, 34, 24, 26, 31, 20, 31, 26, 19, 21, 21, 32, 21, 19, 20, 22, 20, 21, 26, 20, 22, 24, 33, 29, 25, 21, 19, 19, 20, 19, 33, 19, 19, 20, 20, 20, 20, 20, 32, 20, 21, 33, 36, 26, 30, 22, 23, 23, 21, 21, 22, 22, 19, 22, 19, 22, 20, 20, 20, 22, 20, 20] \n",
"Count: 104 \n",
"\n",
"YearsInCurrentRole\n",
"Outliers: [15, 16, 18, 15, 18, 17, 16, 15, 16, 15, 16, 16, 15, 16, 17, 15, 15, 15, 17, 17, 16] \n",
"Count: 21 \n",
"\n",
"YearsSinceLastPromotion\n",
"Outliers: [8, 15, 8, 8, 9, 13, 12, 10, 11, 9, 12, 15, 15, 15, 9, 11, 11, 9, 12, 11, 15, 11, 10, 9, 11, 9, 8, 11, 11, 8, 13, 9, 9, 12, 10, 11, 15, 13, 9, 11, 10, 8, 8, 11, 9, 11, 12, 11, 14, 13, 14, 8, 11, 15, 10, 11, 11, 15, 11, 13, 11, 13, 15, 8, 13, 15, 11, 14, 15, 15, 9, 11, 9, 8, 9, 15, 11, 12, 9, 8, 10, 14, 8, 13, 13, 12, 14, 8, 8, 8, 14, 14, 8, 12, 13, 14, 14, 12, 11, 8, 11, 9, 12, 8, 9, 11, 9] \n",
"Count: 107 \n",
"\n",
"YearsWithCurrManager\n",
"Outliers: [17, 15, 15, 15, 15, 17, 16, 17, 15, 17, 17, 17, 17, 16] \n",
"Count: 14 \n",
"\n"
]
}
],
"source": [
"for c_name in ibm.columns:\n",
" print (c_name)\n",
" iqr_outliers(ibm[c_name])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def remove_outliers(c_name):\n",
" outliers = iqr_outliers(ibm[c_name])\n",
"\n",
" while (len(outliers)!=0):\n",
" for i in outliers:\n",
" ibm.drop(ibm.loc[ibm[c_name]==i].index, inplace = True)\n",
" outliers = iqr_outliers(ibm[c_name])\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880] \n",
"Count: 114 \n",
"\n",
"Outliers: [15427, 13458, 14756, 13245, 13664, 13503, 13549, 13872, 13734, 13591, 16064, 13675, 13496, 13603, 13525, 16015, 13964, 15992, 14336, 13212, 16555, 14118, 13610, 13237, 16184, 15402, 14814, 13770, 16307, 13826, 14275, 13582, 14852, 13194, 13973, 13726, 13320, 13120, 13499, 13758, 13191, 16124, 13577, 14026, 13142, 13695, 13402, 13247, 14732, 16422, 13757, 16032, 16328, 14411, 16437, 15202, 16413, 13269, 13966, 15972, 15379, 12936, 12965, 13116, 13464, 16291, 15787, 13225, 13348, 13341, 13206, 13744, 13570] \n",
"Count: 73 \n",
"\n",
"Outliers: [11994, 12490, 12185, 11849, 11996, 12061, 11878, 12504, 11935, 12808, 11836, 12742, 11904, 12169, 11916, 11957, 12031] \n",
"Count: 17 \n",
"\n",
"Outliers: [11713, 11691] \n",
"Count: 2 \n",
"\n",
"Outliers: [11631] \n",
"Count: 1 \n",
"\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n"
]
}
],
"source": [
"remove_outliers('MonthlyIncome')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Attrition</th>\n",
" <th>BusinessTravel</th>\n",
" <th>DailyRate</th>\n",
" <th>Dp_Sales&Development</th>\n",
" <th>Dp_Sales</th>\n",
" <th>Dp_HumanResources</th>\n",
" <th>DistanceFromHome</th>\n",
" <th>Education</th>\n",
" <th>EnvironmentSatisfaction</th>\n",
" <th>EF_Life Sciences</th>\n",
" <th>EF_Medical</th>\n",
" <th>EF_Marketing</th>\n",
" <th>EF_TechnicalDegree</th>\n",
" <th>EF_HumanResources</th>\n",
" <th>EF_Other</th>\n",
" <th>Gender</th>\n",
" <th>HourlyRate</th>\n",
" <th>JobInvolvement</th>\n",
" <th>JobLevel</th>\n",
" <th>JobSatisfaction</th>\n",
" <th>JR_HealthcareRepresentive</th>\n",
" <th>JR_HumanResource</th>\n",
" <th>JR_LaboratoryTechnician</th>\n",
" <th>JR_Manager</th>\n",
" <th>JR_ManufacturingDirector</th>\n",
" <th>JR_ResearchDirector</th>\n",
" <th>JR_ResearchScientist</th>\n",
" <th>JR_SalesExecutive</th>\n",
" <th>JR_SalesRepresentative</th>\n",
" <th>MonthlyIncome</th>\n",
" <th>MonthlyRate</th>\n",
" <th>NumCompaniesWorked</th>\n",
" <th>MS_Married</th>\n",
" <th>MS_Single</th>\n",
" <th>MS_Divorced</th>\n",
" <th>OverTime</th>\n",
" <th>PercentSalaryHike</th>\n",
" <th>PerformanceRating</th>\n",
" <th>RelationshipSatisfaction</th>\n",
" <th>StockOptionLevel</th>\n",
" <th>TotalWorkingYears</th>\n",
" <th>TrainingTimesLastYear</th>\n",
" <th>WorkLifeBalance</th>\n",
" <th>YearsAtCompany</th>\n",
" <th>YearsInCurrentRole</th>\n",
" <th>YearsSinceLastPromotion</th>\n",
" <th>YearsWithCurrManager</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>41</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1102</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>94</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>5993</td>\n",
" <td>19479</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>279</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>61</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5130</td>\n",
" <td>24907</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>10</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1373</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>92</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2090</td>\n",
" <td>2396</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1392</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>56</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2909</td>\n",
" <td>23159</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>27</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>591</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3468</td>\n",
" <td>16632</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1465</th>\n",
" <td>36</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>884</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>41</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2571</td>\n",
" <td>12290</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1466</th>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>613</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>42</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9991</td>\n",
" <td>21457</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1467</th>\n",
" <td>27</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>155</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>87</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6142</td>\n",
" <td>5174</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>20</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1468</th>\n",
" <td>49</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1023</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>63</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>5390</td>\n",
" <td>13243</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1469</th>\n",
" <td>34</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>628</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>82</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4404</td>\n",
" <td>10228</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1263 rows Γ 48 columns</p>\n",
"</div>"
],
"text/plain": [
" Age Attrition BusinessTravel DailyRate Dp_Sales&Development \\\n",
"0 41 1 1 1102 0 \n",
"1 49 0 2 279 1 \n",
"2 37 1 1 1373 1 \n",
"3 33 0 2 1392 1 \n",
"4 27 0 1 591 1 \n",
"... ... ... ... ... ... \n",
"1465 36 0 2 884 1 \n",
"1466 39 0 1 613 1 \n",
"1467 27 0 1 155 1 \n",
"1468 49 0 2 1023 0 \n",
"1469 34 0 1 628 1 \n",
"\n",
" Dp_Sales Dp_HumanResources DistanceFromHome Education \\\n",
"0 1 0 1 2 \n",
"1 0 0 8 1 \n",
"2 0 0 2 2 \n",
"3 0 0 3 4 \n",
"4 0 0 2 1 \n",
"... ... ... ... ... \n",
"1465 0 0 23 2 \n",
"1466 0 0 6 1 \n",
"1467 0 0 4 3 \n",
"1468 1 0 2 3 \n",
"1469 0 0 8 3 \n",
"\n",
" EnvironmentSatisfaction EF_Life Sciences EF_Medical EF_Marketing \\\n",
"0 2 1 0 0 \n",
"1 3 1 0 0 \n",
"2 4 0 0 0 \n",
"3 4 1 0 0 \n",
"4 1 0 1 0 \n",
"... ... ... ... ... \n",
"1465 3 0 1 0 \n",
"1466 4 0 1 0 \n",
"1467 2 1 0 0 \n",
"1468 4 0 1 0 \n",
"1469 2 0 1 0 \n",
"\n",
" EF_TechnicalDegree EF_HumanResources EF_Other Gender HourlyRate \\\n",
"0 0 0 0 1 94 \n",
"1 0 0 0 0 61 \n",
"2 0 0 1 0 92 \n",
"3 0 0 0 1 56 \n",
"4 0 0 0 0 40 \n",
"... ... ... ... ... ... \n",
"1465 0 0 0 0 41 \n",
"1466 0 0 0 0 42 \n",
"1467 0 0 0 0 87 \n",
"1468 0 0 0 0 63 \n",
"1469 0 0 0 0 82 \n",
"\n",
" JobInvolvement JobLevel JobSatisfaction JR_HealthcareRepresentive \\\n",
"0 3 2 4 0 \n",
"1 2 2 2 0 \n",
"2 2 1 3 0 \n",
"3 3 1 3 0 \n",
"4 3 1 2 0 \n",
"... ... ... ... ... \n",
"1465 4 2 4 0 \n",
"1466 2 3 1 1 \n",
"1467 4 2 2 0 \n",
"1468 2 2 2 0 \n",
"1469 4 2 3 0 \n",
"\n",
" JR_HumanResource JR_LaboratoryTechnician JR_Manager \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 1 0 \n",
"3 0 0 0 \n",
"4 0 1 0 \n",
"... ... ... ... \n",
"1465 0 1 0 \n",
"1466 0 0 0 \n",
"1467 0 0 0 \n",
"1468 0 0 0 \n",
"1469 0 1 0 \n",
"\n",
" JR_ManufacturingDirector JR_ResearchDirector JR_ResearchScientist \\\n",
"0 0 0 0 \n",
"1 0 0 1 \n",
"2 0 0 0 \n",
"3 0 0 1 \n",
"4 0 0 0 \n",
"... ... ... ... \n",
"1465 0 0 0 \n",
"1466 0 0 0 \n",
"1467 1 0 0 \n",
"1468 0 0 0 \n",
"1469 0 0 0 \n",
"\n",
" JR_SalesExecutive JR_SalesRepresentative MonthlyIncome MonthlyRate \\\n",
"0 1 0 5993 19479 \n",
"1 0 0 5130 24907 \n",
"2 0 0 2090 2396 \n",
"3 0 0 2909 23159 \n",
"4 0 0 3468 16632 \n",
"... ... ... ... ... \n",
"1465 0 0 2571 12290 \n",
"1466 0 0 9991 21457 \n",
"1467 0 0 6142 5174 \n",
"1468 1 0 5390 13243 \n",
"1469 0 0 4404 10228 \n",
"\n",
" NumCompaniesWorked MS_Married MS_Single MS_Divorced OverTime \\\n",
"0 8 0 1 0 1 \n",
"1 1 1 0 0 0 \n",
"2 6 0 1 0 1 \n",
"3 1 1 0 0 1 \n",
"4 9 1 0 0 0 \n",
"... ... ... ... ... ... \n",
"1465 4 1 0 0 0 \n",
"1466 4 1 0 0 0 \n",
"1467 1 1 0 0 1 \n",
"1468 2 1 0 0 0 \n",
"1469 2 1 0 0 0 \n",
"\n",
" PercentSalaryHike PerformanceRating RelationshipSatisfaction \\\n",
"0 11 3 1 \n",
"1 23 4 4 \n",
"2 15 3 2 \n",
"3 11 3 3 \n",
"4 12 3 4 \n",
"... ... ... ... \n",
"1465 17 3 3 \n",
"1466 15 3 1 \n",
"1467 20 4 2 \n",
"1468 14 3 4 \n",
"1469 12 3 1 \n",
"\n",
" StockOptionLevel TotalWorkingYears TrainingTimesLastYear \\\n",
"0 0 8 0 \n",
"1 1 10 3 \n",
"2 0 7 3 \n",
"3 0 8 3 \n",
"4 1 6 3 \n",
"... ... ... ... \n",
"1465 1 17 3 \n",
"1466 1 9 5 \n",
"1467 1 6 0 \n",
"1468 0 17 3 \n",
"1469 0 6 3 \n",
"\n",
" WorkLifeBalance YearsAtCompany YearsInCurrentRole \\\n",
"0 1 6 4 \n",
"1 3 10 7 \n",
"2 3 0 0 \n",
"3 3 8 7 \n",
"4 3 2 2 \n",
"... ... ... ... \n",
"1465 3 5 2 \n",
"1466 3 7 7 \n",
"1467 3 6 2 \n",
"1468 2 9 6 \n",
"1469 4 4 3 \n",
"\n",
" YearsSinceLastPromotion YearsWithCurrManager \n",
"0 0 5 \n",
"1 1 7 \n",
"2 0 0 \n",
"3 3 0 \n",
"4 2 2 \n",
"... ... ... \n",
"1465 0 3 \n",
"1466 1 7 \n",
"1467 0 3 \n",
"1468 0 8 \n",
"1469 1 2 \n",
"\n",
"[1263 rows x 48 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Classification"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Support Vector Machine (prepared by Teh Liang Sean) "
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# import important library to do SVM\n",
"from sklearn import svm\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import metrics"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"#The target for SVM will be the attrition of IBM employees to know whether the employees will continue stay or leave IBM\n",
"x_svm_find = ibm.drop(columns = 'Attrition')\n",
"y_svm = ibm['Attrition']"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Features Score\n",
"29 MonthlyIncome 26471.159476\n",
"30 MonthlyRate 1308.443569\n",
"2 DailyRate 1111.594737\n",
"44 YearsInCurrentRole 109.263859\n",
"43 YearsAtCompany 103.805057\n",
"46 YearsWithCurrManager 100.636711\n",
"40 TotalWorkingYears 95.843571\n",
"35 OverTime 60.367656\n",
"6 DistanceFromHome 57.197704\n",
"0 Age 46.705340\n",
"28 JR_SalesRepresentative 27.299127\n",
"33 MS_Single 26.251695\n",
"39 StockOptionLevel 24.376114\n",
"20 JR_HealthcareRepresentive 10.935616\n",
"24 JR_ManufacturingDirector 9.987076\n"
]
}
],
"source": [
"# Try use SelectKBest and chi-squared (chiΒ²) statistical test for non-negative feature to find top 15 best features\n",
"#Import library\n",
"from sklearn.feature_selection import SelectKBest\n",
"from sklearn.feature_selection import chi2\n",
"#Use SelectKBest class to find top 15 best features\n",
"best_15_features = SelectKBest(score_func=chi2, k=15)\n",
"fit = best_15_features.fit(x_svm_find,y_svm)\n",
"dfscores = pd.DataFrame(fit.scores_)\n",
"dfcolumns = pd.DataFrame(x_svm_find.columns)\n",
"#Try to concat two dataframes for a better visualization \n",
"top_15_feature_scores = pd.concat([dfcolumns,dfscores],axis=1)\n",
"#Name the dataframe columns\n",
"top_15_feature_scores.columns = ['Features','Score'] \n",
"#Show 15 best features\n",
"print(top_15_feature_scores.nlargest(15,'Score')) "
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"ibm_svm_features_df = pd.DataFrame()\n",
"# Set up data to do SVM using top 15 best features identified\n",
"ibm_svm_features_df.insert(0,'MonthlyIncome',ibm['MonthlyIncome'])\n",
"ibm_svm_features_df.insert(1,'MonthlyRate',ibm['MonthlyRate'])\n",
"ibm_svm_features_df.insert(2,'DailyRate',ibm['DailyRate'])\n",
"ibm_svm_features_df.insert(3,'YearsInCurrentRole',ibm['YearsInCurrentRole'])\n",
"ibm_svm_features_df.insert(4,'YearsAtCompany',ibm['YearsAtCompany'])\n",
"ibm_svm_features_df.insert(5,'YearsWithCurrManager',ibm['YearsWithCurrManager'])\n",
"ibm_svm_features_df.insert(6,'TotalWorkingYears',ibm['TotalWorkingYears'])\n",
"ibm_svm_features_df.insert(7,'OverTime',ibm['OverTime'])\n",
"ibm_svm_features_df.insert(8,'DistanceFromHome',ibm['DistanceFromHome'])\n",
"ibm_svm_features_df.insert(9,'Age',ibm['Age'])\n",
"ibm_svm_features_df.insert(10,'JR_SalesRepresentative',ibm['JR_SalesRepresentative'])\n",
"ibm_svm_features_df.insert(11,'MS_Single',ibm['MS_Single'])\n",
"ibm_svm_features_df.insert(12,'StockOptionLevel',ibm['StockOptionLevel'])\n",
"ibm_svm_features_df.insert(13,'JR_HealthcareRepresentive ',ibm['JR_HealthcareRepresentive'])\n",
"ibm_svm_features_df.insert(14,'JR_ManufacturingDirector',ibm['JR_ManufacturingDirector'])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MonthlyIncome</th>\n",
" <th>MonthlyRate</th>\n",
" <th>DailyRate</th>\n",
" <th>YearsInCurrentRole</th>\n",
" <th>YearsAtCompany</th>\n",
" <th>YearsWithCurrManager</th>\n",
" <th>TotalWorkingYears</th>\n",
" <th>OverTime</th>\n",
" <th>DistanceFromHome</th>\n",
" <th>Age</th>\n",
" <th>JR_SalesRepresentative</th>\n",
" <th>MS_Single</th>\n",
" <th>StockOptionLevel</th>\n",
" <th>JR_HealthcareRepresentive</th>\n",
" <th>JR_ManufacturingDirector</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5993</td>\n",
" <td>19479</td>\n",
" <td>1102</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5130</td>\n",
" <td>24907</td>\n",
" <td>279</td>\n",
" <td>7</td>\n",
" <td>10</td>\n",
" <td>7</td>\n",
" <td>10</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>49</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2090</td>\n",
" <td>2396</td>\n",
" <td>1373</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>37</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2909</td>\n",
" <td>23159</td>\n",
" <td>1392</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3468</td>\n",
" <td>16632</td>\n",
" <td>591</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>27</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1465</th>\n",
" <td>2571</td>\n",
" <td>12290</td>\n",
" <td>884</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>36</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1466</th>\n",
" <td>9991</td>\n",
" <td>21457</td>\n",
" <td>613</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1467</th>\n",
" <td>6142</td>\n",
" <td>5174</td>\n",
" <td>155</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>27</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1468</th>\n",
" <td>5390</td>\n",
" <td>13243</td>\n",
" <td>1023</td>\n",
" <td>6</td>\n",
" <td>9</td>\n",
" <td>8</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>49</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1469</th>\n",
" <td>4404</td>\n",
" <td>10228</td>\n",
" <td>628</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>34</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1263 rows Γ 15 columns</p>\n",
"</div>"
],
"text/plain": [
" MonthlyIncome MonthlyRate DailyRate YearsInCurrentRole \\\n",
"0 5993 19479 1102 4 \n",
"1 5130 24907 279 7 \n",
"2 2090 2396 1373 0 \n",
"3 2909 23159 1392 7 \n",
"4 3468 16632 591 2 \n",
"... ... ... ... ... \n",
"1465 2571 12290 884 2 \n",
"1466 9991 21457 613 7 \n",
"1467 6142 5174 155 2 \n",
"1468 5390 13243 1023 6 \n",
"1469 4404 10228 628 3 \n",
"\n",
" YearsAtCompany YearsWithCurrManager TotalWorkingYears OverTime \\\n",
"0 6 5 8 1 \n",
"1 10 7 10 0 \n",
"2 0 0 7 1 \n",
"3 8 0 8 1 \n",
"4 2 2 6 0 \n",
"... ... ... ... ... \n",
"1465 5 3 17 0 \n",
"1466 7 7 9 0 \n",
"1467 6 3 6 1 \n",
"1468 9 8 17 0 \n",
"1469 4 2 6 0 \n",
"\n",
" DistanceFromHome Age JR_SalesRepresentative MS_Single \\\n",
"0 1 41 0 1 \n",
"1 8 49 0 0 \n",
"2 2 37 0 1 \n",
"3 3 33 0 0 \n",
"4 2 27 0 0 \n",
"... ... ... ... ... \n",
"1465 23 36 0 0 \n",
"1466 6 39 0 0 \n",
"1467 4 27 0 0 \n",
"1468 2 49 0 0 \n",
"1469 8 34 0 0 \n",
"\n",
" StockOptionLevel JR_HealthcareRepresentive JR_ManufacturingDirector \n",
"0 0 0 0 \n",
"1 1 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 1 0 0 \n",
"... ... ... ... \n",
"1465 1 0 0 \n",
"1466 1 1 0 \n",
"1467 1 0 1 \n",
"1468 0 0 0 \n",
"1469 0 0 0 \n",
"\n",
"[1263 rows x 15 columns]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm_svm_features_df"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"#assignment ibm_svm_features to x\n",
"x_svm = ibm_svm_features_df"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"#Try to scale all the numeric data of each features to make svm model train more effective\n",
"from sklearn.preprocessing import StandardScaler\n",
"s_scaler = StandardScaler()\n",
"x_scaled_svm = s_scaler.fit_transform(x_svm)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"#Try to use tomek link to solve undersampling problem as attriction too few 'yes' value for imbalanced classification \n",
"from imblearn.under_sampling import TomekLinks\n",
"\n",
"tl_svm = TomekLinks(sampling_strategy='not minority')\n",
"x_tl_svm, y_tl_svm= tl_svm.fit_resample(x_svm, y_svm)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"#Train the modals with 80% and test 20% of the data\n",
"x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(x_tl_svm,y_tl_svm, test_size=0.2,random_state=40, stratify=y_tl_svm)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# Model 1 is using the manual tuning for some hyperparameters of SVM\n",
"model_1_svm=svm.SVC(C=2,kernel='sigmoid',gamma='scale',coef0=0.6,random_state=40,probability=True)\n",
"model_1_svm.fit(x_train_svm,y_train_svm)\n",
"y_predict_1_svm=model_1_svm.predict(x_test_svm)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 4 folds for each of 5400 candidates, totalling 21600 fits\n"
]
}
],
"source": [
"# Modal 2 is using GridSearchCV to find the best hyperparameters for SVM using cross validation\n",
"# Only some hyperparameters are tuned \n",
"\n",
"# import GridSearchCV library\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"#Try to tune the hyperparameter with\n",
"#kernel type: linear/rbf/sigmoid\n",
"#C which is the regularization parameter: range 0-1 increase by 0.1\n",
"#coef0 that is the independent term for kernel method (only for sigmoid): range 0.0-0.5 increase by 0.1\n",
"#degree for the polynomial ('poly') kernel method: range 0-5 increase by 1\n",
"#gamma that are kernel coefficient for 'rbf' and 'poly': scale/auto\n",
"\n",
"param_grid={'kernel':('linear','rbf','sigmoid'),\n",
" 'C':[i for i in np.arange(1.0,3.0,0.1)],\n",
" 'coef0':[y for y in np.arange(0.0,1.5,0.1)],\n",
" 'degree':[z for z in np.arange(3,6,1)],\n",
" 'gamma':('auto','scale'),}\n",
"# set random state to 40\n",
"find_best_para_model=svm.SVC(random_state=40)\n",
"Grid_search_svm=GridSearchCV(find_best_para_model,param_grid, n_jobs=-1,verbose=2,cv=4)\n",
"# this may take some time to run\n",
"Grid_search_svm.fit(x_train_svm,y_train_svm)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'C': 2.8000000000000016,\n",
" 'coef0': 0.0,\n",
" 'degree': 3,\n",
" 'gamma': 'scale',\n",
" 'kernel': 'rbf'}"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Show the best hyperparameter found by grid search\n",
"Grid_search_svm.best_params_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use hyperparameter found grid search to build modal \n",
"model_2_svm=svm.SVC(C=2.8000000000000016,kernel='rbf',degree=3,gamma='scale',coef0=0.0,probability=True,random_state=40)\n",
"model_2_svm.fit(x_train_svm,y_train_svm)\n",
"y_predict_2_svm=model_2_svm.predict(x_test_svm)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy of prediction classification result for 2 model\n",
"Hyperparameters that try to tune manually (model 1): 0.7416666666666667\n",
"Best hyperparameters found using GridSearchCV (model 2): 0.8166666666666667\n"
]
}
],
"source": [
"#Evaluate accurracy of classification result\n",
"print('Accuracy of prediction classification result for 2 model')\n",
"print('Hyperparameters that try to tune manually (model 1): ',metrics.accuracy_score(y_test_svm, y_predict_1_svm))\n",
"print('Best hyperparameters found using GridSearchCV (model 2): ',metrics.accuracy_score(y_test_svm, y_predict_2_svm)) "
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[183 12]\n",
" [ 32 13]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.85 0.94 0.89 195\n",
" 1 0.52 0.29 0.37 45\n",
"\n",
" accuracy 0.82 240\n",
" macro avg 0.69 0.61 0.63 240\n",
"weighted avg 0.79 0.82 0.79 240\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\USER\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:70: FutureWarning: Pass labels=[0, 1] as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error\n",
" warnings.warn(f\"Pass {args_msg} as keyword args. From version \"\n"
]
}
],
"source": [
"#Evaluating classification result by confusion matrix\n",
"from sklearn.metrics import confusion_matrix\n",
"print (confusion_matrix(y_test_svm, y_predict_2_svm,[0,1]))\n",
"\n",
"#Evaluating classification result by Precision, Recall and F1-Measure\n",
"from sklearn.metrics import classification_report\n",
"print (classification_report(y_test_svm, y_predict_2_svm))"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAz+UlEQVR4nO3de5xN9frA8c/jfhvkmtyT2wwhg3QKxYlSEerERIkzlK5O/VIicolURIgiOXGcSFIJUS5RSpLLSMepk5RObrlPGM/vj7XG2e3msjFrr71nP+/Xa71mr71uz9ozs579/X7X+n5FVTHGGBO78vgdgDHGGH9ZIjDGmBhnicAYY2KcJQJjjIlxlgiMMSbGWSIwxpgYZ4nAGGNinCUCc4aIrBCRAyJSMIP3ewe910pEdgXMi4jcLyJbROSoiOwSkbkiUj/EYxcUkekickhEfhaR/lms+7iIHAmYjovIaREp4y5/RkR+cPf1vYgMDNr+RjfOIyKyVkTig85juIj8KCIH3XNPCIhxmrvPwyLypYhcF7Tv3iKyw933YhG5KGjfo0Vknzs9IyISsPw/7rmkn9fSoG0HishO97zmiEjxgOWlROSfIrLXnWalLxeRWiLytojsEZH9IrJERGoHffZjReQn9/c/SUTyB+37Lff3+r2IdAtYVk1ENOj3MShgeUkReU1EfnGnIUGfV0MRWe1+1rtEZHBmv3fjIVW1ySaAakAasB+4JWjZCqB30HutgF0B8+OBfwPXAAWBIkASMCDE4z8NrAYuAOoCPwPtQtx2CPBhwHxtoKj7uiKwFejkztcEDgFXAvmAx4AdQD53+a3AT8DFQF43rg3usqLusarhfIm6ATgMVHOXtwR+ARKAAsBkYGVAXH2A7UAlN64UoG/A8v8AbTI5xzuAr4HKQDHgbeC1gOWTgKVAcaAEsAx43l3WFOgFlALyA8OArwO2fdL97EsBZYFPgaEBy/8B/NM97pXAQSAh4O9G0z+/DOJ+FZjr/j1Uc/9GegYsTwFGuJ91DWA3cJPf/w+xNvkegE2RMQGDgTXA88C7QctWkEUicC+uaUDT8zj+j8C1AfPDgDkhbCfuxeWOTJZXBDYD/+fO3wu8F7A8D3AcaO3OPwq8EbA8AUjN4vibgM7u62eBiQHLLnIvkjXc+bVAcsDyXsCnAfNZJYJ5wCMB81cAqUARd/594J6A5f2AJZnsq5QbV2l3fj0ByR/oBvzgvi4KnABqBSz/OzDKfZ1dItgLNAmYfxxYHTB/DIgPmJ8LPOb3/0OsTVY1ZNL1AGa5U1sRKX8W27bGSQqfZbaCiHQTkU2ZLLsA56L5VcDbX+FchLNzFVAeeDNonwNE5AiwC+diNjt9kTsRNF/PnZ8DXOJWp+TH+Sa+OJO4ywO1cEocme2bgH0nkP05znKrcJaKSIMM4gycL4iThAEmAjeIyAXu59kZJzlkpAXws6ruy2LflUSkhHt+aar6TTZxf+9W7byaXkUXtL/A1/UC5scBPUQkv1td1RynNGPCyBKBQUSuBKrifBP+Aucbdrest/qd0jhF+kyp6mxVvTSTxcXcnwcD3jsIxIVw7DuAeap6JOh4o9ztL8P5Bpu+7w+AluK0cRTA+YZaAKfqAvc8VuNU4RwHbgEeCj6omyRm4VTPfO2+vQi4VUQuFZHCOKUsDdh3sQzOsVhAO0ESzjfsqsBHwBIRKekuex/o7dbJl8ApuRCw7w3ueexzpzSc6qLguCvhJI3ANpj3gQdEpKyIXAjcH7Dv4JjT407/3ewFmrgxN3bfnxWw7mJggIjEicglwF0BMQO8C3TB+ay/Bqap6ufBcRtvWSIw4FxMl6rqXnd+tvteulM4dcuB8gMn3df7gArncfz0i3jxgPeK49S/Z8q92N4CvJbRcnV8iXORGeq+9zXOub2Ic9Evg1NPnd7w/STOha0yUMjd7kMROXPxEpE8OMnlBE5VU/rxlrvbvwl8j1PVczhg30cyOMcj6taJqOoaVT2uqsdU9WngV5wSD8B0nLr6FTglkI/c99P3PRf4BudCXBwnmb8e9HmVxWlHmKSq/whYNAL4EtiIU321AOd3+0sGMafHfdiN+YiqrlfVU6r6X/fzuDagIft+nM//XzjtGv9Ij1lESuEkiqdwPuvKOKXRezBhZYkgxrkX01txviX/LCI/43wDbhBQNbET55tqoOo4FzuA5ThVCYnnEoOqHsC5KAdWhTTgf1UumemE07i9Ipv18uE0RKYfb56q1lPV0jgX7qpA+rfQBsA/VXWXe3GbgdOAHQ/O3TvANJzqqM6qepIAqjpRVWuqajmchJAP2OIu3nqW56i41SqqelpVn1TVaqpayd3uR3dK39cUVT3qlo5eAq5P35FbXbQUWKiqI4JiPq6q96pqRVW9GCexf6GqaTjJJZ+I1AzYJKu407szTo97v6omqeqFqpqAc81Jr0K8GKfaaab7We/CqZq7/g97Nd7yu5HCJn8noCvOxbQKcGHAtAp4zl2nLc63w6Y4/+C1gG38/o6XCTjf+lrhVFEUAm4j9LuGRgErcS66dXASQ5Z3DeFc2J4Kei8Pzt05F7ixNnX3dX/AOo1x7lIpi3M3zOyAZU8CH+Nc6PMA3YGjQEl3+Us4d9UUyyCeQjj13+J+niuAkQHL+7qfW0WcNpGt6Z+hu/6fAj67R4A9/K9BtxROMhOcpLSF3zc8f+T+Dgq70yRgjbusOM7F98VMPsf0eAS4HPiB3zfcz8H5Jl/UjTHwrqFmOHdp5cGpIvwn8FHAtjXc9/MC1+FUJSUExPUrTjVkHpy/u0+AEX7/X8Ta5HsANvn8B+AUzZ/L4P1bcW7hTL+t8i73wnUI53bLAUCegPUFeMBd5xjON9V/BvzTJwFbs4ijIE71xyHgv0D/oOVHgKsC5iviVFldErReHvec9rvbfIPTDiAB63yMU7WxH5iCe6upu6wQTh36bjeWDbgJCafkoDh36xwJmJLc5SVx7iI66n52TwN5gz6jZ9zj7ndfi7ssIWDbfTilrMSAbWvhtFscwymJBX8+1YF33G33u59BTXfZHW7cR4PiruIub4FTjXXMPUZS0L5L4VQXHcUpHXYLWNYV+M5dthuYCVwY9Hf0k7vvjUDboH1fg1MaO+h+Zi/j3gllU/im9D9CY4wxMcraCIwxJsZZIjDGmBhnicAYY2KcJQJjjIlx+fwO4GyVKVNGq1Wr5ncYxhgTVb744ou9qlo2o2VRlwiqVavG+vXr/Q7DGGOiioh8n9kyqxoyxpgYZ4nAGGNinCUCY4yJcZYIjDEmxlkiMMaYGOdZIhBnIPJfRGRLJstFRMaLM9D3JhG5zKtYjDHGZM7LEsEMoF0Wy6/DGWavJpCMM9C3McaYMPPsOQJVXSUi1bJYpQMwU53uTz8VkZIiUkFVsxzy0Bhjwm32up28vfHH7Ff0iKqSmppK4xrlefLGUIbyPjt+thFUxBkAI90u970/EJFkEVkvIuv37NkTluCMMSbd2xt/JGX3IV+OfeTIETZs2MDGjRs5efJk9hucAz+fLJYM3stwcARVnQpMBUhMTLQBFIwxYRdfoTj/7NM8bMdLTU1l6NChjBkzhjJlyjBp0iQ6dWroybH8TAS7cAarTlcJZyQjY4yJeR07dmTJkiX07NmT5557jgsuuMCzY/lZNbQQ6OHePXQ5cNDaB4wxsezw4cOkpqYCMGDAAJYuXcr06dM9TQLgYYlARP6BM5B5GRHZhTMoeH4AVX0JWARcjzP+7TGgp1exGGMik9+NsKFK2X2I+ArFPT3GkiVLSE5O5vbbb2fEiBG0atXK0+MF8vKuoa7ZLFegn1fHN8ZEvvRGWK8vsucrvkJxOjTM8F6W87Z//3769+/Pa6+9Rp06dWjfvr0nx8lK1HVDbYzJXcLdCBtJli9fTlJSEvv27WPgwIE88cQTFCpUKOxxWCIwxhiflCtXjurVq7N48WIaNmzoWxyWCIwxnsusLSAaqoVykqry2muvsWHDBsaPH0/9+vVZu3YtIhndTR8+1umcMcZzmT2Q5WXde6T57rvvaNu2LT179mTjxo0cP34cwPckAFYiMMaESay2BaSlpTFx4kQee+wx8uTJw6RJk+jTpw958kTO93BLBMYY46G9e/cyePBgWrZsyUsvvUSVKlX8DukPLBEYY87Kudz7H2ttASdPnmTWrFn06NGD8uXLs2HDBqpXrx4R1UAZiZyyiTEmKpxLB2yx1BbwxRdfkJiYSM+ePfnggw8AuPjiiyM2CYCVCIwx5yBW6/uzcvz4cYYOHcqzzz5LuXLleOutt2jbtq3fYYXEEoExxuSAjh07snTpUnr37s2YMWMoWbKk3yGFzKqGjDHmHB06dOhMJ3GPP/44y5Yt4+WXX46qJABWIjDGcHYNwLHW8JuZRYsW0bdvX26//XZGjhxJy5Yt/Q7pnFmJwBhzVg3AsdTwm5G9e/fSvXt32rdvT1xcHDfddJPfIZ03KxEYYwBrAA7FBx98QFJSEgcOHGDw4ME8/vjjFCxY0O+wzpslAmOMCVGFChWoVasWkydPpn79+n6Hk2MsERgTA7JrA7B6/4ypKtOmTePLL79k4sSJ1KtXj9WrV0f0MwHnwtoIjIkB2bUBxHq9f0a+/fZb2rRpw1//+ldSUlIiqpO4nGYlAmNihLUBhCYtLY3x48czcOBA8uXLx5QpU+jdu3dEdRKX0ywRGGNMgL179zJ06FBat27N5MmTqVSpkt8heS73pjhjjAnRiRMnmD59OqdPn6Z8+fJs3LiRhQsXxkQSACsRGBP1QnkYzBqDM/f5559z1113sWXLFipVqsS1115LtWrV/A4rrKxEYEyUC+VhMGsM/qNjx47x8MMPc/nll3PgwAEWLlzItdde63dYvrASgTG5gDUEn70OHTqwbNkykpOTeeaZZyhRooTfIfnGSgTGmJhx8ODBM53EDRo0iA8//JApU6bEdBIAKxEYEzHOZeQvsPr/UL377rv07duX7t278/TTT9OiRQu/Q4oYViIwJkKcy8hfYPX/2dmzZw/dunXjxhtvpFSpUnTq1MnvkCKOlQiMiSBW15+zli5dSlJSEgcPHmTo0KEMGDCAAgUK+B1WxLFEYIzJtSpWrEjdunWZPHkyCQkJfocTsSwRGOOjwHYBq+s/f6dPn+aVV17hyy+/PHPxX7Vqld9hRTxrIzDGR4HtAlbXf3527NhB69at6dOnD9u3bz/TSZzJnpUIjPGZtQucn7S0NMaNG8egQYPInz8/L7/8Mr169cqVvYR6xdMSgYi0E5HtIrJDRAZksLyEiLwjIl+JyFYR6ellPMaY3Gfv3r0MHz6cP//5z6SkpNC7d29LAmfJs0QgInmBicB1QDzQVUTig1brB6SoagOgFfCciFiTvjEmS7/99hsvv/zy7zqJW7BgARUrWtXaufCyRNAU2KGq36rqCWAO0CFoHQXixEnfxYD9wCkPYzImIsxet5O/TPnknJ4biHXr1q2jcePGJCcns2zZMgCqVq1qpYDz4GUiqAj8EDC/y30v0ItAXeAnYDPwgKqeDt6RiCSLyHoRWb9nzx6v4jUmbNIbia2BOHRHjx6lf//+NG/enIMHD/Lee+/FbCdxOc3LxuKM0rMGzbcFNgLXADWAD0Rktar+7muSqk4FpgIkJiYG78OYqGSNxGenY8eOLFu2jLvvvptRo0ZRvLjdaptTvCwR7AIqB8xXwvnmH6gnMF8dO4DvgDoexmSMiSK//vrrmdtABw8ezMqVK5k0aZIlgRzmZYngc6CmiFQHfgRuA7oFrbMTaA2sFpHyQG3gWw9jMsYXwR3K2cNj2Vu4cCF333033bt3Z9SoUVx11VV+h5RreVYiUNVTwL3AEmAb8IaqbhWRviLS111tGHCFiGwGlgOPquper2Iyxi/BHcpZ20DmfvnlF2677TY6dOhAmTJl6NKli98h5XqePlCmqouARUHvvRTw+ifAWntMTLA2gewtXryYpKQkjhw5wrBhw3j00UfJnz+/32HlevZksTEmYlSuXJn69eszadIk4uODHzsyXrG+howxvjl9+jSTJ0+mT58+ACQkJLBixQpLAmFmJQJjOPfRwUJljcN/9M0339C7d29Wr17Nn//8Z1JTUylUqJDfYcUkKxEYw7mPDhYqaxz+n1OnTjF69GguvfRSNm/ezKuvvsqSJUssCfjISgTGuKwxNzz27dvH6NGjuf7665k4cSIVKlTwO6SYZyUCY4znfvvtN6ZMmXKmk7ivvvqK+fPnWxKIEFYiMLleKPX/VofvnU8++YRevXqxbds2atSoQZs2bahcuXL2G5qwsRKByfVCqf+3Ovycd+TIER588EH+9Kc/cfToURYvXkybNm38DstkwEoEJiZY/X/4dezYkeXLl3PvvfcycuRI4uLi/A7JZMJKBMaYHHPgwIEzncQNGTKE1atXM2HCBEsCES7kEoGIFFXVo14GY0xGzvcef6v/D4/58+fTr18/evTowejRo7nyyiv9DsmEKNsSgYhcISIpOB3HISINRGSS55EZ4zrfe/yt/t9bP//8M126dKFz585ceOGF3HbbbX6HZM5SKCWCsTgDyCwEUNWvRKSFp1EZE8Tq+CPT+++/T1JSEseOHWPkyJE8/PDD1klcFAqpakhVfwgaDzTNm3CMMdGkatWqNGrUiIkTJ1Knjo0pFa1CaSz+QUSuAFRECojIw7jVRMaY2HL69GlefPFF/vrXvwIQHx/P8uXLLQlEuVBKBH2BF3AGnt8FLAXu8TIok3vkRGdu1tgbGbZv306vXr1Ys2YNbdu2tU7icpFQSgS1VTVJVcurajlVvR2o63VgJnfIic7crLHXXydPnuTpp5+mQYMGpKSkMGPGDN5//31LArlIKCWCCcBlIbxnTIasoTe6HThwgDFjxnDjjTcyYcIELrzwQr9DMjks00QgIs2BK4CyItI/YFFxIK/XgRlj/JOamsr06dPp27cv5cqVY9OmTVSqVMnvsIxHsioRFACKuesEPhZ4CLDRpGPY2dT7W/1+9Pn444/p1asX33zzDbVq1aJNmzaWBHK5TBOBqq4EVorIDFX9PowxmQiXXu8fygXe6vejx+HDh3nssceYOHEi1apVY+nSpdZJXIwIpY3gmIiMARKAM61DqnqNZ1GZiGf1/rlPx44d+eijj3jggQcYPnw4xYoV8zskEyahJIJZwD+BG3BuJb0D2ONlUMaY8Ni/fz+FChWiSJEiDBs2DBGheXNL8LEmlNtHS6vqNOCkqq5U1buAyz2OyxjjsXnz5lG3bl2GDBkCwBVXXGFJIEaFkghOuj93i0h7EWkEWMtRjJq9bifrvtvvdxjmPOzevZtOnTpxyy23ULlyZZKSkvwOyfgslKqh4SJSAvgbzvMDxYEHvQzKRK70u4WsATg6vffee9x+++2kpqYyevRo+vfvT758Nj5VrMv2L0BV33VfHgSuBhCRP3kZlIlszaqXoluzKn6HYc7BxRdfTJMmTXjxxRepVauW3+GYCJFp1ZCI5BWRriLysIjUc9+7QUTWAi+GLUJjzDlLS0vjhRdeoFevXgDUrVuXpUuXWhIwv5NViWAaUBn4DBgvIt8DzYEBqrogDLGZCJL+EJk9IBY9UlJS6N27N5988gnXX3+9dRJnMpVVIkgELlXV0yJSCNgLXKKqP4cnNBNJApOAtQ9EthMnTvDMM88wbNgw4uLieP311+nWrRtBY4oYc0ZWieCEqp4GUNVUEfnmbJOAiLTD6cI6L/CKqo7KYJ1WwDggP7BXVVuezTFM+NhDZNHh119/ZezYsdx8882MHz+ecuXK+R2SiXBZJYI6IrLJfS1ADXdeAFXVS7PasYjkBSYCf8YZx+BzEVmoqikB65QEJgHtVHWniNhfrDHn4Pjx40ybNo177rmHcuXKsXnzZi666CK/wzJRIqtEcL5jDjQFdqjqtwAiMgfoAKQErNMNmK+qOwFU9ZfzPKY5C9Z5XO6watUqevfuzb/+9S/q1q1L69atLQmYs5LpXUOq+n1WUwj7rgj8EDC/y30vUC3gAhFZISJfiEiPjHYkIskisl5E1u/ZY71b5JSzGTTG2gYiz6FDh7jnnnto2bIlp06dYtmyZbRu3drvsEwU8vJJkoxapjSD4zcGWgOFgU9E5FNV/eZ3G6lOBaYCJCYmBu/DnAer949eHTt2ZMWKFTz00EMMGzaMokWL+h2SiVJeJoJdOLefpqsE/JTBOntV9ShwVERWAQ2AbzDG/MHevXspUqQIRYoUYcSIEYgIl19uXX+Z8xNKX0OISGERqX2W+/4cqCki1UWkAHAbsDBonbeBq0Qkn4gUAZoB287yOMbkeqrKnDlzqFu3Lk8++SQAzZs3tyRgckS2iUBEbgQ2Aovd+YYiEnxB/wNVPQXcCyzBubi/oapbRaSviPR119nm7ncTzoNrr6jqlnM8F2NypR9//JGOHTvStWtXqlevTo8eGTalGXPOQqkaGoJzB9AKAFXdKCLVQtm5qi4CFgW991LQ/BhgTCj7MybWvPvuuyQlJXHy5EmeffZZHnzwQfLmtSHDTc4KJRGcUtWD9lSiMeF3ySWXcMUVVzBhwgQuueQSv8MxuVQobQRbRKQbkFdEaorIBGCtx3EZE5PS0tIYO3Ysd955JwB16tTh/ffftyRgPBVKieA+YCDwGzAbp85/uJdBmXNnD4lFr61bt9KrVy/WrVtH+/btrZM4EzahlAhqq+pAVW3iTk+oaqrnkZlzYg+JRZ8TJ07w1FNP0ahRI/79738ze/Zs3nnnHUsCJmxCKRE8LyIVgLnAHFXd6nFM5jzZQ2LR5ddff2X8+PHccsstjBs3jrJly/odkokx2ZYIVPVqoBWwB5gqIptF5AmvAzMmNzt27BgvvPACaWlpZzqJmzVrliUB44uQnix2u58eLyIfAf8HDMbaCcIulPp/q/ePfB999BG9e/fm22+/pV69erRu3ZoKFSr4HZaJYaE8UFZXRIaIyBacISrX4nQXYcIslPp/q/ePXAcPHqRPnz5cc801iAgfffSRdRJnIkIoJYJXgX8A16pqcF9BJsys/j96dezYkVWrVvHII48wZMgQihQp4ndIxgAhJAJVtc5MjDlHe/bsoWjRohQpUoSnn36avHnz0qRJE7/DMuZ3Mq0aEpE33J+bRWRTwLQ5YOQyY0wGVJXZs2f/rpO4yy+/3JKAiUhZlQgecH/eEI5AjMktdu3axd133827775Ls2bNzjwlbEykymqEst3uy3syGJ3snvCEZ0x0WbhwIfHx8Xz44YeMHTuWNWvWkJCQ4HdYxmQplCeL/5zBe9fldCDG5Aa1atXiyiuvZPPmzdZTqIkamVYNicjdON/8Lw5qE4gD1ngdmDHR4NSpU4wbN45NmzYxc+ZM6tSpw6JFi7Lf0JgIklUbwWzgfeBpYEDA+4dVdb+nUcWo7B4Ys4fFIsumTZvo1asX69evp0OHDtZJnIlaWVUNqar+B+gHHA6YEJFS3ocWe7J7YMweFosMv/32G08++SSNGzdm586dvPHGG7z11luWBEzUyq5EcAPwBaBA4Mg0ClzsYVwxyx4Yi3yHDh1i0qRJdO3albFjx1K6dGm/QzLmvGSaCFT1Bvdn9fCFY0xkOnr0KFOnTuX++++nbNmybNmyhfLly/sdljE5IpS+hv4kIkXd17eLyPMiUsX70IyJDMuXL6d+/fr079+flStXAlgSMLlKKLePTgaOiUgDnJ5Hvwf+7mlUMWb2up38ZconIQ8oY8Lj119/pXfv3rRp04Z8+fKxcuVKrrnmGr/DMibHhZIITqmqAh2AF1T1BZxbSE0OSW8ktsbgyHLzzTczY8YMHn30Ub766itatGjhd0jGeCKU3kcPi8hjQHfgKhHJC+T3NqzYY43EkeG///0vxYoVo2jRoowaNYp8+fLRuHFjv8MyxlOhlAj+gjNw/V3uADUVgTGeRmVMmKkqf//734mPjz/TSVyzZs0sCZiYEMpQlT8Ds4ASInIDkKqqMz2PzJgw2blzJ+3bt6dHjx7Url2bXr16+R2SMWEVyl1DtwKfAbcAtwLrRKSL14EZEw5vv/02CQkJrFq1ivHjx7N69Wrq1q3rd1jGhFUobQQDgSaq+guAiJQFlgHzvAzMGC+pKiJCnTp1aNWqFRMmTKBatWp+h2WML0JpI8iTngRc+0LczpiIc+rUKUaPHk337t0BqF27Nu+8844lARPTQrmgLxaRJSJyp4jcCbwHWPeKJup89dVXNGvWjAEDBnDs2DFSU1P9DsmYiBBKY/EjwBTgUqABMFVVH/U6MGNySmpqKk888QSJiYn8+OOPzJs3j/nz51sncca4shqPoCbwLFAD2Aw8rKqZ95FsTIQ6fPgwU6ZMISkpieeff55SpazzXGMCZVUimA68C3TG6YF0wtnuXETaich2EdkhIgOyWK+JiKTZ3Ugmpxw5coRnn32WtLQ0ypYtS0pKCjNmzLAkYEwGsrprKE5VX3ZfbxeRDWezY/cJ5Ik4Q13uAj4XkYWqmpLBeqOBJWezf2Mys3TpUpKTk9m5cyeNGzfm6quvpmzZsn6HZUzEyqpEUEhEGonIZSJyGVA4aD47TYEdqvqtqp4A5uD0VxTsPuBN4JcMluV6s9ftZN13NuBbTti/fz89e/akbdu2FCpUiNWrV3P11Vf7HZYxES+rEsFu4PmA+Z8D5hXIrhvGisAPAfO7gGaBK4hIReBmd19NMtuRiCQDyQBVquSuHrDTh6a0zubO380338yaNWt4/PHHGTRokDUGGxOirAamOd+vUpLBexo0Pw54VFXTRDJa/UwsU4GpAImJicH7iHrNqpeiW7PcleDC5eeffyYuLo6iRYsyZswYChQoQMOGDf0Oy5io4uWDYbuAygHzlYCfgtZJBOaIyH+ALsAkEenoYUwml1BVZsyYQXx8PIMHDwagadOmlgSMOQehdDFxrj4HaopIdeBH4DagW+AKgcNgisgM4F1VXeBhTGExe93OM1U+2Ukfh8CE7j//+Q99+vRh6dKlXHnllSQnJ/sdkjFRzbMSgaqeAu7FuRtoG/CGqm4Vkb4i0ter40aC9IFmQmGD0Zydt956i3r16rF27VpefPFFVq5cSe3atf0Oy5iolm2JQJzK+yTgYlV9yh2v+EJV/Sy7bVV1EUHdUajqS5mse2dIEUcJG2gmZ6V3EpeQkECbNm144YUXqFq1qt9hGZMrhFIimAQ0B7q684dxng8wxnMnT55k5MiRJCUlAVCrVi0WLFhgScCYHBRKImimqv2AVABVPQAU8DQqY4ANGzbQtGlTBg4cSFpaGr/99pvfIRmTK4XSWHzSffpX4cx4BKc9jSoKBTYQWwPw+Tl+/DhPPfUUY8aMoWzZsrz11lt07NjR77CMybVCKRGMB94CyonICOBjYKSnUUWhwAZiawA+P0ePHmXatGnccccdpKSkWBIwxmPZlghUdZaIfAG0xnlIrKOqbvM8sihkDcTn7vDhw0yePJm//e1vlClThpSUFMqUKeN3WMbEhFDGLK4CHAPeARYCR933jMkRixcvpl69egwYMIDVq1cDWBIwJoxCaSN4D6d9QIBCQHVgO5DgYVwmBuzbt4/+/fszc+ZM6taty5o1a2je3EpUxoRbKFVD9QPn3Z5H+3gWkYkZnTp1Yu3atQwaNIiBAwdSsGBBv0MyJiaddRcTqrpBRDLtKdSYrOzevZu4uDiKFSvGs88+S4ECBWjQoIHfYRkT00J5srh/wGwe4DJgj2cRmVxJVXn11Vfp378/d911F88//zxNmtj3CWMiQSglgriA16dw2gze9Cac6GLPDoTm22+/pU+fPixbtowWLVrQt2+u7mrKmKiTZSJwHyQrpqqPhCmeqJL+7EB8heL27EAm5s+fT/fu3cmbNy+TJ08mOTmZPHm87P3cGHO2Mk0EIpJPVU+FOCxlzLJnBzKW3klc/fr1adeuHePGjaNy5crZb2iMCbusSgSf4bQHbBSRhcBc4Gj6QlWd73FsJgqdOHGCZ555hq1btzJ79mxq1qzJm29aTaIxkSyUMnopYB/OuMI3ADe6P435nfXr19OkSRMGDRoEOEnBGBP5sioRlHPvGNrC/x4oS5frxg0OlTUQ/9Hx48d58sknee6557jwwgt5++23uemmm/wOyxgToqxKBHmBYu4UF/A6fYpJ1rncHx09epQZM2bQq1cvtm7daknAmCiTVYlgt6o+FbZIoog1EMOhQ4eYNGkSjzzyCGXKlGHbtm2ULl3a77CMMecgqxKBZLHMxLD33nuPhIQEBg4ceKaTOEsCxkSvrEoErcMWRQQLbBOA2G4X2LNnDw8++CCzZ88mISGBefPm0axZM7/DMsacp0xLBKq6P5yBRKrANgGI7XaBzp07M3fuXIYMGcKGDRssCRiTS5x1p3OxKJbbBH788UdKlChBsWLFGDt2LAULFqRevXp+h2WMyUH2rL/JkKry8ssvEx8fz+DBgwFo3LixJQFjciFLBOYP/v3vf9O6dWuSk5Np3Lgx/fr18zskY4yHLBFkYfa6naz7LraaSubNm0f9+vX54osvmDp1KsuXL6dGjRp+h2WM8ZC1EWQh/W6hWGgcTu8krkGDBrRv356xY8dSqVIlv8MyxoSBlQiy0ax6Kbo1q+J3GJ45ceIEQ4cO5bbbbkNVqVmzJnPnzrUkYEwMsUQQwz777DMaN27MkCFDyJcvn3USZ0yMskQQg44dO8bDDz9M8+bNOXDgAO+88w6zZs2yweONiVGWCGLQ8ePHef3110lOTiYlJYUbbrBexY2JZZ4mAhFpJyLbRWSHiAzIYHmSiGxyp7Ui0sDLeGLZwYMHGTFiBKdOnaJ06dJs27aNyZMnU7x4bHaXYYz5H88SgTve8UTgOiAe6Coi8UGrfQe0VNVLgWHAVK/iiWXvvPPOmQfDPv74YwAuuOACn6MyxkQKL0sETYEdqvqtqp4A5gAdAldQ1bWqesCd/RSIiFtVZq/byV+mfPK7Poai0Z49e+jatSs33XQTpUuXZt26dbRq1crvsIwxEcbLRFAR+CFgfpf7XmZ6Ae9ntEBEkkVkvYis37NnTw6GmLH0juaivYO5zp078+abb/LUU0+xfv16EhMT/Q7JGBOBvHygLKPxDDIc4lJErsZJBFdmtFxVp+JWGyUmJoZlmMxo7Whu165dlCxZkmLFijFu3DgKFixIQkKC32EZYyKYlyWCXUDlgPlKwE/BK4nIpcArQAdV3edhPLna6dOnmTJlCvHx8WcGj7/sssssCRhjsuVlIvgcqCki1UWkAHAbsDBwBRGpAswHuqvqNx7Gkqv961//4pprrqFv3740bdqU++67z++QjDFRxLOqIVU9JSL3AkuAvMB0Vd0qIn3d5S8Bg4HSwCQRATilqr5UZAeORBZNo5DNnTuXHj16ULBgQaZNm0bPnj1xP0tjjAmJp53OqeoiYFHQey8FvO4N9PYyhlAFNhBHQyNxeidxjRo1okOHDjz//PNcdNFFfodljIlC1vtogGhoIP7tt98YMWIE27Zt44033uCSSy5hzpw5fodljIli1sVEFPn000+57LLLGDZsGIULF7ZO4owxOcISQRQ4evQoDz30EFdccQWHDx9m0aJFzJw50zqJM8bkCEsEUSA1NZU5c+Zwzz33sHXrVq677jq/QzLG5CLWRhChfv31VyZMmMBjjz12ppO4kiVL+h2WMSYXshJBBFqwYAHx8fEMHTqUtWvXAlgSMMZ4xhJBBPnvf//Lrbfeys0330y5cuVYt24dLVq08DssY0wuZ1VDEaRLly589tlnDB8+nP/7v/8jf/78fodkjIkBlgh8tnPnTi644ALi4uIYP348BQsWJD4+eNgGY4zxjlUN+eT06dNMnDiRhIQEBg8eDECjRo0sCRhjws4SgQ+2b99Oy5Ytuffee2nevDkPPPCA3yEZY2JYTFcN+dHR3BtvvEGPHj0oXLgwr776KnfccYd1EmeM8VVMlwjSO5oDPO9oTtUZT6dx48Z06tSJbdu2ceedd1oSMMb4LqZLBOB9R3OpqakMGzaMr7/+mnnz5lGjRg1mz57t2fGMMeZsxXSJwGtr166lUaNGjBw5kri4OOskzhgTkSwReODIkSPcf//9XHnllRw7dozFixczY8YM6yTOGBORLBF44MSJE8ybN49+/fqxZcsW2rZt63dIxhiTqZhvI8gp+/fvZ/z48TzxxBOUKlWKbdu2UaJECb/DMsaYbFmJIAe8+eabxMfHM3z48DOdxFkSMMZEC0sE52H37t107tyZLl26cNFFF7F+/XrrJM4YE3ViNhHMXreTdd/tP6993Hrrrbz33nuMGjWKzz77jIYNG+ZMcMYYE0Yx20aQ/kTx2T5E9v3331OqVCni4uKYMGEChQsXpnbt2l6EaIwxYRGzJQKAZtVL0a1ZlZDWPX36NBMmTCAhIYFBgwYB0LBhQ0sCxpioF7MlgrPx9ddf07t3b9asWUO7du146KGH/A7JGGNyTEyXCEIxZ84cGjRowLZt25g5cyaLFi2iatWqfodljDE5xhJBJk6fPg1AkyZNuOWWW0hJSaF79+7WSZwxJtexRBDk+PHjDBgwgM6dO6Oq1KhRg9dff53y5cv7HZoxxnjCEkGA1atX07BhQ0aPHk3p0qU5efKk3yEZY4znLBEAhw8fpl+/frRo0YKTJ0/ywQcf8Morr1CgQAG/QzPGGM/FzF1DgaORwe9HJDt58iQLFizgwQcfZPjw4RQtWtSvMI0xJuxiJhGkj0aWfvGvVbYwsnM9p041oVSpUnz99dfExcX5HKUxxoSfp1VDItJORLaLyA4RGZDBchGR8e7yTSJymZfxxFcozpzky+lSahcfP3ULbz7Tn08++QTAkoAxJmZ5lghEJC8wEbgOiAe6ikh80GrXATXdKRmY7FU84IwT0KlTJ2699VYqV67M+vXrueqqq7w8pDHGRDwvSwRNgR2q+q2qngDmAB2C1ukAzFTHp0BJEangVUBbU7ayePFinnnmGT799FMaNGjg1aGMMSZqeNlGUBH4IWB+F9AshHUqArsDVxKRZJwSA1WqhNY3ULD4i4pTLn8C9z30FbVq1TqnfRhjTG7kZSLI6BFcPYd1UNWpwFSAxMTEPywPxZM3JpzLZsYYk+t5WTW0C6gcMF8J+Okc1jHGGOMhLxPB50BNEakuIgWA24CFQessBHq4dw9dDhxU1d3BOzLGGOMdz6qGVPWUiNwLLAHyAtNVdauI9HWXvwQsAq4HdgDHgJ5exWOMMSZjnj5QpqqLcC72ge+9FPBagX5exmCMMSZr1teQMcbEOEsExhgT4ywRGGNMjLNEYIwxMU6c9troISJ7gO/PcfMywN4cDCca2DnHBjvn2HA+51xVVctmtCDqEsH5EJH1qprodxzhZOccG+ycY4NX52xVQ8YYE+MsERhjTIyLtUQw1e8AfGDnHBvsnGODJ+ccU20Exhhj/ijWSgTGGGOCWCIwxpgYlysTgYi0E5HtIrJDRAZksFxEZLy7fJOIXOZHnDkphHNOcs91k4isFZGoH6czu3MOWK+JiKSJSJdwxueFUM5ZRFqJyEYR2SoiK8MdY04L4W+7hIi8IyJfuecc1b0Yi8h0EflFRLZksjznr1+qmqsmnC6v/w1cDBQAvgLig9a5HngfZ4S0y4F1fscdhnO+ArjAfX1dLJxzwHof4vSC28XvuMPwey4JpABV3PlyfscdhnN+HBjtvi4L7AcK+B37eZxzC+AyYEsmy3P8+pUbSwRNgR2q+q2qngDmAB2C1ukAzFTHp0BJEakQ7kBzULbnrKprVfWAO/spzmhw0SyU3zPAfcCbwC/hDM4joZxzN2C+qu4EUNVoP+9QzlmBOBERoBhOIjgV3jBzjqquwjmHzOT49Ss3JoKKwA8B87vc9852nWhytufTC+cbRTTL9pxFpCJwM/ASuUMov+dawAUiskJEvhCRHmGLzhuhnPOLQF2cYW43Aw+o6unwhOeLHL9+eTowjU8kg/eC75ENZZ1oEvL5iMjVOIngSk8j8l4o5zwOeFRV05wvi1EvlHPOBzQGWgOFgU9E5FNV/cbr4DwSyjm3BTYC1wA1gA9EZLWqHvI4Nr/k+PUrNyaCXUDlgPlKON8UznadaBLS+YjIpcArwHWqui9MsXkllHNOBOa4SaAMcL2InFLVBWGJMOeF+re9V1WPAkdFZBXQAIjWRBDKOfcERqlTgb5DRL4D6gCfhSfEsMvx61durBr6HKgpItVFpABwG7AwaJ2FQA+39f1y4KCq7g53oDko23MWkSrAfKB7FH87DJTtOatqdVWtpqrVgHnAPVGcBCC0v+23gatEJJ+IFAGaAdvCHGdOCuWcd+KUgBCR8kBt4NuwRhleOX79ynUlAlU9JSL3Aktw7jiYrqpbRaSvu/wlnDtIrgd2AMdwvlFErRDPeTBQGpjkfkM+pVHcc2OI55yrhHLOqrpNRBYDm4DTwCuqmuFtiNEgxN/zMGCGiGzGqTZ5VFWjtntqEfkH0AooIyK7gCeB/ODd9cu6mDDGmBiXG6uGjDHGnAVLBMYYE+MsERhjTIyzRGCMMTHOEoExxsQ4SwQmIrm9hW4MmKplse6RHDjeDBH5zj3WBhFpfg77eEVE4t3XjwctW3u+Mbr7Sf9ctrg9bpbMZv2GInJ9Thzb5F52+6iJSCJyRFWL5fS6WexjBvCuqs4TkWuBZ1X10vPY33nHlN1+ReQ14BtVHZHF+ncCiap6b07HYnIPKxGYqCAixURkufttfbOI/KGnURGpICKrAr4xX+W+f62IfOJuO1dEsrtArwIucbft7+5ri4g86L5XVETec/u/3yIif3HfXyEiiSIyCijsxjHLXXbE/fnPwG/obkmks4jkFZExIvK5OH3M9wnhY/kEt7MxEWkqzjgTX7o/a7tP4j4F/MWN5S9u7NPd43yZ0edoYpDffW/bZFNGE5CG05HYRuAtnKfgi7vLyuA8VZleoj3i/vwbMNB9nReIc9ddBRR1338UGJzB8WbgjlcA3AKsw+m8bTNQFKd7461AI6Az8HLAtiXcnytwvn2fiSlgnfQYbwZec18XwOlFsjCQDDzhvl8QWA9UzyDOIwHnNxdo584XB/K5r9sAb7qv7wReDNh+JHC7+7okTh9ERf3+fdvk75TrupgwucZxVW2YPiMi+YGRItICp+uEikB54OeAbT4HprvrLlDVjSLSEogH1rhdaxTA+SadkTEi8gSwB6eH1tbAW+p04IaIzAeuAhYDz4rIaJzqpNVncV7vA+NFpCDQDlilqsfd6qhL5X+jqJUAagLfBW1fWEQ2AtWAL4APAtZ/TURq4vREmT+T418L3CQiD7vzhYAqRHd/ROY8WSIw0SIJZ/Spxqp6UkT+g3MRO0NVV7mJoj3wdxEZAxwAPlDVriEc4xFVnZc+IyJtMlpJVb8RkcY4/b08LSJLVfWpUE5CVVNFZAVO18l/Af6RfjjgPlVdks0ujqtqQxEpAbwL9APG4/S385Gq3uw2rK/IZHsBOqvq9lDiNbHB2ghMtCgB/OImgauBqsEriEhVd52XgWk4w/19CvxJRNLr/IuISK0Qj7kK6OhuUxSnWme1iFwEHFPV14Fn3eMEO+mWTDIyB6ejsKtwOlPD/Xl3+jYiUss9ZoZU9SBwP/Cwu00J4Ed38Z0Bqx7GqSJLtwS4T9zikYg0yuwYJnZYIjDRYhaQKCLrcUoHX2ewTitgo4h8iVOP/4Kq7sG5MP5DRDbhJIY6oRxQVTfgtB18htNm8IqqfgnUBz5zq2gGAsMz2HwqsCm9sTjIUpxxaZepM/wiOONEpAAbxBm0fArZlNjdWL7C6Zr5GZzSyRqc9oN0HwHx6Y3FOCWH/G5sW9x5E+Ps9lFjjIlxViIwxpgYZ4nAGGNinCUCY4yJcZYIjDEmxlkiMMaYGGeJwBhjYpwlAmOMiXH/D95IOHfpsjmgAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"from matplotlib import pyplot as plt\n",
"#Evaluating classification result by ROC curves\n",
"from sklearn.metrics import roc_curve\n",
"y_pred_prob_svm = model_2_svm.predict_proba(x_test_svm)[:,1]\n",
"fpr, tpr, threshold = roc_curve(y_test_svm, y_pred_prob_svm)\n",
"plt.plot([0, 1], [0, 1], 'k--')\n",
"plt.plot(fpr,tpr)\n",
"auc = roc_auc_score(y_test_svm, y_pred_prob_svm)\n",
"plt.title(f'AUC: {auc}')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.ylabel('True Positive Rate')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
ibm = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition.csv')
pd.set_option('display.max_columns', None)
ibm.shape
(1470, 35)
ibm.describe()
Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | 2.728571 | 6502.931293 | 14313.103401 | 2.693197 | 15.209524 | 3.153741 | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | 1.102846 | 4707.956783 | 7117.786044 | 2.498009 | 3.659938 | 0.360824 | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
min | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | 1.000000 | 1009.000000 | 2094.000000 | 0.000000 | 11.000000 | 3.000000 | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | 2.000000 | 2911.000000 | 8047.000000 | 1.000000 | 12.000000 | 3.000000 | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
50% | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | 3.000000 | 4919.000000 | 14235.500000 | 2.000000 | 14.000000 | 3.000000 | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
75% | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | 4.000000 | 8379.000000 | 20461.500000 | 4.000000 | 18.000000 | 3.000000 | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
max | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | 4.000000 | 19999.000000 | 26999.000000 | 9.000000 | 25.000000 | 4.000000 | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
import statistics
for i in ibm.columns:
print(i, " mode: ", statistics.mode(ibm[i]));
Age mode: 35
Attrition mode: No
BusinessTravel mode: Travel_Rarely
DailyRate mode: 691
Department mode: Research & Development
DistanceFromHome mode: 2
Education mode: 3
EducationField mode: Life Sciences
EmployeeCount mode: 1
EmployeeNumber mode: 1
EnvironmentSatisfaction mode: 3
Gender mode: Male
HourlyRate mode: 66
JobInvolvement mode: 3
JobLevel mode: 1
JobRole mode: Sales Executive
JobSatisfaction mode: 4
MaritalStatus mode: Married
MonthlyIncome mode: 2342
MonthlyRate mode: 9150
NumCompaniesWorked mode: 1
Over18 mode: Y
OverTime mode: No
PercentSalaryHike mode: 11
PerformanceRating mode: 3
RelationshipSatisfaction mode: 3
StandardHours mode: 80
StockOptionLevel mode: 0
TotalWorkingYears mode: 10
TrainingTimesLastYear mode: 2
WorkLifeBalance mode: 3
YearsAtCompany mode: 5
YearsInCurrentRole mode: 2
YearsSinceLastPromotion mode: 0
YearsWithCurrManager mode: 2
ibm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 1470 non-null int64
1 Attrition 1470 non-null object
2 BusinessTravel 1470 non-null object
3 DailyRate 1470 non-null int64
4 Department 1470 non-null object
5 DistanceFromHome 1470 non-null int64
6 Education 1470 non-null int64
7 EducationField 1470 non-null object
8 EmployeeCount 1470 non-null int64
9 EmployeeNumber 1470 non-null int64
10 EnvironmentSatisfaction 1470 non-null int64
11 Gender 1470 non-null object
12 HourlyRate 1470 non-null int64
13 JobInvolvement 1470 non-null int64
14 JobLevel 1470 non-null int64
15 JobRole 1470 non-null object
16 JobSatisfaction 1470 non-null int64
17 MaritalStatus 1470 non-null object
18 MonthlyIncome 1470 non-null int64
19 MonthlyRate 1470 non-null int64
20 NumCompaniesWorked 1470 non-null int64
21 Over18 1470 non-null object
22 OverTime 1470 non-null object
23 PercentSalaryHike 1470 non-null int64
24 PerformanceRating 1470 non-null int64
25 RelationshipSatisfaction 1470 non-null int64
26 StandardHours 1470 non-null int64
27 StockOptionLevel 1470 non-null int64
28 TotalWorkingYears 1470 non-null int64
29 TrainingTimesLastYear 1470 non-null int64
30 WorkLifeBalance 1470 non-null int64
31 YearsAtCompany 1470 non-null int64
32 YearsInCurrentRole 1470 non-null int64
33 YearsSinceLastPromotion 1470 non-null int64
34 YearsWithCurrManager 1470 non-null int64
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
ibm.drop(columns = 'EmployeeCount', inplace = True)
ibm.drop(columns = 'EmployeeNumber', inplace = True)
ibm.drop(columns = 'Over18', inplace = True)
ibm.drop(columns = 'StandardHours', inplace = True)
ibm.drop_duplicates()
Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Yes | 11 | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | No | 23 | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Yes | 15 | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Yes | 11 | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | No | 12 | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1465 | 36 | No | Travel_Frequently | 884 | Research & Development | 23 | 2 | Medical | 3 | Male | 41 | 4 | 2 | Laboratory Technician | 4 | Married | 2571 | 12290 | 4 | No | 17 | 3 | 3 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
1466 | 39 | No | Travel_Rarely | 613 | Research & Development | 6 | 1 | Medical | 4 | Male | 42 | 2 | 3 | Healthcare Representative | 1 | Married | 9991 | 21457 | 4 | No | 15 | 3 | 1 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
1467 | 27 | No | Travel_Rarely | 155 | Research & Development | 4 | 3 | Life Sciences | 2 | Male | 87 | 4 | 2 | Manufacturing Director | 2 | Married | 6142 | 5174 | 1 | Yes | 20 | 4 | 2 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
1468 | 49 | No | Travel_Frequently | 1023 | Sales | 2 | 3 | Medical | 4 | Male | 63 | 2 | 2 | Sales Executive | 2 | Married | 5390 | 13243 | 2 | No | 14 | 3 | 4 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
1469 | 34 | No | Travel_Rarely | 628 | Research & Development | 8 | 3 | Medical | 2 | Male | 82 | 4 | 2 | Laboratory Technician | 3 | Married | 4404 | 10228 | 2 | No | 12 | 3 | 1 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows Γ 31 columns
ibm.isnull().sum()
Age 0 Attrition 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
# replace Attrition (0 - No, 1 - Yes)
ibm.replace({'Attrition' : {'Yes': 1, 'No': 0}}, inplace = True)
# replace BusinessTravel (0 - Non-Travel, 1 - Travel_Rarely, 2 - Travel_Frequently)
ibm.replace({'BusinessTravel' : {'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}}, inplace = True)
#Department
dummy = pd.get_dummies(ibm['Department'])
ibm.insert(5,'Dp_Sales&Development', dummy['Research & Development'])
ibm.insert(6,'Dp_Sales', dummy['Sales'])
ibm.insert(7,'Dp_HumanResources', dummy['Human Resources'])
ibm.drop(columns = 'Department', inplace = True)
#EducationField
dummy = pd.get_dummies(ibm['EducationField'])
ibm.insert(11,'EF_Life Sciences',dummy['Life Sciences'])
ibm.insert(12,'EF_Medical',dummy['Medical'])
ibm.insert(13,'EF_Marketing',dummy['Marketing'])
ibm.insert(14,'EF_TechnicalDegree',dummy['Technical Degree'])
ibm.insert(15,'EF_HumanResources',dummy['Human Resources'])
ibm.insert(16,'EF_Other',dummy['Other'])
ibm.drop(columns = 'EducationField', inplace = True)
# replace Gender (0 - Male; 1 - Female)
ibm.replace({'Gender': {'Male': 0, 'Female': 1}}, inplace = True)
# Job role dummy variables
dummy=pd.get_dummies(ibm['JobRole'])
ibm.insert(23, 'JR_HealthcareRepresentive', dummy['Healthcare Representative'])
ibm.insert(24, 'JR_HumanResource', dummy['Human Resources'])
ibm.insert(25, 'JR_LaboratoryTechnician', dummy['Laboratory Technician'])
ibm.insert(26, 'JR_Manager', dummy['Manager'])
ibm.insert(27, 'JR_ManufacturingDirector', dummy['Manufacturing Director'])
ibm.insert(28, 'JR_ResearchDirector', dummy['Research Director'])
ibm.insert(29, 'JR_ResearchScientist', dummy['Research Scientist'])
ibm.insert(30, 'JR_SalesExecutive', dummy['Sales Executive'])
ibm.insert(31, 'JR_SalesRepresentative', dummy['Sales Representative'])
ibm.drop(columns = 'JobRole', inplace = True)
# MaritalStatus role dummy variables
dummy=pd.get_dummies(ibm['MaritalStatus'])
ibm.insert(34, 'MS_Married', dummy['Married'])
ibm.insert(35, 'MS_Single', dummy['Single'])
ibm.insert(36, 'MS_Divorced', dummy['Divorced'])
ibm.drop(columns = 'MaritalStatus', inplace = True)
# replace Overtime (0 - No; 1 - Yes)
ibm.replace({'OverTime': {'No': 0, 'Yes': 1}}, inplace = True)
# replace Over18 (0 - N; 1 - Y)
ibm.replace({'Over18': {'N': 0, 'Y': 1}}, inplace = True)
def iqr_outliers(data):
out=[]
firstQuartile = data.quantile(0.25)
thirdQuartile = data.quantile(0.75)
iqr = thirdQuartile-firstQuartile
Lower_bound = firstQuartile - 1.5 * iqr
Upper_bound = thirdQuartile + 1.5 * iqr
for i in data:
if i > Upper_bound or i < Lower_bound:
out.append(i)
print("Outliers:",out , "\nCount: ", len(out), "\n")
return out
for c_name in ibm.columns:
print (c_name)
iqr_outliers(ibm[c_name])
Age
Outliers: []
Count: 0
Attrition
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 237
BusinessTravel
Outliers: [2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2]
Count: 427
DailyRate
Outliers: []
Count: 0
Dp_Sales&Development
Outliers: []
Count: 0
Dp_Sales
Outliers: []
Count: 0
Dp_HumanResources
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 63
DistanceFromHome
Outliers: []
Count: 0
Education
Outliers: []
Count: 0
EnvironmentSatisfaction
Outliers: []
Count: 0
EF_Life Sciences
Outliers: []
Count: 0
EF_Medical
Outliers: []
Count: 0
EF_Marketing
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 159
EF_TechnicalDegree
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 132
EF_HumanResources
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 27
EF_Other
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 82
Gender
Outliers: []
Count: 0
HourlyRate
Outliers: []
Count: 0
JobInvolvement
Outliers: []
Count: 0
JobLevel
Outliers: []
Count: 0
JobSatisfaction
Outliers: []
Count: 0
JR_HealthcareRepresentive
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 131
JR_HumanResource
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 52
JR_LaboratoryTechnician
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 259
JR_Manager
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 102
JR_ManufacturingDirector
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 145
JR_ResearchDirector
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 80
JR_ResearchScientist
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 292
JR_SalesExecutive
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 326
JR_SalesRepresentative
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 83
MonthlyIncome
Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880]
Count: 114
MonthlyRate
Outliers: []
Count: 0
NumCompaniesWorked
Outliers: [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
Count: 52
MS_Married
Outliers: []
Count: 0
MS_Single
Outliers: []
Count: 0
MS_Divorced
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Count: 327
OverTime
Outliers: []
Count: 0
PercentSalaryHike
Outliers: []
Count: 0
PerformanceRating
Outliers: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
Count: 226
RelationshipSatisfaction
Outliers: []
Count: 0
StockOptionLevel
Outliers: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Count: 85
TotalWorkingYears
Outliers: [31, 29, 37, 38, 30, 40, 36, 34, 32, 33, 37, 30, 36, 31, 33, 32, 37, 31, 32, 32, 30, 34, 30, 40, 29, 35, 31, 33, 31, 29, 32, 30, 33, 30, 29, 31, 32, 33, 36, 34, 31, 36, 33, 31, 29, 33, 29, 32, 31, 35, 29, 32, 34, 36, 32, 30, 36, 29, 34, 37, 29, 29, 35]
Count: 63
TrainingTimesLastYear
Outliers: [0, 5, 5, 5, 6, 5, 5, 5, 6, 6, 0, 0, 0, 5, 0, 5, 5, 5, 6, 6, 5, 0, 6, 5, 5, 0, 5, 5, 6, 5, 5, 5, 0, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 0, 0, 5, 5, 5, 6, 6, 5, 0, 5, 0, 5, 5, 0, 6, 0, 5, 5, 6, 6, 5, 6, 5, 0, 5, 5, 5, 5, 0, 6, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 0, 5, 0, 5, 5, 6, 5, 6, 5, 0, 5, 5, 0, 6, 6, 5, 6, 0, 5, 0, 6, 6, 6, 6, 5, 5, 0, 5, 0, 0, 6, 0, 6, 5, 6, 5, 5, 0, 5, 6, 6, 5, 5, 0, 0, 6, 0, 0, 5, 0, 5, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6, 0, 6, 6, 5, 5, 0, 0, 6, 6, 0, 5, 0, 0, 0, 0, 0, 5, 5, 6, 5, 5, 0, 5, 5, 0, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 0, 0, 5, 5, 5, 5, 6, 0, 0, 6, 6, 6, 6, 5, 5, 5, 6, 5, 0, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 0, 5, 5, 5, 5, 5, 0, 0, 0, 6, 5, 6, 6, 5, 6, 0, 6, 6, 5, 6, 6, 5, 5, 5, 0]
Count: 238
WorkLifeBalance
Outliers: []
Count: 0
YearsAtCompany
Outliers: [25, 22, 22, 27, 21, 22, 37, 25, 20, 40, 20, 24, 20, 24, 33, 20, 19, 22, 33, 24, 19, 21, 20, 36, 20, 20, 22, 24, 21, 21, 25, 21, 29, 20, 27, 20, 31, 32, 20, 20, 21, 22, 22, 34, 24, 26, 31, 20, 31, 26, 19, 21, 21, 32, 21, 19, 20, 22, 20, 21, 26, 20, 22, 24, 33, 29, 25, 21, 19, 19, 20, 19, 33, 19, 19, 20, 20, 20, 20, 20, 32, 20, 21, 33, 36, 26, 30, 22, 23, 23, 21, 21, 22, 22, 19, 22, 19, 22, 20, 20, 20, 22, 20, 20]
Count: 104
YearsInCurrentRole
Outliers: [15, 16, 18, 15, 18, 17, 16, 15, 16, 15, 16, 16, 15, 16, 17, 15, 15, 15, 17, 17, 16]
Count: 21
YearsSinceLastPromotion
Outliers: [8, 15, 8, 8, 9, 13, 12, 10, 11, 9, 12, 15, 15, 15, 9, 11, 11, 9, 12, 11, 15, 11, 10, 9, 11, 9, 8, 11, 11, 8, 13, 9, 9, 12, 10, 11, 15, 13, 9, 11, 10, 8, 8, 11, 9, 11, 12, 11, 14, 13, 14, 8, 11, 15, 10, 11, 11, 15, 11, 13, 11, 13, 15, 8, 13, 15, 11, 14, 15, 15, 9, 11, 9, 8, 9, 15, 11, 12, 9, 8, 10, 14, 8, 13, 13, 12, 14, 8, 8, 8, 14, 14, 8, 12, 13, 14, 14, 12, 11, 8, 11, 9, 12, 8, 9, 11, 9]
Count: 107
YearsWithCurrManager
Outliers: [17, 15, 15, 15, 15, 17, 16, 17, 15, 17, 17, 17, 17, 16]
Count: 14
def remove_outliers(c_name):
outliers = iqr_outliers(ibm[c_name])
while (len(outliers)!=0):
for i in outliers:
ibm.drop(ibm.loc[ibm[c_name]==i].index, inplace = True)
outliers = iqr_outliers(ibm[c_name])
remove_outliers('MonthlyIncome')
Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880]
Count: 114
Outliers: [15427, 13458, 14756, 13245, 13664, 13503, 13549, 13872, 13734, 13591, 16064, 13675, 13496, 13603, 13525, 16015, 13964, 15992, 14336, 13212, 16555, 14118, 13610, 13237, 16184, 15402, 14814, 13770, 16307, 13826, 14275, 13582, 14852, 13194, 13973, 13726, 13320, 13120, 13499, 13758, 13191, 16124, 13577, 14026, 13142, 13695, 13402, 13247, 14732, 16422, 13757, 16032, 16328, 14411, 16437, 15202, 16413, 13269, 13966, 15972, 15379, 12936, 12965, 13116, 13464, 16291, 15787, 13225, 13348, 13341, 13206, 13744, 13570]
Count: 73
Outliers: [11994, 12490, 12185, 11849, 11996, 12061, 11878, 12504, 11935, 12808, 11836, 12742, 11904, 12169, 11916, 11957, 12031]
Count: 17
Outliers: [11713, 11691]
Count: 2
Outliers: [11631]
Count: 1
Outliers: []
Count: 0
ibm
Age | Attrition | BusinessTravel | DailyRate | Dp_Sales&Development | Dp_Sales | Dp_HumanResources | DistanceFromHome | Education | EnvironmentSatisfaction | EF_Life Sciences | EF_Medical | EF_Marketing | EF_TechnicalDegree | EF_HumanResources | EF_Other | Gender | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | JR_HealthcareRepresentive | JR_HumanResource | JR_LaboratoryTechnician | JR_Manager | JR_ManufacturingDirector | JR_ResearchDirector | JR_ResearchScientist | JR_SalesExecutive | JR_SalesRepresentative | MonthlyIncome | MonthlyRate | NumCompaniesWorked | MS_Married | MS_Single | MS_Divorced | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | 1 | 1 | 1102 | 0 | 1 | 0 | 1 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 94 | 3 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5993 | 19479 | 8 | 0 | 1 | 0 | 1 | 11 | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | 0 | 2 | 279 | 1 | 0 | 0 | 8 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 61 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 5130 | 24907 | 1 | 1 | 0 | 0 | 0 | 23 | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | 1 | 1 | 1373 | 1 | 0 | 0 | 2 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 92 | 2 | 1 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2090 | 2396 | 6 | 0 | 1 | 0 | 1 | 15 | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | 0 | 2 | 1392 | 1 | 0 | 0 | 3 | 4 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 56 | 3 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2909 | 23159 | 1 | 1 | 0 | 0 | 1 | 11 | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | 0 | 1 | 591 | 1 | 0 | 0 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 40 | 3 | 1 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3468 | 16632 | 9 | 1 | 0 | 0 | 0 | 12 | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1465 | 36 | 0 | 2 | 884 | 1 | 0 | 0 | 23 | 2 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 41 | 4 | 2 | 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2571 | 12290 | 4 | 1 | 0 | 0 | 0 | 17 | 3 | 3 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
1466 | 39 | 0 | 1 | 613 | 1 | 0 | 0 | 6 | 1 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 42 | 2 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9991 | 21457 | 4 | 1 | 0 | 0 | 0 | 15 | 3 | 1 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
1467 | 27 | 0 | 1 | 155 | 1 | 0 | 0 | 4 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 87 | 4 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6142 | 5174 | 1 | 1 | 0 | 0 | 1 | 20 | 4 | 2 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
1468 | 49 | 0 | 2 | 1023 | 0 | 1 | 0 | 2 | 3 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 63 | 2 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5390 | 13243 | 2 | 1 | 0 | 0 | 0 | 14 | 3 | 4 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
1469 | 34 | 0 | 1 | 628 | 1 | 0 | 0 | 8 | 3 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 82 | 4 | 2 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4404 | 10228 | 2 | 1 | 0 | 0 | 0 | 12 | 3 | 1 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1263 rows Γ 48 columns
# import important library to do SVM
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
#The target for SVM will be the attrition of IBM employees to know whether the employees will continue stay or leave IBM
x_svm_find = ibm.drop(columns = 'Attrition')
y_svm = ibm['Attrition']
# Try use SelectKBest and chi-squared (chiΒ²) statistical test for non-negative feature to find top 15 best features
#Import library
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#Use SelectKBest class to find top 15 best features
best_15_features = SelectKBest(score_func=chi2, k=15)
fit = best_15_features.fit(x_svm_find,y_svm)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x_svm_find.columns)
#Try to concat two dataframes for a better visualization
top_15_feature_scores = pd.concat([dfcolumns,dfscores],axis=1)
#Name the dataframe columns
top_15_feature_scores.columns = ['Features','Score']
#Show 15 best features
print(top_15_feature_scores.nlargest(15,'Score'))
Features Score
29 MonthlyIncome 26471.159476
30 MonthlyRate 1308.443569
2 DailyRate 1111.594737
44 YearsInCurrentRole 109.263859
43 YearsAtCompany 103.805057
46 YearsWithCurrManager 100.636711
40 TotalWorkingYears 95.843571
35 OverTime 60.367656
6 DistanceFromHome 57.197704
0 Age 46.705340
28 JR_SalesRepresentative 27.299127
33 MS_Single 26.251695
39 StockOptionLevel 24.376114
20 JR_HealthcareRepresentive 10.935616
24 JR_ManufacturingDirector 9.987076
ibm_svm_features_df = pd.DataFrame()
# Set up data to do SVM using top 15 best features identified
ibm_svm_features_df.insert(0,'MonthlyIncome',ibm['MonthlyIncome'])
ibm_svm_features_df.insert(1,'MonthlyRate',ibm['MonthlyRate'])
ibm_svm_features_df.insert(2,'DailyRate',ibm['DailyRate'])
ibm_svm_features_df.insert(3,'YearsInCurrentRole',ibm['YearsInCurrentRole'])
ibm_svm_features_df.insert(4,'YearsAtCompany',ibm['YearsAtCompany'])
ibm_svm_features_df.insert(5,'YearsWithCurrManager',ibm['YearsWithCurrManager'])
ibm_svm_features_df.insert(6,'TotalWorkingYears',ibm['TotalWorkingYears'])
ibm_svm_features_df.insert(7,'OverTime',ibm['OverTime'])
ibm_svm_features_df.insert(8,'DistanceFromHome',ibm['DistanceFromHome'])
ibm_svm_features_df.insert(9,'Age',ibm['Age'])
ibm_svm_features_df.insert(10,'JR_SalesRepresentative',ibm['JR_SalesRepresentative'])
ibm_svm_features_df.insert(11,'MS_Single',ibm['MS_Single'])
ibm_svm_features_df.insert(12,'StockOptionLevel',ibm['StockOptionLevel'])
ibm_svm_features_df.insert(13,'JR_HealthcareRepresentive ',ibm['JR_HealthcareRepresentive'])
ibm_svm_features_df.insert(14,'JR_ManufacturingDirector',ibm['JR_ManufacturingDirector'])
ibm_svm_features_df
MonthlyIncome | MonthlyRate | DailyRate | YearsInCurrentRole | YearsAtCompany | YearsWithCurrManager | TotalWorkingYears | OverTime | DistanceFromHome | Age | JR_SalesRepresentative | MS_Single | StockOptionLevel | JR_HealthcareRepresentive | JR_ManufacturingDirector | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5993 | 19479 | 1102 | 4 | 6 | 5 | 8 | 1 | 1 | 41 | 0 | 1 | 0 | 0 | 0 |
1 | 5130 | 24907 | 279 | 7 | 10 | 7 | 10 | 0 | 8 | 49 | 0 | 0 | 1 | 0 | 0 |
2 | 2090 | 2396 | 1373 | 0 | 0 | 0 | 7 | 1 | 2 | 37 | 0 | 1 | 0 | 0 | 0 |
3 | 2909 | 23159 | 1392 | 7 | 8 | 0 | 8 | 1 | 3 | 33 | 0 | 0 | 0 | 0 | 0 |
4 | 3468 | 16632 | 591 | 2 | 2 | 2 | 6 | 0 | 2 | 27 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1465 | 2571 | 12290 | 884 | 2 | 5 | 3 | 17 | 0 | 23 | 36 | 0 | 0 | 1 | 0 | 0 |
1466 | 9991 | 21457 | 613 | 7 | 7 | 7 | 9 | 0 | 6 | 39 | 0 | 0 | 1 | 1 | 0 |
1467 | 6142 | 5174 | 155 | 2 | 6 | 3 | 6 | 1 | 4 | 27 | 0 | 0 | 1 | 0 | 1 |
1468 | 5390 | 13243 | 1023 | 6 | 9 | 8 | 17 | 0 | 2 | 49 | 0 | 0 | 0 | 0 | 0 |
1469 | 4404 | 10228 | 628 | 3 | 4 | 2 | 6 | 0 | 8 | 34 | 0 | 0 | 0 | 0 | 0 |
1263 rows Γ 15 columns
#assignment ibm_svm_features to x
x_svm = ibm_svm_features_df
#Try to scale all the numeric data of each features to make svm model train more effective
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()
x_scaled_svm = s_scaler.fit_transform(x_svm)
#Try to use tomek link to solve undersampling problem as attriction too few 'yes' value for imbalanced classification
from imblearn.under_sampling import TomekLinks
tl_svm = TomekLinks(sampling_strategy='not minority')
x_tl_svm, y_tl_svm= tl_svm.fit_resample(x_svm, y_svm)
#Train the modals with 80% and test 20% of the data
x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(x_tl_svm,y_tl_svm, test_size=0.2,random_state=40, stratify=y_tl_svm)
# Model 1 is using the manual tuning for some hyperparameters of SVM
model_1_svm=svm.SVC(C=2,kernel='sigmoid',gamma='scale',coef0=0.6,random_state=40,probability=True)
model_1_svm.fit(x_train_svm,y_train_svm)
y_predict_1_svm=model_1_svm.predict(x_test_svm)
# Modal 2 is using GridSearchCV to find the best hyperparameters for SVM using cross validation
# Only some hyperparameters are tuned
# import GridSearchCV library
from sklearn.model_selection import GridSearchCV
#Try to tune the hyperparameter with
#kernel type: linear/rbf/sigmoid
#C which is the regularization parameter: range 0-1 increase by 0.1
#coef0 that is the independent term for kernel method (only for sigmoid): range 0.0-0.5 increase by 0.1
#degree for the polynomial ('poly') kernel method: range 0-5 increase by 1
#gamma that are kernel coefficient for 'rbf' and 'poly': scale/auto
param_grid={'kernel':('linear','rbf','sigmoid'),
'C':[i for i in np.arange(1.0,3.0,0.1)],
'coef0':[y for y in np.arange(0.0,1.5,0.1)],
'degree':[z for z in np.arange(3,6,1)],
'gamma':('auto','scale'),}
# set random state to 40
find_best_para_model=svm.SVC(random_state=40)
Grid_search_svm=GridSearchCV(find_best_para_model,param_grid, n_jobs=-1,verbose=2,cv=4)
# this may take some time to run
Grid_search_svm.fit(x_train_svm,y_train_svm)
Fitting 4 folds for each of 5400 candidates, totalling 21600 fits
# Show the best hyperparameter found by grid search
Grid_search_svm.best_params_
{'C': 2.8000000000000016, 'coef0': 0.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
# Use hyperparameter found grid search to build modal
model_2_svm=svm.SVC(C=2.8000000000000016,kernel='rbf',degree=3,gamma='scale',coef0=0.0,probability=True,random_state=40)
model_2_svm.fit(x_train_svm,y_train_svm)
y_predict_2_svm=model_2_svm.predict(x_test_svm)
#Evaluate accurracy of classification result
print('Accuracy of prediction classification result for 2 model')
print('Hyperparameters that try to tune manually (model 1): ',metrics.accuracy_score(y_test_svm, y_predict_1_svm))
print('Best hyperparameters found using GridSearchCV (model 2): ',metrics.accuracy_score(y_test_svm, y_predict_2_svm))
Accuracy of prediction classification result for 2 model
Hyperparameters that try to tune manually (model 1): 0.7416666666666667
Best hyperparameters found using GridSearchCV (model 2): 0.8166666666666667
#Evaluating classification result by confusion matrix
from sklearn.metrics import confusion_matrix
print (confusion_matrix(y_test_svm, y_predict_2_svm,[0,1]))
#Evaluating classification result by Precision, Recall and F1-Measure
from sklearn.metrics import classification_report
print (classification_report(y_test_svm, y_predict_2_svm))
[[183 12]
[ 32 13]]
precision recall f1-score support
0 0.85 0.94 0.89 195
1 0.52 0.29 0.37 45
accuracy 0.82 240
macro avg 0.69 0.61 0.63 240
weighted avg 0.79 0.82 0.79 240
C:\Users\USER\anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass labels=[0, 1] as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "
from matplotlib import pyplot as plt
#Evaluating classification result by ROC curves
from sklearn.metrics import roc_curve
y_pred_prob_svm = model_2_svm.predict_proba(x_test_svm)[:,1]
fpr, tpr, threshold = roc_curve(y_test_svm, y_pred_prob_svm)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr,tpr)
auc = roc_auc_score(y_test_svm, y_pred_prob_svm)
plt.title(f'AUC: {auc}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
Β© Alger 2022