In [1]:
# Data manipulation
import numpy as np
import pandas as pd
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)
# Data modeling
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
# Saving models
import pickle
In [2]:
df0 = pd.read_csv("Data/HR_comma_sep.csv")
df0.head()
Out[2]:
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | Department | salary | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
In [3]:
df0.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14999 entries, 0 to 14998 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 satisfaction_level 14999 non-null float64 1 last_evaluation 14999 non-null float64 2 number_project 14999 non-null int64 3 average_montly_hours 14999 non-null int64 4 time_spend_company 14999 non-null int64 5 Work_accident 14999 non-null int64 6 left 14999 non-null int64 7 promotion_last_5years 14999 non-null int64 8 Department 14999 non-null object 9 salary 14999 non-null object dtypes: float64(2), int64(6), object(2) memory usage: 1.1+ MB
In [4]:
df0.describe()
Out[4]:
satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | |
---|---|---|---|---|---|---|---|---|
count | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 |
mean | 0.612834 | 0.716102 | 3.803054 | 201.050337 | 3.498233 | 0.144610 | 0.238083 | 0.021268 |
std | 0.248631 | 0.171169 | 1.232592 | 49.943099 | 1.460136 | 0.351719 | 0.425924 | 0.144281 |
min | 0.090000 | 0.360000 | 2.000000 | 96.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.440000 | 0.560000 | 3.000000 | 156.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 0.640000 | 0.720000 | 4.000000 | 200.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 0.820000 | 0.870000 | 5.000000 | 245.000000 | 4.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 1.000000 | 1.000000 | 7.000000 | 310.000000 | 10.000000 | 1.000000 | 1.000000 | 1.000000 |
In [5]:
df0.columns
Out[5]:
Index(['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years', 'Department', 'salary'], dtype='object')
In [6]:
df0 = df0.rename(columns={'Work_accident': 'work_accident',
'average_montly_hours': 'average_monthly_hours',
'time_spend_company': 'tenure',
'Department': 'department'})
df0.columns
Out[6]:
Index(['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 'tenure', 'work_accident', 'left', 'promotion_last_5years', 'department', 'salary'], dtype='object')
In [7]:
df0.isna().sum()
Out[7]:
satisfaction_level 0 last_evaluation 0 number_project 0 average_monthly_hours 0 tenure 0 work_accident 0 left 0 promotion_last_5years 0 department 0 salary 0 dtype: int64
In [8]:
df0.duplicated().sum()
Out[8]:
3008
In [9]:
#Finding what rows have duplicates
df0[df0.duplicated()].head()
Out[9]:
satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | department | salary | |
---|---|---|---|---|---|---|---|---|---|---|
396 | 0.46 | 0.57 | 2 | 139 | 3 | 0 | 1 | 0 | sales | low |
866 | 0.41 | 0.46 | 2 | 128 | 3 | 0 | 1 | 0 | accounting | low |
1317 | 0.37 | 0.51 | 2 | 127 | 3 | 0 | 1 | 0 | sales | medium |
1368 | 0.41 | 0.52 | 2 | 132 | 3 | 0 | 1 | 0 | RandD | low |
1461 | 0.42 | 0.53 | 2 | 142 | 3 | 0 | 1 | 0 | sales | low |
In [10]:
df1 = df0.drop_duplicates(keep='first')
df1.head()
Out[10]:
satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | department | salary | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
In [11]:
#Ensuring no duplicates
df1[df1.duplicated()].head()
Out[11]:
satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | department | salary |
---|
In [12]:
# Creating a boxplot to visualize distribution of `tenure` and detect any outliers
plt.figure(figsize=(6,6))
plt.title('Boxplot to detect outliers for tenure', fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.boxplot(x=df1['tenure'])
plt.show()
In [13]:
# Compute the 25th percentile value in `tenure`
percentile25 = df1['tenure'].quantile(0.25)
# Compute the 75th percentile
percentile75 = df1['tenure'].quantile(0.75)
# Compute the interquartile
iqr = percentile75- percentile25
# Define the upper limit and lower limit for non-outlier values in `tenure`
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25- 1.5 * iqr
print("Lower limit:", lower_limit)
print("Upper limit:", upper_limit)
# Identify subset of data containing outliers in `tenure`
outliers = df1[(df1['tenure'] > upper_limit) | (df1['tenure'] < lower_limit)]
# Count how many rows in the data contain outliers in `tenure`
print("Number of rows in the data containing outliers in `tenure`:", len(outliers))
Lower limit: 1.5 Upper limit: 5.5 Number of rows in the data containing outliers in `tenure`: 824
In [14]:
#Analyze Stage
In [15]:
print(df1['left'].value_counts())
print()
print(df1['left'].value_counts(normalize=True))
left 0 10000 1 1991 Name: count, dtype: int64 left 0 0.833959 1 0.166041 Name: proportion, dtype: float64
In [16]:
#visualizations
fig, ax = plt.subplots(1, 2, figsize = (22,8))
# Create boxplot showing `average_monthly_hours` distributions for number_project`, comparing employees who stayed versus those who left
sns.boxplot(data=df1, x='average_monthly_hours', y='number_project',hue='left', orient="h", ax=ax[0])
ax[0].invert_yaxis()
ax[0].set_title('Monthly hours by number of projects', fontsize='14')
# Create histogram showing distribution of `number_project`, comparing employees who stayed versus those who left
tenure_stay = df1[df1['left']==0]['number_project']
tenure_left = df1[df1['left']==1]['number_project']
sns.histplot(data=df1, x='number_project', hue='left', multiple='dodge',shrink=2, ax=ax[1])
ax[1].set_title('Number of projects histogram', fontsize='14')
# Display the plots
plt.show()
In [17]:
# Get value counts of stayed/left for employees with 7 projects
df1[df1['number_project']==7]['left'].value_counts()
Out[17]:
left 1 145 Name: count, dtype: int64
In [18]:
plt.figure(figsize=(16, 9))
sns.scatterplot(data=df1, x='average_monthly_hours', y='satisfaction_level', hue='left', alpha=0.4)
plt.axvline(x=166.67, color='#ff6361', label='166.67 hrs./mo.', ls='--')
plt.legend(labels=['166.67 hrs./mo.', 'left', 'stayed'])
plt.title('Monthly hours by last evaluation score', fontsize='14');
In [19]:
fig, ax = plt.subplots(1, 2, figsize = (22,8))
# Create boxplot showing distributions of satisfaction_level
sns.boxplot(data=df1, x='satisfaction_level', y='tenure', hue='left',orient="h", ax=ax[0])
ax[0].invert_yaxis()
ax[0].set_title('Satisfaction by tenure', fontsize='14')
# Create histogram showing distribution of tenure comparing employees who stayed versus those who left
tenure_stay = df1[df1['left']==0]['tenure']
tenure_left = df1[df1['left']==1]['tenure']
sns.histplot(data=df1, x='tenure', hue='left', multiple='dodge', shrink=5,ax=ax[1])
ax[1].set_title('Tenure histogram', fontsize='14')
plt.show();
In [20]:
#Calculate mean and median of those who stayed and didn't
df1.groupby(['left'])['satisfaction_level'].agg([np.mean,np.median])
C:\Users\dietz\AppData\Local\Temp\ipykernel_25424\269140903.py:2: FutureWarning: The provided callable <function mean at 0x0000013D033079C0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead. df1.groupby(['left'])['satisfaction_level'].agg([np.mean,np.median]) C:\Users\dietz\AppData\Local\Temp\ipykernel_25424\269140903.py:2: FutureWarning: The provided callable <function median at 0x0000013D217225C0> is currently using SeriesGroupBy.median. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "median" instead. df1.groupby(['left'])['satisfaction_level'].agg([np.mean,np.median])
Out[20]:
mean | median | |
---|---|---|
left | ||
0 | 0.667365 | 0.69 |
1 | 0.440271 | 0.41 |
In [21]:
#Salary analysis
fig, ax = plt.subplots(1, 2, figsize = (22,8))
# Define short-tenured employees
tenure_short = df1[df1['tenure'] < 7]
# Define long-tenured employees
tenure_long = df1[df1['tenure'] > 6]
# Plot short-tenured histogram
# Plot long-tenured histogram
sns.histplot(data=tenure_short, x='tenure', hue='salary', discrete=1,
hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=.5,ax=ax[0])
ax[0].set_title('Salary histogram by tenure: short-tenured people',fontsize='14')
sns.histplot(data=tenure_long, x='tenure', hue='salary', discrete=1,
hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=.4,ax=ax[1])
ax[1].set_title('Salary histogram by tenure: long-tenured people',fontsize='14');
In [22]:
plt.figure(figsize=(16, 9))
sns.scatterplot(data=df1, x='average_monthly_hours', y='last_evaluation', hue='left', alpha=0.4)
plt.axvline(x=166.67, color='#ff6361', label='166.67 hrs./mo.', ls='--')
plt.legend(labels=['166.67 hrs./mo.', 'left', 'stayed'])
plt.title('Monthly hours by last evaluation score', fontsize='14');
In [23]:
plt.figure(figsize=(16, 3))
sns.scatterplot(data=df1, x='average_monthly_hours', y='promotion_last_5years', hue='left', alpha=0.4)
plt.axvline(x=166.67, color='#ff6361', ls='--')
plt.legend(labels=['166.67 hrs./mo.', 'left', 'stayed'])
plt.title('Monthly hours by promotion last 5 years', fontsize='14');
In [24]:
# Display counts for each department
df1["department"].value_counts()
Out[24]:
department sales 3239 technical 2244 support 1821 IT 976 RandD 694 product_mng 686 marketing 673 accounting 621 hr 601 management 436 Name: count, dtype: int64
In [25]:
# Create stacked histogram to compare department distribution
plt.figure(figsize=(11,8))
sns.histplot(data=df1, x='department', hue='left', discrete=1,hue_order=[0, 1], multiple='dodge', shrink=.5)
#plt.xticks(rotation='45')
plt.title('Counts of stayed/left by department', fontsize=14);
In [26]:
df_enc = df1.copy()
# Encode the `salary` column as an ordinal numeric category
df_enc['salary'] = (
df_enc['salary'].astype('category')
.cat.set_categories(['low', 'medium', 'high'])
.cat.codes
)
# Dummy encode the `department` column
df_enc = pd.get_dummies(df_enc, drop_first=False)
# Display the new dataframe
df_enc.head()
Out[26]:
satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | salary | department_IT | department_RandD | department_accounting | department_hr | department_management | department_marketing | department_product_mng | department_sales | department_support | department_technical | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | False | True | False | False |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | False | True | False | False |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
In [27]:
plt.figure(figsize=(8, 6))
sns.heatmap(df_enc[['satisfaction_level', 'last_evaluation', 'number_project','average_monthly_hours', 'tenure']].corr(), annot=True, cmap="crest")
plt.title('Heatmap of the dataset')
plt.show()
In [28]:
#Visualize employees across departments
pd.crosstab(df1['department'], df1['left']).plot(kind ='bar',color='mr')
plt.title('Counts of employees who left versus stayed across department')
plt.ylabel('Employee count')
plt.xlabel('Department')
plt.show()
In [29]:
#omit outliers in tenure
df_logreg = df_enc[(df_enc['tenure'] >= lower_limit) & (df_enc['tenure'] <=upper_limit)]
# Display first few rows of new dataframe
df_logreg.head()
Out[29]:
satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | salary | department_IT | department_RandD | department_accounting | department_hr | department_management | department_marketing | department_product_mng | department_sales | department_support | department_technical | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | False | True | False | False |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
5 | 0.41 | 0.50 | 2 | 153 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
In [30]:
y = df_logreg['left']
# Display first few rows of the outcome variable
y.head()
Out[30]:
0 1 2 1 3 1 4 1 5 1 Name: left, dtype: int64
In [31]:
X = df_logreg.drop('left', axis=1)
# Display the first few rows of the selected features
X.head()
Out[31]:
satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | promotion_last_5years | salary | department_IT | department_RandD | department_accounting | department_hr | department_management | department_marketing | department_product_mng | department_sales | department_support | department_technical | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 0 | 1 | False | False | False | False | False | False | False | True | False | False |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
5 | 0.41 | 0.50 | 2 | 153 | 3 | 0 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
In [32]:
#split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y, random_state=42)
In [33]:
# Construct a logistic regression model and fit it to the training dataset
log_clf = LogisticRegression(random_state=42, max_iter=500).fit(X_train,y_train)
C:\Users\dietz\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
In [34]:
#use to to get predictions on the model
y_pred = log_clf.predict(X_test)
In [35]:
log_cm = confusion_matrix(y_test, y_pred, labels=log_clf.classes_)
In [36]:
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm,display_labels=log_clf.classes_)
# Plot confusion matrix
log_disp.plot(values_format='')
# Display plot
plt.show()
In [37]:
df_logreg['left'].value_counts(normalize=True)
Out[37]:
left 0 0.831468 1 0.168532 Name: proportion, dtype: float64
In [38]:
# Create classification report for logistic regression model
target_names = ['Predicted would not leave', 'Predicted would leave']
print(classification_report(y_test, y_pred, target_names=target_names))
precision recall f1-score support Predicted would not leave 0.86 0.93 0.90 2321 Predicted would leave 0.44 0.26 0.33 471 accuracy 0.82 2792 macro avg 0.65 0.60 0.61 2792 weighted avg 0.79 0.82 0.80 2792
In [39]:
#tree based model
# Isolate the outcome variable
y = df_enc['left']
# Display the first few rows of `y`
y.head()
Out[39]:
0 1 1 1 2 1 3 1 4 1 Name: left, dtype: int64
In [40]:
# Select the features
X = df_enc.drop('left', axis=1)
# Display the first few rows of `X`
X.head()
Out[40]:
satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | promotion_last_5years | salary | department_IT | department_RandD | department_accounting | department_hr | department_management | department_marketing | department_product_mng | department_sales | department_support | department_technical | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 0 | 1 | False | False | False | False | False | False | False | True | False | False |
2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 0 | 1 | False | False | False | False | False | False | False | True | False | False |
3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
In [41]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y, random_state=0)
In [42]:
# Instantiate model
tree = DecisionTreeClassifier(random_state=0)
# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth':[4, 6, 8, None],
'min_samples_leaf': [2, 5, 1],
'min_samples_split': [2, 4, 6]
}
# Assign a dictionary of scoring metrics to capture
paramter = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
# Instantiate GridSearch
tree1 = GridSearchCV(tree, cv_params, scoring=paramter, cv=4, refit='roc_auc')
In [43]:
%%time
tree1.fit(X_train, y_train)
CPU times: total: 2.75 s Wall time: 2.85 s
Out[43]:
GridSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=0), param_grid={'max_depth': [4, 6, 8, None], 'min_samples_leaf': [2, 5, 1], 'min_samples_split': [2, 4, 6]}, refit='roc_auc', scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=0), param_grid={'max_depth': [4, 6, 8, None], 'min_samples_leaf': [2, 5, 1], 'min_samples_split': [2, 4, 6]}, refit='roc_auc', scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
DecisionTreeClassifier(max_depth=4, min_samples_leaf=5, random_state=0)
DecisionTreeClassifier(max_depth=4, min_samples_leaf=5, random_state=0)
In [44]:
# Check best parameters
tree1.best_params_
Out[44]:
{'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 2}
In [45]:
# Check best AUC score on CV
tree1.best_score_
Out[45]:
0.969819392792457
In [78]:
def make_results(model_name:str, model_object, metric:str):
metric_dict = {'auc': 'mean_test_roc_auc',
'precision': 'mean_test_precision',
'recall': 'mean_test_recall',
'f1': 'mean_test_f1',
'accuracy': 'mean_test_accuracy'
}
# Get all the results from the CV and put them in a df
cv_results = pd.DataFrame(model_object.cv_results_)
# Isolate the row of the df with the max(metric) score
best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]
# Extract Accuracy, precision, recall, and f1 score from that row
auc = best_estimator_results.mean_test_roc_auc
f1 = best_estimator_results.mean_test_f1
recall = best_estimator_results.mean_test_recall
precision = best_estimator_results.mean_test_precision
accuracy = best_estimator_results.mean_test_accuracy
# Create table of results
table = pd.DataFrame()
table = pd.DataFrame({'model': [model_name],
'precision': [precision],
'recall': [recall],
'F1': [f1],
'accuracy': [accuracy],
'auc': [auc]
})
return table
In [79]:
# Get all CV scores
tree1_cv_results = make_results('decision tree cv', tree1, 'auc')
tree1_cv_results
Out[79]:
model | precision | recall | F1 | accuracy | auc | |
---|---|---|---|---|---|---|
0 | decision tree cv | 0.914552 | 0.916949 | 0.915707 | 0.971978 | 0.969819 |
In [80]:
type(tree1_cv_results)
Out[80]:
pandas.core.frame.DataFrame
In [81]:
rf = RandomForestClassifier(random_state=0)
# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth': [3,5, None],
'max_features': [1.0],
'max_samples': [0.7, 1.0],
'min_samples_leaf': [1,2,3],
'min_samples_split': [2,3,4],
'n_estimators': [300, 500],
}
# Assign a dictionary of scoring metrics to capture
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
# Instantiate GridSearch
rf1 = GridSearchCV(rf, cv_params, scoring=scoring, cv=4, refit='roc_auc')
#Fit the random forest model to the training data.
rf1.fit(X_train, y_train)
Out[81]:
GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=0), param_grid={'max_depth': [3, 5, None], 'max_features': [1.0], 'max_samples': [0.7, 1.0], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3, 4], 'n_estimators': [300, 500]}, refit='roc_auc', scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=0), param_grid={'max_depth': [3, 5, None], 'max_features': [1.0], 'max_samples': [0.7, 1.0], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3, 4], 'n_estimators': [300, 500]}, refit='roc_auc', scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
RandomForestClassifier(max_depth=5, max_features=1.0, max_samples=0.7, min_samples_split=4, n_estimators=500, random_state=0)
RandomForestClassifier(max_depth=5, max_features=1.0, max_samples=0.7, min_samples_split=4, n_estimators=500, random_state=0)
In [69]:
path = 'Capston_model/'
In [74]:
def write_pickle(path, model_object, save_as:str):
with open(path + save_as + '.pkl', 'wb') as to_write:
pickle.dump(model_object, to_write)
In [62]:
# Write pickle
write_pickle(path, rf1, 'hr_rf1')
In [67]:
def read_pickle(path, saved_model_name:str):
with open(path + saved_model_name + '.pkl', 'rb') as to_read:
model = pickle.load(to_read)
return model
In [70]:
# Read pickle
rf1 = read_pickle(path, 'hr_rf1')
In [71]:
#check best score on auc
rf1.best_score_
Out[71]:
0.9804250949807172
In [75]:
# Check best params
rf1.best_params_
Out[75]:
{'max_depth': 5, 'max_features': 1.0, 'max_samples': 0.7, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 500}
In [76]:
# Get all CV scores
rf1_cv_results = make_results('random forest cv', rf1, 'auc')
print(tree1_cv_results)
print(rf1_cv_results)
None None
In [82]:
def get_scores(model_name:str, model, X_test_data, y_test_data):
preds = model.best_estimator_.predict(X_test_data)
auc = roc_auc_score(y_test_data, preds)
accuracy = accuracy_score(y_test_data, preds)
precision = precision_score(y_test_data, preds)
recall = recall_score(y_test_data, preds)
f1 = f1_score(y_test_data, preds)
table = pd.DataFrame({'model': [model_name],
'precision': [precision],
'recall': [recall],
'f1': [f1],
'accuracy': [accuracy],
'AUC': [auc]
})
return table
In [83]:
# Get predictions on test data
rf1_test_scores = get_scores('random forest1 test', rf1, X_test, y_test)
rf1_test_scores
Out[83]:
model | precision | recall | f1 | accuracy | AUC | |
---|---|---|---|---|---|---|
0 | random forest1 test | 0.964211 | 0.919679 | 0.941418 | 0.980987 | 0.956439 |
In [84]:
# Drop `satisfaction_level` and save resulting dataframe in new variable
df2 = df_enc.drop('satisfaction_level', axis=1)
# Display first few rows of new dataframe
df2.head()
Out[84]:
last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | salary | department_IT | department_RandD | department_accounting | department_hr | department_management | department_marketing | department_product_mng | department_sales | department_support | department_technical | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
1 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | False | True | False | False |
2 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | False | True | False | False |
3 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
4 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False |
In [85]:
# Create `overworked` column. For now, it's identical to average monthly hours.
df2['overworked'] = df2['average_monthly_hours']
# Inspect max and min average monthly hours values
print('Max hours:', df2['overworked'].max())
print('Min hours:', df2['overworked'].min())
Max hours: 310 Min hours: 96
In [86]:
# Define `overworked` as working > 175 hrs/week
df2['overworked'] = (df2['overworked'] > 175).astype(int)
# Display first few rows of new column
df2['overworked'].head()
Out[86]:
0 0 1 1 2 1 3 1 4 0 Name: overworked, dtype: int32
In [87]:
# Drop the `average_monthly_hours` column
df2 = df2.drop('average_monthly_hours', axis=1)
# Display first few rows of resulting dataframe
df2.head()
Out[87]:
last_evaluation | number_project | tenure | work_accident | left | promotion_last_5years | salary | department_IT | department_RandD | department_accounting | department_hr | department_management | department_marketing | department_product_mng | department_sales | department_support | department_technical | overworked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.53 | 2 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False | 0 |
1 | 0.86 | 5 | 6 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | False | True | False | False | 1 |
2 | 0.88 | 7 | 4 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | False | True | False | False | 1 |
3 | 0.87 | 5 | 5 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False | 1 |
4 | 0.52 | 2 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | False | True | False | False | 0 |
In [89]:
# Isolate the outcome variable
y = df2['left']
# Select the features
X = df2.drop('left', axis=1)
In [90]:
# Create test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y, random_state=0)
In [95]:
# Instantiate model - decision tree round 2
tree = DecisionTreeClassifier(random_state=0)
# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth':[4, 6, 8, None],'min_samples_leaf': [2, 5, 1],'min_samples_split': [2, 4, 6]}
# Assign a dictionary of scoring metrics to capture
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
# Instantiate GridSearch
tree2 = GridSearchCV(tree, cv_params, scoring=scoring, cv=4, refit='roc_auc')
In [96]:
tree2.fit(X_train, y_train)
Out[96]:
GridSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=0), param_grid={'max_depth': [4, 6, 8, None], 'min_samples_leaf': [2, 5, 1], 'min_samples_split': [2, 4, 6]}, refit='roc_auc', scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=0), param_grid={'max_depth': [4, 6, 8, None], 'min_samples_leaf': [2, 5, 1], 'min_samples_split': [2, 4, 6]}, refit='roc_auc', scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
DecisionTreeClassifier(max_depth=6, min_samples_leaf=2, min_samples_split=6, random_state=0)
DecisionTreeClassifier(max_depth=6, min_samples_leaf=2, min_samples_split=6, random_state=0)
In [97]:
# Check best params
tree2.best_params_
Out[97]:
{'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 6}
In [98]:
# Check best AUC score on CV
tree2.best_score_
Out[98]:
0.9586752505340426
In [99]:
# Get all CV scores
tree2_cv_results = make_results('decision tree2 cv', tree2, 'auc')
print(tree1_cv_results)
print(tree2_cv_results)
model precision recall F1 accuracy auc 0 decision tree cv 0.914552 0.916949 0.915707 0.971978 0.969819 model precision recall F1 accuracy auc 0 decision tree2 cv 0.856693 0.903553 0.878882 0.958523 0.958675
In [102]:
# Instantiate model - Random Forrest round 2
rf = RandomForestClassifier(random_state=0)
# Assign a dictionary of hyperparameters to search over
cv_params = {'max_depth': [3,5, None],
'max_features': [1.0],
'max_samples': [0.7, 1.0],
'min_samples_leaf': [1,2,3],
'min_samples_split': [2,3,4],
'n_estimators': [300, 500],
}
# Assign a dictionary of scoring metrics to capture
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
# Instantiate GridSearch
rf2 = GridSearchCV(rf, cv_params, scoring=scoring, cv=4, refit='roc_auc')
In [103]:
rf2.fit(X_train, y_train) #--> Wall time: 7min 5s
Out[103]:
GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=0), param_grid={'max_depth': [3, 5, None], 'max_features': [1.0], 'max_samples': [0.7, 1.0], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3, 4], 'n_estimators': [300, 500]}, refit='roc_auc', scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=4, estimator=RandomForestClassifier(random_state=0), param_grid={'max_depth': [3, 5, None], 'max_features': [1.0], 'max_samples': [0.7, 1.0], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3, 4], 'n_estimators': [300, 500]}, refit='roc_auc', scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
RandomForestClassifier(max_depth=5, max_features=1.0, max_samples=0.7, min_samples_leaf=2, n_estimators=300, random_state=0)
RandomForestClassifier(max_depth=5, max_features=1.0, max_samples=0.7, min_samples_leaf=2, n_estimators=300, random_state=0)
In [104]:
# Write pickle
write_pickle(path, rf2, 'hr_rf2')
In [105]:
# Read in pickle
rf2 = read_pickle(path, 'hr_rf2')
In [106]:
# Check best params
rf2.best_params_
Out[106]:
{'max_depth': 5, 'max_features': 1.0, 'max_samples': 0.7, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
In [107]:
# Check best AUC score on CV
rf2.best_score_
Out[107]:
0.9648100662833985
In [108]:
# Get all CV scores
rf2_cv_results = make_results('random forest2 cv', rf2, 'auc')
print(tree2_cv_results)
print(rf2_cv_results)
model precision recall F1 accuracy auc 0 decision tree2 cv 0.856693 0.903553 0.878882 0.958523 0.958675 model precision recall F1 accuracy auc 0 random forest2 cv 0.866758 0.878754 0.872407 0.957411 0.96481
In [109]:
# Get predictions on test data
rf2_test_scores = get_scores('random forest2 test', rf2, X_test, y_test)
rf2_test_scores
Out[109]:
model | precision | recall | f1 | accuracy | AUC | |
---|---|---|---|---|---|---|
0 | random forest2 test | 0.870406 | 0.903614 | 0.8867 | 0.961641 | 0.938407 |
In [110]:
# Generate array of values for confusion matrix
preds = rf2.best_estimator_.predict(X_test)
cm = confusion_matrix(y_test, preds, labels=rf2.classes_)
# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=rf2.classes_)
disp.plot(values_format='');
In [112]:
plot_tree help
Cell In[112], line 1 plot_tree help ^ SyntaxError: invalid syntax
In [114]:
# Plot the tree
plt.figure(figsize=(85,20))
plot_tree(tree2.best_estimator_, max_depth=6, fontsize=14, feature_names=X.columns, class_names={0:'stayed', 1:'left'}, filled=True);
plt.show()
In [115]:
#tree2_importances = pd.DataFrame(tree2.best_estimator_.feature_importances_,columns=X.columns)
tree2_importances = pd.DataFrame(tree2.best_estimator_.feature_importances_,
columns=['gini_importance'],
index=X.columns
)
tree2_importances = tree2_importances.sort_values(by='gini_importance',ascending=False)
# Only extract the features with importances > 0
tree2_importances = tree2_importances[tree2_importances['gini_importance'] != 0]
tree2_importances
Out[115]:
gini_importance | |
---|---|
last_evaluation | 0.343958 |
number_project | 0.343385 |
tenure | 0.215681 |
overworked | 0.093498 |
department_support | 0.001142 |
salary | 0.000910 |
department_sales | 0.000607 |
department_technical | 0.000418 |
work_accident | 0.000183 |
department_IT | 0.000139 |
department_marketing | 0.000078 |
In [116]:
sns.barplot(data=tree2_importances, x="gini_importance", y=tree2_importances.index, orient='h')
plt.title("Decision Tree: Feature Importances for Employee Leaving",fontsize=12)
plt.ylabel("Feature")
plt.xlabel("Importance")
plt.show()
In [118]:
# Retrieve feature importances
feat_impt = rf2.best_estimator_.feature_importances_
# Indices of top 10 features
ind = np.argpartition(rf2.best_estimator_.feature_importances_,-10)[-10:]
# Column labels of top 10 features
feat = X.columns[ind]
# Filter feat_impt to consist of top 10 feature importances
feat_impt = feat_impt[ind]
y_df = pd.DataFrame({"Feature":feat,"Importance":feat_impt})
y_sort_df = y_df.sort_values("Importance")
fig = plt.figure()
ax1 = fig.add_subplot(111)
y_sort_df.plot(kind='barh',ax=ax1,x="Feature",y="Importance")
ax1.set_title("Random Forest: Feature Importances for Employee Leaving",fontsize=12)
ax1.set_ylabel("Feature")
ax1.set_xlabel("Importance")
plt.show()
In [ ]: