### Complete flow to generate a ML model for the HR Attrition dataset

#### Prepare the data:

In [1]:
# Import a custom transformer for preprocessing data based on feature definitions
from preprocessor import Preprocessor

In [2]:
import numpy as np
import pandas as pd

# Import feature definitions and data
sheets = pd.read_excel('Data/HR-Employee-Attrition.xlsx', sheet_name=["Feature Definitions", "Train-Test"])

# Create feature definitions data frame
features = sheets["Feature Definitions"]
features.columns = [c.lower() for c in features.columns]
features.set_index("name", append=False, inplace=True)
features.head()

Unnamed: 0_level_0,sample,variable_type,data_type,feature_strategy,hash_features
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Age,41,feature,int,scaling,
Attrition,Yes,target,str,none,
BusinessTravel,Travel_Rarely,feature,str,one hot encoding,
DailyRate,1102,feature,int,scaling,
Department,Sales,feature,str,one hot encoding,


In [3]:
# Setup the data dataframe
data = sheets["Train-Test"]

# Get the target features
target = features.loc[features["variable_type"] == "target"]
target_name = target.index[0]

# Get the target data
d_target = data.loc[:,[target_name]]

d_target.head()

Unnamed: 0,Attrition
0,Yes
1,No
2,No
3,No
4,No


In [4]:
# Get the features to be excluded from the model
exclusions = features['variable_type'].isin(["excluded", "target", "identifier"])

excluded = features.loc[exclusions]
features = features.loc[~exclusions]

# Remove excluded features from the data
data = data[features.index.tolist()]

In [5]:
# Split the data into training and testing subsets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, d_target, test_size=0.30, random_state=42)

In [6]:
X_test.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
893,41,Travel_Frequently,1200,Research & Development,22,3,Life Sciences,4,Female,75,...,3,1,2,12,4,2,6,2,3,3
115,50,Travel_Frequently,809,Sales,12,3,Marketing,3,Female,77,...,3,4,0,16,3,3,2,2,2,1
526,27,Travel_Frequently,829,Sales,8,1,Marketing,3,Male,84,...,3,2,1,5,3,3,4,2,1,1
175,35,Travel_Frequently,138,Research & Development,2,3,Medical,2,Female,37,...,3,4,0,10,5,3,6,2,1,2
63,45,Travel_Rarely,193,Research & Development,6,4,Other,4,Male,52,...,3,2,0,17,3,4,0,0,0,0


In [7]:
# Test the preprocessor
prep = Preprocessor(features, return_type='df').fit(X_train)
prep.transform(X_test)

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,Education_1,Education_2,Education_3,Education_4,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,JobRole0,JobRole1,JobRole2,JobRole3
893,0,1,0,0,1,0,0,0,1,0,...,0.083789,0.927695,-0.158869,-0.613794,0.232242,-0.328203,-0.994429,-0.451462,-0.816773,-1.706519
115,0,1,0,0,0,1,0,0,1,0,...,0.591154,0.153090,-0.807584,-0.613794,-0.071434,-0.882752,1.265490,-0.034487,-0.816773,0.899445
526,0,1,0,0,0,1,1,0,0,0,...,-0.804100,0.153090,-0.483227,-0.613794,-0.375111,-0.882752,1.265490,-0.034487,-0.816773,0.899445
175,0,1,0,0,1,0,0,0,1,0,...,-0.169894,1.702300,-0.158869,-0.613794,-0.375111,-0.605477,-0.994429,-1.702387,1.808367,0.899445
63,0,0,1,0,1,0,0,0,0,1,...,0.717995,0.153090,-1.131942,-1.175408,-0.678787,-1.160026,0.512184,0.799463,0.058274,0.899445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,0,0,1,1,0,0,0,1,0,0,...,-1.311464,0.153090,-0.969763,-1.175408,-0.678787,-1.160026,2.018796,1.633413,-0.816773,0.030790
292,0,0,1,0,0,1,0,0,1,0,...,0.210630,0.153090,0.003310,0.790241,-0.375111,0.780896,1.265490,-0.034487,-0.816773,0.899445
1083,0,0,1,0,0,1,1,0,0,0,...,-1.184623,0.153090,-0.807584,-0.613794,-0.678787,-0.605477,-0.241123,1.216438,-0.816773,-0.837864
551,0,0,1,0,1,0,0,0,1,0,...,-1.311464,0.153090,-0.969763,-1.175408,-0.678787,-1.160026,-0.994429,-0.451462,-0.816773,-1.706519


#### Set up machine learning pipelines, fit and score the models:

In [8]:
# Set up a pipeline to run the ML flow
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle

# Construct the pipelines
pipe_lr = Pipeline([('prep', Preprocessor(features, return_type='df')), ('clf', LogisticRegression(solver='lbfgs', random_state=42))])
pipe_rf = Pipeline([('prep', Preprocessor(features, return_type='df')), ('clf', RandomForestClassifier(n_estimators=10, random_state=42))])

# List of pipelines for ease of iteration
pipelines = [pipe_lr, pipe_rf]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Random Forest'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train.values.ravel())

# Compare accuracies
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))

# Identify the most accurate model on test data
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
    if val.score(X_test, y_test) > best_acc:
        best_acc = val.score(X_test, y_test.values.ravel())
        best_pipe = val
        best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

# Save pipeline to file
with open('HR-Attrition-v1.pkl', 'wb') as file:
    pickle.dump(best_pipe, file)
    print('Saved %s pipeline to file' % pipe_dict[best_clf])

# Also save the preprocessor and model as separate files
with open('HR-Attrition-v1-prep.pkl', 'wb') as file:
    pickle.dump(best_pipe.named_steps['prep'], file)
with open('HR-Attrition-v1-clf.pkl', 'wb') as file:
    pickle.dump(best_pipe.named_steps['clf'], file)

Logistic Regression pipeline test accuracy: 0.905
Random Forest pipeline test accuracy: 0.862
Classifier with best accuracy: Logistic Regression
Saved Logistic Regression pipeline to file


#### Validate the saved model:

In [9]:
# Load the saved pipeline from disk
with open('HR-Attrition-v1.pkl', 'rb') as file:
    model = pickle.load(file)

model

Pipeline(memory=None,
         steps=[('prep',
                 <preprocessor.Preprocessor object at 0x0000024689421940>),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=42,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [10]:
# Load additional data to test the saved model
validation = pd.read_excel('Data/HR-Employee-Attrition.xlsx', sheet_name="Validate")
validation

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
1,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
2,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
3,53,No,Travel_Rarely,1282,Research & Development,5,3,Other,1,32,...,4,80,1,26,3,2,14,13,4,8
4,43,No,Travel_Rarely,1273,Research & Development,2,2,Medical,1,46,...,4,80,2,6,3,2,5,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,34,No,Travel_Rarely,704,Sales,28,3,Marketing,1,2035,...,4,80,2,8,2,3,8,7,1,7
136,36,No,Travel_Rarely,1120,Sales,11,4,Marketing,1,2045,...,1,80,1,8,2,2,6,3,0,0
137,29,No,Travel_Rarely,468,Research & Development,28,4,Medical,1,2054,...,2,80,0,5,3,1,5,4,0,4
138,39,No,Travel_Rarely,722,Sales,24,1,Marketing,1,2056,...,1,80,1,21,2,2,20,9,9,6


In [11]:
# Get the targets
v_target = validation.loc[:,[target_name]]
v_target.head()

Unnamed: 0,Attrition
0,Yes
1,No
2,No
3,No
4,No


In [12]:
# Remove excluded features from the validation dataset
validation = validation[features.index.tolist()]
validation

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,37,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,...,3,2,0,7,3,3,0,0,0,0
1,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,...,3,4,1,6,3,3,2,2,2,2
2,32,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,4,Male,79,...,3,3,0,8,2,2,7,7,3,6
3,53,Travel_Rarely,1282,Research & Development,5,3,Other,3,Female,58,...,3,4,1,26,3,2,14,13,4,8
4,43,Travel_Rarely,1273,Research & Development,2,2,Medical,4,Female,72,...,3,4,2,6,3,2,5,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,34,Travel_Rarely,704,Sales,28,3,Marketing,4,Female,95,...,4,4,2,8,2,3,8,7,1,7
136,36,Travel_Rarely,1120,Sales,11,4,Marketing,2,Female,100,...,3,1,1,8,2,2,6,3,0,0
137,29,Travel_Rarely,468,Research & Development,28,4,Medical,4,Female,73,...,3,2,0,5,3,1,5,4,0,4
138,39,Travel_Rarely,722,Sales,24,1,Marketing,2,Female,60,...,3,1,1,21,2,2,20,9,9,6


In [13]:
# Get a score for the validation dataset from the saved pipeline
model.score(validation, v_target.values.ravel())

0.8785714285714286

#### Train a Keras model:

In [14]:
# Run the training data through the preprocessor
X_train_transformed = best_pipe.named_steps['prep'].transform(X_train)
X_train_transformed.head(5)

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,Education_1,Education_2,Education_3,Education_4,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,JobRole0,JobRole1,JobRole2,JobRole3
1298,0,0,1,0,1,0,0,0,1,0,...,0.591154,-0.621514,1.462919,1.913468,1.143272,1.05817,0.512184,0.799463,0.058274,0.899445
620,0,0,1,0,1,0,0,0,0,1,...,0.21063,-0.621514,-1.131942,-1.175408,-0.678787,-1.160026,-0.994429,-0.451462,-0.816773,-1.706519
1193,0,0,1,0,1,0,0,0,0,1,...,0.464312,-0.621514,-0.969763,-1.175408,-0.678787,-1.160026,-0.994429,-0.451462,-0.816773,-1.706519
139,0,0,1,0,1,0,0,0,1,0,...,-0.296735,0.15309,-0.969763,-1.175408,-0.678787,-1.160026,-0.994429,-0.451462,-0.816773,-1.706519
1165,0,0,1,0,1,0,1,0,0,0,...,-0.296735,0.15309,-0.483227,-0.332987,-0.071434,-0.605477,0.512184,0.799463,0.058274,0.899445


In [55]:
from sklearn.preprocessing import LabelEncoder

# Encode target values
le = LabelEncoder().fit(y_train.values.ravel())
y_train_encoded = le.transform(y_train.values.ravel())
y_train_encoded[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [71]:
from keras.models import Sequential
from keras.layers import Dense

# Define the Keras model
model = Sequential()
model.add(Dense(100, input_dim=74, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the Keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the Keras model on the dataset
model.fit(X_train_transformed, y_train_encoded, epochs=50, batch_size=8, class_weight={0:0.1, 1:2.0})

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x2469e051cc0>

In [60]:
# Run the test data through the preprocessor
X_test_transformed = best_pipe.named_steps['prep'].transform(X_test)
# Encode the test labels
y_test_encoded = le.transform(y_test.values.ravel())

# Check model accuracy on test data
print('Keras test accuracy: %.3f' % (model.evaluate(X_test_transformed, y_test_encoded)[1]))

Keras test accuracy: 0.877


In [61]:
# Save the keras model architecture and weights to disk
model.save('HR-Attrition-Keras-v1.h5')

In [65]:
import keras
from keras import backend as kerasbackend

kerasbackend.clear_session()
            
# Load the keras model architecture and weights from disk
keras_model = keras.models.load_model('HR-Attrition-Keras-v1.h5')
keras_model._make_predict_function()

print('Keras prediction: %.0f' % (keras_model.predict(X_test_transformed.iloc[[0]])))

Keras prediction: 0
