This blog posts shows some ways to get generally good performance on tabular data. Most of the work in getting high performance models from tabular data comes from cleaning the dataset, clever feature engineering, and other tasks specific to the data set. We won’t be doing that here. However, there’s still a need for some good baseline parameters to know you’re getting the best out of your model. This post provides a way to use Bayesian optimization to find good hyperparameters and get good performance.
Table of Contents
import os
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')
df = pd.read_csv(Path(os.getenv('DATA')) / 'stroke/healthcare-dataset-stroke-data.csv')
df.head()
id | gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9046 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
1 | 51676 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | NaN | never smoked | 1 |
2 | 31112 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
3 | 60182 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
4 | 1665 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 5110 non-null int64
1 gender 5110 non-null object
2 age 5110 non-null float64
3 hypertension 5110 non-null int64
4 heart_disease 5110 non-null int64
5 ever_married 5110 non-null object
6 work_type 5110 non-null object
7 Residence_type 5110 non-null object
8 avg_glucose_level 5110 non-null float64
9 bmi 4909 non-null float64
10 smoking_status 5110 non-null object
11 stroke 5110 non-null int64
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
df = df.drop('id', axis=1)
df.isnull().sum()
gender 0
age 0
hypertension 0
heart_disease 0
ever_married 0
work_type 0
Residence_type 0
avg_glucose_level 0
bmi 201
smoking_status 0
stroke 0
dtype: int64
df['stroke'].value_counts()
0 4861
1 249
Name: stroke, dtype: int64
We have significantly unbalanced data. We’ll have to use oversampling to adjust for this in the training data.
le = LabelEncoder()
en_df = df.apply(le.fit_transform)
en_df.head()
gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 88 | 0 | 1 | 1 | 2 | 1 | 3850 | 239 | 1 | 1 |
1 | 0 | 82 | 0 | 0 | 1 | 3 | 0 | 3588 | 418 | 2 | 1 |
2 | 1 | 101 | 0 | 1 | 1 | 2 | 0 | 2483 | 198 | 2 | 1 |
3 | 0 | 70 | 0 | 0 | 1 | 2 | 1 | 3385 | 217 | 3 | 1 |
4 | 0 | 100 | 1 | 0 | 1 | 3 | 0 | 3394 | 113 | 2 | 1 |
Clean Dataset
en_df_imputed = en_df
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputer.fit_transform(en_df_imputed)
array([[ 1., 88., 0., ..., 239., 1., 1.],
[ 0., 82., 0., ..., 418., 2., 1.],
[ 1., 101., 0., ..., 198., 2., 1.],
...,
[ 0., 56., 0., ..., 179., 2., 0.],
[ 1., 72., 0., ..., 129., 1., 0.],
[ 0., 65., 0., ..., 135., 0., 0.]])
en_df_imputed.isnull().sum()
gender 0
age 0
hypertension 0
heart_disease 0
ever_married 0
work_type 0
Residence_type 0
avg_glucose_level 0
bmi 0
smoking_status 0
stroke 0
dtype: int64
features=['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
'work_type', 'Residence_type',
'smoking_status']
from imblearn.over_sampling import SMOTE
X, y = en_df_imputed[features], en_df_imputed["stroke"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)
Modeling
from functools import partial
from hyperopt import STATUS_OK, Trials, fmin, hp, space_eval, tpe
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn.metrics import accuracy_score, f1_score
num_trials = 500
svm_trials = 100 # svm takes much longer, so you may want to limit this
XGBoost
from xgboost import XGBClassifier
xgb_space={'max_depth': scope.int(hp.quniform("max_depth", 3, 18, 1)),
'gamma': hp.uniform ('gamma', 1,9),
'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
'reg_lambda' : hp.uniform('reg_lambda', 0,1),
'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
'n_estimators': 180,
'seed': 0
}
def train_clf(clf, params):
clf=clf(**params)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
accuracy = accuracy_score(y_test, preds>0.5)
return {'loss': -accuracy, 'status': STATUS_OK}
def train_xgb(params):
"""
xgb needs eval_metric or it produces lots of warnings
"""
clf=XGBClassifier(**params)
clf.fit(X_train, y_train, eval_metric='logloss')
pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred>0.5)
return {'loss': -accuracy, 'status': STATUS_OK}
trials = Trials()
fmin(fn = train_xgb,
space = xgb_space,
algo = tpe.suggest,
max_evals = num_trials,
trials = trials)
100%|██████████████████████████████████████████████| 500/500 [02:25<00:00, 3.43trial/s, best loss: -0.764187866927593]
{'colsample_bytree': 0.5671189561452116,
'gamma': 1.0071481663300468,
'max_depth': 14.0,
'min_child_weight': 2.0,
'reg_alpha': 50.0,
'reg_lambda': 0.6971844311013579}
best_hyperparams = space_eval(xgb_space, trials.argmin)
best_hyperparams
{'colsample_bytree': 0.5671189561452116,
'gamma': 1.0071481663300468,
'max_depth': 14,
'min_child_weight': 2.0,
'n_estimators': 180,
'reg_alpha': 50.0,
'reg_lambda': 0.6971844311013579,
'seed': 0}
xgb_clf = XGBClassifier(**best_hyperparams)
xgb_clf.fit(X_train, y_train)
[00:09:42] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.5671189561452116,
enable_categorical=False, gamma=1.0071481663300468, gpu_id=-1,
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=14,
min_child_weight=2.0, missing=nan, monotone_constraints='()',
n_estimators=180, n_jobs=12, num_parallel_tree=1,
predictor='auto', random_state=0, reg_alpha=50.0,
reg_lambda=0.6971844311013579, scale_pos_weight=1, seed=0,
subsample=1, tree_method='exact', validate_parameters=1,
verbosity=None)
xgb_preds = xgb_clf.predict(X_test)
f1_score(y_test, xgb_preds)
0.25846153846153846
accuracy_score(y_test, xgb_preds)
0.764187866927593
y_test
4688 0
4478 0
3849 0
4355 0
3826 0
..
3605 0
4934 0
4835 0
4105 0
2902 0
Name: stroke, Length: 1022, dtype: int64
Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_space = {
"n_estimators": hp.randint("n_estimators", 10, 700),
"criterion": hp.choice("criterion", ["gini", "entropy"]),
"max_depth": hp.randint('max_depth', 1, 100),
"min_samples_split": hp.randint('min_samples_split', 2, 20),
"min_samples_leaf": hp.randint('min_samples_leaf', 1, 10),
"max_features": hp.choice('max_features', ['sqrt', 'log2']),
"random_state": 42
}
trials = Trials()
fmin(fn = partial(train_clf, RandomForestClassifier),
space = rf_space,
algo = tpe.suggest,
max_evals = num_trials,
trials = trials)
100%|█████████████████████████████████████████████| 500/500 [09:01<00:00, 1.08s/trial, best loss: -0.8679060665362035]
{'criterion': 0,
'max_depth': 98,
'max_features': 1,
'min_samples_leaf': 1,
'min_samples_split': 9,
'n_estimators': 535}
rf_best_hyperparams = space_eval(rf_space, trials.argmin)
rf_best_hyperparams
{'criterion': 'gini',
'max_depth': 98,
'max_features': 'log2',
'min_samples_leaf': 1,
'min_samples_split': 9,
'n_estimators': 535,
'random_state': 42}
rf_clf = RandomForestClassifier(**rf_best_hyperparams)
rf_clf.fit(X_train, y_train)
RandomForestClassifier(max_depth=98, max_features='log2', min_samples_split=9,
n_estimators=535, random_state=42)
rf_preds = rf_clf.predict(X_test)
f1_score(y_test, rf_preds)
0.17177914110429446
accuracy_score(y_test, rf_preds)
0.8679060665362035
SVM
You can also try support vector machines, but I usually skip these for very large datasets. They don’t generally get the best performance and the training time is much higher than the others. Fundamentally, nonlinear SVM kernels are trying to solve a problem that is O(n_samples^2 * n_features)
, so things quickly get out of hand with a lot of samples.
from sklearn.svm import SVC
svm_space = {
'C': hp.lognormal('svm_C', 0, 1),
'kernel': hp.choice('kernel', ['linear', 'rbf', 'poly']),
'degree':hp.choice('degree',[2,3,4]),
'probability':hp.choice('probability',[True])
}
trials = Trials()
fmin(fn = partial(train_clf, SVC),
space = svm_space,
algo = tpe.suggest,
max_evals = svm_trials,
trials = trials)
100%|█████████████████████████████████████████████| 100/100 [22:05<00:00, 13.25s/trial, best loss: -0.8111545988258317]
{'degree': 2, 'kernel': 2, 'probability': 0, 'svm_C': 2.1465036697130855}
svm_best_hyperparams = space_eval(svm_space, trials.argmin)
svm_best_hyperparams
{'C': 2.1465036697130855, 'degree': 4, 'kernel': 'poly', 'probability': True}
svm_clf = SVC(**svm_best_hyperparams)
svm_clf.fit(X_train, y_train)
SVC(C=2.1465036697130855, degree=4, kernel='poly', probability=True)
svm_preds = svm_clf.predict(X_test)
f1_score(y_test, svm_preds)
0.2771535580524344
accuracy_score(y_test, svm_preds)
0.8111545988258317
Neural Network with FastAI
I’ve had poor-to-mixed results with neural networks and hyperopt. But I still included it because I thought it might be helpful.
from fastai.tabular.all import *
df.head()
gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
1 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | NaN | never smoked | 1 |
2 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
3 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
4 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
dep_var = 'stroke'
For FastAI, we’ll combine the training and validation data into one DataFrame and then split them out later. It’s just easier this way.
full_X_df = pd.concat([X_train, X_test])
full_y_df = pd.concat([y_train, y_test])
df = pd.merge(full_X_df, full_y_df, left_index=True, right_index=True)
np.sum(y_test)
62
continuous_vars, categorical_vars = cont_cat_split(df, dep_var=dep_var)
val_indices = list(range(len(X_train), len(X_train) + len(X_test)))
ind_splitter = IndexSplitter(val_indices)
splits = ind_splitter(df)
preprocessing = [Categorify, Normalize]
to_nn = TabularPandas(df, preprocessing, categorical_vars, continuous_vars, splits=splits, y_names=dep_var)
dls = to_nn.dataloaders(64)
def my_acc(preds, gt):
"""
The order that FAI and sklearn received inputs is flipped, so be careful.
"""
return accuracy_score(gt.cpu(), np.rint(preds.cpu()))
nn_space = [
{'layer1': scope.int(hp.quniform('layer1', 2, 200, 1))},
{'layer2': scope.int(hp.quniform('layer2', 2, 500, 2))},
{'epochs': scope.int(hp.quniform('epochs', 1, 20, 1))},
{'lr': hp.uniform('lr', 1e-7, 1e-1)},
]
def objective(params):
learn = tabular_learner(dls, y_range=(y.min(), y.max()), layers=[params[0]['layer1'],params[1]['layer2']], metrics=accuracy)
with learn.no_bar(), learn.no_logging():
learn.fit(params[2]['epochs'], params[3]['lr'])
return {'loss': learn.recorder.losses[-1], 'status': STATUS_OK}
trials = Trials()
best = fmin(objective,
space=nn_space,
algo=tpe.suggest,
max_evals=num_trials,
trials=trials)
print(best)
100%|███████████████████████████████████████████| 500/500 [2:36:05<00:00, 18.73s/trial, best loss: 0.12682415544986725]
{'epochs': 20.0, 'layer1': 181.0, 'layer2': 158.0, 'lr': 0.004167824772417915}
nn_best_hyperparams = space_eval(nn_space, trials.argmin)
nn_best_hyperparams
({'layer1': 181},
{'layer2': 158},
{'epochs': 20},
{'lr': 0.004167824772417915})
learn = tabular_learner(dls, y_range=(y.min(), y.max()),
layers=[nn_best_hyperparams[0]['layer1'], nn_best_hyperparams[1]['layer2']], metrics=my_acc)
learn.fit(nn_best_hyperparams[2]['epochs'], nn_best_hyperparams[3]['lr'], )
epoch | train_loss | valid_loss | my_acc | time |
---|---|---|---|---|
0 | 0.152055 | 0.210501 | 0.703523 | 00:01 |
1 | 0.149081 | 0.241731 | 0.587084 | 00:01 |
2 | 0.147788 | 0.208247 | 0.710372 | 00:01 |
3 | 0.144673 | 0.194463 | 0.737769 | 00:01 |
4 | 0.142503 | 0.195599 | 0.729941 | 00:01 |
5 | 0.143986 | 0.247592 | 0.614481 | 00:01 |
6 | 0.140043 | 0.233124 | 0.599804 | 00:01 |
7 | 0.140295 | 0.193624 | 0.726027 | 00:01 |
8 | 0.135865 | 0.200070 | 0.707436 | 00:01 |
9 | 0.134626 | 0.198199 | 0.737769 | 00:01 |
10 | 0.138697 | 0.211755 | 0.710372 | 00:01 |
11 | 0.133714 | 0.199920 | 0.726027 | 00:01 |
12 | 0.139617 | 0.222729 | 0.687867 | 00:01 |
13 | 0.131833 | 0.192564 | 0.747554 | 00:01 |
14 | 0.133367 | 0.180420 | 0.767123 | 00:01 |
15 | 0.133702 | 0.213077 | 0.709393 | 00:01 |
16 | 0.137790 | 0.210081 | 0.688845 | 00:01 |
17 | 0.134852 | 0.158134 | 0.800391 | 00:01 |
18 | 0.137128 | 0.197334 | 0.726027 | 00:01 |
19 | 0.130718 | 0.185704 | 0.740705 | 00:01 |
nn_preds, gt = learn.get_preds()
my_acc(nn_preds, gt)
0.7407045009784736
Note that the gt is different here than for the other classifiers.
gt.sum(), len(gt)
(tensor(922), 1022)
y_test.value_counts()
0 960
1 62
Name: stroke, dtype: int64
Catboost
from catboost import CatBoostClassifier
cb_params = {'loss_function':'Logloss',
'eval_metric':'AUC',
'cat_features': categorical_vars,
'verbose': 200,
'random_seed': 42
}
cb_clf = CatBoostClassifier(**cb_params)
cb_clf.fit(X_train, y_train,
eval_set=(X_test, y_test),
use_best_model=True,
plot=True
);
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
Learning rate set to 0.052636
0: test: 0.7928763 best: 0.7928763 (0) total: 173ms remaining: 2m 52s
200: test: 0.7582745 best: 0.8060484 (11) total: 5.18s remaining: 20.6s
400: test: 0.7407258 best: 0.8060484 (11) total: 10.5s remaining: 15.6s
600: test: 0.7307124 best: 0.8060484 (11) total: 15.9s remaining: 10.6s
800: test: 0.7226983 best: 0.8060484 (11) total: 21.3s remaining: 5.29s
999: test: 0.7188004 best: 0.8060484 (11) total: 26.9s remaining: 0us
bestTest = 0.8060483871
bestIteration = 11
Shrink model to first 12 iterations.
Note: The above command provides an interactive graph that is not displayed on the blog.
cb_preds = cb_clf.predict(X_test)
f1_score(y_test, cb_preds)
0.2580645161290323
accuracy_score(y_test, cb_preds)
0.7299412915851272
Ensembling
Some of these models are already ensemble models. But who says you can’t ensemble ensemble models? No one that I listen to!
Averaging
Each of these classifiers can return the probabilities from the classifier. If you’re going to do averaging, you’ll want to use these. Let’s get the probabilities from each classifier.
xgb_probs = xgb_clf.predict_proba(X_test)
xgb_probs[:5]
array([[0.9548289 , 0.04517111],
[0.9358052 , 0.06419478],
[0.96357316, 0.03642686],
[0.3344649 , 0.6655351 ],
[0.23816943, 0.76183057]], dtype=float32)
You can see that the predictions are just the argmax of the probabilities.
xgb_probs_labels = np.argmax(xgb_probs, axis=1)
(xgb_probs_labels == xgb_preds).all()
True
Let’s get them for the other classifiers.
rf_probs = rf_clf.predict_proba(X_test)
svm_probs = svm_clf.predict_proba(X_test)
cb_probs = cb_clf.predict_proba(X_test)
ensemble_ave = np.argmax(xgb_probs + rf_probs + svm_probs + cb_probs, axis=1)
f1_score(ensemble_ave, y_test)
0.26562499999999994
accuracy_score(y_test, ensemble_ave)
0.8160469667318982
This is not always going to give the best result, but it can be something to keep in your back pocket.
Voting
scikit-learn
also provides a voting mechanism for ensembling, which you can see here.
from sklearn.ensemble import VotingClassifier
clfs = [('xbg', xgb_clf), ('rf', rf_clf), ('svm', svm_clf), ('cb', cb_clf)]
ensemble = VotingClassifier(clfs, voting='hard')
ensemble.fit(X_train, y_train)
[15:54:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Learning rate set to 0.024769
0: total: 47.2ms remaining: 47.2s
200: total: 6.61s remaining: 26.3s
400: total: 13.4s remaining: 20s
600: total: 20.1s remaining: 13.3s
800: total: 26.7s remaining: 6.63s
999: total: 33.3s remaining: 0us
VotingClassifier(estimators=[('xbg',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1,
colsample_bynode=1,
colsample_bytree=0.5671189561452116,
enable_categorical=False,
gamma=1.0071481663300468, gpu_id=-1,
importance_type=None,
interaction_constraints='',
learning_rate=0.300000012,
max_delta_step=0, max_depth=14,
min_child_weight=2.0, missing=...
scale_pos_weight=1, seed=0,
subsample=1, tree_method='exact',
validate_parameters=1,
verbosity=None)),
('rf',
RandomForestClassifier(max_depth=98,
max_features='log2',
min_samples_split=9,
n_estimators=535,
random_state=42)),
('svm',
SVC(C=2.1465036697130855, degree=4, kernel='poly',
probability=True)),
('cb',
<catboost.core.CatBoostClassifier object at 0x000001F10B3250A0>)])
ensemble_preds = ensemble.predict(X_test)
f1_score(ensemble_preds, y_test)
0.25773195876288657
accuracy_score(ensemble.predict(X_test), y_test)
0.8590998043052838