This blog posts shows some ways to get generally good performance on tabular data. Most of the work in getting high performance models from tabular data comes from cleaning the dataset, clever feature engineering, and other tasks specific to the data set. We won’t be doing that here. However, there’s still a need for some good baseline parameters to know you’re getting the best out of your model. This post provides a way to use Bayesian optimization to find good hyperparameters and get good performance.

Table of Contents

import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')
df = pd.read_csv(Path(os.getenv('DATA')) / 'stroke/healthcare-dataset-stroke-data.csv')
df.head()
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 51676 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 60182 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 1665 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
df = df.drop('id', axis=1)
df.isnull().sum()
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
df['stroke'].value_counts()
0    4861
1     249
Name: stroke, dtype: int64

We have significantly unbalanced data. We’ll have to use oversampling to adjust for this in the training data.

le = LabelEncoder()
en_df = df.apply(le.fit_transform)
en_df.head()
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 1 88 0 1 1 2 1 3850 239 1 1
1 0 82 0 0 1 3 0 3588 418 2 1
2 1 101 0 1 1 2 0 2483 198 2 1
3 0 70 0 0 1 2 1 3385 217 3 1
4 0 100 1 0 1 3 0 3394 113 2 1

Clean Dataset

en_df_imputed = en_df
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputer.fit_transform(en_df_imputed)
array([[  1.,  88.,   0., ..., 239.,   1.,   1.],
       [  0.,  82.,   0., ..., 418.,   2.,   1.],
       [  1., 101.,   0., ..., 198.,   2.,   1.],
       ...,
       [  0.,  56.,   0., ..., 179.,   2.,   0.],
       [  1.,  72.,   0., ..., 129.,   1.,   0.],
       [  0.,  65.,   0., ..., 135.,   0.,   0.]])
en_df_imputed.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64
features=['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type',
       'smoking_status']
from imblearn.over_sampling import SMOTE
X, y = en_df_imputed[features], en_df_imputed["stroke"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)

Modeling

from functools import partial

from hyperopt import STATUS_OK, Trials, fmin, hp, space_eval, tpe
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn.metrics import accuracy_score, f1_score
num_trials = 500
svm_trials = 100 # svm takes much longer, so you may want to limit this

XGBoost

from xgboost import XGBClassifier
xgb_space={'max_depth': scope.int(hp.quniform("max_depth", 3, 18, 1)),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }
def train_clf(clf, params):
    clf=clf(**params)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    accuracy = accuracy_score(y_test, preds>0.5)

    return {'loss': -accuracy, 'status': STATUS_OK}
def train_xgb(params):
    """
    xgb needs eval_metric or it produces lots of warnings
    """
    clf=XGBClassifier(**params)
    clf.fit(X_train, y_train, eval_metric='logloss')
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)

    return {'loss': -accuracy, 'status': STATUS_OK}
trials = Trials()

fmin(fn = train_xgb,
    space = xgb_space,
    algo = tpe.suggest,
    max_evals = num_trials,
    trials = trials)
100%|██████████████████████████████████████████████| 500/500 [02:25<00:00,  3.43trial/s, best loss: -0.764187866927593]





{'colsample_bytree': 0.5671189561452116,
 'gamma': 1.0071481663300468,
 'max_depth': 14.0,
 'min_child_weight': 2.0,
 'reg_alpha': 50.0,
 'reg_lambda': 0.6971844311013579}
best_hyperparams = space_eval(xgb_space, trials.argmin)
best_hyperparams
{'colsample_bytree': 0.5671189561452116,
 'gamma': 1.0071481663300468,
 'max_depth': 14,
 'min_child_weight': 2.0,
 'n_estimators': 180,
 'reg_alpha': 50.0,
 'reg_lambda': 0.6971844311013579,
 'seed': 0}
xgb_clf = XGBClassifier(**best_hyperparams)
xgb_clf.fit(X_train, y_train)
[00:09:42] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5671189561452116,
              enable_categorical=False, gamma=1.0071481663300468, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=14,
              min_child_weight=2.0, missing=nan, monotone_constraints='()',
              n_estimators=180, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=50.0,
              reg_lambda=0.6971844311013579, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)
xgb_preds = xgb_clf.predict(X_test)
f1_score(y_test, xgb_preds)
0.25846153846153846
accuracy_score(y_test, xgb_preds)
0.764187866927593
y_test
4688    0
4478    0
3849    0
4355    0
3826    0
       ..
3605    0
4934    0
4835    0
4105    0
2902    0
Name: stroke, Length: 1022, dtype: int64

Random Forest

from sklearn.ensemble import RandomForestClassifier
rf_space = {
    "n_estimators": hp.randint("n_estimators", 10, 700),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    "max_depth": hp.randint('max_depth', 1, 100),
    "min_samples_split": hp.randint('min_samples_split', 2, 20),
    "min_samples_leaf": hp.randint('min_samples_leaf', 1, 10),
    "max_features": hp.choice('max_features', ['sqrt', 'log2']),
    "random_state": 42
}
trials = Trials()

fmin(fn = partial(train_clf, RandomForestClassifier),
    space = rf_space,
    algo = tpe.suggest,
    max_evals = num_trials,
    trials = trials)
100%|█████████████████████████████████████████████| 500/500 [09:01<00:00,  1.08s/trial, best loss: -0.8679060665362035]





{'criterion': 0,
 'max_depth': 98,
 'max_features': 1,
 'min_samples_leaf': 1,
 'min_samples_split': 9,
 'n_estimators': 535}
rf_best_hyperparams = space_eval(rf_space, trials.argmin)
rf_best_hyperparams
{'criterion': 'gini',
 'max_depth': 98,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 9,
 'n_estimators': 535,
 'random_state': 42}
rf_clf = RandomForestClassifier(**rf_best_hyperparams)
rf_clf.fit(X_train, y_train)
RandomForestClassifier(max_depth=98, max_features='log2', min_samples_split=9,
                       n_estimators=535, random_state=42)
rf_preds = rf_clf.predict(X_test)
f1_score(y_test, rf_preds)
0.17177914110429446
accuracy_score(y_test, rf_preds)
0.8679060665362035

SVM

You can also try support vector machines, but I usually skip these for very large datasets. They don’t generally get the best performance and the training time is much higher than the others. Fundamentally, nonlinear SVM kernels are trying to solve a problem that is O(n_samples^2 * n_features), so things quickly get out of hand with a lot of samples.

from sklearn.svm import SVC
svm_space = {
      'C': hp.lognormal('svm_C', 0, 1),
      'kernel': hp.choice('kernel', ['linear', 'rbf', 'poly']),
      'degree':hp.choice('degree',[2,3,4]),
      'probability':hp.choice('probability',[True])
      }
trials = Trials()

fmin(fn = partial(train_clf, SVC),
    space = svm_space,
    algo = tpe.suggest,
    max_evals = svm_trials,
    trials = trials)
100%|█████████████████████████████████████████████| 100/100 [22:05<00:00, 13.25s/trial, best loss: -0.8111545988258317]





{'degree': 2, 'kernel': 2, 'probability': 0, 'svm_C': 2.1465036697130855}
svm_best_hyperparams = space_eval(svm_space, trials.argmin)
svm_best_hyperparams
{'C': 2.1465036697130855, 'degree': 4, 'kernel': 'poly', 'probability': True}
svm_clf = SVC(**svm_best_hyperparams)
svm_clf.fit(X_train, y_train)
SVC(C=2.1465036697130855, degree=4, kernel='poly', probability=True)
svm_preds = svm_clf.predict(X_test)
f1_score(y_test, svm_preds)
0.2771535580524344
accuracy_score(y_test, svm_preds)
0.8111545988258317

Neural Network with FastAI

I’ve had poor-to-mixed results with neural networks and hyperopt. But I still included it because I thought it might be helpful.

from fastai.tabular.all import *
df.head()
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
dep_var = 'stroke'

For FastAI, we’ll combine the training and validation data into one DataFrame and then split them out later. It’s just easier this way.

full_X_df = pd.concat([X_train, X_test])
full_y_df = pd.concat([y_train, y_test])
df = pd.merge(full_X_df, full_y_df, left_index=True, right_index=True)
np.sum(y_test)
62
continuous_vars, categorical_vars = cont_cat_split(df, dep_var=dep_var)

val_indices = list(range(len(X_train), len(X_train) + len(X_test)))
ind_splitter = IndexSplitter(val_indices)
splits = ind_splitter(df) 

preprocessing = [Categorify, Normalize]
to_nn = TabularPandas(df, preprocessing, categorical_vars, continuous_vars, splits=splits, y_names=dep_var)
dls = to_nn.dataloaders(64)
def my_acc(preds, gt):
    """
    The order that FAI and sklearn received inputs is flipped, so be careful.
    """
    return accuracy_score(gt.cpu(), np.rint(preds.cpu()))
nn_space = [
    {'layer1': scope.int(hp.quniform('layer1', 2, 200, 1))},
    {'layer2': scope.int(hp.quniform('layer2', 2, 500, 2))},
    {'epochs': scope.int(hp.quniform('epochs', 1, 20, 1))},
    {'lr': hp.uniform('lr', 1e-7, 1e-1)},
]
def objective(params):
    learn = tabular_learner(dls, y_range=(y.min(), y.max()), layers=[params[0]['layer1'],params[1]['layer2']], metrics=accuracy)
    with learn.no_bar(), learn.no_logging():
        learn.fit(params[2]['epochs'], params[3]['lr'])
    return {'loss': learn.recorder.losses[-1], 'status': STATUS_OK}
trials = Trials()

best = fmin(objective,
    space=nn_space,
    algo=tpe.suggest,
    max_evals=num_trials,
           trials=trials)
print(best)
100%|███████████████████████████████████████████| 500/500 [2:36:05<00:00, 18.73s/trial, best loss: 0.12682415544986725]
{'epochs': 20.0, 'layer1': 181.0, 'layer2': 158.0, 'lr': 0.004167824772417915}
nn_best_hyperparams = space_eval(nn_space, trials.argmin)
nn_best_hyperparams
({'layer1': 181},
 {'layer2': 158},
 {'epochs': 20},
 {'lr': 0.004167824772417915})
learn = tabular_learner(dls, y_range=(y.min(), y.max()), 
                        layers=[nn_best_hyperparams[0]['layer1'], nn_best_hyperparams[1]['layer2']], metrics=my_acc)
learn.fit(nn_best_hyperparams[2]['epochs'], nn_best_hyperparams[3]['lr'], )
epoch train_loss valid_loss my_acc time
0 0.152055 0.210501 0.703523 00:01
1 0.149081 0.241731 0.587084 00:01
2 0.147788 0.208247 0.710372 00:01
3 0.144673 0.194463 0.737769 00:01
4 0.142503 0.195599 0.729941 00:01
5 0.143986 0.247592 0.614481 00:01
6 0.140043 0.233124 0.599804 00:01
7 0.140295 0.193624 0.726027 00:01
8 0.135865 0.200070 0.707436 00:01
9 0.134626 0.198199 0.737769 00:01
10 0.138697 0.211755 0.710372 00:01
11 0.133714 0.199920 0.726027 00:01
12 0.139617 0.222729 0.687867 00:01
13 0.131833 0.192564 0.747554 00:01
14 0.133367 0.180420 0.767123 00:01
15 0.133702 0.213077 0.709393 00:01
16 0.137790 0.210081 0.688845 00:01
17 0.134852 0.158134 0.800391 00:01
18 0.137128 0.197334 0.726027 00:01
19 0.130718 0.185704 0.740705 00:01
nn_preds, gt = learn.get_preds()
my_acc(nn_preds, gt)
0.7407045009784736

Note that the gt is different here than for the other classifiers.

gt.sum(), len(gt)
(tensor(922), 1022)
y_test.value_counts()
0    960
1     62
Name: stroke, dtype: int64

Catboost

from catboost import CatBoostClassifier
cb_params = {'loss_function':'Logloss',
             'eval_metric':'AUC',
             'cat_features': categorical_vars,
             'verbose': 200,
             'random_seed': 42
            }
cb_clf = CatBoostClassifier(**cb_params)
cb_clf.fit(X_train, y_train,
          eval_set=(X_test, y_test),
          use_best_model=True,
          plot=True
         );
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Learning rate set to 0.052636
0:	test: 0.7928763	best: 0.7928763 (0)	total: 173ms	remaining: 2m 52s
200:	test: 0.7582745	best: 0.8060484 (11)	total: 5.18s	remaining: 20.6s
400:	test: 0.7407258	best: 0.8060484 (11)	total: 10.5s	remaining: 15.6s
600:	test: 0.7307124	best: 0.8060484 (11)	total: 15.9s	remaining: 10.6s
800:	test: 0.7226983	best: 0.8060484 (11)	total: 21.3s	remaining: 5.29s
999:	test: 0.7188004	best: 0.8060484 (11)	total: 26.9s	remaining: 0us

bestTest = 0.8060483871
bestIteration = 11

Shrink model to first 12 iterations.

Note: The above command provides an interactive graph that is not displayed on the blog.

cb_preds = cb_clf.predict(X_test)
f1_score(y_test, cb_preds)
0.2580645161290323
accuracy_score(y_test, cb_preds)
0.7299412915851272

Ensembling

Some of these models are already ensemble models. But who says you can’t ensemble ensemble models? No one that I listen to!

Averaging

Each of these classifiers can return the probabilities from the classifier. If you’re going to do averaging, you’ll want to use these. Let’s get the probabilities from each classifier.

xgb_probs = xgb_clf.predict_proba(X_test)
xgb_probs[:5]
array([[0.9548289 , 0.04517111],
       [0.9358052 , 0.06419478],
       [0.96357316, 0.03642686],
       [0.3344649 , 0.6655351 ],
       [0.23816943, 0.76183057]], dtype=float32)

You can see that the predictions are just the argmax of the probabilities.

xgb_probs_labels = np.argmax(xgb_probs, axis=1)
(xgb_probs_labels == xgb_preds).all()
True

Let’s get them for the other classifiers.

rf_probs = rf_clf.predict_proba(X_test)
svm_probs = svm_clf.predict_proba(X_test)
cb_probs = cb_clf.predict_proba(X_test)
ensemble_ave = np.argmax(xgb_probs + rf_probs + svm_probs + cb_probs, axis=1)
f1_score(ensemble_ave, y_test)
0.26562499999999994
accuracy_score(y_test, ensemble_ave)
0.8160469667318982

This is not always going to give the best result, but it can be something to keep in your back pocket.

Voting

scikit-learn also provides a voting mechanism for ensembling, which you can see here.

from sklearn.ensemble import VotingClassifier
clfs = [('xbg', xgb_clf), ('rf', rf_clf), ('svm', svm_clf), ('cb', cb_clf)]
ensemble = VotingClassifier(clfs, voting='hard')
ensemble.fit(X_train, y_train)
[15:54:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Learning rate set to 0.024769
0:	total: 47.2ms	remaining: 47.2s
200:	total: 6.61s	remaining: 26.3s
400:	total: 13.4s	remaining: 20s
600:	total: 20.1s	remaining: 13.3s
800:	total: 26.7s	remaining: 6.63s
999:	total: 33.3s	remaining: 0us





VotingClassifier(estimators=[('xbg',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.5671189561452116,
                                            enable_categorical=False,
                                            gamma=1.0071481663300468, gpu_id=-1,
                                            importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.300000012,
                                            max_delta_step=0, max_depth=14,
                                            min_child_weight=2.0, missing=...
                                            scale_pos_weight=1, seed=0,
                                            subsample=1, tree_method='exact',
                                            validate_parameters=1,
                                            verbosity=None)),
                             ('rf',
                              RandomForestClassifier(max_depth=98,
                                                     max_features='log2',
                                                     min_samples_split=9,
                                                     n_estimators=535,
                                                     random_state=42)),
                             ('svm',
                              SVC(C=2.1465036697130855, degree=4, kernel='poly',
                                  probability=True)),
                             ('cb',
                              <catboost.core.CatBoostClassifier object at 0x000001F10B3250A0>)])
ensemble_preds = ensemble.predict(X_test)
f1_score(ensemble_preds, y_test)
0.25773195876288657
accuracy_score(ensemble.predict(X_test), y_test)
0.8590998043052838