This blog posts shows some ways to get generally good performance on tabular data. Most of the work in getting high performance models from tabular data comes from cleaning the dataset, clever feature engineering, and other tasks specific to the data set. We won’t be doing that here. However, there’s still a need for some good baseline parameters to know you’re getting the best out of your model. This post provides a way to use Bayesian optimization to find good hyperparameters and get good performance.

Table of Contents

import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

df = pd.read_csv(Path(os.getenv('DATA')) / 'stroke/healthcare-dataset-stroke-data.csv')

df.head()

	id	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	9046	Male	67.0	0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1
1	51676	Female	61.0	0	0	Yes	Self-employed	Rural	202.21	NaN	never smoked	1
2	31112	Male	80.0	0	1	Yes	Private	Rural	105.92	32.5	never smoked	1
3	60182	Female	49.0	0	0	Yes	Private	Urban	171.23	34.4	smokes	1
4	1665	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.0	never smoked	1

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB

df = df.drop('id', axis=1)

df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

We have significantly unbalanced data. We’ll have to use oversampling to adjust for this in the training data.

le = LabelEncoder()
en_df = df.apply(le.fit_transform)
en_df.head()

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	1	88	0	1	1	2	1	3850	239	1	1
1	0	82	0	0	1	3	0	3588	418	2	1
2	1	101	0	1	1	2	0	2483	198	2	1
3	0	70	0	0	1	2	1	3385	217	3	1
4	0	100	1	0	1	3	0	3394	113	2	1

Clean Dataset

en_df_imputed = en_df
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputer.fit_transform(en_df_imputed)

array([[  1.,  88.,   0., ..., 239.,   1.,   1.],
       [  0.,  82.,   0., ..., 418.,   2.,   1.],
       [  1., 101.,   0., ..., 198.,   2.,   1.],
       ...,
       [  0.,  56.,   0., ..., 179.,   2.,   0.],
       [  1.,  72.,   0., ..., 129.,   1.,   0.],
       [  0.,  65.,   0., ..., 135.,   0.,   0.]])

en_df_imputed.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

features=['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type',
       'smoking_status']

from imblearn.over_sampling import SMOTE
X, y = en_df_imputed[features], en_df_imputed["stroke"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)

Modeling

from functools import partial

from hyperopt import STATUS_OK, Trials, fmin, hp, space_eval, tpe
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn.metrics import accuracy_score, f1_score

num_trials = 500
svm_trials = 100 # svm takes much longer, so you may want to limit this

XGBoost

from xgboost import XGBClassifier

xgb_space={'max_depth': scope.int(hp.quniform("max_depth", 3, 18, 1)),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

def train_clf(clf, params):
    clf=clf(**params)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    accuracy = accuracy_score(y_test, preds>0.5)

    return {'loss': -accuracy, 'status': STATUS_OK}

def train_xgb(params):
    """
    xgb needs eval_metric or it produces lots of warnings
    """
    clf=XGBClassifier(**params)
    clf.fit(X_train, y_train, eval_metric='logloss')
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)

    return {'loss': -accuracy, 'status': STATUS_OK}

trials = Trials()

fmin(fn = train_xgb,
    space = xgb_space,
    algo = tpe.suggest,
    max_evals = num_trials,
    trials = trials)

100%|██████████████████████████████████████████████| 500/500 [02:25<00:00,  3.43trial/s, best loss: -0.764187866927593]

{'colsample_bytree': 0.5671189561452116,
 'gamma': 1.0071481663300468,
 'max_depth': 14.0,
 'min_child_weight': 2.0,
 'reg_alpha': 50.0,
 'reg_lambda': 0.6971844311013579}

best_hyperparams = space_eval(xgb_space, trials.argmin)

best_hyperparams

{'colsample_bytree': 0.5671189561452116,
 'gamma': 1.0071481663300468,
 'max_depth': 14,
 'min_child_weight': 2.0,
 'n_estimators': 180,
 'reg_alpha': 50.0,
 'reg_lambda': 0.6971844311013579,
 'seed': 0}

xgb_clf = XGBClassifier(**best_hyperparams)

xgb_clf.fit(X_train, y_train)

[00:09:42] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5671189561452116,
              enable_categorical=False, gamma=1.0071481663300468, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=14,
              min_child_weight=2.0, missing=nan, monotone_constraints='()',
              n_estimators=180, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=50.0,
              reg_lambda=0.6971844311013579, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

xgb_preds = xgb_clf.predict(X_test)

f1_score(y_test, xgb_preds)

0.25846153846153846

accuracy_score(y_test, xgb_preds)

0.764187866927593

y_test

  0
  0
  0
  0
  0
       ..
  0
  0
  0
  0
  0
Name: stroke, Length: 1022, dtype: int64

Random Forest

from sklearn.ensemble import RandomForestClassifier

rf_space = {
    "n_estimators": hp.randint("n_estimators", 10, 700),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    "max_depth": hp.randint('max_depth', 1, 100),
    "min_samples_split": hp.randint('min_samples_split', 2, 20),
    "min_samples_leaf": hp.randint('min_samples_leaf', 1, 10),
    "max_features": hp.choice('max_features', ['sqrt', 'log2']),
    "random_state": 42
}

trials = Trials()

fmin(fn = partial(train_clf, RandomForestClassifier),
    space = rf_space,
    algo = tpe.suggest,
    max_evals = num_trials,
    trials = trials)

100%|█████████████████████████████████████████████| 500/500 [09:01<00:00,  1.08s/trial, best loss: -0.8679060665362035]

{'criterion': 0,
 'max_depth': 98,
 'max_features': 1,
 'min_samples_leaf': 1,
 'min_samples_split': 9,
 'n_estimators': 535}

rf_best_hyperparams = space_eval(rf_space, trials.argmin)

rf_best_hyperparams

{'criterion': 'gini',
 'max_depth': 98,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 9,
 'n_estimators': 535,
 'random_state': 42}

rf_clf = RandomForestClassifier(**rf_best_hyperparams)

rf_clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=98, max_features='log2', min_samples_split=9,
                       n_estimators=535, random_state=42)

rf_preds = rf_clf.predict(X_test)

f1_score(y_test, rf_preds)

0.17177914110429446

accuracy_score(y_test, rf_preds)

0.8679060665362035

SVM

You can also try support vector machines, but I usually skip these for very large datasets. They don’t generally get the best performance and the training time is much higher than the others. Fundamentally, nonlinear SVM kernels are trying to solve a problem that is O(n_samples^2 * n_features), so things quickly get out of hand with a lot of samples.

from sklearn.svm import SVC

svm_space = {
      'C': hp.lognormal('svm_C', 0, 1),
      'kernel': hp.choice('kernel', ['linear', 'rbf', 'poly']),
      'degree':hp.choice('degree',[2,3,4]),
      'probability':hp.choice('probability',[True])
      }

trials = Trials()

fmin(fn = partial(train_clf, SVC),
    space = svm_space,
    algo = tpe.suggest,
    max_evals = svm_trials,
    trials = trials)

100%|█████████████████████████████████████████████| 100/100 [22:05<00:00, 13.25s/trial, best loss: -0.8111545988258317]

{'degree': 2, 'kernel': 2, 'probability': 0, 'svm_C': 2.1465036697130855}

svm_best_hyperparams = space_eval(svm_space, trials.argmin)

svm_best_hyperparams

{'C': 2.1465036697130855, 'degree': 4, 'kernel': 'poly', 'probability': True}

svm_clf = SVC(**svm_best_hyperparams)

svm_clf.fit(X_train, y_train)

SVC(C=2.1465036697130855, degree=4, kernel='poly', probability=True)

svm_preds = svm_clf.predict(X_test)

f1_score(y_test, svm_preds)

0.2771535580524344

accuracy_score(y_test, svm_preds)

0.8111545988258317

Neural Network with FastAI

I’ve had poor-to-mixed results with neural networks and hyperopt. But I still included it because I thought it might be helpful.

from fastai.tabular.all import *

df.head()

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	Male	67.0	0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1
1	Female	61.0	0	0	Yes	Self-employed	Rural	202.21	NaN	never smoked	1
2	Male	80.0	0	1	Yes	Private	Rural	105.92	32.5	never smoked	1
3	Female	49.0	0	0	Yes	Private	Urban	171.23	34.4	smokes	1
4	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.0	never smoked	1

dep_var = 'stroke'

For FastAI, we’ll combine the training and validation data into one DataFrame and then split them out later. It’s just easier this way.

full_X_df = pd.concat([X_train, X_test])
full_y_df = pd.concat([y_train, y_test])

df = pd.merge(full_X_df, full_y_df, left_index=True, right_index=True)

np.sum(y_test)

continuous_vars, categorical_vars = cont_cat_split(df, dep_var=dep_var)

val_indices = list(range(len(X_train), len(X_train) + len(X_test)))

ind_splitter = IndexSplitter(val_indices)

splits = ind_splitter(df) 

preprocessing = [Categorify, Normalize]

to_nn = TabularPandas(df, preprocessing, categorical_vars, continuous_vars, splits=splits, y_names=dep_var)

dls = to_nn.dataloaders(64)

def my_acc(preds, gt):
    """
    The order that FAI and sklearn received inputs is flipped, so be careful.
    """
    return accuracy_score(gt.cpu(), np.rint(preds.cpu()))

nn_space = [
    {'layer1': scope.int(hp.quniform('layer1', 2, 200, 1))},
    {'layer2': scope.int(hp.quniform('layer2', 2, 500, 2))},
    {'epochs': scope.int(hp.quniform('epochs', 1, 20, 1))},
    {'lr': hp.uniform('lr', 1e-7, 1e-1)},
]

def objective(params):
    learn = tabular_learner(dls, y_range=(y.min(), y.max()), layers=[params[0]['layer1'],params[1]['layer2']], metrics=accuracy)
    with learn.no_bar(), learn.no_logging():
        learn.fit(params[2]['epochs'], params[3]['lr'])
    return {'loss': learn.recorder.losses[-1], 'status': STATUS_OK}

trials = Trials()

best = fmin(objective,
    space=nn_space,
    algo=tpe.suggest,
    max_evals=num_trials,
           trials=trials)
print(best)

100%|███████████████████████████████████████████| 500/500 [2:36:05<00:00, 18.73s/trial, best loss: 0.12682415544986725]
{'epochs': 20.0, 'layer1': 181.0, 'layer2': 158.0, 'lr': 0.004167824772417915}

nn_best_hyperparams = space_eval(nn_space, trials.argmin)
nn_best_hyperparams

({'layer1': 181},
 {'layer2': 158},
 {'epochs': 20},
 {'lr': 0.004167824772417915})

learn = tabular_learner(dls, y_range=(y.min(), y.max()), 
                        layers=[nn_best_hyperparams[0]['layer1'], nn_best_hyperparams[1]['layer2']], metrics=my_acc)
learn.fit(nn_best_hyperparams[2]['epochs'], nn_best_hyperparams[3]['lr'], )

epoch	train_loss	valid_loss	my_acc	time
0	0.152055	0.210501	0.703523	00:01
1	0.149081	0.241731	0.587084	00:01
2	0.147788	0.208247	0.710372	00:01
3	0.144673	0.194463	0.737769	00:01
4	0.142503	0.195599	0.729941	00:01
5	0.143986	0.247592	0.614481	00:01
6	0.140043	0.233124	0.599804	00:01
7	0.140295	0.193624	0.726027	00:01
8	0.135865	0.200070	0.707436	00:01
9	0.134626	0.198199	0.737769	00:01
10	0.138697	0.211755	0.710372	00:01
11	0.133714	0.199920	0.726027	00:01
12	0.139617	0.222729	0.687867	00:01
13	0.131833	0.192564	0.747554	00:01
14	0.133367	0.180420	0.767123	00:01
15	0.133702	0.213077	0.709393	00:01
16	0.137790	0.210081	0.688845	00:01
17	0.134852	0.158134	0.800391	00:01
18	0.137128	0.197334	0.726027	00:01
19	0.130718	0.185704	0.740705	00:01

nn_preds, gt = learn.get_preds()

my_acc(nn_preds, gt)

0.7407045009784736

Note that the gt is different here than for the other classifiers.

gt.sum(), len(gt)

(tensor(922), 1022)

y_test.value_counts()

0    960
1     62
Name: stroke, dtype: int64

Catboost

from catboost import CatBoostClassifier

cb_params = {'loss_function':'Logloss',
             'eval_metric':'AUC',
             'cat_features': categorical_vars,
             'verbose': 200,
             'random_seed': 42
            }
cb_clf = CatBoostClassifier(**cb_params)
cb_clf.fit(X_train, y_train,
          eval_set=(X_test, y_test),
          use_best_model=True,
          plot=True
         );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Learning rate set to 0.052636
0:	test: 0.7928763	best: 0.7928763 (0)	total: 173ms	remaining: 2m 52s
200:	test: 0.7582745	best: 0.8060484 (11)	total: 5.18s	remaining: 20.6s
400:	test: 0.7407258	best: 0.8060484 (11)	total: 10.5s	remaining: 15.6s
600:	test: 0.7307124	best: 0.8060484 (11)	total: 15.9s	remaining: 10.6s
800:	test: 0.7226983	best: 0.8060484 (11)	total: 21.3s	remaining: 5.29s
999:	test: 0.7188004	best: 0.8060484 (11)	total: 26.9s	remaining: 0us

bestTest = 0.8060483871
bestIteration = 11

Shrink model to first 12 iterations.

Note: The above command provides an interactive graph that is not displayed on the blog.

cb_preds = cb_clf.predict(X_test)

f1_score(y_test, cb_preds)

0.2580645161290323

accuracy_score(y_test, cb_preds)

0.7299412915851272

Ensembling

Some of these models are already ensemble models. But who says you can’t ensemble ensemble models? No one that I listen to!

Averaging

Each of these classifiers can return the probabilities from the classifier. If you’re going to do averaging, you’ll want to use these. Let’s get the probabilities from each classifier.

xgb_probs = xgb_clf.predict_proba(X_test)

xgb_probs[:5]

array([[0.9548289 , 0.04517111],
       [0.9358052 , 0.06419478],
       [0.96357316, 0.03642686],
       [0.3344649 , 0.6655351 ],
       [0.23816943, 0.76183057]], dtype=float32)

You can see that the predictions are just the argmax of the probabilities.

xgb_probs_labels = np.argmax(xgb_probs, axis=1)

(xgb_probs_labels == xgb_preds).all()

True

Let’s get them for the other classifiers.

rf_probs = rf_clf.predict_proba(X_test)

svm_probs = svm_clf.predict_proba(X_test)

cb_probs = cb_clf.predict_proba(X_test)

ensemble_ave = np.argmax(xgb_probs + rf_probs + svm_probs + cb_probs, axis=1)

f1_score(ensemble_ave, y_test)

0.26562499999999994

accuracy_score(y_test, ensemble_ave)

0.8160469667318982

This is not always going to give the best result, but it can be something to keep in your back pocket.

Voting

scikit-learn also provides a voting mechanism for ensembling, which you can see here.

from sklearn.ensemble import VotingClassifier

clfs = [('xbg', xgb_clf), ('rf', rf_clf), ('svm', svm_clf), ('cb', cb_clf)]
ensemble = VotingClassifier(clfs, voting='hard')

ensemble.fit(X_train, y_train)

[15:54:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Learning rate set to 0.024769
0:	total: 47.2ms	remaining: 47.2s
200:	total: 6.61s	remaining: 26.3s
400:	total: 13.4s	remaining: 20s
600:	total: 20.1s	remaining: 13.3s
800:	total: 26.7s	remaining: 6.63s
999:	total: 33.3s	remaining: 0us





VotingClassifier(estimators=[('xbg',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.5671189561452116,
                                            enable_categorical=False,
                                            gamma=1.0071481663300468, gpu_id=-1,
                                            importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.300000012,
                                            max_delta_step=0, max_depth=14,
                                            min_child_weight=2.0, missing=...
                                            scale_pos_weight=1, seed=0,
                                            subsample=1, tree_method='exact',
                                            validate_parameters=1,
                                            verbosity=None)),
                             ('rf',
                              RandomForestClassifier(max_depth=98,
                                                     max_features='log2',
                                                     min_samples_split=9,
                                                     n_estimators=535,
                                                     random_state=42)),
                             ('svm',
                              SVC(C=2.1465036697130855, degree=4, kernel='poly',
                                  probability=True)),
                             ('cb',
                              <catboost.core.CatBoostClassifier object at 0x000001F10B3250A0>)])

ensemble_preds = ensemble.predict(X_test)

f1_score(ensemble_preds, y_test)

0.25773195876288657

accuracy_score(ensemble.predict(X_test), y_test)

0.8590998043052838