from pycaret.datasets import get_data
from pycaret.classification import *

from sklearn.metrics import balanced_accuracy_score
from imblearn.over_sampling import RandomOverSampler


# Record version of key libraries
from importlib.metadata import version

print('pycaret==%s' % version('pycaret'))

pycaret==2.3.1


# Get a list of all pre-packaged data
# get_data('index')

# Select a pre-packaged data for testing
data = get_data('diabetes')


# Setup upsampling method
ros = RandomOverSampler(sampling_strategy='not majority', random_state=6)

# Get relevant column names
numeric_cols = [
    'Number of times pregnant',
    'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
    'Diastolic blood pressure (mm Hg)', 
    'Triceps skin fold thickness (mm)',
    '2-Hour serum insulin (mu U/ml)',
    'Body mass index (weight in kg/(height in m)^2)'
]
categorical_cols = []
ignore_cols = []

# Setup core pycaret config
clf_setup = setup(
    data=data, 
    target='Class variable', 
    silent=True,
    numeric_features=numeric_cols,
    categorical_features=categorical_cols,
    ignore_features=ignore_cols,
    imputation_type='simple',
    numeric_imputation='median',
    normalize=True,
    normalize_method='minmax',
    fix_imbalance=True,
    fix_imbalance_method=ros,
    fold_strategy='stratifiedkfold',
    fold=10,
    n_jobs=-1,
    preprocess=True,
    session_id=6, 
    log_data=True,
    log_profile=True,
    log_experiment=True, 
    experiment_name='diabetes1')

# Add additional metrics for model evaluation
add_metric('balacc', 'Balanced Accuracy', balanced_accuracy_score, greater_is_better=True, target='pred')

# Retrieve and save processed data
X_data = get_config('X')
y_data = get_config('y')
y_data.name = 'Target variable'
comb_data = X_data.merge(y_data, left_index=True, right_index=True)
comb_data.to_csv('./processed_data.csv', index=False)

# data_processing_pipeline = get_config('prep_pipe')


# Select a list of models of interest to be tested
# # To see all available models
# models()

# Pick logistic regression & ensemble models
chosen_model_types_list = ['lr'] + models(type='ensemble').index.tolist()

top3_models = compare_models(
    include = chosen_model_types_list, 
    n_select = 3,
    sort='Balanced Accuracy',
    errors='raise'
)

print(top3_models)

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=6, verbose=0,
                       warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=6, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=6)]


# Tune top models

tuned_top3_models = [
    tune_model(
        i, 
        optimize='Balanced Accuracy',
        search_library='tune-sklearn',
        search_algorithm='hyperopt',
        choose_better=True
    ) 
    for i in top3_models]

print(tuned_top3_models)

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={},
                       criterion='entropy', max_depth=5,
                       max_features=0.6311732042950683, max_leaf_nodes=None,
                       max_samples=None,
                       min_impurity_decrease=9.124812416452801e-08,
                       min_impurity_split=None, min_samples_leaf=5,
                       min_samples_split=6, min_weight_fraction_leaf=0.0,
                       n_estimators=120, n_jobs=-1, oob_score=False,
                       random_state=6, verbose=0, warm_start=False), LogisticRegression(C=6.68643136832486, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=6, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), AdaBoostClassifier(algorithm='SAMME', base_estimator=None,
                   learning_rate=0.051736972654183026, n_estimators=166,
                   random_state=6)]


# Blend top tuned models into a single voting classifier model

blended_model = blend_models(
    estimator_list = tuned_top3_models, 
    method = 'soft'
)

print(blended_model)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight={},
                                                     criterion='entropy',
                                                     max_depth=5,
                                                     max_features=0.6311732042950683,
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=9.124812416452801e-08,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=5,
                                                     min_samples_split=6,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimato...
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=6, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('ada',
                              AdaBoostClassifier(algorithm='SAMME',
                                                 base_estimator=None,
                                                 learning_rate=0.051736972654183026,
                                                 n_estimators=166,
                                                 random_state=6))],
                 flatten_transform=True, n_jobs=-1, verbose=False,
                 voting='soft', weights=None)


# Make interactive standard diagnostic plots
evaluate_model(blended_model)

# plot_model(blended_model, plot='parameter')
# plot_model(blended_model, plot='manifold')
# plot_model(blended_model, plot='learning')
# plot_model(blended_model, plot='confusion_matrix')
# plot_model(blended_model, plot='class_report')
# # plot_model(blended_model, plot='feature')


# Interpret model - only for tree-based models

interpret_model(tuned_top3_models[0], plot='summary')
interpret_model(tuned_top3_models[0], plot='correlation', feature='Plasma glucose concentration a 2 hours in an oral glucose tolerance test')
interpret_model(tuned_top3_models[0], plot='reason', observation=100)

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


# Select best model out of all models trained in current session

best_model = automl(optimize = 'Balanced Accuracy')

print(best_model)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight={},
                                                     criterion='entropy',
                                                     max_depth=5,
                                                     max_features=0.6311732042950683,
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=9.124812416452801e-08,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=5,
                                                     min_samples_split=6,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimato...
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=6, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('ada',
                              AdaBoostClassifier(algorithm='SAMME',
                                                 base_estimator=None,
                                                 learning_rate=0.051736972654183026,
                                                 n_estimators=166,
                                                 random_state=6))],
                 flatten_transform=True, n_jobs=-1, verbose=False,
                 voting='soft', weights=None)


# Final check on performance of best model

compare_models(
    include = [best_model] + tuned_top3_models, 
    sort='Balanced Accuracy',
    errors='raise'
)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight={},
                                                     criterion='entropy',
                                                     max_depth=5,
                                                     max_features=0.6311732042950683,
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=9.124812416452801e-08,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=5,
                                                     min_samples_split=6,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimato...
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=6, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('ada',
                              AdaBoostClassifier(algorithm='SAMME',
                                                 base_estimator=None,
                                                 learning_rate=0.051736972654183026,
                                                 n_estimators=166,
                                                 random_state=6))],
                 flatten_transform=True, n_jobs=-1, verbose=False,
                 voting='soft', weights=None)


# Save model

save_model(best_model, model_name='best_model')
# best_model = load_model('best_model')

Transformation Pipeline and Model Succesfully Saved

(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=['Number of times '
                                                           'pregnant',
                                                           'Plasma glucose '
                                                           'concentration a 2 '
                                                           'hours in an oral '
                                                           'glucose tolerance '
                                                           'test',
                                                           'Diastolic blood '
                                                           'pressure (mm Hg)',
                                                           'Triceps skin fold '
                                                           'thickness (mm)',
                                                           '2...
                                                                   max_iter=1000,
                                                                   multi_class='auto',
                                                                   n_jobs=None,
                                                                   penalty='l2',
                                                                   random_state=6,
                                                                   solver='lbfgs',
                                                                   tol=0.0001,
                                                                   verbose=0,
                                                                   warm_start=False)),
                                               ('ada',
                                                AdaBoostClassifier(algorithm='SAMME',
                                                                   base_estimator=None,
                                                                   learning_rate=0.051736972654183026,
                                                                   n_estimators=166,
                                                                   random_state=6))],
                                   flatten_transform=True, n_jobs=-1,
                                   verbose=False, voting='soft',
                                   weights=None)]],
          verbose=False),
 'best_model.pkl')

	Number of times pregnant	Plasma glucose concentration a 2 hours in an oral glucose tolerance test	Diastolic blood pressure (mm Hg)	Triceps skin fold thickness (mm)	2-Hour serum insulin (mu U/ml)	Body mass index (weight in kg/(height in m)^2)	Diabetes pedigree function	Age (years)	Class variable
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

	Description	Value
0	session_id	6
1	Target	Class variable
2	Target Type	Binary
3	Label Encoded	0: 0, 1: 1
4	Original Data	(768, 9)
5	Missing Values	False
6	Numeric Features	8
7	Categorical Features	0
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(537, 8)
12	Transformed Test Set	(231, 8)
13	Shuffle Train-Test	True
14	Stratify Train-Test	False
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	False
19	Log Experiment	True
20	Experiment Name	diabetes1
21	USI	bbd4
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	median
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	constant
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	True
30	Normalize Method	minmax
31	Transformation	False
32	Transformation Method	None
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Clustering	False
45	Clustering Iteration	None
46	Polynomial Features	False
47	Polynomial Degree	None
48	Trignometry Features	False
49	Polynomial Threshold	None
50	Group Features	False
51	Feature Selection	False
52	Feature Selection Method	classic
53	Features Selection Threshold	None
54	Feature Interaction	False
55	Feature Ratio	False
56	Interaction Threshold	None
57	Fix Imbalance	True
58	Fix Imbalance Method	RandomOverSampler

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	Balanced Accuracy	TT (Sec)
rf	Random Forest Classifier	0.7617	0.8137	0.6895	0.6663	0.6738	0.4872	0.4909	0.7455	0.0930
lr	Logistic Regression	0.7374	0.8297	0.7211	0.6135	0.6606	0.4497	0.4558	0.7338	0.1260
ada	Ada Boost Classifier	0.7448	0.8083	0.6632	0.6369	0.6472	0.4480	0.4504	0.7264	0.0260
gbc	Gradient Boosting Classifier	0.7355	0.8250	0.6947	0.6213	0.6517	0.4412	0.4470	0.7264	0.0350
lightgbm	Light Gradient Boosting Machine	0.7374	0.7952	0.6526	0.6306	0.6379	0.4329	0.4361	0.7183	0.0900
et	Extra Trees Classifier	0.7542	0.8148	0.5684	0.7029	0.6176	0.4409	0.4538	0.7122	0.0870

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	Balanced Accuracy
0	0.7407	0.7759	0.7895	0.6000	0.6818	0.4698	0.4825	0.7519
1	0.7407	0.8850	0.9474	0.5806	0.7200	0.5033	0.5562	0.7880
2	0.6481	0.7654	0.8421	0.5000	0.6275	0.3329	0.3741	0.6925
3	0.8333	0.9030	0.8421	0.7273	0.7805	0.6473	0.6518	0.8353
4	0.8333	0.9128	0.8421	0.7273	0.7805	0.6473	0.6518	0.8353
5	0.7407	0.8323	0.8421	0.5926	0.6957	0.4815	0.5041	0.7639
6	0.6667	0.8226	0.8421	0.5161	0.6400	0.3614	0.3994	0.7068
7	0.6792	0.7384	0.6316	0.5455	0.5854	0.3261	0.3284	0.6687
8	0.7925	0.8630	0.7368	0.7000	0.7179	0.5539	0.5544	0.7802
9	0.7925	0.7423	0.7368	0.7000	0.7179	0.5539	0.5544	0.7802
Mean	0.7468	0.8241	0.8053	0.6189	0.6947	0.4877	0.5057	0.7603
SD	0.0632	0.0626	0.0817	0.0832	0.0601	0.1124	0.1052	0.0536

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	Balanced Accuracy
0	0.7222	0.8030	0.6842	0.5909	0.6341	0.4122	0.4151	0.7135
1	0.7222	0.8647	0.8421	0.5714	0.6809	0.4505	0.4772	0.7496
2	0.6852	0.8015	0.7895	0.5357	0.6383	0.3772	0.3995	0.7090
3	0.8704	0.9173	0.8421	0.8000	0.8205	0.7192	0.7197	0.8639
4	0.8704	0.9188	0.8947	0.7727	0.8293	0.7257	0.7308	0.8759
5	0.7593	0.8045	0.8421	0.6154	0.7111	0.5132	0.5318	0.7782
6	0.6667	0.8481	0.6842	0.5200	0.5909	0.3184	0.3269	0.6707
7	0.6981	0.7833	0.6842	0.5652	0.6190	0.3728	0.3775	0.6950
8	0.8113	0.8591	0.7368	0.7368	0.7368	0.5898	0.5898	0.7949
9	0.7736	0.7663	0.7368	0.6667	0.7000	0.5189	0.5206	0.7655
Mean	0.7579	0.8367	0.7737	0.6375	0.6961	0.4998	0.5089	0.7616
SD	0.0694	0.0509	0.0746	0.0958	0.0772	0.1349	0.1315	0.0655

(1) Import libraries¶

(2) Get data¶

(3) Setup pipeline¶

(4) Compare models¶

(5) Tune models¶

(6) Combine multiple models¶

(7) Analyze model performance¶

(8) Interpret models¶

(9) Select best model automatically¶

(10) Save best model¶

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	Balanced Accuracy	TT (Sec)
0	Voting Classifier	0.7579	0.8367	0.7737	0.6375	0.6961	0.4998	0.5089	0.7616	0.1910
3	Ada Boost Classifier	0.7468	0.8241	0.8053	0.6189	0.6947	0.4877	0.5057	0.7603	0.0960
1	Random Forest Classifier	0.7542	0.8285	0.7737	0.6388	0.6944	0.4949	0.5062	0.7587	0.1450
2	Logistic Regression	0.7449	0.8329	0.7211	0.6255	0.6671	0.4632	0.4688	0.7397	0.0070