from pycaret.datasets import get_data
from pycaret.classification import *
from sklearn.metrics import balanced_accuracy_score
from imblearn.over_sampling import RandomOverSampler
# Record version of key libraries
from importlib.metadata import version
print('pycaret==%s' % version('pycaret'))
pycaret==2.3.1
# Get a list of all pre-packaged data
# get_data('index')
# Select a pre-packaged data for testing
data = get_data('diabetes')
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class variable | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
This step also includes data pre-processing steps such as normalisation and missing values imputation.
Logged data and profile are stored under mlruns
folder.
# Setup upsampling method
ros = RandomOverSampler(sampling_strategy='not majority', random_state=6)
# Get relevant column names
numeric_cols = [
'Number of times pregnant',
'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
'Diastolic blood pressure (mm Hg)',
'Triceps skin fold thickness (mm)',
'2-Hour serum insulin (mu U/ml)',
'Body mass index (weight in kg/(height in m)^2)'
]
categorical_cols = []
ignore_cols = []
# Setup core pycaret config
clf_setup = setup(
data=data,
target='Class variable',
silent=True,
numeric_features=numeric_cols,
categorical_features=categorical_cols,
ignore_features=ignore_cols,
imputation_type='simple',
numeric_imputation='median',
normalize=True,
normalize_method='minmax',
fix_imbalance=True,
fix_imbalance_method=ros,
fold_strategy='stratifiedkfold',
fold=10,
n_jobs=-1,
preprocess=True,
session_id=6,
log_data=True,
log_profile=True,
log_experiment=True,
experiment_name='diabetes1')
# Add additional metrics for model evaluation
add_metric('balacc', 'Balanced Accuracy', balanced_accuracy_score, greater_is_better=True, target='pred')
# Retrieve and save processed data
X_data = get_config('X')
y_data = get_config('y')
y_data.name = 'Target variable'
comb_data = X_data.merge(y_data, left_index=True, right_index=True)
comb_data.to_csv('./processed_data.csv', index=False)
# data_processing_pipeline = get_config('prep_pipe')
Description | Value | |
---|---|---|
0 | session_id | 6 |
1 | Target | Class variable |
2 | Target Type | Binary |
3 | Label Encoded | 0: 0, 1: 1 |
4 | Original Data | (768, 9) |
5 | Missing Values | False |
6 | Numeric Features | 8 |
7 | Categorical Features | 0 |
8 | Ordinal Features | False |
9 | High Cardinality Features | False |
10 | High Cardinality Method | None |
11 | Transformed Train Set | (537, 8) |
12 | Transformed Test Set | (231, 8) |
13 | Shuffle Train-Test | True |
14 | Stratify Train-Test | False |
15 | Fold Generator | StratifiedKFold |
16 | Fold Number | 10 |
17 | CPU Jobs | -1 |
18 | Use GPU | False |
19 | Log Experiment | True |
20 | Experiment Name | diabetes1 |
21 | USI | bbd4 |
22 | Imputation Type | simple |
23 | Iterative Imputation Iteration | None |
24 | Numeric Imputer | median |
25 | Iterative Imputation Numeric Model | None |
26 | Categorical Imputer | constant |
27 | Iterative Imputation Categorical Model | None |
28 | Unknown Categoricals Handling | least_frequent |
29 | Normalize | True |
30 | Normalize Method | minmax |
31 | Transformation | False |
32 | Transformation Method | None |
33 | PCA | False |
34 | PCA Method | None |
35 | PCA Components | None |
36 | Ignore Low Variance | False |
37 | Combine Rare Levels | False |
38 | Rare Level Threshold | None |
39 | Numeric Binning | False |
40 | Remove Outliers | False |
41 | Outliers Threshold | None |
42 | Remove Multicollinearity | False |
43 | Multicollinearity Threshold | None |
44 | Clustering | False |
45 | Clustering Iteration | None |
46 | Polynomial Features | False |
47 | Polynomial Degree | None |
48 | Trignometry Features | False |
49 | Polynomial Threshold | None |
50 | Group Features | False |
51 | Feature Selection | False |
52 | Feature Selection Method | classic |
53 | Features Selection Threshold | None |
54 | Feature Interaction | False |
55 | Feature Ratio | False |
56 | Interaction Threshold | None |
57 | Fix Imbalance | True |
58 | Fix Imbalance Method | RandomOverSampler |
# Select a list of models of interest to be tested
# # To see all available models
# models()
# Pick logistic regression & ensemble models
chosen_model_types_list = ['lr'] + models(type='ensemble').index.tolist()
top3_models = compare_models(
include = chosen_model_types_list,
n_select = 3,
sort='Balanced Accuracy',
errors='raise'
)
print(top3_models)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | Balanced Accuracy | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|---|
rf | Random Forest Classifier | 0.7617 | 0.8137 | 0.6895 | 0.6663 | 0.6738 | 0.4872 | 0.4909 | 0.7455 | 0.0930 |
lr | Logistic Regression | 0.7374 | 0.8297 | 0.7211 | 0.6135 | 0.6606 | 0.4497 | 0.4558 | 0.7338 | 0.1260 |
ada | Ada Boost Classifier | 0.7448 | 0.8083 | 0.6632 | 0.6369 | 0.6472 | 0.4480 | 0.4504 | 0.7264 | 0.0260 |
gbc | Gradient Boosting Classifier | 0.7355 | 0.8250 | 0.6947 | 0.6213 | 0.6517 | 0.4412 | 0.4470 | 0.7264 | 0.0350 |
lightgbm | Light Gradient Boosting Machine | 0.7374 | 0.7952 | 0.6526 | 0.6306 | 0.6379 | 0.4329 | 0.4361 | 0.7183 | 0.0900 |
et | Extra Trees Classifier | 0.7542 | 0.8148 | 0.5684 | 0.7029 | 0.6176 | 0.4409 | 0.4538 | 0.7122 | 0.0870 |
[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=6, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False), AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=50, random_state=6)]
# Tune top models
tuned_top3_models = [
tune_model(
i,
optimize='Balanced Accuracy',
search_library='tune-sklearn',
search_algorithm='hyperopt',
choose_better=True
)
for i in top3_models]
print(tuned_top3_models)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | Balanced Accuracy | |
---|---|---|---|---|---|---|---|---|
0 | 0.7407 | 0.7759 | 0.7895 | 0.6000 | 0.6818 | 0.4698 | 0.4825 | 0.7519 |
1 | 0.7407 | 0.8850 | 0.9474 | 0.5806 | 0.7200 | 0.5033 | 0.5562 | 0.7880 |
2 | 0.6481 | 0.7654 | 0.8421 | 0.5000 | 0.6275 | 0.3329 | 0.3741 | 0.6925 |
3 | 0.8333 | 0.9030 | 0.8421 | 0.7273 | 0.7805 | 0.6473 | 0.6518 | 0.8353 |
4 | 0.8333 | 0.9128 | 0.8421 | 0.7273 | 0.7805 | 0.6473 | 0.6518 | 0.8353 |
5 | 0.7407 | 0.8323 | 0.8421 | 0.5926 | 0.6957 | 0.4815 | 0.5041 | 0.7639 |
6 | 0.6667 | 0.8226 | 0.8421 | 0.5161 | 0.6400 | 0.3614 | 0.3994 | 0.7068 |
7 | 0.6792 | 0.7384 | 0.6316 | 0.5455 | 0.5854 | 0.3261 | 0.3284 | 0.6687 |
8 | 0.7925 | 0.8630 | 0.7368 | 0.7000 | 0.7179 | 0.5539 | 0.5544 | 0.7802 |
9 | 0.7925 | 0.7423 | 0.7368 | 0.7000 | 0.7179 | 0.5539 | 0.5544 | 0.7802 |
Mean | 0.7468 | 0.8241 | 0.8053 | 0.6189 | 0.6947 | 0.4877 | 0.5057 | 0.7603 |
SD | 0.0632 | 0.0626 | 0.0817 | 0.0832 | 0.0601 | 0.1124 | 0.1052 | 0.0536 |
[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=5, max_features=0.6311732042950683, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=9.124812416452801e-08, min_impurity_split=None, min_samples_leaf=5, min_samples_split=6, min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False), LogisticRegression(C=6.68643136832486, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=6, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False), AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.051736972654183026, n_estimators=166, random_state=6)]
# Blend top tuned models into a single voting classifier model
blended_model = blend_models(
estimator_list = tuned_top3_models,
method = 'soft'
)
print(blended_model)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | Balanced Accuracy | |
---|---|---|---|---|---|---|---|---|
0 | 0.7222 | 0.8030 | 0.6842 | 0.5909 | 0.6341 | 0.4122 | 0.4151 | 0.7135 |
1 | 0.7222 | 0.8647 | 0.8421 | 0.5714 | 0.6809 | 0.4505 | 0.4772 | 0.7496 |
2 | 0.6852 | 0.8015 | 0.7895 | 0.5357 | 0.6383 | 0.3772 | 0.3995 | 0.7090 |
3 | 0.8704 | 0.9173 | 0.8421 | 0.8000 | 0.8205 | 0.7192 | 0.7197 | 0.8639 |
4 | 0.8704 | 0.9188 | 0.8947 | 0.7727 | 0.8293 | 0.7257 | 0.7308 | 0.8759 |
5 | 0.7593 | 0.8045 | 0.8421 | 0.6154 | 0.7111 | 0.5132 | 0.5318 | 0.7782 |
6 | 0.6667 | 0.8481 | 0.6842 | 0.5200 | 0.5909 | 0.3184 | 0.3269 | 0.6707 |
7 | 0.6981 | 0.7833 | 0.6842 | 0.5652 | 0.6190 | 0.3728 | 0.3775 | 0.6950 |
8 | 0.8113 | 0.8591 | 0.7368 | 0.7368 | 0.7368 | 0.5898 | 0.5898 | 0.7949 |
9 | 0.7736 | 0.7663 | 0.7368 | 0.6667 | 0.7000 | 0.5189 | 0.5206 | 0.7655 |
Mean | 0.7579 | 0.8367 | 0.7737 | 0.6375 | 0.6961 | 0.4998 | 0.5089 | 0.7616 |
SD | 0.0694 | 0.0509 | 0.0746 | 0.0958 | 0.0772 | 0.1349 | 0.1315 | 0.0655 |
VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=5, max_features=0.6311732042950683, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=9.124812416452801e-08, min_impurity_split=None, min_samples_leaf=5, min_samples_split=6, min_weight_fraction_leaf=0.0, n_estimato... l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=6, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)), ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.051736972654183026, n_estimators=166, random_state=6))], flatten_transform=True, n_jobs=-1, verbose=False, voting='soft', weights=None)
# Make interactive standard diagnostic plots
evaluate_model(blended_model)
# plot_model(blended_model, plot='parameter')
# plot_model(blended_model, plot='manifold')
# plot_model(blended_model, plot='learning')
# plot_model(blended_model, plot='confusion_matrix')
# plot_model(blended_model, plot='class_report')
# # plot_model(blended_model, plot='feature')
# Interpret model - only for tree-based models
interpret_model(tuned_top3_models[0], plot='summary')
interpret_model(tuned_top3_models[0], plot='correlation', feature='Plasma glucose concentration a 2 hours in an oral glucose tolerance test')
interpret_model(tuned_top3_models[0], plot='reason', observation=100)
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans. findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
# Select best model out of all models trained in current session
best_model = automl(optimize = 'Balanced Accuracy')
print(best_model)
VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=5, max_features=0.6311732042950683, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=9.124812416452801e-08, min_impurity_split=None, min_samples_leaf=5, min_samples_split=6, min_weight_fraction_leaf=0.0, n_estimato... l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=6, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)), ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.051736972654183026, n_estimators=166, random_state=6))], flatten_transform=True, n_jobs=-1, verbose=False, voting='soft', weights=None)
# Final check on performance of best model
compare_models(
include = [best_model] + tuned_top3_models,
sort='Balanced Accuracy',
errors='raise'
)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | Balanced Accuracy | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Voting Classifier | 0.7579 | 0.8367 | 0.7737 | 0.6375 | 0.6961 | 0.4998 | 0.5089 | 0.7616 | 0.1910 |
3 | Ada Boost Classifier | 0.7468 | 0.8241 | 0.8053 | 0.6189 | 0.6947 | 0.4877 | 0.5057 | 0.7603 | 0.0960 |
1 | Random Forest Classifier | 0.7542 | 0.8285 | 0.7737 | 0.6388 | 0.6944 | 0.4949 | 0.5062 | 0.7587 | 0.1450 |
2 | Logistic Regression | 0.7449 | 0.8329 | 0.7211 | 0.6255 | 0.6671 | 0.4632 | 0.4688 | 0.7397 | 0.0070 |
VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={}, criterion='entropy', max_depth=5, max_features=0.6311732042950683, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=9.124812416452801e-08, min_impurity_split=None, min_samples_leaf=5, min_samples_split=6, min_weight_fraction_leaf=0.0, n_estimato... l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=6, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)), ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.051736972654183026, n_estimators=166, random_state=6))], flatten_transform=True, n_jobs=-1, verbose=False, voting='soft', weights=None)
# Save model
save_model(best_model, model_name='best_model')
# best_model = load_model('best_model')
Transformation Pipeline and Model Succesfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=False, features_todrop=[], id_columns=[], ml_usecase='classification', numerical_features=['Number of times ' 'pregnant', 'Plasma glucose ' 'concentration a 2 ' 'hours in an oral ' 'glucose tolerance ' 'test', 'Diastolic blood ' 'pressure (mm Hg)', 'Triceps skin fold ' 'thickness (mm)', '2... max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=6, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)), ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.051736972654183026, n_estimators=166, random_state=6))], flatten_transform=True, n_jobs=-1, verbose=False, voting='soft', weights=None)]], verbose=False), 'best_model.pkl')