from pycaret.datasets import get_data
from pycaret.regression import *
# Record version of key libraries
from importlib.metadata import version
print('pycaret==%s' % version('pycaret'))
pycaret==2.3.1
# Get a list of all pre-packaged data
# get_data('index')
# Select a pre-packaged data for testing
data = get_data('diamond')
Carat Weight | Cut | Color | Clarity | Polish | Symmetry | Report | Price | |
---|---|---|---|---|---|---|---|---|
0 | 1.10 | Ideal | H | SI1 | VG | EX | GIA | 5169 |
1 | 0.83 | Ideal | H | VS1 | ID | ID | AGSL | 3470 |
2 | 0.85 | Ideal | H | SI1 | EX | EX | GIA | 3183 |
3 | 0.91 | Ideal | E | SI1 | VG | VG | GIA | 4370 |
4 | 0.83 | Ideal | G | SI1 | EX | EX | GIA | 3171 |
This step also includes data pre-processing steps such as normalisation and missing values imputation.
Logged data and profile are stored under mlruns
folder.
# Get relevant column names
numeric_cols = [
'Carat Weight'
]
categorical_cols = [
'Cut',
'Color',
'Clarity',
'Polish',
'Symmetry',
'Report'
]
ignore_cols = []
# Setup core pycaret config
clf_setup = setup(
data=data,
target='Price',
silent=True,
numeric_features=numeric_cols,
categorical_features=categorical_cols,
ignore_features=ignore_cols,
imputation_type='simple',
numeric_imputation='median',
normalize=True,
normalize_method='minmax',
n_jobs=-1,
preprocess=True,
session_id=6,
log_data=True,
log_profile=True,
log_experiment=True,
experiment_name='diamond1')
# Retrieve and save processed data
X_data = get_config('X')
y_data = get_config('y')
y_data.name = 'Target variable'
comb_data = X_data.merge(y_data, left_index=True, right_index=True)
comb_data.to_csv('./processed_data.csv', index=False)
# data_processing_pipeline = get_config('prep_pipe')
Description | Value | |
---|---|---|
0 | session_id | 6 |
1 | Target | Price |
2 | Original Data | (6000, 8) |
3 | Missing Values | False |
4 | Numeric Features | 1 |
5 | Categorical Features | 6 |
6 | Ordinal Features | False |
7 | High Cardinality Features | False |
8 | High Cardinality Method | None |
9 | Transformed Train Set | (4199, 28) |
10 | Transformed Test Set | (1801, 28) |
11 | Shuffle Train-Test | True |
12 | Stratify Train-Test | False |
13 | Fold Generator | KFold |
14 | Fold Number | 10 |
15 | CPU Jobs | -1 |
16 | Use GPU | False |
17 | Log Experiment | True |
18 | Experiment Name | diamond1 |
19 | USI | 5916 |
20 | Imputation Type | simple |
21 | Iterative Imputation Iteration | None |
22 | Numeric Imputer | median |
23 | Iterative Imputation Numeric Model | None |
24 | Categorical Imputer | constant |
25 | Iterative Imputation Categorical Model | None |
26 | Unknown Categoricals Handling | least_frequent |
27 | Normalize | True |
28 | Normalize Method | minmax |
29 | Transformation | False |
30 | Transformation Method | None |
31 | PCA | False |
32 | PCA Method | None |
33 | PCA Components | None |
34 | Ignore Low Variance | False |
35 | Combine Rare Levels | False |
36 | Rare Level Threshold | None |
37 | Numeric Binning | False |
38 | Remove Outliers | False |
39 | Outliers Threshold | None |
40 | Remove Multicollinearity | False |
41 | Multicollinearity Threshold | None |
42 | Clustering | False |
43 | Clustering Iteration | None |
44 | Polynomial Features | False |
45 | Polynomial Degree | None |
46 | Trignometry Features | False |
47 | Polynomial Threshold | None |
48 | Group Features | False |
49 | Feature Selection | False |
50 | Feature Selection Method | classic |
51 | Features Selection Threshold | None |
52 | Feature Interaction | False |
53 | Feature Ratio | False |
54 | Interaction Threshold | None |
55 | Transform Target | False |
56 | Transform Target Method | box-cox |
# Select a list of models of interest to be tested
# # To see all available models
# models()
# Pick logistic regression & ensemble models
chosen_model_types_list = ['lr'] + models(type='ensemble').index.tolist()
top3_models = compare_models(
include = chosen_model_types_list,
n_select = 3,
sort = 'MAE',
errors='raise'
)
print(top3_models)
Model | MAE | MSE | RMSE | R2 | RMSLE | MAPE | TT (Sec) | |
---|---|---|---|---|---|---|---|---|
lightgbm | Light Gradient Boosting Machine | 729.4033 | 3145188.2592 | 1702.5267 | 0.9714 | 0.0738 | 0.0541 | 0.0180 |
et | Extra Trees Regressor | 744.4396 | 2382834.1109 | 1515.1797 | 0.9775 | 0.0773 | 0.0581 | 0.2880 |
rf | Random Forest Regressor | 754.4898 | 2918969.8604 | 1642.0265 | 0.9730 | 0.0774 | 0.0573 | 0.2760 |
gbr | Gradient Boosting Regressor | 894.9630 | 3211173.9904 | 1765.7414 | 0.9697 | 0.0996 | 0.0757 | 0.0770 |
lr | Linear Regression | 2479.0399 | 14794353.7000 | 3808.8955 | 0.8608 | 0.6401 | 0.2918 | 0.0070 |
ada | AdaBoost Regressor | 4181.5019 | 24880437.9189 | 4973.6338 | 0.7588 | 0.4834 | 0.5581 | 0.0630 |
[LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=6, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False), RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False)]
# Tune top models
tuned_top3_models = [
tune_model(
i,
optimize='MAE',
search_library='tune-sklearn',
search_algorithm='hyperopt',
choose_better=True
)
for i in top3_models]
print(tuned_top3_models)
MAE | MSE | RMSE | R2 | RMSLE | MAPE | |
---|---|---|---|---|---|---|
0 | 817.1251 | 2227793.2072 | 1492.5794 | 0.9805 | 0.0953 | 0.0717 |
1 | 934.6753 | 2217710.8545 | 1489.1981 | 0.9716 | 0.1095 | 0.0830 |
2 | 926.5242 | 3250053.5419 | 1802.7905 | 0.9685 | 0.1000 | 0.0745 |
3 | 950.1546 | 2732921.7784 | 1653.1551 | 0.9738 | 0.1029 | 0.0792 |
4 | 1039.4660 | 3657692.4620 | 1912.5095 | 0.9563 | 0.1129 | 0.0882 |
5 | 883.9758 | 2687757.5822 | 1639.4382 | 0.9720 | 0.1022 | 0.0797 |
6 | 1108.4840 | 6972538.1040 | 2640.5564 | 0.9425 | 0.1156 | 0.0880 |
7 | 882.5780 | 3210533.9555 | 1791.7963 | 0.9679 | 0.1039 | 0.0784 |
8 | 1045.5730 | 6108631.8443 | 2471.5647 | 0.9503 | 0.1085 | 0.0798 |
9 | 1045.2795 | 7017244.5308 | 2649.0082 | 0.9445 | 0.1083 | 0.0829 |
Mean | 963.3836 | 4008287.7861 | 1954.2596 | 0.9628 | 0.1059 | 0.0805 |
SD | 87.6865 | 1825732.1379 | 434.9220 | 0.0126 | 0.0059 | 0.0050 |
[LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=6, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False), RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False)]
# Blend top tuned models into a single voting regressor model
blended_model = blend_models(
estimator_list = tuned_top3_models
)
print(blended_model)
MAE | MSE | RMSE | R2 | RMSLE | MAPE | |
---|---|---|---|---|---|---|
0 | 606.9172 | 1156593.0260 | 1075.4502 | 0.9899 | 0.0641 | 0.0488 |
1 | 629.4794 | 1157347.3011 | 1075.8008 | 0.9852 | 0.0680 | 0.0513 |
2 | 714.4820 | 2095819.3271 | 1447.6945 | 0.9797 | 0.0729 | 0.0531 |
3 | 649.8647 | 1465235.7956 | 1210.4692 | 0.9860 | 0.0691 | 0.0514 |
4 | 729.9967 | 2478535.1756 | 1574.3364 | 0.9704 | 0.0759 | 0.0552 |
5 | 570.5030 | 1348017.4795 | 1161.0415 | 0.9860 | 0.0621 | 0.0466 |
6 | 798.1941 | 5277761.8211 | 2297.3380 | 0.9565 | 0.0757 | 0.0547 |
7 | 591.3541 | 1286299.6225 | 1134.1515 | 0.9871 | 0.0666 | 0.0498 |
8 | 824.0156 | 4534494.1043 | 2129.4352 | 0.9631 | 0.0737 | 0.0550 |
9 | 726.5214 | 3711664.3862 | 1926.5680 | 0.9706 | 0.0692 | 0.0530 |
Mean | 684.1328 | 2451176.8039 | 1503.2285 | 0.9774 | 0.0697 | 0.0519 |
SD | 83.0913 | 1447120.2835 | 437.5852 | 0.0109 | 0.0045 | 0.0027 |
VotingRegressor(estimators=[('lightgbm', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=6, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsam... RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False))], n_jobs=-1, verbose=False, weights=None)
# Make interactive standard diagnostic plots
evaluate_model(blended_model)
# plot_model(blended_model, plot='residuals_interactive')
# plot_model(blended_model, plot='error')
# plot_model(blended_model, plot='learning')
# plot_model(blended_model, plot='manifold')
# # plot_model(blended_model, plot='feature')
# Interpret model - only for tree-based models
interpret_model(tuned_top3_models[0], plot='summary')
interpret_model(tuned_top3_models[0], plot='correlation', feature='Cut_Ideal')
interpret_model(tuned_top3_models[0], plot='reason', observation=100)
# Select best model out of all models trained in current session
best_model = automl(optimize = 'MAE')
print(best_model)
VotingRegressor(estimators=[('lightgbm', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=6, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsam... RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False))], n_jobs=-1, verbose=False, weights=None)
# Final check on performance of best model
compare_models(
include = [best_model] + tuned_top3_models,
sort='MAE',
errors='raise'
)
Model | MAE | MSE | RMSE | R2 | RMSLE | MAPE | TT (Sec) | |
---|---|---|---|---|---|---|---|---|
0 | Voting Regressor | 684.1328 | 2451176.8039 | 1503.2285 | 0.9774 | 0.0697 | 0.0519 | 0.5180 |
1 | Light Gradient Boosting Machine | 729.4033 | 3145188.2592 | 1702.5267 | 0.9714 | 0.0738 | 0.0541 | 0.0250 |
2 | Extra Trees Regressor | 744.4396 | 2382834.1109 | 1515.1797 | 0.9775 | 0.0773 | 0.0581 | 0.3030 |
3 | Random Forest Regressor | 754.4898 | 2918969.8604 | 1642.0265 | 0.9730 | 0.0774 | 0.0573 | 0.2650 |
VotingRegressor(estimators=[('lightgbm', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=6, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsam... RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False))], n_jobs=-1, verbose=False, weights=None)
# Save model
save_model(best_model, model_name='best_model')
# best_model = load_model('best_model')
Transformation Pipeline and Model Succesfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=['Cut', 'Color', 'Clarity', 'Polish', 'Symmetry', 'Report'], display_types=False, features_todrop=[], id_columns=[], ml_usecase='regression', numerical_features=['Carat Weight'], target='Price', time_features=[])), ('imputer', Simple_Imputer(categorical_strategy='not_available', fill_v... ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=6, verbose=0, warm_start=False))], n_jobs=-1, verbose=False, weights=None)]], verbose=False), 'best_model.pkl')