from pycaret.datasets import get_data
from pycaret.internal.pycaret_experiment import TimeSeriesExperiment
from sktime.utils.plotting import plot_series
# Record version of key libraries
from importlib.metadata import version
print('pycaret==%s' % version('pycaret'))
print('pycaret-ts-alpha==%s' % version('pycaret-ts-alpha'))
pycaret==2.3.1 pycaret-ts-alpha==3.0.0.dev1624743408
# Get a list of all pre-packaged data
# get_data('index') # NOTE - currently will not show time series sample data
# Select a pre-packaged data for testing
# NOTE - only support univariate currently
y_raw_data = get_data('airline')
# Plot time series
_ = plot_series(y_raw_data) # NOTE - will be bundled into pycaret interface
Period 1949-01 112.0 1949-02 118.0 1949-03 132.0 1949-04 129.0 1949-05 121.0 Freq: M, Name: Number of airline passengers, dtype: float64
This step also includes data pre-processing steps such as normalisation and missing values imputation.
Logged data and profile are stored under mlruns
folder.
# Setup pycaret experiment object
exp = TimeSeriesExperiment()
# Setup core pycaret config
# NOTE - only support univariate currently
exp.setup(
data=y_raw_data,
fh=12,
fold=3,
fold_strategy='expanding',
seasonal_period=12,
imputation_type='simple',
n_jobs=-1,
preprocess=True,
session_id=6,
log_data=True,
log_profile=True,
log_experiment=True,
experiment_name='airline1')
# Retrieve and save processed data
y_data = exp.get_config('y')
y_data.to_csv('./processed_data.csv', index=False)
# data_processing_pipeline = exp.get_config('prep_pipe')
# Visualise train and test data split
y_train = exp.get_config("y_train")
y_test = exp.get_config("y_test")
_ = plot_series(y_train, y_test, labels=['Train', 'Test'])
Description | Value | |
---|---|---|
0 | session_id | 6 |
1 | Original Data | (144, 1) |
2 | Missing Values | False |
3 | Transformed Train Set | (132,) |
4 | Transformed Test Set | (12,) |
5 | Fold Generator | ExpandingWindowSplitter |
6 | Fold Number | 3 |
7 | CPU Jobs | -1 |
8 | Use GPU | False |
9 | Log Experiment | True |
10 | Experiment Name | airline1 |
11 | USI | 8a5f |
12 | Imputation Type | simple |
# Select a list of models of interest to be tested
# # To see all available models
# exp.models()
# Pick specific models
# chosen_model_types_list = ['lr'] + models(type='ensemble').index.tolist()
top3_models = exp.compare_models(
n_select = 3,
sort = 'MAE',
errors='raise'
)
print(top3_models)
Model | MAE | RMSE | MAPE | SMAPE | R2 | TT (Sec) | |
---|---|---|---|---|---|---|---|
ets | ETS | 19.5891 | 529.3567 | 0.0498 | 0.0498 | 0.8624 | 0.0600 |
exp_smooth | Exponential Smoothing | 19.7275 | 532.6698 | 0.0503 | 0.0502 | 0.8608 | 0.0367 |
arima | ARIMA | 20.0069 | 528.2344 | 0.0501 | 0.0507 | 0.8677 | 0.0700 |
auto_arima | Auto ARIMA | 21.0297 | 602.4566 | 0.0525 | 0.0531 | 0.8509 | 1.5967 |
et_cds_dt | Extra Trees w/ Cond. Deseasonalize & Detrending | 24.6502 | 1054.7894 | 0.0585 | 0.0603 | 0.7088 | 0.1500 |
knn_cds_dt | K Neighbors w/ Cond. Deseasonalize & Detrending | 25.8293 | 1290.4445 | 0.0617 | 0.0641 | 0.6260 | 0.0400 |
ada_cds_dt | AdaBoost w/ Cond. Deseasonalize & Detrending | 26.9655 | 1323.1592 | 0.0646 | 0.0668 | 0.6279 | 0.0433 |
gbr_cds_dt | Gradient Boosting w/ Cond. Deseasonalize & Detrending | 26.9739 | 1272.0735 | 0.0641 | 0.0665 | 0.6537 | 0.0300 |
rf_cds_dt | Random Forest w/ Cond. Deseasonalize & Detrending | 27.1100 | 1511.1234 | 0.0639 | 0.0667 | 0.5677 | 0.1767 |
lightgbm_cds_dt | Light Gradient Boosting w/ Cond. Deseasonalize & Detrending | 28.7738 | 1339.5259 | 0.0698 | 0.0722 | 0.6255 | 0.0200 |
dt_cds_dt | Decision Tree w/ Cond. Deseasonalize & Detrending | 30.2749 | 1575.0098 | 0.0728 | 0.0759 | 0.5625 | 0.0167 |
br_cds_dt | Bayesian Ridge w/ Cond. Deseasonalize & Detrending | 32.0341 | 1558.5530 | 0.0799 | 0.0818 | 0.5658 | 0.0167 |
lasso_cds_dt | Lasso w/ Cond. Deseasonalize & Detrending | 32.8026 | 1556.2088 | 0.0823 | 0.0841 | 0.5678 | 0.0167 |
en_cds_dt | Elastic Net w/ Cond. Deseasonalize & Detrending | 32.8556 | 1559.8088 | 0.0825 | 0.0843 | 0.5669 | 0.0133 |
ridge_cds_dt | Ridge w/ Cond. Deseasonalize & Detrending | 32.9702 | 1566.6345 | 0.0828 | 0.0846 | 0.5652 | 0.0167 |
lr_cds_dt | Linear w/ Cond. Deseasonalize & Detrending | 32.9708 | 1566.6682 | 0.0828 | 0.0846 | 0.5652 | 0.0200 |
snaive | Seasonal Naive Forecaster | 33.3611 | 1478.5278 | 0.0832 | 0.0879 | 0.6072 | 0.6267 |
huber_cds_dt | Huber w/ Cond. Deseasonalize & Detrending | 35.4709 | 1715.0979 | 0.0910 | 0.0936 | 0.5226 | 0.0233 |
lar_cds_dt | Least Angular Regressor w/ Cond. Deseasonalize & Detrending | 36.5285 | 1803.6208 | 0.0936 | 0.0945 | 0.5058 | 0.0167 |
llar_cds_dt | Lasso Least Angular Regressor w/ Cond. Deseasonalize & Detrending | 46.7239 | 3996.1200 | 0.1109 | 0.1165 | -0.0733 | 0.0167 |
omp_cds_dt | Orthogonal Matching Pursuit w/ Cond. Deseasonalize & Detrending | 47.2799 | 4184.3512 | 0.1110 | 0.1177 | -0.1201 | 0.0167 |
polytrend | Polynomial Trend Forecaster | 48.6301 | 4031.3261 | 0.1170 | 0.1216 | -0.0784 | 0.0067 |
par_cds_dt | Passive Aggressive w/ Cond. Deseasonalize & Detrending | 57.6336 | 6605.0816 | 0.1586 | 0.1625 | -0.9387 | 0.0200 |
theta | Theta Forecaster | 62.7733 | 7332.4303 | 0.1425 | 0.1607 | -0.9043 | 0.0067 |
naive | Naive Forecaster | 69.0278 | 8540.5833 | 0.1569 | 0.1792 | -1.2216 | 0.7833 |
[AutoETS(additive_only=False, allow_multiplicative_trend=False, auto=False, bounds=None, callback=None, damped_trend=False, dates=None, disp=False, error='add', freq=None, full_output=True, information_criterion='aic', initial_level=None, initial_seasonal=None, initial_trend=None, initialization_method='estimated', maxiter=1000, missing='none', n_jobs=None, restrict=True, return_params=False, seasonal='add', sp=12, start_params=None, trend='add'), ExponentialSmoothing(damped_trend=False, initial_level=None, initial_seasonal=None, initial_trend=None, initialization_method='estimated', seasonal='add', sp=12, trend='add', use_boxcox=None), ARIMA(maxiter=50, method='lbfgs', order=(1, 0, 0), out_of_sample_size=0, scoring='mse', scoring_args=None, seasonal_order=(0, 1, 0, 12), start_params=None, suppress_warnings=False, trend=None, with_intercept=True)]
# Tune top models
# NOTE - only support sklearn random and grid search now
tuned_top3_models = [
exp.tune_model(
i,
optimize='MAE',
search_algorithm='random',
choose_better=True
)
for i in top3_models]
print(tuned_top3_models)
cutoff | MAE | RMSE | MAPE | SMAPE | R2 | |
---|---|---|---|---|---|---|
0 | 1956-12 | 14.6866 | 326.6373 | 0.0385 | 0.0386 | 0.8937 |
1 | 1957-12 | 16.9869 | 402.5116 | 0.0451 | 0.0444 | 0.8946 |
2 | 1958-12 | 29.0535 | 1061.8231 | 0.0654 | 0.0679 | 0.7624 |
Mean | nan | 20.2424 | 596.9907 | 0.0497 | 0.0503 | 0.8502 |
SD | nan | 6.3008 | 330.1425 | 0.0114 | 0.0127 | 0.0621 |
[AutoETS(additive_only=False, allow_multiplicative_trend=False, auto=False, bounds=None, callback=None, damped_trend=False, dates=None, disp=False, error='add', freq=None, full_output=True, information_criterion='aic', initial_level=None, initial_seasonal=None, initial_trend=None, initialization_method='estimated', maxiter=1000, missing='none', n_jobs=None, restrict=True, return_params=False, seasonal='add', sp=12, start_params=None, trend='add'), ExponentialSmoothing(damped_trend=False, initial_level=None, initial_seasonal=None, initial_trend=None, initialization_method='estimated', seasonal='add', sp=12, trend='add', use_boxcox=None), ARIMA(maxiter=50, method='lbfgs', order=(1, 0, 0), out_of_sample_size=0, scoring='mse', scoring_args=None, seasonal_order=(0, 1, 0, 12), start_params=None, suppress_warnings=False, trend=None, with_intercept=True)]
# Blend top tuned models into a single voting regressor model
blended_model = exp.blend_models(
estimator_list = tuned_top3_models
)
print(blended_model)
cutoff | MAE | RMSE | MAPE | SMAPE | R2 | |
---|---|---|---|---|---|---|
0 | 1956-12 | 13.0074 | 291.2886 | 0.0325 | 0.0333 | 0.9052 |
1 | 1957-12 | 22.2042 | 582.3384 | 0.0615 | 0.0593 | 0.8474 |
2 | 1958-12 | 23.7032 | 666.7215 | 0.0550 | 0.0568 | 0.8508 |
Mean | nan | 19.6383 | 513.4495 | 0.0497 | 0.0498 | 0.8678 |
SD | nan | 4.7285 | 160.8244 | 0.0124 | 0.0117 | 0.0265 |
_EnsembleForecasterWithVoting(forecasters=[('ets', AutoETS(additive_only=False, allow_multiplicative_trend=False, auto=False, bounds=None, callback=None, damped_trend=False, dates=None, disp=False, error='add', freq=None, full_output=True, information_criterion='aic', initial_level=None, initial_seasonal=None, initial_trend=None, initialization_method='estimated',... initial_trend=None, initialization_method='estimated', seasonal='add', sp=12, trend='add', use_boxcox=None)), ('arima', ARIMA(maxiter=50, method='lbfgs', order=(1, 0, 0), out_of_sample_size=0, scoring='mse', scoring_args=None, seasonal_order=(0, 1, 0, 12), start_params=None, suppress_warnings=False, trend=None, with_intercept=True))], method='mean', n_jobs=-1, weights=None)
# # Make interactive standard diagnostic plots
# # NOTE - not implemented yet
# exp.evaluate_model(blended_model)
# plot_model(blended_model, plot='residuals_interactive')
# plot_model(blended_model, plot='error')
# plot_model(blended_model, plot='learning')
# plot_model(blended_model, plot='manifold')
# # plot_model(blended_model, plot='feature')
# Check model prediction
y_pred = exp.predict_model(blended_model)
_ = plot_series(y_raw_data, y_train, y_pred[0], labels=['All', 'Train', 'Test Predictions'])
Model | MAE | RMSE | MAPE | SMAPE | R2 | |
---|---|---|---|---|---|---|
0 | EnsembleForecaster | 13.2377 | 17.3508 | 0.0274 | 0.0274 | 0.9457 |
# Train on full dataset so that the model is ready to forecast future after that
final_model = exp.finalize_model(blended_model)
y_pred_future = exp.predict_model(final_model, verbose=False)
_ = plot_series(y_raw_data, y_pred_future[0], labels=['All', 'Future Pred (Unknown)'])
# Save model
exp.save_model(final_model, model_name='best_model')
# final_model = exp.load_model('best_model')
Transformation Pipeline and Model Successfully Saved
(_EnsembleForecasterWithVoting(forecasters=[('ets', AutoETS(additive_only=False, allow_multiplicative_trend=False, auto=False, bounds=None, callback=None, damped_trend=False, dates=None, disp=False, error='add', freq=None, full_output=True, information_criterion='aic', initial_level=None, initial_seasonal=None, initial_trend=None, initialization_method='estimated',... initial_trend=None, initialization_method='estimated', seasonal='add', sp=12, trend='add', use_boxcox=None)), ('arima', ARIMA(maxiter=50, method='lbfgs', order=(1, 0, 0), out_of_sample_size=0, scoring='mse', scoring_args=None, seasonal_order=(0, 1, 0, 12), start_params=None, suppress_warnings=False, trend=None, with_intercept=True))], method='mean', n_jobs=-1, weights=None), 'best_model.pkl')