from pycaret.datasets import get_data
from pycaret.anomaly import *
# Record version of key libraries
from importlib.metadata import version
print('pycaret==%s' % version('pycaret'))
pycaret==2.3.1
# Get a list of all pre-packaged data
# get_data('index')
# Select a pre-packaged data for testing
data = get_data('anomaly')
Col1 | Col2 | Col3 | Col4 | Col5 | Col6 | Col7 | Col8 | Col9 | Col10 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.263995 | 0.764929 | 0.138424 | 0.935242 | 0.605867 | 0.518790 | 0.912225 | 0.608234 | 0.723782 | 0.733591 |
1 | 0.546092 | 0.653975 | 0.065575 | 0.227772 | 0.845269 | 0.837066 | 0.272379 | 0.331679 | 0.429297 | 0.367422 |
2 | 0.336714 | 0.538842 | 0.192801 | 0.553563 | 0.074515 | 0.332993 | 0.365792 | 0.861309 | 0.899017 | 0.088600 |
3 | 0.092108 | 0.995017 | 0.014465 | 0.176371 | 0.241530 | 0.514724 | 0.562208 | 0.158963 | 0.073715 | 0.208463 |
4 | 0.325261 | 0.805968 | 0.957033 | 0.331665 | 0.307923 | 0.355315 | 0.501899 | 0.558449 | 0.885169 | 0.182754 |
This step also includes data pre-processing steps such as normalisation and missing values imputation.
Logged data and profile are stored under mlruns
folder.
# Get relevant column names
numeric_cols = [
'Col1',
'Col2',
'Col3',
'Col4',
'Col5',
'Col6',
'Col7',
'Col8',
'Col9',
'Col10'
]
categorical_cols = []
ignore_cols = []
# Setup core pycaret config
clf_setup = setup(
data=data,
silent=True,
numeric_features=numeric_cols,
categorical_features=categorical_cols,
ignore_features=ignore_cols,
imputation_type='simple',
numeric_imputation='median',
normalize=True,
normalize_method='minmax',
n_jobs=-1,
preprocess=True,
session_id=6,
log_data=True,
log_profile=True,
log_experiment=True,
experiment_name='anomaly1')
# Retrieve and save processed data
X_data = get_config('X')
X_data.to_csv('./processed_data.csv', index=False)
# data_processing_pipeline = get_config('prep_pipe')
Description | Value | |
---|---|---|
0 | session_id | 6 |
1 | Original Data | (1000, 10) |
2 | Missing Values | False |
3 | Numeric Features | 10 |
4 | Categorical Features | 0 |
5 | Ordinal Features | False |
6 | High Cardinality Features | False |
7 | High Cardinality Method | None |
8 | Transformed Data | (1000, 10) |
9 | CPU Jobs | -1 |
10 | Use GPU | False |
11 | Log Experiment | True |
12 | Experiment Name | anomaly1 |
13 | USI | e2a4 |
14 | Imputation Type | simple |
15 | Iterative Imputation Iteration | None |
16 | Numeric Imputer | median |
17 | Iterative Imputation Numeric Model | None |
18 | Categorical Imputer | mode |
19 | Iterative Imputation Categorical Model | None |
20 | Unknown Categoricals Handling | least_frequent |
21 | Normalize | True |
22 | Normalize Method | minmax |
23 | Transformation | False |
24 | Transformation Method | None |
25 | PCA | False |
26 | PCA Method | None |
27 | PCA Components | None |
28 | Ignore Low Variance | False |
29 | Combine Rare Levels | False |
30 | Rare Level Threshold | None |
31 | Numeric Binning | False |
32 | Remove Outliers | False |
33 | Outliers Threshold | None |
34 | Remove Multicollinearity | False |
35 | Multicollinearity Threshold | None |
36 | Clustering | False |
37 | Clustering Iteration | None |
38 | Polynomial Features | False |
39 | Polynomial Degree | None |
40 | Trignometry Features | False |
41 | Polynomial Threshold | None |
42 | Group Features | False |
43 | Feature Selection | False |
44 | Feature Selection Method | classic |
45 | Features Selection Threshold | None |
46 | Feature Interaction | False |
47 | Feature Ratio | False |
48 | Interaction Threshold | None |
# Select a list of models of interest to be tested
# # To see all available models
# models()
# Pick isolation forest model
iforest_model = create_model('iforest', fraction=0.05)
print(iforest_model)
IForest(behaviour='new', bootstrap=False, contamination=0.05, max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=-1, random_state=6, verbose=0)
# Assign cluster using model
iforest_results = assign_model(iforest_model)
iforest_results.head()
Col1 | Col2 | Col3 | Col4 | Col5 | Col6 | Col7 | Col8 | Col9 | Col10 | Anomaly | Anomaly_Score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.263995 | 0.764929 | 0.138424 | 0.935242 | 0.605867 | 0.518790 | 0.912225 | 0.608234 | 0.723782 | 0.733591 | 0 | -0.017557 |
1 | 0.546092 | 0.653975 | 0.065575 | 0.227772 | 0.845269 | 0.837066 | 0.272379 | 0.331679 | 0.429297 | 0.367422 | 0 | -0.066793 |
2 | 0.336714 | 0.538842 | 0.192801 | 0.553563 | 0.074515 | 0.332993 | 0.365792 | 0.861309 | 0.899017 | 0.088600 | 1 | 0.004493 |
3 | 0.092108 | 0.995017 | 0.014465 | 0.176371 | 0.241530 | 0.514724 | 0.562208 | 0.158963 | 0.073715 | 0.208463 | 1 | 0.046974 |
4 | 0.325261 | 0.805968 | 0.957033 | 0.331665 | 0.307923 | 0.355315 | 0.501899 | 0.558449 | 0.885169 | 0.182754 | 0 | -0.036004 |
# Assign cluster using model
iforest_results = predict_model(iforest_model, data=data)
iforest_results.head()
Col1 | Col2 | Col3 | Col4 | Col5 | Col6 | Col7 | Col8 | Col9 | Col10 | Anomaly | Anomaly_Score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.263995 | 0.764929 | 0.138424 | 0.935242 | 0.605867 | 0.518790 | 0.912225 | 0.608234 | 0.723782 | 0.733591 | 0 | -0.017557 |
1 | 0.546092 | 0.653975 | 0.065575 | 0.227772 | 0.845269 | 0.837066 | 0.272379 | 0.331679 | 0.429297 | 0.367422 | 0 | -0.066793 |
2 | 0.336714 | 0.538842 | 0.192801 | 0.553563 | 0.074515 | 0.332993 | 0.365792 | 0.861309 | 0.899017 | 0.088600 | 1 | 0.004493 |
3 | 0.092108 | 0.995017 | 0.014465 | 0.176371 | 0.241530 | 0.514724 | 0.562208 | 0.158963 | 0.073715 | 0.208463 | 1 | 0.046974 |
4 | 0.325261 | 0.805968 | 0.957033 | 0.331665 | 0.307923 | 0.355315 | 0.501899 | 0.558449 | 0.885169 | 0.182754 | 0 | -0.036004 |
# Make interactive standard diagnostic plots
evaluate_model(iforest_model)
# plot_model(iforest_model, plot='tsne')
# plot_model(iforest_model, plot='umap')
# Save model
save_model(iforest_model, model_name='best_model')
# iforest_model = load_model('iforest_model')
Transformation Pipeline and Model Succesfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=[], display_types=False, features_todrop=[], id_columns=[], ml_usecase='regression', numerical_features=['Col1', 'Col2', 'Col3', 'Col4', 'Col5', 'Col6', 'Col7', 'Col8', 'Col9', 'Col10'], target='UNSUPERVISED_DUMMY_TARGET', time_features=[])), ('imputer', Simple_Imputer(categorical_strate... ('fix_perfect', 'passthrough'), ('clean_names', Clean_Colum_Names()), ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'), ('dfs', 'passthrough'), ('pca', 'passthrough'), ['trained_model', IForest(behaviour='new', bootstrap=False, contamination=0.05, max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=-1, random_state=6, verbose=0)]], verbose=False), 'best_model.pkl')