from pycaret.datasets import get_data
from pycaret.anomaly import *


# Record version of key libraries
from importlib.metadata import version

print('pycaret==%s' % version('pycaret'))

pycaret==2.3.1


# Get a list of all pre-packaged data
# get_data('index')

# Select a pre-packaged data for testing
data = get_data('anomaly')


# Get relevant column names
numeric_cols = [
    'Col1',
    'Col2',
    'Col3',
    'Col4',
    'Col5',
    'Col6',
    'Col7',
    'Col8',
    'Col9',
    'Col10'
]
categorical_cols = []
ignore_cols = []

# Setup core pycaret config
clf_setup = setup(
    data=data, 
    silent=True,
    numeric_features=numeric_cols,
    categorical_features=categorical_cols,
    ignore_features=ignore_cols,
    imputation_type='simple',
    numeric_imputation='median',
    normalize=True,
    normalize_method='minmax',
    n_jobs=-1,
    preprocess=True,
    session_id=6, 
    log_data=True,
    log_profile=True,
    log_experiment=True, 
    experiment_name='anomaly1')

# Retrieve and save processed data
X_data = get_config('X')
X_data.to_csv('./processed_data.csv', index=False)

# data_processing_pipeline = get_config('prep_pipe')


# Select a list of models of interest to be tested
# # To see all available models
# models()

# Pick isolation forest model
iforest_model = create_model('iforest', fraction=0.05)

print(iforest_model)

IForest(behaviour='new', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=-1,
    random_state=6, verbose=0)


# Assign cluster using model

iforest_results = assign_model(iforest_model)

iforest_results.head()


# Assign cluster using model

iforest_results = predict_model(iforest_model, data=data)

iforest_results.head()


# Make interactive standard diagnostic plots
evaluate_model(iforest_model)

# plot_model(iforest_model, plot='tsne')
# plot_model(iforest_model, plot='umap')


# Save model

save_model(iforest_model, model_name='best_model')
# iforest_model = load_model('iforest_model')

Transformation Pipeline and Model Succesfully Saved

(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=['Col1', 'Col2',
                                                           'Col3', 'Col4',
                                                           'Col5', 'Col6',
                                                           'Col7', 'Col8',
                                                           'Col9', 'Col10'],
                                       target='UNSUPERVISED_DUMMY_TARGET',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strate...
                 ('fix_perfect', 'passthrough'),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  IForest(behaviour='new', bootstrap=False, contamination=0.05,
     max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=-1,
     random_state=6, verbose=0)]],
          verbose=False),
 'best_model.pkl')

	Col1	Col2	Col3	Col4	Col5	Col6	Col7	Col8	Col9	Col10
0	0.263995	0.764929	0.138424	0.935242	0.605867	0.518790	0.912225	0.608234	0.723782	0.733591
1	0.546092	0.653975	0.065575	0.227772	0.845269	0.837066	0.272379	0.331679	0.429297	0.367422
2	0.336714	0.538842	0.192801	0.553563	0.074515	0.332993	0.365792	0.861309	0.899017	0.088600
3	0.092108	0.995017	0.014465	0.176371	0.241530	0.514724	0.562208	0.158963	0.073715	0.208463
4	0.325261	0.805968	0.957033	0.331665	0.307923	0.355315	0.501899	0.558449	0.885169	0.182754

	Description	Value
0	session_id	6
1	Original Data	(1000, 10)
2	Missing Values	False
3	Numeric Features	10
4	Categorical Features	0
5	Ordinal Features	False
6	High Cardinality Features	False
7	High Cardinality Method	None
8	Transformed Data	(1000, 10)
9	CPU Jobs	-1
10	Use GPU	False
11	Log Experiment	True
12	Experiment Name	anomaly1
13	USI	e2a4
14	Imputation Type	simple
15	Iterative Imputation Iteration	None
16	Numeric Imputer	median
17	Iterative Imputation Numeric Model	None
18	Categorical Imputer	mode
19	Iterative Imputation Categorical Model	None
20	Unknown Categoricals Handling	least_frequent
21	Normalize	True
22	Normalize Method	minmax
23	Transformation	False
24	Transformation Method	None
25	PCA	False
26	PCA Method	None
27	PCA Components	None
28	Ignore Low Variance	False
29	Combine Rare Levels	False
30	Rare Level Threshold	None
31	Numeric Binning	False
32	Remove Outliers	False
33	Outliers Threshold	None
34	Remove Multicollinearity	False
35	Multicollinearity Threshold	None
36	Clustering	False
37	Clustering Iteration	None
38	Polynomial Features	False
39	Polynomial Degree	None
40	Trignometry Features	False
41	Polynomial Threshold	None
42	Group Features	False
43	Feature Selection	False
44	Feature Selection Method	classic
45	Features Selection Threshold	None
46	Feature Interaction	False
47	Feature Ratio	False
48	Interaction Threshold	None

	Col1	Col2	Col3	Col4	Col5	Col6	Col7	Col8	Col9	Col10	Anomaly	Anomaly_Score
0	0.263995	0.764929	0.138424	0.935242	0.605867	0.518790	0.912225	0.608234	0.723782	0.733591	0	-0.017557
1	0.546092	0.653975	0.065575	0.227772	0.845269	0.837066	0.272379	0.331679	0.429297	0.367422	0	-0.066793
2	0.336714	0.538842	0.192801	0.553563	0.074515	0.332993	0.365792	0.861309	0.899017	0.088600	1	0.004493
3	0.092108	0.995017	0.014465	0.176371	0.241530	0.514724	0.562208	0.158963	0.073715	0.208463	1	0.046974
4	0.325261	0.805968	0.957033	0.331665	0.307923	0.355315	0.501899	0.558449	0.885169	0.182754	0	-0.036004

	Col1	Col2	Col3	Col4	Col5	Col6	Col7	Col8	Col9	Col10	Anomaly	Anomaly_Score
0	0.263995	0.764929	0.138424	0.935242	0.605867	0.518790	0.912225	0.608234	0.723782	0.733591	0	-0.017557
1	0.546092	0.653975	0.065575	0.227772	0.845269	0.837066	0.272379	0.331679	0.429297	0.367422	0	-0.066793
2	0.336714	0.538842	0.192801	0.553563	0.074515	0.332993	0.365792	0.861309	0.899017	0.088600	1	0.004493
3	0.092108	0.995017	0.014465	0.176371	0.241530	0.514724	0.562208	0.158963	0.073715	0.208463	1	0.046974
4	0.325261	0.805968	0.957033	0.331665	0.307923	0.355315	0.501899	0.558449	0.885169	0.182754	0	-0.036004

(1) Import libraries¶

(2) Get data¶

(3) Setup pipeline¶

(4) Create models¶

(5) Assign cluster labels to current data¶

(6) Assign cluster labels to new data¶

(7) Analyze model performance¶

(7) Save model¶