from pycaret.datasets import get_data
from pycaret.clustering import *


# Record version of key libraries
from importlib.metadata import version

print('pycaret==%s' % version('pycaret'))

pycaret==2.3.1


# Get a list of all pre-packaged data
# get_data('index')

# Select a pre-packaged data for testing
data = get_data('pokemon')


# Get relevant column names
numeric_cols = [
    'Total', 
    'HP', 
    'Attack', 
    'Defense', 
    'Sp. Atk', 
    'Sp. Def', 
    'Speed'
]
categorical_cols = [
    'Type 1',
    'Type 2',
    'Generation',
    'Legendary'
]
ignore_cols = [
    '#'
]

# Setup core pycaret config
clf_setup = setup(
    data=data, 
    silent=True,
    numeric_features=numeric_cols,
    categorical_features=categorical_cols,
    ignore_features=ignore_cols,
    imputation_type='simple',
    numeric_imputation='median',
    normalize=True,
    normalize_method='minmax',
    n_jobs=-1,
    preprocess=True,
    session_id=6, 
    log_data=True,
    log_profile=True,
    log_experiment=True, 
    experiment_name='pokemon1')

# Retrieve and save processed data
X_data = get_config('X')
X_data.to_csv('./processed_data.csv', index=False)

# data_processing_pipeline = get_config('prep_pipe')


# Select a list of models of interest to be tested
# # To see all available models
# models()

# Pick kmeans model
kmeans_model = create_model('kmeans', num_clusters=3)

print(kmeans_model)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=-1, precompute_distances='deprecated',
       random_state=6, tol=0.0001, verbose=0)


# Assign cluster using model

kmeans_results = assign_model(kmeans_model)

kmeans_results.head()


# Assign cluster using model

kmeans_results = predict_model(kmeans_model, data=data)

kmeans_results.head()


# Make interactive standard diagnostic plots
evaluate_model(kmeans_model)

# plot_model(kmeans_model, plot='cluster')
# plot_model(kmeans_model, feature = 'Total', label=True)
# plot_model(kmeans_model, plot='tsne')
# plot_model(kmeans_model, plot='distribution')
# plot_model(kmeans_model, plot='elbow')
# plot_model(kmeans_model, plot='silhouette')


# Save model

save_model(kmeans_model, model_name='best_model')
# kmeans_model = load_model('kmeans_model')

Transformation Pipeline and Model Succesfully Saved

(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['Type 1', 'Type 2',
                                                             'Generation',
                                                             'Legendary'],
                                       display_types=False,
                                       features_todrop=['#'], id_columns=[],
                                       ml_usecase='regression',
                                       numerical_features=['Total', 'HP',
                                                           'Attack', 'Defense',
                                                           'Sp. Atk', 'Sp. Def',
                                                           'Speed'],
                                       target='UNSUPERVISED_DUMMY_TARGET',
                                       time_features=[])),
                 ('imputer',
                  Sim...
                 ('fix_perfect', 'passthrough'),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  KMeans(algorithm='auto', copy_x=True, init='k-means++',
                         max_iter=300, n_clusters=3, n_init=10, n_jobs=-1,
                         precompute_distances='deprecated', random_state=6,
                         tol=0.0001, verbose=0)]],
          verbose=False),
 'best_model.pkl')

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary
0	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	False
1	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	False
2	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	False
3	3	VenusaurMega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	False
4	4	Charmander	Fire	NaN	309	39	52	43	60	50	65	1	False

	Description	Value
0	session_id	6
1	Original Data	(800, 13)
2	Missing Values	True
3	Numeric Features	7
4	Categorical Features	5
5	Ordinal Features	False
6	High Cardinality Features	False
7	High Cardinality Method	None
8	Transformed Data	(800, 851)
9	CPU Jobs	-1
10	Use GPU	False
11	Log Experiment	True
12	Experiment Name	pokemon1
13	USI	2169
14	Imputation Type	simple
15	Iterative Imputation Iteration	None
16	Numeric Imputer	median
17	Iterative Imputation Numeric Model	None
18	Categorical Imputer	mode
19	Iterative Imputation Categorical Model	None
20	Unknown Categoricals Handling	least_frequent
21	Normalize	True
22	Normalize Method	minmax
23	Transformation	False
24	Transformation Method	None
25	PCA	False
26	PCA Method	None
27	PCA Components	None
28	Ignore Low Variance	False
29	Combine Rare Levels	False
30	Rare Level Threshold	None
31	Numeric Binning	False
32	Remove Outliers	False
33	Outliers Threshold	None
34	Remove Multicollinearity	False
35	Multicollinearity Threshold	None
36	Clustering	False
37	Clustering Iteration	None
38	Polynomial Features	False
39	Polynomial Degree	None
40	Trignometry Features	False
41	Polynomial Threshold	None
42	Group Features	False
43	Feature Selection	False
44	Feature Selection Method	classic
45	Features Selection Threshold	None
46	Feature Interaction	False
47	Feature Ratio	False
48	Interaction Threshold	None

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary	Cluster
0	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	False	Cluster 0
1	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	False	Cluster 0
2	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	False	Cluster 0
3	3	VenusaurMega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	False	Cluster 0
4	4	Charmander	Fire	NaN	309	39	52	43	60	50	65	1	False	Cluster 2

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary	Cluster
0	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	False	Cluster 0
1	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	False	Cluster 0
2	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	False	Cluster 0
3	3	VenusaurMega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	False	Cluster 0
4	4	Charmander	Fire	NaN	309	39	52	43	60	50	65	1	False	Cluster 2

(1) Import libraries¶

(2) Get data¶

(3) Setup pipeline¶

(4) Create models¶

(5) Assign cluster labels to current data¶

(6) Assign cluster labels to new data¶

(7) Analyze model performance¶

(7) Save model¶