from pycaret.datasets import get_data
from pycaret.clustering import *
# Record version of key libraries
from importlib.metadata import version
print('pycaret==%s' % version('pycaret'))
pycaret==2.3.1
# Get a list of all pre-packaged data
# get_data('index')
# Select a pre-packaged data for testing
data = get_data('pokemon')
# | Name | Type 1 | Type 2 | Total | HP | Attack | Defense | Sp. Atk | Sp. Def | Speed | Generation | Legendary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Bulbasaur | Grass | Poison | 318 | 45 | 49 | 49 | 65 | 65 | 45 | 1 | False |
1 | 2 | Ivysaur | Grass | Poison | 405 | 60 | 62 | 63 | 80 | 80 | 60 | 1 | False |
2 | 3 | Venusaur | Grass | Poison | 525 | 80 | 82 | 83 | 100 | 100 | 80 | 1 | False |
3 | 3 | VenusaurMega Venusaur | Grass | Poison | 625 | 80 | 100 | 123 | 122 | 120 | 80 | 1 | False |
4 | 4 | Charmander | Fire | NaN | 309 | 39 | 52 | 43 | 60 | 50 | 65 | 1 | False |
This step also includes data pre-processing steps such as normalisation and missing values imputation.
Logged data and profile are stored under mlruns
folder.
# Get relevant column names
numeric_cols = [
'Total',
'HP',
'Attack',
'Defense',
'Sp. Atk',
'Sp. Def',
'Speed'
]
categorical_cols = [
'Type 1',
'Type 2',
'Generation',
'Legendary'
]
ignore_cols = [
'#'
]
# Setup core pycaret config
clf_setup = setup(
data=data,
silent=True,
numeric_features=numeric_cols,
categorical_features=categorical_cols,
ignore_features=ignore_cols,
imputation_type='simple',
numeric_imputation='median',
normalize=True,
normalize_method='minmax',
n_jobs=-1,
preprocess=True,
session_id=6,
log_data=True,
log_profile=True,
log_experiment=True,
experiment_name='pokemon1')
# Retrieve and save processed data
X_data = get_config('X')
X_data.to_csv('./processed_data.csv', index=False)
# data_processing_pipeline = get_config('prep_pipe')
Description | Value | |
---|---|---|
0 | session_id | 6 |
1 | Original Data | (800, 13) |
2 | Missing Values | True |
3 | Numeric Features | 7 |
4 | Categorical Features | 5 |
5 | Ordinal Features | False |
6 | High Cardinality Features | False |
7 | High Cardinality Method | None |
8 | Transformed Data | (800, 851) |
9 | CPU Jobs | -1 |
10 | Use GPU | False |
11 | Log Experiment | True |
12 | Experiment Name | pokemon1 |
13 | USI | 2169 |
14 | Imputation Type | simple |
15 | Iterative Imputation Iteration | None |
16 | Numeric Imputer | median |
17 | Iterative Imputation Numeric Model | None |
18 | Categorical Imputer | mode |
19 | Iterative Imputation Categorical Model | None |
20 | Unknown Categoricals Handling | least_frequent |
21 | Normalize | True |
22 | Normalize Method | minmax |
23 | Transformation | False |
24 | Transformation Method | None |
25 | PCA | False |
26 | PCA Method | None |
27 | PCA Components | None |
28 | Ignore Low Variance | False |
29 | Combine Rare Levels | False |
30 | Rare Level Threshold | None |
31 | Numeric Binning | False |
32 | Remove Outliers | False |
33 | Outliers Threshold | None |
34 | Remove Multicollinearity | False |
35 | Multicollinearity Threshold | None |
36 | Clustering | False |
37 | Clustering Iteration | None |
38 | Polynomial Features | False |
39 | Polynomial Degree | None |
40 | Trignometry Features | False |
41 | Polynomial Threshold | None |
42 | Group Features | False |
43 | Feature Selection | False |
44 | Feature Selection Method | classic |
45 | Features Selection Threshold | None |
46 | Feature Interaction | False |
47 | Feature Ratio | False |
48 | Interaction Threshold | None |
# Select a list of models of interest to be tested
# # To see all available models
# models()
# Pick kmeans model
kmeans_model = create_model('kmeans', num_clusters=3)
print(kmeans_model)
Silhouette | Calinski-Harabasz | Davies-Bouldin | Homogeneity | Rand Index | Completeness | |
---|---|---|---|---|---|---|
0 | 0.095 | 53.6185 | 3.0264 | 0 | 0 | 0 |
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=-1, precompute_distances='deprecated', random_state=6, tol=0.0001, verbose=0)
# Assign cluster using model
kmeans_results = assign_model(kmeans_model)
kmeans_results.head()
# | Name | Type 1 | Type 2 | Total | HP | Attack | Defense | Sp. Atk | Sp. Def | Speed | Generation | Legendary | Cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Bulbasaur | Grass | Poison | 318 | 45 | 49 | 49 | 65 | 65 | 45 | 1 | False | Cluster 0 |
1 | 2 | Ivysaur | Grass | Poison | 405 | 60 | 62 | 63 | 80 | 80 | 60 | 1 | False | Cluster 0 |
2 | 3 | Venusaur | Grass | Poison | 525 | 80 | 82 | 83 | 100 | 100 | 80 | 1 | False | Cluster 0 |
3 | 3 | VenusaurMega Venusaur | Grass | Poison | 625 | 80 | 100 | 123 | 122 | 120 | 80 | 1 | False | Cluster 0 |
4 | 4 | Charmander | Fire | NaN | 309 | 39 | 52 | 43 | 60 | 50 | 65 | 1 | False | Cluster 2 |
# Assign cluster using model
kmeans_results = predict_model(kmeans_model, data=data)
kmeans_results.head()
# | Name | Type 1 | Type 2 | Total | HP | Attack | Defense | Sp. Atk | Sp. Def | Speed | Generation | Legendary | Cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Bulbasaur | Grass | Poison | 318 | 45 | 49 | 49 | 65 | 65 | 45 | 1 | False | Cluster 0 |
1 | 2 | Ivysaur | Grass | Poison | 405 | 60 | 62 | 63 | 80 | 80 | 60 | 1 | False | Cluster 0 |
2 | 3 | Venusaur | Grass | Poison | 525 | 80 | 82 | 83 | 100 | 100 | 80 | 1 | False | Cluster 0 |
3 | 3 | VenusaurMega Venusaur | Grass | Poison | 625 | 80 | 100 | 123 | 122 | 120 | 80 | 1 | False | Cluster 0 |
4 | 4 | Charmander | Fire | NaN | 309 | 39 | 52 | 43 | 60 | 50 | 65 | 1 | False | Cluster 2 |
# Make interactive standard diagnostic plots
evaluate_model(kmeans_model)
# plot_model(kmeans_model, plot='cluster')
# plot_model(kmeans_model, feature = 'Total', label=True)
# plot_model(kmeans_model, plot='tsne')
# plot_model(kmeans_model, plot='distribution')
# plot_model(kmeans_model, plot='elbow')
# plot_model(kmeans_model, plot='silhouette')
# Save model
save_model(kmeans_model, model_name='best_model')
# kmeans_model = load_model('kmeans_model')
Transformation Pipeline and Model Succesfully Saved
(Pipeline(memory=None, steps=[('dtypes', DataTypes_Auto_infer(categorical_features=['Type 1', 'Type 2', 'Generation', 'Legendary'], display_types=False, features_todrop=['#'], id_columns=[], ml_usecase='regression', numerical_features=['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'], target='UNSUPERVISED_DUMMY_TARGET', time_features=[])), ('imputer', Sim... ('fix_perfect', 'passthrough'), ('clean_names', Clean_Colum_Names()), ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'), ('dfs', 'passthrough'), ('pca', 'passthrough'), ['trained_model', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=-1, precompute_distances='deprecated', random_state=6, tol=0.0001, verbose=0)]], verbose=False), 'best_model.pkl')