import numpy as np
import pandas as pd
import tensorflow as tf
from pycaret.datasets import get_data
import sklearn.model_selection
import autokeras as ak
# Record version of key libraries
from importlib.metadata import version
print('autokeras==%s' % version('autokeras'))
autokeras==1.0.15
# Get a list of all pre-packaged data
# get_data('index')
# Select a pre-packaged data for testing
data = get_data('diabetes')
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class variable | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
# Split data into X and y
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
# Specify types of features - can only be either categorical or numerical
# Must be specified for each column in a dictionary
column_types = {
'Number of times pregnant':'numerical',
'Plasma glucose concentration a 2 hours in an oral glucose tolerance test':'numerical',
'Diastolic blood pressure (mm Hg)':'numerical',
'Triceps skin fold thickness (mm)':'numerical',
'2-Hour serum insulin (mu U/ml)':'numerical',
'Body mass index (weight in kg/(height in m)^2)':'numerical',
'Diabetes pedigree function':'numerical',
'Age (years)':'numerical'
}
# Split data into training and testing data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=6)
display(X)
display(y)
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | |
---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 |
764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 |
765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 |
766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 |
767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 |
768 rows × 8 columns
0 1 1 0 2 1 3 0 4 1 .. 763 0 764 0 765 0 766 1 767 0 Name: Class variable, Length: 768, dtype: int64
# Setup automl object
# Loss defaults to use 'binary_crossentropy' or 'categorical_crossentropy' based on the number of classes.
automl = ak.StructuredDataClassifier(
column_names=X.columns.tolist(),
column_types=column_types,
multi_label=False,
overwrite=True,
max_trials=100,
project_name='diabetes',
seed=6
)
# Fit and find best model
automl.fit(
x=X_train,
y=y_train
)
Trial 100 Complete [00h 00m 03s] val_accuracy: 0.8229166865348816 Best val_accuracy So Far: 0.8333333134651184 Total elapsed time: 00h 18m 24s INFO:tensorflow:Oracle triggered exit Epoch 1/13 18/18 [==============================] - 0s 2ms/step - loss: 0.6109 - accuracy: 0.6684 Epoch 2/13 18/18 [==============================] - 0s 2ms/step - loss: 0.5016 - accuracy: 0.7656 Epoch 3/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4682 - accuracy: 0.7830 Epoch 4/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4635 - accuracy: 0.7674 Epoch 5/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4557 - accuracy: 0.7812 Epoch 6/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4479 - accuracy: 0.7951 Epoch 7/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4464 - accuracy: 0.7899 Epoch 8/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4506 - accuracy: 0.7847 Epoch 9/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4334 - accuracy: 0.7934 Epoch 10/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4405 - accuracy: 0.7934 Epoch 11/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4260 - accuracy: 0.7882 Epoch 12/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4274 - accuracy: 0.7865 Epoch 13/13 18/18 [==============================] - 0s 2ms/step - loss: 0.4313 - accuracy: 0.7847 INFO:tensorflow:Assets written to: ./diabetes/best_model/assets <autokeras.tasks.structured_data.StructuredDataClassifier object at 0x7f92ffdca640>
# Predict with the best model
predicted_y = automl.predict(X_test)
# Test model results
print("Train loss and accuracy:", automl.evaluate(x=X_train, y=y_train))
print("Test loss and accuracy:", automl.evaluate(x=X_test, y=y_test))
6/6 [==============================] - 0s 1ms/step 18/18 [==============================] - 0s 1ms/step - loss: 0.4096 - accuracy: 0.8003 Train loss and accuracy: [0.40956196188926697, 0.8003472089767456] 6/6 [==============================] - 0s 2ms/step - loss: 0.4727 - accuracy: 0.8021 Test loss and accuracy: [0.4726644456386566, 0.8020833134651184]
# Extract best model. This returns model as standard tensorflow object
best_model = automl.export_model()
# Check model structure
best_model.summary()
# Save model to file
best_model.save('best_model')
Model: "model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) [(None, 8)] 0 _________________________________________________________________ multi_category_encoding (Mul (None, 8) 0 _________________________________________________________________ normalization (Normalization (None, 8) 17 _________________________________________________________________ dense (Dense) (None, 512) 4608 _________________________________________________________________ re_lu (ReLU) (None, 512) 0 _________________________________________________________________ dropout (Dropout) (None, 512) 0 _________________________________________________________________ dense_1 (Dense) (None, 32) 16416 _________________________________________________________________ re_lu_1 (ReLU) (None, 32) 0 _________________________________________________________________ dropout_1 (Dropout) (None, 32) 0 _________________________________________________________________ dense_2 (Dense) (None, 1) 33 _________________________________________________________________ classification_head_1 (Activ (None, 1) 0 ================================================================= Total params: 21,074 Trainable params: 21,057 Non-trainable params: 17 _________________________________________________________________