import numpy as np
import pandas as pd
import tensorflow as tf
from pycaret.datasets import get_data
import sklearn.model_selection

import autokeras as ak


# Record version of key libraries
from importlib.metadata import version

print('autokeras==%s' % version('autokeras'))

autokeras==1.0.15


# Get a list of all pre-packaged data
# get_data('index')

# Select a pre-packaged data for testing
data = get_data('diabetes')


# Split data into X and y
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

# Specify types of features - can only be either categorical or numerical
# Must be specified for each column in a dictionary
column_types = {
    'Number of times pregnant':'numerical',
    'Plasma glucose concentration a 2 hours in an oral glucose tolerance test':'numerical',
    'Diastolic blood pressure (mm Hg)':'numerical',
    'Triceps skin fold thickness (mm)':'numerical',
    '2-Hour serum insulin (mu U/ml)':'numerical',
    'Body mass index (weight in kg/(height in m)^2)':'numerical',
    'Diabetes pedigree function':'numerical',
    'Age (years)':'numerical'
}

# Split data into training and testing data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=6)

display(X)
display(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Class variable, Length: 768, dtype: int64


# Setup automl object
# Loss defaults to use 'binary_crossentropy' or 'categorical_crossentropy' based on the number of classes.
automl = ak.StructuredDataClassifier(
    column_names=X.columns.tolist(),
    column_types=column_types,
    multi_label=False,
    overwrite=True, 
    max_trials=100,
    project_name='diabetes',
    seed=6
)


# Fit and find best model
automl.fit(
    x=X_train,
    y=y_train
)

Trial 100 Complete [00h 00m 03s]
val_accuracy: 0.8229166865348816

Best val_accuracy So Far: 0.8333333134651184
Total elapsed time: 00h 18m 24s
INFO:tensorflow:Oracle triggered exit
Epoch 1/13
18/18 [==============================] - 0s 2ms/step - loss: 0.6109 - accuracy: 0.6684
Epoch 2/13
18/18 [==============================] - 0s 2ms/step - loss: 0.5016 - accuracy: 0.7656
Epoch 3/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4682 - accuracy: 0.7830
Epoch 4/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4635 - accuracy: 0.7674
Epoch 5/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4557 - accuracy: 0.7812
Epoch 6/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4479 - accuracy: 0.7951
Epoch 7/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4464 - accuracy: 0.7899
Epoch 8/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4506 - accuracy: 0.7847
Epoch 9/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4334 - accuracy: 0.7934
Epoch 10/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4405 - accuracy: 0.7934
Epoch 11/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4260 - accuracy: 0.7882
Epoch 12/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4274 - accuracy: 0.7865
Epoch 13/13
18/18 [==============================] - 0s 2ms/step - loss: 0.4313 - accuracy: 0.7847
INFO:tensorflow:Assets written to: ./diabetes/best_model/assets
<autokeras.tasks.structured_data.StructuredDataClassifier object at 0x7f92ffdca640>


# Predict with the best model
predicted_y = automl.predict(X_test)

# Test model results
print("Train loss and accuracy:", automl.evaluate(x=X_train, y=y_train))
print("Test loss and accuracy:", automl.evaluate(x=X_test, y=y_test))

6/6 [==============================] - 0s 1ms/step
18/18 [==============================] - 0s 1ms/step - loss: 0.4096 - accuracy: 0.8003
Train loss and accuracy: [0.40956196188926697, 0.8003472089767456]
6/6 [==============================] - 0s 2ms/step - loss: 0.4727 - accuracy: 0.8021
Test loss and accuracy: [0.4726644456386566, 0.8020833134651184]


# Extract best model. This returns model as standard tensorflow object
best_model = automl.export_model()

# Check model structure
best_model.summary()


# Save model to file
best_model.save('best_model')

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
multi_category_encoding (Mul (None, 8)                 0         
_________________________________________________________________
normalization (Normalization (None, 8)                 17        
_________________________________________________________________
dense (Dense)                (None, 512)               4608      
_________________________________________________________________
re_lu (ReLU)                 (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                16416     
_________________________________________________________________
re_lu_1 (ReLU)               (None, 32)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
_________________________________________________________________
classification_head_1 (Activ (None, 1)                 0         
=================================================================
Total params: 21,074
Trainable params: 21,057
Non-trainable params: 17
_________________________________________________________________

	Number of times pregnant	Plasma glucose concentration a 2 hours in an oral glucose tolerance test	Diastolic blood pressure (mm Hg)	Triceps skin fold thickness (mm)	2-Hour serum insulin (mu U/ml)	Body mass index (weight in kg/(height in m)^2)	Diabetes pedigree function	Age (years)
0	6	148	72	35	0	33.6	0.627	50
1	1	85	66	29	0	26.6	0.351	31
2	8	183	64	0	0	23.3	0.672	32
3	1	89	66	23	94	28.1	0.167	21
4	0	137	40	35	168	43.1	2.288	33
...	...	...	...	...	...	...	...	...
763	10	101	76	48	180	32.9	0.171	63
764	2	122	70	27	0	36.8	0.340	27
765	5	121	72	23	112	26.2	0.245	30
766	1	126	60	0	0	30.1	0.349	47
767	1	93	70	31	0	30.4	0.315	23

(1) Import libraries¶

(2) Get data¶

(3) Setup model object¶

(4) Fit model¶

(5) Generate predictions & test results¶

(6) Get best model¶