## (1) Import libraries

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from pycaret.datasets import get_data
import sklearn.model_selection

import autokeras as ak

In [7]:
# Record version of key libraries
from importlib.metadata import version

print('autokeras==%s' % version('autokeras'))

autokeras==1.0.15


## (2) Get data

In [8]:
# Get a list of all pre-packaged data
# get_data('index')

# Select a pre-packaged data for testing
data = get_data('diabetes')

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
# Split data into X and y
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

# Specify types of features - can only be either categorical or numerical
# Must be specified for each column in a dictionary
column_types = {
    'Number of times pregnant':'numerical',
    'Plasma glucose concentration a 2 hours in an oral glucose tolerance test':'numerical',
    'Diastolic blood pressure (mm Hg)':'numerical',
    'Triceps skin fold thickness (mm)':'numerical',
    '2-Hour serum insulin (mu U/ml)':'numerical',
    'Body mass index (weight in kg/(height in m)^2)':'numerical',
    'Diabetes pedigree function':'numerical',
    'Age (years)':'numerical'
}

# Split data into training and testing data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=6)

display(X)
display(y)

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years)
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Class variable, Length: 768, dtype: int64

## (3) Setup model object

In [17]:
# Setup automl object
# Loss defaults to use 'binary_crossentropy' or 'categorical_crossentropy' based on the number of classes.
automl = ak.StructuredDataClassifier(
    column_names=X.columns.tolist(),
    column_types=column_types,
    multi_label=False,
    overwrite=True, 
    max_trials=100,
    project_name='diabetes',
    seed=6
)

## (4) Fit model

In [18]:
# Fit and find best model
automl.fit(
    x=X_train,
    y=y_train
)

Trial 100 Complete [00h 00m 03s]
val_accuracy: 0.8229166865348816

Best val_accuracy So Far: 0.8333333134651184
Total elapsed time: 00h 18m 24s
INFO:tensorflow:Oracle triggered exit
Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13
INFO:tensorflow:Assets written to: ./diabetes/best_model/assets
<autokeras.tasks.structured_data.StructuredDataClassifier object at 0x7f92ffdca640>


## (5) Generate predictions & test results

In [23]:
# Predict with the best model
predicted_y = automl.predict(X_test)

# Test model results
print("Train loss and accuracy:", automl.evaluate(x=X_train, y=y_train))
print("Test loss and accuracy:", automl.evaluate(x=X_test, y=y_test))

Train loss and accuracy: [0.40956196188926697, 0.8003472089767456]
Test loss and accuracy: [0.4726644456386566, 0.8020833134651184]


## (6) Get best model

In [31]:
# Extract best model. This returns model as standard tensorflow object
best_model = automl.export_model()

# Check model structure
best_model.summary()

In [32]:
# Save model to file
best_model.save('best_model')

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
multi_category_encoding (Mul (None, 8)                 0         
_________________________________________________________________
normalization (Normalization (None, 8)                 17        
_________________________________________________________________
dense (Dense)                (None, 512)               4608      
_________________________________________________________________
re_lu (ReLU)                 (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                16416 