import pandas as pd
from pycaret.datasets import get_data

import sweetviz as sv


# Record version of key libraries
from importlib.metadata import version

print('sweetviz==%s' % version('sweetviz'))

sweetviz==2.1.3


# Select a pre-packaged data for testing
df = get_data('diabetes', verbose=True)


# Generate EDA report on data - including uni variable distribution, target vs single variable relationship, correlation and missing values
# NOTE - the unique point of sweetviz (against pandas-profiling) is enabling visualizing target variable against other variables
# NOTE - only numerical and boolean features can be targets currently.
report = sv.analyze(
        source=df,
        target_feat='Class variable'
    )


# Display report in Jupyter notebook
report.show_notebook()


# Compare 2 sub-populations of data
# NOTE - use compare() to compare 2 dataframes (e.g. train/test df)
compare_report = sv.compare_intra(
        source_df=df, 
        condition_series=(df["Number of times pregnant"] > 7), 
        names=["Less pregnancies", "More pregnancies"], 
        target_feat='Class variable'
    )


# Display report in Jupyter notebook
compare_report.show_notebook()


# Save report to an HTML file
report.show_html("sweetviz_report.html")

	Number of times pregnant	Plasma glucose concentration a 2 hours in an oral glucose tolerance test	Diastolic blood pressure (mm Hg)	Triceps skin fold thickness (mm)	2-Hour serum insulin (mu U/ml)	Body mass index (weight in kg/(height in m)^2)	Diabetes pedigree function	Age (years)	Class variable
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

(1) Import libraries¶

(2) Get data¶

(3) Perform EDA¶

(4) Comparing two (sub-)dataframes¶

(5) Save report¶