import pandas as pd
from pycaret.datasets import get_data
import sweetviz as sv
# Record version of key libraries
from importlib.metadata import version
print('sweetviz==%s' % version('sweetviz'))
sweetviz==2.1.3
# Select a pre-packaged data for testing
df = get_data('diabetes', verbose=True)
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class variable | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
# Generate EDA report on data - including uni variable distribution, target vs single variable relationship, correlation and missing values
# NOTE - the unique point of sweetviz (against pandas-profiling) is enabling visualizing target variable against other variables
# NOTE - only numerical and boolean features can be targets currently.
report = sv.analyze(
source=df,
target_feat='Class variable'
)
# Display report in Jupyter notebook
report.show_notebook()
# Compare 2 sub-populations of data
# NOTE - use compare() to compare 2 dataframes (e.g. train/test df)
compare_report = sv.compare_intra(
source_df=df,
condition_series=(df["Number of times pregnant"] > 7),
names=["Less pregnancies", "More pregnancies"],
target_feat='Class variable'
)
# Display report in Jupyter notebook
compare_report.show_notebook()
# Save report to an HTML file
report.show_html("sweetviz_report.html")