import pandas as pd
from pycaret.datasets import get_data
from sklearn.model_selection import train_test_split
from dataprep.eda import create_report, plot, plot_correlation, plot_missing, plot_diff
# Record version of key libraries
from importlib.metadata import version
print('dataprep==%s' % version('dataprep'))
dataprep==0.3.0
# Select a pre-packaged data for testing
df = get_data('diabetes', verbose=True)
Number of times pregnant | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Diastolic blood pressure (mm Hg) | Triceps skin fold thickness (mm) | 2-Hour serum insulin (mu U/ml) | Body mass index (weight in kg/(height in m)^2) | Diabetes pedigree function | Age (years) | Class variable | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
# Split dataframes into 2 randomly
df_x, df_y = train_test_split(df, test_size=0.3, random_state=6)
# Generate and display EDA report on data in jupyter notebook
# NOTE - report.show_browser() does not work in my setup
report = create_report(df, title='Diabetes Data')
report.show()
# # Save report to an HTML file
# report.save("dataprep_report")
Number of Variables | 9 |
---|---|
Number of Rows | 768 |
Missing Cells | 0 |
Missing Cells (%) | 0.0% |
Duplicate Rows | 0 |
Duplicate Rows (%) | 0.0% |
Total Size in Memory | 54.1 KB |
Average Row Size in Memory | 72.2 B |
Variable Types |
|
Number of times pregnant is skewed | Skewed |
---|---|
Triceps skin fold thickness (mm) is skewed | Skewed |
2-Hour serum insulin (mu U/ml) is skewed | Skewed |
Age (years) is skewed | Skewed |
Class variable has constant length 1 | Constant Length |
Number of times pregnant has 111 (14.45%) zeros | Zeros |
Triceps skin fold thickness (mm) has 227 (29.56%) zeros | Zeros |
2-Hour serum insulin (mu U/ml) has 374 (48.7%) zeros | Zeros |
numerical
Approximate Distinct Count | 17 |
---|---|
Approximate Unique (%) | 2.2% |
Missing | 0 |
Missing (%) | 0.0% |
Infinite | 0 |
Infinite (%) | 0.0% |
Memory Size | 12.0 KB |
Mean | 3.8451 |
Minimum | 0 |
Maximum | 17 |
Zeros | 111 |
Zeros (%) | 14.4% |
Negatives | 0 |
Negatives (%) | 0.0% |
Minimum | 0 |
---|---|
5-th Percentile | 0 |
Q1 | 1 |
Median | 3 |
Q3 | 6 |
95-th Percentile | 10 |
Maximum | 17 |
Range | 17 |
IQR | 5 |
Mean | 3.8451 |
---|---|
Standard Deviation | 3.3696 |
Variance | 11.3541 |
Sum | 2953 |
Skewness | 0.8999 |
Kurtosis | 0.1504 |
Coefficient of Variation | 0.8763 |
numerical
Approximate Distinct Count | 136 |
---|---|
Approximate Unique (%) | 17.7% |
Missing | 0 |
Missing (%) | 0.0% |
Infinite | 0 |
Infinite (%) | 0.0% |
Memory Size | 12.0 KB |
Mean | 120.8945 |
Minimum | 0 |
Maximum | 199 |
Zeros | 5 |
Zeros (%) | 0.7% |
Negatives | 0 |
Negatives (%) | 0.0% |
Minimum | 0 |
---|---|
5-th Percentile | 79 |
Q1 | 99 |
Median | 117 |
Q3 | 140.25 |
95-th Percentile | 181 |
Maximum | 199 |
Range | 199 |
IQR | 41.25 |
Mean | 120.8945 |
---|---|
Standard Deviation | 31.9726 |
Variance | 1022.2483 |
Sum | 92847 |
Skewness | 0.1734 |
Kurtosis | 0.6288 |
Coefficient of Variation | 0.2645 |
numerical
Approximate Distinct Count | 47 |
---|---|
Approximate Unique (%) | 6.1% |
Missing | 0 |
Missing (%) | 0.0% |
Infinite | 0 |
Infinite (%) | 0.0% |
Memory Size | 12.0 KB |
Mean | 69.1055 |
Minimum | 0 |
Maximum | 122 |
Zeros | 35 |
Zeros (%) | 4.6% |
Negatives | 0 |
Negatives (%) | 0.0% |
Minimum | 0 |
---|---|
5-th Percentile | 38.7 |
Q1 | 62 |
Median | 72 |
Q3 | 80 |
95-th Percentile | 90 |
Maximum | 122 |
Range | 122 |
IQR | 18 |
Mean | 69.1055 |
---|---|
Standard Deviation | 19.3558 |
Variance | 374.6473 |
Sum | 53073 |
Skewness | -1.84 |
Kurtosis | 5.1387 |
Coefficient of Variation | 0.2801 |
numerical
Approximate Distinct Count | 51 |
---|---|
Approximate Unique (%) | 6.6% |
Missing | 0 |
Missing (%) | 0.0% |
Infinite | 0 |
Infinite (%) | 0.0% |
Memory Size | 12.0 KB |
Mean | 20.5365 |
Minimum | 0 |
Maximum | 99 |
Zeros | 227 |
Zeros (%) | 29.6% |
Negatives | 0 |
Negatives (%) | 0.0% |
Minimum | 0 |
---|---|
5-th Percentile | 0 |
Q1 | 0 |
Median | 23 |
Q3 | 32 |
95-th Percentile | 44 |
Maximum | 99 |
Range | 99 |
IQR | 32 |
Mean | 20.5365 |
---|---|
Standard Deviation | 15.9522 |
Variance | 254.4732 |
Sum | 15772 |
Skewness | 0.1092 |
Kurtosis | -0.5245 |
Coefficient of Variation | 0.7768 |
numerical
Approximate Distinct Count | 186 |
---|---|
Approximate Unique (%) | 24.2% |
Missing | 0 |
Missing (%) | 0.0% |
Infinite | 0 |
Infinite (%) | 0.0% |
Memory Size | 12.0 KB |
Mean | 79.7995 |
Minimum | 0 |
Maximum | 846 |
Zeros | 374 |
Zeros (%) | 48.7% |
Negatives | 0 |
Negatives (%) | 0.0% |
Minimum | 0 |
---|---|
5-th Percentile | 0 |
Q1 | 0 |
Median | 30.5 |
Q3 | 127.25 |
95-th Percentile | 293 |
Maximum | 846 |
Range | 846 |
IQR | 127.25 |
Mean | 79.7995 |
---|---|
Standard Deviation | 115.244 |
Variance | 13281.1801 |
Sum | 61286 |
Skewness | 2.2678 |
Kurtosis | 7.1596 |
Coefficient of Variation | 1.4442 |
numerical
Approximate Distinct Count | 248 |
---|---|
Approximate Unique (%) | 32.3% |
Missing | 0 |
Missing (%) | 0.0% |
Infinite | 0 |
Infinite (%) | 0.0% |
Memory Size | 12.0 KB |
Mean | 31.9926 |
Minimum | 0 |
Maximum | 67.1 |
Zeros | 11 |
Zeros (%) | 1.4% |
Negatives | 0 |
Negatives (%) | 0.0% |
Minimum | 0 |
---|---|
5-th Percentile | 21.8 |
Q1 | 27.3 |
Median | 32 |
Q3 | 36.6 |
95-th Percentile | 44.395 |
Maximum | 67.1 |
Range | 67.1 |
IQR | 9.3 |
Mean | 31.9926 |
---|---|
Standard Deviation | 7.8842 |
Variance | 62.16 |
Sum | 24570.3 |
Skewness | -0.4281 |
Kurtosis | 3.2613 |
Coefficient of Variation | 0.2464 |
numerical
Approximate Distinct Count | 517 |
---|---|
Approximate Unique (%) | 67.3% |
Missing | 0 |
Missing (%) | 0.0% |
Infinite | 0 |
Infinite (%) | 0.0% |
Memory Size | 12.0 KB |
Mean | 0.4719 |
Minimum | 0.078 |
Maximum | 2.42 |
Zeros | 0 |
Zeros (%) | 0.0% |
Negatives | 0 |
Negatives (%) | 0.0% |
Minimum | 0.078 |
---|---|
5-th Percentile | 0.1404 |
Q1 | 0.2437 |
Median | 0.3725 |
Q3 | 0.6262 |
95-th Percentile | 1.1328 |
Maximum | 2.42 |
Range | 2.342 |
IQR | 0.3825 |
Mean | 0.4719 |
---|---|
Standard Deviation | 0.3313 |
Variance | 0.1098 |
Sum | 362.401 |
Skewness | 1.9162 |
Kurtosis | 5.5508 |
Coefficient of Variation | 0.7022 |
numerical
Approximate Distinct Count | 52 |
---|---|
Approximate Unique (%) | 6.8% |
Missing | 0 |
Missing (%) | 0.0% |
Infinite | 0 |
Infinite (%) | 0.0% |
Memory Size | 12.0 KB |
Mean | 33.2409 |
Minimum | 21 |
Maximum | 81 |
Zeros | 0 |
Zeros (%) | 0.0% |
Negatives | 0 |
Negatives (%) | 0.0% |
Minimum | 21 |
---|---|
5-th Percentile | 21 |
Q1 | 24 |
Median | 29 |
Q3 | 41 |
95-th Percentile | 58 |
Maximum | 81 |
Range | 60 |
IQR | 17 |
Mean | 33.2409 |
---|---|
Standard Deviation | 11.7602 |
Variance | 138.303 |
Sum | 25529 |
Skewness | 1.1274 |
Kurtosis | 0.6312 |
Coefficient of Variation | 0.3538 |
categorical
Approximate Distinct Count | 2 |
---|---|
Approximate Unique (%) | 0.3% |
Missing | 0 |
Missing (%) | 0.0% |
Memory Size | 49.5 KB |
Mean | 1 |
---|---|
Standard Deviation | 0 |
Median | 1 |
Minimum | 1 |
Maximum | 1 |
1st row | 1 |
---|---|
2nd row | 0 |
3rd row | 1 |
4th row | 0 |
5th row | 1 |
Count | 0 |
---|---|
Lowercase Letter | 0 |
Space Separator | 0 |
Uppercase Letter | 0 |
Dash Punctuation | 0 |
Decimal Number | 768 |
# Investigate specific column of interest
# Also supports visualising text and maps
plot(df, "Class variable")
Approximate Distinct Count | 2 |
---|---|
Approximate Unique (%) | 0.3% |
Missing | 0 |
Missing (%) | 0.0% |
Memory Size | 49.5 KB |
Mean | 1 |
---|---|
Standard Deviation | 0 |
Median | 1 |
Minimum | 1 |
Maximum | 1 |
1st row | 1 |
---|---|
2nd row | 0 |
3rd row | 1 |
4th row | 0 |
5th row | 1 |
Count | 0 |
---|---|
Lowercase Letter | 0 |
Space Separator | 0 |
Uppercase Letter | 0 |
Dash Punctuation | 0 |
Decimal Number | 768 |
Value | Count | Frequency (%) |
0 | 500 | |
1 | 268 |
# Investigate a set of 2 columns of interest
# Also supports visualising maps
plot(df, "Class variable", "Age (years)")
# Investigate correlations with summary table
plot_correlation(df)
Pearson | Spearman | KendallTau | |
---|---|---|---|
Highest Positive Correlation | 0.544 | 0.607 | 0.458 |
Highest Negative Correlation | -0.114 | -0.127 | -0.096 |
Lowest Correlation | 0.018 | 0.0 | 0.004 |
Mean Correlation | 0.143 | 0.15 | 0.113 |
# Investigate a set of 2 columns of interest, with scatter plot & regression line
plot_correlation(df, "Class variable", "Age (years)")
# Investigate impact of removing missing rows on other variables
plot_missing(df, "Age (years)")
# To compare two dataframes with same sets of columns/variables
plot_diff([df_x, df_y])
df1 | df2 | |
Number of Variables | 9 | 9 |
Number of Rows | 537 | 231 |
Missing Cells | 0 | 0 |
Missing Cells (%) | 0.0% | 0.0% |
Duplicate Rows | 0 | 0 |
Duplicate Rows (%) | 0.0% | 0.0% |
Total Size in Memory | 58.1 KB | 26.1 KB |
Average Row Size in Memory | 57.6 KB | 25.9 KB |
Variable Types |
|
|