import sklearn.datasets
Explore Data
show_correlation
show_correlation (df:pandas.core.frame.DataFrame, method='pearson', cmap:str|Colormap='PuBu', low:float=0, high:float=0, axis:Axis|None=0, subset:Subset|None=None, text_color_threshold:float=0.408, vmin:float|None=None, vmax:float|None=None, gmap:Sequence|None=None)
Show correlation heatmap
If output is not rendering properly when you reopen a notebook, make sure the notebook is trusted.
Parameters:
df
: DataFramemethod
: Method of correlation to pass todf.corr()
Remaining parameters are passed to
pandas.io.formats.style.background_gradient
.
= sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True) X_diabetes, y_diabetes
="columns")) show_correlation(pd.concat((X_diabetes, y_diabetes), axis
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
age | 1.00 | 0.17 | 0.19 | 0.34 | 0.26 | 0.22 | -0.08 | 0.20 | 0.27 | 0.30 | 0.19 |
sex | 0.17 | 1.00 | 0.09 | 0.24 | 0.04 | 0.14 | -0.38 | 0.33 | 0.15 | 0.21 | 0.04 |
bmi | 0.19 | 0.09 | 1.00 | 0.40 | 0.25 | 0.26 | -0.37 | 0.41 | 0.45 | 0.39 | 0.59 |
bp | 0.34 | 0.24 | 0.40 | 1.00 | 0.24 | 0.19 | -0.18 | 0.26 | 0.39 | 0.39 | 0.44 |
s1 | 0.26 | 0.04 | 0.25 | 0.24 | 1.00 | 0.90 | 0.05 | 0.54 | 0.52 | 0.33 | 0.21 |
s2 | 0.22 | 0.14 | 0.26 | 0.19 | 0.90 | 1.00 | -0.20 | 0.66 | 0.32 | 0.29 | 0.17 |
s3 | -0.08 | -0.38 | -0.37 | -0.18 | 0.05 | -0.20 | 1.00 | -0.74 | -0.40 | -0.27 | -0.39 |
s4 | 0.20 | 0.33 | 0.41 | 0.26 | 0.54 | 0.66 | -0.74 | 1.00 | 0.62 | 0.42 | 0.43 |
s5 | 0.27 | 0.15 | 0.45 | 0.39 | 0.52 | 0.32 | -0.40 | 0.62 | 1.00 | 0.46 | 0.57 |
s6 | 0.30 | 0.21 | 0.39 | 0.39 | 0.33 | 0.29 | -0.27 | 0.42 | 0.46 | 1.00 | 0.38 |
target | 0.19 | 0.04 | 0.59 | 0.44 | 0.21 | 0.17 | -0.39 | 0.43 | 0.57 | 0.38 | 1.00 |
= sklearn.datasets.load_iris()
iris = iris["data"], iris["target"]
X_iris, y_iris = pd.Series(y_iris, name="iris type").map(
y_iris for num, name in zip([0, 1, 2], iris["target_names"])}
{num: name
)= pd.DataFrame(X_iris, columns=iris["feature_names"]) X_iris
show_correlation(== "setosa", name="setosa"))), method="spearman"
pd.concat((X_iris, pd.Series(y_iris )
/var/folders/wv/pmfhhk1d4h1fkd_z5l83m0dw0000gq/T/ipykernel_96080/529445965.py:15: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
return df.corr(method=method).style.background_gradient(**kwargs).format("{0:,.2f}")
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
sepal length (cm) | 1.00 | -0.17 | 0.88 | 0.83 |
sepal width (cm) | -0.17 | 1.00 | -0.31 | -0.29 |
petal length (cm) | 0.88 | -0.31 | 1.00 | 0.94 |
petal width (cm) | 0.83 | -0.29 | 0.94 | 1.00 |
plot_column_clusters
plot_column_clusters (df, corr_method:str='spearman', ax:matplotlib.axes._axes.Axes=None, p=30, truncate_mode=None, color_threshold=None, get_leaves=True, orientation='top', labels=None, count_sort=False, distance_sort=False, show_leaf_counts=True, no_plot=False, no_labels=False, leaf_font_size=None, leaf_rotation=None, leaf_label_func=None, show_contracted=False, link_color_func=None, above_threshold_color='C0')
Plot a dendrogram based on column correlations
If output is not rendering properly when you reopen a notebook, make sure the notebook is trusted.
Adapted from https://github.com/fastai/book_nbs/blob/master/utils.py#L58-L64
Parameters:
df
: DataFramecorr_method
: Method of correlation to pass todf.corr()
ax
: MatplotlibAxes
object. Plot will be added to this object if provided; otherwise a newAxes
object will be generated.Remaining parameters are passed to
scipy.cluster.hierarchy.dendrogram
.
= plot_column_clusters(X_iris) ax