Source code for kdiagram.utils.hist

from typing import Optional, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from .handlers import columns_manager
from .plot import (
    add_histogram_to_plot,
    add_kde_to_plot,
    normalize_pdf,
    prepare_data_for_kde,
    set_axis_grid,
    setup_plot_axes,
)


[docs] def plot_hist_kde( data: Union[np.ndarray, pd.Series, pd.DataFrame], column: Optional[str] = None, *, bins: int = 50, x_label: Optional[str] = None, title: str = "Distribution (Histogram + KDE)", bandwidth: Optional[float] = None, show_kde: bool = True, savefig: Optional[str] = None, dpi: int = 300, figsize: tuple[float, float] = (8, 6), kde_color: str = "orange", hist_color: str = "skyblue", hist_edge_color: str = "white", kde_line_width: float = 2, hist_alpha: float = 0.7, normalize_kde: bool = False, show_grid: bool = True, grid_props: Optional[dict] = None, return_ax: bool = False, **hist_kws, # Pass extra keywords to the histogram ) -> tuple[np.ndarray, np.ndarray]: # Ensure the data is a valid type and convert it to np.ndarray if isinstance(data, (pd.DataFrame, pd.Series)): if isinstance(data, pd.DataFrame): columns = columns_manager(column, empty_as_none=True) if columns is None: raise ValueError( "If a DataFrame is provided, the 'column' " "parameter must specify the series to plot." ) column = columns[0] series_data = data[column] else: series_data = data # Auto-set x_label from series name if not provided by user if x_label is None and series_data.name: x_label = str(series_data.name) data = series_data.values # Default x_label if still not set if x_label is None: x_label = "Value" data = np.asarray(data) # Prepare the data for KDE grid, pdf = prepare_data_for_kde(data, bandwidth=bandwidth) if normalize_kde: pdf = normalize_pdf(pdf) # Create the plot axes ax = setup_plot_axes(figsize=figsize, title=title, x_label=x_label) # Set grid properties set_axis_grid(ax, show_grid=show_grid, grid_props=grid_props) add_histogram_to_plot( data, ax, bins=bins, hist_color=hist_color, hist_edge_color=hist_edge_color, hist_alpha=hist_alpha, **hist_kws, ) # Add KDE to the plot if requested if show_kde: add_kde_to_plot( grid, pdf, ax, color=kde_color, line_width=kde_line_width, ) # Customize axis labels and title ax.set_xlabel(x_label, fontsize=12) ax.set_ylabel("Density", fontsize=12) # Let legend be added after all elements are plotted ax.legend() # Save or display the plot if savefig: plt.savefig(savefig, dpi=dpi, bbox_inches="tight") plt.close() else: plt.show() if return_ax: return ax return grid, pdf
plot_hist_kde.__doc__ = r""" Plot histogram and Kernel Density Estimate (KDE) for uncertainty evaluation. This function combines a histogram and a Kernel Density Estimate (KDE) to visualize the distribution of the provided data. It allows users to evaluate the uncertainty in predictions by plotting the histogram of the data along with an optional KDE to estimate the probability density function. Parameters ---------- data : Union[np.ndarray, pd.Series, pd.DataFrame] The data to be plotted. This can be a numpy array, a pandas Series, or a pandas DataFrame. If a DataFrame is provided, the 'column' parameter must be specified to select the column to plot. column : Optional[str], default=None The name of the column to plot if the input data is a DataFrame. If data is a Series, this parameter is ignored. bins : int, default=50 The number of bins to use in the histogram. x_label : str, default='Value' The label for the x-axis. title : str, default='Distribution (Histogram + KDE)' The title of the plot. bandwidth : Optional[float], default=None The bandwidth for the Kernel Density Estimate. If None, the bandwidth will be estimated using Silverman's rule of thumb. show_kde : bool, default=True Whether or not to display the KDE on the plot. If False, only the histogram will be plotted. savefig : Optional[str], default=None The file path where the plot will be saved. If None, the plot will be displayed on the screen. dpi : int, default=300 The resolution of the saved plot (dots per inch) when savefig is specified. figsize : Tuple[float, float], default=(8, 6) The size of the plot in inches. kde_color : str, default='orange' The color of the KDE line. hist_color : str, default='skyblue' The color of the histogram bars. hist_edge_color : str, default='white' The color of the edges of the histogram bars. kde_line_width : float, default=2 The line width of the KDE line. hist_alpha : float, default=0.7 The transparency level of the histogram bars. A value between 0 and 1. hist_edge_alpha : float, default=1.0 The transparency level of the histogram edges. A value between 0 and 1. normalize_kde : bool, default=False If True, the KDE will be normalized so that the maximum value is 1. show_grid : bool, default=True Whether or not to display a grid on the plot. grid_props : Optional[dict], default=None A dictionary of grid properties. If provided, these will be applied to customize the grid appearance. By default, a dotted grid with 0.7 alpha is used. **kws : additional keyword arguments Additional keyword arguments that can be passed to customize the plot, such as adjusting the axis properties or applying specific formatting. Returns ------- grid : np.ndarray The x-values grid for the KDE evaluation. pdf : np.ndarray The estimated probability density function (PDF) values computed from the KDE. Notes ----- - The function estimates the KDE using a Gaussian kernel with a specified or automatically calculated bandwidth. - The KDE can be normalized to fit the range [0, 1], which is useful for comparison purposes, especially when overlaid with histograms. - The function automatically handles different input data types, such as pandas DataFrames, Series, or numpy arrays. Examples -------- >>> import numpy as np >>> from kdiagram.utils import plot_hist_kde >>> data = np.random.normal(0, 1, 1000) >>> plot_hist_kde(data, bins=30, kde_color='blue') >>> import pandas as pd >>> df = pd.DataFrame({'values': np.random.normal(0, 1, 1000)}) >>> plot_hist_kde(df, column='values', bins=30, show_kde=True) >>> plot_hist_kde(data, bins=30, title="Histogram with KDE", >>> savefig="output.png") See Also -------- scipy.stats.gaussian_kde : For the Kernel Density Estimate implementation. matplotlib.pyplot.hist : For plotting histograms in matplotlib. pandas.Series.hist : For creating histograms from pandas Series. References ---------- .. [1] Silverman, B. W. (1986). *Density Estimation for Statistics and Data Analysis*. CRC Press. .. [2] Scott, D. W. (2015). *Multivariate Density Estimation: Theory, Practice, and Visualization*. Wiley-Interscience. """