Source code for s2spy.preprocess

"""Preprocessor for s2spy workflow."""
import warnings
from typing import Literal
from typing import Union
import numpy as np
import scipy.stats
import xarray as xr


[docs] def _linregress(x: np.ndarray, y: np.ndarray) -> tuple[float, float]: """Calculate the slope and intercept between two arrays using scipy's linregress. Used to make linregress more ufunc-friendly. Args: x: First array. y: Second array. Returns: slope, intercept """ slope, intercept, _, _, _ = scipy.stats.linregress(x, y) return slope, intercept
[docs] def _trend_linear(data: Union[xr.DataArray, xr.Dataset]) -> dict: """Calculate the linear trend over time. Args: data: The input data of which you want to know the trend. Returns: Dictionary containing the linear trend information (slope and intercept) """ slope, intercept = xr.apply_ufunc( _linregress, data["time"].astype(float), data, input_core_dims=[["time"], ["time"]], output_core_dims=[[], []], vectorize=True, ) return {"slope": slope, "intercept": intercept}
[docs] def _subtract_linear_trend(data: Union[xr.DataArray, xr.Dataset], trend: dict): """Subtract a previously calclulated linear trend from (new) data.""" return data - trend["intercept"] - trend["slope"] * (data["time"].astype(float))
[docs] def _get_trend(data: Union[xr.DataArray, xr.Dataset], method: str): """Calculate the trend, with a certain method. Only linear is implemented.""" if method == "linear": return _trend_linear(data) raise ValueError(f"Unkown detrending method '{method}'")
[docs] def _subtract_trend(data: Union[xr.DataArray, xr.Dataset], method: str, trend: dict): """Subtract the previously calculated trend from (new) data. Only linear is implemented.""" if method == "linear": return _subtract_linear_trend(data, trend) raise NotImplementedError
[docs] def _get_climatology( data: Union[xr.Dataset, xr.DataArray], timescale: Literal["monthly", "weekly", "daily"], ): """Calculate the climatology of timeseries data.""" _check_data_resolution_match(data, timescale) if timescale == "monthly": climatology = data.groupby("time.month").mean("time") elif timescale == "weekly": climatology = data.groupby(data["time"].dt.isocalendar().week).mean("time") elif timescale == "daily": climatology = data.groupby("time.dayofyear").mean("time") else: raise ValueError("Given timescale is not supported.") return climatology
[docs] def _subtract_climatology( data: Union[xr.Dataset, xr.DataArray], timescale: Literal["monthly", "weekly", "daily"], climatology: Union[xr.Dataset, xr.DataArray], ): if timescale == "monthly": deseasonalized = data.groupby("time.month") - climatology elif timescale == "weekly": deseasonalized = data.groupby(data["time"].dt.isocalendar().week) - climatology elif timescale == "daily": deseasonalized = data.groupby("time.dayofyear") - climatology else: raise ValueError("Given timescale is not supported.") return deseasonalized
[docs] def _check_input_data(data: Union[xr.DataArray, xr.Dataset]): """Check the input data for compatiblity with the preprocessor. Args: data: Data to validate. Raises: ValueError: If the input data is of the wrong type. ValueError: If the input data does not have a 'time' dimension. """ if not any(isinstance(data, dtype) for dtype in (xr.DataArray, xr.Dataset)): raise ValueError( "Input data has to be an xarray-DataArray or xarray-Dataset, " f"not {type(data)}" ) if "time" not in data.dims: raise ValueError( "Analysis is done of the 'time' dimension, but the input data" f" only has dims: {data.dims}" )
[docs] def _check_temporal_resolution( timescale: Literal["monthly", "weekly", "daily"] ) -> Literal["monthly", "weekly", "daily"]: support_temporal_resolution = ["monthly", "weekly", "daily"] if timescale not in support_temporal_resolution: raise ValueError( "Given temporal resoltuion is not supported." "Please choose from 'monthly', 'weekly', 'daily'." ) return timescale
[docs] def _check_data_resolution_match( data: Union[xr.DataArray, xr.Dataset], timescale: Literal["monthly", "weekly", "daily"], ): """Check if the temporal resolution of input is the same as given timescale.""" timescale_dict = { "monthly": np.timedelta64(1, "M"), "weekly": np.timedelta64(1, "W"), "daily": np.timedelta64(1, "D"), } time_intervals = np.diff(data["time"].to_numpy()) temporal_resolution = np.median(time_intervals).astype("timedelta64[D]") if timescale == "monthly": temporal_resolution = temporal_resolution.astype(int) min_days, max_days = (28, 31) if not max_days >= temporal_resolution >= min_days: warnings.warn( "The temporal resolution of data does not completely match " "the target timescale. Please check your input data.", stacklevel=1, ) elif timescale in timescale_dict: if timescale_dict[timescale].astype("timedelta64[D]") != temporal_resolution: warnings.warn( "The temporal resolution of data does not completely match " "the target timescale. Please check your input data.", stacklevel=1, )
[docs] class Preprocessor: """Preprocessor for s2s data.""" def __init__( # noqa: PLR0913 self, rolling_window_size: Union[int, None], timescale: Literal["monthly", "weekly", "daily"], rolling_min_periods: int = 1, subtract_climatology: bool = True, detrend: Union[str, None] = "linear", ): """Preprocessor for s2s data. Can detrend as well as deseasonalize. On calling `.fit(data)`, the preprocessor will: - Calculate the rolling mean of the input data. - Calculate and store the climatology of the rolling mean. - Calculate and store the trend of the rolling mean. When calling `.transform(data)`, the preprocessor will: - Remove the climatology from a copy of the data. - Remove the (stored) trend from this deseasonalized data. - Return the detrended and deseasonalized data. Args: rolling_window_size: The size of the rolling window that will be applied before calculating the trend and climatology. Setting this to None will skip this step. rolling_min_periods: The minimum number of periods within a rolling window. If higher than 1 (the default), NaN values will be present at the start and end of the preprocessed data. subtract_climatology (optional): If you want to calculate and remove the climatology of the data. Defaults to True. detrend (optional): Which method to use for detrending. Currently the only method supported is "linear". If you want to skip detrending, set this to None. timescale: Temporal resolution of input data. """ self._window_size = rolling_window_size self._min_periods = rolling_min_periods self._detrend = detrend self._subtract_climatology = subtract_climatology if subtract_climatology: self._timescale = _check_temporal_resolution(timescale) self._climatology: Union[xr.DataArray, xr.Dataset] self._trend: dict self._is_fit = False
[docs] def fit(self, data: Union[xr.DataArray, xr.Dataset]) -> None: """Fit this Preprocessor to input data. Args: data: Input data for fitting. """ _check_input_data(data) if self._window_size not in [None, 1]: data_rolling = data.rolling( dim={"time": self._window_size}, # type: ignore min_periods=self._min_periods, center=True, ).mean() # TODO: give option to be a gaussian-like window, instead of a block. else: data_rolling = data if self._subtract_climatology: self._climatology = _get_climatology(data_rolling, self._timescale) if self._detrend is not None: if self._subtract_climatology: deseasonalized = _subtract_climatology( data_rolling, self._timescale, self._climatology ) self._trend = _get_trend(deseasonalized, self._detrend) else: self._trend = _get_trend(data_rolling, self._detrend) self._is_fit = True
[docs] def transform( self, data: Union[xr.DataArray, xr.Dataset] ) -> Union[xr.DataArray, xr.Dataset]: """Apply the preprocessing steps to the input data. Args: data: Input data to perform preprocessing. Returns: Preprocessed data. """ if not self._is_fit: raise ValueError( "The preprocessor has to be fit to data before a transform" " can be applied" ) if self._subtract_climatology: d = _subtract_climatology(data, self._timescale, self._climatology) else: d = data if self._detrend is not None: return _subtract_trend(d, self._detrend, self.trend) return d
[docs] def fit_transform( self, data: Union[xr.DataArray, xr.Dataset] ) -> Union[xr.DataArray, xr.Dataset]: """Fit this Preprocessor to input data, and then apply the steps to the data. Args: data: Input data for fit and transform. Returns: Preprocessed data. """ self.fit(data) return self.transform(data)
@property
[docs] def trend(self) -> dict: """Return the stored trend (dictionary).""" if not self._detrend: raise ValueError("Detrending is set to `None`, so no trend is available") if not self._is_fit: raise ValueError( "The preprocessor has to be fit to data before the trend" " can be requested." ) return self._trend
@property
[docs] def climatology(self) -> Union[xr.DataArray, xr.Dataset]: """Return the stored climatology data.""" if not self._subtract_climatology: raise ValueError( "`subtract_climatology is set to `False`, so no climatology " "data is available" ) if not self._is_fit: raise ValueError( "The preprocessor has to be fit to data before the" " climatology can be requested." ) return self._climatology