This class computes rolling features. To avoid data leakage, the last point
in the window is excluded from calculations, ('closed': 'left' and
'center': False).
Currently, the following statistics are supported: 'mean', 'std', 'min', 'max',
'sum', 'median', 'ratio_min_max', 'coef_variation', 'ewm'. For 'ewm', the
alpha parameter can be set in the kwargs_stats dictionary, default is
{'ewm': {'alpha': 0.3}}.
Statistics to compute over the rolling window. Can be a string or a list,
and can have repeats. Available statistics are: 'mean', 'std', 'min', 'max',
'sum', 'median', 'ratio_min_max', 'coef_variation', 'ewm'. For 'ewm', the
alpha parameter can be set in the kwargs_stats dictionary, default is
{'ewm': {'alpha': 0.3}}.
Minimum number of observations in window required to have a value.
Same as the min_periods argument of pandas rolling. If None,
defaults to window_sizes.
Dictionary with additional arguments for the statistics. The keys are the
statistic names and the values are dictionaries with the arguments for the
corresponding statistic. For example, {'ewm': {'alpha': 0.3}}.
Statistics to compute over the rolling window. Can be a string or a list,
and can have repeats. Available statistics are: 'mean', 'std', 'min', 'max',
'sum', 'median', 'ratio_min_max', 'coef_variation', 'ewm'.
Minimum number of observations in window required to have a value.
Same as the min_periods argument of pandas rolling. If None,
defaults to window_sizes.
Dictionary with additional arguments for the statistics. The keys are the
statistic names and the values are dictionaries with the arguments for the
corresponding statistic. For example, {'ewm': {'alpha': 0.3}}.
None
Returns:
Type
Description
None
Source code in skforecast/preprocessing/preprocessing.py
def_validate_params(self,stats:str|list[str],window_sizes:int|list[int],min_periods:int|list[int]|None=None,features_names:list[str]|None=None,fillna:str|float|None=None,kwargs_stats:dict[str,dict[str,object]]|None=None)->None:""" Validate the parameters of the RollingFeatures class. Parameters ---------- stats : str, list Statistics to compute over the rolling window. Can be a `string` or a `list`, and can have repeats. Available statistics are: 'mean', 'std', 'min', 'max', 'sum', 'median', 'ratio_min_max', 'coef_variation', 'ewm'. window_sizes : int, list Size of the rolling window for each statistic. If an `int`, all stats share the same window size. If a `list`, it should have the same length as stats. min_periods : int, list, default None Minimum number of observations in window required to have a value. Same as the `min_periods` argument of pandas rolling. If `None`, defaults to `window_sizes`. features_names : list, default None Names of the output features. If `None`, default names will be used in the format 'roll_stat_window_size', for example 'roll_mean_7'. fillna : str, float, default None Fill missing values in `transform_batch` method. Available methods are: 'mean', 'median', 'ffill', 'bfill', or a float value. kwargs_stats : dict, default None Dictionary with additional arguments for the statistics. The keys are the statistic names and the values are dictionaries with the arguments for the corresponding statistic. For example, {'ewm': {'alpha': 0.3}}. Returns ------- None """# statsallowed_stats=['mean','std','min','max','sum','median','ratio_min_max','coef_variation','ewm']ifnotisinstance(stats,(str,list)):raiseTypeError(f"`stats` must be a string or a list of strings. Got {type(stats)}.")ifisinstance(stats,str):stats=[stats]forstatinset(stats):ifstatnotinallowed_stats:raiseValueError(f"Statistic '{stat}' is not allowed. Allowed stats are: {allowed_stats}.")n_stats=len(stats)# window_sizesifnotisinstance(window_sizes,(int,list)):raiseTypeError(f"`window_sizes` must be an int or a list of ints. Got {type(window_sizes)}.")ifisinstance(window_sizes,list):n_window_sizes=len(window_sizes)ifn_window_sizes!=n_stats:raiseValueError(f"Length of `window_sizes` list ({n_window_sizes}) "f"must match length of `stats` list ({n_stats}).")# Check duplicates (stats, window_sizes)ifisinstance(window_sizes,int):window_sizes=[window_sizes]*n_statsiflen(set(zip(stats,window_sizes)))!=n_stats:raiseValueError(f"Duplicate (stat, window_size) pairs are not allowed.\n"f" `stats` : {stats}\n"f" `window_sizes` : {window_sizes}")# min_periodsifnotisinstance(min_periods,(int,list,type(None))):raiseTypeError(f"`min_periods` must be an int, list of ints, or None. Got {type(min_periods)}.")ifmin_periodsisnotNone:ifisinstance(min_periods,int):min_periods=[min_periods]*n_statselifisinstance(min_periods,list):n_min_periods=len(min_periods)ifn_min_periods!=n_stats:raiseValueError(f"Length of `min_periods` list ({n_min_periods}) "f"must match length of `stats` list ({n_stats}).")fori,min_periodinenumerate(min_periods):ifmin_period>window_sizes[i]:raiseValueError("Each `min_period` must be less than or equal to its ""corresponding `window_size`.")# features_namesifnotisinstance(features_names,(list,type(None))):raiseTypeError(f"`features_names` must be a list of strings or None. Got {type(features_names)}.")ifisinstance(features_names,list):n_features_names=len(features_names)ifn_features_names!=n_stats:raiseValueError(f"Length of `features_names` list ({n_features_names}) "f"must match length of `stats` list ({n_stats}).")# fillnaiffillnaisnotNone:ifnotisinstance(fillna,(int,float,str)):raiseTypeError(f"`fillna` must be a float, string, or None. Got {type(fillna)}.")ifisinstance(fillna,str):allowed_fill_strategy=['mean','median','ffill','bfill']iffillnanotinallowed_fill_strategy:raiseValueError(f"'{fillna}' is not allowed. Allowed `fillna` "f"values are: {allowed_fill_strategy} or a float value.")# kwargs_statsallowed_kwargs_stats=['ewm']ifkwargs_statsisnotNone:ifnotisinstance(kwargs_stats,dict):raiseTypeError(f"`kwargs_stats` must be a dictionary or None. Got {type(kwargs_stats)}.")forstatinkwargs_stats.keys():ifstatnotinallowed_kwargs_stats:raiseValueError(f"Invalid statistic '{stat}' found in `kwargs_stats`. "f"Allowed statistics with additional arguments are: "f"{allowed_kwargs_stats}. Please ensure all keys in "f"`kwargs_stats` are among the allowed statistics.")
def_apply_stat_pandas(self,rolling_obj:pd.core.window.rolling.Rolling,stat:str)->pd.Series:""" Apply the specified statistic to a pandas rolling object. Parameters ---------- rolling_obj : pandas Rolling Rolling object to apply the statistic. stat : str Statistic to compute. Returns ------- stat_series : pandas Series Series with the computed statistic. """ifstat=='mean':returnrolling_obj.mean()elifstat=='std':returnrolling_obj.std()elifstat=='min':returnrolling_obj.min()elifstat=='max':returnrolling_obj.max()elifstat=='sum':returnrolling_obj.sum()elifstat=='median':returnrolling_obj.median()elifstat=='ratio_min_max':returnrolling_obj.min()/rolling_obj.max()elifstat=='coef_variation':returnrolling_obj.std()/rolling_obj.mean()elifstat=='ewm':kwargs=self.kwargs_stats.get(stat,{})returnrolling_obj.apply(lambdax:_ewm_jit(x,**kwargs),raw=True)else:raiseValueError(f"Statistic '{stat}' is not implemented.")
deftransform_batch(self,X:pd.Series)->pd.DataFrame:""" Transform an entire pandas Series using rolling windows and compute the specified statistics. Parameters ---------- X : pandas Series The input data series to transform. Returns ------- rolling_features : pandas DataFrame A DataFrame containing the rolling features. """forkinself.unique_rolling_windows.keys():rolling_obj=X.rolling(**self.unique_rolling_windows[k]['params'])self.unique_rolling_windows[k]['rolling_obj']=rolling_objrolling_features=[]fori,statinenumerate(self.stats):window_size=self.window_sizes[i]min_periods=self.min_periods[i]key=f"{window_size}_{min_periods}"rolling_obj=self.unique_rolling_windows[key]['rolling_obj']stat_series=self._apply_stat_pandas(rolling_obj=rolling_obj,stat=stat)rolling_features.append(stat_series)rolling_features=pd.concat(rolling_features,axis=1)rolling_features.columns=self.features_namesrolling_features=rolling_features.iloc[self.max_window_size:]ifself.fillnaisnotNone:ifself.fillna=='mean':rolling_features=rolling_features.fillna(rolling_features.mean())elifself.fillna=='median':rolling_features=rolling_features.fillna(rolling_features.median())elifself.fillna=='ffill':rolling_features=rolling_features.ffill()elifself.fillna=='bfill':rolling_features=rolling_features.bfill()else:rolling_features=rolling_features.fillna(self.fillna)returnrolling_features
def_apply_stat_numpy_jit(self,X_window:np.ndarray,stat:str)->float:""" Apply the specified statistic to a numpy array using Numba JIT. Parameters ---------- X_window : numpy array Array with the rolling window. stat : str Statistic to compute. Returns ------- stat_value : float Value of the computed statistic. """ifstat=='mean':return_np_mean_jit(X_window)elifstat=='std':return_np_std_jit(X_window)elifstat=='min':return_np_min_jit(X_window)elifstat=='max':return_np_max_jit(X_window)elifstat=='sum':return_np_sum_jit(X_window)elifstat=='median':return_np_median_jit(X_window)elifstat=='ratio_min_max':return_np_min_max_ratio_jit(X_window)elifstat=='coef_variation':return_np_cv_jit(X_window)elifstat=='ewm':kwargs=self.kwargs_stats.get(stat,{})return_ewm_jit(X_window,**kwargs)else:raiseValueError(f"Statistic '{stat}' is not implemented.")
Transform a numpy array using rolling windows and compute the
specified statistics. The returned array will have the shape
(X.shape[1] if exists, n_stats). For example, if X is a flat
array, the output will have shape (n_stats,). If X is a 2D array,
the output will have shape (X.shape[1], n_stats).
Parameters:
Name
Type
Description
Default
X
numpy ndarray
The input data array to transform.
required
Returns:
Name
Type
Description
rolling_features
numpy ndarray
An array containing the computed statistics.
Source code in skforecast/preprocessing/preprocessing.py
deftransform(self,X:np.ndarray)->np.ndarray:""" Transform a numpy array using rolling windows and compute the specified statistics. The returned array will have the shape (X.shape[1] if exists, n_stats). For example, if X is a flat array, the output will have shape (n_stats,). If X is a 2D array, the output will have shape (X.shape[1], n_stats). Parameters ---------- X : numpy ndarray The input data array to transform. Returns ------- rolling_features : numpy ndarray An array containing the computed statistics. """array_ndim=X.ndimifarray_ndim==1:X=X[:,np.newaxis]rolling_features=np.full(shape=(X.shape[1],self.n_stats),fill_value=np.nan,dtype=float)foriinrange(X.shape[1]):forj,statinenumerate(self.stats):X_window=X[-self.window_sizes[j]:,i]X_window=X_window[~np.isnan(X_window)]iflen(X_window)>0:rolling_features[i,j]=self._apply_stat_numpy_jit(X_window,stat)else:rolling_features[i,j]=np.nanifarray_ndim==1:rolling_features=rolling_features.ravel()returnrolling_features
This class computes rolling features for classification problems. To avoid data
leakage, the last point in the window is excluded from calculations,
('closed': 'left' and 'center': False).
Currently, the following statistics are supported: 'proportion', 'mode',
'entropy', 'n_changes', 'n_unique'.
Statistics to compute over the rolling window. Can be a string or a list,
and can have repeats. Available statistics are: 'proportion', 'mode',
'entropy', 'n_changes', 'n_unique'.
Minimum number of observations in window required to have a value.
Same as the min_periods argument of pandas rolling. If None,
defaults to window_sizes.
Names of the output features. If None, default names will be used in the
format 'roll_stat_window_size', for example 'roll_mode_7'. For 'proportion',
class-specific names are appended, e.g., 'roll_proportion_7_class_0'.
Statistics to compute over the rolling window. Can be a string or a list,
and can have repeats. Available statistics are: 'proportion', 'mode',
'entropy', 'n_changes', 'n_unique'.
Minimum number of observations in window required to have a value.
Same as the min_periods argument of pandas rolling. If None,
defaults to window_sizes.
Names of the output features. If None, default names will be used in the
format 'roll_stat_window_size', for example 'roll_mode_7'. For 'proportion',
class-specific names are appended, e.g., 'roll_proportion_7_class_0'.
def_validate_params(self,stats:str|list[str],window_sizes:int|list[int],min_periods:int|list[int]|None=None,features_names:list[str]|None=None,fillna:str|float|None=None)->None:""" Validate the parameters of the RollingFeaturesClassification class. Parameters ---------- stats : str, list Statistics to compute over the rolling window. Can be a `string` or a `list`, and can have repeats. Available statistics are: 'proportion', 'mode', 'entropy', 'n_changes', 'n_unique'. window_sizes : int, list Size of the rolling window for each statistic. If an `int`, all stats share the same window size. If a `list`, it should have the same length as `stats`. min_periods : int, list, default None Minimum number of observations in window required to have a value. Same as the `min_periods` argument of pandas rolling. If `None`, defaults to `window_sizes`. features_names : list, default None Names of the output features. If `None`, default names will be used in the format 'roll_stat_window_size', for example 'roll_mode_7'. For 'proportion', class-specific names are appended, e.g., 'roll_proportion_7_class_0'. fillna : str, float, default None Fill missing values in `transform_batch` method. Available methods are: 'mean', 'median', 'ffill', 'bfill', or a float value. Returns ------- None """# statsallowed_stats=['proportion','mode','entropy','n_changes','n_unique']ifnotisinstance(stats,(str,list)):raiseTypeError(f"`stats` must be a string or a list of strings. Got {type(stats)}.")ifisinstance(stats,str):stats=[stats]forstatinset(stats):ifstatnotinallowed_stats:raiseValueError(f"Statistic '{stat}' is not allowed. Allowed stats are: {allowed_stats}.")n_stats=len(stats)# window_sizesifnotisinstance(window_sizes,(int,list)):raiseTypeError(f"`window_sizes` must be an int or a list of ints. Got {type(window_sizes)}.")ifisinstance(window_sizes,list):n_window_sizes=len(window_sizes)ifn_window_sizes!=n_stats:raiseValueError(f"Length of `window_sizes` list ({n_window_sizes}) "f"must match length of `stats` list ({n_stats}).")# Check duplicates (stats, window_sizes)ifisinstance(window_sizes,int):window_sizes=[window_sizes]*n_statsiflen(set(zip(stats,window_sizes)))!=n_stats:raiseValueError(f"Duplicate (stat, window_size) pairs are not allowed.\n"f" `stats` : {stats}\n"f" `window_sizes` : {window_sizes}")# min_periodsifnotisinstance(min_periods,(int,list,type(None))):raiseTypeError(f"`min_periods` must be an int, list of ints, or None. Got {type(min_periods)}.")ifmin_periodsisnotNone:ifisinstance(min_periods,int):min_periods=[min_periods]*n_statselifisinstance(min_periods,list):n_min_periods=len(min_periods)ifn_min_periods!=n_stats:raiseValueError(f"Length of `min_periods` list ({n_min_periods}) "f"must match length of `stats` list ({n_stats}).")fori,min_periodinenumerate(min_periods):ifmin_period>window_sizes[i]:raiseValueError("Each `min_period` must be less than or equal to its ""corresponding `window_size`.")# features_namesifnotisinstance(features_names,(list,type(None))):raiseTypeError(f"`features_names` must be a list of strings or None. Got {type(features_names)}.")ifisinstance(features_names,list):n_features_names=len(features_names)ifn_features_names!=n_stats:raiseValueError(f"Length of `features_names` list ({n_features_names}) "f"must match length of `stats` list ({n_stats}).")# TODO: Not used as ForecasterRecursiveClassifier doesn't allow NaNs. Check# when creating ForecasterRecursiveMultiSeriesClassifier# fillnaiffillnaisnotNone:ifnotisinstance(fillna,(int,float,str)):raiseTypeError(f"`fillna` must be a float, string, or None. Got {type(fillna)}.")ifisinstance(fillna,str):allowed_fill_strategy=['mean','median','ffill','bfill']iffillnanotinallowed_fill_strategy:raiseValueError(f"'{fillna}' is not allowed. Allowed `fillna` "f"values are: {allowed_fill_strategy} or a float value.")
def_apply_stat_pandas(self,X:pd.Series,rolling_obj:pd.core.window.rolling.Rolling,stat:str)->pd.Series:""" Apply the specified statistic to a pandas rolling object. Parameters ---------- rolling_obj : pandas Rolling Rolling object to apply the statistic. stat : str Statistic to compute. Returns ------- stat_series : pandas Series Series with the computed statistic. """ifstat=='proportion':rolling_params={'window':rolling_obj.window,'min_periods':rolling_obj.min_periods,'center':rolling_obj.center,'closed':rolling_obj.closed}dummies=pd.get_dummies(X,prefix='class')proportions=dummies.rolling(**rolling_params).sum()/rolling_obj.windowreturnproportionselifstat=='mode':returnrolling_obj.apply(lambdax:scipy_mode(x)[0],raw=True)elifstat=='entropy':returnrolling_obj.apply(_entropy,raw=True)elifstat=='n_changes':returnrolling_obj.apply(_n_changes_jit,raw=True)elifstat=='n_unique':returnrolling_obj.apply(_n_unique_jit,raw=True)else:raiseValueError(f"Statistic '{stat}' is not implemented.")
deftransform_batch(self,X:pd.Series)->pd.DataFrame:""" Transform an entire pandas Series using rolling windows and compute the specified statistics. Parameters ---------- X : pandas Series The input data series to transform. Returns ------- rolling_features : pandas DataFrame A DataFrame containing the rolling features. """ifself.classesisNone:self.classes=list(np.sort(X.unique()))features_names=[]forstat,feature_nameinzip(self.stats,self.features_names):ifstat!='proportion':features_names.append(feature_name)else:forclsinself.classes:feature_name_class=f"{feature_name}_class_{cls}"features_names.append(feature_name_class)self.features_names=features_namesforkinself.unique_rolling_windows.keys():rolling_obj=X.rolling(**self.unique_rolling_windows[k]['params'])self.unique_rolling_windows[k]['rolling_obj']=rolling_objrolling_features=[]fori,statinenumerate(self.stats):window_size=self.window_sizes[i]min_periods=self.min_periods[i]key=f"{window_size}_{min_periods}"rolling_obj=self.unique_rolling_windows[key]['rolling_obj']stat_series=self._apply_stat_pandas(X=X,rolling_obj=rolling_obj,stat=stat)rolling_features.append(stat_series)rolling_features=pd.concat(rolling_features,axis=1)rolling_features.columns=self.features_namesrolling_features=rolling_features.iloc[self.max_window_size:]ifself.fillnaisnotNone:ifself.fillna=='mean':rolling_features=rolling_features.fillna(rolling_features.mean())elifself.fillna=='median':rolling_features=rolling_features.fillna(rolling_features.median())elifself.fillna=='ffill':rolling_features=rolling_features.ffill()elifself.fillna=='bfill':rolling_features=rolling_features.bfill()else:rolling_features=rolling_features.fillna(self.fillna)returnrolling_features
def_apply_stat_numpy_jit(self,X_window:np.ndarray,stat:str)->float:""" Apply the specified statistic to a numpy array using Numba JIT. Parameters ---------- X_window : numpy array Array with the rolling window. stat : str Statistic to compute. Returns ------- stat_value : float Value of the computed statistic. """ifstat=='proportion':# Calculate proportions for each classproportions=np.zeros(len(self.classes))len_window=len(X_window)fori,clsinenumerate(self.classes):proportions[i]=np.sum(X_window==cls)/len_windowreturnproportionselifstat=='mode':returnscipy_mode(X_window)[0]elifstat=='entropy':return_entropy(X_window)elifstat=='n_changes':return_n_changes_jit(X_window)elifstat=='n_unique':return_n_unique_jit(X_window)else:raiseValueError(f"Statistic '{stat}' is not implemented.")
Transform a numpy array using rolling windows and compute the
specified statistics. The returned array will have the shape
(X.shape[1] if exists, n_stats). For example, if X is a flat
array, the output will have shape (n_stats,). If X is a 2D array,
the output will have shape (X.shape[1], n_stats).
Parameters:
Name
Type
Description
Default
X
numpy ndarray
The input data array to transform.
required
Returns:
Name
Type
Description
rolling_features
numpy ndarray
An array containing the computed statistics.
Source code in skforecast/preprocessing/preprocessing.py
deftransform(self,X:np.ndarray)->np.ndarray:""" Transform a numpy array using rolling windows and compute the specified statistics. The returned array will have the shape (X.shape[1] if exists, n_stats). For example, if X is a flat array, the output will have shape (n_stats,). If X is a 2D array, the output will have shape (X.shape[1], n_stats). Parameters ---------- X : numpy ndarray The input data array to transform. Returns ------- rolling_features : numpy ndarray An array containing the computed statistics. """ifself.classesisNone:raiseValueError("Classes must be specified before calling transform. ""Call `transform_batch` first to infer classes from data.")array_ndim=X.ndimifarray_ndim==1:X=X[:,np.newaxis]# TODO: If more than one columns 2d Array, maybe the classes doesn't come# from the same column. Col 1 has classes [0, 1], col 2 has classes [3, 4].n_classes=len(self.classes)n_output_features=0forstatinself.stats:ifstat=='proportion':n_output_features+=n_classeselse:n_output_features+=1rolling_features=np.full(shape=(X.shape[1],n_output_features),fill_value=np.nan,dtype=float)foriinrange(X.shape[1]):feature_idx=0forj,statinenumerate(self.stats):X_window=X[-self.window_sizes[j]:,i]X_window=X_window[~np.isnan(X_window)]iflen(X_window)>=0:result=self._apply_stat_numpy_jit(X_window,stat)ifstat=='proportion':# Result is an array with one value per classrolling_features[i,feature_idx:feature_idx+n_classes]=resultfeature_idx+=n_classeselse:# Result is a single valuerolling_features[i,feature_idx]=resultfeature_idx+=1else:ifstat=='proportion':rolling_features[i,feature_idx:feature_idx+n_classes]=np.nanfeature_idx+=n_classeselse:rolling_features[i,feature_idx]=np.nanfeature_idx+=1ifarray_ndim==1:rolling_features=rolling_features.ravel()returnrolling_features
Convert a pandas DataFrame where each column represents a different time series
into a long format DataFrame with a MultiIndex. The index of the input DataFrame
must be a pandas DatetimeIndex with a defined frequency. The function reshapes the
DataFrame from wide format to long format, where each row corresponds to a
specific time point and series ID. The resulting DataFrame will have a MultiIndex
with the series IDs as the first level and a pandas DatetimeIndex as the second
level. If return_multi_index is set to False, the returned DataFrame have three
columns: 'series_id', 'datetime' and 'value', with a regular index.
Parameters:
Name
Type
Description
Default
data
DataFrame
Wide format series. The index must be a pandas DatetimeIndex with a
defined frequency and each column must represent a different time series.
If True, the returned DataFrame will have a MultiIndex with the series IDs
as the first level and a pandas DatetimeIndex as the second level. If False,
the returned DataFrame will have a regular index.
True
Returns:
Name
Type
Description
data
pandas DataFrame
Long format series with a MultiIndex. The first level contains the series IDs,
and the second level contains a pandas DatetimeIndex with the same frequency
for each series.
Source code in skforecast/preprocessing/preprocessing.py
defreshape_series_wide_to_long(data:pd.DataFrame,return_multi_index:bool=True)->pd.DataFrame:""" Convert a pandas DataFrame where each column represents a different time series into a long format DataFrame with a MultiIndex. The index of the input DataFrame must be a pandas DatetimeIndex with a defined frequency. The function reshapes the DataFrame from wide format to long format, where each row corresponds to a specific time point and series ID. The resulting DataFrame will have a MultiIndex with the series IDs as the first level and a pandas DatetimeIndex as the second level. If `return_multi_index` is set to False, the returned DataFrame have three columns: 'series_id', 'datetime' and 'value', with a regular index. Parameters ---------- data: pandas DataFrame Wide format series. The index must be a pandas DatetimeIndex with a defined frequency and each column must represent a different time series. return_multi_index: bool, default True If True, the returned DataFrame will have a MultiIndex with the series IDs as the first level and a pandas DatetimeIndex as the second level. If False, the returned DataFrame will have a regular index. Returns ------- data: pandas DataFrame Long format series with a MultiIndex. The first level contains the series IDs, and the second level contains a pandas DatetimeIndex with the same frequency for each series. """ifnotisinstance(data,pd.DataFrame):raiseTypeError("`data` must be a pandas DataFrame.")ifnotisinstance(data.index,pd.DatetimeIndex):raiseTypeError("`data` index must be a pandas DatetimeIndex.")freq=data.index.freqdata.index.name="datetime"data=data.reset_index()data=pd.melt(data,id_vars="datetime",var_name="series_id",value_name="value")data=data.groupby("series_id",sort=False).apply(lambdax:x.set_index("datetime").asfreq(freq),include_groups=False)ifnotreturn_multi_index:data=data.reset_index()returndata
defreshape_series_long_to_dict(data:pd.DataFrame,freq:str,series_id:str|None=None,index:str|None=None,values:str|None=None,suppress_warnings:bool=False)->dict[str,pd.Series]:""" Convert a long-format DataFrame into a dictionary of pandas Series with the specified frequency. Supports two input formats: - A pandas DataFrame with explicit columns for the series identifier, time index, and values. - A pandas DataFrame with a MultiIndex, where the first level contains the series IDs, and the second level contains a pandas DatetimeIndex. Parameters ---------- data: pandas DataFrame Long-format series. freq: str Frequency of the series. series_id: str, default None Column name with the series identifier. Not needed if the input data is a pandas DataFrame with MultiIndex. index: str, default None Column name with the time index. Not needed if the input data is a pandas DataFrame with MultiIndex. values: str, default None Column name with the values. Not needed if the input data is a pandas DataFrame with MultiIndex. suppress_warnings: bool, default False If True, suppress warnings when a series is incomplete after setting the frequency. Returns ------- series_dict: dict Dictionary with the series. """ifnotisinstance(data,pd.DataFrame):raiseTypeError("`data` must be a pandas DataFrame.")ifisinstance(data.index,pd.MultiIndex):first_col=data.columns[0]data.index=data.index.set_names([data.index.names[0],None])series_dict={id:data.loc[id][first_col].rename(id).asfreq(freq)foridindata.index.levels[0]}else:forcolin[series_id,index,values]:ifcolisNone:raiseValueError("Arguments `series_id`, `index`, and `values` must be ""specified when the input DataFrame does not have a MultiIndex. ""Please provide a value for each of these arguments.")ifcolnotindata.columns:raiseValueError(f"Column '{col}' not found in `data`.")data_grouped=data.groupby(series_id,observed=True)original_sizes=data_grouped.size()series_dict={}fork,vindata_grouped:series_dict[k]=v.set_index(index)[values].asfreq(freq,fill_value=np.nan).rename(k)series_dict[k].index.name=Noneifnotsuppress_warningsandlen(series_dict[k])!=original_sizes[k]:warnings.warn(f"Series '{k}' is incomplete. NaNs have been introduced after "f"setting the frequency.",MissingValuesWarning)returnseries_dict
Convert a long-format DataFrame of exogenous variables into a dictionary
of pandas DataFrames with the specified frequency. Supports two input formats:
A pandas DataFrame with explicit columns for the series identifier, time
index, and exogenous variables.
A pandas DataFrame with a MultiIndex, where the first level contains the
series IDs, and the second level contains a pandas DatetimeIndex.
Consolidate the data types of the exogenous variables if, after setting
the frequency, NaNs have been introduced and the data types have changed
to float.
defreshape_exog_long_to_dict(data:pd.DataFrame,freq:str,series_id:str|None=None,index:str|None=None,drop_all_nan_cols:bool=False,consolidate_dtypes:bool=True,suppress_warnings:bool=False)->dict[str,pd.DataFrame]:""" Convert a long-format DataFrame of exogenous variables into a dictionary of pandas DataFrames with the specified frequency. Supports two input formats: - A pandas DataFrame with explicit columns for the series identifier, time index, and exogenous variables. - A pandas DataFrame with a MultiIndex, where the first level contains the series IDs, and the second level contains a pandas DatetimeIndex. Parameters ---------- data: pandas DataFrame Long format exogenous variables. freq: str Frequency of the series. series_id: str, default None Column name with the series identifier. Not needed if the input data is a pandas DataFrame with MultiIndex. index: str, default None Column name with the time index. Not needed if the input data is a pandas DataFrame with MultiIndex. drop_all_nan_cols: bool, default False If True, drop columns with all values as NaN. This is useful when there are series without some exogenous variables. consolidate_dtypes: bool, default True Consolidate the data types of the exogenous variables if, after setting the frequency, NaNs have been introduced and the data types have changed to float. suppress_warnings: bool, default False If True, suppress warnings when exog is incomplete after setting the frequency. Returns ------- exog_dict: dict Dictionary with the exogenous variables. """ifnotisinstance(data,pd.DataFrame):raiseTypeError("`data` must be a pandas DataFrame.")ifisinstance(data.index,pd.MultiIndex):data.index=data.index.set_names([data.index.names[0],None])exog_dict={id:data.loc[id].asfreq(freq)foridindata.index.levels[0]}else:forcolin[series_id,index]:ifcolisNone:raiseValueError("Arguments `series_id`, and `index` must be ""specified when the input DataFrame does not have a MultiIndex. ""Please provide a value for each of these arguments.")ifcolnotindata.columns:raiseValueError(f"Column '{col}' not found in `data`.")cols_float_dtype={colforcolindata.columnsifpd.api.types.is_float_dtype(data[col])}data_grouped=data.groupby(series_id,observed=True)original_sizes=data_grouped.size()exog_dict=dict(tuple(data_grouped))exog_dict={k:v.set_index(index).asfreq(freq,fill_value=np.nan).drop(columns=series_id)fork,vinexog_dict.items()}forkinexog_dict.keys():exog_dict[k].index.name=Nonenans_introduced=Falseifnotsuppress_warningsorconsolidate_dtypes:fork,vinexog_dict.items():iflen(v)!=original_sizes[k]:nans_introduced=Trueifnotsuppress_warnings:warnings.warn(f"Exogenous variables for series '{k}' are incomplete. "f"NaNs have been introduced after setting the frequency.",MissingValuesWarning)ifconsolidate_dtypes:cols_float_dtype.update({colforcolinv.columnsifpd.api.types.is_float_dtype(v[col])})ifconsolidate_dtypesandnans_introduced:new_dtypes={k:floatforkincols_float_dtype}exog_dict={k:v.astype(new_dtypes)fork,vinexog_dict.items()}ifdrop_all_nan_cols:exog_dict={k:v.dropna(how="all",axis=1)fork,vinexog_dict.items()}returnexog_dict
Convert dictionaries of series and exogenous variables to a long-format
pandas DataFrame with MultiIndex. The first level of the MultiIndex contains the
series identifiers, and the second level contains the temporal index. If both
series and exog are provided, they are merged into a single DataFrame.
Names for the levels of the MultiIndex in the resulting DataFrame. The first
name corresponds to the series identifier, and the second name corresponds
to the temporal index.
Type of merge to perform when combining series and exog. Options are:
'left': Keep only indices from series (default)
'right': Keep only indices from exog
'outer': Keep all indices from both series and exog
'inner': Keep only indices present in both
'left'
Returns:
Name
Type
Description
long_df
DataFrame
Long-format DataFrame with a MultiIndex of two levels:
- First level: series identifier (named by index_names[0], default 'series_id')
- Second level: temporal index (named by index_names[1], default 'datetime')
Columns include:
- Series values (named by series_col_name, default 'series_value') if series is provided.
- Exogenous variable columns (from exog) if exog is provided.
If both series and exog are provided, columns from both are present.
If only one is provided, only its columns are present.
Source code in skforecast/preprocessing/preprocessing.py
defreshape_series_exog_dict_to_long(series:dict[str,pd.Series]|None,exog:dict[str,pd.Series|pd.DataFrame]|None,series_col_name:str='series_value',index_names:list[str]=['series_id','datetime'],merge_how:str='left')->pd.DataFrame:""" Convert dictionaries of series and exogenous variables to a long-format pandas DataFrame with MultiIndex. The first level of the MultiIndex contains the series identifiers, and the second level contains the temporal index. If both series and exog are provided, they are merged into a single DataFrame. Parameters ---------- series: dict, None Dictionary with multiple time series (expected: dict[str, pd.Series]). exog: dict, None Dictionary with exogenous variables (expected: dict[str, pd.Series or pd.DataFrame]). series_col_name: str, default 'series_value' Column name for the series values in the resulting DataFrame. index_names: list[str], default ['series_id', 'datetime'] Names for the levels of the MultiIndex in the resulting DataFrame. The first name corresponds to the series identifier, and the second name corresponds to the temporal index. merge_how: str, default 'left' Type of merge to perform when combining `series` and `exog`. Options are: - 'left': Keep only indices from `series` (default) - 'right': Keep only indices from `exog` - 'outer': Keep all indices from both `series` and `exog` - 'inner': Keep only indices present in both Returns ------- long_df : pandas.DataFrame Long-format DataFrame with a MultiIndex of two levels: - First level: series identifier (named by `index_names[0]`, default 'series_id') - Second level: temporal index (named by `index_names[1]`, default 'datetime') Columns include: - Series values (named by `series_col_name`, default 'series_value') if `series` is provided. - Exogenous variable columns (from `exog`) if `exog` is provided. If both `series` and `exog` are provided, columns from both are present. If only one is provided, only its columns are present. """ifseriesisNoneandexogisNone:raiseValueError("Both `series` and `exog` cannot be None.")ifseriesisnotNone:ifnotisinstance(series,dict):raiseTypeError(f"`series` must be a dictionary. Got {type(series)}.")fork,vinseries.items():ifnotisinstance(v,pd.Series):raiseTypeError(f"`series['{k}']` must be a pandas Series.")series=pd.concat(series,names=index_names).to_frame(series_col_name)ifexogisnotNone:ifnotisinstance(exog,dict):raiseTypeError(f"`exog` must be a dictionary. Got {type(exog)}.")fork,vinexog.items():ifnotisinstance(v,(pd.Series,pd.DataFrame)):raiseTypeError(f"`exog['{k}']` must be a pandas Series or a pandas DataFrame.")exog=pd.concat(exog,names=index_names)ifisinstance(exog,pd.Series):exog=exog.to_frame(name='exog_value')ifseriesisnotNoneandexogisnotNone:series_idx_type=type(series.index.get_level_values(1))exog_idx_type=type(exog.index.get_level_values(1))ifseries_idx_type!=exog_idx_type:raiseTypeError(f"Index type mismatch: series has index of type "f"{series_idx_type}, but `exog` has {exog_idx_type}. "f"Ensure all indices are compatible.")ifseries_col_nameinexog.columns:raiseValueError(f"Column name conflict: '{series_col_name}' already exists in `exog`. "f"Please choose a different `series_col_name` value.")ifseriesisNone:long_df=exogelifexogisNone:long_df=serieselse:long_df=pd.merge(series,exog,left_index=True,right_index=True,how=merge_how)returnlong_df
Transforms a time series into a differentiated time series of a specified order
and provides functionality to revert the differentiation.
When using a direct module Forecaster, the model in step 1 must be
used if you want to reverse the differentiation of the training time
series with the inverse_transform_training method.
List with the first value of the time series before each differentiation.
If order = 2, first value correspond with the first value of the original
time series and the second value correspond with the first value of the
differentiated time series of order 1. These values are necessary to
revert the differentiation and reconstruct the original time series.
List with the first training value of the time series before each differentiation.
For order = 1, the value correspond with the last value of the window used to
create the predictors. For order > 1, the value correspond with the first
value of the differentiated time series prior to the next differentiation.
These values are necessary to revert the differentiation and reconstruct the
training time series.
List with the last value of the time series before each differentiation,
used to revert differentiation on subsequent data windows. If order = 2,
first value correspond with the last value of the original time series
and the second value correspond with the last value of the differentiated
time series of order 1. This is essential for correctly transforming a
time series that follows immediately after the series used to fit the
transformer.
def__init__(self,order:int=1,window_size:int|None=None)->None:ifnotisinstance(order,(int,np.integer)):raiseTypeError(f"Parameter `order` must be an integer greater than 0. Found {type(order)}.")iforder<1:raiseValueError(f"Parameter `order` must be an integer greater than 0. Found {order}.")ifwindow_sizeisnotNone:ifnotisinstance(window_size,(int,np.integer)):raiseTypeError(f"Parameter `window_size` must be an integer greater than 0. "f"Found {type(window_size)}.")ifwindow_size<1:raiseValueError(f"Parameter `window_size` must be an integer greater than 0. "f"Found {window_size}.")self.order=orderself.window_size=window_sizeself.initial_values=[]self.pre_train_values=[]self.last_values=[]
Fits the transformer. Stores the values needed to revert the
differentiation of different window of the time series, original
time series, training time series, and a time series that follows
immediately after the series used to fit the transformer.
Parameters:
Name
Type
Description
Default
X
numpy ndarray
Time series to be differentiated.
required
y
Ignored
Not used, present here for API consistency by convention.
@_check_X_numpy_ndarray_1d()deffit(self,X:np.ndarray,y:Any=None)->Self:""" Fits the transformer. Stores the values needed to revert the differentiation of different window of the time series, original time series, training time series, and a time series that follows immediately after the series used to fit the transformer. Parameters ---------- X : numpy ndarray Time series to be differentiated. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : TimeSeriesDifferentiator """self.initial_values=[]self.pre_train_values=[]self.last_values=[]foriinrange(self.order):ifi==0:self.initial_values.append(X[0])ifself.window_sizeisnotNone:self.pre_train_values.append(X[self.window_size-self.order])self.last_values.append(X[-1])X_diff=np.diff(X,n=1)else:self.initial_values.append(X_diff[0])ifself.window_sizeisnotNone:self.pre_train_values.append(X_diff[self.window_size-self.order])self.last_values.append(X_diff[-1])X_diff=np.diff(X_diff,n=1)returnself
@_check_X_numpy_ndarray_1d()deftransform(self,X:np.ndarray,y:Any=None)->np.ndarray:""" Transforms a time series into a differentiated time series of order n. Parameters ---------- X : numpy ndarray Time series to be differentiated. y : Ignored Not used, present here for API consistency by convention. Returns ------- X_diff : numpy ndarray Differentiated time series. The length of the array is the same as the original time series but the first n `order` values are nan. """X_diff=np.diff(X,n=self.order)X_diff=np.append((np.full(shape=self.order,fill_value=np.nan)),X_diff)returnX_diff
@_check_X_numpy_ndarray_1d()definverse_transform(self,X:np.ndarray,y:Any=None)->np.ndarray:""" Reverts the differentiation. To do so, the input array is assumed to be the same time series used to fit the transformer but differentiated. Parameters ---------- X : numpy ndarray Differentiated time series. y : Ignored Not used, present here for API consistency by convention. Returns ------- X_diff : numpy ndarray Reverted differentiated time series. """# Remove initial nan values if presentX=X[np.argmax(~np.isnan(X)):]foriinrange(self.order):ifi==0:X_undiff=np.insert(X,0,self.initial_values[-1])X_undiff=np.cumsum(X_undiff,dtype=float)else:X_undiff=np.insert(X_undiff,0,self.initial_values[-(i+1)])X_undiff=np.cumsum(X_undiff,dtype=float)returnX_undiff
Reverts the differentiation. To do so, the input array is assumed to be
the differentiated training time series generated with the original
time series used to fit the transformer.
When using a direct module Forecaster, the model in step 1 must be
used if you want to reverse the differentiation of the training time
series with the inverse_transform_training method.
Parameters:
Name
Type
Description
Default
X
numpy ndarray
Differentiated time series.
required
y
Ignored
Not used, present here for API consistency by convention.
None
Returns:
Name
Type
Description
X_diff
numpy ndarray
Reverted differentiated time series.
Source code in skforecast/preprocessing/preprocessing.py
@_check_X_numpy_ndarray_1d()definverse_transform_training(self,X:np.ndarray,y:Any=None)->np.ndarray:""" Reverts the differentiation. To do so, the input array is assumed to be the differentiated training time series generated with the original time series used to fit the transformer. When using a `direct` module Forecaster, the model in step 1 must be used if you want to reverse the differentiation of the training time series with the `inverse_transform_training` method. Parameters ---------- X : numpy ndarray Differentiated time series. y : Ignored Not used, present here for API consistency by convention. Returns ------- X_diff : numpy ndarray Reverted differentiated time series. """ifnotself.pre_train_values:raiseValueError("The `window_size` parameter must be set before fitting the ""transformer to revert the differentiation of the training ""time series.")# Remove initial nan values if presentX=X[np.argmax(~np.isnan(X)):]foriinrange(self.order):ifi==0:X_undiff=np.insert(X,0,self.pre_train_values[-1])X_undiff=np.cumsum(X_undiff,dtype=float)else:X_undiff=np.insert(X_undiff,0,self.pre_train_values[-(i+1)])X_undiff=np.cumsum(X_undiff,dtype=float)# Remove initial values as they are not part of the training time seriesX_undiff=X_undiff[self.order:]returnX_undiff
Reverts the differentiation. The input array X is assumed to be a
differentiated time series of order n that starts right after the
the time series used to fit the transformer.
Parameters:
Name
Type
Description
Default
X
numpy ndarray
Differentiated time series. It is assumed o start right after
the time series used to fit the transformer.
required
y
Ignored
Not used, present here for API consistency by convention.
None
Returns:
Name
Type
Description
X_undiff
numpy ndarray
Reverted differentiated time series.
Source code in skforecast/preprocessing/preprocessing.py
@_check_X_numpy_ndarray_1d(ensure_1d=False)definverse_transform_next_window(self,X:np.ndarray,y:Any=None)->np.ndarray:""" Reverts the differentiation. The input array `X` is assumed to be a differentiated time series of order n that starts right after the the time series used to fit the transformer. Parameters ---------- X : numpy ndarray Differentiated time series. It is assumed o start right after the time series used to fit the transformer. y : Ignored Not used, present here for API consistency by convention. Returns ------- X_undiff : numpy ndarray Reverted differentiated time series. """array_ndim=X.ndimifarray_ndim==1:X=X[:,np.newaxis]# Remove initial rows with nan values if presentX=X[~np.isnan(X).any(axis=1)]foriinrange(self.order):ifi==0:X_undiff=np.cumsum(X,axis=0,dtype=float)+self.last_values[-1]else:X_undiff=np.cumsum(X_undiff,axis=0,dtype=float)+self.last_values[-(i+1)]ifarray_ndim==1:X_undiff=X_undiff.ravel()returnX_undiff
defset_params(self,**params):""" Set the parameters of the TimeSeriesDifferentiator. Parameters ---------- params : dict A dictionary of the parameters to set. Returns ------- None """forparam,valueinparams.items():setattr(self,param,value)
QuantileBinner class to bin data into quantile-based bins using numpy.percentile.
This class is similar to KBinsDiscretizer but faster for binning data into
quantile-based bins. Bin intervals are defined following the convention:
bins[i-1] <= x < bins[i]. See more information in numpy.percentile and
numpy.digitize.
The method used to compute the quantiles. This parameter is passed to
numpy.percentile. Default is 'linear'. Valid values are "inverse_cdf",
"averaged_inverse_cdf", "closest_observation", "interpolated_inverse_cdf",
"hazen", "weibull", "linear", "median_unbiased", "normal_unbiased".
The method used to compute the quantiles. This parameter is passed to
numpy.percentile. Default is 'linear'. Valid values are 'linear',
'lower', 'higher', 'midpoint', 'nearest'.
def_validate_params(self,n_bins:int,method:str,subsample:int,dtype:type,random_state:int):""" Validate the parameters passed to the class initializer. """ifnotisinstance(n_bins,int)orn_bins<2:raiseValueError(f"`n_bins` must be an int greater than 1. Got {n_bins}.")valid_methods=["inverse_cdf","averaged_inverse_cdf","closest_observation","interpolated_inverse_cdf","hazen","weibull","linear","median_unbiased","normal_unbiased",]ifmethodnotinvalid_methods:raiseValueError(f"`method` must be one of {valid_methods}. Got {method}.")ifnotisinstance(subsample,int)orsubsample<1:raiseValueError(f"`subsample` must be an integer greater than or equal to 1. "f"Got {subsample}.")ifnotisinstance(random_state,int)orrandom_state<0:raiseValueError(f"`random_state` must be an integer greater than or equal to 0. "f"Got {random_state}.")ifnotisinstance(dtype,type):raiseValueError(f"`dtype` must be a valid numpy dtype. Got {dtype}.")
deffit(self,X:np.ndarray):""" Learn the bin edges based on quantiles from the training data. Parameters ---------- X : numpy ndarray The training data used to compute the quantiles. Returns ------- None """ifX.size==0:raiseValueError("Input data `X` cannot be empty.")iflen(X)>self.subsample:rng=np.random.default_rng(self.random_state)X=X[rng.integers(0,len(X),self.subsample)]self.bin_edges_=np.percentile(a=X,q=np.linspace(0,100,self.n_bins+1),method=self.method)self.n_bins_=len(self.bin_edges_)-1self.intervals_={int(i):(float(self.bin_edges_[i]),float(self.bin_edges_[i+1]))foriinrange(self.n_bins_)}
The indices of the bins each value belongs to.
Values less than the smallest bin edge are assigned to the first bin,
and values greater than the largest bin edge are assigned to the last bin.
Source code in skforecast/preprocessing/preprocessing.py
deftransform(self,X:np.ndarray):""" Assign new data to the learned bins. Parameters ---------- X : numpy ndarray The data to assign to the bins. Returns ------- bin_indices : numpy ndarray The indices of the bins each value belongs to. Values less than the smallest bin edge are assigned to the first bin, and values greater than the largest bin edge are assigned to the last bin. """ifself.bin_edges_isNone:raiseNotFittedError("The model has not been fitted yet. Call 'fit' with training data first.")bin_indices=np.digitize(X,bins=self.bin_edges_,right=False)bin_indices=np.clip(bin_indices,1,self.n_bins_).astype(self.dtype)-1returnbin_indices
Fit the model to the data and return the bin indices for the same data.
Parameters:
Name
Type
Description
Default
X
ndarray
The data to fit and transform.
required
Returns:
Name
Type
Description
bin_indices
ndarray
The indices of the bins each value belongs to.
Values less than the smallest bin edge are assigned to the first bin,
and values greater than the largest bin edge are assigned to the last bin.
Source code in skforecast/preprocessing/preprocessing.py
deffit_transform(self,X):""" Fit the model to the data and return the bin indices for the same data. Parameters ---------- X : numpy.ndarray The data to fit and transform. Returns ------- bin_indices : numpy.ndarray The indices of the bins each value belongs to. Values less than the smallest bin edge are assigned to the first bin, and values greater than the largest bin edge are assigned to the last bin. """self.fit(X)returnself.transform(X)
defget_params(self):""" Get the parameters of the quantile binner. Parameters ---------- self Returns ------- params : dict A dictionary of the parameters of the quantile binner. """return{"n_bins":self.n_bins,"method":self.method,"subsample":self.subsample,"dtype":self.dtype,"random_state":self.random_state,}
defset_params(self,**params):""" Set the parameters of the QuantileBinner. Parameters ---------- params : dict A dictionary of the parameters to set. Returns ------- None """forparam,valueinparams.items():setattr(self,param,value)
If True, the calibration factor is the same for the lower and upper bounds.
If False, the calibration factor is different for the lower and upper bounds.
If True, the calibration factor is the same for the lower and upper bounds.
If False, the calibration factor is different for the lower and upper bounds.
def__init__(self,nominal_coverage:float=0.8,symmetric_calibration:bool=True)->None:ifnominal_coverage<0ornominal_coverage>1:raiseValueError(f"`nominal_coverage` must be a float between 0 and 1. Got {nominal_coverage}")self.nominal_coverage=nominal_coverageself.symmetric_calibration=symmetric_calibrationself.correction_factor_={}self.correction_factor_lower_={}self.correction_factor_upper_={}self.fit_coverage_={}self.fit_input_type_=Noneself.fit_series_names_=Noneself.is_fitted=False
Learn the correction factor needed to achieve the desired coverage.
Parameters:
Name
Type
Description
Default
y_true
pandas Series, pandas DataFrame, dict
True values of the time series.
If pandas Series, it is assumed that only one series is available.
If pandas DataFrame, it is assumed that each column is a different
series which will be calibrated separately. The column names are
used as series names.
If dict, it is assumed that each key is a series name and the
corresponding value is a pandas Series with the true values.
required
y_pred_interval
pandas DataFrame
Prediction interval estimated for the time series.
If y_true contains only one series, y_pred_interval must have
two columns, 'lower_bound' and 'upper_bound'.
If y_true contains multiple series, y_pred_interval must be
a long-format DataFrame with three columns: 'level', 'lower_bound',
and 'upper_bound'. The 'level' column identifies the series to which
each interval belongs.
required
Returns:
Type
Description
None
Source code in skforecast/preprocessing/preprocessing.py
deffit(self,y_true:pd.Series|pd.DataFrame|dict[str,pd.Series],y_pred_interval:pd.DataFrame,)->None:""" Learn the correction factor needed to achieve the desired coverage. Parameters ---------- y_true : pandas Series, pandas DataFrame, dict True values of the time series. - If pandas Series, it is assumed that only one series is available. - If pandas DataFrame, it is assumed that each column is a different series which will be calibrated separately. The column names are used as series names. - If dict, it is assumed that each key is a series name and the corresponding value is a pandas Series with the true values. y_pred_interval : pandas DataFrame Prediction interval estimated for the time series. - If `y_true` contains only one series, `y_pred_interval` must have two columns, 'lower_bound' and 'upper_bound'. - If `y_true` contains multiple series, `y_pred_interval` must be a long-format DataFrame with three columns: 'level', 'lower_bound', and 'upper_bound'. The 'level' column identifies the series to which each interval belongs. Returns ------- None """self.correction_factor_={}self.correction_factor_lower_={}self.correction_factor_upper_={}self.fit_coverage_={}self.fit_input_type_=Noneself.fit_series_names_=Noneself.is_fitted=Falseifnotisinstance(y_true,(pd.Series,pd.DataFrame,dict)):raiseTypeError("`y_true` must be a pandas Series, pandas DataFrame, or a dictionary.")ifnotisinstance(y_pred_interval,(pd.DataFrame)):raiseTypeError("`y_pred_interval` must be a pandas DataFrame.")ifnotset(["lower_bound","upper_bound"]).issubset(y_pred_interval.columns):raiseValueError("`y_pred_interval` must have columns 'lower_bound' and 'upper_bound'.")ifisinstance(y_true,(pd.DataFrame,dict))and'level'notiny_pred_interval.columns:raiseValueError("If `y_true` is a pandas DataFrame or a dictionary, `y_pred_interval` ""must have an additional column 'level' to identify each series.")ifisinstance(y_true,pd.Series):name=y_true.nameify_true.nameisnotNoneelse'y'self.fit_input_type_="single_series"y_true={name:y_true}if"level"notiny_pred_interval.columns:y_pred_interval=y_pred_interval.copy()y_pred_interval["level"]=nameelse:ify_pred_interval["level"].nunique()>1:raiseValueError("If `y_true` is a pandas Series, `y_pred_interval` must have ""only one series. Found multiple values in column 'level'.")ify_pred_interval["level"].iat[0]!=name:raiseValueError(f"Series name in `y_true`, '{name}', does not match the level "f"name in `y_pred_interval`, '{y_pred_interval['level'].iat[0]}'.")elifisinstance(y_true,pd.DataFrame):self.fit_input_type_="multiple_series"y_true=y_true.to_dict(orient='series')else:self.fit_input_type_="multiple_series"fork,viny_true.items():ifnotisinstance(v,pd.Series):raiseValueError(f"When `y_true` is a dict, all its values must be pandas "f"Series. Got {type(v)} for series '{k}'.")y_pred_interval={k:v[['lower_bound','upper_bound']]fork,viny_pred_interval.groupby('level')}ifnoty_pred_interval.keys()==y_true.keys():raiseValueError(f"Series names in `y_true` and `y_pred_interval` do not match.\n"f" `y_true` series names : {list(y_true.keys())}\n"f" `y_pred_interval` series names : {list(y_pred_interval.keys())}")forkiny_true.keys():ifnoty_true[k].index.equals(y_pred_interval[k].index):raiseIndexError(f"Index of `y_true` and `y_pred_interval` must match. Different "f"indices found for series '{k}'.")y_true_=np.asarray(y_true[k])y_pred_interval_=np.asarray(y_pred_interval[k])lower_bound=y_pred_interval_[:,0]upper_bound=y_pred_interval_[:,1]conformity_scores_lower=lower_bound-y_true_conformity_scores_upper=y_true_-upper_boundconformity_scores=np.max([conformity_scores_lower,conformity_scores_upper,],axis=0,)self.correction_factor_[k]=float(np.quantile(conformity_scores,self.nominal_coverage))self.correction_factor_lower_[k]=float(-1*np.quantile(-1*conformity_scores_lower,(1-self.nominal_coverage)/2))self.correction_factor_upper_[k]=float(np.quantile(conformity_scores_upper,1-(1-self.nominal_coverage)/2))coverage_fit_=calculate_coverage(y_true=y_true_,lower_bound=lower_bound,upper_bound=upper_bound,)self.fit_coverage_[k]=float(coverage_fit_)self.is_fitted=Trueself.fit_series_names_=list(y_true.keys())
Apply the correction factor to the prediction interval to achieve the desired
coverage.
Parameters:
Name
Type
Description
Default
y_pred_interval
pandas DataFrame
Prediction interval to be calibrated using conformal method.
If only intervals for one series are available, y_pred_interval
must have two columns, 'lower_bound' and 'upper_bound'.
If multiple series are available, y_pred_interval must be
a long-format DataFrame with three columns: 'level', 'lower_bound',
and 'upper_bound'. The 'level' column identifies the series to which
each interval belongs.
required
Returns:
Name
Type
Description
y_pred_interval_conformal
pandas DataFrame
Prediction interval with the correction factor applied.
Source code in skforecast/preprocessing/preprocessing.py
deftransform(self,y_pred_interval:pd.DataFrame)->pd.DataFrame:""" Apply the correction factor to the prediction interval to achieve the desired coverage. Parameters ---------- y_pred_interval : pandas DataFrame Prediction interval to be calibrated using conformal method. - If only intervals for one series are available, `y_pred_interval` must have two columns, 'lower_bound' and 'upper_bound'. - If multiple series are available, `y_pred_interval` must be a long-format DataFrame with three columns: 'level', 'lower_bound', and 'upper_bound'. The 'level' column identifies the series to which each interval belongs. Returns ------- y_pred_interval_conformal : pandas DataFrame Prediction interval with the correction factor applied. """ifnotself.is_fitted:raiseNotFittedError("ConformalIntervalCalibrator not fitted yet. Call 'fit' with ""training data first.")ifnotisinstance(y_pred_interval,pd.DataFrame):raiseTypeError("`y_pred_interval` must be a pandas DataFrame.")ifnotset(["lower_bound","upper_bound"]).issubset(y_pred_interval.columns):raiseValueError("`y_pred_interval` must have columns 'lower_bound' and 'upper_bound'.")ifself.fit_input_type_=="single_series"and'level'notiny_pred_interval.columns:y_pred_interval=y_pred_interval.copy()y_pred_interval["level"]=self.fit_series_names_[0]ifself.fit_input_type_=="multiple_series"and'level'notiny_pred_interval.columns:raiseValueError("The transformer was fitted with multiple series. `y_pred_interval` ""must be a long-format DataFrame with three columns: 'level', ""'lower_bound', and 'upper_bound'. The 'level' column identifies ""the series to which each interval belongs.")conformalized_intervals=[]fork,y_pred_interval_iny_pred_interval.groupby('level')[['lower_bound','upper_bound']]:ifknotinself.fit_series_names_:raiseValueError(f"Series '{k}' was not seen during fit. Available series are: "f"{self.fit_series_names_}.")correction_factor=self.correction_factor_[k]correction_factor_lower=self.correction_factor_lower_[k]correction_factor_upper=self.correction_factor_upper_[k]index=y_pred_interval_.indexy_pred_interval_=y_pred_interval_.to_numpy()y_pred_interval_conformal=y_pred_interval_.copy()ifself.symmetric_calibration:y_pred_interval_conformal[:,0]=(y_pred_interval_conformal[:,0]-correction_factor)y_pred_interval_conformal[:,1]=(y_pred_interval_conformal[:,1]+correction_factor)else:y_pred_interval_conformal[:,0]=(y_pred_interval_conformal[:,0]-correction_factor_lower)y_pred_interval_conformal[:,1]=(y_pred_interval_conformal[:,1]+correction_factor_upper)# If upper bound is less than lower bound, swap themmask=(y_pred_interval_conformal[:,1]<y_pred_interval_conformal[:,0])(y_pred_interval_conformal[mask,0],y_pred_interval_conformal[mask,1],)=(y_pred_interval_conformal[mask,1],y_pred_interval_conformal[mask,0],)y_pred_interval_conformal=pd.DataFrame(data=y_pred_interval_conformal,columns=['lower_bound','upper_bound'],index=index)y_pred_interval_conformal.insert(0,'level',k)conformalized_intervals.append(y_pred_interval_conformal)conformalized_intervals=pd.concat(conformalized_intervals)returnconformalized_intervals