Skip to content

preprocessing

skforecast.preprocessing.preprocessing.RollingFeatures

RollingFeatures(
    stats,
    window_sizes,
    min_periods=None,
    features_names=None,
    fillna=None,
)

This class computes rolling features. To avoid data leakage, the last point in the window is excluded from calculations, ('closed': 'left' and 'center': False).

Parameters:

Name Type Description Default
stats (str, list)

Statistics to compute over the rolling window. Can be a string or a list, and can have repeats. Available statistics are: 'mean', 'std', 'min', 'max', 'sum', 'median', 'ratio_min_max', 'coef_variation'.

required
window_sizes (int, list)

Size of the rolling window for each statistic. If an int, all stats share the same window size. If a list, it should have the same length as stats.

required
min_periods (int, list)

Minimum number of observations in window required to have a value. Same as the min_periods argument of pandas rolling. If None, defaults to window_sizes.

`None`
features_names list

Names of the output features. If None, default names will be used in the format 'roll_stat_window_size', for example 'roll_mean_7'.

`None`
fillna (str, float)

Fill missing values in transform_batch method. Available methods are: 'mean', 'median', 'ffill', 'bfill', or a float value.

`None`

Attributes:

Name Type Description
stats list

Statistics to compute over the rolling window.

n_stats int

Number of statistics to compute.

window_sizes list

Size of the rolling window for each statistic.

max_window_size int

Maximum window size.

min_periods list

Minimum number of observations in window required to have a value.

features_names list

Names of the output features.

fillna (str, float)

Method to fill missing values in transform_batch method.

unique_rolling_windows dict

Dictionary containing unique rolling window parameters and the corresponding statistics.

Source code in skforecast/preprocessing/preprocessing.py
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
def __init__(
    self, 
    stats: Union[str, list],
    window_sizes: Union[int, list],
    min_periods: Optional[Union[int, list]] = None,
    features_names: Optional[list] = None, 
    fillna: Optional[Union[str, float]] = None
) -> None:

    self._validate_params(
        stats,
        window_sizes,
        min_periods,
        features_names,
        fillna
    )

    if isinstance(stats, str):
        stats = [stats]
    self.stats = stats
    self.n_stats = len(stats)

    if isinstance(window_sizes, int):
        window_sizes = [window_sizes] * self.n_stats
    self.window_sizes = window_sizes
    self.max_window_size = max(window_sizes)

    if min_periods is None:
        min_periods = self.window_sizes
    elif isinstance(min_periods, int):
        min_periods = [min_periods] * self.n_stats
    self.min_periods = min_periods

    if features_names is None:
        features_names = [
            f"roll_{stat}_{window_size}" 
            for stat, window_size in zip(self.stats, self.window_sizes)
        ]
    self.features_names = features_names

    self.fillna = fillna

    window_params_list = []
    for i in range(len(self.stats)):
        window_params = (self.window_sizes[i], self.min_periods[i])
        window_params_list.append(window_params)

    # Find unique window parameter combinations
    unique_rolling_windows = {}
    for i, params in enumerate(window_params_list):
        key = f"{params[0]}_{params[1]}"
        if key not in unique_rolling_windows:
            unique_rolling_windows[key] = {
                'params': {
                    'window': params[0], 
                    'min_periods': params[1], 
                    'center': False,
                    'closed': 'left'
                },
                'stats_idx': [], 
                'stats_names': [], 
                'rolling_obj': None
            }
        unique_rolling_windows[key]['stats_idx'].append(i)
        unique_rolling_windows[key]['stats_names'].append(self.features_names[i])

    self.unique_rolling_windows = unique_rolling_windows

_validate_params

_validate_params(
    stats,
    window_sizes,
    min_periods=None,
    features_names=None,
    fillna=None,
)

Validate the parameters of the RollingFeatures class.

Parameters:

Name Type Description Default
stats (str, list)

Statistics to compute over the rolling window. Can be a string or a list, and can have repeats. Available statistics are: 'mean', 'std', 'min', 'max', 'sum', 'median', 'ratio_min_max', 'coef_variation'.

required
window_sizes (int, list)

Size of the rolling window for each statistic. If an int, all stats share the same window size. If a list, it should have the same length as stats.

required
min_periods (int, list)

Minimum number of observations in window required to have a value. Same as the min_periods argument of pandas rolling. If None, defaults to window_sizes.

`None`
features_names list

Names of the output features. If None, default names will be used in the format 'roll_stat_window_size', for example 'roll_mean_7'.

`None`
fillna (str, float)

Fill missing values in transform_batch method. Available methods are: 'mean', 'median', 'ffill', 'bfill', or a float value.

`None`

Returns:

Type Description
None
Source code in skforecast/preprocessing/preprocessing.py
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
def _validate_params(
    self, 
    stats, 
    window_sizes, 
    min_periods: Optional[Union[int, list]] = None,
    features_names: Optional[Union[str, list]] = None, 
    fillna: Optional[Union[str, float]] = None
) -> None:
    """
    Validate the parameters of the RollingFeatures class.

    Parameters
    ----------
    stats : str, list
        Statistics to compute over the rolling window. Can be a `string` or a `list`,
        and can have repeats. Available statistics are: 'mean', 'std', 'min', 'max',
        'sum', 'median', 'ratio_min_max', 'coef_variation'.
    window_sizes : int, list
        Size of the rolling window for each statistic. If an `int`, all stats share 
        the same window size. If a `list`, it should have the same length as stats.
    min_periods : int, list, default `None`
        Minimum number of observations in window required to have a value. 
        Same as the `min_periods` argument of pandas rolling. If `None`, 
        defaults to `window_sizes`.
    features_names : list, default `None`
        Names of the output features. If `None`, default names will be used in the 
        format 'roll_stat_window_size', for example 'roll_mean_7'.
    fillna : str, float, default `None`
        Fill missing values in `transform_batch` method. Available 
        methods are: 'mean', 'median', 'ffill', 'bfill', or a float value.

    Returns
    -------
    None

    """

    # stats
    if not isinstance(stats, (str, list)):
        raise TypeError(
            f"`stats` must be a string or a list of strings. Got {type(stats)}."
        )        

    if isinstance(stats, str):
        stats = [stats]
    allowed_stats = ['mean', 'std', 'min', 'max', 'sum', 'median', 
                     'ratio_min_max', 'coef_variation']
    for stat in set(stats):
        if stat not in allowed_stats:
            raise ValueError(
                f"Statistic '{stat}' is not allowed. Allowed stats are: {allowed_stats}."
            )

    n_stats = len(stats)

    # window_sizes
    if not isinstance(window_sizes, (int, list)):
        raise TypeError(
            f"`window_sizes` must be an int or a list of ints. Got {type(window_sizes)}."
        )

    if isinstance(window_sizes, list):
        n_window_sizes = len(window_sizes)
        if n_window_sizes != n_stats:
            raise ValueError(
                f"Length of `window_sizes` list ({n_window_sizes}) "
                f"must match length of `stats` list ({n_stats})."
            )

    # Check duplicates (stats, window_sizes)
    if isinstance(window_sizes, int):
        window_sizes = [window_sizes] * n_stats
    if len(set(zip(stats, window_sizes))) != n_stats:
        raise ValueError(
            f"Duplicate (stat, window_size) pairs are not allowed.\n"
            f"    `stats`       : {stats}\n"
            f"    `window_sizes : {window_sizes}"
        )

    # min_periods
    if not isinstance(min_periods, (int, list, type(None))):
        raise TypeError(
            f"`min_periods` must be an int, list of ints, or None. Got {type(min_periods)}."
        )

    if min_periods is not None:
        if isinstance(min_periods, int):
            min_periods = [min_periods] * n_stats
        elif isinstance(min_periods, list):
            n_min_periods = len(min_periods)
            if n_min_periods != n_stats:
                raise ValueError(
                    f"Length of `min_periods` list ({n_min_periods}) "
                    f"must match length of `stats` list ({n_stats})."
                )

        for i, min_period in enumerate(min_periods):
            if min_period > window_sizes[i]:
                raise ValueError(
                    "Each `min_period` must be less than or equal to its "
                    "corresponding `window_size`."
                )

    # features_names
    if not isinstance(features_names, (list, type(None))):
        raise TypeError(
            f"`features_names` must be a list of strings or None. Got {type(features_names)}."
        )

    if isinstance(features_names, list):
        n_features_names = len(features_names)
        if n_features_names != n_stats:
            raise ValueError(
                f"Length of `features_names` list ({n_features_names}) "
                f"must match length of `stats` list ({n_stats})."
            )

    # fillna
    if fillna is not None:
        if not isinstance(fillna, (int, float, str)):
            raise TypeError(
                f"`fillna` must be a float, string, or None. Got {type(fillna)}."
            )

        if isinstance(fillna, str):
            allowed_fill_strategy = ['mean', 'median', 'ffill', 'bfill']
            if fillna not in allowed_fill_strategy:
                raise ValueError(
                    f"'{fillna}' is not allowed. Allowed `fillna` "
                    f"values are: {allowed_fill_strategy} or a float value."
                )

_apply_stat_pandas

_apply_stat_pandas(rolling_obj, stat)

Apply the specified statistic to a pandas rolling object.

Parameters:

Name Type Description Default
rolling_obj pandas Rolling

Rolling object to apply the statistic.

required
stat str

Statistic to compute.

required

Returns:

Name Type Description
stat_series pandas Series

Series with the computed statistic.

Source code in skforecast/preprocessing/preprocessing.py
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
def _apply_stat_pandas(
    self, 
    rolling_obj: pd.core.window.rolling.Rolling, 
    stat: str
) -> pd.Series:
    """
    Apply the specified statistic to a pandas rolling object.

    Parameters
    ----------
    rolling_obj : pandas Rolling
        Rolling object to apply the statistic.
    stat : str
        Statistic to compute.

    Returns
    -------
    stat_series : pandas Series
        Series with the computed statistic.

    """

    if stat == 'mean':
        return rolling_obj.mean()
    elif stat == 'std':
        return rolling_obj.std()
    elif stat == 'min':
        return rolling_obj.min()
    elif stat == 'max':
        return rolling_obj.max()
    elif stat == 'sum':
        return rolling_obj.sum()
    elif stat == 'median':
        return rolling_obj.median()
    elif stat == 'ratio_min_max':
        return rolling_obj.min() / rolling_obj.max()
    elif stat == 'coef_variation':
        return rolling_obj.std() / rolling_obj.mean()
    else:
        raise ValueError(f"Statistic '{stat}' is not implemented.")

transform_batch

transform_batch(X)

Transform an entire pandas Series using rolling windows and compute the specified statistics.

Parameters:

Name Type Description Default
X pandas Series

The input data series to transform.

required

Returns:

Name Type Description
rolling_features pandas DataFrame

A DataFrame containing the rolling features.

Source code in skforecast/preprocessing/preprocessing.py
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
def transform_batch(
    self, 
    X: pd.Series
) -> pd.DataFrame:
    """
    Transform an entire pandas Series using rolling windows and compute the 
    specified statistics.

    Parameters
    ----------
    X : pandas Series
        The input data series to transform.

    Returns
    -------
    rolling_features : pandas DataFrame
        A DataFrame containing the rolling features.

    """

    for k in self.unique_rolling_windows.keys():
        rolling_obj = X.rolling(**self.unique_rolling_windows[k]['params'])
        self.unique_rolling_windows[k]['rolling_obj'] = rolling_obj

    rolling_features = []
    for i, stat in enumerate(self.stats):
        window_size = self.window_sizes[i]
        min_periods = self.min_periods[i]

        key = f"{window_size}_{min_periods}"
        rolling_obj = self.unique_rolling_windows[key]['rolling_obj']

        stat_series = self._apply_stat_pandas(rolling_obj=rolling_obj, stat=stat)            
        rolling_features.append(stat_series)

    rolling_features = pd.concat(rolling_features, axis=1)
    rolling_features.columns = self.features_names
    rolling_features = rolling_features.iloc[self.max_window_size:]

    if self.fillna is not None:
        if self.fillna == 'mean':
            rolling_features = rolling_features.fillna(rolling_features.mean())
        elif self.fillna == 'median':
            rolling_features = rolling_features.fillna(rolling_features.median())
        elif self.fillna == 'ffill':
            rolling_features = rolling_features.ffill()
        elif self.fillna == 'bfill':
            rolling_features = rolling_features.bfill()
        else:
            rolling_features = rolling_features.fillna(self.fillna)

    return rolling_features

_apply_stat_numpy_jit

_apply_stat_numpy_jit(X_window, stat)

Apply the specified statistic to a numpy array using Numba JIT.

Parameters:

Name Type Description Default
X_window numpy array

Array with the rolling window.

required
stat str

Statistic to compute.

required

Returns:

Name Type Description
stat_value float

Value of the computed statistic.

Source code in skforecast/preprocessing/preprocessing.py
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
def _apply_stat_numpy_jit(
    self, 
    X_window: np.ndarray, 
    stat: str
) -> float:
    """
    Apply the specified statistic to a numpy array using Numba JIT.

    Parameters
    ----------
    X_window : numpy array
        Array with the rolling window.
    stat : str
        Statistic to compute.

    Returns
    -------
    stat_value : float
        Value of the computed statistic.

    """

    if stat == 'mean':
        return _np_mean_jit(X_window)
    elif stat == 'std':
        return _np_std_jit(X_window)
    elif stat == 'min':
        return _np_min_jit(X_window)
    elif stat == 'max':
        return _np_max_jit(X_window)
    elif stat == 'sum':
        return _np_sum_jit(X_window)
    elif stat == 'median':
        return _np_median_jit(X_window)
    elif stat == 'ratio_min_max':
        return _np_min_max_ratio_jit(X_window)
    elif stat == 'coef_variation':
        return _np_cv_jit(X_window)
    else:
        raise ValueError(f"Statistic '{stat}' is not implemented.")

transform

transform(X)

Transform a numpy array using rolling windows and compute the specified statistics. The returned array will have the shape (X.shape[1] if exists, n_stats). For example, if X is a flat array, the output will have shape (n_stats,). If X is a 2D array, the output will have shape (X.shape[1], n_stats).

Parameters:

Name Type Description Default
X numpy ndarray

The input data array to transform.

required

Returns:

Name Type Description
rolling_features numpy ndarray

An array containing the computed statistics.

Source code in skforecast/preprocessing/preprocessing.py
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
def transform(
    self, 
    X: np.ndarray
) -> np.ndarray:
    """
    Transform a numpy array using rolling windows and compute the 
    specified statistics. The returned array will have the shape 
    (X.shape[1] if exists, n_stats). For example, if X is a flat
    array, the output will have shape (n_stats,). If X is a 2D array,
    the output will have shape (X.shape[1], n_stats).

    Parameters
    ----------
    X : numpy ndarray
        The input data array to transform.

    Returns
    -------
    rolling_features : numpy ndarray
        An array containing the computed statistics.

    """

    array_ndim = X.ndim
    if array_ndim == 1:
        X = X[:, np.newaxis]

    rolling_features = np.full(
        shape=(X.shape[1], self.n_stats), fill_value=np.nan, dtype=float
    )

    for i in range(X.shape[1]):
        for j, stat in enumerate(self.stats):
            X_window = X[-self.window_sizes[j]:, i]
            X_window = X_window[~np.isnan(X_window)]
            if len(X_window) > 0: 
                rolling_features[i, j] = self._apply_stat_numpy_jit(X_window, stat)
            else:
                rolling_features[i, j] = np.nan

    if array_ndim == 1:
        rolling_features = rolling_features.ravel()

    return rolling_features

skforecast.preprocessing.preprocessing.series_long_to_dict

series_long_to_dict(
    data,
    series_id,
    index,
    values,
    freq,
    suppress_warnings=False,
)

Convert long format series to dictionary of pandas Series with frequency. Input data must be a pandas DataFrame with columns for the series identifier, time index, and values. The function will group the data by the series identifier and convert the time index to a datetime index with the given frequency.

Parameters:

Name Type Description Default
data DataFrame

Long format series.

required
series_id str

Column name with the series identifier.

required
index str

Column name with the time index.

required
values str

Column name with the values.

required
freq str

Frequency of the series.

required
suppress_warnings bool

If True, suppress warnings when a series is incomplete after setting the frequency.

False

Returns:

Name Type Description
series_dict dict

Dictionary with the series.

Source code in skforecast/preprocessing/preprocessing.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
def series_long_to_dict(
    data: pd.DataFrame,
    series_id: str,
    index: str,
    values: str,
    freq: str,
    suppress_warnings: bool = False
) -> dict:
    """
    Convert long format series to dictionary of pandas Series with frequency.
    Input data must be a pandas DataFrame with columns for the series identifier,
    time index, and values. The function will group the data by the series
    identifier and convert the time index to a datetime index with the given
    frequency.

    Parameters
    ----------
    data: pandas DataFrame
        Long format series.
    series_id: str
        Column name with the series identifier.
    index: str
        Column name with the time index.
    values: str
        Column name with the values.
    freq: str
        Frequency of the series.
    suppress_warnings: bool, default `False`
        If True, suppress warnings when a series is incomplete after setting the
        frequency.

    Returns
    -------
    series_dict: dict
        Dictionary with the series.

    """

    if not isinstance(data, pd.DataFrame):
        raise TypeError("`data` must be a pandas DataFrame.")

    for col in [series_id, index, values]:
        if col not in data.columns:
            raise ValueError(f"Column '{col}' not found in `data`.")

    original_sizes = data.groupby(series_id).size()
    series_dict = {}
    for k, v in data.groupby(series_id):
        series_dict[k] = v.set_index(index)[values].asfreq(freq).rename(k)
        series_dict[k].index.name = None
        if not suppress_warnings and len(series_dict[k]) != original_sizes[k]:
            warnings.warn(
                f"Series '{k}' is incomplete. NaNs have been introduced after "
                f"setting the frequency.",
                MissingValuesWarning
            )

    return series_dict

skforecast.preprocessing.preprocessing.exog_long_to_dict

exog_long_to_dict(
    data,
    series_id,
    index,
    freq,
    dropna=False,
    suppress_warnings=False,
)

Convert long format exogenous variables to dictionary. Input data must be a pandas DataFrame with columns for the series identifier, time index, and exogenous variables. The function will group the data by the series identifier and convert the time index to a datetime index with the given frequency.

Parameters:

Name Type Description Default
data DataFrame

Long format exogenous variables.

required
series_id str

Column name with the series identifier.

required
index str

Column name with the time index.

required
freq str

Frequency of the series.

required
dropna bool

If True, drop columns with all values as NaN. This is useful when there are series without some exogenous variables.

False
suppress_warnings bool

If True, suppress warnings when exog is incomplete after setting the frequency.

False

Returns:

Name Type Description
exog_dict dict

Dictionary with the exogenous variables.

Source code in skforecast/preprocessing/preprocessing.py
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
def exog_long_to_dict(
    data: pd.DataFrame,
    series_id: str,
    index: str,
    freq: str,
    dropna: bool = False,
    suppress_warnings: bool = False
) -> dict:
    """
    Convert long format exogenous variables to dictionary. Input data must be a
    pandas DataFrame with columns for the series identifier, time index, and
    exogenous variables. The function will group the data by the series identifier
    and convert the time index to a datetime index with the given frequency.

    Parameters
    ----------
    data: pandas DataFrame
        Long format exogenous variables.
    series_id: str
        Column name with the series identifier.
    index: str
        Column name with the time index.
    freq: str
        Frequency of the series.
    dropna: bool, default False
        If True, drop columns with all values as NaN. This is useful when
        there are series without some exogenous variables.
    suppress_warnings: bool, default False
        If True, suppress warnings when exog is incomplete after setting the
        frequency.

    Returns
    -------
    exog_dict: dict
        Dictionary with the exogenous variables.

    """

    if not isinstance(data, pd.DataFrame):
        raise TypeError("`data` must be a pandas DataFrame.")

    for col in [series_id, index]:
        if col not in data.columns:
            raise ValueError(f"Column '{col}' not found in `data`.")

    original_sizes = data.groupby(series_id).size()
    exog_dict = dict(tuple(data.groupby(series_id)))
    exog_dict = {
        k: v.set_index(index).asfreq(freq).drop(columns=series_id)
        for k, v in exog_dict.items()
    }

    for k in exog_dict.keys():
        exog_dict[k].index.name = None

    if dropna:
        exog_dict = {k: v.dropna(how="all", axis=1) for k, v in exog_dict.items()}
    else: 
        if not suppress_warnings:
            for k, v in exog_dict.items():
                if len(v) != original_sizes[k]:
                    warnings.warn(
                        f"Exogenous variables for series '{k}' are incomplete. "
                        f"NaNs have been introduced after setting the frequency.",
                        MissingValuesWarning
                    )

    return exog_dict

skforecast.preprocessing.preprocessing.TimeSeriesDifferentiator

TimeSeriesDifferentiator(order=1, window_size=None)

Bases: BaseEstimator, TransformerMixin

Transforms a time series into a differentiated time series of a specified order and provides functionality to revert the differentiation.

When using a direct module Forecaster, the model in step 1 must be used if you want to reverse the differentiation of the training time series with the inverse_transform_training method.

Parameters:

Name Type Description Default
order int

The order of differentiation to be applied.

1
window_size int

The window size used by the forecaster. This is required to revert the differentiation for the target variable y or its predicted values.

None

Attributes:

Name Type Description
order int

The order of differentiation.

initial_values list

List with the first value of the time series before each differentiation. If order = 2, first value correspond with the first value of the original time series and the second value correspond with the first value of the differentiated time series of order 1. These values are necessary to revert the differentiation and reconstruct the original time series.

pre_train_values list

List with the first training value of the time series before each differentiation. For order = 1, the value correspond with the last value of the window used to create the predictors. For order > 1, the value correspond with the first value of the differentiated time series prior to the next differentiation. These values are necessary to revert the differentiation and reconstruct the training time series.

last_values list

List with the last value of the time series before each differentiation, used to revert differentiation on subsequent data windows. If order = 2, first value correspond with the last value of the original time series and the second value correspond with the last value of the differentiated time series of order 1. This is essential for correctly transforming a time series that follows immediately after the series used to fit the transformer.

Source code in skforecast/preprocessing/preprocessing.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def __init__(
    self, 
    order: int = 1,
    window_size: int = None
) -> None:

    if not isinstance(order, (int, np.integer)):
        raise TypeError(
            f"Parameter `order` must be an integer greater than 0. Found {type(order)}."
        )
    if order < 1:
        raise ValueError(
            f"Parameter `order` must be an integer greater than 0. Found {order}."
        )

    if window_size is not None:
        if not isinstance(window_size, (int, np.integer)):
            raise TypeError(
                f"Parameter `window_size` must be an integer greater than 0. "
                f"Found {type(window_size)}."
            )
        if window_size < 1:
            raise ValueError(
                f"Parameter `window_size` must be an integer greater than 0. "
                f"Found {window_size}."
            )

    self.order = order
    self.window_size = window_size
    self.initial_values = []
    self.pre_train_values = []
    self.last_values = []

fit

fit(X, y=None)

Fits the transformer. Stores the values needed to revert the differentiation of different window of the time series, original time series, training time series, and a time series that follows immediately after the series used to fit the transformer.

Parameters:

Name Type Description Default
X numpy ndarray

Time series to be differentiated.

required
y Ignored

Not used, present here for API consistency by convention.

None

Returns:

Name Type Description
self TimeSeriesDifferentiator
Source code in skforecast/preprocessing/preprocessing.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
@_check_X_numpy_ndarray_1d()
def fit(
    self, 
    X: np.ndarray, 
    y: Any = None
) -> Self:
    """
    Fits the transformer. Stores the values needed to revert the 
    differentiation of different window of the time series, original 
    time series, training time series, and a time series that follows
    immediately after the series used to fit the transformer.

    Parameters
    ----------
    X : numpy ndarray
        Time series to be differentiated.
    y : Ignored
        Not used, present here for API consistency by convention.

    Returns
    -------
    self : TimeSeriesDifferentiator

    """

    self.initial_values = []
    self.pre_train_values = []
    self.last_values = []

    for i in range(self.order):
        if i == 0:
            self.initial_values.append(X[0])
            if self.window_size is not None:
                self.pre_train_values.append(X[self.window_size - self.order])
            self.last_values.append(X[-1])
            X_diff = np.diff(X, n=1)
        else:
            self.initial_values.append(X_diff[0])
            if self.window_size is not None:
                self.pre_train_values.append(X_diff[self.window_size - self.order])
            self.last_values.append(X_diff[-1])
            X_diff = np.diff(X_diff, n=1)

    return self

transform

transform(X, y=None)

Transforms a time series into a differentiated time series of order n.

Parameters:

Name Type Description Default
X numpy ndarray

Time series to be differentiated.

required
y Ignored

Not used, present here for API consistency by convention.

None

Returns:

Name Type Description
X_diff numpy ndarray

Differentiated time series. The length of the array is the same as the original time series but the first n order values are nan.

Source code in skforecast/preprocessing/preprocessing.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
@_check_X_numpy_ndarray_1d()
def transform(
    self, 
    X: np.ndarray, 
    y: Any = None
) -> np.ndarray:
    """
    Transforms a time series into a differentiated time series of order n.

    Parameters
    ----------
    X : numpy ndarray
        Time series to be differentiated.
    y : Ignored
        Not used, present here for API consistency by convention.

    Returns
    -------
    X_diff : numpy ndarray
        Differentiated time series. The length of the array is the same as
        the original time series but the first n `order` values are nan.

    """

    X_diff = np.diff(X, n=self.order)
    X_diff = np.append((np.full(shape=self.order, fill_value=np.nan)), X_diff)

    return X_diff

inverse_transform

inverse_transform(X, y=None)

Reverts the differentiation. To do so, the input array is assumed to be the same time series used to fit the transformer but differentiated.

Parameters:

Name Type Description Default
X numpy ndarray

Differentiated time series.

required
y Ignored

Not used, present here for API consistency by convention.

None

Returns:

Name Type Description
X_diff numpy ndarray

Reverted differentiated time series.

Source code in skforecast/preprocessing/preprocessing.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
@_check_X_numpy_ndarray_1d()
def inverse_transform(
    self, 
    X: np.ndarray, 
    y: Any = None
) -> np.ndarray:
    """
    Reverts the differentiation. To do so, the input array is assumed to be
    the same time series used to fit the transformer but differentiated.

    Parameters
    ----------
    X : numpy ndarray
        Differentiated time series.
    y : Ignored
        Not used, present here for API consistency by convention.

    Returns
    -------
    X_diff : numpy ndarray
        Reverted differentiated time series.

    """

    # Remove initial nan values if present
    X = X[np.argmax(~np.isnan(X)):]
    for i in range(self.order):
        if i == 0:
            X_undiff = np.insert(X, 0, self.initial_values[-1])
            X_undiff = np.cumsum(X_undiff, dtype=float)
        else:
            X_undiff = np.insert(X_undiff, 0, self.initial_values[-(i + 1)])
            X_undiff = np.cumsum(X_undiff, dtype=float)

    return X_undiff

inverse_transform_training

inverse_transform_training(X, y=None)

Reverts the differentiation. To do so, the input array is assumed to be the differentiated training time series generated with the original time series used to fit the transformer.

When using a direct module Forecaster, the model in step 1 must be used if you want to reverse the differentiation of the training time series with the inverse_transform_training method.

Parameters:

Name Type Description Default
X numpy ndarray

Differentiated time series.

required
y Ignored

Not used, present here for API consistency by convention.

None

Returns:

Name Type Description
X_diff numpy ndarray

Reverted differentiated time series.

Source code in skforecast/preprocessing/preprocessing.py
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
@_check_X_numpy_ndarray_1d()
def inverse_transform_training(
    self, 
    X: np.ndarray, 
    y: Any = None
) -> np.ndarray:
    """
    Reverts the differentiation. To do so, the input array is assumed to be
    the differentiated training time series generated with the original 
    time series used to fit the transformer.

    When using a `direct` module Forecaster, the model in step 1 must be 
    used if you want to reverse the differentiation of the training time 
    series with the `inverse_transform_training` method.

    Parameters
    ----------
    X : numpy ndarray
        Differentiated time series.
    y : Ignored
        Not used, present here for API consistency by convention.

    Returns
    -------
    X_diff : numpy ndarray
        Reverted differentiated time series.

    """

    if not self.pre_train_values:
        raise ValueError(
            "The `window_size` parameter must be set before fitting the "
            "transformer to revert the differentiation of the training "
            "time series."
        )

    # Remove initial nan values if present
    X = X[np.argmax(~np.isnan(X)):]
    for i in range(self.order):
        if i == 0:
            X_undiff = np.insert(X, 0, self.pre_train_values[-1])
            X_undiff = np.cumsum(X_undiff, dtype=float)
        else:
            X_undiff = np.insert(X_undiff, 0, self.pre_train_values[-(i + 1)])
            X_undiff = np.cumsum(X_undiff, dtype=float)

    # Remove initial values as they are not part of the training time series
    X_undiff = X_undiff[self.order:]

    return X_undiff

inverse_transform_next_window

inverse_transform_next_window(X, y=None)

Reverts the differentiation. The input array X is assumed to be a differentiated time series of order n that starts right after the the time series used to fit the transformer.

Parameters:

Name Type Description Default
X numpy ndarray

Differentiated time series. It is assumed o start right after the time series used to fit the transformer.

required
y Ignored

Not used, present here for API consistency by convention.

None

Returns:

Name Type Description
X_undiff numpy ndarray

Reverted differentiated time series.

Source code in skforecast/preprocessing/preprocessing.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
@_check_X_numpy_ndarray_1d(ensure_1d=False)
def inverse_transform_next_window(
    self,
    X: np.ndarray,
    y: Any = None
) -> np.ndarray:
    """
    Reverts the differentiation. The input array `X` is assumed to be a 
    differentiated time series of order n that starts right after the
    the time series used to fit the transformer.

    Parameters
    ----------
    X : numpy ndarray
        Differentiated time series. It is assumed o start right after
        the time series used to fit the transformer.
    y : Ignored
        Not used, present here for API consistency by convention.

    Returns
    -------
    X_undiff : numpy ndarray
        Reverted differentiated time series.

    """

    array_ndim = X.ndim
    if array_ndim == 1:
        X = X[:, np.newaxis]

    # Remove initial rows with nan values if present
    X = X[~np.isnan(X).any(axis=1)]

    for i in range(self.order):
        if i == 0:
            X_undiff = np.cumsum(X, axis=0, dtype=float) + self.last_values[-1]
        else:
            X_undiff = np.cumsum(X_undiff, axis=0, dtype=float) + self.last_values[-(i + 1)]

    if array_ndim == 1:
        X_undiff = X_undiff.ravel()

    return X_undiff

set_params

set_params(**params)

Set the parameters of the TimeSeriesDifferentiator.

Parameters:

Name Type Description Default
params dict

A dictionary of the parameters to set.

{}

Returns:

Type Description
None
Source code in skforecast/preprocessing/preprocessing.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
def set_params(self, **params):
    """
    Set the parameters of the TimeSeriesDifferentiator.

    Parameters
    ----------
    params : dict
        A dictionary of the parameters to set.

    Returns
    -------
    None

    """

    for param, value in params.items():
        setattr(self, param, value)

skforecast.preprocessing.preprocessing.QuantileBinner

QuantileBinner(
    n_bins,
    method="linear",
    subsample=200000,
    dtype=np.float64,
    random_state=789654,
)

QuantileBinner class to bin data into quantile-based bins using numpy.percentile. This class is similar to KBinsDiscretizer but faster for binning data into quantile-based bins. Bin intervals are defined following the convention: bins[i-1] <= x < bins[i]. See more information in numpy.percentile and numpy.digitize.

Parameters:

Name Type Description Default
n_bins int

The number of quantile-based bins to create.

required
method str

The method used to compute the quantiles. This parameter is passed to numpy.percentile. Default is 'linear'. Valid values are "inverse_cdf", "averaged_inverse_cdf", "closest_observation", "interpolated_inverse_cdf", "hazen", "weibull", "linear", "median_unbiased", "normal_unbiased".

'linear'
subsample int

The number of samples to use for computing quantiles. If the dataset has more samples than subsample, a random subset will be used.

200000
random_state int

The random seed to use for generating a random subset of the data.

789654
dtype data type

The data type to use for the bin indices. Default is numpy.float64.

numpy.float64

Attributes:

Name Type Description
n_bins int

The number of quantile-based bins to create.

method str, default='linear'

The method used to compute the quantiles. This parameter is passed to numpy.percentile. Default is 'linear'. Valid values are 'linear', 'lower', 'higher', 'midpoint', 'nearest'.

subsample int, default=200000

The number of samples to use for computing quantiles. If the dataset has more samples than subsample, a random subset will be used.

random_state int, default=789654

The random seed to use for generating a random subset of the data.

dtype data type, default=numpy.float64

The data type to use for the bin indices. Default is numpy.float64.

n_bins_ int

The number of bins learned during fitting.

bin_edges_ numpy ndarray

The edges of the bins learned during fitting.

Source code in skforecast/preprocessing/preprocessing.py
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
def __init__(
    self,
    n_bins: int,
    method: Optional[str] = "linear",
    subsample: int = 200000,
    dtype: Optional[type] = np.float64,
    random_state: Optional[int] = 789654
):

    self._validate_params(
        n_bins,
        method,
        subsample,
        dtype,
        random_state
    )

    self.n_bins       = n_bins
    self.method       = method
    self.subsample    = subsample
    self.random_state = random_state
    self.dtype        = dtype
    self.n_bins_      = None
    self.bin_edges_   = None
    self.intervals_   = None

_validate_params

_validate_params(
    n_bins, method, subsample, dtype, random_state
)

Validate the parameters passed to the class initializer.

Source code in skforecast/preprocessing/preprocessing.py
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
def _validate_params(
    self,
    n_bins: int,
    method: str,
    subsample: int,
    dtype: type,
    random_state: int
):
    """
    Validate the parameters passed to the class initializer.
    """

    if not isinstance(n_bins, int) or n_bins < 2:
        raise ValueError(
            f"`n_bins` must be an int greater than 1. Got {n_bins}."
        )

    valid_methods = [
        "inverse_cdf",
        "averaged_inverse_cdf",
        "closest_observation",
        "interpolated_inverse_cdf",
        "hazen",
        "weibull",
        "linear",
        "median_unbiased",
        "normal_unbiased",
    ]
    if method not in valid_methods:
        raise ValueError(
            f"`method` must be one of {valid_methods}. Got {method}."
        )
    if not isinstance(subsample, int) or subsample < 1:
        raise ValueError(
            f"`subsample` must be an integer greater than or equal to 1. "
            f"Got {subsample}."
        )
    if not isinstance(random_state, int) or random_state < 0:
        raise ValueError(
            f"`random_state` must be an integer greater than or equal to 0. "
            f"Got {random_state}."
        )
    if not isinstance(dtype, type):
        raise ValueError(
            f"`dtype` must be a valid numpy dtype. Got {dtype}."
        )

fit

fit(X)

Learn the bin edges based on quantiles from the training data.

Parameters:

Name Type Description Default
X numpy ndarray

The training data used to compute the quantiles.

required

Returns:

Type Description
None
Source code in skforecast/preprocessing/preprocessing.py
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
def fit(self, X: np.ndarray):
    """
    Learn the bin edges based on quantiles from the training data.

    Parameters
    ----------
    X : numpy ndarray
        The training data used to compute the quantiles.

    Returns
    -------
    None

    """

    if X.size == 0:
        raise ValueError("Input data `X` cannot be empty.")
    if len(X) > self.subsample:
        rng = np.random.default_rng(self.random_state)
        X = X[rng.integers(0, len(X), self.subsample)]

    self.bin_edges_ = np.percentile(
        a      = X,
        q      = np.linspace(0, 100, self.n_bins + 1),
        method = self.method
    )

    self.n_bins_ = len(self.bin_edges_) - 1
    self.intervals_ = {
        float(i): (float(self.bin_edges_[i]), float(self.bin_edges_[i + 1]))
        for i in range(self.n_bins_)
    }

transform

transform(X)

Assign new data to the learned bins.

Parameters:

Name Type Description Default
X numpy ndarray

The data to assign to the bins.

required

Returns:

Name Type Description
bin_indices numpy ndarray

The indices of the bins each value belongs to. Values less than the smallest bin edge are assigned to the first bin, and values greater than the largest bin edge are assigned to the last bin.

Source code in skforecast/preprocessing/preprocessing.py
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
def transform(self, X: np.ndarray):
    """
    Assign new data to the learned bins.

    Parameters
    ----------
    X : numpy ndarray
        The data to assign to the bins.

    Returns
    -------
    bin_indices : numpy ndarray 
        The indices of the bins each value belongs to.
        Values less than the smallest bin edge are assigned to the first bin,
        and values greater than the largest bin edge are assigned to the last bin.

    """

    if self.bin_edges_ is None:
        raise NotFittedError(
            "The model has not been fitted yet. Call 'fit' with training data first."
        )

    bin_indices = np.digitize(X, bins=self.bin_edges_, right=False)
    bin_indices = np.clip(bin_indices, 1, self.n_bins_).astype(self.dtype) - 1

    return bin_indices

fit_transform

fit_transform(X)

Fit the model to the data and return the bin indices for the same data.

Parameters:

Name Type Description Default
X ndarray

The data to fit and transform.

required

Returns:

Name Type Description
bin_indices ndarray

The indices of the bins each value belongs to. Values less than the smallest bin edge are assigned to the first bin, and values greater than the largest bin edge are assigned to the last bin.

Source code in skforecast/preprocessing/preprocessing.py
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
def fit_transform(self, X):
    """
    Fit the model to the data and return the bin indices for the same data.

    Parameters
    ----------
    X : numpy.ndarray
        The data to fit and transform.

    Returns
    -------
    bin_indices : numpy.ndarray
        The indices of the bins each value belongs to.
        Values less than the smallest bin edge are assigned to the first bin,
        and values greater than the largest bin edge are assigned to the last bin.

    """

    self.fit(X)

    return self.transform(X)

get_params

get_params()

Get the parameters of the quantile binner.

Parameters:

Name Type Description Default
self
required

Returns:

Name Type Description
params dict

A dictionary of the parameters of the quantile binner.

Source code in skforecast/preprocessing/preprocessing.py
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
def get_params(self):
    """
    Get the parameters of the quantile binner.

    Parameters
    ----------
    self

    Returns
    -------
    params : dict
        A dictionary of the parameters of the quantile binner.

    """

    return {
        "n_bins": self.n_bins,
        "method": self.method,
        "subsample": self.subsample,
        "dtype": self.dtype,
        "random_state": self.random_state,
    }

set_params

set_params(**params)

Set the parameters of the QuantileBinner.

Parameters:

Name Type Description Default
params dict

A dictionary of the parameters to set.

{}

Returns:

Type Description
None
Source code in skforecast/preprocessing/preprocessing.py
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
def set_params(self, **params):
    """
    Set the parameters of the QuantileBinner.

    Parameters
    ----------
    params : dict
        A dictionary of the parameters to set.

    Returns
    -------
    None

    """

    for param, value in params.items():
        setattr(self, param, value)