A utility class for splitting time series data into training, validation,
and testing sets for machine learning algorithms.
This class provides flexible splitting strategies supporting multiple input
formats (wide DataFrame, long DataFrame with MultiIndex, or dictionary of Series),
both DatetimeIndex and RangeIndex, and flexible output formats.
New in this version: Support for multiple series arguments with independent
splitting behavior. Each series can have different lengths and date ranges.
Parameters:
Name
Type
Description
Default
*series
DataFrame | dict[str, Series | DataFrame]
One or more time series data to split. Each can be:
- Wide format pandas DataFrame with DatetimeIndex or RangeIndex
- Long format pandas DataFrame with MultiIndex (series_id, datetime)
- Dictionary of pandas Series or DataFrames with identical indexes
When multiple series are provided, they are treated independently and
splits are returned as a list of tuples (one tuple per series group).
def__init__(self,*series:pd.DataFrame|dict[str,pd.Series|pd.DataFrame])->None:""" Initialize TimeSeriesSplitter with one or more series. Parameters ---------- *series : pd.DataFrame | dict[str, pd.Series | pd.DataFrame] One or more time series data in supported formats. Raises ------ ValueError If no series provided or series have invalid format. TypeError If series are not in a supported format. """iflen(series)==0:raiseValueError('At least one series must be provided.')# -- Process each series argument independentlyself.series_groups_=[]self.series_indexes_=[]self.index_types_=[]self.index_freqs_=[]self._min_indexes_=[]self._max_indexes_=[]fori,series_inputinenumerate(series):# Use inner check_preprocess_series() preprocessing for each groupseries_dict,series_indexes=check_preprocess_series(series_input)# -- Store the preprocess series data dict & index dictself.series_groups_.append(series_dict)self.series_indexes_.append(series_indexes)# -- Store index type and frequency information for this groupfirst_index=next(iter(series_indexes.values()))index_type=type(first_index)self.index_types_.append(index_type)ifisinstance(first_index,pd.DatetimeIndex):self.index_freqs_.append(first_index.freq)self._min_indexes_.append(min([idx.min()foridxinseries_indexes.values()]))self._max_indexes_.append(max([idx.max()foridxinseries_indexes.values()]))ifisinstance(first_index,pd.RangeIndex):self.index_freqs_.append(first_index.step)self._min_indexes_.append(0)self._max_indexes_.append(len(first_index)-1)# -- Store the number groups/series inputself.n_groups_=len(series)self.n_timeseries=sum(map(len,self.series_indexes_))# -- Store version informationself.skforecast_version=__version__self.python_version=sys.version.split(' ')[0]
def_convert_date_to_position(self,date:str|pd.Timestamp,index:pd.Index,date_name:str='date',)->int:""" Convert a date to its position in the given index. Parameters ---------- date : str | pd.Timestamp Date to convert. index : pd.Index Index to search in. date_name : str, default 'date' Name of the date parameter (for error messages). Returns ------- int Position of the date in the index. Raises ------ ValueError If date is outside the valid range. """# -- Convert any string date input to Timestamp objectifisinstance(date,str):date=pd.Timestamp(date)# -- Raise error if data is not in required time rangeifdatenotinindex:raiseValueError(f'{date_name}{date} is not present in the series index. 'f'Available range: {index[0]} to {index[-1]}.')# -- Extract the numeric positionreturnindex.get_loc(date)
def_validate_date_split_args(self,group_idx:int,start_train:str|pd.Timestamp|None,end_train:str|pd.Timestamp,end_validation:str|pd.Timestamp|None,end_test:str|pd.Timestamp|None,)->tuple[int,int,int|None,int|None]:""" Validate and convert date split arguments to positions for a specific group. Parameters ---------- group_idx : int Index of the series group to validate. Returns ------- tuple[int, int, int | None, int | None] Positions for (start_train, end_train, end_validation, end_test). """# -- Force index to be DatetimeIndex objectifself.index_types_[group_idx]!=pd.DatetimeIndex:raiseTypeError(f'Group {group_idx}: `split_by_date` requires `DatetimeIndex` object. 'f'Current index type: {self.index_types_[group_idx].__name__}. ''Consider using `split_by_size` instead.')first_index=next(iter(self.series_indexes_[group_idx].values()))# -- Convert dates to positionsifstart_trainisNone:start_train_pos=0else:start_train_pos=self._convert_date_to_position(start_train,first_index,'start_train')end_train_pos=self._convert_date_to_position(end_train,first_index,'end_train')ifend_validationisNone:end_validation_pos=end_train_poselse:end_validation_pos=self._convert_date_to_position(end_validation,first_index,'end_validation')ifend_testisNone:end_test_pos=len(first_index)-1else:end_test_pos=self._convert_date_to_position(end_test,first_index,'end_test')# -- Validate position orderifstart_train_pos>=end_train_pos:raiseValueError(f'Group {group_idx}: start_train must be earlier than end_train. 'f'Got start_train={first_index[start_train_pos]}, 'f'end_train={first_index[end_train_pos]}.')ifend_train_pos>end_validation_pos:raiseValueError(f'Group {group_idx}: end_train must be earlier than or equal to end_validation. 'f'Got end_train={first_index[end_train_pos]}, 'f'end_validation={first_index[end_validation_pos]}.')ifend_validation_pos>end_test_pos:raiseValueError(f'Group {group_idx}: end_validation must be earlier than or equal to end_test. 'f'Got end_validation={first_index[end_validation_pos]}, 'f'end_test={first_index[end_test_pos]}.')returnstart_train_pos,end_train_pos,end_validation_pos,end_test_pos
Convert a size specification to an absolute integer count.
This method handles both absolute (integer) and proportional (float)
size specifications, validating the input and converting proportions
to actual counts based on the total length.
Parameters:
Name
Type
Description
Default
size
int | float | None
Size specification to convert:
- If int: Absolute count (returned as-is after validation)
- If float: Proportion of total_len (must be between 0 and 1)
- If None: Returns None (indicates no size specified)
required
size_name
str
Name of the size parameter (e.g., 'train_size', 'validation_size').
Used in error messages for clarity.
required
total_len
int
Total length of the series against which proportions are calculated.
required
group_idx
int | None
Index of the series group being processed. If provided, it's included
in error messages for multi-group scenarios.
None
Returns:
Type
Description
int | None
Absolute count as integer, or None if size was None.
For float inputs, uses ceiling to ensure at least the requested
proportion is included.
Source code in skforecast\experimental\_splitter.py
def_convert_size(self,size:int|float|None,size_name:str,total_len:int,group_idx:int|None=None,)->int|None:""" Convert a size specification to an absolute integer count. This method handles both absolute (integer) and proportional (float) size specifications, validating the input and converting proportions to actual counts based on the total length. Parameters ---------- size : int | float | None Size specification to convert: - If int: Absolute count (returned as-is after validation) - If float: Proportion of total_len (must be between 0 and 1) - If None: Returns None (indicates no size specified) size_name : str Name of the size parameter (e.g., 'train_size', 'validation_size'). Used in error messages for clarity. total_len : int Total length of the series against which proportions are calculated. group_idx : int | None, default None Index of the series group being processed. If provided, it's included in error messages for multi-group scenarios. Returns ------- int | None Absolute count as integer, or None if size was None. For float inputs, uses ceiling to ensure at least the requested proportion is included. """ifsizeisNone:returnNone# -- Build error message prefix with optional group indexerror_prefix=f'Group {group_idx}: 'ifgroup_idxisnotNoneelse''ifisinstance(size,float):# -- Validate proportion is in valid rangeifnot0<size<1:raiseValueError(f'{error_prefix}{size_name} proportion must be between 0 and 1. 'f'Got {size}.')# -- Convert proportion to count using ceiling to ensure minimum coveragereturnint(np.ceil(size*total_len))# -- Return integer size as-isreturnint(size)
def_validate_size_split_args(self,group_idx:int,train_size:int|float,validation_size:int|float|None,test_size:int|float|None,)->tuple[int,int|None,int|None]:""" Validate and convert size split arguments for a specific group. Parameters ---------- group_idx : int Index of the series group to validate. Returns ------- tuple[int, int | None, int | None] Absolute sizes for (train, validation, test). """# -- Extract sample index (first one)first_index=next(iter(self.series_indexes_[group_idx].values()))total_len=len(first_index)# Convert all sizes using the helper methodtrain_count=self._convert_size(train_size,'train_size',total_len,group_idx)validation_count=self._convert_size(validation_size,'validation_size',total_len,group_idx)test_count=self._convert_size(test_size,'test_size',total_len,group_idx)# -- Validate total doesn't exceed series lengthtotal_requested=train_countifvalidation_countisnotNone:total_requested+=validation_countiftest_countisnotNone:total_requested+=test_countiftotal_requested>total_len:raiseValueError(f'Group {group_idx}: Sum of requested sizes ({total_requested}) 'f'exceeds series length ({total_len}). 'f'Got train_size={train_count}, validation_size={validation_count}, 'f'test_size={test_count}.')# -- Return set sample countsreturntrain_count,validation_count,test_count
def_split_series_dict(self,series_dict:dict[str,pd.Series],positions:dict[str,tuple[int,int]],)->list[dict[str,pd.Series]]:""" Split a single series dictionary according to positions. Parameters ---------- series_dict : dict[str, pd.Series] Dictionary of series to split. positions : dict[str, tuple[int, int]] Start and end positions for each split. Returns ------- list[dict[str, pd.Series]] List of dictionaries, one for each split. """# -- Collect series idssplit_names=list(positions.keys())split_data={name:{}fornameinsplit_names}forseries_name,seriesinseries_dict.items():forsplit_nameinsplit_names:start,end=positions[split_name]split_data[split_name][series_name]=series.iloc[start:end+1].copy()return[split_data[name]fornameinsplit_names]
def_convert_output(self,split_dicts:list[dict[str,pd.Series]],output_format:Literal['wide','long','long_multi_index','dict']='wide',)->tuple:""" Convert split data to requested output format. Parameters ---------- split_dicts : list[dict[str, pd.Series]] List of split data as dictionaries. output_format : {'wide', 'long', 'long_multi_index', 'dict'}, default 'wide' Output format. Returns ------- tuple Splits in requested format. """matchoutput_format:case'dict':returntuple(split_dicts)case'wide':returntuple(pd.DataFrame.from_dict(split_dict)forsplit_dictinsplit_dicts)case'long'|'long_multi_index':returntuple(reshape_series_wide_to_long(pd.DataFrame.from_dict(split_dict),return_multi_index=(output_format=='long_multi_index'),)forsplit_dictinsplit_dicts)case_:raiseValueError(f'Output format `{output_format}` is not supported. 'f'Choose one of ["wide", "long", "long_multi_index", "dict"].')
Creates training, validation (optional), and test sets by splitting
series at specified date boundaries. Dates are inclusive.
When multiple series groups were provided to the constructor, this method
returns a list of tuples (one per group). Each group is split independently
based on its own date range.
Parameters:
Name
Type
Description
Default
end_train
str | Timestamp
Training set end date (inclusive). Required parameter.
required
start_train
str | Timestamp | None
Training set start date (inclusive). Defaults to first date in each group.
None
end_validation
str | Timestamp | None
Validation set end date (inclusive).
Defaults to end_train if not provided (no validation set created).
None
end_test
str | Timestamp | None
Test set end date (inclusive).
Defaults to last date in each group.
None
output_format
('wide', 'long', 'long_multi_index', 'dict')
Output format for the splits.
'wide'
verbose
bool
If True, print detailed split information for each group.
False
Returns:
Type
Description
list[tuple] | tuple
If single series group: tuple of splits (train, test) or (train, val, test)
If multiple series groups: list of tuples, one per group
Raises:
Type
Description
TypeError
If series don't have DatetimeIndex.
ValueError
If dates are invalid or outside available range.
Examples:
>>> # Single group>>> splitter=TimeSeriesSplitter(df1)>>> train,test=splitter.split_by_date(end_train='2023-03-11')
defsplit_by_date(self,end_train:str|pd.Timestamp,start_train:str|pd.Timestamp|None=None,end_validation:str|pd.Timestamp|None=None,end_test:str|pd.Timestamp|None=None,output_format:Literal['wide','long','long_multi_index','dict']='wide',verbose:bool=False,)->list[tuple]|tuple:""" Split time series based on date ranges. Creates training, validation (optional), and test sets by splitting series at specified date boundaries. Dates are inclusive. When multiple series groups were provided to the constructor, this method returns a list of tuples (one per group). Each group is split independently based on its own date range. Parameters ---------- end_train : str | pd.Timestamp Training set end date (inclusive). Required parameter. start_train : str | pd.Timestamp | None, default None Training set start date (inclusive). Defaults to first date in each group. end_validation : str | pd.Timestamp | None, default None Validation set end date (inclusive). Defaults to end_train if not provided (no validation set created). end_test : str | pd.Timestamp | None, default None Test set end date (inclusive). Defaults to last date in each group. output_format : {'wide', 'long', 'long_multi_index', 'dict'}, default 'wide' Output format for the splits. verbose : bool, default False If True, print detailed split information for each group. Returns ------- list[tuple] | tuple If single series group: tuple of splits (train, test) or (train, val, test) If multiple series groups: list of tuples, one per group Raises ------ TypeError If series don't have DatetimeIndex. ValueError If dates are invalid or outside available range. Examples -------- >>> # Single group >>> splitter = TimeSeriesSplitter(df1) >>> train, test = splitter.split_by_date(end_train='2023-03-11') >>> # Multiple groups >>> splitter = TimeSeriesSplitter(df1, df2, df3) >>> splits = splitter.split_by_date(end_train='2023-03-11') >>> # splits = [(df1_train, df1_test), (df2_train, df2_test), (df3_train, df3_test)] """results=[]forgroup_idxinrange(self.n_groups_):# -- Validate and get positions for current groupstart_pos,end_train_pos,end_val_pos,end_test_pos=(self._validate_date_split_args(group_idx,start_train,end_train,end_validation,end_test))# -- Define positions split dictpositions={'train':(start_pos,end_train_pos)}ifend_validationisnotNone:positions['validation']=(end_train_pos+1,end_val_pos)positions['test']=(end_val_pos+1,end_test_pos)else:positions['test']=(end_train_pos+1,end_test_pos)# -- Perform split on current groupsplit_dicts=[{k:vfork,vinsplit_dict.items()iflen(v)>0}forsplit_dictinself._split_series_dict(self.series_groups_[group_idx],positions)]# -- Convert to required outputresult=self._convert_output(split_dicts,output_format)ifverbose:self._print_split_info(group_idx,positions,output_format)results.append(result)# -- Return single tuple if only one group, otherwise list of tuplesreturnresultsifself.n_groups_>1elseresults[0]
Split time series based on size (absolute or proportional).
Creates training, validation (optional), and test sets by splitting
series at specified size boundaries. Sizes can be absolute (int) or
proportional (float between 0 and 1).
When multiple series groups were provided to the constructor, this method
returns a list of tuples (one per group). Each group is split independently
based on its own length.
Parameters:
Name
Type
Description
Default
train_size
int | float
Training set size. If int, absolute count. If float, proportion of total.
required
validation_size
int | float | None
Validation set size. Same as train_size.
If None, no validation set is created.
None
test_size
int | float | None
Test set size. Same as train_size.
If None, remainder is used as test set.
None
output_format
('wide', 'long', 'long_multi_index', 'dict')
Output format for the splits.
'wide'
verbose
bool
If True, print detailed split information for each group.
False
Returns:
Type
Description
list[tuple] | tuple
If single series group: tuple of splits (train, test) or (train, val, test)
If multiple series groups: list of tuples, one per group
Raises:
Type
Description
ValueError
If sizes are invalid or exceed series length.
Examples:
>>> # Single group with proportions>>> splitter=TimeSeriesSplitter(df1)>>> train,test=splitter.split_by_size(train_size=0.8)
>>> # Multiple groups with absolute sizes>>> splitter=TimeSeriesSplitter(df1,df2,df3)>>> splits=splitter.split_by_size(train_size=70,test_size=30)>>> # Each group split with 70 training samples and 30 test samples
Source code in skforecast\experimental\_splitter.py
defsplit_by_size(self,train_size:int|float,validation_size:int|float|None=None,test_size:int|float|None=None,output_format:Literal['wide','long','long_multi_index','dict']='wide',verbose:bool=False,)->list[tuple]|tuple:""" Split time series based on size (absolute or proportional). Creates training, validation (optional), and test sets by splitting series at specified size boundaries. Sizes can be absolute (int) or proportional (float between 0 and 1). When multiple series groups were provided to the constructor, this method returns a list of tuples (one per group). Each group is split independently based on its own length. Parameters ---------- train_size : int | float Training set size. If int, absolute count. If float, proportion of total. validation_size : int | float | None, default None Validation set size. Same as train_size. If None, no validation set is created. test_size : int | float | None, default None Test set size. Same as train_size. If None, remainder is used as test set. output_format : {'wide', 'long', 'long_multi_index', 'dict'}, default 'wide' Output format for the splits. verbose : bool, default False If True, print detailed split information for each group. Returns ------- list[tuple] | tuple If single series group: tuple of splits (train, test) or (train, val, test) If multiple series groups: list of tuples, one per group Raises ------ ValueError If sizes are invalid or exceed series length. Examples -------- >>> # Single group with proportions >>> splitter = TimeSeriesSplitter(df1) >>> train, test = splitter.split_by_size(train_size=0.8) >>> # Multiple groups with absolute sizes >>> splitter = TimeSeriesSplitter(df1, df2, df3) >>> splits = splitter.split_by_size(train_size=70, test_size=30) >>> # Each group split with 70 training samples and 30 test samples """results=[]forgroup_idxinrange(self.n_groups_):# -- Validate and get counts for current grouptrain_count,val_count,test_count=self._validate_size_split_args(group_idx,train_size,validation_size,test_size)# -- Get total length for current groupfirst_index=next(iter(self.series_indexes_[group_idx].values()))total_len=len(first_index)# -- Compute positionstrain_end=train_count-1val_end=train_end+(val_countifval_countisnotNoneelse0)test_end=total_len-1positions={'train':(0,train_end)}ifval_countisnotNone:positions['validation']=(train_end+1,val_end)positions['test']=(val_end+1,test_end)else:positions['test']=(train_end+1,test_end)# -- Perform split on current groupsplit_dicts=self._split_series_dict(self.series_groups_[group_idx],positions)# -- Convert to required outputresult=self._convert_output(split_dicts,output_format)ifverbose:self._print_split_info(group_idx,positions,output_format)results.append(result)# Return single tuple if only one group, otherwise list of tuplesreturnresults[0]ifself.n_groups_==1elseresults
def_print_split_info(self,group_idx:int,positions:dict[str,tuple[int,int]],output_format:str,)->None:""" Print detailed split information for a specific group. Parameters ---------- group_idx : int Index of the series group. positions : dict[str, tuple[int, int]] Position ranges for each split. output_format : str Output format being used. """# -- Extract a sample index (first one)first_index=next(iter(self.series_indexes_[group_idx].values()))total_len=len(first_index)# -- Print headerprint(f'Split Information (Group id: {group_idx})')print('='*32)forsplit_name,(start,end)inpositions.items():length=max(0,end-start+1)percentage=(length/total_len*100)iftotal_len>0else0ifisinstance(first_index,pd.DatetimeIndex):start_date=first_index[start]ifstart<len(first_index)else'N/A'end_date=first_index[end]ifend<len(first_index)else'N/A'print(f'{split_name.capitalize():12} | 'f'Range: {start_date} to {end_date} | 'f'Length: {length} ({percentage:.1f}%)')else:print(f'{split_name.capitalize():12} | 'f'Positions: {start} to {end} | 'f'Length: {length} ({percentage:.1f}%)')# -- Print the given output formatprint(f'Output format: {output_format}',end='')ifgroup_idx<self.n_groups_:print('\n')
Calculate the number of days to the next holiday and the number of days since
the last holiday.
Parameters:
Name
Type
Description
Default
df
pandas DataFrame
DataFrame containing the holiday data.
required
holiday_column
str
The name of the column indicating holidays (True/False), by default 'is_holiday'.
'is_holiday'
date_column
str
The name of the column containing the dates, by default 'date'.
'date'
fill_na
(int, float)
Value to fill for NaN values in the output columns, by default 0.
0.
Returns:
Name
Type
Description
df
DataFrame
DataFrame with additional columns for days to the next holiday ('days_to_holiday')
and days since the last holiday ('days_since_holiday').
Notes
The function assumes that the input df contains a boolean column indicating holidays
and a date column. It calculates the number of days to the next holiday and the number of
days since the last holiday for each date in the date column.
Source code in skforecast\experimental\_experimental.py
defcalculate_distance_from_holiday(df:pd.DataFrame,holiday_column:str='is_holiday',date_column:str='date',fill_na:int|float=0.)->pd.DataFrame:# pragma: no cover""" Calculate the number of days to the next holiday and the number of days since the last holiday. Parameters ---------- df : pandas DataFrame DataFrame containing the holiday data. holiday_column : str, default 'is_holiday' The name of the column indicating holidays (True/False), by default 'is_holiday'. date_column : str, default 'date' The name of the column containing the dates, by default 'date'. fill_na : int, float, default 0. Value to fill for NaN values in the output columns, by default 0. Returns ------- df : pd.DataFrame DataFrame with additional columns for days to the next holiday ('days_to_holiday') and days since the last holiday ('days_since_holiday'). Notes ----- The function assumes that the input `df` contains a boolean column indicating holidays and a date column. It calculates the number of days to the next holiday and the number of days since the last holiday for each date in the date column. """df=df.reset_index(drop=True)df[date_column]=pd.to_datetime(df[date_column])dates=df[date_column].to_numpy()holiday_dates=df.loc[df[holiday_column],date_column].to_numpy()holiday_dates_sorted=np.sort(holiday_dates)# For next holiday (right side)next_idx=np.searchsorted(holiday_dates_sorted,dates,side='left')has_next=next_idx<len(holiday_dates_sorted)days_to_holiday=np.full(len(dates),np.nan)days_to_holiday[has_next]=(holiday_dates_sorted[next_idx[has_next]]-dates[has_next]).astype('timedelta64[D]').astype(int)# For previous holiday (left side)prev_idx=np.searchsorted(holiday_dates_sorted,dates,side='right')-1has_prev=prev_idx>=0days_since_holiday=np.full(len(dates),np.nan)days_since_holiday[has_prev]=(dates[has_prev]-holiday_dates_sorted[prev_idx[has_prev]]).astype('timedelta64[D]').astype(int)df["days_to_holiday"]=pd.Series(days_to_holiday,dtype="Int64").fillna(fill_na)df["days_since_holiday"]=pd.Series(days_since_holiday,dtype="Int64").fillna(fill_na)returndf