`Datasets`¶

skforecast.datasets.fetch_dataset ¶


fetch_dataset(
    name,
    version="latest",
    raw=False,
    kwargs_read=None,
    verbose=True,
)

Fetch a dataset from the skforecast-datasets repository.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the dataset to fetch.	required
`version`	`str \| int`	Version of the dataset to fetch. If 'latest', the latest version will be fetched (the one in the main branch). For a list of available versions, see the repository branches.	`'latest'`
`raw`	`bool`	If True, the raw dataset is fetched. If False, the preprocessed dataset is fetched. The preprocessing consists of setting the column with the date/time as index and converting the index to datetime. A frequency is also set to the index.	`False`
`kwargs_read`	`dict \| None`	Kwargs to pass to pandas `pd.read_csv` or `pd.read_parquet` function depending on the dataset file type.	`None`
`verbose`	`bool`	If True, print information about the dataset.	`True`

Returns:

Name	Type	Description
`df`	`pandas DataFrame`	Dataset.

Source code in skforecast/datasets/datasets.py

def fetch_dataset(
    name: str,
    version: str | int = 'latest',
    raw: bool = False,
    kwargs_read: dict | None = None,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Fetch a dataset from the skforecast-datasets repository.

    Parameters
    ----------
    name: str
        Name of the dataset to fetch.
    version: str, int, default 'latest'
        Version of the dataset to fetch. If 'latest', the latest version will be 
        fetched (the one in the main branch). For a list of available versions, 
        see the repository branches.
    raw: bool, default False
        If True, the raw dataset is fetched. If False, the preprocessed dataset 
        is fetched. The preprocessing consists of setting the column with the 
        date/time as index and converting the index to datetime. A frequency is 
        also set to the index.
    kwargs_read: dict, default None
        Kwargs to pass to pandas `pd.read_csv` or `pd.read_parquet` function 
        depending on the dataset file type.
    verbose: bool, default True
        If True, print information about the dataset.

    Returns
    -------
    df: pandas DataFrame
        Dataset.

    """

    kwargs_read = kwargs_read or {}
    version = 'main' if version == 'latest' else f'{version}'

    if name not in datasets:
        raise ValueError(
            f"Dataset '{name}' not found. "
            f"Available datasets are: {sorted(datasets)}"
        )

    url = datasets[name]['url']
    if '{version}' in url:
        url = url.format(version=version)
    file_type = datasets[name]['file_type']

    if file_type == 'csv':
        try:
            sep = datasets[name]['sep']
            df = pd.read_csv(url, sep=sep, **kwargs_read)
        except Exception as e:
            raise ValueError(
                f"Error reading dataset '{name}' from {url}: {str(e)}."
            )
    elif file_type == 'parquet':
        try:
            df = pd.read_parquet(url, **kwargs_read)
        except Exception as e:
            raise ValueError(
                f"Error reading dataset '{name}' from {url}: {str(e)}."
            )

    if not raw:
        try:
            index_col = datasets[name]['index_col']
            freq = datasets[name]['freq']
            if freq == 'h' and tuple(int(x) for x in pd.__version__.split('.')[:2]) < (2, 2):
                freq = 'H'
            date_format = datasets[name]['date_format']
            if df.index.name != index_col:
                df = df.set_index(index_col)
            df.index = pd.to_datetime(df.index, format=date_format)
            if df.index.is_unique:
                df = df.asfreq(freq)
            df = df.sort_index()
        except Exception as e:
            warnings.warn(
                f"Could not preprocess dataset '{name}': {e}",
                stacklevel=2
            )

    if verbose:
        _print_dataset_info(name, version=version, shape=df.shape)

    return df

skforecast.datasets.load_demo_dataset ¶


load_demo_dataset(version='latest', verbose=True)

Load demo data set with monthly expenditure ($AUD) on corticosteroid drugs that the Australian health system had between 1991 and 2008. Obtained from the book: Forecasting: Principles and Practice by Rob J Hyndman and George Athanasopoulos. Index is set to datetime with monthly frequency and sorted.

Parameters:

Name	Type	Description	Default
`version`	`str`	Version of the dataset to fetch. If 'latest', the latest version will be fetched (the one in the main branch). For a list of available versions, see the repository branches.	`'latest'`
`verbose`	`bool`	If True, print information about the dataset.	`True`

Returns:

Name	Type	Description
`df`	`pandas Series`	Dataset.

Source code in skforecast/datasets/datasets.py

def load_demo_dataset(
    version: str = 'latest', 
    verbose: bool = True
) -> pd.Series:
    """
    Load demo data set with monthly expenditure ($AUD) on corticosteroid drugs that
    the Australian health system had between 1991 and 2008. Obtained from the book:
    Forecasting: Principles and Practice by Rob J Hyndman and George Athanasopoulos.
    Index is set to datetime with monthly frequency and sorted.

    Parameters
    ----------
    version: str, default 'latest'
        Version of the dataset to fetch. If 'latest', the latest version will be
        fetched (the one in the main branch). For a list of available versions,
        see the repository branches.
    verbose: bool, default True
        If True, print information about the dataset.

    Returns
    -------
    df: pandas Series
        Dataset.

    """

    version = 'main' if version == 'latest' else f'{version}'

    url = (
        f'https://raw.githubusercontent.com/skforecast/skforecast-datasets/{version}/'
        'data/h2o.csv'
    )

    df = pd.read_csv(url, sep=',', header=0, names=['y', 'datetime'])
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d')
    df = df.set_index('datetime')
    df = df.asfreq('MS')
    df = df['y']
    df = df.sort_index()

    if verbose:
        _print_dataset_info('h2o', version=version, shape=df.shape)

    return df

skforecast.datasets.show_datasets_info ¶


show_datasets_info(datasets_names=None, version='latest')

Print information about available datasets. If datasets_names is provided, only information about those datasets will be printed.

Parameters:

Name	Type	Description	Default
`datasets_names`	`list[str] \| None`	List of dataset names to display information about. If None, information about all datasets will be displayed.	`None`
`version`	`str`	Version of the datasets to display information about. If 'latest', the latest version will be used (the one in the main branch). For a list of available versions, see the repository branches.	`'latest'`

Returns:

Type	Description
`None`

Source code in skforecast/datasets/datasets.py

def show_datasets_info(
    datasets_names: list[str] | None = None,
    version: str = 'latest'
) -> None:
    """
    Print information about available datasets. If `datasets_names` is provided,
    only information about those datasets will be printed.

    Parameters
    ----------
    datasets_names: list, default None
        List of dataset names to display information about. If None, information 
        about all datasets will be displayed.
    version: str, default 'latest'
        Version of the datasets to display information about. If 'latest', the
        latest version will be used (the one in the main branch). For a list of
        available versions, see the repository branches.

    Returns
    -------
    None

    """

    datasets_names = datasets_names or sorted(datasets)
    version = 'main' if version == 'latest' else f'{version}'

    for dataset_name in datasets_names:
        if dataset_name in datasets:
            _print_dataset_info(dataset_name, version=version)
        else:
            print(
                f"Dataset '{dataset_name}' not available. Set argument "
                f"`datasets_names` to None to see all available datasets."
            )

Datasets¶

skforecast.datasets.fetch_dataset ¶

skforecast.datasets.load_demo_dataset ¶

skforecast.datasets.show_datasets_info ¶

`Datasets`¶