`Datasets`¶

skforecast.datasets.fetch_dataset ¶

fetch_dataset(
    name,
    version="latest",
    raw=False,
    kwargs_read_csv={},
    verbose=True,
)

Fetch a dataset from the skforecast-datasets repository.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the dataset to fetch.	required
`version`	`str \| int`	Version of the dataset to fetch. If 'latest', the latest version will be fetched (the one in the main branch). For a list of available versions, see the repository branches.	`'latest'`
`raw`	`bool`	If True, the raw dataset is fetched. If False, the preprocessed dataset is fetched. The preprocessing consists of setting the column with the date/time as index and converting the index to datetime. A frequency is also set to the index.	`False`
`kwargs_read_csv`	`dict`	Kwargs to pass to pandas `read_csv` function.	`{}`
`verbose`	`bool`	If True, print information about the dataset.	`True`

Returns:

Name	Type	Description
`df`	`pandas DataFrame`	Dataset.

Source code in skforecast/datasets/datasets.py

def fetch_dataset(
    name: str,
    version: str | int = 'latest',
    raw: bool = False,
    kwargs_read_csv: dict = {},
    verbose: bool = True
) -> pd.DataFrame:
    """
    Fetch a dataset from the skforecast-datasets repository.

    Parameters
    ----------
    name: str
        Name of the dataset to fetch.
    version: str, int, default `'latest'`
        Version of the dataset to fetch. If 'latest', the latest version will be 
        fetched (the one in the main branch). For a list of available versions, 
        see the repository branches.
    raw: bool, default `False`
        If True, the raw dataset is fetched. If False, the preprocessed dataset 
        is fetched. The preprocessing consists of setting the column with the 
        date/time as index and converting the index to datetime. A frequency is 
        also set to the index.
    kwargs_read_csv: dict, default `{}`
        Kwargs to pass to pandas `read_csv` function.
    verbose: bool, default `True`
        If True, print information about the dataset.

    Returns
    -------
    df: pandas DataFrame
        Dataset.

    """

    version = 'main' if version == 'latest' else f'{version}'

    if name not in datasets.keys():
        raise ValueError(
            f"Dataset '{name}' not found. "
            f"Available datasets are: {sorted(datasets.keys())}"
        )

    url = datasets[name]['url']
    if '{version}' in url:
        url = url.format(version=version)
    file_type = datasets[name]['file_type']

    if not isinstance(url, list):
        parsed = urlparse(url)
        if parsed.scheme == "https" and parsed.netloc == "drive.google.com":
            file_id = url.split('/')[-2]
            url = 'https://drive.google.com/uc?id=' + file_id
        if file_type == 'csv':
            try:
                sep = datasets[name]['sep']
                df = pd.read_csv(url, sep=sep, **kwargs_read_csv)
            except Exception as e:
                raise ValueError(
                    f"Error reading dataset '{name}' from {url}: {str(e)}."
                )
        if file_type == 'parquet':
            try:
                df = pd.read_parquet(url)
            except Exception as e:
                raise ValueError(
                    f"Error reading dataset '{name}' from {url}: {str(e)}."
                )
    else:
        try: 
            df = []
            for url_partition in url:
                path = 'https://drive.google.com/uc?export=download&id=' + url_partition.split('/')[-2]
                df.append(pd.read_parquet(path))
        except Exception as e:
            raise ValueError(
                f"Error reading dataset '{name}' from {url}: {str(e)}."
            )
        df = pd.concat(df, axis=0).reset_index(drop=True)

    if not raw:
        try:
            index_col = datasets[name]['index_col']
            freq = datasets[name]['freq']
            if freq == 'H' and pd.__version__ >= '2.2.0':
                freq = "h"
            date_format = datasets[name]['date_format']
            df = df.set_index(index_col)
            df.index = pd.to_datetime(df.index, format=date_format)
            df = df.asfreq(freq)
            df = df.sort_index()
        except:
            pass

    if verbose:
        _print_dataset_info(name, version=version, shape=df.shape)

    return df

skforecast.datasets.load_demo_dataset ¶

load_demo_dataset(version='latest')

Load demo data set with monthly expenditure ($AUD) on corticosteroid drugs that the Australian health system had between 1991 and 2008. Obtained from the book: Forecasting: Principles and Practice by Rob J Hyndman and George Athanasopoulos. Index is set to datetime with monthly frequency and sorted.

Parameters:

Name	Type	Description	Default
`version`	`str`	Version of the dataset to fetch. If 'latest', the latest version will be fetched (the one in the main branch). For a list of available versions, see the repository branches.	`'latest'`

Returns:

Name	Type	Description
`df`	`pandas Series`	Dataset.

Source code in skforecast/datasets/datasets.py

def load_demo_dataset(version: str = 'latest') -> pd.Series:
    """
    Load demo data set with monthly expenditure ($AUD) on corticosteroid drugs that
    the Australian health system had between 1991 and 2008. Obtained from the book:
    Forecasting: Principles and Practice by Rob J Hyndman and George Athanasopoulos.
    Index is set to datetime with monthly frequency and sorted.

    Parameters
    ----------
    version: str, default `'latest'`
        Version of the dataset to fetch. If 'latest', the latest version will be
        fetched (the one in the main branch). For a list of available versions,
        see the repository branches.

    Returns
    -------
    df: pandas Series
        Dataset.

    """

    version = 'main' if version == 'latest' else f'{version}'

    url = (
        f'https://raw.githubusercontent.com/skforecast/skforecast-datasets/{version}/'
        'data/h2o.csv'
    )

    df = pd.read_csv(url, sep=',', header=0, names=['y', 'datetime'])
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d')
    df = df.set_index('datetime')
    df = df.asfreq('MS')
    df = df['y']
    df = df.sort_index()

    return df

skforecast.datasets.show_datasets_info ¶

show_datasets_info(datasets_names=None, version='latest')

Print information about available datasets. If datasets_names is provided, only information about those datasets will be printed.

Parameters:

Name	Type	Description	Default
`datasets_names`	`list[str] \| None`	List of dataset names to display information about. If None, information about all datasets will be displayed.	`None`
`version`	`str`	Version of the datasets to display information about.	`'latest'`

Returns:

Type	Description
`None`

Source code in skforecast/datasets/datasets.py

def show_datasets_info(
    datasets_names: list[str] | None = None,
    version: str = 'latest'
) -> None:
    """
    Print information about available datasets. If `datasets_names` is provided,
    only information about those datasets will be printed.

    Parameters
    ----------
    datasets_names: list, default None
        List of dataset names to display information about. If None, information 
        about all datasets will be displayed.
    version: str
        Version of the datasets to display information about.

    Returns
    -------
    None

    """

    datasets_names = datasets_names or sorted(datasets.keys())
    version = 'main' if version == 'latest' else f'{version}'

    for dataset_name in datasets_names:
        if dataset_name in datasets:
            _print_dataset_info(dataset_name, version=version)
        else:
            print(
                f"Dataset '{dataset_name}' not available. Set argument "
                f"`datasets_names` to None to see all available datasets."
            )

Datasets¶

skforecast.datasets.fetch_dataset ¶

skforecast.datasets.load_demo_dataset ¶

skforecast.datasets.show_datasets_info ¶

`Datasets`¶