Skip to content

Datasets

skforecast.datasets.fetch_dataset

fetch_dataset(
    name,
    version="latest",
    raw=False,
    kwargs_read_csv={},
    verbose=True,
)

Fetch a dataset from the skforecast-datasets repository.

Parameters:

Name Type Description Default
name str

Name of the dataset to fetch.

required
version str | int

Version of the dataset to fetch. If 'latest', the latest version will be fetched (the one in the main branch). For a list of available versions, see the repository branches.

'latest'
raw bool

If True, the raw dataset is fetched. If False, the preprocessed dataset is fetched. The preprocessing consists of setting the column with the date/time as index and converting the index to datetime. A frequency is also set to the index.

False
kwargs_read_csv dict

Kwargs to pass to pandas read_csv function.

{}
verbose bool

If True, print information about the dataset.

True

Returns:

Name Type Description
df pandas DataFrame

Dataset.

Source code in skforecast/datasets/datasets.py
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
def fetch_dataset(
    name: str,
    version: str | int = 'latest',
    raw: bool = False,
    kwargs_read_csv: dict = {},
    verbose: bool = True
) -> pd.DataFrame:
    """
    Fetch a dataset from the skforecast-datasets repository.

    Parameters
    ----------
    name: str
        Name of the dataset to fetch.
    version: str, int, default `'latest'`
        Version of the dataset to fetch. If 'latest', the latest version will be 
        fetched (the one in the main branch). For a list of available versions, 
        see the repository branches.
    raw: bool, default `False`
        If True, the raw dataset is fetched. If False, the preprocessed dataset 
        is fetched. The preprocessing consists of setting the column with the 
        date/time as index and converting the index to datetime. A frequency is 
        also set to the index.
    kwargs_read_csv: dict, default `{}`
        Kwargs to pass to pandas `read_csv` function.
    verbose: bool, default `True`
        If True, print information about the dataset.

    Returns
    -------
    df: pandas DataFrame
        Dataset.

    """

    version = 'main' if version == 'latest' else f'{version}'

    if name not in datasets.keys():
        raise ValueError(
            f"Dataset '{name}' not found. "
            f"Available datasets are: {sorted(datasets.keys())}"
        )

    url = datasets[name]['url']
    if '{version}' in url:
        url = url.format(version=version)
    file_type = datasets[name]['file_type']

    if not isinstance(url, list):
        parsed = urlparse(url)
        if parsed.scheme == "https" and parsed.netloc == "drive.google.com":
            file_id = url.split('/')[-2]
            url = 'https://drive.google.com/uc?id=' + file_id
        if file_type == 'csv':
            try:
                sep = datasets[name]['sep']
                df = pd.read_csv(url, sep=sep, **kwargs_read_csv)
            except Exception as e:
                raise ValueError(
                    f"Error reading dataset '{name}' from {url}: {str(e)}."
                )
        if file_type == 'parquet':
            try:
                df = pd.read_parquet(url)
            except Exception as e:
                raise ValueError(
                    f"Error reading dataset '{name}' from {url}: {str(e)}."
                )
    else:
        try: 
            df = []
            for url_partition in url:
                path = 'https://drive.google.com/uc?export=download&id=' + url_partition.split('/')[-2]
                df.append(pd.read_parquet(path))
        except Exception as e:
            raise ValueError(
                f"Error reading dataset '{name}' from {url}: {str(e)}."
            )
        df = pd.concat(df, axis=0).reset_index(drop=True)

    if not raw:
        try:
            index_col = datasets[name]['index_col']
            freq = datasets[name]['freq']
            if freq == 'H' and pd.__version__ >= '2.2.0':
                freq = "h"
            date_format = datasets[name]['date_format']
            df = df.set_index(index_col)
            df.index = pd.to_datetime(df.index, format=date_format)
            df = df.asfreq(freq)
            df = df.sort_index()
        except:
            pass

    if verbose:
        _print_dataset_info(name, version=version, shape=df.shape)

    return df

skforecast.datasets.load_demo_dataset

load_demo_dataset(version='latest')

Load demo data set with monthly expenditure ($AUD) on corticosteroid drugs that the Australian health system had between 1991 and 2008. Obtained from the book: Forecasting: Principles and Practice by Rob J Hyndman and George Athanasopoulos. Index is set to datetime with monthly frequency and sorted.

Parameters:

Name Type Description Default
version str

Version of the dataset to fetch. If 'latest', the latest version will be fetched (the one in the main branch). For a list of available versions, see the repository branches.

'latest'

Returns:

Name Type Description
df pandas Series

Dataset.

Source code in skforecast/datasets/datasets.py
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
def load_demo_dataset(version: str = 'latest') -> pd.Series:
    """
    Load demo data set with monthly expenditure ($AUD) on corticosteroid drugs that
    the Australian health system had between 1991 and 2008. Obtained from the book:
    Forecasting: Principles and Practice by Rob J Hyndman and George Athanasopoulos.
    Index is set to datetime with monthly frequency and sorted.

    Parameters
    ----------
    version: str, default `'latest'`
        Version of the dataset to fetch. If 'latest', the latest version will be
        fetched (the one in the main branch). For a list of available versions,
        see the repository branches.

    Returns
    -------
    df: pandas Series
        Dataset.

    """

    version = 'main' if version == 'latest' else f'{version}'

    url = (
        f'https://raw.githubusercontent.com/skforecast/skforecast-datasets/{version}/'
        'data/h2o.csv'
    )

    df = pd.read_csv(url, sep=',', header=0, names=['y', 'datetime'])
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d')
    df = df.set_index('datetime')
    df = df.asfreq('MS')
    df = df['y']
    df = df.sort_index()

    return df

skforecast.datasets.show_datasets_info

show_datasets_info(datasets_names=None, version='latest')

Print information about available datasets. If datasets_names is provided, only information about those datasets will be printed.

Parameters:

Name Type Description Default
datasets_names list[str] | None

List of dataset names to display information about. If None, information about all datasets will be displayed.

None
version str

Version of the datasets to display information about.

'latest'

Returns:

Type Description
None
Source code in skforecast/datasets/datasets.py
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
def show_datasets_info(
    datasets_names: list[str] | None = None,
    version: str = 'latest'
) -> None:
    """
    Print information about available datasets. If `datasets_names` is provided,
    only information about those datasets will be printed.

    Parameters
    ----------
    datasets_names: list, default None
        List of dataset names to display information about. If None, information 
        about all datasets will be displayed.
    version: str
        Version of the datasets to display information about.

    Returns
    -------
    None

    """

    datasets_names = datasets_names or sorted(datasets.keys())
    version = 'main' if version == 'latest' else f'{version}'

    for dataset_name in datasets_names:
        if dataset_name in datasets:
            _print_dataset_info(dataset_name, version=version)
        else:
            print(
                f"Dataset '{dataset_name}' not available. Set argument "
                f"`datasets_names` to None to see all available datasets."
            )