This document shows the profiling of the main classes, methods and functions available in skforecast. Understanding the bottlenecks will help to:
- Use it more efficiently
- Improve the code for future releases
Libraries and dataĀ¶
# Libraries
# ==============================================================================
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import platform
import psutil
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
import skforecast
from skforecast.recursive import ForecasterRecursive
from skforecast.direct import ForecasterDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
%load_ext pyinstrument
# Versions
# ==============================================================================
print(f"Python version : {platform.python_version()}")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"skforecast version : {skforecast.__version__}")
print(f"pandas version : {pd.__version__}")
print(f"numpy version : {np.__version__}")
print(f"psutil version : {psutil.__version__}")
print("")
# System information
# ==============================================================================
print(f"Machine type: {platform.machine()}")
print(f"Processor type: {platform.processor()}")
print(f"Platform type: {platform.platform()}")
print(f"Operating system: {platform.system()}")
print(f"Operating system release: {platform.release()}")
print(f"Operating system version: {platform.version()}")
print(f"Number of physical cores: {psutil.cpu_count(logical=False)}")
print(f"Number of logical cores: {psutil.cpu_count(logical=True)}")
A time series of length 1000 with random values is created.
# Data
# ==============================================================================
np.random.seed(123)
n = 1_000
data = pd.Series(data = np.random.normal(size=n))
Dummy regressorĀ¶
To isolate the training process of the regressor from the other parts of the code, a dummy regressor class is created. This dummy regressor has a fit method that does nothing, and a predict method that returns a constant value.
class DummyRegressor(LinearRegression):
"""
Dummy regressor with dummy fit and predict methods.
"""
def fit(self, X, y):
pass
def predict(self, y):
predictions = np.ones(shape = len(y))
return predictions
Profiling fitĀ¶
%%pyinstrument
forecaster = ForecasterRecursive(
regressor = DummyRegressor(),
lags = 24
)
forecaster.fit(y=data)
c:\Users\jaesc2\Miniconda3\envs\skforecast_py11\Lib\site-packages\sklearn\preprocessing\_discretization.py:278: UserWarning: Feature 0 is constant and will be replaced with 0. warnings.warn(
Almost all of the time spent by fit
is required by the create_train_X_y
method.
%%pyinstrument
forecaster = ForecasterRecursive(
regressor = HistGradientBoostingRegressor(max_iter=10, random_state=123),
lags = 24
)
forecaster.fit(y=data)
When training a forecaster with a real machine learning regressor, the time spent by create_train_X_y
is negligible compared to the time needed by the fit
method of the regressor. Therefore, improving the speed of create_train_X_y
will not have much impact.
Profiling create_train_X_yĀ¶
Understand how the create_train_X_y
method is influenced by the length of the series and the number of lags.
# Profiling `create_train_X_y` for different length of series and number of lags
# ======================================================================================
series_length = np.linspace(1000, 1000000, num=5, dtype=int)
n_lags = [5, 10, 50, 100, 200]
results = {}
for lags in n_lags:
execution_time = []
forecaster = ForecasterRecursive(
regressor = DummyRegressor(),
lags = lags
)
for n in series_length:
y = pd.Series(data = np.random.normal(size=n))
tic = time.perf_counter()
_ = forecaster.create_train_X_y(y=y)
toc = time.perf_counter()
execution_time.append(toc - tic)
results[lags] = execution_time
results = pd.DataFrame(
data = results,
index = series_length
)
results
5 | 10 | 50 | 100 | 200 | |
---|---|---|---|---|---|
1000 | 0.001045 | 0.004704 | 0.007673 | 0.038916 | 0.069427 |
250750 | 0.007231 | 0.014726 | 0.213554 | 0.364044 | 0.561126 |
500500 | 0.023345 | 0.052342 | 0.427991 | 0.775216 | 1.522044 |
750250 | 0.037870 | 0.068540 | 0.666305 | 1.062016 | 2.148227 |
1000000 | 0.042683 | 0.092421 | 0.832870 | 1.425935 | 2.702640 |
fig, ax = plt.subplots(figsize=(7, 4))
results.plot(ax=ax, marker='.')
ax.set_xlabel('length of series')
ax.set_ylabel('time (seconds)')
ax.set_title('Profiling create_train_X_y()')
ax.legend(title='number of lags');
Profiling predictĀ¶
forecaster = ForecasterRecursive(
regressor = DummyRegressor(),
lags = 24
)
forecaster.fit(y=data)
c:\Users\jaesc2\Miniconda3\envs\skforecast_py11\Lib\site-packages\sklearn\preprocessing\_discretization.py:278: UserWarning: Feature 0 is constant and will be replaced with 0. warnings.warn(
%%pyinstrument
_ = forecaster.predict(steps=1000)
forecaster = ForecasterRecursive(
regressor = HistGradientBoostingRegressor(max_iter=10, random_state=123),
lags = 24
)
forecaster.fit(y=data)
%%pyinstrument
_ = forecaster.predict(steps=1000)
Inside the predict
method, the append
action is the most expensive but, similar to what happen with fit
, it is negligible compared to the time need by the predict
method of the regressor.