functime,一个神奇的 python 库
1
Mar. 2024
functime 是一个用于时间序列预测的机器学习库,用于在大型数据集上进行预测和时间序列特征提取。functime 的独特之处在于其独特的预处理选项和创新的交叉验证拆分器。
functime 具有无与伦比的速度和效率,能够在几秒钟内处理数量惊人的 100,000 个时间序列。这种非凡的速度是通过巧妙地利用 Polars 进行并行特征工程实现的。
特点
-
功能齐全:强大且易于使用的 API,用于预测和特征工程
-
快速:在笔记本电脑上几秒钟内预测 100,000 个时间序列
-
高效:使用 Polars 对时间序列进行并行特征工程。
-
久经考验:能够产生真正业务影响并赢得竞争的算法
-
使用扩展窗口和滑动窗口分离器进行回测
-
可通过 FLAML 的强大功能无缝处理复杂的任务,例如管理滞后和超参数调整。
安装
pip install functime
functime 提供了额外的选项。例如,要安装具有大型语言模型(LLM)和 lightgbm 功能的 functime。
pip install "functime[llm,lgb]"
-
cat: 使用 “catboost” 预测器
-
xgb: 使用 “xgboost” 预测器
-
lgb: 使用 “lightgbm” 预测器
-
llm: 使用 llm 支持的预测
使用 functime 进行时间序列预测
import polars as pl
from functime.cross_validation import train_test_split
from functime.seasonality import add_fourier_terms
from functime.forecasting import linear_model
from functime.preprocessing import scale
from functime.metrics import mase
# Load commodities price data
y = pl.read_parquet("https://github.com/TracecatHQ/functime/raw/main/data/commodities.parquet")
entity_col, time_col = y.columns[:2]
# Time series split
y_train, y_test = y.pipe(train_test_split(test_size=3))
# Fit-predict
forecaster = linear_model(freq="1mo", lags=24)
forecaster.fit(y=y_train)
y_pred = forecaster.predict(fh=3)
# fit-predict in a single line
y_pred = linear_model(freq="1mo", lags=24)(y=y_train, fh=3)
# Score forecasts in parallel
scores = mase(y_true=y_test, y_pred=y_pred, y_train=y_train)
# Forecast with target transforms and feature transforms
forecaster = linear_model(
freq="1mo",
lags=24,
target_transform=scale(),
feature_transform=add_fourier_terms(sp=12, K=6)
)
# Forecast with exogenous regressors!
# Just pass them into X
X = (
y.select([entity_col, time_col])
.pipe(add_fourier_terms(sp=12, K=6)).collect()
)
X_train, X_future = y.pipe(train_test_split(test_size=3))
forecaster = linear_model(freq="1mo", lags=24)
forecaster.fit(y=y_train, X=X_train)
y_pred = forecaster.predict(fh=3, X=X_future)
特征提取
import polars as pl
import numpy as np
from functime.feature_extractors import FeatureExtractor, binned_entropy
# Load commodities price data
y = pl.read_parquet("https://github.com/TracecatHQ/functime/raw/main/data/commodities.parquet")
# Get column names ("commodity_type", "time", "price")
entity_col, time_col, value_col = y.columns
# Extract a single feature from a single time-series
binned_entropy = binned_entropy(
pl.Series(np.random.normal(0, 1, size=10)),
bin_count=10
)
# Also works on LazyFrames with query optimization
features = (
pl.LazyFrame({
"index": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
"value": np.random.normal(0, 1, size=10)
})
.select(
binned_entropy=pl.col("value").ts.binned_entropy(bin_count=10),
lempel_ziv_complexity=pl.col("value").ts.lempel_ziv_complexity(threshold=3),
longest_streak_above_mean=pl.col("value").ts.longest_streak_above_mean(),
)
.collect()
)
# Extract features blazingly fast on many
# stacked time-series using `group_by`
features = (
y.group_by(entity_col)
.agg(
binned_entropy=pl.col(value_col).ts.binned_entropy(bin_count=10),
lempel_ziv_complexity=pl.col(value_col).ts.lempel_ziv_complexity(threshold=3),
longest_streak_above_mean=pl.col(value_col).ts.longest_streak_above_mean(),
)
)
# Extract features blazingly fast on windows
# of many time-series using `group_by_dynamic`
features = (
# Compute rolling features at yearly intervals
y.group_by_dynamic(
time_col,
every="12mo",
by=entity_col,
)
.agg(
binned_entropy=pl.col(value_col).ts.binned_entropy(bin_count=10),
lempel_ziv_complexity=pl.col(value_col).ts.lempel_ziv_complexity(threshold=3),
longest_streak_above_mean=pl.col(value_col).ts.longest_streak_above_mean(),
)
)