Credits
import datetime
import numpy as np
import os
import sys
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(sys.version)
Requires run time restart
!pip install pandas==1.1.5 # requires Runtime restart
!pip install scikit-learn==1.0 # requires Runtime restart
import pandas as pd
assert pd.__version__ == "1.1.5"
pd.set_option("max_columns", 100)
import sklearn
assert sklearn.__version__ == "1.0"
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import make_friedman1, make_regression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
np.random.seed(1)
forecast_steps = 7 # 7
def generate_data():
df = pd.DataFrame()
start_date = datetime.datetime(year=2001, month=1, day=1)
df["date"] = [start_date + datetime.timedelta(days=x+1) for x in range(365*15)]
num_features = 10 # only 5 features determine label
n_informative = 5 # determine the label, the rest is random
X, y = make_friedman1(n_samples=df.shape[0], n_features=num_features, noise=1.0)
# X, y = make_regression(n_samples=df.shape[0], n_features=num_features, n_informative=n_informative, n_targets=1)
df.loc[:, [f"x_{i}" for i in range(num_features)]] = X
df.loc[:, "x_2"] *= 20 # scale feature
df.loc[:, "x_3"] *= 30 # scale feature
df.loc[:, "x_7"] *= 70 # scale feature
df.loc[:, "x_8"] *= 80 # scale feature
df.loc[:, "y"] = y
df.loc[:, "y"] = (
0.5 * df.loc[:, "y"] +
0.5 * df.loc[:, "y"].rolling(window=3).mean()
)
df.loc[:, "y"] = df.loc[:, "y"].shift(forecast_steps)
df.rolling(2, min_periods=1).sum()
return df
df = generate_data()
df.describe(include="all", datetime_is_numeric=True)
df.head(10)
df.plot(kind="line",
x="date",
y="y",
colormap="coolwarm",
figsize=(12, 3),
fontsize=12,
title="Sythentic data",
xlabel="date",
ylabel="label")
df_corr = df.copy(deep=True)
df_corr.loc[:, "future_label"] = df_corr.loc[:, "y"].shift(-forecast_steps)
df_corr.corr().style.background_gradient(cmap="coolwarm")
Lookback & forecast step example:
lookback_steps = 14 # length of history provided to learn label at t
end_train_idx = int(0.7 * df.shape[0])
end_validation_idx = int(0.85 * df.shape[0])
print(f"""train idx: [{df.index[0]}, {end_train_idx})
validation idx: [{end_train_idx}, {end_validation_idx})
test idx: [{end_validation_idx}-{df.shape[0]})""")
Using a pipeline to ensure no train/test data leakage occurs. Accomplished by:
# label 'y' is included as a feature because historical labels are provided as features
numerical_features = ["y"] + [f"x_{i}" for i in range(10)]
def get_feature_pipeline():
class FeatureSelector(BaseEstimator, TransformerMixin):
def __init__(self, feature_names):
self.feature_names = feature_names
def fit( self, X, y = None ):
return self
def transform(self, X, y=None):
return X.loc[:, self.feature_names].copy(deep=True)
numerical_pipeline = Pipeline(steps = [
("num_selector", FeatureSelector(numerical_features)),
("imputer", SimpleImputer(strategy="median")),
("std_scaler", StandardScaler())
])
feature_pipeline = FeatureUnion(
n_jobs=1,
transformer_list=[
("numerical_pipeline", numerical_pipeline),
# ("categorical_pipeline", categorical_pipeline),
]
)
return feature_pipeline
feature_pipeline = get_feature_pipeline()
train_df = pd.DataFrame(
feature_pipeline.fit_transform(
df.loc[:end_train_idx, :]
),
columns=numerical_features
)
# Train label is offset by the number of steps into the future we're forecasting
train_df.loc[:, "label"] = train_df.loc[:, "y"].shift(-forecast_steps)
train_df = train_df.loc[train_df["label"].notna(), :]
assert 0 == train_df.isna().sum().sum()
print(train_df.shape)
train_df.head(5)
dataset_train = keras.preprocessing.timeseries_dataset_from_array(
data=train_df.loc[:train_df.shape[0]-lookback_steps, numerical_features],
targets=train_df.loc[lookback_steps-1:, "label"],
sequence_length=lookback_steps,
sequence_stride=1,
sampling_rate=1,
batch_size=256,
shuffle=False,
)
for x, y in dataset_train.take(1):
print(x.shape, y.shape)
def df_transform_to_keras_dataset(df_idx_start, df_idx_end):
validation_df = pd.DataFrame(
feature_pipeline.transform(
df.loc[df_idx_start:df_idx_end, :]
),
columns=numerical_features
)
validation_df.loc[:, "label"] = validation_df.loc[:, "y"].shift(-forecast_steps)
validation_df = validation_df.loc[validation_df["label"].notna(), :]
assert 0 == validation_df.isna().sum().sum()
print(validation_df.shape)
dataset_validation = keras.preprocessing.timeseries_dataset_from_array(
data=validation_df.loc[:validation_df.shape[0]-lookback_steps, numerical_features],
targets=validation_df.loc[lookback_steps-1:, "label"],
sequence_length=lookback_steps,
batch_size=256,
shuffle=False,
)
return dataset_validation
dataset_validation = df_transform_to_keras_dataset(end_train_idx, end_validation_idx)
for x, y in dataset_validation.take(1):
print(x.shape, y.shape)
print(f"target label: {y[0]}")
for i in range(len(x[0])):
print(f"y{i}: {x[0][i][0]}, x_0: {x[0][i][1]}, x_1: {x[0][i][2]}, x_-1: {x[0][i][-1]}")
print(f"Row idx of first label (see 'Name: x') in column 0:")
(
pd.DataFrame(feature_pipeline.transform(
df.loc[end_train_idx:end_validation_idx, :])
).loc[lookback_steps + forecast_steps - 1, :]
)
print(f"Rows idx of dataset to learn first label: 0:{lookback_steps - 1}")
(
pd.DataFrame(feature_pipeline.transform(
df.loc[end_train_idx:end_validation_idx, :])
).loc[lookback_steps-5:lookback_steps-1,:]
)
inputs = keras.layers.Input(shape=(x.shape[1], x.shape[2]))
lstm_out = keras.layers.LSTM(32)(inputs)
outputs = keras.layers.Dense(1)(lstm_out)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=keras.optimizers.Adam(
learning_rate=0.001),
loss="mse"
)
model.summary()
model_name = "lstm_synthetic_data"
path_checkpoint = f"/content/drive/My Drive/Colab Notebooks/forecasting/{model_name}.h5"
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5)
modelckpt_callback = keras.callbacks.ModelCheckpoint(
monitor="val_loss",
filepath=path_checkpoint,
verbose=1,
save_weights_only=True,
save_best_only=True,
)
history = model.fit(
dataset_train,
epochs=50,
validation_data=dataset_validation,
callbacks=[es_callback, modelckpt_callback],
verbose=0,
)
loss_df = pd.DataFrame({
"train_loss": history.history["loss"],
"validation_loss": history.history["val_loss"]
})
loss_df.plot(kind="line",
y=["train_loss", "validation_loss"],
colormap="coolwarm",
figsize=(9, 3),
fontsize=12,
title="LSTM Training Loss",
xlabel="epoch",
ylabel="loss")
for x_validation, y_validation in dataset_validation.take(1):
print(x_validation.shape, y_validation.shape)
validation_diff = pd.DataFrame({
"y_pred": model.predict(x_validation).flatten(),
"y_true": y_validation.numpy()
})
val_mse = round(metrics.mean_squared_error(
y_true=validation_diff["y_true"],
y_pred=validation_diff["y_pred"],
), 3)
validation_diff.plot(kind="line",
y=["y_true", "y_pred"],
colormap="coolwarm",
figsize=(12, 3),
fontsize=12,
title=f"Validation predictions. neg_mean_squared_error = {val_mse}",
xlabel="index",
ylabel="value")
dataset_test = df_transform_to_keras_dataset(end_validation_idx, df.shape[0]-1)
for x_test, y_test in dataset_test.take(1):
print(x_test.shape, y_test.shape)
test_diff = pd.DataFrame({
"y_pred": model.predict(x_test).flatten(),
"y_true": y_test.numpy()
})
test_mse = round(metrics.mean_squared_error(
y_true=test_diff["y_true"],
y_pred=test_diff["y_pred"],
), 3)
test_diff.plot(kind="line",
y=["y_true", "y_pred"],
colormap="coolwarm",
figsize=(12, 3),
fontsize=12,
title=f"Test predictions. neg_mean_squared_error = {test_mse}",
xlabel="index",
ylabel="value")
%%shell
jupyter nbconvert --to html "/content/drive/My Drive/Colab Notebooks/forecasting/lstm_synthetic_data.ipynb"