Scoring Time Series Estimators

This examples demonstrates some of the caveats / issues when trying to calculate performance scores for time series estimators.

This pipeline has been designed to evaluate performance using segments (not series’) as instances of the data.

# Author: David Burns
# License: BSD


import itertools

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from seglearn.datasets import load_watch
from seglearn.pipe import Pype
from seglearn.transform import FeatureRep, Segment

CONFUSION PLOT

def plot_confusion_matrix(cm, classes,
                          normalize=True,
                          cmap=plt.cm.Blues):
    """ plots confusion matrix """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

SETUP

# load the data
data = load_watch()
X = data['X']
y = data['y']

# create a feature representation pipeline
steps = [('seg', Segment()),
         ('features', FeatureRep()),
         ('scaler', StandardScaler()),
         ('rf', RandomForestClassifier(n_estimators=20))]

pipe = Pype(steps)

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

OPTION 1: Use the score SegPipe score method

pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)
print("Accuracy score: ", score)

Out:

Accuracy score:  0.7556131260794473

OPTION 2: generate true and predicted target values for the segments

y_true, y_pred = pipe.transform_predict(X_test, y_test)
# use any of the sklearn scorers
f1_macro = f1_score(y_true, y_pred, average='macro')
print("F1 score: ", f1_macro)

cm = confusion_matrix(y_true, y_pred)
plot_confusion_matrix(cm, data['y_labels'])
../_images/sphx_glr_plot_scoring_001.png

Out:

F1 score:  0.7683103625934831

OPTION 3: scoring during model selection

# model selection using the built-in score method for the final estimator
cv_scores = cross_validate(pipe, X, y, cv=4, return_train_score=True)
print("CV Scores: ", pd.DataFrame(cv_scores))

# model selection with scoring functions / dictionaries
#
# unfortunately, this is not possible withing the current framework due to how
# scoring is implemented within the model_selection functions / classes of sklearn
# running the code below will cause an error, because the model_selection
# functions / classes do not have access to y_true for the segments
#
# >>> scoring = ['accuracy','precision_macro','recall_macro','f1_macro']
# >>> cv_scores = cross_validate(pipe, X, y, cv = 4, return_train_score=True, scoring=scoring)
#
# workarounds for this issue are outlined below

Out:

CV Scores:     fit_time  score_time  test_score  train_score
0  0.446275    0.076087    0.784972     0.999708
1  0.502541    0.079094    0.842857     1.000000
2  0.466788    0.069131    0.774956     1.000000
3  0.431198    0.071541    0.770619     1.000000

SCORING WORKAROUND 1: USE ANOTHER SCORER FUNCTION

# ``SegPipe`` can be initialized with a scorer callable made with sklearn.metrics.make_scorer
# this can be used to cross_validate or grid search with any 1 score

scorer = make_scorer(f1_score, average='macro')
pipe = Pype(steps, scorer=scorer)
cv_scores = cross_validate(pipe, X, y, cv=4, return_train_score=True)
print("CV F1 Scores: ", pd.DataFrame(cv_scores))

Out:

CV F1 Scores:     fit_time  score_time  test_score  train_score
0  0.431416    0.073237    0.824988          1.0
1  0.467504    0.068295    0.841483          1.0
2  0.446201    0.069201    0.791326          1.0
3  0.435928    0.070309    0.797336          1.0

SCORING WORKAROUND 2: WORK OUTSIDE THE PIPELINE

# If you want to have multiple score computed, the only way is as follows
#
# First transform the time series data into segments and then use an sklearn Pipeline
#
# The disadvantage of this is that the parameters of the segmentation cannot be
# optimized with this approach

segmenter = Segment()
X_seg, y_seg, _ = segmenter.fit_transform(X, y)

clf = Pipeline([('features', FeatureRep()),
                ('scaler', StandardScaler()),
                ('rf', RandomForestClassifier())])

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_scores = cross_validate(clf, X_seg, y_seg,
                           cv=4, return_train_score=False, scoring=scoring)
print("CV Scores (workaround): ", pd.DataFrame(cv_scores))

Out:

CV Scores (workaround):     fit_time  score_time  test_accuracy  test_precision_macro  test_recall_macro  test_f1_macro
0  1.312680    0.081219       0.813675              0.829951           0.827524       0.823218
1  1.308434    0.079772       0.816082              0.840500           0.834377       0.831517
2  1.313067    0.081143       0.810950              0.838272           0.820251       0.822046
3  1.310060    0.090736       0.799829              0.822963           0.818255       0.816566

Total running time of the script: ( 0 minutes 12.474 seconds)

Gallery generated by Sphinx-Gallery