"""
This module is an sklearn compatible pipeline for machine learning
time series data and sequences using a sliding window segmentation
"""
# Author: David Burns
# License: BSD
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from .transform import XyTransformerMixin, Segment
from .util import segmented_prediction_to_series
[docs]class Pype(Pipeline):
"""
This pipeline extends the sklearn Pipeline to support transformers that change X, y,
sample_weight, and the number of samples.
It also adds some new options for setting hyper-parameters with callables and in reference to
other parameters (see examples).
Parameters
----------
steps : list
List of (name, transform) tuples (implementing fit/transform) that are chained, in the
order in which they are chained, with the last object an estimator.
scorer : sklearn scorer object
memory : currently not implemented
Attributes
----------
N_train : number of training samples - available after calling fit method
N_test : number of testing samples - available after calling predict, or score methods
Examples
--------
>>> from seglearn.transform import FeatureRep, SegmentX
>>> from seglearn.pipe import Pype
>>> from seglearn.datasets import load_watch
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.preprocessing import StandardScaler
>>> data = load_watch()
>>> X = data['X']
>>> y = data['y']
>>> pipe = Pype([('segment', SegmentX()),
>>> ('features', FeatureRep()),
>>> ('scaler', StandardScaler()),
>>> ('rf', RandomForestClassifier())])
>>> pipe.fit(X, y)
>>> print(pipe.score(X, y))
"""
# todo: handle steps with None
def __init__(self, steps, scorer=None, memory=None):
self.scorer = scorer
self.N_train = None
self.N_test = None
self.N_fit = None
self.history = None
super(Pype, self).__init__(steps, memory=memory)
[docs] def fit(self, X, y=None, **fit_params):
"""
Fit the model
Fit all the transforms one after the other and transform the
data, then fit the transformed data using the final estimator.
Parameters
----------
X : iterable
Training data. Must fulfill input requirements of first step of the
pipeline.
y : iterable, default=None
Training targets. Must fulfill label requirements for all steps of
the pipeline.
**fit_params : dict of string -> object
Parameters passed to the ``fit`` method of each step, where
each parameter name is prefixed such that parameter ``p`` for step
``s`` has key ``s__p``.
Returns
-------
self : Pipeline
This estimator
"""
Xt, yt, fit_params = self._fit(X, y, **fit_params)
self.N_train = len(yt)
if self._final_estimator is not None:
fitres = self._final_estimator.fit(Xt, yt, **fit_params)
if hasattr(fitres, 'history'):
self.history = fitres
return self
def _fit(self, X, y=None, **fit_params):
self.steps = list(self.steps)
self._validate_steps()
fit_params_steps = dict((name, {}) for name, step in self.steps
if step is not None)
for pname, pval in fit_params.items():
step, param = pname.split('__', 1)
fit_params_steps[step][param] = pval
Xt = X
yt = y
# iterate through all but last
for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
if transformer is None:
pass
else:
# not doing cloning for now...
if isinstance(transformer, XyTransformerMixin):
Xt, yt, _ = transformer.fit_transform(Xt, yt, sample_weight=None,
**fit_params_steps[name])
else:
Xt = transformer.fit_transform(Xt, yt, **fit_params_steps[name])
if self._final_estimator is None:
return Xt, yt, {}
return Xt, yt, fit_params_steps[self.steps[-1][0]]
def _transform(self, X, y=None, sample_weight=None):
Xt = X
yt = y
swt = sample_weight
for name, transformer in self.steps[:-1]: # iterate through all but last
if isinstance(transformer, XyTransformerMixin):
Xt, yt, swt = transformer.transform(Xt, yt, swt)
else:
Xt = transformer.transform(Xt)
return Xt, yt, swt
[docs] def predict(self, X):
"""
Apply transforms to the data, and predict with the final estimator
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
Returns
-------
yp : array-like
Predicted transformed target
"""
Xt, _, _ = self._transform(X)
return self._final_estimator.predict(Xt)
[docs] def score(self, X, y=None, sample_weight=None):
"""
Apply transforms, and score with the final estimator
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
y : iterable, default=None
Targets used for scoring. Must fulfill label requirements for all
steps of the pipeline.
sample_weight : array-like, default=None
If not None, this argument is passed as ``sample_weight`` keyword
argument to the ``score`` method of the final estimator.
Returns
-------
score : float
"""
Xt, yt, swt = self._transform(X, y, sample_weight)
self.N_test = len(yt)
score_params = {}
if swt is not None:
score_params['sample_weight'] = swt
if self.scorer is None:
return self._final_estimator.score(Xt, yt, **score_params)
return self.scorer(self._final_estimator, Xt, yt, **score_params)
[docs] def predict_proba(self, X):
"""
Apply transforms, and predict_proba of the final estimator
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
Returns
-------
y_proba : array-like, shape = [n_samples, n_classes]
Predicted probability of each class
"""
Xt, _, _ = self._transform(X)
return self._final_estimator.predict_proba(Xt)
[docs] def decision_function(self, X):
"""
Apply transforms, and decision_function of the final estimator
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
Returns
-------
y_score : array-like, shape = [n_samples, n_classes]
"""
Xt, _, _ = self._transform(X)
return self._final_estimator.decision_function(Xt)
[docs] def predict_log_proba(self, X):
"""
Apply transforms, and predict_log_proba of the final estimator
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
Returns
-------
y_score : array-like, shape = [n_samples, n_classes]
"""
Xt, _, _ = self._transform(X)
return self._final_estimator.predict_log_proba(Xt)
[docs] def predict_as_series(self, X):
"""
Returns predictions in a list, grouping predictions based on the series they were derived from
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
Returns
-------
yp : list
Predictions
"""
ix = np.arange(len(X)) # series index
ixp, yp = self.transform_predict(X, ix)
yp = [yp[ixp == i] for i in ix]
return np.array(yp, dtype=object)
[docs] def predict_unsegmented(self, X, categorical_target=False):
"""
Generates predictions for each time series on the same sampling as the original series, by resampling
a prediction performed with sliding window segmentation
Requires that one of the Segment transforms be part of the pipeline
See plot_feature_rep.py example
Parameters
----------
X : iterable
Data to predict on. Must fulfill input requirements of first step
of the pipeline.
categorical_target : boolean
Set to True for classification problems, and false for regression problems
Returns
-------
yp : iterable
Time series predictions on the same sampling as X
"""
segmenter = self._get_segmenter()
if not segmenter:
raise Exception("Pype does not contain valid segmenter transform")
width = segmenter.width
step = segmenter._step
yp = self.predict_as_series(X)
yu = []
for i, yi in enumerate(yp):
yui = segmented_prediction_to_series(yi, step, width, categorical_target)
d = len(X[i]) - len(yui)
if d > 0:
yui = np.concatenate([yui, np.repeat(yui[-1:], d)], axis=0)
yu.append(yui)
return np.array(yu, dtype=object)
def _get_segmenter(self):
for name, transformer in self.steps[:-1]: # iterate through all but last
if isinstance(transformer, Segment):
return transformer
[docs] def set_params(self, **params):
"""
Set the parameters of this estimator.
Valid parameter keys can be listed with ``get_params()``.
Returns
-------
self
"""
items = self.steps
names, _ = zip(*items)
keys = list(params.keys())
for name in keys:
if '__' not in name and name in names:
# replace an estimator
self._replace_estimator('steps', name, params.pop(name))
if callable(params[name]):
# use a callable or function to set parameters
params[name] = params[name](params)
elif params[name] in keys:
# set one arg from another
params[name] = params[params[name]]
BaseEstimator.set_params(self, **params)
return self