Source code for seglearn.pipe

"""
This module is an sklearn compatible pipeline for machine learning
time series data and sequences using a sliding window segmentation
"""
# Author: David Burns
# License: BSD

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline

from .transform import XyTransformerMixin, Segment
from .util import segmented_prediction_to_series


[docs]class Pype(Pipeline): """ This pipeline extends the sklearn Pipeline to support transformers that change X, y, sample_weight, and the number of samples. It also adds some new options for setting hyper-parameters with callables and in reference to other parameters (see examples). Parameters ---------- steps : list List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an estimator. scorer : sklearn scorer object memory : currently not implemented Attributes ---------- N_train : number of training samples - available after calling fit method N_test : number of testing samples - available after calling predict, or score methods Examples -------- >>> from seglearn.transform import FeatureRep, SegmentX >>> from seglearn.pipe import Pype >>> from seglearn.datasets import load_watch >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.preprocessing import StandardScaler >>> data = load_watch() >>> X = data['X'] >>> y = data['y'] >>> pipe = Pype([('segment', SegmentX()), >>> ('features', FeatureRep()), >>> ('scaler', StandardScaler()), >>> ('rf', RandomForestClassifier())]) >>> pipe.fit(X, y) >>> print(pipe.score(X, y)) """ # todo: handle steps with None def __init__(self, steps, scorer=None, memory=None): self.scorer = scorer self.N_train = None self.N_test = None self.N_fit = None self.history = None super(Pype, self).__init__(steps, memory=memory)
[docs] def fit(self, X, y=None, **fit_params): """ Fit the model Fit all the transforms one after the other and transform the data, then fit the transformed data using the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. Returns ------- self : Pipeline This estimator """ Xt, yt, fit_params = self._fit(X, y, **fit_params) self.N_train = len(yt) if self._final_estimator is not None: fitres = self._final_estimator.fit(Xt, yt, **fit_params) if hasattr(fitres, 'history'): self.history = fitres return self
def _fit(self, X, y=None, **fit_params): self.steps = list(self.steps) self._validate_steps() fit_params_steps = dict((name, {}) for name, step in self.steps if step is not None) for pname, pval in fit_params.items(): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X yt = y # iterate through all but last for step_idx, (name, transformer) in enumerate(self.steps[:-1]): if transformer is None: pass else: # not doing cloning for now... if isinstance(transformer, XyTransformerMixin): Xt, yt, _ = transformer.fit_transform(Xt, yt, sample_weight=None, **fit_params_steps[name]) else: Xt = transformer.fit_transform(Xt, yt, **fit_params_steps[name]) if self._final_estimator is None: return Xt, yt, {} return Xt, yt, fit_params_steps[self.steps[-1][0]] def _transform(self, X, y=None, sample_weight=None): Xt = X yt = y swt = sample_weight for name, transformer in self.steps[:-1]: # iterate through all but last if isinstance(transformer, XyTransformerMixin): Xt, yt, swt = transformer.transform(Xt, yt, swt) else: Xt = transformer.transform(Xt) return Xt, yt, swt
[docs] def transform(self, X, y=None): """ Apply transforms, and transform with the final estimator This also works where final estimator is ``None``: all prior transformations are applied. Parameters ---------- X : iterable Data to transform. Must fulfill input requirements of first step of the pipeline. y : array-like Target Returns ------- Xt : array-like, shape = [n_samples, n_transformed_features] Transformed data yt : array-like, shape = [n_samples] Transformed target """ Xt, yt, _ = self._transform(X, y) if isinstance(self._final_estimator, XyTransformerMixin): Xt, yt, _ = self._final_estimator.transform(Xt, yt) else: Xt = self._final_estimator.transform(Xt) return Xt, yt
[docs] def fit_transform(self, X, y=None, **fit_params): """ Fit the model and transform with the final estimator Fits all the transforms one after the other and transforms the data, then uses fit_transform on transformed data with the final estimator. Parameters ---------- X : iterable Training data. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. Returns ------- Xt : array-like, shape = [n_samples, n_transformed_features] Transformed samples yt : array-like, shape = [n_samples] Transformed target """ Xt, yt, fit_params = self._fit(X, y, **fit_params) if isinstance(self._final_estimator, XyTransformerMixin): Xt, yt, _ = self._final_estimator.fit_transform(Xt, yt) else: if hasattr(self._final_estimator, 'fit_transform'): Xt = self._final_estimator.fit_transform(Xt, yt) else: self._final_estimator.fit(Xt, yt) Xt = self._final_estimator.transform(Xt) self.N_fit = len(yt) return Xt, yt
[docs] def predict(self, X): """ Apply transforms to the data, and predict with the final estimator Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. Returns ------- yp : array-like Predicted transformed target """ Xt, _, _ = self._transform(X) return self._final_estimator.predict(Xt)
[docs] def transform_predict(self, X, y): """ Apply transforms to the data, and predict with the final estimator. Unlike predict, this also returns the transformed target Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. y : array-like target Returns ------- yt : array-like Transformed target yp : array-like Predicted transformed target """ Xt, yt, _ = self._transform(X, y) yp = self._final_estimator.predict(Xt) return yt, yp
[docs] def score(self, X, y=None, sample_weight=None): """ Apply transforms, and score with the final estimator Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. y : iterable, default=None Targets used for scoring. Must fulfill label requirements for all steps of the pipeline. sample_weight : array-like, default=None If not None, this argument is passed as ``sample_weight`` keyword argument to the ``score`` method of the final estimator. Returns ------- score : float """ Xt, yt, swt = self._transform(X, y, sample_weight) self.N_test = len(yt) score_params = {} if swt is not None: score_params['sample_weight'] = swt if self.scorer is None: return self._final_estimator.score(Xt, yt, **score_params) return self.scorer(self._final_estimator, Xt, yt, **score_params)
[docs] def predict_proba(self, X): """ Apply transforms, and predict_proba of the final estimator Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. Returns ------- y_proba : array-like, shape = [n_samples, n_classes] Predicted probability of each class """ Xt, _, _ = self._transform(X) return self._final_estimator.predict_proba(Xt)
[docs] def decision_function(self, X): """ Apply transforms, and decision_function of the final estimator Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. Returns ------- y_score : array-like, shape = [n_samples, n_classes] """ Xt, _, _ = self._transform(X) return self._final_estimator.decision_function(Xt)
[docs] def predict_log_proba(self, X): """ Apply transforms, and predict_log_proba of the final estimator Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. Returns ------- y_score : array-like, shape = [n_samples, n_classes] """ Xt, _, _ = self._transform(X) return self._final_estimator.predict_log_proba(Xt)
[docs] def predict_as_series(self, X): """ Returns predictions in a list, grouping predictions based on the series they were derived from Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. Returns ------- yp : list Predictions """ ix = np.arange(len(X)) # series index ixp, yp = self.transform_predict(X, ix) yp = [yp[ixp == i] for i in ix] return np.array(yp, dtype=object)
[docs] def predict_unsegmented(self, X, categorical_target=False): """ Generates predictions for each time series on the same sampling as the original series, by resampling a prediction performed with sliding window segmentation Requires that one of the Segment transforms be part of the pipeline See plot_feature_rep.py example Parameters ---------- X : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. categorical_target : boolean Set to True for classification problems, and false for regression problems Returns ------- yp : iterable Time series predictions on the same sampling as X """ segmenter = self._get_segmenter() if not segmenter: raise Exception("Pype does not contain valid segmenter transform") width = segmenter.width step = segmenter._step yp = self.predict_as_series(X) yu = [] for i, yi in enumerate(yp): yui = segmented_prediction_to_series(yi, step, width, categorical_target) d = len(X[i]) - len(yui) if d > 0: yui = np.concatenate([yui, np.repeat(yui[-1:], d)], axis=0) yu.append(yui) return np.array(yu, dtype=object)
def _get_segmenter(self): for name, transformer in self.steps[:-1]: # iterate through all but last if isinstance(transformer, Segment): return transformer
[docs] def set_params(self, **params): """ Set the parameters of this estimator. Valid parameter keys can be listed with ``get_params()``. Returns ------- self """ items = self.steps names, _ = zip(*items) keys = list(params.keys()) for name in keys: if '__' not in name and name in names: # replace an estimator self._replace_estimator('steps', name, params.pop(name)) if callable(params[name]): # use a callable or function to set parameters params[name] = params[name](params) elif params[name] in keys: # set one arg from another params[name] = params[params[name]] BaseEstimator.set_params(self, **params) return self