Source code for pliers.extractors.base

''' Base Extractor class and associated functionality. '''

from abc import ABCMeta, abstractmethod
import json

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype

from pliers.transformers import Transformer
from pliers.utils import isgenerator, flatten


[docs]class Extractor(Transformer, metaclass=ABCMeta): ''' Base class for all pliers Extractors.'''
[docs] def transform(self, stim, *args, **kwargs): result = super().transform(stim, *args, **kwargs) return list(result) if isgenerator(result) else result
@abstractmethod def _extract(self, stim): pass def _transform(self, stim, *args, **kwargs): return self._extract(stim, *args, **kwargs) def plot(self, result, stim=None): raise NotImplementedError("No plotting method is defined for class " "%s." % self.__class__.__name__)
[docs]class ExtractorResult: ''' Stores feature data produced by an Extractor. Args: data (ndarray, iterable): Extracted feature data. Either an ndarray (1-d or 2-d), an iterable, or a raw result. If a raw result is passed, the source Extractor must implement _to_df(). stim (Stim): The input Stim object from which features were extracted. extractor (Extractor): The Extractor object used in extraction. features (list, ndarray): Optional names of extracted features. If passed, must have as many elements as there are columns in data. onsets (list, ndarray): Optional iterable giving the onsets of the rows in data. Length must match the input data. durations (list, ndarray): Optional iterable giving the durations associated with the rows in data. orders (list, ndarray): Optional iterable giving the integer orders associated with the rows in data. '''
[docs] def __init__(self, data, stim, extractor, features=None, onsets=None, durations=None, orders=None): self._data = data self.stim = stim self.extractor = extractor self.features = features self._history = None self.onset = onsets self.duration = durations self.order = orders
@property def raw(self): ''' Stores raw result of extraction, prior to postprocessing done in to_df(). ''' return self._data if hasattr(self.extractor, '_to_df') else None @property def data(self): ''' Creates a DataFrame with default arguments ''' return self.to_df()
[docs] def to_df(self, timing=True, metadata=False, format='wide', extractor_name=False, object_id=True, extractor_params=False, **to_df_kwargs): ''' Convert current instance to a pandas DatasFrame. Args: timing (bool): If True, adds columns for event onset and duration. Note that these columns will be added even if there are no valid values in the current object (NaNs will be inserted). If 'auto', timing columns are only inserted if there's at least one valid (i.e., non-NaN) onset/order/duration. metadata (bool): If True, adds columns for key metadata (including the name, filename, class, history, and source file of the Stim). format (str): Format to return the data in. Can be either 'wide' or 'long'. In the wide case, every extracted feature is a column, and every result object is in a row. In the long case, every row contains a single record/feature combination. extractor_name (bool): If True, includes the Extractor name as a column (in 'long' format) or index level (in 'wide' format). object_id (bool): If True, attempts to intelligently add an 'object_id' column that differentiates between multiple objects in the results that may share onsets and durations (and would otherwise be impossible to distinguish). This frequently occurs for ImageExtractors that identify multiple target objects (e.g., faces) within a single ImageStim. In addition to boolean values, the special value 'auto' can be passed, in which case the object_id column will only be inserted if the resulting constant would be non-constant. extractor_params (bool): if True, returns log_attributes of at extraction time, as stored in transformer_params attribute in ExtractorResult.history. These are returned as serialized dictionary in extractor_params column. Returns: A pandas DataFrame. ''' # Ideally, Extractors should implement their own _to_df() class method # that produces a DataFrame in standardized format. Failing that, we # assume self._data is already array-like and can be wrapped in a DF. if hasattr(self.extractor, '_to_df'): df = self.extractor._to_df(self, **to_df_kwargs) features = self.features else: features = self.features data = np.array(self._data) if features is None: features = ['feature_%d' % (i + 1) for i in range(data.shape[1])] df = pd.DataFrame(data, columns=features) if features is not None: index_cols = list(set(df.columns) - set(features)) else: index_cols = [] if hasattr(self, '_onsets'): onsets = np.array(self._onsets) onsets += 0.0 if self.onset is None else self.onset else: onsets = np.nan if self.onset is None else self.onset durations = getattr(self, '_durations', self.duration) durations = np.nan if durations is None else durations orders = getattr(self, '_orders', self.order) orders = np.nan if orders is None else orders # If any features clash with protected keys, append underscore protected = ['onset', 'order', 'duration', 'extractor', 'stim_name', 'class', 'filename', 'history', 'source_file'] df = df.rename(columns={k: k + '_' for k in protected}) # Generally we leave it to Extractors to properly track the number of # objects returned in the result DF, using the 'object_id' column. # But in cases where the Extractor punt on this and_object_id=True, we # take our best guess. The logic is that we increment the object # counter for any row in the DF that cannot be uniquely distinguished # from other rows by onset and duration. if object_id: if 'object_id' not in df.columns: index_cols.append('object_id') index = pd.Series(onsets).astype(str) + '_' + \ pd.Series(durations).astype(str) if object_id is True or (object_id == 'auto' and len(set(index)) < len(df)): ids = np.arange(len(df)) if len(index) == 1 \ else df.groupby(index).cumcount() df.insert(0, 'object_id', ids) if timing is True or (timing == 'auto' and (np.isfinite(durations).any() or np.isfinite(orders).any())): df.insert(0, 'onset', onsets) df.insert(0, 'duration', durations) df.insert(0, 'order', orders) df = df.sort_values('onset').reset_index(drop=True) index_cols.extend(['onset', 'order', 'duration']) if format == 'long': df = df.melt(index_cols, var_name='feature') df = df.dropna(subset=['value']) if extractor_name: name = self.extractor.name if format == 'long': df['extractor'] = name else: df.columns = pd.MultiIndex.from_product([[name], df.columns]) if metadata: df['stim_name'] = self.stim.name df['class'] = self.stim.__class__.__name__ df['filename'] = self.stim.filename hist = '' if self.stim.history is None else str(self.stim.history) df['history'] = hist df['source_file'] = self.history.to_df().iloc[0].source_file if extractor_params: dict_params = eval(self.history.transformer_params) df['extractor_params'] = json.dumps(dict_params) return df
@property def history(self): ''' Returns the transformation history for the input Stim. ''' return self._history @history.setter def history(self, history): self._history = history
[docs]def merge_results(results, format='wide', timing=True, metadata=True, extractor_names=True, object_id=True, extractor_params=False, aggfunc=None, invalid_results='ignore', **to_df_kwargs): ''' Merges a list of ExtractorResults instances and returns a pandas DF. Args: results (list, tuple): A list of ExtractorResult instances to merge. format (str): Format to return the data in. Can be either 'wide' or 'long'. In the wide case, every extracted feature is a column, and every Stim is a row. In the long case, every row contains a single Stim/Extractor/feature combination. timing (bool, str): Whether or not to include columns for onset, order, and duration. metadata (bool): if True, includes Stim metadata columns in the returned DataFrame. These columns include 'stim_name', 'class', 'filename', 'history', and 'source_file'. Note that these values are often long strings, so the returned DF will be considerably larger. extractor_names (str, bool): How to handle extractor names when returning results. The specific behavior depends on whether format is 'long' or 'wide'. Valid values include: - 'prepend' or True: In both 'long' and 'wide' formats, feature names will be prepended with the Extractor name (e.g., "FaceExtractor#face_likelihood"). - 'drop' or False: In both 'long' and 'wide' formats, extractor names will be omitted entirely from the result. Note that this can create feature name conflicts when merging results from multiple Extractors, so is generally discouraged. - 'column': In 'long' format, extractor name will be included as a separate column. Not valid for 'wide' format (and will raise an error). - 'multi': In 'wide' format, a MultiIndex will be used for the columns, with the first level of the index containing the Extractor name and the second level containing the feature name. This value is invalid if format='long' (and will raise and error). object_id (bool): If True, attempts to intelligently add an 'object_id' column that differentiates between multiple objects in the results that may share onsets/orders/durations (and would otherwise be impossible to distinguish). This frequently occurs for ImageExtractors that identify multiple target objects (e.g., faces) within a single ImageStim. Default is 'auto', which includes the 'object_id' column if and only if it has a non-constant value. extractor_params (bool): If True, returns serialized extractor_params of the extractor, i.e. log_attributes at time of extraction. If format='wide', merge_results returns one column per extractor, each named ExtractorName#FeatureName#extractor_params. If format='long', returns only one column named extractor_params. aggfunc (str, Callable): If format='wide' and extractor_names='drop', it's possible for name clashes between features to occur. In such cases, the aggfunc argument is passed onto pandas' pivot_table function, and specifies how to aggregate multiple values for the same index. Can be a callable or any string value recognized by pandas. By default (None), 'mean' will be used for numeric columns and 'first' will be used for object/categorical columns. invalid_results (str): Specifies desired action for treating elements of the passed in results argument that are not ExtractorResult objects. Valid values include: - 'ignore' will ignore them and merge the valid ExtractorResults. - 'fail' will raise an exception on any invalid input Returns: a pandas DataFrame. For format details, see 'format' argument. ''' results = flatten(results) _timing = True if timing == 'auto' else timing _object_id = True if object_id == 'auto' else object_id if extractor_names is True: extractor_names = 'prepend' elif extractor_names is False: extractor_names = 'drop' dfs = [] for r in results: if isinstance(r, ExtractorResult): dfs.append(r.to_df(timing=_timing, metadata=metadata, format='long', extractor_name=True, object_id=_object_id, extractor_params=extractor_params, **to_df_kwargs)) elif invalid_results == 'fail': raise ValueError("At least one of the provided results was not an" "ExtractorResult. Set the invalid_results" "parameter to 'ignore' if you wish to ignore" "this.") if len(dfs) == 0: return pd.DataFrame() data = pd.concat(dfs, axis=0).reset_index(drop=True) if object_id == 'auto' and data['object_id'].nunique() == 1: data = data.drop('object_id', axis=1) unique_ext = data['extractor'] + '#' + data['feature'].astype(str) if extractor_names in ['prepend', 'multi']: data['feature'] = unique_ext if format == 'wide': ind_cols = {'stim_name', 'onset', 'order', 'duration', 'object_id', 'class', 'filename', 'history', 'source_file'} ind_cols = list(ind_cols & set(data.columns)) # pandas groupby/index operations can't handle NaNs in index, (see # issue at https://github.com/pandas-dev/pandas/issues/3729), so we # replace NaNs with a placeholder and then re-substitute after # pivoting. dtypes = data[ind_cols].dtypes data[ind_cols] = data[ind_cols].fillna('PlAcEholdER') # Set default aggfunc based on column type, otherwise bad things happen if aggfunc is None: aggfunc = 'mean' if is_numeric_dtype(data['value']) else 'first' # add conditional on value of extractor_names if extractor_params: data['unique_extractor'] = unique_ext.astype(str) + '#extractor_params' attrs = data.pivot_table(index=ind_cols, columns='unique_extractor', values='extractor_params', aggfunc='first') data = data.pivot_table(index=ind_cols, columns='feature', values='value', aggfunc=aggfunc) if extractor_params: data = pd.concat([data,attrs], axis=1) data = data.reset_index() data.columns.name = None # vestigial--is set to 'feature' data[ind_cols] = data[ind_cols].replace('PlAcEholdER', np.nan) data[ind_cols] = data[ind_cols].astype(dict(zip(ind_cols, dtypes))) if extractor_names != 'column' and 'extractor' in data.columns: data = data.drop('extractor', axis=1) if timing == 'auto' and 'onset' in data.columns: if data['onset'].isnull().all(): data = data.drop(['onset', 'order', 'duration'], axis=1) if 'onset' in data.columns: key = [('onset', ''), ('order', ''), ('duration', '')] \ if isinstance(data.columns, pd.MultiIndex) \ else ['onset', 'order', 'duration'] data = data.sort_values(key).reset_index(drop=True) if extractor_names == 'multi': if format == 'long': raise ValueError("Invalid extractor_names value 'multi'. When " "format is 'long', extractor_names must be " "one of 'drop', 'prepend', or 'column'.") data.columns = pd.MultiIndex.from_tuples( [c.split('#') for c in data.columns]) return data