Source code for pliers.extractors.base
''' Base Extractor class and associated functionality. '''
from abc import ABCMeta, abstractmethod
import json
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from pliers.transformers import Transformer
from pliers.utils import isgenerator, flatten
[docs]class Extractor(Transformer, metaclass=ABCMeta):
''' Base class for all pliers Extractors.'''
[docs] def transform(self, stim, *args, **kwargs):
result = super().transform(stim, *args, **kwargs)
return list(result) if isgenerator(result) else result
@abstractmethod
def _extract(self, stim):
pass
def _transform(self, stim, *args, **kwargs):
return self._extract(stim, *args, **kwargs)
def plot(self, result, stim=None):
raise NotImplementedError("No plotting method is defined for class "
"%s." % self.__class__.__name__)
[docs]class ExtractorResult:
''' Stores feature data produced by an Extractor.
Args:
data (ndarray, iterable): Extracted feature data. Either an ndarray
(1-d or 2-d), an iterable, or a raw result. If a raw result is
passed, the source Extractor must implement _to_df().
stim (Stim): The input Stim object from which features were extracted.
extractor (Extractor): The Extractor object used in extraction.
features (list, ndarray): Optional names of extracted features. If
passed, must have as many elements as there are columns in data.
onsets (list, ndarray): Optional iterable giving the onsets of the
rows in data. Length must match the input data.
durations (list, ndarray): Optional iterable giving the durations
associated with the rows in data.
orders (list, ndarray): Optional iterable giving the integer orders
associated with the rows in data.
'''
[docs] def __init__(self, data, stim, extractor, features=None, onsets=None,
durations=None, orders=None):
self._data = data
self.stim = stim
self.extractor = extractor
self.features = features
self._history = None
self.onset = onsets
self.duration = durations
self.order = orders
@property
def raw(self):
''' Stores raw result of extraction, prior to postprocessing done
in to_df(). '''
return self._data if hasattr(self.extractor, '_to_df') else None
@property
def data(self):
''' Creates a DataFrame with default arguments '''
return self.to_df()
[docs] def to_df(self, timing=True, metadata=False, format='wide',
extractor_name=False, object_id=True, extractor_params=False,
**to_df_kwargs):
''' Convert current instance to a pandas DatasFrame.
Args:
timing (bool): If True, adds columns for event onset and duration.
Note that these columns will be added even if there are no
valid values in the current object (NaNs will be inserted).
If 'auto', timing columns are only inserted if there's at least
one valid (i.e., non-NaN) onset/order/duration.
metadata (bool): If True, adds columns for key metadata (including
the name, filename, class, history, and source file of the
Stim).
format (str): Format to return the data in. Can be either 'wide' or
'long'. In the wide case, every extracted feature is a column,
and every result object is in a row. In the long case, every
row contains a single record/feature combination.
extractor_name (bool): If True, includes the Extractor name as
a column (in 'long' format) or index level (in 'wide' format).
object_id (bool): If True, attempts to intelligently add an
'object_id' column that differentiates between multiple objects
in the results that may share onsets and durations (and would
otherwise be impossible to distinguish). This frequently occurs
for ImageExtractors that identify multiple target objects
(e.g., faces) within a single ImageStim. In addition to boolean
values, the special value 'auto' can be passed, in which case
the object_id column will only be inserted if the resulting
constant would be non-constant.
extractor_params (bool): if True, returns log_attributes of
at extraction time, as stored in transformer_params attribute
in ExtractorResult.history. These are returned as serialized
dictionary in extractor_params column.
Returns:
A pandas DataFrame.
'''
# Ideally, Extractors should implement their own _to_df() class method
# that produces a DataFrame in standardized format. Failing that, we
# assume self._data is already array-like and can be wrapped in a DF.
if hasattr(self.extractor, '_to_df'):
df = self.extractor._to_df(self, **to_df_kwargs)
features = self.features
else:
features = self.features
data = np.array(self._data)
if features is None:
features = ['feature_%d' % (i + 1)
for i in range(data.shape[1])]
df = pd.DataFrame(data, columns=features)
if features is not None:
index_cols = list(set(df.columns) - set(features))
else:
index_cols = []
if hasattr(self, '_onsets'):
onsets = np.array(self._onsets)
onsets += 0.0 if self.onset is None else self.onset
else:
onsets = np.nan if self.onset is None else self.onset
durations = getattr(self, '_durations', self.duration)
durations = np.nan if durations is None else durations
orders = getattr(self, '_orders', self.order)
orders = np.nan if orders is None else orders
# If any features clash with protected keys, append underscore
protected = ['onset', 'order', 'duration', 'extractor', 'stim_name',
'class', 'filename', 'history', 'source_file']
df = df.rename(columns={k: k + '_' for k in protected})
# Generally we leave it to Extractors to properly track the number of
# objects returned in the result DF, using the 'object_id' column.
# But in cases where the Extractor punt on this and_object_id=True, we
# take our best guess. The logic is that we increment the object
# counter for any row in the DF that cannot be uniquely distinguished
# from other rows by onset and duration.
if object_id:
if 'object_id' not in df.columns:
index_cols.append('object_id')
index = pd.Series(onsets).astype(str) + '_' + \
pd.Series(durations).astype(str)
if object_id is True or (object_id == 'auto' and
len(set(index)) < len(df)):
ids = np.arange(len(df)) if len(index) == 1 \
else df.groupby(index).cumcount()
df.insert(0, 'object_id', ids)
if timing is True or (timing == 'auto' and
(np.isfinite(durations).any() or
np.isfinite(orders).any())):
df.insert(0, 'onset', onsets)
df.insert(0, 'duration', durations)
df.insert(0, 'order', orders)
df = df.sort_values('onset').reset_index(drop=True)
index_cols.extend(['onset', 'order', 'duration'])
if format == 'long':
df = df.melt(index_cols, var_name='feature')
df = df.dropna(subset=['value'])
if extractor_name:
name = self.extractor.name
if format == 'long':
df['extractor'] = name
else:
df.columns = pd.MultiIndex.from_product([[name], df.columns])
if metadata:
df['stim_name'] = self.stim.name
df['class'] = self.stim.__class__.__name__
df['filename'] = self.stim.filename
hist = '' if self.stim.history is None else str(self.stim.history)
df['history'] = hist
df['source_file'] = self.history.to_df().iloc[0].source_file
if extractor_params:
dict_params = eval(self.history.transformer_params)
df['extractor_params'] = json.dumps(dict_params)
return df
@property
def history(self):
''' Returns the transformation history for the input Stim. '''
return self._history
@history.setter
def history(self, history):
self._history = history
[docs]def merge_results(results, format='wide', timing=True, metadata=True,
extractor_names=True, object_id=True, extractor_params=False,
aggfunc=None, invalid_results='ignore', **to_df_kwargs):
''' Merges a list of ExtractorResults instances and returns a pandas DF.
Args:
results (list, tuple): A list of ExtractorResult instances to merge.
format (str): Format to return the data in. Can be either 'wide' or
'long'. In the wide case, every extracted feature is a column,
and every Stim is a row. In the long case, every row contains a
single Stim/Extractor/feature combination.
timing (bool, str): Whether or not to include columns for onset,
order, and duration.
metadata (bool): if True, includes Stim metadata columns in the
returned DataFrame. These columns include 'stim_name', 'class',
'filename', 'history', and 'source_file'. Note that these values
are often long strings, so the returned DF will be considerably
larger.
extractor_names (str, bool): How to handle extractor names when
returning results. The specific behavior depends on whether format
is 'long' or 'wide'. Valid values include:
- 'prepend' or True: In both 'long' and 'wide' formats,
feature names will be prepended with the Extractor name
(e.g., "FaceExtractor#face_likelihood").
- 'drop' or False: In both 'long' and 'wide' formats, extractor
names will be omitted entirely from the result. Note that
this can create feature name conflicts when merging results
from multiple Extractors, so is generally discouraged.
- 'column': In 'long' format, extractor name will be included
as a separate column. Not valid for 'wide' format (and will
raise an error).
- 'multi': In 'wide' format, a MultiIndex will be used for the
columns, with the first level of the index containing the
Extractor name and the second level containing the feature
name. This value is invalid if format='long' (and will raise
and error).
object_id (bool): If True, attempts to intelligently add an
'object_id' column that differentiates between multiple objects in
the results that may share onsets/orders/durations (and would
otherwise be impossible to distinguish). This frequently occurs for
ImageExtractors that identify multiple target objects (e.g., faces)
within a single ImageStim. Default is 'auto', which includes the
'object_id' column if and only if it has a non-constant value.
extractor_params (bool): If True, returns serialized extractor_params
of the extractor, i.e. log_attributes at time of extraction.
If format='wide', merge_results returns one column per extractor,
each named ExtractorName#FeatureName#extractor_params.
If format='long', returns only one column named extractor_params.
aggfunc (str, Callable): If format='wide' and extractor_names='drop',
it's possible for name clashes between features to occur. In such
cases, the aggfunc argument is passed onto pandas' pivot_table
function, and specifies how to aggregate multiple values for the
same index. Can be a callable or any string value recognized by
pandas. By default (None), 'mean' will be used for numeric columns
and 'first' will be used for object/categorical columns.
invalid_results (str): Specifies desired action for treating elements
of the passed in results argument that are not ExtractorResult
objects. Valid values include:
- 'ignore' will ignore them and merge the valid
ExtractorResults.
- 'fail' will raise an exception on any invalid input
Returns: a pandas DataFrame. For format details, see 'format' argument.
'''
results = flatten(results)
_timing = True if timing == 'auto' else timing
_object_id = True if object_id == 'auto' else object_id
if extractor_names is True:
extractor_names = 'prepend'
elif extractor_names is False:
extractor_names = 'drop'
dfs = []
for r in results:
if isinstance(r, ExtractorResult):
dfs.append(r.to_df(timing=_timing, metadata=metadata,
format='long', extractor_name=True,
object_id=_object_id,
extractor_params=extractor_params,
**to_df_kwargs))
elif invalid_results == 'fail':
raise ValueError("At least one of the provided results was not an"
"ExtractorResult. Set the invalid_results"
"parameter to 'ignore' if you wish to ignore"
"this.")
if len(dfs) == 0:
return pd.DataFrame()
data = pd.concat(dfs, axis=0).reset_index(drop=True)
if object_id == 'auto' and data['object_id'].nunique() == 1:
data = data.drop('object_id', axis=1)
unique_ext = data['extractor'] + '#' + data['feature'].astype(str)
if extractor_names in ['prepend', 'multi']:
data['feature'] = unique_ext
if format == 'wide':
ind_cols = {'stim_name', 'onset', 'order', 'duration', 'object_id',
'class', 'filename', 'history', 'source_file'}
ind_cols = list(ind_cols & set(data.columns))
# pandas groupby/index operations can't handle NaNs in index, (see
# issue at https://github.com/pandas-dev/pandas/issues/3729), so we
# replace NaNs with a placeholder and then re-substitute after
# pivoting.
dtypes = data[ind_cols].dtypes
data[ind_cols] = data[ind_cols].fillna('PlAcEholdER')
# Set default aggfunc based on column type, otherwise bad things happen
if aggfunc is None:
aggfunc = 'mean' if is_numeric_dtype(data['value']) else 'first'
# add conditional on value of extractor_names
if extractor_params:
data['unique_extractor'] = unique_ext.astype(str) + '#extractor_params'
attrs = data.pivot_table(index=ind_cols, columns='unique_extractor',
values='extractor_params', aggfunc='first')
data = data.pivot_table(index=ind_cols, columns='feature',
values='value', aggfunc=aggfunc)
if extractor_params:
data = pd.concat([data,attrs], axis=1)
data = data.reset_index()
data.columns.name = None # vestigial--is set to 'feature'
data[ind_cols] = data[ind_cols].replace('PlAcEholdER', np.nan)
data[ind_cols] = data[ind_cols].astype(dict(zip(ind_cols, dtypes)))
if extractor_names != 'column' and 'extractor' in data.columns:
data = data.drop('extractor', axis=1)
if timing == 'auto' and 'onset' in data.columns:
if data['onset'].isnull().all():
data = data.drop(['onset', 'order', 'duration'], axis=1)
if 'onset' in data.columns:
key = [('onset', ''), ('order', ''), ('duration', '')] \
if isinstance(data.columns, pd.MultiIndex) \
else ['onset', 'order', 'duration']
data = data.sort_values(key).reset_index(drop=True)
if extractor_names == 'multi':
if format == 'long':
raise ValueError("Invalid extractor_names value 'multi'. When "
"format is 'long', extractor_names must be "
"one of 'drop', 'prepend', or 'column'.")
data.columns = pd.MultiIndex.from_tuples(
[c.split('#') for c in data.columns])
return data