Source code for pliers.stimuli.text

''' Classes that represent text or sequences of text. '''

import re
from urllib.request import urlopen

import pandas as pd

from pliers.support.decorators import requires_nltk_corpus
from pliers.utils import attempt_to_import, verify_dependencies
from .base import Stim

pysrt = attempt_to_import('pysrt')


[docs]class TextStim(Stim):

    ''' Any simple text stimulus--most commonly a single word.

    Args:
        filename (str): Path to input file, if one exists.
        text (str): Text value to store. If none is provided, value is read
            from filename.
        onset (float): Optional onset of the text presentation (in secs) with
            respect to some more general context or timeline the user wishes
            to keep track of.
        duration (float): Optional duration of the TextStim, in seconds.
        order (int): Optional sequential index of the TextStim within some
            broader context.
        url (str): Optional url to read contents from.
    '''

    _default_file_extension = '.txt'

[docs]    def __init__(self, filename=None, text=None, onset=None, duration=None,
                 order=None, url=None):
        if filename is not None and text is None:
            text = open(filename).read()
        if url is not None:
            text = urlopen(url).read()
        self.text = text
        name = 'text[%s]' % text[:40]  # Truncate at 40 chars
        super().__init__(filename, onset, duration, order,
                                       name=name, url=url)

    @property
    def data(self):
        return self.text

    def save(self, path):
        with open(path, 'w') as f:
            f.write(self.text)


[docs]class ComplexTextStim(Stim):

    ''' A collection of text stims (e.g., a story), typically ordered and with
    onsets and/or durations associated with each element.

    Args:
        filename (str): The filename to read from. Must be tab-delimited text.
            Files must always contain a column containing the text of each
            stimulus in the collection. Optionally, additional columns can be
            included that contain duration and onset information. If a header
            row is present in the file, valid columns must be labeled as
            'text', 'onset', and 'duration' where available (though only text
            is mandatory). If no header is present in the file, the columns
            argument will be used to infer the indices of the key columns.
        onset (float): Optional onset of the ComplexTextStim relative to some
            more general context.
        duration (float): Optional duration of the ComplexTextStim withing some
            more general context.
        columns (str): Optional specification of column order. An abbreviated
            string denoting the column position of text, onset, and duration
            in the file. Use t for text, o for onset, d for duration. For
            example, passing 'ot' indicates that the first column contains
            the onsets and the second contains the text. E.g., passing 'tod'
            indicates that the first three columns contain text, onset, and
            duration information, respectively. Note that if the input file
            contains a header row, the columns argument will be ignored.
        default_duration (float): the duration to assign to any text elements
            in the collection that do not have an explicit value provided
            in the input file.
        elements (list): An optional list of TextStims that comprise the
            ComplexTextStim. If both the filename and elements arguments are
            passed, the TextStims in elements will be appended to the ones
            extracted from the file.
        text (str): Optional multi-token string to convert to a
            ComplexTextStim.
        unit (str): The unit of segmentation. Either 'word' or 'sentence'.
            Ignored if text is None.
        tokenizer: Optional tokenizer to use if initializing from text. If
            passed, will override the default nltk tokenizers. If a string is
            passed, it is interpreted as a capturing regex and passed to
            re.findall(). Otherwise, must be an object that implements a
            tokenize() method and returns a list of tokens. Ignored if text is
            None.
        language (str): The language to use; passed to nltk. Only used if
            tokenizer is None. Defaults to English. Ignored if text is None.
    '''

    _default_file_extension = '.txt'

[docs]    def __init__(self, filename=None, onset=None, duration=None, columns=None,
                 default_duration=None, elements=None, text=None, unit='word',
                 tokenizer=None, language='english'):

        if filename is None and elements is None and text is None:
            raise ValueError("At least one of the 'filename', 'elements', or "
                             "text arguments must be specified.")

        super().__init__(filename, onset, duration)

        self._elements = []

        if filename is not None:
            if filename.endswith('srt'):
                self._from_srt(filename)
            else:
                self._from_file(filename, columns, default_duration)

        if elements is not None:
            self._elements.extend(elements)

        if text is not None:
            self._from_text(text, unit, tokenizer, language)

    @property
    def elements(self):
        return [f for f in self]

    def _from_file(self, filename, columns, default_duration):
        tod_names = {'t': 'text', 'o': 'onset', 'd': 'duration'}

        first_row = open(filename).readline().strip().split('\t')
        if len(set(first_row) & set(tod_names.values())):
            col_names = None
        else:
            col_names = [tod_names[x] for x in columns]

        data = pd.read_csv(filename, sep='\t', names=col_names)

        for i, r in data.iterrows():
            if 'onset' not in r:
                elem = TextStim(r['text'], order=i)
            else:
                duration = r.get('duration', None)
                if duration is None:
                    duration = default_duration
                elem = TextStim(filename, r['text'], r['onset'], duration)
            self._elements.append(elem)

    def save(self, path):
        if path.endswith('srt'):
            verify_dependencies(['pysrt'])
            from pysrt import SubRipFile, SubRipItem
            from datetime import time

            out = SubRipFile()
            for elem in self._elements:
                start = time(*self._to_tup(elem.onset))
                end = time(*self._to_tup(elem.onset + elem.duration))
                out.append(SubRipItem(0, start, end, elem.text))
            out.save(path)
        else:
            with open(path, 'w') as f:
                f.write('onset\ttext\tduration\n')
                for elem in self._elements:
                    f.write('{}\t{}\t{}\n'.format(elem.onset,
                                                  elem.text,
                                                  elem.duration))

    def _from_srt(self, filename):
        verify_dependencies(['pysrt'])

        data = pysrt.open(filename)
        list_ = [[] for _ in data]
        for i, row in enumerate(data):
            start = tuple(row.start)
            start_time = self._to_sec(start)

            end_ = tuple(row.end)
            duration = self._to_sec(end_) - start_time

            line = re.sub(r'\s+', ' ', row.text)
            list_[i] = [line, start_time, duration]

        # Convert to pandas DataFrame
        df = pd.DataFrame(columns=["text", "onset", "duration"], data=list_)

        for i, r in df.iterrows():
            elem = TextStim(filename, text=r['text'], onset=r['onset'],
                            duration=r['duration'], order=i)
            self._elements.append(elem)

    def __iter__(self):
        """ Iterate text elements. """
        for elem in self._elements:
            offset = 0.0 if self.onset is None else self.onset
            # Calculate element onset relative to this stimulus
            rel_onset = offset if elem.onset is None else offset + elem.onset
            yield TextStim(elem.filename, elem.text, rel_onset, elem.duration,
                           elem.order)

    def _to_tup(self, sec):
        hours = int(sec / 3600)
        sec = sec % 3600
        mins = int(sec / 60)
        sec = sec % 60
        secs = int(sec)
        microsecs = int((sec - secs) * 1000000)
        return hours, mins, secs, microsecs

    def _to_sec(self, tup):
        hours, mins, secs, msecs = tup
        total_msecs = (hours * 60 * 60 * 1000) + (mins * 60 * 1000) + \
            (secs * 1000) + msecs
        total_secs = total_msecs / 1000.
        return total_secs

    def _from_text(self, text, unit, tokenizer, language):

        if tokenizer is not None:
            if isinstance(tokenizer, str):
                tokens = re.findall(tokenizer, text)
            else:
                tokens = tokenizer.tokenize(text)
        else:
            import nltk

            @requires_nltk_corpus
            def tokenize_text(text):
                if unit == 'word':
                    try:
                        return nltk.word_tokenize(text, language)
                    except LookupError:
                        nltk.download('punkt')
                        return nltk.word_tokenize(text, language)
                elif unit.startswith('sent'):
                    return nltk.sent_tokenize(text, language)
                else:
                    raise ValueError(
                        "unit must be either 'word' or 'sentence'")

            tokens = tokenize_text(text)

        for i, t in enumerate(tokens):
            self._elements.append(TextStim(text=t, onset=None, duration=None,
                                  order=i))

    @property
    def data(self):
        return ' '.join([e.text for e in self._elements])

    def __hash__(self):
        return hash(
            (self.data, self.onset, self.duration, self.order, self.history))