Source code for pliers.datasets.text

''' Functionality for loading and manipulating text datasets. '''

import zipfile
import json
import os
import tempfile
import io
import requests
import pandas as pd


def _load_datasets():
    path = os.path.abspath(__file__)
    path = os.path.join(os.path.dirname(path), 'dictionaries.json')
    dicts = json.load(open(path, encoding='utf-8'))
    return dicts

datasets = _load_datasets()


def _get_dictionary_path():
    # For now, stash everything under home directory.
    # TODO: Generalize this to support default system paths, env vars, etc.
    dir_path = os.path.expanduser(
        os.path.join('~', 'pliers_data', 'dictionaries'))
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    return dir_path


def _download_dictionary(url, format, rename):

    tmpdir = tempfile.mkdtemp()
    _file = os.path.join(tmpdir, os.path.basename(url))
    r = requests.get(url)
    with open(_file, 'wb') as f:
        f.write(r.content)

    if format != 'xlsx' and zipfile.is_zipfile(_file):
        with zipfile.ZipFile(_file) as zf:
            source = zf.namelist()[0]
            zf.extract(source, tmpdir)
            _file = os.path.join(tmpdir, source)

    if format == 'csv' or url.endswith('csv'):
        data = pd.read_csv(_file)
    elif format == 'tsv' or url.endswith('tsv'):
        data = pd.read_csv(_file, sep='\t')
    elif format.startswith('xls') or os.path.splitext(url)[1].startswith('xls'):
        data = pd.read_excel(_file, engine='openpyxl')

    if rename is not None:
        data = data.rename(columns=rename)
    return data


[docs]def fetch_dictionary(name, url=None, format=None, index=0, rename=None,
                     save=True, force_retrieve=False):
    ''' Retrieve a dictionary of text norms from the web or local storage.

    Args:
        name (str): The name of the dictionary. If no url is passed, this must
            match either one of the keys in the predefined dictionary file (see
            dictionaries.json), or the name assigned to a previous dictionary
            retrieved from a specific URL.
        url (str): The URL of dictionary file to retrieve. Optional if name
            matches an existing dictionary.
        format (str): One of 'csv', 'tsv', 'xls', or None. Used to read data
            appropriately. Note that most forms of compression will be detected
            and handled automatically, so the format string refers only to the
            format of the decompressed file. When format is None, the format
            will be inferred from the filename.
        index (str, int): The name or numeric index of the column to used as
            the dictionary index. Passed directly to pd.ix.
        rename (dict): An optional dictionary passed to pd.rename(); can be
            used to rename columns in the loaded dictionary. Note that the
            locally-saved dictionary will retain the renamed columns.
        save (bool): Whether or not to save the dictionary locally the first
            time it is retrieved.
        force_retrieve (bool): If True, remote dictionary will always be
            downloaded, even if a local copy exists (and the local copy will
            be overwritten).

    Returns: A pandas DataFrame indexed by strings (typically words).

    '''
    file_path = os.path.join(_get_dictionary_path(), name + '.csv')
    if not force_retrieve and os.path.exists(file_path):
        df = pd.read_csv(file_path)
        index = datasets[name].get('index', df.columns[index])
        return df.set_index(index)

    if name in datasets:
        url = datasets[name]['url']
        format = datasets[name].get('format', format)
        index = datasets[name].get('index', index)
        rename = datasets.get('rename', rename)

    if url is None:
        raise ValueError("Dataset '%s' not found in local storage or presets, "
                         "and no download URL provided." % name)
    data = _download_dictionary(url, format=format, rename=rename)

    if isinstance(index, int):
        index = data.columns[index]
    data = data.set_index(index)

    if save:
        file_path = os.path.join(_get_dictionary_path(), name + '.csv')
        data.to_csv(file_path, encoding='utf-8')
    return data