Source code for ticclat.tokenize

"""Generators that produce term-frequency vectors of documents in a corpus.

A document in ticclat is a term-frequency vector (collections.Counter). This
module contains generators that return term-frequency vectors for certain types
of input data.
"""
import bz2

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer


[docs]def ticcl_frequency(in_files, max_word_length=255):
    """
    Generate word-frequency pairs from TICCL frequency files.

    For each file in `in_files`, open it and yield a dictionary with
    frequencies (value) for each word (key).
    """
    for freq_file in in_files:
        word_freqs = {}
        if freq_file.endswith('bz2'):
            file_open = bz2.open(freq_file, 'rt')
        else:
            file_open = open(freq_file)

        with file_open as file_handle:
            for line in file_handle:
                # Sometimes a word contains a space, so we split only on tab.
                word, freq = line.split('\t')

                # The corpus may contain wordforms that are too long
                if len(word) <= max_word_length:
                    word_freqs[word] = int(freq)
        yield word_freqs


[docs]def do_nothing(list_of_words):
    """Return the argument unchanged."""
    return list_of_words


[docs]def terms_documents_matrix_word_lists(word_lists):
    """Returns a terms document matrix and related objects of a corpus

    A terms document matrix contains frequencies of wordforms, with wordforms
    along one matrix axis and documents along the other.

    Inputs:
        word_lists: iterator over lists of words
    Returns:
        corpus: a sparse terms documents matrix
        vocabulary: the vectorizer object containing the vocabulary (i.e., all word forms
                    in the corpus)
    """
    vocabulary = CountVectorizer(tokenizer=do_nothing, lowercase=False)
    corpus = vocabulary.fit_transform(word_lists)

    return corpus, vocabulary


[docs]def terms_documents_matrix_ticcl_frequency(in_files):
    """Returns a terms document matrix and related objects of a corpus

    A terms document matrix contains frequencies of wordforms, with wordforms
    along one matrix axis (columns) and documents along the other (rows).

    Inputs:
        in_files: list of ticcl frequency files (one per document in the
            corpus)
    Returns:
        corpus: a sparse terms documents matrix
        vocabulary: the vectorizer object containing the vocabulary (i.e., all word forms
                    in the corpus)
    """
    vocabulary = DictVectorizer()
    corpus = vocabulary.fit_transform(ticcl_frequency(in_files))

    return corpus, vocabulary