Source code for ticclat.tokenize

"""Generators that produce term-frequency vectors of documents in a corpus.

A document in ticclat is a term-frequency vector (collections.Counter). This
module contains generators that return term-frequency vectors for certain types
of input data.
"""
import bz2

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer


[docs]def ticcl_frequency(in_files, max_word_length=255): """ Generate word-frequency pairs from TICCL frequency files. For each file in `in_files`, open it and yield a dictionary with frequencies (value) for each word (key). """ for freq_file in in_files: word_freqs = {} if freq_file.endswith('bz2'): file_open = bz2.open(freq_file, 'rt') else: file_open = open(freq_file) with file_open as file_handle: for line in file_handle: # Sometimes a word contains a space, so we split only on tab. word, freq = line.split('\t') # The corpus may contain wordforms that are too long if len(word) <= max_word_length: word_freqs[word] = int(freq) yield word_freqs
[docs]def do_nothing(list_of_words): """Return the argument unchanged.""" return list_of_words
[docs]def terms_documents_matrix_word_lists(word_lists): """Returns a terms document matrix and related objects of a corpus A terms document matrix contains frequencies of wordforms, with wordforms along one matrix axis and documents along the other. Inputs: word_lists: iterator over lists of words Returns: corpus: a sparse terms documents matrix vocabulary: the vectorizer object containing the vocabulary (i.e., all word forms in the corpus) """ vocabulary = CountVectorizer(tokenizer=do_nothing, lowercase=False) corpus = vocabulary.fit_transform(word_lists) return corpus, vocabulary
[docs]def terms_documents_matrix_ticcl_frequency(in_files): """Returns a terms document matrix and related objects of a corpus A terms document matrix contains frequencies of wordforms, with wordforms along one matrix axis (columns) and documents along the other (rows). Inputs: in_files: list of ticcl frequency files (one per document in the corpus) Returns: corpus: a sparse terms documents matrix vocabulary: the vectorizer object containing the vocabulary (i.e., all word forms in the corpus) """ vocabulary = DictVectorizer() corpus = vocabulary.fit_transform(ticcl_frequency(in_files)) return corpus, vocabulary