Source code for abydos.corpus

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.corpus.

The corpus class is a container for linguistic corpora and includes various
functions for corpus statistics, language modeling, etc.
"""

from __future__ import unicode_literals

from math import log10

__all__ = ['Corpus']


[docs]class Corpus(object): """Corpus class. Internally, this is a list of lists or lists. The corpus itself is a list of documents. Each document is an ordered list of sentences in those documents. And each sentence is an ordered list of words that make up that sentence. """ def __init__(self, corpus_text='', doc_split='\n\n', sent_split='\n', filter_chars='', stop_words=None): r"""Initialize Corpus. By default, when importing a corpus: - two consecutive newlines divide documents - single newlines divide sentences - other whitespace divides words :param str corpus_text: the corpus text as a single string :param str doc_split: a character or string used to split corpus_text into documents :param str sent_split: a character or string used to split documents into sentences :param list filter_chars: A list of characters (as a string, tuple, set, or list) to filter out of the corpus text :param list stop_words: A list of words (as a tuple, set, or list) to filter out of the corpus text >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> corp = Corpus(tqbf) """ self.corpus = [] self.doc_split = doc_split self.sent_split = sent_split for document in corpus_text.split(doc_split): doc = [] for sentence in (s.split() for s in document.split(sent_split)): if stop_words: for word in set(stop_words): while word in sentence: sentence.remove(word) for char in set(filter_chars): sentence = [word.replace(char, '') for word in sentence] if sentence: doc.append(sentence) if doc: self.corpus.append(doc)
[docs] def docs(self): r"""Return the docs in the corpus. Each list within a doc represents the sentences in that doc, each of which is in turn a list of words within that sentence. :returns: the paragraphs in the corpus as a list of lists of lists of strs :rtype: [[[str]]] >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> corp = Corpus(tqbf) >>> corp.docs() [[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog', 'ran', 'off.']]] >>> len(corp.docs()) 1 """ return self.corpus
[docs] def paras(self): r"""Return the paragraphs in the corpus. Each list within a paragraph represents the sentences in that doc, each of which is in turn a list of words within that sentence. This is identical to the docs() member function and exists only to mirror part of NLTK's API for corpora. :returns: the paragraphs in the corpus as a list of lists of lists of strs :rtype: [[[str]]] >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> corp = Corpus(tqbf) >>> corp.paras() [[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog', 'ran', 'off.']]] >>> len(corp.paras()) 1 """ return self.docs()
[docs] def sents(self): r"""Return the sentences in the corpus. Each list within a sentence represents the words within that sentence. :returns: the sentences in the corpus as a list of lists of strs :rtype: [[str]] >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> corp = Corpus(tqbf) >>> corp.sents() [['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog', 'ran', 'off.']] >>> len(corp.sents()) 3 """ return [words for sents in self.corpus for words in sents]
[docs] def words(self): r"""Return the words in the corpus as a single list. :returns: the words in the corpus as a list of strs :rtype: [str] >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> corp = Corpus(tqbf) >>> corp.words() ['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.', 'And', 'then', 'it', 'slept.', 'And', 'the', 'dog', 'ran', 'off.'] >>> len(corp.words()) 18 """ return [words for sents in self.sents() for words in sents]
[docs] def docs_of_words(self): r"""Return the docs in the corpus, with sentences flattened. Each list within the corpus represents all the words of that document. Thus the sentence level of lists has been flattened. :returns: the docs in the corpus as a list of list of strs :rtype: [[str]] >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> corp = Corpus(tqbf) >>> corp.docs_of_words() [['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.', 'And', 'then', 'it', 'slept.', 'And', 'the', 'dog', 'ran', 'off.']] >>> len(corp.docs_of_words()) 1 """ return [[words for sents in doc for words in sents] for doc in self.corpus]
[docs] def raw(self): r"""Return the raw corpus. This is reconstructed by joining sub-components with the corpus' split characters :returns: the raw corpus :rtype: str >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n' >>> tqbf += 'And then it slept.\n And the dog ran off.' >>> corp = Corpus(tqbf) >>> print(corp.raw()) The quick brown fox jumped over the lazy dog. And then it slept. And the dog ran off. >>> len(corp.raw()) 85 """ doc_list = [] for doc in self.corpus: sent_list = [] for sent in doc: sent_list.append(' '.join(sent)) doc_list.append(self.sent_split.join(sent_list)) del sent_list return self.doc_split.join(doc_list)
[docs] def idf(self, term, transform=None): r"""Calculate the Inverse Document Frequency of a term in the corpus. :param str term: the term to calculate the IDF of :param function transform: a function to apply to each document term before checking for the presence of term :returns: the IDF :rtype: float >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n\n' >>> tqbf += 'And then it slept.\n\n And the dog ran off.' >>> corp = Corpus(tqbf) >>> print(corp.docs()) [[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.']], [['And', 'then', 'it', 'slept.']], [['And', 'the', 'dog', 'ran', 'off.']]] >>> round(corp.idf('dog'), 10) 0.4771212547 >>> round(corp.idf('the'), 10) 0.1760912591 """ docs_with_term = 0 docs = self.docs_of_words() for doc in docs: doc_set = set(doc) if transform: transformed_doc = [] for word in doc_set: transformed_doc.append(transform(word)) doc_set = set(transformed_doc) if term in doc_set: docs_with_term += 1 if docs_with_term == 0: return float('inf') return log10(len(docs)/docs_with_term)
if __name__ == '__main__': import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)