Source code for abydos.corpus

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.corpus.

The corpus class is a container for linguistic corpora and includes various
functions for corpus statistics, language modeling, etc.
"""

from __future__ import unicode_literals

from math import log10

__all__ = ['Corpus']


[docs]class Corpus(object):
    """Corpus class.

    Internally, this is a list of lists or lists. The corpus itself is a list
    of documents. Each document is an ordered list of sentences in those
    documents. And each sentence is an ordered list of words that make up that
    sentence.
    """

    def __init__(self, corpus_text='', doc_split='\n\n', sent_split='\n',
                 filter_chars='', stop_words=None):
        r"""Initialize Corpus.

        By default, when importing a corpus:
            - two consecutive newlines divide documents
            - single newlines divide sentences
            - other whitespace divides words

        :param str corpus_text: the corpus text as a single string
        :param str doc_split: a character or string used to split corpus_text
            into documents
        :param str sent_split: a character or string used to split documents
            into sentences
        :param list filter_chars: A list of characters (as a string, tuple,
            set, or list) to filter out of the corpus text
        :param list stop_words: A list of words (as a tuple, set, or list) to
            filter out of the corpus text

        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
        >>> tqbf += 'And then it slept.\n And the dog ran off.'
        >>> corp = Corpus(tqbf)
        """
        self.corpus = []
        self.doc_split = doc_split
        self.sent_split = sent_split

        for document in corpus_text.split(doc_split):
            doc = []
            for sentence in (s.split() for s in document.split(sent_split)):
                if stop_words:
                    for word in set(stop_words):
                        while word in sentence:
                            sentence.remove(word)
                for char in set(filter_chars):
                    sentence = [word.replace(char, '') for word in sentence]
                if sentence:
                    doc.append(sentence)
            if doc:
                self.corpus.append(doc)

[docs]    def docs(self):
        r"""Return the docs in the corpus.

        Each list within a doc represents the sentences in that doc, each of
        which is in turn a list of words within that sentence.

        :returns: the paragraphs in the corpus as a list of lists of lists
            of strs
        :rtype: [[[str]]]

        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
        >>> tqbf += 'And then it slept.\n And the dog ran off.'
        >>> corp = Corpus(tqbf)
        >>> corp.docs()
        [[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
        'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog', 'ran',
        'off.']]]
        >>> len(corp.docs())
        1
        """
        return self.corpus

[docs]    def paras(self):
        r"""Return the paragraphs in the corpus.

        Each list within a paragraph represents the sentences in that doc, each
        of which is in turn a list of words within that sentence.
        This is identical to the docs() member function and exists only to
        mirror part of NLTK's API for corpora.

        :returns: the paragraphs in the corpus as a list of lists of lists
            of strs
        :rtype: [[[str]]]

        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
        >>> tqbf += 'And then it slept.\n And the dog ran off.'
        >>> corp = Corpus(tqbf)
        >>> corp.paras()
        [[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
        'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog', 'ran',
        'off.']]]
        >>> len(corp.paras())
        1
        """
        return self.docs()

[docs]    def sents(self):
        r"""Return the sentences in the corpus.

        Each list within a sentence represents the words within that sentence.

        :returns: the sentences in the corpus as a list of lists of strs
        :rtype: [[str]]

        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
        >>> tqbf += 'And then it slept.\n And the dog ran off.'
        >>> corp = Corpus(tqbf)
        >>> corp.sents()
        [['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
        'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog', 'ran',
        'off.']]
        >>> len(corp.sents())
        3
        """
        return [words for sents in self.corpus for words in sents]

[docs]    def words(self):
        r"""Return the words in the corpus as a single list.

        :returns: the words in the corpus as a list of strs
        :rtype: [str]

        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
        >>> tqbf += 'And then it slept.\n And the dog ran off.'
        >>> corp = Corpus(tqbf)
        >>> corp.words()
        ['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
        'dog.', 'And', 'then', 'it', 'slept.', 'And', 'the', 'dog', 'ran',
        'off.']
        >>> len(corp.words())
        18
        """
        return [words for sents in self.sents() for words in sents]

[docs]    def docs_of_words(self):
        r"""Return the docs in the corpus, with sentences flattened.

        Each list within the corpus represents all the words of that document.
        Thus the sentence level of lists has been flattened.

        :returns: the docs in the corpus as a list of list of strs
        :rtype: [[str]]

        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
        >>> tqbf += 'And then it slept.\n And the dog ran off.'
        >>> corp = Corpus(tqbf)
        >>> corp.docs_of_words()
        [['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
        'dog.', 'And', 'then', 'it', 'slept.', 'And', 'the', 'dog', 'ran',
        'off.']]
        >>> len(corp.docs_of_words())
        1
        """
        return [[words for sents in doc for words in sents]
                for doc in self.corpus]

[docs]    def raw(self):
        r"""Return the raw corpus.

        This is reconstructed by joining sub-components with the corpus' split
        characters

        :returns: the raw corpus
        :rtype: str

        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
        >>> tqbf += 'And then it slept.\n And the dog ran off.'
        >>> corp = Corpus(tqbf)
        >>> print(corp.raw())
        The quick brown fox jumped over the lazy dog.
        And then it slept.
        And the dog ran off.
        >>> len(corp.raw())
        85
        """
        doc_list = []
        for doc in self.corpus:
            sent_list = []
            for sent in doc:
                sent_list.append(' '.join(sent))
            doc_list.append(self.sent_split.join(sent_list))
            del sent_list
        return self.doc_split.join(doc_list)

[docs]    def idf(self, term, transform=None):
        r"""Calculate the Inverse Document Frequency of a term in the corpus.

        :param str term: the term to calculate the IDF of
        :param function transform: a function to apply to each document term
            before checking for the presence of term
        :returns: the IDF
        :rtype: float

        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n\n'
        >>> tqbf += 'And then it slept.\n\n And the dog ran off.'
        >>> corp = Corpus(tqbf)
        >>> print(corp.docs())
        [[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
        'dog.']],
        [['And', 'then', 'it', 'slept.']],
        [['And', 'the', 'dog', 'ran', 'off.']]]
        >>> round(corp.idf('dog'), 10)
        0.4771212547
        >>> round(corp.idf('the'), 10)
        0.1760912591
        """
        docs_with_term = 0
        docs = self.docs_of_words()
        for doc in docs:
            doc_set = set(doc)
            if transform:
                transformed_doc = []
                for word in doc_set:
                    transformed_doc.append(transform(word))
                doc_set = set(transformed_doc)

            if term in doc_set:
                docs_with_term += 1

        if docs_with_term == 0:
            return float('inf')

        return log10(len(docs)/docs_with_term)


if __name__ == '__main__':
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)