# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.corpus._corpus.
The Corpus class is a container for linguistic corpora and includes various
functions for corpus statistics, language modeling, etc.
"""
from math import log
__all__ = ['Corpus']
[docs]class Corpus(object):
"""Corpus class.
Internally, this is a list of lists or lists. The corpus itself is a list
of documents. Each document is an ordered list of sentences in those
documents. And each sentence is an ordered list of words that make up that
sentence.
.. versionadded:: 0.1.0
"""
def __init__(
self,
corpus_text='',
doc_split='\n\n',
sent_split='\n',
filter_chars='',
stop_words=None,
word_tokenizer=None,
):
r"""Initialize Corpus.
By default, when importing a corpus:
- two consecutive newlines divide documents
- single newlines divide sentences
- other whitespace divides words
Parameters
----------
corpus_text : str
The corpus text as a single string
doc_split : str
A character or string used to split corpus_text into documents
sent_split : str
A character or string used to split documents into sentences
filter_chars : list
A list of characters (as a string, tuple, set, or list) to filter
out of the corpus text
stop_words : list
A list of words (as a tuple, set, or list) to filter out of the
corpus text
word_tokenizer : _Tokenizer
A tokenizer to apply to each sentence in order to retrieve the
individual "word" tokens. If set to none, str.split() will be used.
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
.. versionadded:: 0.1.0
"""
self.corpus = []
self.doc_split = doc_split
self.sent_split = sent_split
for document in corpus_text.split(doc_split):
doc = []
for sentence in document.split(sent_split):
if word_tokenizer:
sentence = word_tokenizer.tokenize(sentence).get_list()
else:
sentence = sentence.split()
if stop_words:
for word in set(stop_words):
while word in sentence:
sentence.remove(word)
for char in set(filter_chars):
sentence = [word.replace(char, '') for word in sentence]
if sentence:
doc.append(sentence)
if doc:
self.corpus.append(doc)
[docs] def docs(self):
r"""Return the docs in the corpus.
Each list within a doc represents the sentences in that doc, each of
which is in turn a list of words within that sentence.
Returns
-------
[[[str]]]
The docs in the corpus as a list of lists of lists of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.docs()
[[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog',
'ran', 'off.']]]
>>> len(corp.docs())
1
.. versionadded:: 0.1.0
"""
return self.corpus
[docs] def paras(self):
r"""Return the paragraphs in the corpus.
Each list within a paragraph represents the sentences in that doc, each
of which is in turn a list of words within that sentence.
This is identical to the docs() member function and exists only to
mirror part of NLTK's API for corpora.
Returns
-------
[[[str]]]
The paragraphs in the corpus as a list of lists of lists of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.paras()
[[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog',
'ran', 'off.']]]
>>> len(corp.paras())
1
.. versionadded:: 0.1.0
"""
return self.docs()
[docs] def sents(self):
r"""Return the sentences in the corpus.
Each list within a sentence represents the words within that sentence.
Returns
-------
[[str]]
The sentences in the corpus as a list of lists of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.sents()
[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog',
'ran', 'off.']]
>>> len(corp.sents())
3
"""
return [words for sents in self.corpus for words in sents]
[docs] def words(self):
r"""Return the words in the corpus as a single list.
Returns
-------
[str]
The words in the corpus as a list of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.words()
['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.', 'And', 'then', 'it', 'slept.', 'And', 'the', 'dog', 'ran',
'off.']
>>> len(corp.words())
18
.. versionadded:: 0.1.0
"""
return [words for sents in self.sents() for words in sents]
[docs] def docs_of_words(self):
r"""Return the docs in the corpus, with sentences flattened.
Each list within the corpus represents all the words of that document.
Thus the sentence level of lists has been flattened.
Returns
-------
[[str]]
The docs in the corpus as a list of list of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.docs_of_words()
[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.', 'And', 'then', 'it', 'slept.', 'And', 'the', 'dog', 'ran',
'off.']]
>>> len(corp.docs_of_words())
1
.. versionadded:: 0.1.0
"""
return [
[words for sents in doc for words in sents] for doc in self.corpus
]
[docs] def raw(self):
r"""Return the raw corpus.
This is reconstructed by joining sub-components with the corpus' split
characters
Returns
-------
str
The raw corpus
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> print(corp.raw())
The quick brown fox jumped over the lazy dog.
And then it slept.
And the dog ran off.
>>> len(corp.raw())
85
.. versionadded:: 0.1.0
"""
doc_list = []
for doc in self.corpus:
sent_list = []
for sent in doc:
sent_list.append(' '.join(sent))
doc_list.append(self.sent_split.join(sent_list))
del sent_list
return self.doc_split.join(doc_list)
[docs] def idf(self, term, transform=None):
r"""Calculate the Inverse Document Frequency of a term in the corpus.
Parameters
----------
term : str
The term to calculate the IDF of
transform : function
A function to apply to each document term before checking for the
presence of term
Returns
-------
float
The IDF
Examples
--------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n\n'
>>> tqbf += 'And then it slept.\n\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> print(corp.docs())
[[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.']],
[['And', 'then', 'it', 'slept.']],
[['And', 'the', 'dog', 'ran', 'off.']]]
>>> round(corp.idf('dog'), 10)
1.0986122887
>>> round(corp.idf('the'), 10)
0.4054651081
.. versionadded:: 0.1.0
"""
docs_with_term = 0
docs = self.docs_of_words()
for doc in docs:
doc_set = set(doc)
if transform:
transformed_doc = []
for word in doc_set:
transformed_doc.append(transform(word))
doc_set = set(transformed_doc)
if term in doc_set:
docs_with_term += 1
if docs_with_term == 0:
return float('inf')
return log(len(docs) / docs_with_term)
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)