# -*- coding: utf-8 -*-
# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
r"""abydos.stats.
The stats module defines functions for calculating various statistical data
about linguistic objects.
This includes the ConfusionTable object, which includes members capable of
calculating the following data based on a confusion table:
- population counts
- precision, recall, specificity, negative predictive value, fall-out,
false discovery rate, accuracy, balanced accuracy, informedness,
and markedness
- various means of the precision & recall, including: arithmetic,
geometric, harmonic, quadratic, logarithmic, contraharmonic,
identric (exponential), & Hölder (power/generalized) means
- :math:`F_{\beta}`-scores, :math:`E`-scores, :math:`G`-measures, along
with special functions for :math:`F_{1}`, :math:`F_{0.5}`, &
:math:`F_{2}` scores
- significance & Matthews correlation coefficient calculation
Functions are provided for calculating the following means:
- arithmetic
- geometric
- harmonic
- quadratic
- contraharmonic
- logarithmic
- identric (exponential)
- Seiffert's
- Lehmer
- Heronian
- Hölder (power/generalized)
- Stolkarsky
- arithmetic-geometric
- geometric-harmonic
- arithmetic-geometric-harmonic
And for calculating:
- midrange
- median
- mode
- variance
- standard deviation
"""
from __future__ import division, unicode_literals
import math
from collections import Counter
from six.moves import range
from .util import prod
__all__ = ['ConfusionTable', 'aghmean', 'agmean', 'amean', 'cmean', 'ghmean',
'gmean', 'heronian_mean', 'hmean', 'hoelder_mean', 'imean',
'lehmer_mean', 'lmean', 'median', 'midrange', 'mode', 'qmean',
'seiffert_mean', 'std', 'var']
[docs]class ConfusionTable(object):
"""ConfusionTable object.
This object is initialized by passing either four integers (or a tuple of
four integers) representing the squares of a confusion table:
true positives, true negatives, false positives, and false negatives
The object possesses methods for the calculation of various statistics
based on the confusion table.
"""
_tp, _tn, _fp, _fn = 0, 0, 0, 0
def __init__(self, tp=0, tn=0, fp=0, fn=0):
"""Initialize ConfusionTable.
:param int tp: true positives (or a tuple, list, or dict); If a tuple
or list is supplied, it must include 4 values in the order [tp, tn,
fp, fn]. If a dict is supplied, it must have 4 keys, namely 'tp',
'tn', 'fp', & 'fn'.
:param int tn: true negatives
:param int fp: false positives
:param int fn: false negatives
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct == ConfusionTable((120, 60, 20, 30))
True
>>> ct == ConfusionTable([120, 60, 20, 30])
True
>>> ct == ConfusionTable({'tp': 120, 'tn': 60, 'fp': 20, 'fn': 30})
True
"""
if isinstance(tp, (tuple, list)):
if len(tp) == 4:
self._tp = tp[0]
self._tn = tp[1]
self._fp = tp[2]
self._fn = tp[3]
else:
raise AttributeError('ConfusionTable requires a 4-tuple ' +
'when being created from a tuple.')
elif isinstance(tp, dict):
if 'tp' in tp:
self._tp = tp['tp']
if 'tn' in tp:
self._tn = tp['tn']
if 'fp' in tp:
self._fp = tp['fp']
if 'fn' in tp:
self._fn = tp['fn']
else:
self._tp = tp
self._tn = tn
self._fp = fp
self._fn = fn
def __eq__(self, other):
"""Perform eqality (==) comparison.
Compares a ConfusionTable to another ConfusionTable or its equivalent
in the form of a tuple, list, or dict.
:returns: True if two ConfusionTables are the same object or all four
of their attributes are equal
:rtype: bool
>>> ct1 = ConfusionTable(120, 60, 20, 30)
>>> ct2 = ConfusionTable(120, 60, 20, 30)
>>> ct3 = ConfusionTable(60, 30, 10, 15)
>>> ct1 == ct2
True
>>> ct1 == ct3
False
>>> ct1 != ct2
False
>>> ct1 != ct3
True
"""
if isinstance(other, ConfusionTable):
if id(self) == id(other):
return True
if ((self._tp == other.true_pos() and
self._tn == other.true_neg() and
self._fp == other.false_pos() and
self._fn == other.false_neg())):
return True
elif isinstance(other, (tuple, list)):
if ((self._tp == other[0] and self._tn == other[1] and
self._fp == other[2] and self._fn == other[3])):
return True
elif isinstance(other, dict):
if ((self._tp == other['tp'] and self._tn == other['tn'] and
self._fp == other['fp'] and self._fn == other['fn'])):
return True
return False
def __str__(self):
"""Cast to str.
:returns: a human-readable version of the confusion table
:rtype: str
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> str(ct)
'tp:120, tn:60, fp:20, fn:30'
"""
return ('tp:' + str(self._tp) + ', tn:' + str(self._tn) + ', fp:' +
str(self._fp) + ', fn:' + str(self._fn))
[docs] def to_tuple(self):
"""Cast to tuple.
:returns: the confusion table as a 4-tuple (tp, tn, fp, fn)
:rtype: tuple
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.to_tuple()
(120, 60, 20, 30)
"""
return self._tp, self._tn, self._fp, self._fn
[docs] def to_dict(self):
"""Cast to dict.
:returns: the confusion table as a dict
:rtype: dict
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> import pprint
>>> pprint.pprint(ct.to_dict())
{'fn': 30, 'fp': 20, 'tn': 60, 'tp': 120}
"""
return {'tp': self._tp, 'tn': self._tn,
'fp': self._fp, 'fn': self._fn}
[docs] def true_pos(self):
"""Return true positives.
:returns: the true positives of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.true_pos()
120
"""
return self._tp
[docs] def true_neg(self):
"""Return true negatives.
:returns: the true negatives of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.true_neg()
60
"""
return self._tn
[docs] def false_pos(self):
"""Return false positives.
:returns: the false positives of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.false_pos()
20
"""
return self._fp
[docs] def false_neg(self):
"""Return false negatives.
:returns: the false negatives of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.false_neg()
30
"""
return self._fn
[docs] def correct_pop(self):
"""Return correct population.
:returns: the correct population of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.correct_pop()
180
"""
return self._tp + self._tn
[docs] def error_pop(self):
"""Return error population.
:returns: The error population of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.error_pop()
50
"""
return self._fp + self._fn
[docs] def test_pos_pop(self):
"""Return test positive population.
:returns: The test positive population of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.test_pos_pop()
140
"""
return self._tp + self._fp
[docs] def test_neg_pop(self):
"""Return test negative population.
:returns: The test negative population of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.test_neg_pop()
90
"""
return self._tn + self._fn
[docs] def cond_pos_pop(self):
"""Return condition positive population.
:returns: The condition positive population of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.cond_pos_pop()
150
"""
return self._tp + self._fn
[docs] def cond_neg_pop(self):
"""Return condition negative population.
:returns: The condition negative population of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.cond_neg_pop()
80
"""
return self._fp + self._tn
[docs] def population(self):
"""Return population, N.
:returns: The population (N) of the confusion table
:rtype: int
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.population()
230
"""
return self._tp + self._tn + self._fp + self._fn
[docs] def precision(self):
r"""Return precision.
Precision is defined as :math:`\frac{tp}{tp + fp}`
AKA positive predictive value (PPV)
Cf. https://en.wikipedia.org/wiki/Precision_and_recall
Cf. https://en.wikipedia.org/wiki/Information_retrieval#Precision
:returns: The precision of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.precision()
0.8571428571428571
"""
if self._tp + self._fp == 0:
return float('NaN')
return self._tp / (self._tp + self._fp)
[docs] def precision_gain(self):
r"""Return gain in precision.
The gain in precision is defined as:
:math:`G(precision) = \frac{precision}{random~ precision}`
Cf. https://en.wikipedia.org/wiki/Gain_(information_retrieval)
:returns: The gain in precision of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.precision_gain()
1.3142857142857143
"""
if self.population() == 0:
return float('NaN')
random_precision = self.cond_pos_pop()/self.population()
return self.precision()/random_precision
[docs] def recall(self):
r"""Return recall.
Recall is defined as :math:`\frac{tp}{tp + fn}`
AKA sensitivity
AKA true positive rate (TPR)
Cf. https://en.wikipedia.org/wiki/Precision_and_recall
Cf. https://en.wikipedia.org/wiki/Sensitivity_(test)
Cf. https://en.wikipedia.org/wiki/Information_retrieval#Recall
:returns: The recall of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.recall()
0.8
"""
if self._tp + self._fn == 0:
return float('NaN')
return self._tp / (self._tp + self._fn)
[docs] def specificity(self):
r"""Return specificity.
Specificity is defined as :math:`\frac{tn}{tn + fp}`
AKA true negative rate (TNR)
Cf. https://en.wikipedia.org/wiki/Specificity_(tests)
:returns: The specificity of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.specificity()
0.75
"""
if self._tn + self._fp == 0:
return float('NaN')
return self._tn / (self._tn + self._fp)
[docs] def npv(self):
r"""Return negative predictive value (NPV).
NPV is defined as :math:`\frac{tn}{tn + fn}`
Cf. https://en.wikipedia.org/wiki/Negative_predictive_value
:returns: The negative predictive value of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.npv()
0.6666666666666666
"""
if self._tn + self._fn == 0:
return float('NaN')
return self._tn / (self._tn + self._fn)
[docs] def fallout(self):
r"""Return fall-out.
Fall-out is defined as :math:`\frac{fp}{fp + tn}`
AKA false positive rate (FPR)
Cf. https://en.wikipedia.org/wiki/Information_retrieval#Fall-out
:returns: The fall-out of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.fallout()
0.25
"""
if self._fp + self._tn == 0:
return float('NaN')
return self._fp / (self._fp + self._tn)
[docs] def fdr(self):
r"""Return false discovery rate (FDR).
False discovery rate is defined as :math:`\frac{fp}{fp + tp}`
Cf. https://en.wikipedia.org/wiki/False_discovery_rate
:returns: The false discovery rate of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.fdr()
0.14285714285714285
"""
if self._fp + self._tp == 0:
return float('NaN')
return self._fp / (self._fp + self._tp)
[docs] def accuracy(self):
r"""Return accuracy.
Accuracy is defined as :math:`\frac{tp + tn}{population}`
Cf. https://en.wikipedia.org/wiki/Accuracy
:returns: The accuracy of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.accuracy()
0.782608695652174
"""
if self.population() == 0:
return float('NaN')
return (self._tp + self._tn) / self.population()
[docs] def accuracy_gain(self):
r"""Return gain in accuracy.
The gain in accuracy is defined as:
:math:`G(accuracy) = \frac{accuracy}{random~ accuracy}`
Cf. https://en.wikipedia.org/wiki/Gain_(information_retrieval)
:returns: The gain in accuracy of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.accuracy_gain()
1.4325259515570934
"""
if self.population() == 0:
return float('NaN')
random_accuracy = ((self.cond_pos_pop()/self.population())**2 +
(self.cond_neg_pop()/self.population())**2)
return self.accuracy()/random_accuracy
[docs] def balanced_accuracy(self):
r"""Return balanced accuracy.
Balanced accuracy is defined as
:math:`\frac{sensitivity + specificity}{2}`
Cf. https://en.wikipedia.org/wiki/Accuracy
:returns: The balanced accuracy of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.balanced_accuracy()
0.775
"""
return 0.5 * (self.recall() + self.specificity())
[docs] def markedness(self):
"""Return markedness.
Markedness is defined as :math:`precision + npv - 1`
AKA DeltaP
Cf. https://en.wikipedia.org/wiki/Youden%27s_J_statistic
Cf.
http://dspace.flinders.edu.au/xmlui/bitstream/handle/2328/27165/Powers%20Evaluation.pdf
:returns: The markedness of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.markedness()
0.5238095238095237
"""
return self.precision() + self.npv() - 1
[docs] def pr_amean(self):
r"""Return arithmetic mean of precision & recall.
The arithmetic mean of precision and recall is defined as:
:math:`\frac{precision \cdot recall}{2}`
Cf. https://en.wikipedia.org/wiki/Arithmetic_mean
:returns: The arithmetic mean of the confusion table's precision &
recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_amean()
0.8285714285714285
"""
return amean((self.precision(), self.recall()))
[docs] def pr_gmean(self):
r"""Return geometric mean of precision & recall.
The geometric mean of precision and recall is defined as:
:math:`\sqrt{precision \cdot recall}`
Cf. https://en.wikipedia.org/wiki/Geometric_mean
:returns: The geometric mean of the confusion table's precision &
recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_gmean()
0.828078671210825
"""
return gmean((self.precision(), self.recall()))
[docs] def pr_hmean(self):
r"""Return harmonic mean of precision & recall.
The harmonic mean of precision and recall is defined as:
:math:`\frac{2 \cdot precision \cdot recall}{precision + recall}`
Cf. https://en.wikipedia.org/wiki/Harmonic_mean
:returns: The harmonic mean of the confusion table's precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_hmean()
0.8275862068965516
"""
return hmean((self.precision(), self.recall()))
[docs] def pr_qmean(self):
r"""Return quadratic mean of precision & recall.
The quadratic mean of precision and recall is defined as:
:math:`\sqrt{\frac{precision^{2} + recall^{2}}{2}}`
Cf. https://en.wikipedia.org/wiki/Quadratic_mean
:returns: The quadratic mean of the confusion table's precision &
recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_qmean()
0.8290638930598233
"""
return qmean((self.precision(), self.recall()))
[docs] def pr_cmean(self):
r"""Return contraharmonic mean of precision & recall.
The contraharmonic mean is:
:math:`\frac{precision^{2} + recall^{2}}{precision + recall}`
Cf. https://en.wikipedia.org/wiki/Contraharmonic_mean
:returns: The contraharmonic mean of the confusion table's precision &
recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_cmean()
0.8295566502463055
"""
return cmean((self.precision(), self.recall()))
[docs] def pr_lmean(self):
r"""Return logarithmic mean of precision & recall.
The logarithmic mean is:
0 if either precision or recall is 0,
the precision if they are equal,
otherwise :math:`\frac{precision - recall}
{ln(precision) - ln(recall)}`
Cf. https://en.wikipedia.org/wiki/Logarithmic_mean
:returns: The logarithmic mean of the confusion table's precision &
recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_lmean()
0.8282429171492667
"""
precision = self.precision()
recall = self.recall()
if not precision or not recall:
return 0.0
elif precision == recall:
return precision
return ((precision - recall) /
(math.log(precision) - math.log(recall)))
[docs] def pr_imean(self):
r"""Return identric (exponential) mean of precision & recall.
The identric mean is:
precision if precision = recall,
otherwise :math:`\frac{1}{e} \cdot
\sqrt[precision - recall]{\frac{precision^{precision}}
{recall^{recall}}}`
Cf. https://en.wikipedia.org/wiki/Identric_mean
:returns: The identric mean of the confusion table's precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_imean()
0.8284071826325543
"""
return imean((self.precision(), self.recall()))
[docs] def pr_seiffert_mean(self):
r"""Return Seiffert's mean of precision & recall.
Seiffert's mean of precision and recall is:
:math:`\frac{precision - recall}{4 \cdot arctan
\sqrt{\frac{precision}{recall}} - \pi}`
Cf. http://www.helsinki.fi/~hasto/pp/miaPreprint.pdf
:returns: Seiffer's mean of the confusion table's precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_seiffert_mean()
0.8284071696048312
"""
return seiffert_mean((self.precision(), self.recall()))
[docs] def pr_lehmer_mean(self, exp=2):
r"""Return Lehmer mean of precision & recall.
The Lehmer mean is:
:math:`\frac{precision^{exp} + recall^{exp}}
{precision^{exp-1} + recall^{exp-1}}`
Cf. https://en.wikipedia.org/wiki/Lehmer_mean
:param numeric exp: The exponent of the Lehmer mean
:returns: The Lehmer mean for the given exponent of the confusion
table's precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_lehmer_mean()
0.8295566502463055
"""
return lehmer_mean((self.precision(), self.recall()), exp)
[docs] def pr_heronian_mean(self):
r"""Return Heronian mean of precision & recall.
The Heronian mean of precision and recall is defined as:
:math:`\frac{precision + \sqrt{precision \cdot recall} + recall}{3}`
Cf. https://en.wikipedia.org/wiki/Heronian_mean
:returns: The Heronian mean of the confusion table's precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_heronian_mean()
0.8284071761178939
"""
return heronian_mean((self.precision(), self.recall()))
[docs] def pr_hoelder_mean(self, exp=2):
r"""Return Hölder (power/generalized) mean of precision & recall.
The power mean of precision and recall is defined as:
:math:`\frac{1}{2} \cdot
\sqrt[exp]{precision^{exp} + recall^{exp}}`
for :math:`exp \ne 0`, and the geometric mean for :math:`exp = 0`
Cf. https://en.wikipedia.org/wiki/Generalized_mean
:param numeric exp: The exponent of the Hölder mean
:returns: The Hölder mean for the given exponent of the confusion
table's precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_hoelder_mean()
0.8290638930598233
"""
return hoelder_mean((self.precision(), self.recall()), exp)
[docs] def pr_agmean(self):
"""Return arithmetic-geometric mean of precision & recall.
Iterates between arithmetic & geometric means until they converge to
a single value (rounded to 12 digits)
Cf. https://en.wikipedia.org/wiki/Arithmetic-geometric_mean
:returns: The arithmetic-geometric mean of the confusion table's
precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_agmean()
0.8283250315702829
"""
return agmean((self.precision(), self.recall()))
[docs] def pr_ghmean(self):
"""Return geometric-harmonic mean of precision & recall.
Iterates between geometric & harmonic means until they converge to
a single value (rounded to 12 digits)
Cf. https://en.wikipedia.org/wiki/Geometric-harmonic_mean
:returns: The geometric-harmonic mean of the confusion table's
precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_ghmean()
0.8278323841238441
"""
return ghmean((self.precision(), self.recall()))
[docs] def pr_aghmean(self):
"""Return arithmetic-geometric-harmonic mean of precision & recall.
Iterates over arithmetic, geometric, & harmonic means until they
converge to a single value (rounded to 12 digits), following the
method described by Raïssouli, Leazizi, & Chergui:
http://www.emis.de/journals/JIPAM/images/014_08_JIPAM/014_08.pdf
:returns: The arithmetic-geometric-harmonic mean of the confusion
table's precision & recall
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.pr_aghmean()
0.8280786712108288
"""
return aghmean((self.precision(), self.recall()))
[docs] def fbeta_score(self, beta=1):
r"""Return :math:`F_{\beta}` score.
:math:`F_{\beta}` for a positive real value :math:`\beta` "measures
the effectiveness of retrieval with respect to a user who
attaches :math:`\beta` times as much importance to recall as
precision" (van Rijsbergen 1979)
:math:`F_{\beta}` score is defined as:
:math:`(1 + \beta^2) \cdot \frac{precision \cdot recall}
{((\beta^2 \cdot precision) + recall)}`
Cf. https://en.wikipedia.org/wiki/F1_score
:params numeric beta: The :math:`\beta` parameter in the above formula
:returns: The :math:`F_{\beta}` of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.fbeta_score()
0.8275862068965518
>>> ct.fbeta_score(beta=0.1)
0.8565371024734982
"""
if beta <= 0:
raise AttributeError('Beta must be a positive real value.')
precision = self.precision()
recall = self.recall()
return ((1 + beta**2) *
precision * recall / ((beta**2 * precision) + recall))
[docs] def f2_score(self):
"""Return :math:`F_{2}`.
The :math:`F_{2}` score emphasizes recall over precision in comparison
to the :math:`F_{1}` score
Cf. https://en.wikipedia.org/wiki/F1_score
:returns: The :math:`F_{2}` of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.f2_score()
0.8108108108108109
"""
return self.fbeta_score(2)
[docs] def fhalf_score(self):
"""Return :math:`F_{0.5}` score.
The :math:`F_{0.5}` score emphasizes precision over recall in
comparison to the :math:`F_{1}` score
Cf. https://en.wikipedia.org/wiki/F1_score
:returns: The :math:`F_{0.5}` score of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.fhalf_score()
0.8450704225352114
"""
return self.fbeta_score(0.5)
[docs] def e_score(self, beta=1):
"""Return :math:`E`-score.
This is Van Rijsbergen's effectiveness measure
Cf. https://en.wikipedia.org/wiki/Information_retrieval#F-measure
:returns: The :math:`E`-score of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.e_score()
0.17241379310344818
"""
return 1-self.fbeta_score(beta)
[docs] def f1_score(self):
r"""Return :math:`F_{1}` score.
:math:`F_{1}` score is the harmonic mean of precision and recall:
:math:`2 \cdot \frac{precision \cdot recall}{precision + recall}`
Cf. https://en.wikipedia.org/wiki/F1_score
:returns: The :math:`F_{1}` of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.f1_score()
0.8275862068965516
"""
return self.pr_hmean()
[docs] def f_measure(self):
r"""Return :math:`F`-measure.
:math:`F`-measure is the harmonic mean of precision and recall:
:math:`2 \cdot \frac{precision \cdot recall}{precision + recall}`
Cf. https://en.wikipedia.org/wiki/F1_score
:returns: The math:`F`-measure of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.f_measure()
0.8275862068965516
"""
return self.pr_hmean()
[docs] def g_measure(self):
r"""Return G-measure.
:math:`G`-measure is the geometric mean of precision and recall:
:math:`\sqrt{precision \cdot recall}`
This is identical to the Fowlkes–Mallows (FM) index for two
clusters.
Cf. https://en.wikipedia.org/wiki/F1_score#G-measure
Cf. https://en.wikipedia.org/wiki/Fowlkes%E2%80%93Mallows_index
:returns: The :math:`G`-measure of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.g_measure()
0.828078671210825
"""
return self.pr_gmean()
[docs] def mcc(self):
r"""Return Matthews correlation coefficient (MCC).
The Matthews correlation coefficient is defined as:
:math:`\frac{(tp \cdot tn) - (fp \cdot fn)}
{\sqrt{(tp + fp)(tp + fn)(tn + fp)(tn + fn)}}`
This is equivalent to the geometric mean of informedness and
markedness, defined above.
Cf. https://en.wikipedia.org/wiki/Matthews_correlation_coefficient
:returns: The Matthews correlation coefficient of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.mcc()
0.5367450401216932
"""
if (((self._tp + self._fp) * (self._tp + self._fn) *
(self._tn + self._fp) * (self._tn + self._fn))) == 0:
return float('NaN')
return (((self._tp * self._tn) - (self._fp * self._fn)) /
math.sqrt((self._tp + self._fp) * (self._tp + self._fn) *
(self._tn + self._fp) * (self._tn + self._fn)))
[docs] def significance(self):
r"""Return the significance, :math:`\chi^{2}`.
Significance is defined as:
:math:`\chi^{2} =
\frac{(tp \cdot tn - fp \cdot fn)^{2} (tp + tn + fp + fn)}
{((tp + fp)(tp + fn)(tn + fp)(tn + fn)}`
Also: :math:`\chi^{2} = MCC^{2} \cdot n`
Cf. https://en.wikipedia.org/wiki/Pearson%27s_chi-square_test
:returns: The significance of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.significance()
66.26190476190476
"""
if (((self._tp + self._fp) * (self._tp + self._fn) *
(self._tn + self._fp) * (self._tn + self._fn))) == 0:
return float('NaN')
return (((self._tp * self._tn - self._fp * self._fn)**2 *
(self._tp + self._tn + self._fp + self._fn)) /
((self._tp + self._fp) * (self._tp + self._fn) *
(self._tn + self._fp) * (self._tn + self._fn)))
[docs] def kappa_statistic(self):
r"""Return κ statistic.
The κ statistic is defined as:
:math:`\kappa = \frac{accuracy - random~ accuracy}
{1 - random~ accuracy}`
The κ statistic compares the performance of the classifier relative to
the performance of a random classifier. κ = 0 indicates performance
identical to random. κ = 1 indicates perfect predictive success.
κ = -1 indicates perfect predictive failure.
:returns: The κ statistic of the confusion table
:rtype: float
>>> ct = ConfusionTable(120, 60, 20, 30)
>>> ct.kappa_statistic()
0.5344129554655871
"""
if self.population() == 0:
return float('NaN')
random_accuracy = (((self._tn + self._fp) *
(self._tn + self._fn) +
(self._fn + self._tp) *
(self._fp + self._tp)) /
self.population()**2)
return (self.accuracy()-random_accuracy) / (1-random_accuracy)
[docs]def amean(nums):
r"""Return arithmetic mean.
The arithmetic mean is defined as:
:math:`\frac{\sum{nums}}{|nums|}`
Cf. https://en.wikipedia.org/wiki/Arithmetic_mean
:param list nums: A series of numbers
:returns: The arithmetric mean of nums
:rtype: float
>>> amean([1, 2, 3, 4])
2.5
>>> amean([1, 2])
1.5
>>> amean([0, 5, 1000])
335.0
"""
return sum(nums)/len(nums)
[docs]def gmean(nums):
r"""Return geometric mean.
The geometric mean is defined as:
:math:`\sqrt[|nums|]{\prod\limits_{i} nums_{i}}`
Cf. https://en.wikipedia.org/wiki/Geometric_mean
:param list nums: A series of numbers
:returns: The geometric mean of nums
:rtype: float
>>> gmean([1, 2, 3, 4])
2.213363839400643
>>> gmean([1, 2])
1.4142135623730951
>>> gmean([0, 5, 1000])
0.0
"""
return prod(nums)**(1/len(nums))
[docs]def hmean(nums):
r"""Return harmonic mean.
The harmonic mean is defined as:
:math:`\frac{|nums|}{\sum\limits_{i}\frac{1}{nums_i}}`
Following the behavior of Wolfram|Alpha:
- If one of the values in nums is 0, return 0.
- If more than one value in nums is 0, return NaN.
Cf. https://en.wikipedia.org/wiki/Harmonic_mean
:param list nums: A series of numbers
:returns: The harmonic mean of nums
:rtype: float
>>> hmean([1, 2, 3, 4])
1.9200000000000004
>>> hmean([1, 2])
1.3333333333333333
>>> hmean([0, 5, 1000])
0
"""
if len(nums) < 1:
raise AttributeError('hmean requires at least one value')
elif len(nums) == 1:
return nums[0]
else:
for i in range(1, len(nums)):
if nums[0] != nums[i]:
break
else:
return nums[0]
if 0 in nums:
if nums.count(0) > 1:
return float('nan')
return 0
return len(nums)/sum(1/i for i in nums)
[docs]def qmean(nums):
r"""Return quadratic mean.
The quadratic mean of precision and recall is defined as:
:math:`\sqrt{\sum\limits_{i} \frac{num_i^2}{|nums|}}`
Cf. https://en.wikipedia.org/wiki/Quadratic_mean
:param list nums: A series of numbers
:returns: The quadratic mean of nums
:rtype: float
>>> qmean([1, 2, 3, 4])
2.7386127875258306
>>> qmean([1, 2])
1.5811388300841898
>>> qmean([0, 5, 1000])
577.3574860228857
"""
return (sum(i**2 for i in nums)/len(nums))**0.5
[docs]def cmean(nums):
r"""Return contraharmonic mean.
The contraharmonic mean is:
:math:`\frac{\sum\limits_i x_i^2}{\sum\limits_i x_i}`
Cf. https://en.wikipedia.org/wiki/Contraharmonic_mean
:param list nums: A series of numbers
:returns: The contraharmonic mean of nums
:rtype: float
>>> cmean([1, 2, 3, 4])
3.0
>>> cmean([1, 2])
1.6666666666666667
>>> cmean([0, 5, 1000])
995.0497512437811
"""
return sum(x**2 for x in nums)/sum(nums)
[docs]def lmean(nums):
r"""Return logarithmic mean.
The logarithmic mean of an arbitrarily long series is defined by
http://www.survo.fi/papers/logmean.pdf
as:
:math:`L(x_1, x_2, ..., x_n) =
(n-1)! \sum\limits_{i=1}^n \frac{x_i}
{\prod\limits_{\substack{j = 1\\j \ne i}}^n
ln \frac{x_i}{x_j}}`
Cf. https://en.wikipedia.org/wiki/Logarithmic_mean
:param list nums: A series of numbers
:returns: The logarithmic mean of nums
:rtype: float
>>> lmean([1, 2, 3, 4])
2.2724242417489258
>>> lmean([1, 2])
1.4426950408889634
"""
if len(nums) != len(set(nums)):
raise AttributeError('No two values in the nums list may be equal.')
rolling_sum = 0
for i in range(len(nums)):
rolling_prod = 1
for j in range(len(nums)):
if i != j:
rolling_prod *= (math.log(nums[i]/nums[j]))
rolling_sum += nums[i]/rolling_prod
return math.factorial(len(nums)-1) * rolling_sum
[docs]def imean(nums):
r"""Return identric (exponential) mean.
The identric mean of two numbers x and y is:
x if x = y
otherwise :math:`\frac{1}{e} \sqrt[x-y]{\frac{x^x}{y^y}}`
Cf. https://en.wikipedia.org/wiki/Identric_mean
:param list nums: A series of numbers
:returns: The identric mean of nums
:rtype: float
>>> imean([1, 2])
1.4715177646857693
>>> imean([1, 0])
nan
>>> imean([2, 4])
2.9430355293715387
"""
if len(nums) == 1:
return nums[0]
if len(nums) > 2:
raise AttributeError('imean supports no more than two values')
if nums[0] <= 0 or nums[1] <= 0:
return float('NaN')
elif nums[0] == nums[1]:
return nums[0]
return ((1/math.e) *
(nums[0]**nums[0]/nums[1]**nums[1])**(1/(nums[0]-nums[1])))
[docs]def seiffert_mean(nums):
r"""Return Seiffert's mean.
Seiffert's mean of two numbers x and y is:
:math:`\frac{x - y}{4 \cdot arctan \sqrt{\frac{x}{y}} - \pi}`
Cf. http://www.helsinki.fi/~hasto/pp/miaPreprint.pdf
:param list nums: A series of numbers
:returns: Sieffert's mean of nums
:rtype: float
>>> seiffert_mean([1, 2])
1.4712939827611637
>>> seiffert_mean([1, 0])
0.3183098861837907
>>> seiffert_mean([2, 4])
2.9425879655223275
>>> seiffert_mean([2, 1000])
336.84053300118825
"""
if len(nums) == 1:
return nums[0]
if len(nums) > 2:
raise AttributeError('seiffert_mean supports no more than two values')
if nums[0]+nums[1] == 0 or nums[0]-nums[1] == 0:
return float('NaN')
return (nums[0]-nums[1])/(2*math.asin((nums[0]-nums[1])/(nums[0]+nums[1])))
[docs]def lehmer_mean(nums, exp=2):
r"""Return Lehmer mean.
The Lehmer mean is:
:math:`\frac{\sum\limits_i{x_i^p}}{\sum\limits_i{x_i^(p-1)}}`
Cf. https://en.wikipedia.org/wiki/Lehmer_mean
:param list nums: A series of numbers
:param numeric exp: The exponent of the Lehmer mean
:returns: The Lehmer mean of nums for the given exponent
:rtype: float
>>> lehmer_mean([1, 2, 3, 4])
3.0
>>> lehmer_mean([1, 2])
1.6666666666666667
>>> lehmer_mean([0, 5, 1000])
995.0497512437811
"""
return sum(x**exp for x in nums)/sum(x**(exp-1) for x in nums)
[docs]def heronian_mean(nums):
r"""Return Heronian mean.
The Heronian mean is:
:math:`\frac{\sum\limits_{i, j}\sqrt{{x_i \cdot x_j}}}
{|nums| \cdot \frac{|nums| + 1}{2}}`
for :math:`j \ge i`
Cf. https://en.wikipedia.org/wiki/Heronian_mean
:param list nums: A series of numbers
:returns: The Heronian mean of nums
:rtype: float
>>> heronian_mean([1, 2, 3, 4])
2.3888282852609093
>>> heronian_mean([1, 2])
1.4714045207910316
>>> heronian_mean([0, 5, 1000])
179.28511301977582
"""
mag = len(nums)
rolling_sum = 0
for i in range(mag):
for j in range(i, mag):
if nums[i] == nums[j]:
rolling_sum += nums[i]
else:
rolling_sum += (nums[i]*nums[j])**0.5
return rolling_sum * 2 / (mag*(mag+1))
[docs]def hoelder_mean(nums, exp=2):
r"""Return Hölder (power/generalized) mean.
The Hölder mean is defined as:
:math:`\sqrt[p]{\frac{1}{|nums|} \cdot \sum\limits_i{x_i^p}}`
for :math:`p \ne 0`, and the geometric mean for :math:`p = 0`
Cf. https://en.wikipedia.org/wiki/Generalized_mean
:param list nums: A series of numbers
:param numeric exp: The exponent of the Hölder mean
:returns: The Hölder mean of nums for the given exponent
:rtype: float
>>> hoelder_mean([1, 2, 3, 4])
2.7386127875258306
>>> hoelder_mean([1, 2])
1.5811388300841898
>>> hoelder_mean([0, 5, 1000])
577.3574860228857
"""
if exp == 0:
return gmean(nums)
return ((1/len(nums)) * sum(i**exp for i in nums))**(1/exp)
[docs]def agmean(nums):
"""Return arithmetic-geometric mean.
Iterates between arithmetic & geometric means until they converge to
a single value (rounded to 12 digits).
Cf. https://en.wikipedia.org/wiki/Arithmetic-geometric_mean
:param list nums: A series of numbers
:returns: The arithmetic-geometric mean of nums
:rtype: float
>>> agmean([1, 2, 3, 4])
2.3545004777751077
>>> agmean([1, 2])
1.4567910310469068
>>> agmean([0, 5, 1000])
2.9753977059954195e-13
"""
m_a = amean(nums)
m_g = gmean(nums)
if math.isnan(m_a) or math.isnan(m_g):
return float('nan')
while round(m_a, 12) != round(m_g, 12):
m_a, m_g = (m_a+m_g)/2, (m_a*m_g)**(1/2)
return m_a
[docs]def ghmean(nums):
"""Return geometric-harmonic mean.
Iterates between geometric & harmonic means until they converge to
a single value (rounded to 12 digits).
Cf. https://en.wikipedia.org/wiki/Geometric-harmonic_mean
:param list nums: A series of numbers
:returns: The geometric-harmonic mean of nums
:rtype: float
>>> ghmean([1, 2, 3, 4])
2.058868154613003
>>> ghmean([1, 2])
1.3728805006183502
>>> ghmean([0, 5, 1000])
0.0
>>> ghmean([0, 0])
0.0
>>> ghmean([0, 0, 5])
nan
"""
m_g = gmean(nums)
m_h = hmean(nums)
if math.isnan(m_g) or math.isnan(m_h):
return float('nan')
while round(m_h, 12) != round(m_g, 12):
m_g, m_h = (m_g*m_h)**(1/2), (2*m_g*m_h)/(m_g+m_h)
return m_g
[docs]def aghmean(nums):
"""Return arithmetic-geometric-harmonic mean.
Iterates over arithmetic, geometric, & harmonic means until they
converge to a single value (rounded to 12 digits), following the
method described by Raïssouli, Leazizi, & Chergui:
http://www.emis.de/journals/JIPAM/images/014_08_JIPAM/014_08.pdf
:param list nums: A series of numbers
:returns: The arithmetic-geometric-harmonic mean of nums
:rtype: float
>>> aghmean([1, 2, 3, 4])
2.198327159900212
>>> aghmean([1, 2])
1.4142135623731884
>>> aghmean([0, 5, 1000])
335.0
"""
m_a = amean(nums)
m_g = gmean(nums)
m_h = hmean(nums)
if math.isnan(m_a) or math.isnan(m_g) or math.isnan(m_h):
return float('nan')
while (round(m_a, 12) != round(m_g, 12) and
round(m_g, 12) != round(m_h, 12)):
m_a, m_g, m_h = ((m_a+m_g+m_h)/3,
(m_a*m_g*m_h)**(1/3),
3/(1/m_a+1/m_g+1/m_h))
return m_a
[docs]def midrange(nums):
"""Return midrange.
The midrange is the arithmetic mean of the maximum & minimum of a series.
Cf. https://en.wikipedia.org/wiki/Midrange
:param list nums: A series of numbers
:returns: The midrange of nums
:rtype: float
>>> midrange([1, 2, 3])
2.0
>>> midrange([1, 2, 2, 3])
2.0
>>> midrange([1, 2, 1000, 3])
500.5
"""
return 0.5*(max(nums)+min(nums))
[docs]def mode(nums):
"""Return the mode.
The mode of a series is the most common element of that series
Cf. https://en.wikipedia.org/wiki/Mode_(statistics)
:param list nums: A series of numbers
:returns: The mode of nums
:rtype: float
>>> mode([1, 2, 2, 3])
2
"""
return Counter(nums).most_common(1)[0][0]
[docs]def var(nums, mean_func=amean, ddof=0):
"""Calculate the variance.
:param list nums: A series of numbers
:param function mean_func: A mean function (amean by default)
:param int ddof: The degrees of freedom (0 by default)
:returns: The variance of the values in the series
:rtype: float
>>> var([1, 1, 1, 1])
0.0
>>> var([1, 2, 3, 4])
1.25
>>> round(var([1, 2, 3, 4], ddof=1), 12)
1.666666666667
"""
x_bar = mean_func(nums)
return sum((x - x_bar) ** 2 for x in nums) / (len(nums) - ddof)
[docs]def std(nums, mean_func=amean, ddof=0):
"""Return the standard deviation.
:param list nums: A series of numbers
:param function mean_func: A mean function (amean by default)
:param int ddof: The degrees of freedom (0 by default)
:returns: The standard deviation of the values in the series
:rtype: float
>>> std([1, 1, 1, 1])
0.0
>>> round(std([1, 2, 3, 4]), 12)
1.11803398875
>>> round(std([1, 2, 3, 4], ddof=1), 12)
1.290994448736
"""
return var(nums, mean_func, ddof)**0.5
if __name__ == '__main__':
import doctest
doctest.testmod()