# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.distance._generalized_fleiss.
Generalized Fleiss correlation
"""
from ._token_distance import _TokenDistance
from ..stats._mean import (
aghmean,
agmean,
amean,
cmean,
ghmean,
gmean,
heronian_mean,
hmean,
hoelder_mean,
imean,
lehmer_mean,
lmean,
qmean,
seiffert_mean,
)
__all__ = ['GeneralizedFleiss']
def _agmean_prec6(l):
return agmean(l, prec=6)
def _ghmean_prec6(l):
return ghmean(l, prec=6)
def _aghmean_prec6(l):
return aghmean(l, prec=6)
means = {
'arithmetic': amean,
'geometric': gmean,
'harmonic': hmean,
'ag': _agmean_prec6,
'gh': _ghmean_prec6,
'agh': _aghmean_prec6,
'contraharmonic': cmean,
'identric': imean,
'logarithmic': lmean,
'quadratic': qmean,
'heronian': heronian_mean,
'hoelder': hoelder_mean,
'lehmer': lehmer_mean,
'seiffert': seiffert_mean,
}
[docs]class GeneralizedFleiss(_TokenDistance):
r"""Generalized Fleiss correlation.
For two sets X and Y and a population N, Generalized Fleiss correlation
is based on observations from :cite:`Fleiss:1975`.
.. math::
corr_{GeneralizedFleiss}(X, Y) =
\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|}
{\mu_{products~of~marginals}}
The mean function :math:`\mu` may be any of the mean functions in
:py:mod:`abydos.stats`. The products of marginals may be one of the
following:
- ``a`` : :math:`|X| \cdot |N \setminus X|` &
:math:`|Y| \cdot |N \setminus Y|`
- ``b`` : :math:`|X| \cdot |Y|` &
:math:`|N \setminus X| \cdot |N \setminus Y|`
- ``c`` : :math:`|X| \cdot |N| \setminus Y|` &
:math:`|Y| \cdot |N \setminus X|`
In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
this is
.. math::
corr_{GeneralizedFleiss} =
\frac{ad-bc}{\mu_{products~of~marginals}}
And the products of marginals are:
- ``a`` : :math:`p_1q_1 = (a+b)(c+d)` & :math:`p_2q_2 = (a+c)(b+d)`
- ``b`` : :math:`p_1p_2 = (a+b)(a+c)` & :math:`q_1q_2 = (c+d)(b+d)`
- ``c`` : :math:`p_1q_2 = (a+b)(b+d)` & :math:`p_2q_1 = (a+c)(c+d)`
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet=None,
tokenizer=None,
intersection_type='crisp',
mean_func='arithmetic',
marginals='a',
proportional=False,
**kwargs
):
"""Initialize GeneralizedFleiss instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet <alphabet>` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type <intersection_type>` description in
:py:class:`_TokenDistance` for details.
mean_func : str or function
Specifies the mean function to use. A function taking a list of
numbers as its only required argument may be supplied, or one of
the following strings will select the specified mean function from
:py:mod:`abydos.stats`:
- ``arithmetic`` employs :py:func:`amean`, and this measure
will be identical to :py:class:`MaxwellPilliner` with
otherwise default parameters
- ``geometric`` employs :py:func:`gmean`, and this measure
will be identical to :py:class:`PearsonPhi` with otherwise
default parameters
- ``harmonic`` employs :py:func:`hmean`, and this measure
will be identical to :py:class:`Fleiss` with otherwise
default parameters
- ``ag`` employs the arithmetic-geometric mean
:py:func:`agmean`
- ``gh`` employs the geometric-harmonic mean
:py:func:`ghmean`
- ``agh`` employs the arithmetic-geometric-harmonic mean
:py:func:`aghmean`
- ``contraharmonic`` employs the contraharmonic mean
:py:func:`cmean`
- ``identric`` employs the identric mean :py:func:`imean`
- ``logarithmic`` employs the logarithmic mean
:py:func:`lmean`
- ``quadratic`` employs the quadratic mean :py:func:`qmean`
- ``heronian`` employs the Heronian mean
:py:func:`heronian_mean`
- ``hoelder`` employs the Hölder mean :py:func:`hoelder_mean`
- ``lehmer`` employs the Lehmer mean :py:func:`lehmer_mean`
- ``seiffert`` employs Seiffert's mean
:py:func:`seiffert_mean`
marginals : str
Specifies the pairs of marginals to multiply and calculate the
resulting mean of. Can be:
- ``a`` : :math:`p_1q_1 = (a+b)(c+d)` &
:math:`p_2q_2 = (a+c)(b+d)`
- ``b`` : :math:`p_1p_2 = (a+b)(a+c)` &
:math:`q_1q_2 = (c+d)(b+d)`
- ``c`` : :math:`p_1q_2 = (a+b)(b+d)` &
:math:`p_2q_1 = (a+c)(c+d)`
proportional : bool
If true, each of the values, :math:`a, b, c, d` and the marginals
will be divided by the total :math:`a+b+c+d=n`.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
self.mean_func = mean_func
self.marginals = marginals
self.proportional = proportional
super(GeneralizedFleiss, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
[docs] def corr(self, src, tar):
"""Return the Generalized Fleiss correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Generalized Fleiss correlation
Examples
--------
>>> cmp = GeneralizedFleiss()
>>> cmp.corr('cat', 'hat')
0.49743589743589745
>>> cmp.corr('Niall', 'Neil')
0.35921989956790845
>>> cmp.corr('aluminum', 'Catalan')
0.10803030303030303
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237483954
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
if self.proportional:
a /= n
b /= n
c /= n
d /= n
num = a * d - b * c
if not num:
return 0.0
if self.marginals == 'b':
mps = [(a + b) * (a + c), (c + d) * (b + d)]
elif self.marginals == 'c':
mps = [(a + b) * (b + d), (a + c) * (c + d)]
else:
mps = [(a + b) * (c + d), (a + c) * (b + d)]
mean_value = (
self.mean_func(mps)
if callable(self.mean_func)
else means[self.mean_func](mps)
)
return num / mean_value
[docs] def sim(self, src, tar):
"""Return the Generalized Fleiss similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Generalized Fleiss similarity
Examples
--------
>>> cmp = GeneralizedFleiss()
>>> cmp.sim('cat', 'hat')
0.7487179487179487
>>> cmp.sim('Niall', 'Neil')
0.6796099497839543
>>> cmp.sim('aluminum', 'Catalan')
0.5540151515151515
>>> cmp.sim('ATCG', 'TAGC')
0.496790757381258
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()