Source code for abydos.phonetic._eudex

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._eudex.

The phonetic._eudex module implements the Eudex phonetic hash.
"""

from __future__ import unicode_literals

from six.moves import range

__all__ = ['eudex']


[docs]def eudex(word, max_length=8):
    """Return the eudex phonetic hash of a word.

    This implementation of eudex phonetic hashing is based on the specification
    (not the reference implementation) at :cite:`Ticki:2016`.

    Further details can be found at :cite:`Ticki:2016b`.

    :param str word: the word to transform
    :param int max_length: the length in bits of the code returned (default 8)
    :returns: the eudex hash
    :rtype: int

    >>> eudex('Colin')
    432345564238053650
    >>> eudex('Christopher')
    433648490138894409
    >>> eudex('Niall')
    648518346341351840
    >>> eudex('Smith')
    720575940412906756
    >>> eudex('Schmidt')
    720589151732307997
    """
    _trailing_phones = {
        'a': 0,  # a
        'b': 0b01001000,  # b
        'c': 0b00001100,  # c
        'd': 0b00011000,  # d
        'e': 0,  # e
        'f': 0b01000100,  # f
        'g': 0b00001000,  # g
        'h': 0b00000100,  # h
        'i': 1,  # i
        'j': 0b00000101,  # j
        'k': 0b00001001,  # k
        'l': 0b10100000,  # l
        'm': 0b00000010,  # m
        'n': 0b00010010,  # n
        'o': 0,  # o
        'p': 0b01001001,  # p
        'q': 0b10101000,  # q
        'r': 0b10100001,  # r
        's': 0b00010100,  # s
        't': 0b00011101,  # t
        'u': 1,  # u
        'v': 0b01000101,  # v
        'w': 0b00000000,  # w
        'x': 0b10000100,  # x
        'y': 1,  # y
        'z': 0b10010100,  # z
        'ß': 0b00010101,  # ß
        'à': 0,  # à
        'á': 0,  # á
        'â': 0,  # â
        'ã': 0,  # ã
        'ä': 0,  # ä[æ]
        'å': 1,  # å[oː]
        'æ': 0,  # æ[æ]
        'ç': 0b10010101,  # ç[t͡ʃ]
        'è': 1,  # è
        'é': 1,  # é
        'ê': 1,  # ê
        'ë': 1,  # ë
        'ì': 1,  # ì
        'í': 1,  # í
        'î': 1,  # î
        'ï': 1,  # ï
        'ð': 0b00010101,  # ð[ð̠](represented as a non-plosive T)
        'ñ': 0b00010111,  # ñ[nj](represented as a combination of n and j)
        'ò': 0,  # ò
        'ó': 0,  # ó
        'ô': 0,  # ô
        'õ': 0,  # õ
        'ö': 1,  # ö[ø]
        '÷': 0b11111111,  # ÷
        'ø': 1,  # ø[ø]
        'ù': 1,  # ù
        'ú': 1,  # ú
        'û': 1,  # û
        'ü': 1,  # ü
        'ý': 1,  # ý
        'þ': 0b00010101,  # þ[ð̠](represented as a non-plosive T)
        'ÿ': 1,  # ÿ
    }

    _initial_phones = {
        'a': 0b10000100,  # a*
        'b': 0b00100100,  # b
        'c': 0b00000110,  # c
        'd': 0b00001100,  # d
        'e': 0b11011000,  # e*
        'f': 0b00100010,  # f
        'g': 0b00000100,  # g
        'h': 0b00000010,  # h
        'i': 0b11111000,  # i*
        'j': 0b00000011,  # j
        'k': 0b00000101,  # k
        'l': 0b01010000,  # l
        'm': 0b00000001,  # m
        'n': 0b00001001,  # n
        'o': 0b10010100,  # o*
        'p': 0b00100101,  # p
        'q': 0b01010100,  # q
        'r': 0b01010001,  # r
        's': 0b00001010,  # s
        't': 0b00001110,  # t
        'u': 0b11100000,  # u*
        'v': 0b00100011,  # v
        'w': 0b00000000,  # w
        'x': 0b01000010,  # x
        'y': 0b11100100,  # y*
        'z': 0b01001010,  # z
        'ß': 0b00001011,  # ß
        'à': 0b10000101,  # à
        'á': 0b10000101,  # á
        'â': 0b10000000,  # â
        'ã': 0b10000110,  # ã
        'ä': 0b10100110,  # ä [æ]
        'å': 0b11000010,  # å [oː]
        'æ': 0b10100111,  # æ [æ]
        'ç': 0b01010100,  # ç [t͡ʃ]
        'è': 0b11011001,  # è
        'é': 0b11011001,  # é
        'ê': 0b11011001,  # ê
        'ë': 0b11000110,  # ë [ə] or [œ]
        'ì': 0b11111001,  # ì
        'í': 0b11111001,  # í
        'î': 0b11111001,  # î
        'ï': 0b11111001,  # ï
        'ð': 0b00001011,  # ð [ð̠] (represented as a non-plosive T)
        'ñ': 0b00001011,  # ñ [nj] (represented as a combination of n and j)
        'ò': 0b10010101,  # ò
        'ó': 0b10010101,  # ó
        'ô': 0b10010101,  # ô
        'õ': 0b10010101,  # õ
        'ö': 0b11011100,  # ö [œ] or [ø]
        '÷': 0b11111111,  # ÷
        'ø': 0b11011101,  # ø [œ] or [ø]
        'ù': 0b11100001,  # ù
        'ú': 0b11100001,  # ú
        'û': 0b11100001,  # û
        'ü': 0b11100101,  # ü
        'ý': 0b11100101,  # ý
        'þ': 0b00001011,  # þ [ð̠] (represented as a non-plosive T)
        'ÿ': 0b11100101,  # ÿ
    }
    # Lowercase input & filter unknown characters
    word = ''.join(char for char in word.lower() if char in _initial_phones)

    if not word:
        word = '÷'

    # Perform initial eudex coding of each character
    values = [_initial_phones[word[0]]]
    values += [_trailing_phones[char] for char in word[1:]]

    # Right-shift by one to determine if second instance should be skipped
    shifted_values = [_ >> 1 for _ in values]
    condensed_values = [values[0]]
    for n in range(1, len(shifted_values)):
        if shifted_values[n] != shifted_values[n - 1]:
            condensed_values.append(values[n])

    # Add padding after first character & trim beyond max_length
    values = (
        [condensed_values[0]]
        + [0] * max(0, max_length - len(condensed_values))
        + condensed_values[1:max_length]
    )

    # Combine individual character values into eudex hash
    hash_value = 0
    for val in values:
        hash_value = (hash_value << 8) | val

    return hash_value


if __name__ == '__main__':
    import doctest

    doctest.testmod()