Source code for abydos.phonetic._fr

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._fr.

The phonetic._fr module implements phonetic algorithms intended for French,
including:

    - FONEM
    - an early version of Henry Code
"""

from __future__ import unicode_literals

from re import compile as re_compile
from unicodedata import normalize as unicode_normalize

from six import text_type

__all__ = ['fonem', 'henry_early']


[docs]def fonem(word): """Return the FONEM code of a word. FONEM is a phonetic algorithm designed for French (particularly surnames in Saguenay, Canada), defined in :cite:`Bouchard:1981`. Guillaume Plique's Javascript implementation :cite:`Plique:2018` at https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js was also consulted for this implementation. :param str word: the word to transform :returns: the FONEM code :rtype: str >>> fonem('Marchand') 'MARCHEN' >>> fonem('Beaulieu') 'BOLIEU' >>> fonem('Beaumont') 'BOMON' >>> fonem('Legrand') 'LEGREN' >>> fonem('Pelletier') 'PELETIER' """ # I don't see a sane way of doing this without regexps :( rule_table = { # Vowels & groups of vowels 'V-1': (re_compile('E?AU'), 'O'), 'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'), 'V-3,4': (re_compile('E?AU[TX]$'), 'O'), 'V-6': (re_compile('E?AUL?D$'), 'O'), 'V-7': (re_compile(r'(?<!G)AY$'), 'E'), 'V-8': (re_compile('EUX$'), 'EU'), 'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'), 'V-10': ('Y', 'I'), 'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'), 'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'), 'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'), 'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''), # Nasal vowels 'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'), 'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'), 'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'), 'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'IN'), 'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'), 'V-20': ( re_compile( '(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))' ), 'IN', ), # Consonants and groups of consonants 'C-1': ('BV', 'V'), 'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'), 'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'), 'C-4': (re_compile('^C(?=[EIY])'), 'S'), 'C-5': (re_compile('^C(?=[OUA])'), 'K'), 'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'), 'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'), 'C-8': (re_compile('CC(?=[AOU])'), 'K'), 'C-9': (re_compile('CC(?=[EIY])'), 'X'), 'C-10': (re_compile('G(?=[EIY])'), 'J'), 'C-11': (re_compile('GA(?=I?[MN])'), 'G#'), 'C-12': (re_compile('GE(O|AU)'), 'JO'), 'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'), 'C-14': (re_compile('(?<![PCS])H'), ''), 'C-15': ('JEA', 'JA'), 'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'), 'C-17': (re_compile('^MC'), 'MA#'), 'C-18': ('PH', 'F'), 'C-19': ('QU', 'K'), 'C-20': (re_compile('^SC(?=[EIY])'), 'S'), 'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'), 'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'), 'C-23': ('SH', 'CH'), 'C-24': (re_compile('TIA$'), 'SSIA'), 'C-25': (re_compile('(?<=[AIOUY])W'), ''), 'C-26': (re_compile('X[CSZ]'), 'X'), 'C-27': ( re_compile( '(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])' ), 'S', ), 'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'), 'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'), 'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'), 'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'), 'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'), 'C-28d': (re_compile('ILE$'), 'ILLE'), 'C-29': ( re_compile( '(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' + 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$' ), lambda m: (m.group(1) or '') + (m.group(2) or ''), ), 'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'), 'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'), # Rules to undo rule bleeding prevention in C-11, C-16, C-17 'C-34': ('G#', 'GA'), 'C-35': ('MA#', 'MAC'), } rule_order = [ 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', 'C-12', 'C-8', 'C-9', 'C-10', 'C-16', 'C-17', 'C-2', 'C-3', 'C-7', 'V-2,5', 'V-3,4', 'V-6', 'V-1', 'C-14', 'C-31,33', 'C-30,32', 'C-11', 'V-15', 'V-17', 'V-18', 'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16', 'V-19', 'V-20', 'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15', 'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24', 'C-25', 'C-26', 'C-27', 'C-29', 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', 'C-34', 'C-35', ] # normalize, upper-case, and filter non-French letters word = unicode_normalize('NFKD', text_type(word.upper())) word = word.translate({198: 'AE', 338: 'OE'}) word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', } ) for rule in rule_order: regex, repl = rule_table[rule] if isinstance(regex, text_type): word = word.replace(regex, repl) else: word = regex.sub(repl, word) return word
[docs]def henry_early(word, max_length=3): """Calculate the early version of the Henry code for a word. The early version of Henry coding is given in :cite:`Legare:1972`. This is different from the later version defined in :cite:`Henry:1976`. :param str word: the word to transform :param int max_length: the length of the code returned (defaults to 3) :returns: the early Henry code :rtype: str >>> henry_early('Marchand') 'MRC' >>> henry_early('Beaulieu') 'BL' >>> henry_early('Beaumont') 'BM' >>> henry_early('Legrand') 'LGR' >>> henry_early('Pelletier') 'PLT' """ _cons = { 'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z', } _vows = {'A', 'E', 'I', 'O', 'U', 'Y'} _diph = { 'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O', 'EU': 'U', } # _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'} _simple = {'W': 'V', 'X': 'S', 'Z': 'S'} word = unicode_normalize('NFKD', text_type(word.upper())) word = ''.join( c for c in word if c in { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', } ) if not word: return '' # Rule Ia seems to be covered entirely in II # Rule Ib if word[0] in _vows: # Ib1 if (word[1:2] in _cons - {'M', 'N'} and word[2:3] in _cons) or ( word[1:2] in _cons and word[2:3] not in _cons ): if word[0] == 'Y': word = 'I' + word[1:] # Ib2 elif word[1:2] in {'M', 'N'} and word[2:3] in _cons: if word[0] == 'E': word = 'A' + word[1:] elif word[0] in {'I', 'U', 'Y'}: word = 'E' + word[1:] # Ib3 elif word[:2] in _diph: word = _diph[word[:2]] + word[2:] # Ib4 elif word[1:2] in _vows and word[0] == 'Y': word = 'I' + word[1:] code = '' skip = 0 # Rule II for pos, char in enumerate(word): nxch = word[pos + 1 : pos + 2] prev = word[pos - 1 : pos] if skip: skip -= 1 elif char in _vows: code += char # IIc elif char == nxch: skip = 1 code += char elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}: continue # IIb elif char in _simple: code += _simple[char] elif char in {'C', 'G', 'P', 'Q', 'S'}: if char == 'C': if nxch in {'A', 'O', 'U', 'L', 'R'}: code += 'K' elif nxch in {'E', 'I', 'Y'}: code += 'S' elif nxch == 'H': if word[pos + 2 : pos + 3] in _vows: code += 'C' else: # CHR, CHL, etc. code += 'K' else: code += 'C' elif char == 'G': if nxch in {'A', 'O', 'U', 'L', 'R'}: code += 'G' elif nxch in {'E', 'I', 'Y'}: code += 'J' elif nxch == 'N': code += 'N' elif char == 'P': if nxch != 'H': code += 'P' else: code += 'F' elif char == 'Q': if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}: code += 'G' else: # QUA, QUO, etc. code += 'K' else: # S... if word[pos : pos + 6] == 'SAINTE': code += 'X' skip = 5 elif word[pos : pos + 5] == 'SAINT': code += 'X' skip = 4 elif word[pos : pos + 3] == 'STE': code += 'X' skip = 2 elif word[pos : pos + 2] == 'ST': code += 'X' skip = 1 elif nxch in _cons: continue else: code += 'S' # IId elif char == 'H' and prev in _cons: continue elif char in _cons - {'L', 'R'} and nxch in _cons - {'L', 'R'}: continue elif char == 'L' and nxch in {'M', 'N'}: continue elif char in {'M', 'N'} and prev in _vows and nxch in _cons: continue # IIa else: code += char # IIe1 if code[-4:] in {'AULT', 'EULT', 'OULT'}: code = code[:-2] # The following are blocked by rules above # elif code[-4:-3] in _vows and code[-3:] == 'MPS': # code = code[:-3] # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', # 'NS', 'NT'}: # code = code[:-2] elif code[-2:-1] == 'R' and code[-1:] in _cons: code = code[:-1] # IIe2 elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}: code = code[:-1] elif code[-2:] == 'ER': code = code[:-1] # Drop non-initial vowels code = code[:1] + code[1:].translate( {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''} ) if max_length != -1: code = code[:max_length] return code
if __name__ == '__main__': import doctest doctest.testmod()