# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.phonetic._nrl.
NRL English-to-phoneme algorithm
"""
from re import match as re_match
from deprecation import deprecated
from ._phonetic import _Phonetic
from .. import __version__
__all__ = ['NRL', 'nrl']
[docs]class NRL(_Phonetic):
"""Naval Research Laboratory English-to-phoneme encoder.
This is defined by :cite:`Elovitz:1976`.
.. versionadded:: 0.3.6
"""
_rules = {
' ': (
('', ' ', '', ' '),
('', '-', '', ''),
('.', "'S", '', 'z'),
('#:.E', "'S", '', 'z'),
('#', "'S", '', 'z'),
('', "'", '', ''),
('', ',', '', ' '),
('', '.', '', ' '),
('', '?', '', ' '),
('', '!', '', ' '),
),
'A': (
('', 'A', ' ', 'AX'),
(' ', 'ARE', ' ', 'AAr'),
(' ', 'AR', 'O', 'AXr'),
('', 'AR', '#', 'EHr'),
('^', 'AS', '#', 'EYs'),
('', 'A', 'WA', 'AX'),
('', 'AW', '', 'AO'),
(' :', 'ANY', '', 'EHnIY'),
('', 'A', '^+#', 'EY'),
('#:', 'ALLY', '', 'AXlIY'),
(' ', 'AL', '#', 'AXl'),
('', 'AGAIN', '', 'AXgEHn'),
('#:', 'AG', 'E', 'IHj'),
('', 'A', '^+:#', 'AE'),
(' :', 'A', '^+ ', 'EY'),
('', 'A', '^%', 'EY'),
(' ', 'ARR', '', 'AXr'),
('', 'ARR', '', 'AEr'),
(' :', 'AR', ' ', 'AAr'),
('', 'AR', ' ', 'ER'),
('', 'AR', '', 'AAr'),
('', 'AIR', '', 'EHr'),
('', 'AI', '', 'EY'),
('', 'AY', '', 'EY'),
('', 'AU', '', 'AO'),
('#:', 'AL', ' ', 'AXl'),
('#:', 'ALS', ' ', 'AXlz'),
('', 'ALK', '', 'AOk'),
('', 'AL', '^', 'AOl'),
(' :', 'ABLE', '', 'EYbAXl'),
('', 'ABLE', '', 'AXbAXl'),
('', 'ANG', '+', 'EYnj'),
('', 'A', '', 'AE'),
),
'B': (
(' ', 'BE', '^#', 'bIH'),
('', 'BEING', '', 'bIYIHNG'),
(' ', 'BOTH', ' ', 'bOWTH'),
(' ', 'BUS', '#', 'bIHz'),
('', 'BUIL', '', 'bIHl'),
('', 'B', '', 'b'),
),
'C': (
(' ', 'CH', '^', 'k'),
('^E', 'CH', '', 'k'),
('', 'CH', '', 'CH'),
(' S', 'CI', '#', 'sAY'),
('', 'CI', 'A', 'SH'),
('', 'CI', 'O', 'SH'),
('', 'CI', 'EN', 'SH'),
('', 'C', '+', 's'),
('', 'CK', '', 'k'),
('', 'COM', '%', 'kAHm'),
('', 'C', '', 'k'),
),
'D': (
('#:', 'DED', ' ', 'dIHd'),
('.E', 'D', ' ', 'd'),
('#:^E', 'D', ' ', 't'),
(' ', 'DE', '^#', 'dIH'),
(' ', 'DO', ' ', 'dUW'),
(' ', 'DOES', '', 'dAHz'),
(' ', 'DOING', '', 'dUWIHNG'),
(' ', 'DOW', '', 'dAW'),
('', 'DU', 'A', 'jUW'),
('', 'D', '', 'd'),
),
'E': (
('#:', 'E', ' ', ''),
("':^", 'E', ' ', ''),
(' :', 'E', ' ', 'IY'),
('#', 'ED', ' ', 'd'),
('#:', 'E', 'D ', ''),
('', 'EV', 'ER', 'EHv'),
('', 'E', '^%', 'IY'),
('', 'ERI', '#', 'IYrIY'),
('', 'ERI', '', 'EHrIH'),
('#:', 'ER', '#', 'ER'),
('', 'ER', '#', 'EHr'),
('', 'ER', '', 'ER'),
(' ', 'EVEN', '', 'IYvEHn'),
('#:', 'E', 'W', ''),
('T', 'EW', '', 'UW'),
('S', 'EW', '', 'UW'),
('R', 'EW', '', 'UW'),
('D', 'EW', '', 'UW'),
('L', 'EW', '', 'UW'),
('Z', 'EW', '', 'UW'),
('N', 'EW', '', 'UW'),
('J', 'EW', '', 'UW'),
('TH', 'EW', '', 'UW'),
('CH', 'EW', '', 'UW'),
('SH', 'EW', '', 'UW'),
('', 'EW', '', 'yUW'),
('', 'E', 'O', 'IY'),
('#:S', 'ES', ' ', 'IHz'),
('#:C', 'ES', ' ', 'IHz'),
('#:G', 'ES', ' ', 'IHz'),
('#:Z', 'ES', ' ', 'IHz'),
('#:X', 'ES', ' ', 'IHz'),
('#:J', 'ES', ' ', 'IHz'),
('#:CH', 'ES', ' ', 'IHz'),
('#:SH', 'ES', ' ', 'IHz'),
('#:', 'E', 'S ', ''),
('#:', 'ELY', ' ', 'lIY'),
('#:', 'EMENT', '', 'mEHnt'),
('', 'EFUL', '', 'fUHl'),
('', 'EE', '', 'IY'),
('', 'EARN', '', 'ERn'),
(' ', 'EAR', '^', 'ER'),
('', 'EAD', '', 'EHd'),
('#:', 'EA', ' ', 'IYAX'),
('', 'EA', 'SU', 'EH'),
('', 'EA', '', 'IY'),
('', 'EIGH', '', 'EY'),
('', 'EI', '', 'IY'),
(' ', 'EYE', '', 'AY'),
('', 'EY', '', 'IY'),
('', 'EU', '', 'yUW'),
('', 'E', '', 'EH'),
),
'F': (('', 'FUL', '', 'fUHl'), ('', 'F', '', 'f')),
'G': (
('', 'GIV', '', 'gIHv'),
(' ', 'G', 'I^', 'g'),
('', 'GE', 'T', 'gEH'),
('SU', 'GGES', '', 'gjEHs'),
('', 'GG', '', 'g'),
(' B#', 'G', '', 'g'),
('', 'G', '+', 'j'),
('', 'GREAT', '', 'grEYt'),
('#', 'GH', '', ''),
('', 'G', '', 'g'),
),
'H': (
(' ', 'HAV', '', 'hAEv'),
(' ', 'HERE', '', 'hIYr'),
(' ', 'HOUR', '', 'AWER'),
('', 'HOW', '', 'hAW'),
('', 'H', '#', 'h'),
('', 'H', '', ''),
),
'I': (
(' ', 'IN', '', 'IHn'),
(' ', 'I', ' ', 'AY'),
('', 'IN', 'D', 'AYn'),
('', 'IER', '', 'IYER'),
('#:R', 'IED', '', 'IYd'),
('', 'IED', ' ', 'AYd'),
('', 'IEN', '', 'IYEHn'),
('', 'IE', 'T', 'AYEH'),
(' :', 'I', '%', 'AY'),
('', 'I', '%', 'IY'),
('', 'IE', '', 'IY'),
('', 'I', '^+:#', 'IH'),
('', 'IR', '#', 'AYr'),
('', 'IZ', '%', 'AYz'),
('', 'IS', '%', 'AYz'),
('', 'I', 'D%', 'AY'),
('+^', 'I', '^+', 'IH'),
('', 'I', 'T%', 'AY'),
('#:^', 'I', '^+', 'IH'),
('', 'I', '^+', 'AY'),
('', 'IR', '', 'ER'),
('', 'IGH', '', 'AY'),
('', 'ILD', '', 'AYld'),
('', 'IGN', ' ', 'AYn'),
('', 'IGN', '^', 'AYn'),
('', 'IGN', '%', 'AYn'),
('', 'IQUE', '', 'IYk'),
('', 'I', '', 'IH'),
),
'J': (('', 'J', '', 'j'),),
'K': ((' ', 'K', 'N', ''), ('', 'K', '', 'k')),
'L': (
('', 'LO', 'C#', 'lOW'),
('L', 'L', '', ''),
('#:^', 'L', '%', 'AXl'),
('', 'LEAD', '', 'lIYd'),
('', 'L', '', 'l'),
),
'M': (('', 'MOV', '', 'mUWv'), ('', 'M', '', 'm')),
'N': (
('E', 'NG', '+', 'nj'),
('', 'NG', 'R', 'NGg'),
('', 'NG', '#', 'NGg'),
('', 'NGL', '%', 'NGgAXl'),
('', 'NG', '', 'NG'),
('', 'NK', '', 'NGk'),
(' ', 'NOW', ' ', 'nAW'),
('', 'N', '', 'n'),
),
'O': (
('', 'OF', ' ', 'AXv'),
('', 'OROUGH', '', 'EROW'),
('#:', 'OR', ' ', 'ER'),
('#:', 'ORS', ' ', 'ERz'),
('', 'OR', '', 'AOr'),
(' ', 'ONE', '', 'wAHn'),
('', 'OW', '', 'OW'),
(' ', 'OVER', '', 'OWvER'),
('', 'OV', '', 'AHv'),
('', 'O', '^%', 'OW'),
('', 'O', '^EN', 'OW'),
('', 'O', '^I#', 'OW'),
('', 'OL', 'D', 'OWl'),
('', 'OUGHT', '', 'AOt'),
('', 'OUGH', '', 'AHf'),
(' ', 'OU', '', 'AW'),
('H', 'OU', 'S#', 'AW'),
('', 'OUS', '', 'AXs'),
('', 'OUR', '', 'AOr'),
('', 'OULD', '', 'UHd'),
('^', 'OU', '^L', 'AH'),
('', 'OUP', '', 'UWp'),
('', 'OU', '', 'AW'),
('', 'OY', '', 'OY'),
('', 'OING', '', 'OWIHNG'),
('', 'OI', '', 'OY'),
('', 'OOR', '', 'AOr'),
('', 'OOK', '', 'UHk'),
('', 'OOD', '', 'UHd'),
('', 'OO', '', 'UW'),
('', 'O', 'E', 'OW'),
('', 'O', ' ', 'OW'),
('', 'OA', '', 'OW'),
(' ', 'ONLY', '', 'OWnlIY'),
(' ', 'ONCE', '', 'wAHns'),
('', "ON'T", '', 'OWnt'),
('C', 'O', 'N', 'AA'),
('', 'O', 'NG', 'AO'),
(' :^', 'O', 'N', 'AH'),
('I', 'ON', '', 'AXn'),
('#:', 'ON', ' ', 'AXn'),
('#^', 'ON', '', 'AXn'),
('', 'O', 'ST ', 'OW'),
('', 'OF', '^', 'AOf'),
('', 'OTHER', '', 'AHDHER'),
('', 'OSS', ' ', 'AOs'),
('#:^', 'OM', '', 'AHm'),
('', 'O', '', 'AA'),
),
'P': (
('', 'PH', '', 'f'),
('', 'PEOP', '', 'pIYp'),
('', 'POW', '', 'pAW'),
('', 'PUT', ' ', 'pUHt'),
('', 'P', '', 'p'),
),
'Q': (
('', 'QUAR', '', 'kwAOr'),
('', 'QU', '', 'kw'),
('', 'Q', '', 'k'),
),
'R': ((' ', 'RE', '^#', 'rIY'), ('', 'R', '', 'r')),
'S': (
('', 'SH', '', 'SH'),
('#', 'SION', '', 'ZHAXn'),
('', 'SOME', '', 'sAHm'),
('#', 'SUR', '#', 'ZHER'),
('', 'SUR', '#', 'SHER'),
('#', 'SU', '#', 'ZHUW'),
('#', 'SSU', '#', 'SHUW'),
('#', 'SED', ' ', 'zd'),
('#', 'S', '#', 'z'),
('', 'SAID', '', 'sEHd'),
('^', 'SION', '', 'SHAXn'),
('', 'S', 'S', ''),
('.', 'S', ' ', 'z'),
('#:.E', 'S', ' ', 'z'),
('#:^##', 'S', ' ', 'z'),
('#:^#', 'S', ' ', 's'),
('U', 'S', ' ', 's'),
(' :#', 'S', ' ', 'z'),
(' ', 'SCH', '', 'sk'),
('', 'S', 'C+', ''),
('#', 'SM', '', 'zm'),
('#', 'SN', "'", 'zAXn'),
('', 'S', '', 's'),
),
'T': (
(' ', 'THE', ' ', 'DHAX'),
('', 'TO', ' ', 'tUW'),
('', 'THAT', ' ', 'DHAEt'),
(' ', 'THIS', ' ', 'DHIHs'),
(' ', 'THEY', '', 'DHEY'),
(' ', 'THERE', '', 'DHEHr'),
('', 'THER', '', 'DHER'),
('', 'THEIR', '', 'DHEHr'),
(' ', 'THAN', ' ', 'DHAEn'),
(' ', 'THEM', ' ', 'DHEHm'),
('', 'THESE', ' ', 'DHIYz'),
(' ', 'THEN', '', 'DHEHn'),
('', 'THROUGH', '', 'THrUW'),
('', 'THOSE', '', 'DHOWz'),
('', 'THOUGH', ' ', 'DHOW'),
(' ', 'THUS', '', 'DHAHs'),
('', 'TH', '', 'TH'),
('#:', 'TED', ' ', 'tIHd'),
('S', 'TI', '#N', 'CH'),
('', 'TI', 'O', 'SH'),
('', 'TI', 'A', 'SH'),
('', 'TIEN', '', 'SHAXn'),
('', 'TUR', '#', 'CHER'),
('', 'TU', 'A', 'CHUW'),
(' ', 'TWO', '', 'tUW'),
('', 'T', '', 't'),
),
'U': (
(' ', 'UN', 'I', 'yUWn'),
(' ', 'UN', '', 'AHn'),
(' ', 'UPON', '', 'AXpAOn'),
('T', 'UR', '#', 'UHr'),
('S', 'UR', '#', 'UHr'),
('R', 'UR', '#', 'UHr'),
('D', 'UR', '#', 'UHr'),
('L', 'UR', '#', 'UHr'),
('Z', 'UR', '#', 'UHr'),
('N', 'UR', '#', 'UHr'),
('J', 'UR', '#', 'UHr'),
('TH', 'UR', '#', 'UHr'),
('CH', 'UR', '#', 'UHr'),
('SH', 'UR', '#', 'UHr'),
('', 'UR', '#', 'yUHr'),
('', 'UR', '', 'ER'),
('', 'U', '^ ', 'AH'),
('', 'U', '^^', 'AH'),
('', 'UY', '', 'AY'),
(' G', 'U', '#', ''),
('G', 'U', '%', ''),
('G', 'U', '#', 'w'),
('#N', 'U', '', 'yUW'),
('T', 'U', '', 'UW'),
('S', 'U', '', 'UW'),
('R', 'U', '', 'UW'),
('D', 'U', '', 'UW'),
('L', 'U', '', 'UW'),
('Z', 'U', '', 'UW'),
('N', 'U', '', 'UW'),
('J', 'U', '', 'UW'),
('TH', 'U', '', 'UW'),
('CH', 'U', '', 'UW'),
('SH', 'U', '', 'UW'),
('', 'U', '', 'yUW'),
),
'V': (('', 'VIEW', '', 'vyUW'), ('', 'V', '', 'v')),
'W': (
(' ', 'WERE', '', 'wER'),
('', 'WA', 'S', 'wAA'),
('', 'WA', 'T', 'wAA'),
('', 'WHERE', '', 'WHEHr'),
('', 'WHAT', '', 'WHAAt'),
('', 'WHOL', '', 'hOWl'),
('', 'WHO', '', 'hUW'),
('', 'WH', '', 'WH'),
('', 'WAR', '', 'wAOr'),
('', 'WOR', '^', 'wER'),
('', 'WR', '', 'r'),
('', 'W', '', 'w'),
),
'X': (('', 'X', '', 'ks'),),
'Y': (
('', 'YOUNG', '', 'yAHNG'),
(' ', 'YOU', '', 'yUW'),
(' ', 'YES', '', 'yEHs'),
(' ', 'Y', '', 'y'),
('#:^', 'Y', ' ', 'IY'),
('#:^', 'Y', 'I', 'IY'),
(' :', 'Y', ' ', 'AY'),
(' :', 'Y', '#', 'AY'),
(' :', 'Y', '^+:#', 'IH'),
(' :', 'Y', '^#', 'AY'),
('', 'Y', '', 'IH'),
),
'Z': (('', 'Z', '', 'z'),),
}
[docs] def encode(self, word):
"""Return the Naval Research Laboratory phonetic encoding of a word.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The NRL phonetic encoding
Examples
--------
>>> pe = NRL()
>>> pe.encode('the')
'DHAX'
>>> pe.encode('round')
'rAWnd'
>>> pe.encode('quick')
'kwIHk'
>>> pe.encode('eaten')
'IYtEHn'
>>> pe.encode('Smith')
'smIHTH'
>>> pe.encode('Larsen')
'lAArsEHn'
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
def _to_regex(pattern, left_match=True):
new_pattern = ''
replacements = {
'#': '[AEIOU]+',
':': '[BCDFGHJKLMNPQRSTVWXYZ]*',
'^': '[BCDFGHJKLMNPQRSTVWXYZ]',
'.': '[BDVGJLMNTWZ]',
'%': '(ER|E|ES|ED|ING|ELY)',
'+': '[EIY]',
' ': '^',
}
for char in pattern:
new_pattern += (
replacements[char] if char in replacements else char
)
if left_match:
new_pattern += '$'
if '^' not in pattern:
new_pattern = '^.*' + new_pattern
else:
new_pattern = '^' + new_pattern.replace('^', '$')
if '$' not in new_pattern:
new_pattern += '.*$'
return new_pattern
word = word.upper()
pron = ''
pos = 0
while pos < len(word):
left_orig = word[:pos]
right_orig = word[pos:]
first = word[pos] if word[pos] in self._rules else ' '
for rule in self._rules[first]:
left, match, right, out = rule
if right_orig.startswith(match):
if left:
l_pattern = _to_regex(left, left_match=True)
if right:
r_pattern = _to_regex(right, left_match=False)
if (not left or re_match(l_pattern, left_orig)) and (
not right
or re_match(r_pattern, right_orig[len(match) :])
):
pron += out
pos += len(match)
break
else:
pron += word[pos]
pos += 1
return pron
[docs]@deprecated(
deprecated_in='0.4.0',
removed_in='0.6.0',
current_version=__version__,
details='Use the NRL.encode method instead.',
)
def nrl(word):
"""Return the Naval Research Laboratory phonetic encoding of a word.
This is a wrapper for :py:meth:`NRL.encode`.
Parameters
----------
word : str
The word to transform
Returns
-------
str
The NRL phonetic encoding
Examples
--------
>>> nrl('the')
'DHAX'
>>> nrl('round')
'rAWnd'
>>> nrl('quick')
'kwIHk'
>>> nrl('eaten')
'IYtEHn'
>>> nrl('Smith')
'smIHTH'
>>> nrl('Larsen')
'lAArsEHn'
.. versionadded:: 0.3.0
"""
return NRL().encode(word)
if __name__ == '__main__':
import doctest
doctest.testmod()