Source code for abydos.fingerprint

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.fingerprint.

The fingerprint module implements string fingerprints such as:
    - string fingerprint
    - q-gram fingerprint
    - phonetic fingerprint
    - Pollock & Zomora's skeleton key
    - Pollock & Zomora's omission key
    - Cisłak & Grabowski's occurrence fingerprint
    - Cisłak & Grabowski's occurrence halved fingerprint
    - Cisłak & Grabowski's count fingerprint
    - Cisłak & Grabowski's position fingerprint
    - Synoname Toolcode
"""

from __future__ import division, unicode_literals

from collections import Counter
from unicodedata import normalize as unicode_normalize

from six import text_type

from .phonetic import double_metaphone
from .qgram import QGrams

__all__ = ['MOST_COMMON_LETTERS', 'MOST_COMMON_LETTERS_CG',
           'MOST_COMMON_LETTERS_DE', 'MOST_COMMON_LETTERS_DE_LC',
           'MOST_COMMON_LETTERS_EN_LC', 'count_fingerprint',
           'occurrence_fingerprint', 'occurrence_halved_fingerprint',
           'omission_key', 'phonetic_fingerprint', 'position_fingerprint',
           'qgram_fingerprint', 'skeleton_key', 'str_fingerprint',
           'synoname_toolcode']


[docs]def str_fingerprint(phrase, joiner=' '):
    """Return string fingerprint.

    The fingerprint of a string is a string consisting of all of the unique
    words in a string, alphabetized & concatenated with intervening joiners.
    This fingerprint is described at :cite:`OpenRefine:2012`.

    :param str phrase: the string from which to calculate the fingerprint
    :param str joiner: the string that will be placed between each word
    :returns: the fingerprint of the phrase
    :rtype: str

    >>> str_fingerprint('The quick brown fox jumped over the lazy dog.')
    'brown dog fox jumped lazy over quick the'
    """
    phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower()))
    phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
    phrase = joiner.join(sorted(list(set(phrase.split()))))
    return phrase


[docs]def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''):
    """Return Q-Gram fingerprint.

    A q-gram fingerprint is a string consisting of all of the unique q-grams
    in a string, alphabetized & concatenated. This fingerprint is described at
    :cite:`OpenRefine:2012`.

    :param str phrase: the string from which to calculate the q-gram
        fingerprint
    :param int qval: the length of each q-gram (by default 2)
    :param str start_stop: the start & stop symbol(s) to concatenate on either
        end of the phrase, as defined in abydos.util.qgram()
    :param str joiner: the string that will be placed between each word
    :returns: the q-gram fingerprint of the phrase
    :rtype: str

    >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
    'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
    >>> qgram_fingerprint('Christopher')
    'cherhehrisopphristto'
    >>> qgram_fingerprint('Niall')
    'aliallni'
    """
    phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower()))
    phrase = ''.join(c for c in phrase if c.isalnum())
    phrase = QGrams(phrase, qval, start_stop)
    phrase = joiner.join(sorted(phrase))
    return phrase


[docs]def phonetic_fingerprint(phrase, phonetic_algorithm=double_metaphone,
                         joiner=' ', *args):
    """Return the phonetic fingerprint of a phrase.

    A phonetic fingerprint is identical to a standard string fingerprint, as
    implemented in abydos.clustering.fingerprint(), but performs the
    fingerprinting function after converting the string to its phonetic form,
    as determined by some phonetic algorithm. This fingerprint is described at
    :cite:`OpenRefine:2012`.

    :param str phrase: the string from which to calculate the phonetic
        fingerprint
    :param function phonetic_algorithm: a phonetic algorithm that takes a
        string and returns a string (presumably a phonetic representation of
        the original string) By default, this function uses
        abydos.phonetic.double_metaphone()
    :param str joiner: the string that will be placed between each word
    :param args: additional arguments to pass to the phonetic algorithm,
        along with the phrase itself
    :returns: the phonetic fingerprint of the phrase
    :rtype: str

    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.')
    '0 afr fks jmpt kk ls prn tk'
    >>> from abydos.phonetic import soundex
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.',
    ... phonetic_algorithm=soundex)
    'b650 d200 f200 j513 l200 o160 q200 t000'
    """
    phonetic = ''
    for word in phrase.split():
        word = phonetic_algorithm(word, *args)
        if not isinstance(word, text_type) and hasattr(word, '__iter__'):
            word = word[0]
        phonetic += word + joiner
    phonetic = phonetic[:-len(joiner)]
    return str_fingerprint(phonetic)


[docs]def skeleton_key(word):
    """Return the skeleton key.

    The skeleton key of a word is defined in :cite:`Pollock:1984`.

    :param str word: the word to transform into its skeleton key
    :returns: the skeleton key
    :rtype: str

    >>> skeleton_key('The quick brown fox jumped over the lazy dog.')
    'THQCKBRWNFXJMPDVLZYGEUIOA'
    >>> skeleton_key('Christopher')
    'CHRSTPIOE'
    >>> skeleton_key('Niall')
    'NLIA'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U'}

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})
    start = word[0:1]
    consonant_part = ''
    vowel_part = ''

    # add consonants & vowels to to separate strings
    # (omitting the first char & duplicates)
    for char in word[1:]:
        if char != start:
            if char in _vowels:
                if char not in vowel_part:
                    vowel_part += char
            elif char not in consonant_part:
                consonant_part += char
    # return the first char followed by consonants followed by vowels
    return start + consonant_part + vowel_part


[docs]def omission_key(word):
    """Return the omission key.

    The omission key of a word is defined in :cite:`Pollock:1984`.

    :param str word: the word to transform into its omission key
    :returns: the omission key
    :rtype: str

    >>> omission_key('The quick brown fox jumped over the lazy dog.')
    'JKQXZVWYBFMGPDHCLNTREUIOA'
    >>> omission_key('Christopher')
    'PHCTSRIOE'
    >>> omission_key('Niall')
    'LNIA'
    """
    _consonants = ('J', 'K', 'Q', 'X', 'Z', 'V', 'W', 'Y', 'B', 'F', 'M', 'G',
                   'P', 'D', 'H', 'C', 'L', 'N', 'T', 'S', 'R')

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    key = ''

    # add consonants in order supplied by _consonants (no duplicates)
    for char in _consonants:
        if char in word:
            key += char

    # add vowels in order they appeared in the word (no duplicates)
    for char in word:
        if char not in _consonants and char not in key:
            key += char

    return key


# most common letters, as defined in Cisłak & Grabowski
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd',
                          'l', 'c', 'u', 'm', 'w', 'f')

# most common letters (case-folded to lowercase), as shown in Google Books
# English n-grams, among letters a-z & digits 0-9
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l',
                             'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b',
                             'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9',
                             '3', '4', '8', '5', '6', '7')

# most common letters, as shown in Google Books English n-grams, among letters
# A-Z, a-z & digits 0-9
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd',
                       'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k',
                       'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H',
                       'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j',
                       'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q',
                       '3', 'Z', '4', '8', '5', '6', '7',)

# most common letters (case-folded to lowercase), as shown in Google Books
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
                          'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
                          'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q',
                          '1', '2', '3', '4', '0', '5', '6', '9', '8', '7')

# most common letters (case-folded to lowercase), as shown in Google Books
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits
# 0-9
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
                             'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
                             'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G',
                             'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I',
                             'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O',
                             'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2',
                             'Y', '3', '4', '0', '5', '6', '9', '8', '7')


[docs]def occurrence_fingerprint(word, n_bits=16,
                           most_common=MOST_COMMON_LETTERS_CG):
    """Return the occurrence fingerprint.

    Based on the occurrence fingerprint from :cite:`Cislak:2017`.

    :param str word: the word to fingerprint
    :param int n_bits: number of bits in the fingerprint returned
    :param list most_common: the most common tokens in the target language,
        ordered by frequency
    :returns: the occurrence fingerprint
    :rtype: int

    >>> bin(occurrence_fingerprint('hat'))
    '0b110000100000000'
    >>> bin(occurrence_fingerprint('niall'))
    '0b10110000100000'
    >>> bin(occurrence_fingerprint('colin'))
    '0b1110000110000'
    >>> bin(occurrence_fingerprint('atcg'))
    '0b110000000010000'
    >>> bin(occurrence_fingerprint('entreatment'))
    '0b1110010010000100'
    """
    word = set(word)
    fingerprint = 0

    for letter in most_common:
        if letter in word:
            fingerprint += 1
        n_bits -= 1
        if n_bits:
            fingerprint <<= 1
        else:
            break

    n_bits -= 1
    if n_bits > 0:
        fingerprint <<= n_bits

    return fingerprint


[docs]def occurrence_halved_fingerprint(word, n_bits=16,
                                  most_common=MOST_COMMON_LETTERS_CG):
    """Return the occurrence halved fingerprint.

    Based on the occurrence halved fingerprint from :cite:`Cislak:2017`.

    :param str word: the word to fingerprint
    :param int n_bits: number of bits in the fingerprint returned
    :param list most_common: the most common tokens in the target language,
        ordered by frequency
    :returns: the occurrence halved fingerprint
    :rtype: int

    >>> bin(occurrence_halved_fingerprint('hat'))
    '0b1010000000010'
    >>> bin(occurrence_halved_fingerprint('niall'))
    '0b10010100000'
    >>> bin(occurrence_halved_fingerprint('colin'))
    '0b1001010000'
    >>> bin(occurrence_halved_fingerprint('atcg'))
    '0b10100000000000'
    >>> bin(occurrence_halved_fingerprint('entreatment'))
    '0b1111010000110000'
    """
    if n_bits % 2:
        n_bits += 1

    w_len = len(word)//2
    w_1 = set(word[:w_len])
    w_2 = set(word[w_len:])
    fingerprint = 0

    for letter in most_common:
        if n_bits:
            fingerprint <<= 1
            if letter in w_1:
                fingerprint += 1
            fingerprint <<= 1
            if letter in w_2:
                fingerprint += 1
            n_bits -= 2
        else:
            break

    if n_bits > 0:
        fingerprint <<= n_bits

    return fingerprint


[docs]def count_fingerprint(word, n_bits=16,
                      most_common=MOST_COMMON_LETTERS_CG):
    """Return the count fingerprint.

    Based on the count fingerprint from :cite:`Cislak:2017`.

    :param str word: the word to fingerprint
    :param int n_bits: number of bits in the fingerprint returned
    :param list most_common: the most common tokens in the target language,
        ordered by frequency
    :returns: the count fingerprint
    :rtype: int

    >>> bin(count_fingerprint('hat'))
    '0b1010000000001'
    >>> bin(count_fingerprint('niall'))
    '0b10001010000'
    >>> bin(count_fingerprint('colin'))
    '0b101010000'
    >>> bin(count_fingerprint('atcg'))
    '0b1010000000000'
    >>> bin(count_fingerprint('entreatment'))
    '0b1111010000100000'
    """
    if n_bits % 2:
        n_bits += 1

    word = Counter(word)
    fingerprint = 0

    for letter in most_common:
        if n_bits:
            fingerprint <<= 2
            fingerprint += (word[letter] & 3)
            n_bits -= 2
        else:
            break

    if n_bits:
        fingerprint <<= n_bits

    return fingerprint


[docs]def position_fingerprint(word, n_bits=16,
                         most_common=MOST_COMMON_LETTERS_CG,
                         bits_per_letter=3):
    """Return the position fingerprint.

    Based on the position fingerprint from :cite:`Cislak:2017`.

    :param str word: the word to fingerprint
    :param int n_bits: number of bits in the fingerprint returned
    :param list most_common: the most common tokens in the target language,
        ordered by frequency
    :param int bits_per_letter: the bits to assign for letter position
    :returns: the position fingerprint
    :rtype: int

    >>> bin(position_fingerprint('hat'))
    '0b1110100011111111'
    >>> bin(position_fingerprint('niall'))
    '0b1111110101110010'
    >>> bin(position_fingerprint('colin'))
    '0b1111111110010111'
    >>> bin(position_fingerprint('atcg'))
    '0b1110010001111111'
    >>> bin(position_fingerprint('entreatment'))
    '0b101011111111'
    """
    position = {}
    for pos, letter in enumerate(word):
        if letter not in position and letter in most_common:
            position[letter] = min(pos, 2**bits_per_letter-1)

    fingerprint = 0

    for letter in most_common:
        if n_bits:
            fingerprint <<= min(bits_per_letter, n_bits)
            if letter in position:
                fingerprint += min(position[letter], 2**n_bits-1)
            else:
                fingerprint += min(2**bits_per_letter-1, 2**n_bits-1)
            n_bits -= min(bits_per_letter, n_bits)
        else:
            break

    for _ in range(n_bits):
        fingerprint <<= 1
        fingerprint += 1

    return fingerprint


_synoname_special_table = (
    # Roman, match, extra, method
    (False, 'NONE', '', 0),
    (False, 'aine', '', 3),
    (False, 'also erroneously', '', 4),
    (False, 'also identified with the', '', 2),
    (False, 'also identified with', '', 2),
    (False, 'archbishop', '', 7),
    (False, 'atelier', '', 7),
    (False, 'baron', '', 7),
    (False, 'cadet', '', 3),
    (False, 'cardinal', '', 7),
    (False, 'circle of', '', 5),
    (False, 'circle', '', 5),
    (False, 'class of', '', 5),
    (False, 'conde de', '', 7),
    (False, 'countess', '', 7),
    (False, 'count', '', 7),
    (False, "d'", " d'", 15),
    (False, 'dai', '', 15),
    (False, "dall'", " dall'", 15),
    (False, 'dalla', '', 15),
    (False, 'dalle', '', 15),
    (False, 'dal', '', 15),
    (False, 'da', '', 15),
    (False, 'degli', '', 15),
    (False, 'della', '', 15),
    (False, 'del', '', 15),
    (False, 'den', '', 15),
    (False, 'der altere', '', 3),
    (False, 'der jungere', '', 3),
    (False, 'der', '', 15),
    (False, 'de la', '', 15),
    (False, 'des', '', 15),
    (False, "de'", " de'", 15),
    (False, 'de', '', 15),
    (False, 'di ser', '', 7),
    (False, 'di', '', 15),
    (False, 'dos', '', 15),
    (False, 'du', '', 15),
    (False, 'duke of', '', 7),
    (False, 'earl of', '', 7),
    (False, 'el', '', 15),
    (False, 'fils', '', 3),
    (False, 'florentine follower of', '', 5),
    (False, 'follower of', '', 5),
    (False, 'fra', '', 7),
    (False, 'freiherr von', '', 7),
    (False, 'giovane', '', 7),
    (False, 'group', '', 5),
    (True, 'iii', '', 3),
    (True, 'ii', '', 3),
    (False, 'il giovane', '', 7),
    (False, 'il vecchio', '', 7),
    (False, 'il', '', 15),
    (False, "in't", '', 7),
    (False, 'in het', '', 7),
    (True, 'iv', '', 3),
    (True, 'ix', '', 3),
    (True, 'i', '', 3),
    (False, 'jr.', '', 3),
    (False, 'jr', '', 3),
    (False, 'juniore', '', 3),
    (False, 'junior', '', 3),
    (False, 'king of', '', 7),
    (False, "l'", " l'", 15),
    (False, "l'aine", '', 3),
    (False, 'la', '', 15),
    (False, 'le jeune', '', 3),
    (False, 'le', '', 15),
    (False, 'lo', '', 15),
    (False, 'maestro', '', 7),
    (False, 'maitre', '', 7),
    (False, 'marchioness', '', 7),
    (False, 'markgrafin von', '', 7),
    (False, 'marquess', '', 7),
    (False, 'marquis', '', 7),
    (False, 'master of the', '', 7),
    (False, 'master of', '', 7),
    (False, 'master known as the', '', 7),
    (False, 'master with the', '', 7),
    (False, 'master with', '', 7),
    (False, 'masters', '', 7),
    (False, 'master', '', 7),
    (False, 'meister', '', 7),
    (False, 'met de', '', 7),
    (False, 'met', '', 7),
    (False, 'mlle.', '', 7),
    (False, 'mlle', '', 7),
    (False, 'monogrammist', '', 7),
    (False, 'monsu', '', 7),
    (False, 'nee', '', 2),
    (False, 'of', '', 3),
    (False, 'oncle', '', 3),
    (False, 'op den', '', 15),
    (False, 'op de', '', 15),
    (False, 'or', '', 2),
    (False, 'over den', '', 15),
    (False, 'over de', '', 15),
    (False, 'over', '', 7),
    (False, 'p.re', '', 7),
    (False, 'p.r.a.', '', 1),
    (False, 'padre', '', 7),
    (False, 'painter', '', 7),
    (False, 'pere', '', 3),
    (False, 'possibly identified with', '', 6),
    (False, 'possibly', '', 6),
    (False, 'pseudo', '', 15),
    (False, 'r.a.', '', 1),
    (False, 'reichsgraf von', '', 7),
    (False, 'ritter von', '', 7),
    (False, 'sainte-', ' sainte-', 8),
    (False, 'sainte', '', 7),
    (False, 'saint-', ' saint-', 8),
    (False, 'saint', '', 7),
    (False, 'santa', '', 15),
    (False, "sant'", " sant'", 15),
    (False, 'san', '', 15),
    (False, 'ser', '', 7),
    (False, 'seniore', '', 3),
    (False, 'senior', '', 3),
    (False, 'sir', '', 5),
    (False, 'sr.', '', 3),
    (False, 'sr', '', 3),
    (False, 'ss.', ' ss.', 14),
    (False, 'ss', '', 6),
    (False, 'st-', ' st-', 8),
    (False, 'st.', ' st.', 15),
    (False, 'ste-', ' ste-', 8),
    (False, 'ste.', ' ste.', 15),
    (False, 'studio', '', 7),
    (False, 'sub-group', '', 5),
    (False, 'sultan of', '', 7),
    (False, 'ten', '', 15),
    (False, 'ter', '', 15),
    (False, 'the elder', '', 3),
    (False, 'the younger', '', 3),
    (False, 'the', '', 7),
    (False, 'tot', '', 15),
    (False, 'unidentified', '', 1),
    (False, 'van den', '', 15),
    (False, 'van der', '', 15),
    (False, 'van de', '', 15),
    (False, 'vanden', '', 15),
    (False, 'vander', '', 15),
    (False, 'van', '', 15),
    (False, 'vecchia', '', 7),
    (False, 'vecchio', '', 7),
    (True, 'viii', '', 3),
    (True, 'vii', '', 3),
    (True, 'vi', '', 3),
    (True, 'v', '', 3),
    (False, 'vom', '', 7),
    (False, 'von', '', 15),
    (False, 'workshop', '', 7),
    (True, 'xiii', '', 3),
    (True, 'xii', '', 3),
    (True, 'xiv', '', 3),
    (True, 'xix', '', 3),
    (True, 'xi', '', 3),
    (True, 'xviii', '', 3),
    (True, 'xvii', '', 3),
    (True, 'xvi', '', 3),
    (True, 'xv', '', 3),
    (True, 'xx', '', 3),
    (True, 'x', '', 3),
    (False, 'y', '', 7)
)


[docs]def synoname_toolcode(lname, fname='', qual='', normalize=0):
    """Build the Synoname toolcode.

    Cf. :cite:`Getty:1991,Gross:1991`.

    :param str lname: last name
    :param str fname: first name (can be blank)
    :param str qual: qualifier
    :param int normalize: normalization mode (0, 1, or 2)
    :returns: the transformed last and first names and the synoname toolcode
    :rtype: tuple

    >>> synoname_toolcode('hat')
    ('hat', '', '0000000003$$h')
    >>> synoname_toolcode('niall')
    ('niall', '', '0000000005$$n')
    >>> synoname_toolcode('colin')
    ('colin', '', '0000000005$$c')
    >>> synoname_toolcode('atcg')
    ('atcg', '', '0000000004$$a')
    >>> synoname_toolcode('entreatment')
    ('entreatment', '', '0000000011$$e')

    >>> synoname_toolcode('Ste.-Marie', 'Count John II', normalize=2)
    ('ste.-marie ii', 'count john', '0200491310$015b049a127c$smcji')
    >>> synoname_toolcode('Michelangelo IV', '', 'Workshop of')
    ('michelangelo iv', '', '3000550015$055b$mi')
    """
    method_dict = {'end': 1, 'middle': 2, 'beginning': 4,
                             'beginning_no_space': 8}

    lname = lname.lower()
    fname = fname.lower()
    qual = qual.lower()

    # Start with the basic code
    toolcode = ['0', '0', '0', '000', '00', '00', '$', '', '$', '']

    full_name = ' '.join((lname, fname))

    # Fill field 0 (qualifier)
    qual_3 = {'adaptation after', 'after', 'assistant of', 'assistants of',
              'circle of', 'follower of', 'imitator of', 'in the style of',
              'manner of', 'pupil of', 'school of', 'studio of',
              'style of', 'workshop of'}
    qual_2 = {'copy after', 'copy after?', 'copy of'}
    qual_1 = {'ascribed to', 'attributed to or copy after',
              'attributed to', 'possibly'}

    if qual in qual_3:
        toolcode[0] = '3'
    elif qual in qual_2:
        toolcode[0] = '2'
    elif qual in qual_1:
        toolcode[0] = '1'

    # Fill field 1 (punctuation)
    if '.' in full_name:
        toolcode[1] = '2'
    else:
        for punct in ',-/:;"&\'()!{|}?$%*+<=>[\\]^_`~':
            if punct in full_name:
                toolcode[1] = '1'
                break

    # Fill field 2 (generation)
    gen_1 = ('the elder', ' sr.', ' sr', 'senior', 'der altere', 'il vecchio',
             "l'aine", 'p.re', 'padre', 'seniore', 'vecchia', 'vecchio')
    gen_2 = (' jr.', ' jr', 'der jungere', 'il giovane', 'giovane', 'juniore',
             'junior', 'le jeune', 'the younger')

    elderyounger = ''  # save elder/younger for possible movement later
    for gen in gen_1:
        if gen in full_name:
            toolcode[2] = '1'
            elderyounger = gen
            break
    else:
        for gen in gen_2:
            if gen in full_name:
                toolcode[2] = '2'
                elderyounger = gen
                break

    # do comma flip
    if normalize:
        comma = lname.find(',')
        if comma != -1:
            lname_end = lname[comma + 1:]
            while lname_end[0] in {' ', ','}:
                lname_end = lname_end[1:]
            fname = lname_end + ' ' + fname
            lname = lname[:comma].strip()

    # do elder/younger move
    if normalize == 2 and elderyounger:
        elderyounger_loc = fname.find(elderyounger)
        if elderyounger_loc != -1:
            lname = ' '.join((lname, elderyounger.strip()))
            fname = ' '.join((fname[:elderyounger_loc].strip(),
                              fname[elderyounger_loc +
                                    len(elderyounger):])).strip()

    toolcode[4] = '{:02d}'.format(len(fname))
    toolcode[5] = '{:02d}'.format(len(lname))

    # strip punctuation
    for char in ',/:;"&()!{|}?$%*+<=>[\\]^_`~':
        full_name = full_name.replace(char, '')
    for pos, char in enumerate(full_name):
        if char == '-' and full_name[pos - 1:pos + 2] != 'b-g':
            full_name = full_name[:pos] + ' ' + full_name[pos + 1:]

    # Fill field 9 (search range)
    for letter in [_[0] for _ in full_name.split()]:
        if letter not in toolcode[9]:
            toolcode[9] += letter
        if len(toolcode[9]) == 15:
            break

    def roman_check(numeral, fname, lname):
        """Move Roman numerals from first name to last."""
        loc = fname.find(numeral)
        if fname and (loc != -1 and
                      (len(fname[loc:]) == len(numeral)) or
                      fname[loc+len(numeral)] in {' ', ','}):
            lname = ' '.join((lname, numeral))
            fname = ' '.join((fname[:loc].strip(),
                              fname[loc + len(numeral):].lstrip(' ,')))
        return fname.strip(), lname.strip()

    # Fill fields 7 (specials) and 3 (roman numerals)
    for num, special in enumerate(_synoname_special_table):
        roman, match, extra, method = special
        if method & method_dict['end']:
            match_context = ' ' + match
            loc = full_name.find(match_context)
            if ((len(full_name) > len(match_context)) and
                    (loc == len(full_name) - len(match_context))):
                if roman:
                    if not any(abbr in fname for abbr in ('i.', 'v.', 'x.')):
                        full_name = full_name[:loc]
                        toolcode[7] += '{:03d}'.format(num) + 'a'
                        if toolcode[3] == '000':
                            toolcode[3] = '{:03d}'.format(num)
                        if normalize == 2:
                            fname, lname = roman_check(match, fname, lname)
                else:
                    full_name = full_name[:loc]
                    toolcode[7] += '{:03d}'.format(num) + 'a'
        if method & method_dict['middle']:
            match_context = ' ' + match + ' '
            loc = 0
            while loc != -1:
                loc = full_name.find(match_context, loc+1)
                if loc > 0:
                    if roman:
                        if not any(abbr in fname for abbr in
                                   ('i.', 'v.', 'x.')):
                            full_name = (full_name[:loc] +
                                         full_name[loc + len(match) + 1:])
                            toolcode[7] += '{:03d}'.format(num) + 'b'
                            if toolcode[3] == '000':
                                toolcode[3] = '{:03d}'.format(num)
                            if normalize == 2:
                                fname, lname = roman_check(match, fname, lname)
                    else:
                        full_name = (full_name[:loc] +
                                     full_name[loc + len(match) + 1:])
                        toolcode[7] += '{:03d}'.format(num) + 'b'
        if method & method_dict['beginning']:
            match_context = match + ' '
            loc = full_name.find(match_context)
            if loc == 0:
                full_name = full_name[len(match) + 1:]
                toolcode[7] += '{:03d}'.format(num) + 'c'
        if method & method_dict['beginning_no_space']:
            loc = full_name.find(match)
            if loc == 0:
                toolcode[7] += '{:03d}'.format(num) + 'd'
                if full_name[:len(match)] not in toolcode[9]:
                    toolcode[9] += full_name[:len(match)]

        if extra:
            loc = full_name.find(extra)
            if loc != -1:
                toolcode[7] += '{:03d}'.format(num) + 'X'
                # Since extras are unique, we only look for each of them
                # once, and they include otherwise impossible characters for
                # this field, it's not possible for the following line to have
                # ever been false.
                # if full_name[loc:loc+len(extra)] not in toolcode[9]:
                toolcode[9] += full_name[loc:loc+len(match)]

    return lname, fname, ''.join(toolcode)


if __name__ == '__main__':
    import doctest
    doctest.testmod()