gtav-src/tools_ng/bin/audio/FaceFx 2012/Analysis Languages/Czech.py

# -*- coding: utf-8 -*-
#
# A Czech language pronouncer.
#
# Jamie Redmond
# Copyright (c) 2002-2011 OC3 Entertainment, Inc.
# -----------------------------------------------

from FxStudio import *
from FxAnalysis import *
import re

czech_ipa_map = { u'a'  : u'a',
                  u'á'  : u'a',
                  u'b'  : u'b',
                  u'c'  : u'ʦ',
                  u'č'  : u'ʧ',
                  u'd'  : u'd',
                  u'ď'  : u'ɟ',
                  u'd\'': u'ɟ', # d' is here for completeness but it is replaced with ď during processing.
                  u'e'  : u'ɛ',
                  u'é'  : u'ɛ',
                  u'ě'  : [u'j', u'ɛ'],
                  u'f'  : u'f',
                  u'g'  : u'ɡ',
                  u'h'  : u'ɦ',
                  u'H'  : u'x', # ch is replaced by H during processing.
                  u'i'  : u'ɪ',
                  u'í'  : u'ɪ',
                  u'j'  : u'j',
                  u'k'  : u'k',
                  u'l'  : u'l',
                  u'm'  : u'm',
                  u'n'  : u'n',
                  u'ň'  : u'ɲ',
                  u'N'  : u'ŋ', # n is replaced with N when it precedes k or g during processing.
                  u'o'  : u'o',
                  u'ó'  : u'o',
                  u'p'  : u'p',
                  u'q'  : [u'k', u'v'], # q is here for completeness but this expansion is made explicitly during processing.
                  u'r'  : u'r',
                  u'ř'  : u'r', # we may want to change this...
                  u's'  : u's',
                  u'š'  : u'ʃ',
                  u't'  : u't',
                  u'ť'  : u'c',
                  u't\'': u'c', # t' is here for completeness but it is replaced with ť during processing.
                  u'u'  : u'u',
                  u'ú'  : u'u',
                  u'ů'  : u'u',
                  u'v'  : u'v',
                  u'w'  : u'v',
                  u'x'  : [u'k', u's'], # x is here for completeness but this expansion is made explicitly during processing.
                  u'y'  : u'ɪ',
                  u'ý'  : u'ɪ',
                  u'z'  : u'z',
                  u'ž'  : u'ʒ',
                  u'Z'  : u'ʣ', # dz is replace by Z during processing.
                  u'Ž'  : u'ʤ', # dž is replaced by Ž during processing.
                  u'A'  : [u'ɪ', u'j', u'a'], # ia is replaced by A during processing.
                  u'E'  : [u'ɪ', u'j', u'ɛ'], # ie is replaced by E during processing.
                  u'I'  : [u'ɪ', u'j', u'ɪ'], # ii is replaced by I during processing.
                  u'O'  : [u'ɪ', u'j', u'o'], # io is replaced by O during processing.
                  u'U'  : [u'ɪ', u'j', u'u'], # iu is replaced by U during processing.
                  u'Y'  : [u'ɔ', u'ɪ'], # oj is replaced by Y during processing.
                  u'#'  : [u'e', u'ɪ'], # ej is replaced by # during processing.
                  u'$'  : [u'j', u'e', u'ɪ'], # ěj is replaced by $ during processing.
                  u'%'  : [u'u', u'ɪ'], # uj, új, and ůj are replaced by % during processing.
                  u'&'  : [u'o', u'ʊ'] } # ou is replaced by & during processing.


# Note the fact that ř does not appear in the following string and maps is intentional.

# The following string contains both voiced and unvoiced consonants that are relevant to morphing rules.
# The characters that are true Unicode need to be escaped in this string because it is used as the basis
# of a regular expression and Python does not allow non-escaped Unicode characters in regular expression
# patterns.

#                           ur'bpdtgkzsZcvfďťhHžšŽč'
morphing_czech_consonants = ur'bpdtgkzsZcvf\u010f\u0165hH\u017e\u0161\u017d\u010d'


czech_voiced_to_unvoiced_map = { u'b'  : u'p',
                                 u'd'  : u't',
                                 u'g'  : u'k',
                                 u'z'  : u's',
                                 u'Z'  : u'c',
                                 u'v'  : u'f',
                                 u'ď'  : u'ť',
                                 u'h'  : u'H',
                                 u'ž'  : u'š',
                                 u'Ž'  : u'č' }


czech_unvoiced_to_voiced_map = { u'p'  : u'b',
                                 u't'  : u'd',
                                 u'k'  : u'g',
                                 u's'  : u'z',
                                 u'c'  : u'Z',
                                 u'f'  : u'v',
                                 u'ť'  : u'ď',
                                 u'H'  : u'h',
                                 u'š'  : u'ž',
                                 u'č'  : u'Ž' }


def debug_trace(str):
    if getConsoleVariableAsSwitch('a_devtrace'):
        msgW('FxAnalysis: [Czech Language Module]: ' + str)


def pronounce_czech_consonant_cluster(consonants):
    cluster = consonants.group(0)

    last_consonant = cluster[len(cluster) - 1]

    # Voiced v has no effect on the preceding consonants.
    if last_consonant == u'v':
        return cluster

    morphed = u''

    try:
        # If this doesn't raise a KeyError exception then we know the
        # last consonant in the cluster was voiced and we need to morph
        # all preceding unvoiced consonants into voiced consonants.
        czech_voiced_to_unvoiced_map[last_consonant]

        for u in cluster:
            try:
                morphed += czech_unvoiced_to_voiced_map[u]
            except KeyError:
                morphed += u

    except KeyError:
        pass

    try:
        # If this doesn't raise a KeyError exception then we know the
        # last consonant in the cluster was unvoiced and we need to morph
        # all preceding voiced consonants into unvoiced consonants.
        czech_unvoiced_to_voiced_map[last_consonant]

        for u in cluster:
            try:
                morphed += czech_voiced_to_unvoiced_map[u]
            except KeyError:
                morphed += u

    except KeyError:
        pass

    debug_trace('    morphing consonant cluster ' + cluster + ' -> ' + morphed)

    return morphed


def pronounce_czech_word(previous_word, current_word, next_word):
    pronunciation = []

    current_word = current_word.lower()
    original_word = current_word

    debug_trace('pronouncing ' + original_word)

    # Check for one letter prepositions that should voice or devoice based on the
    # first letter of the following word (preposisions are pronounced with the following
    # word and not standalone).
    if len(current_word) == 1 and len(next_word) > 0:
      next_word_lowercase = next_word.lower()
      if current_word == u'v' or current_word == u's' or current_word == u'z':
        # Run replacements on next_word_lowercase first.
        next_word_lowercase = next_word_lowercase.replace(u'ch', u'H')
        next_word_lowercase = next_word_lowercase.replace(u'dz', u'Z')
        next_word_lowercase = next_word_lowercase.replace(u'dž', u'Ž')
        # Now check for voicing or devoicing of the one letter preposition
        # based on the first letter of the next word.
        joined_word = current_word + next_word_lowercase
        # See the comments for this same line of code further down in the file for an
        # explanation of what it does.
        debug_trace('    joined: ' + joined_word)
        joined_word = re.sub(ur'([{0}]{{2,}})'.format(morphing_czech_consonants), pronounce_czech_consonant_cluster, joined_word)
        current_word = joined_word[0]
        debug_trace('    post: ' + current_word)

    # Replace ch with H for ease of lookup.
    current_word = current_word.replace(u'ch', u'H')

    # Replace certain vowel groups found in foreign words with uppercase letters
    # corresponding to the correct Czech pronunctiation for ease of lookup.
    current_word = current_word.replace(u'ia', u'A')
    current_word = current_word.replace(u'ie', u'E')
    current_word = current_word.replace(u'ii', u'I')
    current_word = current_word.replace(u'io', u'O')
    current_word = current_word.replace(u'iu', u'U')

    # Replace qu with kv.
    current_word = current_word.replace(u'qu', u'kv')

    # Expand q (remaining after qu replacement) and x.
    current_word = current_word.replace(u'q', u'kv')
    current_word = current_word.replace(u'x', u'ks')

    # Replace d' and t' with ď and ť.
    current_word = current_word.replace(u'd\'', u'ď')
    current_word = current_word.replace(u't\'', u'ť')

    # Exceptions.
    current_word = current_word.replace(u'oj', u'Y')
    current_word = current_word.replace(u'ej', u'#')
    current_word = current_word.replace(u'ěj', u'$')
    current_word = current_word.replace(u'uj', u'%')
    current_word = current_word.replace(u'új', u'%')
    current_word = current_word.replace(u'ůj', u'%')

    # Replace n with N when it precedes k or g. N maps to ŋ in czech_ipa_map.
    current_word = re.sub(ur'n([kg])', ur'N\1', current_word)

    # Replace d, t, and n when followed by i, í, or ě with ď, ť, and ň, respectively.
    # We have to use the \u syntax here because Python does not accept unicode characters
    # directly in regular expressions, hence the ur'\u' syntax.
    current_word = re.sub(ur'd([i\u00ed\u011b])', ur'\u010f\1', current_word)
    current_word = re.sub(ur't([i\u00ed\u011b])', ur'\u0165\1', current_word)
    current_word = re.sub(ur'n([i\u00ed\u011b])', ur'\u0148\1', current_word)

    # If the above replacement happend before ě, replace the ě with e so that
    # there is no double j pronunciation. E.g. dělám should be pronounced
    # as ɟɛlaːm and not ɟjɛlaːm (ignore that we use a instead of aː in our implementation).
    current_word = current_word.replace(u'ďě', u'ďe')
    current_word = current_word.replace(u'ťě', u'ťe')
    current_word = current_word.replace(u'ňě', u'ňe')

    # b, p, v, and f followed by ě are handled implicitly due to ě's pronunciation
    # definition in czech_ipa_table; but mě is pronounced mňe.
    current_word = current_word.replace(u'mě', u'mňe')

    # Temporarily replace dz and dž with Z and Ž, respectively, so that the following
    # assimilation rules are easier to implement.
    current_word = current_word.replace(u'dz', u'Z')
    current_word = current_word.replace(u'dž', u'Ž')

    # Convert the end of the word to unvoiced if applicable.
    if len(current_word) > 1:
        current_word = current_word[:-1] + czech_voiced_to_unvoiced_map.get(current_word[-1], current_word[-1])

    # Check for groups of two or more consonants. When such a group is found check
    # the last consonant in the group to see if it is voiced or unvoiced and
    # morph the preceding consonants to match (e.g. they either all become voiced or
    # unvoiced depending on how the last consonant is classified).
    current_word = re.sub(ur'([{0}]{{2,}})'.format(morphing_czech_consonants), pronounce_czech_consonant_cluster, current_word)

    # Undo the dz and dž replacement if any are still present.
    current_word = current_word.replace(u'Z', u'dz')
    current_word = current_word.replace(u'Ž', u'dž')

    # Replace sh with sch (sH).
    current_word.replace(u'sh', u'sH')

    # Replace dz and dž with Z and Ž, respectively, for ease of lookup. This happens
    # at the end because rules above could prevent this from happening.
    current_word = current_word.replace(u'dz', u'Z')
    current_word = current_word.replace(u'dž', u'Ž')

    # Handle the ou dipthong which occurs in native Czech words.
    current_word = current_word.replace(u'ou', u'&')

    # Handle the dipthongs au and eu? I don't think it's necesssary for our facial animation
    # purposes since we'd split them out into their components again during coarticulation
    # anyway... plus it's a difficult problem because sometimes they are dipthongs and sometimes
    # they are not (e.g. at morpheme boundaries where there is a glottal stop). These two are only
    # found in foreign words or words of foreign origin. If it's a big deal on certain words their
    # pronunciation can always be overridden in the language dictionary.

    for c in current_word:
        try:
            ipa = czech_ipa_map[c]
            if isinstance(ipa, list):
                pronunciation.extend(ipa)
            else:
                pronunciation.append(ipa)
        except KeyError:
            errorW('Czech Language Pronouncer: {0} was not found in czech_ipa_map!'.format(c))
            raise

    pronunciation_string = u''
    for u in pronunciation:
        pronunciation_string += u

    debug_trace('result = ' + original_word + ' -> ' + pronunciation_string)

    return pronunciation

if __name__ == '__main__':
    registerLanguagePronouncer('Czech', 'ipa', pronounce_czech_word)
    msg('Czech Language Module registered.')