297 lines
13 KiB
Python
Executable File
297 lines
13 KiB
Python
Executable File
# -*- coding: utf-8 -*-
|
||
#
|
||
# A Czech language pronouncer.
|
||
#
|
||
# Jamie Redmond
|
||
# Copyright (c) 2002-2011 OC3 Entertainment, Inc.
|
||
# -----------------------------------------------
|
||
|
||
from FxStudio import *
|
||
from FxAnalysis import *
|
||
import re
|
||
|
||
czech_ipa_map = { u'a' : u'a',
|
||
u'á' : u'a',
|
||
u'b' : u'b',
|
||
u'c' : u'ʦ',
|
||
u'č' : u'ʧ',
|
||
u'd' : u'd',
|
||
u'ď' : u'ɟ',
|
||
u'd\'': u'ɟ', # d' is here for completeness but it is replaced with ď during processing.
|
||
u'e' : u'ɛ',
|
||
u'é' : u'ɛ',
|
||
u'ě' : [u'j', u'ɛ'],
|
||
u'f' : u'f',
|
||
u'g' : u'ɡ',
|
||
u'h' : u'ɦ',
|
||
u'H' : u'x', # ch is replaced by H during processing.
|
||
u'i' : u'ɪ',
|
||
u'í' : u'ɪ',
|
||
u'j' : u'j',
|
||
u'k' : u'k',
|
||
u'l' : u'l',
|
||
u'm' : u'm',
|
||
u'n' : u'n',
|
||
u'ň' : u'ɲ',
|
||
u'N' : u'ŋ', # n is replaced with N when it precedes k or g during processing.
|
||
u'o' : u'o',
|
||
u'ó' : u'o',
|
||
u'p' : u'p',
|
||
u'q' : [u'k', u'v'], # q is here for completeness but this expansion is made explicitly during processing.
|
||
u'r' : u'r',
|
||
u'ř' : u'r', # we may want to change this...
|
||
u's' : u's',
|
||
u'š' : u'ʃ',
|
||
u't' : u't',
|
||
u'ť' : u'c',
|
||
u't\'': u'c', # t' is here for completeness but it is replaced with ť during processing.
|
||
u'u' : u'u',
|
||
u'ú' : u'u',
|
||
u'ů' : u'u',
|
||
u'v' : u'v',
|
||
u'w' : u'v',
|
||
u'x' : [u'k', u's'], # x is here for completeness but this expansion is made explicitly during processing.
|
||
u'y' : u'ɪ',
|
||
u'ý' : u'ɪ',
|
||
u'z' : u'z',
|
||
u'ž' : u'ʒ',
|
||
u'Z' : u'ʣ', # dz is replace by Z during processing.
|
||
u'Ž' : u'ʤ', # dž is replaced by Ž during processing.
|
||
u'A' : [u'ɪ', u'j', u'a'], # ia is replaced by A during processing.
|
||
u'E' : [u'ɪ', u'j', u'ɛ'], # ie is replaced by E during processing.
|
||
u'I' : [u'ɪ', u'j', u'ɪ'], # ii is replaced by I during processing.
|
||
u'O' : [u'ɪ', u'j', u'o'], # io is replaced by O during processing.
|
||
u'U' : [u'ɪ', u'j', u'u'], # iu is replaced by U during processing.
|
||
u'Y' : [u'ɔ', u'ɪ'], # oj is replaced by Y during processing.
|
||
u'#' : [u'e', u'ɪ'], # ej is replaced by # during processing.
|
||
u'$' : [u'j', u'e', u'ɪ'], # ěj is replaced by $ during processing.
|
||
u'%' : [u'u', u'ɪ'], # uj, új, and ůj are replaced by % during processing.
|
||
u'&' : [u'o', u'ʊ'] } # ou is replaced by & during processing.
|
||
|
||
|
||
# Note the fact that ř does not appear in the following string and maps is intentional.
|
||
|
||
# The following string contains both voiced and unvoiced consonants that are relevant to morphing rules.
|
||
# The characters that are true Unicode need to be escaped in this string because it is used as the basis
|
||
# of a regular expression and Python does not allow non-escaped Unicode characters in regular expression
|
||
# patterns.
|
||
|
||
# ur'bpdtgkzsZcvfďťhHžšŽč'
|
||
morphing_czech_consonants = ur'bpdtgkzsZcvf\u010f\u0165hH\u017e\u0161\u017d\u010d'
|
||
|
||
|
||
czech_voiced_to_unvoiced_map = { u'b' : u'p',
|
||
u'd' : u't',
|
||
u'g' : u'k',
|
||
u'z' : u's',
|
||
u'Z' : u'c',
|
||
u'v' : u'f',
|
||
u'ď' : u'ť',
|
||
u'h' : u'H',
|
||
u'ž' : u'š',
|
||
u'Ž' : u'č' }
|
||
|
||
|
||
czech_unvoiced_to_voiced_map = { u'p' : u'b',
|
||
u't' : u'd',
|
||
u'k' : u'g',
|
||
u's' : u'z',
|
||
u'c' : u'Z',
|
||
u'f' : u'v',
|
||
u'ť' : u'ď',
|
||
u'H' : u'h',
|
||
u'š' : u'ž',
|
||
u'č' : u'Ž' }
|
||
|
||
|
||
def debug_trace(str):
|
||
if getConsoleVariableAsSwitch('a_devtrace'):
|
||
msgW('FxAnalysis: [Czech Language Module]: ' + str)
|
||
|
||
|
||
def pronounce_czech_consonant_cluster(consonants):
|
||
cluster = consonants.group(0)
|
||
|
||
last_consonant = cluster[len(cluster) - 1]
|
||
|
||
# Voiced v has no effect on the preceding consonants.
|
||
if last_consonant == u'v':
|
||
return cluster
|
||
|
||
morphed = u''
|
||
|
||
try:
|
||
# If this doesn't raise a KeyError exception then we know the
|
||
# last consonant in the cluster was voiced and we need to morph
|
||
# all preceding unvoiced consonants into voiced consonants.
|
||
czech_voiced_to_unvoiced_map[last_consonant]
|
||
|
||
for u in cluster:
|
||
try:
|
||
morphed += czech_unvoiced_to_voiced_map[u]
|
||
except KeyError:
|
||
morphed += u
|
||
|
||
except KeyError:
|
||
pass
|
||
|
||
try:
|
||
# If this doesn't raise a KeyError exception then we know the
|
||
# last consonant in the cluster was unvoiced and we need to morph
|
||
# all preceding voiced consonants into unvoiced consonants.
|
||
czech_unvoiced_to_voiced_map[last_consonant]
|
||
|
||
for u in cluster:
|
||
try:
|
||
morphed += czech_voiced_to_unvoiced_map[u]
|
||
except KeyError:
|
||
morphed += u
|
||
|
||
except KeyError:
|
||
pass
|
||
|
||
debug_trace(' morphing consonant cluster ' + cluster + ' -> ' + morphed)
|
||
|
||
return morphed
|
||
|
||
|
||
def pronounce_czech_word(previous_word, current_word, next_word):
|
||
pronunciation = []
|
||
|
||
current_word = current_word.lower()
|
||
original_word = current_word
|
||
|
||
debug_trace('pronouncing ' + original_word)
|
||
|
||
# Check for one letter prepositions that should voice or devoice based on the
|
||
# first letter of the following word (preposisions are pronounced with the following
|
||
# word and not standalone).
|
||
if len(current_word) == 1 and len(next_word) > 0:
|
||
next_word_lowercase = next_word.lower()
|
||
if current_word == u'v' or current_word == u's' or current_word == u'z':
|
||
# Run replacements on next_word_lowercase first.
|
||
next_word_lowercase = next_word_lowercase.replace(u'ch', u'H')
|
||
next_word_lowercase = next_word_lowercase.replace(u'dz', u'Z')
|
||
next_word_lowercase = next_word_lowercase.replace(u'dž', u'Ž')
|
||
# Now check for voicing or devoicing of the one letter preposition
|
||
# based on the first letter of the next word.
|
||
joined_word = current_word + next_word_lowercase
|
||
# See the comments for this same line of code further down in the file for an
|
||
# explanation of what it does.
|
||
debug_trace(' joined: ' + joined_word)
|
||
joined_word = re.sub(ur'([{0}]{{2,}})'.format(morphing_czech_consonants), pronounce_czech_consonant_cluster, joined_word)
|
||
current_word = joined_word[0]
|
||
debug_trace(' post: ' + current_word)
|
||
|
||
# Replace ch with H for ease of lookup.
|
||
current_word = current_word.replace(u'ch', u'H')
|
||
|
||
# Replace certain vowel groups found in foreign words with uppercase letters
|
||
# corresponding to the correct Czech pronunctiation for ease of lookup.
|
||
current_word = current_word.replace(u'ia', u'A')
|
||
current_word = current_word.replace(u'ie', u'E')
|
||
current_word = current_word.replace(u'ii', u'I')
|
||
current_word = current_word.replace(u'io', u'O')
|
||
current_word = current_word.replace(u'iu', u'U')
|
||
|
||
# Replace qu with kv.
|
||
current_word = current_word.replace(u'qu', u'kv')
|
||
|
||
# Expand q (remaining after qu replacement) and x.
|
||
current_word = current_word.replace(u'q', u'kv')
|
||
current_word = current_word.replace(u'x', u'ks')
|
||
|
||
# Replace d' and t' with ď and ť.
|
||
current_word = current_word.replace(u'd\'', u'ď')
|
||
current_word = current_word.replace(u't\'', u'ť')
|
||
|
||
# Exceptions.
|
||
current_word = current_word.replace(u'oj', u'Y')
|
||
current_word = current_word.replace(u'ej', u'#')
|
||
current_word = current_word.replace(u'ěj', u'$')
|
||
current_word = current_word.replace(u'uj', u'%')
|
||
current_word = current_word.replace(u'új', u'%')
|
||
current_word = current_word.replace(u'ůj', u'%')
|
||
|
||
# Replace n with N when it precedes k or g. N maps to ŋ in czech_ipa_map.
|
||
current_word = re.sub(ur'n([kg])', ur'N\1', current_word)
|
||
|
||
# Replace d, t, and n when followed by i, í, or ě with ď, ť, and ň, respectively.
|
||
# We have to use the \u syntax here because Python does not accept unicode characters
|
||
# directly in regular expressions, hence the ur'\u' syntax.
|
||
current_word = re.sub(ur'd([i\u00ed\u011b])', ur'\u010f\1', current_word)
|
||
current_word = re.sub(ur't([i\u00ed\u011b])', ur'\u0165\1', current_word)
|
||
current_word = re.sub(ur'n([i\u00ed\u011b])', ur'\u0148\1', current_word)
|
||
|
||
# If the above replacement happend before ě, replace the ě with e so that
|
||
# there is no double j pronunciation. E.g. dělám should be pronounced
|
||
# as ɟɛlaːm and not ɟjɛlaːm (ignore that we use a instead of aː in our implementation).
|
||
current_word = current_word.replace(u'ďě', u'ďe')
|
||
current_word = current_word.replace(u'ťě', u'ťe')
|
||
current_word = current_word.replace(u'ňě', u'ňe')
|
||
|
||
# b, p, v, and f followed by ě are handled implicitly due to ě's pronunciation
|
||
# definition in czech_ipa_table; but mě is pronounced mňe.
|
||
current_word = current_word.replace(u'mě', u'mňe')
|
||
|
||
# Temporarily replace dz and dž with Z and Ž, respectively, so that the following
|
||
# assimilation rules are easier to implement.
|
||
current_word = current_word.replace(u'dz', u'Z')
|
||
current_word = current_word.replace(u'dž', u'Ž')
|
||
|
||
# Convert the end of the word to unvoiced if applicable.
|
||
if len(current_word) > 1:
|
||
current_word = current_word[:-1] + czech_voiced_to_unvoiced_map.get(current_word[-1], current_word[-1])
|
||
|
||
# Check for groups of two or more consonants. When such a group is found check
|
||
# the last consonant in the group to see if it is voiced or unvoiced and
|
||
# morph the preceding consonants to match (e.g. they either all become voiced or
|
||
# unvoiced depending on how the last consonant is classified).
|
||
current_word = re.sub(ur'([{0}]{{2,}})'.format(morphing_czech_consonants), pronounce_czech_consonant_cluster, current_word)
|
||
|
||
# Undo the dz and dž replacement if any are still present.
|
||
current_word = current_word.replace(u'Z', u'dz')
|
||
current_word = current_word.replace(u'Ž', u'dž')
|
||
|
||
# Replace sh with sch (sH).
|
||
current_word.replace(u'sh', u'sH')
|
||
|
||
# Replace dz and dž with Z and Ž, respectively, for ease of lookup. This happens
|
||
# at the end because rules above could prevent this from happening.
|
||
current_word = current_word.replace(u'dz', u'Z')
|
||
current_word = current_word.replace(u'dž', u'Ž')
|
||
|
||
# Handle the ou dipthong which occurs in native Czech words.
|
||
current_word = current_word.replace(u'ou', u'&')
|
||
|
||
# Handle the dipthongs au and eu? I don't think it's necesssary for our facial animation
|
||
# purposes since we'd split them out into their components again during coarticulation
|
||
# anyway... plus it's a difficult problem because sometimes they are dipthongs and sometimes
|
||
# they are not (e.g. at morpheme boundaries where there is a glottal stop). These two are only
|
||
# found in foreign words or words of foreign origin. If it's a big deal on certain words their
|
||
# pronunciation can always be overridden in the language dictionary.
|
||
|
||
for c in current_word:
|
||
try:
|
||
ipa = czech_ipa_map[c]
|
||
if isinstance(ipa, list):
|
||
pronunciation.extend(ipa)
|
||
else:
|
||
pronunciation.append(ipa)
|
||
except KeyError:
|
||
errorW('Czech Language Pronouncer: {0} was not found in czech_ipa_map!'.format(c))
|
||
raise
|
||
|
||
pronunciation_string = u''
|
||
for u in pronunciation:
|
||
pronunciation_string += u
|
||
|
||
debug_trace('result = ' + original_word + ' -> ' + pronunciation_string)
|
||
|
||
return pronunciation
|
||
|
||
if __name__ == '__main__':
|
||
registerLanguagePronouncer('Czech', 'ipa', pronounce_czech_word)
|
||
msg('Czech Language Module registered.')
|
||
|