#!/usr/bin/python # coding=utf8 # Copyright (C) 2007 Mashrab Kuvatov # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. import sys def needsPrefix(inStr, curIndx): 'Returns true if h should be prefixed after s' if curIndx > 0: return inStr[curIndx - 1] in [u'с', u'С'] else: return False def isPrevVocal(inStr, curIndx): 'Returns true if the previous char is vocal' vocalChars = [u'а', u'и', u'е', u'о', u'ў', u'у', \ u'А', u'И', u'Е', u'О', u'Ў', u'У'] if curIndx > 0: return inStr[curIndx - 1] in vocalChars else: return False def needsUpperCasing(inStr, curIndx): 'Returns true if uppercasing is needed' prevCharUpper = nextCharUpper = False if curIndx > 0: prevCharUpper = inStr[curIndx - 1].isupper() if prevCharUpper: return True if curIndx < len(inStr) - 1: nextCharUpper = inStr[curIndx + 1].isupper() return nextCharUpper def convertLine(cyr_str): 'Convert given cyrillic string into latin one' lookUpTbl = [u'a', u'b', u'v', u'g', u'd', u'e', u'j', u'z', u'i', u'y', u'k', u'l', u'm', u'n', u'o', u'p', u'r', u's', u't', u'u', u'f', u'x', u'ts', u'ch', u'sh', u'sh', u'ʼ', u'i', u'', u'e', u'yu', u'ya'] lat_str = u'' cyr_str_len = len(cyr_str) if cyr_str_len == 0: return cyr_str isWordBegin = True # loop over the chars in the string for curCyrIndx in range(cyr_str_len): curCyrChar = cyr_str[curCyrIndx] if (curCyrChar >= u'а' and curCyrChar <= u'я') or \ curCyrChar in [u'ў', u'ҳ', u'қ', u'ғ', u'ё'] or \ (curCyrChar >= u'А' and curCyrChar <= u'Я') or \ curCyrChar in [u'Ў', u'Ҳ', u'Қ', u'Ғ', u'Ё']: wasCurCharUpper = curCyrChar.isupper() curCyrChar = curCyrChar.lower() if curCyrChar == u'ў': curLatChar = u'oʻ' elif curCyrChar == u'ҳ': if needsPrefix(cyr_str, curCyrIndx): curLatChar = u'ʼh' else: curLatChar = u'h' elif curCyrChar == u'қ': curLatChar = u'q' elif curCyrChar == u'ғ': curLatChar = u'gʻ' # TODO: Convert 'ё' properly elif curCyrChar == u'ё': curLatChar = u'yo' elif curCyrChar == u'е' and isWordBegin: curLatChar = u'ye' elif curCyrChar == u'ц': if isWordBegin or not isPrevVocal(cyr_str, curCyrIndx): curLatChar = u's' else: curLatChar = u'ts' # TODO: Take care of 'ю' 'я' else: curLatChar = lookUpTbl[ord(curCyrChar) - ord(u'а')] if wasCurCharUpper: if curLatChar in [u'ch', u'sh', u'yo', u'yu', u'ya', u'ye', u'ts'] and \ needsUpperCasing(cyr_str, curCyrIndx): curLatChar = curLatChar.upper() elif curLatChar == u'ʼh': curLatChar = u'ʼH' else: curLatChar = curLatChar.capitalize() else: curLatChar = curCyrChar lat_str = lat_str + curLatChar isWordBegin = not curCyrChar.isalnum() return lat_str # begining of the program if len(sys.argv) != 3: print 'Converts UTF-8 encoded Cyrillic Uzbek text file into', print 'Latin Uzbek text file.' print 'Usage: %s cyr.txt lat.txt' % sys.argv[0] sys.exit(1) else: cyr_filename = sys.argv[1] lat_filename = sys.argv[2] # read cyr_file = open(cyr_filename, 'r') latLines = '' # convert for eachCyrLine in cyr_file: latLines = latLines + convertLine(eachCyrLine.decode('utf-8')) cyr_file.close() # save latLines = latLines.encode('utf-8') lat_file = open(lat_filename, 'w') lat_file.write(latLines) lat_file.close()