#!/usr/bin/env python
# Preprocessor for serto font for the use with LaTeX.
# Copyright 2001-2003 by Johannes Heinecke
# you can use this and change it as you wish, under the condition
# that the original copyright line is not deleted
FONTFILE="syriac.font" # specify absolute path
# TODO: : mit transkription
# sErtO: grosse Vokale sind unter der Linie
import re, sys, string
class Serto:
def __init__(self, elatex=0):
self.elatex=elatex # eLaTeX needs \TeXXeTstate=1
fp = open(FONTFILE, "r")
lines = fp.readlines()
fp.close()
self.inlineS = re.compile("()(.*?)()")
self.inlineT = re.compile("()(.*?)()")
self.inlineST = re.compile("()(.*?)()")
self.tabelle = {} # "_d": (isol, init, med, fin, link)
self.transtabelle = {} # "_d": \d{d}
self.fontname = ""
status = "serto"
for z in lines:
if len(z) < 2:
continue
if z[0] == "#":
if z[:6] == "#FONT:":
a = string.split(z)
self.fontname = string.strip(a[1])
#self.textframe.thetext.config(font=self.fontname)
#self.testlabel.config(font=self.fontname)
elif z[:7] == "#TRANS:":
status = "transliterate"
continue
felder = string.split(z)
if status == "serto":
if len(felder) < 7:
print "ERROR:", z
else:
#print z, int(felder[2])
self.tabelle[felder[0]] = (int(felder[2]),
int(felder[3]),
int(felder[4]),
int(felder[5]),
int(felder[6]))
else:
if len(felder) < 2:
#print "WARNING:", z
self.transtabelle[felder[0]] = felder[0]
else:
self.transtabelle[felder[0]] = felder[1]
def tokenize(self, str, xlen):
ix = 0
self.tokens = []
self.digits = []
number = 0 #
while(ix < xlen): #for ix in range(xlen):
#print "IX", ix, str
if str[ix] == "\\":
command = "\\"
ix = ix + 1
while(ix < xlen):
if not str[ix] in string.letters:
break
else:
command = command + str[ix]
ix = ix + 1
self.tokens.append(command)
elif str[ix] in "{}":
self.tokens.append(str[ix])
ix = ix + 1
else:
for ll in range(5, 0, -1):
if self.tabelle.has_key(str[ix:ix+ll]):
if ll == 1 and str[ix:ix+ll] in "aeiou" \
and (len(self.tokens) == 0 \
or self.tokens[-1] == "~"):
#self.tokens.append("'" + str[ix:ix+ll])
self.tokens.extend(["'", str[ix:ix+ll]])
#pass
else:
if len(self.tokens) \
and str[ix:ix+ll] == self.tokens[-1] \
and self.tabelle[str[ix:ix+ll]][4] != 3 \
and str[ix:ix+ll] not in ["~", "0", "1",
"2", "3", "4",
"5", "6", "7",
"8", "9"]:
"""insert shadda"""
self.tokens.append("Q")
else:
self.tokens.append(str[ix:ix+ll])
ix = ix + ll
break # for-loop
else:
ix = ix + 1
#print "TOKENS",self.tokens
def transtokenize(self, str, xlen):
ix = 0
self.tokens = []
self.digits = []
number = 0 #
while(ix < xlen): #for ix in range(xlen):
#print "IX", ix,
for ll in range(5, 0, -1):
if self.transtabelle.has_key(str[ix:ix+ll]):
if ll == 1 and str[ix:ix+ll] in "aeiou" \
and (len(self.tokens) == 0 \
or self.tokens[-1] == "~"):
self.tokens.append("'" + str[ix:ix+ll])
#self.tokens.extend(["'", str[ix:ix+ll]])
else:
#if len(self.tokens) \
# and str[ix:ix+ll] == self.tokens[-1] \
# and self.tabelle[str[ix:ix+ll]][4] != 3 \
# and str[ix:ix+ll] not in ["~", "0", "1", "2", "3", "4",
# "5", "6", "7", "8", "9"]:
# """insert shadda"""
# self.tokens.append("Q")
#else:
self.tokens.append(str[ix:ix+ll])
ix = ix + ll
break # for-loop
else:
ix = ix + 1
#print "TRANSTOKENS",self.tokens
def transliterate(self, syrisch):
syrisch = string.replace(syrisch, " ", "~")
self.transtokenize(syrisch, len(syrisch))
#print self.tokens
ret = []
oldtok = ""
for tok in self.tokens:
if tok == "~": # blank
ret.append(" ")
#elif tok == "Q": # shadda
#ret.append(ret[-1])
#elif tok == "+": # soft sign under begadkefat
#if len(ret):
# ret[-1] = self.spec.get(oldtok+tok, oldtok+tok)
else:
ret.append(self.transtabelle.get(tok, tok))
#oldtok = tok
#print ret
return string.join(ret, "")
def syriacise(self):
# replace tokens by serto letters, take into account context
ix = 0
out = []
digits = []
self.maxlen = len(self.tokens)
number = 0
for i in range(self.maxlen):
if self.tokens[i][0] in "\\{}":
out.append(self.tokens[i])
elif self.tabelle[self.tokens[i]][2] == -1:
#print "skipping letter"
continue
else:
form = self.context(i)
if self.tokens[i] in ["0", "1", "2", "3", "4",
"5", "6", "7", "8", "9"]:
number = 1
digits.append(chr(self.tabelle[self.tokens[i]][form]))
else:
if number == 1:
number = 0
digits.reverse()
out.extend(digits)
digits = []
#out.append(chr(self.tabelle[self.tokens[i]][form]))
out.append("%c" %(self.tabelle[self.tokens[i]][form]))
#print self.tokens[i], form, self.tabelle[self.tokens[i]][form]
if number:
number = 0
digits.reverse()
out.extend(digits)
#for i in out: print "%d" % ord(i),
#print
#if not self.elatex:
# out.reverse()
return string.join(out, "")
def context(self, ix):
"""returns 0 if letter is isolated
1 if letter is initial
2 if letter is medial
3 if letter is final"""
if self.before(ix) and self.next(ix):
return 2
elif self.before(ix) and not self.next(ix):
return 3
elif not self.before(ix) and self.next(ix):
return 1
else:
return 0
def next(self, ix):
"""returns 1 if next token is a letter"""
for i in range(ix+1, self.maxlen):
if self.tokens[i][0] in "\\{}":
return 0
elif self.tabelle[self.tokens[i]][4] in [2,3]:
continue
elif self.tokens[i] not in ["~", "!", ",", ".", ";", "?"] :
return 1
else:
return 0
return 0
def before(self, ix):
"""returns 1 if preceding token is a letter"""
for i in range(ix-1, -1, -1):
if self.tokens[i][0] in "\\{}":
return 0
elif self.tabelle[self.tokens[i]][4] == 2:
continue
elif self.tokens[i] != "~":
if self.tabelle[self.tokens[i]][4] == 0:
return 0
else:
return 1
else:
return 0
return 0
def convert(self, transcript):
# interface function
# dummy blank
transcript = string.replace(transcript, " ", "~")
self.tokenize(transcript, len(transcript))
return self.syriacise()
def texify(self, word):
res = []
for ll in serto.convert(word):
#print "LETTER", ord(ll)
if ord(ll) < 16:
#print "WWWWWWWWW", len(res), res
if len(res):
res[-1] = "\\upperserto{%d}{%s}" % (ord(ll), res[-1])
else:
res.append("\\upperserto{%d}{A}" % (ord(ll))) # A: Olaf
elif ord(ll) < 32:
if len(res):
res[-1] = "\\lowerserto{%d}{%s}" % (ord(ll), res[-1])
else:
res.append("\\lowerserto{%d}{A}" % (ord(ll)))
elif ord(ll) < 127 and ord(ll) not in [34,35,36,37,38,95]: # 95: underscore
res.append(ll)
else:
res.append("\\char%d{}" % ord(ll))
if not self.elatex:
res.reverse()
return string.join(res, "")
def inlineserto(self, matchobject):
return "{\\serto\\beginR %s\\endR}" % self.texify(matchobject.group(2))
def inlinetrans(self, matchobject):
return "\\emph{%s}" % self.transliterate(matchobject.group(2))
def inlinesertotrans(self, matchobject):
return "{\\serto\\beginR %s\\endR} \\emph{%s}" \
% (self.texify(matchobject.group(2)),
self.transliterate(matchobject.group(2)))
#-------------------------------------------------------
if __name__ == "__main__":
sys.stderr.write("serto - TeX - preprocessor\n(c) Johannes Heinecke\n")
if len(sys.argv) < 2:
sys.stderr.write("usage:\n serto.py [-e] inputfile\n")
sys.stderr.write(" -e: for usage with elatex\n\n")
else:
sys.stderr.write("\n")
import getopt
elatex = 0
optlist,comargs = getopt.getopt(sys.argv[1:], "e")
for (o,a) in optlist:
if o == "-e":
elatex = 1
serto = Serto(elatex=elatex)
fp = open(comargs[0])
mode = "latin"
z = fp.readline()
while (z):
#print 'QQQ',z
if z[:-1] == "":
# must be on a single line (will be deleted)
if not elatex:
sys.stderr.write("using without the -e option (and elatex) may not work!\n")
mode = "serto"
print '{\\serto\\beginR %'
elif string.strip(z[:-1]) == "":
mode = "latin"
#print '\\endR}%' # causes problems in last line
print '}%'
elif z[:-1] == "":
mode = "trans"
print '{\\it %'
elif string.strip(z[:-1]) == "":
mode = "latin"
print '}%'
# elif z[:-1] == "":
# mode = "sertotrans"
# print '{\\serto\\beginR %'
#
# elif string.strip(z[:-1]) == "":
# mode = "latin"
# print '\\endR}%'
else:
if mode == "latin":
#sys.stdout.write(serto.inlineS.sub(serto.inlineserto, z))
a = serto.inlineS.sub(serto.inlineserto, z)
b = serto.inlineT.sub(serto.inlinetrans, a)
c = serto.inlineST.sub(serto.inlinesertotrans, b)
sys.stdout.write(c)
elif mode == "trans":
print serto.transliterate(z)
else:
if z[:-1] == "": print "\n\\beginR",
else:
print serto.texify(z)
z = fp.readline()
fp.close()