#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ballbach@rten.net

import sys;
import re;
import tempfile;
from optparse import OptionParser, OptionGroup;

class RTF_FileEvents:
	"""Event descriptions for parsing an RTF file."""

	def group_start(self, group_nr):
		"""A new {} group has been entered."""
		pass

	def group_end(self, group_nr):
		"""A {} group has been completed."""
		pass

	def control_word(self, word, argument = -1, has_space = False):
		"""A control word has been encountered."""
		pass

	def control_symbol(self, symbol):
		"""A control symbol has been encountered."""
		pass

	def hex_char(self, hex):
		"""Literal hexidecimal"""
		pass

	def text(self, symbol):
		"""Text - unmatched."""
		pass

	def end_of_doc(self):
		"""End of document"""
		pass

def RTF_FileParse(file, events):
	"""Does the lexing for an RTF file."""
				
	cw = re.compile("\\\\(?P<control>\w+)(?P<terminator>(?P<eol>$)|-(?P<num>\d+)|(?P<empty> ?))")
	cs = re.compile("\\\\((?P<symbol>[|~\\-_:\\*])|'(?P<hex>[0-9a-fA-F]{2}))")
	group_nr = 0

	for line in file:
		t = -1
		while True:
			t = t + 1
			if t >= len(line):
				break
			char = line[t]

			# ignore newline characters
			if ord(char) == 0xd or ord(char) == 0xa:
				events.text(char)

			# now do simple lexing of what's a control word/statement, and
			# what's just text, and when we start and terminate groups.
			if char == "{":
				events.group_start(group_nr)
				group_nr = group_nr + 1
			elif char == "}":
				group_nr = group_nr - 1
				events.group_end(group_nr)
			elif char == "\\":
				mw = cw.match(line[t:])
				if mw != None:
					t = t + mw.end() - 1
					arg = -1
					if mw.group("num"):
						arg = int(mw.group("num"))

					has_space = False
					if mw.group("empty") and mw.group("empty") == " ":
						has_space = True
					events.control_word(mw.group("control"), arg, has_space)
				else:
					ms = cs.match(line[t:])
					if ms != None:
						t = t + ms.end() - 1
						if ms.group("symbol") != None:
							events.control_symbol(ms.group("symbol"))
						else:
							events.hex_char(int(ms.group("hex"), 16))
					else:
						raise("Bad RTF trying to match %s" % line[t:])
			else:
				events.text(char)

	events.end_of_doc()

class Cyrillsci_Filter(RTF_FileEvents):
	def __init__(self, output = sys.stdout, debug = False, debug_stream = sys.stderr):
		self.text_so_far = "";
		self.debug = debug
		self.debug_stream = debug_stream
		self.output_stream = output

		self.in_fonttable_group = -1
		self.cyrillic_font_nr = 0
		self.strogij_font_nr = -1
		self.font_re = re.compile("f(?P<font_nr>\d+)")
		self.process_text = False
		self.in_strogij_run = False

		self.default_character = '%'
		self.used_default_character = False

		self.strogij_table = \
		{
			0x80: u'А', 0x81: u'а', 0x82: u'Б', 0x83: u'б', 0x84: u'В', 0x85: u'в',
			0x86: u'Г', 0x87: u'г', 0x88: u'Ґ', 0x89: u'ґ', 0x8a: u'Д', 0x8b: u'д',
			0x8c: u'Е', 0x8d: u'е', 0x8e: u'Ё', 0x8f: u'ё', 0x90: u'Є', 0x91: u'є',
			0x92: u'Ж', 0x93: u'ж', 0x94: u'З', 0x95: u'з', 0x96: u'И', 0x97: u'и',
			0x98: u'І', 0x99: u'і', 0x9a: u'Ї', 0x9b: u'ї', 0x9c: u'Й', 0x9d: u'й',
			0x9e: u'К', 0x9f: u'к', 0xa0: u'Л', 0xa1: u'л', 0xa2: u'М', 0xa3: u'м',
			0xa4: u'Н', 0xa5: u'н', 0xa6: u'О', 0xa7: u'о', 0xa8: u'П', 0xa9: u'п',
			0xaa: u'Р', 0xab: u'р', 0xac: u'С', 0xad: u'с', 0xae: u'«', 0xaf: u'»',
			0xb0: u'-', 0xb1: u'±', 0xb2: u'≤', 0xb3: u'Б', 0xb4: u'¥', 0xb5: u'µ',
			0xb6: u'∂', 0xb7: u'∑', 0xb8: u'∏', 0xb9: u'π', 0xba: u'∫', 0xbb: u'ª',
			0xbc: u'º', 0xbd: u'Ω', 0xbe: u'æ', 0xbf: u'ø', 0xc0: u' ', 0xc1: u'¡',
			0xc2: u'¬', 0xc3: u'√', 0xc4: u'ƒ', 0xc5: u'≈', 0xc6: u'Δ', 0xc7: u'«',
			0xc8: u'»', 0xc9: u'…', 0xca: u' ', 0xcb: u'À', 0xcc: u'Ã', 0xcd: u'Õ',
			0xce: u'Œ', 0xcf: u' ', 0xd0: u'–', 0xd1: u'—', 0xd2: u'“', 0xd3: u'”',
			0xd4: u'‘', 0xd5: u'’', 0xd6: u'÷',	0xd7: u'◊', 0xd8: u'ÿ', 0xd9: u'Ÿ',
			0xda: u' ', 0xdb: u'€', 0xdc: u'‹', 0xdd: u'›', 0xde: u'ﬁ', 0xdf: u'ﬂ',
			0xe0: u'Т', 0xe1: u'т', 0xe2: u'У', 0xe3: u'у', 0xe4: u'Ў', 0xe5: u'ў',
			0xe6: u'Ф', 0xe7: u'ф', 0xe8: u'Х', 0xe9: u'х', 0xea: u'Ц', 0xeb: u'ц',
			0xec: u'Ч', 0xed: u'ч', 0xee: u'Ш', 0xef: u'ш', 0xf0: u'Щ', 0xf1: u'щ',
			0xf2: u'Ъ', 0xf3: u'ъ', 0xf4: u'Ы', 0xf5: u'ы', 0xf6: u'Ь', 0xf7: u'ь',
			0xf8: u'Э', 0xf9: u'э', 0xfa: u'Ю', 0xfb: u'ю', 0xfc: u'Я', 0xfd: u'я',
			0xfe: u'„', 0xff: u'\''
		}

	def group_start(self, group_nr):
		self.acc_text()
		self.debug_print("Group start: %d" % group_nr)
		self.output_print("{")
		self.current_group = group_nr

	def group_end(self, group_nr):
		self.acc_text()
		if self.in_fonttable_group == group_nr:
			# okay, the fonttable is about to end - add our new cyrillic font
			self.in_fonttable_group = -1
			self.output_print("\n{\\f%d\\fswiss\\fcharset204{\\*\\fname Arial;}Arial CYR;}\n" % self.cyrillic_font_nr)

		self.debug_print("Group end: %d" % group_nr)
		self.output_print("}")

	def control_word(self, word, argument = -1, has_space = False):
		self.acc_text()
		self.debug_print("Word: %s / %d / %s" % (word, argument, has_space))

		do_output = True

		# if we're starting the font table, we're almost ready to write our new
		# cyrillic font information
		if word == "fonttbl":
			self.in_fonttable_group = self.current_group
			self.debug_print("Font table at group %d" % self.current_group)
		elif self.process_text and word == "bullet":
			# believe it or not, '\bullet' appears to be н
			self.strogij(chr(0xa5))
			do_output = False
		else:
			# track the current font, reserve a font # for the new cyrillic
			# font while reading the font table
			fm = self.font_re.match(word)
			if fm != None:
				f_nr = int(fm.group("font_nr"))
				self.debug_print("Font: %d" % f_nr)
				self.current_font = f_nr
				if self.in_fonttable_group > -1 and f_nr >= self.cyrillic_font_nr:
					self.cyrillic_font_nr = f_nr + 1
				elif self.strogij_font_nr == f_nr:
					self.process_text = True
					word = "f%d" % self.cyrillic_font_nr
				else:
					self.process_text = False
		
		if do_output:
			if has_space:
				word = word + " ";
			if argument > -1:
				self.output_print("\\%s-%d" % (word, argument))
			else:
				self.output_print("\\%s" % word)

	def control_symbol(self, symbol):
		self.acc_text()
		self.debug_print("Symbol: %s" % symbol)
		self.output_print("\\%s" % symbol)

	def text(self, string):
		if self.in_strogij_run:
			# if we're in the middle of processing strogij characters, then we
			# have to cancel that, and add a new language tag.
			self.in_strogij_run = False
			self.text_so_far += "\\lang1033"

		if string == " ":
			self.text_so_far += "\\'20"
		else:
			self.text_so_far += string
	
	def hex_char(self, hex):
		self.acc_text()
		self.debug_print("Hex: 0x%02x" % hex)
		
		if self.process_text:
			self.strogij(chr(hex))
		else:
			self.output_print("\\'%02x" % hex)

	def strogij(self, chars):
		if not self.in_strogij_run:
			self.output_print("\\lang1049\\f%d " % self.cyrillic_font_nr)
			self.in_strogij_run = True

		for char in chars:
			char = ord(char)
			if self.strogij_table.has_key(char):
				converted = self.strogij_table[char]
			else:
				self.used_default_character = self.default_character
				converted = self.default_character

			try:
				converted = converted.encode("cp1251")
				for char in converted:
					self.output_stream.write("\\'%02x" % ord(char))

			except UnicodeEncodeError:
				# if it isn't valid CP1251, add the raw unicode character. we
				# could do this for everything, I suppose.
				self.output_stream.write("\\uc1\\u%d?" % ord(converted))

	def end_of_doc(self):
		if self.debug:
			self.acc_text()
		self.output_stream.flush()

	def acc_text(self):
		if len(self.text_so_far) != 0:
			# get rid of so much line break junk for the output
			self.debug_print("Text: %s" % self.text_so_far.replace("\n", "\\n").replace("\r", "\\r"))
			self.text_so_far = self.text_so_far.replace("\r", "").replace("\n\n", "\n")
			self.output_print(self.text_so_far)

			if self.in_fonttable_group > -1:
				if self.text_so_far.find("Strogij") != -1:
					self.debug_print("Found strogij font: %d" % self.current_font)
					self.strogij_font_nr = self.current_font

			self.text_so_far = ""

	def debug_print(self, str):
		if self.debug:
			self.debug_stream.write(str + "\n")

	def output_print(self, str):
		self.output_stream.write(str)

# some defaults
default_font_str = "\\fswiss\\fcharset204{\\*\\fname Arial;}Arial CYR;"

# parse options
parser = OptionParser()
group = OptionGroup(parser, "Output Options")
group.add_option("-o", "--output_file",		action="store",			dest="output_file",	type="string",		default="-")
group.add_option("-p", "--in_place",		action="store_true",	dest="in_place",	default=False,		help="Modify files in place")
group.add_option("-d", "--debug",			action="store_true",	dest="debug",		default=False)
parser.add_option_group(group)

(options, args) = parser.parse_args()
if len(args) < 1:
	print >> sys.stderr, "error: no filename specified"
	sys.exit(1)

for filename in args:

	output_file = None
	if options.in_place:
		output_file = tempfile.TemporaryFile("w+b")
	elif options.output_file == "-":
		output_file = sys.stdout
	else:
		output_file = open(options.output_file, "w")

	print >> sys.stderr, "Starting to process '%s'" % filename
	file = open(filename, "r")
	events = Cyrillsci_Filter(output = output_file, debug = options.debug)
	RTF_FileParse(file, events)
	if events.used_default_character:
		print >> sys.stderr, "Warning: had to use a default character on '%s'" % file

	if options.in_place:
		# overwrite input now
		file = None
		output_file.seek(0, 0)
		overwrite = open(filename, "wb")

		while True:
			line = output_file.readline()
			if not line:
				break
			overwrite.write(line)


