#!/usr/bin/env python

#TODO: lexical rules
#TODO: unaligned words
#TODO: composed rules

import sys
import fileinput
import copy

import myutils
import ptb2xrs
import align_tool
import tree

logs = sys.stderr

lexlist = ['.', ',', '``', "''", "-LRB-", "-RRB-"]

class Span(set):
	''' a wrapper of the built-in set type.
	note: the result of operations (say, union "|") is still a span.
	you do not need to override them.'''

	def __init__(self, contents=[]):
		set.__init__(self, contents)		

	def split(self):
		a = sorted(self)
		a.append(-1)	## dummy end
		last = low = a[0]
		res = []
		for i in a[1:]:
			if i != last + 1:
				res.append((low, last))
				low = i
			last = i
		return res

	@staticmethod
	def split2str(slist):
		def item2str(item):
			if item[0] == item[1]:
				return str(item[0])
			else:
				return str(item[0]) + "-" + str(item[1])
			
		return [item2str(item) for item in slist]

	def range_and_missing(self, unaligned_words, top_level=False, cmax=None):
		''' caution: return a range (as a Span) and missing elements (as a set)'''
		if not self:
			return Span(), Span(), Span()
		else:
			if top_level:
				mini, maxi = 0, cmax - 1
			else:
				mini, maxi = min(self), max(self)
			r = Span(range(mini, maxi+1))
			missing = (r - self) & unaligned_words			
			return r, self | missing, missing		
		
	def __str__(self):
		return "[" + ",\\,".join(self.split2str(self.split())) + "]"

	def getwords(self, words):
		# lhuang: omit boundary words
		return ""
		if len(self) == 1:
			return words[min(self)]
		else:
			return "%s..%s" % (words[min(self)], words[max(self)])

class Tree2Str(tree.Tree):

	def __init__(self, etree, alignment, unaligned_words, dep=0, cwords=""):
		'''must be the case the etree has already been set espan.'''
		tree.Tree.__init__(self, etree.node, etree.type, etree.children)
		self.espan = etree.span

		if self.children:			
			self.children = [ Tree2Str(sub, alignment, unaligned_words, dep + 1) for sub in self.children ]
			span = Span()
			for sub in self.children:
				span |= sub.cspan
		else:
			pos = self.espan
			span = Span([c for (e, c) in alignment if e == pos])

		## tricky here: only the top-level needs the sentence span. also, cwords is "static" for all nodes.
		if dep == 0:
			cmax = len(cwords)
			self.__class__.cwords = cwords
		else:
			cmax = None
			
		self.reaches, self.cspan, self.unaligned = span.range_and_missing(unaligned_words, dep==0, cmax)
		self.outspan = Span()

	def pp(self, dep=0):
		print " |	 " * dep,
		if hasattr(self, "variable"):
			print "x%d:" % self.variable,
		print self.node, "sp=%s, un=%s, out=%s" % (self.cspan, self.unaligned, self.outspan)
			
		if self.children:
			for sub in self.children:
				sub.pp(dep + 1)

	def str_tex(self, dep=0):
		if self.children:
			s = "( "
			if hasattr(self, "variable"):
				s += "\\circlenode{A}{#x_{%d}#}" % self.variable
			s += self.node + \
				 "#^{%s}_{%s}#" % (str(self.outspan)[1:-1],	 str(self.cspan)[1:-1])	 + " " # no []
				##	"#^{%s}#" % str(self.reaches)[1:-1] + \		 
				 
			for sub in self.children:
				s += sub.str_tex(dep+1) + " "
			s += ")"
		else:
			s = self.node
		return s

	def get_perm_rhs(self, a):
		'''get the permutation and rhs from a list of spans by sorting'''

		## e.g. input a = [4, 0, 2-3]		
		target_words = self.reaches
		for span in a:
			target_words -= span			
		
		a = [(min(span), ind, span) for (ind, span) in enumerate(a)] \
				+ [(i, self.cwords[i], None) for i in target_words]
				
		## now a = [(4, 0, 4), (0, 1, 0), (2, 2, 2-3)]
		a.sort()
		## a = [(0,1, 0), (2,2, 2-3), (4,0, 4)]; returns the permutation (1,2,0)
		perm = []# [ind for (_, ind, _) in a]
			
##		rhs = " ".join([("x%d:(\'%s\')" % (ind, span.getwords(self.cwords))) for (_, ind, span) in a])
		rhs = " ".join([(("x%d" % x) if type(x) is int else x) for (_, x, _) in a])
		return perm, rhs
		
	def get_rules(self, rules, dep=0, perm=None):

		if perm is None:
			perm = []

		if self.children and hasattr(self, "variable"):

			## i'm a frontier node. start a new rule. pass an empty list to hold the new get_rules
			a = []
			lhs = self.node + "(" + " ".join([sub.get_rules(rules, dep+1, a) for sub in self.children]) + ")"
			p, rhs = self.get_perm_rhs(a)
			
			rules.append("%s -> %s" % (lhs, rhs))

			## now, do something for your parent
			perm.append(self.cspan)
			
##			return "x%d:%s(\'%s\')" %	 (var, self.node, self.ellipse())
			return "x%d:%s" %	 (self.variable, self.node)
			
		else:
			## i'm not a frontier node
			if self.children:
				## i am not a variable node, but recursion goes on
				return self.node + "(" + " ".join([sub.get_rules(rules, dep+1, perm) for sub in self.children]) + ")"
			else:
				# leaf
				return '"%s"' % self.node
			
	def ghkm(self, var=0, dep=0):
		
		if dep == 0:
			self.outspan = Span()
			self.variable = 0 # root is also a frontier node
			
		if self.children:
			for sub in self.children:
				sub.outspan = copy.copy(self.outspan)
				for sibling in self.children:
					if sibling is not sub:
						sub.outspan |= sibling.cspan
				if sub.children and	 \
						 sub.reaches and sub.node not in lexlist and not (len(self.children) == 1 and var == 0) \
						 and not(sub.reaches & sub.outspan):

					## found another variable
					
					sub.variable = var
					sub.ghkm(0, dep+1)
					var += 1
				else:
					var = sub.ghkm(var, dep+1)
					
		return var

def do_ghkm(etree, cwords, align):
	if type(etree) is str:
		_, etree = tree.xrsparse(ptb2xrs.ptb2xrs(etree))
		
	unaligned_words = set(range(0, len(cwords))) - set([c for (e, c) in align])

	etree.setSpans()
	t2s = Tree2Str(etree, align, unaligned_words, cwords=cwords)

	t2s.ghkm()
	if latex:
		return t2s.str_tex()
	else:
		rules = []
		t2s.get_rules(rules)
		return "\n".join(rules)	
	
if __name__ == '__main__':

	def usage():
		print >> logs, "A variant of GHKM extraction by Liang\nUsage: "
		print >> logs, "paste <e-tree-PTB> <f-corpus> <e2f-alignment> | ./ghkm.rules [options]"
		print >> logs, "\t-i			 : inverted alignment (f->e, for david)"
		print >> logs, "\t-p			 : include permutations"
		sys.exit(1)
		
	import getopt
	try:
		opts, args = getopt.getopt(sys.argv[1:], "if:p", ["first="])
	except:
		usage()

	inverse = False
	latex = False
	for o, a in opts:
		if o in ["--paren"]:
			pass
		elif o in ["-i"]:
			inverse = True
		elif o in ["--latex"]:
			latex = True
		else:
			usage()

	for sentid, line in enumerate(fileinput.input(args)):
		parseline, chineseline, alignline = line.split("\t")	 # pasted input
		_, etree = tree.xrsparse(ptb2xrs.ptb2xrs(parseline))
		cwords = chineseline.split()
		align = align_tool.get_alignment(alignline, inv=inverse)

		print >> logs, "sentence #%d\tlen %d" % (sentid+1, etree.numLeaves())
		print do_ghkm(etree, cwords, align)