#!/usr/bin/env python #TODO: lexical rules #TODO: unaligned words #TODO: composed rules import sys import fileinput import copy import myutils import ptb2xrs import align_tool import tree logs = sys.stderr lexlist = ['.', ',', '``', "''", "-LRB-", "-RRB-"] class Span(set): ''' a wrapper of the built-in set type. note: the result of operations (say, union "|") is still a span. you do not need to override them.''' def __init__(self, contents=[]): set.__init__(self, contents) def split(self): a = sorted(self) a.append(-1) ## dummy end last = low = a[0] res = [] for i in a[1:]: if i != last + 1: res.append((low, last)) low = i last = i return res @staticmethod def split2str(slist): def item2str(item): if item[0] == item[1]: return str(item[0]) else: return str(item[0]) + "-" + str(item[1]) return [item2str(item) for item in slist] def range_and_missing(self, unaligned_words, top_level=False, cmax=None): ''' caution: return a range (as a Span) and missing elements (as a set)''' if not self: return Span(), Span(), Span() else: if top_level: mini, maxi = 0, cmax - 1 else: mini, maxi = min(self), max(self) r = Span(range(mini, maxi+1)) missing = (r - self) & unaligned_words return r, self | missing, missing def __str__(self): return "[" + ",\\,".join(self.split2str(self.split())) + "]" def getwords(self, words): # lhuang: omit boundary words return "" if len(self) == 1: return words[min(self)] else: return "%s..%s" % (words[min(self)], words[max(self)]) class Tree2Str(tree.Tree): def __init__(self, etree, alignment, unaligned_words, dep=0, cwords=""): '''must be the case the etree has already been set espan.''' tree.Tree.__init__(self, etree.node, etree.type, etree.children) self.espan = etree.span if self.children: self.children = [ Tree2Str(sub, alignment, unaligned_words, dep + 1) for sub in self.children ] span = Span() for sub in self.children: span |= sub.cspan else: pos = self.espan span = Span([c for (e, c) in alignment if e == pos]) ## tricky here: only the top-level needs the sentence span. also, cwords is "static" for all nodes. if dep == 0: cmax = len(cwords) self.__class__.cwords = cwords else: cmax = None self.reaches, self.cspan, self.unaligned = span.range_and_missing(unaligned_words, dep==0, cmax) self.outspan = Span() def pp(self, dep=0): print " | " * dep, if hasattr(self, "variable"): print "x%d:" % self.variable, print self.node, "sp=%s, un=%s, out=%s" % (self.cspan, self.unaligned, self.outspan) if self.children: for sub in self.children: sub.pp(dep + 1) def str_tex(self, dep=0): if self.children: s = "( " if hasattr(self, "variable"): s += "\\circlenode{A}{#x_{%d}#}" % self.variable s += self.node + \ "#^{%s}_{%s}#" % (str(self.outspan)[1:-1], str(self.cspan)[1:-1]) + " " # no [] ## "#^{%s}#" % str(self.reaches)[1:-1] + \ for sub in self.children: s += sub.str_tex(dep+1) + " " s += ")" else: s = self.node return s def get_perm_rhs(self, a): '''get the permutation and rhs from a list of spans by sorting''' ## e.g. input a = [4, 0, 2-3] target_words = self.reaches for span in a: target_words -= span a = [(min(span), ind, span) for (ind, span) in enumerate(a)] \ + [(i, self.cwords[i], None) for i in target_words] ## now a = [(4, 0, 4), (0, 1, 0), (2, 2, 2-3)] a.sort() ## a = [(0,1, 0), (2,2, 2-3), (4,0, 4)]; returns the permutation (1,2,0) perm = []# [ind for (_, ind, _) in a] ## rhs = " ".join([("x%d:(\'%s\')" % (ind, span.getwords(self.cwords))) for (_, ind, span) in a]) rhs = " ".join([(("x%d" % x) if type(x) is int else x) for (_, x, _) in a]) return perm, rhs def get_rules(self, rules, dep=0, perm=None): if perm is None: perm = [] if self.children and hasattr(self, "variable"): ## i'm a frontier node. start a new rule. pass an empty list to hold the new get_rules a = [] lhs = self.node + "(" + " ".join([sub.get_rules(rules, dep+1, a) for sub in self.children]) + ")" p, rhs = self.get_perm_rhs(a) rules.append("%s -> %s" % (lhs, rhs)) ## now, do something for your parent perm.append(self.cspan) ## return "x%d:%s(\'%s\')" % (var, self.node, self.ellipse()) return "x%d:%s" % (self.variable, self.node) else: ## i'm not a frontier node if self.children: ## i am not a variable node, but recursion goes on return self.node + "(" + " ".join([sub.get_rules(rules, dep+1, perm) for sub in self.children]) + ")" else: # leaf return '"%s"' % self.node def ghkm(self, var=0, dep=0): if dep == 0: self.outspan = Span() self.variable = 0 # root is also a frontier node if self.children: for sub in self.children: sub.outspan = copy.copy(self.outspan) for sibling in self.children: if sibling is not sub: sub.outspan |= sibling.cspan if sub.children and \ sub.reaches and sub.node not in lexlist and not (len(self.children) == 1 and var == 0) \ and not(sub.reaches & sub.outspan): ## found another variable sub.variable = var sub.ghkm(0, dep+1) var += 1 else: var = sub.ghkm(var, dep+1) return var def do_ghkm(etree, cwords, align): if type(etree) is str: _, etree = tree.xrsparse(ptb2xrs.ptb2xrs(etree)) unaligned_words = set(range(0, len(cwords))) - set([c for (e, c) in align]) etree.setSpans() t2s = Tree2Str(etree, align, unaligned_words, cwords=cwords) t2s.ghkm() if latex: return t2s.str_tex() else: rules = [] t2s.get_rules(rules) return "\n".join(rules) if __name__ == '__main__': def usage(): print >> logs, "A variant of GHKM extraction by Liang\nUsage: " print >> logs, "paste | ./ghkm.rules [options]" print >> logs, "\t-i : inverted alignment (f->e, for david)" print >> logs, "\t-p : include permutations" sys.exit(1) import getopt try: opts, args = getopt.getopt(sys.argv[1:], "if:p", ["first="]) except: usage() inverse = False latex = False for o, a in opts: if o in ["--paren"]: pass elif o in ["-i"]: inverse = True elif o in ["--latex"]: latex = True else: usage() for sentid, line in enumerate(fileinput.input(args)): parseline, chineseline, alignline = line.split("\t") # pasted input _, etree = tree.xrsparse(ptb2xrs.ptb2xrs(parseline)) cwords = chineseline.split() align = align_tool.get_alignment(alignline, inv=inverse) print >> logs, "sentence #%d\tlen %d" % (sentid+1, etree.numLeaves()) print do_ghkm(etree, cwords, align)