ghkm is an open source project powered by Assembla

Assembla offers free public and private SVN/Git repositories and project hosting with bug/issue tracking and collaboration tools.

/
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#!/usr/bin/env python

#TODO: lexical rules
#TODO: unaligned words
#TODO: composed rules

import sys
import fileinput
import copy

import myutils
import ptb2xrs
import align_tool
import tree

logs = sys.stderr

lexlist = ['.', ',', '``', "''", "-LRB-", "-RRB-"]

class Span(set):
    ''' a wrapper of the built-in set type.
    note: the result of operations (say, union "|") is still a span.
    you do not need to override them.'''

    def __init__(self, contents=[]):
        set.__init__(self, contents)

    def split(self):
        a = sorted(self)
        a.append(-1)    ## dummy end
        last = low = a[0]
        res = []
        for i in a[1:]:
            if i != last + 1:
                res.append((low, last))
                low = i
            last = i
        return res

    @staticmethod
    def split2str(slist):
        def item2str(item):
            if item[0] == item[1]:
                return str(item[0])
            else:
                return str(item[0]) + "-" + str(item[1])

        return [item2str(item) for item in slist]

    def range_and_missing(self, unaligned_words, top_level=False, cmax=None):
        ''' caution: return a range (as a Span) and missing elements (as a set)'''
        if not self:
            return Span(), Span(), Span()
        else:
            if top_level:
                mini, maxi = 0, cmax - 1
            else:
                mini, maxi = min(self), max(self)
            r = Span(range(mini, maxi+1))
            missing = (r - self) & unaligned_words
            return r, self | missing, missing

    def __str__(self):
        return "[" + ",\\,".join(self.split2str(self.split())) + "]"

    def getwords(self, words):
        # lhuang: omit boundary words
        return ""
        if len(self) == 1:
            return words[min(self)]
        else:
            return "%s..%s" % (words[min(self)], words[max(self)])

class Tree2Str(tree.Tree):

    def __init__(self, etree, alignment, unaligned_words, dep=0, cwords=""):
        '''must be the case the etree has already been set espan.'''
        tree.Tree.__init__(self, etree.node, etree.type, etree.children)
        self.espan = etree.span

        if self.children:
            self.children = [ Tree2Str(sub, alignment, unaligned_words, dep + 1) for sub in self.children ]
            span = Span()
            for sub in self.children:
                span |= sub.cspan
        else:
            pos = self.espan
            span = Span([c for (e, c) in alignment if e == pos])

        ## tricky here: only the top-level needs the sentence span. also, cwords is "static" for all nodes.
        if dep == 0:
            cmax = len(cwords)
            self.__class__.cwords = cwords
        else:
            cmax = None

        self.reaches, self.cspan, self.unaligned = span.range_and_missing(unaligned_words, dep==0, cmax)
        self.outspan = Span()

    def pp(self, dep=0):
        print " |     " * dep,
        if hasattr(self, "variable"):
            print "x%d:" % self.variable,
        print self.node, "sp=%s, un=%s, out=%s" % (self.cspan, self.unaligned, self.outspan)

        if self.children:
            for sub in self.children:
                sub.pp(dep + 1)

    def str_tex(self, dep=0):
        if self.children:
            s = "( "
            if hasattr(self, "variable"):
                s += "\\circlenode{A}{#x_{%d}#}" % self.variable
            s += self.node + \
                 "#^{%s}_{%s}#" % (str(self.outspan)[1:-1],     str(self.cspan)[1:-1])     + " " # no []
                ##    "#^{%s}#" % str(self.reaches)[1:-1] + \

            for sub in self.children:
                s += sub.str_tex(dep+1) + " "
            s += ")"
        else:
            s = self.node
        return s

    def get_perm_rhs(self, a):
        '''get the permutation and rhs from a list of spans by sorting'''

        ## e.g. input a = [4, 0, 2-3]
        target_words = self.reaches
        for span in a:
            target_words -= span

        a = [(min(span), ind, span) for (ind, span) in enumerate(a)] \
                + [(i, self.cwords[i], None) for i in target_words]

        ## now a = [(4, 0, 4), (0, 1, 0), (2, 2, 2-3)]
        a.sort()
        ## a = [(0,1, 0), (2,2, 2-3), (4,0, 4)]; returns the permutation (1,2,0)
        perm = []# [ind for (_, ind, _) in a]

##        rhs = " ".join([("x%d:(\'%s\')" % (ind, span.getwords(self.cwords))) for (_, ind, span) in a])
        rhs = " ".join([(("x%d" % x) if type(x) is int else x) for (_, x, _) in a])
        return perm, rhs

    def get_rules(self, rules, dep=0, perm=None):

        if perm is None:
            perm = []

        if self.children and hasattr(self, "variable"):

            ## i'm a frontier node. start a new rule. pass an empty list to hold the new get_rules
            a = []
            lhs = self.node + "(" + " ".join([sub.get_rules(rules, dep+1, a) for sub in self.children]) + ")"
            p, rhs = self.get_perm_rhs(a)

            rules.append("%s -> %s" % (lhs, rhs))

            ## now, do something for your parent
            perm.append(self.cspan)

##            return "x%d:%s(\'%s\')" %     (var, self.node, self.ellipse())
            return "x%d:%s" %     (self.variable, self.node)

        else:
            ## i'm not a frontier node
            if self.children:
                ## i am not a variable node, but recursion goes on
                return self.node + "(" + " ".join([sub.get_rules(rules, dep+1, perm) for sub in self.children]) + ")"
            else:
                # leaf
                return '"%s"' % self.node

    def ghkm(self, var=0, dep=0):

        if dep == 0:
            self.outspan = Span()
            self.variable = 0 # root is also a frontier node

        if self.children:
            for sub in self.children:
                sub.outspan = copy.copy(self.outspan)
                for sibling in self.children:
                    if sibling is not sub:
                        sub.outspan |= sibling.cspan
                if sub.children and     \
                         sub.reaches and sub.node not in lexlist and not (len(self.children) == 1 and var == 0) \
                         and not(sub.reaches & sub.outspan):

                    ## found another variable

                    sub.variable = var
                    sub.ghkm(0, dep+1)
                    var += 1
                else:
                    var = sub.ghkm(var, dep+1)

        return var

def do_ghkm(etree, cwords, align):
    if type(etree) is str:
        _, etree = tree.xrsparse(ptb2xrs.ptb2xrs(etree))

    unaligned_words = set(range(0, len(cwords))) - set([c for (e, c) in align])

    etree.setSpans()
    t2s = Tree2Str(etree, align, unaligned_words, cwords=cwords)

    t2s.ghkm()
    if latex:
        return t2s.str_tex()
    else:
        rules = []
        t2s.get_rules(rules)
        return "\n".join(rules)

if __name__ == '__main__':

    def usage():
        print >> logs, "A variant of GHKM extraction by Liang\nUsage: "
        print >> logs, "paste <e-tree-PTB> <f-corpus> <e2f-alignment> | ./ghkm.rules [options]"
        print >> logs, "\t-i             : inverted alignment (f->e, for david)"
        print >> logs, "\t-p             : include permutations"
        sys.exit(1)

    import getopt
    try:
        opts, args = getopt.getopt(sys.argv[1:], "if:p", ["first="])
    except:
        usage()

    inverse = False
    latex = False
    for o, a in opts:
        if o in ["--paren"]:
            pass
        elif o in ["-i"]:
            inverse = True
        elif o in ["--latex"]:
            latex = True
        else:
            usage()

    for sentid, line in enumerate(fileinput.input(args)):
        parseline, chineseline, alignline = line.split("\t")     # pasted input
        _, etree = tree.xrsparse(ptb2xrs.ptb2xrs(parseline))
        cwords = chineseline.split()
        align = align_tool.get_alignment(alignline, inv=inverse)

        print >> logs, "sentence #%d\tlen %d" % (sentid+1, etree.numLeaves())
        print do_ghkm(etree, cwords, align)
Ajax-loader Loading, please wait...