cfad47cfa3/t3compiler/tads3/lib/tok.t

4b825dc642cb6eb9a060e54bf8d69288fbee4904cfad47cfa334b206c65f22086bcc5d63e6f70944
1
#charset "us-ascii"
2
3
/*
4
 *   Tokenizer - customizable tokenizer class for use with the intrinsic
5
 *   class 'grammar-production' parser.
6
 *   
7
 *   This tokenizer implementation is parameterized with a set of rules
8
 *   (see below); a basic set of rules is provided, but users can
9
 *   customize the tokenizer quite extensively simply by subclassing the
10
 *   Tokenizer class and overriding the 'rules_' property with a new set
11
 *   of rules declarations.  
12
 */
13
14
#include "tads.h"
15
#include "t3.h"
16
#include "dict.h"
17
#include "tok.h"
18
#include "vector.h"
19
20
/* ------------------------------------------------------------------------ */
21
/*
22
 *   Tokenizer exceptions 
23
 */
24
25
/*
26
 *   base class for all tokenizer errors (to allow blanket 'catch') 
27
 */
28
class TokenizerError: Exception;
29
30
/*
31
 *   no match for token 
32
 */
33
class TokErrorNoMatch: TokenizerError
34
    construct(str)
35
    {
36
        /* remember the full remaining text */
37
        remainingStr_ = str;
38
39
        /* 
40
         *   for convenience, separately remember the single character
41
         *   that we don't recognize - this is simply the first character
42
         *   of the rest of the line 
43
         */
44
        curChar_ = str.substr(1, 1);
45
    }
46
47
    /* 
48
     *   The remainder of the string.  This is the part that couldn't be
49
     *   matched; we were successful in matching up to this point. 
50
     */
51
    remainingStr_ = nil
52
53
    /* current character (first character of remainingStr_) */
54
    curChar_ = nil
55
;
56
57
/* ------------------------------------------------------------------------ */
58
/*
59
 *   Basic token types
60
 */
61
62
/* word */
63
enum token tokWord;
64
65
/* quoted string */
66
enum token tokString;
67
68
/* punctuation */
69
enum token tokPunct;
70
71
/* integer number */
72
enum token tokInt;
73
74
75
/* ------------------------------------------------------------------------ */
76
/*
77
 *   Tokenizer base class
78
 */
79
class Tokenizer: object
80
    /*
81
     *   Tokenizing rules.  The subclass can override this to specify a
82
     *   list that defines different tokenization rules.  Each entry in the
83
     *   master rules_ list is one rule.  Each rule is a list consisting of
84
     *   the name of the rule; the pattern to match for the rule; the token
85
     *   type (an 'enum token') to use when the rule is matched; the value
86
     *   computation rule; and the value test rule.
87
     *   
88
     *   The name of a rule is just an arbitrary string to identify the
89
     *   rule.  This can be used to insert new rules in order relative to
90
     *   known existing rules, or to delete known existing rules.
91
     *   
92
     *   If the value computation rule is nil, we'll just use the matching
93
     *   text as the token value.  If the value rule is a string, we'll use
94
     *   the string as a replacement pattern (with rexReplace).  If it's a
95
     *   property ID, we'll invoke the property of self with the following
96
     *   arguments:
97
     *   
98
     *   txt, typ, toks
99
     *   
100
     *   'txt' is the matched text; 'typ' is the token type from the rule;
101
     *   and 'toks' is a vector to which the new token or tokens are to be
102
     *   added.  The routine is responsible for adding the appropriate
103
     *   values to the result list.  Note that the routine can add more
104
     *   than one token to the results if desired.
105
     *   
106
     *   If the value test rule is non-nil, it must be either a method or a
107
     *   function; we'll call the method or function to test to see if the
108
     *   matched value is valid.  We'll call the method (on self) with the
109
     *   matching text as the argument; if the method returns true, the
110
     *   rule matches, otherwise the rule fails, and we'll continue looking
111
     *   for another rule as though we hadn't matched the rule's regular
112
     *   expression in the first place.  This can be used for rules that
113
     *   require more than a simple regular expression match; for example,
114
     *   the value test can be used to look up the match in a dictionary,
115
     *   so that the rule only matches tokens that are defined in the
116
     *   dictionary.  
117
     */
118
    rules_ = static
119
    [
120
        /* skip whitespace */
121
        ['whitespace', new RexPattern('<Space>+'), nil, &tokCvtSkip, nil],
122
123
        /* certain punctuation marks */
124
        ['punctuation', new RexPattern('[.,;:?!]'), tokPunct, nil, nil],
125
126
        /* 
127
         *   Words - note that we convert everything to lower-case.  A
128
         *   word must start with an alphabetic character, but can contain
129
         *   alphabetics, digits, hyphens, and apostrophes after that. 
130
         */
131
        ['word', new RexPattern('<Alpha>(<AlphaNum>|[-\'])*'),
132
         tokWord, &tokCvtLower, nil],
133
134
        /* strings */
135
        ['string single-quote',
136
         new RexPattern('\'(.*)\''), tokString, nil, nil],
137
        ['string double-quote',
138
         new RexPattern('"(.*)"'), tokString, nil, nil],
139
140
        /* integer numbers */
141
        ['integer', new RexPattern('[0-9]+'), tokInt, nil, nil]
142
    ]
143
144
    /*
145
     *   Insert a new rule before or after the existing rule with the name
146
     *   'curName'.  If 'curName' is nil, or rule is found with the given
147
     *   name, we'll insert the new rule at the end of the list.  'rule'
148
     *   must be a list with the standard elements for a tokenizer rule.
149
     *   'after' is nil to insert the new rule before the given existing
150
     *   rule, true to insert after it.  
151
     */
152
    insertRule(rule, curName, after)
153
    {
154
        local idx;
155
156
        /* 
157
         *   if the name of an existing rule was supplied, find the
158
         *   existing rule with the given name 
159
         */
160
        idx = nil;
161
        if (curName != nil)
162
            idx = rules_.indexWhich({x: tokRuleName(x) == curName});
163
164
        /* if we didn't find curName, insert at the end of the list */
165
        if (idx == nil)
166
            idx = rules_.length();
167
168
        /* if we're inserting after the given element, adjust the index */
169
        if (after)
170
            ++idx;
171
172
        /* insert the new rule */
173
        insertRuleAt(rule, idx);
174
    }
175
176
    /* 
177
     *   Insert a rule at the given index in our rules list.  'rule' must
178
     *   be a list with the standard elements for a tokenizer rule.  'idx'
179
     *   is the index of the new rule; we'll insert before the existing
180
     *   element at this index; so if 'idx' is 1, we'll insert before the
181
     *   first existing rule.  
182
     */
183
    insertRuleAt(rule, idx)
184
    {
185
        /* insert the rule */
186
        rules_ = rules_.insertAt(idx, rule);
187
    }
188
189
    /*
190
     *   Delete a rule by name.  This finds the rule with the given name
191
     *   and removes it from the list. 
192
     */
193
    deleteRule(name)
194
    {
195
        local idx;
196
        
197
        /* find the rule with the given name */
198
        idx = rules_.indexWhich({x: tokRuleName(x) == name});
199
200
        /* if we found the named element, remove it from the list */
201
        if (idx != nil)
202
            deleteRuleAt(idx);
203
    }
204
205
    /* delete the rule at the given index */
206
    deleteRuleAt(idx)
207
    {
208
        /* delete the rule */
209
        rules_ = rules_.removeElementAt(idx);
210
    }
211
212
    /* convert a string to lower-case (for value computation rules) */
213
    tokCvtLower(txt, typ, toks)
214
    {
215
        /* add the lower-cased version of the string to the result list */
216
        toks.append([txt.toLower(), typ, txt]);
217
    }
218
219
    /* 
220
     *   processing routine to skip a match - this is used for whitespace
221
     *   and other text that does not result in any tokens in the result
222
     *   list 
223
     */
224
    tokCvtSkip(txt, typ, toks)
225
    {
226
        /* simply skip the text without generating any new tokens */
227
    }
228
229
    /*
230
     *   Tokenize a string.  If we find text that we can't match to any of
231
     *   the rules, we'll throw an exception (TokErrorNoMatch).  If we
232
     *   succeed in tokenizing the entire string, we'll return a list with
233
     *   one element per token.  Each element of the main list is a
234
     *   sublist with the following elements describing a token:
235
     *   
236
     *   - The first element gives the token's value.
237
     *   
238
     *   - The second element the token type (given as a token type enum
239
     *   value).
240
     *   
241
     *   - The third element the original token strings, before any
242
     *   conversions or evaluations were performed.  For example, this
243
     *   maintains the original case of strings that are lower-cased for
244
     *   the corresponding token values.
245
     */
246
    tokenize(str)
247
    {
248
        local toks = new Vector(32);
249
        local startIdx = 1;
250
        local len = str.length();
251
        
252
        /* keep going until we run out of string */
253
    mainLoop:
254
        while (startIdx <= len)
255
        {
256
            /* run through the rules in sequence until we match one */
257
        ruleLoop:
258
            for (local i = 1, local cnt = rules_.length() ; i <= cnt ; ++i)
259
            {
260
                local cur;
261
                local match;
262
                local val;
263
                        
264
                /* get the current rule */
265
                cur = rules_[i];
266
267
                /* check for a match to the rule's pattern */
268
                match = rexMatch(tokRulePat(cur), str, startIdx);
269
                if (match != nil && match > 0)
270
                {
271
                    local test;
272
                    local txt;
273
                    local typ;
274
275
                    /* get the matching text */
276
                    txt = str.substr(startIdx, match);
277
278
                    /* 
279
                     *   if there's a value test, invoke it to determine
280
                     *   if the token really matches 
281
                     */
282
                    if ((test = tokRuleTest(cur)) != nil)
283
                    {
284
                        local accept;
285
286
                        /* check what kind of test function we have */
287
                        switch (dataType(test))
288
                        {
289
                        case TypeFuncPtr:
290
                        case TypeObject:
291
                            /* it's a function or anonymous function */
292
                            accept = (test)(txt);
293
                            break;
294
295
                        case TypeProp:
296
                            /* it's a method */
297
                            accept = self.(test)(txt);
298
                            break;
299
300
                        default:
301
                            /* consider anything else to be accepted */
302
                            accept = true;
303
                            break;
304
                        }
305
306
                        /* 
307
                         *   if the value test failed, it means that the
308
                         *   token doesn't match this rule after all -
309
                         *   ignore the regex match and keep searching for
310
                         *   another rule 
311
                         */
312
                        if (!accept)
313
                            continue ruleLoop;
314
                    }
315
316
                    /* get the type of the token from the rule */
317
                    typ = tokRuleType(cur);
318
                    
319
                    /* get this value processing rule */
320
                    val = tokRuleVal(cur);
321
322
                    /* determine what value to use */
323
                    switch(dataTypeXlat(val))
324
                    {
325
                    case TypeNil:
326
                        /* use the matching text verbatim */
327
                        toks.append([txt, typ, txt]);
328
                        break;
329
                        
330
                    case TypeProp:
331
                        /* 
332
                         *   invoke the property - it's responsible for
333
                         *   adding the token or tokens to the results
334
                         *   lists 
335
                         */
336
                        self.(val)(txt, typ, toks);
337
                        break;
338
                        
339
                    case TypeSString:
340
                        /* it's a regular expression replacement */
341
                        toks.append(
342
                            [rexReplace(tokRulePat(cur),
343
                                        txt, val, ReplaceOnce),
344
                             typ, txt]);
345
                        break;
346
347
                    case TypeFuncPtr:
348
                        /* invoke the function */
349
                        (val)(txt, typ, toks);
350
                        break;
351
352
                    default:
353
                        /* 
354
                         *   use any other value exactly as given in
355
                         *   the rule 
356
                         */
357
                        toks.append([val, typ, txt]);
358
                        break;
359
                    }
360
361
                    /* 
362
                     *   continue the search at the next character after
363
                     *   the end of this token 
364
                     */
365
                    startIdx += match;
366
367
                    /* start over with the rest of the string */
368
                    continue mainLoop;
369
                }
370
            }
371
372
            /*
373
             *   We failed to find a match for this part of the string.
374
             *   Throw an exception and let the caller figure out what to
375
             *   do.  The exception parameter gives the rest of the
376
             *   string, so the caller can display a suitable error
377
             *   message if desired.  
378
             */
379
            throw new TokErrorNoMatch(str.substr(startIdx));
380
        }
381
382
        /* we're done with the string - return out value and type lists */
383
        return toks.toList();
384
    }
385
;
386
387
/* ------------------------------------------------------------------------ */
388
/*
389
 *   Test Section 
390
 */
391
392
#ifdef TOK_TEST
393
394
main(args)
395
{
396
    "Enter text to tokenize.  Type Q or QUIT when done. ";
397
    for (;;)
398
    {
399
        local str, toks;
400
401
        /* read a string */
402
        "\b>";
403
        str = inputLine();
404
405
        /* catch tokenization errors */
406
        try
407
        {
408
            /* tokenize the string */
409
            toks = Tokenizer.tokenize(str);
410
411
            /* if the first token is 'quit', we're done */
412
            if (toks.length() > 0
413
                && getTokType(toks[1]) == tokWord
414
                && (getTokVal(toks[1])== 'quit' || getTokVal(toks[1]) == 'q'))
415
            {
416
                /* they want to stop - exit the command loop */
417
                break;
418
            }
419
420
            /* display the tokens */
421
            for (local i = 1, local cnt = toks.length() ; i <= cnt ; ++i)
422
                "(<<getTokVal(toks[i])>>) ";
423
        }
424
        catch (TokErrorNoMatch err)
425
        {
426
            "Unrecognized punctuation: <<err.remainingStr_.substr(1, 1)>>";
427
        }
428
    }
429
}
430
431
#endif /* TOK_TEST */
432