| | 1 | #charset "us-ascii" |
| | 2 | |
| | 3 | /* |
| | 4 | * Tokenizer - customizable tokenizer class for use with the intrinsic |
| | 5 | * class 'grammar-production' parser. |
| | 6 | * |
| | 7 | * This tokenizer implementation is parameterized with a set of rules |
| | 8 | * (see below); a basic set of rules is provided, but users can |
| | 9 | * customize the tokenizer quite extensively simply by subclassing the |
| | 10 | * Tokenizer class and overriding the 'rules_' property with a new set |
| | 11 | * of rules declarations. |
| | 12 | */ |
| | 13 | |
| | 14 | #include "tads.h" |
| | 15 | #include "t3.h" |
| | 16 | #include "dict.h" |
| | 17 | #include "tok.h" |
| | 18 | #include "vector.h" |
| | 19 | |
| | 20 | /* ------------------------------------------------------------------------ */ |
| | 21 | /* |
| | 22 | * Tokenizer exceptions |
| | 23 | */ |
| | 24 | |
| | 25 | /* |
| | 26 | * base class for all tokenizer errors (to allow blanket 'catch') |
| | 27 | */ |
| | 28 | class TokenizerError: Exception; |
| | 29 | |
| | 30 | /* |
| | 31 | * no match for token |
| | 32 | */ |
| | 33 | class TokErrorNoMatch: TokenizerError |
| | 34 | construct(str) |
| | 35 | { |
| | 36 | /* remember the full remaining text */ |
| | 37 | remainingStr_ = str; |
| | 38 | |
| | 39 | /* |
| | 40 | * for convenience, separately remember the single character |
| | 41 | * that we don't recognize - this is simply the first character |
| | 42 | * of the rest of the line |
| | 43 | */ |
| | 44 | curChar_ = str.substr(1, 1); |
| | 45 | } |
| | 46 | |
| | 47 | /* |
| | 48 | * The remainder of the string. This is the part that couldn't be |
| | 49 | * matched; we were successful in matching up to this point. |
| | 50 | */ |
| | 51 | remainingStr_ = nil |
| | 52 | |
| | 53 | /* current character (first character of remainingStr_) */ |
| | 54 | curChar_ = nil |
| | 55 | ; |
| | 56 | |
| | 57 | /* ------------------------------------------------------------------------ */ |
| | 58 | /* |
| | 59 | * Basic token types |
| | 60 | */ |
| | 61 | |
| | 62 | /* word */ |
| | 63 | enum token tokWord; |
| | 64 | |
| | 65 | /* quoted string */ |
| | 66 | enum token tokString; |
| | 67 | |
| | 68 | /* punctuation */ |
| | 69 | enum token tokPunct; |
| | 70 | |
| | 71 | /* integer number */ |
| | 72 | enum token tokInt; |
| | 73 | |
| | 74 | |
| | 75 | /* ------------------------------------------------------------------------ */ |
| | 76 | /* |
| | 77 | * Tokenizer base class |
| | 78 | */ |
| | 79 | class Tokenizer: object |
| | 80 | /* |
| | 81 | * Tokenizing rules. The subclass can override this to specify a |
| | 82 | * list that defines different tokenization rules. Each entry in the |
| | 83 | * master rules_ list is one rule. Each rule is a list consisting of |
| | 84 | * the name of the rule; the pattern to match for the rule; the token |
| | 85 | * type (an 'enum token') to use when the rule is matched; the value |
| | 86 | * computation rule; and the value test rule. |
| | 87 | * |
| | 88 | * The name of a rule is just an arbitrary string to identify the |
| | 89 | * rule. This can be used to insert new rules in order relative to |
| | 90 | * known existing rules, or to delete known existing rules. |
| | 91 | * |
| | 92 | * If the value computation rule is nil, we'll just use the matching |
| | 93 | * text as the token value. If the value rule is a string, we'll use |
| | 94 | * the string as a replacement pattern (with rexReplace). If it's a |
| | 95 | * property ID, we'll invoke the property of self with the following |
| | 96 | * arguments: |
| | 97 | * |
| | 98 | * txt, typ, toks |
| | 99 | * |
| | 100 | * 'txt' is the matched text; 'typ' is the token type from the rule; |
| | 101 | * and 'toks' is a vector to which the new token or tokens are to be |
| | 102 | * added. The routine is responsible for adding the appropriate |
| | 103 | * values to the result list. Note that the routine can add more |
| | 104 | * than one token to the results if desired. |
| | 105 | * |
| | 106 | * If the value test rule is non-nil, it must be either a method or a |
| | 107 | * function; we'll call the method or function to test to see if the |
| | 108 | * matched value is valid. We'll call the method (on self) with the |
| | 109 | * matching text as the argument; if the method returns true, the |
| | 110 | * rule matches, otherwise the rule fails, and we'll continue looking |
| | 111 | * for another rule as though we hadn't matched the rule's regular |
| | 112 | * expression in the first place. This can be used for rules that |
| | 113 | * require more than a simple regular expression match; for example, |
| | 114 | * the value test can be used to look up the match in a dictionary, |
| | 115 | * so that the rule only matches tokens that are defined in the |
| | 116 | * dictionary. |
| | 117 | */ |
| | 118 | rules_ = static |
| | 119 | [ |
| | 120 | /* skip whitespace */ |
| | 121 | ['whitespace', new RexPattern('<Space>+'), nil, &tokCvtSkip, nil], |
| | 122 | |
| | 123 | /* certain punctuation marks */ |
| | 124 | ['punctuation', new RexPattern('[.,;:?!]'), tokPunct, nil, nil], |
| | 125 | |
| | 126 | /* |
| | 127 | * Words - note that we convert everything to lower-case. A |
| | 128 | * word must start with an alphabetic character, but can contain |
| | 129 | * alphabetics, digits, hyphens, and apostrophes after that. |
| | 130 | */ |
| | 131 | ['word', new RexPattern('<Alpha>(<AlphaNum>|[-\'])*'), |
| | 132 | tokWord, &tokCvtLower, nil], |
| | 133 | |
| | 134 | /* strings */ |
| | 135 | ['string single-quote', |
| | 136 | new RexPattern('\'(.*)\''), tokString, nil, nil], |
| | 137 | ['string double-quote', |
| | 138 | new RexPattern('"(.*)"'), tokString, nil, nil], |
| | 139 | |
| | 140 | /* integer numbers */ |
| | 141 | ['integer', new RexPattern('[0-9]+'), tokInt, nil, nil] |
| | 142 | ] |
| | 143 | |
| | 144 | /* |
| | 145 | * Insert a new rule before or after the existing rule with the name |
| | 146 | * 'curName'. If 'curName' is nil, or rule is found with the given |
| | 147 | * name, we'll insert the new rule at the end of the list. 'rule' |
| | 148 | * must be a list with the standard elements for a tokenizer rule. |
| | 149 | * 'after' is nil to insert the new rule before the given existing |
| | 150 | * rule, true to insert after it. |
| | 151 | */ |
| | 152 | insertRule(rule, curName, after) |
| | 153 | { |
| | 154 | local idx; |
| | 155 | |
| | 156 | /* |
| | 157 | * if the name of an existing rule was supplied, find the |
| | 158 | * existing rule with the given name |
| | 159 | */ |
| | 160 | idx = nil; |
| | 161 | if (curName != nil) |
| | 162 | idx = rules_.indexWhich({x: tokRuleName(x) == curName}); |
| | 163 | |
| | 164 | /* if we didn't find curName, insert at the end of the list */ |
| | 165 | if (idx == nil) |
| | 166 | idx = rules_.length(); |
| | 167 | |
| | 168 | /* if we're inserting after the given element, adjust the index */ |
| | 169 | if (after) |
| | 170 | ++idx; |
| | 171 | |
| | 172 | /* insert the new rule */ |
| | 173 | insertRuleAt(rule, idx); |
| | 174 | } |
| | 175 | |
| | 176 | /* |
| | 177 | * Insert a rule at the given index in our rules list. 'rule' must |
| | 178 | * be a list with the standard elements for a tokenizer rule. 'idx' |
| | 179 | * is the index of the new rule; we'll insert before the existing |
| | 180 | * element at this index; so if 'idx' is 1, we'll insert before the |
| | 181 | * first existing rule. |
| | 182 | */ |
| | 183 | insertRuleAt(rule, idx) |
| | 184 | { |
| | 185 | /* insert the rule */ |
| | 186 | rules_ = rules_.insertAt(idx, rule); |
| | 187 | } |
| | 188 | |
| | 189 | /* |
| | 190 | * Delete a rule by name. This finds the rule with the given name |
| | 191 | * and removes it from the list. |
| | 192 | */ |
| | 193 | deleteRule(name) |
| | 194 | { |
| | 195 | local idx; |
| | 196 | |
| | 197 | /* find the rule with the given name */ |
| | 198 | idx = rules_.indexWhich({x: tokRuleName(x) == name}); |
| | 199 | |
| | 200 | /* if we found the named element, remove it from the list */ |
| | 201 | if (idx != nil) |
| | 202 | deleteRuleAt(idx); |
| | 203 | } |
| | 204 | |
| | 205 | /* delete the rule at the given index */ |
| | 206 | deleteRuleAt(idx) |
| | 207 | { |
| | 208 | /* delete the rule */ |
| | 209 | rules_ = rules_.removeElementAt(idx); |
| | 210 | } |
| | 211 | |
| | 212 | /* convert a string to lower-case (for value computation rules) */ |
| | 213 | tokCvtLower(txt, typ, toks) |
| | 214 | { |
| | 215 | /* add the lower-cased version of the string to the result list */ |
| | 216 | toks.append([txt.toLower(), typ, txt]); |
| | 217 | } |
| | 218 | |
| | 219 | /* |
| | 220 | * processing routine to skip a match - this is used for whitespace |
| | 221 | * and other text that does not result in any tokens in the result |
| | 222 | * list |
| | 223 | */ |
| | 224 | tokCvtSkip(txt, typ, toks) |
| | 225 | { |
| | 226 | /* simply skip the text without generating any new tokens */ |
| | 227 | } |
| | 228 | |
| | 229 | /* |
| | 230 | * Tokenize a string. If we find text that we can't match to any of |
| | 231 | * the rules, we'll throw an exception (TokErrorNoMatch). If we |
| | 232 | * succeed in tokenizing the entire string, we'll return a list with |
| | 233 | * one element per token. Each element of the main list is a |
| | 234 | * sublist with the following elements describing a token: |
| | 235 | * |
| | 236 | * - The first element gives the token's value. |
| | 237 | * |
| | 238 | * - The second element the token type (given as a token type enum |
| | 239 | * value). |
| | 240 | * |
| | 241 | * - The third element the original token strings, before any |
| | 242 | * conversions or evaluations were performed. For example, this |
| | 243 | * maintains the original case of strings that are lower-cased for |
| | 244 | * the corresponding token values. |
| | 245 | */ |
| | 246 | tokenize(str) |
| | 247 | { |
| | 248 | local toks = new Vector(32); |
| | 249 | local startIdx = 1; |
| | 250 | local len = str.length(); |
| | 251 | |
| | 252 | /* keep going until we run out of string */ |
| | 253 | mainLoop: |
| | 254 | while (startIdx <= len) |
| | 255 | { |
| | 256 | /* run through the rules in sequence until we match one */ |
| | 257 | ruleLoop: |
| | 258 | for (local i = 1, local cnt = rules_.length() ; i <= cnt ; ++i) |
| | 259 | { |
| | 260 | local cur; |
| | 261 | local match; |
| | 262 | local val; |
| | 263 | |
| | 264 | /* get the current rule */ |
| | 265 | cur = rules_[i]; |
| | 266 | |
| | 267 | /* check for a match to the rule's pattern */ |
| | 268 | match = rexMatch(tokRulePat(cur), str, startIdx); |
| | 269 | if (match != nil && match > 0) |
| | 270 | { |
| | 271 | local test; |
| | 272 | local txt; |
| | 273 | local typ; |
| | 274 | |
| | 275 | /* get the matching text */ |
| | 276 | txt = str.substr(startIdx, match); |
| | 277 | |
| | 278 | /* |
| | 279 | * if there's a value test, invoke it to determine |
| | 280 | * if the token really matches |
| | 281 | */ |
| | 282 | if ((test = tokRuleTest(cur)) != nil) |
| | 283 | { |
| | 284 | local accept; |
| | 285 | |
| | 286 | /* check what kind of test function we have */ |
| | 287 | switch (dataType(test)) |
| | 288 | { |
| | 289 | case TypeFuncPtr: |
| | 290 | case TypeObject: |
| | 291 | /* it's a function or anonymous function */ |
| | 292 | accept = (test)(txt); |
| | 293 | break; |
| | 294 | |
| | 295 | case TypeProp: |
| | 296 | /* it's a method */ |
| | 297 | accept = self.(test)(txt); |
| | 298 | break; |
| | 299 | |
| | 300 | default: |
| | 301 | /* consider anything else to be accepted */ |
| | 302 | accept = true; |
| | 303 | break; |
| | 304 | } |
| | 305 | |
| | 306 | /* |
| | 307 | * if the value test failed, it means that the |
| | 308 | * token doesn't match this rule after all - |
| | 309 | * ignore the regex match and keep searching for |
| | 310 | * another rule |
| | 311 | */ |
| | 312 | if (!accept) |
| | 313 | continue ruleLoop; |
| | 314 | } |
| | 315 | |
| | 316 | /* get the type of the token from the rule */ |
| | 317 | typ = tokRuleType(cur); |
| | 318 | |
| | 319 | /* get this value processing rule */ |
| | 320 | val = tokRuleVal(cur); |
| | 321 | |
| | 322 | /* determine what value to use */ |
| | 323 | switch(dataTypeXlat(val)) |
| | 324 | { |
| | 325 | case TypeNil: |
| | 326 | /* use the matching text verbatim */ |
| | 327 | toks.append([txt, typ, txt]); |
| | 328 | break; |
| | 329 | |
| | 330 | case TypeProp: |
| | 331 | /* |
| | 332 | * invoke the property - it's responsible for |
| | 333 | * adding the token or tokens to the results |
| | 334 | * lists |
| | 335 | */ |
| | 336 | self.(val)(txt, typ, toks); |
| | 337 | break; |
| | 338 | |
| | 339 | case TypeSString: |
| | 340 | /* it's a regular expression replacement */ |
| | 341 | toks.append( |
| | 342 | [rexReplace(tokRulePat(cur), |
| | 343 | txt, val, ReplaceOnce), |
| | 344 | typ, txt]); |
| | 345 | break; |
| | 346 | |
| | 347 | case TypeFuncPtr: |
| | 348 | /* invoke the function */ |
| | 349 | (val)(txt, typ, toks); |
| | 350 | break; |
| | 351 | |
| | 352 | default: |
| | 353 | /* |
| | 354 | * use any other value exactly as given in |
| | 355 | * the rule |
| | 356 | */ |
| | 357 | toks.append([val, typ, txt]); |
| | 358 | break; |
| | 359 | } |
| | 360 | |
| | 361 | /* |
| | 362 | * continue the search at the next character after |
| | 363 | * the end of this token |
| | 364 | */ |
| | 365 | startIdx += match; |
| | 366 | |
| | 367 | /* start over with the rest of the string */ |
| | 368 | continue mainLoop; |
| | 369 | } |
| | 370 | } |
| | 371 | |
| | 372 | /* |
| | 373 | * We failed to find a match for this part of the string. |
| | 374 | * Throw an exception and let the caller figure out what to |
| | 375 | * do. The exception parameter gives the rest of the |
| | 376 | * string, so the caller can display a suitable error |
| | 377 | * message if desired. |
| | 378 | */ |
| | 379 | throw new TokErrorNoMatch(str.substr(startIdx)); |
| | 380 | } |
| | 381 | |
| | 382 | /* we're done with the string - return out value and type lists */ |
| | 383 | return toks.toList(); |
| | 384 | } |
| | 385 | ; |
| | 386 | |
| | 387 | /* ------------------------------------------------------------------------ */ |
| | 388 | /* |
| | 389 | * Test Section |
| | 390 | */ |
| | 391 | |
| | 392 | #ifdef TOK_TEST |
| | 393 | |
| | 394 | main(args) |
| | 395 | { |
| | 396 | "Enter text to tokenize. Type Q or QUIT when done. "; |
| | 397 | for (;;) |
| | 398 | { |
| | 399 | local str, toks; |
| | 400 | |
| | 401 | /* read a string */ |
| | 402 | "\b>"; |
| | 403 | str = inputLine(); |
| | 404 | |
| | 405 | /* catch tokenization errors */ |
| | 406 | try |
| | 407 | { |
| | 408 | /* tokenize the string */ |
| | 409 | toks = Tokenizer.tokenize(str); |
| | 410 | |
| | 411 | /* if the first token is 'quit', we're done */ |
| | 412 | if (toks.length() > 0 |
| | 413 | && getTokType(toks[1]) == tokWord |
| | 414 | && (getTokVal(toks[1])== 'quit' || getTokVal(toks[1]) == 'q')) |
| | 415 | { |
| | 416 | /* they want to stop - exit the command loop */ |
| | 417 | break; |
| | 418 | } |
| | 419 | |
| | 420 | /* display the tokens */ |
| | 421 | for (local i = 1, local cnt = toks.length() ; i <= cnt ; ++i) |
| | 422 | "(<<getTokVal(toks[i])>>) "; |
| | 423 | } |
| | 424 | catch (TokErrorNoMatch err) |
| | 425 | { |
| | 426 | "Unrecognized punctuation: <<err.remainingStr_.substr(1, 1)>>"; |
| | 427 | } |
| | 428 | } |
| | 429 | } |
| | 430 | |
| | 431 | #endif /* TOK_TEST */ |
| | 432 | |