| | 1 | /* |
| | 2 | * Copyright (c) 1998, 2002 Michael J. Roberts. All Rights Reserved. |
| | 3 | * |
| | 4 | * Please see the accompanying license file, LICENSE.TXT, for information |
| | 5 | * on using and copying this software. |
| | 6 | */ |
| | 7 | /* |
| | 8 | Name |
| | 9 | regex.c - Regular Expression Parser and Recognizer for T3 |
| | 10 | Function |
| | 11 | Parses and recognizes regular expressions. Adapted to C++ and UTF-8 |
| | 12 | from the TADS 2 implementation. |
| | 13 | Notes |
| | 14 | Regular expression syntax: |
| | 15 | |
| | 16 | abc|def either abc or def |
| | 17 | (abc) abc |
| | 18 | abc+ abc, abcc, abccc, ... |
| | 19 | abc* ab, abc, abcc, ... |
| | 20 | abc? ab or abc |
| | 21 | x{n} x exactly 'n' times |
| | 22 | x{n,} x at least 'n' times |
| | 23 | x{,n} x 0 to 'n' times |
| | 24 | x{n,m} x from 'n' to 'm' times |
| | 25 | . any single character |
| | 26 | abc$ abc at the end of the line |
| | 27 | ^abc abc at the beginning of the line |
| | 28 | %^abc literally ^abc |
| | 29 | [abcx-z] matches a, b, c, x, y, or z |
| | 30 | [^abcx-z] matches any character except a, b, c, x, y, or z |
| | 31 | [^]-q] matches any character except ], -, or q |
| | 32 | x+? use *shortest* working match to x+ |
| | 33 | x*? use shortest match to x* |
| | 34 | x{}? use shortest match to x{} |
| | 35 | |
| | 36 | Note that using ']' or '-' in a character range expression requires |
| | 37 | special ordering. If ']' is to be used, it must be the first character |
| | 38 | after the '^', if present, within the brackets. If '-' is to be used, |
| | 39 | it must be the first character after the '^' and/or ']', if present. |
| | 40 | |
| | 41 | The following special sequences match literal characters by name (the |
| | 42 | names insensitive to case): |
| | 43 | |
| | 44 | <lparen> ( |
| | 45 | <rparen> ) |
| | 46 | <lsquare> [ |
| | 47 | <rsquare> ] |
| | 48 | <lbrace> { |
| | 49 | <rbrace> } |
| | 50 | <langle> < |
| | 51 | <rangle> > |
| | 52 | <vbar> | |
| | 53 | <caret> ^ |
| | 54 | <period> . |
| | 55 | <dot> . |
| | 56 | <squote> ' |
| | 57 | <dquote> " |
| | 58 | <star> * |
| | 59 | <plus> + |
| | 60 | <percent> % |
| | 61 | <question> ? |
| | 62 | <dollar> $ |
| | 63 | <backslash> \ |
| | 64 | <return> carriage return (character code 0x000D) |
| | 65 | <linefeed> line feed (character code 0x000A) |
| | 66 | <tab> tab (character code 0x0009) |
| | 67 | <nul> null character (character code 0x0000) |
| | 68 | <null> null character (character code 0x0000) |
| | 69 | |
| | 70 | The following special sequences match character classes (these class |
| | 71 | names, like the character literal names above, are insensitive to case): |
| | 72 | |
| | 73 | <Alpha> any alphabetic character |
| | 74 | <Upper> any upper-case alphabetic character |
| | 75 | <Lower> any lower-case alphabetic character |
| | 76 | <Digit> any digit |
| | 77 | <AlphaNum> any alphabetic character or digit |
| | 78 | <Space> space character |
| | 79 | <Punct> punctuation character |
| | 80 | <Newline> any newline character: \n\r, \r\n, \r, \n, 0x2028 |
| | 81 | |
| | 82 | Character classes can be combined by separating class names with the '|' |
| | 83 | delimiter. In addition, a class expression can be complemented to make |
| | 84 | it an exclusion expression by putting a '^' after the opening bracket. |
| | 85 | |
| | 86 | <Upper|Digit> any uppercase letter or any digit |
| | 87 | <^Alpha> anything except an alphabetic character |
| | 88 | <^Upper|Digit> anything except an uppercase letter or a digit |
| | 89 | (note that the exclusion applies to the ENTIRE |
| | 90 | expression, so in this case both uppercase letters |
| | 91 | and digits are excluded) |
| | 92 | |
| | 93 | In addition, character classes and named literals can be combined: |
| | 94 | |
| | 95 | <Upper|Star|Plus> Any uppercase letter, or '*' or '+' |
| | 96 | |
| | 97 | Finally, literal characters and literal ranges resembling '[]' ranges |
| | 98 | can be used in angle-bracket expressions, with the limitation that each |
| | 99 | character or range must be separated with the '|' delimiter: |
| | 100 | |
| | 101 | <a|b|c> 'a' or 'b' or 'c' |
| | 102 | <Upper|a|b|c> any uppercase letter, or 'a', 'b', or 'c' |
| | 103 | <Upper|a-m> any uppercase letter, or lowercase letters 'a' to 'm' |
| | 104 | |
| | 105 | '%' is used to escape the special characters: | . ( ) * ? + ^ $ % [ |
| | 106 | (We use '%' rather than a backslash because it's less trouble to |
| | 107 | enter in a TADS string -- a backslash needs to be quoted with another |
| | 108 | backslash, which is error-prone and hard to read. '%' doesn't need |
| | 109 | any special quoting in a TADS string, which makes it a lot more |
| | 110 | readable.) |
| | 111 | |
| | 112 | In addition, '%' is used to introduce additional special sequences: |
| | 113 | |
| | 114 | %1 text matching first parenthesized expression |
| | 115 | %9 text matching ninth parenthesized experssion |
| | 116 | %< matches at the beginning of a word only |
| | 117 | %> matches at the end of a word only |
| | 118 | %w matches any word character |
| | 119 | %W matches any non-word character |
| | 120 | %b matches at any word boundary (beginning or end of word) |
| | 121 | %B matches except at a word boundary |
| | 122 | |
| | 123 | For the word matching sequences, a word is any sequence of letters and |
| | 124 | numbers. |
| | 125 | |
| | 126 | The following special sequences modify the search rules: |
| | 127 | |
| | 128 | <Case> - makes the search case-sensitive. This is the default. |
| | 129 | <NoCase> - makes the search case-insensitive. <Case> and <NoCase> |
| | 130 | are both global - the entire expression is either |
| | 131 | case-sensitive or case-insensitive. |
| | 132 | |
| | 133 | <FE> or <FirstEnd> - select the substring that ends earliest, in |
| | 134 | case of an ambiguous match. <FirstBegin> is the default. |
| | 135 | <FB> or <FirstBegin> - select the substring that starts earliest, in |
| | 136 | case of an ambiguous match. This is the default. |
| | 137 | |
| | 138 | <Min> - select the shortest matching substring, in case of an |
| | 139 | ambiguous match. <Max> is the default. |
| | 140 | <Max> - select the longest matching substring, in case of an |
| | 141 | ambiguous match. |
| | 142 | |
| | 143 | Modified |
| | 144 | 09/06/98 MJRoberts - Creation |
| | 145 | */ |
| | 146 | |
| | 147 | |
| | 148 | #include <ctype.h> |
| | 149 | #include <stdio.h> |
| | 150 | #include <stdlib.h> |
| | 151 | #include <string.h> |
| | 152 | #include <assert.h> |
| | 153 | |
| | 154 | #include "t3std.h" |
| | 155 | #include "vmregex.h" |
| | 156 | #include "utf8.h" |
| | 157 | #include "vmerr.h" |
| | 158 | #include "vmerrnum.h" |
| | 159 | #include "vmuni.h" |
| | 160 | #include "vmfile.h" |
| | 161 | |
| | 162 | /* ------------------------------------------------------------------------ */ |
| | 163 | /* |
| | 164 | * Initialize. |
| | 165 | */ |
| | 166 | CRegexParser::CRegexParser() |
| | 167 | { |
| | 168 | /* no tuple array yet */ |
| | 169 | tuple_arr_ = 0; |
| | 170 | tuples_alloc_ = 0; |
| | 171 | |
| | 172 | /* clear states */ |
| | 173 | next_state_ = RE_STATE_FIRST_VALID; |
| | 174 | |
| | 175 | /* no range buffer yet */ |
| | 176 | range_buf_ = 0; |
| | 177 | range_buf_cnt_ = 0; |
| | 178 | range_buf_max_ = 0; |
| | 179 | |
| | 180 | /* by default, our expressions are case sensitive by default */ |
| | 181 | default_case_sensitive_ = TRUE; |
| | 182 | } |
| | 183 | |
| | 184 | /* ------------------------------------------------------------------------ */ |
| | 185 | /* |
| | 186 | * Reset compiler - clears states and tuples |
| | 187 | */ |
| | 188 | void CRegexParser::reset() |
| | 189 | { |
| | 190 | int i; |
| | 191 | re_tuple *t; |
| | 192 | |
| | 193 | /* delete any range tables we've allocated */ |
| | 194 | for (i = 0, t = tuple_arr_ ; i < next_state_ ; ++i, ++t) |
| | 195 | { |
| | 196 | /* if it's a range, delete the allocated range */ |
| | 197 | if ((t->typ == RE_RANGE || t->typ == RE_RANGE_EXCL) |
| | 198 | && t->info.range.char_range != 0) |
| | 199 | { |
| | 200 | t3free(t->info.range.char_range); |
| | 201 | t->info.range.char_range = 0; |
| | 202 | } |
| | 203 | |
| | 204 | /* clear the tuple type */ |
| | 205 | t->typ = RE_INVALID; |
| | 206 | } |
| | 207 | |
| | 208 | /* clear states */ |
| | 209 | next_state_ = RE_STATE_FIRST_VALID; |
| | 210 | } |
| | 211 | |
| | 212 | /* ------------------------------------------------------------------------ */ |
| | 213 | /* |
| | 214 | * Delete |
| | 215 | */ |
| | 216 | CRegexParser::~CRegexParser() |
| | 217 | { |
| | 218 | /* reset state (to delete most of our memory structures) */ |
| | 219 | reset(); |
| | 220 | |
| | 221 | /* if we've allocated an array, delete it */ |
| | 222 | if (tuple_arr_ != 0) |
| | 223 | { |
| | 224 | t3free(tuple_arr_); |
| | 225 | tuple_arr_ = 0; |
| | 226 | } |
| | 227 | |
| | 228 | /* delete our range buffer if we have one */ |
| | 229 | if (range_buf_ != 0) |
| | 230 | { |
| | 231 | t3free(range_buf_); |
| | 232 | range_buf_ = 0; |
| | 233 | } |
| | 234 | } |
| | 235 | |
| | 236 | /* ------------------------------------------------------------------------ */ |
| | 237 | /* |
| | 238 | * Add an entry to our range buffer |
| | 239 | */ |
| | 240 | void CRegexParser::add_range_char(wchar_t ch1, wchar_t ch2) |
| | 241 | { |
| | 242 | /* |
| | 243 | * if the first character in the range is null, it requires special |
| | 244 | * representation |
| | 245 | */ |
| | 246 | if (ch1 == 0) |
| | 247 | { |
| | 248 | /* |
| | 249 | * A null character must be represented using the character class |
| | 250 | * notation for RE_NULLCHAR, because a null lead character in a |
| | 251 | * pair has the special meaning of introducing a class specifier |
| | 252 | * in the second character of the pair. So, first add the null |
| | 253 | * character as a class character. |
| | 254 | */ |
| | 255 | add_range_class(RE_NULLCHAR); |
| | 256 | |
| | 257 | /* |
| | 258 | * if the second character of the range is null, we're done, since |
| | 259 | * all we needed was the null itself |
| | 260 | */ |
| | 261 | if (ch2 == 0) |
| | 262 | return; |
| | 263 | |
| | 264 | /* |
| | 265 | * we've added the null character, so simply add the rest of the |
| | 266 | * range starting at character code 1 |
| | 267 | */ |
| | 268 | ch1 = 1; |
| | 269 | } |
| | 270 | |
| | 271 | /* make sure we have space in the range buffer for the added entry */ |
| | 272 | ensure_range_buf_space(); |
| | 273 | |
| | 274 | /* add the new entry */ |
| | 275 | range_buf_[range_buf_cnt_++] = ch1; |
| | 276 | range_buf_[range_buf_cnt_++] = ch2; |
| | 277 | } |
| | 278 | |
| | 279 | /* |
| | 280 | * Add a class to a character range. The class identifier is one of the |
| | 281 | * RE_xxx recognizer types representing a class: RE_ALPHA, RE_DIGIT, and |
| | 282 | * so on. |
| | 283 | */ |
| | 284 | void CRegexParser::add_range_class(re_recog_type cl) |
| | 285 | { |
| | 286 | /* make sure we have space in the range buffer */ |
| | 287 | ensure_range_buf_space(); |
| | 288 | |
| | 289 | /* |
| | 290 | * add the new entry - the first character of the pair is a null |
| | 291 | * character to indicate that the entry represents a class, and the |
| | 292 | * second of the pair is the type code |
| | 293 | */ |
| | 294 | range_buf_[range_buf_cnt_++] = 0; |
| | 295 | range_buf_[range_buf_cnt_++] = (wchar_t)cl; |
| | 296 | } |
| | 297 | |
| | 298 | /* |
| | 299 | * Check for space in the range buffer for a new range, expanding the |
| | 300 | * buffer if necessary. |
| | 301 | */ |
| | 302 | void CRegexParser::ensure_range_buf_space() |
| | 303 | { |
| | 304 | /* make sure we have room for another entry */ |
| | 305 | if (range_buf_ == 0) |
| | 306 | { |
| | 307 | /* |
| | 308 | * allocate an initial size - it must be even, since we always |
| | 309 | * consume elements in pairs |
| | 310 | */ |
| | 311 | range_buf_max_ = 128; |
| | 312 | range_buf_ = (wchar_t *)t3malloc(range_buf_max_ * sizeof(wchar_t)); |
| | 313 | |
| | 314 | /* no entries yet */ |
| | 315 | range_buf_cnt_ = 0; |
| | 316 | } |
| | 317 | else if (range_buf_cnt_ == range_buf_max_) |
| | 318 | { |
| | 319 | /* |
| | 320 | * reallocate the buffer at a larger size (the size must always be |
| | 321 | * even, since we always add to the buffer in pairs) |
| | 322 | */ |
| | 323 | range_buf_max_ += 128; |
| | 324 | range_buf_ = (wchar_t *)t3realloc(range_buf_, |
| | 325 | range_buf_max_ * sizeof(wchar_t)); |
| | 326 | } |
| | 327 | |
| | 328 | /* if the range buffer is null, throw an error */ |
| | 329 | if (range_buf_ == 0) |
| | 330 | err_throw(VMERR_OUT_OF_MEMORY); |
| | 331 | } |
| | 332 | |
| | 333 | |
| | 334 | /* ------------------------------------------------------------------------ */ |
| | 335 | /* |
| | 336 | * Allocate a new state ID |
| | 337 | */ |
| | 338 | re_state_id CRegexParser::alloc_state() |
| | 339 | { |
| | 340 | /* |
| | 341 | * If we don't have enough room for another state, expand the array |
| | 342 | */ |
| | 343 | if (next_state_ >= tuples_alloc_) |
| | 344 | { |
| | 345 | int new_alloc; |
| | 346 | |
| | 347 | /* bump the size by a bit */ |
| | 348 | new_alloc = tuples_alloc_ + 100; |
| | 349 | |
| | 350 | /* allocate or expand the array */ |
| | 351 | if (tuples_alloc_ == 0) |
| | 352 | { |
| | 353 | /* allocate the initial memory block */ |
| | 354 | tuple_arr_ = (re_tuple *)t3malloc(new_alloc * sizeof(re_tuple)); |
| | 355 | } |
| | 356 | else |
| | 357 | { |
| | 358 | /* re-allocate the existing memory block */ |
| | 359 | tuple_arr_ = (re_tuple *)t3realloc(tuple_arr_, |
| | 360 | new_alloc * sizeof(re_tuple)); |
| | 361 | } |
| | 362 | |
| | 363 | /* remember the new allocation size */ |
| | 364 | tuples_alloc_ = new_alloc; |
| | 365 | } |
| | 366 | |
| | 367 | /* initialize the next state */ |
| | 368 | tuple_arr_[next_state_].next_state_1 = RE_STATE_INVALID; |
| | 369 | tuple_arr_[next_state_].next_state_2 = RE_STATE_INVALID; |
| | 370 | tuple_arr_[next_state_].info.ch = RE_EPSILON; |
| | 371 | tuple_arr_[next_state_].flags = 0; |
| | 372 | tuple_arr_[next_state_].typ = RE_INVALID; |
| | 373 | |
| | 374 | /* return the new state's ID */ |
| | 375 | return next_state_++; |
| | 376 | } |
| | 377 | |
| | 378 | |
| | 379 | /* ------------------------------------------------------------------------ */ |
| | 380 | /* |
| | 381 | * Set a transition from a state to a given destination state. |
| | 382 | */ |
| | 383 | void CRegexParser::set_trans(re_state_id id, re_state_id dest_id, |
| | 384 | re_recog_type typ, wchar_t ch) |
| | 385 | { |
| | 386 | re_tuple *tuple; |
| | 387 | |
| | 388 | /* ignore invalid states */ |
| | 389 | if (id == RE_STATE_INVALID) |
| | 390 | return; |
| | 391 | |
| | 392 | /* |
| | 393 | * get the tuple containing the transitions for this state ID - the |
| | 394 | * state ID is the index of the state's transition tuple in the |
| | 395 | * array |
| | 396 | */ |
| | 397 | tuple = &tuple_arr_[id]; |
| | 398 | |
| | 399 | /* |
| | 400 | * If the first state pointer hasn't been set yet, set it to the new |
| | 401 | * destination. Otherwise, set the second state pointer. |
| | 402 | * |
| | 403 | * Only set the character on setting the first state. When setting |
| | 404 | * the second state, we must assume that the character for the state |
| | 405 | * has already been set, since any given state can have only one |
| | 406 | * character setting. |
| | 407 | */ |
| | 408 | if (tuple->next_state_1 == RE_STATE_INVALID) |
| | 409 | { |
| | 410 | /* set the transition type and character ID */ |
| | 411 | tuple->typ = typ; |
| | 412 | tuple->info.ch = ch; |
| | 413 | |
| | 414 | /* set the first transition */ |
| | 415 | tuple->next_state_1 = dest_id; |
| | 416 | } |
| | 417 | else |
| | 418 | { |
| | 419 | /* |
| | 420 | * set only the second transition state - don't set the character |
| | 421 | * ID or transition type |
| | 422 | */ |
| | 423 | tuple->next_state_2 = dest_id; |
| | 424 | } |
| | 425 | } |
| | 426 | |
| | 427 | /* ------------------------------------------------------------------------ */ |
| | 428 | /* |
| | 429 | * Initialize a new machine, giving it an initial and final state |
| | 430 | */ |
| | 431 | void CRegexParser::init_machine(re_machine *machine) |
| | 432 | { |
| | 433 | machine->init = alloc_state(); |
| | 434 | machine->final = alloc_state(); |
| | 435 | } |
| | 436 | |
| | 437 | /* |
| | 438 | * Build a character recognizer |
| | 439 | */ |
| | 440 | void CRegexParser::build_char(re_machine *machine, wchar_t ch) |
| | 441 | { |
| | 442 | /* initialize our new machine */ |
| | 443 | init_machine(machine); |
| | 444 | |
| | 445 | /* allocate a transition tuple for the new state */ |
| | 446 | set_trans(machine->init, machine->final, RE_LITERAL, ch); |
| | 447 | } |
| | 448 | |
| | 449 | /* |
| | 450 | * Build a special type recognizer |
| | 451 | */ |
| | 452 | void CRegexParser::build_special(re_machine *machine, re_recog_type typ, |
| | 453 | wchar_t ch) |
| | 454 | { |
| | 455 | /* initialize our new machine */ |
| | 456 | init_machine(machine); |
| | 457 | |
| | 458 | /* allocate a transition tuple for the new state */ |
| | 459 | set_trans(machine->init, machine->final, typ, ch); |
| | 460 | } |
| | 461 | |
| | 462 | /* |
| | 463 | * Build a character range recognizer. |
| | 464 | */ |
| | 465 | void CRegexParser::build_char_range(re_machine *machine, |
| | 466 | int exclusion) |
| | 467 | { |
| | 468 | wchar_t *range_copy; |
| | 469 | |
| | 470 | /* initialize our new machine */ |
| | 471 | init_machine(machine); |
| | 472 | |
| | 473 | /* allocate a transition table for the new state */ |
| | 474 | set_trans(machine->init, machine->final, |
| | 475 | (exclusion ? RE_RANGE_EXCL : RE_RANGE), 0); |
| | 476 | |
| | 477 | /* allocate a copy of the range vector */ |
| | 478 | range_copy = (wchar_t *)t3malloc(range_buf_cnt_ * sizeof(wchar_t)); |
| | 479 | |
| | 480 | /* copy the caller's range */ |
| | 481 | memcpy(range_copy, range_buf_, range_buf_cnt_ * sizeof(wchar_t)); |
| | 482 | |
| | 483 | /* store it in the tuple */ |
| | 484 | tuple_arr_[machine->init].info.range.char_range = range_copy; |
| | 485 | tuple_arr_[machine->init].info.range.char_range_cnt = range_buf_cnt_; |
| | 486 | } |
| | 487 | |
| | 488 | |
| | 489 | /* |
| | 490 | * Build a group recognizer. This is almost the same as a character |
| | 491 | * recognizer, but matches a previous group rather than a literal |
| | 492 | * character. |
| | 493 | */ |
| | 494 | void CRegexParser::build_group_matcher(re_machine *machine, int group_num) |
| | 495 | { |
| | 496 | /* initialize our new machine */ |
| | 497 | init_machine(machine); |
| | 498 | |
| | 499 | /* |
| | 500 | * Allocate a transition tuple for the new state for the group ID. |
| | 501 | * Rather than storing a literal character code, store the group ID |
| | 502 | * in the character code slot. |
| | 503 | */ |
| | 504 | set_trans(machine->init, machine->final, |
| | 505 | RE_GROUP_MATCH, (wchar_t)group_num); |
| | 506 | } |
| | 507 | |
| | 508 | |
| | 509 | /* |
| | 510 | * Build a concatenation recognizer |
| | 511 | */ |
| | 512 | void CRegexParser::build_concat(re_machine *new_machine, |
| | 513 | re_machine *lhs, re_machine *rhs) |
| | 514 | { |
| | 515 | /* initialize the new machine */ |
| | 516 | init_machine(new_machine); |
| | 517 | |
| | 518 | /* |
| | 519 | * set up an epsilon transition from the new machine's initial state |
| | 520 | * to the first submachine's initial state |
| | 521 | */ |
| | 522 | set_trans(new_machine->init, lhs->init, RE_EPSILON, 0); |
| | 523 | |
| | 524 | /* |
| | 525 | * Set up an epsilon transition from the first submachine's final |
| | 526 | * state to the second submachine's initial state |
| | 527 | */ |
| | 528 | set_trans(lhs->final, rhs->init, RE_EPSILON, 0); |
| | 529 | |
| | 530 | /* |
| | 531 | * Set up an epsilon transition from the second submachine's final |
| | 532 | * state to our new machine's final state |
| | 533 | */ |
| | 534 | set_trans(rhs->final, new_machine->final, RE_EPSILON, 0); |
| | 535 | } |
| | 536 | |
| | 537 | /* |
| | 538 | * Build a group machine. sub_machine contains the machine that |
| | 539 | * expresses the group's contents; we'll fill in new_machine with a |
| | 540 | * newly-created machine that encloses and marks the group. |
| | 541 | */ |
| | 542 | void CRegexParser::build_group(re_machine *new_machine, |
| | 543 | re_machine *sub_machine, int group_id) |
| | 544 | { |
| | 545 | /* initialize the container machine */ |
| | 546 | init_machine(new_machine); |
| | 547 | |
| | 548 | /* |
| | 549 | * Set up a group-entry transition from the new machine's initial |
| | 550 | * state into the initial state of the group, and a group-exit |
| | 551 | * transition from the group's final state into the container's final |
| | 552 | * state. For both transitions, store the group ID in the character |
| | 553 | * field of the transition, to identify which group is affected. |
| | 554 | */ |
| | 555 | set_trans(new_machine->init, sub_machine->init, |
| | 556 | RE_GROUP_ENTER, (wchar_t)group_id); |
| | 557 | set_trans(sub_machine->final, new_machine->final, |
| | 558 | RE_GROUP_EXIT, (wchar_t)group_id); |
| | 559 | } |
| | 560 | |
| | 561 | /* |
| | 562 | * Build an assertion machine |
| | 563 | */ |
| | 564 | void CRegexParser::build_assert(re_machine *new_machine, |
| | 565 | re_machine *sub_machine, int is_negative) |
| | 566 | { |
| | 567 | /* initialize the container machine */ |
| | 568 | init_machine(new_machine); |
| | 569 | |
| | 570 | /* allocate a transition tuple for the new state */ |
| | 571 | set_trans(new_machine->init, new_machine->final, |
| | 572 | is_negative ? RE_ASSERT_NEG : RE_ASSERT_POS, 0); |
| | 573 | |
| | 574 | /* set the sub-state */ |
| | 575 | tuple_arr_[new_machine->init].info.sub.init = sub_machine->init; |
| | 576 | tuple_arr_[new_machine->init].info.sub.final = sub_machine->final; |
| | 577 | } |
| | 578 | |
| | 579 | |
| | 580 | /* |
| | 581 | * Build an alternation recognizer |
| | 582 | */ |
| | 583 | void CRegexParser::build_alter(re_machine *new_machine, |
| | 584 | re_machine *lhs, re_machine *rhs) |
| | 585 | { |
| | 586 | /* initialize the new machine */ |
| | 587 | init_machine(new_machine); |
| | 588 | |
| | 589 | /* |
| | 590 | * Set up an epsilon transition from our new machine's initial state |
| | 591 | * to the initial state of each submachine |
| | 592 | */ |
| | 593 | set_trans(new_machine->init, lhs->init, RE_EPSILON, 0); |
| | 594 | set_trans(new_machine->init, rhs->init, RE_EPSILON, 0); |
| | 595 | |
| | 596 | /* |
| | 597 | * Set up an epsilon transition from the final state of each |
| | 598 | * submachine to our final state |
| | 599 | */ |
| | 600 | set_trans(lhs->final, new_machine->final, RE_EPSILON, 0); |
| | 601 | set_trans(rhs->final, new_machine->final, RE_EPSILON, 0); |
| | 602 | } |
| | 603 | |
| | 604 | /* |
| | 605 | * Build a closure recognizer |
| | 606 | */ |
| | 607 | void CRegexParser::build_closure(re_machine *new_machine, re_machine *sub, |
| | 608 | wchar_t specifier, int shortest) |
| | 609 | { |
| | 610 | /* initialize the new machine */ |
| | 611 | init_machine(new_machine); |
| | 612 | |
| | 613 | /* |
| | 614 | * Set up an epsilon transition from our initial state to the |
| | 615 | * submachine's initial state. However, if we're in shortest mode, |
| | 616 | * wait to do this until after we've generated the rest of the |
| | 617 | * machine, and instead set up the transition from the submachine's |
| | 618 | * final state to our final state. The order is important because |
| | 619 | * we will favor the first branch taken when we find two matches of |
| | 620 | * equal total length; hence we want to make the branch that will |
| | 621 | * give us either the longest match or shortest match for this |
| | 622 | * closure first, depending on which way we want to go. |
| | 623 | */ |
| | 624 | if (!shortest) |
| | 625 | set_trans(new_machine->init, sub->init, RE_EPSILON, 0); |
| | 626 | else |
| | 627 | set_trans(sub->final, new_machine->final, RE_EPSILON, 0); |
| | 628 | |
| | 629 | /* |
| | 630 | * If this is an unbounded closure ('*' or '+', but not '?'), set up |
| | 631 | * the loop transition that takes us from the new machine's final |
| | 632 | * state back to its initial state. We don't do this on the |
| | 633 | * zero-or-one closure, because we can only match the expression |
| | 634 | * once. |
| | 635 | */ |
| | 636 | if (specifier != '?') |
| | 637 | { |
| | 638 | /* set the transition */ |
| | 639 | set_trans(sub->final, sub->init, RE_EPSILON, 0); |
| | 640 | |
| | 641 | /* if we have a 'shortest' modifier, flag it in this branch */ |
| | 642 | if (shortest) |
| | 643 | tuple_arr_[sub->final].flags |= RE_STATE_SHORTEST; |
| | 644 | } |
| | 645 | |
| | 646 | /* |
| | 647 | * If this is a zero-or-one closure or a zero-or-more closure, set |
| | 648 | * up an epsilon transition from our initial state to our final |
| | 649 | * state, since we can skip the entire subexpression. We don't do |
| | 650 | * this on the one-or-more closure, because we can't skip the |
| | 651 | * subexpression in this case. |
| | 652 | */ |
| | 653 | if (specifier != '+') |
| | 654 | { |
| | 655 | /* set the transition */ |
| | 656 | set_trans(new_machine->init, new_machine->final, RE_EPSILON, 0); |
| | 657 | |
| | 658 | /* if we have a 'shortest' modifier, flag it in this branch */ |
| | 659 | if (shortest) |
| | 660 | tuple_arr_[new_machine->init].flags |= RE_STATE_SHORTEST; |
| | 661 | } |
| | 662 | |
| | 663 | /* |
| | 664 | * Set up a transition from the submachine's final state to our |
| | 665 | * final state. We waited until here to ensure proper ordering for |
| | 666 | * longest-preferred. If we're in shortest-preferred mode, though, |
| | 667 | * set up the initial transition to the submachine instead. |
| | 668 | */ |
| | 669 | if (!shortest) |
| | 670 | set_trans(sub->final, new_machine->final, RE_EPSILON, 0); |
| | 671 | else |
| | 672 | set_trans(new_machine->init, sub->init, RE_EPSILON, 0); |
| | 673 | } |
| | 674 | |
| | 675 | void CRegexParser::build_interval(re_machine *new_machine, |
| | 676 | re_machine *sub, int min_val, int max_val, |
| | 677 | int var_id, int shortest) |
| | 678 | { |
| | 679 | re_machine inner_machine; |
| | 680 | |
| | 681 | /* initialize the outer (new) machine */ |
| | 682 | init_machine(new_machine); |
| | 683 | |
| | 684 | /* initialize the inner machine */ |
| | 685 | init_machine(&inner_machine); |
| | 686 | |
| | 687 | /* |
| | 688 | * Set the loop transition into the submachine, and set the other to |
| | 689 | * bypass the submachine. If we have a 'shortest' modifier, take |
| | 690 | * the bypass branch first, otherwise take the enter branch first. |
| | 691 | */ |
| | 692 | if (shortest) |
| | 693 | { |
| | 694 | set_trans(inner_machine.init, new_machine->final, RE_LOOP_BRANCH, 0); |
| | 695 | set_trans(inner_machine.init, sub->init, RE_LOOP_BRANCH, 0); |
| | 696 | } |
| | 697 | else |
| | 698 | { |
| | 699 | set_trans(inner_machine.init, sub->init, RE_LOOP_BRANCH, 0); |
| | 700 | set_trans(inner_machine.init, new_machine->final, RE_LOOP_BRANCH, 0); |
| | 701 | } |
| | 702 | |
| | 703 | /* |
| | 704 | * set the final transition of the submachine to come back to the |
| | 705 | * loop branch point |
| | 706 | */ |
| | 707 | set_trans(sub->final, inner_machine.init, RE_EPSILON, 0); |
| | 708 | |
| | 709 | /* |
| | 710 | * set the outer machine to transition into the inner machine and |
| | 711 | * zero the loop variable |
| | 712 | */ |
| | 713 | set_trans(new_machine->init, inner_machine.init, RE_ZERO_VAR, 0); |
| | 714 | |
| | 715 | /* set the variable ID in the ZERO_VAR node */ |
| | 716 | tuple_arr_[new_machine->init].info.loop.loop_var = var_id; |
| | 717 | |
| | 718 | /* set up the loop parameters in the loop node */ |
| | 719 | tuple_arr_[inner_machine.init].info.loop.loop_var = var_id; |
| | 720 | tuple_arr_[inner_machine.init].info.loop.loop_min = min_val; |
| | 721 | tuple_arr_[inner_machine.init].info.loop.loop_max = max_val; |
| | 722 | |
| | 723 | /* |
| | 724 | * if there's a 'shortest' modifier, note it in the loop node, so |
| | 725 | * that we can take the bypass branch first whenever possible |
| | 726 | */ |
| | 727 | if (shortest) |
| | 728 | tuple_arr_[inner_machine.init].flags |= RE_STATE_SHORTEST; |
| | 729 | } |
| | 730 | |
| | 731 | /* |
| | 732 | * Build a null machine |
| | 733 | */ |
| | 734 | void CRegexParser::build_null_machine(re_machine *machine) |
| | 735 | { |
| | 736 | machine->init = machine->final = RE_STATE_INVALID; |
| | 737 | } |
| | 738 | |
| | 739 | /* ------------------------------------------------------------------------ */ |
| | 740 | /* |
| | 741 | * Determine if a machine is null |
| | 742 | */ |
| | 743 | int CRegexParser::is_machine_null(re_machine *machine) |
| | 744 | { |
| | 745 | return (machine->init == RE_STATE_INVALID); |
| | 746 | } |
| | 747 | |
| | 748 | |
| | 749 | /* ------------------------------------------------------------------------ */ |
| | 750 | /* |
| | 751 | * Concatenate the second machine onto the first machine, replacing the |
| | 752 | * first machine with the resulting machine. If the first machine is a |
| | 753 | * null machine (created with re_build_null_machine), we'll simply copy |
| | 754 | * the second machine into the first. |
| | 755 | */ |
| | 756 | void CRegexParser::concat_onto(re_machine *dest, re_machine *rhs) |
| | 757 | { |
| | 758 | /* check for a null destination machine */ |
| | 759 | if (is_machine_null(dest)) |
| | 760 | { |
| | 761 | /* |
| | 762 | * the first machine is null - simply copy the second machine |
| | 763 | * onto the first unchanged |
| | 764 | */ |
| | 765 | *dest = *rhs; |
| | 766 | } |
| | 767 | else |
| | 768 | { |
| | 769 | re_machine new_machine; |
| | 770 | |
| | 771 | /* build the concatenated machine */ |
| | 772 | build_concat(&new_machine, dest, rhs); |
| | 773 | |
| | 774 | /* copy the concatenated machine onto the first machine */ |
| | 775 | *dest = new_machine; |
| | 776 | } |
| | 777 | } |
| | 778 | |
| | 779 | /* |
| | 780 | * Alternate the second machine onto the first machine, replacing the |
| | 781 | * first machine with the resulting machine. If the first machine is a |
| | 782 | * null machine, this simply replaces the first machine with the second |
| | 783 | * machine. If the second machine is null, this simply leaves the first |
| | 784 | * machine unchanged. |
| | 785 | */ |
| | 786 | void CRegexParser::alternate_onto(re_machine *dest, re_machine *rhs) |
| | 787 | { |
| | 788 | /* check to see if the first machine is null */ |
| | 789 | if (is_machine_null(dest)) |
| | 790 | { |
| | 791 | /* |
| | 792 | * the first machine is null - simply copy the second machine |
| | 793 | * onto the first |
| | 794 | */ |
| | 795 | *dest = *rhs; |
| | 796 | } |
| | 797 | else |
| | 798 | { |
| | 799 | /* |
| | 800 | * if the second machine is null, don't do anything; otherwise, |
| | 801 | * build the alternation |
| | 802 | */ |
| | 803 | if (!is_machine_null(rhs)) |
| | 804 | { |
| | 805 | re_machine new_machine; |
| | 806 | |
| | 807 | /* build the alternation */ |
| | 808 | build_alter(&new_machine, dest, rhs); |
| | 809 | |
| | 810 | /* replace the first machine with the alternation */ |
| | 811 | *dest = new_machine; |
| | 812 | } |
| | 813 | } |
| | 814 | } |
| | 815 | |
| | 816 | /* ------------------------------------------------------------------------ */ |
| | 817 | /* |
| | 818 | * Compile an expression |
| | 819 | */ |
| | 820 | re_status_t CRegexParser::compile(const char *expr_str, size_t exprlen, |
| | 821 | re_compiled_pattern_base *pat) |
| | 822 | { |
| | 823 | re_machine cur_machine; |
| | 824 | re_machine alter_machine; |
| | 825 | re_machine new_machine; |
| | 826 | int group_stack_level; |
| | 827 | struct |
| | 828 | { |
| | 829 | re_machine old_cur; |
| | 830 | re_machine old_alter; |
| | 831 | int group_id; |
| | 832 | int capturing; |
| | 833 | int neg_assertion; |
| | 834 | int pos_assertion; |
| | 835 | } group_stack[RE_GROUP_NESTING_MAX]; |
| | 836 | utf8_ptr expr; |
| | 837 | |
| | 838 | /* reset everything */ |
| | 839 | reset(); |
| | 840 | |
| | 841 | /* we have no groups yet */ |
| | 842 | pat->group_cnt = 0; |
| | 843 | |
| | 844 | /* we have no looping variables yet */ |
| | 845 | pat->loop_var_cnt = 0; |
| | 846 | |
| | 847 | /* |
| | 848 | * set the default match modes - maximum, first-beginning, |
| | 849 | * case-sensitive |
| | 850 | */ |
| | 851 | pat->longest_match = TRUE; |
| | 852 | pat->first_begin = TRUE; |
| | 853 | pat->case_sensitive = default_case_sensitive_; |
| | 854 | |
| | 855 | /* start out with no current machine and no alternate machine */ |
| | 856 | build_null_machine(&cur_machine); |
| | 857 | build_null_machine(&alter_machine); |
| | 858 | |
| | 859 | /* nothing on the stack yet */ |
| | 860 | group_stack_level = 0; |
| | 861 | |
| | 862 | /* loop until we run out of expression to parse */ |
| | 863 | for (expr.set((char *)expr_str) ; exprlen != 0 ; expr.inc(&exprlen)) |
| | 864 | { |
| | 865 | switch(expr.getch()) |
| | 866 | { |
| | 867 | case '^': |
| | 868 | /* |
| | 869 | * beginning of line - if we're not at the beginning of the |
| | 870 | * current expression (i.e., we already have some |
| | 871 | * concatentations accumulated), treat it as an ordinary |
| | 872 | * character |
| | 873 | */ |
| | 874 | if (!is_machine_null(&cur_machine)) |
| | 875 | goto normal_char; |
| | 876 | |
| | 877 | /* build a new start-of-text recognizer */ |
| | 878 | build_special(&new_machine, RE_TEXT_BEGIN, 0); |
| | 879 | |
| | 880 | /* |
| | 881 | * concatenate it onto the string - note that this can't |
| | 882 | * have any postfix operators |
| | 883 | */ |
| | 884 | concat_onto(&cur_machine, &new_machine); |
| | 885 | break; |
| | 886 | |
| | 887 | case '$': |
| | 888 | /* |
| | 889 | * End of line specifier - if there's anything left after |
| | 890 | * the '$' other than a close parens or alternation |
| | 891 | * specifier, treat it as a normal character |
| | 892 | */ |
| | 893 | if (exprlen > 1 |
| | 894 | && (expr.getch_at(1) != ')' && expr.getch_at(1) != '|')) |
| | 895 | goto normal_char; |
| | 896 | |
| | 897 | /* build a new end-of-text recognizer */ |
| | 898 | build_special(&new_machine, RE_TEXT_END, 0); |
| | 899 | |
| | 900 | /* |
| | 901 | * concatenate it onto the string - note that this can't |
| | 902 | * have any postfix operators |
| | 903 | */ |
| | 904 | concat_onto(&cur_machine, &new_machine); |
| | 905 | break; |
| | 906 | |
| | 907 | case '(': |
| | 908 | { |
| | 909 | int capturing; |
| | 910 | int pos_assertion; |
| | 911 | int neg_assertion; |
| | 912 | |
| | 913 | /* presume it's a capturing group */ |
| | 914 | capturing = TRUE; |
| | 915 | |
| | 916 | /* presume it's not an assertion */ |
| | 917 | pos_assertion = FALSE; |
| | 918 | neg_assertion = FALSE; |
| | 919 | |
| | 920 | /* |
| | 921 | * Add a nesting level. Push the current machine and |
| | 922 | * alternate machines onto the group stack, and clear |
| | 923 | * everything out for the new group. |
| | 924 | */ |
| | 925 | if (group_stack_level > RE_GROUP_NESTING_MAX) |
| | 926 | { |
| | 927 | /* we cannot proceed - return an error */ |
| | 928 | return RE_STATUS_GROUP_NESTING_TOO_DEEP; |
| | 929 | } |
| | 930 | |
| | 931 | /* save the current state on the stack */ |
| | 932 | group_stack[group_stack_level].old_cur = cur_machine; |
| | 933 | group_stack[group_stack_level].old_alter = alter_machine; |
| | 934 | |
| | 935 | /* |
| | 936 | * Assign the group a group ID - groups are numbered in |
| | 937 | * order of their opening (left) parentheses, so we want |
| | 938 | * to assign a group number now. We won't actually need |
| | 939 | * to know the group number until we get to the matching |
| | 940 | * close paren, but we need to assign it now, so store |
| | 941 | * it in the group stack. |
| | 942 | */ |
| | 943 | group_stack[group_stack_level].group_id = pat->group_cnt; |
| | 944 | |
| | 945 | /* check for special group flags */ |
| | 946 | if (exprlen > 2 && expr.getch_at(1) == '?') |
| | 947 | { |
| | 948 | switch(expr.getch_at(2)) |
| | 949 | { |
| | 950 | case ':': |
| | 951 | /* it's a non-capturing group */ |
| | 952 | capturing = FALSE; |
| | 953 | |
| | 954 | /* skip two extra characters for the '?:' part */ |
| | 955 | expr.inc(&exprlen); |
| | 956 | expr.inc(&exprlen); |
| | 957 | break; |
| | 958 | |
| | 959 | case '=': |
| | 960 | /* it's a positive assertion group */ |
| | 961 | pos_assertion = TRUE; |
| | 962 | |
| | 963 | /* assertions don't capture */ |
| | 964 | capturing = FALSE; |
| | 965 | |
| | 966 | /* skip two extra characters for the '?=' part */ |
| | 967 | expr.inc(&exprlen); |
| | 968 | expr.inc(&exprlen); |
| | 969 | break; |
| | 970 | |
| | 971 | case '!': |
| | 972 | /* it's a negative assertion group */ |
| | 973 | neg_assertion = TRUE; |
| | 974 | |
| | 975 | /* assertions don't capture */ |
| | 976 | capturing = FALSE; |
| | 977 | |
| | 978 | /* skip two extra characters for the '?!' part */ |
| | 979 | expr.inc(&exprlen); |
| | 980 | expr.inc(&exprlen); |
| | 981 | break; |
| | 982 | |
| | 983 | default: |
| | 984 | /* it's not a recognized sequence - ignore it */ |
| | 985 | break; |
| | 986 | } |
| | 987 | } |
| | 988 | |
| | 989 | /* remember if the group is capturing */ |
| | 990 | group_stack[group_stack_level].capturing = capturing; |
| | 991 | |
| | 992 | /* remember if it's an assertion of some kind */ |
| | 993 | group_stack[group_stack_level].pos_assertion = pos_assertion; |
| | 994 | group_stack[group_stack_level].neg_assertion = neg_assertion; |
| | 995 | |
| | 996 | /* consume the group number if it's a capturing group */ |
| | 997 | if (capturing) |
| | 998 | ++(pat->group_cnt); |
| | 999 | |
| | 1000 | /* push the level */ |
| | 1001 | ++group_stack_level; |
| | 1002 | |
| | 1003 | /* start the new group with empty machines */ |
| | 1004 | build_null_machine(&cur_machine); |
| | 1005 | build_null_machine(&alter_machine); |
| | 1006 | } |
| | 1007 | break; |
| | 1008 | |
| | 1009 | case ')': |
| | 1010 | do_close_paren: |
| | 1011 | /* if there's nothing on the stack, ignore this */ |
| | 1012 | if (group_stack_level == 0) |
| | 1013 | break; |
| | 1014 | |
| | 1015 | /* take a level off the stack */ |
| | 1016 | --group_stack_level; |
| | 1017 | |
| | 1018 | /* |
| | 1019 | * Remove a nesting level. If we have a pending alternate |
| | 1020 | * expression, build the alternation expression. This will |
| | 1021 | * leave the entire group expression in alter_machine, |
| | 1022 | * regardless of whether an alternation was in progress or |
| | 1023 | * not. |
| | 1024 | */ |
| | 1025 | alternate_onto(&alter_machine, &cur_machine); |
| | 1026 | |
| | 1027 | /* |
| | 1028 | * Create a group machine that encloses the group and marks |
| | 1029 | * it with a group number. We assigned the group number |
| | 1030 | * when we parsed the open paren, so read that group number |
| | 1031 | * from the stack. |
| | 1032 | * |
| | 1033 | * Note that this will leave 'new_machine' with the entire |
| | 1034 | * group machine. |
| | 1035 | * |
| | 1036 | * If this is a non-capturing group, don't bother building |
| | 1037 | * the new machine - just copy the current alternation |
| | 1038 | * machine onto the new machine. |
| | 1039 | */ |
| | 1040 | if (group_stack[group_stack_level].capturing) |
| | 1041 | { |
| | 1042 | /* it's a regular capturing group - add the group machine */ |
| | 1043 | build_group(&new_machine, &alter_machine, |
| | 1044 | group_stack[group_stack_level].group_id); |
| | 1045 | } |
| | 1046 | else if (group_stack[group_stack_level].pos_assertion |
| | 1047 | || group_stack[group_stack_level].neg_assertion) |
| | 1048 | { |
| | 1049 | /* it's an assertion - build the assertion group */ |
| | 1050 | build_assert(&new_machine, &alter_machine, |
| | 1051 | group_stack[group_stack_level].neg_assertion); |
| | 1052 | } |
| | 1053 | else |
| | 1054 | { |
| | 1055 | /* it's a non-capturing group - just add the group tree */ |
| | 1056 | new_machine = alter_machine; |
| | 1057 | } |
| | 1058 | |
| | 1059 | /* |
| | 1060 | * Pop the stack - restore the alternation and current |
| | 1061 | * machines that were in progress before the group started. |
| | 1062 | */ |
| | 1063 | cur_machine = group_stack[group_stack_level].old_cur; |
| | 1064 | alter_machine = group_stack[group_stack_level].old_alter; |
| | 1065 | |
| | 1066 | /* |
| | 1067 | * If we just did an assertion, ignore any closure postfix |
| | 1068 | * operators. Since an assertion doesn't consume any input |
| | 1069 | * text, applying a closure would create an infinite loop. |
| | 1070 | */ |
| | 1071 | if (group_stack[group_stack_level].pos_assertion |
| | 1072 | || group_stack[group_stack_level].neg_assertion) |
| | 1073 | goto skip_closures; |
| | 1074 | |
| | 1075 | /* |
| | 1076 | * Check the group expression (in new_machine) for postfix |
| | 1077 | * expressions |
| | 1078 | */ |
| | 1079 | goto apply_postfix; |
| | 1080 | |
| | 1081 | case '|': |
| | 1082 | /* |
| | 1083 | * Start a new alternation. This ends the current |
| | 1084 | * alternation; if we have a previous pending alternate, |
| | 1085 | * build an alternation machine out of the previous |
| | 1086 | * alternate and the current machine and move that to the |
| | 1087 | * alternate; otherwise, simply move the current machine to |
| | 1088 | * the pending alternate. |
| | 1089 | */ |
| | 1090 | alternate_onto(&alter_machine, &cur_machine); |
| | 1091 | |
| | 1092 | /* |
| | 1093 | * the alternation starts out with a blank slate, so null |
| | 1094 | * out the current machine |
| | 1095 | */ |
| | 1096 | build_null_machine(&cur_machine); |
| | 1097 | break; |
| | 1098 | |
| | 1099 | case '<': |
| | 1100 | /* check for our various special directives */ |
| | 1101 | if ((exprlen >= 4 && memicmp(expr.getptr(), "<FE>", 4) == 0) |
| | 1102 | || (exprlen >= 10 |
| | 1103 | && memicmp(expr.getptr(), "<FirstEnd>", 10) == 0)) |
| | 1104 | { |
| | 1105 | /* turn off first-begin mode */ |
| | 1106 | pat->first_begin = FALSE; |
| | 1107 | } |
| | 1108 | else if ((exprlen >= 4 && memicmp(expr.getptr(), "<FB>", 4) == 0) |
| | 1109 | || (exprlen >= 12 |
| | 1110 | && memicmp(expr.getptr(), "<FirstBegin>", 12) == 0)) |
| | 1111 | { |
| | 1112 | /* turn on first-begin mode */ |
| | 1113 | pat->first_begin = TRUE; |
| | 1114 | } |
| | 1115 | else if (exprlen >= 5 && memicmp(expr.getptr(), "<Max>", 5) == 0) |
| | 1116 | { |
| | 1117 | /* turn on longest-match mode */ |
| | 1118 | pat->longest_match = TRUE; |
| | 1119 | } |
| | 1120 | else if (exprlen >= 5 && memicmp(expr.getptr(), "<Min>", 5) == 0) |
| | 1121 | { |
| | 1122 | /* turn off longest-match mode */ |
| | 1123 | pat->longest_match = FALSE; |
| | 1124 | } |
| | 1125 | else if (exprlen >= 6 && memicmp(expr.getptr(), "<Case>", 6) == 0) |
| | 1126 | { |
| | 1127 | /* turn on case sensitivity */ |
| | 1128 | pat->case_sensitive = TRUE; |
| | 1129 | } |
| | 1130 | else if (exprlen >= 8 |
| | 1131 | && memicmp(expr.getptr(), "<NoCase>", 8) == 0) |
| | 1132 | { |
| | 1133 | /* turn off case sensitivity */ |
| | 1134 | pat->case_sensitive = FALSE; |
| | 1135 | } |
| | 1136 | else |
| | 1137 | { |
| | 1138 | /* |
| | 1139 | * It's nothing else we recognize, so it must be a |
| | 1140 | * character class or class range expression, which |
| | 1141 | * consists of one or more classes, single characters, or |
| | 1142 | * character ranges separated by '|' delimiters. |
| | 1143 | */ |
| | 1144 | if (compile_char_class_expr(&expr, &exprlen, &new_machine)) |
| | 1145 | { |
| | 1146 | /* success - look for postfix operators */ |
| | 1147 | goto apply_postfix; |
| | 1148 | } |
| | 1149 | else |
| | 1150 | { |
| | 1151 | /* |
| | 1152 | * failure - treat the whole thing as ordinary |
| | 1153 | * characters |
| | 1154 | */ |
| | 1155 | goto normal_char; |
| | 1156 | } |
| | 1157 | } |
| | 1158 | |
| | 1159 | /* skip everything up to the closing ">" */ |
| | 1160 | for ( ; exprlen > 0 && expr.getch() != '>' ; expr.inc(&exprlen)) ; |
| | 1161 | break; |
| | 1162 | |
| | 1163 | case '%': |
| | 1164 | /* |
| | 1165 | * quoted character - skip the quote mark and see what we |
| | 1166 | * have |
| | 1167 | */ |
| | 1168 | expr.inc(&exprlen); |
| | 1169 | |
| | 1170 | /* check to see if we're at the end of the expression */ |
| | 1171 | if (exprlen == 0) |
| | 1172 | { |
| | 1173 | /* |
| | 1174 | * end of the string - ignore it, but undo the extra |
| | 1175 | * increment of the expression index so that we exit the |
| | 1176 | * enclosing loop properly |
| | 1177 | */ |
| | 1178 | expr.dec(&exprlen); |
| | 1179 | break; |
| | 1180 | } |
| | 1181 | |
| | 1182 | /* see what we have */ |
| | 1183 | switch(expr.getch()) |
| | 1184 | { |
| | 1185 | case '1': |
| | 1186 | case '2': |
| | 1187 | case '3': |
| | 1188 | case '4': |
| | 1189 | case '5': |
| | 1190 | case '6': |
| | 1191 | case '7': |
| | 1192 | case '8': |
| | 1193 | case '9': |
| | 1194 | /* group match - build a new literal group recognizer */ |
| | 1195 | build_group_matcher(&new_machine, |
| | 1196 | value_of_digit(expr.getch()) - 1); |
| | 1197 | |
| | 1198 | /* apply any postfix expression to the group recognizer */ |
| | 1199 | goto apply_postfix; |
| | 1200 | |
| | 1201 | case '<': |
| | 1202 | /* build a beginning-of-word recognizer */ |
| | 1203 | build_special(&new_machine, RE_WORD_BEGIN, 0); |
| | 1204 | |
| | 1205 | /* it can't be postfixed - just concatenate it */ |
| | 1206 | concat_onto(&cur_machine, &new_machine); |
| | 1207 | break; |
| | 1208 | |
| | 1209 | case '>': |
| | 1210 | /* build an end-of-word recognizer */ |
| | 1211 | build_special(&new_machine, RE_WORD_END, 0); |
| | 1212 | |
| | 1213 | /* it can't be postfixed - just concatenate it */ |
| | 1214 | concat_onto(&cur_machine, &new_machine); |
| | 1215 | break; |
| | 1216 | |
| | 1217 | case 'w': |
| | 1218 | /* word character */ |
| | 1219 | build_special(&new_machine, RE_WORD_CHAR, 0); |
| | 1220 | goto apply_postfix; |
| | 1221 | |
| | 1222 | case 'W': |
| | 1223 | /* non-word character */ |
| | 1224 | build_special(&new_machine, RE_NON_WORD_CHAR, 0); |
| | 1225 | goto apply_postfix; |
| | 1226 | |
| | 1227 | case 'b': |
| | 1228 | /* word boundary */ |
| | 1229 | build_special(&new_machine, RE_WORD_BOUNDARY, 0); |
| | 1230 | |
| | 1231 | /* it can't be postfixed */ |
| | 1232 | concat_onto(&cur_machine, &new_machine); |
| | 1233 | break; |
| | 1234 | |
| | 1235 | case 'B': |
| | 1236 | /* not a word boundary */ |
| | 1237 | build_special(&new_machine, RE_NON_WORD_BOUNDARY, 0); |
| | 1238 | |
| | 1239 | /* it can't be postfixed */ |
| | 1240 | concat_onto(&cur_machine, &new_machine); |
| | 1241 | break; |
| | 1242 | |
| | 1243 | default: |
| | 1244 | /* build a new literal character recognizer */ |
| | 1245 | build_char(&new_machine, expr.getch()); |
| | 1246 | |
| | 1247 | /* apply any postfix expression to the character */ |
| | 1248 | goto apply_postfix; |
| | 1249 | } |
| | 1250 | break; |
| | 1251 | |
| | 1252 | case '.': |
| | 1253 | /* |
| | 1254 | * wildcard character - build a single character recognizer |
| | 1255 | * for the special wildcard symbol, then go check it for a |
| | 1256 | * postfix operator |
| | 1257 | */ |
| | 1258 | build_special(&new_machine, RE_WILDCARD, 0); |
| | 1259 | goto apply_postfix; |
| | 1260 | break; |
| | 1261 | |
| | 1262 | case '[': |
| | 1263 | /* range expression */ |
| | 1264 | { |
| | 1265 | int is_exclusive = FALSE; |
| | 1266 | |
| | 1267 | /* we have no entries yet */ |
| | 1268 | range_buf_cnt_ = 0; |
| | 1269 | |
| | 1270 | /* first, skip the open bracket */ |
| | 1271 | expr.inc(&exprlen); |
| | 1272 | |
| | 1273 | /* check to see if starts with the exclusion character */ |
| | 1274 | if (exprlen != 0 && expr.getch() == '^') |
| | 1275 | { |
| | 1276 | /* skip the exclusion specifier */ |
| | 1277 | expr.inc(&exprlen); |
| | 1278 | |
| | 1279 | /* note it */ |
| | 1280 | is_exclusive = TRUE; |
| | 1281 | } |
| | 1282 | |
| | 1283 | /* |
| | 1284 | * if the first character is a ']', include it in the |
| | 1285 | * range |
| | 1286 | */ |
| | 1287 | if (exprlen != 0 && expr.getch() == ']') |
| | 1288 | { |
| | 1289 | add_range_char(']'); |
| | 1290 | expr.inc(&exprlen); |
| | 1291 | } |
| | 1292 | |
| | 1293 | /* |
| | 1294 | * if the next character is a '-', include it in the |
| | 1295 | * range |
| | 1296 | */ |
| | 1297 | if (exprlen != 0 && expr.getch() == '-') |
| | 1298 | { |
| | 1299 | add_range_char('-'); |
| | 1300 | expr.inc(&exprlen); |
| | 1301 | } |
| | 1302 | |
| | 1303 | /* scan the character set */ |
| | 1304 | while (exprlen != 0 && expr.getch() != ']') |
| | 1305 | { |
| | 1306 | wchar_t ch; |
| | 1307 | |
| | 1308 | /* note this character */ |
| | 1309 | ch = expr.getch(); |
| | 1310 | |
| | 1311 | /* skip this character of the expression */ |
| | 1312 | expr.inc(&exprlen); |
| | 1313 | |
| | 1314 | /* check for a range */ |
| | 1315 | if (exprlen != 0 && expr.getch() == '-') |
| | 1316 | { |
| | 1317 | wchar_t ch2; |
| | 1318 | |
| | 1319 | /* skip the '-' */ |
| | 1320 | expr.inc(&exprlen); |
| | 1321 | if (exprlen != 0) |
| | 1322 | { |
| | 1323 | /* get the other end of the range */ |
| | 1324 | ch2 = expr.getch(); |
| | 1325 | |
| | 1326 | /* skip the second character */ |
| | 1327 | expr.inc(&exprlen); |
| | 1328 | |
| | 1329 | /* if the range is reversed, swap it */ |
| | 1330 | if (ch > ch2) |
| | 1331 | { |
| | 1332 | wchar_t tmp = ch; |
| | 1333 | ch = ch2; |
| | 1334 | ch2 = tmp; |
| | 1335 | } |
| | 1336 | |
| | 1337 | /* add the range */ |
| | 1338 | add_range_char(ch, ch2); |
| | 1339 | } |
| | 1340 | } |
| | 1341 | else |
| | 1342 | { |
| | 1343 | /* no range - add the one-character range */ |
| | 1344 | add_range_char(ch); |
| | 1345 | } |
| | 1346 | } |
| | 1347 | |
| | 1348 | /* create a character range machine */ |
| | 1349 | build_char_range(&new_machine, is_exclusive); |
| | 1350 | |
| | 1351 | /* apply any postfix operator */ |
| | 1352 | goto apply_postfix; |
| | 1353 | } |
| | 1354 | break; |
| | 1355 | |
| | 1356 | default: |
| | 1357 | normal_char: |
| | 1358 | /* |
| | 1359 | * it's an ordinary character - build a single character |
| | 1360 | * recognizer machine, and then concatenate it onto any |
| | 1361 | * existing machine |
| | 1362 | */ |
| | 1363 | build_char(&new_machine, expr.getch()); |
| | 1364 | |
| | 1365 | apply_postfix: |
| | 1366 | /* |
| | 1367 | * Check for a postfix operator, and apply it to the machine |
| | 1368 | * in 'new_machine' if present. In any case, concatenate |
| | 1369 | * the 'new_machine' (modified by a postix operator or not) |
| | 1370 | * to the current machien. |
| | 1371 | */ |
| | 1372 | if (exprlen > 1) |
| | 1373 | { |
| | 1374 | switch(expr.getch_at(1)) |
| | 1375 | { |
| | 1376 | case '*': |
| | 1377 | case '+': |
| | 1378 | case '?': |
| | 1379 | /* |
| | 1380 | * We have a postfix closure operator. Build a new |
| | 1381 | * closure machine out of 'new_machine'. |
| | 1382 | */ |
| | 1383 | { |
| | 1384 | re_machine closure_machine; |
| | 1385 | int shortest; |
| | 1386 | |
| | 1387 | /* move onto the closure operator */ |
| | 1388 | expr.inc(&exprlen); |
| | 1389 | |
| | 1390 | /* |
| | 1391 | * if the next character is '?', it's a modifier |
| | 1392 | * that indicates that we are to use the |
| | 1393 | * shortest match - note it if so |
| | 1394 | */ |
| | 1395 | shortest = (exprlen > 1 && expr.getch_at(1) == '?'); |
| | 1396 | |
| | 1397 | /* build the closure machine */ |
| | 1398 | build_closure(&closure_machine, |
| | 1399 | &new_machine, expr.getch(), shortest); |
| | 1400 | |
| | 1401 | /* replace the original machine with the closure */ |
| | 1402 | new_machine = closure_machine; |
| | 1403 | |
| | 1404 | /* |
| | 1405 | * skip any redundant closure symbols, keeping |
| | 1406 | * only the first one we saw |
| | 1407 | */ |
| | 1408 | skip_closures: |
| | 1409 | while (exprlen > 1) |
| | 1410 | { |
| | 1411 | /* check for a simple closure suffix */ |
| | 1412 | if (expr.getch_at(1) == '?' |
| | 1413 | || expr.getch_at(1) == '+' |
| | 1414 | || expr.getch_at(1) == '*') |
| | 1415 | { |
| | 1416 | /* skip it and keep looping */ |
| | 1417 | expr.inc(&exprlen); |
| | 1418 | continue; |
| | 1419 | } |
| | 1420 | |
| | 1421 | /* check for an interval */ |
| | 1422 | if (expr.getch_at(1) == '{') |
| | 1423 | { |
| | 1424 | /* skip until we find the matching '}' */ |
| | 1425 | while (exprlen > 1 && expr.getch_at(1) != '}') |
| | 1426 | expr.inc(&exprlen); |
| | 1427 | |
| | 1428 | /* go back for anything that follows */ |
| | 1429 | continue; |
| | 1430 | } |
| | 1431 | |
| | 1432 | /* if it's anything else, we're done discarding */ |
| | 1433 | break; |
| | 1434 | } |
| | 1435 | } |
| | 1436 | break; |
| | 1437 | |
| | 1438 | case '{': |
| | 1439 | /* interval specifier */ |
| | 1440 | { |
| | 1441 | int min_val; |
| | 1442 | int max_val; |
| | 1443 | re_machine interval_machine; |
| | 1444 | int shortest; |
| | 1445 | int var_id; |
| | 1446 | |
| | 1447 | /* |
| | 1448 | * loops can never overlap, but can be nested; |
| | 1449 | * so the only thing we have to worry about in |
| | 1450 | * assigning a loop variable is the group |
| | 1451 | * nesting depth |
| | 1452 | */ |
| | 1453 | var_id = group_stack_level; |
| | 1454 | |
| | 1455 | /* note the highest variable ID we've seen */ |
| | 1456 | if (var_id >= pat->loop_var_cnt) |
| | 1457 | pat->loop_var_cnt = var_id + 1; |
| | 1458 | |
| | 1459 | /* presume neither min nor max will be specified */ |
| | 1460 | min_val = -1; |
| | 1461 | max_val = -1; |
| | 1462 | |
| | 1463 | /* skip the current character and the '{' */ |
| | 1464 | expr.inc(&exprlen); |
| | 1465 | expr.inc(&exprlen); |
| | 1466 | |
| | 1467 | /* parse the minimum count, if provided */ |
| | 1468 | min_val = parse_int(&expr, &exprlen); |
| | 1469 | |
| | 1470 | /* if there's a comma, parse the maximum value */ |
| | 1471 | if (exprlen >= 1 && expr.getch() == ',') |
| | 1472 | { |
| | 1473 | /* skip the ',' and parse the number */ |
| | 1474 | expr.inc(&exprlen); |
| | 1475 | max_val = parse_int(&expr, &exprlen); |
| | 1476 | } |
| | 1477 | else |
| | 1478 | { |
| | 1479 | /* |
| | 1480 | * there's no other value, so this is a |
| | 1481 | * simple loop with the same value for min |
| | 1482 | * and max |
| | 1483 | */ |
| | 1484 | max_val = min_val; |
| | 1485 | } |
| | 1486 | |
| | 1487 | /* |
| | 1488 | * if we're not looking at a '}', skip |
| | 1489 | * characters until we are |
| | 1490 | */ |
| | 1491 | while (exprlen != 0 && expr.getch() != '}') |
| | 1492 | expr.inc(&exprlen); |
| | 1493 | |
| | 1494 | /* |
| | 1495 | * if there's a '?' following, it's a 'shortest' |
| | 1496 | * modifier - note it |
| | 1497 | */ |
| | 1498 | shortest = FALSE; |
| | 1499 | if (exprlen > 1 && expr.getch_at(1) == '?') |
| | 1500 | { |
| | 1501 | /* note the modifier */ |
| | 1502 | shortest = TRUE; |
| | 1503 | |
| | 1504 | /* skip another character for the modifier */ |
| | 1505 | expr.inc(&exprlen); |
| | 1506 | } |
| | 1507 | |
| | 1508 | /* set up an interval node */ |
| | 1509 | build_interval(&interval_machine, &new_machine, |
| | 1510 | min_val, max_val, var_id, shortest); |
| | 1511 | |
| | 1512 | /* replace the original machine with the interval */ |
| | 1513 | new_machine = interval_machine; |
| | 1514 | |
| | 1515 | /* skip any closure modifiers that follow */ |
| | 1516 | goto skip_closures; |
| | 1517 | } |
| | 1518 | break; |
| | 1519 | |
| | 1520 | default: |
| | 1521 | /* no postfix operator */ |
| | 1522 | break; |
| | 1523 | } |
| | 1524 | } |
| | 1525 | |
| | 1526 | /* |
| | 1527 | * Concatenate the new machine onto the current machine |
| | 1528 | * under construction. |
| | 1529 | */ |
| | 1530 | concat_onto(&cur_machine, &new_machine); |
| | 1531 | break; |
| | 1532 | } |
| | 1533 | |
| | 1534 | /* if we've run down the expression string, go no further */ |
| | 1535 | if (exprlen == 0) |
| | 1536 | break; |
| | 1537 | } |
| | 1538 | |
| | 1539 | /* if there are any open parens outstanding, close them */ |
| | 1540 | if (group_stack_level != 0) |
| | 1541 | goto do_close_paren; |
| | 1542 | |
| | 1543 | /* complete any pending alternation */ |
| | 1544 | alternate_onto(&alter_machine, &cur_machine); |
| | 1545 | |
| | 1546 | /* check for and break any infinite loops */ |
| | 1547 | break_loops(&alter_machine); |
| | 1548 | |
| | 1549 | /* remove meaningless branch-to-branch transitions */ |
| | 1550 | remove_branch_to_branch(&alter_machine); |
| | 1551 | |
| | 1552 | /* store the results in the caller's base pattern description */ |
| | 1553 | pat->machine = alter_machine; |
| | 1554 | pat->tuple_cnt = next_state_; |
| | 1555 | |
| | 1556 | /* limit the group count to the maximum */ |
| | 1557 | if (pat->group_cnt > RE_GROUP_REG_CNT) |
| | 1558 | pat->group_cnt = RE_GROUP_REG_CNT; |
| | 1559 | |
| | 1560 | /* limit the variable count to the maximum */ |
| | 1561 | if (pat->loop_var_cnt > RE_LOOP_VARS_MAX) |
| | 1562 | pat->loop_var_cnt = RE_LOOP_VARS_MAX; |
| | 1563 | |
| | 1564 | /* no errors encountered */ |
| | 1565 | return RE_STATUS_SUCCESS; |
| | 1566 | } |
| | 1567 | |
| | 1568 | |
| | 1569 | /* |
| | 1570 | * Parse a character class or class range expresion. Returns true on |
| | 1571 | * success, in which case we will have built a class range expression in |
| | 1572 | * new_machine and updated expr and exprlen to skip the expression. |
| | 1573 | * Returns false on syntax error or other failure, in which case expr and |
| | 1574 | * exprlen will be unchanged. |
| | 1575 | */ |
| | 1576 | int CRegexParser::compile_char_class_expr(utf8_ptr *expr, size_t *exprlen, |
| | 1577 | re_machine *result_machine) |
| | 1578 | { |
| | 1579 | utf8_ptr p; |
| | 1580 | size_t rem; |
| | 1581 | int is_exclusive; |
| | 1582 | |
| | 1583 | /* start at the character after the '<' */ |
| | 1584 | p = *expr; |
| | 1585 | rem = *exprlen; |
| | 1586 | p.inc(&rem); |
| | 1587 | |
| | 1588 | /* presume it won't be exclusive */ |
| | 1589 | is_exclusive = FALSE; |
| | 1590 | |
| | 1591 | /* check for an exclusion flag */ |
| | 1592 | if (rem != 0 && p.getch() == '^') |
| | 1593 | { |
| | 1594 | /* note the exclusion */ |
| | 1595 | is_exclusive = TRUE; |
| | 1596 | |
| | 1597 | /* skip the exclusion prefix */ |
| | 1598 | p.inc(&rem); |
| | 1599 | } |
| | 1600 | |
| | 1601 | /* clear out the range builder buffer */ |
| | 1602 | range_buf_cnt_ = 0; |
| | 1603 | |
| | 1604 | /* |
| | 1605 | * keep going until we find the closing delimiter (or run out of |
| | 1606 | * expression string) |
| | 1607 | */ |
| | 1608 | for (;;) |
| | 1609 | { |
| | 1610 | utf8_ptr start; |
| | 1611 | wchar_t delim; |
| | 1612 | size_t curcharlen; |
| | 1613 | size_t curbytelen; |
| | 1614 | |
| | 1615 | /* remember where the current part starts */ |
| | 1616 | start = p; |
| | 1617 | |
| | 1618 | /* scan for the '>' or '|' */ |
| | 1619 | for (curcharlen = 0 ; rem != 0 ; p.inc(&rem), ++curcharlen) |
| | 1620 | { |
| | 1621 | /* get the next character */ |
| | 1622 | delim = p.getch(); |
| | 1623 | |
| | 1624 | /* if it's '>' or '|', we're done */ |
| | 1625 | if (delim == '>' || delim == '|') |
| | 1626 | break; |
| | 1627 | } |
| | 1628 | |
| | 1629 | /* |
| | 1630 | * If we reached the end of the expression without finding a |
| | 1631 | * closing delimiter, the expression is invalid - treat the whole |
| | 1632 | * thing (starting with the '<') as ordinary characters. |
| | 1633 | */ |
| | 1634 | if (rem == 0) |
| | 1635 | return FALSE; |
| | 1636 | |
| | 1637 | /* get the length of this part */ |
| | 1638 | curbytelen = (size_t)(p.getptr() - start.getptr()); |
| | 1639 | |
| | 1640 | /* |
| | 1641 | * See what we have. If we have a single character, it's a |
| | 1642 | * literal. If we have a character, a hyphen, and another |
| | 1643 | * character, it's a literal range. Otherwise, it must be one of |
| | 1644 | * our named classes. |
| | 1645 | */ |
| | 1646 | if (curcharlen == 1) |
| | 1647 | { |
| | 1648 | /* it's a literal - add the character */ |
| | 1649 | add_range_char(start.getch()); |
| | 1650 | } |
| | 1651 | else if (curcharlen == 3 && start.getch_at(1) == '-') |
| | 1652 | { |
| | 1653 | /* it's a literal range - add it */ |
| | 1654 | add_range_char(start.getch(), start.getch_at(2)); |
| | 1655 | } |
| | 1656 | else |
| | 1657 | { |
| | 1658 | struct char_class_t |
| | 1659 | { |
| | 1660 | /* expression name for the class */ |
| | 1661 | const char *name; |
| | 1662 | |
| | 1663 | /* length of the class name */ |
| | 1664 | size_t name_len; |
| | 1665 | |
| | 1666 | /* |
| | 1667 | * literal character, if the name represents a character |
| | 1668 | * rather than a class - this is used only if code == |
| | 1669 | * RE_LITERAL |
| | 1670 | */ |
| | 1671 | wchar_t ch; |
| | 1672 | |
| | 1673 | /* RE_xxx code for the class */ |
| | 1674 | re_recog_type code; |
| | 1675 | }; |
| | 1676 | static const char_class_t classes[] = |
| | 1677 | { |
| | 1678 | { "alpha", 5, 0, RE_ALPHA }, |
| | 1679 | { "digit", 5, 0, RE_DIGIT }, |
| | 1680 | { "upper", 5, 0, RE_UPPER }, |
| | 1681 | { "lower", 5, 0, RE_LOWER }, |
| | 1682 | { "alphanum", 8, 0, RE_ALPHANUM }, |
| | 1683 | { "space", 5, 0, RE_SPACE }, |
| | 1684 | { "punct", 5, 0, RE_PUNCT }, |
| | 1685 | { "newline", 7, 0, RE_NEWLINE }, |
| | 1686 | { "langle", 6, '<', RE_LITERAL }, |
| | 1687 | { "rangle", 6, '>', RE_LITERAL }, |
| | 1688 | { "vbar", 4, '|', RE_LITERAL }, |
| | 1689 | { "caret", 5, '^', RE_LITERAL }, |
| | 1690 | { "squote", 6, '\'', RE_LITERAL }, |
| | 1691 | { "dquote", 6, '"', RE_LITERAL }, |
| | 1692 | { "star", 4, '*', RE_LITERAL }, |
| | 1693 | { "question", 8, '?', RE_LITERAL }, |
| | 1694 | { "percent", 7, '%', RE_LITERAL }, |
| | 1695 | { "dot", 3, '.', RE_LITERAL }, |
| | 1696 | { "period", 6, '.', RE_LITERAL }, |
| | 1697 | { "plus", 4, '+', RE_LITERAL }, |
| | 1698 | { "lsquare", 7, '[', RE_LITERAL }, |
| | 1699 | { "rsquare", 7, ']', RE_LITERAL }, |
| | 1700 | { "lparen", 6, '(', RE_LITERAL }, |
| | 1701 | { "rparen", 6, ')', RE_LITERAL}, |
| | 1702 | { "lbrace", 6, '{', RE_LITERAL }, |
| | 1703 | { "rbrace", 6, '}', RE_LITERAL }, |
| | 1704 | { "dollar", 6, '$', RE_LITERAL }, |
| | 1705 | { "backslash",9, '\\', RE_LITERAL }, |
| | 1706 | { "return", 6, 0x000D, RE_LITERAL }, |
| | 1707 | { "linefeed", 8, 0x000A, RE_LITERAL }, |
| | 1708 | { "tab", 3, 0x0009, RE_LITERAL }, |
| | 1709 | { "nul", 3, 0x0000, RE_LITERAL }, |
| | 1710 | { "null", 4, 0x0000, RE_LITERAL } |
| | 1711 | }; |
| | 1712 | const char_class_t *cp; |
| | 1713 | size_t i; |
| | 1714 | int found; |
| | 1715 | |
| | 1716 | /* scan our name list for a match */ |
| | 1717 | for (cp = classes, i = 0, found = FALSE ; |
| | 1718 | i < sizeof(classes)/sizeof(classes[0]) ; |
| | 1719 | ++i, ++cp) |
| | 1720 | { |
| | 1721 | /* |
| | 1722 | * if the length matches and the name matches (ignoring |
| | 1723 | * case), this is the one we want |
| | 1724 | */ |
| | 1725 | if (curbytelen == cp->name_len |
| | 1726 | && memicmp(start.getptr(), cp->name, curbytelen) == 0) |
| | 1727 | { |
| | 1728 | /* |
| | 1729 | * this is it - add either a class range or literal |
| | 1730 | * range, depending on the meaning of the name |
| | 1731 | */ |
| | 1732 | if (cp->code == RE_LITERAL) |
| | 1733 | { |
| | 1734 | /* it's a name for a literal */ |
| | 1735 | add_range_char(cp->ch); |
| | 1736 | } |
| | 1737 | else |
| | 1738 | { |
| | 1739 | /* |
| | 1740 | * It's a name for a character class. As a |
| | 1741 | * special case for efficiency, if this is the one |
| | 1742 | * and only thing in this class expression, don't |
| | 1743 | * create a range expression but instead create a |
| | 1744 | * special for the class. |
| | 1745 | * |
| | 1746 | * Note that we can't do this for an exclusive |
| | 1747 | * class, since we don't have any special matcher |
| | 1748 | * for those - implement those with a character |
| | 1749 | * range as usual. |
| | 1750 | */ |
| | 1751 | if (range_buf_cnt_ == 0 |
| | 1752 | && delim == '>' |
| | 1753 | && !is_exclusive) |
| | 1754 | { |
| | 1755 | /* |
| | 1756 | * This is the only thing, so build a special |
| | 1757 | * to match this class - this is more |
| | 1758 | * efficient to store and to match than a |
| | 1759 | * range expression. |
| | 1760 | */ |
| | 1761 | build_special(result_machine, cp->code, 0); |
| | 1762 | |
| | 1763 | /* skip to the '>' */ |
| | 1764 | *expr = p; |
| | 1765 | *exprlen = rem; |
| | 1766 | |
| | 1767 | /* |
| | 1768 | * we're done with the expresion - tell the |
| | 1769 | * caller we were successful |
| | 1770 | */ |
| | 1771 | return TRUE; |
| | 1772 | } |
| | 1773 | else |
| | 1774 | { |
| | 1775 | /* |
| | 1776 | * it's not the only thing, so add the class |
| | 1777 | * to the range list |
| | 1778 | */ |
| | 1779 | add_range_class(cp->code); |
| | 1780 | } |
| | 1781 | } |
| | 1782 | |
| | 1783 | /* note that we found a match */ |
| | 1784 | found = TRUE; |
| | 1785 | |
| | 1786 | /* no need to scan further in our table */ |
| | 1787 | break; |
| | 1788 | } |
| | 1789 | } |
| | 1790 | |
| | 1791 | /* if we didn't find a match, the whole expression is invalid */ |
| | 1792 | if (!found) |
| | 1793 | return FALSE; |
| | 1794 | } |
| | 1795 | |
| | 1796 | /* |
| | 1797 | * if we found the '>', we're done; if we found a '|', skip it and |
| | 1798 | * keep going |
| | 1799 | */ |
| | 1800 | if (delim == '|') |
| | 1801 | { |
| | 1802 | /* skip the delimiter, and back for another round */ |
| | 1803 | p.inc(&rem); |
| | 1804 | } |
| | 1805 | else |
| | 1806 | { |
| | 1807 | /* we found the '>', so we're done - add the range recognizer */ |
| | 1808 | build_char_range(result_machine, is_exclusive); |
| | 1809 | |
| | 1810 | /* skip up to the '>' */ |
| | 1811 | *expr = p; |
| | 1812 | *exprlen = rem; |
| | 1813 | |
| | 1814 | /* tell the caller we were successful */ |
| | 1815 | return TRUE; |
| | 1816 | } |
| | 1817 | } |
| | 1818 | } |
| | 1819 | |
| | 1820 | |
| | 1821 | /* |
| | 1822 | * Parse an integer value. Returns -1 if there's no number. |
| | 1823 | */ |
| | 1824 | int CRegexParser::parse_int(utf8_ptr *p, size_t *rem) |
| | 1825 | { |
| | 1826 | int acc; |
| | 1827 | |
| | 1828 | /* if it's not a number to start with, simply return -1 */ |
| | 1829 | if (*rem == 0 || !is_digit(p->getch())) |
| | 1830 | return -1; |
| | 1831 | |
| | 1832 | /* keep going as long as we find digits */ |
| | 1833 | for (acc = 0 ; *rem > 0 && is_digit(p->getch()) ; p->inc(rem)) |
| | 1834 | { |
| | 1835 | /* add this digit into the accumulator */ |
| | 1836 | acc *= 10; |
| | 1837 | acc += value_of_digit(p->getch()); |
| | 1838 | } |
| | 1839 | |
| | 1840 | /* return the accumulated result */ |
| | 1841 | return acc; |
| | 1842 | } |
| | 1843 | |
| | 1844 | /* |
| | 1845 | * Break any infinite loops in the machine. Check for cycles that |
| | 1846 | * consist solely of epsilon transitions, and break each cycle at the |
| | 1847 | * last alternating transition. |
| | 1848 | */ |
| | 1849 | void CRegexParser::break_loops(re_machine *machine) |
| | 1850 | { |
| | 1851 | re_state_id cur; |
| | 1852 | |
| | 1853 | /* |
| | 1854 | * for each state, search the machine for cycles from the state back |
| | 1855 | * to itself |
| | 1856 | */ |
| | 1857 | for (cur = 0 ; cur < next_state_ ; ++cur) |
| | 1858 | { |
| | 1859 | /* test for loops from this state back to itself */ |
| | 1860 | find_loop(machine, cur); |
| | 1861 | } |
| | 1862 | } |
| | 1863 | |
| | 1864 | /* |
| | 1865 | * Find and break loops from the current state back to the given initial |
| | 1866 | * state completely via epsilon transitions. Returns true if we found a |
| | 1867 | * loop back to the initial state, false if not. |
| | 1868 | */ |
| | 1869 | int CRegexParser::find_loop(re_machine *machine, re_state_id cur_state) |
| | 1870 | { |
| | 1871 | /* |
| | 1872 | * keep going until we get to a final state or a non-epsilon |
| | 1873 | * transition |
| | 1874 | */ |
| | 1875 | for (;;) |
| | 1876 | { |
| | 1877 | re_tuple *tuple; |
| | 1878 | |
| | 1879 | /* |
| | 1880 | * if this is the final state, there's nowhere else to go, so we |
| | 1881 | * evidently can't find the loop we were seeking |
| | 1882 | */ |
| | 1883 | if (cur_state == machine->final) |
| | 1884 | return FALSE; |
| | 1885 | |
| | 1886 | /* get the tuple for this state */ |
| | 1887 | tuple = &tuple_arr_[cur_state]; |
| | 1888 | |
| | 1889 | /* |
| | 1890 | * if this state has already been visited by a recursive caller, |
| | 1891 | * we're in a loop |
| | 1892 | */ |
| | 1893 | if ((tuple->flags & RE_STATE_CYCLE_TEST) != 0) |
| | 1894 | return TRUE; |
| | 1895 | |
| | 1896 | /* if it's a two-transition epsilon state, handle it separately */ |
| | 1897 | if (tuple->typ == RE_EPSILON |
| | 1898 | && tuple->next_state_2 != RE_STATE_INVALID) |
| | 1899 | { |
| | 1900 | unsigned char old_flags; |
| | 1901 | |
| | 1902 | /* |
| | 1903 | * This is a branch. Recursively try to find the loop in |
| | 1904 | * each branch. If we do find the loop in either branch, it |
| | 1905 | * means that there are no branches closer to the final |
| | 1906 | * transition back to the original state, so we must break |
| | 1907 | * this branch. |
| | 1908 | */ |
| | 1909 | |
| | 1910 | /* |
| | 1911 | * mark the current state as being inspected, so if we |
| | 1912 | * encounter it again we'll know we've found a cycle |
| | 1913 | */ |
| | 1914 | old_flags = tuple->flags; |
| | 1915 | tuple->flags |= RE_STATE_CYCLE_TEST; |
| | 1916 | |
| | 1917 | /* |
| | 1918 | * try the second state first, since we won't have to |
| | 1919 | * shuffle around the slots to clear the second state |
| | 1920 | */ |
| | 1921 | if (find_loop(machine, tuple->next_state_2)) |
| | 1922 | { |
| | 1923 | /* the second branch loops - break it */ |
| | 1924 | tuple->next_state_2 = RE_STATE_INVALID; |
| | 1925 | } |
| | 1926 | |
| | 1927 | /* check the other branch */ |
| | 1928 | if (find_loop(machine, tuple->next_state_1)) |
| | 1929 | { |
| | 1930 | /* |
| | 1931 | * the first branch loops - move the second branch to |
| | 1932 | * the first slot, and clear the second slot (if we only |
| | 1933 | * have one branch, it must always be in the first slot) |
| | 1934 | */ |
| | 1935 | tuple->next_state_1 = tuple->next_state_2; |
| | 1936 | tuple->next_state_2 = RE_STATE_INVALID; |
| | 1937 | } |
| | 1938 | |
| | 1939 | /* we're done testing this state - restore its original marking */ |
| | 1940 | tuple->flags = old_flags; |
| | 1941 | |
| | 1942 | /* |
| | 1943 | * if both branches are invalid, this entire state is |
| | 1944 | * unusable, so tell the caller to break its branch to here |
| | 1945 | */ |
| | 1946 | if (tuple->next_state_1 == RE_STATE_INVALID) |
| | 1947 | return TRUE; |
| | 1948 | |
| | 1949 | /* |
| | 1950 | * there's no need to continue either branch, since we've |
| | 1951 | * already followed them all the way to the end via the |
| | 1952 | * recursive call - and if we had a loop, we've broken it, |
| | 1953 | * so simply tell the caller there are no more loops along |
| | 1954 | * this path |
| | 1955 | */ |
| | 1956 | return FALSE; |
| | 1957 | } |
| | 1958 | |
| | 1959 | /* see what kind of transition this is */ |
| | 1960 | switch(tuple->typ) |
| | 1961 | { |
| | 1962 | case RE_EPSILON: |
| | 1963 | case RE_GROUP_ENTER: |
| | 1964 | case RE_GROUP_EXIT: |
| | 1965 | /* |
| | 1966 | * epsilon or group transition - this could definitely be part |
| | 1967 | * of a loop, so move on to the next state and keep looking |
| | 1968 | */ |
| | 1969 | cur_state = tuple->next_state_1; |
| | 1970 | |
| | 1971 | /* |
| | 1972 | * if this took us to an invalid state, we must have reached |
| | 1973 | * the final state of a sub-machine - we can go no further |
| | 1974 | * from here, so there are no loops in this branch |
| | 1975 | */ |
| | 1976 | if (cur_state == RE_STATE_INVALID) |
| | 1977 | return FALSE; |
| | 1978 | break; |
| | 1979 | |
| | 1980 | case RE_TEXT_BEGIN: |
| | 1981 | case RE_TEXT_END: |
| | 1982 | case RE_WORD_BEGIN: |
| | 1983 | case RE_WORD_END: |
| | 1984 | case RE_WORD_BOUNDARY: |
| | 1985 | case RE_NON_WORD_BOUNDARY: |
| | 1986 | case RE_ASSERT_POS: |
| | 1987 | case RE_ASSERT_NEG: |
| | 1988 | case RE_ZERO_VAR: |
| | 1989 | /* |
| | 1990 | * none of these transitions consumes input, so any of these |
| | 1991 | * could result in an infinite loop - continue down the |
| | 1992 | * current path |
| | 1993 | */ |
| | 1994 | cur_state = tuple->next_state_1; |
| | 1995 | break; |
| | 1996 | |
| | 1997 | default: |
| | 1998 | /* |
| | 1999 | * all other states consume input, so this branch definitely |
| | 2000 | * can't loop back to the original state without consuming |
| | 2001 | * input - we do not need to proceed any further down the |
| | 2002 | * current branch, since it's not an infinite epsilon loop |
| | 2003 | * even if it does ultimately find its way back to the |
| | 2004 | * initial state |
| | 2005 | */ |
| | 2006 | return FALSE; |
| | 2007 | } |
| | 2008 | } |
| | 2009 | } |
| | 2010 | |
| | 2011 | /* |
| | 2012 | * Optimization: remove meaningless branch-to-branch transitions. An |
| | 2013 | * "epsilon" state that has only one outgoing transition does nothing |
| | 2014 | * except move to the next state, so any transition that points to such a |
| | 2015 | * state could just as well point to the destination of the epsilon's one |
| | 2016 | * outgoing transition. |
| | 2017 | */ |
| | 2018 | void CRegexParser::remove_branch_to_branch(re_machine *machine) |
| | 2019 | { |
| | 2020 | re_state_id cur; |
| | 2021 | re_tuple *tuple; |
| | 2022 | |
| | 2023 | /* make sure the initial state points to the first meaningful state */ |
| | 2024 | optimize_transition(machine, &machine->init); |
| | 2025 | |
| | 2026 | /* scan all active states */ |
| | 2027 | for (cur = 0, tuple = tuple_arr_ ; cur < next_state_ ; ++cur, ++tuple) |
| | 2028 | { |
| | 2029 | /* if this isn't the final state, check it */ |
| | 2030 | if (cur != machine->final) |
| | 2031 | { |
| | 2032 | /* check both of our outgoing transitions */ |
| | 2033 | optimize_transition(machine, &tuple->next_state_1); |
| | 2034 | optimize_transition(machine, &tuple->next_state_2); |
| | 2035 | } |
| | 2036 | } |
| | 2037 | } |
| | 2038 | |
| | 2039 | /* |
| | 2040 | * Optimize a single transition to remove meaningless branch-to-branch |
| | 2041 | * transitions. |
| | 2042 | */ |
| | 2043 | void CRegexParser::optimize_transition(const re_machine *machine, |
| | 2044 | re_state_id *trans) |
| | 2045 | { |
| | 2046 | /* keep going as long as we find meaningless forwarding branches */ |
| | 2047 | for (;;) |
| | 2048 | { |
| | 2049 | re_tuple *tuple_nxt; |
| | 2050 | |
| | 2051 | /* |
| | 2052 | * if this transition points to the machine's final state, there's |
| | 2053 | * nowhere else to go from here |
| | 2054 | */ |
| | 2055 | if (*trans == RE_STATE_INVALID || *trans == machine->final) |
| | 2056 | return; |
| | 2057 | |
| | 2058 | /* |
| | 2059 | * if the transition points to anything other than a single-branch |
| | 2060 | * epsilon, we point to a meaningful next state, so there's no |
| | 2061 | * further branch-to-branch elimination we can perform |
| | 2062 | */ |
| | 2063 | tuple_nxt = &tuple_arr_[*trans]; |
| | 2064 | if (tuple_nxt->typ != RE_EPSILON |
| | 2065 | || tuple_nxt->next_state_2 != RE_STATE_INVALID) |
| | 2066 | return; |
| | 2067 | |
| | 2068 | /* |
| | 2069 | * This transition points to a meaningless intermediate state, so |
| | 2070 | * we can simply skip the intermediate state and go directly from |
| | 2071 | * the current state to the target state's single target. Once |
| | 2072 | * we've done this, continue scanning, because we might find that |
| | 2073 | * the new target state itself is a meaningless intermediate state |
| | 2074 | * that we can skip past as well (and so on, and so on - keep |
| | 2075 | * going until we find a real target state). |
| | 2076 | */ |
| | 2077 | *trans = tuple_nxt->next_state_1; |
| | 2078 | } |
| | 2079 | } |
| | 2080 | |
| | 2081 | /* ------------------------------------------------------------------------ */ |
| | 2082 | /* |
| | 2083 | * Compile an expression and return a newly-allocated pattern object. |
| | 2084 | */ |
| | 2085 | re_status_t CRegexParser::compile_pattern(const char *expr_str, |
| | 2086 | size_t exprlen, |
| | 2087 | re_compiled_pattern **pattern) |
| | 2088 | { |
| | 2089 | re_status_t stat; |
| | 2090 | re_state_id i; |
| | 2091 | re_tuple *t; |
| | 2092 | re_compiled_pattern *pat; |
| | 2093 | size_t siz; |
| | 2094 | wchar_t *rp; |
| | 2095 | re_compiled_pattern_base base_pat; |
| | 2096 | |
| | 2097 | /* presume we won't allocate a new pattern object */ |
| | 2098 | *pattern = 0; |
| | 2099 | |
| | 2100 | /* compile the pattern */ |
| | 2101 | if ((stat = compile(expr_str, exprlen, &base_pat)) != RE_STATUS_SUCCESS) |
| | 2102 | return stat; |
| | 2103 | |
| | 2104 | /* |
| | 2105 | * Start off with our basic space needs: we need space for the base |
| | 2106 | * structure, plus space for re_tuple array (actually, space for the |
| | 2107 | * number of allocated tuples minus one, since there's one built into |
| | 2108 | * the base structure). |
| | 2109 | */ |
| | 2110 | siz = sizeof(re_compiled_pattern) |
| | 2111 | + (base_pat.tuple_cnt - 1)*sizeof(pat->tuples[0]); |
| | 2112 | |
| | 2113 | /* |
| | 2114 | * Run through the tuples in the result machine and add up the amount |
| | 2115 | * of extra space we'll need for extra allocations (specifically, |
| | 2116 | * character ranges). |
| | 2117 | */ |
| | 2118 | for (i = 0, t = tuple_arr_ ; i < base_pat.tuple_cnt ; ++i, ++t) |
| | 2119 | { |
| | 2120 | /* check what kind of tuple this is */ |
| | 2121 | switch(t->typ) |
| | 2122 | { |
| | 2123 | case RE_RANGE: |
| | 2124 | case RE_RANGE_EXCL: |
| | 2125 | /* range - add in space for the character range data */ |
| | 2126 | siz += sizeof(t->info.range.char_range[0]) |
| | 2127 | * t->info.range.char_range_cnt; |
| | 2128 | break; |
| | 2129 | |
| | 2130 | default: |
| | 2131 | /* other types require no extra storage */ |
| | 2132 | break; |
| | 2133 | } |
| | 2134 | } |
| | 2135 | |
| | 2136 | /* allocate space for the structure */ |
| | 2137 | *pattern = pat = (re_compiled_pattern *)t3malloc(siz); |
| | 2138 | |
| | 2139 | /* copy the base pattern to the result */ |
| | 2140 | memcpy(pat, &base_pat, sizeof(base_pat)); |
| | 2141 | |
| | 2142 | /* copy the tuple array */ |
| | 2143 | memcpy(pat->tuples, tuple_arr_, pat->tuple_cnt * sizeof(pat->tuples[0])); |
| | 2144 | |
| | 2145 | /* |
| | 2146 | * Pack the range data onto the end of the tuple array in the data |
| | 2147 | * structure. We already calculated how much space we'll need for this |
| | 2148 | * and included the space in the structure's allocation, so we simply |
| | 2149 | * need to find the location of the range data at the end of our tuple |
| | 2150 | * array. |
| | 2151 | */ |
| | 2152 | rp = (wchar_t *)&pat->tuples[pat->tuple_cnt]; |
| | 2153 | |
| | 2154 | /* |
| | 2155 | * run through the new tuple array and pack the range data into the new |
| | 2156 | * allocation unit |
| | 2157 | */ |
| | 2158 | for (i = 0, t = pat->tuples ; i < next_state_ ; ++i, ++t) |
| | 2159 | { |
| | 2160 | /* check what kind of tuple this is */ |
| | 2161 | switch(t->typ) |
| | 2162 | { |
| | 2163 | case RE_RANGE: |
| | 2164 | case RE_RANGE_EXCL: |
| | 2165 | /* |
| | 2166 | * Character range. Copy the original character range data |
| | 2167 | * into the new allocation unit, at the next available location |
| | 2168 | * in the allocation unit. |
| | 2169 | */ |
| | 2170 | memcpy(rp, t->info.range.char_range, |
| | 2171 | t->info.range.char_range_cnt |
| | 2172 | * sizeof(t->info.range.char_range[0])); |
| | 2173 | |
| | 2174 | /* fix up the tuple to point to the packed copy */ |
| | 2175 | t->info.range.char_range = rp; |
| | 2176 | |
| | 2177 | /* move past the space in our allocation unit */ |
| | 2178 | rp += t->info.range.char_range_cnt; |
| | 2179 | break; |
| | 2180 | |
| | 2181 | default: |
| | 2182 | /* other types contain no range data */ |
| | 2183 | break; |
| | 2184 | } |
| | 2185 | } |
| | 2186 | |
| | 2187 | /* success */ |
| | 2188 | return stat; |
| | 2189 | } |
| | 2190 | |
| | 2191 | /* |
| | 2192 | * free a pattern previously created with compile_pattern() |
| | 2193 | */ |
| | 2194 | void CRegexParser::free_pattern(re_compiled_pattern *pattern) |
| | 2195 | { |
| | 2196 | /* we allocate each pattern as a single unit, so it's easy to free */ |
| | 2197 | t3free(pattern); |
| | 2198 | } |
| | 2199 | |
| | 2200 | /* ------------------------------------------------------------------------ */ |
| | 2201 | /* |
| | 2202 | * Pattern recognizer |
| | 2203 | */ |
| | 2204 | |
| | 2205 | /* |
| | 2206 | * construct |
| | 2207 | */ |
| | 2208 | CRegexSearcher::CRegexSearcher() |
| | 2209 | { |
| | 2210 | } |
| | 2211 | |
| | 2212 | /* |
| | 2213 | * delete |
| | 2214 | */ |
| | 2215 | CRegexSearcher::~CRegexSearcher() |
| | 2216 | { |
| | 2217 | } |
| | 2218 | |
| | 2219 | /* |
| | 2220 | * Match a string to a compiled expression. Returns the length of the |
| | 2221 | * match if successful, or -1 if no match was found. |
| | 2222 | */ |
| | 2223 | int CRegexSearcher::match(const char *entire_str, |
| | 2224 | const char *str, const size_t origlen, |
| | 2225 | const re_compiled_pattern_base *pattern, |
| | 2226 | const re_tuple *tuple_arr, |
| | 2227 | const re_machine *machine, |
| | 2228 | re_group_register *regs, |
| | 2229 | short *loop_vars) |
| | 2230 | { |
| | 2231 | size_t entire_str_len; |
| | 2232 | re_state_id cur_state, final_state; |
| | 2233 | utf8_ptr p; |
| | 2234 | size_t curlen; |
| | 2235 | int start_ofs; |
| | 2236 | int _retval_; |
| | 2237 | |
| | 2238 | const int ST_EPS1 = 1; /* doing first branch of two-branch epsilon */ |
| | 2239 | const int ST_EPS2 = 2; /* doing second branch of two-branch epsilon */ |
| | 2240 | const int ST_ASSERT = 3; /* doing an assertion */ |
| | 2241 | |
| | 2242 | /* macro to perform a"local return" */ |
| | 2243 | #define local_return(retval) \ |
| | 2244 | _retval_ = (retval); \ |
| | 2245 | goto do_local_return; |
| | 2246 | |
| | 2247 | /* reset the stack */ |
| | 2248 | stack_.reset(); |
| | 2249 | |
| | 2250 | /* start at the machine's initial state */ |
| | 2251 | cur_state = machine->init; |
| | 2252 | |
| | 2253 | /* note the final state */ |
| | 2254 | final_state = machine->final; |
| | 2255 | |
| | 2256 | /* |
| | 2257 | * if we're starting in the final state, this is a zero-length |
| | 2258 | * pattern, which matches anything - immediately return success with a |
| | 2259 | * zero-length match |
| | 2260 | */ |
| | 2261 | if (cur_state == final_state) |
| | 2262 | return 0; |
| | 2263 | |
| | 2264 | /* start at the beginning of the string */ |
| | 2265 | p.set((char *)str); |
| | 2266 | curlen = origlen; |
| | 2267 | |
| | 2268 | /* note the offset from the start of the entire string */ |
| | 2269 | start_ofs = str - entire_str; |
| | 2270 | entire_str_len = start_ofs + origlen; |
| | 2271 | |
| | 2272 | /* run the machine */ |
| | 2273 | for (;;) |
| | 2274 | { |
| | 2275 | const re_tuple *tuple; |
| | 2276 | |
| | 2277 | /* get the tuple for this state */ |
| | 2278 | tuple = &tuple_arr[cur_state]; |
| | 2279 | |
| | 2280 | /* check the type of state we're processing */ |
| | 2281 | switch(tuple->typ) |
| | 2282 | { |
| | 2283 | case RE_ZERO_VAR: |
| | 2284 | /* save the variable in the stack frame if necessary */ |
| | 2285 | stack_.save_loop_var(tuple->info.loop.loop_var, loop_vars); |
| | 2286 | |
| | 2287 | /* zero the loop variable and proceed */ |
| | 2288 | loop_vars[tuple->info.loop.loop_var] = 0; |
| | 2289 | break; |
| | 2290 | |
| | 2291 | case RE_LOOP_BRANCH: |
| | 2292 | /* |
| | 2293 | * This is a loop branch. Check the variable to see if we've |
| | 2294 | * satisfied the loop condition yet: |
| | 2295 | * |
| | 2296 | * - If we haven't yet reached the minimum loop count, simply |
| | 2297 | * transition into the loop (i.e., take the 'enter' branch |
| | 2298 | * unconditionally). |
| | 2299 | * |
| | 2300 | * - If we have reached the maximum loop count (if there is |
| | 2301 | * one - if max is -1 then there's no upper bound), simply |
| | 2302 | * transition past the loop (i.e., take the 'bypass' branch |
| | 2303 | * unconditionally). |
| | 2304 | * |
| | 2305 | * - Otherwise, we must treat this just like an ordinary |
| | 2306 | * two-way epsilon branch. |
| | 2307 | * |
| | 2308 | * Note that, if we have a 'shortest' modifier, the bypass |
| | 2309 | * branch will be first, to encourage the indeterminate check |
| | 2310 | * to choose the short branch whenever possible; otherwise the |
| | 2311 | * enter branch will be first, so we take the long branch |
| | 2312 | * whenever possible. |
| | 2313 | */ |
| | 2314 | { |
| | 2315 | short *varptr; |
| | 2316 | |
| | 2317 | /* get the variable value */ |
| | 2318 | varptr = &loop_vars[tuple->info.loop.loop_var]; |
| | 2319 | |
| | 2320 | /* save the variable in the stack frame if necessary */ |
| | 2321 | stack_.save_loop_var(tuple->info.loop.loop_var, loop_vars); |
| | 2322 | |
| | 2323 | /* |
| | 2324 | * if we're not at the loop minimum yet, transition into |
| | 2325 | * the loop body |
| | 2326 | */ |
| | 2327 | if (*varptr < tuple->info.loop.loop_min) |
| | 2328 | { |
| | 2329 | /* increment the loop counter */ |
| | 2330 | ++(*varptr); |
| | 2331 | |
| | 2332 | /* unconditionally transfer into the loop */ |
| | 2333 | if ((tuple->flags & RE_STATE_SHORTEST) == 0) |
| | 2334 | cur_state = tuple->next_state_1; |
| | 2335 | else |
| | 2336 | cur_state = tuple->next_state_2; |
| | 2337 | |
| | 2338 | /* we're done processing this state */ |
| | 2339 | goto got_next_state; |
| | 2340 | } |
| | 2341 | |
| | 2342 | /* |
| | 2343 | * if we've reached the loop maximum (if there is one), |
| | 2344 | * transition past the loop |
| | 2345 | */ |
| | 2346 | if (tuple->info.loop.loop_max >= 0 |
| | 2347 | && *varptr >= tuple->info.loop.loop_max) |
| | 2348 | { |
| | 2349 | /* unconditionally skip the loop */ |
| | 2350 | if ((tuple->flags & RE_STATE_SHORTEST) == 0) |
| | 2351 | cur_state = tuple->next_state_2; |
| | 2352 | else |
| | 2353 | cur_state = tuple->next_state_1; |
| | 2354 | |
| | 2355 | /* we're done with this state */ |
| | 2356 | goto got_next_state; |
| | 2357 | } |
| | 2358 | |
| | 2359 | /* |
| | 2360 | * We don't know which way to go, so we must treat this as |
| | 2361 | * a two-way epsilon branch. Count this as another loop |
| | 2362 | * iteration, since the branch that enters the loop will |
| | 2363 | * come back here for another try. The branch that skips |
| | 2364 | * the loop doesn't care about the loop counter any more, |
| | 2365 | * so we can just increment it and ignore the skip branch. |
| | 2366 | */ |
| | 2367 | ++(*varptr); |
| | 2368 | goto two_way_epsilon; |
| | 2369 | } |
| | 2370 | /* not reached */ |
| | 2371 | |
| | 2372 | case RE_GROUP_MATCH: |
| | 2373 | { |
| | 2374 | int group_num; |
| | 2375 | re_group_register *group_reg; |
| | 2376 | size_t reg_len; |
| | 2377 | |
| | 2378 | /* it's a group - get the group number */ |
| | 2379 | group_num = (int)tuple->info.ch; |
| | 2380 | group_reg = ®s[group_num]; |
| | 2381 | |
| | 2382 | /* |
| | 2383 | * if this register isn't defined, there's nothing to |
| | 2384 | * match, so fail |
| | 2385 | */ |
| | 2386 | if (group_reg->start_ofs == -1 |
| | 2387 | || group_reg->end_ofs == -1) |
| | 2388 | { |
| | 2389 | local_return(-1); |
| | 2390 | } |
| | 2391 | |
| | 2392 | /* calculate the length of the register value */ |
| | 2393 | reg_len = group_reg->end_ofs - group_reg->start_ofs; |
| | 2394 | |
| | 2395 | /* if we don't have enough left to match, it fails */ |
| | 2396 | if (curlen < reg_len) |
| | 2397 | { |
| | 2398 | local_return(-1); |
| | 2399 | } |
| | 2400 | |
| | 2401 | /* if the string doesn't match exactly, we fail */ |
| | 2402 | if (memcmp(p.getptr(), entire_str + group_reg->start_ofs, |
| | 2403 | reg_len) != 0) |
| | 2404 | { |
| | 2405 | local_return(-1); |
| | 2406 | } |
| | 2407 | |
| | 2408 | /* |
| | 2409 | * It matches exactly - skip the entire byte length of the |
| | 2410 | * register in the source string |
| | 2411 | */ |
| | 2412 | p.set(p.getptr() + reg_len); |
| | 2413 | curlen -= reg_len; |
| | 2414 | } |
| | 2415 | break; |
| | 2416 | |
| | 2417 | case RE_TEXT_BEGIN: |
| | 2418 | /* |
| | 2419 | * Match only the exact beginning of the string - if we're |
| | 2420 | * anywhere else, this isn't a match. If this succeeds, we |
| | 2421 | * don't skip any characters. |
| | 2422 | */ |
| | 2423 | if (p.getptr() != entire_str) |
| | 2424 | { |
| | 2425 | local_return(-1); |
| | 2426 | } |
| | 2427 | break; |
| | 2428 | |
| | 2429 | case RE_TEXT_END: |
| | 2430 | /* |
| | 2431 | * Match only the exact end of the string - if we're anywhere |
| | 2432 | * else, this isn't a match. Don't skip any characters on |
| | 2433 | * success. |
| | 2434 | */ |
| | 2435 | if (curlen != 0) |
| | 2436 | { |
| | 2437 | local_return(-1); |
| | 2438 | } |
| | 2439 | break; |
| | 2440 | |
| | 2441 | case RE_WORD_BEGIN: |
| | 2442 | /* |
| | 2443 | * If the previous character is a word character, we're not at |
| | 2444 | * the beginning of a word. If we're at the beginning of the |
| | 2445 | * entire string, we need not check anything previous - |
| | 2446 | * there's no previous character, so we can't have a preceding |
| | 2447 | * word character. |
| | 2448 | */ |
| | 2449 | if (p.getptr() != entire_str |
| | 2450 | && is_word_char(p.getch_before(1))) |
| | 2451 | { |
| | 2452 | local_return(-1); |
| | 2453 | } |
| | 2454 | |
| | 2455 | /* |
| | 2456 | * if we're at the end of the string, or the current character |
| | 2457 | * isn't a word character, we're not at the beginning of a |
| | 2458 | * word |
| | 2459 | */ |
| | 2460 | if (curlen == 0 || !is_word_char(p.getch())) |
| | 2461 | { |
| | 2462 | local_return(-1); |
| | 2463 | } |
| | 2464 | break; |
| | 2465 | |
| | 2466 | case RE_WORD_END: |
| | 2467 | /* |
| | 2468 | * if the current character is a word character, we're not at |
| | 2469 | * the end of a word |
| | 2470 | */ |
| | 2471 | if (curlen != 0 && is_word_char(p.getch())) |
| | 2472 | { |
| | 2473 | local_return(-1); |
| | 2474 | } |
| | 2475 | |
| | 2476 | /* |
| | 2477 | * if we're at the beginning of the string, or the previous |
| | 2478 | * character is not a word character, we're not at the end of |
| | 2479 | * a word |
| | 2480 | */ |
| | 2481 | if (p.getptr() == entire_str |
| | 2482 | || !is_word_char(p.getch_before(1))) |
| | 2483 | { |
| | 2484 | local_return(-1); |
| | 2485 | } |
| | 2486 | break; |
| | 2487 | |
| | 2488 | case RE_WORD_CHAR: |
| | 2489 | /* if it's not a word character, it's a failure */ |
| | 2490 | if (curlen == 0 || !is_word_char(p.getch())) |
| | 2491 | { |
| | 2492 | local_return(-1); |
| | 2493 | } |
| | 2494 | |
| | 2495 | /* skip this character of input */ |
| | 2496 | p.inc(&curlen); |
| | 2497 | break; |
| | 2498 | |
| | 2499 | case RE_NON_WORD_CHAR: |
| | 2500 | /* if it's a word character, it's a failure */ |
| | 2501 | if (curlen == 0 || is_word_char(p.getch())) |
| | 2502 | { |
| | 2503 | local_return(-1); |
| | 2504 | } |
| | 2505 | |
| | 2506 | /* skip the input */ |
| | 2507 | p.inc(&curlen); |
| | 2508 | break; |
| | 2509 | |
| | 2510 | case RE_WORD_BOUNDARY: |
| | 2511 | case RE_NON_WORD_BOUNDARY: |
| | 2512 | { |
| | 2513 | int prev_is_word; |
| | 2514 | int next_is_word; |
| | 2515 | int boundary; |
| | 2516 | |
| | 2517 | /* |
| | 2518 | * Determine if the previous character is a word character |
| | 2519 | * -- if we're at the beginning of the string, it's |
| | 2520 | * obviously not, otherwise check its classification |
| | 2521 | */ |
| | 2522 | if (p.getptr() == entire_str) |
| | 2523 | { |
| | 2524 | /* |
| | 2525 | * at the start of the string, so there definitely |
| | 2526 | * isn't a preceding word character |
| | 2527 | */ |
| | 2528 | prev_is_word = FALSE; |
| | 2529 | } |
| | 2530 | else |
| | 2531 | { |
| | 2532 | /* look at the previous character */ |
| | 2533 | prev_is_word = is_word_char(p.getch_before(1)); |
| | 2534 | } |
| | 2535 | |
| | 2536 | /* make the same check for the current character */ |
| | 2537 | next_is_word = (curlen != 0 |
| | 2538 | && is_word_char(p.getch())); |
| | 2539 | |
| | 2540 | /* |
| | 2541 | * Determine if this is a boundary - it is if the two |
| | 2542 | * states are different |
| | 2543 | */ |
| | 2544 | boundary = ((prev_is_word != 0) ^ (next_is_word != 0)); |
| | 2545 | |
| | 2546 | /* |
| | 2547 | * make sure it matches what was desired, and return |
| | 2548 | * failure if not |
| | 2549 | */ |
| | 2550 | if ((tuple->typ == RE_WORD_BOUNDARY && !boundary) |
| | 2551 | || (tuple->typ == RE_NON_WORD_BOUNDARY && boundary)) |
| | 2552 | { |
| | 2553 | local_return(-1); |
| | 2554 | } |
| | 2555 | } |
| | 2556 | break; |
| | 2557 | |
| | 2558 | case RE_WILDCARD: |
| | 2559 | /* make sure we have a character to match */ |
| | 2560 | if (curlen == 0) |
| | 2561 | { |
| | 2562 | local_return(-1); |
| | 2563 | } |
| | 2564 | |
| | 2565 | /* skip this character */ |
| | 2566 | p.inc(&curlen); |
| | 2567 | break; |
| | 2568 | |
| | 2569 | case RE_RANGE: |
| | 2570 | case RE_RANGE_EXCL: |
| | 2571 | { |
| | 2572 | int match; |
| | 2573 | wchar_t ch; |
| | 2574 | wchar_t *rp; |
| | 2575 | size_t i; |
| | 2576 | |
| | 2577 | /* make sure we have a character to match */ |
| | 2578 | if (curlen == 0) |
| | 2579 | { |
| | 2580 | local_return(-1); |
| | 2581 | } |
| | 2582 | |
| | 2583 | /* get this character */ |
| | 2584 | ch = p.getch(); |
| | 2585 | |
| | 2586 | /* search for the character in the range */ |
| | 2587 | for (i = tuple->info.range.char_range_cnt, |
| | 2588 | rp = tuple->info.range.char_range ; |
| | 2589 | i != 0 ; i -= 2, rp += 2) |
| | 2590 | { |
| | 2591 | /* |
| | 2592 | * check for a class specifier; if it's not a class |
| | 2593 | * specifier, treat it as a literal range, and check |
| | 2594 | * case sensitivity |
| | 2595 | */ |
| | 2596 | if (rp[0] == '\0') |
| | 2597 | { |
| | 2598 | int match; |
| | 2599 | |
| | 2600 | /* |
| | 2601 | * The first character of the range pair is null, |
| | 2602 | * which means that this isn't a literal range but |
| | 2603 | * rather a class. Check for a match to the |
| | 2604 | * class. |
| | 2605 | */ |
| | 2606 | switch(rp[1]) |
| | 2607 | { |
| | 2608 | case RE_ALPHA: |
| | 2609 | match = t3_is_alpha(ch); |
| | 2610 | break; |
| | 2611 | |
| | 2612 | case RE_DIGIT: |
| | 2613 | match = t3_is_digit(ch); |
| | 2614 | break; |
| | 2615 | |
| | 2616 | case RE_UPPER: |
| | 2617 | match = t3_is_upper(ch); |
| | 2618 | break; |
| | 2619 | |
| | 2620 | case RE_LOWER: |
| | 2621 | match = t3_is_lower(ch); |
| | 2622 | break; |
| | 2623 | |
| | 2624 | case RE_ALPHANUM: |
| | 2625 | match = t3_is_alpha(ch) || t3_is_digit(ch); |
| | 2626 | break; |
| | 2627 | |
| | 2628 | case RE_SPACE: |
| | 2629 | match = t3_is_space(ch); |
| | 2630 | break; |
| | 2631 | |
| | 2632 | case RE_PUNCT: |
| | 2633 | match = t3_is_punct(ch); |
| | 2634 | break; |
| | 2635 | |
| | 2636 | case RE_NEWLINE: |
| | 2637 | match = (ch == 0x000A |
| | 2638 | || ch == 0x000D |
| | 2639 | || ch == 0x2028); |
| | 2640 | break; |
| | 2641 | |
| | 2642 | case RE_NULLCHAR: |
| | 2643 | match = (ch == 0); |
| | 2644 | break; |
| | 2645 | |
| | 2646 | default: |
| | 2647 | /* this shouldn't happen */ |
| | 2648 | match = FALSE; |
| | 2649 | break; |
| | 2650 | } |
| | 2651 | |
| | 2652 | /* |
| | 2653 | * if we matched, we can stop looking; otherwise, |
| | 2654 | * simply keep going, since there might be another |
| | 2655 | * entry that does match |
| | 2656 | */ |
| | 2657 | if (match) |
| | 2658 | break; |
| | 2659 | } |
| | 2660 | else if (pattern->case_sensitive) |
| | 2661 | { |
| | 2662 | /* |
| | 2663 | * the search is case-sensitive - compare the |
| | 2664 | * character to the range without case conversion |
| | 2665 | */ |
| | 2666 | if (ch >= rp[0] && ch <= rp[1]) |
| | 2667 | break; |
| | 2668 | } |
| | 2669 | else if (t3_is_upper(rp[0]) && t3_is_upper(rp[1])) |
| | 2670 | { |
| | 2671 | wchar_t uch; |
| | 2672 | |
| | 2673 | /* |
| | 2674 | * the range is all upper-case letters - convert |
| | 2675 | * the source character to upper-case for the |
| | 2676 | * comparison |
| | 2677 | */ |
| | 2678 | uch = t3_to_upper(ch); |
| | 2679 | if (uch >= rp[0] && uch <= rp[1]) |
| | 2680 | break; |
| | 2681 | } |
| | 2682 | else if (t3_is_lower(rp[0]) && t3_is_lower(rp[1])) |
| | 2683 | { |
| | 2684 | wchar_t lch; |
| | 2685 | |
| | 2686 | /* |
| | 2687 | * the range is all lower-case letters - convert |
| | 2688 | * the source character to upper-case for the |
| | 2689 | * comparison |
| | 2690 | */ |
| | 2691 | lch = t3_to_lower(ch); |
| | 2692 | if (lch >= rp[0] && lch <= rp[1]) |
| | 2693 | break; |
| | 2694 | } |
| | 2695 | else |
| | 2696 | { |
| | 2697 | /* |
| | 2698 | * The cases of the two ends of the range don't |
| | 2699 | * agree, so there's nothing we can do for case |
| | 2700 | * conversions. Simply compare the range exactly. |
| | 2701 | */ |
| | 2702 | if (ch >= rp[0] && ch <= rp[1]) |
| | 2703 | break; |
| | 2704 | } |
| | 2705 | } |
| | 2706 | |
| | 2707 | /* we matched if we stopped before exhausting the list */ |
| | 2708 | match = (i != 0); |
| | 2709 | |
| | 2710 | /* make sure we got what we wanted */ |
| | 2711 | if ((tuple->typ == RE_RANGE && !match) |
| | 2712 | || (tuple->typ == RE_RANGE_EXCL && match)) |
| | 2713 | { |
| | 2714 | local_return(-1); |
| | 2715 | } |
| | 2716 | |
| | 2717 | /* skip this character of the input */ |
| | 2718 | p.inc(&curlen); |
| | 2719 | } |
| | 2720 | break; |
| | 2721 | |
| | 2722 | case RE_ALPHA: |
| | 2723 | /* check for an alphabetic character */ |
| | 2724 | if (curlen == 0 || !t3_is_alpha(p.getch())) |
| | 2725 | { |
| | 2726 | local_return(-1); |
| | 2727 | } |
| | 2728 | |
| | 2729 | /* skip this character of the input, and continue */ |
| | 2730 | p.inc(&curlen); |
| | 2731 | break; |
| | 2732 | |
| | 2733 | case RE_DIGIT: |
| | 2734 | /* check for a digit character */ |
| | 2735 | if (curlen == 0 || !t3_is_digit(p.getch())) |
| | 2736 | { |
| | 2737 | local_return(-1); |
| | 2738 | } |
| | 2739 | |
| | 2740 | /* skip this character of the input, and continue */ |
| | 2741 | p.inc(&curlen); |
| | 2742 | break; |
| | 2743 | |
| | 2744 | case RE_UPPER: |
| | 2745 | /* check for an upper-case alphabetic character */ |
| | 2746 | if (curlen == 0 || !t3_is_upper(p.getch())) |
| | 2747 | { |
| | 2748 | local_return(-1); |
| | 2749 | } |
| | 2750 | |
| | 2751 | /* skip this character of the input, and continue */ |
| | 2752 | p.inc(&curlen); |
| | 2753 | break; |
| | 2754 | |
| | 2755 | case RE_LOWER: |
| | 2756 | /* check for a lower-case alphabetic character */ |
| | 2757 | if (curlen == 0 || !t3_is_lower(p.getch())) |
| | 2758 | { |
| | 2759 | local_return(-1); |
| | 2760 | } |
| | 2761 | |
| | 2762 | /* skip this character of the input, and continue */ |
| | 2763 | p.inc(&curlen); |
| | 2764 | break; |
| | 2765 | |
| | 2766 | case RE_ALPHANUM: |
| | 2767 | /* check for an alphabetic or digit character */ |
| | 2768 | if (curlen == 0 |
| | 2769 | || (!t3_is_alpha(p.getch()) && !t3_is_digit(p.getch()))) |
| | 2770 | { |
| | 2771 | local_return(-1); |
| | 2772 | } |
| | 2773 | |
| | 2774 | /* skip this character of the input, and continue */ |
| | 2775 | p.inc(&curlen); |
| | 2776 | break; |
| | 2777 | |
| | 2778 | case RE_SPACE: |
| | 2779 | /* check for a space of some kind */ |
| | 2780 | if (curlen == 0 || !t3_is_space(p.getch())) |
| | 2781 | { |
| | 2782 | local_return(-1); |
| | 2783 | } |
| | 2784 | |
| | 2785 | /* skip this character of the input, and continue */ |
| | 2786 | p.inc(&curlen); |
| | 2787 | break; |
| | 2788 | |
| | 2789 | case RE_PUNCT: |
| | 2790 | /* check for a punctuation character of some kind */ |
| | 2791 | if (curlen == 0 || !t3_is_punct(p.getch())) |
| | 2792 | { |
| | 2793 | local_return(-1); |
| | 2794 | } |
| | 2795 | |
| | 2796 | /* skip this character of the input, and continue */ |
| | 2797 | p.inc(&curlen); |
| | 2798 | break; |
| | 2799 | |
| | 2800 | case RE_NEWLINE: |
| | 2801 | /* check for a newline character of some kind */ |
| | 2802 | { |
| | 2803 | wchar_t ch; |
| | 2804 | |
| | 2805 | /* if we're out of characters, we don't have a match */ |
| | 2806 | if (curlen == 0) |
| | 2807 | { |
| | 2808 | local_return(-1); |
| | 2809 | } |
| | 2810 | |
| | 2811 | /* get the character */ |
| | 2812 | ch = p.getch(); |
| | 2813 | |
| | 2814 | /* if it's not a newline, we fail this match */ |
| | 2815 | if (ch != 0x000A && ch != 0x000d && ch != 0x2028) |
| | 2816 | { |
| | 2817 | local_return(-1); |
| | 2818 | } |
| | 2819 | } |
| | 2820 | |
| | 2821 | /* skip this character of input and continue */ |
| | 2822 | p.inc(&curlen); |
| | 2823 | break; |
| | 2824 | |
| | 2825 | case RE_ASSERT_POS: |
| | 2826 | case RE_ASSERT_NEG: |
| | 2827 | /* |
| | 2828 | * It's an assertion. Run this as a sub-state: push the |
| | 2829 | * current state so that we can come back to it later. |
| | 2830 | */ |
| | 2831 | stack_.push(ST_ASSERT, start_ofs, p.getptr() - entire_str, |
| | 2832 | cur_state, final_state); |
| | 2833 | |
| | 2834 | /* |
| | 2835 | * In the sub-state, start with the sub-machine's initial |
| | 2836 | * state and finish with the sub-machine's final state. |
| | 2837 | */ |
| | 2838 | cur_state = tuple->info.sub.init; |
| | 2839 | final_state = tuple->info.sub.final; |
| | 2840 | |
| | 2841 | /* in the sub-state, the sub-string to match starts here */ |
| | 2842 | start_ofs = p.getptr() - entire_str; |
| | 2843 | |
| | 2844 | /* |
| | 2845 | * just proceed from here; we'll finish up with the assertion |
| | 2846 | * test when we reach the final sub-machine state and pop the |
| | 2847 | * stack state we pushed |
| | 2848 | */ |
| | 2849 | goto got_next_state; |
| | 2850 | |
| | 2851 | case RE_LITERAL: |
| | 2852 | /* |
| | 2853 | * ordinary character match - if there's not another |
| | 2854 | * character, we obviously fail |
| | 2855 | */ |
| | 2856 | if (curlen == 0) |
| | 2857 | { |
| | 2858 | local_return(-1); |
| | 2859 | } |
| | 2860 | |
| | 2861 | /* |
| | 2862 | * check case sensitivity - if we're not in case-sensitive |
| | 2863 | * mode, and both characters are alphabetic, perform a |
| | 2864 | * case-insensitive comparison; otherwise, perform an exact |
| | 2865 | * comparison |
| | 2866 | */ |
| | 2867 | if (tuple->info.ch == p.getch()) |
| | 2868 | { |
| | 2869 | /* |
| | 2870 | * we have an exact match; there's no need to check for |
| | 2871 | * any case conversions |
| | 2872 | */ |
| | 2873 | } |
| | 2874 | else if (!pattern->case_sensitive |
| | 2875 | && t3_is_alpha(tuple->info.ch) && t3_is_alpha(p.getch())) |
| | 2876 | { |
| | 2877 | /* |
| | 2878 | * We're performing a case-insensitive search, and both |
| | 2879 | * characters are alphabetic. Convert the string |
| | 2880 | * character to the same case as the pattern character, |
| | 2881 | * then compare them. Note that we always use the case of |
| | 2882 | * the pattern character, because this gives the pattern |
| | 2883 | * control over the handling for languages where |
| | 2884 | * conversions are ambiguous. |
| | 2885 | */ |
| | 2886 | if (t3_is_upper(tuple->info.ch)) |
| | 2887 | { |
| | 2888 | /* |
| | 2889 | * the pattern character is upper-case - convert the |
| | 2890 | * string character to upper-case and compare |
| | 2891 | */ |
| | 2892 | if (tuple->info.ch != t3_to_upper(p.getch())) |
| | 2893 | { |
| | 2894 | local_return(-1); |
| | 2895 | } |
| | 2896 | } |
| | 2897 | else if (t3_is_lower(tuple->info.ch)) |
| | 2898 | { |
| | 2899 | /* |
| | 2900 | * the pattern character is lower-case - compare to |
| | 2901 | * the lower-case string character |
| | 2902 | */ |
| | 2903 | if (tuple->info.ch != t3_to_lower(p.getch())) |
| | 2904 | { |
| | 2905 | local_return(-1); |
| | 2906 | } |
| | 2907 | } |
| | 2908 | else |
| | 2909 | { |
| | 2910 | /* |
| | 2911 | * the pattern character is neither upper nor lower |
| | 2912 | * case, so we cannot perform a case conversion; we |
| | 2913 | * know the match isn't exact, so we don't have a |
| | 2914 | * match at all |
| | 2915 | */ |
| | 2916 | local_return(-1); |
| | 2917 | } |
| | 2918 | } |
| | 2919 | else |
| | 2920 | { |
| | 2921 | /* |
| | 2922 | * the search is case-sensitive, or this pattern character |
| | 2923 | * is non-alphabetic; since we didn't find an exact match, |
| | 2924 | * the string does not match the pattern |
| | 2925 | */ |
| | 2926 | local_return(-1); |
| | 2927 | } |
| | 2928 | |
| | 2929 | /* skip this character of the input */ |
| | 2930 | p.inc(&curlen); |
| | 2931 | break; |
| | 2932 | |
| | 2933 | case RE_GROUP_ENTER: |
| | 2934 | /* |
| | 2935 | * if the group index (given by the state's character ID) is |
| | 2936 | * in range, note the location in the string where the group |
| | 2937 | * starts |
| | 2938 | */ |
| | 2939 | if (tuple->info.ch < RE_GROUP_REG_CNT) |
| | 2940 | { |
| | 2941 | /* save the register in the stack frame if necessary */ |
| | 2942 | stack_.save_group_reg(tuple->info.ch, regs); |
| | 2943 | |
| | 2944 | /* store the new value */ |
| | 2945 | regs[tuple->info.ch].start_ofs = p.getptr() - entire_str; |
| | 2946 | } |
| | 2947 | break; |
| | 2948 | |
| | 2949 | case RE_GROUP_EXIT: |
| | 2950 | /* |
| | 2951 | * note the string location of the end of the group, if the |
| | 2952 | * group ID is in range |
| | 2953 | */ |
| | 2954 | if (tuple->info.ch < RE_GROUP_REG_CNT) |
| | 2955 | { |
| | 2956 | /* save the register in the stack frame if necessary */ |
| | 2957 | stack_.save_group_reg(tuple->info.ch, regs); |
| | 2958 | |
| | 2959 | /* store the new value */ |
| | 2960 | regs[tuple->info.ch].end_ofs = p.getptr() - entire_str; |
| | 2961 | } |
| | 2962 | break; |
| | 2963 | |
| | 2964 | case RE_EPSILON: |
| | 2965 | /* it's an epsilon transition */ |
| | 2966 | if (tuple->next_state_2 == RE_STATE_INVALID) |
| | 2967 | { |
| | 2968 | /* |
| | 2969 | * We have only one transition, so this state is entirely |
| | 2970 | * deterministic. Simply move on to the next state. (We |
| | 2971 | * try to optimize these types of transitions out of the |
| | 2972 | * machine when compiling, but we handle them anyway in |
| | 2973 | * case any survive optimization.) |
| | 2974 | */ |
| | 2975 | cur_state = tuple->next_state_1; |
| | 2976 | } |
| | 2977 | else |
| | 2978 | { |
| | 2979 | two_way_epsilon: |
| | 2980 | /* |
| | 2981 | * This state has two possible transitions, and we don't |
| | 2982 | * know which one to take. So, try both, see which one |
| | 2983 | * works better, and return the result. Try the first |
| | 2984 | * transition first. |
| | 2985 | * |
| | 2986 | * To test both branches, we push the machine state onto |
| | 2987 | * the stack, test the first branch, then restore the |
| | 2988 | * machine state and test the second branch. We return |
| | 2989 | * the better of the two branches. |
| | 2990 | * |
| | 2991 | * Set up to test the first branch by pushing the current |
| | 2992 | * machine state, noting that we're in the first branch of |
| | 2993 | * a two-branch epsilon so that we'll know to proceed to |
| | 2994 | * the second branch when we finish with the first one. |
| | 2995 | */ |
| | 2996 | stack_.push(ST_EPS1, start_ofs, p.getptr() - entire_str, |
| | 2997 | cur_state, final_state); |
| | 2998 | |
| | 2999 | /* the next state is the first branch of the epsilon */ |
| | 3000 | cur_state = tuple->next_state_1; |
| | 3001 | |
| | 3002 | /* match the sub-string from here */ |
| | 3003 | start_ofs = p.getptr() - entire_str; |
| | 3004 | |
| | 3005 | /* we have the next state set to go */ |
| | 3006 | goto got_next_state; |
| | 3007 | } |
| | 3008 | break; |
| | 3009 | |
| | 3010 | default: |
| | 3011 | /* invalid node type */ |
| | 3012 | assert(FALSE); |
| | 3013 | local_return(-1); |
| | 3014 | } |
| | 3015 | |
| | 3016 | /* |
| | 3017 | * if we got this far, we were successful - move on to the next |
| | 3018 | * state |
| | 3019 | */ |
| | 3020 | cur_state = tuple->next_state_1; |
| | 3021 | |
| | 3022 | got_next_state: |
| | 3023 | /* we come here when we've already figured out the next state */ |
| | 3024 | ; |
| | 3025 | |
| | 3026 | /* |
| | 3027 | * If we're in the final state, it means we've matched the |
| | 3028 | * pattern. Return success by indicating the length of the string |
| | 3029 | * we matched. |
| | 3030 | */ |
| | 3031 | if (cur_state == final_state) |
| | 3032 | { |
| | 3033 | local_return(p.getptr() - (entire_str + start_ofs)); |
| | 3034 | } |
| | 3035 | |
| | 3036 | /* |
| | 3037 | * if we're in an invalid state, the expression must have been |
| | 3038 | * ill-formed; return failure |
| | 3039 | */ |
| | 3040 | if (cur_state == RE_STATE_INVALID) |
| | 3041 | { |
| | 3042 | local_return(-1); |
| | 3043 | } |
| | 3044 | |
| | 3045 | /* resume the loop */ |
| | 3046 | continue; |
| | 3047 | |
| | 3048 | do_local_return: |
| | 3049 | /* check what kind of state we're returning to */ |
| | 3050 | switch(stack_.get_top_type()) |
| | 3051 | { |
| | 3052 | int str_ofs; |
| | 3053 | int ret1, ret2; |
| | 3054 | int ret1_is_winner; |
| | 3055 | |
| | 3056 | case -1: |
| | 3057 | /* |
| | 3058 | * There's nothing left on the state stack, so we're actually |
| | 3059 | * returning to our caller. Simply return the given return |
| | 3060 | * value. |
| | 3061 | */ |
| | 3062 | return _retval_; |
| | 3063 | |
| | 3064 | case ST_EPS1: |
| | 3065 | /* |
| | 3066 | * We're finishing the first branch of a two-branch epsilon. |
| | 3067 | * Swap the current state with the saved state on the stack, |
| | 3068 | * and push a new state for the second branch. |
| | 3069 | */ |
| | 3070 | str_ofs = p.getptr() - entire_str; |
| | 3071 | stack_.save_and_push(_retval_, ST_EPS2, &start_ofs, &str_ofs, |
| | 3072 | &cur_state, &final_state, |
| | 3073 | regs, loop_vars); |
| | 3074 | |
| | 3075 | /* the next state is the second branch */ |
| | 3076 | cur_state = tuple_arr[cur_state].next_state_2; |
| | 3077 | |
| | 3078 | /* set up at the original string position */ |
| | 3079 | p.set((char *)entire_str + str_ofs); |
| | 3080 | curlen = entire_str_len - str_ofs; |
| | 3081 | |
| | 3082 | /* the second branch substring starts at the current character */ |
| | 3083 | start_ofs = p.getptr() - entire_str; |
| | 3084 | |
| | 3085 | /* continue from here */ |
| | 3086 | break; |
| | 3087 | |
| | 3088 | case ST_EPS2: |
| | 3089 | /* |
| | 3090 | * We're finishing the second branch of a two-branch epsilon. |
| | 3091 | * First, get the two return values - the first branch is the |
| | 3092 | * return value saved in the second-from-the-top stack frame, |
| | 3093 | * and the second branch is the current return value. |
| | 3094 | */ |
| | 3095 | ret1 = stack_.get_frame(1)->retval; |
| | 3096 | ret2 = _retval_; |
| | 3097 | |
| | 3098 | /* |
| | 3099 | * If they both failed, the whole thing failed. Otherwise, |
| | 3100 | * return the longer or shorter of the two, depending on the |
| | 3101 | * current match mode, plus the length we ourselves matched |
| | 3102 | * previously. Note that we return the register set from the |
| | 3103 | * winning match. |
| | 3104 | */ |
| | 3105 | if (ret1 < 0 && ret2 < 0) |
| | 3106 | { |
| | 3107 | /* |
| | 3108 | * They both failed - backtrack to the initial state by |
| | 3109 | * popping the second branch's frame (which stores the |
| | 3110 | * branch initial state) and discarding the first branch's |
| | 3111 | * frame (which stores the first branch's *final* state). |
| | 3112 | * Note that the only thing we care about from the stack |
| | 3113 | * is the group registers and loop variables; everything |
| | 3114 | * else will be restored from the enclosing frame that |
| | 3115 | * we're returning to. |
| | 3116 | */ |
| | 3117 | stack_.pop(&start_ofs, &str_ofs, &cur_state, &final_state, |
| | 3118 | regs, loop_vars); |
| | 3119 | stack_.discard(); |
| | 3120 | |
| | 3121 | /* set the string pointer */ |
| | 3122 | p.set((char *)entire_str + str_ofs); |
| | 3123 | curlen = entire_str_len - str_ofs; |
| | 3124 | |
| | 3125 | /* return failure */ |
| | 3126 | local_return(-1); |
| | 3127 | } |
| | 3128 | |
| | 3129 | /* |
| | 3130 | * Choose the winner based on the match mode. The match mode |
| | 3131 | * of interest is the one in the original two-way epsilon, |
| | 3132 | * which is the state in the stack frame at the top of the |
| | 3133 | * stack. |
| | 3134 | */ |
| | 3135 | if (pattern->longest_match |
| | 3136 | && !(tuple_arr[stack_.get_frame(0)->state].flags |
| | 3137 | & RE_STATE_SHORTEST)) |
| | 3138 | { |
| | 3139 | /* |
| | 3140 | * longest match - choose ret1 if ret2 isn't a good match, |
| | 3141 | * or ret1 is longer than ret2 |
| | 3142 | */ |
| | 3143 | ret1_is_winner = (ret2 < 0 || ret1 >= ret2); |
| | 3144 | } |
| | 3145 | else |
| | 3146 | { |
| | 3147 | /* |
| | 3148 | * shortest match - choose ret1 if it's the only good |
| | 3149 | * match, or if they're both good and it's the shorter one |
| | 3150 | */ |
| | 3151 | ret1_is_winner = ((ret1 >= 0 && ret2 < 0) |
| | 3152 | || (ret1 >= 0 && ret2 >= 0 |
| | 3153 | && ret1 <= ret2)); |
| | 3154 | } |
| | 3155 | |
| | 3156 | /* choose the winner */ |
| | 3157 | if (ret1_is_winner) |
| | 3158 | { |
| | 3159 | /* |
| | 3160 | * The winner is the first branch. First, pop the second |
| | 3161 | * branch's initial state, since this is the baseline for |
| | 3162 | * the first branch's final state. |
| | 3163 | */ |
| | 3164 | stack_.pop(&start_ofs, &str_ofs, &cur_state, &final_state, |
| | 3165 | regs, loop_vars); |
| | 3166 | |
| | 3167 | /* add in the length up to the initial state at the epsilon */ |
| | 3168 | ret1 += str_ofs - start_ofs; |
| | 3169 | |
| | 3170 | /* |
| | 3171 | * Second, pop the first branch's final state. This is |
| | 3172 | * stored relative to the initial state of the branch, so |
| | 3173 | * we're ready to restore it now. |
| | 3174 | */ |
| | 3175 | stack_.pop(&start_ofs, &str_ofs, &cur_state, &final_state, |
| | 3176 | regs, loop_vars); |
| | 3177 | |
| | 3178 | /* set the string pointer */ |
| | 3179 | p.set((char *)entire_str + str_ofs); |
| | 3180 | curlen = entire_str_len - str_ofs; |
| | 3181 | |
| | 3182 | /* return the result */ |
| | 3183 | local_return(ret1); |
| | 3184 | } |
| | 3185 | else |
| | 3186 | { |
| | 3187 | regex_stack_entry *fp; |
| | 3188 | |
| | 3189 | /* |
| | 3190 | * The winner is the second branch. The current machine |
| | 3191 | * state is the final state for the second branch, so we |
| | 3192 | * simply need to discard the saved state for the two |
| | 3193 | * branches. |
| | 3194 | */ |
| | 3195 | |
| | 3196 | /* add in the length up to the initial state at the epsilon */ |
| | 3197 | fp = stack_.get_frame(0); |
| | 3198 | ret2 += fp->str_ofs - fp->start_ofs; |
| | 3199 | |
| | 3200 | /* discard the two branch states */ |
| | 3201 | stack_.discard(); |
| | 3202 | stack_.discard(); |
| | 3203 | |
| | 3204 | /* return the result */ |
| | 3205 | local_return(ret2); |
| | 3206 | } |
| | 3207 | |
| | 3208 | case ST_ASSERT: |
| | 3209 | /* |
| | 3210 | * We're finishing an assertion sub-machine. First, pop the |
| | 3211 | * machine state to get back to where we were. |
| | 3212 | */ |
| | 3213 | stack_.pop(&start_ofs, &str_ofs, &cur_state, &final_state, |
| | 3214 | regs, loop_vars); |
| | 3215 | |
| | 3216 | /* |
| | 3217 | * set the string pointer and remaining length for the string |
| | 3218 | * offset we popped from the state |
| | 3219 | */ |
| | 3220 | p.set((char *)entire_str + str_ofs); |
| | 3221 | curlen = entire_str_len - str_ofs; |
| | 3222 | |
| | 3223 | /* |
| | 3224 | * If this is a positive assertion and it didn't match, return |
| | 3225 | * failure; if this is a negative assertion and it did match, |
| | 3226 | * return failure; otherwise, proceed without having consumed |
| | 3227 | * any input |
| | 3228 | */ |
| | 3229 | if (tuple_arr[cur_state].typ == RE_ASSERT_POS |
| | 3230 | ? _retval_ < 0 : _retval_ >= 0) |
| | 3231 | { |
| | 3232 | local_return(-1); |
| | 3233 | } |
| | 3234 | |
| | 3235 | /* resume where we left off */ |
| | 3236 | cur_state = tuple_arr[cur_state].next_state_1; |
| | 3237 | break; |
| | 3238 | } |
| | 3239 | |
| | 3240 | /* resume from here */ |
| | 3241 | goto got_next_state; |
| | 3242 | } |
| | 3243 | } |
| | 3244 | |
| | 3245 | /* ------------------------------------------------------------------------ */ |
| | 3246 | /* |
| | 3247 | * Search for a regular expression within a string. Returns -1 if the |
| | 3248 | * string cannot be found, otherwise returns the offset from the start |
| | 3249 | * of the string to be searched of the start of the first match for the |
| | 3250 | * pattern. |
| | 3251 | */ |
| | 3252 | int CRegexSearcher::search(const char *entirestr, const char *str, size_t len, |
| | 3253 | const re_compiled_pattern_base *pattern, |
| | 3254 | const re_tuple *tuple_arr, |
| | 3255 | const re_machine *machine, re_group_register *regs, |
| | 3256 | int *result_len) |
| | 3257 | { |
| | 3258 | utf8_ptr p; |
| | 3259 | re_group_register best_match_regs[RE_GROUP_REG_CNT]; |
| | 3260 | short loop_vars[RE_LOOP_VARS_MAX]; |
| | 3261 | int best_match_len; |
| | 3262 | int best_match_start; |
| | 3263 | const char *max_start_pos; |
| | 3264 | |
| | 3265 | /* we don't have a match yet */ |
| | 3266 | best_match_len = -1; |
| | 3267 | best_match_start = -1; |
| | 3268 | |
| | 3269 | /* search the entire string */ |
| | 3270 | max_start_pos = str + len; |
| | 3271 | |
| | 3272 | /* |
| | 3273 | * Starting at the first character in the string, search for the |
| | 3274 | * pattern at each subsequent character until we either find the |
| | 3275 | * pattern or run out of string to test. |
| | 3276 | */ |
| | 3277 | for (p.set((char *)str) ; p.getptr() < max_start_pos ; p.inc(&len)) |
| | 3278 | { |
| | 3279 | int matchlen; |
| | 3280 | |
| | 3281 | /* check for a match */ |
| | 3282 | matchlen = match(entirestr, p.getptr(), len, |
| | 3283 | pattern, tuple_arr, machine, regs, loop_vars); |
| | 3284 | if (matchlen >= 0) |
| | 3285 | { |
| | 3286 | /* check our first-begin/first-end mode */ |
| | 3287 | if (pattern->first_begin) |
| | 3288 | { |
| | 3289 | /* |
| | 3290 | * We're in first-begin mode: return immediately, |
| | 3291 | * because we want to find the match that starts at the |
| | 3292 | * earliest point in the string; having found a match |
| | 3293 | * here, there's no point in looking for a later match, |
| | 3294 | * since it obviously won't start any earlier than this |
| | 3295 | * one does. |
| | 3296 | */ |
| | 3297 | *result_len = matchlen; |
| | 3298 | return p.getptr() - str; |
| | 3299 | } |
| | 3300 | else |
| | 3301 | { |
| | 3302 | int keep; |
| | 3303 | |
| | 3304 | /* |
| | 3305 | * We're in first-end mode. We can't return yet, |
| | 3306 | * because we might find something that ends before this |
| | 3307 | * string does. |
| | 3308 | * |
| | 3309 | * If this is the first or best match so far, note it; |
| | 3310 | * otherwise, ignore it. In any case, continue looking. |
| | 3311 | */ |
| | 3312 | if (best_match_len == -1) |
| | 3313 | { |
| | 3314 | /* |
| | 3315 | * this is our first match - it's definitely the |
| | 3316 | * best so far |
| | 3317 | */ |
| | 3318 | keep = TRUE; |
| | 3319 | } |
| | 3320 | else |
| | 3321 | { |
| | 3322 | int end_idx; |
| | 3323 | |
| | 3324 | /* calculate the ending index of this match */ |
| | 3325 | end_idx = (p.getptr() - str) + matchlen; |
| | 3326 | |
| | 3327 | /* see what we have */ |
| | 3328 | if (end_idx < best_match_start + best_match_len) |
| | 3329 | { |
| | 3330 | /* |
| | 3331 | * this one ends earlier than the previous best |
| | 3332 | * match so far -- we definitely want to keep |
| | 3333 | * this one |
| | 3334 | */ |
| | 3335 | keep = TRUE; |
| | 3336 | } |
| | 3337 | else if (end_idx == best_match_start + best_match_len) |
| | 3338 | { |
| | 3339 | /* |
| | 3340 | * This one ends at exactly the same point as |
| | 3341 | * our best match so far. Decide which one to |
| | 3342 | * keep based on the lengths. If we're in |
| | 3343 | * longest-match mode, keep the longer one, |
| | 3344 | * otherwise keep the shorter one. |
| | 3345 | */ |
| | 3346 | if (pattern->longest_match) |
| | 3347 | { |
| | 3348 | /* |
| | 3349 | * longest-match mode - keep the old one, |
| | 3350 | * since it's longer (it starts earlier and |
| | 3351 | * ends at the same point) |
| | 3352 | */ |
| | 3353 | keep = FALSE; |
| | 3354 | } |
| | 3355 | else |
| | 3356 | { |
| | 3357 | /* |
| | 3358 | * shortest-match mode - keep the new one, |
| | 3359 | * since it's shorter (it starts later and |
| | 3360 | * ends at the same point) |
| | 3361 | */ |
| | 3362 | keep = TRUE; |
| | 3363 | } |
| | 3364 | } |
| | 3365 | else |
| | 3366 | { |
| | 3367 | /* |
| | 3368 | * this one isn't as good as the previous best |
| | 3369 | * -- ignore this one and keep our previous |
| | 3370 | * winner |
| | 3371 | */ |
| | 3372 | keep = FALSE; |
| | 3373 | } |
| | 3374 | } |
| | 3375 | |
| | 3376 | /* if we're keeping this match, save it */ |
| | 3377 | if (keep) |
| | 3378 | { |
| | 3379 | /* save the start index, length, and register contents */ |
| | 3380 | best_match_start = p.getptr() - str; |
| | 3381 | best_match_len = matchlen; |
| | 3382 | memcpy(best_match_regs, regs, sizeof(best_match_regs)); |
| | 3383 | |
| | 3384 | /* |
| | 3385 | * There's no point in looking for strings that |
| | 3386 | * start beyond the end of the current match, |
| | 3387 | * because they certainly won't end before this |
| | 3388 | * match ends. So, adjust our end-of-loop marker to |
| | 3389 | * point to the next character past the end of our |
| | 3390 | * match. |
| | 3391 | */ |
| | 3392 | max_start_pos = p.getptr() + matchlen; |
| | 3393 | } |
| | 3394 | } |
| | 3395 | } |
| | 3396 | } |
| | 3397 | |
| | 3398 | /* if we found a previous match, return it */ |
| | 3399 | if (best_match_len != -1) |
| | 3400 | { |
| | 3401 | /* set the caller's match length */ |
| | 3402 | *result_len = best_match_len; |
| | 3403 | |
| | 3404 | /* copy the saved match registers into the caller's registers */ |
| | 3405 | memcpy(regs, best_match_regs, sizeof(best_match_regs)); |
| | 3406 | |
| | 3407 | /* return the starting index of the match */ |
| | 3408 | return best_match_start; |
| | 3409 | } |
| | 3410 | |
| | 3411 | /* we didn't find a match */ |
| | 3412 | return -1; |
| | 3413 | } |
| | 3414 | |
| | 3415 | /* ------------------------------------------------------------------------ */ |
| | 3416 | /* |
| | 3417 | * Search for a previously-compiled pattern within the given string. |
| | 3418 | * Returns the offset of the match, or -1 if no match was found. |
| | 3419 | */ |
| | 3420 | int CRegexSearcher::search_for_pattern( |
| | 3421 | const re_compiled_pattern *pattern, |
| | 3422 | const char *entirestr, const char *searchstr, size_t searchlen, |
| | 3423 | int *result_len, re_group_register *regs) |
| | 3424 | { |
| | 3425 | /* |
| | 3426 | * search for the pattern in our copy of the string - use the copy so |
| | 3427 | * that the group registers stay valid even if the caller deallocates |
| | 3428 | * the original string after we return |
| | 3429 | */ |
| | 3430 | return search(entirestr, searchstr, searchlen, pattern, pattern->tuples, |
| | 3431 | &pattern->machine, regs, result_len); |
| | 3432 | } |
| | 3433 | |
| | 3434 | /* ------------------------------------------------------------------------ */ |
| | 3435 | /* |
| | 3436 | * Check for a match to a previously compiled expression. Returns the |
| | 3437 | * length of the match if we found a match, -1 if we found no match. This |
| | 3438 | * is not a search function; we merely match the leading substring of the |
| | 3439 | * given string to the given pattern. |
| | 3440 | */ |
| | 3441 | int CRegexSearcher::match_pattern( |
| | 3442 | const re_compiled_pattern *pattern, |
| | 3443 | const char *entirestr, const char *searchstr, size_t searchlen, |
| | 3444 | re_group_register *regs) |
| | 3445 | { |
| | 3446 | short loop_vars[RE_LOOP_VARS_MAX]; |
| | 3447 | |
| | 3448 | /* match the string */ |
| | 3449 | return match(entirestr, searchstr, searchlen, |
| | 3450 | pattern, pattern->tuples, &pattern->machine, |
| | 3451 | regs, loop_vars); |
| | 3452 | } |
| | 3453 | |
| | 3454 | /* ------------------------------------------------------------------------ */ |
| | 3455 | /* |
| | 3456 | * Compile an expression and check for a match. Returns the length of the |
| | 3457 | * match if we found a match, -1 if we found no match. This is not a |
| | 3458 | * search function; we merely match the leading substring of the given |
| | 3459 | * string to the given pattern. |
| | 3460 | * |
| | 3461 | * If a pattern pattern will be used repeatedly in multiple searches or |
| | 3462 | * matches, it is more efficient to compile the pattern once and then |
| | 3463 | * re-use the compiled pattern for each search or match, since compiling a |
| | 3464 | * pattern takes time. |
| | 3465 | */ |
| | 3466 | int CRegexSearcherSimple::compile_and_match( |
| | 3467 | const char *patstr, size_t patlen, |
| | 3468 | const char *entirestr, const char *searchstr, size_t searchlen) |
| | 3469 | { |
| | 3470 | re_compiled_pattern_base pat; |
| | 3471 | short loop_vars[RE_LOOP_VARS_MAX]; |
| | 3472 | |
| | 3473 | /* no groups yet */ |
| | 3474 | group_cnt_ = 0; |
| | 3475 | |
| | 3476 | /* clear the group registers */ |
| | 3477 | clear_group_regs(regs_); |
| | 3478 | |
| | 3479 | /* compile the expression - return failure if we get an error */ |
| | 3480 | if (parser_->compile(patstr, patlen, &pat) != RE_STATUS_SUCCESS) |
| | 3481 | return FALSE; |
| | 3482 | |
| | 3483 | /* remember the group count from the compiled pattern */ |
| | 3484 | group_cnt_ = pat.group_cnt; |
| | 3485 | |
| | 3486 | /* match the string */ |
| | 3487 | return match(entirestr, searchstr, searchlen, |
| | 3488 | &pat, parser_->tuple_arr_, &pat.machine, regs_, loop_vars); |
| | 3489 | } |
| | 3490 | |
| | 3491 | /* ------------------------------------------------------------------------ */ |
| | 3492 | /* |
| | 3493 | * Compile an expression and search for a match within the given string. |
| | 3494 | * Returns the offset of the match, or -1 if no match was found. |
| | 3495 | * |
| | 3496 | * If a pattern will be used repeatedly in multiple searches or matches, it |
| | 3497 | * is more efficient to compile the pattern once, using compile_pattern(), |
| | 3498 | * and then re-use the compiled pattern for each search, since compiling a |
| | 3499 | * pattern takes time. |
| | 3500 | */ |
| | 3501 | int CRegexSearcherSimple::compile_and_search( |
| | 3502 | const char *patstr, size_t patlen, |
| | 3503 | const char *entirestr, const char *searchstr, size_t searchlen, |
| | 3504 | int *result_len) |
| | 3505 | { |
| | 3506 | re_compiled_pattern_base pat; |
| | 3507 | |
| | 3508 | /* no groups yet */ |
| | 3509 | group_cnt_ = 0; |
| | 3510 | |
| | 3511 | /* clear the group registers */ |
| | 3512 | clear_group_regs(regs_); |
| | 3513 | |
| | 3514 | /* compile the expression - return failure if we get an error */ |
| | 3515 | if (parser_->compile(patstr, patlen, &pat) != RE_STATUS_SUCCESS) |
| | 3516 | return -1; |
| | 3517 | |
| | 3518 | /* remember the group count from the compiled pattern */ |
| | 3519 | group_cnt_ = pat.group_cnt; |
| | 3520 | |
| | 3521 | /* |
| | 3522 | * search for the pattern in our copy of the string - use the copy so |
| | 3523 | * that the group registers stay valid even if the caller deallocates |
| | 3524 | * the original string after we return |
| | 3525 | */ |
| | 3526 | return search(entirestr, searchstr, searchlen, |
| | 3527 | &pat, parser_->tuple_arr_, &pat.machine, |
| | 3528 | regs_, result_len); |
| | 3529 | } |