cfad47cfa3/tads3/vmregex.cpp

4b825dc642cb6eb9a060e54bf8d69288fbee4904cfad47cfa334b206c65f22086bcc5d63e6f70944
1
/* 
2
 *   Copyright (c) 1998, 2002 Michael J. Roberts.  All Rights Reserved.
3
 *   
4
 *   Please see the accompanying license file, LICENSE.TXT, for information
5
 *   on using and copying this software.  
6
 */
7
/*
8
Name
9
  regex.c - Regular Expression Parser and Recognizer for T3
10
Function
11
  Parses and recognizes regular expressions.  Adapted to C++ and UTF-8
12
  from the TADS 2 implementation.
13
Notes
14
  Regular expression syntax:
15
16
     abc|def    either abc or def
17
     (abc)      abc
18
     abc+       abc, abcc, abccc, ...
19
     abc*       ab, abc, abcc, ...
20
     abc?       ab or abc
21
     x{n}       x exactly 'n' times
22
     x{n,}      x at least 'n' times
23
     x{,n}      x 0 to 'n' times
24
     x{n,m}     x from 'n' to 'm' times
25
     .          any single character
26
     abc$       abc at the end of the line
27
     ^abc       abc at the beginning of the line
28
     %^abc      literally ^abc
29
     [abcx-z]   matches a, b, c, x, y, or z
30
     [^abcx-z]  matches any character except a, b, c, x, y, or z
31
     [^]-q]     matches any character except ], -, or q
32
     x+?        use *shortest* working match to x+
33
     x*?        use shortest match to x*
34
     x{}?       use shortest match to x{}
35
36
  Note that using ']' or '-' in a character range expression requires
37
  special ordering.  If ']' is to be used, it must be the first character
38
  after the '^', if present, within the brackets.  If '-' is to be used,
39
  it must be the first character after the '^' and/or ']', if present.
40
41
  The following special sequences match literal characters by name (the
42
  names insensitive to case):
43
44
     <lparen>   (
45
     <rparen>   )
46
     <lsquare>  [
47
     <rsquare>  ]
48
     <lbrace>   {
49
     <rbrace>   }
50
     <langle>   <
51
     <rangle>   >
52
     <vbar>     |
53
     <caret>    ^
54
     <period>   .
55
     <dot>      .
56
     <squote>   '
57
     <dquote>   "
58
     <star>     *
59
     <plus>     +
60
     <percent>  %
61
     <question> ?
62
     <dollar>   $
63
     <backslash> \
64
     <return>   carriage return (character code 0x000D)
65
     <linefeed> line feed (character code 0x000A)
66
     <tab>      tab (character code 0x0009)
67
     <nul>      null character (character code 0x0000)
68
     <null>     null character (character code 0x0000)
69
70
  The following special sequences match character classes (these class
71
  names, like the character literal names above, are insensitive to case):
72
73
     <Alpha>    any alphabetic character
74
     <Upper>    any upper-case alphabetic character
75
     <Lower>    any lower-case alphabetic character
76
     <Digit>    any digit
77
     <AlphaNum> any alphabetic character or digit
78
     <Space>    space character
79
     <Punct>    punctuation character
80
     <Newline>  any newline character: \n\r, \r\n, \r, \n, 0x2028
81
82
  Character classes can be combined by separating class names with the '|'
83
  delimiter.  In addition, a class expression can be complemented to make
84
  it an exclusion expression by putting a '^' after the opening bracket.
85
86
     <Upper|Digit>  any uppercase letter or any digit
87
     <^Alpha>       anything except an alphabetic character
88
     <^Upper|Digit> anything except an uppercase letter or a digit
89
                    (note that the exclusion applies to the ENTIRE
90
                    expression, so in this case both uppercase letters
91
                    and digits are excluded)
92
93
  In addition, character classes and named literals can be combined:
94
95
     <Upper|Star|Plus>  Any uppercase letter, or '*' or '+'
96
97
  Finally, literal characters and literal ranges resembling '[]' ranges
98
  can be used in angle-bracket expressions, with the limitation that each
99
  character or range must be separated with the '|' delimiter:
100
101
     <a|b|c>        'a' or 'b' or 'c'
102
     <Upper|a|b|c>  any uppercase letter, or 'a', 'b', or 'c'
103
     <Upper|a-m>    any uppercase letter, or lowercase letters 'a' to 'm'
104
105
  '%' is used to escape the special characters: | . ( ) * ? + ^ $ % [
106
  (We use '%' rather than a backslash because it's less trouble to
107
  enter in a TADS string -- a backslash needs to be quoted with another
108
  backslash, which is error-prone and hard to read.  '%' doesn't need
109
  any special quoting in a TADS string, which makes it a lot more
110
  readable.)
111
112
  In addition, '%' is used to introduce additional special sequences:
113
114
     %1         text matching first parenthesized expression
115
     %9         text matching ninth parenthesized experssion
116
     %<         matches at the beginning of a word only
117
     %>         matches at the end of a word only
118
     %w         matches any word character
119
     %W         matches any non-word character
120
     %b         matches at any word boundary (beginning or end of word)
121
     %B         matches except at a word boundary
122
123
  For the word matching sequences, a word is any sequence of letters and
124
  numbers.
125
126
  The following special sequences modify the search rules:
127
128
     <Case> - makes the search case-sensitive.  This is the default.
129
     <NoCase> - makes the search case-insensitive.  <Case> and <NoCase>
130
              are both global - the entire expression is either
131
              case-sensitive or case-insensitive.
132
133
     <FE> or <FirstEnd> - select the substring that ends earliest, in
134
                case of an ambiguous match.  <FirstBegin> is the default.
135
     <FB> or <FirstBegin> - select the substring that starts earliest, in
136
                case of an ambiguous match.  This is the default.
137
138
     <Min> - select the shortest matching substring, in case of an
139
                ambiguous match.  <Max> is the default.
140
     <Max> - select the longest matching substring, in case of an
141
                ambiguous match.
142
143
Modified
144
  09/06/98 MJRoberts  - Creation
145
*/
146
147
148
#include <ctype.h>
149
#include <stdio.h>
150
#include <stdlib.h>
151
#include <string.h>
152
#include <assert.h>
153
154
#include "t3std.h"
155
#include "vmregex.h"
156
#include "utf8.h"
157
#include "vmerr.h"
158
#include "vmerrnum.h"
159
#include "vmuni.h"
160
#include "vmfile.h"
161
162
/* ------------------------------------------------------------------------ */
163
/*
164
 *   Initialize.
165
 */
166
CRegexParser::CRegexParser()
167
{
168
    /* no tuple array yet */
169
    tuple_arr_ = 0;
170
    tuples_alloc_ = 0;
171
172
    /* clear states */
173
    next_state_ = RE_STATE_FIRST_VALID;
174
175
    /* no range buffer yet */
176
    range_buf_ = 0;
177
    range_buf_cnt_ = 0;
178
    range_buf_max_ = 0;
179
180
    /* by default, our expressions are case sensitive by default */
181
    default_case_sensitive_ = TRUE;
182
}
183
184
/* ------------------------------------------------------------------------ */
185
/*
186
 *   Reset compiler - clears states and tuples 
187
 */
188
void CRegexParser::reset()
189
{
190
    int i;
191
    re_tuple *t;
192
    
193
    /* delete any range tables we've allocated */
194
    for (i = 0, t = tuple_arr_ ; i < next_state_ ; ++i, ++t)
195
    {
196
        /* if it's a range, delete the allocated range */
197
        if ((t->typ == RE_RANGE || t->typ == RE_RANGE_EXCL)
198
            && t->info.range.char_range != 0)
199
        {
200
            t3free(t->info.range.char_range);
201
            t->info.range.char_range = 0;
202
        }
203
204
        /* clear the tuple type */
205
        t->typ = RE_INVALID;
206
    }
207
208
    /* clear states */
209
    next_state_ = RE_STATE_FIRST_VALID;
210
}
211
212
/* ------------------------------------------------------------------------ */
213
/*
214
 *   Delete
215
 */
216
CRegexParser::~CRegexParser()
217
{
218
    /* reset state (to delete most of our memory structures) */
219
    reset();
220
    
221
    /* if we've allocated an array, delete it */
222
    if (tuple_arr_ != 0)
223
    {
224
        t3free(tuple_arr_);
225
        tuple_arr_ = 0;
226
    }
227
228
    /* delete our range buffer if we have one */
229
    if (range_buf_ != 0)
230
    {
231
        t3free(range_buf_);
232
        range_buf_ = 0;
233
    }
234
}
235
236
/* ------------------------------------------------------------------------ */
237
/*
238
 *   Add an entry to our range buffer 
239
 */
240
void CRegexParser::add_range_char(wchar_t ch1, wchar_t ch2)
241
{
242
    /* 
243
     *   if the first character in the range is null, it requires special
244
     *   representation 
245
     */
246
    if (ch1 == 0)
247
    {
248
        /* 
249
         *   A null character must be represented using the character class
250
         *   notation for RE_NULLCHAR, because a null lead character in a
251
         *   pair has the special meaning of introducing a class specifier
252
         *   in the second character of the pair.  So, first add the null
253
         *   character as a class character.
254
         */
255
        add_range_class(RE_NULLCHAR);
256
257
        /* 
258
         *   if the second character of the range is null, we're done, since
259
         *   all we needed was the null itself 
260
         */
261
        if (ch2 == 0)
262
            return;
263
264
        /* 
265
         *   we've added the null character, so simply add the rest of the
266
         *   range starting at character code 1 
267
         */
268
        ch1 = 1;
269
    }
270
271
    /* make sure we have space in the range buffer for the added entry */
272
    ensure_range_buf_space();
273
274
    /* add the new entry */
275
    range_buf_[range_buf_cnt_++] = ch1;
276
    range_buf_[range_buf_cnt_++] = ch2;
277
}
278
279
/*
280
 *   Add a class to a character range.  The class identifier is one of the
281
 *   RE_xxx recognizer types representing a class: RE_ALPHA, RE_DIGIT, and
282
 *   so on.  
283
 */
284
void CRegexParser::add_range_class(re_recog_type cl)
285
{
286
    /* make sure we have space in the range buffer */
287
    ensure_range_buf_space();
288
289
    /* 
290
     *   add the new entry - the first character of the pair is a null
291
     *   character to indicate that the entry represents a class, and the
292
     *   second of the pair is the type code 
293
     */
294
    range_buf_[range_buf_cnt_++] = 0;
295
    range_buf_[range_buf_cnt_++] = (wchar_t)cl;
296
}
297
298
/*
299
 *   Check for space in the range buffer for a new range, expanding the
300
 *   buffer if necessary.
301
 */
302
void CRegexParser::ensure_range_buf_space()
303
{
304
    /* make sure we have room for another entry */
305
    if (range_buf_ == 0)
306
    {
307
        /* 
308
         *   allocate an initial size - it must be even, since we always
309
         *   consume elements in pairs 
310
         */
311
        range_buf_max_ = 128;
312
        range_buf_ = (wchar_t *)t3malloc(range_buf_max_ * sizeof(wchar_t));
313
314
        /* no entries yet */
315
        range_buf_cnt_ = 0;
316
    }
317
    else if (range_buf_cnt_ == range_buf_max_)
318
    {
319
        /* 
320
         *   reallocate the buffer at a larger size (the size must always be
321
         *   even, since we always add to the buffer in pairs) 
322
         */
323
        range_buf_max_ += 128;
324
        range_buf_ = (wchar_t *)t3realloc(range_buf_,
325
                                          range_buf_max_ * sizeof(wchar_t));
326
    }
327
328
    /* if the range buffer is null, throw an error */
329
    if (range_buf_ == 0)
330
        err_throw(VMERR_OUT_OF_MEMORY);
331
}
332
333
334
/* ------------------------------------------------------------------------ */
335
/*
336
 *   Allocate a new state ID 
337
 */
338
re_state_id CRegexParser::alloc_state()
339
{
340
    /*
341
     *   If we don't have enough room for another state, expand the array 
342
     */
343
    if (next_state_ >= tuples_alloc_)
344
    {
345
        int new_alloc;
346
        
347
        /* bump the size by a bit */
348
        new_alloc = tuples_alloc_ + 100;
349
        
350
        /* allocate or expand the array */
351
        if (tuples_alloc_ == 0)
352
        {
353
            /* allocate the initial memory block */
354
            tuple_arr_ = (re_tuple *)t3malloc(new_alloc * sizeof(re_tuple));
355
        }
356
        else
357
        {
358
            /* re-allocate the existing memory block */
359
            tuple_arr_ = (re_tuple *)t3realloc(tuple_arr_,
360
                                               new_alloc * sizeof(re_tuple));
361
        }
362
363
        /* remember the new allocation size */
364
        tuples_alloc_ = new_alloc;
365
    }
366
367
    /* initialize the next state */
368
    tuple_arr_[next_state_].next_state_1 = RE_STATE_INVALID;
369
    tuple_arr_[next_state_].next_state_2 = RE_STATE_INVALID;
370
    tuple_arr_[next_state_].info.ch = RE_EPSILON;
371
    tuple_arr_[next_state_].flags = 0;
372
    tuple_arr_[next_state_].typ = RE_INVALID;
373
374
    /* return the new state's ID */
375
    return next_state_++;
376
}
377
378
379
/* ------------------------------------------------------------------------ */
380
/*
381
 *   Set a transition from a state to a given destination state.  
382
 */
383
void CRegexParser::set_trans(re_state_id id, re_state_id dest_id,
384
                             re_recog_type typ, wchar_t ch)
385
{
386
    re_tuple *tuple;
387
388
    /* ignore invalid states */
389
    if (id == RE_STATE_INVALID)
390
        return;
391
    
392
    /* 
393
     *   get the tuple containing the transitions for this state ID - the
394
     *   state ID is the index of the state's transition tuple in the
395
     *   array 
396
     */
397
    tuple = &tuple_arr_[id];
398
399
    /*
400
     *   If the first state pointer hasn't been set yet, set it to the new
401
     *   destination.  Otherwise, set the second state pointer.
402
     *   
403
     *   Only set the character on setting the first state.  When setting
404
     *   the second state, we must assume that the character for the state
405
     *   has already been set, since any given state can have only one
406
     *   character setting.  
407
     */
408
    if (tuple->next_state_1 == RE_STATE_INVALID)
409
    {
410
        /* set the transition type and character ID */
411
        tuple->typ = typ;
412
        tuple->info.ch = ch;
413
414
        /* set the first transition */
415
        tuple->next_state_1 = dest_id;
416
    }
417
    else
418
    {
419
        /* 
420
         *   set only the second transition state - don't set the character
421
         *   ID or transition type 
422
         */
423
        tuple->next_state_2 = dest_id;
424
    }
425
}
426
427
/* ------------------------------------------------------------------------ */
428
/*
429
 *   Initialize a new machine, giving it an initial and final state 
430
 */
431
void CRegexParser::init_machine(re_machine *machine)
432
{
433
    machine->init = alloc_state();
434
    machine->final = alloc_state();
435
}
436
437
/*
438
 *   Build a character recognizer
439
 */
440
void CRegexParser::build_char(re_machine *machine, wchar_t ch)
441
{
442
    /* initialize our new machine */
443
    init_machine(machine);
444
445
    /* allocate a transition tuple for the new state */
446
    set_trans(machine->init, machine->final, RE_LITERAL, ch);
447
}
448
449
/*
450
 *   Build a special type recognizer 
451
 */
452
void CRegexParser::build_special(re_machine *machine, re_recog_type typ,
453
                                 wchar_t ch)
454
{
455
    /* initialize our new machine */
456
    init_machine(machine);
457
458
    /* allocate a transition tuple for the new state */
459
    set_trans(machine->init, machine->final, typ, ch);
460
}
461
462
/*
463
 *   Build a character range recognizer.
464
 */
465
void CRegexParser::build_char_range(re_machine *machine,
466
                                    int exclusion)
467
{
468
    wchar_t *range_copy;
469
    
470
    /* initialize our new machine */
471
    init_machine(machine);
472
473
    /* allocate a transition table for the new state */
474
    set_trans(machine->init, machine->final,
475
              (exclusion ? RE_RANGE_EXCL : RE_RANGE), 0);
476
477
    /* allocate a copy of the range vector */
478
    range_copy = (wchar_t *)t3malloc(range_buf_cnt_ * sizeof(wchar_t));
479
480
    /* copy the caller's range */
481
    memcpy(range_copy, range_buf_, range_buf_cnt_ * sizeof(wchar_t));
482
483
    /* store it in the tuple */
484
    tuple_arr_[machine->init].info.range.char_range = range_copy;
485
    tuple_arr_[machine->init].info.range.char_range_cnt = range_buf_cnt_;
486
}
487
488
489
/*
490
 *   Build a group recognizer.  This is almost the same as a character
491
 *   recognizer, but matches a previous group rather than a literal
492
 *   character. 
493
 */
494
void CRegexParser::build_group_matcher(re_machine *machine, int group_num)
495
{
496
    /* initialize our new machine */
497
    init_machine(machine);
498
499
    /* 
500
     *   Allocate a transition tuple for the new state for the group ID.
501
     *   Rather than storing a literal character code, store the group ID
502
     *   in the character code slot.  
503
     */
504
    set_trans(machine->init, machine->final,
505
              RE_GROUP_MATCH, (wchar_t)group_num);
506
}
507
508
509
/*
510
 *   Build a concatenation recognizer 
511
 */
512
void CRegexParser::build_concat(re_machine *new_machine,
513
                                   re_machine *lhs, re_machine *rhs)
514
{
515
    /* initialize the new machine */
516
    init_machine(new_machine);
517
518
    /* 
519
     *   set up an epsilon transition from the new machine's initial state
520
     *   to the first submachine's initial state 
521
     */
522
    set_trans(new_machine->init, lhs->init, RE_EPSILON, 0);
523
524
    /*
525
     *   Set up an epsilon transition from the first submachine's final
526
     *   state to the second submachine's initial state 
527
     */
528
    set_trans(lhs->final, rhs->init, RE_EPSILON, 0);
529
530
    /*
531
     *   Set up an epsilon transition from the second submachine's final
532
     *   state to our new machine's final state 
533
     */
534
    set_trans(rhs->final, new_machine->final, RE_EPSILON, 0);
535
}
536
537
/*
538
 *   Build a group machine.  sub_machine contains the machine that
539
 *   expresses the group's contents; we'll fill in new_machine with a
540
 *   newly-created machine that encloses and marks the group.  
541
 */
542
void CRegexParser::build_group(re_machine *new_machine,
543
                               re_machine *sub_machine, int group_id)
544
{
545
    /* initialize the container machine */
546
    init_machine(new_machine);
547
548
    /* 
549
     *   Set up a group-entry transition from the new machine's initial
550
     *   state into the initial state of the group, and a group-exit
551
     *   transition from the group's final state into the container's final
552
     *   state.  For both transitions, store the group ID in the character
553
     *   field of the transition, to identify which group is affected.  
554
     */
555
    set_trans(new_machine->init, sub_machine->init,
556
              RE_GROUP_ENTER, (wchar_t)group_id);
557
    set_trans(sub_machine->final, new_machine->final,
558
              RE_GROUP_EXIT, (wchar_t)group_id);
559
}
560
561
/*
562
 *   Build an assertion machine 
563
 */
564
void CRegexParser::build_assert(re_machine *new_machine,
565
                                re_machine *sub_machine, int is_negative)
566
{
567
    /* initialize the container machine */
568
    init_machine(new_machine);
569
570
    /* allocate a transition tuple for the new state */
571
    set_trans(new_machine->init, new_machine->final,
572
              is_negative ? RE_ASSERT_NEG : RE_ASSERT_POS, 0);
573
574
    /* set the sub-state */
575
    tuple_arr_[new_machine->init].info.sub.init = sub_machine->init;
576
    tuple_arr_[new_machine->init].info.sub.final = sub_machine->final;
577
}
578
579
580
/*
581
 *   Build an alternation recognizer 
582
 */
583
void CRegexParser::build_alter(re_machine *new_machine,
584
                               re_machine *lhs, re_machine *rhs)
585
{
586
    /* initialize the new machine */
587
    init_machine(new_machine);
588
589
    /*
590
     *   Set up an epsilon transition from our new machine's initial state
591
     *   to the initial state of each submachine 
592
     */
593
    set_trans(new_machine->init, lhs->init, RE_EPSILON, 0);
594
    set_trans(new_machine->init, rhs->init, RE_EPSILON, 0);
595
596
    /*
597
     *   Set up an epsilon transition from the final state of each
598
     *   submachine to our final state 
599
     */
600
    set_trans(lhs->final, new_machine->final, RE_EPSILON, 0);
601
    set_trans(rhs->final, new_machine->final, RE_EPSILON, 0);
602
}
603
604
/*
605
 *   Build a closure recognizer
606
 */
607
void CRegexParser::build_closure(re_machine *new_machine, re_machine *sub,
608
                                 wchar_t specifier, int shortest)
609
{
610
    /* initialize the new machine */
611
    init_machine(new_machine);
612
613
    /* 
614
     *   Set up an epsilon transition from our initial state to the
615
     *   submachine's initial state.  However, if we're in shortest mode,
616
     *   wait to do this until after we've generated the rest of the
617
     *   machine, and instead set up the transition from the submachine's
618
     *   final state to our final state.  The order is important because
619
     *   we will favor the first branch taken when we find two matches of
620
     *   equal total length; hence we want to make the branch that will
621
     *   give us either the longest match or shortest match for this
622
     *   closure first, depending on which way we want to go.  
623
     */
624
    if (!shortest)
625
        set_trans(new_machine->init, sub->init, RE_EPSILON, 0);
626
    else
627
        set_trans(sub->final, new_machine->final, RE_EPSILON, 0);
628
629
    /*
630
     *   If this is an unbounded closure ('*' or '+', but not '?'), set up
631
     *   the loop transition that takes us from the new machine's final
632
     *   state back to its initial state.  We don't do this on the
633
     *   zero-or-one closure, because we can only match the expression
634
     *   once.  
635
     */
636
    if (specifier != '?')
637
    {
638
        /* set the transition */
639
        set_trans(sub->final, sub->init, RE_EPSILON, 0);
640
641
        /* if we have a 'shortest' modifier, flag it in this branch */
642
        if (shortest)
643
            tuple_arr_[sub->final].flags |= RE_STATE_SHORTEST;
644
    }
645
646
    /*
647
     *   If this is a zero-or-one closure or a zero-or-more closure, set
648
     *   up an epsilon transition from our initial state to our final
649
     *   state, since we can skip the entire subexpression.  We don't do
650
     *   this on the one-or-more closure, because we can't skip the
651
     *   subexpression in this case.  
652
     */
653
    if (specifier != '+')
654
    {
655
        /* set the transition */
656
        set_trans(new_machine->init, new_machine->final, RE_EPSILON, 0);
657
658
        /* if we have a 'shortest' modifier, flag it in this branch */
659
        if (shortest)
660
            tuple_arr_[new_machine->init].flags |= RE_STATE_SHORTEST;
661
    }
662
663
    /* 
664
     *   Set up a transition from the submachine's final state to our
665
     *   final state.  We waited until here to ensure proper ordering for
666
     *   longest-preferred.  If we're in shortest-preferred mode, though,
667
     *   set up the initial transition to the submachine instead.  
668
     */
669
    if (!shortest)
670
        set_trans(sub->final, new_machine->final, RE_EPSILON, 0);
671
    else
672
        set_trans(new_machine->init, sub->init, RE_EPSILON, 0);
673
}
674
675
void CRegexParser::build_interval(re_machine *new_machine,
676
                                  re_machine *sub, int min_val, int max_val,
677
                                  int var_id, int shortest)
678
{
679
    re_machine inner_machine;
680
    
681
    /* initialize the outer (new) machine */
682
    init_machine(new_machine);
683
684
    /* initialize the inner machine */
685
    init_machine(&inner_machine);
686
687
    /* 
688
     *   Set the loop transition into the submachine, and set the other to
689
     *   bypass the submachine.  If we have a 'shortest' modifier, take
690
     *   the bypass branch first, otherwise take the enter branch first. 
691
     */
692
    if (shortest)
693
    {
694
        set_trans(inner_machine.init, new_machine->final, RE_LOOP_BRANCH, 0);
695
        set_trans(inner_machine.init, sub->init, RE_LOOP_BRANCH, 0);
696
    }
697
    else
698
    {
699
        set_trans(inner_machine.init, sub->init, RE_LOOP_BRANCH, 0);
700
        set_trans(inner_machine.init, new_machine->final, RE_LOOP_BRANCH, 0);
701
    }
702
703
    /* 
704
     *   set the final transition of the submachine to come back to the
705
     *   loop branch point 
706
     */
707
    set_trans(sub->final, inner_machine.init, RE_EPSILON, 0);
708
709
    /* 
710
     *   set the outer machine to transition into the inner machine and
711
     *   zero the loop variable 
712
     */
713
    set_trans(new_machine->init, inner_machine.init, RE_ZERO_VAR, 0);
714
715
    /* set the variable ID in the ZERO_VAR node */
716
    tuple_arr_[new_machine->init].info.loop.loop_var = var_id;
717
718
    /* set up the loop parameters in the loop node */
719
    tuple_arr_[inner_machine.init].info.loop.loop_var = var_id;
720
    tuple_arr_[inner_machine.init].info.loop.loop_min = min_val;
721
    tuple_arr_[inner_machine.init].info.loop.loop_max = max_val;
722
723
    /* 
724
     *   if there's a 'shortest' modifier, note it in the loop node, so
725
     *   that we can take the bypass branch first whenever possible 
726
     */
727
    if (shortest)
728
        tuple_arr_[inner_machine.init].flags |= RE_STATE_SHORTEST;
729
}
730
731
/*
732
 *   Build a null machine 
733
 */
734
void CRegexParser::build_null_machine(re_machine *machine)
735
{
736
    machine->init = machine->final = RE_STATE_INVALID;
737
}
738
739
/* ------------------------------------------------------------------------ */
740
/*
741
 *   Determine if a machine is null 
742
 */
743
int CRegexParser::is_machine_null(re_machine *machine)
744
{
745
    return (machine->init == RE_STATE_INVALID);
746
}
747
748
749
/* ------------------------------------------------------------------------ */
750
/*
751
 *   Concatenate the second machine onto the first machine, replacing the
752
 *   first machine with the resulting machine.  If the first machine is a
753
 *   null machine (created with re_build_null_machine), we'll simply copy
754
 *   the second machine into the first. 
755
 */
756
void CRegexParser::concat_onto(re_machine *dest, re_machine *rhs)
757
{
758
    /* check for a null destination machine */
759
    if (is_machine_null(dest))
760
    {
761
        /* 
762
         *   the first machine is null - simply copy the second machine
763
         *   onto the first unchanged 
764
         */
765
        *dest = *rhs;
766
    }
767
    else
768
    {
769
        re_machine new_machine;
770
        
771
        /* build the concatenated machine */
772
        build_concat(&new_machine, dest, rhs);
773
774
        /* copy the concatenated machine onto the first machine */
775
        *dest = new_machine;
776
    }
777
}
778
779
/*
780
 *   Alternate the second machine onto the first machine, replacing the
781
 *   first machine with the resulting machine.  If the first machine is a
782
 *   null machine, this simply replaces the first machine with the second
783
 *   machine.  If the second machine is null, this simply leaves the first
784
 *   machine unchanged. 
785
 */
786
void CRegexParser::alternate_onto(re_machine *dest, re_machine *rhs)
787
{
788
    /* check to see if the first machine is null */
789
    if (is_machine_null(dest))
790
    {
791
        /* 
792
         *   the first machine is null - simply copy the second machine
793
         *   onto the first 
794
         */
795
        *dest = *rhs;
796
    }
797
    else
798
    {
799
        /* 
800
         *   if the second machine is null, don't do anything; otherwise,
801
         *   build the alternation 
802
         */
803
        if (!is_machine_null(rhs))
804
        {
805
            re_machine new_machine;
806
            
807
            /* build the alternation */
808
            build_alter(&new_machine, dest, rhs);
809
810
            /* replace the first machine with the alternation */
811
            *dest = new_machine;
812
        }
813
    }
814
}
815
816
/* ------------------------------------------------------------------------ */
817
/*
818
 *   Compile an expression 
819
 */
820
re_status_t CRegexParser::compile(const char *expr_str, size_t exprlen,
821
                                  re_compiled_pattern_base *pat)
822
{
823
    re_machine cur_machine;
824
    re_machine alter_machine;
825
    re_machine new_machine;
826
    int group_stack_level;
827
    struct
828
    {
829
        re_machine old_cur;
830
        re_machine old_alter;
831
        int group_id;
832
        int capturing;
833
        int neg_assertion;
834
        int pos_assertion;
835
    } group_stack[RE_GROUP_NESTING_MAX];
836
    utf8_ptr expr;
837
838
    /* reset everything */
839
    reset();
840
841
    /* we have no groups yet */
842
    pat->group_cnt = 0;
843
844
    /* we have no looping variables yet */
845
    pat->loop_var_cnt = 0;
846
847
    /* 
848
     *   set the default match modes - maximum, first-beginning,
849
     *   case-sensitive 
850
     */
851
    pat->longest_match = TRUE;
852
    pat->first_begin = TRUE;
853
    pat->case_sensitive = default_case_sensitive_;
854
855
    /* start out with no current machine and no alternate machine */
856
    build_null_machine(&cur_machine);
857
    build_null_machine(&alter_machine);
858
859
    /* nothing on the stack yet */
860
    group_stack_level = 0;
861
862
    /* loop until we run out of expression to parse */
863
    for (expr.set((char *)expr_str) ; exprlen != 0 ; expr.inc(&exprlen))
864
    {
865
        switch(expr.getch())
866
        {
867
        case '^':
868
            /*
869
             *   beginning of line - if we're not at the beginning of the
870
             *   current expression (i.e., we already have some
871
             *   concatentations accumulated), treat it as an ordinary
872
             *   character 
873
             */
874
            if (!is_machine_null(&cur_machine))
875
                goto normal_char;
876
877
            /* build a new start-of-text recognizer */
878
            build_special(&new_machine, RE_TEXT_BEGIN, 0);
879
880
            /* 
881
             *   concatenate it onto the string - note that this can't
882
             *   have any postfix operators 
883
             */
884
            concat_onto(&cur_machine, &new_machine);
885
            break;
886
887
        case '$':
888
            /*
889
             *   End of line specifier - if there's anything left after
890
             *   the '$' other than a close parens or alternation
891
             *   specifier, treat it as a normal character 
892
             */
893
            if (exprlen > 1
894
                && (expr.getch_at(1) != ')' && expr.getch_at(1) != '|'))
895
                goto normal_char;
896
897
            /* build a new end-of-text recognizer */
898
            build_special(&new_machine, RE_TEXT_END, 0);
899
900
            /* 
901
             *   concatenate it onto the string - note that this can't
902
             *   have any postfix operators 
903
             */
904
            concat_onto(&cur_machine, &new_machine);
905
            break;
906
            
907
        case '(':
908
            {
909
                int capturing;
910
                int pos_assertion;
911
                int neg_assertion;
912
913
                /* presume it's a capturing group */
914
                capturing = TRUE;
915
916
                /* presume it's not an assertion */
917
                pos_assertion = FALSE;
918
                neg_assertion = FALSE;
919
                
920
                /* 
921
                 *   Add a nesting level.  Push the current machine and
922
                 *   alternate machines onto the group stack, and clear
923
                 *   everything out for the new group.  
924
                 */
925
                if (group_stack_level > RE_GROUP_NESTING_MAX)
926
                {
927
                    /* we cannot proceed - return an error */
928
                    return RE_STATUS_GROUP_NESTING_TOO_DEEP;
929
                }
930
                
931
                /* save the current state on the stack */
932
                group_stack[group_stack_level].old_cur = cur_machine;
933
                group_stack[group_stack_level].old_alter = alter_machine;
934
                
935
                /* 
936
                 *   Assign the group a group ID - groups are numbered in
937
                 *   order of their opening (left) parentheses, so we want
938
                 *   to assign a group number now.  We won't actually need
939
                 *   to know the group number until we get to the matching
940
                 *   close paren, but we need to assign it now, so store
941
                 *   it in the group stack.  
942
                 */
943
                group_stack[group_stack_level].group_id = pat->group_cnt;
944
                
945
                /* check for special group flags */
946
                if (exprlen > 2 && expr.getch_at(1) == '?')
947
                {
948
                    switch(expr.getch_at(2))
949
                    {
950
                    case ':':
951
                        /* it's a non-capturing group */
952
                        capturing = FALSE;
953
954
                        /* skip two extra characters for the '?:' part */
955
                        expr.inc(&exprlen);
956
                        expr.inc(&exprlen);
957
                        break;
958
                        
959
                    case '=':
960
                        /* it's a positive assertion group */
961
                        pos_assertion = TRUE;
962
963
                        /* assertions don't capture */
964
                        capturing = FALSE;
965
966
                        /* skip two extra characters for the '?=' part */
967
                        expr.inc(&exprlen);
968
                        expr.inc(&exprlen);
969
                        break;
970
                        
971
                    case '!':
972
                        /* it's a negative assertion group */
973
                        neg_assertion = TRUE;
974
975
                        /* assertions don't capture */
976
                        capturing = FALSE;
977
978
                        /* skip two extra characters for the '?!' part */
979
                        expr.inc(&exprlen);
980
                        expr.inc(&exprlen);
981
                        break;
982
                        
983
                    default:
984
                        /* it's not a recognized sequence - ignore it */
985
                        break;
986
                    }
987
                }
988
989
                /* remember if the group is capturing */
990
                group_stack[group_stack_level].capturing = capturing;
991
992
                /* remember if it's an assertion of some kind */
993
                group_stack[group_stack_level].pos_assertion = pos_assertion;
994
                group_stack[group_stack_level].neg_assertion = neg_assertion;
995
                
996
                /* consume the group number if it's a capturing group */
997
                if (capturing)
998
                    ++(pat->group_cnt);
999
                
1000
                /* push the level */
1001
                ++group_stack_level;
1002
                
1003
                /* start the new group with empty machines */
1004
                build_null_machine(&cur_machine);
1005
                build_null_machine(&alter_machine);
1006
            }
1007
            break;
1008
1009
        case ')':
1010
        do_close_paren:
1011
            /* if there's nothing on the stack, ignore this */
1012
            if (group_stack_level == 0)
1013
                break;
1014
1015
            /* take a level off the stack */
1016
            --group_stack_level;
1017
1018
            /* 
1019
             *   Remove a nesting level.  If we have a pending alternate
1020
             *   expression, build the alternation expression.  This will
1021
             *   leave the entire group expression in alter_machine,
1022
             *   regardless of whether an alternation was in progress or
1023
             *   not.  
1024
             */
1025
            alternate_onto(&alter_machine, &cur_machine);
1026
1027
            /*
1028
             *   Create a group machine that encloses the group and marks
1029
             *   it with a group number.  We assigned the group number
1030
             *   when we parsed the open paren, so read that group number
1031
             *   from the stack.
1032
             *   
1033
             *   Note that this will leave 'new_machine' with the entire
1034
             *   group machine.
1035
             *   
1036
             *   If this is a non-capturing group, don't bother building
1037
             *   the new machine - just copy the current alternation
1038
             *   machine onto the new machine.  
1039
             */
1040
            if (group_stack[group_stack_level].capturing)
1041
            {
1042
                /* it's a regular capturing group - add the group machine */
1043
                build_group(&new_machine, &alter_machine,
1044
                            group_stack[group_stack_level].group_id);
1045
            }
1046
            else if (group_stack[group_stack_level].pos_assertion
1047
                     || group_stack[group_stack_level].neg_assertion)
1048
            {
1049
                /* it's an assertion - build the assertion group */
1050
                build_assert(&new_machine, &alter_machine,
1051
                             group_stack[group_stack_level].neg_assertion);
1052
            }
1053
            else
1054
            {
1055
                /* it's a non-capturing group - just add the group tree */
1056
                new_machine = alter_machine;
1057
            }
1058
1059
            /*
1060
             *   Pop the stack - restore the alternation and current
1061
             *   machines that were in progress before the group started. 
1062
             */
1063
            cur_machine = group_stack[group_stack_level].old_cur;
1064
            alter_machine = group_stack[group_stack_level].old_alter;
1065
1066
            /* 
1067
             *   If we just did an assertion, ignore any closure postfix
1068
             *   operators.  Since an assertion doesn't consume any input
1069
             *   text, applying a closure would create an infinite loop.  
1070
             */
1071
            if (group_stack[group_stack_level].pos_assertion
1072
                || group_stack[group_stack_level].neg_assertion)
1073
                goto skip_closures;
1074
1075
            /*
1076
             *   Check the group expression (in new_machine) for postfix
1077
             *   expressions 
1078
             */
1079
            goto apply_postfix;
1080
1081
        case '|':
1082
            /* 
1083
             *   Start a new alternation.  This ends the current
1084
             *   alternation; if we have a previous pending alternate,
1085
             *   build an alternation machine out of the previous
1086
             *   alternate and the current machine and move that to the
1087
             *   alternate; otherwise, simply move the current machine to
1088
             *   the pending alternate. 
1089
             */
1090
            alternate_onto(&alter_machine, &cur_machine);
1091
1092
            /* 
1093
             *   the alternation starts out with a blank slate, so null
1094
             *   out the current machine 
1095
             */
1096
            build_null_machine(&cur_machine);
1097
            break;
1098
1099
        case '<':
1100
            /* check for our various special directives */
1101
            if ((exprlen >= 4 && memicmp(expr.getptr(), "<FE>", 4) == 0)
1102
                || (exprlen >= 10
1103
                    && memicmp(expr.getptr(), "<FirstEnd>", 10) == 0))
1104
            {
1105
                /* turn off first-begin mode */
1106
                pat->first_begin = FALSE;
1107
            }
1108
            else if ((exprlen >= 4 && memicmp(expr.getptr(), "<FB>", 4) == 0)
1109
                     || (exprlen >= 12
1110
                         && memicmp(expr.getptr(), "<FirstBegin>", 12) == 0))
1111
            {
1112
                /* turn on first-begin mode */
1113
                pat->first_begin = TRUE;
1114
            }
1115
            else if (exprlen >= 5 && memicmp(expr.getptr(), "<Max>", 5) == 0)
1116
            {
1117
                /* turn on longest-match mode */
1118
                pat->longest_match = TRUE;
1119
            }
1120
            else if (exprlen >= 5 && memicmp(expr.getptr(), "<Min>", 5) == 0)
1121
            {
1122
                /* turn off longest-match mode */
1123
                pat->longest_match = FALSE;
1124
            }
1125
            else if (exprlen >= 6 && memicmp(expr.getptr(), "<Case>", 6) == 0)
1126
            {
1127
                /* turn on case sensitivity */
1128
                pat->case_sensitive = TRUE;
1129
            }
1130
            else if (exprlen >= 8
1131
                     && memicmp(expr.getptr(), "<NoCase>", 8) == 0)
1132
            {
1133
                /* turn off case sensitivity */
1134
                pat->case_sensitive = FALSE;
1135
            }
1136
            else
1137
            {
1138
                /*
1139
                 *   It's nothing else we recognize, so it must be a
1140
                 *   character class or class range expression, which
1141
                 *   consists of one or more classes, single characters, or
1142
                 *   character ranges separated by '|' delimiters.  
1143
                 */
1144
                if (compile_char_class_expr(&expr, &exprlen, &new_machine))
1145
                {
1146
                    /* success - look for postfix operators */
1147
                    goto apply_postfix;
1148
                }
1149
                else
1150
                {
1151
                    /* 
1152
                     *   failure - treat the whole thing as ordinary
1153
                     *   characters 
1154
                     */
1155
                    goto normal_char;
1156
                }
1157
            }
1158
1159
            /* skip everything up to the closing ">" */
1160
            for ( ; exprlen > 0 && expr.getch() != '>' ; expr.inc(&exprlen)) ;
1161
            break;
1162
1163
        case '%':
1164
            /* 
1165
             *   quoted character - skip the quote mark and see what we
1166
             *   have 
1167
             */
1168
            expr.inc(&exprlen);
1169
1170
            /* check to see if we're at the end of the expression */
1171
            if (exprlen == 0)
1172
            {
1173
                /* 
1174
                 *   end of the string - ignore it, but undo the extra
1175
                 *   increment of the expression index so that we exit the
1176
                 *   enclosing loop properly 
1177
                 */
1178
                expr.dec(&exprlen);
1179
                break;
1180
            }
1181
1182
            /* see what we have */
1183
            switch(expr.getch())
1184
            {
1185
            case '1':
1186
            case '2':
1187
            case '3':
1188
            case '4':
1189
            case '5':
1190
            case '6':
1191
            case '7':
1192
            case '8':
1193
            case '9':
1194
                /* group match - build a new literal group recognizer */
1195
                build_group_matcher(&new_machine,
1196
                                    value_of_digit(expr.getch()) - 1);
1197
1198
                /* apply any postfix expression to the group recognizer */
1199
                goto apply_postfix;
1200
1201
            case '<':
1202
                /* build a beginning-of-word recognizer */
1203
                build_special(&new_machine, RE_WORD_BEGIN, 0);
1204
1205
                /* it can't be postfixed - just concatenate it */
1206
                concat_onto(&cur_machine, &new_machine);
1207
                break;
1208
1209
            case '>':
1210
                /* build an end-of-word recognizer */
1211
                build_special(&new_machine, RE_WORD_END, 0);
1212
1213
                /* it can't be postfixed - just concatenate it */
1214
                concat_onto(&cur_machine, &new_machine);
1215
                break;
1216
1217
            case 'w':
1218
                /* word character */
1219
                build_special(&new_machine, RE_WORD_CHAR, 0);
1220
                goto apply_postfix;
1221
1222
            case 'W':
1223
                /* non-word character */
1224
                build_special(&new_machine, RE_NON_WORD_CHAR, 0);
1225
                goto apply_postfix;
1226
1227
            case 'b':
1228
                /* word boundary */
1229
                build_special(&new_machine, RE_WORD_BOUNDARY, 0);
1230
1231
                /* it can't be postfixed */
1232
                concat_onto(&cur_machine, &new_machine);
1233
                break;
1234
1235
            case 'B':
1236
                /* not a word boundary */
1237
                build_special(&new_machine, RE_NON_WORD_BOUNDARY, 0);
1238
1239
                /* it can't be postfixed */
1240
                concat_onto(&cur_machine, &new_machine);
1241
                break;
1242
1243
            default:
1244
                /* build a new literal character recognizer */
1245
                build_char(&new_machine, expr.getch());
1246
1247
                /* apply any postfix expression to the character */
1248
                goto apply_postfix;
1249
            }
1250
            break;
1251
1252
        case '.':
1253
            /* 
1254
             *   wildcard character - build a single character recognizer
1255
             *   for the special wildcard symbol, then go check it for a
1256
             *   postfix operator 
1257
             */
1258
            build_special(&new_machine, RE_WILDCARD, 0);
1259
            goto apply_postfix;
1260
            break;
1261
1262
        case '[':
1263
            /* range expression */
1264
            {
1265
                int is_exclusive = FALSE;
1266
1267
                /* we have no entries yet */
1268
                range_buf_cnt_ = 0;
1269
1270
                /* first, skip the open bracket */
1271
                expr.inc(&exprlen);
1272
1273
                /* check to see if starts with the exclusion character */
1274
                if (exprlen != 0 && expr.getch() == '^')
1275
                {
1276
                    /* skip the exclusion specifier */
1277
                    expr.inc(&exprlen);
1278
1279
                    /* note it */
1280
                    is_exclusive = TRUE;
1281
                }
1282
1283
                /* 
1284
                 *   if the first character is a ']', include it in the
1285
                 *   range 
1286
                 */
1287
                if (exprlen != 0 && expr.getch() == ']')
1288
                {
1289
                    add_range_char(']');
1290
                    expr.inc(&exprlen);
1291
                }
1292
1293
                /*
1294
                 *   if the next character is a '-', include it in the
1295
                 *   range 
1296
                 */
1297
                if (exprlen != 0 && expr.getch() == '-')
1298
                {
1299
                    add_range_char('-');
1300
                    expr.inc(&exprlen);
1301
                }
1302
1303
                /* scan the character set */
1304
                while (exprlen != 0 && expr.getch() != ']')
1305
                {
1306
                    wchar_t ch;
1307
                    
1308
                    /* note this character */
1309
                    ch = expr.getch();
1310
1311
                    /* skip this character of the expression */
1312
                    expr.inc(&exprlen);
1313
1314
                    /* check for a range */
1315
                    if (exprlen != 0 && expr.getch() == '-')
1316
                    {
1317
                        wchar_t ch2;
1318
                        
1319
                        /* skip the '-' */
1320
                        expr.inc(&exprlen);
1321
                        if (exprlen != 0)
1322
                        {
1323
                            /* get the other end of the range */
1324
                            ch2 = expr.getch();
1325
1326
                            /* skip the second character */
1327
                            expr.inc(&exprlen);
1328
1329
                            /* if the range is reversed, swap it */
1330
                            if (ch > ch2)
1331
                            {
1332
                                wchar_t tmp = ch;
1333
                                ch = ch2;
1334
                                ch2 = tmp;
1335
                            }
1336
1337
                            /* add the range */
1338
                            add_range_char(ch, ch2);
1339
                        }
1340
                    }
1341
                    else
1342
                    {
1343
                        /* no range - add the one-character range */
1344
                        add_range_char(ch);
1345
                    }
1346
                }
1347
1348
                /* create a character range machine */
1349
                build_char_range(&new_machine, is_exclusive);
1350
1351
                /* apply any postfix operator */
1352
                goto apply_postfix;
1353
            }            
1354
            break;
1355
1356
        default:
1357
        normal_char:
1358
            /* 
1359
             *   it's an ordinary character - build a single character
1360
             *   recognizer machine, and then concatenate it onto any
1361
             *   existing machine 
1362
             */
1363
            build_char(&new_machine, expr.getch());
1364
1365
        apply_postfix:
1366
            /*
1367
             *   Check for a postfix operator, and apply it to the machine
1368
             *   in 'new_machine' if present.  In any case, concatenate
1369
             *   the 'new_machine' (modified by a postix operator or not)
1370
             *   to the current machien.  
1371
             */
1372
            if (exprlen > 1)
1373
            {
1374
                switch(expr.getch_at(1))
1375
                {
1376
                case '*':
1377
                case '+':
1378
                case '?':
1379
                    /*
1380
                     *   We have a postfix closure operator.  Build a new
1381
                     *   closure machine out of 'new_machine'.  
1382
                     */
1383
                    {
1384
                        re_machine closure_machine;
1385
                        int shortest;
1386
                        
1387
                        /* move onto the closure operator */
1388
                        expr.inc(&exprlen);
1389
1390
                        /* 
1391
                         *   if the next character is '?', it's a modifier
1392
                         *   that indicates that we are to use the
1393
                         *   shortest match - note it if so 
1394
                         */
1395
                        shortest = (exprlen > 1 && expr.getch_at(1) == '?');
1396
1397
                        /* build the closure machine */
1398
                        build_closure(&closure_machine,
1399
                                      &new_machine, expr.getch(), shortest);
1400
                        
1401
                        /* replace the original machine with the closure */
1402
                        new_machine = closure_machine;
1403
                        
1404
                        /* 
1405
                         *   skip any redundant closure symbols, keeping
1406
                         *   only the first one we saw 
1407
                         */
1408
                    skip_closures:
1409
                        while (exprlen > 1)
1410
                        {
1411
                            /* check for a simple closure suffix */
1412
                            if (expr.getch_at(1) == '?'
1413
                                || expr.getch_at(1) == '+'
1414
                                || expr.getch_at(1) == '*')
1415
                            {
1416
                                /* skip it and keep looping */
1417
                                expr.inc(&exprlen);
1418
                                continue;
1419
                            }
1420
1421
                            /* check for an interval */
1422
                            if (expr.getch_at(1) == '{')
1423
                            {
1424
                                /* skip until we find the matching '}' */
1425
                                while (exprlen > 1 && expr.getch_at(1) != '}')
1426
                                    expr.inc(&exprlen);
1427
1428
                                /* go back for anything that follows */
1429
                                continue;
1430
                            }
1431
1432
                            /* if it's anything else, we're done discarding */
1433
                            break;
1434
                        }
1435
                    }
1436
                    break;
1437
1438
                case '{':
1439
                    /* interval specifier */
1440
                    {
1441
                        int min_val;
1442
                        int max_val;
1443
                        re_machine interval_machine;
1444
                        int shortest;
1445
                        int var_id;
1446
1447
                        /* 
1448
                         *   loops can never overlap, but can be nested;
1449
                         *   so the only thing we have to worry about in
1450
                         *   assigning a loop variable is the group
1451
                         *   nesting depth 
1452
                         */
1453
                        var_id = group_stack_level;
1454
1455
                        /* note the highest variable ID we've seen */
1456
                        if (var_id >= pat->loop_var_cnt)
1457
                            pat->loop_var_cnt = var_id + 1;
1458
1459
                        /* presume neither min nor max will be specified */
1460
                        min_val = -1;
1461
                        max_val = -1;
1462
                        
1463
                        /* skip the current character and the '{' */
1464
                        expr.inc(&exprlen);
1465
                        expr.inc(&exprlen);
1466
                        
1467
                        /* parse the minimum count, if provided */
1468
                        min_val = parse_int(&expr, &exprlen);
1469
1470
                        /* if there's a comma, parse the maximum value */
1471
                        if (exprlen >= 1 && expr.getch() == ',')
1472
                        {
1473
                            /* skip the ',' and parse the number */
1474
                            expr.inc(&exprlen);
1475
                            max_val = parse_int(&expr, &exprlen);
1476
                        }
1477
                        else
1478
                        {
1479
                            /* 
1480
                             *   there's no other value, so this is a
1481
                             *   simple loop with the same value for min
1482
                             *   and max 
1483
                             */
1484
                            max_val = min_val;
1485
                        }
1486
1487
                        /* 
1488
                         *   if we're not looking at a '}', skip
1489
                         *   characters until we are 
1490
                         */
1491
                        while (exprlen != 0 && expr.getch() != '}')
1492
                            expr.inc(&exprlen);
1493
1494
                        /* 
1495
                         *   if there's a '?' following, it's a 'shortest'
1496
                         *   modifier - note it 
1497
                         */
1498
                        shortest = FALSE;
1499
                        if (exprlen > 1 && expr.getch_at(1) == '?')
1500
                        {
1501
                            /* note the modifier */
1502
                            shortest = TRUE;
1503
1504
                            /* skip another character for the modifier */
1505
                            expr.inc(&exprlen);
1506
                        }
1507
1508
                        /* set up an interval node */
1509
                        build_interval(&interval_machine, &new_machine,
1510
                                       min_val, max_val, var_id, shortest);
1511
1512
                        /* replace the original machine with the interval */
1513
                        new_machine = interval_machine;
1514
1515
                        /* skip any closure modifiers that follow */
1516
                        goto skip_closures;
1517
                    }
1518
                    break;
1519
                    
1520
                default:
1521
                    /* no postfix operator */
1522
                    break;
1523
                }
1524
            }
1525
1526
            /*
1527
             *   Concatenate the new machine onto the current machine
1528
             *   under construction.  
1529
             */
1530
            concat_onto(&cur_machine, &new_machine);
1531
            break;
1532
        }
1533
1534
        /* if we've run down the expression string, go no further */
1535
        if (exprlen == 0)
1536
            break;
1537
    }
1538
1539
    /* if there are any open parens outstanding, close them */
1540
    if (group_stack_level != 0)
1541
        goto do_close_paren;
1542
1543
    /* complete any pending alternation */
1544
    alternate_onto(&alter_machine, &cur_machine);
1545
1546
    /* check for and break any infinite loops */
1547
    break_loops(&alter_machine);
1548
1549
    /* remove meaningless branch-to-branch transitions */
1550
    remove_branch_to_branch(&alter_machine);
1551
1552
    /* store the results in the caller's base pattern description */
1553
    pat->machine = alter_machine;
1554
    pat->tuple_cnt = next_state_;
1555
1556
    /* limit the group count to the maximum */
1557
    if (pat->group_cnt > RE_GROUP_REG_CNT)
1558
        pat->group_cnt = RE_GROUP_REG_CNT;
1559
1560
    /* limit the variable count to the maximum */
1561
    if (pat->loop_var_cnt > RE_LOOP_VARS_MAX)
1562
        pat->loop_var_cnt = RE_LOOP_VARS_MAX;
1563
1564
    /* no errors encountered */
1565
    return RE_STATUS_SUCCESS;
1566
}
1567
1568
1569
/* 
1570
 *   Parse a character class or class range expresion.  Returns true on
1571
 *   success, in which case we will have built a class range expression in
1572
 *   new_machine and updated expr and exprlen to skip the expression.
1573
 *   Returns false on syntax error or other failure, in which case expr and
1574
 *   exprlen will be unchanged.  
1575
 */
1576
int CRegexParser::compile_char_class_expr(utf8_ptr *expr, size_t *exprlen,
1577
                                          re_machine *result_machine)
1578
{
1579
    utf8_ptr p;
1580
    size_t rem;
1581
    int is_exclusive;
1582
    
1583
    /* start at the character after the '<' */
1584
    p = *expr;
1585
    rem = *exprlen;
1586
    p.inc(&rem);
1587
1588
    /* presume it won't be exclusive */
1589
    is_exclusive = FALSE;
1590
1591
    /* check for an exclusion flag */
1592
    if (rem != 0 && p.getch() == '^')
1593
    {
1594
        /* note the exclusion */
1595
        is_exclusive = TRUE;
1596
        
1597
        /* skip the exclusion prefix */
1598
        p.inc(&rem);
1599
    }
1600
    
1601
    /* clear out the range builder buffer */
1602
    range_buf_cnt_ = 0;
1603
    
1604
    /* 
1605
     *   keep going until we find the closing delimiter (or run out of
1606
     *   expression string) 
1607
     */
1608
    for (;;)
1609
    {
1610
        utf8_ptr start;
1611
        wchar_t delim;
1612
        size_t curcharlen;
1613
        size_t curbytelen;
1614
        
1615
        /* remember where the current part starts */
1616
        start = p;
1617
        
1618
        /* scan for the '>' or '|' */
1619
        for (curcharlen = 0 ; rem != 0 ; p.inc(&rem), ++curcharlen)
1620
        {
1621
            /* get the next character */
1622
            delim = p.getch();
1623
            
1624
            /* if it's '>' or '|', we're done */
1625
            if (delim == '>' || delim == '|')
1626
                break;
1627
        }
1628
        
1629
        /* 
1630
         *   If we reached the end of the expression without finding a
1631
         *   closing delimiter, the expression is invalid - treat the whole
1632
         *   thing (starting with the '<') as ordinary characters.  
1633
         */
1634
        if (rem == 0)
1635
            return FALSE;
1636
        
1637
        /* get the length of this part */
1638
        curbytelen = (size_t)(p.getptr() - start.getptr());
1639
        
1640
        /* 
1641
         *   See what we have.  If we have a single character, it's a
1642
         *   literal.  If we have a character, a hyphen, and another
1643
         *   character, it's a literal range.  Otherwise, it must be one of
1644
         *   our named classes.  
1645
         */
1646
        if (curcharlen == 1)
1647
        {
1648
            /* it's a literal - add the character */
1649
            add_range_char(start.getch());
1650
        }
1651
        else if (curcharlen == 3 && start.getch_at(1) == '-')
1652
        {
1653
            /* it's a literal range - add it */
1654
            add_range_char(start.getch(), start.getch_at(2));
1655
        }
1656
        else
1657
        {
1658
            struct char_class_t
1659
            {
1660
                /* expression name for the class */
1661
                const char *name;
1662
                
1663
                /* length of the class name */
1664
                size_t name_len;
1665
                
1666
                /* 
1667
                 *   literal character, if the name represents a character
1668
                 *   rather than a class - this is used only if code ==
1669
                 *   RE_LITERAL 
1670
                 */
1671
                wchar_t ch;
1672
                
1673
                /* RE_xxx code for the class */
1674
                re_recog_type code;
1675
            };
1676
            static const char_class_t classes[] =
1677
            {
1678
                { "alpha",    5, 0, RE_ALPHA },
1679
                { "digit",    5, 0, RE_DIGIT },
1680
                { "upper",    5, 0, RE_UPPER },
1681
                { "lower",    5, 0, RE_LOWER },
1682
                { "alphanum", 8, 0, RE_ALPHANUM },
1683
                { "space",    5, 0, RE_SPACE },
1684
                { "punct",    5, 0, RE_PUNCT },
1685
                { "newline",  7, 0, RE_NEWLINE },
1686
                { "langle",   6, '<', RE_LITERAL },
1687
                { "rangle",   6, '>', RE_LITERAL },
1688
                { "vbar",     4, '|', RE_LITERAL },
1689
                { "caret",    5, '^', RE_LITERAL },
1690
                { "squote",   6, '\'', RE_LITERAL },
1691
                { "dquote",   6, '"', RE_LITERAL },
1692
                { "star",     4, '*', RE_LITERAL },
1693
                { "question", 8, '?', RE_LITERAL },
1694
                { "percent",  7, '%', RE_LITERAL },
1695
                { "dot",      3, '.', RE_LITERAL },
1696
                { "period",   6, '.', RE_LITERAL },
1697
                { "plus",     4, '+', RE_LITERAL },
1698
                { "lsquare",  7, '[', RE_LITERAL },
1699
                { "rsquare",  7, ']', RE_LITERAL },
1700
                { "lparen",   6, '(', RE_LITERAL },
1701
                { "rparen",   6, ')', RE_LITERAL},
1702
                { "lbrace",   6, '{', RE_LITERAL },
1703
                { "rbrace",   6, '}', RE_LITERAL },
1704
                { "dollar",   6, '$', RE_LITERAL },
1705
                { "backslash",9, '\\', RE_LITERAL },
1706
                { "return",   6, 0x000D, RE_LITERAL },
1707
                { "linefeed", 8, 0x000A, RE_LITERAL },
1708
                { "tab",      3, 0x0009, RE_LITERAL },
1709
                { "nul",      3, 0x0000, RE_LITERAL },
1710
                { "null",     4, 0x0000, RE_LITERAL }
1711
            };
1712
            const char_class_t *cp;
1713
            size_t i;
1714
            int found;
1715
1716
            /* scan our name list for a match */
1717
            for (cp = classes, i = 0, found = FALSE ;
1718
                 i < sizeof(classes)/sizeof(classes[0]) ;
1719
                 ++i, ++cp)
1720
            {
1721
                /* 
1722
                 *   if the length matches and the name matches (ignoring
1723
                 *   case), this is the one we want 
1724
                 */
1725
                if (curbytelen == cp->name_len
1726
                    && memicmp(start.getptr(), cp->name, curbytelen) == 0)
1727
                {
1728
                    /* 
1729
                     *   this is it - add either a class range or literal
1730
                     *   range, depending on the meaning of the name 
1731
                     */
1732
                    if (cp->code == RE_LITERAL)
1733
                    {
1734
                        /* it's a name for a literal */
1735
                        add_range_char(cp->ch);
1736
                    }
1737
                    else
1738
                    {
1739
                        /* 
1740
                         *   It's a name for a character class.  As a
1741
                         *   special case for efficiency, if this is the one
1742
                         *   and only thing in this class expression, don't
1743
                         *   create a range expression but instead create a
1744
                         *   special for the class.
1745
                         *   
1746
                         *   Note that we can't do this for an exclusive
1747
                         *   class, since we don't have any special matcher
1748
                         *   for those - implement those with a character
1749
                         *   range as usual.  
1750
                         */
1751
                        if (range_buf_cnt_ == 0
1752
                            && delim == '>'
1753
                            && !is_exclusive)
1754
                        {
1755
                            /*
1756
                             *   This is the only thing, so build a special
1757
                             *   to match this class - this is more
1758
                             *   efficient to store and to match than a
1759
                             *   range expression.  
1760
                             */
1761
                            build_special(result_machine, cp->code, 0);
1762
                            
1763
                            /* skip to the '>' */
1764
                            *expr = p;
1765
                            *exprlen = rem;
1766
                            
1767
                            /* 
1768
                             *   we're done with the expresion - tell the
1769
                             *   caller we were successful 
1770
                             */
1771
                            return TRUE;
1772
                        }
1773
                        else
1774
                        {
1775
                            /* 
1776
                             *   it's not the only thing, so add the class
1777
                             *   to the range list 
1778
                             */
1779
                            add_range_class(cp->code);
1780
                        }
1781
                    }
1782
                    
1783
                    /* note that we found a match */
1784
                    found = TRUE;
1785
                    
1786
                    /* no need to scan further in our table */
1787
                    break;
1788
                }
1789
            }
1790
            
1791
            /* if we didn't find a match, the whole expression is invalid */
1792
            if (!found)
1793
                return FALSE;
1794
        }
1795
        
1796
        /* 
1797
         *   if we found the '>', we're done; if we found a '|', skip it and
1798
         *   keep going 
1799
         */
1800
        if (delim == '|')
1801
        {
1802
            /* skip the delimiter, and back for another round */
1803
            p.inc(&rem);
1804
        }
1805
        else
1806
        {
1807
            /* we found the '>', so we're done - add the range recognizer */
1808
            build_char_range(result_machine, is_exclusive);
1809
            
1810
            /* skip up to the '>' */
1811
            *expr = p;
1812
            *exprlen = rem;
1813
            
1814
            /* tell the caller we were successful */
1815
            return TRUE;
1816
        }
1817
    }
1818
}
1819
1820
1821
/*
1822
 *   Parse an integer value.  Returns -1 if there's no number.  
1823
 */
1824
int CRegexParser::parse_int(utf8_ptr *p, size_t *rem)
1825
{
1826
    int acc;
1827
    
1828
    /* if it's not a number to start with, simply return -1 */
1829
    if (*rem == 0 || !is_digit(p->getch()))
1830
        return -1;
1831
    
1832
    /* keep going as long as we find digits */
1833
    for (acc = 0 ; *rem > 0 && is_digit(p->getch()) ; p->inc(rem))
1834
    {
1835
        /* add this digit into the accumulator */
1836
        acc *= 10;
1837
        acc += value_of_digit(p->getch());
1838
    }
1839
1840
    /* return the accumulated result */
1841
    return acc;
1842
}
1843
1844
/*
1845
 *   Break any infinite loops in the machine.  Check for cycles that
1846
 *   consist solely of epsilon transitions, and break each cycle at the
1847
 *   last alternating transition. 
1848
 */
1849
void CRegexParser::break_loops(re_machine *machine)
1850
{
1851
    re_state_id cur;
1852
    
1853
    /* 
1854
     *   for each state, search the machine for cycles from the state back
1855
     *   to itself 
1856
     */
1857
    for (cur = 0 ; cur < next_state_ ; ++cur)
1858
    {
1859
        /* test for loops from this state back to itself */
1860
        find_loop(machine, cur);
1861
    }
1862
}
1863
1864
/*
1865
 *   Find and break loops from the current state back to the given initial
1866
 *   state completely via epsilon transitions.  Returns true if we found a
1867
 *   loop back to the initial state, false if not.  
1868
 */
1869
int CRegexParser::find_loop(re_machine *machine, re_state_id cur_state)
1870
{
1871
    /* 
1872
     *   keep going until we get to a final state or a non-epsilon
1873
     *   transition 
1874
     */
1875
    for (;;)
1876
    {
1877
        re_tuple *tuple;
1878
1879
        /* 
1880
         *   if this is the final state, there's nowhere else to go, so we
1881
         *   evidently can't find the loop we were seeking
1882
         */
1883
        if (cur_state == machine->final)
1884
            return FALSE;
1885
1886
        /* get the tuple for this state */
1887
        tuple = &tuple_arr_[cur_state];
1888
1889
        /* 
1890
         *   if this state has already been visited by a recursive caller,
1891
         *   we're in a loop 
1892
         */
1893
        if ((tuple->flags & RE_STATE_CYCLE_TEST) != 0)
1894
            return TRUE;
1895
        
1896
        /* if it's a two-transition epsilon state, handle it separately */
1897
        if (tuple->typ == RE_EPSILON
1898
            && tuple->next_state_2 != RE_STATE_INVALID)
1899
        {
1900
            unsigned char old_flags;
1901
            
1902
            /* 
1903
             *   This is a branch.  Recursively try to find the loop in
1904
             *   each branch.  If we do find the loop in either branch, it
1905
             *   means that there are no branches closer to the final
1906
             *   transition back to the original state, so we must break
1907
             *   this branch.  
1908
             */
1909
1910
            /* 
1911
             *   mark the current state as being inspected, so if we
1912
             *   encounter it again we'll know we've found a cycle 
1913
             */
1914
            old_flags = tuple->flags;
1915
            tuple->flags |= RE_STATE_CYCLE_TEST;
1916
1917
            /* 
1918
             *   try the second state first, since we won't have to
1919
             *   shuffle around the slots to clear the second state 
1920
             */
1921
            if (find_loop(machine, tuple->next_state_2))
1922
            {
1923
                /* the second branch loops - break it */
1924
                tuple->next_state_2 = RE_STATE_INVALID;
1925
            }
1926
            
1927
            /* check the other branch */
1928
            if (find_loop(machine, tuple->next_state_1))
1929
            {
1930
                /* 
1931
                 *   the first branch loops - move the second branch to
1932
                 *   the first slot, and clear the second slot (if we only
1933
                 *   have one branch, it must always be in the first slot) 
1934
                 */
1935
                tuple->next_state_1 = tuple->next_state_2;
1936
                tuple->next_state_2 = RE_STATE_INVALID;
1937
            }
1938
1939
            /* we're done testing this state - restore its original marking */
1940
            tuple->flags = old_flags;
1941
1942
            /* 
1943
             *   if both branches are invalid, this entire state is
1944
             *   unusable, so tell the caller to break its branch to here 
1945
             */
1946
            if (tuple->next_state_1 == RE_STATE_INVALID)
1947
                return TRUE;
1948
            
1949
            /* 
1950
             *   there's no need to continue either branch, since we've
1951
             *   already followed them all the way to the end via the
1952
             *   recursive call - and if we had a loop, we've broken it,
1953
             *   so simply tell the caller there are no more loops along
1954
             *   this path 
1955
             */
1956
            return FALSE;
1957
        }
1958
1959
        /* see what kind of transition this is */
1960
        switch(tuple->typ)
1961
        {
1962
        case RE_EPSILON:
1963
        case RE_GROUP_ENTER:
1964
        case RE_GROUP_EXIT:
1965
            /* 
1966
             *   epsilon or group transition - this could definitely be part
1967
             *   of a loop, so move on to the next state and keep looking 
1968
             */
1969
            cur_state = tuple->next_state_1;
1970
1971
            /* 
1972
             *   if this took us to an invalid state, we must have reached
1973
             *   the final state of a sub-machine - we can go no further
1974
             *   from here, so there are no loops in this branch 
1975
             */
1976
            if (cur_state == RE_STATE_INVALID)
1977
                return FALSE;
1978
            break;
1979
1980
        case RE_TEXT_BEGIN:
1981
        case RE_TEXT_END:
1982
        case RE_WORD_BEGIN:
1983
        case RE_WORD_END:
1984
        case RE_WORD_BOUNDARY:
1985
        case RE_NON_WORD_BOUNDARY:
1986
        case RE_ASSERT_POS:
1987
        case RE_ASSERT_NEG:
1988
        case RE_ZERO_VAR:
1989
            /* 
1990
             *   none of these transitions consumes input, so any of these
1991
             *   could result in an infinite loop - continue down the
1992
             *   current path 
1993
             */
1994
            cur_state = tuple->next_state_1;
1995
            break;
1996
1997
        default:
1998
            /* 
1999
             *   all other states consume input, so this branch definitely
2000
             *   can't loop back to the original state without consuming
2001
             *   input - we do not need to proceed any further down the
2002
             *   current branch, since it's not an infinite epsilon loop
2003
             *   even if it does ultimately find its way back to the
2004
             *   initial state 
2005
             */
2006
            return FALSE;
2007
        }
2008
    }
2009
}
2010
2011
/*
2012
 *   Optimization: remove meaningless branch-to-branch transitions.  An
2013
 *   "epsilon" state that has only one outgoing transition does nothing
2014
 *   except move to the next state, so any transition that points to such a
2015
 *   state could just as well point to the destination of the epsilon's one
2016
 *   outgoing transition.  
2017
 */
2018
void CRegexParser::remove_branch_to_branch(re_machine *machine)
2019
{
2020
    re_state_id cur;
2021
    re_tuple *tuple;
2022
    
2023
    /* make sure the initial state points to the first meaningful state */
2024
    optimize_transition(machine, &machine->init);
2025
2026
    /* scan all active states */
2027
    for (cur = 0, tuple = tuple_arr_ ; cur < next_state_ ; ++cur, ++tuple)
2028
    {
2029
        /* if this isn't the final state, check it */
2030
        if (cur != machine->final)
2031
        {
2032
            /* check both of our outgoing transitions */
2033
            optimize_transition(machine, &tuple->next_state_1);
2034
            optimize_transition(machine, &tuple->next_state_2);
2035
        }
2036
    }
2037
}
2038
2039
/*
2040
 *   Optimize a single transition to remove meaningless branch-to-branch
2041
 *   transitions. 
2042
 */
2043
void CRegexParser::optimize_transition(const re_machine *machine,
2044
                                       re_state_id *trans)
2045
{
2046
    /* keep going as long as we find meaningless forwarding branches */
2047
    for (;;)
2048
    {
2049
        re_tuple *tuple_nxt;
2050
        
2051
        /* 
2052
         *   if this transition points to the machine's final state, there's
2053
         *   nowhere else to go from here 
2054
         */
2055
        if (*trans == RE_STATE_INVALID || *trans == machine->final)
2056
            return;
2057
2058
        /* 
2059
         *   if the transition points to anything other than a single-branch
2060
         *   epsilon, we point to a meaningful next state, so there's no
2061
         *   further branch-to-branch elimination we can perform 
2062
         */
2063
        tuple_nxt = &tuple_arr_[*trans];
2064
        if (tuple_nxt->typ != RE_EPSILON
2065
            || tuple_nxt->next_state_2 != RE_STATE_INVALID)
2066
            return;
2067
2068
        /* 
2069
         *   This transition points to a meaningless intermediate state, so
2070
         *   we can simply skip the intermediate state and go directly from
2071
         *   the current state to the target state's single target.  Once
2072
         *   we've done this, continue scanning, because we might find that
2073
         *   the new target state itself is a meaningless intermediate state
2074
         *   that we can skip past as well (and so on, and so on - keep
2075
         *   going until we find a real target state).  
2076
         */
2077
        *trans = tuple_nxt->next_state_1;
2078
    }
2079
}
2080
2081
/* ------------------------------------------------------------------------ */
2082
/*
2083
 *   Compile an expression and return a newly-allocated pattern object.  
2084
 */
2085
re_status_t CRegexParser::compile_pattern(const char *expr_str,
2086
                                          size_t exprlen,
2087
                                          re_compiled_pattern **pattern)
2088
{
2089
    re_status_t stat;
2090
    re_state_id i;
2091
    re_tuple *t;
2092
    re_compiled_pattern *pat;
2093
    size_t siz;
2094
    wchar_t *rp;
2095
    re_compiled_pattern_base base_pat;
2096
2097
    /* presume we won't allocate a new pattern object */
2098
    *pattern = 0;
2099
2100
    /* compile the pattern */
2101
    if ((stat = compile(expr_str, exprlen, &base_pat)) != RE_STATUS_SUCCESS)
2102
        return stat;
2103
2104
    /* 
2105
     *   Start off with our basic space needs: we need space for the base
2106
     *   structure, plus space for re_tuple array (actually, space for the
2107
     *   number of allocated tuples minus one, since there's one built into
2108
     *   the base structure).  
2109
     */
2110
    siz = sizeof(re_compiled_pattern)
2111
          + (base_pat.tuple_cnt - 1)*sizeof(pat->tuples[0]);
2112
2113
    /* 
2114
     *   Run through the tuples in the result machine and add up the amount
2115
     *   of extra space we'll need for extra allocations (specifically,
2116
     *   character ranges).  
2117
     */
2118
    for (i = 0, t = tuple_arr_ ; i < base_pat.tuple_cnt ; ++i, ++t)
2119
    {
2120
        /* check what kind of tuple this is */
2121
        switch(t->typ)
2122
        {
2123
        case RE_RANGE:
2124
        case RE_RANGE_EXCL:
2125
            /* range - add in space for the character range data */
2126
            siz += sizeof(t->info.range.char_range[0])
2127
                   * t->info.range.char_range_cnt;
2128
            break;
2129
2130
        default:
2131
            /* other types require no extra storage */
2132
            break;
2133
        }
2134
    }
2135
2136
    /* allocate space for the structure */
2137
    *pattern = pat = (re_compiled_pattern *)t3malloc(siz);
2138
2139
    /* copy the base pattern to the result */
2140
    memcpy(pat, &base_pat, sizeof(base_pat));
2141
2142
    /* copy the tuple array */
2143
    memcpy(pat->tuples, tuple_arr_, pat->tuple_cnt * sizeof(pat->tuples[0]));
2144
2145
    /*
2146
     *   Pack the range data onto the end of the tuple array in the data
2147
     *   structure.  We already calculated how much space we'll need for this
2148
     *   and included the space in the structure's allocation, so we simply
2149
     *   need to find the location of the range data at the end of our tuple
2150
     *   array.  
2151
     */
2152
    rp = (wchar_t *)&pat->tuples[pat->tuple_cnt];
2153
2154
    /* 
2155
     *   run through the new tuple array and pack the range data into the new
2156
     *   allocation unit 
2157
     */
2158
    for (i = 0, t = pat->tuples ; i < next_state_ ; ++i, ++t)
2159
    {
2160
        /* check what kind of tuple this is */
2161
        switch(t->typ)
2162
        {
2163
        case RE_RANGE:
2164
        case RE_RANGE_EXCL:
2165
            /* 
2166
             *   Character range.  Copy the original character range data
2167
             *   into the new allocation unit, at the next available location
2168
             *   in the allocation unit.  
2169
             */
2170
            memcpy(rp, t->info.range.char_range,
2171
                   t->info.range.char_range_cnt
2172
                   * sizeof(t->info.range.char_range[0]));
2173
2174
            /* fix up the tuple to point to the packed copy */
2175
            t->info.range.char_range = rp;
2176
2177
            /* move past the space in our allocation unit */
2178
            rp += t->info.range.char_range_cnt;
2179
            break;
2180
2181
        default:
2182
            /* other types contain no range data */
2183
            break;
2184
        }
2185
    }
2186
2187
    /* success */
2188
    return stat;
2189
}
2190
2191
/* 
2192
 *   free a pattern previously created with compile_pattern() 
2193
 */
2194
void CRegexParser::free_pattern(re_compiled_pattern *pattern)
2195
{
2196
    /* we allocate each pattern as a single unit, so it's easy to free */
2197
    t3free(pattern);
2198
}
2199
2200
/* ------------------------------------------------------------------------ */
2201
/*
2202
 *   Pattern recognizer 
2203
 */
2204
2205
/*
2206
 *   construct 
2207
 */
2208
CRegexSearcher::CRegexSearcher()
2209
{
2210
}
2211
2212
/*
2213
 *   delete 
2214
 */
2215
CRegexSearcher::~CRegexSearcher()
2216
{
2217
}
2218
2219
/*
2220
 *   Match a string to a compiled expression.  Returns the length of the
2221
 *   match if successful, or -1 if no match was found.  
2222
 */
2223
int CRegexSearcher::match(const char *entire_str,
2224
                          const char *str, const size_t origlen,
2225
                          const re_compiled_pattern_base *pattern,
2226
                          const re_tuple *tuple_arr,
2227
                          const re_machine *machine,
2228
                          re_group_register *regs,
2229
                          short *loop_vars)
2230
{
2231
    size_t entire_str_len;
2232
    re_state_id cur_state, final_state;
2233
    utf8_ptr p;
2234
    size_t curlen;
2235
    int start_ofs;
2236
    int _retval_;
2237
2238
    const int ST_EPS1 = 1;      /* doing first branch of two-branch epsilon */
2239
    const int ST_EPS2 = 2;     /* doing second branch of two-branch epsilon */
2240
    const int ST_ASSERT = 3;                          /* doing an assertion */
2241
2242
    /* macro to perform a"local return" */
2243
#define local_return(retval) \
2244
    _retval_ = (retval); \
2245
    goto do_local_return;
2246
2247
    /* reset the stack */
2248
    stack_.reset();
2249
2250
    /* start at the machine's initial state */
2251
    cur_state = machine->init;
2252
2253
    /* note the final state */
2254
    final_state = machine->final;
2255
2256
    /* 
2257
     *   if we're starting in the final state, this is a zero-length
2258
     *   pattern, which matches anything - immediately return success with a
2259
     *   zero-length match 
2260
     */
2261
    if (cur_state == final_state)
2262
        return 0;
2263
2264
    /* start at the beginning of the string */
2265
    p.set((char *)str);
2266
    curlen = origlen;
2267
2268
    /* note the offset from the start of the entire string */
2269
    start_ofs = str - entire_str;
2270
    entire_str_len = start_ofs + origlen;
2271
2272
    /* run the machine */
2273
    for (;;)
2274
    {
2275
        const re_tuple *tuple;
2276
2277
        /* get the tuple for this state */
2278
        tuple = &tuple_arr[cur_state];
2279
2280
        /* check the type of state we're processing */
2281
        switch(tuple->typ)
2282
        {
2283
        case RE_ZERO_VAR:
2284
            /* save the variable in the stack frame if necessary */
2285
            stack_.save_loop_var(tuple->info.loop.loop_var, loop_vars);
2286
2287
            /* zero the loop variable and proceed */
2288
            loop_vars[tuple->info.loop.loop_var] = 0;
2289
            break;
2290
            
2291
        case RE_LOOP_BRANCH:
2292
            /*
2293
             *   This is a loop branch.  Check the variable to see if we've
2294
             *   satisfied the loop condition yet:
2295
             *   
2296
             *   - If we haven't yet reached the minimum loop count, simply
2297
             *   transition into the loop (i.e., take the 'enter' branch
2298
             *   unconditionally).
2299
             *   
2300
             *   - If we have reached the maximum loop count (if there is
2301
             *   one - if max is -1 then there's no upper bound), simply
2302
             *   transition past the loop (i.e., take the 'bypass' branch
2303
             *   unconditionally).
2304
             *   
2305
             *   - Otherwise, we must treat this just like an ordinary
2306
             *   two-way epsilon branch.
2307
             *   
2308
             *   Note that, if we have a 'shortest' modifier, the bypass
2309
             *   branch will be first, to encourage the indeterminate check
2310
             *   to choose the short branch whenever possible; otherwise the
2311
             *   enter branch will be first, so we take the long branch
2312
             *   whenever possible.  
2313
             */
2314
            {
2315
                short *varptr;
2316
                
2317
                /* get the variable value */
2318
                varptr = &loop_vars[tuple->info.loop.loop_var];
2319
                
2320
                /* save the variable in the stack frame if necessary */
2321
                stack_.save_loop_var(tuple->info.loop.loop_var, loop_vars);
2322
2323
                /* 
2324
                 *   if we're not at the loop minimum yet, transition into
2325
                 *   the loop body 
2326
                 */
2327
                if (*varptr < tuple->info.loop.loop_min)
2328
                {
2329
                    /* increment the loop counter */
2330
                    ++(*varptr);
2331
                    
2332
                    /* unconditionally transfer into the loop */
2333
                    if ((tuple->flags & RE_STATE_SHORTEST) == 0)
2334
                        cur_state = tuple->next_state_1;
2335
                    else
2336
                        cur_state = tuple->next_state_2;
2337
                    
2338
                    /* we're done processing this state */
2339
                    goto got_next_state;
2340
                }
2341
                
2342
                /* 
2343
                 *   if we've reached the loop maximum (if there is one),
2344
                 *   transition past the loop 
2345
                 */
2346
                if (tuple->info.loop.loop_max >= 0
2347
                    && *varptr >= tuple->info.loop.loop_max)
2348
                {
2349
                    /* unconditionally skip the loop */
2350
                    if ((tuple->flags & RE_STATE_SHORTEST) == 0)
2351
                        cur_state = tuple->next_state_2;
2352
                    else
2353
                        cur_state = tuple->next_state_1;
2354
                    
2355
                    /* we're done with this state */
2356
                    goto got_next_state;
2357
                }
2358
2359
                /* 
2360
                 *   We don't know which way to go, so we must treat this as
2361
                 *   a two-way epsilon branch.  Count this as another loop
2362
                 *   iteration, since the branch that enters the loop will
2363
                 *   come back here for another try.  The branch that skips
2364
                 *   the loop doesn't care about the loop counter any more,
2365
                 *   so we can just increment it and ignore the skip branch.
2366
                 */
2367
                ++(*varptr);
2368
                goto two_way_epsilon;
2369
            }
2370
            /* not reached */
2371
            
2372
        case RE_GROUP_MATCH:
2373
            {
2374
                int group_num;
2375
                re_group_register *group_reg;
2376
                size_t reg_len;
2377
                
2378
                /* it's a group - get the group number */
2379
                group_num = (int)tuple->info.ch;
2380
                group_reg = &regs[group_num];
2381
2382
                /* 
2383
                 *   if this register isn't defined, there's nothing to
2384
                 *   match, so fail 
2385
                 */
2386
                if (group_reg->start_ofs == -1
2387
                    || group_reg->end_ofs == -1)
2388
                {
2389
                    local_return(-1);
2390
                }
2391
2392
                /* calculate the length of the register value */
2393
                reg_len = group_reg->end_ofs - group_reg->start_ofs;
2394
2395
                /* if we don't have enough left to match, it fails */
2396
                if (curlen < reg_len)
2397
                {
2398
                    local_return(-1);
2399
                }
2400
2401
                /* if the string doesn't match exactly, we fail */
2402
                if (memcmp(p.getptr(), entire_str + group_reg->start_ofs,
2403
                           reg_len) != 0)
2404
                {
2405
                    local_return(-1);
2406
                }
2407
2408
                /*
2409
                 *   It matches exactly - skip the entire byte length of the
2410
                 *   register in the source string 
2411
                 */
2412
                p.set(p.getptr() + reg_len);
2413
                curlen -= reg_len;
2414
            }
2415
            break;
2416
2417
        case RE_TEXT_BEGIN:
2418
            /* 
2419
             *   Match only the exact beginning of the string - if we're
2420
             *   anywhere else, this isn't a match.  If this succeeds, we
2421
             *   don't skip any characters.  
2422
             */
2423
            if (p.getptr() != entire_str)
2424
            {
2425
                local_return(-1);
2426
            }
2427
            break;
2428
2429
        case RE_TEXT_END:
2430
            /*
2431
             *   Match only the exact end of the string - if we're anywhere
2432
             *   else, this isn't a match.  Don't skip any characters on
2433
             *   success.  
2434
             */
2435
            if (curlen != 0)
2436
            {
2437
                local_return(-1);
2438
            }
2439
            break;
2440
2441
        case RE_WORD_BEGIN:
2442
            /* 
2443
             *   If the previous character is a word character, we're not at
2444
             *   the beginning of a word.  If we're at the beginning of the
2445
             *   entire string, we need not check anything previous -
2446
             *   there's no previous character, so we can't have a preceding
2447
             *   word character.  
2448
             */
2449
            if (p.getptr() != entire_str
2450
                && is_word_char(p.getch_before(1)))
2451
            {
2452
                local_return(-1);
2453
            }
2454
2455
            /* 
2456
             *   if we're at the end of the string, or the current character
2457
             *   isn't a word character, we're not at the beginning of a
2458
             *   word 
2459
             */
2460
            if (curlen == 0 || !is_word_char(p.getch()))
2461
            {
2462
                local_return(-1);
2463
            }
2464
            break;
2465
2466
        case RE_WORD_END:
2467
            /*
2468
             *   if the current character is a word character, we're not at
2469
             *   the end of a word 
2470
             */
2471
            if (curlen != 0 && is_word_char(p.getch()))
2472
            {
2473
                local_return(-1);
2474
            }
2475
            
2476
            /*
2477
             *   if we're at the beginning of the string, or the previous
2478
             *   character is not a word character, we're not at the end of
2479
             *   a word 
2480
             */
2481
            if (p.getptr() == entire_str
2482
                || !is_word_char(p.getch_before(1)))
2483
            {
2484
                local_return(-1);
2485
            }
2486
            break;
2487
2488
        case RE_WORD_CHAR:
2489
            /* if it's not a word character, it's a failure */
2490
            if (curlen == 0 || !is_word_char(p.getch()))
2491
            {
2492
                local_return(-1);
2493
            }
2494
2495
            /* skip this character of input */
2496
            p.inc(&curlen);
2497
            break;
2498
2499
        case RE_NON_WORD_CHAR:
2500
            /* if it's a word character, it's a failure */
2501
            if (curlen == 0 || is_word_char(p.getch()))
2502
            {
2503
                local_return(-1);
2504
            }
2505
2506
            /* skip the input */
2507
            p.inc(&curlen);
2508
            break;
2509
2510
        case RE_WORD_BOUNDARY:
2511
        case RE_NON_WORD_BOUNDARY:
2512
            {
2513
                int prev_is_word;
2514
                int next_is_word;
2515
                int boundary;
2516
2517
                /*
2518
                 *   Determine if the previous character is a word character
2519
                 *   -- if we're at the beginning of the string, it's
2520
                 *   obviously not, otherwise check its classification 
2521
                 */
2522
                if (p.getptr() == entire_str)
2523
                {
2524
                    /* 
2525
                     *   at the start of the string, so there definitely
2526
                     *   isn't a preceding word character 
2527
                     */
2528
                    prev_is_word = FALSE;
2529
                }
2530
                else
2531
                {
2532
                    /* look at the previous character */
2533
                    prev_is_word = is_word_char(p.getch_before(1));
2534
                }
2535
2536
                /* make the same check for the current character */
2537
                next_is_word = (curlen != 0
2538
                                && is_word_char(p.getch()));
2539
2540
                /*
2541
                 *   Determine if this is a boundary - it is if the two
2542
                 *   states are different 
2543
                 */
2544
                boundary = ((prev_is_word != 0) ^ (next_is_word != 0));
2545
2546
                /* 
2547
                 *   make sure it matches what was desired, and return
2548
                 *   failure if not 
2549
                 */
2550
                if ((tuple->typ == RE_WORD_BOUNDARY && !boundary)
2551
                    || (tuple->typ == RE_NON_WORD_BOUNDARY && boundary))
2552
                {
2553
                    local_return(-1);
2554
                }
2555
            }
2556
            break;
2557
            
2558
        case RE_WILDCARD:
2559
            /* make sure we have a character to match */
2560
            if (curlen == 0)
2561
            {
2562
                local_return(-1);
2563
            }
2564
            
2565
            /* skip this character */
2566
            p.inc(&curlen);
2567
            break;
2568
2569
        case RE_RANGE:
2570
        case RE_RANGE_EXCL:
2571
            {
2572
                int match;
2573
                wchar_t ch;
2574
                wchar_t *rp;
2575
                size_t i;
2576
2577
                /* make sure we have a character to match */
2578
                if (curlen == 0)
2579
                {
2580
                    local_return(-1);
2581
                }
2582
2583
                /* get this character */
2584
                ch = p.getch();
2585
2586
                /* search for the character in the range */
2587
                for (i = tuple->info.range.char_range_cnt,
2588
                     rp = tuple->info.range.char_range ;
2589
                     i != 0 ; i -= 2, rp += 2)
2590
                {
2591
                    /* 
2592
                     *   check for a class specifier; if it's not a class
2593
                     *   specifier, treat it as a literal range, and check
2594
                     *   case sensitivity 
2595
                     */
2596
                    if (rp[0] == '\0')
2597
                    {
2598
                        int match;
2599
2600
                        /*
2601
                         *   The first character of the range pair is null,
2602
                         *   which means that this isn't a literal range but
2603
                         *   rather a class.  Check for a match to the
2604
                         *   class.  
2605
                         */
2606
                        switch(rp[1])
2607
                        {
2608
                        case RE_ALPHA:
2609
                            match = t3_is_alpha(ch);
2610
                            break;
2611
2612
                        case RE_DIGIT:
2613
                            match = t3_is_digit(ch);
2614
                            break;
2615
2616
                        case RE_UPPER:
2617
                            match = t3_is_upper(ch);
2618
                            break;
2619
2620
                        case RE_LOWER:
2621
                            match = t3_is_lower(ch);
2622
                            break;
2623
2624
                        case RE_ALPHANUM:
2625
                            match = t3_is_alpha(ch) || t3_is_digit(ch);
2626
                            break;
2627
2628
                        case RE_SPACE:
2629
                            match = t3_is_space(ch);
2630
                            break;
2631
2632
                        case RE_PUNCT:
2633
                            match = t3_is_punct(ch);
2634
                            break;
2635
2636
                        case RE_NEWLINE:
2637
                            match = (ch == 0x000A
2638
                                     || ch == 0x000D
2639
                                     || ch == 0x2028);
2640
                            break;
2641
                            
2642
                        case RE_NULLCHAR:
2643
                            match = (ch == 0);
2644
                            break;
2645
                            
2646
                        default:
2647
                            /* this shouldn't happen */
2648
                            match = FALSE;
2649
                            break;
2650
                        }
2651
                        
2652
                        /* 
2653
                         *   if we matched, we can stop looking; otherwise,
2654
                         *   simply keep going, since there might be another
2655
                         *   entry that does match 
2656
                         */
2657
                        if (match)
2658
                            break;
2659
                    }
2660
                    else if (pattern->case_sensitive)
2661
                    {
2662
                        /* 
2663
                         *   the search is case-sensitive - compare the
2664
                         *   character to the range without case conversion 
2665
                         */
2666
                        if (ch >= rp[0] && ch <= rp[1])
2667
                            break;
2668
                    }
2669
                    else if (t3_is_upper(rp[0]) && t3_is_upper(rp[1]))
2670
                    {
2671
                        wchar_t uch;
2672
                        
2673
                        /* 
2674
                         *   the range is all upper-case letters - convert
2675
                         *   the source character to upper-case for the
2676
                         *   comparison 
2677
                         */
2678
                        uch = t3_to_upper(ch);
2679
                        if (uch >= rp[0] && uch <= rp[1])
2680
                            break;
2681
                    }
2682
                    else if (t3_is_lower(rp[0]) && t3_is_lower(rp[1]))
2683
                    {
2684
                        wchar_t lch;
2685
2686
                        /* 
2687
                         *   the range is all lower-case letters - convert
2688
                         *   the source character to upper-case for the
2689
                         *   comparison 
2690
                         */
2691
                        lch = t3_to_lower(ch);
2692
                        if (lch >= rp[0] && lch <= rp[1])
2693
                            break;
2694
                    }
2695
                    else
2696
                    {
2697
                        /* 
2698
                         *   The cases of the two ends of the range don't
2699
                         *   agree, so there's nothing we can do for case
2700
                         *   conversions.  Simply compare the range exactly.
2701
                         */
2702
                        if (ch >= rp[0] && ch <= rp[1])
2703
                            break;
2704
                    }
2705
                }
2706
                
2707
                /* we matched if we stopped before exhausting the list */
2708
                match = (i != 0);
2709
                
2710
                /* make sure we got what we wanted */
2711
                if ((tuple->typ == RE_RANGE && !match)
2712
                    || (tuple->typ == RE_RANGE_EXCL && match))
2713
                {
2714
                    local_return(-1);
2715
                }
2716
2717
                /* skip this character of the input */
2718
                p.inc(&curlen);
2719
            }
2720
            break;
2721
            
2722
        case RE_ALPHA:
2723
            /* check for an alphabetic character */
2724
            if (curlen == 0 || !t3_is_alpha(p.getch()))
2725
            {
2726
                local_return(-1);
2727
            }
2728
            
2729
            /* skip this character of the input, and continue */
2730
            p.inc(&curlen);
2731
            break;
2732
            
2733
        case RE_DIGIT:
2734
            /* check for a digit character */
2735
            if (curlen == 0 || !t3_is_digit(p.getch()))
2736
            {
2737
                local_return(-1);
2738
            }
2739
2740
            /* skip this character of the input, and continue */
2741
            p.inc(&curlen);
2742
            break;
2743
2744
        case RE_UPPER:
2745
            /* check for an upper-case alphabetic character */
2746
            if (curlen == 0 || !t3_is_upper(p.getch()))
2747
            {
2748
                local_return(-1);
2749
            }
2750
2751
            /* skip this character of the input, and continue */
2752
            p.inc(&curlen);
2753
            break;
2754
2755
        case RE_LOWER:
2756
            /* check for a lower-case alphabetic character */
2757
            if (curlen == 0 || !t3_is_lower(p.getch()))
2758
            {
2759
                local_return(-1);
2760
            }
2761
2762
            /* skip this character of the input, and continue */
2763
            p.inc(&curlen);
2764
            break;
2765
2766
        case RE_ALPHANUM:
2767
            /* check for an alphabetic or digit character */
2768
            if (curlen == 0
2769
                || (!t3_is_alpha(p.getch()) && !t3_is_digit(p.getch())))
2770
            {
2771
                local_return(-1);
2772
            }
2773
2774
            /* skip this character of the input, and continue */
2775
            p.inc(&curlen);
2776
            break;
2777
2778
        case RE_SPACE:
2779
            /* check for a space of some kind */
2780
            if (curlen == 0 || !t3_is_space(p.getch()))
2781
            {
2782
                local_return(-1);
2783
            }
2784
2785
            /* skip this character of the input, and continue */
2786
            p.inc(&curlen);
2787
            break;
2788
2789
        case RE_PUNCT:
2790
            /* check for a punctuation character of some kind */
2791
            if (curlen == 0 || !t3_is_punct(p.getch()))
2792
            {
2793
                local_return(-1);
2794
            }
2795
2796
            /* skip this character of the input, and continue */
2797
            p.inc(&curlen);
2798
            break;
2799
2800
        case RE_NEWLINE:
2801
            /* check for a newline character of some kind */
2802
            {
2803
                wchar_t ch;
2804
2805
                /* if we're out of characters, we don't have a match */
2806
                if (curlen == 0)
2807
                {
2808
                    local_return(-1);
2809
                }
2810
2811
                /* get the character */
2812
                ch = p.getch();
2813
2814
                /* if it's not a newline, we fail this match */
2815
                if (ch != 0x000A && ch != 0x000d && ch != 0x2028)
2816
                {
2817
                    local_return(-1);
2818
                }
2819
            }
2820
2821
            /* skip this character of input and continue */
2822
            p.inc(&curlen);
2823
            break;
2824
2825
        case RE_ASSERT_POS:
2826
        case RE_ASSERT_NEG:
2827
            /*
2828
             *   It's an assertion.  Run this as a sub-state: push the
2829
             *   current state so that we can come back to it later. 
2830
             */
2831
            stack_.push(ST_ASSERT, start_ofs, p.getptr() - entire_str,
2832
                        cur_state, final_state);
2833
2834
            /*
2835
             *   In the sub-state, start with the sub-machine's initial
2836
             *   state and finish with the sub-machine's final state. 
2837
             */
2838
            cur_state = tuple->info.sub.init;
2839
            final_state = tuple->info.sub.final;
2840
2841
            /* in the sub-state, the sub-string to match starts here */
2842
            start_ofs = p.getptr() - entire_str;
2843
2844
            /* 
2845
             *   just proceed from here; we'll finish up with the assertion
2846
             *   test when we reach the final sub-machine state and pop the
2847
             *   stack state we pushed 
2848
             */
2849
            goto got_next_state;
2850
2851
        case RE_LITERAL:
2852
            /* 
2853
             *   ordinary character match - if there's not another
2854
             *   character, we obviously fail 
2855
             */
2856
            if (curlen == 0)
2857
            {
2858
                local_return(-1);
2859
            }
2860
2861
            /* 
2862
             *   check case sensitivity - if we're not in case-sensitive
2863
             *   mode, and both characters are alphabetic, perform a
2864
             *   case-insensitive comparison; otherwise, perform an exact
2865
             *   comparison 
2866
             */
2867
            if (tuple->info.ch == p.getch())
2868
            {
2869
                /* 
2870
                 *   we have an exact match; there's no need to check for
2871
                 *   any case conversions 
2872
                 */
2873
            }
2874
            else if (!pattern->case_sensitive
2875
                     && t3_is_alpha(tuple->info.ch) && t3_is_alpha(p.getch()))
2876
            {
2877
                /* 
2878
                 *   We're performing a case-insensitive search, and both
2879
                 *   characters are alphabetic.  Convert the string
2880
                 *   character to the same case as the pattern character,
2881
                 *   then compare them.  Note that we always use the case of
2882
                 *   the pattern character, because this gives the pattern
2883
                 *   control over the handling for languages where
2884
                 *   conversions are ambiguous.  
2885
                 */
2886
                if (t3_is_upper(tuple->info.ch))
2887
                {
2888
                    /* 
2889
                     *   the pattern character is upper-case - convert the
2890
                     *   string character to upper-case and compare 
2891
                     */
2892
                    if (tuple->info.ch != t3_to_upper(p.getch()))
2893
                    {
2894
                        local_return(-1);
2895
                    }
2896
                }
2897
                else if (t3_is_lower(tuple->info.ch))
2898
                {
2899
                    /* 
2900
                     *   the pattern character is lower-case - compare to
2901
                     *   the lower-case string character 
2902
                     */
2903
                    if (tuple->info.ch != t3_to_lower(p.getch()))
2904
                    {
2905
                        local_return(-1);
2906
                    }
2907
                }
2908
                else
2909
                {
2910
                    /* 
2911
                     *   the pattern character is neither upper nor lower
2912
                     *   case, so we cannot perform a case conversion; we
2913
                     *   know the match isn't exact, so we don't have a
2914
                     *   match at all 
2915
                     */
2916
                    local_return(-1);
2917
                }
2918
            }
2919
            else
2920
            {
2921
                /* 
2922
                 *   the search is case-sensitive, or this pattern character
2923
                 *   is non-alphabetic; since we didn't find an exact match,
2924
                 *   the string does not match the pattern 
2925
                 */
2926
                local_return(-1);
2927
            }
2928
            
2929
            /* skip this character of the input */
2930
            p.inc(&curlen);
2931
            break;
2932
2933
        case RE_GROUP_ENTER:
2934
            /* 
2935
             *   if the group index (given by the state's character ID) is
2936
             *   in range, note the location in the string where the group
2937
             *   starts 
2938
             */
2939
            if (tuple->info.ch < RE_GROUP_REG_CNT)
2940
            {
2941
                /* save the register in the stack frame if necessary */
2942
                stack_.save_group_reg(tuple->info.ch, regs);
2943
                
2944
                /* store the new value */
2945
                regs[tuple->info.ch].start_ofs = p.getptr() - entire_str;
2946
            }
2947
            break;
2948
2949
        case RE_GROUP_EXIT:
2950
            /* 
2951
             *   note the string location of the end of the group, if the
2952
             *   group ID is in range 
2953
             */
2954
            if (tuple->info.ch < RE_GROUP_REG_CNT)
2955
            {
2956
                /* save the register in the stack frame if necessary */
2957
                stack_.save_group_reg(tuple->info.ch, regs);
2958
2959
                /* store the new value */
2960
                regs[tuple->info.ch].end_ofs = p.getptr() - entire_str;
2961
            }
2962
            break;
2963
2964
        case RE_EPSILON:
2965
            /* it's an epsilon transition */
2966
            if (tuple->next_state_2 == RE_STATE_INVALID)
2967
            {
2968
                /*
2969
                 *   We have only one transition, so this state is entirely
2970
                 *   deterministic.  Simply move on to the next state.  (We
2971
                 *   try to optimize these types of transitions out of the
2972
                 *   machine when compiling, but we handle them anyway in
2973
                 *   case any survive optimization.)  
2974
                 */
2975
                cur_state = tuple->next_state_1;
2976
            }
2977
            else
2978
            {
2979
            two_way_epsilon:
2980
                /*
2981
                 *   This state has two possible transitions, and we don't
2982
                 *   know which one to take.  So, try both, see which one
2983
                 *   works better, and return the result.  Try the first
2984
                 *   transition first.
2985
                 *   
2986
                 *   To test both branches, we push the machine state onto
2987
                 *   the stack, test the first branch, then restore the
2988
                 *   machine state and test the second branch.  We return
2989
                 *   the better of the two branches.
2990
                 *   
2991
                 *   Set up to test the first branch by pushing the current
2992
                 *   machine state, noting that we're in the first branch of
2993
                 *   a two-branch epsilon so that we'll know to proceed to
2994
                 *   the second branch when we finish with the first one.  
2995
                 */
2996
                stack_.push(ST_EPS1, start_ofs, p.getptr() - entire_str,
2997
                            cur_state, final_state);
2998
2999
                /* the next state is the first branch of the epsilon */
3000
                cur_state = tuple->next_state_1;
3001
3002
                /* match the sub-string from here */
3003
                start_ofs = p.getptr() - entire_str;
3004
3005
                /* we have the next state set to go */
3006
                goto got_next_state;
3007
            }
3008
            break;
3009
            
3010
        default:
3011
            /* invalid node type */
3012
            assert(FALSE);
3013
            local_return(-1);
3014
        }
3015
        
3016
        /* 
3017
         *   if we got this far, we were successful - move on to the next
3018
         *   state 
3019
         */
3020
        cur_state = tuple->next_state_1;
3021
        
3022
    got_next_state:
3023
        /* we come here when we've already figured out the next state */
3024
        ;
3025
3026
        /* 
3027
         *   If we're in the final state, it means we've matched the
3028
         *   pattern.  Return success by indicating the length of the string
3029
         *   we matched.  
3030
         */
3031
        if (cur_state == final_state)
3032
        {
3033
            local_return(p.getptr() - (entire_str + start_ofs));
3034
        }
3035
3036
        /* 
3037
         *   if we're in an invalid state, the expression must have been
3038
         *   ill-formed; return failure 
3039
         */
3040
        if (cur_state == RE_STATE_INVALID)
3041
        {
3042
            local_return(-1);
3043
        }
3044
3045
        /* resume the loop */
3046
        continue;
3047
3048
    do_local_return:
3049
        /* check what kind of state we're returning to */
3050
        switch(stack_.get_top_type())
3051
        {
3052
            int str_ofs;
3053
            int ret1, ret2;
3054
            int ret1_is_winner;
3055
               
3056
        case -1:
3057
            /*
3058
             *   There's nothing left on the state stack, so we're actually
3059
             *   returning to our caller.  Simply return the given return
3060
             *   value.  
3061
             */
3062
            return _retval_;
3063
3064
        case ST_EPS1:
3065
            /*
3066
             *   We're finishing the first branch of a two-branch epsilon.
3067
             *   Swap the current state with the saved state on the stack,
3068
             *   and push a new state for the second branch.  
3069
             */
3070
            str_ofs = p.getptr() - entire_str;
3071
            stack_.save_and_push(_retval_, ST_EPS2, &start_ofs, &str_ofs,
3072
                                 &cur_state, &final_state,
3073
                                 regs, loop_vars);
3074
3075
            /* the next state is the second branch */
3076
            cur_state = tuple_arr[cur_state].next_state_2;
3077
3078
            /* set up at the original string position */
3079
            p.set((char *)entire_str + str_ofs);
3080
            curlen = entire_str_len - str_ofs;
3081
3082
            /* the second branch substring starts at the current character */
3083
            start_ofs = p.getptr() - entire_str;
3084
3085
            /* continue from here */
3086
            break;
3087
3088
        case ST_EPS2:
3089
            /* 
3090
             *   We're finishing the second branch of a two-branch epsilon.
3091
             *   First, get the two return values - the first branch is the
3092
             *   return value saved in the second-from-the-top stack frame,
3093
             *   and the second branch is the current return value.  
3094
             */
3095
            ret1 = stack_.get_frame(1)->retval;
3096
            ret2 = _retval_;
3097
            
3098
            /*
3099
             *   If they both failed, the whole thing failed.  Otherwise,
3100
             *   return the longer or shorter of the two, depending on the
3101
             *   current match mode, plus the length we ourselves matched
3102
             *   previously.  Note that we return the register set from the
3103
             *   winning match.  
3104
             */
3105
            if (ret1 < 0 && ret2 < 0)
3106
            {
3107
                /* 
3108
                 *   They both failed - backtrack to the initial state by
3109
                 *   popping the second branch's frame (which stores the
3110
                 *   branch initial state) and discarding the first branch's
3111
                 *   frame (which stores the first branch's *final* state).
3112
                 *   Note that the only thing we care about from the stack
3113
                 *   is the group registers and loop variables; everything
3114
                 *   else will be restored from the enclosing frame that
3115
                 *   we're returning to.  
3116
                 */
3117
                stack_.pop(&start_ofs, &str_ofs, &cur_state, &final_state,
3118
                           regs, loop_vars);
3119
                stack_.discard();
3120
3121
                /* set the string pointer */
3122
                p.set((char *)entire_str + str_ofs);
3123
                curlen = entire_str_len - str_ofs;
3124
3125
                /* return failure */
3126
                local_return(-1);
3127
            }
3128
3129
            /* 
3130
             *   Choose the winner based on the match mode.  The match mode
3131
             *   of interest is the one in the original two-way epsilon,
3132
             *   which is the state in the stack frame at the top of the
3133
             *   stack.  
3134
             */
3135
            if (pattern->longest_match
3136
                && !(tuple_arr[stack_.get_frame(0)->state].flags
3137
                     & RE_STATE_SHORTEST))
3138
            {
3139
                /* 
3140
                 *   longest match - choose ret1 if ret2 isn't a good match,
3141
                 *   or ret1 is longer than ret2 
3142
                 */
3143
                ret1_is_winner = (ret2 < 0 || ret1 >= ret2);
3144
            }
3145
            else
3146
            {
3147
                /* 
3148
                 *   shortest match - choose ret1 if it's the only good
3149
                 *   match, or if they're both good and it's the shorter one 
3150
                 */
3151
                ret1_is_winner = ((ret1 >= 0 && ret2 < 0)
3152
                                  || (ret1 >= 0 && ret2 >= 0
3153
                                      && ret1 <= ret2));
3154
            }
3155
            
3156
            /* choose the winner */
3157
            if (ret1_is_winner)
3158
            {
3159
                /*
3160
                 *   The winner is the first branch.  First, pop the second
3161
                 *   branch's initial state, since this is the baseline for
3162
                 *   the first branch's final state. 
3163
                 */
3164
                stack_.pop(&start_ofs, &str_ofs, &cur_state, &final_state,
3165
                           regs, loop_vars);
3166
3167
                /* add in the length up to the initial state at the epsilon */
3168
                ret1 += str_ofs - start_ofs;
3169
3170
                /*
3171
                 *   Second, pop the first branch's final state.  This is
3172
                 *   stored relative to the initial state of the branch, so
3173
                 *   we're ready to restore it now. 
3174
                 */
3175
                stack_.pop(&start_ofs, &str_ofs, &cur_state, &final_state,
3176
                           regs, loop_vars);
3177
3178
                /* set the string pointer */
3179
                p.set((char *)entire_str + str_ofs);
3180
                curlen = entire_str_len - str_ofs;
3181
3182
                /* return the result */
3183
                local_return(ret1);
3184
            }
3185
            else
3186
            {
3187
                regex_stack_entry *fp;
3188
                
3189
                /*
3190
                 *   The winner is the second branch.  The current machine
3191
                 *   state is the final state for the second branch, so we
3192
                 *   simply need to discard the saved state for the two
3193
                 *   branches.  
3194
                 */
3195
3196
                /* add in the length up to the initial state at the epsilon */
3197
                fp = stack_.get_frame(0);
3198
                ret2 += fp->str_ofs - fp->start_ofs;
3199
3200
                /* discard the two branch states */
3201
                stack_.discard();
3202
                stack_.discard();
3203
                
3204
                /* return the result */
3205
                local_return(ret2);
3206
            }
3207
3208
        case ST_ASSERT:
3209
            /*
3210
             *   We're finishing an assertion sub-machine.  First, pop the
3211
             *   machine state to get back to where we were.  
3212
             */
3213
            stack_.pop(&start_ofs, &str_ofs, &cur_state, &final_state,
3214
                       regs, loop_vars);
3215
3216
            /* 
3217
             *   set the string pointer and remaining length for the string
3218
             *   offset we popped from the state 
3219
             */
3220
            p.set((char *)entire_str + str_ofs);
3221
            curlen = entire_str_len - str_ofs;
3222
3223
            /* 
3224
             *   If this is a positive assertion and it didn't match, return
3225
             *   failure; if this is a negative assertion and it did match,
3226
             *   return failure; otherwise, proceed without having consumed
3227
             *   any input 
3228
             */
3229
            if (tuple_arr[cur_state].typ == RE_ASSERT_POS
3230
                ? _retval_ < 0 : _retval_ >= 0)
3231
            {
3232
                local_return(-1);
3233
            }
3234
3235
            /* resume where we left off */
3236
            cur_state = tuple_arr[cur_state].next_state_1;
3237
            break;
3238
        }
3239
3240
        /* resume from here */
3241
        goto got_next_state;
3242
    }
3243
}
3244
3245
/* ------------------------------------------------------------------------ */
3246
/*
3247
 *   Search for a regular expression within a string.  Returns -1 if the
3248
 *   string cannot be found, otherwise returns the offset from the start
3249
 *   of the string to be searched of the start of the first match for the
3250
 *   pattern.  
3251
 */
3252
int CRegexSearcher::search(const char *entirestr, const char *str, size_t len,
3253
                           const re_compiled_pattern_base *pattern,
3254
                           const re_tuple *tuple_arr,
3255
                           const re_machine *machine, re_group_register *regs,
3256
                           int *result_len)
3257
{
3258
    utf8_ptr p;
3259
    re_group_register best_match_regs[RE_GROUP_REG_CNT];
3260
    short loop_vars[RE_LOOP_VARS_MAX];
3261
    int best_match_len;
3262
    int best_match_start;
3263
    const char *max_start_pos;
3264
3265
    /* we don't have a match yet */
3266
    best_match_len = -1;
3267
    best_match_start = -1;
3268
3269
    /* search the entire string */
3270
    max_start_pos = str + len;
3271
    
3272
    /*
3273
     *   Starting at the first character in the string, search for the
3274
     *   pattern at each subsequent character until we either find the
3275
     *   pattern or run out of string to test. 
3276
     */
3277
    for (p.set((char *)str) ; p.getptr() < max_start_pos ; p.inc(&len))
3278
    {
3279
        int matchlen;
3280
        
3281
        /* check for a match */
3282
        matchlen = match(entirestr, p.getptr(), len,
3283
                         pattern, tuple_arr, machine, regs, loop_vars);
3284
        if (matchlen >= 0)
3285
        {
3286
            /* check our first-begin/first-end mode */
3287
            if (pattern->first_begin)
3288
            {
3289
                /*   
3290
                 *   We're in first-begin mode: return immediately,
3291
                 *   because we want to find the match that starts at the
3292
                 *   earliest point in the string; having found a match
3293
                 *   here, there's no point in looking for a later match,
3294
                 *   since it obviously won't start any earlier than this
3295
                 *   one does.  
3296
                 */
3297
                *result_len = matchlen;
3298
                return p.getptr() - str;
3299
            }
3300
            else
3301
            {
3302
                int keep;
3303
                
3304
                /*
3305
                 *   We're in first-end mode.  We can't return yet,
3306
                 *   because we might find something that ends before this
3307
                 *   string does.
3308
                 *   
3309
                 *   If this is the first or best match so far, note it;
3310
                 *   otherwise, ignore it.  In any case, continue looking.
3311
                 */
3312
                if (best_match_len == -1)
3313
                {
3314
                    /* 
3315
                     *   this is our first match - it's definitely the
3316
                     *   best so far 
3317
                     */
3318
                    keep = TRUE;
3319
                }
3320
                else
3321
                {
3322
                    int end_idx;
3323
                    
3324
                    /* calculate the ending index of this match */
3325
                    end_idx = (p.getptr() - str) + matchlen;
3326
3327
                    /* see what we have */
3328
                    if (end_idx < best_match_start + best_match_len)
3329
                    {
3330
                        /* 
3331
                         *   this one ends earlier than the previous best
3332
                         *   match so far -- we definitely want to keep
3333
                         *   this one 
3334
                         */
3335
                        keep = TRUE;
3336
                    }
3337
                    else if (end_idx == best_match_start + best_match_len)
3338
                    {
3339
                        /* 
3340
                         *   This one ends at exactly the same point as
3341
                         *   our best match so far.  Decide which one to
3342
                         *   keep based on the lengths.  If we're in
3343
                         *   longest-match mode, keep the longer one,
3344
                         *   otherwise keep the shorter one. 
3345
                         */
3346
                        if (pattern->longest_match)
3347
                        {
3348
                            /* 
3349
                             *   longest-match mode - keep the old one,
3350
                             *   since it's longer (it starts earlier and
3351
                             *   ends at the same point) 
3352
                             */
3353
                            keep = FALSE;
3354
                        }
3355
                        else
3356
                        {
3357
                            /* 
3358
                             *   shortest-match mode - keep the new one,
3359
                             *   since it's shorter (it starts later and
3360
                             *   ends at the same point) 
3361
                             */
3362
                            keep = TRUE;
3363
                        }
3364
                    }
3365
                    else
3366
                    {
3367
                        /* 
3368
                         *   this one isn't as good as the previous best
3369
                         *   -- ignore this one and keep our previous
3370
                         *   winner 
3371
                         */
3372
                        keep = FALSE;
3373
                    }
3374
                }
3375
                
3376
                /* if we're keeping this match, save it */
3377
                if (keep)
3378
                {
3379
                    /* save the start index, length, and register contents */
3380
                    best_match_start = p.getptr() - str;
3381
                    best_match_len = matchlen;
3382
                    memcpy(best_match_regs, regs, sizeof(best_match_regs));
3383
3384
                    /*
3385
                     *   There's no point in looking for strings that
3386
                     *   start beyond the end of the current match,
3387
                     *   because they certainly won't end before this
3388
                     *   match ends.  So, adjust our end-of-loop marker to
3389
                     *   point to the next character past the end of our
3390
                     *   match. 
3391
                     */
3392
                    max_start_pos = p.getptr() + matchlen;
3393
                }
3394
            }
3395
        }
3396
    }
3397
3398
    /* if we found a previous match, return it */
3399
    if (best_match_len != -1)
3400
    {
3401
        /* set the caller's match length */
3402
        *result_len = best_match_len;
3403
3404
        /* copy the saved match registers into the caller's registers */
3405
        memcpy(regs, best_match_regs, sizeof(best_match_regs));
3406
3407
        /* return the starting index of the match */
3408
        return best_match_start;
3409
    }
3410
3411
    /* we didn't find a match */
3412
    return -1;
3413
}
3414
3415
/* ------------------------------------------------------------------------ */
3416
/*
3417
 *   Search for a previously-compiled pattern within the given string.
3418
 *   Returns the offset of the match, or -1 if no match was found.  
3419
 */
3420
int CRegexSearcher::search_for_pattern(
3421
    const re_compiled_pattern *pattern,
3422
    const char *entirestr, const char *searchstr, size_t searchlen,
3423
    int *result_len, re_group_register *regs)
3424
{
3425
    /* 
3426
     *   search for the pattern in our copy of the string - use the copy so
3427
     *   that the group registers stay valid even if the caller deallocates
3428
     *   the original string after we return 
3429
     */
3430
    return search(entirestr, searchstr, searchlen, pattern, pattern->tuples,
3431
                  &pattern->machine, regs, result_len);
3432
}
3433
3434
/* ------------------------------------------------------------------------ */
3435
/*
3436
 *   Check for a match to a previously compiled expression.  Returns the
3437
 *   length of the match if we found a match, -1 if we found no match.  This
3438
 *   is not a search function; we merely match the leading substring of the
3439
 *   given string to the given pattern.  
3440
 */
3441
int CRegexSearcher::match_pattern(
3442
    const re_compiled_pattern *pattern,
3443
    const char *entirestr, const char *searchstr, size_t searchlen, 
3444
    re_group_register *regs)
3445
{
3446
    short loop_vars[RE_LOOP_VARS_MAX];
3447
3448
    /* match the string */
3449
    return match(entirestr, searchstr, searchlen,
3450
                 pattern, pattern->tuples, &pattern->machine,
3451
                 regs, loop_vars);
3452
}
3453
3454
/* ------------------------------------------------------------------------ */
3455
/*
3456
 *   Compile an expression and check for a match.  Returns the length of the
3457
 *   match if we found a match, -1 if we found no match.  This is not a
3458
 *   search function; we merely match the leading substring of the given
3459
 *   string to the given pattern.  
3460
 *   
3461
 *   If a pattern pattern will be used repeatedly in multiple searches or
3462
 *   matches, it is more efficient to compile the pattern once and then
3463
 *   re-use the compiled pattern for each search or match, since compiling a
3464
 *   pattern takes time.  
3465
 */
3466
int CRegexSearcherSimple::compile_and_match(
3467
    const char *patstr, size_t patlen,
3468
    const char *entirestr, const char *searchstr, size_t searchlen)
3469
{
3470
    re_compiled_pattern_base pat;
3471
    short loop_vars[RE_LOOP_VARS_MAX];
3472
3473
    /* no groups yet */
3474
    group_cnt_ = 0;
3475
3476
    /* clear the group registers */
3477
    clear_group_regs(regs_);
3478
3479
    /* compile the expression - return failure if we get an error */
3480
    if (parser_->compile(patstr, patlen, &pat) != RE_STATUS_SUCCESS)
3481
        return FALSE;
3482
3483
    /* remember the group count from the compiled pattern */
3484
    group_cnt_ = pat.group_cnt;
3485
3486
    /* match the string */
3487
    return match(entirestr, searchstr, searchlen,
3488
                 &pat, parser_->tuple_arr_, &pat.machine, regs_, loop_vars);
3489
}
3490
3491
/* ------------------------------------------------------------------------ */
3492
/*
3493
 *   Compile an expression and search for a match within the given string.
3494
 *   Returns the offset of the match, or -1 if no match was found.
3495
 *   
3496
 *   If a pattern will be used repeatedly in multiple searches or matches, it
3497
 *   is more efficient to compile the pattern once, using compile_pattern(),
3498
 *   and then re-use the compiled pattern for each search, since compiling a
3499
 *   pattern takes time.  
3500
 */
3501
int CRegexSearcherSimple::compile_and_search(
3502
    const char *patstr, size_t patlen,
3503
    const char *entirestr, const char *searchstr, size_t searchlen,
3504
    int *result_len)
3505
{
3506
    re_compiled_pattern_base pat;
3507
3508
    /* no groups yet */
3509
    group_cnt_ = 0;
3510
3511
    /* clear the group registers */
3512
    clear_group_regs(regs_);
3513
3514
    /* compile the expression - return failure if we get an error */
3515
    if (parser_->compile(patstr, patlen, &pat) != RE_STATUS_SUCCESS)
3516
        return -1;
3517
3518
    /* remember the group count from the compiled pattern */
3519
    group_cnt_ = pat.group_cnt;
3520
3521
    /* 
3522
     *   search for the pattern in our copy of the string - use the copy so
3523
     *   that the group registers stay valid even if the caller deallocates
3524
     *   the original string after we return 
3525
     */
3526
    return search(entirestr, searchstr, searchlen,
3527
                  &pat, parser_->tuple_arr_, &pat.machine,
3528
                  regs_, result_len);
3529
}