cfad47cfa3/tads3/vmregex.h

4b825dc642cb6eb9a060e54bf8d69288fbee4904cfad47cfa334b206c65f22086bcc5d63e6f70944
1
/* 
2
 *   Copyright (c) 1998, 2002 Michael J. Roberts.  All Rights Reserved.
3
 *   
4
 *   Please see the accompanying license file, LICENSE.TXT, for information
5
 *   on using and copying this software.  
6
 */
7
/*
8
Name
9
  vmregex.h - regular expression parser for T3
10
Function
11
  
12
Notes
13
  Adapted from the TADS 2 regular expression parser.  This version
14
  uses UTF-8 strings rather than simple single-byte character strings,
15
  and is organized into a C++ class.
16
Modified
17
  04/11/99 CNebel     - Fix warnings.
18
  10/07/98 MJRoberts  - Creation
19
*/
20
21
#ifndef VMREGEX_H
22
#define VMREGEX_H
23
24
#include <stdlib.h>
25
#include "t3std.h"
26
#include "vmuni.h"
27
#include "utf8.h"
28
#include "vmerr.h"
29
#include "vmerrnum.h"
30
31
32
/* state ID */
33
typedef int re_state_id;
34
35
/* invalid state ID - used to mark null machines */
36
#define RE_STATE_INVALID   ((re_state_id)-1)
37
38
/* first valid state ID */
39
#define RE_STATE_FIRST_VALID  ((re_state_id)0)
40
41
42
/* ------------------------------------------------------------------------ */
43
/*
44
 *   Group register structure.  Each register keeps track of the starting
45
 *   and ending offset of the group's text within the original search
46
 *   string.  
47
 */
48
struct re_group_register
49
{
50
    int start_ofs;
51
    int end_ofs;
52
};
53
54
/* maximum number of group registers we keep */
55
#define RE_GROUP_REG_CNT  10
56
57
/* maximum group nesting depth */
58
#define RE_GROUP_NESTING_MAX  20
59
60
/* 
61
 *   the maximum number of separate loop variables we need is the same as
62
 *   the group nesting level, since we only need one loop variable per
63
 *   nested group 
64
 */
65
#define RE_LOOP_VARS_MAX  RE_GROUP_NESTING_MAX
66
67
68
69
/* ------------------------------------------------------------------------ */
70
/*
71
 *   Recognizer types. 
72
 */
73
74
enum re_recog_type
75
{
76
    /* invalid/uninitialized */
77
    RE_INVALID,
78
79
    /* literal character recognizer */
80
    RE_LITERAL,
81
82
    /* "epsilon" recognizer - match without consuming anything */
83
    RE_EPSILON,
84
85
    /* wildcard character */
86
    RE_WILDCARD,
87
88
    /* beginning and end of text */
89
    RE_TEXT_BEGIN,
90
    RE_TEXT_END,
91
92
    /* start and end of a word */
93
    RE_WORD_BEGIN,
94
    RE_WORD_END,
95
96
    /* word-char and non-word-char */
97
    RE_WORD_CHAR,
98
    RE_NON_WORD_CHAR,
99
100
    /* word-boundary and non-word-boundary */
101
    RE_WORD_BOUNDARY,
102
    RE_NON_WORD_BOUNDARY,
103
104
    /* a character range/exclusion range */
105
    RE_RANGE,
106
    RE_RANGE_EXCL,
107
108
    /* group entry/exit transition */
109
    RE_GROUP_ENTER,
110
    RE_GROUP_EXIT,
111
112
    /* 
113
     *   group matcher - the character code has the group number (0 for
114
     *   group 0, etc) rather than a literal to match 
115
     */
116
    RE_GROUP_MATCH,
117
118
    /* any alphabetic character */
119
    RE_ALPHA,
120
121
    /* any digit */
122
    RE_DIGIT,
123
124
    /* any upper-case alphabetic */
125
    RE_UPPER,
126
127
    /* any lower-case alphabetic */
128
    RE_LOWER,
129
130
    /* any alphanumeric */
131
    RE_ALPHANUM,
132
133
    /* space character */
134
    RE_SPACE,
135
136
    /* punctuation character */
137
    RE_PUNCT,
138
139
    /* newline character */
140
    RE_NEWLINE,
141
142
    /* null character (used in range recognizers) */
143
    RE_NULLCHAR,
144
145
    /* positive assertion */
146
    RE_ASSERT_POS,
147
148
    /* negative assertion */
149
    RE_ASSERT_NEG,
150
151
    /* loop entry: zero the associated loop variable */
152
    RE_ZERO_VAR,
153
154
    /* loop branch: inspect loop criteria and branch accordingly */
155
    RE_LOOP_BRANCH
156
};
157
158
159
/* ------------------------------------------------------------------------ */
160
/* 
161
 *   Denormalized state transition tuple.  Each tuple represents the
162
 *   complete set of transitions out of a particular state.  A particular
163
 *   state can have one character transition, or two epsilon transitions.
164
 *   Note that we don't need to store the state ID of the tuple itself in
165
 *   the tuple, because the state ID is the index of the tuple in an array
166
 *   of state tuples.  
167
 */
168
struct re_tuple
169
{
170
    /* recognizer type */
171
    re_recog_type typ;
172
173
    /* the character we must match to transition to the target state */
174
    union
175
    {
176
        /* 
177
         *   if this is a character transition, this is the character (used
178
         *   as the character literal in RE_LITERAL, and as the group ID in
179
         *   RE_GROUP_MATCH and in RE_EPSILON nodes with the group flag set) 
180
         */
181
        wchar_t ch;
182
183
        /* 
184
         *   if this has a sub-machine, this is the start and end info (used
185
         *   for RE_ASSERT_POS, RE_ASSERT_NEG) 
186
         */
187
        struct
188
        {
189
            re_state_id init;
190
            re_state_id final;
191
        } sub;
192
193
        /* 
194
         *   if this is a loop, the loop parameters (used for RE_ZERO_VAR,
195
         *   RE_LOOP_BRANCH) 
196
         */
197
        struct
198
        {
199
            int loop_min;
200
            int loop_max;
201
            int loop_var;
202
        } loop;
203
204
        /* 
205
         *   Character range match table - this is used if the recognizer
206
         *   type is RE_RANGE or RE_RANGE_EXCL; for other recognizer types,
207
         *   this is not used.
208
         *   
209
         *   If used, this is an array of pairs of characters.  In each pair,
210
         *   the first is the low end of the range, and the second is the
211
         *   high end of the range, both ends inclusive.  A single character
212
         *   takes up two entries, both identical, to specify a range of only
213
         *   one character.
214
         *   
215
         *   If the first character is '\0', then neither wchar_t is a
216
         *   character in the ordinary sense described above.  Instead, the
217
         *   second wchar_t is actually one of the recognizer type codes
218
         *   (re_recog_type) for a character class (RE_ALPHA, RE_DIGIT, etc).
219
         *   The pair in this case is to be taken to match (or exclude) the
220
         *   entire class.
221
         *   
222
         *   To represent a match for '\0', use '\0' for the first wchar_t
223
         *   and RE_NULLCHAR for the second wchar_t.  Note that the special
224
         *   meaning of '\0' in the first character of a pair makes it
225
         *   impossible to represent a range including a null byte with a
226
         *   single pair; instead, representing a range like [\000-\017]
227
         *   requires two pairs: the first pair is ('\0', RE_NULLCHAR), and
228
         *   the second pair is ('\001', '\017').  
229
         */
230
        struct
231
        {
232
            wchar_t *char_range;
233
            size_t char_range_cnt;
234
        } range;
235
236
    } info;
237
238
    /* the target states */
239
    re_state_id next_state_1;
240
    re_state_id next_state_2;
241
242
    /* flags */
243
    unsigned char flags;
244
};
245
246
247
/*
248
 *   Tuple flags 
249
 */
250
251
/* this state is being tested for a cycle */
252
#define RE_STATE_CYCLE_TEST   0x08
253
254
/* 
255
 *   for branching states: take the shortest, rather than longest, branch
256
 *   when both branches are successful 
257
 */
258
#define RE_STATE_SHORTEST     0x10
259
260
261
/* ------------------------------------------------------------------------ */
262
/*
263
 *   A "machine" description.  A machines is fully described by its initial
264
 *   and final state ID's.  
265
 */
266
struct re_machine
267
{
268
    /* the machine's initial state */
269
    re_state_id init;
270
271
    /* the machine's final state */
272
    re_state_id final;
273
};
274
275
276
/* ------------------------------------------------------------------------ */
277
/*
278
 *   Compiled pattern description.  This is not a complete compiled pattern,
279
 *   since the tuple array is separate; this is just a description of the
280
 *   compiled pattern that can be combined with the tuple array to form a
281
 *   full compiled pattern.
282
 */
283
struct re_compiled_pattern_base
284
{
285
    /* the pattern's machine description */
286
    re_machine machine;
287
288
    /* the number of tuples in the tuple array */
289
    re_state_id tuple_cnt;
290
291
    /* number of capturing groups in the expression */
292
    int group_cnt;
293
294
    /* maximum number of looping variables in the expression */
295
    int loop_var_cnt;
296
297
    /*
298
     *   <Case> or <NoCase> mode.  If this flag is clear, the search is not
299
     *   case-sensitive, so alphabetic characters in the pattern are matched
300
     *   without regard to case.  
301
     */
302
    int case_sensitive : 1;
303
304
    /* 
305
     *   <MIN> or <MAX> match mode -- if this flag is set, we match the
306
     *   longest string in case of ambiguity; otherwise we match the
307
     *   shortest.  
308
     */
309
    int longest_match : 1;
310
311
    /* 
312
     *   <FirstEnd> or <FirstBeg> match mode -- if this flag is set, we
313
     *   match (in a search) the string that starts first in case of
314
     *   ambiguity; otherwise, we match the string that ends first 
315
     */
316
    int first_begin : 1;
317
};
318
319
/*
320
 *   Compiled pattern object.  This is a pattern compiled and saved for use
321
 *   in searches and matches.  This is a compiled pattern description
322
 *   coupled with its tuple array, which in combination provide a complete
323
 *   compiled pattern.  
324
 */
325
struct re_compiled_pattern: re_compiled_pattern_base
326
{
327
    /* 
328
     *   the tuple array (the structure is overallocated to make room for
329
     *   tuple_cnt entries in this array) 
330
     */
331
    re_tuple tuples[1];
332
};
333
334
/* ------------------------------------------------------------------------ */
335
/*
336
 *   Status codes 
337
 */
338
typedef enum
339
{
340
    /* success */
341
    RE_STATUS_SUCCESS = 0,
342
343
    /* compilation error - group nesting too deep */
344
    RE_STATUS_GROUP_NESTING_TOO_DEEP
345
346
} re_status_t;
347
348
349
/* ------------------------------------------------------------------------ */
350
/*
351
 *   Regular expression compilation context structure.  This tracks the
352
 *   state of the compilation and stores the resources associated with the
353
 *   compiled expression.  
354
 */
355
class CRegexParser
356
{
357
    friend class CRegexSearcher;
358
    friend class CRegexSearcherSimple;
359
        
360
public:
361
    /* initialize */
362
    CRegexParser();
363
364
    /* delete */
365
    ~CRegexParser();
366
367
    /* 
368
     *   Compile an expression and create a compiled pattern object, filling
369
     *   in *pattern with a pointer to the newly-allocated pattern object.
370
     *   The caller is responsible for freeing the pattern by calling
371
     *   free_pattern(pattern).  
372
     */
373
    re_status_t compile_pattern(const char *expr_str, size_t exprlen,
374
                                re_compiled_pattern **pattern);
375
376
    /* free a pattern previously created with compile_pattern() */
377
    static void free_pattern(re_compiled_pattern *pattern);
378
379
    /* set the default case sensitivity */
380
    void set_default_case_sensitive(int f) { default_case_sensitive_ = f; }
381
382
protected:
383
    /* reset the parser */
384
    void reset();
385
386
    /* allocate a new state ID */
387
    re_state_id alloc_state();
388
389
    /* set a transition from a state to a given destination state */
390
    void set_trans(re_state_id id, re_state_id dest_id,
391
                   re_recog_type typ, wchar_t ch);
392
393
    /* initialize a new machine, setting up the initial and final state */
394
    void init_machine(struct re_machine *machine);
395
396
    /* build a character recognizer */
397
    void build_char(struct re_machine *machine, wchar_t ch);
398
399
    /* build a special recognizer */
400
    void build_special(struct re_machine *machine,
401
                       re_recog_type typ, wchar_t ch);
402
403
    /* build a character range recognizer */
404
    void build_char_range(struct re_machine *machine, int exclusion);
405
406
    /* build a group recognizer */
407
    void build_group_matcher(struct re_machine *machine, int group_num);
408
409
    /* build a concatenation recognizer */
410
    void build_concat(struct re_machine *new_machine,
411
                      struct re_machine *lhs, struct re_machine *rhs);
412
413
    /* build a group machine */
414
    void build_group(struct re_machine *new_machine,
415
                     struct re_machine *sub_machine, int group_id);
416
417
    /* build a positive or negative assertion machine */
418
    void build_assert(struct re_machine *new_machine,
419
                      struct re_machine *sub_machine, int is_negative);
420
421
    /* build an alternation recognizer */
422
    void build_alter(struct re_machine *new_machine,
423
                     struct re_machine *lhs, struct re_machine *rhs);
424
425
    /* build a closure recognizer */
426
    void build_closure(struct re_machine *new_machine,
427
                       struct re_machine *sub, wchar_t specifier,
428
                       int shortest);
429
430
    /* build an interval matcher */
431
    void build_interval(struct re_machine *new_machine,
432
                        struct re_machine *sub, int min_val, int max_val,
433
                        int var_id, int shortest);
434
435
    /* build a null machine */
436
    void build_null_machine(struct re_machine *machine);
437
438
    /* determine if a machine is null */
439
    int is_machine_null(struct re_machine *machine);
440
441
    /* concate the second machine onto the first machine */
442
    void concat_onto(struct re_machine *dest, struct re_machine *rhs);
443
444
    /* alternate the second machine onto the first */
445
    void alternate_onto(struct re_machine *dest, struct re_machine *rhs);
446
447
    /* compile an expression */
448
    re_status_t compile(const char *expr_str, size_t exprlen,
449
                        re_compiled_pattern_base *pat);
450
451
    /* compile a character class or class range expression */
452
    int compile_char_class_expr(utf8_ptr *expr, size_t *exprlen,
453
                                re_machine *result_machine);
454
455
    /* parse an integer value */
456
    int parse_int(utf8_ptr *p, size_t *rem);
457
458
    /* add a character to our range buffer */
459
    void add_range_char(wchar_t ch) { add_range_char(ch, ch); }
460
    void add_range_char(wchar_t ch_lo, wchar_t ch_hi);
461
462
    /* add a character class to our range buffer */
463
    void add_range_class(re_recog_type cl);
464
465
    /* ensure space in the range buffer for another entry */
466
    void ensure_range_buf_space();
467
468
    /* break any infinite loops in the machine */
469
    void break_loops(re_machine *machine);
470
471
    /* find an infinite loop back to the given state */
472
    int find_loop(re_machine *machine, re_state_id cur_state);
473
474
    /* optimize away meaningless branch-to-branch transitions */
475
    void remove_branch_to_branch(re_machine *machine);
476
    void optimize_transition(const re_machine *machine, re_state_id *trans);
477
478
    /* next available state ID */
479
    re_state_id next_state_;
480
481
    /*
482
     *   The array of transition tuples.  We'll allocate this array and
483
     *   expand it as necessary.  
484
     */
485
    re_tuple *tuple_arr_;
486
487
    /* number of transition tuples allocated in the array */
488
    int tuples_alloc_;
489
490
    /* buffer for building range exprssions */
491
    wchar_t *range_buf_;
492
493
    /* current number of entries in range buffer */
494
    size_t range_buf_cnt_;
495
496
    /* maximum number of entries in range buffer */
497
    size_t range_buf_max_;
498
499
    /* are our expressions case-sensitive by default? */
500
    int default_case_sensitive_;
501
};
502
503
/* ------------------------------------------------------------------------ */
504
/*
505
 *   Pattern recognizer state stack.  Each time we need to process a
506
 *   sub-state (a two-way epsilon, or a nested assertion), we stack the
507
 *   current state so that we can backtrack when we're done with the
508
 *   sub-expression.  The state we store consists of:
509
 *   
510
 *   - backtrack type - this is an arbitrary uchar identifier that the
511
 *   pattern matcher uses to identify where to go when we pop the state
512
 *   
513
 *   - the current state ID
514
 *   
515
 *   - the current offset in the string being matched
516
 *   
517
 *   - the state ID of the terminating state of the machine
518
 *   
519
 *   - saved group registers; we only store the ones we've actually
520
 *   modified, to avoid unnecessary copying
521
 *   
522
 *   - saved loop variables; we only store the ones we've actually modified,
523
 *   to avoid unnecessary copying 
524
 */
525
526
/* the base structure for a stacked state */
527
struct regex_stack_entry
528
{
529
    /* the backtrack type identifier */
530
    short typ;
531
532
    /* the starting offset in the string */
533
    int start_ofs;
534
535
    /* the current offset in the string */
536
    int str_ofs;
537
538
    /* the pattern state */
539
    re_state_id state;
540
541
    /* the final state of the machine */
542
    re_state_id final;
543
544
    /* the return value for this state */
545
    int retval;
546
547
    /* stack offset of previous frame */
548
    int prv_sp;
549
};
550
551
/* saved group/loop entry */
552
struct regex_stack_var
553
{
554
    /* 
555
     *   The ID - this is in the range 0..RE_GROUP_REG_CNT-1 for group
556
     *   registers, RE_GROUP_REG_CNT..RE_GROUP_REG_CNT+RE_LOOP_VARS_MAX-1
557
     *   for loop variables.  In other words, a loop variable is identified
558
     *   by its loop variable number plus RE_GROUP_REG_CNT.  The special ID
559
     *   value -1 indicates the integer 'retval' value (a saved return value
560
     *   for the stack state).  
561
     */
562
    int id;
563
564
    /* the value */
565
    union
566
    {
567
        re_group_register group;
568
        short loopvar;
569
        int retval;
570
    } val;
571
};
572
573
/* state stack class */
574
class CRegexStack
575
{
576
public:
577
    CRegexStack()
578
    {
579
        /* allocate the initial state buffer */
580
        bufsiz_ = 8192;
581
        buf_ = (char *)t3malloc(bufsiz_);
582
583
        /* we don't have anything on the stack yet */
584
        sp_ = -1;
585
        used_ = 0;
586
    }
587
588
    ~CRegexStack()
589
    {
590
        /* delete the stack buffer */
591
        t3free(buf_);
592
    }
593
594
    /* reset the stack */
595
    void reset()
596
    {
597
        /* empty the stack */
598
        sp_ = -1;
599
        used_ = 0;
600
    }
601
602
    /* push a new state */
603
    void push(ushort typ, int start_ofs, int str_ofs,
604
              re_state_id state, re_state_id final)
605
    {
606
        regex_stack_entry *fp;
607
608
        /* 
609
         *   Ensure we have enough space for the base state structure plus a
610
         *   full complement of group registers, loop variables, and return
611
         *   value.  We might not actually need all of the registers and
612
         *   loop variables, so we won't commit all of this space yet, but
613
         *   check in advance to make sure we have it so that we don't have
614
         *   to check again when and if we get around to consuming
615
         *   group/loop slots.  
616
         */
617
        ensure_space(sizeof(regex_stack_entry)
618
                     + ((RE_GROUP_REG_CNT + RE_LOOP_VARS_MAX + 1)
619
                        *sizeof(regex_stack_var)));
620
621
        /* allocate the base stack frame */
622
        fp = (regex_stack_entry *)alloc_space(sizeof(regex_stack_entry));
623
624
        /* set it up */
625
        fp->typ = typ;
626
        fp->start_ofs = start_ofs;
627
        fp->str_ofs = str_ofs;
628
        fp->state = state;
629
        fp->final = final;
630
631
        /* push it onto the stack */
632
        fp->prv_sp = sp_;
633
        sp_ = (char *)fp - buf_;
634
    }
635
636
    /* save a group register */
637
    void save_group_reg(int id, const re_group_register *regs)
638
    {
639
        regex_stack_var *var;
640
641
        /* allocate a new slot if needed and save the value */
642
        if (sp_ != -1 && (var = new_reg_or_var(id)) != 0)
643
            var->val.group = regs[id];
644
    }
645
646
    /* save a loop variable */
647
    void save_loop_var(int id, const short *loop_vars)
648
    {
649
        regex_stack_var *var;
650
651
        /* 
652
         *   allocate a new slot if needed and save the value; note that
653
         *   loop variables are identified by the loop variable ID plus the
654
         *   base index RE_GROUP_REG_CNT 
655
         */
656
        if (sp_ != -1 && (var = new_reg_or_var(id + RE_GROUP_REG_CNT)) != 0)
657
            var->val.loopvar = loop_vars[id];
658
    }
659
660
    /* 
661
     *   get the type of the state at top of stack; if there is no state,
662
     *   returns -1 
663
     */
664
    int get_top_type()
665
    {
666
        /* 
667
         *   if there's nothing on the stack, so indicate, otherwise get the
668
         *   type from the top stack element 
669
         */
670
        if (sp_ == -1)
671
            return -1;
672
        else
673
            return ((regex_stack_entry *)(buf_ + sp_))->typ;
674
    }
675
676
    /* get the stack frame at the given depth (0 is top of stack) */
677
    regex_stack_entry *get_frame(int depth)
678
    {
679
        regex_stack_entry *fp;
680
681
        /* traverse the given number of frames from the top of the stack */
682
        for (fp = (regex_stack_entry *)(buf_ + sp_) ; depth != 0 ; --depth)
683
            fp = (regex_stack_entry *)(buf_ + fp->prv_sp);
684
685
        /* return the frame pointer */
686
        return fp;
687
    }
688
689
    /* pop a state */
690
    void pop(int *start_ofs, int *str_ofs,
691
             re_state_id *state, re_state_id *final,
692
             re_group_register *regs, short *loop_vars)
693
    {
694
        regex_stack_entry *fp;
695
        regex_stack_var *var;
696
697
        /* get the stack pointer */
698
        fp = (regex_stack_entry *)(buf_ + sp_);
699
700
        /* restore the string offset and state ID */
701
        *start_ofs = fp->start_ofs;
702
        *str_ofs = fp->str_ofs;
703
        *state = fp->state;
704
        *final = fp->final;
705
        
706
        /* run through the saved registers/variables in the state */
707
        for (var = (regex_stack_var *)(fp + 1) ;
708
             var < (regex_stack_var *)(buf_ + used_) ; ++var)
709
        {
710
            /* sense the type */
711
            if (var->id < RE_GROUP_REG_CNT)
712
            {
713
                /* it's a group register */
714
                regs[var->id] = var->val.group;
715
            }
716
            else
717
            {
718
                /* it's a loop variable */
719
                loop_vars[var->id - RE_GROUP_REG_CNT] = var->val.loopvar;
720
            }
721
        }
722
723
        /* we're done with the stop stack frame, so discard it */
724
        discard();
725
    }
726
727
    /* discard the top stack state */
728
    void discard()
729
    {
730
        regex_stack_entry *fp;
731
732
        /* get the stack pointer */
733
        fp = (regex_stack_entry *)(buf_ + sp_);
734
735
        /* unwind the stack */
736
        used_ = (size_t)sp_;
737
        sp_ = fp->prv_sp;
738
    }
739
740
    /* 
741
     *   Save the current state at the top of the stack and push a new
742
     *   state.  This is used to traverse the second branch of a two-branch
743
     *   epsilon: we first have to save the results of the first branch,
744
     *   including the return value and its registers, and we then have to
745
     *   restore the initial register/loop state as it was before the first
746
     *   branch.
747
     *   
748
     *   We save the final state and restore the initial state by swapping
749
     *   the group registers in the saved state with those in the current
750
     *   state.  This brings back the initial conditions to the current
751
     *   machine state, while saving everything that's changed in the
752
     *   current machine state in the stack frame.  We'll likewise swap the
753
     *   machine state and string offset.  Later, this same final machine
754
     *   state can be restored by first restoring the machine state to the
755
     *   initial state, then popping this frame.
756
     *   
757
     *   On return, the stack frame that was active on entry will be set to
758
     *   contain the current machine state, and the current machine state
759
     *   will be replaced with what was in that stack frame.  In addition,
760
     *   we'll have pushed a new stack frame for the new current machine
761
     *   state.  
762
     */
763
    void save_and_push(int retval,
764
                       ushort typ, int *start_ofs, int *str_ofs,
765
                       re_state_id *state, re_state_id *final,
766
                       re_group_register *regs, short *loop_vars)
767
    {
768
        regex_stack_entry *fp;
769
        regex_stack_var *var;
770
        int tmp_ofs;
771
        re_state_id tmp_id;
772
773
        /* get the stack pointer */
774
        fp = (regex_stack_entry *)(buf_ + sp_);
775
776
        /* swap the string offset */
777
        tmp_ofs = *str_ofs;
778
        *str_ofs = fp->str_ofs;
779
        fp->str_ofs = tmp_ofs;
780
781
        /* swap the starting offset */
782
        tmp_ofs = *start_ofs;
783
        *start_ofs = fp->start_ofs;
784
        fp->start_ofs = tmp_ofs;
785
786
        /* swap the current machine state */
787
        tmp_id = *state;
788
        *state = fp->state;
789
        fp->state = tmp_id;
790
791
        /* swap the final machine state */
792
        tmp_id = *final;
793
        *final = fp->final;
794
        fp->final = tmp_id;
795
796
        /* swap all group and loop registers with the current state */
797
        for (var = (regex_stack_var *)(fp + 1) ;
798
             var < (regex_stack_var *)(buf_ + used_) ; ++var)
799
        {
800
            /* sense the type */
801
            if (var->id < RE_GROUP_REG_CNT)
802
            {
803
                re_group_register tmp;
804
                
805
                /* it's a group register */
806
                tmp = regs[var->id];
807
                regs[var->id] = var->val.group;
808
                var->val.group = tmp;
809
            }
810
            else
811
            {
812
                short tmp;
813
                
814
                /* it's a loop variable */
815
                tmp = loop_vars[var->id - RE_GROUP_REG_CNT];
816
                loop_vars[var->id - RE_GROUP_REG_CNT] = var->val.loopvar;
817
                var->val.loopvar = tmp;
818
            }
819
        }
820
821
        /* save the return value from the outgoing state */
822
        fp->retval = retval;
823
824
        /* push a copy of the restored state */
825
        push(typ, *start_ofs, *str_ofs, *state, *final);
826
    }
827
828
protected:
829
    /* 
830
     *   allocate a new register or group variable in the stack frame; if we
831
     *   find an existing copy of the same variable, we'll return null to
832
     *   indicate that we don't have to save it again 
833
     */
834
    regex_stack_var *new_reg_or_var(int id)
835
    {
836
        regex_stack_entry *fp;
837
        regex_stack_var *var;
838
839
        /* get the stack pointer */
840
        fp = (regex_stack_entry *)(buf_ + sp_);
841
842
        /* scan the frame for a register/variable with this ID */
843
        for (var = (regex_stack_var *)(fp + 1) ;
844
             var < (regex_stack_var *)(buf_ + used_) ; ++var)
845
        {
846
            /* if this is the one, we don't need to save it again */
847
            if (var->id == id)
848
                return 0;
849
        }
850
851
        /* we didn't find it, so return a new entry with the given ID */
852
        var = (regex_stack_var *)alloc_space(sizeof(regex_stack_var));
853
        var->id = id;
854
        return var;
855
    }
856
    
857
    /* ensure space in our stack buffer */
858
    void ensure_space(size_t siz)
859
    {
860
        /* if it's within range, we're fine */
861
        if (used_ + siz <= bufsiz_)
862
            return;
863
864
        /* expand */
865
        bufsiz_ += 8192;
866
867
        /* if it's too large, throw an error */
868
        if (bufsiz_ > OSMALMAX)
869
            err_throw(VMERR_OUT_OF_MEMORY);
870
871
        /* reallocate at the new size */
872
        buf_ = (char *)t3realloc(buf_, bufsiz_);
873
874
        /* make sure we're not out of memory */
875
        if (buf_ == 0)
876
            err_throw(VMERR_OUT_OF_MEMORY);
877
    }
878
879
    /* 
880
     *   allocate space - the caller must have already checked that space is
881
     *   available 
882
     */
883
    char *alloc_space(size_t siz)
884
    {
885
        char *ret;
886
887
        /* figure out where the new object goes */
888
        ret = buf_ + used_;
889
890
        /* consume the space */
891
        used_ += siz;
892
893
        /* return the allocated space */
894
        return ret;
895
    }
896
897
    /* the stack buffer */
898
    char *buf_;
899
    size_t bufsiz_;
900
901
    /* offset of current stack frame */
902
    int sp_;
903
904
    /* number of bytes used so far */
905
    size_t used_;
906
};
907
908
/* ------------------------------------------------------------------------ */
909
/*
910
 *   Regular Expression Searcher/Matcher.  This object encapsulates the
911
 *   group registers associated with a search.  
912
 */
913
class CRegexSearcher
914
{
915
public:
916
    CRegexSearcher();
917
    ~CRegexSearcher();
918
919
    /*
920
     *   Search for a compiled pattern.  Returns the byte offset of the
921
     *   match, or -1 if no match was found.  *result_len is filled in with
922
     *   the byte length of the match if we found one.  Note that the
923
     *   returned index and result_len values are byte lengths, not
924
     *   character lengths.
925
     *   
926
     *   The caller is responsible for providing a set of group registers,
927
     *   which must be an array of registers of size RE_GROUP_REG_CNT.  The
928
     *   caller also must save the original search string if it will be
929
     *   necessary to extract substrings based on the group registers.  
930
     */
931
    int search_for_pattern(const re_compiled_pattern *pattern,
932
                           const char *entirestr,
933
                           const char *searchstr, size_t searchlen,
934
                           int *result_len, re_group_register *regs);
935
936
    /*
937
     *   Check for a match to a previously compiled expression.  Returns the
938
     *   length of the match if we found a match, -1 if we found no match.
939
     *   This is not a search function; we merely match the leading
940
     *   substring of the given string to the given pattern.  Note that the
941
     *   returned length is a byte length, not a character length.  
942
     *   
943
     *   The caller is responsible for providing a set of group registers,
944
     *   which must be an array of registers of size RE_GROUP_REG_CNT.  The
945
     *   caller also must save the original search string if it will be
946
     *   necessary to extract substrings based on the group registers.  
947
     */
948
    int match_pattern(const re_compiled_pattern *pattern,
949
                      const char *entirestr,
950
                      const char *searchstr, size_t searchlen,
951
                      re_group_register *regs);
952
953
protected:
954
    /* match a string to a compiled expression */
955
    int match(const char *entire_str,
956
              const char *str, size_t origlen,
957
              const re_compiled_pattern_base *pattern,
958
              const re_tuple *tuple_arr,
959
              const struct re_machine *machine,
960
              re_group_register *regs, short *loop_vars);
961
962
    /* search for a regular expression within a string */
963
    int search(const char *entire_str,
964
               const char *str, size_t len,
965
               const re_compiled_pattern_base *pattern,
966
               const re_tuple *tuple_arr,
967
               const struct re_machine *machine,
968
               re_group_register *regs, int *result_len);
969
970
    /* clear a set of group registers */
971
    void clear_group_regs(re_group_register *regs)
972
    {
973
        int i;
974
        re_group_register *r;
975
976
        /* set the start and end offsets for all registers to -1 */
977
        for (r = regs, i = 0 ; i < RE_GROUP_REG_CNT ; ++i, ++r)
978
            r->start_ofs = r->end_ofs = -1;
979
    }
980
981
    /*
982
     *   Determine if a character is part of a word.  We consider letters
983
     *   and numbers to be word characters.  
984
     */
985
    static int is_word_char(wchar_t c)
986
    {
987
        return (t3_is_alpha(c) || t3_is_digit(c));
988
    }
989
990
    /* match state stack */
991
    CRegexStack stack_;
992
};
993
994
/*
995
 *   Simplified Searcher - this class provides some high-level methods that
996
 *   simplify one-off searches that combine compilation and searching into
997
 *   one step. 
998
 */
999
class CRegexSearcherSimple: public CRegexSearcher
1000
{
1001
public:
1002
    CRegexSearcherSimple(class CRegexParser *parser)
1003
    {
1004
        /* remember my parser */
1005
        parser_ = parser;
1006
    }
1007
1008
    ~CRegexSearcherSimple()
1009
    {
1010
    }
1011
1012
    /* search for a pattern, using our internal group registers */
1013
    int search_for_pattern(const re_compiled_pattern *pattern,
1014
                           const char *entirestr,
1015
                           const char *searchstr, size_t searchlen,
1016
                           int *result_len)
1017
    {
1018
        /* remember the group count from the compiled pattern */
1019
        group_cnt_ = pattern->group_cnt;
1020
1021
        /* clear the group registers */
1022
        clear_group_regs(regs_);
1023
1024
        /* search for the compiled pattern using our group register */
1025
        return CRegexSearcher::search_for_pattern(
1026
            pattern, entirestr, searchstr, searchlen, result_len, regs_);
1027
    }
1028
1029
    /* match a pattern, using our internal group registers */
1030
    int match_pattern(const re_compiled_pattern *pattern,
1031
                      const char *entirestr,
1032
                      const char *searchstr, size_t searchlen)
1033
    {
1034
        /* remember the group count from the compiled pattern */
1035
        group_cnt_ = pattern->group_cnt;
1036
1037
        /* clear the group registers */
1038
        clear_group_regs(regs_);
1039
1040
        /* search for the compiled pattern using our group register */
1041
        return CRegexSearcher::match_pattern(
1042
            pattern, entirestr, searchstr, searchlen, regs_);
1043
    }
1044
1045
    /*
1046
     *   Compile an expression and search for a match within the given
1047
     *   string.  Returns the byte offset of the match, or -1 if no match
1048
     *   was found.  *result_len is filled in with the byte length of the
1049
     *   match if we found one.  Note that the returned index and result_len
1050
     *   values are byte lengths, not character lengths.  
1051
     */
1052
    int compile_and_search(const char *pattern, size_t patlen,
1053
                           const char *entirestr,
1054
                           const char *searchstr, size_t searchlen,
1055
                           int *result_len);
1056
1057
    /*
1058
     *   Compile an expression and check for a match.  Returns the byte
1059
     *   length of the match if we found a match, -1 if we found no match.
1060
     *   This is not a search function; we merely match the leading
1061
     *   substring of the given string to the given pattern.  Note that the
1062
     *   returned length is a byte length, not a character length.  
1063
     */
1064
    int compile_and_match(const char *pattern, size_t patlen,
1065
                          const char *entirestr,
1066
                          const char *searchstr, size_t searchlen);
1067
1068
    /*
1069
     *   Get a group register.  0 refers to the first group; groups are
1070
     *   numbered in left-to-right order by their opening parenthesis.  
1071
     */
1072
    const re_group_register *get_group_reg(int i) const { return &regs_[i]; }
1073
1074
    /* get the number of groups in the last pattern we searched */
1075
    int get_group_cnt() const { return group_cnt_; }
1076
1077
protected:
1078
    /* group registers */
1079
    re_group_register regs_[RE_GROUP_REG_CNT];
1080
1081
    /* number of groups in last pattern we searched */
1082
    int group_cnt_;
1083
1084
    /* my regular expression parser */
1085
    class CRegexParser *parser_;
1086
};
1087
1088
#endif /* VMREGEX_H */
1089