cfad47cfa3/tads3/vmgram.cpp

4b825dc642cb6eb9a060e54bf8d69288fbee4904cfad47cfa334b206c65f22086bcc5d63e6f70944
1
#ifdef RCSID
2
static char RCSid[] =
3
"$Header$";
4
#endif
5
6
/* 
7
 *   Copyright (c) 2000, 2002 Michael J. Roberts.  All Rights Reserved.
8
 *   
9
 *   Please see the accompanying license file, LICENSE.TXT, for information
10
 *   on using and copying this software.  
11
 */
12
/*
13
Name
14
  vmgram.cpp - T3 Grammar Production metaclass
15
Function
16
  
17
Notes
18
  
19
Modified
20
  02/15/00 MJRoberts  - Creation
21
*/
22
23
#include <stdlib.h>
24
#include <string.h>
25
#include <assert.h>
26
27
#include "t3std.h"
28
#include "vmtype.h"
29
#include "vmstack.h"
30
#include "vmlst.h"
31
#include "vmstr.h"
32
#include "vmgram.h"
33
#include "vmerr.h"
34
#include "vmerrnum.h"
35
#include "vmmeta.h"
36
#include "vmdict.h"
37
#include "vmtobj.h"
38
#include "vmpredef.h"
39
40
41
/* ------------------------------------------------------------------------ */
42
/*
43
 *   Grammar production object - memory pool.  This is a simple
44
 *   suballocator that obtains large blocks from the system allocator,
45
 *   then hands out pieces of those blocks.  Allocating is very cheap
46
 *   (just a pointer increment in most cases), and we don't track
47
 *   individual allocations and deletions but just throw away all
48
 *   suballocated memory at once.  
49
 */
50
51
/* size of each memory block */
52
const size_t VMGRAMPROD_MEM_BLOCK_SIZE = 16*1024;
53
54
/*
55
 *   memory pool block 
56
 */
57
struct CVmGramProdMemBlock
58
{
59
    /* next block in chain */
60
    CVmGramProdMemBlock *nxt_;
61
62
    /* bytes of the block */
63
    char buf_[VMGRAMPROD_MEM_BLOCK_SIZE];
64
};
65
66
/*
67
 *   memory pool object 
68
 */
69
class CVmGramProdMem
70
{
71
public:
72
    CVmGramProdMem()
73
    {
74
        /* no memory blocks yet */
75
        block_head_ = 0;
76
        block_cur_ = 0;
77
78
        /* we don't yet have a block */
79
        cur_rem_ = 0;
80
        cur_free_ = 0;
81
    }
82
83
    ~CVmGramProdMem()
84
    {
85
        /* delete each of our blocks */
86
        while (block_head_ != 0)
87
        {
88
            CVmGramProdMemBlock *nxt;
89
            
90
            /* remember the next block */
91
            nxt = block_head_->nxt_;
92
            
93
            /* delete this block */
94
            t3free(block_head_);
95
96
            /* move on to the next */
97
            block_head_ = nxt;
98
        }
99
    }
100
101
    /* reset - delete all previously sub-allocated memory */
102
    void reset()
103
    {
104
        /* start over with the first block */
105
        block_cur_ = block_head_;
106
107
        /* initialize the free pointer for the new block, if we have one */
108
        if (block_cur_ != 0)
109
        {
110
            /* start at the beginning of the new block */
111
            cur_free_ = block_cur_->buf_;
112
113
            /* this entire block is available */
114
            cur_rem_ = VMGRAMPROD_MEM_BLOCK_SIZE;
115
        }
116
        else
117
        {
118
            /* there are no blocks, so there's no memory available yet */
119
            cur_rem_ = 0;
120
            cur_free_ = 0;
121
        }
122
    }
123
124
    /* allocate memory */
125
    void *alloc(size_t siz)
126
    {
127
        void *ret;
128
        
129
        /* round the size to the local hardware boundary */
130
        siz = osrndsz(siz);
131
132
        /* if it exceeds the block size, fail */
133
        if (siz > VMGRAMPROD_MEM_BLOCK_SIZE)
134
            return 0;
135
136
        /* if we don't have enough space in this block, go to the next */
137
        if (siz > cur_rem_)
138
        {
139
            /* allocate a new block if necessary */
140
            if (block_cur_ == 0)
141
            {
142
                /* there's nothing in the list yet - set up the first block */
143
                block_head_ = (CVmGramProdMemBlock *)
144
                              t3malloc(sizeof(CVmGramProdMemBlock));
145
146
                /* activate the new block */
147
                block_cur_ = block_head_;
148
149
                /* there's nothing after this block */
150
                block_cur_->nxt_ = 0;
151
            }
152
            else if (block_cur_->nxt_ == 0)
153
            {
154
                /* we're at the end of the list - add a block */
155
                block_cur_->nxt_ = (CVmGramProdMemBlock *)
156
                                   t3malloc(sizeof(CVmGramProdMemBlock));
157
158
                /* advance to the new block */
159
                block_cur_ = block_cur_->nxt_;
160
161
                /* the new block is the last in the chain */
162
                block_cur_->nxt_ = 0;
163
            }
164
            else
165
            {
166
                /* another block follows - advance to it */
167
                block_cur_ = block_cur_->nxt_;
168
            }
169
170
            /* start at the beginning of the new block */
171
            cur_free_ = block_cur_->buf_;
172
173
            /* this entire block is available */
174
            cur_rem_ = VMGRAMPROD_MEM_BLOCK_SIZE;
175
        }
176
177
        /* the return value will be the current free pointer */
178
        ret = cur_free_;
179
180
        /* consume the space */
181
        cur_free_ += siz;
182
        cur_rem_ -= siz;
183
184
        /* return the block location */
185
        return ret;
186
    }
187
188
private:
189
    /* head of block list */
190
    CVmGramProdMemBlock *block_head_;
191
192
    /* block we're currently suballocating out of */
193
    CVmGramProdMemBlock *block_cur_;
194
195
    /* current free pointer in current block */
196
    char *cur_free_;
197
198
    /* amount of space remaining in current block */
199
    size_t cur_rem_;
200
};
201
202
/* ------------------------------------------------------------------------ */
203
/*
204
 *   Input token array entry.  We make a private copy of the input token list
205
 *   (i.e., the token list we're trying to match to our grammar rules) during
206
 *   parsing using one of these structures for each token.  
207
 */
208
struct vmgramprod_tok
209
{
210
    /* original token value */
211
    vm_val_t val_;
212
213
    /* token type */
214
    ulong typ_;
215
216
    /* token text */
217
    const char *txt_;
218
219
    /* length of the token text */
220
    size_t len_;
221
222
    /* hash value of the token */
223
    unsigned int hash_;
224
    
225
    /* number of vocabulary matches associated with the word */
226
    size_t match_cnt_;
227
228
    /* pointer to list of matches for the word */
229
    vmgram_match_info *matches_;
230
};
231
232
233
/* ------------------------------------------------------------------------ */
234
/*
235
 *   Parsing state match object.  A match can be terminal, in which case
236
 *   it refers to a single token of input, or it can be a production, in
237
 *   which case it refers to a subtree of match objects.  
238
 */
239
struct CVmGramProdMatch
240
{
241
    /* allocate */
242
    static CVmGramProdMatch *alloc(CVmGramProdMem *mem, size_t tok_pos,
243
                                   const vm_val_t *tok_match_result,
244
                                   int matched_star,
245
                                   vm_prop_id_t target_prop,
246
                                   vm_obj_id_t proc_obj,
247
                                   const CVmGramProdMatch **sub_match_list,
248
                                   size_t sub_match_cnt)
249
    {
250
        CVmGramProdMatch *match;
251
        
252
        /* allocate space */
253
        match = (CVmGramProdMatch *)mem->alloc(sizeof(CVmGramProdMatch));
254
255
        /* initialize */
256
        match->tok_pos_ = tok_pos;
257
        match->matched_star_ = matched_star;
258
        match->target_prop_ = target_prop;
259
        match->proc_obj_ = proc_obj;
260
        match->sub_match_list_ = sub_match_list;
261
        match->sub_match_cnt_ = sub_match_cnt;
262
263
        /* save the token match value, if one was given */
264
        if (tok_match_result != 0)
265
            match->tok_match_result_ = *tok_match_result;
266
        else
267
            match->tok_match_result_.set_nil();
268
269
        /* return the new match */
270
        return match;
271
    }
272
273
    /* property ID with which this match is associated */
274
    vm_prop_id_t target_prop_;
275
    
276
    /* 
277
     *   token position of the matching token; ignored if the production
278
     *   subtree information is valid 
279
     */
280
    size_t tok_pos_;
281
282
    /*
283
     *   The token match value.  This is the result code from the Dictionary
284
     *   comparator's matchValues() function, which typically encodes
285
     *   information about how the value matched (truncation, case folding,
286
     *   accent approximation, etc) in bitwise flags.  
287
     */
288
    vm_val_t tok_match_result_;
289
290
    /* 
291
     *   flag: this is a '*' token in the rule; these don't really match any
292
     *   tokens at all, since they merely serve to stop parsing regardless
293
     *   of whether or not more input tokens are present 
294
     */
295
    int matched_star_;
296
297
    /* 
298
     *   object ID of match processor - if this is valid, this match
299
     *   refers to a production; otherwise, the match refers to a single
300
     *   token 
301
     */
302
    vm_obj_id_t proc_obj_;
303
304
    /* pointer to array of sub-matches */
305
    const CVmGramProdMatch **sub_match_list_;
306
307
    /* number of sub-match entries */
308
    size_t sub_match_cnt_;
309
};
310
311
/*
312
 *   Top-level match list holder 
313
 */
314
struct CVmGramProdMatchEntry
315
{
316
    /* the match tree */
317
    CVmGramProdMatch *match_;
318
319
    /* token position at the end of the match */
320
    size_t tok_pos_;
321
322
    /* next entry in the list */
323
    CVmGramProdMatchEntry *nxt_;
324
};
325
326
/*
327
 *   Parsing state object.  At any given time, we will have one or more of
328
 *   these state objects in our processing queue.  A state object tracks a
329
 *   parsing position - the token position, the alternative position, the
330
 *   match list so far for the alternative (i.e., the match for each item
331
 *   in the alternative up to but not including the current position), and
332
 *   the enclosing parsing state object (i.e., the parsing state that
333
 *   "recursed" into this alternative).  
334
 */
335
struct CVmGramProdState
336
{
337
    /* allocate */
338
    static CVmGramProdState *alloc(CVmGramProdMem *mem,
339
                                   int tok_pos, int alt_pos,
340
                                   CVmGramProdState *enclosing,
341
                                   const vmgram_alt_info *altp,
342
                                   vm_obj_id_t prod_obj,
343
                                   int circular_alt)
344
    {
345
        size_t match_cnt;
346
        CVmGramProdState *state;
347
348
        /* get the number of match list slots we'll need */
349
        match_cnt = altp->tok_cnt;
350
351
        /* allocate space, including space for the match list */
352
        state = (CVmGramProdState *)
353
                mem->alloc(sizeof(CVmGramProdState)
354
                           + (match_cnt - 1)*sizeof(CVmGramProdMatch *));
355
356
        /* initialize */
357
        state->init(tok_pos, alt_pos, enclosing, altp, prod_obj,
358
                    circular_alt);
359
360
        /* return the new item */
361
        return state;
362
    }
363
    
364
    /* initialize */
365
    void init(int tok_pos, int alt_pos, CVmGramProdState *enclosing,
366
              const vmgram_alt_info *altp, vm_obj_id_t prod_obj,
367
              int circular_alt)
368
    {
369
        /* remember the position parameters */
370
        tok_pos_ = tok_pos;
371
        alt_pos_ = alt_pos;
372
373
        /* we haven't matched a '*' token yet */
374
        matched_star_ = FALSE;
375
376
        /* remember our stack position */
377
        enclosing_ = enclosing;
378
379
        /* remember our alternative image data pointer */
380
        altp_ = altp;
381
382
        /* remember the production object */
383
        prod_obj_ = prod_obj;
384
385
        /* note if we had circular alternatives in the same production */
386
        circular_alt_ = circular_alt;
387
388
        /* no target property for sub-production yet */
389
        sub_target_prop_ = VM_INVALID_PROP;
390
391
        /* we're not in the queue yet */
392
        nxt_ = 0;
393
    }
394
395
    /* clone the state */
396
    CVmGramProdState *clone(CVmGramProdMem *mem)
397
    {
398
        CVmGramProdState *new_state;
399
        CVmGramProdState *new_enclosing;
400
401
        /* clone our enclosing state */
402
        new_enclosing = (enclosing_ != 0 ? enclosing_->clone(mem) : 0);
403
        
404
        /* create a new state object */
405
        new_state = alloc(mem, tok_pos_, alt_pos_, new_enclosing,
406
                          altp_, prod_obj_, circular_alt_);
407
408
        /* copy the target sub-production property */
409
        new_state->sub_target_prop_ = sub_target_prop_;
410
411
        /* copy the valid portion of my match list */
412
        if (alt_pos_ != 0)
413
            memcpy(new_state->match_list_, match_list_,
414
                   alt_pos_ * sizeof(match_list_[0]));
415
416
        /* we're not yet in any queue */
417
        new_state->nxt_ = 0;
418
419
        /* set the '*' flag in the cloned state */
420
        new_state->matched_star_ = matched_star_;
421
422
        /* return the clone */
423
        return new_state;
424
    }
425
426
    /* current token position, as an index into the token list */
427
    size_t tok_pos_;
428
429
    /* 
430
     *   flag: we've matched a '*' token in a production or subproduction,
431
     *   so we should consider all remaining tokens to have been consumed
432
     *   (we don't advance tok_pos_ past all of the tokens, because we
433
     *   separately want to keep track of the number of tokens we matched
434
     *   for everything up to the '*') 
435
     */
436
    int matched_star_;
437
438
    /* 
439
     *   current alternative position, as an index into the alternative's
440
     *   list of items 
441
     */
442
    size_t alt_pos_;
443
444
    /* 
445
     *   the enclosing state object - this is the state object that
446
     *   "recursed" to our position 
447
     */
448
    CVmGramProdState *enclosing_;
449
450
    /* pointer to alternative definition */
451
    const vmgram_alt_info *altp_;
452
453
    /* object ID of the production object */
454
    vm_obj_id_t prod_obj_;
455
456
    /* the target property for the pending sub-production */
457
    vm_prop_id_t sub_target_prop_;
458
459
    /* next production state in the processing queue */
460
    CVmGramProdState *nxt_;
461
462
    /* the production contains circular alternatives */
463
    int circular_alt_;
464
465
    /* 
466
     *   Match list.  This list is allocated with enough space for one
467
     *   match for each of our items (the number of items is given by
468
     *   vmgram_alt_tokcnt(altp_)).  At any given time, these will be
469
     *   valid up to but not including the current alt_pos_ index.  
470
     */
471
    const struct CVmGramProdMatch *match_list_[1];
472
};
473
474
/*
475
 *   Queues.  We maintain three queues: the primary work queue, which
476
 *   contains pending parsing states that we're still attempting to match;
477
 *   the success queue, which contains a list of match trees for
478
 *   successfully completed matches; and the badness queue, which contains
479
 *   a list of parsing states that we can try as fallbacks if we fail to
480
 *   find a match in any of the work queue items.  
481
 */
482
struct CVmGramProdQueue
483
{
484
    /* clear the queues */
485
    void clear()
486
    {
487
        work_queue_ = 0;
488
        badness_queue_ = 0;
489
        success_list_ = 0;
490
    }
491
492
    /* head of work queue */
493
    CVmGramProdState *work_queue_;
494
495
    /* head of badness queue */
496
    CVmGramProdState *badness_queue_;
497
498
    /* head of success list */
499
    CVmGramProdMatchEntry *success_list_;
500
};
501
502
/* ------------------------------------------------------------------------ */
503
/*
504
 *   Grammar-Production object statics 
505
 */
506
507
/* metaclass registration object */
508
static CVmMetaclassGramProd metaclass_reg_obj;
509
CVmMetaclass *CVmObjGramProd::metaclass_reg_ = &metaclass_reg_obj;
510
511
/* function table */
512
int (CVmObjGramProd::
513
     *CVmObjGramProd::func_table_[])(VMG_ vm_obj_id_t self,
514
                                     vm_val_t *retval, uint *argc) =
515
{
516
    &CVmObjGramProd::getp_undef,
517
    &CVmObjGramProd::getp_parse,
518
    &CVmObjGramProd::getp_get_gram_info
519
};
520
521
/* ------------------------------------------------------------------------ */
522
/*
523
 *   Grammar-Production metaclass implementation 
524
 */
525
526
/* 
527
 *   create dynamically using stack arguments 
528
 */
529
vm_obj_id_t CVmObjGramProd::create_from_stack(VMG_ const uchar **pc_ptr,
530
                                              uint argc)
531
{
532
    /* this type of object cannot be dynamically created */
533
    err_throw(VMERR_BAD_DYNAMIC_NEW);
534
535
    /* we won't reach this point, but the compiler might not know */
536
    AFTER_ERR_THROW(return VM_INVALID_OBJ;)
537
}
538
539
/*
540
 *   constructor 
541
 */
542
CVmObjGramProd::CVmObjGramProd(VMG0_)
543
{
544
    /* allocate our extension structure from the variable heap */
545
    ext_ = (char *)G_mem->get_var_heap()
546
           ->alloc_mem(sizeof(vm_gram_ext), this);
547
548
    /* we have no image data yet */
549
    get_ext()->image_data_ = 0;
550
    get_ext()->image_data_size_ = 0;
551
552
    /* we haven't set up our alternatives yet */
553
    get_ext()->alts_ = 0;
554
    get_ext()->alt_cnt_ = 0;
555
556
    /* we haven't cached any hash values yet */
557
    get_ext()->hashes_cached_ = FALSE;
558
    get_ext()->comparator_ = VM_INVALID_OBJ;
559
560
    /* presume we have no circular rules */
561
    get_ext()->has_circular_alt = FALSE;
562
563
    /* create our memory pool */
564
    get_ext()->mem_ = new CVmGramProdMem();
565
566
    /* 
567
     *   allocate initial property enumeration space (we'll expand this as
568
     *   needed, so the initial size is just a guess) 
569
     */
570
    get_ext()->prop_enum_max_ = 64;
571
    get_ext()->prop_enum_arr_ = (vmgram_match_info *)
572
                                t3malloc(get_ext()->prop_enum_max_
573
                                         * sizeof(vmgram_match_info));
574
}
575
576
/* 
577
 *   notify of deletion 
578
 */
579
void CVmObjGramProd::notify_delete(VMG_ int in_root_set)
580
{
581
    /* free our additional data */
582
    if (ext_ != 0)
583
    {
584
        /* 
585
         *   If we've allocated our alternatives, delete the memory.  We
586
         *   allocate the entire complex structure in one block, which we use
587
         *   for the alts_ array, so we merely need to delete that one block.
588
         */
589
        if (get_ext()->alts_ != 0)
590
            t3free(get_ext()->alts_);
591
592
        /* delete our memory pool */
593
        delete get_ext()->mem_;
594
595
        /* delete our property enumeration space */
596
        t3free(get_ext()->prop_enum_arr_);
597
        
598
        /* free the extension */
599
        G_mem->get_var_heap()->free_mem(ext_);
600
    }
601
}
602
603
/* 
604
 *   get a property 
605
 */
606
int CVmObjGramProd::get_prop(VMG_ vm_prop_id_t prop, vm_val_t *retval,
607
                             vm_obj_id_t self, vm_obj_id_t *source_obj,
608
                             uint *argc)
609
{
610
    uint func_idx;
611
    
612
    /* translate the property index to an index into our function table */
613
    func_idx = G_meta_table
614
               ->prop_to_vector_idx(metaclass_reg_->get_reg_idx(), prop);
615
616
    /* call the appropriate function */
617
    if ((this->*func_table_[func_idx])(vmg_ self, retval, argc))
618
    {
619
        *source_obj = metaclass_reg_->get_class_obj(vmg0_);
620
        return TRUE;
621
    }
622
623
    /* inherit default handling */
624
    return CVmObject::get_prop(vmg_ prop, retval, self, source_obj, argc);
625
}
626
627
/* 
628
 *   set a property 
629
 */
630
void CVmObjGramProd::set_prop(VMG_ class CVmUndo *undo,
631
                              vm_obj_id_t self, vm_prop_id_t prop,
632
                              const vm_val_t *val)
633
{
634
    /* no settable properties - throw an error */
635
    err_throw(VMERR_INVALID_SETPROP);
636
}
637
638
/* 
639
 *   load from an image file 
640
 */
641
void CVmObjGramProd::load_from_image(VMG_ vm_obj_id_t self,
642
                                     const char *ptr, size_t siz)
643
{
644
    const char *p;
645
    size_t alt_cnt;
646
    size_t tok_cnt;
647
    size_t i;
648
    size_t alo_siz;
649
    vmgram_alt_info *next_alt;
650
    vmgram_tok_info *next_tok;
651
    char *next_byte;
652
653
    /* remember where our image data comes from */
654
    get_ext()->image_data_ = ptr;
655
    get_ext()->image_data_size_ = siz;
656
657
    /* 
658
     *   For our alternatives structure, start with the base amount of
659
     *   memory, which is the amount we'll need for the array of alternative
660
     *   entries themselves.  The first UINT2 gives the number of
661
     *   alternatives in our grammar rule.  
662
     */
663
    alt_cnt = osrp2(ptr);
664
    alo_siz = alt_cnt * sizeof(vmgram_alt_info);
665
666
    /* 
667
     *   Scan the alternatives.  Check for circular (left-recursive)
668
     *   alternatives, and add up how much memory we'll need for our decoded
669
     *   alternatives structure. 
670
     */
671
    for (i = alt_cnt, p = ptr + 2, tok_cnt = 0 ; i != 0 ; --i)
672
    {
673
        const char *tokp;
674
        size_t alt_tok_cnt;
675
        size_t j;
676
677
        /* get the token count and first token for this alternative */
678
        alt_tok_cnt = vmgram_alt_tokcnt(p);
679
        tokp = vmgram_alt_tokptr(p);
680
681
        /* 
682
         *   For our allocation size, add the amount of memory we need for
683
         *   this alternative entry's array of tokens. 
684
         */
685
        alo_siz += alt_tok_cnt * sizeof(vmgram_tok_info);
686
687
        /* include these tokens in the cumulative token count */
688
        tok_cnt += alt_tok_cnt;
689
690
        /* if the alternative is circular, note it */
691
        if (alt_tok_cnt != 0
692
            && vmgram_tok_type(tokp) == VMGRAM_MATCH_PROD
693
            && vmgram_tok_prod_obj(tokp) == self)
694
        {
695
            /* note the existence of a circular reference */
696
            get_ext()->has_circular_alt = TRUE;
697
        }
698
699
        /* scan the tokens to see how much memory we'll need to store them */
700
        for (j = 0 ; j < alt_tok_cnt ; ++j, tokp = get_next_alt_tok(tokp))
701
        {
702
            /* check what sort of extra memory we'll need for this token */
703
            switch(vmgram_tok_type(tokp))
704
            {
705
            case VMGRAM_MATCH_NSPEECH:
706
                /* we need the array of vm_prop_id_t's */
707
                alo_siz += osrndsz(vmgram_tok_vocn_cnt(tokp)
708
                                   * sizeof(vm_prop_id_t));
709
                break;
710
                
711
            case VMGRAM_MATCH_LITERAL:
712
                /* we need space for the string */
713
                alo_siz += osrndsz(vmgram_tok_lit_len(tokp));
714
                break;
715
            }
716
        }
717
718
        /* the next alternative starts after the last token */
719
        p = tokp;
720
    }
721
722
    /* 
723
     *   Allocate our block of memory to store the decoded alternatives.  Put
724
     *   the whole thing in one block, and put the alternative array at the
725
     *   start of the block. 
726
     */
727
    get_ext()->alt_cnt_ = alt_cnt;
728
    get_ext()->alts_ = next_alt = (vmgram_alt_info *)t3malloc(alo_siz);
729
730
    /* allocate the tokens right after the alternative array */
731
    next_tok = (vmgram_tok_info *)&get_ext()->alts_[alt_cnt];
732
733
    /* allocate the other miscellaneous data after the token array */
734
    next_byte = (char *)&next_tok[tok_cnt];
735
736
    /* run through the image data again and decode it */
737
    for (i = alt_cnt, p = ptr + 2, tok_cnt = 0 ; i != 0 ; --i)
738
    {
739
        const char *tokp;
740
        size_t alt_tok_cnt;
741
        size_t j;
742
743
        /* get the token count and first token for this alternative */
744
        alt_tok_cnt = vmgram_alt_tokcnt(p);
745
        tokp = vmgram_alt_tokptr(p);
746
747
        /* set up this alternative structure */
748
        next_alt->score = vmgram_alt_score(p);
749
        next_alt->badness = vmgram_alt_badness(p);
750
        next_alt->proc_obj = vmgram_alt_procobj(p);
751
        next_alt->tok_cnt = vmgram_alt_tokcnt(p);
752
        next_alt->toks = next_tok;
753
754
        /* consume this alternative entry */
755
        ++next_alt;
756
757
        /* scan and decode the tokens */
758
        for (j = 0 ; j < alt_tok_cnt ; ++j, tokp = get_next_alt_tok(tokp))
759
        {
760
            size_t k;
761
            
762
            /* set up this token's base information */
763
            next_tok->prop = vmgram_tok_prop(tokp);
764
            next_tok->typ = vmgram_tok_type(tokp);
765
766
            /* decode the type-specific data */
767
            switch(vmgram_tok_type(tokp))
768
            {
769
            case VMGRAM_MATCH_PROD:
770
                next_tok->typinfo.prod_obj = vmgram_tok_prod_obj(tokp);
771
                break;
772
773
            case VMGRAM_MATCH_SPEECH:
774
                next_tok->typinfo.speech_prop = vmgram_tok_voc_prop(tokp);
775
                break;
776
                
777
            case VMGRAM_MATCH_NSPEECH:
778
                /* set up our array, using the miscellaneous space pool */
779
                next_tok->typinfo.nspeech.cnt = vmgram_tok_vocn_cnt(tokp);
780
                next_tok->typinfo.nspeech.props = (vm_prop_id_t *)next_byte;
781
782
                /* copy the properties */
783
                for (k = 0 ; k < next_tok->typinfo.nspeech.cnt ; ++k)
784
                    next_tok->typinfo.nspeech.props[k] =
785
                        vmgram_tok_vocn_prop(tokp, k);
786
787
                /* consume the space from the pool */
788
                next_byte += osrndsz(next_tok->typinfo.nspeech.cnt
789
                                     * sizeof(vm_prop_id_t));
790
                break;
791
792
            case VMGRAM_MATCH_LITERAL:
793
                /* take the space out of the miscellaneous space pool */
794
                next_tok->typinfo.lit.len = vmgram_tok_lit_len(tokp);
795
                next_tok->typinfo.lit.str = next_byte;
796
797
                /* copy the data */
798
                memcpy(next_tok->typinfo.lit.str, vmgram_tok_lit_txt(tokp),
799
                       next_tok->typinfo.lit.len);
800
801
                /* consume the space from the pool */
802
                next_byte += osrndsz(next_tok->typinfo.lit.len);
803
                break;
804
805
            case VMGRAM_MATCH_TOKTYPE:
806
                next_tok->typinfo.toktyp_enum = vmgram_tok_tok_enum(tokp);
807
                break;
808
            }
809
810
            /* consume the token slot */
811
            ++next_tok;
812
        }
813
814
        /* the next alternative starts after the last token */
815
        p = tokp;
816
    }
817
}
818
819
/*
820
 *   Remove stale weak references. 
821
 */
822
void CVmObjGramProd::remove_stale_weak_refs(VMG0_)
823
{
824
    /* 
825
     *   Our reference to the dictionary comparator object is weak (we're
826
     *   only caching it - we don't want to prevent the object from being
827
     *   collected if no one else wants it).  So, forget it if the comparator
828
     *   is being deleted.  
829
     */
830
    if (get_ext()->comparator_ != VM_INVALID_OBJ
831
        && G_obj_table->is_obj_deletable(get_ext()->comparator_))
832
    {
833
        /* forget the comparator */
834
        get_ext()->comparator_ = VM_INVALID_OBJ;
835
836
        /* 
837
         *   our cached hash values depend on the comparator, so they're now
838
         *   invalid - forget about them 
839
         */
840
        get_ext()->hashes_cached_ = FALSE;
841
    }
842
}
843
844
845
/*
846
 *   Context for enum_props_cb 
847
 */
848
struct enum_props_ctx
849
{
850
    /* object extension */
851
    vm_gram_ext *ext;
852
    
853
    /* number of properties in our list so far */
854
    size_t cnt;
855
};
856
857
/*
858
 *   Callback for enumerating word properties 
859
 */
860
void CVmObjGramProd::enum_props_cb(VMG_ void *ctx0, vm_prop_id_t prop,
861
                                   const vm_val_t * /*match_val*/)
862
{
863
    enum_props_ctx *ctx = (enum_props_ctx *)ctx0;
864
    vmgram_match_info *ep;
865
    size_t i;
866
867
    /* if this property is already in our list, don't add it again */
868
    for (i = 0, ep = ctx->ext->prop_enum_arr_ ; i < ctx->cnt ; ++i, ++ep)
869
    {
870
        /* if we already have this one, we can ignore it */
871
        if (ep->prop == prop)
872
            return;
873
    }
874
    
875
    /* we need to add it - if the array is full, expand it */
876
    if (ctx->cnt == ctx->ext->prop_enum_max_)
877
    {
878
        /* reallocate the array with more space */
879
        ctx->ext->prop_enum_max_ += 64;
880
        ctx->ext->prop_enum_arr_ = (vmgram_match_info *)
881
                                   t3realloc(ctx->ext->prop_enum_arr_,
882
                                             ctx->ext->prop_enum_max_
883
                                             * sizeof(vmgram_match_info));
884
    }
885
886
    /* add the property to the array */
887
    ep = &ctx->ext->prop_enum_arr_[ctx->cnt];
888
    ep->prop = prop;
889
890
    /* count the new entry */
891
    ctx->cnt++;
892
}
893
894
/*
895
 *   Execute the parseToken method 
896
 */
897
int CVmObjGramProd::getp_parse(VMG_ vm_obj_id_t self,
898
                               vm_val_t *retval, uint *argc)
899
{
900
    vm_val_t *tokval;
901
    vm_val_t dictval;
902
    CVmObjDict *dict;
903
    size_t tok_cnt;
904
    size_t i;
905
    vmgramprod_tok *tok;
906
    const char *toklistp;
907
    int orig_argc = (argc != 0 ? *argc : 0);
908
    CVmGramProdQueue queues;
909
    CVmObjList *lst;
910
    size_t succ_cnt;
911
    CVmGramProdMatchEntry *match;
912
    static CVmNativeCodeDesc desc(2);
913
    
914
    /* check arguments */
915
    if (get_prop_check_argc(retval, argc, &desc))
916
        return TRUE;
917
918
    /* reset our memory pool */
919
    get_ext()->mem_->reset();
920
921
    /* clear the work queues */
922
    queues.clear();
923
924
    /* 
925
     *   get the tokenList argument and make sure it's a list; leave it on
926
     *   the stack for now so it remains reachable for the garbage
927
     *   collector 
928
     */
929
    tokval = G_stk->get(0);
930
    if ((toklistp = tokval->get_as_list(vmg0_)) == 0)
931
        err_throw(VMERR_LIST_VAL_REQD);
932
933
    /* get the length of the token list */
934
    tok_cnt = vmb_get_len(tokval->get_as_list(vmg0_));
935
936
    /* check for a dictionary argument */
937
    if (orig_argc >= 2)
938
    {
939
        /* get the dictionary argument */
940
        dictval = *G_stk->get(1);
941
942
        /* make sure it's an object or nil */
943
        if (dictval.typ != VM_OBJ && dictval.typ != VM_NIL)
944
            err_throw(VMERR_OBJ_VAL_REQD);
945
    }
946
    else
947
    {
948
        /* use a nil dictionary by default */
949
        dictval.set_nil();
950
    }
951
952
    /* get the dictionary object and check that it's really a dictionary */
953
    if (dictval.typ != VM_NIL)
954
    {
955
        /* if it's not a CVmObjDict object, it's the wrong type */
956
        if (!CVmObjDict::is_dictionary_obj(vmg_ dictval.val.obj))
957
        {
958
            /* wrong type - throw an error */
959
            err_throw(VMERR_INVAL_OBJ_TYPE);
960
        }
961
962
        /* get the VM object pointer */
963
        dict = (CVmObjDict *)vm_objp(vmg_ dictval.val.obj);
964
    }
965
    else
966
    {
967
        /* there's no dictionary object */
968
        dict = 0;
969
    }
970
971
    /* 
972
     *   For quick and easy access, make our own private copy of the token
973
     *   and type lists.  First, allocate an array of token structures.  
974
     */
975
    tok = (vmgramprod_tok *)get_ext()->mem_
976
          ->alloc(tok_cnt * sizeof(vmgramprod_tok));
977
978
    /* copy the token string references and types into our list */
979
    for (i = 0 ; i < tok_cnt ; ++i)
980
    {
981
        vm_val_t ele_val;
982
        vm_val_t tok_typ_val;
983
        vm_val_t tok_str_val;
984
        const char *subp;
985
        const char *tokstrp;
986
987
        /* presume we won't have any vocabulary properties for the word */
988
        tok[i].match_cnt_ = 0;
989
        tok[i].matches_ = 0;
990
991
        /* get this element from the list, and treat it as a list */
992
        CVmObjList::index_list(vmg_ &ele_val, toklistp, i+1);
993
        subp = ele_val.get_as_list(vmg0_);
994
995
        /* if this element isn't a list, it's an error */
996
        if (subp == 0)
997
            err_throw(VMERR_BAD_TYPE_BIF);
998
999
        /* 
1000
         *   parse the token sublist: the first element is the token value,
1001
         *   and the second element is the token type 
1002
         */
1003
        CVmObjList::index_list(vmg_ &tok_str_val, subp, 1);
1004
        CVmObjList::index_list(vmg_ &tok_typ_val, subp, 2);
1005
1006
        /* use the value if it's an enumerator; if not, use zero */
1007
        if (tok_typ_val.typ == VM_ENUM)
1008
            tok[i].typ_ = tok_typ_val.val.enumval;
1009
        else
1010
            tok[i].typ_ = 0;
1011
1012
        /* get the token value as a string */
1013
        tokstrp = tok_str_val.get_as_string(vmg0_);
1014
1015
        /* if it's a string, copy it; otherwise, ignore the value */
1016
        if (tokstrp != 0)
1017
        {
1018
            /* remember the string value */
1019
            tok[i].val_ = tok_str_val;
1020
1021
            /* point the token to the string */
1022
            tok[i].txt_ = tokstrp + VMB_LEN;
1023
            tok[i].len_ = vmb_get_len(tokstrp);
1024
1025
            /* calculate and store the token's hash value */
1026
            tok[i].hash_ = calc_str_hash(
1027
                vmg_ dict, &tok_str_val,
1028
                tokstrp + VMB_LEN, vmb_get_len(tokstrp));
1029
            
1030
            /* 
1031
             *   if we have a dictionary, enumerate the properties
1032
             *   associated with the word 
1033
             */
1034
            if (dict != 0)
1035
            {
1036
                enum_props_ctx ctx;
1037
                
1038
                /* set up our callback context */
1039
                ctx.ext = get_ext();
1040
                ctx.cnt = 0;
1041
                
1042
                /* enumerate the properties */
1043
                dict->enum_word_props(vmg_ &enum_props_cb, &ctx,
1044
                                      &tok_str_val, tok[i].txt_, tok[i].len_);
1045
                
1046
                /* 
1047
                 *   if we found any properties for the word, make a copy
1048
                 *   for the token list 
1049
                 */
1050
                if (ctx.cnt != 0)
1051
                {
1052
                    /* allocate space for the list */
1053
                    tok[i].match_cnt_ = ctx.cnt;
1054
                    tok[i].matches_ =
1055
                        (vmgram_match_info *)get_ext()->mem_
1056
                        ->alloc(ctx.cnt * sizeof(vmgram_match_info));
1057
                    
1058
                    /* copy the list */
1059
                    memcpy(tok[i].matches_, get_ext()->prop_enum_arr_,
1060
                           ctx.cnt * sizeof(tok[i].matches_[0]));
1061
                }
1062
            }
1063
        }
1064
        else
1065
        {
1066
            /* it's not a string - use a null string value for this token */
1067
            tok[i].txt_ = 0;
1068
            tok[i].len_ = 0;
1069
        }
1070
    }
1071
1072
    /* enqueue a match state item for each of our alternatives */
1073
    enqueue_alts(vmg_ get_ext()->mem_, tok, tok_cnt, 0, 0,
1074
                 &queues, self, FALSE, 0, dict);
1075
1076
    /* process the work queue */
1077
    process_work_queue(vmg_ get_ext()->mem_, tok, tok_cnt,
1078
                       &queues, dict);
1079
1080
    /* count the entries in the success list */
1081
    for (succ_cnt = 0, match = queues.success_list_ ; match != 0 ;
1082
         ++succ_cnt, match = match->nxt_) ;
1083
1084
    /* allocate the return list */
1085
    retval->set_obj(CVmObjList::create(vmg_ FALSE, succ_cnt));
1086
    lst = (CVmObjList *)vm_objp(vmg_ retval->val.obj);
1087
1088
    /* initially clear the elements of the return list to nil */
1089
    for (i = 0 ; i < succ_cnt ; ++i)
1090
    {
1091
        vm_val_t nil_val;
1092
1093
        /* set this element to nil */
1094
        nil_val.set_nil();
1095
        lst->cons_set_element(i, &nil_val);
1096
    }
1097
1098
    /* push the list momentarily to protect it from garbage collection */
1099
    G_stk->push(retval);
1100
1101
    /* set the elements */
1102
    for (succ_cnt = 0, match = queues.success_list_ ; match != 0 ;
1103
         ++succ_cnt, match = match->nxt_)
1104
    {
1105
        vm_val_t tree_val;
1106
        vm_val_t tok_match_val;
1107
        size_t first_tok;
1108
        size_t last_tok;
1109
        
1110
        /* 
1111
         *   Create a list to hold the token match list - this is a list of
1112
         *   the dictionary's comparator's matchValue() results for the
1113
         *   tokens that matched literals in the grammar rule.  We need a
1114
         *   list with one element per token that we matched.
1115
         *   
1116
         *   We can't actually populate this list yet, but all we need for
1117
         *   the moment are references to it.  So, create the list; we'll
1118
         *   fill it in as we traverse the match to build the result tree.  
1119
         */
1120
        tok_match_val.set_obj(
1121
            CVmObjList::create(vmg_ FALSE, match->tok_pos_));
1122
1123
        /* push the token match value list for gc protection */
1124
        G_stk->push(&tok_match_val);
1125
1126
        /* build the object tree for this success object */
1127
        build_match_tree(vmg_ match->match_, tokval, &tok_match_val,
1128
                         &tree_val, &first_tok, &last_tok);
1129
1130
        /* discard our token match list's gc protection stack entry */
1131
        G_stk->discard();
1132
1133
        /* 
1134
         *   add the match tree's root object to the main return list (note
1135
         *   that this protects it from garbage collection by virtue of the
1136
         *   main return list being protected from garbage collection) 
1137
         */
1138
        lst->cons_set_element(succ_cnt, &tree_val);
1139
    }
1140
1141
    /* discard the gc protection */
1142
    G_stk->discard();
1143
1144
    /* discard arguments */
1145
    G_stk->discard(orig_argc);
1146
1147
    /* evaluation successful */
1148
    return TRUE;
1149
}
1150
1151
/*
1152
 *   Build an object tree for a match 
1153
 */
1154
void CVmObjGramProd::build_match_tree(VMG_ const CVmGramProdMatch *match,
1155
                                      const vm_val_t *toklist,
1156
                                      const vm_val_t *tokmatchlist,
1157
                                      vm_val_t *retval,
1158
                                      size_t *first_tok, size_t *last_tok)
1159
{
1160
    /* check to see what kind of match we have */
1161
    if (match->proc_obj_ != VM_INVALID_OBJ)
1162
    {
1163
        vm_obj_id_t obj_id;
1164
        CVmObjTads *objp;
1165
        size_t i;
1166
1167
        /* 
1168
         *   Create the object to hold the current tree level.  The only
1169
         *   constructor argument is the superclass, which is the match's
1170
         *   processor object. 
1171
         */
1172
        G_stk->push()->set_obj(match->proc_obj_);
1173
        obj_id = CVmObjTads::create_from_stack(vmg_ 0, 1);
1174
1175
        /* get a pointer to the new object */
1176
        objp = (CVmObjTads *)vm_objp(vmg_ obj_id);
1177
1178
        /* push this object to protect it from gc for a moment */
1179
        G_stk->push()->set_obj(obj_id);
1180
1181
        /* 
1182
         *   Initialize the caller's first/last token indices to an invalid
1183
         *   range, in case we have no subproductions.  A production with no
1184
         *   tokens and no subproductions matches no tokens at all; we
1185
         *   indicate this by using an invalid range, with the first token
1186
         *   index higher than the last token index.  Note that it's still
1187
         *   important that we accurately reflect the end of our range for a
1188
         *   zero-token match, since our enclosing nodes will need our
1189
         *   position to figure out where they go.  
1190
         */
1191
        *first_tok = match->tok_pos_ + 1;
1192
        *last_tok = match->tok_pos_;
1193
1194
        /* build the sub-match entries */
1195
        for (i = 0 ; i < match->sub_match_cnt_ ; ++i)
1196
        {
1197
            size_t first_sub_tok;
1198
            size_t last_sub_tok;
1199
            vm_val_t val;
1200
1201
            /* build this subtree */
1202
            build_match_tree(vmg_ match->sub_match_list_[i],
1203
                             toklist, tokmatchlist,
1204
                             &val, &first_sub_tok, &last_sub_tok);
1205
1206
            /* 
1207
             *   If this is the first subtree, use it as the tentative
1208
             *   limits for our overall match so far; otherwise, expand our
1209
             *   limits if they are outside our range so far.
1210
             *   
1211
             *   If the submatch doesn't include any tokens, it obviously
1212
             *   has no effect on our range.  The submatch range will
1213
             *   indicate that the first token index is greater than the
1214
             *   last token index if the submatch includes no tokens.  
1215
             */
1216
            if (i == 0)
1217
            {
1218
                /* it's the first subtree - it's all we know as yet */
1219
                *first_tok = first_sub_tok;
1220
                *last_tok = last_sub_tok;
1221
            }
1222
            else if (first_sub_tok <= last_sub_tok)
1223
            {
1224
                /* check to see if it expands our current range */
1225
                if (first_sub_tok < *first_tok)
1226
                    *first_tok = first_sub_tok;
1227
                if (last_sub_tok > *last_tok)
1228
                    *last_tok = last_sub_tok;
1229
            }
1230
1231
            /* 
1232
             *   save the subtree with the current match object if there's a
1233
             *   property in which to save it 
1234
             */
1235
            if (match->sub_match_list_[i]->target_prop_ != VM_INVALID_PROP)
1236
            {
1237
                /* 
1238
                 *   Set the processor object property for the value.  Note
1239
                 *   that we don't have to keep undo for this change, since
1240
                 *   we just created the tree object ourselves, and we don't
1241
                 *   create any undo savepoints - an object created after
1242
                 *   the most recent undo savepoint doesn't need to keep
1243
                 *   undo information, since the entire object will be
1244
                 *   deleted if we undo to the savepoint. 
1245
                 */
1246
                objp->set_prop(vmg_ 0, obj_id,
1247
                               match->sub_match_list_[i]->target_prop_,
1248
                               &val);
1249
            }
1250
        }
1251
1252
        /* 
1253
         *   if we have exported properties for recording the token index
1254
         *   range, save them with the object 
1255
         */
1256
        if (G_predef->gramprod_first_tok != VM_INVALID_PROP
1257
            && G_predef->gramprod_last_tok != VM_INVALID_PROP)
1258
        {
1259
            vm_val_t val;
1260
1261
            /* save the first token index */
1262
            val.set_int((int)*first_tok + 1);
1263
            objp->set_prop(vmg_ 0, obj_id,
1264
                           G_predef->gramprod_first_tok, &val);
1265
1266
            /* save the last token index */
1267
            val.set_int((int)*last_tok + 1);
1268
            objp->set_prop(vmg_ 0, obj_id,
1269
                           G_predef->gramprod_last_tok, &val);
1270
        }
1271
1272
        /* 
1273
         *   if we have the token list property exported, set it to the
1274
         *   original token list reference 
1275
         */
1276
        if (G_predef->gramprod_token_list != VM_INVALID_PROP)
1277
        {
1278
            /* save the token list reference */
1279
            objp->set_prop(vmg_ 0, obj_id,
1280
                           G_predef->gramprod_token_list, toklist);
1281
        }
1282
1283
        /* if we have the token match list property exported, set it */
1284
        if (G_predef->gramprod_token_match_list != VM_INVALID_PROP)
1285
        {
1286
            /* save the token match list reference */
1287
            objp->set_prop(vmg_ 0, obj_id,
1288
                           G_predef->gramprod_token_match_list, tokmatchlist);
1289
        }
1290
1291
        /* discard our gc protection */
1292
        G_stk->discard();
1293
1294
        /* the return value is the object */
1295
        retval->set_obj(obj_id);
1296
    }
1297
    else
1298
    {
1299
        const char *lstp;
1300
1301
        /* get the token list */
1302
        lstp = toklist->get_as_list(vmg0_);
1303
1304
        /* make sure the index is in range */
1305
        if (match->tok_pos_ < vmb_get_len(lstp))
1306
        {
1307
            vm_val_t ele_val;
1308
            CVmObjList *match_lst;
1309
            
1310
            /* get the token from the list */
1311
            CVmObjList::index_list(vmg_ &ele_val, lstp, match->tok_pos_ + 1);
1312
1313
            /* 
1314
             *   the token is itself a list, whose first element is the
1315
             *   token's value - retrieve the value 
1316
             */
1317
            lstp = ele_val.get_as_list(vmg0_);
1318
            CVmObjList::index_list(vmg_ retval, lstp, 1);
1319
1320
            /* 
1321
             *   Store the token match result in the result list.  If this is
1322
             *   a '*' match, it doesn't have a result list contribution,
1323
             *   because '*' doesn't actually match any tokens (it merely
1324
             *   stops parsing). 
1325
             */
1326
            if (!match->matched_star_)
1327
            {
1328
                /* get the match list */
1329
                match_lst = (CVmObjList *)vm_objp(vmg_ tokmatchlist->val.obj);
1330
1331
                /* 
1332
                 *   set the element at this token position in the match list
1333
                 *   to the token match result 
1334
                 */
1335
                match_lst->cons_set_element(
1336
                    match->tok_pos_, &match->tok_match_result_);
1337
            }
1338
        }
1339
        else
1340
        {
1341
            /* 
1342
             *   the index is past the end of the list - this must be a
1343
             *   '*' token that matched nothing (i.e., the token list was
1344
             *   fully consumed before we reached the '*'), so just return
1345
             *   nil for the match value 
1346
             */
1347
            retval->set_nil();
1348
        }
1349
1350
        /* 
1351
         *   We match one token, so it's the first and last index.  Note
1352
         *   that if this is a '*' match, we don't match any tokens.  
1353
         */
1354
        if (!match->matched_star_)
1355
            *first_tok = *last_tok = match->tok_pos_;
1356
    }
1357
}
1358
1359
/*
1360
 *   Process the work queue 
1361
 */
1362
void CVmObjGramProd::process_work_queue(VMG_ CVmGramProdMem *mem,
1363
                                        const vmgramprod_tok *tok,
1364
                                        size_t tok_cnt,
1365
                                        CVmGramProdQueue *queues,
1366
                                        CVmObjDict *dict)
1367
{
1368
    /* keep going until the work queue and badness queue are empty */
1369
    for (;;)
1370
    {
1371
        /* 
1372
         *   If the work queue is empty, fall back on the badness queue.
1373
         *   Ignore the badness queue if we have any successful matches,
1374
         *   since non-badness matches always take precedence over badness
1375
         *   items. 
1376
         */
1377
        if (queues->work_queue_ == 0 && queues->success_list_ == 0)
1378
        {
1379
            int first;
1380
            int min_badness;
1381
            CVmGramProdState *cur;
1382
            CVmGramProdState *prv;
1383
            CVmGramProdState *nxt;
1384
            
1385
            /* find the lowest badness rating in the queue */
1386
            for (first = TRUE, min_badness = 0, cur = queues->badness_queue_ ;
1387
                 cur != 0 ; cur = cur->nxt_, first = FALSE)
1388
            {
1389
                /* 
1390
                 *   if we're on the first item, note its badness as the
1391
                 *   tentative minimum; otherwise, note the current item's
1392
                 *   badness if it's lower than the lowest we've seen so
1393
                 *   far 
1394
                 */
1395
                if (first || cur->altp_->badness < min_badness)
1396
                    min_badness = cur->altp_->badness;
1397
            }
1398
1399
            /* 
1400
             *   move each of the items whose badness matches the minimum
1401
             *   badness out of the badness queue and into the work queue 
1402
             */
1403
            for (cur = queues->badness_queue_, prv = 0 ; cur != 0 ;
1404
                 cur = nxt)
1405
            {
1406
                /* remember the next item, in case we remove this one */
1407
                nxt = cur->nxt_;
1408
                
1409
                /* if this item has minimum badness, move it */
1410
                if (cur->altp_->badness == min_badness)
1411
                {
1412
                    /* unlink it from the badness queue */
1413
                    if (prv == 0)
1414
                        queues->badness_queue_ = cur->nxt_;
1415
                    else
1416
                        prv->nxt_ = cur->nxt_;
1417
1418
                    /* link it into the work queue */
1419
                    cur->nxt_ = queues->work_queue_;
1420
                    queues->work_queue_ = cur;
1421
                }
1422
                else
1423
                {
1424
                    /* 
1425
                     *   this item is staying in the list - note it as the
1426
                     *   item preceding the next item 
1427
                     */
1428
                    prv = cur;
1429
                }
1430
            }
1431
        }
1432
1433
        /* if the work queue is still empty, we're out of work to do */
1434
        if (queues->work_queue_ == 0)
1435
            break;
1436
        
1437
        /* process the head of the work queue */
1438
        process_work_queue_head(vmg_ mem, tok, tok_cnt, queues, dict);
1439
    }
1440
}
1441
1442
/*
1443
 *   Process a work queue entry 
1444
 */
1445
void CVmObjGramProd::process_work_queue_head(VMG_ CVmGramProdMem *mem,
1446
                                             const vmgramprod_tok *tok,
1447
                                             size_t tok_cnt,
1448
                                             CVmGramProdQueue *queues,
1449
                                             CVmObjDict *dict)
1450
{
1451
    CVmGramProdState *state;
1452
    CVmGramProdMatch *match;
1453
    const vmgram_tok_info *tokp;
1454
    int tok_matched_star;
1455
    vm_prop_id_t enclosing_target_prop;
1456
    
1457
    /* get the first entry from the queue */
1458
    state = queues->work_queue_;
1459
1460
    /* unlink this entry from the queue */
1461
    queues->work_queue_ = state->nxt_;
1462
    state->nxt_ = 0;
1463
1464
    /* get the token pointer for the next active entry in the alternative */
1465
    tokp = state->altp_->toks + state->alt_pos_;
1466
1467
    /* presume we won't match a '*' token */
1468
    tok_matched_star = FALSE;
1469
1470
    /* process the remaining items in the entry */
1471
    while (state->alt_pos_ < state->altp_->tok_cnt)
1472
    {
1473
        int match;
1474
        vm_val_t match_result;
1475
1476
        /* presume we won't find a match */
1477
        match_result.set_nil();
1478
1479
        /* see what we have for this token */
1480
        switch(tokp->typ)
1481
        {
1482
        case VMGRAM_MATCH_PROD:
1483
            {
1484
                vm_obj_id_t sub_obj_id;
1485
                CVmObjGramProd *sub_objp;
1486
1487
                /*
1488
                 *   This is a sub-production node.  Get the
1489
                 *   sub-production object.  
1490
                 */
1491
                sub_obj_id = tokp->typinfo.prod_obj;
1492
1493
                /* make sure it's of the correct metaclass */
1494
                if (!CVmObjGramProd::is_gramprod_obj(vmg_ sub_obj_id))
1495
                {
1496
                    /* wrong type - throw an error */
1497
                    err_throw(VMERR_INVAL_OBJ_TYPE);
1498
                }
1499
1500
                /* get the sub-production object */
1501
                sub_objp = (CVmObjGramProd *)vm_objp(vmg_ sub_obj_id);
1502
1503
                /* 
1504
                 *   set my subproduction target property, so that the
1505
                 *   sub-production can set the target property correctly 
1506
                 */
1507
                state->sub_target_prop_ = tokp->prop;
1508
                
1509
                /* enqueue the alternatives for the sub-production */
1510
                sub_objp->enqueue_alts(vmg_ mem, tok, tok_cnt,
1511
                                       state->tok_pos_, state, queues,
1512
                                       sub_obj_id, FALSE, 0, dict);
1513
            }
1514
1515
            /* 
1516
             *   Do not process the current state any further for now -
1517
             *   we'll get back to it when (and if) we finish processing
1518
             *   the sub-production.  Note that we don't even put the
1519
             *   current state back in the queue - it's stacked behind the
1520
             *   sub-production, and will be re-enqueued when we
1521
             *   successfully finish with the sub-production.  
1522
             */
1523
            return;
1524
            
1525
        case VMGRAM_MATCH_SPEECH:
1526
            /* part of speech - check to see if the current token matches */
1527
            match = (state->tok_pos_ < tok_cnt
1528
                     && find_prop_in_tok(&tok[state->tok_pos_],
1529
                                         tokp->typinfo.speech_prop));
1530
            break;
1531
1532
        case VMGRAM_MATCH_NSPEECH:
1533
            /* 
1534
             *   multiple parts of speech - check to see if the current token
1535
             *   matches any of the parts of the speech 
1536
             */
1537
            if (state->tok_pos_ < tok_cnt)
1538
            {
1539
                size_t rem;
1540
                const vm_prop_id_t *prop;
1541
1542
                /* presume we won't find a match */
1543
                match = FALSE;
1544
1545
                /* check each item in the list */
1546
                for (prop = tokp->typinfo.nspeech.props,
1547
                     rem = tokp->typinfo.nspeech.cnt ;
1548
                     rem != 0 ; --rem, ++prop)
1549
                {
1550
                    /* if this one matches, we have a match */
1551
                    if (find_prop_in_tok(&tok[state->tok_pos_], *prop))
1552
                    {
1553
                        /* note the match */
1554
                        match = TRUE;
1555
1556
                        /* no need to look any further at this token */
1557
                        break;
1558
                    }
1559
                }
1560
            }
1561
            else
1562
            {
1563
                /* 
1564
                 *   we're out of tokens, so we definitely don't have a
1565
                 *   match, since we must match at least one of the possible
1566
                 *   parts of speech of this item 
1567
                 */
1568
                match = FALSE;
1569
            }
1570
            break;
1571
            
1572
        case VMGRAM_MATCH_LITERAL:
1573
            /* 
1574
             *   Literal - check for a match to the string.  Test the hash
1575
             *   values first, as this is much faster than comparing the
1576
             *   strings, and at least tells us if the strings fail to match
1577
             *   (and failing to match being by far the most common case,
1578
             *   this saves us doing a full comparison most of the time).  
1579
             */
1580
            match = (state->tok_pos_ < tok_cnt
1581
                     && tokp->typinfo.lit.hash == tok[state->tok_pos_].hash_
1582
                     && tok_equals_lit(vmg_ &tok[state->tok_pos_],
1583
                                       tokp->typinfo.lit.str,
1584
                                       tokp->typinfo.lit.len,
1585
                                       dict, &match_result));
1586
            break;
1587
            
1588
        case VMGRAM_MATCH_TOKTYPE:
1589
            /* token type */
1590
            match = (state->tok_pos_ < tok_cnt
1591
                     && (tok[state->tok_pos_].typ_
1592
                         == tokp->typinfo.toktyp_enum));
1593
            break;
1594
1595
        case VMGRAM_MATCH_STAR:
1596
            /* 
1597
             *   this matches anything remaining (it also matches if
1598
             *   there's nothing remaining) 
1599
             */
1600
            match = TRUE;
1601
1602
            /* note that we matched a star in the state */
1603
            state->matched_star_ = TRUE;
1604
1605
            /* note that we matched a star for this particular token */
1606
            tok_matched_star = TRUE;
1607
            break;
1608
        }
1609
1610
        /* check for a match */
1611
        if (match)
1612
        {
1613
            /* 
1614
             *   we matched this token - add a token match to our state's
1615
             *   match list for the current position 
1616
             */
1617
            state->match_list_[state->alt_pos_] =
1618
                CVmGramProdMatch::alloc(mem, state->tok_pos_, &match_result,
1619
                                        tok_matched_star, tokp->prop,
1620
                                        VM_INVALID_OBJ, 0, 0);
1621
1622
            /* we're done with this alternative item - move on */
1623
            state->alt_pos_++;
1624
1625
            /* 
1626
             *   If we matched a real token, consume the matched input
1627
             *   token.  For a '*', we don't want to consume anything, since
1628
             *   a '*' merely stops parsing and doesn't actually match
1629
             *   anything.  
1630
             */
1631
            if (!state->matched_star_)
1632
                state->tok_pos_++;
1633
            
1634
            /* move on to the next alternative token item */
1635
            ++tokp;
1636
        }
1637
        else
1638
        {
1639
            /* 
1640
             *   This is not a match - reject this alternative.  We do not
1641
             *   need to process this item further, so we can stop now,
1642
             *   without returning this state to the work queue.  Simply
1643
             *   abandon this work queue item.  
1644
             */
1645
            return;
1646
        }
1647
    }
1648
1649
    /* 
1650
     *   If we make it here, we've reached the end of this alternative and
1651
     *   have matched everything in it.  This means that the alternative
1652
     *   was successful, so we can perform a 'reduce' operation to replace
1653
     *   the matched tokens with this production.  
1654
     */
1655
1656
    /* if we have an enclosing state, get its target property */
1657
    enclosing_target_prop = (state->enclosing_ != 0
1658
                             ? state->enclosing_->sub_target_prop_
1659
                             : VM_INVALID_PROP);
1660
1661
    /* create a match for the entire alternative (i.e., the subproduction) */
1662
    match = CVmGramProdMatch::alloc(
1663
        mem, state->tok_pos_, 0, FALSE, enclosing_target_prop,
1664
        state->altp_->proc_obj, &state->match_list_[0],
1665
        state->altp_->tok_cnt);
1666
1667
    /*
1668
     *   If the state we just matched was marked on creation as coming
1669
     *   from an alternative with circular alternatives, we've just come
1670
     *   up with a match for each circular alternative.  To avoid infinite
1671
     *   recursion, we never enqueue the circular alternatives; however,
1672
     *   now that we know we have a match for the first element of each
1673
     *   circular alternative, we can check each of them to see if we can
1674
     *   enqueue them.
1675
     */
1676
    if (state->circular_alt_)
1677
    {
1678
        vm_obj_id_t prod_obj_id;
1679
        CVmObjGramProd *prod_objp;
1680
1681
        /* 
1682
         *   Get the production from which this alternative came (there
1683
         *   should be no need to check the metaclass, since we could only
1684
         *   have created the state object from a valid production in the
1685
         *   first place).
1686
         *   
1687
         *   First, make sure it's of the correct class.  (It almost
1688
         *   certainly is, since the compiler should have generated this
1689
         *   reference automatically, so there should be no possibility of a
1690
         *   user error.  However, if it's not a production object, we'll
1691
         *   probably crash by blindly casting it to one; so we'll check
1692
         *   here, just to be sure that the compiler didn't do something
1693
         *   wrong and the image file didn't become corrupted.)  
1694
         */
1695
        if (!CVmObjGramProd::is_gramprod_obj(vmg_ state->prod_obj_))
1696
        {
1697
            /* wrong type - throw an error */
1698
            err_throw(VMERR_INVAL_OBJ_TYPE);
1699
        }
1700
1701
        /* get the object, properly cast */
1702
        prod_obj_id = state->prod_obj_;
1703
        prod_objp = (CVmObjGramProd *)vm_objp(vmg_ prod_obj_id);
1704
1705
        /* 
1706
         *   Enqueue the matching circular alternatives from the original
1707
         *   production.  Our match becomes the first token match in the
1708
         *   circular alternative (i.e., it matches the leading circular
1709
         *   reference element).  
1710
         */
1711
        prod_objp->enqueue_alts(vmg_ mem, tok, tok_cnt, state->tok_pos_,
1712
                                state->enclosing_, queues, prod_obj_id,
1713
                                TRUE, match, dict);
1714
    }
1715
1716
    /* check for an enclosing state to pop */
1717
    if (state->enclosing_ != 0)
1718
    {
1719
        /* add the match to the enclosing state's match list */
1720
        state->enclosing_->match_list_[state->enclosing_->alt_pos_] = match;
1721
        state->enclosing_->alt_pos_++;
1722
1723
        /* 
1724
         *   Move the enclosing state's token position to our token position
1725
         *   - since it now encompasses our match, it has consumed all of
1726
         *   the tokens therein.  Likewise, set the '*' flag in the parent
1727
         *   to our own setting.  
1728
         */
1729
        state->enclosing_->tok_pos_ = state->tok_pos_;
1730
        state->enclosing_->matched_star_ = state->matched_star_;
1731
1732
        /* enqueue the enclosing state so we can continue parsing it */
1733
        enqueue_state(state->enclosing_, queues);
1734
    }
1735
    else
1736
    {
1737
        CVmGramProdMatchEntry *entry;
1738
        
1739
        /* 
1740
         *   This is a top-level state, so we've completed parsing.  If
1741
         *   we've consumed all input, or we matched a '*' token, we were
1742
         *   successful.  If there's more input remaining, this is not a
1743
         *   match.  
1744
         */
1745
        if (state->tok_pos_ < tok_cnt && !state->matched_star_)
1746
        {
1747
            /*
1748
             *   There's more input remaining, so this isn't a match.
1749
             *   Reject this alternative.  We do not need to process this
1750
             *   item further, so stop now without returning this state
1751
             *   object to the work queue. 
1752
             */
1753
1754
            /* abandon this work queue item */
1755
            return;
1756
        }
1757
1758
        /* create a new entry for the match list */
1759
        entry = (CVmGramProdMatchEntry *)mem->alloc(sizeof(*entry));
1760
        entry->match_ = match;
1761
        entry->tok_pos_ = state->tok_pos_;
1762
1763
        /* link the entry into the list */
1764
        entry->nxt_ = queues->success_list_;
1765
        queues->success_list_ = entry;
1766
    }
1767
}
1768
1769
/*
1770
 *   Visit my alternatives and enqueue each one that is a possible match
1771
 *   for a given token list.  
1772
 */
1773
void CVmObjGramProd::enqueue_alts(VMG_ CVmGramProdMem *mem,
1774
                                  const vmgramprod_tok *tok,
1775
                                  size_t tok_cnt, size_t start_tok_pos,
1776
                                  CVmGramProdState *enclosing_state,
1777
                                  CVmGramProdQueue *queues, vm_obj_id_t self,
1778
                                  int circ_only,
1779
                                  CVmGramProdMatch *circ_match,
1780
                                  CVmObjDict *dict)
1781
{
1782
    const vmgram_alt_info *altp;
1783
    size_t i;
1784
    int need_to_clone;
1785
    int has_circ;
1786
    vm_obj_id_t comparator;
1787
1788
    /* 
1789
     *   We don't need to clone the enclosing state until we've set up one
1790
     *   new state referring back to it.  However, if we're doing circular
1791
     *   alternatives, the enclosing state could also be enqueued
1792
     *   separately as its own state, so do clone all required copies in
1793
     *   this case.  
1794
     */
1795
    need_to_clone = circ_only;
1796
1797
    /* note whether or not we have any circular alternatives */
1798
    has_circ = get_ext()->has_circular_alt;
1799
1800
    /* 
1801
     *   Cache hash values for all of our literals.  We need to do this if we
1802
     *   haven't done it before, or if the comparator associated with the
1803
     *   dictionary is different than it was last time we cached the values. 
1804
     */
1805
    comparator = (dict != 0 ? dict->get_comparator() : VM_INVALID_OBJ);
1806
    if (!get_ext()->hashes_cached_ || get_ext()->comparator_ != comparator)
1807
        cache_hashes(vmg_ dict);
1808
1809
    /* 
1810
     *   run through our alternatives and enqueue each one that looks
1811
     *   plausible 
1812
     */
1813
    for (altp = get_ext()->alts_, i = get_ext()->alt_cnt_ ; i != 0 ;
1814
         --i, ++altp)
1815
    {
1816
        vmgram_tok_info *first_tokp;
1817
        vmgram_tok_info *tokp;
1818
        size_t cur_alt_tok;
1819
        size_t alt_tok_cnt;
1820
        size_t tok_idx;
1821
        int found_mismatch;
1822
        int prod_before;
1823
        int is_circ;
1824
        
1825
        /* start at the first token remaining in the string */
1826
        tok_idx = start_tok_pos;
1827
1828
        /* get the first token for this alternative */
1829
        tokp = first_tokp = altp->toks;
1830
1831
        /* get the number of tokens in the alternative */
1832
        alt_tok_cnt = altp->tok_cnt;
1833
        
1834
        /* presume we will find no mismatches in this check */
1835
        found_mismatch = FALSE;
1836
1837
        /* 
1838
         *   note whether this alternative is a direct circular reference
1839
         *   or not (it is directly circular if the first token points
1840
         *   back to this same production) 
1841
         */
1842
        is_circ = (alt_tok_cnt != 0
1843
                   && tokp->typ == VMGRAM_MATCH_PROD
1844
                   && tokp->typinfo.prod_obj == self);
1845
        
1846
        /* 
1847
         *   we don't have a production before the current item (this is
1848
         *   important because it determines whether we must consider the
1849
         *   current token or any following token when trying to match a
1850
         *   literal, part of speech, or token type) 
1851
         */
1852
        prod_before = FALSE;
1853
1854
        /* scan the tokens in the alternative */
1855
        for (cur_alt_tok = 0 ; cur_alt_tok < alt_tok_cnt ;
1856
             ++cur_alt_tok, ++tokp)
1857
        {
1858
            int match;
1859
            int match_star;
1860
            int ignore_item;
1861
            vm_val_t match_result;
1862
1863
            /* presume we won't find a match */
1864
            match = FALSE;
1865
            match_star = FALSE;
1866
1867
            /* presume we won't be able to ignore the item */
1868
            ignore_item = FALSE;
1869
1870
            /* 
1871
             *   Try to find a match to the current alternative item.  If
1872
             *   we've already found a mismatch, don't bother, since we need
1873
             *   no further evidence to skip this alternative; in this case
1874
             *   we're still looping over the alternative items simply to
1875
             *   find the beginning of the next alternative.  
1876
             */
1877
            for ( ; !found_mismatch ; ++tok_idx)
1878
            {
1879
                /* see what kind of item we have */
1880
                switch(tokp->typ)
1881
                {
1882
                case VMGRAM_MATCH_PROD:
1883
                    /* 
1884
                     *   We can't rule out a production without checking
1885
                     *   it fully, which we're not doing on this pass;
1886
                     *   however, note that we found a production, because
1887
                     *   it means that we must from now on try skipping
1888
                     *   any number of tokens before ruling out a match on
1889
                     *   other grounds, since the production could
1890
                     *   potentially match any number of tokens.
1891
                     *   
1892
                     *   If we're doing circular matches only, and this is
1893
                     *   a circular production, and we're on the first
1894
                     *   token in our list, we already know it matches, so
1895
                     *   don't even count it as a production before the
1896
                     *   item.  
1897
                     */
1898
                    if (is_circ && circ_only && cur_alt_tok == 0)
1899
                    {
1900
                        /* 
1901
                         *   ignore the fact that it's a production, since
1902
                         *   we already know it matches - we don't want to
1903
                         *   be flexible about tokens following this item
1904
                         *   since we know the exact number that match it 
1905
                         */
1906
                    }
1907
                    else
1908
                    {
1909
                        /* note the production item */
1910
                        prod_before = TRUE;
1911
                    }
1912
1913
                    /* 
1914
                     *   we can ignore this item for the purposes of
1915
                     *   detecting a mismatch, because we can't tell
1916
                     *   during this superficial scan whether it matches 
1917
                     */
1918
                    ignore_item = TRUE;
1919
                    break;
1920
                
1921
                case VMGRAM_MATCH_SPEECH:
1922
                    /* we must match a part of speech */
1923
                    if (tok_idx < tok_cnt
1924
                        && find_prop_in_tok(&tok[tok_idx],
1925
                                            tokp->typinfo.speech_prop))
1926
                    {
1927
                        /* it's a match */
1928
                        match = TRUE;
1929
                    }
1930
                    break;
1931
1932
                case VMGRAM_MATCH_NSPEECH:
1933
                    /* multiple parts of speech */
1934
                    if (tok_idx < tok_cnt)
1935
                    {
1936
                        vm_prop_id_t *prop;
1937
                        size_t rem;
1938
1939
                        /* presume we won't find a match */
1940
                        match = FALSE;
1941
1942
                        /* check each item in the list */
1943
                        for (prop = tokp->typinfo.nspeech.props,
1944
                             rem = tokp->typinfo.nspeech.cnt ; rem != 0 ;
1945
                             --rem, ++prop)
1946
                        {
1947
                            /* if this one matches, we have a match */
1948
                            if (find_prop_in_tok(&tok[tok_idx], *prop))
1949
                            {
1950
                                /* note the match */
1951
                                match = TRUE;
1952
1953
                                /* no need to look any further */
1954
                                break;
1955
                            }
1956
                        }
1957
                    }
1958
                    else
1959
                    {
1960
                        /* we're out of tokens, so we have no match */
1961
                        match = FALSE;
1962
                    }
1963
                    break;
1964
                    
1965
                case VMGRAM_MATCH_LITERAL:
1966
                    /* 
1967
                     *   Match a literal.  Compare the hash values first,
1968
                     *   since a negative match on the hash values will tell
1969
                     *   us for sure that the text won't match; since not
1970
                     *   matching is by far the most common case, this saves
1971
                     *   us the work of doing full string comparisons most of
1972
                     *   the time. 
1973
                     */
1974
                    if (tok_idx < tok_cnt
1975
                        && tok[tok_idx].txt_ != 0
1976
                        && tokp->typinfo.lit.hash == tok[tok_idx].hash_
1977
                        && tok_equals_lit(vmg_ &tok[tok_idx],
1978
                                          tokp->typinfo.lit.str,
1979
                                          tokp->typinfo.lit.len,
1980
                                          dict, &match_result))
1981
                    {
1982
                        /* it's a match */
1983
                        match = TRUE;
1984
                    }
1985
                    break;
1986
                
1987
                case VMGRAM_MATCH_TOKTYPE:
1988
                    /* we must match a token type */
1989
                    if (tok_idx < tok_cnt
1990
                        && tok[tok_idx].typ_ == tokp->typinfo.toktyp_enum)
1991
                        match = TRUE;
1992
                    break;
1993
1994
                case VMGRAM_MATCH_STAR:
1995
                    /* consume everything remaining on the line */
1996
                    tok_idx = tok_cnt;
1997
1998
                    /* this is a match */
1999
                    match = TRUE;
2000
                    match_star = TRUE;
2001
                    break;
2002
                }
2003
2004
                /* if we found a match, we can stop scanning now */
2005
                if (match)
2006
                    break;
2007
2008
                /* 
2009
                 *   we didn't match this token - if we have a production
2010
                 *   before this point, we must keep scanning token, since
2011
                 *   the production could end up matching any number of
2012
                 *   tokens; if we don't have a production item, though,
2013
                 *   we don't need to look any further, and we can rule
2014
                 *   out this alternative immediately 
2015
                 */
2016
                if (!prod_before)
2017
                    break;
2018
2019
                /* 
2020
                 *   if we're ignoring this item (because we can't decide
2021
                 *   now whether it's a match or not, and hence can't rule
2022
                 *   it out), don't scan any tokens for the item 
2023
                 */
2024
                if (ignore_item)
2025
                    break;
2026
2027
                /* 
2028
                 *   if we didn't find a match, and we're out of tokens,
2029
                 *   stop looking - we have no more tokens to look at to
2030
                 *   find a match 
2031
                 */
2032
                if (!match && tok_idx >= tok_cnt)
2033
                    break;
2034
            }
2035
2036
            /* check to see if we found a match */
2037
            if (match)
2038
            {
2039
                /* 
2040
                 *   we found a match - skip the current token (unless we
2041
                 *   matched a '*' item, in which case we've already
2042
                 *   consumed all remaining tokens) 
2043
                 */
2044
                if (!match_star && tok_idx < tok_cnt)
2045
                    ++tok_idx;
2046
            }
2047
            else if (!ignore_item)
2048
            {
2049
                /* 
2050
                 *   we didn't match, and we can't ignore this item - note
2051
                 *   that the alternative does not match 
2052
                 */
2053
                found_mismatch = TRUE;
2054
2055
                /* finish up with this alternative and proceed to the next */
2056
                break;
2057
            }
2058
        }
2059
2060
        /* 
2061
         *   If we didn't find a reason to rule out this alternative,
2062
         *   enqueue the alternative.  Never enqueue circular references,
2063
         *   unless we're ONLY enqueuing circular references.
2064
         */
2065
        if (!found_mismatch
2066
            && ((!circ_only && !is_circ)
2067
                || (circ_only && is_circ)))
2068
        {
2069
            CVmGramProdState *state;
2070
            
2071
            /* create and enqueue the new state */
2072
            state = enqueue_new_state(mem, start_tok_pos, enclosing_state,
2073
                                      altp, self, &need_to_clone, queues,
2074
                                      has_circ);
2075
2076
            /* 
2077
             *   if we are enqueuing circular references, we already have
2078
             *   a match for the first (circular) token, so fill it in 
2079
             */
2080
            if (circ_only)
2081
            {
2082
                CVmGramProdMatch *match;
2083
                
2084
                /* create a match for the alternative */
2085
                match = CVmGramProdMatch::
2086
                        alloc(mem, 0, 0, FALSE, first_tokp->prop,
2087
                              circ_match->proc_obj_,
2088
                              circ_match->sub_match_list_,
2089
                              circ_match->sub_match_cnt_);
2090
2091
                /* set the first match in the token */
2092
                state->match_list_[0] = match;
2093
2094
                /* set up at the second token */
2095
                state->alt_pos_ = 1;
2096
            }
2097
        }
2098
    }
2099
}
2100
2101
/*
2102
 *   Cache hash values for all of our literal tokens
2103
 */
2104
void CVmObjGramProd::cache_hashes(VMG_ CVmObjDict *dict)
2105
{
2106
    vm_obj_id_t comparator;
2107
    vmgram_alt_info *alt;
2108
    size_t i;
2109
2110
    /* get the comparator */
2111
    comparator = (dict != 0 ? dict->get_comparator() : VM_INVALID_OBJ);
2112
2113
    /* run through our alternatives */
2114
    for (i = get_ext()->alt_cnt_, alt = get_ext()->alts_ ;
2115
         i != 0 ; --i, ++alt)
2116
    {
2117
        vmgram_tok_info *tok;
2118
        size_t j;
2119
2120
        /* run through our tokens */
2121
        for (j = alt->tok_cnt, tok = alt->toks ; j != 0 ; --j, ++tok)
2122
        {
2123
            /* if this is a literal token, calculate its hash value */
2124
            if (tok->typ == VMGRAM_MATCH_LITERAL)
2125
            {
2126
                /* calculate and store our hash value */
2127
                tok->typinfo.lit.hash = calc_str_hash(
2128
                    vmg_ dict, 0, tok->typinfo.lit.str, tok->typinfo.lit.len);
2129
            }
2130
        }
2131
    }
2132
2133
    /* note that we've cached hash values, and note the comparator we used */
2134
    get_ext()->hashes_cached_ = TRUE;
2135
    get_ext()->comparator_ = comparator;
2136
}
2137
2138
/*
2139
 *   Create and enqueue a new state 
2140
 */
2141
CVmGramProdState *CVmObjGramProd::
2142
   enqueue_new_state(CVmGramProdMem *mem,
2143
                     size_t start_tok_pos,
2144
                     CVmGramProdState *enclosing_state,
2145
                     const vmgram_alt_info *altp, vm_obj_id_t self,
2146
                     int *need_to_clone, CVmGramProdQueue *queues,
2147
                     int circular_alt)
2148
{
2149
    CVmGramProdState *state;
2150
2151
    /* create the new state */
2152
    state = create_new_state(mem, start_tok_pos, enclosing_state,
2153
                             altp, self, need_to_clone, circular_alt);
2154
    
2155
    /* 
2156
     *   Add the item to the appropriate queue.  If the item has an
2157
     *   associated badness, add it to the badness queue.  Otherwise, add
2158
     *   it to the work queue.  
2159
     */
2160
    if (altp->badness != 0)
2161
    {
2162
        /* 
2163
         *   we have a badness rating - add it to the badness queue, since
2164
         *   we don't want to process it until we entirely exhaust better
2165
         *   possibilities 
2166
         */
2167
        state->nxt_ = queues->badness_queue_;
2168
        queues->badness_queue_ = state;
2169
    }
2170
    else
2171
    {
2172
        /* enqueue the state */
2173
        enqueue_state(state, queues);
2174
    }
2175
2176
    /* return the new state */
2177
    return state;
2178
}
2179
2180
/*
2181
 *   Create a new state 
2182
 */
2183
CVmGramProdState *CVmObjGramProd::
2184
   create_new_state(CVmGramProdMem *mem, size_t start_tok_pos,
2185
                    CVmGramProdState *enclosing_state,
2186
                    const vmgram_alt_info *altp, vm_obj_id_t self,
2187
                    int *need_to_clone, int circular_alt)
2188
{
2189
    /* 
2190
     *   if necessary, clone the enclosing state; we need to do this if we
2191
     *   enqueue more than one nested alternative, since each nested
2192
     *   alternative could parse the rest of the token list differently
2193
     *   and thus provide a different result to the enclosing state 
2194
     */
2195
    if (*need_to_clone && enclosing_state != 0)
2196
        enclosing_state = enclosing_state->clone(mem);
2197
2198
    /* 
2199
     *   we will need to clone the enclosing state if we create another
2200
     *   alternative after this one 
2201
     */
2202
    *need_to_clone = TRUE;
2203
2204
    /* create a new state object for the alternative */
2205
    return CVmGramProdState::alloc(mem, start_tok_pos, 0,
2206
                                   enclosing_state, altp, self,
2207
                                   circular_alt);
2208
}
2209
2210
/*
2211
 *   Enqueue a state in the work queue
2212
 */
2213
void CVmObjGramProd::enqueue_state(CVmGramProdState *state,
2214
                                   CVmGramProdQueue *queues)
2215
{
2216
    /* add it to the work queue */
2217
    state->nxt_ = queues->work_queue_;
2218
    queues->work_queue_ = state;
2219
}
2220
2221
/*
2222
 *   Determine if a token from the input matches a literal token, allowing
2223
 *   the input token to be a truncated version of the literal token if the
2224
 *   given truncation length is non-zero. 
2225
 */
2226
int CVmObjGramProd::tok_equals_lit(VMG_ const struct vmgramprod_tok *tok,
2227
                                   const char *lit, size_t lit_len,
2228
                                   CVmObjDict *dict, vm_val_t *result_val)
2229
{
2230
    /* 
2231
     *   if there's a dictionary, ask it to do the comparison; otherwise,
2232
     *   use an exact literal match 
2233
     */
2234
    if (dict != 0)
2235
    {
2236
        /* ask the dictionary for the comparison */
2237
        return dict->match_strings(vmg_ &tok->val_, tok->txt_, tok->len_,
2238
                                   lit, lit_len, result_val);
2239
    }
2240
    else
2241
    {
2242
        /* perform an exact comparison */
2243
        result_val->set_int(tok->len_ == lit_len
2244
                            && memcmp(tok->txt_, lit, lit_len) == 0);
2245
        return result_val->val.intval;
2246
    }
2247
}
2248
2249
/*
2250
 *   Calculate the hash value for a literal token, using our dictionary's
2251
 *   comparator.  
2252
 */
2253
unsigned int CVmObjGramProd::calc_str_hash(VMG_ CVmObjDict *dict,
2254
                                           const vm_val_t *strval,
2255
                                           const char *str, size_t len)
2256
{
2257
    /* 
2258
     *   if we have a dictionary, ask it to ask it for the hash value, as the
2259
     *   dictionary will ask its comparator to do the work; otherwise,
2260
     *   calculate our own hash value 
2261
     */
2262
    if (dict != 0)
2263
    {
2264
        /* 
2265
         *   We're doing comparisons through the dictionary's comparator, so
2266
         *   we must calculate hash values using the comparator as well to
2267
         *   keep the hash and match operations consistent.  
2268
         */
2269
        return dict->calc_str_hash(vmg_ strval, str, len);
2270
    }
2271
    else
2272
    {
2273
        uint hash;
2274
2275
        /*
2276
         *   We don't have a dictionary, so we're doing our own string
2277
         *   comparisons, which we do as exact byte-for-byte matches.  We can
2278
         *   thus calculate an arbitrary hash value of our own devising.  
2279
         */
2280
        for (hash = 0 ; len != 0 ; ++str, --len)
2281
            hash += (unsigned char)*str;
2282
2283
        /* return the hash value we calculated */
2284
        return hash;
2285
    }
2286
}
2287
2288
2289
/*
2290
 *   Find a property in a token's list.  Returns true if the property is
2291
 *   there, false if not. 
2292
 */
2293
int CVmObjGramProd::find_prop_in_tok(const vmgramprod_tok *tok,
2294
                                     vm_prop_id_t prop)
2295
{
2296
    size_t i;
2297
    const vmgram_match_info *p;
2298
2299
    /* search for the property */
2300
    for (i = tok->match_cnt_, p = tok->matches_ ; i != 0 ; --i, ++p)
2301
    {
2302
        /* if this is the one we're looking for, return success */
2303
        if (p->prop == prop)
2304
            return TRUE;
2305
    }
2306
2307
    /* we didn't find it */
2308
    return FALSE;
2309
}
2310
2311
/*
2312
 *   Get the next token item after the given item in an alternative
2313
 */
2314
const char *CVmObjGramProd::get_next_alt_tok(const char *tokp)
2315
{
2316
    switch(vmgram_tok_type(tokp))
2317
    {
2318
    case VMGRAM_MATCH_PROD:
2319
        return tokp + VMGRAM_TOK_PROD_SIZE;
2320
        
2321
    case VMGRAM_MATCH_SPEECH:
2322
        return tokp + VMGRAM_TOK_SPEECH_SIZE;
2323
2324
    case VMGRAM_MATCH_NSPEECH:
2325
        return tokp + VMGRAM_TOK_NSPEECH_SIZE(tokp);
2326
2327
    case VMGRAM_MATCH_LITERAL:
2328
        return tokp + VMGRAM_TOK_LIT_SIZE(tokp);
2329
2330
    case VMGRAM_MATCH_TOKTYPE:
2331
        return tokp + VMGRAM_TOK_TYPE_SIZE;
2332
2333
    case VMGRAM_MATCH_STAR:
2334
        return tokp + VMGRAM_TOK_STAR_SIZE;
2335
2336
    default:
2337
        err_throw(VMERR_INVAL_METACLASS_DATA);
2338
        AFTER_ERR_THROW(return 0;)
2339
    }
2340
}
2341
2342
/* ------------------------------------------------------------------------ */
2343
/*
2344
 *   Execute the getGrammarInfo() method - builds and returns an information
2345
 *   structure describing this grammar production object.  
2346
 */
2347
int CVmObjGramProd::getp_get_gram_info(VMG_ vm_obj_id_t self,
2348
                                       vm_val_t *retval, uint *argc)
2349
{
2350
    vm_obj_id_t alt_lst_id;
2351
    CVmObjList *alt_lst;
2352
    size_t i;
2353
    vmgram_alt_info *altp;
2354
    static CVmNativeCodeDesc desc(0);
2355
2356
    /* check arguments */
2357
    if (get_prop_check_argc(retval, argc, &desc))
2358
        return TRUE;
2359
2360
    /*
2361
     *   This routine uses considerable extra stack in the course of its
2362
     *   work, so check in advance that we have enough space to run.  
2363
     */
2364
    if (!G_stk->check_space(8))
2365
        err_throw(VMERR_STACK_OVERFLOW);
2366
2367
    /* create a list to hold the rule alternatives */
2368
    alt_lst_id = CVmObjList::create(vmg_ FALSE, get_ext()->alt_cnt_);
2369
    alt_lst = (CVmObjList *)vm_objp(vmg_ alt_lst_id);
2370
2371
    /* push it onto the stack for protection from garbage collection */
2372
    G_stk->push()->set_obj(alt_lst_id);
2373
2374
    /* create the rule-alternative descriptions */
2375
    for (i = 0, altp = get_ext()->alts_ ; i < get_ext()->alt_cnt_ ;
2376
         ++i, ++altp)
2377
    {
2378
        size_t j;
2379
        vmgram_tok_info *tokp;
2380
        vm_val_t alt_obj_val;
2381
        vm_obj_id_t tok_lst_id;
2382
        CVmObjList *tok_lst;
2383
2384
        /* 
2385
         *   if our imported GrammarAltInfo class isn't defined, we can't
2386
         *   provide rule-alternative information, so just fill in the list
2387
         *   with a nil 
2388
         */
2389
        if (G_predef->gramprod_gram_alt_info == VM_INVALID_OBJ)
2390
        {
2391
            /* add a nil to the list */
2392
            alt_obj_val.set_nil();
2393
            alt_lst->cons_set_element(i, &alt_obj_val);
2394
2395
            /* done with this slot */
2396
            continue;
2397
        }
2398
2399
        /* create a list to hold the token descriptors */
2400
        tok_lst_id = CVmObjList::create(vmg_ FALSE, altp->tok_cnt);
2401
        tok_lst = (CVmObjList *)vm_objp(vmg_ tok_lst_id);
2402
2403
        /* push this momentarily for gc protection */
2404
        G_stk->push()->set_obj(tok_lst_id);
2405
2406
        /*
2407
         *   Run through the tokens in the rule alternative, and build a list
2408
         *   of token descriptor structures. 
2409
         */
2410
        for (j = 0, tokp = altp->toks ; j < altp->tok_cnt ; ++j, ++tokp)
2411
        {
2412
            vm_val_t tok_obj_val;
2413
2414
            /* 
2415
             *   if our imported GrammarAltTokInfo class isn't defined, we
2416
             *   can't provide token information, so just fill in the list
2417
             *   with a nil 
2418
             */
2419
            if (G_predef->gramprod_gram_alt_tok_info == VM_INVALID_OBJ)
2420
            {
2421
                /* add a nil to the list */
2422
                tok_obj_val.set_nil();
2423
                tok_lst->cons_set_element(j, &tok_obj_val);
2424
2425
                /* done with this slot */
2426
                continue;
2427
            }
2428
            
2429
            /* 
2430
             *   The constructor arguments are the target property ID, the
2431
             *   token type code, and extra information that depends on the
2432
             *   type code.  We have to push the arguments backwards per the
2433
             *   normal protocol, so start with the variant part.  
2434
             */
2435
            switch (tokp->typ)
2436
            {
2437
            case VMGRAM_MATCH_PROD:
2438
                /* the extra information is the sub-production object */
2439
                G_stk->push()->set_obj(tokp->typinfo.prod_obj);
2440
                break;
2441
2442
            case VMGRAM_MATCH_SPEECH:
2443
                /* the extra info is the part-of-speech property */
2444
                G_stk->push()->set_propid(tokp->typinfo.speech_prop);
2445
                break;
2446
2447
            case VMGRAM_MATCH_NSPEECH:
2448
                /* the extra info is a list of part-of-speech properties */
2449
                {
2450
                    vm_obj_id_t pos_lst_id;
2451
                    CVmObjList *pos_lst;
2452
                    vm_prop_id_t *pp;
2453
                    size_t k;
2454
2455
                    /* create the property list */
2456
                    pos_lst_id = CVmObjList::create(
2457
                        vmg_ FALSE, tokp->typinfo.nspeech.cnt);
2458
                    pos_lst = (CVmObjList *)vm_objp(vmg_ pos_lst_id);
2459
2460
                    /* push it */
2461
                    G_stk->push()->set_obj(pos_lst_id);
2462
2463
                    /* populate it */
2464
                    for (k = 0, pp = tokp->typinfo.nspeech.props ;
2465
                         k < tokp->typinfo.nspeech.cnt ;
2466
                         ++k, ++pp)
2467
                    {
2468
                        vm_val_t val;
2469
                        
2470
                        /* fill in this element */
2471
                        val.set_propid(*pp);
2472
                        pos_lst->cons_set_element(k, &val);
2473
                    }
2474
                }
2475
                break;
2476
2477
            case VMGRAM_MATCH_LITERAL:
2478
                /* the extra info is the literal string value */
2479
                G_stk->push()->set_obj(CVmObjString::create(
2480
                    vmg_ FALSE, tokp->typinfo.lit.str,
2481
                    tokp->typinfo.lit.len));
2482
                break;
2483
2484
            case VMGRAM_MATCH_TOKTYPE:
2485
                /* the extra information is the token class enum */
2486
                G_stk->push()->set_enum(tokp->typinfo.toktyp_enum);
2487
                break;
2488
2489
            case VMGRAM_MATCH_STAR:
2490
                /* 
2491
                 *   for a 'star' type, there is no extra information, but
2492
                 *   push nil to keep the argument list uniform 
2493
                 */
2494
                G_stk->push()->set_nil();
2495
                break;
2496
2497
            default:
2498
                assert(FALSE);
2499
                break;
2500
            }
2501
2502
            /* now push the target property and type code */
2503
            G_stk->push()->set_int(tokp->typ);
2504
            if (tokp->prop != VM_INVALID_PROP)
2505
                G_stk->push()->set_propid(tokp->prop);
2506
            else
2507
                G_stk->push()->set_nil();
2508
2509
            /* 
2510
             *   the first argument to the constructor is the base class,
2511
             *   which is the imported GrammarAltTokInfo class object 
2512
             */
2513
            G_stk->push()->set_obj(G_predef->gramprod_gram_alt_tok_info);
2514
2515
            /* create the object, at long last */
2516
            tok_obj_val.set_obj(CVmObjTads::create_from_stack(vmg_ 0, 4));
2517
2518
            /* add it to our token list */
2519
            tok_lst->cons_set_element(j, &tok_obj_val);
2520
        }
2521
2522
        /* 
2523
         *   Create the GrammarAltInfo object to desribe the alternative.
2524
         *   Pass in the score, badness, match object, and token list as
2525
         *   arguments to the constructor, which we'll count on to stash the
2526
         *   information away in the new object.
2527
         *   
2528
         *   Note that the first constructor argument is the superclass,
2529
         *   which is the imported GrammarAltInfo class object.  
2530
         */
2531
        G_stk->push()->set_obj(tok_lst_id);
2532
        G_stk->push()->set_obj(altp->proc_obj);
2533
        G_stk->push()->set_int(altp->badness);
2534
        G_stk->push()->set_int(altp->score);
2535
        G_stk->push()->set_obj(G_predef->gramprod_gram_alt_info);
2536
        alt_obj_val.set_obj(CVmObjTads::create_from_stack(vmg_ 0, 5));
2537
2538
        /* add the new GrammarAltInfo to the main alternative list */
2539
        alt_lst->cons_set_element(i, &alt_obj_val);
2540
2541
        /* 
2542
         *   It's safe to discard the gc protection for the token list now,
2543
         *   since a reference to is now stashed away in the GrammarAltInfo
2544
         *   object, which in turn is referenced from our main alternative
2545
         *   list, which we stashed on the stack earlier for gc protection.  
2546
         */
2547
        G_stk->discard();
2548
    }
2549
2550
    /* it's safe to discard our gc protection for the main list now */
2551
    G_stk->discard();
2552
2553
    /* return the main list */
2554
    retval->set_obj(alt_lst_id);
2555
2556
    /* successful evaluation */
2557
    return TRUE;
2558
}
2559