| | 1 | #ifdef RCSID |
| | 2 | static char RCSid[] = |
| | 3 | "$Header$"; |
| | 4 | #endif |
| | 5 | |
| | 6 | /* |
| | 7 | * Copyright (c) 2000, 2002 Michael J. Roberts. All Rights Reserved. |
| | 8 | * |
| | 9 | * Please see the accompanying license file, LICENSE.TXT, for information |
| | 10 | * on using and copying this software. |
| | 11 | */ |
| | 12 | /* |
| | 13 | Name |
| | 14 | vmgram.cpp - T3 Grammar Production metaclass |
| | 15 | Function |
| | 16 | |
| | 17 | Notes |
| | 18 | |
| | 19 | Modified |
| | 20 | 02/15/00 MJRoberts - Creation |
| | 21 | */ |
| | 22 | |
| | 23 | #include <stdlib.h> |
| | 24 | #include <string.h> |
| | 25 | #include <assert.h> |
| | 26 | |
| | 27 | #include "t3std.h" |
| | 28 | #include "vmtype.h" |
| | 29 | #include "vmstack.h" |
| | 30 | #include "vmlst.h" |
| | 31 | #include "vmstr.h" |
| | 32 | #include "vmgram.h" |
| | 33 | #include "vmerr.h" |
| | 34 | #include "vmerrnum.h" |
| | 35 | #include "vmmeta.h" |
| | 36 | #include "vmdict.h" |
| | 37 | #include "vmtobj.h" |
| | 38 | #include "vmpredef.h" |
| | 39 | |
| | 40 | |
| | 41 | /* ------------------------------------------------------------------------ */ |
| | 42 | /* |
| | 43 | * Grammar production object - memory pool. This is a simple |
| | 44 | * suballocator that obtains large blocks from the system allocator, |
| | 45 | * then hands out pieces of those blocks. Allocating is very cheap |
| | 46 | * (just a pointer increment in most cases), and we don't track |
| | 47 | * individual allocations and deletions but just throw away all |
| | 48 | * suballocated memory at once. |
| | 49 | */ |
| | 50 | |
| | 51 | /* size of each memory block */ |
| | 52 | const size_t VMGRAMPROD_MEM_BLOCK_SIZE = 16*1024; |
| | 53 | |
| | 54 | /* |
| | 55 | * memory pool block |
| | 56 | */ |
| | 57 | struct CVmGramProdMemBlock |
| | 58 | { |
| | 59 | /* next block in chain */ |
| | 60 | CVmGramProdMemBlock *nxt_; |
| | 61 | |
| | 62 | /* bytes of the block */ |
| | 63 | char buf_[VMGRAMPROD_MEM_BLOCK_SIZE]; |
| | 64 | }; |
| | 65 | |
| | 66 | /* |
| | 67 | * memory pool object |
| | 68 | */ |
| | 69 | class CVmGramProdMem |
| | 70 | { |
| | 71 | public: |
| | 72 | CVmGramProdMem() |
| | 73 | { |
| | 74 | /* no memory blocks yet */ |
| | 75 | block_head_ = 0; |
| | 76 | block_cur_ = 0; |
| | 77 | |
| | 78 | /* we don't yet have a block */ |
| | 79 | cur_rem_ = 0; |
| | 80 | cur_free_ = 0; |
| | 81 | } |
| | 82 | |
| | 83 | ~CVmGramProdMem() |
| | 84 | { |
| | 85 | /* delete each of our blocks */ |
| | 86 | while (block_head_ != 0) |
| | 87 | { |
| | 88 | CVmGramProdMemBlock *nxt; |
| | 89 | |
| | 90 | /* remember the next block */ |
| | 91 | nxt = block_head_->nxt_; |
| | 92 | |
| | 93 | /* delete this block */ |
| | 94 | t3free(block_head_); |
| | 95 | |
| | 96 | /* move on to the next */ |
| | 97 | block_head_ = nxt; |
| | 98 | } |
| | 99 | } |
| | 100 | |
| | 101 | /* reset - delete all previously sub-allocated memory */ |
| | 102 | void reset() |
| | 103 | { |
| | 104 | /* start over with the first block */ |
| | 105 | block_cur_ = block_head_; |
| | 106 | |
| | 107 | /* initialize the free pointer for the new block, if we have one */ |
| | 108 | if (block_cur_ != 0) |
| | 109 | { |
| | 110 | /* start at the beginning of the new block */ |
| | 111 | cur_free_ = block_cur_->buf_; |
| | 112 | |
| | 113 | /* this entire block is available */ |
| | 114 | cur_rem_ = VMGRAMPROD_MEM_BLOCK_SIZE; |
| | 115 | } |
| | 116 | else |
| | 117 | { |
| | 118 | /* there are no blocks, so there's no memory available yet */ |
| | 119 | cur_rem_ = 0; |
| | 120 | cur_free_ = 0; |
| | 121 | } |
| | 122 | } |
| | 123 | |
| | 124 | /* allocate memory */ |
| | 125 | void *alloc(size_t siz) |
| | 126 | { |
| | 127 | void *ret; |
| | 128 | |
| | 129 | /* round the size to the local hardware boundary */ |
| | 130 | siz = osrndsz(siz); |
| | 131 | |
| | 132 | /* if it exceeds the block size, fail */ |
| | 133 | if (siz > VMGRAMPROD_MEM_BLOCK_SIZE) |
| | 134 | return 0; |
| | 135 | |
| | 136 | /* if we don't have enough space in this block, go to the next */ |
| | 137 | if (siz > cur_rem_) |
| | 138 | { |
| | 139 | /* allocate a new block if necessary */ |
| | 140 | if (block_cur_ == 0) |
| | 141 | { |
| | 142 | /* there's nothing in the list yet - set up the first block */ |
| | 143 | block_head_ = (CVmGramProdMemBlock *) |
| | 144 | t3malloc(sizeof(CVmGramProdMemBlock)); |
| | 145 | |
| | 146 | /* activate the new block */ |
| | 147 | block_cur_ = block_head_; |
| | 148 | |
| | 149 | /* there's nothing after this block */ |
| | 150 | block_cur_->nxt_ = 0; |
| | 151 | } |
| | 152 | else if (block_cur_->nxt_ == 0) |
| | 153 | { |
| | 154 | /* we're at the end of the list - add a block */ |
| | 155 | block_cur_->nxt_ = (CVmGramProdMemBlock *) |
| | 156 | t3malloc(sizeof(CVmGramProdMemBlock)); |
| | 157 | |
| | 158 | /* advance to the new block */ |
| | 159 | block_cur_ = block_cur_->nxt_; |
| | 160 | |
| | 161 | /* the new block is the last in the chain */ |
| | 162 | block_cur_->nxt_ = 0; |
| | 163 | } |
| | 164 | else |
| | 165 | { |
| | 166 | /* another block follows - advance to it */ |
| | 167 | block_cur_ = block_cur_->nxt_; |
| | 168 | } |
| | 169 | |
| | 170 | /* start at the beginning of the new block */ |
| | 171 | cur_free_ = block_cur_->buf_; |
| | 172 | |
| | 173 | /* this entire block is available */ |
| | 174 | cur_rem_ = VMGRAMPROD_MEM_BLOCK_SIZE; |
| | 175 | } |
| | 176 | |
| | 177 | /* the return value will be the current free pointer */ |
| | 178 | ret = cur_free_; |
| | 179 | |
| | 180 | /* consume the space */ |
| | 181 | cur_free_ += siz; |
| | 182 | cur_rem_ -= siz; |
| | 183 | |
| | 184 | /* return the block location */ |
| | 185 | return ret; |
| | 186 | } |
| | 187 | |
| | 188 | private: |
| | 189 | /* head of block list */ |
| | 190 | CVmGramProdMemBlock *block_head_; |
| | 191 | |
| | 192 | /* block we're currently suballocating out of */ |
| | 193 | CVmGramProdMemBlock *block_cur_; |
| | 194 | |
| | 195 | /* current free pointer in current block */ |
| | 196 | char *cur_free_; |
| | 197 | |
| | 198 | /* amount of space remaining in current block */ |
| | 199 | size_t cur_rem_; |
| | 200 | }; |
| | 201 | |
| | 202 | /* ------------------------------------------------------------------------ */ |
| | 203 | /* |
| | 204 | * Input token array entry. We make a private copy of the input token list |
| | 205 | * (i.e., the token list we're trying to match to our grammar rules) during |
| | 206 | * parsing using one of these structures for each token. |
| | 207 | */ |
| | 208 | struct vmgramprod_tok |
| | 209 | { |
| | 210 | /* original token value */ |
| | 211 | vm_val_t val_; |
| | 212 | |
| | 213 | /* token type */ |
| | 214 | ulong typ_; |
| | 215 | |
| | 216 | /* token text */ |
| | 217 | const char *txt_; |
| | 218 | |
| | 219 | /* length of the token text */ |
| | 220 | size_t len_; |
| | 221 | |
| | 222 | /* hash value of the token */ |
| | 223 | unsigned int hash_; |
| | 224 | |
| | 225 | /* number of vocabulary matches associated with the word */ |
| | 226 | size_t match_cnt_; |
| | 227 | |
| | 228 | /* pointer to list of matches for the word */ |
| | 229 | vmgram_match_info *matches_; |
| | 230 | }; |
| | 231 | |
| | 232 | |
| | 233 | /* ------------------------------------------------------------------------ */ |
| | 234 | /* |
| | 235 | * Parsing state match object. A match can be terminal, in which case |
| | 236 | * it refers to a single token of input, or it can be a production, in |
| | 237 | * which case it refers to a subtree of match objects. |
| | 238 | */ |
| | 239 | struct CVmGramProdMatch |
| | 240 | { |
| | 241 | /* allocate */ |
| | 242 | static CVmGramProdMatch *alloc(CVmGramProdMem *mem, size_t tok_pos, |
| | 243 | const vm_val_t *tok_match_result, |
| | 244 | int matched_star, |
| | 245 | vm_prop_id_t target_prop, |
| | 246 | vm_obj_id_t proc_obj, |
| | 247 | const CVmGramProdMatch **sub_match_list, |
| | 248 | size_t sub_match_cnt) |
| | 249 | { |
| | 250 | CVmGramProdMatch *match; |
| | 251 | |
| | 252 | /* allocate space */ |
| | 253 | match = (CVmGramProdMatch *)mem->alloc(sizeof(CVmGramProdMatch)); |
| | 254 | |
| | 255 | /* initialize */ |
| | 256 | match->tok_pos_ = tok_pos; |
| | 257 | match->matched_star_ = matched_star; |
| | 258 | match->target_prop_ = target_prop; |
| | 259 | match->proc_obj_ = proc_obj; |
| | 260 | match->sub_match_list_ = sub_match_list; |
| | 261 | match->sub_match_cnt_ = sub_match_cnt; |
| | 262 | |
| | 263 | /* save the token match value, if one was given */ |
| | 264 | if (tok_match_result != 0) |
| | 265 | match->tok_match_result_ = *tok_match_result; |
| | 266 | else |
| | 267 | match->tok_match_result_.set_nil(); |
| | 268 | |
| | 269 | /* return the new match */ |
| | 270 | return match; |
| | 271 | } |
| | 272 | |
| | 273 | /* property ID with which this match is associated */ |
| | 274 | vm_prop_id_t target_prop_; |
| | 275 | |
| | 276 | /* |
| | 277 | * token position of the matching token; ignored if the production |
| | 278 | * subtree information is valid |
| | 279 | */ |
| | 280 | size_t tok_pos_; |
| | 281 | |
| | 282 | /* |
| | 283 | * The token match value. This is the result code from the Dictionary |
| | 284 | * comparator's matchValues() function, which typically encodes |
| | 285 | * information about how the value matched (truncation, case folding, |
| | 286 | * accent approximation, etc) in bitwise flags. |
| | 287 | */ |
| | 288 | vm_val_t tok_match_result_; |
| | 289 | |
| | 290 | /* |
| | 291 | * flag: this is a '*' token in the rule; these don't really match any |
| | 292 | * tokens at all, since they merely serve to stop parsing regardless |
| | 293 | * of whether or not more input tokens are present |
| | 294 | */ |
| | 295 | int matched_star_; |
| | 296 | |
| | 297 | /* |
| | 298 | * object ID of match processor - if this is valid, this match |
| | 299 | * refers to a production; otherwise, the match refers to a single |
| | 300 | * token |
| | 301 | */ |
| | 302 | vm_obj_id_t proc_obj_; |
| | 303 | |
| | 304 | /* pointer to array of sub-matches */ |
| | 305 | const CVmGramProdMatch **sub_match_list_; |
| | 306 | |
| | 307 | /* number of sub-match entries */ |
| | 308 | size_t sub_match_cnt_; |
| | 309 | }; |
| | 310 | |
| | 311 | /* |
| | 312 | * Top-level match list holder |
| | 313 | */ |
| | 314 | struct CVmGramProdMatchEntry |
| | 315 | { |
| | 316 | /* the match tree */ |
| | 317 | CVmGramProdMatch *match_; |
| | 318 | |
| | 319 | /* token position at the end of the match */ |
| | 320 | size_t tok_pos_; |
| | 321 | |
| | 322 | /* next entry in the list */ |
| | 323 | CVmGramProdMatchEntry *nxt_; |
| | 324 | }; |
| | 325 | |
| | 326 | /* |
| | 327 | * Parsing state object. At any given time, we will have one or more of |
| | 328 | * these state objects in our processing queue. A state object tracks a |
| | 329 | * parsing position - the token position, the alternative position, the |
| | 330 | * match list so far for the alternative (i.e., the match for each item |
| | 331 | * in the alternative up to but not including the current position), and |
| | 332 | * the enclosing parsing state object (i.e., the parsing state that |
| | 333 | * "recursed" into this alternative). |
| | 334 | */ |
| | 335 | struct CVmGramProdState |
| | 336 | { |
| | 337 | /* allocate */ |
| | 338 | static CVmGramProdState *alloc(CVmGramProdMem *mem, |
| | 339 | int tok_pos, int alt_pos, |
| | 340 | CVmGramProdState *enclosing, |
| | 341 | const vmgram_alt_info *altp, |
| | 342 | vm_obj_id_t prod_obj, |
| | 343 | int circular_alt) |
| | 344 | { |
| | 345 | size_t match_cnt; |
| | 346 | CVmGramProdState *state; |
| | 347 | |
| | 348 | /* get the number of match list slots we'll need */ |
| | 349 | match_cnt = altp->tok_cnt; |
| | 350 | |
| | 351 | /* allocate space, including space for the match list */ |
| | 352 | state = (CVmGramProdState *) |
| | 353 | mem->alloc(sizeof(CVmGramProdState) |
| | 354 | + (match_cnt - 1)*sizeof(CVmGramProdMatch *)); |
| | 355 | |
| | 356 | /* initialize */ |
| | 357 | state->init(tok_pos, alt_pos, enclosing, altp, prod_obj, |
| | 358 | circular_alt); |
| | 359 | |
| | 360 | /* return the new item */ |
| | 361 | return state; |
| | 362 | } |
| | 363 | |
| | 364 | /* initialize */ |
| | 365 | void init(int tok_pos, int alt_pos, CVmGramProdState *enclosing, |
| | 366 | const vmgram_alt_info *altp, vm_obj_id_t prod_obj, |
| | 367 | int circular_alt) |
| | 368 | { |
| | 369 | /* remember the position parameters */ |
| | 370 | tok_pos_ = tok_pos; |
| | 371 | alt_pos_ = alt_pos; |
| | 372 | |
| | 373 | /* we haven't matched a '*' token yet */ |
| | 374 | matched_star_ = FALSE; |
| | 375 | |
| | 376 | /* remember our stack position */ |
| | 377 | enclosing_ = enclosing; |
| | 378 | |
| | 379 | /* remember our alternative image data pointer */ |
| | 380 | altp_ = altp; |
| | 381 | |
| | 382 | /* remember the production object */ |
| | 383 | prod_obj_ = prod_obj; |
| | 384 | |
| | 385 | /* note if we had circular alternatives in the same production */ |
| | 386 | circular_alt_ = circular_alt; |
| | 387 | |
| | 388 | /* no target property for sub-production yet */ |
| | 389 | sub_target_prop_ = VM_INVALID_PROP; |
| | 390 | |
| | 391 | /* we're not in the queue yet */ |
| | 392 | nxt_ = 0; |
| | 393 | } |
| | 394 | |
| | 395 | /* clone the state */ |
| | 396 | CVmGramProdState *clone(CVmGramProdMem *mem) |
| | 397 | { |
| | 398 | CVmGramProdState *new_state; |
| | 399 | CVmGramProdState *new_enclosing; |
| | 400 | |
| | 401 | /* clone our enclosing state */ |
| | 402 | new_enclosing = (enclosing_ != 0 ? enclosing_->clone(mem) : 0); |
| | 403 | |
| | 404 | /* create a new state object */ |
| | 405 | new_state = alloc(mem, tok_pos_, alt_pos_, new_enclosing, |
| | 406 | altp_, prod_obj_, circular_alt_); |
| | 407 | |
| | 408 | /* copy the target sub-production property */ |
| | 409 | new_state->sub_target_prop_ = sub_target_prop_; |
| | 410 | |
| | 411 | /* copy the valid portion of my match list */ |
| | 412 | if (alt_pos_ != 0) |
| | 413 | memcpy(new_state->match_list_, match_list_, |
| | 414 | alt_pos_ * sizeof(match_list_[0])); |
| | 415 | |
| | 416 | /* we're not yet in any queue */ |
| | 417 | new_state->nxt_ = 0; |
| | 418 | |
| | 419 | /* set the '*' flag in the cloned state */ |
| | 420 | new_state->matched_star_ = matched_star_; |
| | 421 | |
| | 422 | /* return the clone */ |
| | 423 | return new_state; |
| | 424 | } |
| | 425 | |
| | 426 | /* current token position, as an index into the token list */ |
| | 427 | size_t tok_pos_; |
| | 428 | |
| | 429 | /* |
| | 430 | * flag: we've matched a '*' token in a production or subproduction, |
| | 431 | * so we should consider all remaining tokens to have been consumed |
| | 432 | * (we don't advance tok_pos_ past all of the tokens, because we |
| | 433 | * separately want to keep track of the number of tokens we matched |
| | 434 | * for everything up to the '*') |
| | 435 | */ |
| | 436 | int matched_star_; |
| | 437 | |
| | 438 | /* |
| | 439 | * current alternative position, as an index into the alternative's |
| | 440 | * list of items |
| | 441 | */ |
| | 442 | size_t alt_pos_; |
| | 443 | |
| | 444 | /* |
| | 445 | * the enclosing state object - this is the state object that |
| | 446 | * "recursed" to our position |
| | 447 | */ |
| | 448 | CVmGramProdState *enclosing_; |
| | 449 | |
| | 450 | /* pointer to alternative definition */ |
| | 451 | const vmgram_alt_info *altp_; |
| | 452 | |
| | 453 | /* object ID of the production object */ |
| | 454 | vm_obj_id_t prod_obj_; |
| | 455 | |
| | 456 | /* the target property for the pending sub-production */ |
| | 457 | vm_prop_id_t sub_target_prop_; |
| | 458 | |
| | 459 | /* next production state in the processing queue */ |
| | 460 | CVmGramProdState *nxt_; |
| | 461 | |
| | 462 | /* the production contains circular alternatives */ |
| | 463 | int circular_alt_; |
| | 464 | |
| | 465 | /* |
| | 466 | * Match list. This list is allocated with enough space for one |
| | 467 | * match for each of our items (the number of items is given by |
| | 468 | * vmgram_alt_tokcnt(altp_)). At any given time, these will be |
| | 469 | * valid up to but not including the current alt_pos_ index. |
| | 470 | */ |
| | 471 | const struct CVmGramProdMatch *match_list_[1]; |
| | 472 | }; |
| | 473 | |
| | 474 | /* |
| | 475 | * Queues. We maintain three queues: the primary work queue, which |
| | 476 | * contains pending parsing states that we're still attempting to match; |
| | 477 | * the success queue, which contains a list of match trees for |
| | 478 | * successfully completed matches; and the badness queue, which contains |
| | 479 | * a list of parsing states that we can try as fallbacks if we fail to |
| | 480 | * find a match in any of the work queue items. |
| | 481 | */ |
| | 482 | struct CVmGramProdQueue |
| | 483 | { |
| | 484 | /* clear the queues */ |
| | 485 | void clear() |
| | 486 | { |
| | 487 | work_queue_ = 0; |
| | 488 | badness_queue_ = 0; |
| | 489 | success_list_ = 0; |
| | 490 | } |
| | 491 | |
| | 492 | /* head of work queue */ |
| | 493 | CVmGramProdState *work_queue_; |
| | 494 | |
| | 495 | /* head of badness queue */ |
| | 496 | CVmGramProdState *badness_queue_; |
| | 497 | |
| | 498 | /* head of success list */ |
| | 499 | CVmGramProdMatchEntry *success_list_; |
| | 500 | }; |
| | 501 | |
| | 502 | /* ------------------------------------------------------------------------ */ |
| | 503 | /* |
| | 504 | * Grammar-Production object statics |
| | 505 | */ |
| | 506 | |
| | 507 | /* metaclass registration object */ |
| | 508 | static CVmMetaclassGramProd metaclass_reg_obj; |
| | 509 | CVmMetaclass *CVmObjGramProd::metaclass_reg_ = &metaclass_reg_obj; |
| | 510 | |
| | 511 | /* function table */ |
| | 512 | int (CVmObjGramProd:: |
| | 513 | *CVmObjGramProd::func_table_[])(VMG_ vm_obj_id_t self, |
| | 514 | vm_val_t *retval, uint *argc) = |
| | 515 | { |
| | 516 | &CVmObjGramProd::getp_undef, |
| | 517 | &CVmObjGramProd::getp_parse, |
| | 518 | &CVmObjGramProd::getp_get_gram_info |
| | 519 | }; |
| | 520 | |
| | 521 | /* ------------------------------------------------------------------------ */ |
| | 522 | /* |
| | 523 | * Grammar-Production metaclass implementation |
| | 524 | */ |
| | 525 | |
| | 526 | /* |
| | 527 | * create dynamically using stack arguments |
| | 528 | */ |
| | 529 | vm_obj_id_t CVmObjGramProd::create_from_stack(VMG_ const uchar **pc_ptr, |
| | 530 | uint argc) |
| | 531 | { |
| | 532 | /* this type of object cannot be dynamically created */ |
| | 533 | err_throw(VMERR_BAD_DYNAMIC_NEW); |
| | 534 | |
| | 535 | /* we won't reach this point, but the compiler might not know */ |
| | 536 | AFTER_ERR_THROW(return VM_INVALID_OBJ;) |
| | 537 | } |
| | 538 | |
| | 539 | /* |
| | 540 | * constructor |
| | 541 | */ |
| | 542 | CVmObjGramProd::CVmObjGramProd(VMG0_) |
| | 543 | { |
| | 544 | /* allocate our extension structure from the variable heap */ |
| | 545 | ext_ = (char *)G_mem->get_var_heap() |
| | 546 | ->alloc_mem(sizeof(vm_gram_ext), this); |
| | 547 | |
| | 548 | /* we have no image data yet */ |
| | 549 | get_ext()->image_data_ = 0; |
| | 550 | get_ext()->image_data_size_ = 0; |
| | 551 | |
| | 552 | /* we haven't set up our alternatives yet */ |
| | 553 | get_ext()->alts_ = 0; |
| | 554 | get_ext()->alt_cnt_ = 0; |
| | 555 | |
| | 556 | /* we haven't cached any hash values yet */ |
| | 557 | get_ext()->hashes_cached_ = FALSE; |
| | 558 | get_ext()->comparator_ = VM_INVALID_OBJ; |
| | 559 | |
| | 560 | /* presume we have no circular rules */ |
| | 561 | get_ext()->has_circular_alt = FALSE; |
| | 562 | |
| | 563 | /* create our memory pool */ |
| | 564 | get_ext()->mem_ = new CVmGramProdMem(); |
| | 565 | |
| | 566 | /* |
| | 567 | * allocate initial property enumeration space (we'll expand this as |
| | 568 | * needed, so the initial size is just a guess) |
| | 569 | */ |
| | 570 | get_ext()->prop_enum_max_ = 64; |
| | 571 | get_ext()->prop_enum_arr_ = (vmgram_match_info *) |
| | 572 | t3malloc(get_ext()->prop_enum_max_ |
| | 573 | * sizeof(vmgram_match_info)); |
| | 574 | } |
| | 575 | |
| | 576 | /* |
| | 577 | * notify of deletion |
| | 578 | */ |
| | 579 | void CVmObjGramProd::notify_delete(VMG_ int in_root_set) |
| | 580 | { |
| | 581 | /* free our additional data */ |
| | 582 | if (ext_ != 0) |
| | 583 | { |
| | 584 | /* |
| | 585 | * If we've allocated our alternatives, delete the memory. We |
| | 586 | * allocate the entire complex structure in one block, which we use |
| | 587 | * for the alts_ array, so we merely need to delete that one block. |
| | 588 | */ |
| | 589 | if (get_ext()->alts_ != 0) |
| | 590 | t3free(get_ext()->alts_); |
| | 591 | |
| | 592 | /* delete our memory pool */ |
| | 593 | delete get_ext()->mem_; |
| | 594 | |
| | 595 | /* delete our property enumeration space */ |
| | 596 | t3free(get_ext()->prop_enum_arr_); |
| | 597 | |
| | 598 | /* free the extension */ |
| | 599 | G_mem->get_var_heap()->free_mem(ext_); |
| | 600 | } |
| | 601 | } |
| | 602 | |
| | 603 | /* |
| | 604 | * get a property |
| | 605 | */ |
| | 606 | int CVmObjGramProd::get_prop(VMG_ vm_prop_id_t prop, vm_val_t *retval, |
| | 607 | vm_obj_id_t self, vm_obj_id_t *source_obj, |
| | 608 | uint *argc) |
| | 609 | { |
| | 610 | uint func_idx; |
| | 611 | |
| | 612 | /* translate the property index to an index into our function table */ |
| | 613 | func_idx = G_meta_table |
| | 614 | ->prop_to_vector_idx(metaclass_reg_->get_reg_idx(), prop); |
| | 615 | |
| | 616 | /* call the appropriate function */ |
| | 617 | if ((this->*func_table_[func_idx])(vmg_ self, retval, argc)) |
| | 618 | { |
| | 619 | *source_obj = metaclass_reg_->get_class_obj(vmg0_); |
| | 620 | return TRUE; |
| | 621 | } |
| | 622 | |
| | 623 | /* inherit default handling */ |
| | 624 | return CVmObject::get_prop(vmg_ prop, retval, self, source_obj, argc); |
| | 625 | } |
| | 626 | |
| | 627 | /* |
| | 628 | * set a property |
| | 629 | */ |
| | 630 | void CVmObjGramProd::set_prop(VMG_ class CVmUndo *undo, |
| | 631 | vm_obj_id_t self, vm_prop_id_t prop, |
| | 632 | const vm_val_t *val) |
| | 633 | { |
| | 634 | /* no settable properties - throw an error */ |
| | 635 | err_throw(VMERR_INVALID_SETPROP); |
| | 636 | } |
| | 637 | |
| | 638 | /* |
| | 639 | * load from an image file |
| | 640 | */ |
| | 641 | void CVmObjGramProd::load_from_image(VMG_ vm_obj_id_t self, |
| | 642 | const char *ptr, size_t siz) |
| | 643 | { |
| | 644 | const char *p; |
| | 645 | size_t alt_cnt; |
| | 646 | size_t tok_cnt; |
| | 647 | size_t i; |
| | 648 | size_t alo_siz; |
| | 649 | vmgram_alt_info *next_alt; |
| | 650 | vmgram_tok_info *next_tok; |
| | 651 | char *next_byte; |
| | 652 | |
| | 653 | /* remember where our image data comes from */ |
| | 654 | get_ext()->image_data_ = ptr; |
| | 655 | get_ext()->image_data_size_ = siz; |
| | 656 | |
| | 657 | /* |
| | 658 | * For our alternatives structure, start with the base amount of |
| | 659 | * memory, which is the amount we'll need for the array of alternative |
| | 660 | * entries themselves. The first UINT2 gives the number of |
| | 661 | * alternatives in our grammar rule. |
| | 662 | */ |
| | 663 | alt_cnt = osrp2(ptr); |
| | 664 | alo_siz = alt_cnt * sizeof(vmgram_alt_info); |
| | 665 | |
| | 666 | /* |
| | 667 | * Scan the alternatives. Check for circular (left-recursive) |
| | 668 | * alternatives, and add up how much memory we'll need for our decoded |
| | 669 | * alternatives structure. |
| | 670 | */ |
| | 671 | for (i = alt_cnt, p = ptr + 2, tok_cnt = 0 ; i != 0 ; --i) |
| | 672 | { |
| | 673 | const char *tokp; |
| | 674 | size_t alt_tok_cnt; |
| | 675 | size_t j; |
| | 676 | |
| | 677 | /* get the token count and first token for this alternative */ |
| | 678 | alt_tok_cnt = vmgram_alt_tokcnt(p); |
| | 679 | tokp = vmgram_alt_tokptr(p); |
| | 680 | |
| | 681 | /* |
| | 682 | * For our allocation size, add the amount of memory we need for |
| | 683 | * this alternative entry's array of tokens. |
| | 684 | */ |
| | 685 | alo_siz += alt_tok_cnt * sizeof(vmgram_tok_info); |
| | 686 | |
| | 687 | /* include these tokens in the cumulative token count */ |
| | 688 | tok_cnt += alt_tok_cnt; |
| | 689 | |
| | 690 | /* if the alternative is circular, note it */ |
| | 691 | if (alt_tok_cnt != 0 |
| | 692 | && vmgram_tok_type(tokp) == VMGRAM_MATCH_PROD |
| | 693 | && vmgram_tok_prod_obj(tokp) == self) |
| | 694 | { |
| | 695 | /* note the existence of a circular reference */ |
| | 696 | get_ext()->has_circular_alt = TRUE; |
| | 697 | } |
| | 698 | |
| | 699 | /* scan the tokens to see how much memory we'll need to store them */ |
| | 700 | for (j = 0 ; j < alt_tok_cnt ; ++j, tokp = get_next_alt_tok(tokp)) |
| | 701 | { |
| | 702 | /* check what sort of extra memory we'll need for this token */ |
| | 703 | switch(vmgram_tok_type(tokp)) |
| | 704 | { |
| | 705 | case VMGRAM_MATCH_NSPEECH: |
| | 706 | /* we need the array of vm_prop_id_t's */ |
| | 707 | alo_siz += osrndsz(vmgram_tok_vocn_cnt(tokp) |
| | 708 | * sizeof(vm_prop_id_t)); |
| | 709 | break; |
| | 710 | |
| | 711 | case VMGRAM_MATCH_LITERAL: |
| | 712 | /* we need space for the string */ |
| | 713 | alo_siz += osrndsz(vmgram_tok_lit_len(tokp)); |
| | 714 | break; |
| | 715 | } |
| | 716 | } |
| | 717 | |
| | 718 | /* the next alternative starts after the last token */ |
| | 719 | p = tokp; |
| | 720 | } |
| | 721 | |
| | 722 | /* |
| | 723 | * Allocate our block of memory to store the decoded alternatives. Put |
| | 724 | * the whole thing in one block, and put the alternative array at the |
| | 725 | * start of the block. |
| | 726 | */ |
| | 727 | get_ext()->alt_cnt_ = alt_cnt; |
| | 728 | get_ext()->alts_ = next_alt = (vmgram_alt_info *)t3malloc(alo_siz); |
| | 729 | |
| | 730 | /* allocate the tokens right after the alternative array */ |
| | 731 | next_tok = (vmgram_tok_info *)&get_ext()->alts_[alt_cnt]; |
| | 732 | |
| | 733 | /* allocate the other miscellaneous data after the token array */ |
| | 734 | next_byte = (char *)&next_tok[tok_cnt]; |
| | 735 | |
| | 736 | /* run through the image data again and decode it */ |
| | 737 | for (i = alt_cnt, p = ptr + 2, tok_cnt = 0 ; i != 0 ; --i) |
| | 738 | { |
| | 739 | const char *tokp; |
| | 740 | size_t alt_tok_cnt; |
| | 741 | size_t j; |
| | 742 | |
| | 743 | /* get the token count and first token for this alternative */ |
| | 744 | alt_tok_cnt = vmgram_alt_tokcnt(p); |
| | 745 | tokp = vmgram_alt_tokptr(p); |
| | 746 | |
| | 747 | /* set up this alternative structure */ |
| | 748 | next_alt->score = vmgram_alt_score(p); |
| | 749 | next_alt->badness = vmgram_alt_badness(p); |
| | 750 | next_alt->proc_obj = vmgram_alt_procobj(p); |
| | 751 | next_alt->tok_cnt = vmgram_alt_tokcnt(p); |
| | 752 | next_alt->toks = next_tok; |
| | 753 | |
| | 754 | /* consume this alternative entry */ |
| | 755 | ++next_alt; |
| | 756 | |
| | 757 | /* scan and decode the tokens */ |
| | 758 | for (j = 0 ; j < alt_tok_cnt ; ++j, tokp = get_next_alt_tok(tokp)) |
| | 759 | { |
| | 760 | size_t k; |
| | 761 | |
| | 762 | /* set up this token's base information */ |
| | 763 | next_tok->prop = vmgram_tok_prop(tokp); |
| | 764 | next_tok->typ = vmgram_tok_type(tokp); |
| | 765 | |
| | 766 | /* decode the type-specific data */ |
| | 767 | switch(vmgram_tok_type(tokp)) |
| | 768 | { |
| | 769 | case VMGRAM_MATCH_PROD: |
| | 770 | next_tok->typinfo.prod_obj = vmgram_tok_prod_obj(tokp); |
| | 771 | break; |
| | 772 | |
| | 773 | case VMGRAM_MATCH_SPEECH: |
| | 774 | next_tok->typinfo.speech_prop = vmgram_tok_voc_prop(tokp); |
| | 775 | break; |
| | 776 | |
| | 777 | case VMGRAM_MATCH_NSPEECH: |
| | 778 | /* set up our array, using the miscellaneous space pool */ |
| | 779 | next_tok->typinfo.nspeech.cnt = vmgram_tok_vocn_cnt(tokp); |
| | 780 | next_tok->typinfo.nspeech.props = (vm_prop_id_t *)next_byte; |
| | 781 | |
| | 782 | /* copy the properties */ |
| | 783 | for (k = 0 ; k < next_tok->typinfo.nspeech.cnt ; ++k) |
| | 784 | next_tok->typinfo.nspeech.props[k] = |
| | 785 | vmgram_tok_vocn_prop(tokp, k); |
| | 786 | |
| | 787 | /* consume the space from the pool */ |
| | 788 | next_byte += osrndsz(next_tok->typinfo.nspeech.cnt |
| | 789 | * sizeof(vm_prop_id_t)); |
| | 790 | break; |
| | 791 | |
| | 792 | case VMGRAM_MATCH_LITERAL: |
| | 793 | /* take the space out of the miscellaneous space pool */ |
| | 794 | next_tok->typinfo.lit.len = vmgram_tok_lit_len(tokp); |
| | 795 | next_tok->typinfo.lit.str = next_byte; |
| | 796 | |
| | 797 | /* copy the data */ |
| | 798 | memcpy(next_tok->typinfo.lit.str, vmgram_tok_lit_txt(tokp), |
| | 799 | next_tok->typinfo.lit.len); |
| | 800 | |
| | 801 | /* consume the space from the pool */ |
| | 802 | next_byte += osrndsz(next_tok->typinfo.lit.len); |
| | 803 | break; |
| | 804 | |
| | 805 | case VMGRAM_MATCH_TOKTYPE: |
| | 806 | next_tok->typinfo.toktyp_enum = vmgram_tok_tok_enum(tokp); |
| | 807 | break; |
| | 808 | } |
| | 809 | |
| | 810 | /* consume the token slot */ |
| | 811 | ++next_tok; |
| | 812 | } |
| | 813 | |
| | 814 | /* the next alternative starts after the last token */ |
| | 815 | p = tokp; |
| | 816 | } |
| | 817 | } |
| | 818 | |
| | 819 | /* |
| | 820 | * Remove stale weak references. |
| | 821 | */ |
| | 822 | void CVmObjGramProd::remove_stale_weak_refs(VMG0_) |
| | 823 | { |
| | 824 | /* |
| | 825 | * Our reference to the dictionary comparator object is weak (we're |
| | 826 | * only caching it - we don't want to prevent the object from being |
| | 827 | * collected if no one else wants it). So, forget it if the comparator |
| | 828 | * is being deleted. |
| | 829 | */ |
| | 830 | if (get_ext()->comparator_ != VM_INVALID_OBJ |
| | 831 | && G_obj_table->is_obj_deletable(get_ext()->comparator_)) |
| | 832 | { |
| | 833 | /* forget the comparator */ |
| | 834 | get_ext()->comparator_ = VM_INVALID_OBJ; |
| | 835 | |
| | 836 | /* |
| | 837 | * our cached hash values depend on the comparator, so they're now |
| | 838 | * invalid - forget about them |
| | 839 | */ |
| | 840 | get_ext()->hashes_cached_ = FALSE; |
| | 841 | } |
| | 842 | } |
| | 843 | |
| | 844 | |
| | 845 | /* |
| | 846 | * Context for enum_props_cb |
| | 847 | */ |
| | 848 | struct enum_props_ctx |
| | 849 | { |
| | 850 | /* object extension */ |
| | 851 | vm_gram_ext *ext; |
| | 852 | |
| | 853 | /* number of properties in our list so far */ |
| | 854 | size_t cnt; |
| | 855 | }; |
| | 856 | |
| | 857 | /* |
| | 858 | * Callback for enumerating word properties |
| | 859 | */ |
| | 860 | void CVmObjGramProd::enum_props_cb(VMG_ void *ctx0, vm_prop_id_t prop, |
| | 861 | const vm_val_t * /*match_val*/) |
| | 862 | { |
| | 863 | enum_props_ctx *ctx = (enum_props_ctx *)ctx0; |
| | 864 | vmgram_match_info *ep; |
| | 865 | size_t i; |
| | 866 | |
| | 867 | /* if this property is already in our list, don't add it again */ |
| | 868 | for (i = 0, ep = ctx->ext->prop_enum_arr_ ; i < ctx->cnt ; ++i, ++ep) |
| | 869 | { |
| | 870 | /* if we already have this one, we can ignore it */ |
| | 871 | if (ep->prop == prop) |
| | 872 | return; |
| | 873 | } |
| | 874 | |
| | 875 | /* we need to add it - if the array is full, expand it */ |
| | 876 | if (ctx->cnt == ctx->ext->prop_enum_max_) |
| | 877 | { |
| | 878 | /* reallocate the array with more space */ |
| | 879 | ctx->ext->prop_enum_max_ += 64; |
| | 880 | ctx->ext->prop_enum_arr_ = (vmgram_match_info *) |
| | 881 | t3realloc(ctx->ext->prop_enum_arr_, |
| | 882 | ctx->ext->prop_enum_max_ |
| | 883 | * sizeof(vmgram_match_info)); |
| | 884 | } |
| | 885 | |
| | 886 | /* add the property to the array */ |
| | 887 | ep = &ctx->ext->prop_enum_arr_[ctx->cnt]; |
| | 888 | ep->prop = prop; |
| | 889 | |
| | 890 | /* count the new entry */ |
| | 891 | ctx->cnt++; |
| | 892 | } |
| | 893 | |
| | 894 | /* |
| | 895 | * Execute the parseToken method |
| | 896 | */ |
| | 897 | int CVmObjGramProd::getp_parse(VMG_ vm_obj_id_t self, |
| | 898 | vm_val_t *retval, uint *argc) |
| | 899 | { |
| | 900 | vm_val_t *tokval; |
| | 901 | vm_val_t dictval; |
| | 902 | CVmObjDict *dict; |
| | 903 | size_t tok_cnt; |
| | 904 | size_t i; |
| | 905 | vmgramprod_tok *tok; |
| | 906 | const char *toklistp; |
| | 907 | int orig_argc = (argc != 0 ? *argc : 0); |
| | 908 | CVmGramProdQueue queues; |
| | 909 | CVmObjList *lst; |
| | 910 | size_t succ_cnt; |
| | 911 | CVmGramProdMatchEntry *match; |
| | 912 | static CVmNativeCodeDesc desc(2); |
| | 913 | |
| | 914 | /* check arguments */ |
| | 915 | if (get_prop_check_argc(retval, argc, &desc)) |
| | 916 | return TRUE; |
| | 917 | |
| | 918 | /* reset our memory pool */ |
| | 919 | get_ext()->mem_->reset(); |
| | 920 | |
| | 921 | /* clear the work queues */ |
| | 922 | queues.clear(); |
| | 923 | |
| | 924 | /* |
| | 925 | * get the tokenList argument and make sure it's a list; leave it on |
| | 926 | * the stack for now so it remains reachable for the garbage |
| | 927 | * collector |
| | 928 | */ |
| | 929 | tokval = G_stk->get(0); |
| | 930 | if ((toklistp = tokval->get_as_list(vmg0_)) == 0) |
| | 931 | err_throw(VMERR_LIST_VAL_REQD); |
| | 932 | |
| | 933 | /* get the length of the token list */ |
| | 934 | tok_cnt = vmb_get_len(tokval->get_as_list(vmg0_)); |
| | 935 | |
| | 936 | /* check for a dictionary argument */ |
| | 937 | if (orig_argc >= 2) |
| | 938 | { |
| | 939 | /* get the dictionary argument */ |
| | 940 | dictval = *G_stk->get(1); |
| | 941 | |
| | 942 | /* make sure it's an object or nil */ |
| | 943 | if (dictval.typ != VM_OBJ && dictval.typ != VM_NIL) |
| | 944 | err_throw(VMERR_OBJ_VAL_REQD); |
| | 945 | } |
| | 946 | else |
| | 947 | { |
| | 948 | /* use a nil dictionary by default */ |
| | 949 | dictval.set_nil(); |
| | 950 | } |
| | 951 | |
| | 952 | /* get the dictionary object and check that it's really a dictionary */ |
| | 953 | if (dictval.typ != VM_NIL) |
| | 954 | { |
| | 955 | /* if it's not a CVmObjDict object, it's the wrong type */ |
| | 956 | if (!CVmObjDict::is_dictionary_obj(vmg_ dictval.val.obj)) |
| | 957 | { |
| | 958 | /* wrong type - throw an error */ |
| | 959 | err_throw(VMERR_INVAL_OBJ_TYPE); |
| | 960 | } |
| | 961 | |
| | 962 | /* get the VM object pointer */ |
| | 963 | dict = (CVmObjDict *)vm_objp(vmg_ dictval.val.obj); |
| | 964 | } |
| | 965 | else |
| | 966 | { |
| | 967 | /* there's no dictionary object */ |
| | 968 | dict = 0; |
| | 969 | } |
| | 970 | |
| | 971 | /* |
| | 972 | * For quick and easy access, make our own private copy of the token |
| | 973 | * and type lists. First, allocate an array of token structures. |
| | 974 | */ |
| | 975 | tok = (vmgramprod_tok *)get_ext()->mem_ |
| | 976 | ->alloc(tok_cnt * sizeof(vmgramprod_tok)); |
| | 977 | |
| | 978 | /* copy the token string references and types into our list */ |
| | 979 | for (i = 0 ; i < tok_cnt ; ++i) |
| | 980 | { |
| | 981 | vm_val_t ele_val; |
| | 982 | vm_val_t tok_typ_val; |
| | 983 | vm_val_t tok_str_val; |
| | 984 | const char *subp; |
| | 985 | const char *tokstrp; |
| | 986 | |
| | 987 | /* presume we won't have any vocabulary properties for the word */ |
| | 988 | tok[i].match_cnt_ = 0; |
| | 989 | tok[i].matches_ = 0; |
| | 990 | |
| | 991 | /* get this element from the list, and treat it as a list */ |
| | 992 | CVmObjList::index_list(vmg_ &ele_val, toklistp, i+1); |
| | 993 | subp = ele_val.get_as_list(vmg0_); |
| | 994 | |
| | 995 | /* if this element isn't a list, it's an error */ |
| | 996 | if (subp == 0) |
| | 997 | err_throw(VMERR_BAD_TYPE_BIF); |
| | 998 | |
| | 999 | /* |
| | 1000 | * parse the token sublist: the first element is the token value, |
| | 1001 | * and the second element is the token type |
| | 1002 | */ |
| | 1003 | CVmObjList::index_list(vmg_ &tok_str_val, subp, 1); |
| | 1004 | CVmObjList::index_list(vmg_ &tok_typ_val, subp, 2); |
| | 1005 | |
| | 1006 | /* use the value if it's an enumerator; if not, use zero */ |
| | 1007 | if (tok_typ_val.typ == VM_ENUM) |
| | 1008 | tok[i].typ_ = tok_typ_val.val.enumval; |
| | 1009 | else |
| | 1010 | tok[i].typ_ = 0; |
| | 1011 | |
| | 1012 | /* get the token value as a string */ |
| | 1013 | tokstrp = tok_str_val.get_as_string(vmg0_); |
| | 1014 | |
| | 1015 | /* if it's a string, copy it; otherwise, ignore the value */ |
| | 1016 | if (tokstrp != 0) |
| | 1017 | { |
| | 1018 | /* remember the string value */ |
| | 1019 | tok[i].val_ = tok_str_val; |
| | 1020 | |
| | 1021 | /* point the token to the string */ |
| | 1022 | tok[i].txt_ = tokstrp + VMB_LEN; |
| | 1023 | tok[i].len_ = vmb_get_len(tokstrp); |
| | 1024 | |
| | 1025 | /* calculate and store the token's hash value */ |
| | 1026 | tok[i].hash_ = calc_str_hash( |
| | 1027 | vmg_ dict, &tok_str_val, |
| | 1028 | tokstrp + VMB_LEN, vmb_get_len(tokstrp)); |
| | 1029 | |
| | 1030 | /* |
| | 1031 | * if we have a dictionary, enumerate the properties |
| | 1032 | * associated with the word |
| | 1033 | */ |
| | 1034 | if (dict != 0) |
| | 1035 | { |
| | 1036 | enum_props_ctx ctx; |
| | 1037 | |
| | 1038 | /* set up our callback context */ |
| | 1039 | ctx.ext = get_ext(); |
| | 1040 | ctx.cnt = 0; |
| | 1041 | |
| | 1042 | /* enumerate the properties */ |
| | 1043 | dict->enum_word_props(vmg_ &enum_props_cb, &ctx, |
| | 1044 | &tok_str_val, tok[i].txt_, tok[i].len_); |
| | 1045 | |
| | 1046 | /* |
| | 1047 | * if we found any properties for the word, make a copy |
| | 1048 | * for the token list |
| | 1049 | */ |
| | 1050 | if (ctx.cnt != 0) |
| | 1051 | { |
| | 1052 | /* allocate space for the list */ |
| | 1053 | tok[i].match_cnt_ = ctx.cnt; |
| | 1054 | tok[i].matches_ = |
| | 1055 | (vmgram_match_info *)get_ext()->mem_ |
| | 1056 | ->alloc(ctx.cnt * sizeof(vmgram_match_info)); |
| | 1057 | |
| | 1058 | /* copy the list */ |
| | 1059 | memcpy(tok[i].matches_, get_ext()->prop_enum_arr_, |
| | 1060 | ctx.cnt * sizeof(tok[i].matches_[0])); |
| | 1061 | } |
| | 1062 | } |
| | 1063 | } |
| | 1064 | else |
| | 1065 | { |
| | 1066 | /* it's not a string - use a null string value for this token */ |
| | 1067 | tok[i].txt_ = 0; |
| | 1068 | tok[i].len_ = 0; |
| | 1069 | } |
| | 1070 | } |
| | 1071 | |
| | 1072 | /* enqueue a match state item for each of our alternatives */ |
| | 1073 | enqueue_alts(vmg_ get_ext()->mem_, tok, tok_cnt, 0, 0, |
| | 1074 | &queues, self, FALSE, 0, dict); |
| | 1075 | |
| | 1076 | /* process the work queue */ |
| | 1077 | process_work_queue(vmg_ get_ext()->mem_, tok, tok_cnt, |
| | 1078 | &queues, dict); |
| | 1079 | |
| | 1080 | /* count the entries in the success list */ |
| | 1081 | for (succ_cnt = 0, match = queues.success_list_ ; match != 0 ; |
| | 1082 | ++succ_cnt, match = match->nxt_) ; |
| | 1083 | |
| | 1084 | /* allocate the return list */ |
| | 1085 | retval->set_obj(CVmObjList::create(vmg_ FALSE, succ_cnt)); |
| | 1086 | lst = (CVmObjList *)vm_objp(vmg_ retval->val.obj); |
| | 1087 | |
| | 1088 | /* initially clear the elements of the return list to nil */ |
| | 1089 | for (i = 0 ; i < succ_cnt ; ++i) |
| | 1090 | { |
| | 1091 | vm_val_t nil_val; |
| | 1092 | |
| | 1093 | /* set this element to nil */ |
| | 1094 | nil_val.set_nil(); |
| | 1095 | lst->cons_set_element(i, &nil_val); |
| | 1096 | } |
| | 1097 | |
| | 1098 | /* push the list momentarily to protect it from garbage collection */ |
| | 1099 | G_stk->push(retval); |
| | 1100 | |
| | 1101 | /* set the elements */ |
| | 1102 | for (succ_cnt = 0, match = queues.success_list_ ; match != 0 ; |
| | 1103 | ++succ_cnt, match = match->nxt_) |
| | 1104 | { |
| | 1105 | vm_val_t tree_val; |
| | 1106 | vm_val_t tok_match_val; |
| | 1107 | size_t first_tok; |
| | 1108 | size_t last_tok; |
| | 1109 | |
| | 1110 | /* |
| | 1111 | * Create a list to hold the token match list - this is a list of |
| | 1112 | * the dictionary's comparator's matchValue() results for the |
| | 1113 | * tokens that matched literals in the grammar rule. We need a |
| | 1114 | * list with one element per token that we matched. |
| | 1115 | * |
| | 1116 | * We can't actually populate this list yet, but all we need for |
| | 1117 | * the moment are references to it. So, create the list; we'll |
| | 1118 | * fill it in as we traverse the match to build the result tree. |
| | 1119 | */ |
| | 1120 | tok_match_val.set_obj( |
| | 1121 | CVmObjList::create(vmg_ FALSE, match->tok_pos_)); |
| | 1122 | |
| | 1123 | /* push the token match value list for gc protection */ |
| | 1124 | G_stk->push(&tok_match_val); |
| | 1125 | |
| | 1126 | /* build the object tree for this success object */ |
| | 1127 | build_match_tree(vmg_ match->match_, tokval, &tok_match_val, |
| | 1128 | &tree_val, &first_tok, &last_tok); |
| | 1129 | |
| | 1130 | /* discard our token match list's gc protection stack entry */ |
| | 1131 | G_stk->discard(); |
| | 1132 | |
| | 1133 | /* |
| | 1134 | * add the match tree's root object to the main return list (note |
| | 1135 | * that this protects it from garbage collection by virtue of the |
| | 1136 | * main return list being protected from garbage collection) |
| | 1137 | */ |
| | 1138 | lst->cons_set_element(succ_cnt, &tree_val); |
| | 1139 | } |
| | 1140 | |
| | 1141 | /* discard the gc protection */ |
| | 1142 | G_stk->discard(); |
| | 1143 | |
| | 1144 | /* discard arguments */ |
| | 1145 | G_stk->discard(orig_argc); |
| | 1146 | |
| | 1147 | /* evaluation successful */ |
| | 1148 | return TRUE; |
| | 1149 | } |
| | 1150 | |
| | 1151 | /* |
| | 1152 | * Build an object tree for a match |
| | 1153 | */ |
| | 1154 | void CVmObjGramProd::build_match_tree(VMG_ const CVmGramProdMatch *match, |
| | 1155 | const vm_val_t *toklist, |
| | 1156 | const vm_val_t *tokmatchlist, |
| | 1157 | vm_val_t *retval, |
| | 1158 | size_t *first_tok, size_t *last_tok) |
| | 1159 | { |
| | 1160 | /* check to see what kind of match we have */ |
| | 1161 | if (match->proc_obj_ != VM_INVALID_OBJ) |
| | 1162 | { |
| | 1163 | vm_obj_id_t obj_id; |
| | 1164 | CVmObjTads *objp; |
| | 1165 | size_t i; |
| | 1166 | |
| | 1167 | /* |
| | 1168 | * Create the object to hold the current tree level. The only |
| | 1169 | * constructor argument is the superclass, which is the match's |
| | 1170 | * processor object. |
| | 1171 | */ |
| | 1172 | G_stk->push()->set_obj(match->proc_obj_); |
| | 1173 | obj_id = CVmObjTads::create_from_stack(vmg_ 0, 1); |
| | 1174 | |
| | 1175 | /* get a pointer to the new object */ |
| | 1176 | objp = (CVmObjTads *)vm_objp(vmg_ obj_id); |
| | 1177 | |
| | 1178 | /* push this object to protect it from gc for a moment */ |
| | 1179 | G_stk->push()->set_obj(obj_id); |
| | 1180 | |
| | 1181 | /* |
| | 1182 | * Initialize the caller's first/last token indices to an invalid |
| | 1183 | * range, in case we have no subproductions. A production with no |
| | 1184 | * tokens and no subproductions matches no tokens at all; we |
| | 1185 | * indicate this by using an invalid range, with the first token |
| | 1186 | * index higher than the last token index. Note that it's still |
| | 1187 | * important that we accurately reflect the end of our range for a |
| | 1188 | * zero-token match, since our enclosing nodes will need our |
| | 1189 | * position to figure out where they go. |
| | 1190 | */ |
| | 1191 | *first_tok = match->tok_pos_ + 1; |
| | 1192 | *last_tok = match->tok_pos_; |
| | 1193 | |
| | 1194 | /* build the sub-match entries */ |
| | 1195 | for (i = 0 ; i < match->sub_match_cnt_ ; ++i) |
| | 1196 | { |
| | 1197 | size_t first_sub_tok; |
| | 1198 | size_t last_sub_tok; |
| | 1199 | vm_val_t val; |
| | 1200 | |
| | 1201 | /* build this subtree */ |
| | 1202 | build_match_tree(vmg_ match->sub_match_list_[i], |
| | 1203 | toklist, tokmatchlist, |
| | 1204 | &val, &first_sub_tok, &last_sub_tok); |
| | 1205 | |
| | 1206 | /* |
| | 1207 | * If this is the first subtree, use it as the tentative |
| | 1208 | * limits for our overall match so far; otherwise, expand our |
| | 1209 | * limits if they are outside our range so far. |
| | 1210 | * |
| | 1211 | * If the submatch doesn't include any tokens, it obviously |
| | 1212 | * has no effect on our range. The submatch range will |
| | 1213 | * indicate that the first token index is greater than the |
| | 1214 | * last token index if the submatch includes no tokens. |
| | 1215 | */ |
| | 1216 | if (i == 0) |
| | 1217 | { |
| | 1218 | /* it's the first subtree - it's all we know as yet */ |
| | 1219 | *first_tok = first_sub_tok; |
| | 1220 | *last_tok = last_sub_tok; |
| | 1221 | } |
| | 1222 | else if (first_sub_tok <= last_sub_tok) |
| | 1223 | { |
| | 1224 | /* check to see if it expands our current range */ |
| | 1225 | if (first_sub_tok < *first_tok) |
| | 1226 | *first_tok = first_sub_tok; |
| | 1227 | if (last_sub_tok > *last_tok) |
| | 1228 | *last_tok = last_sub_tok; |
| | 1229 | } |
| | 1230 | |
| | 1231 | /* |
| | 1232 | * save the subtree with the current match object if there's a |
| | 1233 | * property in which to save it |
| | 1234 | */ |
| | 1235 | if (match->sub_match_list_[i]->target_prop_ != VM_INVALID_PROP) |
| | 1236 | { |
| | 1237 | /* |
| | 1238 | * Set the processor object property for the value. Note |
| | 1239 | * that we don't have to keep undo for this change, since |
| | 1240 | * we just created the tree object ourselves, and we don't |
| | 1241 | * create any undo savepoints - an object created after |
| | 1242 | * the most recent undo savepoint doesn't need to keep |
| | 1243 | * undo information, since the entire object will be |
| | 1244 | * deleted if we undo to the savepoint. |
| | 1245 | */ |
| | 1246 | objp->set_prop(vmg_ 0, obj_id, |
| | 1247 | match->sub_match_list_[i]->target_prop_, |
| | 1248 | &val); |
| | 1249 | } |
| | 1250 | } |
| | 1251 | |
| | 1252 | /* |
| | 1253 | * if we have exported properties for recording the token index |
| | 1254 | * range, save them with the object |
| | 1255 | */ |
| | 1256 | if (G_predef->gramprod_first_tok != VM_INVALID_PROP |
| | 1257 | && G_predef->gramprod_last_tok != VM_INVALID_PROP) |
| | 1258 | { |
| | 1259 | vm_val_t val; |
| | 1260 | |
| | 1261 | /* save the first token index */ |
| | 1262 | val.set_int((int)*first_tok + 1); |
| | 1263 | objp->set_prop(vmg_ 0, obj_id, |
| | 1264 | G_predef->gramprod_first_tok, &val); |
| | 1265 | |
| | 1266 | /* save the last token index */ |
| | 1267 | val.set_int((int)*last_tok + 1); |
| | 1268 | objp->set_prop(vmg_ 0, obj_id, |
| | 1269 | G_predef->gramprod_last_tok, &val); |
| | 1270 | } |
| | 1271 | |
| | 1272 | /* |
| | 1273 | * if we have the token list property exported, set it to the |
| | 1274 | * original token list reference |
| | 1275 | */ |
| | 1276 | if (G_predef->gramprod_token_list != VM_INVALID_PROP) |
| | 1277 | { |
| | 1278 | /* save the token list reference */ |
| | 1279 | objp->set_prop(vmg_ 0, obj_id, |
| | 1280 | G_predef->gramprod_token_list, toklist); |
| | 1281 | } |
| | 1282 | |
| | 1283 | /* if we have the token match list property exported, set it */ |
| | 1284 | if (G_predef->gramprod_token_match_list != VM_INVALID_PROP) |
| | 1285 | { |
| | 1286 | /* save the token match list reference */ |
| | 1287 | objp->set_prop(vmg_ 0, obj_id, |
| | 1288 | G_predef->gramprod_token_match_list, tokmatchlist); |
| | 1289 | } |
| | 1290 | |
| | 1291 | /* discard our gc protection */ |
| | 1292 | G_stk->discard(); |
| | 1293 | |
| | 1294 | /* the return value is the object */ |
| | 1295 | retval->set_obj(obj_id); |
| | 1296 | } |
| | 1297 | else |
| | 1298 | { |
| | 1299 | const char *lstp; |
| | 1300 | |
| | 1301 | /* get the token list */ |
| | 1302 | lstp = toklist->get_as_list(vmg0_); |
| | 1303 | |
| | 1304 | /* make sure the index is in range */ |
| | 1305 | if (match->tok_pos_ < vmb_get_len(lstp)) |
| | 1306 | { |
| | 1307 | vm_val_t ele_val; |
| | 1308 | CVmObjList *match_lst; |
| | 1309 | |
| | 1310 | /* get the token from the list */ |
| | 1311 | CVmObjList::index_list(vmg_ &ele_val, lstp, match->tok_pos_ + 1); |
| | 1312 | |
| | 1313 | /* |
| | 1314 | * the token is itself a list, whose first element is the |
| | 1315 | * token's value - retrieve the value |
| | 1316 | */ |
| | 1317 | lstp = ele_val.get_as_list(vmg0_); |
| | 1318 | CVmObjList::index_list(vmg_ retval, lstp, 1); |
| | 1319 | |
| | 1320 | /* |
| | 1321 | * Store the token match result in the result list. If this is |
| | 1322 | * a '*' match, it doesn't have a result list contribution, |
| | 1323 | * because '*' doesn't actually match any tokens (it merely |
| | 1324 | * stops parsing). |
| | 1325 | */ |
| | 1326 | if (!match->matched_star_) |
| | 1327 | { |
| | 1328 | /* get the match list */ |
| | 1329 | match_lst = (CVmObjList *)vm_objp(vmg_ tokmatchlist->val.obj); |
| | 1330 | |
| | 1331 | /* |
| | 1332 | * set the element at this token position in the match list |
| | 1333 | * to the token match result |
| | 1334 | */ |
| | 1335 | match_lst->cons_set_element( |
| | 1336 | match->tok_pos_, &match->tok_match_result_); |
| | 1337 | } |
| | 1338 | } |
| | 1339 | else |
| | 1340 | { |
| | 1341 | /* |
| | 1342 | * the index is past the end of the list - this must be a |
| | 1343 | * '*' token that matched nothing (i.e., the token list was |
| | 1344 | * fully consumed before we reached the '*'), so just return |
| | 1345 | * nil for the match value |
| | 1346 | */ |
| | 1347 | retval->set_nil(); |
| | 1348 | } |
| | 1349 | |
| | 1350 | /* |
| | 1351 | * We match one token, so it's the first and last index. Note |
| | 1352 | * that if this is a '*' match, we don't match any tokens. |
| | 1353 | */ |
| | 1354 | if (!match->matched_star_) |
| | 1355 | *first_tok = *last_tok = match->tok_pos_; |
| | 1356 | } |
| | 1357 | } |
| | 1358 | |
| | 1359 | /* |
| | 1360 | * Process the work queue |
| | 1361 | */ |
| | 1362 | void CVmObjGramProd::process_work_queue(VMG_ CVmGramProdMem *mem, |
| | 1363 | const vmgramprod_tok *tok, |
| | 1364 | size_t tok_cnt, |
| | 1365 | CVmGramProdQueue *queues, |
| | 1366 | CVmObjDict *dict) |
| | 1367 | { |
| | 1368 | /* keep going until the work queue and badness queue are empty */ |
| | 1369 | for (;;) |
| | 1370 | { |
| | 1371 | /* |
| | 1372 | * If the work queue is empty, fall back on the badness queue. |
| | 1373 | * Ignore the badness queue if we have any successful matches, |
| | 1374 | * since non-badness matches always take precedence over badness |
| | 1375 | * items. |
| | 1376 | */ |
| | 1377 | if (queues->work_queue_ == 0 && queues->success_list_ == 0) |
| | 1378 | { |
| | 1379 | int first; |
| | 1380 | int min_badness; |
| | 1381 | CVmGramProdState *cur; |
| | 1382 | CVmGramProdState *prv; |
| | 1383 | CVmGramProdState *nxt; |
| | 1384 | |
| | 1385 | /* find the lowest badness rating in the queue */ |
| | 1386 | for (first = TRUE, min_badness = 0, cur = queues->badness_queue_ ; |
| | 1387 | cur != 0 ; cur = cur->nxt_, first = FALSE) |
| | 1388 | { |
| | 1389 | /* |
| | 1390 | * if we're on the first item, note its badness as the |
| | 1391 | * tentative minimum; otherwise, note the current item's |
| | 1392 | * badness if it's lower than the lowest we've seen so |
| | 1393 | * far |
| | 1394 | */ |
| | 1395 | if (first || cur->altp_->badness < min_badness) |
| | 1396 | min_badness = cur->altp_->badness; |
| | 1397 | } |
| | 1398 | |
| | 1399 | /* |
| | 1400 | * move each of the items whose badness matches the minimum |
| | 1401 | * badness out of the badness queue and into the work queue |
| | 1402 | */ |
| | 1403 | for (cur = queues->badness_queue_, prv = 0 ; cur != 0 ; |
| | 1404 | cur = nxt) |
| | 1405 | { |
| | 1406 | /* remember the next item, in case we remove this one */ |
| | 1407 | nxt = cur->nxt_; |
| | 1408 | |
| | 1409 | /* if this item has minimum badness, move it */ |
| | 1410 | if (cur->altp_->badness == min_badness) |
| | 1411 | { |
| | 1412 | /* unlink it from the badness queue */ |
| | 1413 | if (prv == 0) |
| | 1414 | queues->badness_queue_ = cur->nxt_; |
| | 1415 | else |
| | 1416 | prv->nxt_ = cur->nxt_; |
| | 1417 | |
| | 1418 | /* link it into the work queue */ |
| | 1419 | cur->nxt_ = queues->work_queue_; |
| | 1420 | queues->work_queue_ = cur; |
| | 1421 | } |
| | 1422 | else |
| | 1423 | { |
| | 1424 | /* |
| | 1425 | * this item is staying in the list - note it as the |
| | 1426 | * item preceding the next item |
| | 1427 | */ |
| | 1428 | prv = cur; |
| | 1429 | } |
| | 1430 | } |
| | 1431 | } |
| | 1432 | |
| | 1433 | /* if the work queue is still empty, we're out of work to do */ |
| | 1434 | if (queues->work_queue_ == 0) |
| | 1435 | break; |
| | 1436 | |
| | 1437 | /* process the head of the work queue */ |
| | 1438 | process_work_queue_head(vmg_ mem, tok, tok_cnt, queues, dict); |
| | 1439 | } |
| | 1440 | } |
| | 1441 | |
| | 1442 | /* |
| | 1443 | * Process a work queue entry |
| | 1444 | */ |
| | 1445 | void CVmObjGramProd::process_work_queue_head(VMG_ CVmGramProdMem *mem, |
| | 1446 | const vmgramprod_tok *tok, |
| | 1447 | size_t tok_cnt, |
| | 1448 | CVmGramProdQueue *queues, |
| | 1449 | CVmObjDict *dict) |
| | 1450 | { |
| | 1451 | CVmGramProdState *state; |
| | 1452 | CVmGramProdMatch *match; |
| | 1453 | const vmgram_tok_info *tokp; |
| | 1454 | int tok_matched_star; |
| | 1455 | vm_prop_id_t enclosing_target_prop; |
| | 1456 | |
| | 1457 | /* get the first entry from the queue */ |
| | 1458 | state = queues->work_queue_; |
| | 1459 | |
| | 1460 | /* unlink this entry from the queue */ |
| | 1461 | queues->work_queue_ = state->nxt_; |
| | 1462 | state->nxt_ = 0; |
| | 1463 | |
| | 1464 | /* get the token pointer for the next active entry in the alternative */ |
| | 1465 | tokp = state->altp_->toks + state->alt_pos_; |
| | 1466 | |
| | 1467 | /* presume we won't match a '*' token */ |
| | 1468 | tok_matched_star = FALSE; |
| | 1469 | |
| | 1470 | /* process the remaining items in the entry */ |
| | 1471 | while (state->alt_pos_ < state->altp_->tok_cnt) |
| | 1472 | { |
| | 1473 | int match; |
| | 1474 | vm_val_t match_result; |
| | 1475 | |
| | 1476 | /* presume we won't find a match */ |
| | 1477 | match_result.set_nil(); |
| | 1478 | |
| | 1479 | /* see what we have for this token */ |
| | 1480 | switch(tokp->typ) |
| | 1481 | { |
| | 1482 | case VMGRAM_MATCH_PROD: |
| | 1483 | { |
| | 1484 | vm_obj_id_t sub_obj_id; |
| | 1485 | CVmObjGramProd *sub_objp; |
| | 1486 | |
| | 1487 | /* |
| | 1488 | * This is a sub-production node. Get the |
| | 1489 | * sub-production object. |
| | 1490 | */ |
| | 1491 | sub_obj_id = tokp->typinfo.prod_obj; |
| | 1492 | |
| | 1493 | /* make sure it's of the correct metaclass */ |
| | 1494 | if (!CVmObjGramProd::is_gramprod_obj(vmg_ sub_obj_id)) |
| | 1495 | { |
| | 1496 | /* wrong type - throw an error */ |
| | 1497 | err_throw(VMERR_INVAL_OBJ_TYPE); |
| | 1498 | } |
| | 1499 | |
| | 1500 | /* get the sub-production object */ |
| | 1501 | sub_objp = (CVmObjGramProd *)vm_objp(vmg_ sub_obj_id); |
| | 1502 | |
| | 1503 | /* |
| | 1504 | * set my subproduction target property, so that the |
| | 1505 | * sub-production can set the target property correctly |
| | 1506 | */ |
| | 1507 | state->sub_target_prop_ = tokp->prop; |
| | 1508 | |
| | 1509 | /* enqueue the alternatives for the sub-production */ |
| | 1510 | sub_objp->enqueue_alts(vmg_ mem, tok, tok_cnt, |
| | 1511 | state->tok_pos_, state, queues, |
| | 1512 | sub_obj_id, FALSE, 0, dict); |
| | 1513 | } |
| | 1514 | |
| | 1515 | /* |
| | 1516 | * Do not process the current state any further for now - |
| | 1517 | * we'll get back to it when (and if) we finish processing |
| | 1518 | * the sub-production. Note that we don't even put the |
| | 1519 | * current state back in the queue - it's stacked behind the |
| | 1520 | * sub-production, and will be re-enqueued when we |
| | 1521 | * successfully finish with the sub-production. |
| | 1522 | */ |
| | 1523 | return; |
| | 1524 | |
| | 1525 | case VMGRAM_MATCH_SPEECH: |
| | 1526 | /* part of speech - check to see if the current token matches */ |
| | 1527 | match = (state->tok_pos_ < tok_cnt |
| | 1528 | && find_prop_in_tok(&tok[state->tok_pos_], |
| | 1529 | tokp->typinfo.speech_prop)); |
| | 1530 | break; |
| | 1531 | |
| | 1532 | case VMGRAM_MATCH_NSPEECH: |
| | 1533 | /* |
| | 1534 | * multiple parts of speech - check to see if the current token |
| | 1535 | * matches any of the parts of the speech |
| | 1536 | */ |
| | 1537 | if (state->tok_pos_ < tok_cnt) |
| | 1538 | { |
| | 1539 | size_t rem; |
| | 1540 | const vm_prop_id_t *prop; |
| | 1541 | |
| | 1542 | /* presume we won't find a match */ |
| | 1543 | match = FALSE; |
| | 1544 | |
| | 1545 | /* check each item in the list */ |
| | 1546 | for (prop = tokp->typinfo.nspeech.props, |
| | 1547 | rem = tokp->typinfo.nspeech.cnt ; |
| | 1548 | rem != 0 ; --rem, ++prop) |
| | 1549 | { |
| | 1550 | /* if this one matches, we have a match */ |
| | 1551 | if (find_prop_in_tok(&tok[state->tok_pos_], *prop)) |
| | 1552 | { |
| | 1553 | /* note the match */ |
| | 1554 | match = TRUE; |
| | 1555 | |
| | 1556 | /* no need to look any further at this token */ |
| | 1557 | break; |
| | 1558 | } |
| | 1559 | } |
| | 1560 | } |
| | 1561 | else |
| | 1562 | { |
| | 1563 | /* |
| | 1564 | * we're out of tokens, so we definitely don't have a |
| | 1565 | * match, since we must match at least one of the possible |
| | 1566 | * parts of speech of this item |
| | 1567 | */ |
| | 1568 | match = FALSE; |
| | 1569 | } |
| | 1570 | break; |
| | 1571 | |
| | 1572 | case VMGRAM_MATCH_LITERAL: |
| | 1573 | /* |
| | 1574 | * Literal - check for a match to the string. Test the hash |
| | 1575 | * values first, as this is much faster than comparing the |
| | 1576 | * strings, and at least tells us if the strings fail to match |
| | 1577 | * (and failing to match being by far the most common case, |
| | 1578 | * this saves us doing a full comparison most of the time). |
| | 1579 | */ |
| | 1580 | match = (state->tok_pos_ < tok_cnt |
| | 1581 | && tokp->typinfo.lit.hash == tok[state->tok_pos_].hash_ |
| | 1582 | && tok_equals_lit(vmg_ &tok[state->tok_pos_], |
| | 1583 | tokp->typinfo.lit.str, |
| | 1584 | tokp->typinfo.lit.len, |
| | 1585 | dict, &match_result)); |
| | 1586 | break; |
| | 1587 | |
| | 1588 | case VMGRAM_MATCH_TOKTYPE: |
| | 1589 | /* token type */ |
| | 1590 | match = (state->tok_pos_ < tok_cnt |
| | 1591 | && (tok[state->tok_pos_].typ_ |
| | 1592 | == tokp->typinfo.toktyp_enum)); |
| | 1593 | break; |
| | 1594 | |
| | 1595 | case VMGRAM_MATCH_STAR: |
| | 1596 | /* |
| | 1597 | * this matches anything remaining (it also matches if |
| | 1598 | * there's nothing remaining) |
| | 1599 | */ |
| | 1600 | match = TRUE; |
| | 1601 | |
| | 1602 | /* note that we matched a star in the state */ |
| | 1603 | state->matched_star_ = TRUE; |
| | 1604 | |
| | 1605 | /* note that we matched a star for this particular token */ |
| | 1606 | tok_matched_star = TRUE; |
| | 1607 | break; |
| | 1608 | } |
| | 1609 | |
| | 1610 | /* check for a match */ |
| | 1611 | if (match) |
| | 1612 | { |
| | 1613 | /* |
| | 1614 | * we matched this token - add a token match to our state's |
| | 1615 | * match list for the current position |
| | 1616 | */ |
| | 1617 | state->match_list_[state->alt_pos_] = |
| | 1618 | CVmGramProdMatch::alloc(mem, state->tok_pos_, &match_result, |
| | 1619 | tok_matched_star, tokp->prop, |
| | 1620 | VM_INVALID_OBJ, 0, 0); |
| | 1621 | |
| | 1622 | /* we're done with this alternative item - move on */ |
| | 1623 | state->alt_pos_++; |
| | 1624 | |
| | 1625 | /* |
| | 1626 | * If we matched a real token, consume the matched input |
| | 1627 | * token. For a '*', we don't want to consume anything, since |
| | 1628 | * a '*' merely stops parsing and doesn't actually match |
| | 1629 | * anything. |
| | 1630 | */ |
| | 1631 | if (!state->matched_star_) |
| | 1632 | state->tok_pos_++; |
| | 1633 | |
| | 1634 | /* move on to the next alternative token item */ |
| | 1635 | ++tokp; |
| | 1636 | } |
| | 1637 | else |
| | 1638 | { |
| | 1639 | /* |
| | 1640 | * This is not a match - reject this alternative. We do not |
| | 1641 | * need to process this item further, so we can stop now, |
| | 1642 | * without returning this state to the work queue. Simply |
| | 1643 | * abandon this work queue item. |
| | 1644 | */ |
| | 1645 | return; |
| | 1646 | } |
| | 1647 | } |
| | 1648 | |
| | 1649 | /* |
| | 1650 | * If we make it here, we've reached the end of this alternative and |
| | 1651 | * have matched everything in it. This means that the alternative |
| | 1652 | * was successful, so we can perform a 'reduce' operation to replace |
| | 1653 | * the matched tokens with this production. |
| | 1654 | */ |
| | 1655 | |
| | 1656 | /* if we have an enclosing state, get its target property */ |
| | 1657 | enclosing_target_prop = (state->enclosing_ != 0 |
| | 1658 | ? state->enclosing_->sub_target_prop_ |
| | 1659 | : VM_INVALID_PROP); |
| | 1660 | |
| | 1661 | /* create a match for the entire alternative (i.e., the subproduction) */ |
| | 1662 | match = CVmGramProdMatch::alloc( |
| | 1663 | mem, state->tok_pos_, 0, FALSE, enclosing_target_prop, |
| | 1664 | state->altp_->proc_obj, &state->match_list_[0], |
| | 1665 | state->altp_->tok_cnt); |
| | 1666 | |
| | 1667 | /* |
| | 1668 | * If the state we just matched was marked on creation as coming |
| | 1669 | * from an alternative with circular alternatives, we've just come |
| | 1670 | * up with a match for each circular alternative. To avoid infinite |
| | 1671 | * recursion, we never enqueue the circular alternatives; however, |
| | 1672 | * now that we know we have a match for the first element of each |
| | 1673 | * circular alternative, we can check each of them to see if we can |
| | 1674 | * enqueue them. |
| | 1675 | */ |
| | 1676 | if (state->circular_alt_) |
| | 1677 | { |
| | 1678 | vm_obj_id_t prod_obj_id; |
| | 1679 | CVmObjGramProd *prod_objp; |
| | 1680 | |
| | 1681 | /* |
| | 1682 | * Get the production from which this alternative came (there |
| | 1683 | * should be no need to check the metaclass, since we could only |
| | 1684 | * have created the state object from a valid production in the |
| | 1685 | * first place). |
| | 1686 | * |
| | 1687 | * First, make sure it's of the correct class. (It almost |
| | 1688 | * certainly is, since the compiler should have generated this |
| | 1689 | * reference automatically, so there should be no possibility of a |
| | 1690 | * user error. However, if it's not a production object, we'll |
| | 1691 | * probably crash by blindly casting it to one; so we'll check |
| | 1692 | * here, just to be sure that the compiler didn't do something |
| | 1693 | * wrong and the image file didn't become corrupted.) |
| | 1694 | */ |
| | 1695 | if (!CVmObjGramProd::is_gramprod_obj(vmg_ state->prod_obj_)) |
| | 1696 | { |
| | 1697 | /* wrong type - throw an error */ |
| | 1698 | err_throw(VMERR_INVAL_OBJ_TYPE); |
| | 1699 | } |
| | 1700 | |
| | 1701 | /* get the object, properly cast */ |
| | 1702 | prod_obj_id = state->prod_obj_; |
| | 1703 | prod_objp = (CVmObjGramProd *)vm_objp(vmg_ prod_obj_id); |
| | 1704 | |
| | 1705 | /* |
| | 1706 | * Enqueue the matching circular alternatives from the original |
| | 1707 | * production. Our match becomes the first token match in the |
| | 1708 | * circular alternative (i.e., it matches the leading circular |
| | 1709 | * reference element). |
| | 1710 | */ |
| | 1711 | prod_objp->enqueue_alts(vmg_ mem, tok, tok_cnt, state->tok_pos_, |
| | 1712 | state->enclosing_, queues, prod_obj_id, |
| | 1713 | TRUE, match, dict); |
| | 1714 | } |
| | 1715 | |
| | 1716 | /* check for an enclosing state to pop */ |
| | 1717 | if (state->enclosing_ != 0) |
| | 1718 | { |
| | 1719 | /* add the match to the enclosing state's match list */ |
| | 1720 | state->enclosing_->match_list_[state->enclosing_->alt_pos_] = match; |
| | 1721 | state->enclosing_->alt_pos_++; |
| | 1722 | |
| | 1723 | /* |
| | 1724 | * Move the enclosing state's token position to our token position |
| | 1725 | * - since it now encompasses our match, it has consumed all of |
| | 1726 | * the tokens therein. Likewise, set the '*' flag in the parent |
| | 1727 | * to our own setting. |
| | 1728 | */ |
| | 1729 | state->enclosing_->tok_pos_ = state->tok_pos_; |
| | 1730 | state->enclosing_->matched_star_ = state->matched_star_; |
| | 1731 | |
| | 1732 | /* enqueue the enclosing state so we can continue parsing it */ |
| | 1733 | enqueue_state(state->enclosing_, queues); |
| | 1734 | } |
| | 1735 | else |
| | 1736 | { |
| | 1737 | CVmGramProdMatchEntry *entry; |
| | 1738 | |
| | 1739 | /* |
| | 1740 | * This is a top-level state, so we've completed parsing. If |
| | 1741 | * we've consumed all input, or we matched a '*' token, we were |
| | 1742 | * successful. If there's more input remaining, this is not a |
| | 1743 | * match. |
| | 1744 | */ |
| | 1745 | if (state->tok_pos_ < tok_cnt && !state->matched_star_) |
| | 1746 | { |
| | 1747 | /* |
| | 1748 | * There's more input remaining, so this isn't a match. |
| | 1749 | * Reject this alternative. We do not need to process this |
| | 1750 | * item further, so stop now without returning this state |
| | 1751 | * object to the work queue. |
| | 1752 | */ |
| | 1753 | |
| | 1754 | /* abandon this work queue item */ |
| | 1755 | return; |
| | 1756 | } |
| | 1757 | |
| | 1758 | /* create a new entry for the match list */ |
| | 1759 | entry = (CVmGramProdMatchEntry *)mem->alloc(sizeof(*entry)); |
| | 1760 | entry->match_ = match; |
| | 1761 | entry->tok_pos_ = state->tok_pos_; |
| | 1762 | |
| | 1763 | /* link the entry into the list */ |
| | 1764 | entry->nxt_ = queues->success_list_; |
| | 1765 | queues->success_list_ = entry; |
| | 1766 | } |
| | 1767 | } |
| | 1768 | |
| | 1769 | /* |
| | 1770 | * Visit my alternatives and enqueue each one that is a possible match |
| | 1771 | * for a given token list. |
| | 1772 | */ |
| | 1773 | void CVmObjGramProd::enqueue_alts(VMG_ CVmGramProdMem *mem, |
| | 1774 | const vmgramprod_tok *tok, |
| | 1775 | size_t tok_cnt, size_t start_tok_pos, |
| | 1776 | CVmGramProdState *enclosing_state, |
| | 1777 | CVmGramProdQueue *queues, vm_obj_id_t self, |
| | 1778 | int circ_only, |
| | 1779 | CVmGramProdMatch *circ_match, |
| | 1780 | CVmObjDict *dict) |
| | 1781 | { |
| | 1782 | const vmgram_alt_info *altp; |
| | 1783 | size_t i; |
| | 1784 | int need_to_clone; |
| | 1785 | int has_circ; |
| | 1786 | vm_obj_id_t comparator; |
| | 1787 | |
| | 1788 | /* |
| | 1789 | * We don't need to clone the enclosing state until we've set up one |
| | 1790 | * new state referring back to it. However, if we're doing circular |
| | 1791 | * alternatives, the enclosing state could also be enqueued |
| | 1792 | * separately as its own state, so do clone all required copies in |
| | 1793 | * this case. |
| | 1794 | */ |
| | 1795 | need_to_clone = circ_only; |
| | 1796 | |
| | 1797 | /* note whether or not we have any circular alternatives */ |
| | 1798 | has_circ = get_ext()->has_circular_alt; |
| | 1799 | |
| | 1800 | /* |
| | 1801 | * Cache hash values for all of our literals. We need to do this if we |
| | 1802 | * haven't done it before, or if the comparator associated with the |
| | 1803 | * dictionary is different than it was last time we cached the values. |
| | 1804 | */ |
| | 1805 | comparator = (dict != 0 ? dict->get_comparator() : VM_INVALID_OBJ); |
| | 1806 | if (!get_ext()->hashes_cached_ || get_ext()->comparator_ != comparator) |
| | 1807 | cache_hashes(vmg_ dict); |
| | 1808 | |
| | 1809 | /* |
| | 1810 | * run through our alternatives and enqueue each one that looks |
| | 1811 | * plausible |
| | 1812 | */ |
| | 1813 | for (altp = get_ext()->alts_, i = get_ext()->alt_cnt_ ; i != 0 ; |
| | 1814 | --i, ++altp) |
| | 1815 | { |
| | 1816 | vmgram_tok_info *first_tokp; |
| | 1817 | vmgram_tok_info *tokp; |
| | 1818 | size_t cur_alt_tok; |
| | 1819 | size_t alt_tok_cnt; |
| | 1820 | size_t tok_idx; |
| | 1821 | int found_mismatch; |
| | 1822 | int prod_before; |
| | 1823 | int is_circ; |
| | 1824 | |
| | 1825 | /* start at the first token remaining in the string */ |
| | 1826 | tok_idx = start_tok_pos; |
| | 1827 | |
| | 1828 | /* get the first token for this alternative */ |
| | 1829 | tokp = first_tokp = altp->toks; |
| | 1830 | |
| | 1831 | /* get the number of tokens in the alternative */ |
| | 1832 | alt_tok_cnt = altp->tok_cnt; |
| | 1833 | |
| | 1834 | /* presume we will find no mismatches in this check */ |
| | 1835 | found_mismatch = FALSE; |
| | 1836 | |
| | 1837 | /* |
| | 1838 | * note whether this alternative is a direct circular reference |
| | 1839 | * or not (it is directly circular if the first token points |
| | 1840 | * back to this same production) |
| | 1841 | */ |
| | 1842 | is_circ = (alt_tok_cnt != 0 |
| | 1843 | && tokp->typ == VMGRAM_MATCH_PROD |
| | 1844 | && tokp->typinfo.prod_obj == self); |
| | 1845 | |
| | 1846 | /* |
| | 1847 | * we don't have a production before the current item (this is |
| | 1848 | * important because it determines whether we must consider the |
| | 1849 | * current token or any following token when trying to match a |
| | 1850 | * literal, part of speech, or token type) |
| | 1851 | */ |
| | 1852 | prod_before = FALSE; |
| | 1853 | |
| | 1854 | /* scan the tokens in the alternative */ |
| | 1855 | for (cur_alt_tok = 0 ; cur_alt_tok < alt_tok_cnt ; |
| | 1856 | ++cur_alt_tok, ++tokp) |
| | 1857 | { |
| | 1858 | int match; |
| | 1859 | int match_star; |
| | 1860 | int ignore_item; |
| | 1861 | vm_val_t match_result; |
| | 1862 | |
| | 1863 | /* presume we won't find a match */ |
| | 1864 | match = FALSE; |
| | 1865 | match_star = FALSE; |
| | 1866 | |
| | 1867 | /* presume we won't be able to ignore the item */ |
| | 1868 | ignore_item = FALSE; |
| | 1869 | |
| | 1870 | /* |
| | 1871 | * Try to find a match to the current alternative item. If |
| | 1872 | * we've already found a mismatch, don't bother, since we need |
| | 1873 | * no further evidence to skip this alternative; in this case |
| | 1874 | * we're still looping over the alternative items simply to |
| | 1875 | * find the beginning of the next alternative. |
| | 1876 | */ |
| | 1877 | for ( ; !found_mismatch ; ++tok_idx) |
| | 1878 | { |
| | 1879 | /* see what kind of item we have */ |
| | 1880 | switch(tokp->typ) |
| | 1881 | { |
| | 1882 | case VMGRAM_MATCH_PROD: |
| | 1883 | /* |
| | 1884 | * We can't rule out a production without checking |
| | 1885 | * it fully, which we're not doing on this pass; |
| | 1886 | * however, note that we found a production, because |
| | 1887 | * it means that we must from now on try skipping |
| | 1888 | * any number of tokens before ruling out a match on |
| | 1889 | * other grounds, since the production could |
| | 1890 | * potentially match any number of tokens. |
| | 1891 | * |
| | 1892 | * If we're doing circular matches only, and this is |
| | 1893 | * a circular production, and we're on the first |
| | 1894 | * token in our list, we already know it matches, so |
| | 1895 | * don't even count it as a production before the |
| | 1896 | * item. |
| | 1897 | */ |
| | 1898 | if (is_circ && circ_only && cur_alt_tok == 0) |
| | 1899 | { |
| | 1900 | /* |
| | 1901 | * ignore the fact that it's a production, since |
| | 1902 | * we already know it matches - we don't want to |
| | 1903 | * be flexible about tokens following this item |
| | 1904 | * since we know the exact number that match it |
| | 1905 | */ |
| | 1906 | } |
| | 1907 | else |
| | 1908 | { |
| | 1909 | /* note the production item */ |
| | 1910 | prod_before = TRUE; |
| | 1911 | } |
| | 1912 | |
| | 1913 | /* |
| | 1914 | * we can ignore this item for the purposes of |
| | 1915 | * detecting a mismatch, because we can't tell |
| | 1916 | * during this superficial scan whether it matches |
| | 1917 | */ |
| | 1918 | ignore_item = TRUE; |
| | 1919 | break; |
| | 1920 | |
| | 1921 | case VMGRAM_MATCH_SPEECH: |
| | 1922 | /* we must match a part of speech */ |
| | 1923 | if (tok_idx < tok_cnt |
| | 1924 | && find_prop_in_tok(&tok[tok_idx], |
| | 1925 | tokp->typinfo.speech_prop)) |
| | 1926 | { |
| | 1927 | /* it's a match */ |
| | 1928 | match = TRUE; |
| | 1929 | } |
| | 1930 | break; |
| | 1931 | |
| | 1932 | case VMGRAM_MATCH_NSPEECH: |
| | 1933 | /* multiple parts of speech */ |
| | 1934 | if (tok_idx < tok_cnt) |
| | 1935 | { |
| | 1936 | vm_prop_id_t *prop; |
| | 1937 | size_t rem; |
| | 1938 | |
| | 1939 | /* presume we won't find a match */ |
| | 1940 | match = FALSE; |
| | 1941 | |
| | 1942 | /* check each item in the list */ |
| | 1943 | for (prop = tokp->typinfo.nspeech.props, |
| | 1944 | rem = tokp->typinfo.nspeech.cnt ; rem != 0 ; |
| | 1945 | --rem, ++prop) |
| | 1946 | { |
| | 1947 | /* if this one matches, we have a match */ |
| | 1948 | if (find_prop_in_tok(&tok[tok_idx], *prop)) |
| | 1949 | { |
| | 1950 | /* note the match */ |
| | 1951 | match = TRUE; |
| | 1952 | |
| | 1953 | /* no need to look any further */ |
| | 1954 | break; |
| | 1955 | } |
| | 1956 | } |
| | 1957 | } |
| | 1958 | else |
| | 1959 | { |
| | 1960 | /* we're out of tokens, so we have no match */ |
| | 1961 | match = FALSE; |
| | 1962 | } |
| | 1963 | break; |
| | 1964 | |
| | 1965 | case VMGRAM_MATCH_LITERAL: |
| | 1966 | /* |
| | 1967 | * Match a literal. Compare the hash values first, |
| | 1968 | * since a negative match on the hash values will tell |
| | 1969 | * us for sure that the text won't match; since not |
| | 1970 | * matching is by far the most common case, this saves |
| | 1971 | * us the work of doing full string comparisons most of |
| | 1972 | * the time. |
| | 1973 | */ |
| | 1974 | if (tok_idx < tok_cnt |
| | 1975 | && tok[tok_idx].txt_ != 0 |
| | 1976 | && tokp->typinfo.lit.hash == tok[tok_idx].hash_ |
| | 1977 | && tok_equals_lit(vmg_ &tok[tok_idx], |
| | 1978 | tokp->typinfo.lit.str, |
| | 1979 | tokp->typinfo.lit.len, |
| | 1980 | dict, &match_result)) |
| | 1981 | { |
| | 1982 | /* it's a match */ |
| | 1983 | match = TRUE; |
| | 1984 | } |
| | 1985 | break; |
| | 1986 | |
| | 1987 | case VMGRAM_MATCH_TOKTYPE: |
| | 1988 | /* we must match a token type */ |
| | 1989 | if (tok_idx < tok_cnt |
| | 1990 | && tok[tok_idx].typ_ == tokp->typinfo.toktyp_enum) |
| | 1991 | match = TRUE; |
| | 1992 | break; |
| | 1993 | |
| | 1994 | case VMGRAM_MATCH_STAR: |
| | 1995 | /* consume everything remaining on the line */ |
| | 1996 | tok_idx = tok_cnt; |
| | 1997 | |
| | 1998 | /* this is a match */ |
| | 1999 | match = TRUE; |
| | 2000 | match_star = TRUE; |
| | 2001 | break; |
| | 2002 | } |
| | 2003 | |
| | 2004 | /* if we found a match, we can stop scanning now */ |
| | 2005 | if (match) |
| | 2006 | break; |
| | 2007 | |
| | 2008 | /* |
| | 2009 | * we didn't match this token - if we have a production |
| | 2010 | * before this point, we must keep scanning token, since |
| | 2011 | * the production could end up matching any number of |
| | 2012 | * tokens; if we don't have a production item, though, |
| | 2013 | * we don't need to look any further, and we can rule |
| | 2014 | * out this alternative immediately |
| | 2015 | */ |
| | 2016 | if (!prod_before) |
| | 2017 | break; |
| | 2018 | |
| | 2019 | /* |
| | 2020 | * if we're ignoring this item (because we can't decide |
| | 2021 | * now whether it's a match or not, and hence can't rule |
| | 2022 | * it out), don't scan any tokens for the item |
| | 2023 | */ |
| | 2024 | if (ignore_item) |
| | 2025 | break; |
| | 2026 | |
| | 2027 | /* |
| | 2028 | * if we didn't find a match, and we're out of tokens, |
| | 2029 | * stop looking - we have no more tokens to look at to |
| | 2030 | * find a match |
| | 2031 | */ |
| | 2032 | if (!match && tok_idx >= tok_cnt) |
| | 2033 | break; |
| | 2034 | } |
| | 2035 | |
| | 2036 | /* check to see if we found a match */ |
| | 2037 | if (match) |
| | 2038 | { |
| | 2039 | /* |
| | 2040 | * we found a match - skip the current token (unless we |
| | 2041 | * matched a '*' item, in which case we've already |
| | 2042 | * consumed all remaining tokens) |
| | 2043 | */ |
| | 2044 | if (!match_star && tok_idx < tok_cnt) |
| | 2045 | ++tok_idx; |
| | 2046 | } |
| | 2047 | else if (!ignore_item) |
| | 2048 | { |
| | 2049 | /* |
| | 2050 | * we didn't match, and we can't ignore this item - note |
| | 2051 | * that the alternative does not match |
| | 2052 | */ |
| | 2053 | found_mismatch = TRUE; |
| | 2054 | |
| | 2055 | /* finish up with this alternative and proceed to the next */ |
| | 2056 | break; |
| | 2057 | } |
| | 2058 | } |
| | 2059 | |
| | 2060 | /* |
| | 2061 | * If we didn't find a reason to rule out this alternative, |
| | 2062 | * enqueue the alternative. Never enqueue circular references, |
| | 2063 | * unless we're ONLY enqueuing circular references. |
| | 2064 | */ |
| | 2065 | if (!found_mismatch |
| | 2066 | && ((!circ_only && !is_circ) |
| | 2067 | || (circ_only && is_circ))) |
| | 2068 | { |
| | 2069 | CVmGramProdState *state; |
| | 2070 | |
| | 2071 | /* create and enqueue the new state */ |
| | 2072 | state = enqueue_new_state(mem, start_tok_pos, enclosing_state, |
| | 2073 | altp, self, &need_to_clone, queues, |
| | 2074 | has_circ); |
| | 2075 | |
| | 2076 | /* |
| | 2077 | * if we are enqueuing circular references, we already have |
| | 2078 | * a match for the first (circular) token, so fill it in |
| | 2079 | */ |
| | 2080 | if (circ_only) |
| | 2081 | { |
| | 2082 | CVmGramProdMatch *match; |
| | 2083 | |
| | 2084 | /* create a match for the alternative */ |
| | 2085 | match = CVmGramProdMatch:: |
| | 2086 | alloc(mem, 0, 0, FALSE, first_tokp->prop, |
| | 2087 | circ_match->proc_obj_, |
| | 2088 | circ_match->sub_match_list_, |
| | 2089 | circ_match->sub_match_cnt_); |
| | 2090 | |
| | 2091 | /* set the first match in the token */ |
| | 2092 | state->match_list_[0] = match; |
| | 2093 | |
| | 2094 | /* set up at the second token */ |
| | 2095 | state->alt_pos_ = 1; |
| | 2096 | } |
| | 2097 | } |
| | 2098 | } |
| | 2099 | } |
| | 2100 | |
| | 2101 | /* |
| | 2102 | * Cache hash values for all of our literal tokens |
| | 2103 | */ |
| | 2104 | void CVmObjGramProd::cache_hashes(VMG_ CVmObjDict *dict) |
| | 2105 | { |
| | 2106 | vm_obj_id_t comparator; |
| | 2107 | vmgram_alt_info *alt; |
| | 2108 | size_t i; |
| | 2109 | |
| | 2110 | /* get the comparator */ |
| | 2111 | comparator = (dict != 0 ? dict->get_comparator() : VM_INVALID_OBJ); |
| | 2112 | |
| | 2113 | /* run through our alternatives */ |
| | 2114 | for (i = get_ext()->alt_cnt_, alt = get_ext()->alts_ ; |
| | 2115 | i != 0 ; --i, ++alt) |
| | 2116 | { |
| | 2117 | vmgram_tok_info *tok; |
| | 2118 | size_t j; |
| | 2119 | |
| | 2120 | /* run through our tokens */ |
| | 2121 | for (j = alt->tok_cnt, tok = alt->toks ; j != 0 ; --j, ++tok) |
| | 2122 | { |
| | 2123 | /* if this is a literal token, calculate its hash value */ |
| | 2124 | if (tok->typ == VMGRAM_MATCH_LITERAL) |
| | 2125 | { |
| | 2126 | /* calculate and store our hash value */ |
| | 2127 | tok->typinfo.lit.hash = calc_str_hash( |
| | 2128 | vmg_ dict, 0, tok->typinfo.lit.str, tok->typinfo.lit.len); |
| | 2129 | } |
| | 2130 | } |
| | 2131 | } |
| | 2132 | |
| | 2133 | /* note that we've cached hash values, and note the comparator we used */ |
| | 2134 | get_ext()->hashes_cached_ = TRUE; |
| | 2135 | get_ext()->comparator_ = comparator; |
| | 2136 | } |
| | 2137 | |
| | 2138 | /* |
| | 2139 | * Create and enqueue a new state |
| | 2140 | */ |
| | 2141 | CVmGramProdState *CVmObjGramProd:: |
| | 2142 | enqueue_new_state(CVmGramProdMem *mem, |
| | 2143 | size_t start_tok_pos, |
| | 2144 | CVmGramProdState *enclosing_state, |
| | 2145 | const vmgram_alt_info *altp, vm_obj_id_t self, |
| | 2146 | int *need_to_clone, CVmGramProdQueue *queues, |
| | 2147 | int circular_alt) |
| | 2148 | { |
| | 2149 | CVmGramProdState *state; |
| | 2150 | |
| | 2151 | /* create the new state */ |
| | 2152 | state = create_new_state(mem, start_tok_pos, enclosing_state, |
| | 2153 | altp, self, need_to_clone, circular_alt); |
| | 2154 | |
| | 2155 | /* |
| | 2156 | * Add the item to the appropriate queue. If the item has an |
| | 2157 | * associated badness, add it to the badness queue. Otherwise, add |
| | 2158 | * it to the work queue. |
| | 2159 | */ |
| | 2160 | if (altp->badness != 0) |
| | 2161 | { |
| | 2162 | /* |
| | 2163 | * we have a badness rating - add it to the badness queue, since |
| | 2164 | * we don't want to process it until we entirely exhaust better |
| | 2165 | * possibilities |
| | 2166 | */ |
| | 2167 | state->nxt_ = queues->badness_queue_; |
| | 2168 | queues->badness_queue_ = state; |
| | 2169 | } |
| | 2170 | else |
| | 2171 | { |
| | 2172 | /* enqueue the state */ |
| | 2173 | enqueue_state(state, queues); |
| | 2174 | } |
| | 2175 | |
| | 2176 | /* return the new state */ |
| | 2177 | return state; |
| | 2178 | } |
| | 2179 | |
| | 2180 | /* |
| | 2181 | * Create a new state |
| | 2182 | */ |
| | 2183 | CVmGramProdState *CVmObjGramProd:: |
| | 2184 | create_new_state(CVmGramProdMem *mem, size_t start_tok_pos, |
| | 2185 | CVmGramProdState *enclosing_state, |
| | 2186 | const vmgram_alt_info *altp, vm_obj_id_t self, |
| | 2187 | int *need_to_clone, int circular_alt) |
| | 2188 | { |
| | 2189 | /* |
| | 2190 | * if necessary, clone the enclosing state; we need to do this if we |
| | 2191 | * enqueue more than one nested alternative, since each nested |
| | 2192 | * alternative could parse the rest of the token list differently |
| | 2193 | * and thus provide a different result to the enclosing state |
| | 2194 | */ |
| | 2195 | if (*need_to_clone && enclosing_state != 0) |
| | 2196 | enclosing_state = enclosing_state->clone(mem); |
| | 2197 | |
| | 2198 | /* |
| | 2199 | * we will need to clone the enclosing state if we create another |
| | 2200 | * alternative after this one |
| | 2201 | */ |
| | 2202 | *need_to_clone = TRUE; |
| | 2203 | |
| | 2204 | /* create a new state object for the alternative */ |
| | 2205 | return CVmGramProdState::alloc(mem, start_tok_pos, 0, |
| | 2206 | enclosing_state, altp, self, |
| | 2207 | circular_alt); |
| | 2208 | } |
| | 2209 | |
| | 2210 | /* |
| | 2211 | * Enqueue a state in the work queue |
| | 2212 | */ |
| | 2213 | void CVmObjGramProd::enqueue_state(CVmGramProdState *state, |
| | 2214 | CVmGramProdQueue *queues) |
| | 2215 | { |
| | 2216 | /* add it to the work queue */ |
| | 2217 | state->nxt_ = queues->work_queue_; |
| | 2218 | queues->work_queue_ = state; |
| | 2219 | } |
| | 2220 | |
| | 2221 | /* |
| | 2222 | * Determine if a token from the input matches a literal token, allowing |
| | 2223 | * the input token to be a truncated version of the literal token if the |
| | 2224 | * given truncation length is non-zero. |
| | 2225 | */ |
| | 2226 | int CVmObjGramProd::tok_equals_lit(VMG_ const struct vmgramprod_tok *tok, |
| | 2227 | const char *lit, size_t lit_len, |
| | 2228 | CVmObjDict *dict, vm_val_t *result_val) |
| | 2229 | { |
| | 2230 | /* |
| | 2231 | * if there's a dictionary, ask it to do the comparison; otherwise, |
| | 2232 | * use an exact literal match |
| | 2233 | */ |
| | 2234 | if (dict != 0) |
| | 2235 | { |
| | 2236 | /* ask the dictionary for the comparison */ |
| | 2237 | return dict->match_strings(vmg_ &tok->val_, tok->txt_, tok->len_, |
| | 2238 | lit, lit_len, result_val); |
| | 2239 | } |
| | 2240 | else |
| | 2241 | { |
| | 2242 | /* perform an exact comparison */ |
| | 2243 | result_val->set_int(tok->len_ == lit_len |
| | 2244 | && memcmp(tok->txt_, lit, lit_len) == 0); |
| | 2245 | return result_val->val.intval; |
| | 2246 | } |
| | 2247 | } |
| | 2248 | |
| | 2249 | /* |
| | 2250 | * Calculate the hash value for a literal token, using our dictionary's |
| | 2251 | * comparator. |
| | 2252 | */ |
| | 2253 | unsigned int CVmObjGramProd::calc_str_hash(VMG_ CVmObjDict *dict, |
| | 2254 | const vm_val_t *strval, |
| | 2255 | const char *str, size_t len) |
| | 2256 | { |
| | 2257 | /* |
| | 2258 | * if we have a dictionary, ask it to ask it for the hash value, as the |
| | 2259 | * dictionary will ask its comparator to do the work; otherwise, |
| | 2260 | * calculate our own hash value |
| | 2261 | */ |
| | 2262 | if (dict != 0) |
| | 2263 | { |
| | 2264 | /* |
| | 2265 | * We're doing comparisons through the dictionary's comparator, so |
| | 2266 | * we must calculate hash values using the comparator as well to |
| | 2267 | * keep the hash and match operations consistent. |
| | 2268 | */ |
| | 2269 | return dict->calc_str_hash(vmg_ strval, str, len); |
| | 2270 | } |
| | 2271 | else |
| | 2272 | { |
| | 2273 | uint hash; |
| | 2274 | |
| | 2275 | /* |
| | 2276 | * We don't have a dictionary, so we're doing our own string |
| | 2277 | * comparisons, which we do as exact byte-for-byte matches. We can |
| | 2278 | * thus calculate an arbitrary hash value of our own devising. |
| | 2279 | */ |
| | 2280 | for (hash = 0 ; len != 0 ; ++str, --len) |
| | 2281 | hash += (unsigned char)*str; |
| | 2282 | |
| | 2283 | /* return the hash value we calculated */ |
| | 2284 | return hash; |
| | 2285 | } |
| | 2286 | } |
| | 2287 | |
| | 2288 | |
| | 2289 | /* |
| | 2290 | * Find a property in a token's list. Returns true if the property is |
| | 2291 | * there, false if not. |
| | 2292 | */ |
| | 2293 | int CVmObjGramProd::find_prop_in_tok(const vmgramprod_tok *tok, |
| | 2294 | vm_prop_id_t prop) |
| | 2295 | { |
| | 2296 | size_t i; |
| | 2297 | const vmgram_match_info *p; |
| | 2298 | |
| | 2299 | /* search for the property */ |
| | 2300 | for (i = tok->match_cnt_, p = tok->matches_ ; i != 0 ; --i, ++p) |
| | 2301 | { |
| | 2302 | /* if this is the one we're looking for, return success */ |
| | 2303 | if (p->prop == prop) |
| | 2304 | return TRUE; |
| | 2305 | } |
| | 2306 | |
| | 2307 | /* we didn't find it */ |
| | 2308 | return FALSE; |
| | 2309 | } |
| | 2310 | |
| | 2311 | /* |
| | 2312 | * Get the next token item after the given item in an alternative |
| | 2313 | */ |
| | 2314 | const char *CVmObjGramProd::get_next_alt_tok(const char *tokp) |
| | 2315 | { |
| | 2316 | switch(vmgram_tok_type(tokp)) |
| | 2317 | { |
| | 2318 | case VMGRAM_MATCH_PROD: |
| | 2319 | return tokp + VMGRAM_TOK_PROD_SIZE; |
| | 2320 | |
| | 2321 | case VMGRAM_MATCH_SPEECH: |
| | 2322 | return tokp + VMGRAM_TOK_SPEECH_SIZE; |
| | 2323 | |
| | 2324 | case VMGRAM_MATCH_NSPEECH: |
| | 2325 | return tokp + VMGRAM_TOK_NSPEECH_SIZE(tokp); |
| | 2326 | |
| | 2327 | case VMGRAM_MATCH_LITERAL: |
| | 2328 | return tokp + VMGRAM_TOK_LIT_SIZE(tokp); |
| | 2329 | |
| | 2330 | case VMGRAM_MATCH_TOKTYPE: |
| | 2331 | return tokp + VMGRAM_TOK_TYPE_SIZE; |
| | 2332 | |
| | 2333 | case VMGRAM_MATCH_STAR: |
| | 2334 | return tokp + VMGRAM_TOK_STAR_SIZE; |
| | 2335 | |
| | 2336 | default: |
| | 2337 | err_throw(VMERR_INVAL_METACLASS_DATA); |
| | 2338 | AFTER_ERR_THROW(return 0;) |
| | 2339 | } |
| | 2340 | } |
| | 2341 | |
| | 2342 | /* ------------------------------------------------------------------------ */ |
| | 2343 | /* |
| | 2344 | * Execute the getGrammarInfo() method - builds and returns an information |
| | 2345 | * structure describing this grammar production object. |
| | 2346 | */ |
| | 2347 | int CVmObjGramProd::getp_get_gram_info(VMG_ vm_obj_id_t self, |
| | 2348 | vm_val_t *retval, uint *argc) |
| | 2349 | { |
| | 2350 | vm_obj_id_t alt_lst_id; |
| | 2351 | CVmObjList *alt_lst; |
| | 2352 | size_t i; |
| | 2353 | vmgram_alt_info *altp; |
| | 2354 | static CVmNativeCodeDesc desc(0); |
| | 2355 | |
| | 2356 | /* check arguments */ |
| | 2357 | if (get_prop_check_argc(retval, argc, &desc)) |
| | 2358 | return TRUE; |
| | 2359 | |
| | 2360 | /* |
| | 2361 | * This routine uses considerable extra stack in the course of its |
| | 2362 | * work, so check in advance that we have enough space to run. |
| | 2363 | */ |
| | 2364 | if (!G_stk->check_space(8)) |
| | 2365 | err_throw(VMERR_STACK_OVERFLOW); |
| | 2366 | |
| | 2367 | /* create a list to hold the rule alternatives */ |
| | 2368 | alt_lst_id = CVmObjList::create(vmg_ FALSE, get_ext()->alt_cnt_); |
| | 2369 | alt_lst = (CVmObjList *)vm_objp(vmg_ alt_lst_id); |
| | 2370 | |
| | 2371 | /* push it onto the stack for protection from garbage collection */ |
| | 2372 | G_stk->push()->set_obj(alt_lst_id); |
| | 2373 | |
| | 2374 | /* create the rule-alternative descriptions */ |
| | 2375 | for (i = 0, altp = get_ext()->alts_ ; i < get_ext()->alt_cnt_ ; |
| | 2376 | ++i, ++altp) |
| | 2377 | { |
| | 2378 | size_t j; |
| | 2379 | vmgram_tok_info *tokp; |
| | 2380 | vm_val_t alt_obj_val; |
| | 2381 | vm_obj_id_t tok_lst_id; |
| | 2382 | CVmObjList *tok_lst; |
| | 2383 | |
| | 2384 | /* |
| | 2385 | * if our imported GrammarAltInfo class isn't defined, we can't |
| | 2386 | * provide rule-alternative information, so just fill in the list |
| | 2387 | * with a nil |
| | 2388 | */ |
| | 2389 | if (G_predef->gramprod_gram_alt_info == VM_INVALID_OBJ) |
| | 2390 | { |
| | 2391 | /* add a nil to the list */ |
| | 2392 | alt_obj_val.set_nil(); |
| | 2393 | alt_lst->cons_set_element(i, &alt_obj_val); |
| | 2394 | |
| | 2395 | /* done with this slot */ |
| | 2396 | continue; |
| | 2397 | } |
| | 2398 | |
| | 2399 | /* create a list to hold the token descriptors */ |
| | 2400 | tok_lst_id = CVmObjList::create(vmg_ FALSE, altp->tok_cnt); |
| | 2401 | tok_lst = (CVmObjList *)vm_objp(vmg_ tok_lst_id); |
| | 2402 | |
| | 2403 | /* push this momentarily for gc protection */ |
| | 2404 | G_stk->push()->set_obj(tok_lst_id); |
| | 2405 | |
| | 2406 | /* |
| | 2407 | * Run through the tokens in the rule alternative, and build a list |
| | 2408 | * of token descriptor structures. |
| | 2409 | */ |
| | 2410 | for (j = 0, tokp = altp->toks ; j < altp->tok_cnt ; ++j, ++tokp) |
| | 2411 | { |
| | 2412 | vm_val_t tok_obj_val; |
| | 2413 | |
| | 2414 | /* |
| | 2415 | * if our imported GrammarAltTokInfo class isn't defined, we |
| | 2416 | * can't provide token information, so just fill in the list |
| | 2417 | * with a nil |
| | 2418 | */ |
| | 2419 | if (G_predef->gramprod_gram_alt_tok_info == VM_INVALID_OBJ) |
| | 2420 | { |
| | 2421 | /* add a nil to the list */ |
| | 2422 | tok_obj_val.set_nil(); |
| | 2423 | tok_lst->cons_set_element(j, &tok_obj_val); |
| | 2424 | |
| | 2425 | /* done with this slot */ |
| | 2426 | continue; |
| | 2427 | } |
| | 2428 | |
| | 2429 | /* |
| | 2430 | * The constructor arguments are the target property ID, the |
| | 2431 | * token type code, and extra information that depends on the |
| | 2432 | * type code. We have to push the arguments backwards per the |
| | 2433 | * normal protocol, so start with the variant part. |
| | 2434 | */ |
| | 2435 | switch (tokp->typ) |
| | 2436 | { |
| | 2437 | case VMGRAM_MATCH_PROD: |
| | 2438 | /* the extra information is the sub-production object */ |
| | 2439 | G_stk->push()->set_obj(tokp->typinfo.prod_obj); |
| | 2440 | break; |
| | 2441 | |
| | 2442 | case VMGRAM_MATCH_SPEECH: |
| | 2443 | /* the extra info is the part-of-speech property */ |
| | 2444 | G_stk->push()->set_propid(tokp->typinfo.speech_prop); |
| | 2445 | break; |
| | 2446 | |
| | 2447 | case VMGRAM_MATCH_NSPEECH: |
| | 2448 | /* the extra info is a list of part-of-speech properties */ |
| | 2449 | { |
| | 2450 | vm_obj_id_t pos_lst_id; |
| | 2451 | CVmObjList *pos_lst; |
| | 2452 | vm_prop_id_t *pp; |
| | 2453 | size_t k; |
| | 2454 | |
| | 2455 | /* create the property list */ |
| | 2456 | pos_lst_id = CVmObjList::create( |
| | 2457 | vmg_ FALSE, tokp->typinfo.nspeech.cnt); |
| | 2458 | pos_lst = (CVmObjList *)vm_objp(vmg_ pos_lst_id); |
| | 2459 | |
| | 2460 | /* push it */ |
| | 2461 | G_stk->push()->set_obj(pos_lst_id); |
| | 2462 | |
| | 2463 | /* populate it */ |
| | 2464 | for (k = 0, pp = tokp->typinfo.nspeech.props ; |
| | 2465 | k < tokp->typinfo.nspeech.cnt ; |
| | 2466 | ++k, ++pp) |
| | 2467 | { |
| | 2468 | vm_val_t val; |
| | 2469 | |
| | 2470 | /* fill in this element */ |
| | 2471 | val.set_propid(*pp); |
| | 2472 | pos_lst->cons_set_element(k, &val); |
| | 2473 | } |
| | 2474 | } |
| | 2475 | break; |
| | 2476 | |
| | 2477 | case VMGRAM_MATCH_LITERAL: |
| | 2478 | /* the extra info is the literal string value */ |
| | 2479 | G_stk->push()->set_obj(CVmObjString::create( |
| | 2480 | vmg_ FALSE, tokp->typinfo.lit.str, |
| | 2481 | tokp->typinfo.lit.len)); |
| | 2482 | break; |
| | 2483 | |
| | 2484 | case VMGRAM_MATCH_TOKTYPE: |
| | 2485 | /* the extra information is the token class enum */ |
| | 2486 | G_stk->push()->set_enum(tokp->typinfo.toktyp_enum); |
| | 2487 | break; |
| | 2488 | |
| | 2489 | case VMGRAM_MATCH_STAR: |
| | 2490 | /* |
| | 2491 | * for a 'star' type, there is no extra information, but |
| | 2492 | * push nil to keep the argument list uniform |
| | 2493 | */ |
| | 2494 | G_stk->push()->set_nil(); |
| | 2495 | break; |
| | 2496 | |
| | 2497 | default: |
| | 2498 | assert(FALSE); |
| | 2499 | break; |
| | 2500 | } |
| | 2501 | |
| | 2502 | /* now push the target property and type code */ |
| | 2503 | G_stk->push()->set_int(tokp->typ); |
| | 2504 | if (tokp->prop != VM_INVALID_PROP) |
| | 2505 | G_stk->push()->set_propid(tokp->prop); |
| | 2506 | else |
| | 2507 | G_stk->push()->set_nil(); |
| | 2508 | |
| | 2509 | /* |
| | 2510 | * the first argument to the constructor is the base class, |
| | 2511 | * which is the imported GrammarAltTokInfo class object |
| | 2512 | */ |
| | 2513 | G_stk->push()->set_obj(G_predef->gramprod_gram_alt_tok_info); |
| | 2514 | |
| | 2515 | /* create the object, at long last */ |
| | 2516 | tok_obj_val.set_obj(CVmObjTads::create_from_stack(vmg_ 0, 4)); |
| | 2517 | |
| | 2518 | /* add it to our token list */ |
| | 2519 | tok_lst->cons_set_element(j, &tok_obj_val); |
| | 2520 | } |
| | 2521 | |
| | 2522 | /* |
| | 2523 | * Create the GrammarAltInfo object to desribe the alternative. |
| | 2524 | * Pass in the score, badness, match object, and token list as |
| | 2525 | * arguments to the constructor, which we'll count on to stash the |
| | 2526 | * information away in the new object. |
| | 2527 | * |
| | 2528 | * Note that the first constructor argument is the superclass, |
| | 2529 | * which is the imported GrammarAltInfo class object. |
| | 2530 | */ |
| | 2531 | G_stk->push()->set_obj(tok_lst_id); |
| | 2532 | G_stk->push()->set_obj(altp->proc_obj); |
| | 2533 | G_stk->push()->set_int(altp->badness); |
| | 2534 | G_stk->push()->set_int(altp->score); |
| | 2535 | G_stk->push()->set_obj(G_predef->gramprod_gram_alt_info); |
| | 2536 | alt_obj_val.set_obj(CVmObjTads::create_from_stack(vmg_ 0, 5)); |
| | 2537 | |
| | 2538 | /* add the new GrammarAltInfo to the main alternative list */ |
| | 2539 | alt_lst->cons_set_element(i, &alt_obj_val); |
| | 2540 | |
| | 2541 | /* |
| | 2542 | * It's safe to discard the gc protection for the token list now, |
| | 2543 | * since a reference to is now stashed away in the GrammarAltInfo |
| | 2544 | * object, which in turn is referenced from our main alternative |
| | 2545 | * list, which we stashed on the stack earlier for gc protection. |
| | 2546 | */ |
| | 2547 | G_stk->discard(); |
| | 2548 | } |
| | 2549 | |
| | 2550 | /* it's safe to discard our gc protection for the main list now */ |
| | 2551 | G_stk->discard(); |
| | 2552 | |
| | 2553 | /* return the main list */ |
| | 2554 | retval->set_obj(alt_lst_id); |
| | 2555 | |
| | 2556 | /* successful evaluation */ |
| | 2557 | return TRUE; |
| | 2558 | } |
| | 2559 | |