| | 1 | /* $Header: d:/cvsroot/tads/TADS2/regex.h,v 1.3 1999/07/11 00:46:34 MJRoberts Exp $ */ |
| | 2 | |
| | 3 | /* |
| | 4 | * Copyright (c) 1998, 2002 Michael J. Roberts. All Rights Reserved. |
| | 5 | * |
| | 6 | * Please see the accompanying license file, LICENSE.TXT, for information |
| | 7 | * on using and copying this software. |
| | 8 | */ |
| | 9 | /* |
| | 10 | Name |
| | 11 | regex.h - regular expression parser for TADS |
| | 12 | Function |
| | 13 | |
| | 14 | Notes |
| | 15 | |
| | 16 | Modified |
| | 17 | 04/11/99 CNebel - Fix warnings. |
| | 18 | 10/07/98 MJRoberts - Creation |
| | 19 | */ |
| | 20 | |
| | 21 | #ifndef REGEX_H |
| | 22 | #define REGEX_H |
| | 23 | |
| | 24 | #include <stdlib.h> |
| | 25 | |
| | 26 | |
| | 27 | /* state ID */ |
| | 28 | typedef int re_state_id; |
| | 29 | |
| | 30 | /* invalid state ID - used to mark null machines */ |
| | 31 | #define RE_STATE_INVALID ((re_state_id)-1) |
| | 32 | |
| | 33 | /* first valid state ID */ |
| | 34 | #define RE_STATE_FIRST_VALID ((re_state_id)0) |
| | 35 | |
| | 36 | |
| | 37 | /* ------------------------------------------------------------------------ */ |
| | 38 | /* |
| | 39 | * Group register structure. Each register keeps track of the starting |
| | 40 | * and ending offset of the group's text. |
| | 41 | */ |
| | 42 | typedef struct |
| | 43 | { |
| | 44 | const char *start_ofs; |
| | 45 | const char *end_ofs; |
| | 46 | } re_group_register; |
| | 47 | |
| | 48 | /* number of group registers we keep */ |
| | 49 | #define RE_GROUP_REG_CNT 10 |
| | 50 | |
| | 51 | |
| | 52 | /* ------------------------------------------------------------------------ */ |
| | 53 | /* |
| | 54 | * Denormalized state transition tuple. Each tuple represents the |
| | 55 | * complete set of transitions out of a particular state. A particular |
| | 56 | * state can have one character transition, or two epsilon transitions. |
| | 57 | * Note that we don't need to store the state ID in the tuple, because |
| | 58 | * the state ID is the index of the tuple in an array of state tuples. |
| | 59 | */ |
| | 60 | typedef struct |
| | 61 | { |
| | 62 | /* the character we must match to transition to the target state */ |
| | 63 | char ch; |
| | 64 | |
| | 65 | /* the target states */ |
| | 66 | re_state_id next_state_1; |
| | 67 | re_state_id next_state_2; |
| | 68 | |
| | 69 | /* character range match table, if used */ |
| | 70 | unsigned char *char_range; |
| | 71 | |
| | 72 | /* flags */ |
| | 73 | unsigned char flags; |
| | 74 | } re_tuple; |
| | 75 | |
| | 76 | |
| | 77 | /* |
| | 78 | * Tuple flags |
| | 79 | */ |
| | 80 | |
| | 81 | /* this state is the start of a group - the 'ch' value is the group ID */ |
| | 82 | #define RE_STATE_GROUP_BEGIN 0x02 |
| | 83 | |
| | 84 | /* this state is the end of a group - 'ch' is the group ID */ |
| | 85 | #define RE_STATE_GROUP_END 0x04 |
| | 86 | |
| | 87 | |
| | 88 | /* ------------------------------------------------------------------------ */ |
| | 89 | /* |
| | 90 | * Regular expression compilation context structure. This tracks the |
| | 91 | * state of the compilation and stores the resources associated with the |
| | 92 | * compiled expression. |
| | 93 | */ |
| | 94 | typedef struct |
| | 95 | { |
| | 96 | /* error context */ |
| | 97 | errcxdef *errctx; |
| | 98 | |
| | 99 | /* next available state ID */ |
| | 100 | re_state_id next_state; |
| | 101 | |
| | 102 | /* |
| | 103 | * The array of transition tuples. We'll allocate this array and |
| | 104 | * expand it as necessary. |
| | 105 | */ |
| | 106 | re_tuple *tuple_arr; |
| | 107 | |
| | 108 | /* number of transition tuples allocated in the array */ |
| | 109 | int tuples_alloc; |
| | 110 | |
| | 111 | /* current group ID */ |
| | 112 | int cur_group; |
| | 113 | |
| | 114 | /* group registers */ |
| | 115 | re_group_register regs[RE_GROUP_REG_CNT]; |
| | 116 | |
| | 117 | /* |
| | 118 | * Buffer for retaining a copy of the last string we scanned. We |
| | 119 | * retain our own copy of each string, and point the group registers |
| | 120 | * into this copy rather than the caller's original string -- this |
| | 121 | * ensures that the group registers remain valid even after the |
| | 122 | * caller has deallocated the original string. |
| | 123 | */ |
| | 124 | char *strbuf; |
| | 125 | |
| | 126 | /* length of the string currently in the buffer */ |
| | 127 | size_t curlen; |
| | 128 | |
| | 129 | /* size of the buffer allocated to strbuf */ |
| | 130 | size_t strbufsiz; |
| | 131 | } re_context; |
| | 132 | |
| | 133 | |
| | 134 | /* ------------------------------------------------------------------------ */ |
| | 135 | /* |
| | 136 | * Status codes |
| | 137 | */ |
| | 138 | typedef enum |
| | 139 | { |
| | 140 | /* success */ |
| | 141 | RE_STATUS_SUCCESS = 0, |
| | 142 | |
| | 143 | /* compilation error - group nesting too deep */ |
| | 144 | RE_STATUS_GROUP_NESTING_TOO_DEEP |
| | 145 | } re_status_t; |
| | 146 | |
| | 147 | |
| | 148 | /* ------------------------------------------------------------------------ */ |
| | 149 | /* |
| | 150 | * Initialize the context. The memory for the context structure itself |
| | 151 | * must be allocated and maintained by the caller. |
| | 152 | */ |
| | 153 | void re_init(re_context *ctx, errcxdef *errctx); |
| | 154 | |
| | 155 | /* |
| | 156 | * Delete the context - frees structures associated with the context. |
| | 157 | * Does NOT free the memory used by the context structure itself. |
| | 158 | */ |
| | 159 | void re_delete(re_context *ctx); |
| | 160 | |
| | 161 | /* |
| | 162 | * Compile an expression and search for a match within the given string. |
| | 163 | * Returns the offset of the match, or -1 if no match was found. |
| | 164 | */ |
| | 165 | int re_compile_and_search(re_context *ctx, |
| | 166 | const char *pattern, size_t patlen, |
| | 167 | const char *searchstr, size_t searchlen, |
| | 168 | int *result_len); |
| | 169 | |
| | 170 | /* |
| | 171 | * Compile an expression and check for a match. Returns the length of |
| | 172 | * the match if we found a match, -1 if we found no match. This is not |
| | 173 | * a search function; we merely match the leading substring of the given |
| | 174 | * string to the given pattern. |
| | 175 | */ |
| | 176 | int re_compile_and_match(re_context *ctx, |
| | 177 | const char *pattern, size_t patlen, |
| | 178 | const char *searchstr, size_t searchlen); |
| | 179 | |
| | 180 | #endif /* REGEX_H */ |
| | 181 | |