/* ScummVM - Graphic Adventure Engine * * ScummVM is the legal property of its developers, whose names * are too numerous to list here. Please refer to the COPYRIGHT * file distributed with this source distribution. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * */ /* Regular Expression Parser and Recognizer for TADS Function Parses and recognizes regular expressions Notes Regular expression syntax: abc|def either abc or def (abc) abc abc+ abc, abcc, abccc, ... abc* ab, abc, abcc, ... abc? ab or abc . any single character abc$ abc at the end of the line ^abc abc at the beginning of the line %^abc literally ^abc [abcx-z] matches a, b, c, x, y, or z [^abcx-z] matches any character except a, b, c, x, y, or z [^]-q] matches any character except ], -, or q Note that using ']' or '-' in a character range expression requires special ordering. If ']' is to be used, it must be the first character after the '^', if present, within the brackets. If '-' is to be used, it must be the first character after the '^' and/or ']', if present. '%' is used to escape the special characters: | . ( ) * ? + ^ $ % [ (We use '%' rather than a backslash because it's less trouble to enter in a TADS string -- a backslash needs to be quoted with another backslash, which is error-prone and hard to read. '%' doesn't need any special quoting in a TADS string, which makes it a lot more readable.) In addition, '%' is used to introduce additional special sequences: %1 text matching first parenthesized expression %9 text matching ninth parenthesized experssion %< matches at the beginning of a word only %> matches at the end of a word only %w matches any word character %W matches any non-word character %b matches at any word boundary (beginning or end of word) %B matches except at a word boundary For the word matching sequences, a word is any sequence of letters and numbers. */ #include "glk/tads/tads2/regex.h" #include "glk/tads/tads2/ler.h" #include "glk/tads/tads2/os.h" //#include "glk/tads/tads2/std.h" #//include "engines/glk/tads/tads2/ler.h" namespace Glk { namespace TADS { namespace TADS2 { /** * A "machine" (i.e., a finite state automaton) is a set of state * transition tuples. A tuple has three elements: the state ID, the ID * of the state that we transition to, and the condition for the * transition. The condition is simply the character that we must match * to make the transition, or a special distinguished symbol "epsilon," * which refers to a transition with no input character consumed. * * The primitive elements of our machines guarantee that we never have * more than two transitions out of a particular state, so we can * denormalize the representation of a state by storing the two possible * tuples for that state in a single combined tuple. This has the * performance advantage that we can use the state ID as an index into * an array of state tuples. * * A particular machine always has a single initial and single final * (successful) state, so we can define a machine by its initial and * final state ID's. */ enum { // the special symbol value for "epsilon" RE_EPSILON = '\001', // the special symbol value for a wildcard character RE_WILDCARD = '\002', // special symbol values for beginning and end of text RE_TEXT_BEGIN = '\003', RE_TEXT_END = '\004', // special symbol values for start and end of a word RE_WORD_BEGIN = '\005', RE_WORD_END = '\006', // special symbols for word-char and non-word-char RE_WORD_CHAR = '\007', RE_NON_WORD_CHAR = '\010', // special symbols for word-boundary and non-word-boundary RE_WORD_BOUNDARY = '\011', RE_NON_WORD_BOUNDARY = '\012', // special symbol for a character range/exclusion range RE_RANGE = '\013', RE_RANGE_EXCL = '\014', // a range of special symbol values for group matchers RE_GROUP_MATCH_0 = '\015', RE_GROUP_MATCH_9 = (RE_GROUP_MATCH_0 + 9) }; re_context::re_context(errcxdef *errctx) { // save the error context _errctx = errctx; // clear states _next_state = RE_STATE_FIRST_VALID; // clear groups _cur_group = 0; // no string buffer yet _strbuf = 0; } re_context::~re_context() { // reset state reset(); // if we allocated a string buffer, delete it if (_strbuf != 0) { mchfre(_strbuf); _strbuf = nullptr; } } void re_context::reset() { int i; // delete any range tables we've allocated for (i = 0 ; i < _next_state ; ++i) { if (_tuple_arr[i].char_range != 0) { mchfre(_tuple_arr[i].char_range); _tuple_arr[i].char_range = 0; } } // clear states _next_state = RE_STATE_FIRST_VALID; // clear groups _cur_group = 0; } re_state_id re_context::alloc_state() { // If we don't have enough room for another state, expand the array if (_next_state >= (int)_tuple_arr.size()) { // bump the size by a bit _tuple_arr.resize(_tuple_arr.size() + 100); } // initialize the next state _tuple_arr[_next_state].next_state_1 = RE_STATE_INVALID; _tuple_arr[_next_state].next_state_2 = RE_STATE_INVALID; _tuple_arr[_next_state].ch = RE_EPSILON; _tuple_arr[_next_state].flags = 0; _tuple_arr[_next_state].char_range = 0; // return the new state's ID return _next_state++; } void re_context::set_trans(re_state_id id, re_state_id dest_id, char ch) { re_tuple *tuple; /* * get the tuple containing the transitions for this state ID - the * state ID is the index of the state's transition tuple in the array */ tuple = &_tuple_arr[id]; /* * If the first state pointer hasn't been set yet, set it to the new * destination. Otherwise, set the second state pointer. * * Only set the character on setting the first state. When setting * the second state, we must assume that the character for the state * has already been set, since any given state can have only one * character setting. */ if (tuple->next_state_1 == RE_STATE_INVALID) { /* * set the character ID, unless the state has been marked with a * special flag which indicates that the character value has * another meaning (in particular, a group marker) */ if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))) tuple->ch = ch; // set the first transition tuple->next_state_1 = dest_id; } else { // set only the second transition state - don't set the character tuple->next_state_2 = dest_id; } } void re_context::init_machine(re_machine *machine) { machine->init = alloc_state(); machine->final = alloc_state(); } void re_context::build_char(re_machine *machine, char ch) { // initialize our new machine init_machine(machine); // allocate a transition tuple for the new state set_trans(machine->init, machine->final, ch); } void re_context::build_char_range(re_machine *machine, unsigned char *range, int exclusion) { unsigned char *range_copy; // initialize our new machine init_machine(machine); // allocate a transition table for the new state set_trans(machine->init, machine->final, (char)(exclusion ? RE_RANGE_EXCL : RE_RANGE)); // allocate a copy of the range bit vector range_copy = (unsigned char *)mchalo(_errctx, 32, "regex range"); // copy the caller's range memcpy(range_copy, range, 32); // store it in the tuple _tuple_arr[machine->init].char_range = range_copy; } void re_context::build_group_matcher(re_machine *machine, int group_num) { // initialize our new machine init_machine(machine); /* * Allocate a transition tuple for the new state, using the group ID * as the character code. Store the special code for a group * recognizer rather than the normal literal character code. */ set_trans(machine->init, machine->final, (char)(group_num + RE_GROUP_MATCH_0)); } void re_context::build_concat(re_machine *new_machine, re_machine *lhs, re_machine *rhs) { // initialize the new machine init_machine(new_machine); /* * Set up an epsilon transition from the new machine's initial state * to the first submachine's initial state */ set_trans(new_machine->init, lhs->init, RE_EPSILON); /* * Set up an epsilon transition from the first submachine's final * state to the second submachine's initial state */ set_trans(lhs->final, rhs->init, RE_EPSILON); /* * Set up an epsilon transition from the second submachine's final * state to our new machine's final state */ set_trans(rhs->final, new_machine->final, RE_EPSILON); } void re_context::build_group(re_machine *new_machine, re_machine *sub_machine, int group_id) { // initialize the container machine init_machine(new_machine); /* * set up an epsilon transition from the new machine's initial state * into the initial state of the group, and another transition from * the group's final state into the container's final state */ set_trans(new_machine->init, sub_machine->init, RE_EPSILON); set_trans(sub_machine->final, new_machine->final, RE_EPSILON); // Mark the initial and final states of the group machine as being group markers _tuple_arr[new_machine->init].flags |= RE_STATE_GROUP_BEGIN; _tuple_arr[new_machine->final].flags |= RE_STATE_GROUP_END; // store the group ID in the 'ch' member of the start and end states _tuple_arr[new_machine->init].ch = group_id; _tuple_arr[new_machine->final].ch = group_id; } void re_context::build_alter(re_machine *new_machine, re_machine *lhs, re_machine *rhs) { // initialize the new machine init_machine(new_machine); /* * Set up an epsilon transition from our new machine's initial state * to the initial state of each submachine */ set_trans(new_machine->init, lhs->init, RE_EPSILON); set_trans(new_machine->init, rhs->init, RE_EPSILON); /* * Set up an epsilon transition from the final state of each * submachine to our final state */ set_trans(lhs->final, new_machine->final, RE_EPSILON); set_trans(rhs->final, new_machine->final, RE_EPSILON); } void re_context::build_closure(re_machine *new_machine, re_machine *sub, char specifier) { // initialize the new machine init_machine(new_machine); /* * Set up an epsilon transition from our initial state to the submachine's initial * state, and from the submachine's final state to our final state */ set_trans(new_machine->init, sub->init, RE_EPSILON); set_trans(sub->final, new_machine->final, RE_EPSILON); /* * If this is an unbounded closure ('*' or '+', but not '?'), set up * the loop transition that takes us from the new machine's final * state back to its initial state. We don't do this on the * zero-or-one closure, because we can only match the expression * once. */ if (specifier != '?') set_trans(sub->final, sub->init, RE_EPSILON); /* * If this is a zero-or-one closure or a zero-or-more closure, set * up an epsilon transition from our initial state to our final * state, since we can skip the entire subexpression. We don't do * this on the one-or-more closure, because we can't skip the * subexpression in this case. */ if (specifier != '+') set_trans(new_machine->init, new_machine->final, RE_EPSILON); } void re_context::concat_onto(re_machine *dest, re_machine *rhs) { // check for a null destination machine if (dest->isNull()) { /* * the first machine is null - simply copy the second machine * onto the first unchanged */ *dest = *rhs; } else { re_machine new_machine; // build the concatenated machine build_concat(&new_machine, dest, rhs); // copy the concatenated machine onto the first machine *dest = new_machine; } } void re_context::alternate_onto(re_machine *dest, re_machine *rhs) { // check to see if the first machine is null if (dest->isNull()) { /* * the first machine is null - simply copy the second machine * onto the first */ *dest = *rhs; } else { /* * if the second machine is null, don't do anything; otherwise, * build the alternation */ if (!rhs->isNull()) { re_machine new_machine; // build the alternation build_alter(&new_machine, dest, rhs); // replace the first machine with the alternation *dest = new_machine; } } } /** * Set a bit in a bit vector. */ #define re_set_bit(set, bit) \ (((unsigned char *)(set))[(bit) >> 3] |= (1 << ((bit) & 7))) /** * Test a bit in a bit vector */ #define re_is_bit_set(set, bit) \ ((((unsigned char *)(set))[(bit) >> 3] & (1 << ((bit) & 7))) != 0) re_status_t re_context::compile(const char *expr, size_t exprlen, re_machine *result_machine) { re_machine cur_machine; re_machine alter_machine; re_machine new_machine; size_t group_stack_level; struct { re_machine old_cur; re_machine old_alter; int group_id; } group_stack[50]; // reset everything reset(); // start out with no current machine and no alternate machine cur_machine.build_null_machine(); alter_machine.build_null_machine(); // nothing on the stack yet group_stack_level = 0; // loop until we run out of expression to parse for (; exprlen != 0 ; ++expr, --exprlen) { switch(*expr) { case '^': /* * beginning of line - if we're not at the beginning of the * current expression (i.e., we already have some * concatentations accumulated), treat it as an ordinary * character */ if (!cur_machine.isNull()) goto normal_char; // build a new start-of-text recognizer build_char(&new_machine, RE_TEXT_BEGIN); /* * concatenate it onto the string - note that this can't * have any postfix operators */ concat_onto(&cur_machine, &new_machine); break; case '$': /* * End of line specifier - if there's anything left after * the '$' other than a close parens or alternation * specifier, great it as a normal character */ if (exprlen > 1 && (*(expr+1) != ')' && *(expr+1) != '|')) goto normal_char; // build a new end-of-text recognizer build_char(&new_machine, RE_TEXT_END); /* * concatenate it onto the string - note that this can't * have any postfix operators */ concat_onto(&cur_machine, &new_machine); break; case '(': /* * Add a nesting level. Push the current machine and * alternate machines onto the group stack, and clear * everything out for the new group. */ if (group_stack_level > sizeof(group_stack)/sizeof(group_stack[0])) { /* we cannot proceed - return an error */ return RE_STATUS_GROUP_NESTING_TOO_DEEP; } // save the current state on the stack group_stack[group_stack_level].old_cur = cur_machine; group_stack[group_stack_level].old_alter = alter_machine; /* * Assign the group a group ID - groups are numbered in * order of their opening (left) parentheses, so we want to * assign a group number now. We won't actually need to * know the group number until we get to the matching close * paren, but we need to assign it now, so store it in the * group stack. */ group_stack[group_stack_level].group_id = _cur_group; // consume the group number _cur_group++; // push the level ++group_stack_level; // start the new group with empty machines cur_machine.build_null_machine(); alter_machine.build_null_machine(); break; case ')': // if there's nothing on the stack, ignore this if (group_stack_level == 0) break; // take a level off the stack --group_stack_level; /* * Remove a nesting level. If we have a pending alternate * expression, build the alternation expression. This will * leave the entire group expression in alter_machine, * regardless of whether an alternation was in progress or * not. */ alternate_onto(&alter_machine, &cur_machine); /* * Create a group machine that encloses the group and marks * it with a group number. We assigned the group number * when we parsed the open paren, so read that group number * from the stack. * * Note that this will leave 'new_machine' with the entire * group machine. */ build_group(&new_machine, &alter_machine, group_stack[group_stack_level].group_id); /* * Pop the stack - restore the alternation and current * machines that were in progress before the group started. */ cur_machine = group_stack[group_stack_level].old_cur; alter_machine = group_stack[group_stack_level].old_alter; /* * Check the group expression (in new_machine) for postfix * expressions */ goto apply_postfix; case '|': /* * Start a new alternation. This ends the current * alternation; if we have a previous pending alternate, * build an alternation machine out of the previous * alternate and the current machine and move that to the * alternate; otherwise, simply move the current machine to * the pending alternate. */ alternate_onto(&alter_machine, &cur_machine); /* * the alternation starts out with a blank slate, so null * out the current machine */ cur_machine.build_null_machine(); break; case '%': // quoted character - skip the quote mark and see what we have ++expr; --exprlen; // check to see if we're at the end of the expression if (exprlen == 0) { /* * end of the string - ignore it, but undo the extra * increment of the expression index so that we exit the * enclosing loop properly */ --expr; ++exprlen; break; } // see what we have switch(*expr) { case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // group match - build a new literal group recognizer build_group_matcher(&new_machine, (int)(*expr - '1')); // apply any postfix expression to the group recognizer goto apply_postfix; case '<': // build a beginning-of-word recognizer build_char(&new_machine, RE_WORD_BEGIN); // it can't be postfixed - just concatenate it concat_onto(&cur_machine, &new_machine); break; case '>': // build an end-of-word recognizer */ build_char(&new_machine, RE_WORD_END); // it can't be postfixed - just concatenate it concat_onto(&cur_machine, &new_machine); break; case 'w': // word character build_char(&new_machine, RE_WORD_CHAR); goto apply_postfix; case 'W': // non-word character build_char(&new_machine, RE_NON_WORD_CHAR); goto apply_postfix; case 'b': // word boundary build_char(&new_machine, RE_WORD_BOUNDARY); // it can't be postfixed concat_onto(&cur_machine, &new_machine); break; case 'B': // not a word boundary build_char(&new_machine, RE_NON_WORD_BOUNDARY); // it can't be postfixed concat_onto(&cur_machine, &new_machine); break; default: // build a new literal character recognizer build_char(&new_machine, *expr); // apply any postfix expression to the character goto apply_postfix; } break; case '.': /* * wildcard character - build a single character recognizer * for the special wildcard symbol, then go check it for a * postfix operator */ build_char(&new_machine, RE_WILDCARD); goto apply_postfix; break; case '[': { // range expression int is_exclusive = false; unsigned char set[32]; // clear out the set of characters in the range memset(set, 0, sizeof(set)); // first, skip the open bracket ++expr; --exprlen; // check to see if starts with the exclusion character if (exprlen != 0 && *expr == '^') { // skip the exclusion specifier ++expr; --exprlen; // note it is_exclusive = true; } // if the first character is a ']', include it in the range if (exprlen != 0 && *expr == ']') { re_set_bit(set, (int)']'); ++expr; --exprlen; } // if the next character is a '-', include it in the range if (exprlen != 0 && *expr == '-') { re_set_bit(set, (int)'-'); ++expr; --exprlen; } // scan the character set while (exprlen != 0 && *expr != ']') { int ch; // note this character ch = (int)(unsigned char)*expr; // set it re_set_bit(set, ch); // skip this character of the expression ++expr; --exprlen; // check for a range if (exprlen != 0 && *expr == '-') { int ch2; // skip the '-' ++expr; --exprlen; if (exprlen != 0) { // get the other end of the range ch2 = (int)(unsigned char)*expr; // skip the second character ++expr; --exprlen; // if the range is reversed, swap it if (ch > ch2) SWAP(ch, ch2); // fill in the range for ( ; ch <= ch2 ; ++ch) re_set_bit(set, ch); } } } // create a character range machine build_char_range(&new_machine, set, is_exclusive); // apply any postfix operator goto apply_postfix; break; } default: normal_char: /* * it's an ordinary character - build a single character * recognizer machine, and then concatenate it onto any * existing machine */ build_char(&new_machine, *expr); apply_postfix: /* * Check for a postfix operator, and apply it to the machine * in 'new_machine' if present. In any case, concatenate * the 'new_machine' (modified by a postix operator or not) * to the current machien. */ if (exprlen > 1) { switch(*(expr+1)) { case '*': case '+': case '?': /* * We have a postfix closure operator. Build a new * closure machine out of 'new_machine'. */ { re_machine closure_machine; // move onto the closure operator ++expr; --exprlen; // build the closure machine build_closure(&closure_machine, &new_machine, *expr); // replace the original machine with the closure new_machine = closure_machine; /* * skip any redundant closure symbols, keeping * only the first one we saw */ while (exprlen > 1 && (*(expr+1) == '?' || *(expr+1) == '+' || *(expr+1) == '*')) { ++expr; --exprlen; } } break; default: /* no postfix operator */ break; } } /* * Concatenate the new machine onto the current machine * under construction. */ concat_onto(&cur_machine, &new_machine); break; } } // complete any pending alternation alternate_onto(&alter_machine, &cur_machine); // store the resulting machine in the caller's machine descriptor *result_machine = alter_machine; // no errors encountered return RE_STATUS_SUCCESS; } void re_context::note_group(re_group_register *regs, re_state_id id, const char *p) { int group_index; /* * Check to see if this is a valid state and it's a group marker - * if not, there's nothing to do */ if (id == RE_STATE_INVALID || !(_tuple_arr[id].flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)) || (group_index = (int)_tuple_arr[id].ch) >= RE_GROUP_REG_CNT) return; // It's a valid group marker - note the appropriate register value if ((_tuple_arr[id].flags & RE_STATE_GROUP_BEGIN) != 0) regs[group_index].start_ofs = p; else regs[group_index].end_ofs = p; } bool re_context::is_word_char(char c) const { return Common::isAlnum(c); } int re_context::match(const char *entire_str, const char *str, size_t origlen, const re_machine *machine, re_group_register *regs) { re_state_id cur_state; const char *p; size_t curlen; // start at the machine's initial state cur_state = machine->init; // start at the beginning of the string p = str; curlen = origlen; // note any group involved in the initial state note_group(regs, cur_state, p); /* * if we're starting in the final state, immediately return success * with a zero-length match */ if (cur_state == machine->final) { // return success with a zero-length match return 0; } // run the machine for (;;) { re_tuple *tuple; // get the tuple for this state tuple = &_tuple_arr[cur_state]; // if this is a group state, adjust the group registers note_group(regs, cur_state, p); // see what kind of state we're in if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)) && tuple->ch != RE_EPSILON) { /* * This is a character or group recognizer state. If we * match the character or group, continue on to the next * state; otherwise, return failure. */ switch(tuple->ch) { case RE_GROUP_MATCH_0: case RE_GROUP_MATCH_0 + 1: case RE_GROUP_MATCH_0 + 2: case RE_GROUP_MATCH_0 + 3: case RE_GROUP_MATCH_0 + 4: case RE_GROUP_MATCH_0 + 5: case RE_GROUP_MATCH_0 + 6: case RE_GROUP_MATCH_0 + 7: case RE_GROUP_MATCH_0 + 8: case RE_GROUP_MATCH_0 + 9: { int group_num; re_group_register *group_reg; size_t reg_len; // it's a group - get the group number group_num = tuple->ch - RE_GROUP_MATCH_0; group_reg = ®s[group_num]; /* * if this register isn't defined, there's nothing * to match, so fail */ if (group_reg->start_ofs == 0 || group_reg->end_ofs == 0) return -1; // calculate the length of the register value reg_len = group_reg->end_ofs - group_reg->start_ofs; // if we don't have enough left to match, it fails if (curlen < reg_len) return -1; // if the string doesn't match exactly, we fail if (memcmp(p, group_reg->start_ofs, reg_len) != 0) return -1; /* * It matches exactly - skip the entire length of * the register in the source string */ p += reg_len; curlen -= reg_len; break; } case RE_TEXT_BEGIN: /* * Match only the exact beginning of the string - if * we're anywhere else, this isn't a match. If this * succeeds, we don't skip any characters. */ if (p != entire_str) return -1; break; case RE_TEXT_END: /* * Match only the exact end of the string - if we're * anywhere else, this isn't a match. Don't skip any * characters on success. */ if (curlen != 0) return -1; break; case RE_WORD_BEGIN: /* * if the previous character is a word character, we're * not at the beginning of a word */ if (p != entire_str && is_word_char(*(p - 1))) return -1; /* * if we're at the end of the string, or the current * character isn't the start of a word, we're not at the * beginning of a word */ if (curlen == 0 || !is_word_char(*p)) return -1; break; case RE_WORD_END: /* * if the current character is a word character, we're not * at the end of a word */ if (curlen != 0 && is_word_char(*p)) return -1; /* * if we're at the beginning of the string, or the * previous character is not a word character, we're not * at the end of a word */ if (p == entire_str || !is_word_char(*(p - 1))) return -1; break; case RE_WORD_CHAR: /* if it's not a word character, it's a failure */ if (curlen == 0 || !is_word_char(*p)) return -1; /* skip this character of input */ ++p; --curlen; break; case RE_NON_WORD_CHAR: /* if it's a word character, it's a failure */ if (curlen == 0 || is_word_char(*p)) return -1; /* skip the input */ ++p; --curlen; break; case RE_WORD_BOUNDARY: case RE_NON_WORD_BOUNDARY: { int prev_is_word; int next_is_word; int boundary; /* * Determine if the previous character is a word * character -- if we're at the beginning of the * string, it's obviously not, otherwise check its * classification */ prev_is_word = (p != entire_str && is_word_char(*(p - 1))); /* make the same check for the current character */ next_is_word = (curlen != 0 && is_word_char(*p)); /* * Determine if this is a boundary - it is if the * two states are different */ boundary = ((prev_is_word != 0) ^ (next_is_word != 0)); /* * make sure it matches what was desired, and return * failure if not */ if ((tuple->ch == RE_WORD_BOUNDARY && !boundary) || (tuple->ch == RE_NON_WORD_BOUNDARY && boundary)) return -1; } break; case RE_WILDCARD: // make sure we have a character to match if (curlen == 0) return -1; // skip this character ++p; --curlen; break; case RE_RANGE: case RE_RANGE_EXCL: { int match_val; // make sure we have a character to match if (curlen == 0) return -1; // see if we match match_val = re_is_bit_set(tuple->char_range, (int)(unsigned char)*p); // make sure we got what we wanted if ((tuple->ch == RE_RANGE && !match_val) || (tuple->ch == RE_RANGE_EXCL && match_val)) return -1; // skip this character of the input ++p; --curlen; break; } default: // make sure we have an exact match if (curlen == 0 || tuple->ch != *p) return -1; // skip this character of the input ++p; --curlen; break; } /* * if we got this far, we were successful - move on to the * next state */ cur_state = tuple->next_state_1; } else if (tuple->next_state_2 == RE_STATE_INVALID) { /* * We have only one transition, so this state is entirely * deterministic. Simply move on to the next state. */ cur_state = tuple->next_state_1; } else { re_machine sub_machine; re_group_register regs1[RE_GROUP_REG_CNT]; re_group_register regs2[RE_GROUP_REG_CNT]; int ret1; int ret2; /* * This state has two possible transitions, and we don't * know which one to take. So, try both, see which one * works better, and return the result. Try the first * transition first. Note that each separate attempt must * use a separate copy of the registers. */ memcpy(regs1, regs, sizeof(regs1)); sub_machine.init = tuple->next_state_1; sub_machine.final = machine->final; ret1 = match(entire_str, p, curlen, &sub_machine, regs1); /* * Now try the second transition */ memcpy(regs2, regs, sizeof(regs2)); sub_machine.init = tuple->next_state_2; sub_machine.final = machine->final; ret2 = match(entire_str, p, curlen, &sub_machine, regs2); /* * If they both failed, the whole thing failed. Otherwise, * return the longer of the two, plus the length we * ourselves matched previously. Note that we return the * register set from the winning match. */ if (ret1 < 0 && ret2 < 0) { // they both failed return -1; } else if (ret1 > ret2) { // use the first register set and result length memcpy(regs, regs1, sizeof(regs1)); return ret1 + (p - str); } else { // use the second register set and result length memcpy(regs, regs2, sizeof(regs2)); return ret2 + (p - str); } } // If we're in the final state, return success if (cur_state == machine->final) { // finish off any group involved in the final state note_group(regs, cur_state, p); // return the length we matched return p - str; } } } int re_context::search(const char *str, size_t len, const re_machine *machine, re_group_register *regs, int *result_len) { int ofs; /* * Starting at the first character in the string, search for the * pattern at each subsequent character until we either find the * pattern or run out of string to test. */ for (ofs = 0 ; ofs < (int)len ; ++ofs) { int matchlen; // check for a match matchlen = match(str, str + ofs, len - ofs, machine, regs); if (matchlen >= 0) { // we found a match here - return the length and offset *result_len = matchlen; return ofs; } } // we didn't find a match return -1; } void re_context::save_search_str(const char *str, size_t len) { // if the string is empty, this is easy if (len == 0) { // nothing to store - just save the length and return _curlen = 0; return; } // if the current buffer isn't big enough, allocate a new one if (_strbuf == 0 || _strbufsiz < len) { /* * free any previous buffer - its contents are no longer * important, since we're about to overwrite it with a new * string */ if (_strbuf != 0) mchfre(_strbuf); /* * allocate a new buffer; round up to the next 256-byte * increment to make sure we're not constantly reallocating to * random sizes */ _strbufsiz = ((len + 255) & ~255); // allocate it _strbuf = (char *)mchalo(_errctx, _strbufsiz, "regex str"); } // copy the string memcpy(_strbuf, str, len); // save the length _curlen = len; } int re_context::compile_and_search(const char *pattern, size_t patlen, const char *searchstr, size_t searchlen, int *result_len) { re_machine machine; // compile the expression - return failure if we get an error if (compile(pattern, patlen, &machine) != RE_STATUS_SUCCESS) return -1; // save the search string in our internal buffer save_search_str(searchstr, searchlen); // clear the group registers memset(_regs, 0, sizeof(_regs)); /* * search for the pattern in our copy of the string - use the copy * so that the group registers stay valid even if the caller * deallocates the original string after we return */ return search(_strbuf, _curlen, &machine, _regs, result_len); } int re_context::compile_and_match(const char *pattern, size_t patlen, const char *searchstr, size_t searchlen) { re_machine machine; // compile the expression - return failure if we get an error if (compile(pattern, patlen, &machine) != RE_STATUS_SUCCESS) return 0; // save the search string in our internal buffer save_search_str(searchstr, searchlen); // clear the group registers memset(_regs, 0, sizeof(_regs)); // match the string return match(_strbuf, _strbuf, _curlen, &machine, _regs); } } // End of namespace TADS2 } // End of namespace TADS } // End of namespace Glk