diff options
Diffstat (limited to 'engines/glk/tads/tads2/regex.cpp')
-rw-r--r-- | engines/glk/tads/tads2/regex.cpp | 1276 |
1 files changed, 1276 insertions, 0 deletions
diff --git a/engines/glk/tads/tads2/regex.cpp b/engines/glk/tads/tads2/regex.cpp new file mode 100644 index 0000000000..f6a009a0a9 --- /dev/null +++ b/engines/glk/tads/tads2/regex.cpp @@ -0,0 +1,1276 @@ +/* ScummVM - Graphic Adventure Engine + * + * ScummVM is the legal property of its developers, whose names + * are too numerous to list here. Please refer to the COPYRIGHT + * file distributed with this source distribution. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +/* +Regular Expression Parser and Recognizer for TADS +Function + Parses and recognizes regular expressions +Notes + Regular expression syntax: + + abc|def either abc or def + (abc) abc + abc+ abc, abcc, abccc, ... + abc* ab, abc, abcc, ... + abc? ab or abc + . any single character + abc$ abc at the end of the line + ^abc abc at the beginning of the line + %^abc literally ^abc + [abcx-z] matches a, b, c, x, y, or z + [^abcx-z] matches any character except a, b, c, x, y, or z + [^]-q] matches any character except ], -, or q + + Note that using ']' or '-' in a character range expression requires + special ordering. If ']' is to be used, it must be the first character + after the '^', if present, within the brackets. If '-' is to be used, + it must be the first character after the '^' and/or ']', if present. + + '%' is used to escape the special characters: | . ( ) * ? + ^ $ % [ + (We use '%' rather than a backslash because it's less trouble to + enter in a TADS string -- a backslash needs to be quoted with another + backslash, which is error-prone and hard to read. '%' doesn't need + any special quoting in a TADS string, which makes it a lot more + readable.) + + In addition, '%' is used to introduce additional special sequences: + + %1 text matching first parenthesized expression + %9 text matching ninth parenthesized experssion + %< matches at the beginning of a word only + %> matches at the end of a word only + %w matches any word character + %W matches any non-word character + %b matches at any word boundary (beginning or end of word) + %B matches except at a word boundary + + For the word matching sequences, a word is any sequence of letters and + numbers. +*/ + +#include "engines/glk/tads/tads2/regex.h" +#include "engines/glk/tads/tads2/ler.h" +#include "engines/glk/tads/tads2/os.h" +//#include "engines/glk/tads/tads2/std.h" +#//include "engines/glk/tads/tads2/ler.h" + +namespace Glk { +namespace TADS { +namespace TADS2 { + +/** + * A "machine" (i.e., a finite state automaton) is a set of state + * transition tuples. A tuple has three elements: the state ID, the ID + * of the state that we transition to, and the condition for the + * transition. The condition is simply the character that we must match + * to make the transition, or a special distinguished symbol "epsilon," + * which refers to a transition with no input character consumed. + * + * The primitive elements of our machines guarantee that we never have + * more than two transitions out of a particular state, so we can + * denormalize the representation of a state by storing the two possible + * tuples for that state in a single combined tuple. This has the + * performance advantage that we can use the state ID as an index into + * an array of state tuples. + * + * A particular machine always has a single initial and single final + * (successful) state, so we can define a machine by its initial and + * final state ID's. + */ +enum { + // the special symbol value for "epsilon" + RE_EPSILON = '\001', + + // the special symbol value for a wildcard character + RE_WILDCARD = '\002', + + // special symbol values for beginning and end of text + RE_TEXT_BEGIN = '\003', + RE_TEXT_END = '\004', + + // special symbol values for start and end of a word + RE_WORD_BEGIN = '\005', + RE_WORD_END = '\006', + + // special symbols for word-char and non-word-char + RE_WORD_CHAR = '\007', + RE_NON_WORD_CHAR = '\010', + + // special symbols for word-boundary and non-word-boundary + RE_WORD_BOUNDARY = '\011', + RE_NON_WORD_BOUNDARY = '\012', + + // special symbol for a character range/exclusion range + RE_RANGE = '\013', + RE_RANGE_EXCL = '\014', + + // a range of special symbol values for group matchers + RE_GROUP_MATCH_0 = '\015', + RE_GROUP_MATCH_9 = (RE_GROUP_MATCH_0 + 9) +}; + +re_context::re_context(errcxdef *errctx) { + // save the error context + _errctx = errctx; + + // clear states + _next_state = RE_STATE_FIRST_VALID; + + // clear groups + _cur_group = 0; + + // no string buffer yet + _strbuf = 0; +} + +re_context::~re_context() { + // reset state + reset(); + + // if we allocated a string buffer, delete it + if (_strbuf != 0) { + mchfre(_strbuf); + _strbuf = nullptr; + } +} + +void re_context::reset() { + int i; + + // delete any range tables we've allocated + for (i = 0 ; i < _next_state ; ++i) { + if (_tuple_arr[i].char_range != 0) { + mchfre(_tuple_arr[i].char_range); + _tuple_arr[i].char_range = 0; + } + } + + // clear states + _next_state = RE_STATE_FIRST_VALID; + + // clear groups + _cur_group = 0; +} + +re_state_id re_context::alloc_state() { + // If we don't have enough room for another state, expand the array + if (_next_state >= (int)_tuple_arr.size()) { + // bump the size by a bit + _tuple_arr.resize(_tuple_arr.size() + 100); + } + + // initialize the next state + _tuple_arr[_next_state].next_state_1 = RE_STATE_INVALID; + _tuple_arr[_next_state].next_state_2 = RE_STATE_INVALID; + _tuple_arr[_next_state].ch = RE_EPSILON; + _tuple_arr[_next_state].flags = 0; + _tuple_arr[_next_state].char_range = 0; + + // return the new state's ID + return _next_state++; +} + +void re_context::set_trans(re_state_id id, re_state_id dest_id, char ch) { + re_tuple *tuple; + + /* + * get the tuple containing the transitions for this state ID - the + * state ID is the index of the state's transition tuple in the array + */ + tuple = &_tuple_arr[id]; + + /* + * If the first state pointer hasn't been set yet, set it to the new + * destination. Otherwise, set the second state pointer. + * + * Only set the character on setting the first state. When setting + * the second state, we must assume that the character for the state + * has already been set, since any given state can have only one + * character setting. + */ + if (tuple->next_state_1 == RE_STATE_INVALID) { + /* + * set the character ID, unless the state has been marked with a + * special flag which indicates that the character value has + * another meaning (in particular, a group marker) + */ + if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))) + tuple->ch = ch; + + // set the first transition + tuple->next_state_1 = dest_id; + } else { + // set only the second transition state - don't set the character + tuple->next_state_2 = dest_id; + } +} + +void re_context::init_machine(re_machine *machine) { + machine->init = alloc_state(); + machine->final = alloc_state(); +} + +void re_context::build_char(re_machine *machine, char ch) { + // initialize our new machine + init_machine(machine); + + // allocate a transition tuple for the new state + set_trans(machine->init, machine->final, ch); +} + +void re_context::build_char_range(re_machine *machine, unsigned char *range, int exclusion) { + unsigned char *range_copy; + + // initialize our new machine + init_machine(machine); + + // allocate a transition table for the new state + set_trans(machine->init, machine->final, (char)(exclusion ? RE_RANGE_EXCL : RE_RANGE)); + + // allocate a copy of the range bit vector + range_copy = (unsigned char *)mchalo(_errctx, 32, "regex range"); + + // copy the caller's range + memcpy(range_copy, range, 32); + + // store it in the tuple + _tuple_arr[machine->init].char_range = range_copy; +} + +void re_context::build_group_matcher(re_machine *machine, int group_num) { + // initialize our new machine + init_machine(machine); + + /* + * Allocate a transition tuple for the new state, using the group ID + * as the character code. Store the special code for a group + * recognizer rather than the normal literal character code. + */ + set_trans(machine->init, machine->final, (char)(group_num + RE_GROUP_MATCH_0)); +} + +void re_context::build_concat(re_machine *new_machine, re_machine *lhs, re_machine *rhs) { + // initialize the new machine + init_machine(new_machine); + + /* + * Set up an epsilon transition from the new machine's initial state + * to the first submachine's initial state + */ + set_trans(new_machine->init, lhs->init, RE_EPSILON); + + /* + * Set up an epsilon transition from the first submachine's final + * state to the second submachine's initial state + */ + set_trans(lhs->final, rhs->init, RE_EPSILON); + + /* + * Set up an epsilon transition from the second submachine's final + * state to our new machine's final state + */ + set_trans(rhs->final, new_machine->final, RE_EPSILON); +} + +void re_context::build_group(re_machine *new_machine, re_machine *sub_machine, int group_id) { + // initialize the container machine + init_machine(new_machine); + + /* + * set up an epsilon transition from the new machine's initial state + * into the initial state of the group, and another transition from + * the group's final state into the container's final state + */ + set_trans(new_machine->init, sub_machine->init, RE_EPSILON); + set_trans(sub_machine->final, new_machine->final, RE_EPSILON); + + // Mark the initial and final states of the group machine as being group markers + _tuple_arr[new_machine->init].flags |= RE_STATE_GROUP_BEGIN; + _tuple_arr[new_machine->final].flags |= RE_STATE_GROUP_END; + + // store the group ID in the 'ch' member of the start and end states + _tuple_arr[new_machine->init].ch = group_id; + _tuple_arr[new_machine->final].ch = group_id; +} + +void re_context::build_alter(re_machine *new_machine, re_machine *lhs, re_machine *rhs) { + // initialize the new machine + init_machine(new_machine); + + /* + * Set up an epsilon transition from our new machine's initial state + * to the initial state of each submachine + */ + set_trans(new_machine->init, lhs->init, RE_EPSILON); + set_trans(new_machine->init, rhs->init, RE_EPSILON); + + /* + * Set up an epsilon transition from the final state of each + * submachine to our final state + */ + set_trans(lhs->final, new_machine->final, RE_EPSILON); + set_trans(rhs->final, new_machine->final, RE_EPSILON); +} + +void re_context::build_closure(re_machine *new_machine, re_machine *sub, char specifier) { + // initialize the new machine + init_machine(new_machine); + + /* + * Set up an epsilon transition from our initial state to the submachine's initial + * state, and from the submachine's final state to our final state + */ + set_trans(new_machine->init, sub->init, RE_EPSILON); + set_trans(sub->final, new_machine->final, RE_EPSILON); + + /* + * If this is an unbounded closure ('*' or '+', but not '?'), set up + * the loop transition that takes us from the new machine's final + * state back to its initial state. We don't do this on the + * zero-or-one closure, because we can only match the expression + * once. + */ + if (specifier != '?') + set_trans(sub->final, sub->init, RE_EPSILON); + + /* + * If this is a zero-or-one closure or a zero-or-more closure, set + * up an epsilon transition from our initial state to our final + * state, since we can skip the entire subexpression. We don't do + * this on the one-or-more closure, because we can't skip the + * subexpression in this case. + */ + if (specifier != '+') + set_trans(new_machine->init, new_machine->final, RE_EPSILON); +} + +void re_context::concat_onto(re_machine *dest, re_machine *rhs) { + // check for a null destination machine + if (dest->isNull()) { + /* + * the first machine is null - simply copy the second machine + * onto the first unchanged + */ + *dest = *rhs; + } else { + re_machine new_machine; + + // build the concatenated machine + build_concat(&new_machine, dest, rhs); + + // copy the concatenated machine onto the first machine + *dest = new_machine; + } +} + +void re_context::alternate_onto(re_machine *dest, re_machine *rhs) { + // check to see if the first machine is null + if (dest->isNull()) { + /* + * the first machine is null - simply copy the second machine + * onto the first + */ + *dest = *rhs; + } else { + /* + * if the second machine is null, don't do anything; otherwise, + * build the alternation + */ + if (!rhs->isNull()) { + re_machine new_machine; + + // build the alternation + build_alter(&new_machine, dest, rhs); + + // replace the first machine with the alternation + *dest = new_machine; + } + } +} + +/** + * Set a bit in a bit vector. + */ +#define re_set_bit(set, bit) \ + (((unsigned char *)(set))[(bit) >> 3] |= (1 << ((bit) & 7))) + +/** + * Test a bit in a bit vector + */ +#define re_is_bit_set(set, bit) \ + ((((unsigned char *)(set))[(bit) >> 3] & (1 << ((bit) & 7))) != 0) + +re_status_t re_context::compile(const char *expr, size_t exprlen, re_machine *result_machine) { + re_machine cur_machine; + re_machine alter_machine; + re_machine new_machine; + size_t group_stack_level; + struct { + re_machine old_cur; + re_machine old_alter; + int group_id; + } group_stack[50]; + + // reset everything + reset(); + + // start out with no current machine and no alternate machine + cur_machine.build_null_machine(); + alter_machine.build_null_machine(); + + // nothing on the stack yet + group_stack_level = 0; + + // loop until we run out of expression to parse + for (; exprlen != 0 ; ++expr, --exprlen) { + switch(*expr) { + case '^': + /* + * beginning of line - if we're not at the beginning of the + * current expression (i.e., we already have some + * concatentations accumulated), treat it as an ordinary + * character + */ + if (!cur_machine.isNull()) + goto normal_char; + + // build a new start-of-text recognizer + build_char(&new_machine, RE_TEXT_BEGIN); + + /* + * concatenate it onto the string - note that this can't + * have any postfix operators + */ + concat_onto(&cur_machine, &new_machine); + break; + + case '$': + /* + * End of line specifier - if there's anything left after + * the '$' other than a close parens or alternation + * specifier, great it as a normal character + */ + if (exprlen > 1 + && (*(expr+1) != ')' && *(expr+1) != '|')) + goto normal_char; + + // build a new end-of-text recognizer + build_char(&new_machine, RE_TEXT_END); + + /* + * concatenate it onto the string - note that this can't + * have any postfix operators + */ + concat_onto(&cur_machine, &new_machine); + break; + + case '(': + /* + * Add a nesting level. Push the current machine and + * alternate machines onto the group stack, and clear + * everything out for the new group. + */ + if (group_stack_level > sizeof(group_stack)/sizeof(group_stack[0])) { + /* we cannot proceed - return an error */ + return RE_STATUS_GROUP_NESTING_TOO_DEEP; + } + + // save the current state on the stack + group_stack[group_stack_level].old_cur = cur_machine; + group_stack[group_stack_level].old_alter = alter_machine; + + /* + * Assign the group a group ID - groups are numbered in + * order of their opening (left) parentheses, so we want to + * assign a group number now. We won't actually need to + * know the group number until we get to the matching close + * paren, but we need to assign it now, so store it in the + * group stack. + */ + group_stack[group_stack_level].group_id = _cur_group; + + // consume the group number + _cur_group++; + + // push the level + ++group_stack_level; + + // start the new group with empty machines + cur_machine.build_null_machine(); + alter_machine.build_null_machine(); + break; + + case ')': + // if there's nothing on the stack, ignore this + if (group_stack_level == 0) + break; + + // take a level off the stack + --group_stack_level; + + /* + * Remove a nesting level. If we have a pending alternate + * expression, build the alternation expression. This will + * leave the entire group expression in alter_machine, + * regardless of whether an alternation was in progress or + * not. + */ + alternate_onto(&alter_machine, &cur_machine); + + /* + * Create a group machine that encloses the group and marks + * it with a group number. We assigned the group number + * when we parsed the open paren, so read that group number + * from the stack. + * + * Note that this will leave 'new_machine' with the entire + * group machine. + */ + build_group(&new_machine, &alter_machine, + group_stack[group_stack_level].group_id); + + /* + * Pop the stack - restore the alternation and current + * machines that were in progress before the group started. + */ + cur_machine = group_stack[group_stack_level].old_cur; + alter_machine = group_stack[group_stack_level].old_alter; + + /* + * Check the group expression (in new_machine) for postfix + * expressions + */ + goto apply_postfix; + + case '|': + /* + * Start a new alternation. This ends the current + * alternation; if we have a previous pending alternate, + * build an alternation machine out of the previous + * alternate and the current machine and move that to the + * alternate; otherwise, simply move the current machine to + * the pending alternate. + */ + alternate_onto(&alter_machine, &cur_machine); + + /* + * the alternation starts out with a blank slate, so null + * out the current machine + */ + cur_machine.build_null_machine(); + break; + + case '%': + // quoted character - skip the quote mark and see what we have + ++expr; + --exprlen; + + // check to see if we're at the end of the expression + if (exprlen == 0) { + /* + * end of the string - ignore it, but undo the extra + * increment of the expression index so that we exit the + * enclosing loop properly + */ + --expr; + ++exprlen; + break; + } + + // see what we have + switch(*expr) { + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + // group match - build a new literal group recognizer + build_group_matcher(&new_machine, (int)(*expr - '1')); + + // apply any postfix expression to the group recognizer + goto apply_postfix; + + case '<': + // build a beginning-of-word recognizer + build_char(&new_machine, RE_WORD_BEGIN); + + // it can't be postfixed - just concatenate it + concat_onto(&cur_machine, &new_machine); + break; + + case '>': + // build an end-of-word recognizer */ + build_char(&new_machine, RE_WORD_END); + + // it can't be postfixed - just concatenate it + concat_onto(&cur_machine, &new_machine); + break; + + case 'w': + // word character + build_char(&new_machine, RE_WORD_CHAR); + goto apply_postfix; + + case 'W': + // non-word character + build_char(&new_machine, RE_NON_WORD_CHAR); + goto apply_postfix; + + case 'b': + // word boundary + build_char(&new_machine, RE_WORD_BOUNDARY); + + // it can't be postfixed + concat_onto(&cur_machine, &new_machine); + break; + + case 'B': + // not a word boundary + build_char(&new_machine, RE_NON_WORD_BOUNDARY); + + // it can't be postfixed + concat_onto(&cur_machine, &new_machine); + break; + + default: + // build a new literal character recognizer + build_char(&new_machine, *expr); + + // apply any postfix expression to the character + goto apply_postfix; + } + break; + + case '.': + /* + * wildcard character - build a single character recognizer + * for the special wildcard symbol, then go check it for a + * postfix operator + */ + build_char(&new_machine, RE_WILDCARD); + goto apply_postfix; + break; + + case '[': { + // range expression + int is_exclusive = false; + unsigned char set[32]; + + // clear out the set of characters in the range + memset(set, 0, sizeof(set)); + + // first, skip the open bracket + ++expr; + --exprlen; + + // check to see if starts with the exclusion character + if (exprlen != 0 && *expr == '^') { + // skip the exclusion specifier + ++expr; + --exprlen; + + // note it + is_exclusive = true; + } + + // if the first character is a ']', include it in the range + if (exprlen != 0 && *expr == ']') { + re_set_bit(set, (int)']'); + ++expr; + --exprlen; + } + + // if the next character is a '-', include it in the range + if (exprlen != 0 && *expr == '-') { + re_set_bit(set, (int)'-'); + ++expr; + --exprlen; + } + + // scan the character set + while (exprlen != 0 && *expr != ']') { + int ch; + + // note this character + ch = (int)(unsigned char)*expr; + + // set it + re_set_bit(set, ch); + + // skip this character of the expression + ++expr; + --exprlen; + + // check for a range + if (exprlen != 0 && *expr == '-') { + int ch2; + + // skip the '-' + ++expr; + --exprlen; + if (exprlen != 0) { + // get the other end of the range + ch2 = (int)(unsigned char)*expr; + + // skip the second character + ++expr; + --exprlen; + + // if the range is reversed, swap it + if (ch > ch2) + SWAP(ch, ch2); + + // fill in the range + for ( ; ch <= ch2 ; ++ch) + re_set_bit(set, ch); + } + } + } + + // create a character range machine + build_char_range(&new_machine, set, is_exclusive); + + // apply any postfix operator + goto apply_postfix; + break; + } + + default: + normal_char: + /* + * it's an ordinary character - build a single character + * recognizer machine, and then concatenate it onto any + * existing machine + */ + build_char(&new_machine, *expr); + + apply_postfix: + /* + * Check for a postfix operator, and apply it to the machine + * in 'new_machine' if present. In any case, concatenate + * the 'new_machine' (modified by a postix operator or not) + * to the current machien. + */ + if (exprlen > 1) { + switch(*(expr+1)) { + case '*': + case '+': + case '?': + /* + * We have a postfix closure operator. Build a new + * closure machine out of 'new_machine'. + */ + { + re_machine closure_machine; + + // move onto the closure operator + ++expr; + --exprlen; + + // build the closure machine + build_closure(&closure_machine, &new_machine, *expr); + + // replace the original machine with the closure + new_machine = closure_machine; + + /* + * skip any redundant closure symbols, keeping + * only the first one we saw + */ + while (exprlen > 1 && (*(expr+1) == '?' + || *(expr+1) == '+' + || *(expr+1) == '*')) { + ++expr; + --exprlen; + } + } + break; + + default: + /* no postfix operator */ + break; + } + } + + /* + * Concatenate the new machine onto the current machine + * under construction. + */ + concat_onto(&cur_machine, &new_machine); + break; + } + } + + // complete any pending alternation + alternate_onto(&alter_machine, &cur_machine); + + // store the resulting machine in the caller's machine descriptor + *result_machine = alter_machine; + + // no errors encountered + return RE_STATUS_SUCCESS; +} + +void re_context::note_group(re_group_register *regs, re_state_id id, const char *p) { + int group_index; + + /* + * Check to see if this is a valid state and it's a group marker - + * if not, there's nothing to do + */ + if (id == RE_STATE_INVALID + || !(_tuple_arr[id].flags + & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)) + || (group_index = (int)_tuple_arr[id].ch) >= RE_GROUP_REG_CNT) + return; + + // It's a valid group marker - note the appropriate register value + if ((_tuple_arr[id].flags & RE_STATE_GROUP_BEGIN) != 0) + regs[group_index].start_ofs = p; + else + regs[group_index].end_ofs = p; +} + +bool re_context::is_word_char(char c) const { + return Common::isAlnum(c); +} + +int re_context::match(const char *entire_str, const char *str, size_t origlen, + const re_machine *machine, re_group_register *regs) { + re_state_id cur_state; + const char *p; + size_t curlen; + + // start at the machine's initial state + cur_state = machine->init; + + // start at the beginning of the string + p = str; + curlen = origlen; + + // note any group involved in the initial state + note_group(regs, cur_state, p); + + /* + * if we're starting in the final state, immediately return success + * with a zero-length match + */ + if (cur_state == machine->final) { + // return success with a zero-length match + return 0; + } + + // run the machine + for (;;) { + re_tuple *tuple; + + // get the tuple for this state + tuple = &_tuple_arr[cur_state]; + + // if this is a group state, adjust the group registers + note_group(regs, cur_state, p); + + // see what kind of state we're in + if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)) + && tuple->ch != RE_EPSILON) { + /* + * This is a character or group recognizer state. If we + * match the character or group, continue on to the next + * state; otherwise, return failure. + */ + switch(tuple->ch) { + case RE_GROUP_MATCH_0: + case RE_GROUP_MATCH_0 + 1: + case RE_GROUP_MATCH_0 + 2: + case RE_GROUP_MATCH_0 + 3: + case RE_GROUP_MATCH_0 + 4: + case RE_GROUP_MATCH_0 + 5: + case RE_GROUP_MATCH_0 + 6: + case RE_GROUP_MATCH_0 + 7: + case RE_GROUP_MATCH_0 + 8: + case RE_GROUP_MATCH_0 + 9: { + int group_num; + re_group_register *group_reg; + size_t reg_len; + + // it's a group - get the group number + group_num = tuple->ch - RE_GROUP_MATCH_0; + group_reg = ®s[group_num]; + + /* + * if this register isn't defined, there's nothing + * to match, so fail + */ + if (group_reg->start_ofs == 0 || group_reg->end_ofs == 0) + return -1; + + // calculate the length of the register value + reg_len = group_reg->end_ofs - group_reg->start_ofs; + + // if we don't have enough left to match, it fails + if (curlen < reg_len) + return -1; + + // if the string doesn't match exactly, we fail + if (memcmp(p, group_reg->start_ofs, reg_len) != 0) + return -1; + + /* + * It matches exactly - skip the entire length of + * the register in the source string + */ + p += reg_len; + curlen -= reg_len; + break; + } + + case RE_TEXT_BEGIN: + /* + * Match only the exact beginning of the string - if + * we're anywhere else, this isn't a match. If this + * succeeds, we don't skip any characters. + */ + if (p != entire_str) + return -1; + break; + + case RE_TEXT_END: + /* + * Match only the exact end of the string - if we're + * anywhere else, this isn't a match. Don't skip any + * characters on success. + */ + if (curlen != 0) + return -1; + break; + + case RE_WORD_BEGIN: + /* + * if the previous character is a word character, we're + * not at the beginning of a word + */ + if (p != entire_str && is_word_char(*(p - 1))) + return -1; + + /* + * if we're at the end of the string, or the current + * character isn't the start of a word, we're not at the + * beginning of a word + */ + if (curlen == 0 || !is_word_char(*p)) + return -1; + break; + + case RE_WORD_END: + /* + * if the current character is a word character, we're not + * at the end of a word + */ + if (curlen != 0 && is_word_char(*p)) + return -1; + + /* + * if we're at the beginning of the string, or the + * previous character is not a word character, we're not + * at the end of a word + */ + if (p == entire_str || !is_word_char(*(p - 1))) + return -1; + break; + + case RE_WORD_CHAR: + /* if it's not a word character, it's a failure */ + if (curlen == 0 || !is_word_char(*p)) + return -1; + + /* skip this character of input */ + ++p; + --curlen; + break; + + case RE_NON_WORD_CHAR: + /* if it's a word character, it's a failure */ + if (curlen == 0 || is_word_char(*p)) + return -1; + + /* skip the input */ + ++p; + --curlen; + break; + + case RE_WORD_BOUNDARY: + case RE_NON_WORD_BOUNDARY: + { + int prev_is_word; + int next_is_word; + int boundary; + + /* + * Determine if the previous character is a word + * character -- if we're at the beginning of the + * string, it's obviously not, otherwise check its + * classification + */ + prev_is_word = (p != entire_str + && is_word_char(*(p - 1))); + + /* make the same check for the current character */ + next_is_word = (curlen != 0 + && is_word_char(*p)); + + /* + * Determine if this is a boundary - it is if the + * two states are different + */ + boundary = ((prev_is_word != 0) ^ (next_is_word != 0)); + + /* + * make sure it matches what was desired, and return + * failure if not + */ + if ((tuple->ch == RE_WORD_BOUNDARY && !boundary) + || (tuple->ch == RE_NON_WORD_BOUNDARY && boundary)) + return -1; + } + break; + + case RE_WILDCARD: + // make sure we have a character to match + if (curlen == 0) + return -1; + + // skip this character + ++p; + --curlen; + break; + + case RE_RANGE: + case RE_RANGE_EXCL: { + int match; + + // make sure we have a character to match + if (curlen == 0) + return -1; + + // see if we match + match = re_is_bit_set(tuple->char_range, + (int)(unsigned char)*p); + + // make sure we got what we wanted + if ((tuple->ch == RE_RANGE && !match) + || (tuple->ch == RE_RANGE_EXCL && match)) + return -1; + + // skip this character of the input + ++p; + --curlen; + break; + } + + default: + // make sure we have an exact match + if (curlen == 0 || tuple->ch != *p) + return -1; + + // skip this character of the input + ++p; + --curlen; + break; + } + + /* + * if we got this far, we were successful - move on to the + * next state + */ + cur_state = tuple->next_state_1; + } else if (tuple->next_state_2 == RE_STATE_INVALID) { + /* + * We have only one transition, so this state is entirely + * deterministic. Simply move on to the next state. + */ + cur_state = tuple->next_state_1; + } else { + re_machine sub_machine; + re_group_register regs1[RE_GROUP_REG_CNT]; + re_group_register regs2[RE_GROUP_REG_CNT]; + int ret1; + int ret2; + + /* + * This state has two possible transitions, and we don't + * know which one to take. So, try both, see which one + * works better, and return the result. Try the first + * transition first. Note that each separate attempt must + * use a separate copy of the registers. + */ + memcpy(regs1, regs, sizeof(regs1)); + sub_machine.init = tuple->next_state_1; + sub_machine.final = machine->final; + ret1 = match(entire_str, p, curlen, &sub_machine, regs1); + + /* + * Now try the second transition + */ + memcpy(regs2, regs, sizeof(regs2)); + sub_machine.init = tuple->next_state_2; + sub_machine.final = machine->final; + ret2 = match(entire_str, p, curlen, &sub_machine, regs2); + + /* + * If they both failed, the whole thing failed. Otherwise, + * return the longer of the two, plus the length we + * ourselves matched previously. Note that we return the + * register set from the winning match. + */ + if (ret1 < 0 && ret2 < 0) { + // they both failed + return -1; + } else if (ret1 > ret2) { + // use the first register set and result length + memcpy(regs, regs1, sizeof(regs1)); + return ret1 + (p - str); + } else { + // use the second register set and result length + memcpy(regs, regs2, sizeof(regs2)); + return ret2 + (p - str); + } + } + + // If we're in the final state, return success + if (cur_state == machine->final) { + // finish off any group involved in the final state + note_group(regs, cur_state, p); + + // return the length we matched + return p - str; + } + } +} + +int re_context::search(const char *str, size_t len, const re_machine *machine, + re_group_register *regs, int *result_len) { + int ofs; + + /* + * Starting at the first character in the string, search for the + * pattern at each subsequent character until we either find the + * pattern or run out of string to test. + */ + for (ofs = 0 ; ofs < (int)len ; ++ofs) { + int matchlen; + + // check for a match + matchlen = match(str, str + ofs, len - ofs, machine, regs); + if (matchlen >= 0) { + // we found a match here - return the length and offset + *result_len = matchlen; + return ofs; + } + } + + // we didn't find a match + return -1; +} + +void re_context::save_search_str(const char *str, size_t len) { + // if the string is empty, this is easy + if (len == 0) { + // nothing to store - just save the length and return + _curlen = 0; + return; + } + + // if the current buffer isn't big enough, allocate a new one + if (_strbuf == 0 || _strbufsiz < len) { + /* + * free any previous buffer - its contents are no longer + * important, since we're about to overwrite it with a new + * string + */ + if (_strbuf != 0) + mchfre(_strbuf); + + /* + * allocate a new buffer; round up to the next 256-byte + * increment to make sure we're not constantly reallocating to + * random sizes + */ + _strbufsiz = ((len + 255) & ~255); + + // allocate it + _strbuf = (char *)mchalo(_errctx, _strbufsiz, "regex str"); + } + + // copy the string + memcpy(_strbuf, str, len); + + // save the length + _curlen = len; +} + +int re_context::compile_and_search(const char *pattern, size_t patlen, + const char *searchstr, size_t searchlen, int *result_len) { + re_machine machine; + + // compile the expression - return failure if we get an error + if (compile(pattern, patlen, &machine) != RE_STATUS_SUCCESS) + return -1; + + // save the search string in our internal buffer + save_search_str(searchstr, searchlen); + + // clear the group registers + memset(_regs, 0, sizeof(_regs)); + + /* + * search for the pattern in our copy of the string - use the copy + * so that the group registers stay valid even if the caller + * deallocates the original string after we return + */ + return search(_strbuf, _curlen, &machine, _regs, result_len); +} + +int re_context::compile_and_match(const char *pattern, size_t patlen, + const char *searchstr, size_t searchlen) { + re_machine machine; + + // compile the expression - return failure if we get an error + if (compile(pattern, patlen, &machine) != RE_STATUS_SUCCESS) + return 0; + + // save the search string in our internal buffer + save_search_str(searchstr, searchlen); + + // clear the group registers + memset(_regs, 0, sizeof(_regs)); + + // match the string + return match(_strbuf, _strbuf, _curlen, &machine, _regs); +} + +} // End of namespace TADS2 +} // End of namespace TADS +} // Engine of namespace GLK |