aboutsummaryrefslogtreecommitdiff
path: root/engines/glk/tads/tads2/regex.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'engines/glk/tads/tads2/regex.cpp')
-rw-r--r--engines/glk/tads/tads2/regex.cpp2481
1 files changed, 1396 insertions, 1085 deletions
diff --git a/engines/glk/tads/tads2/regex.cpp b/engines/glk/tads/tads2/regex.cpp
index a7f7d99369..5d608dfa4a 100644
--- a/engines/glk/tads/tads2/regex.cpp
+++ b/engines/glk/tads/tads2/regex.cpp
@@ -68,10 +68,8 @@ Notes
*/
#include "glk/tads/tads2/regex.h"
-#include "glk/tads/tads2/ler.h"
-#include "glk/tads/tads2/os.h"
-//#include "glk/tads/tads2/std.h"
-#//include "engines/glk/tads/tads2/ler.h"
+#include "glk/tads/tads2/memory_cache_heap.h"
+#include "common/util.h"
namespace Glk {
namespace TADS {
@@ -128,1148 +126,1461 @@ enum {
RE_GROUP_MATCH_9 = (RE_GROUP_MATCH_0 + 9)
};
-re_context::re_context(errcxdef *errctx) {
- // save the error context
- _errctx = errctx;
-
- // clear states
- _next_state = RE_STATE_FIRST_VALID;
+/* ------------------------------------------------------------------------ */
+/*
+ * A machine description. Machines are fully described by their initial
+ * and final state ID's.
+ */
+struct re_machine {
+ /* the machine's initial state */
+ re_state_id init;
- // clear groups
- _cur_group = 0;
+ /* the machine's final state */
+ re_state_id final;
+};
- // no string buffer yet
- _strbuf = 0;
+/* ------------------------------------------------------------------------ */
+/*
+ * Initialize the context. The memory for the context structure itself
+ * is allocated and maintained by the caller.
+ */
+void re_init(re_context *ctx, errcxdef *errctx)
+{
+ /* save the error context */
+ ctx->errctx = errctx;
+
+ /* no tuple array yet */
+ ctx->tuple_arr = 0;
+ ctx->tuples_alloc = 0;
+
+ /* clear states */
+ ctx->next_state = RE_STATE_FIRST_VALID;
+
+ /* clear groups */
+ ctx->cur_group = 0;
+
+ /* no string buffer yet */
+ ctx->strbuf = 0;
}
-re_context::~re_context() {
- // reset state
- reset();
-
- // if we allocated a string buffer, delete it
- if (_strbuf != 0) {
- mchfre(_strbuf);
- _strbuf = nullptr;
- }
+/* ------------------------------------------------------------------------ */
+/*
+ * Reset compiler - clears states and tuples
+ */
+static void re_reset(re_context *ctx)
+{
+ int i;
+
+ /* delete any range tables we've allocated */
+ for (i = 0 ; i < ctx->next_state ; ++i)
+ {
+ if (ctx->tuple_arr[i].char_range != 0)
+ {
+ mchfre(ctx->tuple_arr[i].char_range);
+ ctx->tuple_arr[i].char_range = 0;
+ }
+ }
+
+ /* clear states */
+ ctx->next_state = RE_STATE_FIRST_VALID;
+
+ /* clear groups */
+ ctx->cur_group = 0;
}
-void re_context::reset() {
- int i;
-
- // delete any range tables we've allocated
- for (i = 0 ; i < _next_state ; ++i) {
- if (_tuple_arr[i].char_range != 0) {
- mchfre(_tuple_arr[i].char_range);
- _tuple_arr[i].char_range = 0;
- }
- }
-
- // clear states
- _next_state = RE_STATE_FIRST_VALID;
-
- // clear groups
- _cur_group = 0;
+/* ------------------------------------------------------------------------ */
+/*
+ * Delete the context - frees structures associated with the context.
+ * Does NOT free the memory used by the context structure itself.
+ */
+void re_delete(re_context *ctx)
+{
+ /* reset state */
+ re_reset(ctx);
+
+ /* if we've allocated an array, delete it */
+ if (ctx->tuple_arr != 0)
+ {
+ mchfre(ctx->tuple_arr);
+ ctx->tuple_arr = 0;
+ }
+
+ /* if we allocated a string buffer, delete it */
+ if (ctx->strbuf != 0)
+ {
+ mchfre(ctx->strbuf);
+ ctx->strbuf = 0;
+ }
}
-re_state_id re_context::alloc_state() {
- // If we don't have enough room for another state, expand the array
- if (_next_state >= (int)_tuple_arr.size()) {
- // bump the size by a bit
- _tuple_arr.resize(_tuple_arr.size() + 100);
- }
-
- // initialize the next state
- _tuple_arr[_next_state].next_state_1 = RE_STATE_INVALID;
- _tuple_arr[_next_state].next_state_2 = RE_STATE_INVALID;
- _tuple_arr[_next_state].ch = RE_EPSILON;
- _tuple_arr[_next_state].flags = 0;
- _tuple_arr[_next_state].char_range = 0;
-
- // return the new state's ID
- return _next_state++;
+/* ------------------------------------------------------------------------ */
+/*
+ * Allocate a new state ID
+ */
+static re_state_id re_alloc_state(re_context *ctx)
+{
+ /*
+ * If we don't have enough room for another state, expand the array
+ */
+ if (ctx->next_state >= ctx->tuples_alloc)
+ {
+ uint new_alloc;
+
+ /* bump the size by a bit */
+ new_alloc = ctx->tuples_alloc + 100;
+
+ /* allocate or expand the array */
+ if (ctx->tuples_alloc == 0)
+ {
+ /* allocate the initial memory block */
+ ctx->tuple_arr =
+ (re_tuple *)mchalo(ctx->errctx,
+ (new_alloc * sizeof(re_tuple)),
+ "regex");
+ }
+ else
+ {
+ re_tuple *ptr;
+
+ /* allocate a new memory block */
+ ptr = (re_tuple *)mchalo(ctx->errctx,
+ (new_alloc * sizeof(re_tuple)),
+ "regex");
+
+ /* copy the old memory to the new memory */
+ memcpy(ptr, ctx->tuple_arr, ctx->tuples_alloc * sizeof(re_tuple));
+
+ /* free the old block */
+ mchfre(ctx->tuple_arr);
+
+ /* use the new block */
+ ctx->tuple_arr = ptr;
+ }
+
+ /* remember the new allocation size */
+ ctx->tuples_alloc = new_alloc;
+ }
+
+ /* initialize the next state */
+ ctx->tuple_arr[ctx->next_state].next_state_1 = RE_STATE_INVALID;
+ ctx->tuple_arr[ctx->next_state].next_state_2 = RE_STATE_INVALID;
+ ctx->tuple_arr[ctx->next_state].ch = RE_EPSILON;
+ ctx->tuple_arr[ctx->next_state].flags = 0;
+ ctx->tuple_arr[ctx->next_state].char_range = 0;
+
+ /* return the new state's ID */
+ return ctx->next_state++;
}
-void re_context::set_trans(re_state_id id, re_state_id dest_id, char ch) {
- re_tuple *tuple;
-
- /*
- * get the tuple containing the transitions for this state ID - the
- * state ID is the index of the state's transition tuple in the array
- */
- tuple = &_tuple_arr[id];
-
- /*
- * If the first state pointer hasn't been set yet, set it to the new
- * destination. Otherwise, set the second state pointer.
- *
- * Only set the character on setting the first state. When setting
- * the second state, we must assume that the character for the state
- * has already been set, since any given state can have only one
- * character setting.
- */
- if (tuple->next_state_1 == RE_STATE_INVALID) {
- /*
- * set the character ID, unless the state has been marked with a
- * special flag which indicates that the character value has
- * another meaning (in particular, a group marker)
- */
- if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)))
- tuple->ch = ch;
-
- // set the first transition
- tuple->next_state_1 = dest_id;
- } else {
- // set only the second transition state - don't set the character
- tuple->next_state_2 = dest_id;
- }
+
+/* ------------------------------------------------------------------------ */
+/*
+ * Set a transition from a state to a given destination state.
+ */
+static void re_set_trans(re_context *ctx,
+ re_state_id id, re_state_id dest_id, char ch)
+{
+ re_tuple *tuple;
+
+ /*
+ * get the tuple containing the transitions for this state ID - the
+ * state ID is the index of the state's transition tuple in the
+ * array
+ */
+ tuple = &ctx->tuple_arr[id];
+
+ /*
+ * If the first state pointer hasn't been set yet, set it to the new
+ * destination. Otherwise, set the second state pointer.
+ *
+ * Only set the character on setting the first state. When setting
+ * the second state, we must assume that the character for the state
+ * has already been set, since any given state can have only one
+ * character setting.
+ */
+ if (tuple->next_state_1 == RE_STATE_INVALID)
+ {
+ /*
+ * set the character ID, unless the state has been marked with a
+ * special flag which indicates that the character value has
+ * another meaning (in particular, a group marker)
+ */
+ if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)))
+ tuple->ch = ch;
+
+ /* set the first transition */
+ tuple->next_state_1 = dest_id;
+ }
+ else
+ {
+ /* set only the second transition state - don't set the character */
+ tuple->next_state_2 = dest_id;
+ }
}
-void re_context::init_machine(re_machine *machine) {
- machine->init = alloc_state();
- machine->final = alloc_state();
+/* ------------------------------------------------------------------------ */
+/*
+ * Initialize a new machine, giving it an initial and final state
+ */
+static void re_init_machine(re_context *ctx, re_machine *machine)
+{
+ machine->init = re_alloc_state(ctx);
+ machine->final = re_alloc_state(ctx);
}
-void re_context::build_char(re_machine *machine, char ch) {
- // initialize our new machine
- init_machine(machine);
+/*
+ * Build a character recognizer
+ */
+static void re_build_char(re_context *ctx, re_machine *machine, char ch)
+{
+ /* initialize our new machine */
+ re_init_machine(ctx, machine);
- // allocate a transition tuple for the new state
- set_trans(machine->init, machine->final, ch);
+ /* allocate a transition tuple for the new state */
+ re_set_trans(ctx, machine->init, machine->final, ch);
}
-void re_context::build_char_range(re_machine *machine, unsigned char *range, int exclusion) {
- unsigned char *range_copy;
-
- // initialize our new machine
- init_machine(machine);
-
- // allocate a transition table for the new state
- set_trans(machine->init, machine->final, (char)(exclusion ? RE_RANGE_EXCL : RE_RANGE));
+/*
+ * Build a character range recognizer. 'range' is a 256-bit (32-byte)
+ * bit vector.
+ */
+static void re_build_char_range(re_context *ctx, re_machine *machine,
+ unsigned char *range, int exclusion)
+{
+ unsigned char *range_copy;
+
+ /* initialize our new machine */
+ re_init_machine(ctx, machine);
+
+ /* allocate a transition table for the new state */
+ re_set_trans(ctx, machine->init, machine->final,
+ (char)(exclusion ? RE_RANGE_EXCL : RE_RANGE));
+
+ /* allocate a copy of the range bit vector */
+ range_copy = (unsigned char *)mchalo(ctx->errctx, 32, "regex range");
+
+ /* copy the caller's range */
+ memcpy(range_copy, range, 32);
+
+ /* store it in the tuple */
+ ctx->tuple_arr[machine->init].char_range = range_copy;
+}
+
- // allocate a copy of the range bit vector
- range_copy = (unsigned char *)mchalo(_errctx, 32, "regex range");
+/*
+ * Build a group recognizer. This is almost the same as a character
+ * recognizer, but matches a previous group rather than a literal
+ * character.
+ */
+static void re_build_group_matcher(re_context *ctx,
+ re_machine *machine, int group_num)
+{
+ /* initialize our new machine */
+ re_init_machine(ctx, machine);
+
+ /*
+ * Allocate a transition tuple for the new state, using the group ID
+ * as the character code. Store the special code for a group
+ * recognizer rather than the normal literal character code.
+ */
+ re_set_trans(ctx, machine->init, machine->final,
+ (char)(group_num + RE_GROUP_MATCH_0));
+}
- // copy the caller's range
- memcpy(range_copy, range, 32);
- // store it in the tuple
- _tuple_arr[machine->init].char_range = range_copy;
+/*
+ * Build a concatenation recognizer
+ */
+static void re_build_concat(re_context *ctx, re_machine *new_machine,
+ re_machine *lhs, re_machine *rhs)
+{
+ /* initialize the new machine */
+ re_init_machine(ctx, new_machine);
+
+ /*
+ * set up an epsilon transition from the new machine's initial state
+ * to the first submachine's initial state
+ */
+ re_set_trans(ctx, new_machine->init, lhs->init, RE_EPSILON);
+
+ /*
+ * Set up an epsilon transition from the first submachine's final
+ * state to the second submachine's initial state
+ */
+ re_set_trans(ctx, lhs->final, rhs->init, RE_EPSILON);
+
+ /*
+ * Set up an epsilon transition from the second submachine's final
+ * state to our new machine's final state
+ */
+ re_set_trans(ctx, rhs->final, new_machine->final, RE_EPSILON);
}
-void re_context::build_group_matcher(re_machine *machine, int group_num) {
- // initialize our new machine
- init_machine(machine);
-
- /*
- * Allocate a transition tuple for the new state, using the group ID
- * as the character code. Store the special code for a group
- * recognizer rather than the normal literal character code.
- */
- set_trans(machine->init, machine->final, (char)(group_num + RE_GROUP_MATCH_0));
+/*
+ * Build a group machine. sub_machine contains the machine that
+ * expresses the group's contents; we'll fill in new_machine with a
+ * newly-created machine that encloses and marks the group.
+ */
+static void re_build_group(re_context *ctx, re_machine *new_machine,
+ re_machine *sub_machine, int group_id)
+{
+ /* initialize the container machine */
+ re_init_machine(ctx, new_machine);
+
+ /*
+ * set up an epsilon transition from the new machine's initial state
+ * into the initial state of the group, and another transition from
+ * the group's final state into the container's final state
+ */
+ re_set_trans(ctx, new_machine->init, sub_machine->init, RE_EPSILON);
+ re_set_trans(ctx, sub_machine->final, new_machine->final, RE_EPSILON);
+
+ /*
+ * Mark the initial and final states of the group machine as being
+ * group markers.
+ */
+ ctx->tuple_arr[new_machine->init].flags |= RE_STATE_GROUP_BEGIN;
+ ctx->tuple_arr[new_machine->final].flags |= RE_STATE_GROUP_END;
+
+ /* store the group ID in the 'ch' member of the start and end states */
+ ctx->tuple_arr[new_machine->init].ch = group_id;
+ ctx->tuple_arr[new_machine->final].ch = group_id;
}
-void re_context::build_concat(re_machine *new_machine, re_machine *lhs, re_machine *rhs) {
- // initialize the new machine
- init_machine(new_machine);
-
- /*
- * Set up an epsilon transition from the new machine's initial state
- * to the first submachine's initial state
- */
- set_trans(new_machine->init, lhs->init, RE_EPSILON);
-
- /*
- * Set up an epsilon transition from the first submachine's final
- * state to the second submachine's initial state
- */
- set_trans(lhs->final, rhs->init, RE_EPSILON);
-
- /*
- * Set up an epsilon transition from the second submachine's final
- * state to our new machine's final state
- */
- set_trans(rhs->final, new_machine->final, RE_EPSILON);
+/*
+ * Build an alternation recognizer
+ */
+static void re_build_alter(re_context *ctx, re_machine *new_machine,
+ re_machine *lhs, re_machine *rhs)
+{
+ /* initialize the new machine */
+ re_init_machine(ctx, new_machine);
+
+ /*
+ * Set up an epsilon transition from our new machine's initial state
+ * to the initial state of each submachine
+ */
+ re_set_trans(ctx, new_machine->init, lhs->init, RE_EPSILON);
+ re_set_trans(ctx, new_machine->init, rhs->init, RE_EPSILON);
+
+ /*
+ * Set up an epsilon transition from the final state of each
+ * submachine to our final state
+ */
+ re_set_trans(ctx, lhs->final, new_machine->final, RE_EPSILON);
+ re_set_trans(ctx, rhs->final, new_machine->final, RE_EPSILON);
}
-void re_context::build_group(re_machine *new_machine, re_machine *sub_machine, int group_id) {
- // initialize the container machine
- init_machine(new_machine);
-
- /*
- * set up an epsilon transition from the new machine's initial state
- * into the initial state of the group, and another transition from
- * the group's final state into the container's final state
- */
- set_trans(new_machine->init, sub_machine->init, RE_EPSILON);
- set_trans(sub_machine->final, new_machine->final, RE_EPSILON);
-
- // Mark the initial and final states of the group machine as being group markers
- _tuple_arr[new_machine->init].flags |= RE_STATE_GROUP_BEGIN;
- _tuple_arr[new_machine->final].flags |= RE_STATE_GROUP_END;
-
- // store the group ID in the 'ch' member of the start and end states
- _tuple_arr[new_machine->init].ch = group_id;
- _tuple_arr[new_machine->final].ch = group_id;
+/*
+ * Build a closure recognizer
+ */
+static void re_build_closure(re_context *ctx,
+ re_machine *new_machine, re_machine *sub,
+ char specifier)
+{
+ /* initialize the new machine */
+ re_init_machine(ctx, new_machine);
+
+ /*
+ * set up an epsilon transition from our initial state to the
+ * submachine's initial state, and from the submachine's final state
+ * to our final state
+ */
+ re_set_trans(ctx, new_machine->init, sub->init, RE_EPSILON);
+ re_set_trans(ctx, sub->final, new_machine->final, RE_EPSILON);
+
+ /*
+ * If this is an unbounded closure ('*' or '+', but not '?'), set up
+ * the loop transition that takes us from the new machine's final
+ * state back to its initial state. We don't do this on the
+ * zero-or-one closure, because we can only match the expression
+ * once.
+ */
+ if (specifier != '?')
+ re_set_trans(ctx, sub->final, sub->init, RE_EPSILON);
+
+ /*
+ * If this is a zero-or-one closure or a zero-or-more closure, set
+ * up an epsilon transition from our initial state to our final
+ * state, since we can skip the entire subexpression. We don't do
+ * this on the one-or-more closure, because we can't skip the
+ * subexpression in this case.
+ */
+ if (specifier != '+')
+ re_set_trans(ctx, new_machine->init, new_machine->final, RE_EPSILON);
}
-void re_context::build_alter(re_machine *new_machine, re_machine *lhs, re_machine *rhs) {
- // initialize the new machine
- init_machine(new_machine);
-
- /*
- * Set up an epsilon transition from our new machine's initial state
- * to the initial state of each submachine
- */
- set_trans(new_machine->init, lhs->init, RE_EPSILON);
- set_trans(new_machine->init, rhs->init, RE_EPSILON);
-
- /*
- * Set up an epsilon transition from the final state of each
- * submachine to our final state
- */
- set_trans(lhs->final, new_machine->final, RE_EPSILON);
- set_trans(rhs->final, new_machine->final, RE_EPSILON);
+/*
+ * Build a null machine
+ */
+static void re_build_null_machine(re_context *ctx, re_machine *machine)
+{
+ machine->init = machine->final = RE_STATE_INVALID;
}
-void re_context::build_closure(re_machine *new_machine, re_machine *sub, char specifier) {
- // initialize the new machine
- init_machine(new_machine);
-
- /*
- * Set up an epsilon transition from our initial state to the submachine's initial
- * state, and from the submachine's final state to our final state
- */
- set_trans(new_machine->init, sub->init, RE_EPSILON);
- set_trans(sub->final, new_machine->final, RE_EPSILON);
-
- /*
- * If this is an unbounded closure ('*' or '+', but not '?'), set up
- * the loop transition that takes us from the new machine's final
- * state back to its initial state. We don't do this on the
- * zero-or-one closure, because we can only match the expression
- * once.
- */
- if (specifier != '?')
- set_trans(sub->final, sub->init, RE_EPSILON);
-
- /*
- * If this is a zero-or-one closure or a zero-or-more closure, set
- * up an epsilon transition from our initial state to our final
- * state, since we can skip the entire subexpression. We don't do
- * this on the one-or-more closure, because we can't skip the
- * subexpression in this case.
- */
- if (specifier != '+')
- set_trans(new_machine->init, new_machine->final, RE_EPSILON);
+/* ------------------------------------------------------------------------ */
+/*
+ * Determine if a machine is null
+ */
+static int re_is_machine_null(re_context *ctx, re_machine *machine)
+{
+ return (machine->init == RE_STATE_INVALID);
}
-void re_context::concat_onto(re_machine *dest, re_machine *rhs) {
- // check for a null destination machine
- if (dest->isNull()) {
- /*
- * the first machine is null - simply copy the second machine
- * onto the first unchanged
- */
- *dest = *rhs;
- } else {
- re_machine new_machine;
-
- // build the concatenated machine
- build_concat(&new_machine, dest, rhs);
-
- // copy the concatenated machine onto the first machine
- *dest = new_machine;
- }
+
+/* ------------------------------------------------------------------------ */
+/*
+ * Concatenate the second machine onto the first machine, replacing the
+ * first machine with the resulting machine. If the first machine is a
+ * null machine (created with re_build_null_machine), we'll simply copy
+ * the second machine into the first.
+ */
+static void re_concat_onto(re_context *ctx,
+ re_machine *dest, re_machine *rhs)
+{
+ /* check for a null destination machine */
+ if (re_is_machine_null(ctx, dest))
+ {
+ /*
+ * the first machine is null - simply copy the second machine
+ * onto the first unchanged
+ */
+ *dest = *rhs;
+ }
+ else
+ {
+ re_machine new_machine;
+
+ /* build the concatenated machine */
+ re_build_concat(ctx, &new_machine, dest, rhs);
+
+ /* copy the concatenated machine onto the first machine */
+ *dest = new_machine;
+ }
}
-void re_context::alternate_onto(re_machine *dest, re_machine *rhs) {
- // check to see if the first machine is null
- if (dest->isNull()) {
- /*
- * the first machine is null - simply copy the second machine
- * onto the first
- */
- *dest = *rhs;
- } else {
- /*
- * if the second machine is null, don't do anything; otherwise,
- * build the alternation
- */
- if (!rhs->isNull()) {
- re_machine new_machine;
-
- // build the alternation
- build_alter(&new_machine, dest, rhs);
-
- // replace the first machine with the alternation
- *dest = new_machine;
- }
- }
+/*
+ * Alternate the second machine onto the first machine, replacing the
+ * first machine with the resulting machine. If the first machine is a
+ * null machine, this simply replaces the first machine with the second
+ * machine. If the second machine is null, this simply leaves the first
+ * machine unchanged.
+ */
+static void re_alternate_onto(re_context *ctx,
+ re_machine *dest, re_machine *rhs)
+{
+ /* check to see if the first machine is null */
+ if (re_is_machine_null(ctx, dest))
+ {
+ /*
+ * the first machine is null - simply copy the second machine
+ * onto the first
+ */
+ *dest = *rhs;
+ }
+ else
+ {
+ /*
+ * if the second machine is null, don't do anything; otherwise,
+ * build the alternation
+ */
+ if (!re_is_machine_null(ctx, rhs))
+ {
+ re_machine new_machine;
+
+ /* build the alternation */
+ re_build_alter(ctx, &new_machine, dest, rhs);
+
+ /* replace the first machine with the alternation */
+ *dest = new_machine;
+ }
+ }
}
-/**
- * Set a bit in a bit vector.
+/* ------------------------------------------------------------------------ */
+/*
+ * Set a bit in a bit vector.
*/
#define re_set_bit(set, bit) \
- (((unsigned char *)(set))[(bit) >> 3] |= (1 << ((bit) & 7)))
+ (((unsigned char *)(set))[(bit) >> 3] |= (1 << ((bit) & 7)))
-/**
- * Test a bit in a bit vector
+/*
+ * Test a bit in a bit vector
*/
#define re_is_bit_set(set, bit) \
- ((((unsigned char *)(set))[(bit) >> 3] & (1 << ((bit) & 7))) != 0)
-
-re_status_t re_context::compile(const char *expr, size_t exprlen, re_machine *result_machine) {
- re_machine cur_machine;
- re_machine alter_machine;
- re_machine new_machine;
- size_t group_stack_level;
- struct {
- re_machine old_cur;
- re_machine old_alter;
- int group_id;
- } group_stack[50];
-
- // reset everything
- reset();
-
- // start out with no current machine and no alternate machine
- cur_machine.build_null_machine();
- alter_machine.build_null_machine();
-
- // nothing on the stack yet
- group_stack_level = 0;
-
- // loop until we run out of expression to parse
- for (; exprlen != 0 ; ++expr, --exprlen) {
- switch(*expr) {
- case '^':
- /*
- * beginning of line - if we're not at the beginning of the
- * current expression (i.e., we already have some
- * concatentations accumulated), treat it as an ordinary
- * character
- */
- if (!cur_machine.isNull())
- goto normal_char;
-
- // build a new start-of-text recognizer
- build_char(&new_machine, RE_TEXT_BEGIN);
-
- /*
- * concatenate it onto the string - note that this can't
- * have any postfix operators
- */
- concat_onto(&cur_machine, &new_machine);
- break;
-
- case '$':
- /*
- * End of line specifier - if there's anything left after
- * the '$' other than a close parens or alternation
- * specifier, great it as a normal character
- */
- if (exprlen > 1
- && (*(expr+1) != ')' && *(expr+1) != '|'))
- goto normal_char;
-
- // build a new end-of-text recognizer
- build_char(&new_machine, RE_TEXT_END);
-
- /*
- * concatenate it onto the string - note that this can't
- * have any postfix operators
- */
- concat_onto(&cur_machine, &new_machine);
- break;
-
- case '(':
- /*
- * Add a nesting level. Push the current machine and
- * alternate machines onto the group stack, and clear
- * everything out for the new group.
- */
- if (group_stack_level > sizeof(group_stack)/sizeof(group_stack[0])) {
- /* we cannot proceed - return an error */
- return RE_STATUS_GROUP_NESTING_TOO_DEEP;
- }
-
- // save the current state on the stack
- group_stack[group_stack_level].old_cur = cur_machine;
- group_stack[group_stack_level].old_alter = alter_machine;
-
- /*
- * Assign the group a group ID - groups are numbered in
- * order of their opening (left) parentheses, so we want to
- * assign a group number now. We won't actually need to
- * know the group number until we get to the matching close
- * paren, but we need to assign it now, so store it in the
- * group stack.
- */
- group_stack[group_stack_level].group_id = _cur_group;
-
- // consume the group number
- _cur_group++;
-
- // push the level
- ++group_stack_level;
-
- // start the new group with empty machines
- cur_machine.build_null_machine();
- alter_machine.build_null_machine();
- break;
-
- case ')':
- // if there's nothing on the stack, ignore this
- if (group_stack_level == 0)
- break;
-
- // take a level off the stack
- --group_stack_level;
-
- /*
- * Remove a nesting level. If we have a pending alternate
- * expression, build the alternation expression. This will
- * leave the entire group expression in alter_machine,
- * regardless of whether an alternation was in progress or
- * not.
- */
- alternate_onto(&alter_machine, &cur_machine);
-
- /*
- * Create a group machine that encloses the group and marks
- * it with a group number. We assigned the group number
- * when we parsed the open paren, so read that group number
- * from the stack.
- *
- * Note that this will leave 'new_machine' with the entire
- * group machine.
- */
- build_group(&new_machine, &alter_machine,
- group_stack[group_stack_level].group_id);
-
- /*
- * Pop the stack - restore the alternation and current
- * machines that were in progress before the group started.
- */
- cur_machine = group_stack[group_stack_level].old_cur;
- alter_machine = group_stack[group_stack_level].old_alter;
-
- /*
- * Check the group expression (in new_machine) for postfix
- * expressions
- */
- goto apply_postfix;
-
- case '|':
- /*
- * Start a new alternation. This ends the current
- * alternation; if we have a previous pending alternate,
- * build an alternation machine out of the previous
- * alternate and the current machine and move that to the
- * alternate; otherwise, simply move the current machine to
- * the pending alternate.
- */
- alternate_onto(&alter_machine, &cur_machine);
-
- /*
- * the alternation starts out with a blank slate, so null
- * out the current machine
- */
- cur_machine.build_null_machine();
- break;
-
- case '%':
- // quoted character - skip the quote mark and see what we have
- ++expr;
- --exprlen;
-
- // check to see if we're at the end of the expression
- if (exprlen == 0) {
- /*
- * end of the string - ignore it, but undo the extra
- * increment of the expression index so that we exit the
- * enclosing loop properly
- */
- --expr;
- ++exprlen;
- break;
- }
-
- // see what we have
- switch(*expr) {
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- // group match - build a new literal group recognizer
- build_group_matcher(&new_machine, (int)(*expr - '1'));
-
- // apply any postfix expression to the group recognizer
- goto apply_postfix;
-
- case '<':
- // build a beginning-of-word recognizer
- build_char(&new_machine, RE_WORD_BEGIN);
-
- // it can't be postfixed - just concatenate it
- concat_onto(&cur_machine, &new_machine);
- break;
-
- case '>':
- // build an end-of-word recognizer */
- build_char(&new_machine, RE_WORD_END);
-
- // it can't be postfixed - just concatenate it
- concat_onto(&cur_machine, &new_machine);
- break;
-
- case 'w':
- // word character
- build_char(&new_machine, RE_WORD_CHAR);
- goto apply_postfix;
-
- case 'W':
- // non-word character
- build_char(&new_machine, RE_NON_WORD_CHAR);
- goto apply_postfix;
-
- case 'b':
- // word boundary
- build_char(&new_machine, RE_WORD_BOUNDARY);
-
- // it can't be postfixed
- concat_onto(&cur_machine, &new_machine);
- break;
-
- case 'B':
- // not a word boundary
- build_char(&new_machine, RE_NON_WORD_BOUNDARY);
-
- // it can't be postfixed
- concat_onto(&cur_machine, &new_machine);
- break;
-
- default:
- // build a new literal character recognizer
- build_char(&new_machine, *expr);
-
- // apply any postfix expression to the character
- goto apply_postfix;
- }
- break;
-
- case '.':
- /*
- * wildcard character - build a single character recognizer
- * for the special wildcard symbol, then go check it for a
- * postfix operator
- */
- build_char(&new_machine, RE_WILDCARD);
- goto apply_postfix;
- break;
-
- case '[': {
- // range expression
- int is_exclusive = false;
- unsigned char set[32];
-
- // clear out the set of characters in the range
- memset(set, 0, sizeof(set));
-
- // first, skip the open bracket
- ++expr;
- --exprlen;
-
- // check to see if starts with the exclusion character
- if (exprlen != 0 && *expr == '^') {
- // skip the exclusion specifier
- ++expr;
- --exprlen;
-
- // note it
- is_exclusive = true;
- }
-
- // if the first character is a ']', include it in the range
- if (exprlen != 0 && *expr == ']') {
- re_set_bit(set, (int)']');
- ++expr;
- --exprlen;
- }
-
- // if the next character is a '-', include it in the range
- if (exprlen != 0 && *expr == '-') {
- re_set_bit(set, (int)'-');
- ++expr;
- --exprlen;
- }
-
- // scan the character set
- while (exprlen != 0 && *expr != ']') {
- int ch;
-
- // note this character
- ch = (int)(unsigned char)*expr;
-
- // set it
- re_set_bit(set, ch);
-
- // skip this character of the expression
- ++expr;
- --exprlen;
-
- // check for a range
- if (exprlen != 0 && *expr == '-') {
- int ch2;
-
- // skip the '-'
- ++expr;
- --exprlen;
- if (exprlen != 0) {
- // get the other end of the range
- ch2 = (int)(unsigned char)*expr;
-
- // skip the second character
- ++expr;
- --exprlen;
-
- // if the range is reversed, swap it
- if (ch > ch2)
- SWAP(ch, ch2);
-
- // fill in the range
- for ( ; ch <= ch2 ; ++ch)
- re_set_bit(set, ch);
- }
- }
- }
-
- // create a character range machine
- build_char_range(&new_machine, set, is_exclusive);
-
- // apply any postfix operator
- goto apply_postfix;
- break;
- }
-
- default:
- normal_char:
- /*
- * it's an ordinary character - build a single character
- * recognizer machine, and then concatenate it onto any
- * existing machine
- */
- build_char(&new_machine, *expr);
-
- apply_postfix:
- /*
- * Check for a postfix operator, and apply it to the machine
- * in 'new_machine' if present. In any case, concatenate
- * the 'new_machine' (modified by a postix operator or not)
- * to the current machien.
- */
- if (exprlen > 1) {
- switch(*(expr+1)) {
- case '*':
- case '+':
- case '?':
- /*
- * We have a postfix closure operator. Build a new
- * closure machine out of 'new_machine'.
- */
- {
- re_machine closure_machine;
-
- // move onto the closure operator
- ++expr;
- --exprlen;
-
- // build the closure machine
- build_closure(&closure_machine, &new_machine, *expr);
-
- // replace the original machine with the closure
- new_machine = closure_machine;
-
- /*
- * skip any redundant closure symbols, keeping
- * only the first one we saw
- */
- while (exprlen > 1 && (*(expr+1) == '?'
- || *(expr+1) == '+'
- || *(expr+1) == '*')) {
- ++expr;
- --exprlen;
- }
- }
- break;
-
- default:
- /* no postfix operator */
- break;
- }
- }
-
- /*
- * Concatenate the new machine onto the current machine
- * under construction.
- */
- concat_onto(&cur_machine, &new_machine);
- break;
- }
- }
-
- // complete any pending alternation
- alternate_onto(&alter_machine, &cur_machine);
-
- // store the resulting machine in the caller's machine descriptor
- *result_machine = alter_machine;
-
- // no errors encountered
- return RE_STATUS_SUCCESS;
+ ((((unsigned char *)(set))[(bit) >> 3] & (1 << ((bit) & 7))) != 0)
+
+/* ------------------------------------------------------------------------ */
+/*
+ * Compile an expression
+ */
+static re_status_t re_compile(re_context *ctx,
+ const char *expr, size_t exprlen,
+ re_machine *result_machine)
+{
+ re_machine cur_machine;
+ re_machine alter_machine;
+ re_machine new_machine;
+ size_t group_stack_level;
+ struct
+ {
+ re_machine old_cur;
+ re_machine old_alter;
+ int group_id;
+ } group_stack[50];
+
+ /* reset everything */
+ re_reset(ctx);
+
+ /* start out with no current machine and no alternate machine */
+ re_build_null_machine(ctx, &cur_machine);
+ re_build_null_machine(ctx, &alter_machine);
+
+ /* nothing on the stack yet */
+ group_stack_level = 0;
+
+ /* loop until we run out of expression to parse */
+ for ( ; exprlen != 0 ; ++expr, --exprlen)
+ {
+ switch(*expr)
+ {
+ case '^':
+ /*
+ * beginning of line - if we're not at the beginning of the
+ * current expression (i.e., we already have some
+ * concatentations accumulated), treat it as an ordinary
+ * character
+ */
+ if (!re_is_machine_null(ctx, &cur_machine))
+ goto normal_char;
+
+ /* build a new start-of-text recognizer */
+ re_build_char(ctx, &new_machine, RE_TEXT_BEGIN);
+
+ /*
+ * concatenate it onto the string - note that this can't
+ * have any postfix operators
+ */
+ re_concat_onto(ctx, &cur_machine, &new_machine);
+ break;
+
+ case '$':
+ /*
+ * End of line specifier - if there's anything left after
+ * the '$' other than a close parens or alternation
+ * specifier, great it as a normal character
+ */
+ if (exprlen > 1
+ && (*(expr+1) != ')' && *(expr+1) != '|'))
+ goto normal_char;
+
+ /* build a new end-of-text recognizer */
+ re_build_char(ctx, &new_machine, RE_TEXT_END);
+
+ /*
+ * concatenate it onto the string - note that this can't
+ * have any postfix operators
+ */
+ re_concat_onto(ctx, &cur_machine, &new_machine);
+ break;
+
+ case '(':
+ /*
+ * Add a nesting level. Push the current machine and
+ * alternate machines onto the group stack, and clear
+ * everything out for the new group.
+ */
+ if (group_stack_level
+ > sizeof(group_stack)/sizeof(group_stack[0]))
+ {
+ /* we cannot proceed - return an error */
+ return RE_STATUS_GROUP_NESTING_TOO_DEEP;
+ }
+
+ /* save the current state on the stack */
+ group_stack[group_stack_level].old_cur = cur_machine;
+ group_stack[group_stack_level].old_alter = alter_machine;
+
+ /*
+ * Assign the group a group ID - groups are numbered in
+ * order of their opening (left) parentheses, so we want to
+ * assign a group number now. We won't actually need to
+ * know the group number until we get to the matching close
+ * paren, but we need to assign it now, so store it in the
+ * group stack.
+ */
+ group_stack[group_stack_level].group_id = ctx->cur_group;
+
+ /* consume the group number */
+ ctx->cur_group++;
+
+ /* push the level */
+ ++group_stack_level;
+
+ /* start the new group with empty machines */
+ re_build_null_machine(ctx, &cur_machine);
+ re_build_null_machine(ctx, &alter_machine);
+ break;
+
+ case ')':
+ /* if there's nothing on the stack, ignore this */
+ if (group_stack_level == 0)
+ break;
+
+ /* take a level off the stack */
+ --group_stack_level;
+
+ /*
+ * Remove a nesting level. If we have a pending alternate
+ * expression, build the alternation expression. This will
+ * leave the entire group expression in alter_machine,
+ * regardless of whether an alternation was in progress or
+ * not.
+ */
+ re_alternate_onto(ctx, &alter_machine, &cur_machine);
+
+ /*
+ * Create a group machine that encloses the group and marks
+ * it with a group number. We assigned the group number
+ * when we parsed the open paren, so read that group number
+ * from the stack.
+ *
+ * Note that this will leave 'new_machine' with the entire
+ * group machine.
+ */
+ re_build_group(ctx, &new_machine, &alter_machine,
+ group_stack[group_stack_level].group_id);
+
+ /*
+ * Pop the stack - restore the alternation and current
+ * machines that were in progress before the group started.
+ */
+ cur_machine = group_stack[group_stack_level].old_cur;
+ alter_machine = group_stack[group_stack_level].old_alter;
+
+ /*
+ * Check the group expression (in new_machine) for postfix
+ * expressions
+ */
+ goto apply_postfix;
+
+ case '|':
+ /*
+ * Start a new alternation. This ends the current
+ * alternation; if we have a previous pending alternate,
+ * build an alternation machine out of the previous
+ * alternate and the current machine and move that to the
+ * alternate; otherwise, simply move the current machine to
+ * the pending alternate.
+ */
+ re_alternate_onto(ctx, &alter_machine, &cur_machine);
+
+ /*
+ * the alternation starts out with a blank slate, so null
+ * out the current machine
+ */
+ re_build_null_machine(ctx, &cur_machine);
+ break;
+
+ case '%':
+ /*
+ * quoted character - skip the quote mark and see what we
+ * have
+ */
+ ++expr;
+ --exprlen;
+
+ /* check to see if we're at the end of the expression */
+ if (exprlen == 0)
+ {
+ /*
+ * end of the string - ignore it, but undo the extra
+ * increment of the expression index so that we exit the
+ * enclosing loop properly
+ */
+ --expr;
+ ++exprlen;
+ break;
+ }
+
+ /* see what we have */
+ switch(*expr)
+ {
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ /* group match - build a new literal group recognizer */
+ re_build_group_matcher(ctx, &new_machine, (int)(*expr - '1'));
+
+ /* apply any postfix expression to the group recognizer */
+ goto apply_postfix;
+
+ case '<':
+ /* build a beginning-of-word recognizer */
+ re_build_char(ctx, &new_machine, RE_WORD_BEGIN);
+
+ /* it can't be postfixed - just concatenate it */
+ re_concat_onto(ctx, &cur_machine, &new_machine);
+ break;
+
+ case '>':
+ /* build an end-of-word recognizer */
+ re_build_char(ctx, &new_machine, RE_WORD_END);
+
+ /* it can't be postfixed - just concatenate it */
+ re_concat_onto(ctx, &cur_machine, &new_machine);
+ break;
+
+ case 'w':
+ /* word character */
+ re_build_char(ctx, &new_machine, RE_WORD_CHAR);
+ goto apply_postfix;
+
+ case 'W':
+ /* non-word character */
+ re_build_char(ctx, &new_machine, RE_NON_WORD_CHAR);
+ goto apply_postfix;
+
+ case 'b':
+ /* word boundary */
+ re_build_char(ctx, &new_machine, RE_WORD_BOUNDARY);
+
+ /* it can't be postfixed */
+ re_concat_onto(ctx, &cur_machine, &new_machine);
+ break;
+
+ case 'B':
+ /* not a word boundary */
+ re_build_char(ctx, &new_machine, RE_NON_WORD_BOUNDARY);
+
+ /* it can't be postfixed */
+ re_concat_onto(ctx, &cur_machine, &new_machine);
+ break;
+
+ default:
+ /* build a new literal character recognizer */
+ re_build_char(ctx, &new_machine, *expr);
+
+ /* apply any postfix expression to the character */
+ goto apply_postfix;
+ }
+ break;
+
+ case '.':
+ /*
+ * wildcard character - build a single character recognizer
+ * for the special wildcard symbol, then go check it for a
+ * postfix operator
+ */
+ re_build_char(ctx, &new_machine, RE_WILDCARD);
+ goto apply_postfix;
+ break;
+
+ case '[':
+ /* range expression */
+ {
+ int is_exclusive = FALSE;
+ unsigned char set[32];
+
+ /* clear out the set of characters in the range */
+ memset(set, 0, sizeof(set));
+
+ /* first, skip the open bracket */
+ ++expr;
+ --exprlen;
+
+ /* check to see if starts with the exclusion character */
+ if (exprlen != 0 && *expr == '^')
+ {
+ /* skip the exclusion specifier */
+ ++expr;
+ --exprlen;
+
+ /* note it */
+ is_exclusive = TRUE;
+ }
+
+ /*
+ * if the first character is a ']', include it in the
+ * range
+ */
+ if (exprlen != 0 && *expr == ']')
+ {
+ re_set_bit(set, (int)']');
+ ++expr;
+ --exprlen;
+ }
+
+ /*
+ * if the next character is a '-', include it in the
+ * range
+ */
+ if (exprlen != 0 && *expr == '-')
+ {
+ re_set_bit(set, (int)'-');
+ ++expr;
+ --exprlen;
+ }
+
+ /* scan the character set */
+ while (exprlen != 0 && *expr != ']')
+ {
+ int ch;
+
+ /* note this character */
+ ch = (int)(unsigned char)*expr;
+
+ /* set it */
+ re_set_bit(set, ch);
+
+ /* skip this character of the expression */
+ ++expr;
+ --exprlen;
+
+ /* check for a range */
+ if (exprlen != 0 && *expr == '-')
+ {
+ int ch2;
+
+ /* skip the '-' */
+ ++expr;
+ --exprlen;
+ if (exprlen != 0)
+ {
+ /* get the other end of the range */
+ ch2 = (int)(unsigned char)*expr;
+
+ /* skip the second character */
+ ++expr;
+ --exprlen;
+
+ /* if the range is reversed, swap it */
+ if (ch > ch2)
+ {
+ int tmp = ch;
+ ch = ch2;
+ ch2 = tmp;
+ }
+
+ /* fill in the range */
+ for ( ; ch <= ch2 ; ++ch)
+ re_set_bit(set, ch);
+ }
+ }
+ }
+
+ /* create a character range machine */
+ re_build_char_range(ctx, &new_machine, set, is_exclusive);
+
+ /* apply any postfix operator */
+ goto apply_postfix;
+ }
+ break;
+
+ default:
+ normal_char:
+ /*
+ * it's an ordinary character - build a single character
+ * recognizer machine, and then concatenate it onto any
+ * existing machine
+ */
+ re_build_char(ctx, &new_machine, *expr);
+
+ apply_postfix:
+ /*
+ * Check for a postfix operator, and apply it to the machine
+ * in 'new_machine' if present. In any case, concatenate
+ * the 'new_machine' (modified by a postix operator or not)
+ * to the current machien.
+ */
+ if (exprlen > 1)
+ {
+ switch(*(expr+1))
+ {
+ case '*':
+ case '+':
+ case '?':
+ /*
+ * We have a postfix closure operator. Build a new
+ * closure machine out of 'new_machine'.
+ */
+ {
+ re_machine closure_machine;
+
+ /* move onto the closure operator */
+ ++expr;
+ --exprlen;
+
+ /* build the closure machine */
+ re_build_closure(ctx, &closure_machine,
+ &new_machine, *expr);
+
+ /* replace the original machine with the closure */
+ new_machine = closure_machine;
+
+ /*
+ * skip any redundant closure symbols, keeping
+ * only the first one we saw
+ */
+ while (exprlen > 1 && (*(expr+1) == '?'
+ || *(expr+1) == '+'
+ || *(expr+1) == '*'))
+ {
+ ++expr;
+ --exprlen;
+ }
+ }
+ break;
+
+ default:
+ /* no postfix operator */
+ break;
+ }
+ }
+
+ /*
+ * Concatenate the new machine onto the current machine
+ * under construction.
+ */
+ re_concat_onto(ctx, &cur_machine, &new_machine);
+ break;
+ }
+ }
+
+ /* complete any pending alternation */
+ re_alternate_onto(ctx, &alter_machine, &cur_machine);
+
+ /* store the resulting machine in the caller's machine descriptor */
+ *result_machine = alter_machine;
+
+ /* no errors encountered */
+ return RE_STATUS_SUCCESS;
}
-void re_context::note_group(re_group_register *regs, re_state_id id, const char *p) {
- int group_index;
-
- /*
- * Check to see if this is a valid state and it's a group marker -
- * if not, there's nothing to do
- */
- if (id == RE_STATE_INVALID
- || !(_tuple_arr[id].flags
- & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
- || (group_index = (int)_tuple_arr[id].ch) >= RE_GROUP_REG_CNT)
- return;
-
- // It's a valid group marker - note the appropriate register value
- if ((_tuple_arr[id].flags & RE_STATE_GROUP_BEGIN) != 0)
- regs[group_index].start_ofs = p;
- else
- regs[group_index].end_ofs = p;
+
+/* ------------------------------------------------------------------------ */
+/*
+ * Pattern recognizer
+ */
+
+/*
+ * Note a group position if appropriate
+ */
+static void re_note_group(re_context *ctx, re_group_register *regs,
+ re_state_id id, const char *p)
+{
+ int group_index;
+
+ /*
+ * Check to see if this is a valid state and it's a group marker -
+ * if not, there's nothing to do
+ */
+ if (id == RE_STATE_INVALID
+ || !(ctx->tuple_arr[id].flags
+ & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
+ || (group_index = (int)ctx->tuple_arr[id].ch) >= RE_GROUP_REG_CNT)
+ return;
+
+ /*
+ * It's a valid group marker - note the appropriate register value
+ */
+ if ((ctx->tuple_arr[id].flags & RE_STATE_GROUP_BEGIN) != 0)
+ regs[group_index].start_ofs = p;
+ else
+ regs[group_index].end_ofs = p;
}
-bool re_context::is_word_char(char c) const {
- return Common::isAlnum(c);
+/*
+ * Determine if a character is part of a word. We consider letters and
+ * numbers to be word characters.
+ */
+static int re_is_word_char(char c) {
+ return Common::isAlnum((unsigned char)c);
}
-int re_context::match(const char *entire_str, const char *str, size_t origlen,
- const re_machine *machine, re_group_register *regs) {
- re_state_id cur_state;
- const char *p;
- size_t curlen;
-
- // start at the machine's initial state
- cur_state = machine->init;
-
- // start at the beginning of the string
- p = str;
- curlen = origlen;
-
- // note any group involved in the initial state
- note_group(regs, cur_state, p);
-
- /*
- * if we're starting in the final state, immediately return success
- * with a zero-length match
- */
- if (cur_state == machine->final) {
- // return success with a zero-length match
- return 0;
- }
-
- // run the machine
- for (;;) {
- re_tuple *tuple;
-
- // get the tuple for this state
- tuple = &_tuple_arr[cur_state];
-
- // if this is a group state, adjust the group registers
- note_group(regs, cur_state, p);
-
- // see what kind of state we're in
- if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
- && tuple->ch != RE_EPSILON) {
- /*
- * This is a character or group recognizer state. If we
- * match the character or group, continue on to the next
- * state; otherwise, return failure.
- */
- switch(tuple->ch) {
- case RE_GROUP_MATCH_0:
- case RE_GROUP_MATCH_0 + 1:
- case RE_GROUP_MATCH_0 + 2:
- case RE_GROUP_MATCH_0 + 3:
- case RE_GROUP_MATCH_0 + 4:
- case RE_GROUP_MATCH_0 + 5:
- case RE_GROUP_MATCH_0 + 6:
- case RE_GROUP_MATCH_0 + 7:
- case RE_GROUP_MATCH_0 + 8:
- case RE_GROUP_MATCH_0 + 9: {
- int group_num;
- re_group_register *group_reg;
- size_t reg_len;
-
- // it's a group - get the group number
- group_num = tuple->ch - RE_GROUP_MATCH_0;
- group_reg = &regs[group_num];
-
- /*
- * if this register isn't defined, there's nothing
- * to match, so fail
- */
- if (group_reg->start_ofs == 0 || group_reg->end_ofs == 0)
- return -1;
-
- // calculate the length of the register value
- reg_len = group_reg->end_ofs - group_reg->start_ofs;
-
- // if we don't have enough left to match, it fails
- if (curlen < reg_len)
- return -1;
-
- // if the string doesn't match exactly, we fail
- if (memcmp(p, group_reg->start_ofs, reg_len) != 0)
- return -1;
-
- /*
- * It matches exactly - skip the entire length of
- * the register in the source string
- */
- p += reg_len;
- curlen -= reg_len;
- break;
- }
-
- case RE_TEXT_BEGIN:
- /*
- * Match only the exact beginning of the string - if
- * we're anywhere else, this isn't a match. If this
- * succeeds, we don't skip any characters.
- */
- if (p != entire_str)
- return -1;
- break;
-
- case RE_TEXT_END:
- /*
- * Match only the exact end of the string - if we're
- * anywhere else, this isn't a match. Don't skip any
- * characters on success.
- */
- if (curlen != 0)
- return -1;
- break;
-
- case RE_WORD_BEGIN:
- /*
- * if the previous character is a word character, we're
- * not at the beginning of a word
- */
- if (p != entire_str && is_word_char(*(p - 1)))
- return -1;
-
- /*
- * if we're at the end of the string, or the current
- * character isn't the start of a word, we're not at the
- * beginning of a word
- */
- if (curlen == 0 || !is_word_char(*p))
- return -1;
- break;
-
- case RE_WORD_END:
- /*
- * if the current character is a word character, we're not
- * at the end of a word
- */
- if (curlen != 0 && is_word_char(*p))
- return -1;
-
- /*
- * if we're at the beginning of the string, or the
- * previous character is not a word character, we're not
- * at the end of a word
- */
- if (p == entire_str || !is_word_char(*(p - 1)))
- return -1;
- break;
-
- case RE_WORD_CHAR:
- /* if it's not a word character, it's a failure */
- if (curlen == 0 || !is_word_char(*p))
- return -1;
-
- /* skip this character of input */
- ++p;
- --curlen;
- break;
-
- case RE_NON_WORD_CHAR:
- /* if it's a word character, it's a failure */
- if (curlen == 0 || is_word_char(*p))
- return -1;
-
- /* skip the input */
- ++p;
- --curlen;
- break;
-
- case RE_WORD_BOUNDARY:
- case RE_NON_WORD_BOUNDARY:
- {
- int prev_is_word;
- int next_is_word;
- int boundary;
-
- /*
- * Determine if the previous character is a word
- * character -- if we're at the beginning of the
- * string, it's obviously not, otherwise check its
- * classification
- */
- prev_is_word = (p != entire_str
- && is_word_char(*(p - 1)));
-
- /* make the same check for the current character */
- next_is_word = (curlen != 0
- && is_word_char(*p));
-
- /*
- * Determine if this is a boundary - it is if the
- * two states are different
- */
- boundary = ((prev_is_word != 0) ^ (next_is_word != 0));
-
- /*
- * make sure it matches what was desired, and return
- * failure if not
- */
- if ((tuple->ch == RE_WORD_BOUNDARY && !boundary)
- || (tuple->ch == RE_NON_WORD_BOUNDARY && boundary))
- return -1;
- }
- break;
-
- case RE_WILDCARD:
- // make sure we have a character to match
- if (curlen == 0)
- return -1;
-
- // skip this character
- ++p;
- --curlen;
- break;
-
- case RE_RANGE:
- case RE_RANGE_EXCL: {
- int match_val;
-
- // make sure we have a character to match
- if (curlen == 0)
- return -1;
-
- // see if we match
- match_val = re_is_bit_set(tuple->char_range, (int)(unsigned char)*p);
-
- // make sure we got what we wanted
- if ((tuple->ch == RE_RANGE && !match_val)
- || (tuple->ch == RE_RANGE_EXCL && match_val))
- return -1;
-
- // skip this character of the input
- ++p;
- --curlen;
- break;
- }
-
- default:
- // make sure we have an exact match
- if (curlen == 0 || tuple->ch != *p)
- return -1;
-
- // skip this character of the input
- ++p;
- --curlen;
- break;
- }
-
- /*
- * if we got this far, we were successful - move on to the
- * next state
- */
- cur_state = tuple->next_state_1;
- } else if (tuple->next_state_2 == RE_STATE_INVALID) {
- /*
- * We have only one transition, so this state is entirely
- * deterministic. Simply move on to the next state.
- */
- cur_state = tuple->next_state_1;
- } else {
- re_machine sub_machine;
- re_group_register regs1[RE_GROUP_REG_CNT];
- re_group_register regs2[RE_GROUP_REG_CNT];
- int ret1;
- int ret2;
-
- /*
- * This state has two possible transitions, and we don't
- * know which one to take. So, try both, see which one
- * works better, and return the result. Try the first
- * transition first. Note that each separate attempt must
- * use a separate copy of the registers.
- */
- memcpy(regs1, regs, sizeof(regs1));
- sub_machine.init = tuple->next_state_1;
- sub_machine.final = machine->final;
- ret1 = match(entire_str, p, curlen, &sub_machine, regs1);
-
- /*
- * Now try the second transition
- */
- memcpy(regs2, regs, sizeof(regs2));
- sub_machine.init = tuple->next_state_2;
- sub_machine.final = machine->final;
- ret2 = match(entire_str, p, curlen, &sub_machine, regs2);
-
- /*
- * If they both failed, the whole thing failed. Otherwise,
- * return the longer of the two, plus the length we
- * ourselves matched previously. Note that we return the
- * register set from the winning match.
- */
- if (ret1 < 0 && ret2 < 0) {
- // they both failed
- return -1;
- } else if (ret1 > ret2) {
- // use the first register set and result length
- memcpy(regs, regs1, sizeof(regs1));
- return ret1 + (p - str);
- } else {
- // use the second register set and result length
- memcpy(regs, regs2, sizeof(regs2));
- return ret2 + (p - str);
- }
- }
-
- // If we're in the final state, return success
- if (cur_state == machine->final) {
- // finish off any group involved in the final state
- note_group(regs, cur_state, p);
-
- // return the length we matched
- return p - str;
- }
- }
+/*
+ * Match a string to a compiled expression. Returns the length of the
+ * match if successful, or -1 if no match was found.
+ */
+static int re_match(re_context *ctx, const char *entire_str,
+ const char *str, size_t origlen,
+ const re_machine *machine, re_group_register *regs)
+{
+ re_state_id cur_state;
+ const char *p;
+ size_t curlen;
+
+ /* start at the machine's initial state */
+ cur_state = machine->init;
+
+ /* start at the beginning of the string */
+ p = str;
+ curlen = origlen;
+
+ /* note any group involved in the initial state */
+ re_note_group(ctx, regs, cur_state, p);
+
+ /*
+ * if we're starting in the final state, immediately return success
+ * with a zero-length match
+ */
+ if (cur_state == machine->final)
+ {
+ /* return success with a zero-length match */
+ return 0;
+ }
+
+ /* run the machine */
+ for (;;)
+ {
+ re_tuple *tuple;
+
+ /* get the tuple for this state */
+ tuple = &ctx->tuple_arr[cur_state];
+
+ /* if this is a group state, adjust the group registers */
+ re_note_group(ctx, regs, cur_state, p);
+
+ /* see what kind of state we're in */
+ if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
+ && tuple->ch != RE_EPSILON)
+ {
+ /*
+ * This is a character or group recognizer state. If we
+ * match the character or group, continue on to the next
+ * state; otherwise, return failure.
+ */
+ switch(tuple->ch)
+ {
+ case RE_GROUP_MATCH_0:
+ case RE_GROUP_MATCH_0 + 1:
+ case RE_GROUP_MATCH_0 + 2:
+ case RE_GROUP_MATCH_0 + 3:
+ case RE_GROUP_MATCH_0 + 4:
+ case RE_GROUP_MATCH_0 + 5:
+ case RE_GROUP_MATCH_0 + 6:
+ case RE_GROUP_MATCH_0 + 7:
+ case RE_GROUP_MATCH_0 + 8:
+ case RE_GROUP_MATCH_0 + 9:
+ {
+ int group_num;
+ re_group_register *group_reg;
+ size_t reg_len;
+
+ /* it's a group - get the group number */
+ group_num = tuple->ch - RE_GROUP_MATCH_0;
+ group_reg = &regs[group_num];
+
+ /*
+ * if this register isn't defined, there's nothing
+ * to match, so fail
+ */
+ if (group_reg->start_ofs == 0 || group_reg->end_ofs == 0)
+ return -1;
+
+ /* calculate the length of the register value */
+ reg_len = group_reg->end_ofs - group_reg->start_ofs;
+
+ /* if we don't have enough left to match, it fails */
+ if (curlen < reg_len)
+ return -1;
+
+ /* if the string doesn't match exactly, we fail */
+ if (memcmp(p, group_reg->start_ofs, reg_len) != 0)
+ return -1;
+
+ /*
+ * It matches exactly - skip the entire length of
+ * the register in the source string
+ */
+ p += reg_len;
+ curlen -= reg_len;
+ }
+ break;
+
+ case RE_TEXT_BEGIN:
+ /*
+ * Match only the exact beginning of the string - if
+ * we're anywhere else, this isn't a match. If this
+ * succeeds, we don't skip any characters.
+ */
+ if (p != entire_str)
+ return -1;
+ break;
+
+ case RE_TEXT_END:
+ /*
+ * Match only the exact end of the string - if we're
+ * anywhere else, this isn't a match. Don't skip any
+ * characters on success.
+ */
+ if (curlen != 0)
+ return -1;
+ break;
+
+ case RE_WORD_BEGIN:
+ /*
+ * if the previous character is a word character, we're
+ * not at the beginning of a word
+ */
+ if (p != entire_str && re_is_word_char(*(p-1)))
+ return -1;
+
+ /*
+ * if we're at the end of the string, or the current
+ * character isn't the start of a word, we're not at the
+ * beginning of a word
+ */
+ if (curlen == 0 || !re_is_word_char(*p))
+ return -1;
+ break;
+
+ case RE_WORD_END:
+ /*
+ * if the current character is a word character, we're not
+ * at the end of a word
+ */
+ if (curlen != 0 && re_is_word_char(*p))
+ return -1;
+
+ /*
+ * if we're at the beginning of the string, or the
+ * previous character is not a word character, we're not
+ * at the end of a word
+ */
+ if (p == entire_str || !re_is_word_char(*(p-1)))
+ return -1;
+ break;
+
+ case RE_WORD_CHAR:
+ /* if it's not a word character, it's a failure */
+ if (curlen == 0 || !re_is_word_char(*p))
+ return -1;
+
+ /* skip this character of input */
+ ++p;
+ --curlen;
+ break;
+
+ case RE_NON_WORD_CHAR:
+ /* if it's a word character, it's a failure */
+ if (curlen == 0 || re_is_word_char(*p))
+ return -1;
+
+ /* skip the input */
+ ++p;
+ --curlen;
+ break;
+
+ case RE_WORD_BOUNDARY:
+ case RE_NON_WORD_BOUNDARY:
+ {
+ int prev_is_word;
+ int next_is_word;
+ int boundary;
+
+ /*
+ * Determine if the previous character is a word
+ * character -- if we're at the beginning of the
+ * string, it's obviously not, otherwise check its
+ * classification
+ */
+ prev_is_word = (p != entire_str
+ && re_is_word_char(*(p-1)));
+
+ /* make the same check for the current character */
+ next_is_word = (curlen != 0
+ && re_is_word_char(*p));
+
+ /*
+ * Determine if this is a boundary - it is if the
+ * two states are different
+ */
+ boundary = ((prev_is_word != 0) ^ (next_is_word != 0));
+
+ /*
+ * make sure it matches what was desired, and return
+ * failure if not
+ */
+ if ((tuple->ch == RE_WORD_BOUNDARY && !boundary)
+ || (tuple->ch == RE_NON_WORD_BOUNDARY && boundary))
+ return -1;
+ }
+ break;
+
+ case RE_WILDCARD:
+ /* make sure we have a character to match */
+ if (curlen == 0)
+ return -1;
+
+ /* skip this character */
+ ++p;
+ --curlen;
+ break;
+
+ case RE_RANGE:
+ case RE_RANGE_EXCL:
+ {
+ int match;
+
+ /* make sure we have a character to match */
+ if (curlen == 0)
+ return -1;
+
+ /* see if we match */
+ match = re_is_bit_set(tuple->char_range,
+ (int)(unsigned char)*p);
+
+ /* make sure we got what we wanted */
+ if ((tuple->ch == RE_RANGE && !match)
+ || (tuple->ch == RE_RANGE_EXCL && match))
+ return -1;
+
+ /* skip this character of the input */
+ ++p;
+ --curlen;
+ }
+ break;
+
+ default:
+ /* make sure we have an exact match */
+ if (curlen == 0 || tuple->ch != *p)
+ return -1;
+
+ /* skip this character of the input */
+ ++p;
+ --curlen;
+ break;
+ }
+
+ /*
+ * if we got this far, we were successful - move on to the
+ * next state
+ */
+ cur_state = tuple->next_state_1;
+ }
+ else if (tuple->next_state_2 == RE_STATE_INVALID)
+ {
+ /*
+ * We have only one transition, so this state is entirely
+ * deterministic. Simply move on to the next state.
+ */
+ cur_state = tuple->next_state_1;
+ }
+ else
+ {
+ re_machine sub_machine;
+ re_group_register regs1[RE_GROUP_REG_CNT];
+ re_group_register regs2[RE_GROUP_REG_CNT];
+ int ret1;
+ int ret2;
+
+ /*
+ * This state has two possible transitions, and we don't
+ * know which one to take. So, try both, see which one
+ * works better, and return the result. Try the first
+ * transition first. Note that each separate attempt must
+ * use a separate copy of the registers.
+ */
+ memcpy(regs1, regs, sizeof(regs1));
+ sub_machine.init = tuple->next_state_1;
+ sub_machine.final = machine->final;
+ ret1 = re_match(ctx, entire_str, p, curlen, &sub_machine, regs1);
+
+ /*
+ * Now try the second transition
+ */
+ memcpy(regs2, regs, sizeof(regs2));
+ sub_machine.init = tuple->next_state_2;
+ sub_machine.final = machine->final;
+ ret2 = re_match(ctx, entire_str, p, curlen, &sub_machine, regs2);
+
+ /*
+ * If they both failed, the whole thing failed. Otherwise,
+ * return the longer of the two, plus the length we
+ * ourselves matched previously. Note that we return the
+ * register set from the winning match.
+ */
+ if (ret1 < 0 && ret2 < 0)
+ {
+ /* they both failed */
+ return -1;
+ }
+ else if (ret1 > ret2)
+ {
+ /* use the first register set and result length */
+ memcpy(regs, regs1, sizeof(regs1));
+ return ret1 + (p - str);
+ }
+ else
+ {
+ /* use the second register set and result length */
+ memcpy(regs, regs2, sizeof(regs2));
+ return ret2 + (p - str);
+ }
+ }
+
+ /*
+ * If we're in the final state, return success
+ */
+ if (cur_state == machine->final)
+ {
+ /* finish off any group involved in the final state */
+ re_note_group(ctx, regs, cur_state, p);
+
+ /* return the length we matched */
+ return p - str;
+ }
+ }
}
-int re_context::search(const char *str, size_t len, const re_machine *machine,
- re_group_register *regs, int *result_len) {
- int ofs;
-
- /*
- * Starting at the first character in the string, search for the
- * pattern at each subsequent character until we either find the
- * pattern or run out of string to test.
- */
- for (ofs = 0 ; ofs < (int)len ; ++ofs) {
- int matchlen;
-
- // check for a match
- matchlen = match(str, str + ofs, len - ofs, machine, regs);
- if (matchlen >= 0) {
- // we found a match here - return the length and offset
- *result_len = matchlen;
- return ofs;
- }
- }
-
- // we didn't find a match
- return -1;
+/* ------------------------------------------------------------------------ */
+/*
+ * Search for a regular expression within a string. Returns -1 if the
+ * string cannot be found, otherwise returns the offset from the start
+ * of the string to be searched of the start of the first match for the
+ * pattern.
+ */
+static int re_search(re_context *ctx, const char *str, size_t len,
+ const re_machine *machine, re_group_register *regs,
+ int *result_len)
+{
+ int ofs;
+
+ /*
+ * Starting at the first character in the string, search for the
+ * pattern at each subsequent character until we either find the
+ * pattern or run out of string to test.
+ */
+ for (ofs = 0 ; ofs < (int)len ; ++ofs)
+ {
+ int matchlen;
+
+ /* check for a match */
+ matchlen = re_match(ctx, str, str + ofs, len - ofs,
+ machine, regs);
+ if (matchlen >= 0)
+ {
+ /* we found a match here - return the length and offset */
+ *result_len = matchlen;
+ return ofs;
+ }
+ }
+
+ /* we didn't find a match */
+ return -1;
}
-void re_context::save_search_str(const char *str, size_t len) {
- // if the string is empty, this is easy
- if (len == 0) {
- // nothing to store - just save the length and return
- _curlen = 0;
- return;
- }
-
- // if the current buffer isn't big enough, allocate a new one
- if (_strbuf == 0 || _strbufsiz < len) {
- /*
- * free any previous buffer - its contents are no longer
- * important, since we're about to overwrite it with a new
- * string
- */
- if (_strbuf != 0)
- mchfre(_strbuf);
-
- /*
- * allocate a new buffer; round up to the next 256-byte
- * increment to make sure we're not constantly reallocating to
- * random sizes
- */
- _strbufsiz = ((len + 255) & ~255);
-
- // allocate it
- _strbuf = (char *)mchalo(_errctx, _strbufsiz, "regex str");
- }
-
- // copy the string
- memcpy(_strbuf, str, len);
-
- // save the length
- _curlen = len;
+/* ------------------------------------------------------------------------ */
+/*
+ * Make a copy of a search string in our private buffer.
+ */
+static void re_save_search_str(re_context *ctx, const char *str, size_t len)
+{
+ /* if the string is empty, this is easy */
+ if (len == 0)
+ {
+ /* nothing to store - just save the length and return */
+ ctx->curlen = 0;
+ return;
+ }
+
+ /* if the current buffer isn't big enough, allocate a new one */
+ if (ctx->strbuf == 0 || ctx->strbufsiz < len)
+ {
+ /*
+ * free any previous buffer - its contents are no longer
+ * important, since we're about to overwrite it with a new
+ * string
+ */
+ if (ctx->strbuf != 0)
+ mchfre(ctx->strbuf);
+
+ /*
+ * allocate a new buffer; round up to the next 256-byte
+ * increment to make sure we're not constantly reallocating to
+ * random sizes
+ */
+ ctx->strbufsiz = ((len + 255) & ~255);
+
+ /* allocate it */
+ ctx->strbuf = (char *)mchalo(ctx->errctx, ctx->strbufsiz,
+ "regex str");
+ }
+
+ /* copy the string */
+ memcpy(ctx->strbuf, str, len);
+
+ /* save the length */
+ ctx->curlen = len;
}
-int re_context::compile_and_search(const char *pattern, size_t patlen,
- const char *searchstr, size_t searchlen, int *result_len) {
- re_machine machine;
-
- // compile the expression - return failure if we get an error
- if (compile(pattern, patlen, &machine) != RE_STATUS_SUCCESS)
- return -1;
-
- // save the search string in our internal buffer
- save_search_str(searchstr, searchlen);
-
- // clear the group registers
- memset(_regs, 0, sizeof(_regs));
-
- /*
- * search for the pattern in our copy of the string - use the copy
- * so that the group registers stay valid even if the caller
- * deallocates the original string after we return
- */
- return search(_strbuf, _curlen, &machine, _regs, result_len);
+/* ------------------------------------------------------------------------ */
+/*
+ * Compile an expression and search for a match within the given string.
+ * Returns the offset of the match, or -1 if no match was found.
+ */
+int re_compile_and_search(re_context *ctx,
+ const char *pattern, size_t patlen,
+ const char *searchstr, size_t searchlen,
+ int *result_len)
+{
+ re_machine machine;
+
+ /* compile the expression - return failure if we get an error */
+ if (re_compile(ctx, pattern, patlen, &machine) != RE_STATUS_SUCCESS)
+ return -1;
+
+ /* save the search string in our internal buffer */
+ re_save_search_str(ctx, searchstr, searchlen);
+
+ /* clear the group registers */
+ memset(ctx->regs, 0, sizeof(ctx->regs));
+
+ /*
+ * search for the pattern in our copy of the string - use the copy
+ * so that the group registers stay valid even if the caller
+ * deallocates the original string after we return
+ */
+ return re_search(ctx, ctx->strbuf, ctx->curlen, &machine,
+ ctx->regs, result_len);
}
-int re_context::compile_and_match(const char *pattern, size_t patlen,
- const char *searchstr, size_t searchlen) {
- re_machine machine;
+/* ------------------------------------------------------------------------ */
+/*
+ * Compile an expression and check for a match. Returns the length of
+ * the match if we found a match, -1 if we found no match. This is not
+ * a search function; we merely match the leading substring of the given
+ * string to the given pattern.
+ */
+int re_compile_and_match(re_context *ctx,
+ const char *pattern, size_t patlen,
+ const char *searchstr, size_t searchlen)
+{
+ re_machine machine;
- // compile the expression - return failure if we get an error
- if (compile(pattern, patlen, &machine) != RE_STATUS_SUCCESS)
- return 0;
+ /* compile the expression - return failure if we get an error */
+ if (re_compile(ctx, pattern, patlen, &machine) != RE_STATUS_SUCCESS)
+ return FALSE;
- // save the search string in our internal buffer
- save_search_str(searchstr, searchlen);
+ /* save the search string in our internal buffer */
+ re_save_search_str(ctx, searchstr, searchlen);
- // clear the group registers
- memset(_regs, 0, sizeof(_regs));
+ /* clear the group registers */
+ memset(ctx->regs, 0, sizeof(ctx->regs));
- // match the string
- return match(_strbuf, _strbuf, _curlen, &machine, _regs);
+ /* match the string */
+ return re_match(ctx, ctx->strbuf, ctx->strbuf, ctx->curlen,
+ &machine, ctx->regs);
}
+
} // End of namespace TADS2
} // End of namespace TADS
} // End of namespace Glk