diff options
Diffstat (limited to 'engines/glk/tads/tads2/regex.h')
-rw-r--r-- | engines/glk/tads/tads2/regex.h | 378 |
1 files changed, 127 insertions, 251 deletions
diff --git a/engines/glk/tads/tads2/regex.h b/engines/glk/tads/tads2/regex.h index cd975e066b..48c2dd0ad4 100644 --- a/engines/glk/tads/tads2/regex.h +++ b/engines/glk/tads/tads2/regex.h @@ -24,289 +24,165 @@ #define GLK_TADS_TADS2_REGEX #include "common/array.h" -#include "glk/tads/tads2/ler.h" +#include "glk/tads/tads2/error_handling.h" namespace Glk { namespace TADS { namespace TADS2 { -/** - * state ID - */ + +/* state ID */ typedef int re_state_id; -/** - * invalid state ID - used to mark null machines - */ +/* invalid state ID - used to mark null machines */ #define RE_STATE_INVALID ((re_state_id)-1) -/** - * first valid state ID - */ +/* first valid state ID */ #define RE_STATE_FIRST_VALID ((re_state_id)0) -/** +/* ------------------------------------------------------------------------ */ +/* * Group register structure. Each register keeps track of the starting - * and ending offset of the group's text. + * and ending offset of the group's text. */ -struct re_group_register { - const char *start_ofs; - const char *end_ofs; -}; +typedef struct +{ + const char *start_ofs; + const char *end_ofs; +} re_group_register; -/** - * number of group registers we keep - */ +/* number of group registers we keep */ #define RE_GROUP_REG_CNT 10 -/** - * Denormalized state transition tuple. Each tuple represents the - * complete set of transitions out of a particular state. A particular - * state can have one character transition, or two epsilon transitions. - * Note that we don't need to store the state ID in the tuple, because - * the state ID is the index of the tuple in an array of state tuples. + +/* ------------------------------------------------------------------------ */ +/* + * Denormalized state transition tuple. Each tuple represents the + * complete set of transitions out of a particular state. A particular + * state can have one character transition, or two epsilon transitions. + * Note that we don't need to store the state ID in the tuple, because + * the state ID is the index of the tuple in an array of state tuples. */ -struct re_tuple { - // the character we must match to transition to the target state - char ch; +typedef struct +{ + /* the character we must match to transition to the target state */ + char ch; - // the target states - re_state_id next_state_1; - re_state_id next_state_2; + /* the target states */ + re_state_id next_state_1; + re_state_id next_state_2; - // character range match table, if used - unsigned char *char_range; + /* character range match table, if used */ + unsigned char *char_range; - // flags - byte flags; -}; + /* flags */ + unsigned char flags; +} re_tuple; -/** - * Tuple flags +/* + * Tuple flags */ -enum { - // this state is the start of a group - the 'ch' value is the group ID - RE_STATE_GROUP_BEGIN = 0x02, - // this state is the end of a group - 'ch' is the group ID */ - RE_STATE_GROUP_END = 0x04 -}; +/* this state is the start of a group - the 'ch' value is the group ID */ +#define RE_STATE_GROUP_BEGIN 0x02 + +/* this state is the end of a group - 'ch' is the group ID */ +#define RE_STATE_GROUP_END 0x04 + + +/* ------------------------------------------------------------------------ */ +/* + * Regular expression compilation context structure. This tracks the + * state of the compilation and stores the resources associated with the + * compiled expression. + */ +typedef struct +{ + /* error context */ + errcxdef *errctx; + + /* next available state ID */ + re_state_id next_state; + + /* + * The array of transition tuples. We'll allocate this array and + * expand it as necessary. + */ + re_tuple *tuple_arr; -/** - * Status codes + /* number of transition tuples allocated in the array */ + int tuples_alloc; + + /* current group ID */ + int cur_group; + + /* group registers */ + re_group_register regs[RE_GROUP_REG_CNT]; + + /* + * Buffer for retaining a copy of the last string we scanned. We + * retain our own copy of each string, and point the group registers + * into this copy rather than the caller's original string -- this + * ensures that the group registers remain valid even after the + * caller has deallocated the original string. + */ + char *strbuf; + + /* length of the string currently in the buffer */ + size_t curlen; + + /* size of the buffer allocated to strbuf */ + size_t strbufsiz; +} re_context; + + +/* ------------------------------------------------------------------------ */ +/* + * Status codes */ -typedef enum { - // success - RE_STATUS_SUCCESS = 0, +typedef enum +{ + /* success */ + RE_STATUS_SUCCESS = 0, - // compilation error - group nesting too deep - RE_STATUS_GROUP_NESTING_TOO_DEEP + /* compilation error - group nesting too deep */ + RE_STATUS_GROUP_NESTING_TOO_DEEP } re_status_t; -/** - * Regular expression compilation. This tracks the state of the compilation and - * stores the resources associated with the compiled expression. +/* ------------------------------------------------------------------------ */ +/* + * Initialize the context. The memory for the context structure itself + * must be allocated and maintained by the caller. + */ +void re_init(re_context *ctx, errcxdef *errctx); + +/* + * Delete the context - frees structures associated with the context. + * Does NOT free the memory used by the context structure itself. + */ +void re_delete(re_context *ctx); + +/* + * Compile an expression and search for a match within the given string. + * Returns the offset of the match, or -1 if no match was found. + */ +int re_compile_and_search(re_context *ctx, + const char *pattern, size_t patlen, + const char *searchstr, size_t searchlen, + int *result_len); + +/* + * Compile an expression and check for a match. Returns the length of + * the match if we found a match, -1 if we found no match. This is not + * a search function; we merely match the leading substring of the given + * string to the given pattern. */ -class re_context { - /** - * A machine description. Machines are fully described by their initial - * and final state ID's. - */ - struct re_machine { - re_state_id init; ///< the machine's initial state - re_state_id final; ///< the machine's final state - - re_machine() : init(0), final(0) {} - - /** - * Build a null machine - */ - void build_null_machine() { - init = final = RE_STATE_INVALID; - } - - /** - * Determine if a machine is null - */ - bool isNull() const { - return (init == RE_STATE_INVALID); - } - }; -private: - /** - * Reset compiler - clears states and tuples - */ - void reset(); - - /** - * Set a transition from a state to a given destination state - */ - void set_trans(re_state_id id, re_state_id dest_id, char ch); - - /** - * Initialize a new machine, giving it an initial and final state - */ - void init_machine(re_machine *machine); - - /** - * Build a character recognizer - */ - void build_char(re_machine *machine, char ch); - - /** - * Build a character range recognizer. 'range' is a 256-bit (32-byte) bit vector. - */ - void build_char_range(re_machine *machine, unsigned char *range, int exclusion); - - /** - * Build a group recognizer. This is almost the same as a character - * recognizer, but matches a previous group rather than a literal character. - */ - void build_group_matcher(re_machine *machine, int group_num); - - /** - * Build a concatenation recognizer - */ - void build_concat(re_machine *new_machine, re_machine *lhs, re_machine *rhs); - - /** - * Build a group machine. sub_machine contains the machine that - * expresses the group's contents; we'll fill in new_machine with a - * newly-created machine that encloses and marks the group. - */ - void build_group(re_machine *new_machine, re_machine *sub_machine, int group_id); - - /** - * Build an alternation recognizer - */ - void build_alter(re_machine *new_machine, re_machine *lhs, re_machine *rhs); - - /** - * Build a closure recognizer - */ - void build_closure(re_machine *new_machine, re_machine *sub, char specifier); - - /** - * Concatenate the second machine onto the first machine, replacing the - * first machine with the resulting machine. If the first machine is a - * null machine (created with re_build_null_machine), we'll simply copy - * the second machine into the first. - */ - void concat_onto(re_machine *dest, re_machine *rhs); - - /** - * Alternate the second machine onto the first machine, replacing the - * first machine with the resulting machine. If the first machine is a - * null machine, this simply replaces the first machine with the second - * machine. If the second machine is null, this simply leaves the first - * machine unchanged. - */ - void alternate_onto(re_machine *dest, re_machine *rhs); - - /** - * Compile an expression - */ - re_status_t compile(const char *expr, size_t exprlen, re_machine *result_machine); - - /** - * Note a group position if appropriate - */ - void note_group(re_group_register *regs, re_state_id id, const char *p); - - /** - * Determine if a character is part of a word. We consider letters and - * numbers to be word characters. - */ - bool is_word_char(char c) const; - - /** - * Match a string to a compiled expression. Returns the length of the - * match if successful, or -1 if no match was found. - */ - int match(const char *entire_str, const char *str, size_t origlen, - const re_machine *machine, re_group_register *regs); - - /** - * Search for a regular expression within a string. Returns -1 if the string - * cannot be found, otherwise returns the offset from the start of the string - * to be searched of the start of the first match for the pattern. - */ - int search(const char *str, size_t len, const re_machine *machine, - re_group_register *regs, int *result_len); - - /** - * Make a copy of a search string in our private buffer. - */ - void save_search_str(const char *str, size_t len); -public: - errcxdef *_errctx; ///< error context - re_state_id _next_state; ///< next available state ID - - /** - * The array of transition tuples. We'll allocate this array and - * expand it as necessary. - */ - Common::Array<re_tuple> _tuple_arr; - - // current group ID - int _cur_group; - - // group registers - re_group_register _regs[RE_GROUP_REG_CNT]; - - /** - * Buffer for retaining a copy of the last string we scanned. We - * retain our own copy of each string, and point the group registers - * into this copy rather than the caller's original string -- this - * ensures that the group registers remain valid even after the - * caller has deallocated the original string. - */ - char *_strbuf; - - /** - * length of the string currently in the buffer - */ - size_t _curlen; - - /** - * size of the buffer allocated to strbuf - */ - size_t _strbufsiz; -public: - /** - * Constructor. The memory for the context structure itself - * must be allocated and maintained by the caller. - */ - re_context(errcxdef *errctx); - - /** - * Destructor - */ - ~re_context(); - - /** - * Allocate a new state ID - */ - re_state_id alloc_state(); - - /** - * Compile an expression and search for a match within the given string. - * Returns the offset of the match, or -1 if no match was found. - */ - int compile_and_search(const char *pattern, size_t patlen, - const char *searchstr, size_t searchlen, int *result_len); - - /** - * Compile an expression and check for a match. Returns the length of the match - * if we found a match, -1 if we found no match. This is not a search function; - * we merely match the leading substring of the given string to the given pattern. - */ - int compile_and_match(const char *pattern, size_t patlen, - const char *searchstr, size_t searchlen); -}; +int re_compile_and_match(re_context *ctx, + const char *pattern, size_t patlen, + const char *searchstr, size_t searchlen); } // End of namespace TADS2 } // End of namespace TADS |