aboutsummaryrefslogtreecommitdiff
path: root/engines/glk/tads/tads2/regex.h
blob: 4c48a8985d0dc4051e8513a4c63a3df0922f97ef (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
/* ScummVM - Graphic Adventure Engine
 *
 * ScummVM is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 */

#ifndef GLK_TADS_TADS2_REGEX
#define GLK_TADS_TADS2_REGEX

#include "common/array.h"
#include "engines/glk/tads/tads2/ler.h"

namespace Glk {
namespace TADS {
namespace TADS2 {

/**
 * state ID
 */
typedef int re_state_id;

/**
 * invalid state ID - used to mark null machines
 */
#define RE_STATE_INVALID   ((re_state_id)-1)

/**
 * first valid state ID
 */
#define RE_STATE_FIRST_VALID  ((re_state_id)0)


/**
 *   Group register structure.  Each register keeps track of the starting
 *   and ending offset of the group's text.
 */
struct re_group_register {
	const char *start_ofs;
	const char *end_ofs;
};

/**
 * number of group registers we keep
 */
#define RE_GROUP_REG_CNT  10

/**
 * Denormalized state transition tuple.  Each tuple represents the
 * complete set of transitions out of a particular state.  A particular
 * state can have one character transition, or two epsilon transitions.
 * Note that we don't need to store the state ID in the tuple, because
 * the state ID is the index of the tuple in an array of state tuples.
 */
struct re_tuple {
	// the character we must match to transition to the target state
	char ch;

	// the target states
	re_state_id next_state_1;
	re_state_id next_state_2;

	// character range match table, if used
	unsigned char *char_range;

	// flags
	byte flags;
};


/**
 * Tuple flags
 */
enum {
	// this state is the start of a group - the 'ch' value is the group ID
	RE_STATE_GROUP_BEGIN = 0x02,

	// this state is the end of a group - 'ch' is the group ID */
	RE_STATE_GROUP_END = 0x04
};

/**
 * Status codes
 */
typedef enum {
	// success
	RE_STATUS_SUCCESS = 0,

	// compilation error - group nesting too deep
	RE_STATUS_GROUP_NESTING_TOO_DEEP
} re_status_t;


/**
 * Regular expression compilation.  This tracks the state of the compilation and
 * stores the resources associated with the compiled expression.
 */
class re_context {
	/**
	 * A machine description.  Machines are fully described by their initial
	 * and final state ID's.
	 */
	struct re_machine {
		re_state_id init;		///< the machine's initial state
		re_state_id final;		///< the machine's final state

		re_machine() : init(0), final(0) {}

		/**
		 * Build a null machine
		 */
		void build_null_machine() {
			init = final = RE_STATE_INVALID;
		}

		/**
		 * Determine if a machine is null
		 */
		bool isNull() const {
			return (init == RE_STATE_INVALID);
		}
	};
private:
	/**
	 * Reset compiler - clears states and tuples
	 */
	void reset();

	/**
	 * Set a transition from a state to a given destination state
	 */
	void set_trans(re_state_id id, re_state_id dest_id, char ch);

	/**
	 * Initialize a new machine, giving it an initial and final state
	 */
	void init_machine(re_machine *machine);

	/**
	 * Build a character recognizer
	 */
	void build_char(re_machine *machine, char ch);

	/**
	 * Build a character range recognizer.  'range' is a 256-bit (32-byte) bit vector.
	 */
	void build_char_range(re_machine *machine, unsigned char *range, int exclusion);

	/**
	 * Build a group recognizer.  This is almost the same as a character
	 * recognizer, but matches a previous group rather than a literal character.
	 */
	void build_group_matcher(re_machine *machine, int group_num);

	/**
	 *   Build a concatenation recognizer
	 */
	void build_concat(re_machine *new_machine, re_machine *lhs, re_machine *rhs);

	/**
	 * Build a group machine.  sub_machine contains the machine that
	 * expresses the group's contents; we'll fill in new_machine with a
	 * newly-created machine that encloses and marks the group.
	 */
	void build_group(re_machine *new_machine, re_machine *sub_machine, int group_id);

	/**
	 * Build an alternation recognizer
	 */
	void build_alter(re_machine *new_machine, re_machine *lhs, re_machine *rhs);

	/**
	 * Build a closure recognizer
	 */
	void build_closure(re_machine *new_machine, re_machine *sub, char specifier);

	/**
	 * Concatenate the second machine onto the first machine, replacing the
	 * first machine with the resulting machine.  If the first machine is a
	 * null machine (created with re_build_null_machine), we'll simply copy
	 * the second machine into the first.
	 */
	void concat_onto(re_machine *dest, re_machine *rhs);

	/**
	 * Alternate the second machine onto the first machine, replacing the
	 * first machine with the resulting machine.  If the first machine is a
	 * null machine, this simply replaces the first machine with the second
	 * machine.  If the second machine is null, this simply leaves the first
	 * machine unchanged.
	 */
	void alternate_onto(re_machine *dest, re_machine *rhs);

	/**
	 * Compile an expression
	 */
	re_status_t compile(const char *expr, size_t exprlen, re_machine *result_machine);

	/**
	 * Note a group position if appropriate
	 */
	void note_group(re_group_register *regs, re_state_id id, const char *p);

	/**
	 * Determine if a character is part of a word.  We consider letters and
	 * numbers to be word characters.
	 */
	bool is_word_char(char c) const;

	/**
	 * Match a string to a compiled expression.  Returns the length of the
	 * match if successful, or -1 if no match was found.
	 */
	int match(const char *entire_str, const char *str, size_t origlen,
		const re_machine *machine, re_group_register *regs);

	/**
	 * Search for a regular expression within a string.  Returns -1 if the string
	 * cannot be found, otherwise returns the offset from the start of the string
	 * to be searched of the start of the first match for the pattern.
	 */
	int search(const char *str, size_t len, const re_machine *machine,
		re_group_register *regs, int *result_len);

	/**
	 * Make a copy of a search string in our private buffer.
	 */
	void save_search_str(const char *str, size_t len);
public:
	errcxdef *_errctx;			///< error context
	re_state_id _next_state;	///< next available state ID

	/**
	 * The array of transition tuples.  We'll allocate this array and
	 * expand it as necessary.
	 */
	Common::Array<re_tuple> _tuple_arr;

	// current group ID
	int _cur_group;

	// group registers
	re_group_register _regs[RE_GROUP_REG_CNT];

	/**
	 * Buffer for retaining a copy of the last string we scanned.  We
	 * retain our own copy of each string, and point the group registers
	 * into this copy rather than the caller's original string -- this
	 * ensures that the group registers remain valid even after the
	 * caller has deallocated the original string.
	 */
	char *_strbuf;

	/**
	 * length of the string currently in the buffer
	 */
	size_t _curlen;

	/**
	 * size of the buffer allocated to strbuf
	 */
	size_t _strbufsiz;
public:
	/**
	 * Constructor.  The memory for the context structure itself
	 * must be allocated and maintained by the caller.
	 */
	re_context(errcxdef *errctx);

	/**
	 * Destructor
	 */
	~re_context();

	/**
	 * Allocate a new state ID
	 */
	re_state_id alloc_state();

	/**
	 * Compile an expression and search for a match within the given string.
	 * Returns the offset of the match, or -1 if no match was found.
	 */
	int compile_and_search(const char *pattern, size_t patlen,
		const char *searchstr, size_t searchlen, int *result_len);

	/**
	 * Compile an expression and check for a match.  Returns the length of the match
	 * if we found a match, -1 if we found no match.  This is not a search function;
	 * we merely match the leading substring of the given string to the given pattern.
	 */
	int compile_and_match(const char *pattern, size_t patlen,
		const char *searchstr, size_t searchlen);
};

} // End of namespace TADS2
} // End of namespace TADS
} // End of namespace Glk

#endif