Mstdlib-1.24.0
m_re.h
1/* The MIT License (MIT)
2 *
3 * Copyright (c) 2019 Monetra Technologies, LLC.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy
6 * of this software and associated documentation files (the "Software"), to deal
7 * in the Software without restriction, including without limitation the rights
8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 * THE SOFTWARE.
22 */
23
24#ifndef __M_RE_H__
25#define __M_RE_H__
26
27/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
28
29#include <mstdlib/mstdlib.h>
30
31/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
32
33__BEGIN_DECLS
34
35/*! \addtogroup m_re Regular Expression
36 * \ingroup m_text
37 *
38 * The engine targets Perl/Python/PCRE expression syntax. However, this is
39 * not a full implementation of the syntax.
40 *
41 * The re engine is uses DFA processing to ensure evaluation happens in a
42 * reasonable amount of time. It does not use back tracking to avoid pathological
43 * expressions causing very slow run time. Due to this back references in patterns
44 * are not supported.
45 *
46 * Patterns are thread safe and re-entrant.
47 *
48 * ## Supported:
49 *
50 * ### Syntax
51 *
52 * Expression | Description
53 * ---------- | -----------
54 * `.` | any character (except newline, see DOTALL)
55 * `^` | Start of string. Or start of line in MULTILINE
56 * `$` | End of string. Or end of line in MULTILINE
57 * `*` | 0 or more repetitions
58 * `+` | 1 or more repetitions
59 * `?` | 0 or 1 repetitions
60 * `*? +? ??` | Ungreedy version of repetition
61 * `{#}` | Exactly # of repetitions
62 * `{#,}` | # or more repetitions
63 * `{#,#}` | Inclusive of # and # repetitions
64 * `\` | Escape character. E.g. `\\ => \`
65 * `[]` | Character range. Can be specific characters or '-' specified range. Multiple ranges can be specified. E.g. `[a-z-8XYZ]`
66 * `[^]` | Negative character range. Can be specific characters or '-' specified range. Multiple ranges can be specified. E.g. `[^a-z-8XYZ]`
67 * \| | Composite A or B. E.g. A\|B
68 * `()` | Pattern and capture group. Groups expressions together for evaluation when used with \|. Also, defines a capture group.
69 * `(?imsU-imsU)` | Allows specifying compile flags in the expression. Supports `i` (ignore case), `m` (multiline), `s` (dot all), `U` (ungreedy). - can be used to disable a flag. E.g. (?im-s). Only allowed to be used once at the start of the pattern.
70 *
71 * \note \ as part of \| (pipe) shown in table is for escaping and not part of syntax.
72 *
73 * ### Escapes
74 *
75 * Expression | Description
76 * ---------- | -----------
77 * C escape sequences | Any standard escape sequence that is part of C. Such as, `\n` (newline) and `\t` (tab)
78 * `\xHH \x{HHHH}` | Hex values
79 * `<` | Beginning of word
80 * `>` | End of word
81 *
82 * ### Short hand character classes
83 *
84 * Cannot be used within brackets.
85 *
86 * ASCII only.
87 *
88 * Expression | Description
89 * ---------- | -----------
90 * `\s` | White space. Equivalent to `[ \t\n\r\f\v]`
91 * `\S` | Not white space. Equivalent to `[^ \t\n\r\f\v]`
92 * `\d` | Digit (number). Equivalent to `[0-9].`
93 * `\D` | Not digit Equivalent to `[^0-9]`
94 * `\w` | Word. Equivalent to `[a-zA-Z0-9_]`
95 * `\W` | Not word. Equivalent to `[^a-zA-Z0-9_]`
96 *
97 * ### POSIX character classes for bracket expressions
98 *
99 * Character ranges _must_ be used in `[]` expressions. `^` negation is supported with ranges.
100 *
101 * ASCII only.
102 *
103 * Range | Description
104 * ----- | -----------
105 * `[:alpha:]` | Alpha characters. Contains `[a-zA-Z]`
106 * `[:alnum:]` | Alpha numeric characters. Contains `[a-zA-Z0-9]`
107 * `[:word:]` | Alpha numeric characters. Contains `[a-zA-Z0-9_]`. Equivalent to `\w`
108 * `[:space:]` | White space characters. Contains `[ \t\r\n\v\f]`. Equivalent to `\s`
109 * `[:digit:]` | Digit (number) characters. Contains `[0-9]`. Equivalent to `\d`
110 * `[:cntrl:]` | Control characters. Contains `[\x00-\x1F\x7F]`. Note: `\x00` is the NULL string terminator so this is really `[\x01-\x1F\x7F]` because `\x00` can never be encountered in a string.
111 * `[:print:]` | Printable characters range. Contains `[\x20-\x7E]`
112 * `[:xdigit:]` | Hexadecimal digit range. Contains `[0-9a-fA-F]`
113 * `[:lower:]` | Lower case character range. Contains `[a-z]`
114 * `[:upper:]` | Upper case character range. Contains `[A-Z]`
115 * `[:blank:]` | Blank character range. Contains `[ \t]`
116 * `[:graph:]` | Graph character range. Contains `[\x21-\x7E]`
117 * `[:punct:]` | Punctuation character range. Contains `[!"\#$%&'()*+,\-./:;<=>?@\[\\\]^_\`{\|}~]`
118 *
119 * \note \ as part of \| (pipe) and \` shown in `[:punct:]` is for escaping and not part of character set.
120 *
121 * ### Features
122 * - Numbered captures (up to 99) are supported in M_re_sub's replacement string.
123 *
124 * ## Not supported:
125 *
126 * - Back references in patterns
127 * - Collating symbols (in brackets)
128 * - Equivalence classes (in brackets)
129 * - 100% POSIX conformance
130 * - BRE (Basic Regular Expression) syntax
131 * - \ escape short hands (\\d, \\w, ...) inside of a bracket ([]) expression.
132 *
133 * ## Match object
134 *
135 * Patterns can have capture groups which can be filled in a match object
136 * during string evaluation. Only numbered capture indexes are supported.
137 * Up to 99 captures can be recorded.
138 *
139 * Index 0 is the full match for the regular expression. If the pattern matches
140 * the string, this will always be populated. Groups (when present) are number
141 * 1-99.
142 *
143 * If a capture is present the index will be available. Composite (|) patterns
144 * can cause gaps in captures. Meaning capture 1, and 5 could be present but capture
145 * 3 and 4 not. Also, captures can be present but have zero length.
146 *
147 * Finally, captures are reported with offset from the start of the string and
148 * the length of the captured data. This is different than some other libraries
149 * which return start and end offsets. Utilizing length instead of end offsets
150 * was decided based on captures being passed to other functions, the majority
151 * of which take a start and length; not an end offset.
152 *
153 * ## Unicode
154 *
155 * Patterns and strings are expected to be UTF-8 encoded and will be interpreted
156 * as such.
157 *
158 * While Unicode is supported normalization is not. Every Unicode character is
159 * treated as a unique character. Many characters match multiple Unicode code
160 * points. Equivalence is not applied and each code point is treated as its
161 * own character.
162 *
163 * @{
164 */
165
166struct M_re;
167typedef struct M_re M_re_t;
168
169struct M_ret_match;
170typedef struct M_ret_match M_re_match_t;
171
172
173/*! Pattern modifier options. */
174typedef enum {
175 M_RE_NONE = 0, /*!< No modifiers applied. */
176 M_RE_CASECMP = 1 << 0, /*!< Matching should be case insensitive. */
177 M_RE_MULTILINE = 1 << 1, /*!< ^ and $ match start and end of lines instead of start and end of string. */
178 M_RE_DOTALL = 1 << 2, /*!< Dot matches all characters including new line. */
179 M_RE_UNGREEDY = 1 << 3 /*!< Invert behavior of greedy qualifiers. E.g. ? acts like ?? and ?? acts like ?. */
181
182
183/*! Compile a regular expression pattern.
184 *
185 * \param[in] pattern The pattern to compile.
186 * \param[in] flags M_re_flags_t flags controlling pattern behavior.
187 *
188 * \return Re object on success. NULL on compilation error.
189 */
190M_API M_re_t *M_re_compile(const char *pattern, M_uint32 flags);
191
192
193/*! Destroy a re object.
194 *
195 * \param[in] re Re object.
196 */
197M_API void M_re_destroy(M_re_t *re);
198
199
200/*! Search for the first match of patten in string.
201 *
202 * \param[in] re Re object.
203 * \param[in] str String to evaluate.
204 * \param[out] match Optional match object.
205 *
206 * \return M_TRUE if match was found. Otherwise, M_FALSE.
207 */
208M_API M_bool M_re_search(const M_re_t *re, const char *str, M_re_match_t **match);
209
210
211/*! Check if the pattern matches from the beginning of the string.
212 *
213 * Equivalent to the pattern starting with ^ and not multi line.
214 *
215 * \param[in] re Re object.
216 * \param[in] str String to evaluate.
217 *
218 * \return M_TRUE if match was found. Otherwise, M_FALSE.
219 */
220M_API M_bool M_re_eq_start(const M_re_t *re, const char *str);
221
222
223/*! Check if the pattern matches the entire string
224 *
225 * Equivalent to the pattern starting with ^, ending with $ and not multi line.
226 *
227 * \param[in] re Re object.
228 * \param[in] str String to evaluate.
229 *
230 * \return M_TRUE if match was found. Otherwise, M_FALSE.
231 */
232M_API M_bool M_re_eq(const M_re_t *re, const char *str);
233
234
235/*! Get all pattern matches within a string.
236 *
237 * \param[in] re Re object.
238 * \param[in] str String to evaluate.
239 *
240 * \return List of M_re_match_t objects for every match found in the string.
241 * NULL if no matches found.
242 */
243M_API M_list_t *M_re_matches(const M_re_t *re, const char *str);
244
245
246/*! Get all matching text within a string.
247 *
248 * If locations of the text or captures are needed use
249 * M_re_matches.
250 *
251 * \param[in] re Re object.
252 * \param[in] str String to evaluate.
253 *
254 * \return List of matching strings for every match found in the string.
255 * NULL if no matches found.
256 */
257M_API M_list_str_t *M_re_find_all(const M_re_t *re, const char *str);
258
259
260/*! Substitute matching pattern in string.
261 *
262 * The replacement string can reference capture groups using
263 * `\#`, `\##`, `\g<#>`, `\g<##>`. The capture data applies to the match
264 * being evaluated. For example:
265 *
266 * \code
267 * pattern: ' (c-e)'
268 * string: 'a b c d e f g'
269 * repl: '\1'
270 *
271 * result: 'a bcde f g'
272 * \endcode
273 *
274 * \param[in] re Re object.
275 * \param[in] repl Replacement string.
276 * \param[in] str String to evaluate.
277 *
278 * \return String with substitutions or original string if no sub situations were made.
279 */
280M_API char *M_re_sub(const M_re_t *re, const char *repl, const char *str);
281
282
283/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
284
285/*! Destroy a match object.
286 *
287 * \param[in] match Match object.
288 */
290
291
292/*! Get a list of all the captured indexes.
293 *
294 * \param[in] match Match object.
295 *
296 * \return List of indexes. Otherwise NULL if no indexes captured.
297 */
299
300
301/*! Get the offset and length of a match at a given index.
302 *
303 * \param[in] match Match object.
304 * \param[in] idx Index.
305 * \param[out] offset Start of match from the beginning of evaluated string.
306 * \param[out] len Length of matched data.
307 *
308 * \return M_TRUE if match found for index. Otherwise, M_FALSE.
309 */
310M_API M_bool M_re_match_idx(const M_re_match_t *match, size_t idx, size_t *offset, size_t *len);
311
312/*! @} */
313
314__END_DECLS
315
316#endif /* __M_RE_H__ */
struct M_list M_list_t
Definition: m_list.h:92
struct M_list_str M_list_str_t
Definition: m_list_str.h:80
struct M_list_u64 M_list_u64_t
Definition: m_list_u64.h:78
M_list_u64_t * M_re_match_idxs(const M_re_match_t *match)
M_re_t * M_re_compile(const char *pattern, M_uint32 flags)
void M_re_match_destroy(M_re_match_t *match)
void M_re_destroy(M_re_t *re)
M_bool M_re_eq_start(const M_re_t *re, const char *str)
M_bool M_re_eq(const M_re_t *re, const char *str)
struct M_ret_match M_re_match_t
Definition: m_re.h:170
M_re_flags_t
Definition: m_re.h:174
M_list_str_t * M_re_find_all(const M_re_t *re, const char *str)
M_list_t * M_re_matches(const M_re_t *re, const char *str)
char * M_re_sub(const M_re_t *re, const char *repl, const char *str)
struct M_re M_re_t
Definition: m_re.h:167
M_bool M_re_search(const M_re_t *re, const char *str, M_re_match_t **match)
M_bool M_re_match_idx(const M_re_match_t *match, size_t idx, size_t *offset, size_t *len)
@ M_RE_UNGREEDY
Definition: m_re.h:179
@ M_RE_CASECMP
Definition: m_re.h:176
@ M_RE_MULTILINE
Definition: m_re.h:177
@ M_RE_DOTALL
Definition: m_re.h:178
@ M_RE_NONE
Definition: m_re.h:175