Line data Source code
1 : /* -*- buffer-read-only: t -*- vi: set ro: */
2 : /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 : #line 1
4 : /* Extended regular expression matching and search library.
5 : Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
6 : This file is part of the GNU C Library.
7 : Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 :
9 : This program is free software; you can redistribute it and/or modify
10 : it under the terms of the GNU General Public License as published by
11 : the Free Software Foundation; either version 3, or (at your option)
12 : any later version.
13 :
14 : This program is distributed in the hope that it will be useful,
15 : but WITHOUT ANY WARRANTY; without even the implied warranty of
16 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 : GNU General Public License for more details.
18 :
19 : You should have received a copy of the GNU General Public License along
20 : with this program; if not, write to the Free Software Foundation,
21 : Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 :
23 : static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 : size_t length, reg_syntax_t syntax);
25 : static void re_compile_fastmap_iter (regex_t *bufp,
26 : const re_dfastate_t *init_state,
27 : char *fastmap);
28 : static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29 : #ifdef RE_ENABLE_I18N
30 : static void free_charset (re_charset_t *cset);
31 : #endif /* RE_ENABLE_I18N */
32 : static void free_workarea_compile (regex_t *preg);
33 : static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 : #ifdef RE_ENABLE_I18N
35 : static void optimize_utf8 (re_dfa_t *dfa);
36 : #endif
37 : static reg_errcode_t analyze (regex_t *preg);
38 : static reg_errcode_t preorder (bin_tree_t *root,
39 : reg_errcode_t (fn (void *, bin_tree_t *)),
40 : void *extra);
41 : static reg_errcode_t postorder (bin_tree_t *root,
42 : reg_errcode_t (fn (void *, bin_tree_t *)),
43 : void *extra);
44 : static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 : static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 : static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 : bin_tree_t *node);
48 : static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 : static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 : static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 : static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
52 : static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
53 : unsigned int constraint);
54 : static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55 : static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 : Idx node, bool root);
57 : static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58 : static Idx fetch_number (re_string_t *input, re_token_t *token,
59 : reg_syntax_t syntax);
60 : static int peek_token (re_token_t *token, re_string_t *input,
61 : reg_syntax_t syntax) internal_function;
62 : static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 : reg_syntax_t syntax, reg_errcode_t *err);
64 : static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 : re_token_t *token, reg_syntax_t syntax,
66 : Idx nest, reg_errcode_t *err);
67 : static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 : re_token_t *token, reg_syntax_t syntax,
69 : Idx nest, reg_errcode_t *err);
70 : static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 : re_token_t *token, reg_syntax_t syntax,
72 : Idx nest, reg_errcode_t *err);
73 : static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 : re_token_t *token, reg_syntax_t syntax,
75 : Idx nest, reg_errcode_t *err);
76 : static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 : re_dfa_t *dfa, re_token_t *token,
78 : reg_syntax_t syntax, reg_errcode_t *err);
79 : static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 : re_token_t *token, reg_syntax_t syntax,
81 : reg_errcode_t *err);
82 : static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 : re_string_t *regexp,
84 : re_token_t *token, int token_len,
85 : re_dfa_t *dfa,
86 : reg_syntax_t syntax,
87 : bool accept_hyphen);
88 : static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 : re_string_t *regexp,
90 : re_token_t *token);
91 : #ifdef RE_ENABLE_I18N
92 : static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 : re_charset_t *mbcset,
94 : Idx *equiv_class_alloc,
95 : const unsigned char *name);
96 : static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 : bitset_t sbcset,
98 : re_charset_t *mbcset,
99 : Idx *char_class_alloc,
100 : const unsigned char *class_name,
101 : reg_syntax_t syntax);
102 : #else /* not RE_ENABLE_I18N */
103 : static reg_errcode_t build_equiv_class (bitset_t sbcset,
104 : const unsigned char *name);
105 : static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 : bitset_t sbcset,
107 : const unsigned char *class_name,
108 : reg_syntax_t syntax);
109 : #endif /* not RE_ENABLE_I18N */
110 : static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 : RE_TRANSLATE_TYPE trans,
112 : const unsigned char *class_name,
113 : const unsigned char *extra,
114 : bool non_match, reg_errcode_t *err);
115 : static bin_tree_t *create_tree (re_dfa_t *dfa,
116 : bin_tree_t *left, bin_tree_t *right,
117 : re_token_type_t type);
118 : static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 : bin_tree_t *left, bin_tree_t *right,
120 : const re_token_t *token);
121 : static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122 : static void free_token (re_token_t *node);
123 : static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124 : static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125 :
126 : /* This table gives an error message for each of the error codes listed
127 : in regex.h. Obviously the order here has to be same as there.
128 : POSIX doesn't require that we do anything for REG_NOERROR,
129 : but why not be nice? */
130 :
131 : static const char __re_error_msgid[] =
132 : {
133 : #define REG_NOERROR_IDX 0
134 : gettext_noop ("Success") /* REG_NOERROR */
135 : "\0"
136 : #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137 : gettext_noop ("No match") /* REG_NOMATCH */
138 : "\0"
139 : #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
140 : gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 : "\0"
142 : #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143 : gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 : "\0"
145 : #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146 : gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 : "\0"
148 : #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
149 : gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 : "\0"
151 : #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
152 : gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 : "\0"
154 : #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
155 : gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 : "\0"
157 : #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158 : gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 : "\0"
160 : #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161 : gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 : "\0"
163 : #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
164 : gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 : "\0"
166 : #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167 : gettext_noop ("Invalid range end") /* REG_ERANGE */
168 : "\0"
169 : #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
170 : gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 : "\0"
172 : #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
173 : gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 : "\0"
175 : #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176 : gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 : "\0"
178 : #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
179 : gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 : "\0"
181 : #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
182 : gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 : };
184 :
185 : static const size_t __re_error_msgid_idx[] =
186 : {
187 : REG_NOERROR_IDX,
188 : REG_NOMATCH_IDX,
189 : REG_BADPAT_IDX,
190 : REG_ECOLLATE_IDX,
191 : REG_ECTYPE_IDX,
192 : REG_EESCAPE_IDX,
193 : REG_ESUBREG_IDX,
194 : REG_EBRACK_IDX,
195 : REG_EPAREN_IDX,
196 : REG_EBRACE_IDX,
197 : REG_BADBR_IDX,
198 : REG_ERANGE_IDX,
199 : REG_ESPACE_IDX,
200 : REG_BADRPT_IDX,
201 : REG_EEND_IDX,
202 : REG_ESIZE_IDX,
203 : REG_ERPAREN_IDX
204 : };
205 :
206 : /* Entry points for GNU code. */
207 :
208 : /* re_compile_pattern is the GNU regular expression compiler: it
209 : compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210 : Returns 0 if the pattern was valid, otherwise an error string.
211 :
212 : Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213 : are set in BUFP on entry. */
214 :
215 : #ifdef _LIBC
216 : const char *
217 : re_compile_pattern (pattern, length, bufp)
218 : const char *pattern;
219 : size_t length;
220 321 : struct re_pattern_buffer *bufp;
221 : #else /* size_t might promote */
222 : const char *
223 : re_compile_pattern (const char *pattern, size_t length,
224 : struct re_pattern_buffer *bufp)
225 : #endif
226 : {
227 : reg_errcode_t ret;
228 :
229 321 : /* And GNU code determines whether or not to get register information
230 : by passing null for the REGS argument to re_match, etc., not by
231 : setting no_sub, unless RE_NO_SUB is set. */
232 321 : bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
233 :
234 321 : /* Match anchors at newline. */
235 : bufp->newline_anchor = 1;
236 321 :
237 244 : ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 77 :
239 : if (!ret)
240 : return NULL;
241 : return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242 : }
243 : #ifdef _LIBC
244 : weak_alias (__re_compile_pattern, re_compile_pattern)
245 : #endif
246 :
247 : /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
248 : also be assigned to arbitrarily: each pattern buffer stores its own
249 : syntax, so it can be changed between regex compilations. */
250 : /* This has no initializer because initialized variables in Emacs
251 : become read-only after dumping. */
252 : reg_syntax_t re_syntax_options;
253 :
254 :
255 : /* Specify the precise syntax of regexps for compilation. This provides
256 : for compatibility for various utilities which historically have
257 : different, incompatible syntaxes.
258 :
259 : The argument SYNTAX is a bit mask comprised of the various bits
260 0 : defined in regex.h. We return the old syntax. */
261 :
262 : reg_syntax_t
263 0 : re_set_syntax (syntax)
264 : reg_syntax_t syntax;
265 0 : {
266 0 : reg_syntax_t ret = re_syntax_options;
267 :
268 : re_syntax_options = syntax;
269 : return ret;
270 : }
271 : #ifdef _LIBC
272 : weak_alias (__re_set_syntax, re_set_syntax)
273 103 : #endif
274 :
275 : int
276 103 : re_compile_fastmap (bufp)
277 103 : struct re_pattern_buffer *bufp;
278 : {
279 103 : re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
280 103 : char *fastmap = bufp->fastmap;
281 103 :
282 9 : memset (fastmap, '\0', sizeof (char) * SBC_MAX);
283 103 : re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
284 9 : if (dfa->init_state != dfa->init_state_word)
285 103 : re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
286 9 : if (dfa->init_state != dfa->init_state_nl)
287 103 : re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
288 103 : if (dfa->init_state != dfa->init_state_begbuf)
289 : re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
290 : bufp->fastmap_accurate = 1;
291 : return 0;
292 : }
293 : #ifdef _LIBC
294 : weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 : #endif
296 :
297 : static inline void
298 481 : __attribute ((always_inline))
299 481 : re_set_fastmap (char *fastmap, bool icase, int ch)
300 0 : {
301 : fastmap[ch] = 1;
302 : if (icase)
303 : fastmap[tolower (ch)] = 1;
304 : }
305 :
306 : /* Helper function for re_compile_fastmap.
307 130 : Compile fastmap for the initial_state INIT_STATE. */
308 :
309 : static void
310 130 : re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311 : char *fastmap)
312 130 : {
313 248 : re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
314 : Idx node_cnt;
315 154 : bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
316 154 : for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
317 : {
318 154 : Idx node = init_state->nodes.elems[node_cnt];
319 : re_token_type_t type = dfa->nodes[node].type;
320 9 :
321 : if (type == CHARACTER)
322 9 : {
323 : re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
324 : #ifdef RE_ENABLE_I18N
325 : if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
326 : {
327 : unsigned char buf[MB_LEN_MAX];
328 : unsigned char *p;
329 0 : wchar_t wc;
330 0 : mbstate_t state;
331 0 :
332 0 : p = buf;
333 0 : *p++ = dfa->nodes[node].opr.c;
334 0 : while (++node < dfa->nodes_len
335 0 : && dfa->nodes[node].type == CHARACTER
336 0 : && dfa->nodes[node].mb_partial)
337 0 : *p++ = dfa->nodes[node].opr.c;
338 0 : memset (&state, '\0', sizeof (state));
339 : if (mbrtowc (&wc, (const char *) buf, p - buf,
340 0 : &state) == p - buf
341 : && (__wcrtomb ((char *) buf, towlower (wc), &state)
342 : != (size_t) -1))
343 : re_set_fastmap (fastmap, false, buf[0]);
344 145 : }
345 : #endif
346 : }
347 365 : else if (type == SIMPLE_BRACKET)
348 : {
349 : int i, ch;
350 292 : for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 18980 : {
352 18688 : int j;
353 472 : bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
354 : for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
355 : if (w & ((bitset_word_t) 1 << j))
356 : re_set_fastmap (fastmap, icase, ch);
357 72 : }
358 : }
359 : #ifdef RE_ENABLE_I18N
360 0 : else if (type == COMPLEX_BRACKET)
361 0 : {
362 0 : Idx i;
363 : re_charset_t *cset = dfa->nodes[node].opr.mbcset;
364 : if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
365 : || cset->nranges || cset->nchar_classes)
366 : {
367 : # ifdef _LIBC
368 : if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
369 : {
370 : /* In this case we want to catch the bytes which are
371 : the first byte of any collation elements.
372 : e.g. In da_DK, we want to catch 'a' since "aa"
373 : is a valid collation element, and don't catch
374 : 'b' since 'b' is the only collation element
375 : which starts from 'b'. */
376 : const int32_t *table = (const int32_t *)
377 : _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
378 : for (i = 0; i < SBC_MAX; ++i)
379 : if (table[i] < 0)
380 0 : re_set_fastmap (fastmap, icase, i);
381 0 : }
382 0 : # else
383 0 : if (dfa->mb_cur_max > 1)
384 : for (i = 0; i < SBC_MAX; ++i)
385 : if (__btowc (i) == WEOF)
386 0 : re_set_fastmap (fastmap, icase, i);
387 : # endif /* not _LIBC */
388 : }
389 : for (i = 0; i < cset->nmbchars; ++i)
390 0 : {
391 0 : char buf[256];
392 0 : mbstate_t state;
393 0 : memset (&state, '\0', sizeof (state));
394 : if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
395 0 : re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
396 : if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
397 0 : {
398 : if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
399 : != (size_t) -1)
400 : re_set_fastmap (fastmap, false, *(unsigned char *) buf);
401 : }
402 72 : }
403 : }
404 71 : #endif /* RE_ENABLE_I18N */
405 : else if (type == OP_PERIOD
406 71 : #ifdef RE_ENABLE_I18N
407 : || type == OP_UTF8_PERIOD
408 36 : #endif /* RE_ENABLE_I18N */
409 36 : || type == END_OF_RE)
410 35 : {
411 36 : memset (fastmap, '\1', sizeof (char) * SBC_MAX);
412 : if (type == END_OF_RE)
413 : bufp->can_be_null = 1;
414 : return;
415 : }
416 : }
417 : }
418 :
419 : /* Entry point for POSIX code. */
420 : /* regcomp takes a regular expression as a string and compiles it.
421 :
422 : PREG is a regex_t *. We do not expect any fields to be initialized,
423 : since POSIX says we shouldn't. Thus, we set
424 :
425 : `buffer' to the compiled pattern;
426 : `used' to the length of the compiled pattern;
427 : `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
428 : REG_EXTENDED bit in CFLAGS is set; otherwise, to
429 : RE_SYNTAX_POSIX_BASIC;
430 : `newline_anchor' to REG_NEWLINE being set in CFLAGS;
431 : `fastmap' to an allocated space for the fastmap;
432 : `fastmap_accurate' to zero;
433 : `re_nsub' to the number of subexpressions in PATTERN.
434 :
435 : PATTERN is the address of the pattern string.
436 :
437 : CFLAGS is a series of bits which affect compilation.
438 :
439 : If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
440 : use POSIX basic syntax.
441 :
442 : If REG_NEWLINE is set, then . and [^...] don't match newline.
443 : Also, regexec will try a match beginning after every newline.
444 :
445 : If REG_ICASE is set, then we considers upper- and lowercase
446 : versions of letters to be equivalent when matching.
447 :
448 : If REG_NOSUB is set, then when PREG is passed to regexec, that
449 : routine will report only success or failure, and nothing about the
450 : registers.
451 :
452 : It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
453 0 : the return codes and their meanings.) */
454 :
455 : int
456 : regcomp (preg, pattern, cflags)
457 : regex_t *_Restrict_ preg;
458 : const char *_Restrict_ pattern;
459 0 : int cflags;
460 0 : {
461 : reg_errcode_t ret;
462 0 : reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
463 0 : : RE_SYNTAX_POSIX_BASIC);
464 0 :
465 : preg->buffer = NULL;
466 : preg->allocated = 0;
467 0 : preg->used = 0;
468 0 :
469 0 : /* Try to allocate space for the fastmap. */
470 : preg->fastmap = re_malloc (char, SBC_MAX);
471 0 : if (BE (preg->fastmap == NULL, 0))
472 : return REG_ESPACE;
473 :
474 0 : syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
475 :
476 0 : /* If REG_NEWLINE is set, newlines are treated differently. */
477 0 : if (cflags & REG_NEWLINE)
478 : { /* REG_NEWLINE implies neither . nor [^...] match newline. */
479 0 : syntax &= ~RE_DOT_NEWLINE;
480 : syntax |= RE_HAT_LISTS_NOT_NEWLINE;
481 : /* It also changes the matching behavior. */
482 0 : preg->newline_anchor = 1;
483 0 : }
484 0 : else
485 : preg->newline_anchor = 0;
486 0 : preg->no_sub = !!(cflags & REG_NOSUB);
487 : preg->translate = NULL;
488 :
489 : ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
490 0 :
491 0 : /* POSIX doesn't distinguish between an unmatched open-group and an
492 : unmatched close-group: both are REG_EPAREN. */
493 : if (ret == REG_ERPAREN)
494 0 : ret = REG_EPAREN;
495 :
496 : /* We have already checked preg->fastmap != NULL. */
497 0 : if (BE (ret == REG_NOERROR, 1))
498 : /* Compute the fastmap now, since regexec cannot modify the pattern
499 : buffer. This function never fails in this implementation. */
500 : (void) re_compile_fastmap (preg);
501 0 : else
502 0 : {
503 : /* Some error occurred while compiling the expression. */
504 : re_free (preg->fastmap);
505 0 : preg->fastmap = NULL;
506 : }
507 :
508 : return (int) ret;
509 : }
510 : #ifdef _LIBC
511 : weak_alias (__regcomp, regcomp)
512 : #endif
513 :
514 : /* Returns a message corresponding to an error code, ERRCODE, returned
515 : from either regcomp or regexec. We don't use PREG here. */
516 :
517 : #ifdef _LIBC
518 : size_t
519 : regerror (errcode, preg, errbuf, errbuf_size)
520 : int errcode;
521 : const regex_t *_Restrict_ preg;
522 : char *_Restrict_ errbuf;
523 0 : size_t errbuf_size;
524 : #else /* size_t might promote */
525 : size_t
526 : regerror (int errcode, const regex_t *_Restrict_ preg,
527 : char *_Restrict_ errbuf, size_t errbuf_size)
528 : #endif
529 : {
530 0 : const char *msg;
531 : size_t msg_size;
532 :
533 : if (BE (errcode < 0
534 : || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 : / sizeof (__re_error_msgid_idx[0])), 0))
536 : /* Only error codes returned by the rest of the code should be passed
537 0 : to this routine. If we are given anything else, or if other regex
538 : code generates an invalid error code, then the program has a bug.
539 0 : Dump core so we can fix it. */
540 : abort ();
541 0 :
542 : msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543 0 :
544 : msg_size = strlen (msg) + 1; /* Includes the null. */
545 0 :
546 0 : if (BE (errbuf_size != 0, 1))
547 : {
548 0 : size_t cpy_size = msg_size;
549 0 : if (BE (msg_size > errbuf_size, 0))
550 : {
551 0 : cpy_size = errbuf_size - 1;
552 : errbuf[cpy_size] = '\0';
553 : }
554 0 : memcpy (errbuf, msg, cpy_size);
555 : }
556 :
557 : return msg_size;
558 : }
559 : #ifdef _LIBC
560 : weak_alias (__regerror, regerror)
561 : #endif
562 :
563 :
564 : #ifdef RE_ENABLE_I18N
565 : /* This static array is used for the map to single-byte characters when
566 : UTF-8 is used. Otherwise we would allocate memory just to initialize
567 : it the same all the time. UTF-8 is the preferred encoding so this is
568 : a worthwhile optimization. */
569 : static const bitset_t utf8_sb_map =
570 : {
571 : /* Set the first 128 bits. */
572 : # if 4 * BITSET_WORD_BITS < ASCII_CHARS
573 : # error "bitset_word_t is narrower than 32 bits"
574 : # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
575 : BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
576 : # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
577 : BITSET_WORD_MAX, BITSET_WORD_MAX,
578 : # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
579 : BITSET_WORD_MAX,
580 : # endif
581 : (BITSET_WORD_MAX
582 : >> (SBC_MAX % BITSET_WORD_BITS == 0
583 : ? 0
584 : : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
585 : };
586 : #endif
587 116 :
588 :
589 : static void
590 : free_dfa_content (re_dfa_t *dfa)
591 116 : {
592 233 : Idx i, j;
593 117 :
594 116 : if (dfa->nodes)
595 233 : for (i = 0; i < dfa->nodes_len; ++i)
596 : free_token (dfa->nodes + i);
597 117 : re_free (dfa->nexts);
598 117 : for (i = 0; i < dfa->nodes_len; ++i)
599 117 : {
600 0 : if (dfa->eclosures != NULL)
601 117 : re_node_set_free (dfa->eclosures + i);
602 117 : if (dfa->inveclosures != NULL)
603 : re_node_set_free (dfa->inveclosures + i);
604 116 : if (dfa->edests != NULL)
605 116 : re_node_set_free (dfa->edests + i);
606 116 : }
607 116 : re_free (dfa->edests);
608 : re_free (dfa->eclosures);
609 116 : re_free (dfa->inveclosures);
610 699 : re_free (dfa->nodes);
611 :
612 583 : if (dfa->state_table)
613 696 : for (i = 0; i <= dfa->state_hash_mask; ++i)
614 : {
615 113 : struct re_state_table_entry *entry = dfa->state_table + i;
616 113 : for (j = 0; j < entry->num; ++j)
617 : {
618 583 : re_dfastate_t *state = entry->array[j];
619 : free_state (state);
620 116 : }
621 : re_free (entry->array);
622 116 : }
623 116 : re_free (dfa->state_table);
624 : #ifdef RE_ENABLE_I18N
625 116 : if (dfa->sb_char != utf8_sb_map)
626 : re_free (dfa->sb_char);
627 : #endif
628 : re_free (dfa->subexp_map);
629 : #ifdef DEBUG
630 116 : re_free (dfa->re_str);
631 116 : #endif
632 :
633 : re_free (dfa);
634 : }
635 :
636 :
637 39 : /* Free dynamically allocated space used by PREG. */
638 :
639 : void
640 39 : regfree (preg)
641 39 : regex_t *preg;
642 39 : {
643 39 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
644 39 : if (BE (dfa != NULL, 1))
645 : free_dfa_content (dfa);
646 39 : preg->buffer = NULL;
647 39 : preg->allocated = 0;
648 :
649 39 : re_free (preg->fastmap);
650 39 : preg->fastmap = NULL;
651 39 :
652 : re_free (preg->translate);
653 : preg->translate = NULL;
654 : }
655 : #ifdef _LIBC
656 : weak_alias (__regfree, regfree)
657 : #endif
658 :
659 : /* Entry points compatible with 4.2 BSD regex library. We don't define
660 : them unless specifically requested. */
661 :
662 : #if defined _REGEX_RE_COMP || defined _LIBC
663 :
664 : /* BSD has one and only one pattern buffer. */
665 : static struct re_pattern_buffer re_comp_buf;
666 :
667 : char *
668 : # ifdef _LIBC
669 : /* Make these definitions weak in libc, so POSIX programs can redefine
670 : these names if they don't use our functions, and still use
671 : regcomp/regexec above without link errors. */
672 : weak_function
673 : # endif
674 : re_comp (s)
675 : const char *s;
676 : {
677 : reg_errcode_t ret;
678 : char *fastmap;
679 :
680 : if (!s)
681 : {
682 : if (!re_comp_buf.buffer)
683 : return gettext ("No previous regular expression");
684 : return 0;
685 : }
686 :
687 : if (re_comp_buf.buffer)
688 : {
689 : fastmap = re_comp_buf.fastmap;
690 : re_comp_buf.fastmap = NULL;
691 : __regfree (&re_comp_buf);
692 : memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
693 : re_comp_buf.fastmap = fastmap;
694 : }
695 :
696 : if (re_comp_buf.fastmap == NULL)
697 : {
698 : re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
699 : if (re_comp_buf.fastmap == NULL)
700 : return (char *) gettext (__re_error_msgid
701 : + __re_error_msgid_idx[(int) REG_ESPACE]);
702 : }
703 :
704 : /* Since `re_exec' always passes NULL for the `regs' argument, we
705 : don't need to initialize the pattern buffer fields which affect it. */
706 :
707 : /* Match anchors at newlines. */
708 : re_comp_buf.newline_anchor = 1;
709 :
710 : ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
711 :
712 : if (!ret)
713 : return NULL;
714 :
715 : /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
716 : return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
717 : }
718 :
719 : #ifdef _LIBC
720 : libc_freeres_fn (free_mem)
721 : {
722 : __regfree (&re_comp_buf);
723 : }
724 : #endif
725 :
726 : #endif /* _REGEX_RE_COMP */
727 :
728 : /* Internal entry point.
729 : Compile the regular expression PATTERN, whose length is LENGTH.
730 321 : SYNTAX indicate regular expression's syntax. */
731 :
732 : static reg_errcode_t
733 321 : re_compile_internal (regex_t *preg, const char * pattern, size_t length,
734 : reg_syntax_t syntax)
735 : {
736 : reg_errcode_t err = REG_NOERROR;
737 : re_dfa_t *dfa;
738 321 : re_string_t regexp;
739 321 :
740 321 : /* Initialize the pattern buffer. */
741 321 : preg->fastmap_accurate = 0;
742 321 : preg->syntax = syntax;
743 321 : preg->not_bol = preg->not_eol = 0;
744 321 : preg->used = 0;
745 : preg->re_nsub = 0;
746 : preg->can_be_null = 0;
747 321 : preg->regs_allocated = REGS_UNALLOCATED;
748 321 :
749 : /* Initialize the dfa. */
750 : dfa = (re_dfa_t *) preg->buffer;
751 : if (BE (preg->allocated < sizeof (re_dfa_t), 0))
752 : {
753 : /* If zero allocated, but buffer is non-null, try to realloc
754 321 : enough space. This loses if buffer's address is bogus, but
755 321 : that is the user's responsibility. If ->buffer is NULL this
756 0 : is a simple allocation. */
757 321 : dfa = re_realloc (preg->buffer, re_dfa_t, 1);
758 321 : if (dfa == NULL)
759 : return REG_ESPACE;
760 321 : preg->allocated = sizeof (re_dfa_t);
761 : preg->buffer = (unsigned char *) dfa;
762 321 : }
763 321 : preg->used = sizeof (re_dfa_t);
764 :
765 0 : err = init_dfa (dfa, length);
766 0 : if (BE (err != REG_NOERROR, 0))
767 0 : {
768 0 : free_dfa_content (dfa);
769 : preg->buffer = NULL;
770 : preg->allocated = 0;
771 : return err;
772 : }
773 : #ifdef DEBUG
774 : /* Note: length+1 will not overflow since it is checked in init_dfa. */
775 : dfa->re_str = re_malloc (char, length + 1);
776 : strncpy (dfa->re_str, pattern, length + 1);
777 : #endif
778 321 :
779 321 : __libc_lock_init (dfa->lock);
780 321 :
781 : err = re_string_construct (®exp, pattern, length, preg->translate,
782 0 : syntax & RE_ICASE, dfa);
783 77 : if (BE (err != REG_NOERROR, 0))
784 77 : {
785 77 : re_compile_internal_free_return:
786 77 : free_workarea_compile (preg);
787 77 : re_string_destruct (®exp);
788 77 : free_dfa_content (dfa);
789 : preg->buffer = NULL;
790 : preg->allocated = 0;
791 : return err;
792 321 : }
793 321 :
794 321 : /* Parse the regular expression, and build a structure tree. */
795 77 : preg->re_nsub = 0;
796 : dfa->str_tree = parse (®exp, preg, syntax, &err);
797 : if (BE (dfa->str_tree == NULL, 0))
798 244 : goto re_compile_internal_free_return;
799 244 :
800 0 : /* Analyze the tree and create the nfa. */
801 : err = analyze (preg);
802 : if (BE (err != REG_NOERROR, 0))
803 : goto re_compile_internal_free_return;
804 244 :
805 0 : #ifdef RE_ENABLE_I18N
806 : /* If possible, do searching in single byte encoding to speed things up. */
807 : if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
808 : optimize_utf8 (dfa);
809 244 : #endif
810 :
811 : /* Then create the initial state of the dfa. */
812 244 : err = create_initial_state (dfa);
813 244 :
814 : /* Release work areas. */
815 244 : free_workarea_compile (preg);
816 : re_string_destruct (®exp);
817 0 :
818 0 : if (BE (err != REG_NOERROR, 0))
819 0 : {
820 : free_dfa_content (dfa);
821 : preg->buffer = NULL;
822 244 : preg->allocated = 0;
823 : }
824 :
825 : return err;
826 : }
827 :
828 : /* Initialize DFA. We use the length of the regular expression PAT_LEN
829 321 : as the initial length of some arrays. */
830 :
831 : static reg_errcode_t
832 : init_dfa (re_dfa_t *dfa, size_t pat_len)
833 321 : {
834 : __re_size_t table_size;
835 : #ifdef RE_ENABLE_I18N
836 : size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
837 321 : #else
838 321 : size_t max_i18n_object_size = 0;
839 : #endif
840 : size_t max_object_size =
841 : MAX (sizeof (struct re_state_table_entry),
842 : MAX (sizeof (re_token_t),
843 : MAX (sizeof (re_node_set),
844 321 : MAX (sizeof (regmatch_t),
845 : max_i18n_object_size))));
846 :
847 321 : memset (dfa, '\0', sizeof (re_dfa_t));
848 :
849 : /* Force allocation of str_tree_storage the first time. */
850 : dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
851 :
852 : /* Avoid overflows. The extra "/ 2" is for the table_size doubling
853 321 : calculation below, and for similar doubling calculations
854 0 : elsewhere. And it's <= rather than <, because some of the
855 : doubling calculations add 1 afterwards. */
856 321 : if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
857 321 : return REG_ESPACE;
858 :
859 : dfa->nodes_alloc = pat_len + 1;
860 1098 : dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
861 1875 :
862 321 : /* table_size = 2 ^ ceil(log pat_len) */
863 : for (table_size = 1; ; table_size <<= 1)
864 321 : if (table_size > pat_len)
865 321 : break;
866 :
867 321 : dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
868 : dfa->state_hash_mask = table_size - 1;
869 :
870 : dfa->mb_cur_max = MB_CUR_MAX;
871 : #ifdef _LIBC
872 : if (dfa->mb_cur_max == 6
873 : && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
874 : dfa->is_utf8 = 1;
875 321 : dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
876 0 : != 0);
877 : #else
878 : if (strcmp (locale_charset (), "UTF-8") == 0)
879 : dfa->is_utf8 = 1;
880 321 :
881 : /* We check exhaustively in the loop below if this charset is a
882 : superset of ASCII. */
883 : dfa->map_notascii = 0;
884 321 : #endif
885 :
886 0 : #ifdef RE_ENABLE_I18N
887 0 : if (dfa->mb_cur_max > 1)
888 : {
889 : if (dfa->is_utf8)
890 : dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
891 : else
892 0 : {
893 0 : int i, j, ch;
894 0 :
895 : dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
896 : if (BE (dfa->sb_char == NULL, 0))
897 0 : return REG_ESPACE;
898 0 :
899 : /* Set the bits corresponding to single byte chars. */
900 0 : for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901 0 : for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
902 0 : {
903 : wint_t wch = __btowc (ch);
904 0 : if (wch != WEOF)
905 0 : dfa->sb_char[i] |= (bitset_word_t) 1 << j;
906 : # ifndef _LIBC
907 : if (isascii (ch) && wch != ch)
908 : dfa->map_notascii = 1;
909 : # endif
910 : }
911 : }
912 321 : }
913 0 : #endif
914 321 :
915 : if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
916 : return REG_ESPACE;
917 : return REG_NOERROR;
918 : }
919 :
920 : /* Initialize WORD_CHAR table, which indicate which character is
921 : "word". In this case "word" means that it is the word construction
922 : character used by some operators like "\<", "\>", etc. */
923 27 :
924 : static void
925 : internal_function
926 27 : init_word_char (re_dfa_t *dfa)
927 135 : {
928 7020 : int i, j, ch;
929 6912 : dfa->word_ops_used = 1;
930 1701 : for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
931 27 : for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
932 : if (isalnum (ch) || ch == '_')
933 : dfa->word_char[i] |= (bitset_word_t) 1 << j;
934 : }
935 :
936 321 : /* Free the work area which are only used while compiling. */
937 :
938 321 : static void
939 : free_workarea_compile (regex_t *preg)
940 676 : {
941 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
942 355 : bin_tree_storage_t *storage, *next;
943 355 : for (storage = dfa->str_tree_storage; storage; storage = next)
944 : {
945 321 : next = storage->next;
946 321 : re_free (storage);
947 321 : }
948 321 : dfa->str_tree_storage = NULL;
949 321 : dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
950 321 : dfa->str_tree = NULL;
951 : re_free (dfa->org_indices);
952 : dfa->org_indices = NULL;
953 : }
954 :
955 244 : /* Create initial states for all contexts. */
956 :
957 : static reg_errcode_t
958 : create_initial_state (re_dfa_t *dfa)
959 : {
960 : Idx first, i;
961 : reg_errcode_t err;
962 : re_node_set init_nodes;
963 244 :
964 244 : /* Initial states have the epsilon closure of the node which is
965 244 : the first node of the regular expression. */
966 244 : first = dfa->str_tree->first->node_idx;
967 0 : dfa->init_node = first;
968 : err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
969 : if (BE (err != REG_NOERROR, 0))
970 : return err;
971 :
972 : /* The back-references which are in initial states can epsilon transit,
973 244 : since in this case all of the subexpressions can be null.
974 0 : Then we add epsilon closures of the nodes which are the next nodes of
975 : the back-references. */
976 0 : if (dfa->nbackref > 0)
977 0 : for (i = 0; i < init_nodes.nelem; ++i)
978 : {
979 : Idx node_idx = init_nodes.elems[i];
980 0 : re_token_type_t type = dfa->nodes[node_idx].type;
981 0 :
982 0 : Idx clexp_idx;
983 : if (type != OP_BACK_REF)
984 : continue;
985 0 : for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
986 0 : {
987 0 : re_token_t *clexp_node;
988 0 : clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
989 : if (clexp_node->type == OP_CLOSE_SUBEXP
990 0 : && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
991 0 : break;
992 : }
993 0 : if (clexp_idx == init_nodes.nelem)
994 : continue;
995 0 :
996 0 : if (type == OP_BACK_REF)
997 : {
998 0 : Idx dest_idx = dfa->edests[node_idx].elems[0];
999 0 : if (!re_node_set_contains (&init_nodes, dest_idx))
1000 : {
1001 : re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1002 : i = 0;
1003 : }
1004 : }
1005 244 : }
1006 :
1007 244 : /* It must be the first time to invoke acquire_state. */
1008 0 : dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1009 244 : /* We don't check ERR here, since the initial state must not be NULL. */
1010 : if (BE (dfa->init_state == NULL, 0))
1011 43 : return err;
1012 : if (dfa->init_state->has_constraint)
1013 43 : {
1014 : dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1015 43 : CONTEXT_WORD);
1016 : dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1017 : CONTEXT_NEWLINE);
1018 : dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1019 43 : &init_nodes,
1020 : CONTEXT_NEWLINE
1021 0 : | CONTEXT_BEGBUF);
1022 : if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1023 : || dfa->init_state_begbuf == NULL, 0))
1024 201 : return err;
1025 201 : }
1026 : else
1027 244 : dfa->init_state_word = dfa->init_state_nl
1028 244 : = dfa->init_state_begbuf = dfa->init_state;
1029 :
1030 : re_node_set_free (&init_nodes);
1031 : return REG_NOERROR;
1032 : }
1033 :
1034 : #ifdef RE_ENABLE_I18N
1035 : /* If it is possible to do searching in single byte encoding instead of UTF-8
1036 : to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1037 0 : DFA nodes where needed. */
1038 :
1039 : static void
1040 : optimize_utf8 (re_dfa_t *dfa)
1041 0 : {
1042 0 : Idx node;
1043 : int i;
1044 0 : bool mb_chars = false;
1045 0 : bool has_period = false;
1046 :
1047 0 : for (node = 0; node < dfa->nodes_len; ++node)
1048 0 : switch (dfa->nodes[node].type)
1049 0 : {
1050 0 : case CHARACTER:
1051 0 : if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1052 0 : mb_chars = true;
1053 : break;
1054 0 : case ANCHOR:
1055 : switch (dfa->nodes[node].opr.ctx_type)
1056 : {
1057 : case LINE_FIRST:
1058 0 : case LINE_LAST:
1059 0 : case BUF_FIRST:
1060 : case BUF_LAST:
1061 0 : break;
1062 : default:
1063 0 : /* Word anchors etc. cannot be handled. */
1064 0 : return;
1065 0 : }
1066 0 : break;
1067 0 : case OP_PERIOD:
1068 : has_period = true;
1069 : break;
1070 : case OP_BACK_REF:
1071 : case OP_ALT:
1072 : case END_OF_RE:
1073 0 : case OP_DUP_ASTERISK:
1074 0 : case OP_OPEN_SUBEXP:
1075 0 : case OP_CLOSE_SUBEXP:
1076 0 : break;
1077 : case COMPLEX_BRACKET:
1078 : return;
1079 0 : case SIMPLE_BRACKET:
1080 : /* Just double check. */
1081 : {
1082 0 : int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1083 : ? 0
1084 0 : : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1085 0 : for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1086 0 : {
1087 : if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1088 : return;
1089 0 : rshift = 0;
1090 0 : }
1091 0 : }
1092 : break;
1093 : default:
1094 0 : abort ();
1095 0 : }
1096 :
1097 0 : if (mb_chars || has_period)
1098 0 : for (node = 0; node < dfa->nodes_len; ++node)
1099 0 : {
1100 0 : if (dfa->nodes[node].type == CHARACTER
1101 0 : && dfa->nodes[node].opr.c >= ASCII_CHARS)
1102 : dfa->nodes[node].mb_partial = 0;
1103 : else if (dfa->nodes[node].type == OP_PERIOD)
1104 : dfa->nodes[node].type = OP_UTF8_PERIOD;
1105 0 : }
1106 0 :
1107 0 : /* The search can be in single byte locale. */
1108 : dfa->mb_cur_max = 1;
1109 : dfa->is_utf8 = 0;
1110 : dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1111 : }
1112 : #endif
1113 :
1114 : /* Analyze the structure tree, and calculate "first", "next", "edest",
1115 244 : "eclosure", and "inveclosure". */
1116 :
1117 244 : static reg_errcode_t
1118 : analyze (regex_t *preg)
1119 : {
1120 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1121 244 : reg_errcode_t ret;
1122 244 :
1123 244 : /* Allocate arrays. */
1124 244 : dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1125 244 : dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1126 : dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1127 0 : dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1128 : if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1129 244 : || dfa->eclosures == NULL, 0))
1130 244 : return REG_ESPACE;
1131 :
1132 : dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1133 314 : if (dfa->subexp_map != NULL)
1134 70 : {
1135 244 : Idx i;
1136 314 : for (i = 0; i < preg->re_nsub; i++)
1137 70 : dfa->subexp_map[i] = i;
1138 0 : preorder (dfa->str_tree, optimize_subexps, dfa);
1139 244 : for (i = 0; i < preg->re_nsub; i++)
1140 : if (dfa->subexp_map[i] != i)
1141 244 : break;
1142 244 : if (i == preg->re_nsub)
1143 : {
1144 : free (dfa->subexp_map);
1145 : dfa->subexp_map = NULL;
1146 244 : }
1147 244 : }
1148 0 :
1149 244 : ret = postorder (dfa->str_tree, lower_subexps, preg);
1150 244 : if (BE (ret != REG_NOERROR, 0))
1151 0 : return ret;
1152 244 : ret = postorder (dfa->str_tree, calc_first, dfa);
1153 244 : if (BE (ret != REG_NOERROR, 0))
1154 244 : return ret;
1155 0 : preorder (dfa->str_tree, calc_next, dfa);
1156 244 : ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1157 244 : if (BE (ret != REG_NOERROR, 0))
1158 0 : return ret;
1159 : ret = calc_eclosure (dfa);
1160 : if (BE (ret != REG_NOERROR, 0))
1161 : return ret;
1162 244 :
1163 174 : /* We only need this during the prune_impossible_nodes pass in regexec.c;
1164 : skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1165 70 : if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1166 70 : || dfa->nbackref)
1167 0 : {
1168 70 : dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1169 : if (BE (dfa->inveclosures == NULL, 0))
1170 : return REG_ESPACE;
1171 244 : ret = calc_inveclosure (dfa);
1172 : }
1173 :
1174 : return ret;
1175 : }
1176 :
1177 : /* Our parse trees are very unbalanced, so we cannot use a stack to
1178 488 : implement parse tree visits. Instead, we use parent pointers and
1179 : some hairy code in these two functions. */
1180 : static reg_errcode_t
1181 : postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1182 : void *extra)
1183 488 : {
1184 : bin_tree_t *node, *prev;
1185 :
1186 : for (node = root; ; )
1187 7330 : {
1188 1878 : /* Descend down the tree, preferably to the left (or to the right
1189 1876 : if that's the only child). */
1190 : while (node->left || node->right)
1191 2 : if (node->left)
1192 : node = node->left;
1193 : else
1194 : node = node->right;
1195 3858 :
1196 3858 : do
1197 0 : {
1198 3858 : reg_errcode_t err = fn (extra, node);
1199 488 : if (BE (err != REG_NOERROR, 0))
1200 3370 : return err;
1201 3370 : if (node->parent == NULL)
1202 : return REG_NOERROR;
1203 : prev = node;
1204 3370 : node = node->parent;
1205 1492 : }
1206 : /* Go up while we have a node that is reached from the right. */
1207 : while (node->right == prev || node->right == NULL);
1208 : node = node->right;
1209 : }
1210 732 : }
1211 :
1212 : static reg_errcode_t
1213 : preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1214 : void *extra)
1215 732 : {
1216 5160 : bin_tree_t *node;
1217 5892 :
1218 5892 : for (node = root; ; )
1219 0 : {
1220 : reg_errcode_t err = fn (extra, node);
1221 : if (BE (err != REG_NOERROR, 0))
1222 5892 : return err;
1223 2849 :
1224 : /* Go to the left node, or up and to the right. */
1225 : if (node->left)
1226 3043 : node = node->left;
1227 11246 : else
1228 : {
1229 5892 : bin_tree_t *prev = NULL;
1230 5892 : while (node->right == prev || node->right == NULL)
1231 5892 : {
1232 732 : prev = node;
1233 : node = node->parent;
1234 2311 : if (!node)
1235 : return REG_NOERROR;
1236 : }
1237 : node = node->right;
1238 : }
1239 : }
1240 : }
1241 :
1242 : /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1243 1824 : re_search_internal to map the inner one's opr.idx to this one's. Adjust
1244 : backreferences as well. Requires a preorder visit. */
1245 1824 : static reg_errcode_t
1246 : optimize_subexps (void *extra, bin_tree_t *node)
1247 1824 : {
1248 0 : re_dfa_t *dfa = (re_dfa_t *) extra;
1249 0 :
1250 0 : if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1251 0 : {
1252 : int idx = node->token.opr.idx;
1253 : node->token.opr.idx = dfa->subexp_map[idx];
1254 1824 : dfa->used_bkref_map |= 1 << node->token.opr.idx;
1255 70 : }
1256 :
1257 0 : else if (node->token.type == SUBEXP
1258 : && node->left && node->left->token.type == SUBEXP)
1259 0 : {
1260 0 : Idx other_idx = node->left->token.opr.idx;
1261 0 :
1262 : node->left = node->left->left;
1263 0 : if (node->left)
1264 0 : node->left->parent = node;
1265 0 :
1266 : dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1267 : if (other_idx < BITSET_WORD_BITS)
1268 1824 : dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1269 : }
1270 :
1271 : return REG_NOERROR;
1272 : }
1273 :
1274 1824 : /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1275 : of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1276 1824 : static reg_errcode_t
1277 1824 : lower_subexps (void *extra, bin_tree_t *node)
1278 : {
1279 1824 : regex_t *preg = (regex_t *) extra;
1280 : reg_errcode_t err = REG_NOERROR;
1281 0 :
1282 0 : if (node->left && node->left->token.type == SUBEXP)
1283 0 : {
1284 : node->left = lower_subexp (&err, preg, node->left);
1285 1824 : if (node->left)
1286 : node->left->parent = node;
1287 70 : }
1288 70 : if (node->right && node->right->token.type == SUBEXP)
1289 70 : {
1290 : node->right = lower_subexp (&err, preg, node->right);
1291 : if (node->right)
1292 1824 : node->right->parent = node;
1293 : }
1294 :
1295 : return err;
1296 70 : }
1297 :
1298 70 : static bin_tree_t *
1299 70 : lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1300 : {
1301 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1302 70 : bin_tree_t *body = node->left;
1303 : bin_tree_t *op, *cls, *tree1, *tree;
1304 :
1305 : if (preg->no_sub
1306 : /* We do not optimize empty subexpressions, because otherwise we may
1307 0 : have bad CONCAT nodes with NULL children. This is obviously not
1308 0 : very common, so we do not lose much. An example that triggers
1309 0 : this case is the sed "script" /\(\)/x. */
1310 0 : && node->left != NULL
1311 0 : && (node->token.opr.idx >= BITSET_WORD_BITS
1312 : || !(dfa->used_bkref_map
1313 : & ((bitset_word_t) 1 << node->token.opr.idx))))
1314 : return node->left;
1315 70 :
1316 70 : /* Convert the SUBEXP node to the concatenation of an
1317 70 : OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1318 70 : op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1319 70 : cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1320 : tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1321 0 : tree = create_tree (dfa, op, tree1, CONCAT);
1322 0 : if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1323 : {
1324 : *err = REG_ESPACE;
1325 70 : return NULL;
1326 70 : }
1327 70 :
1328 : op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1329 : op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1330 : return tree;
1331 : }
1332 :
1333 2034 : /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1334 : nodes. Requires a postorder visit. */
1335 2034 : static reg_errcode_t
1336 2034 : calc_first (void *extra, bin_tree_t *node)
1337 : {
1338 659 : re_dfa_t *dfa = (re_dfa_t *) extra;
1339 659 : if (node->token.type == CONCAT)
1340 : {
1341 : node->first = node->left->first;
1342 : node->node_idx = node->left->node_idx;
1343 1375 : }
1344 1375 : else
1345 1375 : {
1346 0 : node->first = node;
1347 : node->node_idx = re_dfa_add_node (dfa, node->token);
1348 2034 : if (BE (node->node_idx == REG_MISSING, 0))
1349 : return REG_ESPACE;
1350 : }
1351 : return REG_NOERROR;
1352 : }
1353 2034 :
1354 : /* Pass 2: compute NEXT on the tree. Preorder visit. */
1355 2034 : static reg_errcode_t
1356 : calc_next (void *extra, bin_tree_t *node)
1357 151 : {
1358 151 : switch (node->token.type)
1359 151 : {
1360 659 : case OP_DUP_ASTERISK:
1361 659 : node->left->next = node;
1362 659 : break;
1363 659 : case CONCAT:
1364 1224 : node->left->next = node->right->first;
1365 1224 : node->right->next = node->next;
1366 163 : break;
1367 1224 : default:
1368 158 : if (node->left)
1369 1224 : node->left->next = node->next;
1370 : if (node->right)
1371 2034 : node->right->next = node->next;
1372 : break;
1373 : }
1374 : return REG_NOERROR;
1375 : }
1376 2034 :
1377 : /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1378 2034 : static reg_errcode_t
1379 2034 : link_nfa_nodes (void *extra, bin_tree_t *node)
1380 2034 : {
1381 : re_dfa_t *dfa = (re_dfa_t *) extra;
1382 2034 : Idx idx = node->node_idx;
1383 : reg_errcode_t err = REG_NOERROR;
1384 659 :
1385 659 : switch (node->token.type)
1386 : {
1387 244 : case CONCAT:
1388 244 : break;
1389 244 :
1390 : case END_OF_RE:
1391 319 : assert (node->next == NULL);
1392 : break;
1393 :
1394 : case OP_DUP_ASTERISK:
1395 319 : case OP_ALT:
1396 319 : {
1397 314 : Idx left, right;
1398 : dfa->has_plural_match = 1;
1399 5 : if (node->left != NULL)
1400 319 : left = node->left->first->node_idx;
1401 158 : else
1402 : left = node->next->node_idx;
1403 161 : if (node->right != NULL)
1404 319 : right = node->right->first->node_idx;
1405 319 : else
1406 319 : right = node->next->node_idx;
1407 : assert (REG_VALID_INDEX (left));
1408 319 : assert (REG_VALID_INDEX (right));
1409 : err = re_node_set_init_2 (dfa->edests + idx, left, right);
1410 274 : }
1411 : break;
1412 :
1413 274 : case ANCHOR:
1414 274 : case OP_OPEN_SUBEXP:
1415 : case OP_CLOSE_SUBEXP:
1416 0 : err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1417 0 : break;
1418 0 :
1419 0 : case OP_BACK_REF:
1420 0 : dfa->nexts[idx] = node->next->node_idx;
1421 : if (node->token.type == OP_BACK_REF)
1422 538 : re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1423 538 : break;
1424 538 :
1425 538 : default:
1426 : assert (!IS_EPSILON_NODE (node->token.type));
1427 : dfa->nexts[idx] = node->next->node_idx;
1428 2034 : break;
1429 : }
1430 :
1431 : return err;
1432 : }
1433 :
1434 : /* Duplicate the epsilon closure of the node ROOT_NODE.
1435 : Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1436 : to their own constraint. */
1437 206 :
1438 : static reg_errcode_t
1439 : internal_function
1440 : duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1441 : Idx root_node, unsigned int init_constraint)
1442 206 : {
1443 206 : Idx org_node, clone_node;
1444 278 : bool ok;
1445 : unsigned int constraint = init_constraint;
1446 484 : for (org_node = top_org_node, clone_node = top_clone_node;;)
1447 : {
1448 : Idx org_dest, clone_dest;
1449 : if (dfa->nodes[org_node].type == OP_BACK_REF)
1450 : {
1451 : /* If the back reference epsilon-transit, its destination must
1452 0 : also have the constraint. Then duplicate the epsilon closure
1453 0 : of the destination of the back reference, and store it in
1454 0 : edests of the back reference. */
1455 0 : org_dest = dfa->nexts[org_node];
1456 0 : re_node_set_empty (dfa->edests + clone_node);
1457 0 : clone_dest = duplicate_node (dfa, org_dest, constraint);
1458 0 : if (BE (clone_dest == REG_MISSING, 0))
1459 0 : return REG_ESPACE;
1460 0 : dfa->nexts[clone_node] = dfa->nexts[org_node];
1461 : ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1462 484 : if (BE (! ok, 0))
1463 : return REG_ESPACE;
1464 : }
1465 : else if (dfa->edests[org_node].nelem == 0)
1466 : {
1467 206 : /* In case of the node can't epsilon-transit, don't duplicate the
1468 206 : destination and store the original destination as the
1469 : destination of the node. */
1470 278 : dfa->nexts[clone_node] = dfa->nexts[org_node];
1471 : break;
1472 : }
1473 : else if (dfa->edests[org_node].nelem == 1)
1474 206 : {
1475 206 : /* In case of the node can epsilon-transit, and it has only one
1476 206 : destination. */
1477 : org_dest = dfa->edests[org_node].elems[0];
1478 : re_node_set_empty (dfa->edests + clone_node);
1479 136 : if (dfa->nodes[org_node].type == ANCHOR)
1480 : {
1481 : /* In case of the node has another constraint, append it. */
1482 : if (org_node == root_node && clone_node != org_node)
1483 : {
1484 0 : /* ...but if the node is root_node itself, it means the
1485 0 : epsilon closure have a loop, then tie it to the
1486 0 : destination of the root_node. */
1487 0 : ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1488 : if (BE (! ok, 0))
1489 136 : return REG_ESPACE;
1490 : break;
1491 206 : }
1492 206 : constraint |= dfa->nodes[org_node].opr.ctx_type;
1493 0 : }
1494 206 : clone_dest = duplicate_node (dfa, org_dest, constraint);
1495 206 : if (BE (clone_dest == REG_MISSING, 0))
1496 0 : return REG_ESPACE;
1497 : ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498 : if (BE (! ok, 0))
1499 : return REG_ESPACE;
1500 : }
1501 : else /* dfa->edests[org_node].nelem == 2 */
1502 72 : {
1503 72 : /* In case of the node can epsilon-transit, and it has two
1504 : destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1505 72 : org_dest = dfa->edests[org_node].elems[0];
1506 72 : re_node_set_empty (dfa->edests + clone_node);
1507 : /* Search for a duplicated node which satisfies the constraint. */
1508 : clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1509 : if (clone_dest == REG_MISSING)
1510 72 : {
1511 72 : /* There are no such a duplicated node, create a new one. */
1512 0 : reg_errcode_t err;
1513 72 : clone_dest = duplicate_node (dfa, org_dest, constraint);
1514 72 : if (BE (clone_dest == REG_MISSING, 0))
1515 0 : return REG_ESPACE;
1516 72 : ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517 : if (BE (! ok, 0))
1518 72 : return REG_ESPACE;
1519 0 : err = duplicate_node_closure (dfa, org_dest, clone_dest,
1520 : root_node, constraint);
1521 : if (BE (err != REG_NOERROR, 0))
1522 : return err;
1523 : }
1524 : else
1525 0 : {
1526 0 : /* There are a duplicated node which satisfy the constraint,
1527 0 : use it to avoid infinite loop. */
1528 : ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 : if (BE (! ok, 0))
1530 72 : return REG_ESPACE;
1531 72 : }
1532 72 :
1533 0 : org_dest = dfa->edests[org_node].elems[1];
1534 72 : clone_dest = duplicate_node (dfa, org_dest, constraint);
1535 72 : if (BE (clone_dest == REG_MISSING, 0))
1536 0 : return REG_ESPACE;
1537 : ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538 278 : if (BE (! ok, 0))
1539 278 : return REG_ESPACE;
1540 : }
1541 206 : org_node = org_dest;
1542 : clone_node = clone_dest;
1543 : }
1544 : return REG_NOERROR;
1545 : }
1546 :
1547 : /* Search for a node which is duplicated from the node ORG_NODE, and
1548 72 : satisfies the constraint CONSTRAINT. */
1549 :
1550 : static Idx
1551 : search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1552 217 : unsigned int constraint)
1553 : {
1554 145 : Idx idx;
1555 1 : for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1556 0 : {
1557 : if (org_node == dfa->org_indices[idx]
1558 72 : && constraint == dfa->nodes[idx].constraint)
1559 : return idx; /* Found. */
1560 : }
1561 : return REG_MISSING; /* Not found. */
1562 : }
1563 :
1564 : /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1565 : Return the index of the new node, or REG_MISSING if insufficient storage is
1566 350 : available. */
1567 :
1568 350 : static Idx
1569 350 : duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1570 : {
1571 350 : Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1572 350 : if (BE (dup_idx != REG_MISSING, 1))
1573 2 : {
1574 350 : dfa->nodes[dup_idx].constraint = constraint;
1575 : if (dfa->nodes[org_idx].type == ANCHOR)
1576 : dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1577 350 : dfa->nodes[dup_idx].duplicated = 1;
1578 :
1579 350 : /* Store the index of the original node. */
1580 : dfa->org_indices[dup_idx] = org_idx;
1581 : }
1582 : return dup_idx;
1583 70 : }
1584 :
1585 : static reg_errcode_t
1586 : calc_inveclosure (re_dfa_t *dfa)
1587 1330 : {
1588 1260 : Idx src, idx;
1589 : bool ok;
1590 1330 : for (idx = 0; idx < dfa->nodes_len; ++idx)
1591 : re_node_set_init_empty (dfa->inveclosures + idx);
1592 1260 :
1593 5880 : for (src = 0; src < dfa->nodes_len; ++src)
1594 : {
1595 4620 : Idx *elems = dfa->eclosures[src].elems;
1596 4620 : for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1597 0 : {
1598 : ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1599 : if (BE (! ok, 0))
1600 : return REG_ESPACE;
1601 70 : }
1602 : }
1603 :
1604 : return REG_NOERROR;
1605 : }
1606 :
1607 244 : /* Calculate "eclosure" for all the node in DFA. */
1608 :
1609 : static reg_errcode_t
1610 : calc_eclosure (re_dfa_t *dfa)
1611 : {
1612 : Idx node_idx;
1613 : bool incomplete;
1614 244 : #ifdef DEBUG
1615 : assert (dfa->nodes_len > 0);
1616 1969 : #endif
1617 1725 : incomplete = false;
1618 : /* For each nodes, calculate epsilon closure. */
1619 : for (node_idx = 0; ; ++node_idx)
1620 1969 : {
1621 : reg_errcode_t err;
1622 244 : re_node_set eclosure_elem;
1623 244 : if (node_idx == dfa->nodes_len)
1624 0 : {
1625 0 : if (!incomplete)
1626 : break;
1627 : incomplete = false;
1628 : node_idx = 0;
1629 : }
1630 :
1631 : #ifdef DEBUG
1632 : assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1633 1725 : #endif
1634 998 :
1635 : /* If we have already calculated, skip it. */
1636 727 : if (dfa->eclosures[node_idx].nelem != 0)
1637 727 : continue;
1638 0 : /* Calculate epsilon closure of `node_idx'. */
1639 : err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640 727 : if (BE (err != REG_NOERROR, 0))
1641 : return err;
1642 0 :
1643 0 : if (dfa->eclosures[node_idx].nelem == 0)
1644 : {
1645 : incomplete = true;
1646 244 : re_node_set_free (&eclosure_elem);
1647 : }
1648 : }
1649 : return REG_NOERROR;
1650 : }
1651 :
1652 1729 : /* Calculate epsilon closure of NODE. */
1653 :
1654 : static reg_errcode_t
1655 : calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1656 : {
1657 : reg_errcode_t err;
1658 : unsigned int constraint;
1659 : Idx i;
1660 1729 : bool incomplete;
1661 1729 : bool ok;
1662 1729 : re_node_set eclosure;
1663 0 : incomplete = false;
1664 : err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1665 : if (BE (err != REG_NOERROR, 0))
1666 : return err;
1667 1729 :
1668 : /* This indicates that we are calculating this node now.
1669 3458 : We reference this value to avoid infinite loop. */
1670 1729 : dfa->eclosures[node].nelem = REG_MISSING;
1671 :
1672 : constraint = ((dfa->nodes[node].type == ANCHOR)
1673 1729 : ? dfa->nodes[node].opr.ctx_type : 0);
1674 136 : /* If the current node has constraints, duplicate all nodes.
1675 136 : Since they must inherit the constraints. */
1676 : if (constraint
1677 134 : && dfa->edests[node].nelem
1678 134 : && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1679 0 : {
1680 : err = duplicate_node_closure (dfa, node, node, node, constraint);
1681 : if (BE (err != REG_NOERROR, 0))
1682 : return err;
1683 1729 : }
1684 1873 :
1685 : /* Expand each epsilon destination nodes. */
1686 : if (IS_EPSILON_NODE(dfa->nodes[node].type))
1687 1132 : for (i = 0; i < dfa->edests[node].nelem; ++i)
1688 : {
1689 : re_node_set eclosure_elem;
1690 1132 : Idx edest = dfa->edests[node].elems[i];
1691 : /* If calculating the epsilon closure of `edest' is in progress,
1692 3 : return intermediate result. */
1693 3 : if (dfa->eclosures[edest].nelem == REG_MISSING)
1694 : {
1695 : incomplete = true;
1696 : continue;
1697 1129 : }
1698 : /* If we haven't calculated the epsilon closure of `edest' yet,
1699 1002 : calculate now. Otherwise use calculated epsilon closure. */
1700 1002 : if (dfa->eclosures[edest].nelem == 0)
1701 0 : {
1702 : err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1703 : if (BE (err != REG_NOERROR, 0))
1704 127 : return err;
1705 : }
1706 1129 : else
1707 : eclosure_elem = dfa->eclosures[edest];
1708 : /* Merge the epsilon closure of `edest'. */
1709 1129 : re_node_set_merge (&eclosure, &eclosure_elem);
1710 : /* If the epsilon closure of `edest' is incomplete,
1711 4 : the epsilon closure of this node is also incomplete. */
1712 4 : if (dfa->eclosures[edest].nelem == 0)
1713 : {
1714 : incomplete = true;
1715 : re_node_set_free (&eclosure_elem);
1716 : }
1717 1729 : }
1718 1729 :
1719 0 : /* Epsilon closures include itself. */
1720 1729 : ok = re_node_set_insert (&eclosure, node);
1721 4 : if (BE (! ok, 0))
1722 : return REG_ESPACE;
1723 1725 : if (incomplete && !root)
1724 1729 : dfa->eclosures[node].nelem = 0;
1725 1729 : else
1726 : dfa->eclosures[node] = eclosure;
1727 : *new_set = eclosure;
1728 : return REG_NOERROR;
1729 : }
1730 :
1731 : /* Functions for token which are used in the parser. */
1732 :
1733 : /* Fetch a token from INPUT.
1734 : We must not use this function inside bracket expressions. */
1735 1489 :
1736 : static void
1737 1489 : internal_function
1738 1489 : fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1739 : {
1740 : re_string_skip_bytes (input, peek_token (result, input, syntax));
1741 : }
1742 :
1743 : /* Peek a token from INPUT, and return the length of the token.
1744 : We must not use this function inside bracket expressions. */
1745 1589 :
1746 : static int
1747 : internal_function
1748 : peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1749 1589 : {
1750 : unsigned char c;
1751 253 :
1752 253 : if (re_string_eoi (input))
1753 : {
1754 : token->type = END_OF_RE;
1755 1336 : return 0;
1756 1336 : }
1757 :
1758 1336 : c = re_string_peek_byte (input, 0);
1759 : token->opr.c = c;
1760 1336 :
1761 1336 : token->word_char = 0;
1762 0 : #ifdef RE_ENABLE_I18N
1763 : token->mb_partial = 0;
1764 0 : if (input->mb_cur_max > 1 &&
1765 0 : !re_string_first_byte (input, re_string_cur_idx (input)))
1766 0 : {
1767 : token->type = CHARACTER;
1768 : token->mb_partial = 1;
1769 1336 : return 1;
1770 : }
1771 : #endif
1772 498 : if (c == '\\')
1773 : {
1774 23 : unsigned char c2;
1775 23 : if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1776 : {
1777 : token->type = BACK_SLASH;
1778 475 : return 1;
1779 475 : }
1780 475 :
1781 : c2 = re_string_peek_byte_case (input, 1);
1782 475 : token->opr.c = c2;
1783 : token->type = CHARACTER;
1784 0 : #ifdef RE_ENABLE_I18N
1785 0 : if (input->mb_cur_max > 1)
1786 0 : {
1787 : wint_t wc = re_string_wchar_at (input,
1788 : re_string_cur_idx (input) + 1);
1789 : token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1790 475 : }
1791 : else
1792 475 : #endif
1793 : token->word_char = IS_WORD_CHAR (c2) != 0;
1794 228 :
1795 228 : switch (c2)
1796 228 : {
1797 228 : case '|':
1798 11 : if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1799 : token->type = OP_ALT;
1800 11 : break;
1801 : case '1': case '2': case '3': case '4': case '5':
1802 11 : case '6': case '7': case '8': case '9':
1803 11 : if (!(syntax & RE_NO_BK_REFS))
1804 : {
1805 11 : token->type = OP_BACK_REF;
1806 4 : token->opr.idx = c2 - '1';
1807 4 : }
1808 : break;
1809 4 : case '<':
1810 4 : if (!(syntax & RE_NO_GNU_OPS))
1811 : {
1812 4 : token->type = ANCHOR;
1813 7 : token->opr.ctx_type = WORD_FIRST;
1814 7 : }
1815 : break;
1816 7 : case '>':
1817 7 : if (!(syntax & RE_NO_GNU_OPS))
1818 : {
1819 7 : token->type = ANCHOR;
1820 6 : token->opr.ctx_type = WORD_LAST;
1821 6 : }
1822 : break;
1823 6 : case 'b':
1824 6 : if (!(syntax & RE_NO_GNU_OPS))
1825 : {
1826 6 : token->type = ANCHOR;
1827 11 : token->opr.ctx_type = WORD_DELIM;
1828 11 : }
1829 : break;
1830 11 : case 'B':
1831 11 : if (!(syntax & RE_NO_GNU_OPS))
1832 : {
1833 11 : token->type = ANCHOR;
1834 5 : token->opr.ctx_type = NOT_WORD_DELIM;
1835 5 : }
1836 5 : break;
1837 5 : case 'w':
1838 5 : if (!(syntax & RE_NO_GNU_OPS))
1839 5 : token->type = OP_WORD;
1840 5 : break;
1841 5 : case 'W':
1842 6 : if (!(syntax & RE_NO_GNU_OPS))
1843 6 : token->type = OP_NOTWORD;
1844 6 : break;
1845 6 : case 's':
1846 8 : if (!(syntax & RE_NO_GNU_OPS))
1847 8 : token->type = OP_SPACE;
1848 8 : break;
1849 8 : case 'S':
1850 4 : if (!(syntax & RE_NO_GNU_OPS))
1851 4 : token->type = OP_NOTSPACE;
1852 : break;
1853 4 : case '`':
1854 4 : if (!(syntax & RE_NO_GNU_OPS))
1855 : {
1856 4 : token->type = ANCHOR;
1857 4 : token->opr.ctx_type = BUF_FIRST;
1858 4 : }
1859 : break;
1860 4 : case '\'':
1861 4 : if (!(syntax & RE_NO_GNU_OPS))
1862 : {
1863 4 : token->type = ANCHOR;
1864 79 : token->opr.ctx_type = BUF_LAST;
1865 79 : }
1866 79 : break;
1867 79 : case '(':
1868 75 : if (!(syntax & RE_NO_BK_PARENS))
1869 75 : token->type = OP_OPEN_SUBEXP;
1870 75 : break;
1871 75 : case ')':
1872 5 : if (!(syntax & RE_NO_BK_PARENS))
1873 5 : token->type = OP_CLOSE_SUBEXP;
1874 3 : break;
1875 5 : case '+':
1876 4 : if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1877 4 : token->type = OP_DUP_PLUS;
1878 3 : break;
1879 4 : case '?':
1880 9 : if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1881 9 : token->type = OP_DUP_QUESTION;
1882 7 : break;
1883 9 : case '{':
1884 4 : if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1885 4 : token->type = OP_OPEN_DUP_NUM;
1886 2 : break;
1887 4 : case '}':
1888 0 : if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1889 0 : token->type = OP_CLOSE_DUP_NUM;
1890 : break;
1891 475 : default:
1892 : break;
1893 : }
1894 838 : return 2;
1895 : }
1896 838 :
1897 : token->type = CHARACTER;
1898 0 : #ifdef RE_ENABLE_I18N
1899 0 : if (input->mb_cur_max > 1)
1900 : {
1901 : wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1902 : token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1903 838 : }
1904 : else
1905 838 : #endif
1906 : token->word_char = IS_WORD_CHAR (token->opr.c);
1907 15 :
1908 15 : switch (c)
1909 0 : {
1910 15 : case '\n':
1911 18 : if (syntax & RE_NEWLINE_ALT)
1912 18 : token->type = OP_ALT;
1913 0 : break;
1914 18 : case '|':
1915 163 : if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1916 163 : token->type = OP_ALT;
1917 163 : break;
1918 21 : case '*':
1919 21 : token->type = OP_DUP_ASTERISK;
1920 5 : break;
1921 21 : case '+':
1922 5 : if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1923 5 : token->type = OP_DUP_PLUS;
1924 3 : break;
1925 5 : case '?':
1926 4 : if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1927 4 : token->type = OP_DUP_QUESTION;
1928 0 : break;
1929 4 : case '{':
1930 3 : if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1931 3 : token->type = OP_OPEN_DUP_NUM;
1932 0 : break;
1933 3 : case '}':
1934 5 : if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1935 5 : token->type = OP_CLOSE_DUP_NUM;
1936 0 : break;
1937 5 : case '(':
1938 3 : if (syntax & RE_NO_BK_PARENS)
1939 3 : token->type = OP_OPEN_SUBEXP;
1940 0 : break;
1941 3 : case ')':
1942 260 : if (syntax & RE_NO_BK_PARENS)
1943 260 : token->type = OP_CLOSE_SUBEXP;
1944 260 : break;
1945 5 : case '[':
1946 5 : token->type = OP_OPEN_BRACKET;
1947 5 : break;
1948 8 : case '.':
1949 11 : token->type = OP_PERIOD;
1950 3 : break;
1951 : case '^':
1952 3 : if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1953 3 : re_string_cur_idx (input) != 0)
1954 : {
1955 : char prev = re_string_peek_byte (input, -1);
1956 5 : if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1957 5 : break;
1958 5 : }
1959 107 : token->type = ANCHOR;
1960 214 : token->opr.ctx_type = LINE_FIRST;
1961 107 : break;
1962 : case '$':
1963 : if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1964 100 : re_string_cur_idx (input) + 1 != re_string_length (input))
1965 100 : {
1966 100 : re_token_t next;
1967 100 : re_string_skip_bytes (input, 1);
1968 25 : peek_token (&next, input, syntax);
1969 : re_string_skip_bytes (input, -1);
1970 82 : if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1971 82 : break;
1972 82 : }
1973 221 : token->type = ANCHOR;
1974 221 : token->opr.ctx_type = LINE_LAST;
1975 : break;
1976 841 : default:
1977 : break;
1978 : }
1979 : return 1;
1980 : }
1981 :
1982 : /* Peek a token from INPUT, and return the length of the token.
1983 : We must not use this function out of bracket expressions. */
1984 1061 :
1985 : static int
1986 : internal_function
1987 1061 : peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1988 : {
1989 23 : unsigned char c;
1990 23 : if (re_string_eoi (input))
1991 : {
1992 1038 : token->type = END_OF_RE;
1993 1038 : return 0;
1994 : }
1995 : c = re_string_peek_byte (input, 0);
1996 1038 : token->opr.c = c;
1997 0 :
1998 : #ifdef RE_ENABLE_I18N
1999 0 : if (input->mb_cur_max > 1 &&
2000 0 : !re_string_first_byte (input, re_string_cur_idx (input)))
2001 : {
2002 : token->type = CHARACTER;
2003 : return 1;
2004 1038 : }
2005 0 : #endif /* RE_ENABLE_I18N */
2006 :
2007 : if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2008 : && re_string_cur_idx (input) + 1 < re_string_length (input))
2009 0 : {
2010 0 : /* In this case, '\' escape a character. */
2011 0 : unsigned char c2;
2012 0 : re_string_skip_bytes (input, 1);
2013 0 : c2 = re_string_peek_byte (input, 0);
2014 : token->opr.c = c2;
2015 1038 : token->type = CHARACTER;
2016 : return 1;
2017 : }
2018 : if (c == '[') /* '[' is a special char in a bracket exps. */
2019 22 : {
2020 15 : unsigned char c2;
2021 : int token_len;
2022 7 : if (re_string_cur_idx (input) + 1 < re_string_length (input))
2023 22 : c2 = re_string_peek_byte (input, 1);
2024 22 : else
2025 22 : c2 = 0;
2026 : token->opr.c = c2;
2027 2 : token_len = 2;
2028 2 : switch (c2)
2029 2 : {
2030 11 : case '.':
2031 11 : token->type = OP_OPEN_COLL_ELEM;
2032 11 : break;
2033 1 : case '=':
2034 1 : token->type = OP_OPEN_EQUIV_CLASS;
2035 : break;
2036 1 : case ':':
2037 1 : if (syntax & RE_CHAR_CLASSES)
2038 : {
2039 : token->type = OP_OPEN_CHAR_CLASS;
2040 : break;
2041 8 : }
2042 8 : /* else fall through. */
2043 8 : default:
2044 8 : token->type = CHARACTER;
2045 : token->opr.c = c;
2046 22 : token_len = 1;
2047 : break;
2048 1016 : }
2049 : return token_len;
2050 19 : }
2051 19 : switch (c)
2052 19 : {
2053 287 : case '-':
2054 287 : token->type = OP_CHARSET_RANGE;
2055 287 : break;
2056 6 : case ']':
2057 6 : token->type = OP_CLOSE_BRACKET;
2058 6 : break;
2059 704 : case '^':
2060 704 : token->type = OP_NON_MATCH_LIST;
2061 : break;
2062 1016 : default:
2063 : token->type = CHARACTER;
2064 : }
2065 : return 1;
2066 : }
2067 :
2068 : /* Functions for parser. */
2069 :
2070 : /* Entry point of the parser.
2071 : Parse the regular expression REGEXP and return the structure tree.
2072 : If an error is occured, ERR is set by error code, and return NULL.
2073 : This function build the following tree, from regular expression <reg_exp>:
2074 : CAT
2075 : / \
2076 : / \
2077 : <reg_exp> EOR
2078 :
2079 : CAT means concatenation.
2080 321 : EOR means end of regular expression. */
2081 :
2082 : static bin_tree_t *
2083 321 : parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2084 : reg_errcode_t *err)
2085 : {
2086 321 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2087 321 : bin_tree_t *tree, *eor, *root;
2088 321 : re_token_t current_token;
2089 321 : dfa->syntax = syntax;
2090 77 : fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2091 244 : tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2092 244 : if (BE (*err != REG_NOERROR && tree == NULL, 0))
2093 209 : return NULL;
2094 : eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2095 35 : if (tree != NULL)
2096 244 : root = create_tree (dfa, tree, eor, CONCAT);
2097 : else
2098 0 : root = eor;
2099 0 : if (BE (eor == NULL || root == NULL, 0))
2100 : {
2101 244 : *err = REG_ESPACE;
2102 : return NULL;
2103 : }
2104 : return root;
2105 : }
2106 :
2107 : /* This function build the following tree, from regular expression
2108 : <branch1>|<branch2>:
2109 : ALT
2110 : / \
2111 : / \
2112 : <branch1> <branch2>
2113 :
2114 399 : ALT means alternative, which represents the operator `|'. */
2115 :
2116 : static bin_tree_t *
2117 399 : parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2118 399 : reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2119 399 : {
2120 399 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2121 78 : bin_tree_t *tree, *branch = NULL;
2122 : tree = parse_branch (regexp, preg, token, syntax, nest, err);
2123 794 : if (BE (*err != REG_NOERROR && tree == NULL, 0))
2124 : return NULL;
2125 154 :
2126 154 : while (token->type == OP_ALT)
2127 144 : {
2128 : fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2129 144 : if (token->type != OP_ALT && token->type != END_OF_RE
2130 286 : && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2131 2 : {
2132 : branch = parse_branch (regexp, preg, token, syntax, nest, err);
2133 : if (BE (*err != REG_NOERROR && branch == NULL, 0))
2134 10 : return NULL;
2135 152 : }
2136 152 : else
2137 : branch = NULL;
2138 0 : tree = create_tree (dfa, tree, branch, OP_ALT);
2139 0 : if (BE (tree == NULL, 0))
2140 : {
2141 : *err = REG_ESPACE;
2142 319 : return NULL;
2143 : }
2144 : }
2145 : return tree;
2146 : }
2147 :
2148 : /* This function build the following tree, from regular expression
2149 : <exp1><exp2>:
2150 : CAT
2151 : / \
2152 : / \
2153 : <exp1> <exp2>
2154 :
2155 543 : CAT means concatenation. */
2156 :
2157 : static bin_tree_t *
2158 : parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2159 543 : reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2160 543 : {
2161 543 : bin_tree_t *tree, *expr;
2162 46 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2163 : tree = parse_expression (regexp, preg, token, syntax, nest, err);
2164 1303 : if (BE (*err != REG_NOERROR && tree == NULL, 0))
2165 413 : return NULL;
2166 :
2167 343 : while (token->type != OP_ALT && token->type != END_OF_RE
2168 343 : && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2169 : {
2170 34 : expr = parse_expression (regexp, preg, token, syntax, nest, err);
2171 : if (BE (*err != REG_NOERROR && expr == NULL, 0))
2172 309 : {
2173 : return NULL;
2174 309 : }
2175 618 : if (tree != NULL && expr != NULL)
2176 : {
2177 0 : tree = create_tree (dfa, tree, expr, CONCAT);
2178 0 : if (tree == NULL)
2179 : {
2180 : *err = REG_ESPACE;
2181 0 : return NULL;
2182 0 : }
2183 : }
2184 : else if (tree == NULL)
2185 463 : tree = expr;
2186 : /* Otherwise expr == NULL, we don't need to create new tree. */
2187 : }
2188 : return tree;
2189 : }
2190 :
2191 : /* This function build the following tree, from regular expression a*:
2192 : *
2193 : |
2194 : a
2195 886 : */
2196 :
2197 : static bin_tree_t *
2198 886 : parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2199 : reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2200 886 : {
2201 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2202 321 : bin_tree_t *tree;
2203 321 : switch (token->type)
2204 321 : {
2205 : case CHARACTER:
2206 0 : tree = create_token_tree (dfa, NULL, NULL, token);
2207 0 : if (BE (tree == NULL, 0))
2208 : {
2209 : *err = REG_ESPACE;
2210 321 : return NULL;
2211 : }
2212 0 : #ifdef RE_ENABLE_I18N
2213 0 : if (dfa->mb_cur_max > 1)
2214 : {
2215 : while (!re_string_eoi (regexp)
2216 0 : && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2217 0 : {
2218 0 : bin_tree_t *mbc_remain;
2219 0 : fetch_token (token, regexp, syntax);
2220 : mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2221 0 : tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2222 0 : if (BE (mbc_remain == NULL || tree == NULL, 0))
2223 : {
2224 : *err = REG_ESPACE;
2225 : return NULL;
2226 : }
2227 321 : }
2228 78 : }
2229 78 : #endif
2230 78 : break;
2231 8 : case OP_OPEN_SUBEXP:
2232 70 : tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2233 247 : if (BE (*err != REG_NOERROR && tree == NULL, 0))
2234 247 : return NULL;
2235 247 : break;
2236 36 : case OP_OPEN_BRACKET:
2237 211 : tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2238 9 : if (BE (*err != REG_NOERROR && tree == NULL, 0))
2239 9 : return NULL;
2240 : break;
2241 9 : case OP_BACK_REF:
2242 9 : if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2243 : {
2244 0 : *err = REG_ESUBREG;
2245 0 : return NULL;
2246 0 : }
2247 : dfa->used_bkref_map |= 1 << token->opr.idx;
2248 0 : tree = create_token_tree (dfa, NULL, NULL, token);
2249 0 : if (BE (tree == NULL, 0))
2250 : {
2251 0 : *err = REG_ESPACE;
2252 0 : return NULL;
2253 0 : }
2254 3 : ++dfa->nbackref;
2255 3 : dfa->has_mb_node = 1;
2256 : break;
2257 0 : case OP_OPEN_DUP_NUM:
2258 0 : if (syntax & RE_CONTEXT_INVALID_DUP)
2259 : {
2260 : *err = REG_BADRPT;
2261 : return NULL;
2262 : }
2263 : /* FALLTHROUGH */
2264 11 : case OP_DUP_ASTERISK:
2265 : case OP_DUP_PLUS:
2266 0 : case OP_DUP_QUESTION:
2267 0 : if (syntax & RE_CONTEXT_INVALID_OPS)
2268 : {
2269 11 : *err = REG_BADRPT;
2270 : return NULL;
2271 0 : }
2272 0 : else if (syntax & RE_CONTEXT_INDEP_OPS)
2273 : {
2274 : fetch_token (token, regexp, syntax);
2275 : return parse_expression (regexp, preg, token, syntax, nest, err);
2276 19 : }
2277 4 : /* else fall through */
2278 : case OP_CLOSE_SUBEXP:
2279 4 : if ((token->type == OP_CLOSE_SUBEXP) &&
2280 4 : !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2281 : {
2282 : *err = REG_ERPAREN;
2283 : return NULL;
2284 : }
2285 : /* else fall through */
2286 : case OP_CLOSE_DUP_NUM:
2287 13 : /* We treat it as a normal character. */
2288 :
2289 : /* Then we can these characters as normal characters. */
2290 13 : token->type = CHARACTER;
2291 13 : /* mb_partial and word_char bits should be initialized already
2292 : by peek_token. */
2293 0 : tree = create_token_tree (dfa, NULL, NULL, token);
2294 0 : if (BE (tree == NULL, 0))
2295 : {
2296 13 : *err = REG_ESPACE;
2297 123 : return NULL;
2298 246 : }
2299 123 : break;
2300 28 : case ANCHOR:
2301 27 : if ((token->opr.ctx_type
2302 123 : & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2303 117 : && dfa->word_ops_used == 0)
2304 17 : init_word_char (dfa);
2305 : if (token->opr.ctx_type == WORD_DELIM
2306 17 : || token->opr.ctx_type == NOT_WORD_DELIM)
2307 : {
2308 6 : bin_tree_t *tree_first, *tree_last;
2309 6 : if (token->opr.ctx_type == WORD_DELIM)
2310 6 : {
2311 : token->opr.ctx_type = WORD_FIRST;
2312 : tree_first = create_token_tree (dfa, NULL, NULL, token);
2313 : token->opr.ctx_type = WORD_LAST;
2314 11 : }
2315 11 : else
2316 11 : {
2317 : token->opr.ctx_type = INSIDE_WORD;
2318 17 : tree_first = create_token_tree (dfa, NULL, NULL, token);
2319 17 : token->opr.ctx_type = INSIDE_NOTWORD;
2320 17 : }
2321 : tree_last = create_token_tree (dfa, NULL, NULL, token);
2322 0 : tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2323 0 : if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2324 : {
2325 : *err = REG_ESPACE;
2326 : return NULL;
2327 : }
2328 106 : }
2329 106 : else
2330 : {
2331 0 : tree = create_token_tree (dfa, NULL, NULL, token);
2332 0 : if (BE (tree == NULL, 0))
2333 : {
2334 : *err = REG_ESPACE;
2335 : return NULL;
2336 : }
2337 : }
2338 : /* We must return here, since ANCHORs can't be followed
2339 123 : by repetition operators.
2340 123 : eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2341 5 : it must not be "<ANCHOR(^)><REPEAT(*)>". */
2342 5 : fetch_token (token, regexp, syntax);
2343 5 : return tree;
2344 : case OP_PERIOD:
2345 0 : tree = create_token_tree (dfa, NULL, NULL, token);
2346 0 : if (BE (tree == NULL, 0))
2347 : {
2348 5 : *err = REG_ESPACE;
2349 0 : return NULL;
2350 5 : }
2351 10 : if (dfa->mb_cur_max > 1)
2352 : dfa->has_mb_node = 1;
2353 10 : break;
2354 : case OP_WORD:
2355 : case OP_NOTWORD:
2356 10 : tree = build_charclass_op (dfa, regexp->trans,
2357 10 : (const unsigned char *) "alnum",
2358 0 : (const unsigned char *) "_",
2359 10 : token->type == OP_NOTWORD, err);
2360 14 : if (BE (*err != REG_NOERROR && tree == NULL, 0))
2361 : return NULL;
2362 14 : break;
2363 : case OP_SPACE:
2364 : case OP_NOTSPACE:
2365 14 : tree = build_charclass_op (dfa, regexp->trans,
2366 14 : (const unsigned char *) "space",
2367 0 : (const unsigned char *) "",
2368 14 : token->type == OP_NOTSPACE, err);
2369 43 : if (BE (*err != REG_NOERROR && tree == NULL, 0))
2370 : return NULL;
2371 43 : break;
2372 19 : case OP_ALT:
2373 19 : case END_OF_RE:
2374 19 : return NULL;
2375 0 : case BACK_SLASH:
2376 : *err = REG_EESCAPE;
2377 : return NULL;
2378 : default:
2379 : /* Must not happen? */
2380 0 : #ifdef DEBUG
2381 : assert (0);
2382 644 : #endif
2383 : return NULL;
2384 644 : }
2385 647 : fetch_token (token, regexp, syntax);
2386 :
2387 165 : while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2388 165 : || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2389 4 : {
2390 : tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2391 161 : if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392 0 : return NULL;
2393 0 : /* In BRE consecutive duplications are not allowed. */
2394 : if ((syntax & RE_CONTEXT_INVALID_DUP)
2395 0 : && (token->type == OP_DUP_ASTERISK
2396 0 : || token->type == OP_OPEN_DUP_NUM))
2397 : {
2398 : *err = REG_BADRPT;
2399 : return NULL;
2400 640 : }
2401 : }
2402 :
2403 : return tree;
2404 : }
2405 :
2406 : /* This function build the following tree, from regular expression
2407 : (<reg_exp>):
2408 : SUBEXP
2409 : |
2410 : <reg_exp>
2411 78 : */
2412 :
2413 : static bin_tree_t *
2414 78 : parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2415 : reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2416 : {
2417 78 : re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2418 : bin_tree_t *tree;
2419 78 : size_t cur_nsub;
2420 : cur_nsub = preg->re_nsub++;
2421 :
2422 78 : fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2423 0 :
2424 : /* The subexpression may be a null string. */
2425 : if (token->type == OP_CLOSE_SUBEXP)
2426 78 : tree = NULL;
2427 78 : else
2428 5 : {
2429 78 : tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2430 8 : if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2431 : *err = REG_EPAREN;
2432 : if (BE (*err != REG_NOERROR, 0))
2433 70 : return NULL;
2434 70 : }
2435 :
2436 70 : if (cur_nsub <= '9' - '1')
2437 70 : dfa->completed_bkref_map |= 1 << cur_nsub;
2438 :
2439 0 : tree = create_tree (dfa, tree, NULL, SUBEXP);
2440 0 : if (BE (tree == NULL, 0))
2441 : {
2442 70 : *err = REG_ESPACE;
2443 70 : return NULL;
2444 : }
2445 : tree->token.opr.idx = cur_nsub;
2446 : return tree;
2447 : }
2448 :
2449 165 : /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2450 :
2451 : static bin_tree_t *
2452 165 : parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2453 165 : re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2454 165 : {
2455 : bin_tree_t *tree = NULL, *old_tree = NULL;
2456 165 : Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2457 : re_token_t start_token = *token;
2458 4 :
2459 4 : if (token->type == OP_OPEN_DUP_NUM)
2460 4 : {
2461 : end = 0;
2462 0 : start = fetch_number (regexp, token, syntax);
2463 0 : if (start == REG_MISSING)
2464 : {
2465 : if (token->type == CHARACTER && token->opr.c == ',')
2466 0 : start = 0; /* We treat "{,m}" as "{0,m}". */
2467 0 : else
2468 : {
2469 : *err = REG_BADBR; /* <re>{} is invalid. */
2470 4 : return NULL;
2471 : }
2472 : }
2473 0 : if (BE (start != REG_ERROR, 1))
2474 0 : {
2475 0 : /* We treat "{n}" as "{n,n}". */
2476 : end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2477 4 : : ((token->type == CHARACTER && token->opr.c == ',')
2478 : ? fetch_number (regexp, token, syntax) : REG_ERROR));
2479 : }
2480 4 : if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2481 : {
2482 4 : /* Invalid sequence. */
2483 4 : if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2484 : {
2485 0 : if (token->type == END_OF_RE)
2486 : *err = REG_EBRACE;
2487 4 : else
2488 : *err = REG_BADBR;
2489 :
2490 : return NULL;
2491 0 : }
2492 0 :
2493 0 : /* If the syntax bit is set, rollback. */
2494 : re_string_set_index (regexp, start_idx);
2495 : *token = start_token;
2496 0 : token->type = CHARACTER;
2497 : /* mb_partial and word_char bits should be already initialized by
2498 : peek_token. */
2499 0 : return elem;
2500 : }
2501 :
2502 0 : if (BE (end != REG_MISSING && start > end, 0))
2503 0 : {
2504 : /* First number greater than second. */
2505 : *err = REG_BADBR;
2506 : return NULL;
2507 : }
2508 161 : }
2509 161 : else
2510 : {
2511 : start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2512 161 : end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2513 : }
2514 161 :
2515 0 : fetch_token (token, regexp, syntax);
2516 161 :
2517 : if (BE (elem == NULL, 0))
2518 0 : return NULL;
2519 0 : if (BE (start == 0 && end == 0, 0))
2520 : {
2521 : postorder (elem, free_tree, NULL);
2522 : return NULL;
2523 161 : }
2524 :
2525 5 : /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2526 5 : if (BE (start > 0, 0))
2527 : {
2528 0 : tree = elem;
2529 0 : for (i = 2; i <= start; ++i)
2530 0 : {
2531 : elem = duplicate_tree (elem, dfa);
2532 : tree = create_tree (dfa, tree, elem, CONCAT);
2533 : if (BE (elem == NULL || tree == NULL, 0))
2534 5 : goto parse_dup_op_espace;
2535 0 : }
2536 :
2537 : if (start == end)
2538 5 : return tree;
2539 5 :
2540 : /* Duplicate ELEM before it is marked optional. */
2541 : elem = duplicate_tree (elem, dfa);
2542 156 : old_tree = tree;
2543 : }
2544 161 : else
2545 0 : old_tree = NULL;
2546 :
2547 161 : if (elem->token.type == SUBEXP)
2548 : postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2549 161 :
2550 0 : tree = create_tree (dfa, elem, NULL,
2551 : (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2552 : if (BE (tree == NULL, 0))
2553 : goto parse_dup_op_espace;
2554 :
2555 161 : /* This loop is actually executed only when end != REG_MISSING,
2556 3 : to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2557 : already created the start+1-th copy. */
2558 0 : if ((Idx) -1 < 0 || end != REG_MISSING)
2559 0 : for (i = start + 2; i <= end; ++i)
2560 0 : {
2561 : elem = duplicate_tree (elem, dfa);
2562 : tree = create_tree (dfa, tree, elem, CONCAT);
2563 0 : if (BE (elem == NULL || tree == NULL, 0))
2564 0 : goto parse_dup_op_espace;
2565 0 :
2566 : tree = create_tree (dfa, tree, NULL, OP_ALT);
2567 : if (BE (tree == NULL, 0))
2568 161 : goto parse_dup_op_espace;
2569 5 : }
2570 :
2571 161 : if (old_tree)
2572 : tree = create_tree (dfa, old_tree, tree, CONCAT);
2573 0 :
2574 0 : return tree;
2575 0 :
2576 : parse_dup_op_espace:
2577 : *err = REG_ESPACE;
2578 : return NULL;
2579 : }
2580 :
2581 : /* Size of the names for collating symbol/equivalence_class/character_class.
2582 : I'm not sure, but maybe enough. */
2583 : #define BRACKET_NAME_BUF_SIZE 32
2584 :
2585 : #ifndef _LIBC
2586 : /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2587 : Build the range expression which starts from START_ELEM, and ends
2588 : at END_ELEM. The result are written to MBCSET and SBCSET.
2589 : RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2590 : mbcset->range_ends, is a pointer argument sinse we may
2591 : update it. */
2592 :
2593 4 : static reg_errcode_t
2594 : internal_function
2595 : # ifdef RE_ENABLE_I18N
2596 : build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2597 : bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2598 : # else /* not RE_ENABLE_I18N */
2599 : build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2600 : bracket_elem_t *end_elem)
2601 : # endif /* not RE_ENABLE_I18N */
2602 4 : {
2603 : unsigned int start_ch, end_ch;
2604 : /* Equivalence Classes and Character Classes can't be a range start/end. */
2605 0 : if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2606 : || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2607 : 0))
2608 : return REG_ERANGE;
2609 4 :
2610 : /* We can handle no multi character collating elements without libc
2611 : support. */
2612 : if (BE ((start_elem->type == COLL_SYM
2613 0 : && strlen ((char *) start_elem->opr.name) > 1)
2614 : || (end_elem->type == COLL_SYM
2615 : && strlen ((char *) end_elem->opr.name) > 1), 0))
2616 : return REG_ECOLLATE;
2617 :
2618 : # ifdef RE_ENABLE_I18N
2619 : {
2620 4 : wchar_t wc;
2621 : wint_t start_wc;
2622 12 : wint_t end_wc;
2623 8 : wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2624 :
2625 12 : start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2626 8 : : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2627 : : 0));
2628 8 : end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2629 8 : : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2630 8 : : 0));
2631 8 : start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2632 4 : ? __btowc (start_ch) : start_elem->opr.wch);
2633 3 : end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2634 3 : ? __btowc (end_ch) : end_elem->opr.wch);
2635 3 : if (start_wc == WEOF || end_wc == WEOF)
2636 3 : return REG_ECOLLATE;
2637 1 : cmp_buf[0] = start_wc;
2638 : cmp_buf[4] = end_wc;
2639 : if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2640 : return REG_ERANGE;
2641 :
2642 : /* Got valid collation sequence values, add them as a new entry.
2643 : However, for !_LIBC we have no collation elements: if the
2644 2 : character set is single byte, the single byte character set
2645 : that we build below suffices. parse_bracket_exp passes
2646 : no MBCSET if dfa->mb_cur_max == 1. */
2647 0 : if (mbcset)
2648 : {
2649 : /* Check the space of the arrays. */
2650 : if (BE (*range_alloc == mbcset->nranges, 0))
2651 : {
2652 : /* There is not enough space, need realloc. */
2653 : wchar_t *new_array_start, *new_array_end;
2654 0 : Idx new_nranges;
2655 :
2656 : /* +1 in case of mbcset->nranges is 0. */
2657 0 : new_nranges = 2 * mbcset->nranges + 1;
2658 : /* Use realloc since mbcset->range_starts and mbcset->range_ends
2659 0 : are NULL if *range_alloc == 0. */
2660 : new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2661 : new_nranges);
2662 0 : new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2663 0 : new_nranges);
2664 :
2665 0 : if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2666 0 : return REG_ESPACE;
2667 0 :
2668 : mbcset->range_starts = new_array_start;
2669 : mbcset->range_ends = new_array_end;
2670 0 : *range_alloc = new_nranges;
2671 0 : }
2672 :
2673 : mbcset->range_starts[mbcset->nranges] = start_wc;
2674 : mbcset->range_ends[mbcset->nranges++] = end_wc;
2675 514 : }
2676 :
2677 512 : /* Build the table for single byte characters. */
2678 512 : for (wc = 0; wc < SBC_MAX; ++wc)
2679 422 : {
2680 48 : cmp_buf[2] = wc;
2681 : if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2682 : && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2683 : bitset_set (sbcset, wc);
2684 : }
2685 : }
2686 : # else /* not RE_ENABLE_I18N */
2687 : {
2688 : unsigned int ch;
2689 : start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2690 : : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2691 : : 0));
2692 : end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2693 : : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2694 : : 0));
2695 : if (start_ch > end_ch)
2696 : return REG_ERANGE;
2697 : /* Build the table for single byte characters. */
2698 : for (ch = 0; ch < SBC_MAX; ++ch)
2699 : if (start_ch <= ch && ch <= end_ch)
2700 2 : bitset_set (sbcset, ch);
2701 : }
2702 : # endif /* not RE_ENABLE_I18N */
2703 : return REG_NOERROR;
2704 : }
2705 : #endif /* not _LIBC */
2706 :
2707 : #ifndef _LIBC
2708 : /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2709 : Build the collating element which is represented by NAME.
2710 : The result are written to MBCSET and SBCSET.
2711 : COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2712 : pointer argument since we may update it. */
2713 0 :
2714 : static reg_errcode_t
2715 : internal_function
2716 : build_collating_symbol (bitset_t sbcset,
2717 : # ifdef RE_ENABLE_I18N
2718 : re_charset_t *mbcset, Idx *coll_sym_alloc,
2719 0 : # endif
2720 0 : const unsigned char *name)
2721 0 : {
2722 : size_t name_len = strlen ((const char *) name);
2723 : if (BE (name_len != 1, 0))
2724 0 : return REG_ECOLLATE;
2725 0 : else
2726 : {
2727 : bitset_set (sbcset, name[0]);
2728 : return REG_NOERROR;
2729 : }
2730 : }
2731 : #endif /* not _LIBC */
2732 :
2733 : /* This function parse bracket expression like "[abc]", "[a-c]",
2734 247 : "[[.a-a.]]" etc. */
2735 :
2736 : static bin_tree_t *
2737 : parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2738 : reg_syntax_t syntax, reg_errcode_t *err)
2739 : {
2740 : #ifdef _LIBC
2741 : const unsigned char *collseqmb;
2742 : const char *collseqwc;
2743 : uint32_t nrules;
2744 : int32_t table_size;
2745 : const int32_t *symb_table;
2746 : const unsigned char *extra;
2747 :
2748 : /* Local function for parse_bracket_exp used in _LIBC environement.
2749 : Seek the collating symbol entry correspondings to NAME.
2750 : Return the index of the symbol in the SYMB_TABLE. */
2751 :
2752 : auto inline int32_t
2753 : __attribute ((always_inline))
2754 : seek_collating_symbol_entry (name, name_len)
2755 : const unsigned char *name;
2756 : size_t name_len;
2757 : {
2758 : int32_t hash = elem_hash ((const char *) name, name_len);
2759 : int32_t elem = hash % table_size;
2760 : if (symb_table[2 * elem] != 0)
2761 : {
2762 : int32_t second = hash % (table_size - 2) + 1;
2763 :
2764 : do
2765 : {
2766 : /* First compare the hashing value. */
2767 : if (symb_table[2 * elem] == hash
2768 : /* Compare the length of the name. */
2769 : && name_len == extra[symb_table[2 * elem + 1]]
2770 : /* Compare the name. */
2771 : && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2772 : name_len) == 0)
2773 : {
2774 : /* Yep, this is the entry. */
2775 : break;
2776 : }
2777 :
2778 : /* Next entry. */
2779 : elem += second;
2780 : }
2781 : while (symb_table[2 * elem] != 0);
2782 : }
2783 : return elem;
2784 : }
2785 :
2786 : /* Local function for parse_bracket_exp used in _LIBC environement.
2787 : Look up the collation sequence value of BR_ELEM.
2788 : Return the value if succeeded, UINT_MAX otherwise. */
2789 :
2790 : auto inline unsigned int
2791 : __attribute ((always_inline))
2792 : lookup_collation_sequence_value (br_elem)
2793 : bracket_elem_t *br_elem;
2794 : {
2795 : if (br_elem->type == SB_CHAR)
2796 : {
2797 : /*
2798 : if (MB_CUR_MAX == 1)
2799 : */
2800 : if (nrules == 0)
2801 : return collseqmb[br_elem->opr.ch];
2802 : else
2803 : {
2804 : wint_t wc = __btowc (br_elem->opr.ch);
2805 : return __collseq_table_lookup (collseqwc, wc);
2806 : }
2807 : }
2808 : else if (br_elem->type == MB_CHAR)
2809 : {
2810 : return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2811 : }
2812 : else if (br_elem->type == COLL_SYM)
2813 : {
2814 : size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2815 : if (nrules != 0)
2816 : {
2817 : int32_t elem, idx;
2818 : elem = seek_collating_symbol_entry (br_elem->opr.name,
2819 : sym_name_len);
2820 : if (symb_table[2 * elem] != 0)
2821 : {
2822 : /* We found the entry. */
2823 : idx = symb_table[2 * elem + 1];
2824 : /* Skip the name of collating element name. */
2825 : idx += 1 + extra[idx];
2826 : /* Skip the byte sequence of the collating element. */
2827 : idx += 1 + extra[idx];
2828 : /* Adjust for the alignment. */
2829 : idx = (idx + 3) & ~3;
2830 : /* Skip the multibyte collation sequence value. */
2831 : idx += sizeof (unsigned int);
2832 : /* Skip the wide char sequence of the collating element. */
2833 : idx += sizeof (unsigned int) *
2834 : (1 + *(unsigned int *) (extra + idx));
2835 : /* Return the collation sequence value. */
2836 : return *(unsigned int *) (extra + idx);
2837 : }
2838 : else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2839 : {
2840 : /* No valid character. Match it as a single byte
2841 : character. */
2842 : return collseqmb[br_elem->opr.name[0]];
2843 : }
2844 : }
2845 : else if (sym_name_len == 1)
2846 : return collseqmb[br_elem->opr.name[0]];
2847 : }
2848 : return UINT_MAX;
2849 : }
2850 :
2851 : /* Local function for parse_bracket_exp used in _LIBC environement.
2852 : Build the range expression which starts from START_ELEM, and ends
2853 : at END_ELEM. The result are written to MBCSET and SBCSET.
2854 : RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2855 : mbcset->range_ends, is a pointer argument sinse we may
2856 : update it. */
2857 :
2858 : auto inline reg_errcode_t
2859 : __attribute ((always_inline))
2860 : build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2861 : re_charset_t *mbcset;
2862 : Idx *range_alloc;
2863 : bitset_t sbcset;
2864 : bracket_elem_t *start_elem, *end_elem;
2865 : {
2866 : unsigned int ch;
2867 : uint32_t start_collseq;
2868 : uint32_t end_collseq;
2869 :
2870 : /* Equivalence Classes and Character Classes can't be a range
2871 : start/end. */
2872 : if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2873 : || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2874 : 0))
2875 : return REG_ERANGE;
2876 :
2877 : start_collseq = lookup_collation_sequence_value (start_elem);
2878 : end_collseq = lookup_collation_sequence_value (end_elem);
2879 : /* Check start/end collation sequence values. */
2880 : if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2881 : return REG_ECOLLATE;
2882 : if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2883 : return REG_ERANGE;
2884 :
2885 : /* Got valid collation sequence values, add them as a new entry.
2886 : However, if we have no collation elements, and the character set
2887 : is single byte, the single byte character set that we
2888 : build below suffices. */
2889 : if (nrules > 0 || dfa->mb_cur_max > 1)
2890 : {
2891 : /* Check the space of the arrays. */
2892 : if (BE (*range_alloc == mbcset->nranges, 0))
2893 : {
2894 : /* There is not enough space, need realloc. */
2895 : uint32_t *new_array_start;
2896 : uint32_t *new_array_end;
2897 : Idx new_nranges;
2898 :
2899 : /* +1 in case of mbcset->nranges is 0. */
2900 : new_nranges = 2 * mbcset->nranges + 1;
2901 : new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2902 : new_nranges);
2903 : new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2904 : new_nranges);
2905 :
2906 : if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2907 : return REG_ESPACE;
2908 :
2909 : mbcset->range_starts = new_array_start;
2910 : mbcset->range_ends = new_array_end;
2911 : *range_alloc = new_nranges;
2912 : }
2913 :
2914 : mbcset->range_starts[mbcset->nranges] = start_collseq;
2915 : mbcset->range_ends[mbcset->nranges++] = end_collseq;
2916 : }
2917 :
2918 : /* Build the table for single byte characters. */
2919 : for (ch = 0; ch < SBC_MAX; ch++)
2920 : {
2921 : uint32_t ch_collseq;
2922 : /*
2923 : if (MB_CUR_MAX == 1)
2924 : */
2925 : if (nrules == 0)
2926 : ch_collseq = collseqmb[ch];
2927 : else
2928 : ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2929 : if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2930 : bitset_set (sbcset, ch);
2931 : }
2932 : return REG_NOERROR;
2933 : }
2934 :
2935 : /* Local function for parse_bracket_exp used in _LIBC environement.
2936 : Build the collating element which is represented by NAME.
2937 : The result are written to MBCSET and SBCSET.
2938 : COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2939 : pointer argument sinse we may update it. */
2940 :
2941 : auto inline reg_errcode_t
2942 : __attribute ((always_inline))
2943 : build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2944 : re_charset_t *mbcset;
2945 : Idx *coll_sym_alloc;
2946 : bitset_t sbcset;
2947 : const unsigned char *name;
2948 : {
2949 : int32_t elem, idx;
2950 : size_t name_len = strlen ((const char *) name);
2951 : if (nrules != 0)
2952 : {
2953 : elem = seek_collating_symbol_entry (name, name_len);
2954 : if (symb_table[2 * elem] != 0)
2955 : {
2956 : /* We found the entry. */
2957 : idx = symb_table[2 * elem + 1];
2958 : /* Skip the name of collating element name. */
2959 : idx += 1 + extra[idx];
2960 : }
2961 : else if (symb_table[2 * elem] == 0 && name_len == 1)
2962 : {
2963 : /* No valid character, treat it as a normal
2964 : character. */
2965 : bitset_set (sbcset, name[0]);
2966 : return REG_NOERROR;
2967 : }
2968 : else
2969 : return REG_ECOLLATE;
2970 :
2971 : /* Got valid collation sequence, add it as a new entry. */
2972 : /* Check the space of the arrays. */
2973 : if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2974 : {
2975 : /* Not enough, realloc it. */
2976 : /* +1 in case of mbcset->ncoll_syms is 0. */
2977 : Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2978 : /* Use realloc since mbcset->coll_syms is NULL
2979 : if *alloc == 0. */
2980 : int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2981 : new_coll_sym_alloc);
2982 : if (BE (new_coll_syms == NULL, 0))
2983 : return REG_ESPACE;
2984 : mbcset->coll_syms = new_coll_syms;
2985 : *coll_sym_alloc = new_coll_sym_alloc;
2986 : }
2987 : mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2988 : return REG_NOERROR;
2989 : }
2990 : else
2991 : {
2992 : if (BE (name_len != 1, 0))
2993 : return REG_ECOLLATE;
2994 : else
2995 : {
2996 : bitset_set (sbcset, name[0]);
2997 : return REG_NOERROR;
2998 : }
2999 : }
3000 : }
3001 : #endif
3002 :
3003 : re_token_t br_token;
3004 247 : re_bitset_ptr_t sbcset;
3005 247 : #ifdef RE_ENABLE_I18N
3006 : re_charset_t *mbcset;
3007 247 : Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3008 : Idx equiv_class_alloc = 0, char_class_alloc = 0;
3009 : #endif /* not RE_ENABLE_I18N */
3010 247 : bool non_match = false;
3011 : bin_tree_t *work_tree;
3012 : int token_len;
3013 : bool first_round = true;
3014 : #ifdef _LIBC
3015 : collseqmb = (const unsigned char *)
3016 : _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3017 : nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3018 : if (nrules)
3019 : {
3020 : /*
3021 : if (MB_CUR_MAX > 1)
3022 : */
3023 : collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3024 : table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3025 : symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3026 : _NL_COLLATE_SYMB_TABLEMB);
3027 : extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3028 247 : _NL_COLLATE_SYMB_EXTRAMB);
3029 : }
3030 247 : #endif
3031 : sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3032 : #ifdef RE_ENABLE_I18N
3033 247 : mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3034 : #endif /* RE_ENABLE_I18N */
3035 : #ifdef RE_ENABLE_I18N
3036 : if (BE (sbcset == NULL || mbcset == NULL, 0))
3037 : #else
3038 0 : if (BE (sbcset == NULL, 0))
3039 0 : #endif /* RE_ENABLE_I18N */
3040 : {
3041 : *err = REG_ESPACE;
3042 247 : return NULL;
3043 247 : }
3044 :
3045 3 : token_len = peek_token_bracket (token, regexp, syntax);
3046 3 : if (BE (token->type == END_OF_RE, 0))
3047 : {
3048 244 : *err = REG_BADPAT;
3049 : goto parse_bracket_exp_free_return;
3050 : }
3051 6 : if (token->type == OP_NON_MATCH_LIST)
3052 : {
3053 6 : #ifdef RE_ENABLE_I18N
3054 6 : mbcset->non_match = 1;
3055 0 : #endif /* not RE_ENABLE_I18N */
3056 6 : non_match = true;
3057 6 : if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3058 6 : bitset_set (sbcset, '\n');
3059 : re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3060 3 : token_len = peek_token_bracket (token, regexp, syntax);
3061 3 : if (BE (token->type == END_OF_RE, 0))
3062 : {
3063 : *err = REG_BADPAT;
3064 : goto parse_bracket_exp_free_return;
3065 : }
3066 241 : }
3067 75 :
3068 : /* We treat the first ']' as a normal character. */
3069 : if (token->type == OP_CLOSE_BRACKET)
3070 569 : token->type = CHARACTER;
3071 :
3072 : while (1)
3073 : {
3074 : bracket_elem_t start_elem, end_elem;
3075 810 : unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3076 810 : unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3077 : reg_errcode_t ret;
3078 : int token_len2 = 0;
3079 810 : bool is_range_exp = false;
3080 810 : re_token_t token2;
3081 :
3082 810 : start_elem.opr.name = start_name_buf;
3083 : ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3084 13 : syntax, first_round);
3085 43 : if (BE (ret != REG_NOERROR, 0))
3086 : {
3087 797 : *err = ret;
3088 : goto parse_bracket_exp_free_return;
3089 : }
3090 797 : first_round = false;
3091 :
3092 : /* Get information about the next token. We need it in any case. */
3093 797 : token_len = peek_token_bracket (token, regexp, syntax);
3094 :
3095 796 : /* Do not check for ranges if we know they are not allowed. */
3096 : if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3097 10 : {
3098 10 : if (BE (token->type == END_OF_RE, 0))
3099 : {
3100 786 : *err = REG_EBRACK;
3101 : goto parse_bracket_exp_free_return;
3102 7 : }
3103 7 : if (token->type == OP_CHARSET_RANGE)
3104 7 : {
3105 : re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3106 2 : token_len2 = peek_token_bracket (&token2, regexp, syntax);
3107 2 : if (BE (token2.type == END_OF_RE, 0))
3108 : {
3109 5 : *err = REG_EBRACK;
3110 : goto parse_bracket_exp_free_return;
3111 : }
3112 1 : if (token2.type == OP_CLOSE_BRACKET)
3113 1 : {
3114 : /* We treat the last '-' as a normal character. */
3115 : re_string_skip_bytes (regexp, -token_len);
3116 4 : token->type = CHARACTER;
3117 : }
3118 : else
3119 : is_range_exp = true;
3120 785 : }
3121 : }
3122 4 :
3123 4 : if (is_range_exp == true)
3124 : {
3125 4 : end_elem.opr.name = end_name_buf;
3126 : ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3127 0 : dfa, syntax, true);
3128 0 : if (BE (ret != REG_NOERROR, 0))
3129 : {
3130 : *err = ret;
3131 4 : goto parse_bracket_exp_free_return;
3132 : }
3133 :
3134 : token_len = peek_token_bracket (token, regexp, syntax);
3135 :
3136 : #ifdef _LIBC
3137 : *err = build_range_exp (sbcset, mbcset, &range_alloc,
3138 4 : &start_elem, &end_elem);
3139 4 : #else
3140 : # ifdef RE_ENABLE_I18N
3141 : *err = build_range_exp (sbcset,
3142 : dfa->mb_cur_max > 1 ? mbcset : NULL,
3143 : &range_alloc, &start_elem, &end_elem);
3144 : # else
3145 4 : *err = build_range_exp (sbcset, &start_elem, &end_elem);
3146 2 : # endif
3147 : #endif /* RE_ENABLE_I18N */
3148 : if (BE (*err != REG_NOERROR, 0))
3149 : goto parse_bracket_exp_free_return;
3150 781 : }
3151 : else
3152 780 : {
3153 780 : switch (start_elem.type)
3154 780 : {
3155 : case SB_CHAR:
3156 0 : bitset_set (sbcset, start_elem.opr.ch);
3157 : break;
3158 0 : #ifdef RE_ENABLE_I18N
3159 : case MB_CHAR:
3160 : /* Check whether the array has enough space. */
3161 : if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3162 : {
3163 0 : wchar_t *new_mbchars;
3164 : /* Not enough, realloc it. */
3165 0 : /* +1 in case of mbcset->nmbchars is 0. */
3166 : mbchar_alloc = 2 * mbcset->nmbchars + 1;
3167 0 : /* Use realloc since array is NULL if *alloc == 0. */
3168 0 : new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3169 0 : mbchar_alloc);
3170 : if (BE (new_mbchars == NULL, 0))
3171 0 : goto parse_bracket_exp_espace;
3172 0 : mbcset->mbchars = new_mbchars;
3173 : }
3174 1 : mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3175 1 : break;
3176 : #endif /* RE_ENABLE_I18N */
3177 : case EQUIV_CLASS:
3178 : *err = build_equiv_class (sbcset,
3179 1 : #ifdef RE_ENABLE_I18N
3180 1 : mbcset, &equiv_class_alloc,
3181 1 : #endif /* RE_ENABLE_I18N */
3182 0 : start_elem.opr.name);
3183 0 : if (BE (*err != REG_NOERROR, 0))
3184 0 : goto parse_bracket_exp_free_return;
3185 : break;
3186 : case COLL_SYM:
3187 : *err = build_collating_symbol (sbcset,
3188 0 : #ifdef RE_ENABLE_I18N
3189 0 : mbcset, &coll_sym_alloc,
3190 0 : #endif /* RE_ENABLE_I18N */
3191 0 : start_elem.opr.name);
3192 0 : if (BE (*err != REG_NOERROR, 0))
3193 0 : goto parse_bracket_exp_free_return;
3194 : break;
3195 : case CHAR_CLASS:
3196 : *err = build_charclass (regexp->trans, sbcset,
3197 0 : #ifdef RE_ENABLE_I18N
3198 0 : mbcset, &char_class_alloc,
3199 0 : #endif /* RE_ENABLE_I18N */
3200 0 : start_elem.opr.name, syntax);
3201 0 : if (BE (*err != REG_NOERROR, 0))
3202 0 : goto parse_bracket_exp_free_return;
3203 : break;
3204 : default:
3205 : assert (0);
3206 782 : break;
3207 : }
3208 2 : }
3209 2 : if (BE (token->type == END_OF_RE, 0))
3210 : {
3211 780 : *err = REG_EBRACK;
3212 211 : goto parse_bracket_exp_free_return;
3213 : }
3214 : if (token->type == OP_CLOSE_BRACKET)
3215 211 : break;
3216 : }
3217 :
3218 211 : re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3219 0 :
3220 : /* If it is non-matching list. */
3221 : if (non_match)
3222 : bitset_not (sbcset);
3223 211 :
3224 0 : #ifdef RE_ENABLE_I18N
3225 : /* Ensure only single byte characters are set. */
3226 211 : if (dfa->mb_cur_max > 1)
3227 211 : bitset_mask (sbcset, dfa->sb_char);
3228 0 :
3229 0 : if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3230 : || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3231 : || mbcset->non_match)))
3232 : {
3233 0 : bin_tree_t *mbc_tree;
3234 0 : int sbc_idx;
3235 0 : /* Build a tree for complex bracket. */
3236 0 : dfa->has_mb_node = 1;
3237 0 : br_token.type = COMPLEX_BRACKET;
3238 0 : br_token.opr.mbcset = mbcset;
3239 0 : mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3240 0 : if (BE (mbc_tree == NULL, 0))
3241 0 : goto parse_bracket_exp_espace;
3242 : for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3243 : if (sbcset[sbc_idx])
3244 0 : break;
3245 : /* If there are no bits set in sbcset, there is no point
3246 : of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3247 0 : if (sbc_idx < BITSET_WORDS)
3248 0 : {
3249 0 : /* Build a tree for simple bracket. */
3250 0 : br_token.type = SIMPLE_BRACKET;
3251 0 : br_token.opr.sbcset = sbcset;
3252 : work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3253 : if (BE (work_tree == NULL, 0))
3254 0 : goto parse_bracket_exp_espace;
3255 0 :
3256 0 : /* Then join them by ALT node. */
3257 : work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3258 : if (BE (work_tree == NULL, 0))
3259 : goto parse_bracket_exp_espace;
3260 0 : }
3261 0 : else
3262 : {
3263 : re_free (sbcset);
3264 : work_tree = mbc_tree;
3265 : }
3266 : }
3267 : else
3268 211 : #endif /* not RE_ENABLE_I18N */
3269 : {
3270 : #ifdef RE_ENABLE_I18N
3271 211 : free_charset (mbcset);
3272 211 : #endif
3273 211 : /* Build a tree for simple bracket. */
3274 211 : br_token.type = SIMPLE_BRACKET;
3275 0 : br_token.opr.sbcset = sbcset;
3276 : work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277 211 : if (BE (work_tree == NULL, 0))
3278 : goto parse_bracket_exp_espace;
3279 0 : }
3280 0 : return work_tree;
3281 36 :
3282 36 : parse_bracket_exp_espace:
3283 : *err = REG_ESPACE;
3284 36 : parse_bracket_exp_free_return:
3285 : re_free (sbcset);
3286 36 : #ifdef RE_ENABLE_I18N
3287 : free_charset (mbcset);
3288 : #endif /* RE_ENABLE_I18N */
3289 : return NULL;
3290 : }
3291 :
3292 814 : /* Parse an element in the bracket expression. */
3293 :
3294 : static reg_errcode_t
3295 : parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3296 : re_token_t *token, int token_len, re_dfa_t *dfa,
3297 : reg_syntax_t syntax, bool accept_hyphen)
3298 814 : {
3299 814 : #ifdef RE_ENABLE_I18N
3300 : int cur_char_size;
3301 0 : cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3302 0 : if (cur_char_size > 1)
3303 0 : {
3304 0 : elem->type = MB_CHAR;
3305 : elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3306 : re_string_skip_bytes (regexp, cur_char_size);
3307 814 : return REG_NOERROR;
3308 814 : }
3309 811 : #endif /* RE_ENABLE_I18N */
3310 14 : re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3311 800 : if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3312 : || token->type == OP_OPEN_EQUIV_CLASS)
3313 : return parse_bracket_symbol (elem, regexp, token);
3314 : if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3315 : {
3316 0 : /* A '-' must only appear as anything but a range indicator before
3317 0 : the closing bracket. Everything else is an error. */
3318 : re_token_t token2;
3319 : (void) peek_token_bracket (&token2, regexp, syntax);
3320 0 : if (token2.type != OP_CLOSE_BRACKET)
3321 : /* The actual error value is not standardized since this whole
3322 800 : case is undefined. But ERANGE makes good sense. */
3323 800 : return REG_ERANGE;
3324 800 : }
3325 : elem->type = SB_CHAR;
3326 : elem->opr.ch = token->opr.c;
3327 : return REG_NOERROR;
3328 : }
3329 :
3330 : /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3331 : such as [:<character_class>:], [.<collating_element>.], and
3332 14 : [=<equivalent_class>=]. */
3333 :
3334 : static reg_errcode_t
3335 14 : parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3336 14 : re_token_t *token)
3337 14 : {
3338 8 : unsigned char ch, delim = token->opr.c;
3339 2 : int i = 0;
3340 : if (re_string_eoi(regexp))
3341 10 : return REG_EBRACK;
3342 0 : for (;; ++i)
3343 8 : {
3344 1 : if (i >= BRACKET_NAME_BUF_SIZE)
3345 : return REG_EBRACK;
3346 7 : if (token->type == OP_OPEN_CHAR_CLASS)
3347 8 : ch = re_string_fetch_byte_case (regexp);
3348 5 : else
3349 3 : ch = re_string_fetch_byte (regexp);
3350 1 : if (re_string_eoi(regexp))
3351 2 : return REG_EBRACK;
3352 : if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3353 1 : break;
3354 1 : elem->opr.name[i] = ch;
3355 1 : }
3356 : re_string_skip_bytes (regexp, 1);
3357 0 : elem->opr.name[i] = '\0';
3358 0 : switch (token->type)
3359 0 : {
3360 1 : case OP_OPEN_COLL_ELEM:
3361 1 : elem->type = COLL_SYM;
3362 1 : break;
3363 0 : case OP_OPEN_EQUIV_CLASS:
3364 0 : elem->type = EQUIV_CLASS;
3365 0 : break;
3366 0 : case OP_OPEN_CHAR_CLASS:
3367 0 : elem->type = CHAR_CLASS;
3368 : break;
3369 1 : default:
3370 : break;
3371 : }
3372 : return REG_NOERROR;
3373 : }
3374 :
3375 : /* Helper function for parse_bracket_exp.
3376 : Build the equivalence class which is represented by NAME.
3377 : The result are written to MBCSET and SBCSET.
3378 : EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3379 : is a pointer argument sinse we may update it. */
3380 1 :
3381 : static reg_errcode_t
3382 : #ifdef RE_ENABLE_I18N
3383 : build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3384 : Idx *equiv_class_alloc, const unsigned char *name)
3385 : #else /* not RE_ENABLE_I18N */
3386 : build_equiv_class (bitset_t sbcset, const unsigned char *name)
3387 : #endif /* not RE_ENABLE_I18N */
3388 : {
3389 : #ifdef _LIBC
3390 : uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3391 : if (nrules != 0)
3392 : {
3393 : const int32_t *table, *indirect;
3394 : const unsigned char *weights, *extra, *cp;
3395 : unsigned char char_buf[2];
3396 : int32_t idx1, idx2;
3397 : unsigned int ch;
3398 : size_t len;
3399 : /* This #include defines a local function! */
3400 : # include <locale/weight.h>
3401 : /* Calculate the index for equivalence class. */
3402 : cp = name;
3403 : table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3404 : weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3405 : _NL_COLLATE_WEIGHTMB);
3406 : extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3407 : _NL_COLLATE_EXTRAMB);
3408 : indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3409 : _NL_COLLATE_INDIRECTMB);
3410 : idx1 = findidx (&cp);
3411 : if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3412 : /* This isn't a valid character. */
3413 : return REG_ECOLLATE;
3414 :
3415 : /* Build single byte matcing table for this equivalence class. */
3416 : char_buf[1] = (unsigned char) '\0';
3417 : len = weights[idx1];
3418 : for (ch = 0; ch < SBC_MAX; ++ch)
3419 : {
3420 : char_buf[0] = ch;
3421 : cp = char_buf;
3422 : idx2 = findidx (&cp);
3423 : /*
3424 : idx2 = table[ch];
3425 : */
3426 : if (idx2 == 0)
3427 : /* This isn't a valid character. */
3428 : continue;
3429 : if (len == weights[idx2])
3430 : {
3431 : int cnt = 0;
3432 : while (cnt <= len &&
3433 : weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3434 : ++cnt;
3435 :
3436 : if (cnt > len)
3437 : bitset_set (sbcset, ch);
3438 : }
3439 : }
3440 : /* Check whether the array has enough space. */
3441 : if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3442 : {
3443 : /* Not enough, realloc it. */
3444 : /* +1 in case of mbcset->nequiv_classes is 0. */
3445 : Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3446 : /* Use realloc since the array is NULL if *alloc == 0. */
3447 : int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3448 : int32_t,
3449 : new_equiv_class_alloc);
3450 : if (BE (new_equiv_classes == NULL, 0))
3451 : return REG_ESPACE;
3452 : mbcset->equiv_classes = new_equiv_classes;
3453 : *equiv_class_alloc = new_equiv_class_alloc;
3454 : }
3455 : mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3456 : }
3457 1 : else
3458 1 : #endif /* _LIBC */
3459 0 : {
3460 : if (BE (strlen ((const char *) name) != 1, 0))
3461 0 : return REG_ECOLLATE;
3462 : bitset_set (sbcset, *name);
3463 : }
3464 : return REG_NOERROR;
3465 : }
3466 :
3467 : /* Helper function for parse_bracket_exp.
3468 : Build the character class which is represented by NAME.
3469 : The result are written to MBCSET and SBCSET.
3470 : CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3471 : is a pointer argument sinse we may update it. */
3472 24 :
3473 : static reg_errcode_t
3474 : #ifdef RE_ENABLE_I18N
3475 : build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3476 : re_charset_t *mbcset, Idx *char_class_alloc,
3477 : const unsigned char *class_name, reg_syntax_t syntax)
3478 : #else /* not RE_ENABLE_I18N */
3479 : build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3480 : const unsigned char *class_name, reg_syntax_t syntax)
3481 24 : #endif /* not RE_ENABLE_I18N */
3482 : {
3483 : int i;
3484 : const char *name = (const char *) class_name;
3485 24 :
3486 0 : /* In case of REG_ICASE "upper" and "lower" match the both of
3487 0 : upper and lower cases. */
3488 : if ((syntax & RE_ICASE)
3489 : && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3490 : name = "alpha";
3491 24 :
3492 : #ifdef RE_ENABLE_I18N
3493 : /* Check the space of the arrays. */
3494 : if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3495 24 : {
3496 : /* Not enough, realloc it. */
3497 24 : /* +1 in case of mbcset->nchar_classes is 0. */
3498 : Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3499 24 : /* Use realloc since array is NULL if *alloc == 0. */
3500 0 : wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3501 24 : new_char_class_alloc);
3502 24 : if (BE (new_char_classes == NULL, 0))
3503 : return REG_ESPACE;
3504 24 : mbcset->char_classes = new_char_classes;
3505 : *char_class_alloc = new_char_class_alloc;
3506 : }
3507 : mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3508 : #endif /* RE_ENABLE_I18N */
3509 :
3510 : #define BUILD_CHARCLASS_LOOP(ctype_func) \
3511 : do { \
3512 : if (BE (trans != NULL, 0)) \
3513 : { \
3514 : for (i = 0; i < SBC_MAX; ++i) \
3515 : if (ctype_func (i)) \
3516 : bitset_set (sbcset, trans[i]); \
3517 : } \
3518 : else \
3519 : { \
3520 : for (i = 0; i < SBC_MAX; ++i) \
3521 : if (ctype_func (i)) \
3522 : bitset_set (sbcset, i); \
3523 24 : } \
3524 10 : } while (0)
3525 14 :
3526 0 : if (strcmp (name, "alnum") == 0)
3527 14 : BUILD_CHARCLASS_LOOP (isalnum);
3528 0 : else if (strcmp (name, "cntrl") == 0)
3529 14 : BUILD_CHARCLASS_LOOP (iscntrl);
3530 14 : else if (strcmp (name, "lower") == 0)
3531 0 : BUILD_CHARCLASS_LOOP (islower);
3532 0 : else if (strcmp (name, "space") == 0)
3533 0 : BUILD_CHARCLASS_LOOP (isspace);
3534 0 : else if (strcmp (name, "alpha") == 0)
3535 0 : BUILD_CHARCLASS_LOOP (isalpha);
3536 0 : else if (strcmp (name, "digit") == 0)
3537 0 : BUILD_CHARCLASS_LOOP (isdigit);
3538 0 : else if (strcmp (name, "print") == 0)
3539 0 : BUILD_CHARCLASS_LOOP (isprint);
3540 0 : else if (strcmp (name, "upper") == 0)
3541 0 : BUILD_CHARCLASS_LOOP (isupper);
3542 0 : else if (strcmp (name, "blank") == 0)
3543 0 : BUILD_CHARCLASS_LOOP (isblank);
3544 0 : else if (strcmp (name, "graph") == 0)
3545 0 : BUILD_CHARCLASS_LOOP (isgraph);
3546 0 : else if (strcmp (name, "punct") == 0)
3547 : BUILD_CHARCLASS_LOOP (ispunct);
3548 0 : else if (strcmp (name, "xdigit") == 0)
3549 : BUILD_CHARCLASS_LOOP (isxdigit);
3550 24 : else
3551 : return REG_ECTYPE;
3552 :
3553 : return REG_NOERROR;
3554 24 : }
3555 :
3556 : static bin_tree_t *
3557 : build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3558 : const unsigned char *class_name,
3559 : const unsigned char *extra, bool non_match,
3560 : reg_errcode_t *err)
3561 : {
3562 24 : re_bitset_ptr_t sbcset;
3563 : #ifdef RE_ENABLE_I18N
3564 : re_charset_t *mbcset;
3565 : Idx alloc = 0;
3566 : #endif /* not RE_ENABLE_I18N */
3567 : reg_errcode_t ret;
3568 24 : re_token_t br_token;
3569 : bin_tree_t *tree;
3570 24 :
3571 : sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3572 : #ifdef RE_ENABLE_I18N
3573 : mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3574 24 : #endif /* RE_ENABLE_I18N */
3575 :
3576 : #ifdef RE_ENABLE_I18N
3577 : if (BE (sbcset == NULL || mbcset == NULL, 0))
3578 : #else /* not RE_ENABLE_I18N */
3579 0 : if (BE (sbcset == NULL, 0))
3580 0 : #endif /* not RE_ENABLE_I18N */
3581 : {
3582 : *err = REG_ESPACE;
3583 24 : return NULL;
3584 : }
3585 :
3586 13 : if (non_match)
3587 : {
3588 : #ifdef RE_ENABLE_I18N
3589 : mbcset->non_match = 1;
3590 : #endif /* not RE_ENABLE_I18N */
3591 24 : }
3592 :
3593 : /* We don't care the syntax in this case. */
3594 : ret = build_charclass (trans, sbcset,
3595 : #ifdef RE_ENABLE_I18N
3596 : mbcset, &alloc,
3597 24 : #endif /* RE_ENABLE_I18N */
3598 : class_name, 0);
3599 0 :
3600 : if (BE (ret != REG_NOERROR, 0))
3601 0 : {
3602 : re_free (sbcset);
3603 0 : #ifdef RE_ENABLE_I18N
3604 0 : free_charset (mbcset);
3605 : #endif /* RE_ENABLE_I18N */
3606 : *err = ret;
3607 34 : return NULL;
3608 10 : }
3609 : /* \w match '_' also. */
3610 : for (; *extra; extra++)
3611 24 : bitset_set (sbcset, *extra);
3612 13 :
3613 : /* If it is non-matching list. */
3614 : if (non_match)
3615 : bitset_not (sbcset);
3616 24 :
3617 0 : #ifdef RE_ENABLE_I18N
3618 : /* Ensure only single byte characters are set. */
3619 : if (dfa->mb_cur_max > 1)
3620 : bitset_mask (sbcset, dfa->sb_char);
3621 24 : #endif
3622 24 :
3623 24 : /* Build a tree for simple bracket. */
3624 24 : br_token.type = SIMPLE_BRACKET;
3625 0 : br_token.opr.sbcset = sbcset;
3626 : tree = create_token_tree (dfa, NULL, NULL, &br_token);
3627 : if (BE (tree == NULL, 0))
3628 24 : goto build_word_op_espace;
3629 :
3630 : #ifdef RE_ENABLE_I18N
3631 : if (dfa->mb_cur_max > 1)
3632 0 : {
3633 0 : bin_tree_t *mbc_tree;
3634 0 : /* Build a tree for complex bracket. */
3635 0 : br_token.type = COMPLEX_BRACKET;
3636 0 : br_token.opr.mbcset = mbcset;
3637 0 : dfa->has_mb_node = 1;
3638 : mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3639 0 : if (BE (mbc_tree == NULL, 0))
3640 0 : goto build_word_op_espace;
3641 0 : /* Then join them by ALT node. */
3642 : tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3643 : if (BE (mbc_tree != NULL, 1))
3644 : return tree;
3645 24 : }
3646 24 : else
3647 : {
3648 : free_charset (mbcset);
3649 : return tree;
3650 : }
3651 : #else /* not RE_ENABLE_I18N */
3652 0 : return tree;
3653 0 : #endif /* not RE_ENABLE_I18N */
3654 :
3655 0 : build_word_op_espace:
3656 : re_free (sbcset);
3657 0 : #ifdef RE_ENABLE_I18N
3658 0 : free_charset (mbcset);
3659 : #endif /* RE_ENABLE_I18N */
3660 : *err = REG_ESPACE;
3661 : return NULL;
3662 : }
3663 :
3664 : /* This is intended for the expressions like "a{1,3}".
3665 : Fetch a number from `input', and return the number.
3666 : Return REG_MISSING if the number field is empty like "{,1}".
3667 4 : Return REG_ERROR if an error occurred. */
3668 :
3669 4 : static Idx
3670 : fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3671 : {
3672 : Idx num = REG_MISSING;
3673 12 : unsigned char c;
3674 8 : while (1)
3675 8 : {
3676 4 : fetch_token (token, input, syntax);
3677 4 : c = token->opr.c;
3678 : if (BE (token->type == END_OF_RE, 0))
3679 9 : return REG_ERROR;
3680 0 : if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3681 : break;
3682 4 : num = ((token->type != CHARACTER || c < '0' || '9' < c
3683 4 : || num == REG_ERROR)
3684 : ? REG_ERROR
3685 0 : : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3686 : num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3687 : }
3688 : return num;
3689 : }
3690 271 :
3691 : #ifdef RE_ENABLE_I18N
3692 271 : static void
3693 : free_charset (re_charset_t *cset)
3694 : {
3695 : re_free (cset->mbchars);
3696 : # ifdef _LIBC
3697 : re_free (cset->coll_syms);
3698 : re_free (cset->equiv_classes);
3699 271 : re_free (cset->range_starts);
3700 271 : re_free (cset->range_ends);
3701 271 : # endif
3702 : re_free (cset->char_classes);
3703 : re_free (cset);
3704 : }
3705 : #endif /* RE_ENABLE_I18N */
3706 :
3707 : /* Functions for binary tree operation. */
3708 :
3709 1447 : /* Create a tree node. */
3710 :
3711 : static bin_tree_t *
3712 : create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3713 1447 : re_token_type_t type)
3714 1447 : {
3715 : re_token_t t;
3716 : t.type = type;
3717 : return create_token_tree (dfa, left, right, &t);
3718 2174 : }
3719 :
3720 : static bin_tree_t *
3721 : create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3722 2174 : const re_token_t *token)
3723 : {
3724 355 : bin_tree_t *tree;
3725 : if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3726 355 : {
3727 0 : bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3728 355 :
3729 355 : if (storage == NULL)
3730 355 : return NULL;
3731 : storage->next = dfa->str_tree_storage;
3732 2174 : dfa->str_tree_storage = storage;
3733 : dfa->str_tree_storage_idx = 0;
3734 2174 : }
3735 2174 : tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3736 2174 :
3737 2174 : tree->parent = NULL;
3738 2174 : tree->left = left;
3739 2174 : tree->right = right;
3740 2174 : tree->token = *token;
3741 2174 : tree->token.duplicated = 0;
3742 2174 : tree->token.opt_subexp = 0;
3743 : tree->first = NULL;
3744 2174 : tree->next = NULL;
3745 1058 : tree->node_idx = REG_MISSING;
3746 2174 :
3747 822 : if (left != NULL)
3748 2174 : left->parent = tree;
3749 : if (right != NULL)
3750 : right->parent = tree;
3751 : return tree;
3752 : }
3753 :
3754 : /* Mark the tree SRC as an optional subexpression.
3755 0 : To be called from preorder or postorder. */
3756 :
3757 0 : static reg_errcode_t
3758 0 : mark_opt_subexp (void *extra, bin_tree_t *node)
3759 0 : {
3760 : Idx idx = (Idx) (long) extra;
3761 0 : if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3762 : node->token.opt_subexp = 1;
3763 :
3764 : return REG_NOERROR;
3765 : }
3766 :
3767 117 : /* Free the allocated memory inside NODE. */
3768 :
3769 : static void
3770 117 : free_token (re_token_t *node)
3771 0 : {
3772 : #ifdef RE_ENABLE_I18N
3773 : if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3774 117 : free_charset (node->opr.mbcset);
3775 5 : else
3776 117 : #endif /* RE_ENABLE_I18N */
3777 : if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3778 : re_free (node->opr.sbcset);
3779 : }
3780 :
3781 : /* Worker function for tree walking. Free the allocated memory inside NODE
3782 0 : and its children. */
3783 :
3784 0 : static reg_errcode_t
3785 0 : free_tree (void *extra, bin_tree_t *node)
3786 : {
3787 : free_token (&node->token);
3788 : return REG_NOERROR;
3789 : }
3790 :
3791 :
3792 : /* Duplicate the node SRC, and return new node. This is a preorder
3793 : visit similar to the one implemented by the generic visitor, but
3794 : we need more infrastructure to maintain two parallel trees --- so,
3795 5 : it's easier to duplicate. */
3796 :
3797 : static bin_tree_t *
3798 : duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3799 5 : {
3800 : const bin_tree_t *node;
3801 5 : bin_tree_t *dup_root;
3802 : bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3803 :
3804 21 : for (node = root; ; )
3805 13 : {
3806 0 : /* Create a new tree and link it back to the current parent. */
3807 13 : *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3808 13 : if (*p_new == NULL)
3809 13 : return NULL;
3810 : (*p_new)->parent = dup_node;
3811 : (*p_new)->token.duplicated = 1;
3812 13 : dup_node = *p_new;
3813 :
3814 7 : /* Go to the left node, or up and to the right. */
3815 7 : if (node->left)
3816 : {
3817 : node = node->left;
3818 : p_new = &dup_node->left;
3819 6 : }
3820 20 : else
3821 : {
3822 13 : const bin_tree_t *prev = NULL;
3823 13 : while (node->right == prev || node->right == NULL)
3824 13 : {
3825 13 : prev = node;
3826 5 : node = node->parent;
3827 : dup_node = dup_node->parent;
3828 1 : if (!node)
3829 1 : return dup_root;
3830 : }
3831 : node = node->right;
3832 : p_new = &dup_node->right;
3833 : }
3834 : }
3835 : }
|