Line data Source code
1 : /* -*- buffer-read-only: t -*- vi: set ro: */
2 : /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 : #line 1
4 : /* Extended regular expression matching and search library.
5 : Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 Free Software
6 : Foundation, Inc.
7 : This file is part of the GNU C Library.
8 : Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
9 :
10 : This program is free software; you can redistribute it and/or modify
11 : it under the terms of the GNU General Public License as published by
12 : the Free Software Foundation; either version 3, or (at your option)
13 : any later version.
14 :
15 : This program is distributed in the hope that it will be useful,
16 : but WITHOUT ANY WARRANTY; without even the implied warranty of
17 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 : GNU General Public License for more details.
19 :
20 : You should have received a copy of the GNU General Public License along
21 : with this program; if not, write to the Free Software Foundation,
22 : Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
23 :
24 : static void re_string_construct_common (const char *str, Idx len,
25 : re_string_t *pstr,
26 : RE_TRANSLATE_TYPE trans, bool icase,
27 : const re_dfa_t *dfa) internal_function;
28 : static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
29 : const re_node_set *nodes,
30 : re_hashval_t hash) internal_function;
31 : static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
32 : const re_node_set *nodes,
33 : unsigned int context,
34 : re_hashval_t hash) internal_function;
35 :
36 : /* Functions for string operation. */
37 :
38 : /* This function allocate the buffers. It is necessary to call
39 : re_string_reconstruct before using the object. */
40 356 :
41 : static reg_errcode_t
42 : internal_function
43 : re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
44 : RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
45 : {
46 : reg_errcode_t ret;
47 356 : Idx init_buf_len;
48 0 :
49 356 : /* Ensure at least one character fits into the buffers. */
50 356 : if (init_len < dfa->mb_cur_max)
51 : init_len = dfa->mb_cur_max;
52 356 : init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
53 356 : re_string_construct_common (str, len, pstr, trans, icase, dfa);
54 0 :
55 : ret = re_string_realloc_buffers (pstr, init_buf_len);
56 356 : if (BE (ret != REG_NOERROR, 0))
57 356 : return ret;
58 356 :
59 356 : pstr->word_char = dfa->word_char;
60 356 : pstr->word_ops_used = dfa->word_ops_used;
61 356 : pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
62 : pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
63 : pstr->valid_raw_len = pstr->valid_len;
64 : return REG_NOERROR;
65 : }
66 :
67 : /* This function allocate the buffers, and initialize them. */
68 321 :
69 : static reg_errcode_t
70 : internal_function
71 : re_string_construct (re_string_t *pstr, const char *str, Idx len,
72 321 : RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
73 321 : {
74 : reg_errcode_t ret;
75 321 : memset (pstr, '\0', sizeof (re_string_t));
76 : re_string_construct_common (str, len, pstr, trans, icase, dfa);
77 286 :
78 286 : if (len > 0)
79 0 : {
80 : ret = re_string_realloc_buffers (pstr, len + 1);
81 321 : if (BE (ret != REG_NOERROR, 0))
82 : return ret;
83 321 : }
84 : pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
85 :
86 0 : if (icase)
87 : {
88 : #ifdef RE_ENABLE_I18N
89 : if (dfa->mb_cur_max > 1)
90 0 : {
91 0 : while (1)
92 0 : {
93 0 : ret = build_wcs_upper_buffer (pstr);
94 0 : if (BE (ret != REG_NOERROR, 0))
95 0 : return ret;
96 0 : if (pstr->valid_raw_len >= len)
97 0 : break;
98 0 : if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
99 0 : break;
100 : ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
101 : if (BE (ret != REG_NOERROR, 0))
102 : return ret;
103 : }
104 0 : }
105 : else
106 : #endif /* RE_ENABLE_I18N */
107 : build_upper_buffer (pstr);
108 : }
109 321 : else
110 0 : {
111 : #ifdef RE_ENABLE_I18N
112 : if (dfa->mb_cur_max > 1)
113 : build_wcs_buffer (pstr);
114 321 : else
115 0 : #endif /* RE_ENABLE_I18N */
116 : {
117 : if (trans != NULL)
118 321 : re_string_translate_buffer (pstr);
119 321 : else
120 : {
121 : pstr->valid_len = pstr->bufs_len;
122 : pstr->valid_raw_len = pstr->bufs_len;
123 : }
124 321 : }
125 : }
126 :
127 : return REG_NOERROR;
128 : }
129 :
130 : /* Helper functions for re_string_allocate, and re_string_construct. */
131 642 :
132 : static reg_errcode_t
133 : internal_function
134 642 : re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
135 : {
136 : #ifdef RE_ENABLE_I18N
137 : if (pstr->mb_cur_max > 1)
138 : {
139 0 : wint_t *new_wcs;
140 0 :
141 0 : /* Avoid overflow. */
142 : size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
143 0 : if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
144 0 : return REG_ESPACE;
145 0 :
146 0 : new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
147 0 : if (BE (new_wcs == NULL, 0))
148 : return REG_ESPACE;
149 0 : pstr->wcs = new_wcs;
150 0 : if (pstr->offsets != NULL)
151 0 : {
152 0 : Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
153 : if (BE (new_offsets == NULL, 0))
154 : return REG_ESPACE;
155 : pstr->offsets = new_offsets;
156 642 : }
157 : }
158 0 : #endif /* RE_ENABLE_I18N */
159 : if (pstr->mbs_allocated)
160 0 : {
161 0 : unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
162 0 : new_buf_len);
163 : if (BE (new_mbs == NULL, 0))
164 642 : return REG_ESPACE;
165 642 : pstr->mbs = new_mbs;
166 : }
167 : pstr->bufs_len = new_buf_len;
168 : return REG_NOERROR;
169 : }
170 :
171 677 :
172 : static void
173 : internal_function
174 : re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
175 677 : RE_TRANSLATE_TYPE trans, bool icase,
176 677 : const re_dfa_t *dfa)
177 677 : {
178 677 : pstr->raw_mbs = (const unsigned char *) str;
179 677 : pstr->len = len;
180 677 : pstr->raw_len = len;
181 677 : pstr->trans = trans;
182 677 : pstr->icase = icase;
183 677 : pstr->mbs_allocated = (trans != NULL || icase);
184 677 : pstr->mb_cur_max = dfa->mb_cur_max;
185 677 : pstr->is_utf8 = dfa->is_utf8;
186 677 : pstr->map_notascii = dfa->map_notascii;
187 : pstr->stop = pstr->len;
188 : pstr->raw_stop = pstr->stop;
189 : }
190 :
191 : #ifdef RE_ENABLE_I18N
192 :
193 : /* Build wide character buffer PSTR->WCS.
194 : If the byte sequence of the string are:
195 : <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
196 : Then wide character buffer will be:
197 : <wc1> , WEOF , <wc2> , WEOF , <wc3>
198 : We use WEOF for padding, they indicate that the position isn't
199 : a first byte of a multibyte character.
200 :
201 : Note that this function assumes PSTR->VALID_LEN elements are already
202 : built and starts from PSTR->VALID_LEN. */
203 0 :
204 : static void
205 : internal_function
206 : build_wcs_buffer (re_string_t *pstr)
207 : {
208 : #ifdef _LIBC
209 : unsigned char buf[MB_LEN_MAX];
210 : assert (MB_LEN_MAX >= pstr->mb_cur_max);
211 : #else
212 : unsigned char buf[64];
213 : #endif
214 : mbstate_t prev_st;
215 : Idx byte_idx, end_idx, remain_len;
216 : size_t mbclen;
217 0 :
218 0 : /* Build the buffers from pstr->valid_len to either pstr->len or
219 : pstr->bufs_len. */
220 : end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
221 : for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
222 : {
223 0 : wchar_t wc;
224 0 : const char *p;
225 :
226 0 : remain_len = end_idx - byte_idx;
227 : prev_st = pstr->cur_state;
228 : /* Apply the translation if we need. */
229 : if (BE (pstr->trans != NULL, 0))
230 0 : {
231 : int i, ch;
232 0 :
233 0 : for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
234 : {
235 0 : ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
236 : buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
237 : }
238 0 : p = (const char *) buf;
239 0 : }
240 0 : else
241 : p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
242 : mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
243 0 : if (BE (mbclen == (size_t) -2, 0))
244 0 : {
245 : /* The buffer doesn't have enough space, finish to build. */
246 0 : pstr->cur_state = prev_st;
247 : break;
248 : }
249 0 : else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
250 0 : {
251 0 : /* We treat these cases as a singlebyte character. */
252 0 : mbclen = 1;
253 0 : wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
254 : if (BE (pstr->trans != NULL, 0))
255 : wc = pstr->trans[wc];
256 : pstr->cur_state = prev_st;
257 0 : }
258 :
259 0 : /* Write wide character and padding. */
260 0 : pstr->wcs[byte_idx++] = wc;
261 : /* Write paddings. */
262 0 : for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
263 0 : pstr->wcs[byte_idx++] = WEOF;
264 0 : }
265 : pstr->valid_len = byte_idx;
266 : pstr->valid_raw_len = byte_idx;
267 : }
268 :
269 : /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
270 : but for REG_ICASE. */
271 0 :
272 : static reg_errcode_t
273 : internal_function
274 : build_wcs_upper_buffer (re_string_t *pstr)
275 : {
276 : mbstate_t prev_st;
277 : Idx src_idx, byte_idx, end_idx, remain_len;
278 : size_t mbclen;
279 : #ifdef _LIBC
280 : char buf[MB_LEN_MAX];
281 : assert (MB_LEN_MAX >= pstr->mb_cur_max);
282 : #else
283 0 : char buf[64];
284 0 : #endif
285 :
286 : byte_idx = pstr->valid_len;
287 : end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
288 0 :
289 : /* The following optimization assumes that ASCII characters can be
290 0 : mapped to wide characters with a simple cast. */
291 : if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
292 : {
293 : while (byte_idx < end_idx)
294 0 : {
295 0 : wchar_t wc;
296 :
297 : if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
298 0 : && mbsinit (&pstr->cur_state))
299 0 : {
300 : /* In case of a singlebyte character. */
301 : pstr->mbs[byte_idx]
302 0 : = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
303 0 : /* The next step uses the assumption that wchar_t is encoded
304 0 : ASCII-safe: all ASCII values can be converted like this. */
305 : pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
306 : ++byte_idx;
307 0 : continue;
308 0 : }
309 0 :
310 0 : remain_len = end_idx - byte_idx;
311 0 : prev_st = pstr->cur_state;
312 0 : mbclen = mbrtowc (&wc,
313 : ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
314 0 : + byte_idx), remain_len, &pstr->cur_state);
315 0 : if (BE (mbclen < (size_t) -2, 1))
316 : {
317 : wchar_t wcu = wc;
318 : if (iswlower (wc))
319 0 : {
320 0 : size_t mbcdlen;
321 0 :
322 0 : wcu = towupper (wc);
323 : mbcdlen = wcrtomb (buf, wcu, &prev_st);
324 : if (BE (mbclen == mbcdlen, 1))
325 0 : memcpy (pstr->mbs + byte_idx, buf, mbclen);
326 0 : else
327 : {
328 : src_idx = byte_idx;
329 : goto offsets_needed;
330 0 : }
331 0 : }
332 0 : else
333 : memcpy (pstr->mbs + byte_idx,
334 0 : pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
335 0 : pstr->wcs[byte_idx++] = wcu;
336 : /* Write paddings. */
337 0 : for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
338 0 : pstr->wcs[byte_idx++] = WEOF;
339 : }
340 0 : else if (mbclen == (size_t) -1 || mbclen == 0)
341 0 : {
342 : /* It is an invalid character or '\0'. Just use the byte. */
343 0 : int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
344 0 : pstr->mbs[byte_idx] = ch;
345 0 : /* And also cast it to wide char. */
346 : pstr->wcs[byte_idx++] = (wchar_t) ch;
347 : if (BE (mbclen == (size_t) -1, 0))
348 : pstr->cur_state = prev_st;
349 : }
350 0 : else
351 0 : {
352 : /* The buffer doesn't have enough space, finish to build. */
353 : pstr->cur_state = prev_st;
354 0 : break;
355 0 : }
356 0 : }
357 : pstr->valid_len = byte_idx;
358 : pstr->valid_raw_len = byte_idx;
359 0 : return REG_NOERROR;
360 : }
361 : else
362 : for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
363 0 : {
364 0 : wchar_t wc;
365 0 : const char *p;
366 0 : offsets_needed:
367 : remain_len = end_idx - byte_idx;
368 : prev_st = pstr->cur_state;
369 : if (BE (pstr->trans != NULL, 0))
370 0 : {
371 : int i, ch;
372 0 :
373 0 : for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
374 : {
375 0 : ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
376 : buf[i] = pstr->trans[ch];
377 : }
378 0 : p = (const char *) buf;
379 0 : }
380 0 : else
381 : p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
382 0 : mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
383 0 : if (BE (mbclen < (size_t) -2, 1))
384 : {
385 : wchar_t wcu = wc;
386 : if (iswlower (wc))
387 0 : {
388 0 : size_t mbcdlen;
389 0 :
390 0 : wcu = towupper (wc);
391 0 : mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
392 : if (BE (mbclen == mbcdlen, 1))
393 : memcpy (pstr->mbs + byte_idx, buf, mbclen);
394 : else if (mbcdlen != (size_t) -1)
395 0 : {
396 : size_t i;
397 0 :
398 0 : if (byte_idx + mbcdlen > pstr->bufs_len)
399 : {
400 : pstr->cur_state = prev_st;
401 0 : break;
402 : }
403 0 :
404 : if (pstr->offsets == NULL)
405 0 : {
406 0 : pstr->offsets = re_malloc (Idx, pstr->bufs_len);
407 :
408 0 : if (pstr->offsets == NULL)
409 : return REG_ESPACE;
410 0 : }
411 0 : if (!pstr->offsets_needed)
412 0 : {
413 : for (i = 0; i < (size_t) byte_idx; ++i)
414 : pstr->offsets[i] = i;
415 0 : pstr->offsets_needed = 1;
416 0 : }
417 0 :
418 0 : memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
419 : pstr->wcs[byte_idx] = wcu;
420 0 : pstr->offsets[byte_idx] = src_idx;
421 0 : for (i = 1; i < mbcdlen; ++i)
422 0 : {
423 : pstr->offsets[byte_idx + i]
424 0 : = src_idx + (i < mbclen ? i : mbclen - 1);
425 0 : pstr->wcs[byte_idx + i] = WEOF;
426 0 : }
427 0 : pstr->len += mbcdlen - mbclen;
428 : if (pstr->raw_stop > src_idx)
429 0 : pstr->stop += mbcdlen - mbclen;
430 0 : end_idx = (pstr->bufs_len > pstr->len)
431 0 : ? pstr->len : pstr->bufs_len;
432 : byte_idx += mbcdlen;
433 : src_idx += mbclen;
434 0 : continue;
435 : }
436 : else
437 0 : memcpy (pstr->mbs + byte_idx, p, mbclen);
438 : }
439 0 : else
440 : memcpy (pstr->mbs + byte_idx, p, mbclen);
441 :
442 0 : if (BE (pstr->offsets_needed != 0, 0))
443 0 : {
444 : size_t i;
445 0 : for (i = 0; i < mbclen; ++i)
446 : pstr->offsets[byte_idx + i] = src_idx + i;
447 0 : }
448 : src_idx += mbclen;
449 0 :
450 0 : pstr->wcs[byte_idx++] = wcu;
451 : /* Write paddings. */
452 0 : for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
453 0 : pstr->wcs[byte_idx++] = WEOF;
454 : }
455 0 : else if (mbclen == (size_t) -1 || mbclen == 0)
456 : {
457 0 : /* It is an invalid character or '\0'. Just use the byte. */
458 0 : int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
459 0 :
460 : if (BE (pstr->trans != NULL, 0))
461 0 : ch = pstr->trans [ch];
462 0 : pstr->mbs[byte_idx] = ch;
463 0 :
464 : if (BE (pstr->offsets_needed != 0, 0))
465 : pstr->offsets[byte_idx] = src_idx;
466 0 : ++src_idx;
467 0 :
468 0 : /* And also cast it to wide char. */
469 : pstr->wcs[byte_idx++] = (wchar_t) ch;
470 : if (BE (mbclen == (size_t) -1, 0))
471 : pstr->cur_state = prev_st;
472 : }
473 0 : else
474 0 : {
475 : /* The buffer doesn't have enough space, finish to build. */
476 : pstr->cur_state = prev_st;
477 0 : break;
478 0 : }
479 0 : }
480 : pstr->valid_len = byte_idx;
481 : pstr->valid_raw_len = src_idx;
482 : return REG_NOERROR;
483 : }
484 :
485 : /* Skip characters until the index becomes greater than NEW_RAW_IDX.
486 : Return the index. */
487 0 :
488 : static Idx
489 : internal_function
490 : re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
491 : {
492 0 : mbstate_t prev_st;
493 : Idx rawbuf_idx;
494 : size_t mbclen;
495 0 : wint_t wc = WEOF;
496 :
497 : /* Skip the characters which are not necessary to check. */
498 : for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
499 : rawbuf_idx < new_raw_idx;)
500 0 : {
501 0 : wchar_t wc2;
502 0 : Idx remain_len;
503 : remain_len = pstr->len - rawbuf_idx;
504 0 : prev_st = pstr->cur_state;
505 : mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
506 : remain_len, &pstr->cur_state);
507 0 : if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
508 0 : {
509 : /* We treat these cases as a single byte character. */
510 0 : if (mbclen == 0 || remain_len == 0)
511 0 : wc = L'\0';
512 0 : else
513 : wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
514 : mbclen = 1;
515 0 : pstr->cur_state = prev_st;
516 : }
517 0 : else
518 : wc = wc2;
519 0 : /* Then proceed the next character. */
520 0 : rawbuf_idx += mbclen;
521 : }
522 : *last_wc = wc;
523 : return rawbuf_idx;
524 : }
525 : #endif /* RE_ENABLE_I18N */
526 :
527 : /* Build the buffer PSTR->MBS, and apply the translation if we need.
528 : This function is used in case of REG_ICASE. */
529 0 :
530 : static void
531 : internal_function
532 0 : build_upper_buffer (re_string_t *pstr)
533 : {
534 0 : Idx char_idx, end_idx;
535 : end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
536 0 :
537 0 : for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
538 0 : {
539 0 : int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
540 0 : if (BE (pstr->trans != NULL, 0))
541 : ch = pstr->trans[ch];
542 0 : if (islower (ch))
543 : pstr->mbs[char_idx] = toupper (ch);
544 0 : else
545 0 : pstr->mbs[char_idx] = ch;
546 0 : }
547 : pstr->valid_len = char_idx;
548 : pstr->valid_raw_len = char_idx;
549 : }
550 :
551 : /* Apply TRANS to the buffer in PSTR. */
552 0 :
553 : static void
554 : internal_function
555 0 : re_string_translate_buffer (re_string_t *pstr)
556 : {
557 0 : Idx buf_idx, end_idx;
558 : end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
559 0 :
560 0 : for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
561 : {
562 : int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
563 0 : pstr->mbs[buf_idx] = pstr->trans[ch];
564 0 : }
565 0 :
566 : pstr->valid_len = buf_idx;
567 : pstr->valid_raw_len = buf_idx;
568 : }
569 :
570 : /* This function re-construct the buffers.
571 : Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
572 : convert to upper case in case of REG_ICASE, apply translation. */
573 337 :
574 : static reg_errcode_t
575 : internal_function
576 : re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
577 337 : {
578 301 : Idx offset;
579 :
580 : if (BE (pstr->raw_mbs_idx <= idx, 0))
581 : offset = idx - pstr->raw_mbs_idx;
582 : else
583 36 : {
584 0 : /* Reset buffer. */
585 : #ifdef RE_ENABLE_I18N
586 36 : if (pstr->mb_cur_max > 1)
587 36 : memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
588 36 : #endif /* RE_ENABLE_I18N */
589 36 : pstr->len = pstr->raw_len;
590 36 : pstr->stop = pstr->raw_stop;
591 36 : pstr->valid_len = 0;
592 72 : pstr->raw_mbs_idx = 0;
593 36 : pstr->valid_raw_len = 0;
594 36 : pstr->offsets_needed = 0;
595 36 : pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
596 36 : : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
597 : if (!pstr->mbs_allocated)
598 : pstr->mbs = (unsigned char *) pstr->raw_mbs;
599 337 : offset = idx;
600 : }
601 :
602 130 : if (BE (offset != 0, 1))
603 : {
604 : /* Should the already checked characters be kept? */
605 : if (BE (offset < pstr->valid_raw_len, 1))
606 90 : {
607 : /* Yes, move them to the front of the buffer. */
608 0 : #ifdef RE_ENABLE_I18N
609 : if (BE (pstr->offsets_needed, 0))
610 : {
611 0 : Idx low = 0, high = pstr->valid_len, mid;
612 0 : do
613 0 : {
614 0 : mid = (high + low) / 2;
615 0 : if (pstr->offsets[mid] > offset)
616 : high = mid;
617 0 : else if (pstr->offsets[mid] < offset)
618 : low = mid + 1;
619 0 : else
620 0 : break;
621 0 : }
622 0 : while (low < high);
623 : if (pstr->offsets[mid] < offset)
624 : ++mid;
625 : pstr->tip_context = re_string_context_at (pstr, mid - 1,
626 : eflags);
627 : /* This can be quite complicated, so handle specially
628 0 : only the common and easy case where the character with
629 0 : different length representation of lower and upper
630 : case is present at or after offset. */
631 0 : if (pstr->valid_len > offset
632 0 : && mid == offset && pstr->offsets[mid] == offset)
633 0 : {
634 0 : memmove (pstr->wcs, pstr->wcs + offset,
635 0 : (pstr->valid_len - offset) * sizeof (wint_t));
636 0 : memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
637 0 : pstr->valid_len -= offset;
638 : pstr->valid_raw_len -= offset;
639 : for (low = 0; low < pstr->valid_len; low++)
640 : pstr->offsets[low] = pstr->offsets[low + offset] - offset;
641 : }
642 : else
643 0 : {
644 0 : /* Otherwise, just find out how long the partial multibyte
645 0 : character at offset is and fill it with WEOF/255. */
646 0 : pstr->len = pstr->raw_len - idx + offset;
647 0 : pstr->stop = pstr->raw_stop - idx + offset;
648 0 : pstr->offsets_needed = 0;
649 0 : while (mid > 0 && pstr->offsets[mid - 1] == offset)
650 0 : --mid;
651 : while (mid < pstr->valid_len)
652 0 : if (pstr->wcs[mid] != WEOF)
653 0 : break;
654 0 : else
655 : ++mid;
656 : if (mid == pstr->valid_len)
657 0 : pstr->valid_len = 0;
658 0 : else
659 : {
660 0 : pstr->valid_len = pstr->offsets[mid] - offset;
661 0 : if (pstr->valid_len)
662 0 : {
663 : for (low = 0; low < pstr->valid_len; ++low)
664 : pstr->wcs[low] = WEOF;
665 0 : memset (pstr->mbs, 255, pstr->valid_len);
666 : }
667 : }
668 : pstr->valid_raw_len = pstr->valid_len;
669 : }
670 : }
671 90 : else
672 : #endif
673 : {
674 90 : pstr->tip_context = re_string_context_at (pstr, offset - 1,
675 0 : eflags);
676 0 : #ifdef RE_ENABLE_I18N
677 : if (pstr->mb_cur_max > 1)
678 90 : memmove (pstr->wcs, pstr->wcs + offset,
679 0 : (pstr->valid_len - offset) * sizeof (wint_t));
680 0 : #endif /* RE_ENABLE_I18N */
681 90 : if (BE (pstr->mbs_allocated, 0))
682 90 : memmove (pstr->mbs, pstr->mbs + offset,
683 : pstr->valid_len - offset);
684 : pstr->valid_len -= offset;
685 : pstr->valid_raw_len -= offset;
686 : #if DEBUG
687 : assert (pstr->valid_len > 0);
688 : #endif
689 : }
690 : }
691 40 : else
692 : {
693 : /* No, skip all characters until IDX. */
694 40 : Idx prev_valid_len = pstr->valid_len;
695 :
696 0 : #ifdef RE_ENABLE_I18N
697 0 : if (BE (pstr->offsets_needed, 0))
698 0 : {
699 : pstr->len = pstr->raw_len - idx + offset;
700 : pstr->stop = pstr->raw_stop - idx + offset;
701 40 : pstr->offsets_needed = 0;
702 : }
703 40 : #endif
704 : pstr->valid_len = 0;
705 : #ifdef RE_ENABLE_I18N
706 0 : if (pstr->mb_cur_max > 1)
707 : {
708 0 : Idx wcs_idx;
709 : wint_t wc = WEOF;
710 :
711 : if (pstr->is_utf8)
712 : {
713 : const unsigned char *raw, *p, *end;
714 0 :
715 0 : /* Special case UTF-8. Multi-byte chars start with any
716 0 : byte other than 0x80 - 0xbf. */
717 0 : raw = pstr->raw_mbs + pstr->raw_mbs_idx;
718 0 : end = raw + (offset - pstr->mb_cur_max);
719 : if (end < pstr->raw_mbs)
720 : end = pstr->raw_mbs;
721 : p = raw + offset - 1;
722 : #ifdef _LIBC
723 : /* We know the wchar_t encoding is UCS4, so for the simple
724 : case, ASCII characters, skip the conversion step. */
725 : if (isascii (*p) && BE (pstr->trans == NULL, 1))
726 : {
727 : memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
728 : /* pstr->valid_len = 0; */
729 : wc = (wchar_t) *p;
730 0 : }
731 0 : else
732 : #endif
733 : for (; p >= end; --p)
734 : if ((*p & 0xc0) != 0x80)
735 0 : {
736 : mbstate_t cur_state;
737 : wchar_t wc2;
738 : Idx mlen = raw + pstr->len - p;
739 0 : unsigned char buf[6];
740 : size_t mbclen;
741 0 :
742 0 : if (BE (pstr->trans != NULL, 0))
743 0 : {
744 : int i = mlen < 6 ? mlen : 6;
745 : while (--i >= 0)
746 : buf[i] = pstr->trans[p[i]];
747 0 : }
748 0 : /* XXX Don't use mbrtowc, we know which conversion
749 : to use (UTF-8 -> UCS4). */
750 0 : memset (&cur_state, 0, sizeof (cur_state));
751 0 : mbclen = mbrtowc (&wc2, (const char *) p, mlen,
752 : &cur_state);
753 0 : if (raw + offset - p <= mbclen
754 : && mbclen < (size_t) -2)
755 0 : {
756 0 : memset (&pstr->cur_state, '\0',
757 : sizeof (mbstate_t));
758 0 : pstr->valid_len = mbclen - (raw + offset - p);
759 : wc = wc2;
760 : }
761 : break;
762 0 : }
763 0 : }
764 0 :
765 : if (wc == WEOF)
766 0 : pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
767 : if (wc == WEOF)
768 0 : pstr->tip_context
769 0 : = re_string_context_at (pstr, prev_valid_len - 1, eflags);
770 : else
771 0 : pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
772 0 : && IS_WIDE_WORD_CHAR (wc))
773 : ? CONTEXT_WORD
774 0 : : ((IS_WIDE_NEWLINE (wc)
775 : && pstr->newline_anchor)
776 0 : ? CONTEXT_NEWLINE : 0));
777 0 : if (BE (pstr->valid_len, 0))
778 0 : {
779 0 : for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
780 : pstr->wcs[wcs_idx] = WEOF;
781 0 : if (pstr->mbs_allocated)
782 : memset (pstr->mbs, 255, pstr->valid_len);
783 : }
784 : pstr->valid_raw_len = pstr->valid_len;
785 : }
786 40 : else
787 40 : #endif /* RE_ENABLE_I18N */
788 40 : {
789 0 : int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
790 80 : pstr->valid_raw_len = 0;
791 : if (pstr->trans)
792 40 : c = pstr->trans[c];
793 : pstr->tip_context = (bitset_contain (pstr->word_char, c)
794 : ? CONTEXT_WORD
795 : : ((IS_NEWLINE (c) && pstr->newline_anchor)
796 130 : ? CONTEXT_NEWLINE : 0));
797 130 : }
798 : }
799 337 : if (!BE (pstr->mbs_allocated, 0))
800 337 : pstr->mbs += offset;
801 337 : }
802 : pstr->raw_mbs_idx = idx;
803 : pstr->len -= offset;
804 : pstr->stop -= offset;
805 337 :
806 : /* Then build the buffers. */
807 0 : #ifdef RE_ENABLE_I18N
808 : if (pstr->mb_cur_max > 1)
809 0 : {
810 0 : if (pstr->icase)
811 0 : {
812 : reg_errcode_t ret = build_wcs_upper_buffer (pstr);
813 : if (BE (ret != REG_NOERROR, 0))
814 0 : return ret;
815 : }
816 : else
817 : build_wcs_buffer (pstr);
818 337 : }
819 : else
820 0 : #endif /* RE_ENABLE_I18N */
821 0 : if (BE (pstr->mbs_allocated, 0))
822 0 : {
823 0 : if (pstr->icase)
824 : build_upper_buffer (pstr);
825 : else if (pstr->trans != NULL)
826 337 : re_string_translate_buffer (pstr);
827 : }
828 337 : else
829 337 : pstr->valid_len = pstr->len;
830 :
831 : pstr->cur_idx = 0;
832 : return REG_NOERROR;
833 : }
834 475 :
835 : static unsigned char
836 : internal_function __attribute ((pure))
837 : re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
838 : {
839 : int ch;
840 475 : Idx off;
841 475 :
842 : /* Handle the common (easiest) cases first. */
843 : if (BE (!pstr->mbs_allocated, 1))
844 0 : return re_string_peek_byte (pstr, idx);
845 0 :
846 0 : #ifdef RE_ENABLE_I18N
847 : if (pstr->mb_cur_max > 1
848 : && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
849 0 : return re_string_peek_byte (pstr, idx);
850 : #endif
851 0 :
852 0 : off = pstr->cur_idx + idx;
853 : #ifdef RE_ENABLE_I18N
854 : if (pstr->offsets_needed)
855 0 : off = pstr->offsets[off];
856 : #endif
857 :
858 : ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
859 :
860 : #ifdef RE_ENABLE_I18N
861 : /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
862 0 : this function returns CAPITAL LETTER I instead of first byte of
863 0 : DOTLESS SMALL LETTER I. The latter would confuse the parser,
864 : since peek_byte_case doesn't advance cur_idx in any way. */
865 : if (pstr->offsets_needed && !isascii (ch))
866 0 : return re_string_peek_byte (pstr, idx);
867 : #endif
868 :
869 : return ch;
870 : }
871 1 :
872 : static unsigned char
873 1 : internal_function __attribute ((pure))
874 1 : re_string_fetch_byte_case (re_string_t *pstr)
875 : {
876 : if (BE (!pstr->mbs_allocated, 1))
877 0 : return re_string_fetch_byte (pstr);
878 :
879 : #ifdef RE_ENABLE_I18N
880 : if (pstr->offsets_needed)
881 : {
882 : Idx off;
883 : int ch;
884 :
885 : /* For tr_TR.UTF-8 [[:islower:]] there is
886 : [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 : in that case the whole multi-byte character and return
888 : the original letter. On the other side, with
889 0 : [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 0 : anything else would complicate things too much. */
891 :
892 0 : if (!re_string_first_byte (pstr, pstr->cur_idx))
893 0 : return re_string_fetch_byte (pstr);
894 :
895 0 : off = pstr->offsets[pstr->cur_idx];
896 0 : ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
897 :
898 0 : if (! isascii (ch))
899 : return re_string_fetch_byte (pstr);
900 0 :
901 : re_string_skip_bytes (pstr,
902 : re_string_char_size_at (pstr, pstr->cur_idx));
903 : return ch;
904 0 : }
905 : #endif
906 :
907 : return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
908 : }
909 677 :
910 : static void
911 : internal_function
912 677 : re_string_destruct (re_string_t *pstr)
913 677 : {
914 : #ifdef RE_ENABLE_I18N
915 677 : re_free (pstr->wcs);
916 0 : re_free (pstr->offsets);
917 677 : #endif /* RE_ENABLE_I18N */
918 : if (pstr->mbs_allocated)
919 : re_free (pstr->mbs);
920 : }
921 :
922 : /* Return the context at IDX in INPUT. */
923 337 :
924 : static unsigned int
925 : internal_function
926 337 : re_string_context_at (const re_string_t *input, Idx idx, int eflags)
927 : {
928 : int c;
929 110 : if (BE (! REG_VALID_INDEX (idx), 0))
930 227 : /* In this case, we use the value stored in input->tip_context,
931 53 : since we can't know the character in input->mbs[-1] here. */
932 53 : return input->tip_context;
933 : if (BE (idx == input->len, 0))
934 174 : return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935 : : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
936 : #ifdef RE_ENABLE_I18N
937 0 : if (input->mb_cur_max > 1)
938 0 : {
939 : wint_t wc;
940 : Idx wc_idx = idx;
941 : while(input->wcs[wc_idx] == WEOF)
942 : {
943 : #ifdef DEBUG
944 0 : /* It must not happen. */
945 0 : assert (REG_VALID_INDEX (wc_idx));
946 0 : #endif
947 : --wc_idx;
948 0 : if (! REG_VALID_INDEX (wc_idx))
949 0 : return input->tip_context;
950 0 : }
951 0 : wc = input->wcs[wc_idx];
952 0 : if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
953 : return CONTEXT_WORD;
954 : return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
955 : ? CONTEXT_NEWLINE : 0);
956 : }
957 174 : else
958 174 : #endif
959 32 : {
960 142 : c = re_string_byte_at (input, idx);
961 : if (bitset_contain (input->word_char, c))
962 : return CONTEXT_WORD;
963 : return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
964 : }
965 : }
966 :
967 : /* Functions for set operation. */
968 2314 :
969 : static reg_errcode_t
970 2314 : internal_function
971 2314 : re_node_set_alloc (re_node_set *set, Idx size)
972 2314 : {
973 2314 : set->alloc = size;
974 0 : set->nelem = 0;
975 2314 : set->elems = re_malloc (Idx, size);
976 : if (BE (set->elems == NULL, 0))
977 : return REG_ESPACE;
978 : return REG_NOERROR;
979 : }
980 523 :
981 : static reg_errcode_t
982 523 : internal_function
983 523 : re_node_set_init_1 (re_node_set *set, Idx elem)
984 523 : {
985 523 : set->alloc = 1;
986 : set->nelem = 1;
987 0 : set->elems = re_malloc (Idx, 1);
988 0 : if (BE (set->elems == NULL, 0))
989 : {
990 523 : set->alloc = set->nelem = 0;
991 523 : return REG_ESPACE;
992 : }
993 : set->elems[0] = elem;
994 : return REG_NOERROR;
995 : }
996 319 :
997 : static reg_errcode_t
998 319 : internal_function
999 319 : re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1000 319 : {
1001 0 : set->alloc = 2;
1002 319 : set->elems = re_malloc (Idx, 2);
1003 : if (BE (set->elems == NULL, 0))
1004 4 : return REG_ESPACE;
1005 4 : if (elem1 == elem2)
1006 : {
1007 : set->nelem = 1;
1008 : set->elems[0] = elem1;
1009 315 : }
1010 315 : else
1011 : {
1012 314 : set->nelem = 2;
1013 314 : if (elem1 < elem2)
1014 : {
1015 : set->elems[0] = elem1;
1016 : set->elems[1] = elem2;
1017 1 : }
1018 1 : else
1019 : {
1020 : set->elems[0] = elem2;
1021 319 : set->elems[1] = elem1;
1022 : }
1023 : }
1024 : return REG_NOERROR;
1025 : }
1026 945 :
1027 : static reg_errcode_t
1028 945 : internal_function
1029 945 : re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1030 : {
1031 945 : dest->nelem = src->nelem;
1032 945 : if (src->nelem > 0)
1033 945 : {
1034 : dest->alloc = dest->nelem;
1035 0 : dest->elems = re_malloc (Idx, dest->alloc);
1036 0 : if (BE (dest->elems == NULL, 0))
1037 : {
1038 945 : dest->alloc = dest->nelem = 0;
1039 : return REG_ESPACE;
1040 : }
1041 0 : memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1042 945 : }
1043 : else
1044 : re_node_set_init_empty (dest);
1045 : return REG_NOERROR;
1046 : }
1047 :
1048 : /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1049 : DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1050 : Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1051 17 :
1052 : static reg_errcode_t
1053 : internal_function
1054 : re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1055 17 : const re_node_set *src2)
1056 0 : {
1057 : Idx i1, i2, is, id, delta, sbase;
1058 : if (src1->nelem == 0 || src2->nelem == 0)
1059 : return REG_NOERROR;
1060 17 :
1061 : /* We need dest->nelem + 2 * elems_in_intersection; this is a
1062 11 : conservative estimate. */
1063 11 : if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1064 11 : {
1065 0 : Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1066 11 : Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1067 11 : if (BE (new_elems == NULL, 0))
1068 : return REG_ESPACE;
1069 : dest->elems = new_elems;
1070 : dest->alloc = new_alloc;
1071 : }
1072 17 :
1073 17 : /* Find the items in the intersection of SRC1 and SRC2, and copy
1074 17 : into the top of DEST those that are not already in DEST itself. */
1075 17 : sbase = dest->nelem + src1->nelem + src2->nelem;
1076 : i1 = src1->nelem - 1;
1077 : i2 = src2->nelem - 1;
1078 157 : id = dest->nelem - 1;
1079 : for (;;)
1080 : {
1081 139 : if (src1->elems[i1] == src2->elems[i2])
1082 11 : {
1083 : /* Try to find the item in DEST. Maybe we could binary search? */
1084 64 : while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1085 47 : --id;
1086 :
1087 64 : if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1088 : dest->elems[--sbase] = src1->elems[i1];
1089 :
1090 : if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1091 : break;
1092 23 : }
1093 :
1094 0 : /* Lower the highest of the two items. */
1095 0 : else if (src1->elems[i1] < src2->elems[i2])
1096 : {
1097 : if (! REG_VALID_INDEX (--i2))
1098 : break;
1099 23 : }
1100 5 : else
1101 : {
1102 : if (! REG_VALID_INDEX (--i1))
1103 : break;
1104 17 : }
1105 17 : }
1106 17 :
1107 : id = dest->nelem - 1;
1108 : is = dest->nelem + src1->nelem + src2->nelem - 1;
1109 : delta = is - sbase + 1;
1110 :
1111 17 : /* Now copy. When DELTA becomes zero, the remaining
1112 17 : DEST elements are already in place; this is more or
1113 : less the same loop that is in re_node_set_merge. */
1114 : dest->nelem += delta;
1115 11 : if (delta > 0 && REG_VALID_INDEX (id))
1116 : for (;;)
1117 : {
1118 0 : if (dest->elems[is] > dest->elems[id])
1119 0 : {
1120 0 : /* Copy from the top. */
1121 : dest->elems[id + delta--] = dest->elems[is--];
1122 : if (delta == 0)
1123 : break;
1124 : }
1125 11 : else
1126 11 : {
1127 11 : /* Slide from the bottom. */
1128 : dest->elems[id + delta] = dest->elems[id];
1129 : if (! REG_VALID_INDEX (--id))
1130 : break;
1131 : }
1132 17 : }
1133 :
1134 17 : /* Copy remaining SRC elements. */
1135 : memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1136 :
1137 : return REG_NOERROR;
1138 : }
1139 :
1140 : /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1141 : DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1142 0 :
1143 : static reg_errcode_t
1144 : internal_function
1145 : re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1146 0 : const re_node_set *src2)
1147 : {
1148 0 : Idx i1, i2, id;
1149 0 : if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1150 0 : {
1151 0 : dest->alloc = src1->nelem + src2->nelem;
1152 : dest->elems = re_malloc (Idx, dest->alloc);
1153 : if (BE (dest->elems == NULL, 0))
1154 : return REG_ESPACE;
1155 0 : }
1156 0 : else
1157 0 : {
1158 0 : if (src1 != NULL && src1->nelem > 0)
1159 : return re_node_set_init_copy (dest, src1);
1160 0 : else if (src2 != NULL && src2->nelem > 0)
1161 0 : return re_node_set_init_copy (dest, src2);
1162 : else
1163 0 : re_node_set_init_empty (dest);
1164 : return REG_NOERROR;
1165 0 : }
1166 : for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1167 0 : {
1168 0 : if (src1->elems[i1] > src2->elems[i2])
1169 : {
1170 0 : dest->elems[id++] = src2->elems[i2++];
1171 0 : continue;
1172 0 : }
1173 : if (src1->elems[i1] == src2->elems[i2])
1174 0 : ++i2;
1175 : dest->elems[id++] = src1->elems[i1++];
1176 0 : }
1177 0 : if (i1 < src1->nelem)
1178 0 : {
1179 : memcpy (dest->elems + id, src1->elems + i1,
1180 0 : (src1->nelem - i1) * sizeof (Idx));
1181 : id += src1->nelem - i1;
1182 0 : }
1183 0 : else if (i2 < src2->nelem)
1184 0 : {
1185 : memcpy (dest->elems + id, src2->elems + i2,
1186 0 : (src2->nelem - i2) * sizeof (Idx));
1187 0 : id += src2->nelem - i2;
1188 : }
1189 : dest->nelem = id;
1190 : return REG_NOERROR;
1191 : }
1192 :
1193 : /* Calculate the union set of the sets DEST and SRC. And store it to
1194 : DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1195 1239 :
1196 : static reg_errcode_t
1197 : internal_function
1198 1239 : re_node_set_merge (re_node_set *dest, const re_node_set *src)
1199 0 : {
1200 1239 : Idx is, id, sbase, delta;
1201 : if (src == NULL || src->nelem == 0)
1202 566 : return REG_NOERROR;
1203 566 : if (dest->alloc < 2 * src->nelem + dest->nelem)
1204 566 : {
1205 0 : Idx new_alloc = 2 * (src->nelem + dest->alloc);
1206 566 : Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1207 566 : if (BE (new_buffer == NULL, 0))
1208 : return REG_ESPACE;
1209 : dest->elems = new_buffer;
1210 1239 : dest->alloc = new_alloc;
1211 : }
1212 851 :
1213 851 : if (BE (dest->nelem == 0, 0))
1214 851 : {
1215 : dest->nelem = src->nelem;
1216 : memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1217 : return REG_NOERROR;
1218 : }
1219 2793 :
1220 388 : /* Copy into the top of DEST the items of SRC that are not
1221 1629 : found in DEST. Maybe we could binary search in DEST? */
1222 : for (sbase = dest->nelem + 2 * src->nelem,
1223 1629 : is = src->nelem - 1, id = dest->nelem - 1;
1224 5 : REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1225 1624 : {
1226 1039 : if (dest->elems[id] == src->elems[is])
1227 : is--, id--;
1228 585 : else if (dest->elems[id] < src->elems[is])
1229 : dest->elems[--sbase] = src->elems[is--];
1230 : else /* if (dest->elems[id] > src->elems[is]) */
1231 388 : --id;
1232 : }
1233 :
1234 0 : if (REG_VALID_INDEX (is))
1235 0 : {
1236 : /* If DEST is exhausted, the remaining items of SRC must be unique. */
1237 : sbase -= is + 1;
1238 388 : memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1239 388 : }
1240 388 :
1241 388 : id = dest->nelem - 1;
1242 4 : is = dest->nelem + 2 * src->nelem - 1;
1243 : delta = is - sbase + 1;
1244 : if (delta == 0)
1245 : return REG_NOERROR;
1246 384 :
1247 : /* Now copy. When DELTA becomes zero, the remaining
1248 : DEST elements are already in place. */
1249 2862 : dest->nelem += delta;
1250 : for (;;)
1251 : {
1252 1039 : if (dest->elems[is] > dest->elems[id])
1253 1039 : {
1254 384 : /* Copy from the top. */
1255 : dest->elems[id + delta--] = dest->elems[is--];
1256 : if (delta == 0)
1257 : break;
1258 : }
1259 584 : else
1260 584 : {
1261 : /* Slide from the bottom. */
1262 : dest->elems[id + delta] = dest->elems[id];
1263 0 : if (! REG_VALID_INDEX (--id))
1264 : {
1265 0 : /* Copy remaining SRC elements. */
1266 : memcpy (dest->elems, dest->elems + sbase,
1267 : delta * sizeof (Idx));
1268 : break;
1269 : }
1270 384 : }
1271 : }
1272 :
1273 : return REG_NOERROR;
1274 : }
1275 :
1276 : /* Insert the new element ELEM to the re_node_set* SET.
1277 : SET should not already have ELEM.
1278 : Return true if successful. */
1279 2137 :
1280 : static bool
1281 : internal_function
1282 : re_node_set_insert (re_node_set *set, Idx elem)
1283 2137 : {
1284 150 : Idx idx;
1285 : /* In case the set is empty. */
1286 1987 : if (set->alloc == 0)
1287 : return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1288 :
1289 1138 : if (BE (set->nelem, 0) == 0)
1290 1138 : {
1291 1138 : /* We already guaranteed above that set->alloc != 0. */
1292 : set->elems[0] = elem;
1293 : ++set->nelem;
1294 : return true;
1295 849 : }
1296 :
1297 : /* Realloc if we need. */
1298 90 : if (set->alloc == set->nelem)
1299 90 : {
1300 90 : Idx *new_elems;
1301 0 : set->alloc = set->alloc * 2;
1302 90 : new_elems = re_realloc (set->elems, Idx, set->alloc);
1303 : if (BE (new_elems == NULL, 0))
1304 : return false;
1305 : set->elems = new_elems;
1306 : }
1307 849 :
1308 : /* Move the elements which follows the new element. Test the
1309 424 : first element separately to skip a check in the inner loop. */
1310 1977 : if (elem < set->elems[0])
1311 1553 : {
1312 : idx = 0;
1313 : for (idx = set->nelem; idx > 0; idx--)
1314 : set->elems[idx] = set->elems[idx - 1];
1315 1843 : }
1316 1418 : else
1317 : {
1318 : for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319 : set->elems[idx] = set->elems[idx - 1];
1320 849 : }
1321 849 :
1322 849 : /* Insert the new element. */
1323 : set->elems[idx] = elem;
1324 : ++set->nelem;
1325 : return true;
1326 : }
1327 :
1328 : /* Insert the new element ELEM to the re_node_set* SET.
1329 : SET should not already have any element greater than or equal to ELEM.
1330 : Return true if successful. */
1331 5215 :
1332 : static bool
1333 : internal_function
1334 5215 : re_node_set_insert_last (re_node_set *set, Idx elem)
1335 : {
1336 : /* Realloc if we need. */
1337 2240 : if (set->alloc == set->nelem)
1338 2240 : {
1339 2240 : Idx *new_elems;
1340 0 : set->alloc = (set->alloc + 1) * 2;
1341 2240 : new_elems = re_realloc (set->elems, Idx, set->alloc);
1342 : if (BE (new_elems == NULL, 0))
1343 : return false;
1344 : set->elems = new_elems;
1345 5215 : }
1346 5215 :
1347 : /* Insert the new element. */
1348 : set->elems[set->nelem++] = elem;
1349 : return true;
1350 : }
1351 :
1352 : /* Compare two node sets SET1 and SET2.
1353 : Return true if SET1 and SET2 are equivalent. */
1354 40 :
1355 : static bool
1356 : internal_function __attribute ((pure))
1357 40 : re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358 0 : {
1359 365 : Idx i;
1360 285 : if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1361 0 : return false;
1362 40 : for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1363 : if (set1->elems[i] != set2->elems[i])
1364 : return false;
1365 : return true;
1366 : }
1367 :
1368 : /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1369 87 :
1370 : static Idx
1371 : internal_function __attribute ((pure))
1372 87 : re_node_set_contains (const re_node_set *set, Idx elem)
1373 0 : {
1374 : __re_size_t idx, right, mid;
1375 : if (! REG_VALID_NONZERO_INDEX (set->nelem))
1376 87 : return 0;
1377 87 :
1378 405 : /* Binary search the element. */
1379 : idx = 0;
1380 231 : right = set->nelem - 1;
1381 231 : while (idx < right)
1382 113 : {
1383 : mid = (idx + right) / 2;
1384 118 : if (set->elems[mid] < elem)
1385 : idx = mid + 1;
1386 87 : else
1387 : right = mid;
1388 : }
1389 : return set->elems[idx] == elem ? idx + 1 : 0;
1390 : }
1391 217 :
1392 : static void
1393 217 : internal_function
1394 0 : re_node_set_remove_at (re_node_set *set, Idx idx)
1395 217 : {
1396 590 : if (idx < 0 || idx >= set->nelem)
1397 373 : return;
1398 : --set->nelem;
1399 : for (; idx < set->nelem; idx++)
1400 : set->elems[idx] = set->elems[idx + 1];
1401 : }
1402 :
1403 :
1404 : /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1405 : Or return REG_MISSING if an error occurred. */
1406 1725 :
1407 : static Idx
1408 1725 : internal_function
1409 : re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410 30 : {
1411 : if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1412 : {
1413 : size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1414 30 : Idx *new_nexts, *new_indices;
1415 : re_node_set *new_edests, *new_eclosures;
1416 : re_token_t *new_nodes;
1417 : size_t max_object_size =
1418 : MAX (sizeof (re_token_t),
1419 : MAX (sizeof (re_node_set),
1420 30 : sizeof (Idx)));
1421 0 :
1422 : /* Avoid overflows. */
1423 30 : if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1424 30 : return REG_MISSING;
1425 0 :
1426 30 : new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427 30 : if (BE (new_nodes == NULL, 0))
1428 30 : return REG_MISSING;
1429 30 : dfa->nodes = new_nodes;
1430 30 : new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1431 30 : new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1432 : new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433 0 : new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434 30 : if (BE (new_nexts == NULL || new_indices == NULL
1435 30 : || new_edests == NULL || new_eclosures == NULL, 0))
1436 30 : return REG_MISSING;
1437 30 : dfa->nexts = new_nexts;
1438 30 : dfa->org_indices = new_indices;
1439 : dfa->edests = new_edests;
1440 1725 : dfa->eclosures = new_eclosures;
1441 1725 : dfa->nodes_alloc = new_nodes_alloc;
1442 : }
1443 : dfa->nodes[dfa->nodes_len] = token;
1444 1725 : dfa->nodes[dfa->nodes_len].constraint = 0;
1445 3450 : #ifdef RE_ENABLE_I18N
1446 3450 : {
1447 : int type = token.type;
1448 : dfa->nodes[dfa->nodes_len].accept_mb =
1449 1725 : (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1450 1725 : }
1451 1725 : #endif
1452 1725 : dfa->nexts[dfa->nodes_len] = REG_MISSING;
1453 : re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1454 : re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1455 : return dfa->nodes_len++;
1456 : }
1457 536 :
1458 : static inline re_hashval_t
1459 536 : internal_function
1460 : calc_state_hash (const re_node_set *nodes, unsigned int context)
1461 2122 : {
1462 1586 : re_hashval_t hash = nodes->nelem + context;
1463 536 : Idx i;
1464 : for (i = 0 ; i < nodes->nelem ; i++)
1465 : hash += nodes->elems[i];
1466 : return hash;
1467 : }
1468 :
1469 : /* Search for the state whose node_set is equivalent to NODES.
1470 : Return the pointer to the state, if we found it in the DFA.
1471 : Otherwise create the new one and return it. In case of an error
1472 : return NULL and set the error code in ERR.
1473 : Note: - We assume NULL as the invalid state, then it is possible that
1474 : return value is NULL and ERR is REG_NOERROR.
1475 : - We never return non-NULL value in case of any errors, it is for
1476 : optimization. */
1477 34 :
1478 : static re_dfastate_t *
1479 : internal_function
1480 : re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1481 : const re_node_set *nodes)
1482 : {
1483 : re_hashval_t hash;
1484 : re_dfastate_t *new_state;
1485 : struct re_state_table_entry *spot;
1486 : Idx i;
1487 : #ifdef lint
1488 34 : /* Suppress bogus uninitialized-variable warnings. */
1489 : *err = REG_NOERROR;
1490 0 : #endif
1491 0 : if (BE (nodes->nelem == 0, 0))
1492 : {
1493 34 : *err = REG_NOERROR;
1494 34 : return NULL;
1495 : }
1496 39 : hash = calc_state_hash (nodes, 0);
1497 : spot = dfa->state_table + (hash & dfa->state_hash_mask);
1498 17 :
1499 17 : for (i = 0 ; i < spot->num ; i++)
1500 5 : {
1501 12 : re_dfastate_t *state = spot->array[i];
1502 12 : if (hash != state->hash)
1503 : continue;
1504 : if (re_node_set_compare (&state->nodes, nodes))
1505 : return state;
1506 22 : }
1507 22 :
1508 0 : /* There are no appropriate state in the dfa, create the new one. */
1509 : new_state = create_ci_newstate (dfa, nodes, hash);
1510 22 : if (BE (new_state == NULL, 0))
1511 : *err = REG_ESPACE;
1512 :
1513 : return new_state;
1514 : }
1515 :
1516 : /* Search for the state whose node_set is equivalent to NODES and
1517 : whose context is equivalent to CONTEXT.
1518 : Return the pointer to the state, if we found it in the DFA.
1519 : Otherwise create the new one and return it. In case of an error
1520 : return NULL and set the error code in ERR.
1521 : Note: - We assume NULL as the invalid state, then it is possible that
1522 : return value is NULL and ERR is REG_NOERROR.
1523 : - We never return non-NULL value in case of any errors, it is for
1524 : optimization. */
1525 502 :
1526 : static re_dfastate_t *
1527 : internal_function
1528 : re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1529 : const re_node_set *nodes, unsigned int context)
1530 : {
1531 : re_hashval_t hash;
1532 : re_dfastate_t *new_state;
1533 : struct re_state_table_entry *spot;
1534 : Idx i;
1535 : #ifdef lint
1536 502 : /* Suppress bogus uninitialized-variable warnings. */
1537 : *err = REG_NOERROR;
1538 0 : #endif
1539 0 : if (nodes->nelem == 0)
1540 : {
1541 502 : *err = REG_NOERROR;
1542 502 : return NULL;
1543 : }
1544 554 : hash = calc_state_hash (nodes, context);
1545 : spot = dfa->state_table + (hash & dfa->state_hash_mask);
1546 80 :
1547 80 : for (i = 0 ; i < spot->num ; i++)
1548 28 : {
1549 28 : re_dfastate_t *state = spot->array[i];
1550 28 : if (state->hash == hash
1551 : && state->context == context
1552 : && re_node_set_compare (state->entrance_nodes, nodes))
1553 474 : return state;
1554 474 : }
1555 0 : /* There are no appropriate state in `dfa', create the new one. */
1556 : new_state = create_cd_newstate (dfa, nodes, context, hash);
1557 474 : if (BE (new_state == NULL, 0))
1558 : *err = REG_ESPACE;
1559 :
1560 : return new_state;
1561 : }
1562 :
1563 : /* Finish initialization of the new state NEWSTATE, and using its hash value
1564 : HASH put in the appropriate bucket of DFA's state table. Return value
1565 496 : indicates the error code if failed. */
1566 :
1567 : static reg_errcode_t
1568 : register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1569 : re_hashval_t hash)
1570 : {
1571 : struct re_state_table_entry *spot;
1572 496 : reg_errcode_t err;
1573 496 : Idx i;
1574 496 :
1575 0 : newstate->hash = hash;
1576 1580 : err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1577 : if (BE (err != REG_NOERROR, 0))
1578 1084 : return REG_ESPACE;
1579 1084 : for (i = 0; i < newstate->nodes.nelem; i++)
1580 595 : {
1581 0 : Idx elem = newstate->nodes.elems[i];
1582 : if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1583 : if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1584 496 : return REG_ESPACE;
1585 496 : }
1586 :
1587 457 : spot = dfa->state_table + (hash & dfa->state_hash_mask);
1588 457 : if (BE (spot->alloc <= spot->num, 0))
1589 : {
1590 457 : Idx new_alloc = 2 * spot->num + 2;
1591 0 : re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1592 457 : new_alloc);
1593 457 : if (BE (new_array == NULL, 0))
1594 : return REG_ESPACE;
1595 496 : spot->array = new_array;
1596 496 : spot->alloc = new_alloc;
1597 : }
1598 : spot->array[spot->num++] = newstate;
1599 : return REG_NOERROR;
1600 113 : }
1601 :
1602 113 : static void
1603 113 : free_state (re_dfastate_t *state)
1604 113 : {
1605 : re_node_set_free (&state->non_eps_nodes);
1606 53 : re_node_set_free (&state->inveclosure);
1607 53 : if (state->entrance_nodes != &state->nodes)
1608 : {
1609 113 : re_node_set_free (state->entrance_nodes);
1610 113 : re_free (state->entrance_nodes);
1611 113 : }
1612 113 : re_node_set_free (&state->nodes);
1613 113 : re_free (state->word_trtable);
1614 : re_free (state->trtable);
1615 : re_free (state);
1616 : }
1617 :
1618 : /* Create the new state which is independ of contexts.
1619 : Return the new state if succeeded, otherwise return NULL. */
1620 22 :
1621 : static re_dfastate_t *
1622 : internal_function
1623 : create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1624 : re_hashval_t hash)
1625 : {
1626 : Idx i;
1627 22 : reg_errcode_t err;
1628 22 : re_dfastate_t *newstate;
1629 0 :
1630 22 : newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1631 22 : if (BE (newstate == NULL, 0))
1632 : return NULL;
1633 0 : err = re_node_set_init_copy (&newstate->nodes, nodes);
1634 0 : if (BE (err != REG_NOERROR, 0))
1635 : {
1636 : re_free (newstate);
1637 22 : return NULL;
1638 91 : }
1639 :
1640 69 : newstate->entrance_nodes = &newstate->nodes;
1641 69 : for (i = 0 ; i < nodes->nelem ; i++)
1642 69 : {
1643 0 : re_token_t *node = dfa->nodes + nodes->elems[i];
1644 : re_token_type_t type = node->type;
1645 69 : if (type == CHARACTER && !node->constraint)
1646 : continue;
1647 : #ifdef RE_ENABLE_I18N
1648 : newstate->accept_mb |= node->accept_mb;
1649 69 : #endif /* RE_ENABLE_I18N */
1650 12 :
1651 57 : /* If the state has the halt node, the state is a halt state. */
1652 0 : if (type == END_OF_RE)
1653 57 : newstate->halt = 1;
1654 28 : else if (type == OP_BACK_REF)
1655 : newstate->has_backref = 1;
1656 22 : else if (type == ANCHOR || node->constraint)
1657 22 : newstate->has_constraint = 1;
1658 : }
1659 0 : err = register_state (dfa, newstate, hash);
1660 0 : if (BE (err != REG_NOERROR, 0))
1661 : {
1662 22 : free_state (newstate);
1663 : newstate = NULL;
1664 : }
1665 : return newstate;
1666 : }
1667 :
1668 : /* Create the new state which is depend on the context CONTEXT.
1669 : Return the new state if succeeded, otherwise return NULL. */
1670 474 :
1671 : static re_dfastate_t *
1672 : internal_function
1673 474 : create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1674 : unsigned int context, re_hashval_t hash)
1675 : {
1676 : Idx i, nctx_nodes = 0;
1677 474 : reg_errcode_t err;
1678 474 : re_dfastate_t *newstate;
1679 0 :
1680 474 : newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1681 474 : if (BE (newstate == NULL, 0))
1682 : return NULL;
1683 0 : err = re_node_set_init_copy (&newstate->nodes, nodes);
1684 0 : if (BE (err != REG_NOERROR, 0))
1685 : {
1686 : re_free (newstate);
1687 474 : return NULL;
1688 474 : }
1689 :
1690 1706 : newstate->context = context;
1691 : newstate->entrance_nodes = &newstate->nodes;
1692 1232 :
1693 1232 : for (i = 0 ; i < nodes->nelem ; i++)
1694 1232 : {
1695 1232 : unsigned int constraint = 0;
1696 361 : re_token_t *node = dfa->nodes + nodes->elems[i];
1697 : re_token_type_t type = node->type;
1698 1232 : if (node->constraint)
1699 144 : constraint = node->constraint;
1700 :
1701 1088 : if (type == CHARACTER && !constraint)
1702 : continue;
1703 : #ifdef RE_ENABLE_I18N
1704 : newstate->accept_mb |= node->accept_mb;
1705 1088 : #endif /* RE_ENABLE_I18N */
1706 346 :
1707 742 : /* If the state has the halt node, the state is a halt state. */
1708 0 : if (type == END_OF_RE)
1709 742 : newstate->halt = 1;
1710 273 : else if (type == OP_BACK_REF)
1711 : newstate->has_backref = 1;
1712 1088 : else if (type == ANCHOR)
1713 : constraint = node->opr.ctx_type;
1714 626 :
1715 : if (constraint)
1716 205 : {
1717 205 : if (newstate->entrance_nodes == &newstate->nodes)
1718 : {
1719 0 : newstate->entrance_nodes = re_malloc (re_node_set, 1);
1720 0 : if (BE (newstate->entrance_nodes == NULL, 0))
1721 : {
1722 205 : free_state (newstate);
1723 205 : return NULL;
1724 205 : }
1725 : re_node_set_init_copy (newstate->entrance_nodes, nodes);
1726 : nctx_nodes = 0;
1727 626 : newstate->has_constraint = 1;
1728 : }
1729 217 :
1730 217 : if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731 : {
1732 : re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733 : ++nctx_nodes;
1734 474 : }
1735 474 : }
1736 : }
1737 0 : err = register_state (dfa, newstate, hash);
1738 0 : if (BE (err != REG_NOERROR, 0))
1739 : {
1740 474 : free_state (newstate);
1741 : newstate = NULL;
1742 : }
1743 : return newstate;
1744 : }
|