gkregex.c
Go to the documentation of this file.
1 /* Extended regular expression matching and search library.
2  Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.
3  This file is part of the GNU C Library.
4  Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6  The GNU C Library is free software; you can redistribute it and/or
7  modify it under the terms of the GNU Lesser General Public
8  License as published by the Free Software Foundation; either
9  version 2.1 of the License, or (at your option) any later version.
10 
11  The GNU C Library is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  Lesser General Public License for more details.
15 
16  You should have received a copy of the GNU Lesser General Public
17  License along with the GNU C Library; if not, write to the Free
18  Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19  02111-1307 USA. */
20 
21 /* this is for removing a compiler warning */
22 void gkfooo() { return; }
23 
24 #ifdef USE_GKREGEX
25 
26 #ifdef HAVE_CONFIG_H
27 #include "config.h"
28 #endif
29 
30 #ifdef _LIBC
31 /* We have to keep the namespace clean. */
32 # define regfree(preg) __regfree (preg)
33 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
34 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
35 # define regerror(errcode, preg, errbuf, errbuf_size) \
36  __regerror(errcode, preg, errbuf, errbuf_size)
37 # define re_set_registers(bu, re, nu, st, en) \
38  __re_set_registers (bu, re, nu, st, en)
39 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
40  __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
41 # define re_match(bufp, string, size, pos, regs) \
42  __re_match (bufp, string, size, pos, regs)
43 # define re_search(bufp, string, size, startpos, range, regs) \
44  __re_search (bufp, string, size, startpos, range, regs)
45 # define re_compile_pattern(pattern, length, bufp) \
46  __re_compile_pattern (pattern, length, bufp)
47 # define re_set_syntax(syntax) __re_set_syntax (syntax)
48 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
49  __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
50 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
51 
52 # include "../locale/localeinfo.h"
53 #endif
54 
55 #include "GKlib.h"
56 
57 
58 /******************************************************************************/
59 /******************************************************************************/
60 /******************************************************************************/
61 /* GKINCLUDE #include "regex_internal.h" */
62 /******************************************************************************/
63 /******************************************************************************/
64 /******************************************************************************/
65 /* Extended regular expression matching and search library.
66  Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
67  This file is part of the GNU C Library.
68  Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
69 
70  The GNU C Library is free software; you can redistribute it and/or
71  modify it under the terms of the GNU Lesser General Public
72  License as published by the Free Software Foundation; either
73  version 2.1 of the License, or (at your option) any later version.
74 
75  The GNU C Library is distributed in the hope that it will be useful,
76  but WITHOUT ANY WARRANTY; without even the implied warranty of
77  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
78  Lesser General Public License for more details.
79 
80  You should have received a copy of the GNU Lesser General Public
81  License along with the GNU C Library; if not, write to the Free
82  Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
83  02111-1307 USA. */
84 
85 #ifndef _REGEX_INTERNAL_H
86 #define _REGEX_INTERNAL_H 1
87 
88 #include <assert.h>
89 #include <ctype.h>
90 #include <stdio.h>
91 #include <stdlib.h>
92 #include <string.h>
93 
94 #if defined(__MINGW32_VERSION) || defined(_MSC_VER)
95 #define strcasecmp stricmp
96 #endif
97 
98 #if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
99 # include <langinfo.h>
100 #endif
101 #if defined HAVE_LOCALE_H || defined _LIBC
102 # include <locale.h>
103 #endif
104 #if defined HAVE_WCHAR_H || defined _LIBC
105 # include <wchar.h>
106 #endif /* HAVE_WCHAR_H || _LIBC */
107 #if defined HAVE_WCTYPE_H || defined _LIBC
108 # include <wctype.h>
109 #endif /* HAVE_WCTYPE_H || _LIBC */
110 #if defined HAVE_STDBOOL_H || defined _LIBC
111 # include <stdbool.h>
112 #else
113 typedef enum { false, true } bool;
114 #endif /* HAVE_STDBOOL_H || _LIBC */
115 #if defined HAVE_STDINT_H || defined _LIBC
116 # include <stdint.h>
117 #endif /* HAVE_STDINT_H || _LIBC */
118 #if defined _LIBC
119 # include <bits/libc-lock.h>
120 #else
121 # define __libc_lock_define(CLASS,NAME)
122 # define __libc_lock_init(NAME) do { } while (0)
123 # define __libc_lock_lock(NAME) do { } while (0)
124 # define __libc_lock_unlock(NAME) do { } while (0)
125 #endif
126 
127 /* In case that the system doesn't have isblank(). */
128 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
129 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
130 #endif
131 
132 #ifdef _LIBC
133 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
134 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
135 # include <locale/localeinfo.h>
136 # include <locale/elem-hash.h>
137 # include <locale/coll-lookup.h>
138 # endif
139 #endif
140 
141 /* This is for other GNU distributions with internationalized messages. */
142 #if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
143 # include <libintl.h>
144 # ifdef _LIBC
145 # undef gettext
146 # define gettext(msgid) \
147  INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES)
148 # endif
149 #else
150 # define gettext(msgid) (msgid)
151 #endif
152 
153 #ifndef gettext_noop
154 /* This define is so xgettext can find the internationalizable
155  strings. */
156 # define gettext_noop(String) String
157 #endif
158 
159 /* For loser systems without the definition. */
160 #ifndef SIZE_MAX
161 # define SIZE_MAX ((size_t) -1)
162 #endif
163 
164 #if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
165 # define RE_ENABLE_I18N
166 #endif
167 
168 #if __GNUC__ >= 3
169 # define BE(expr, val) __builtin_expect (expr, val)
170 #else
171 # define BE(expr, val) (expr)
172 # define inline
173 #endif
174 
175 /* Number of single byte character. */
176 #define SBC_MAX 256
177 
178 #define COLL_ELEM_LEN_MAX 8
179 
180 /* The character which represents newline. */
181 #define NEWLINE_CHAR '\n'
182 #define WIDE_NEWLINE_CHAR L'\n'
183 
184 /* Rename to standard API for using out of glibc. */
185 #ifndef _LIBC
186 # define __wctype wctype
187 # define __iswctype iswctype
188 # define __btowc btowc
189 # define __mempcpy mempcpy
190 # define __wcrtomb wcrtomb
191 # define __regfree regfree
192 # define attribute_hidden
193 #endif /* not _LIBC */
194 
195 #ifdef __GNUC__
196 # define __attribute(arg) __attribute__ (arg)
197 #else
198 # define __attribute(arg)
199 #endif
200 
201 extern const char __re_error_msgid[] attribute_hidden;
202 extern const size_t __re_error_msgid_idx[] attribute_hidden;
203 
204 /* An integer used to represent a set of bits. It must be unsigned,
205  and must be at least as wide as unsigned int. */
206 typedef unsigned long int bitset_word_t;
207 /* All bits set in a bitset_word_t. */
208 #define BITSET_WORD_MAX ULONG_MAX
209 /* Number of bits in a bitset_word_t. */
210 #define BITSET_WORD_BITS (sizeof (bitset_word_t) * CHAR_BIT)
211 /* Number of bitset_word_t in a bit_set. */
212 #define BITSET_WORDS (SBC_MAX / BITSET_WORD_BITS)
213 typedef bitset_word_t bitset_t[BITSET_WORDS];
214 typedef bitset_word_t *re_bitset_ptr_t;
215 typedef const bitset_word_t *re_const_bitset_ptr_t;
216 
217 #define bitset_set(set,i) \
218  (set[i / BITSET_WORD_BITS] |= (bitset_word_t) 1 << i % BITSET_WORD_BITS)
219 #define bitset_clear(set,i) \
220  (set[i / BITSET_WORD_BITS] &= ~((bitset_word_t) 1 << i % BITSET_WORD_BITS))
221 #define bitset_contain(set,i) \
222  (set[i / BITSET_WORD_BITS] & ((bitset_word_t) 1 << i % BITSET_WORD_BITS))
223 #define bitset_empty(set) memset (set, '\0', sizeof (bitset_t))
224 #define bitset_set_all(set) memset (set, '\xff', sizeof (bitset_t))
225 #define bitset_copy(dest,src) memcpy (dest, src, sizeof (bitset_t))
226 
227 #define PREV_WORD_CONSTRAINT 0x0001
228 #define PREV_NOTWORD_CONSTRAINT 0x0002
229 #define NEXT_WORD_CONSTRAINT 0x0004
230 #define NEXT_NOTWORD_CONSTRAINT 0x0008
231 #define PREV_NEWLINE_CONSTRAINT 0x0010
232 #define NEXT_NEWLINE_CONSTRAINT 0x0020
233 #define PREV_BEGBUF_CONSTRAINT 0x0040
234 #define NEXT_ENDBUF_CONSTRAINT 0x0080
235 #define WORD_DELIM_CONSTRAINT 0x0100
236 #define NOT_WORD_DELIM_CONSTRAINT 0x0200
237 
238 typedef enum
239 {
240  INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
241  WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
242  WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
243  INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
244  LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
245  LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
246  BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
247  BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
248  WORD_DELIM = WORD_DELIM_CONSTRAINT,
249  NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
250 } re_context_type;
251 
252 typedef struct
253 {
254  int alloc;
255  int nelem;
256  int *elems;
257 } re_node_set;
258 
259 typedef enum
260 {
261  NON_TYPE = 0,
262 
263  /* Node type, These are used by token, node, tree. */
264  CHARACTER = 1,
265  END_OF_RE = 2,
266  SIMPLE_BRACKET = 3,
267  OP_BACK_REF = 4,
268  OP_PERIOD = 5,
269 #ifdef RE_ENABLE_I18N
270  COMPLEX_BRACKET = 6,
271  OP_UTF8_PERIOD = 7,
272 #endif /* RE_ENABLE_I18N */
273 
274  /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used
275  when the debugger shows values of this enum type. */
276 #define EPSILON_BIT 8
277  OP_OPEN_SUBEXP = EPSILON_BIT | 0,
278  OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
279  OP_ALT = EPSILON_BIT | 2,
280  OP_DUP_ASTERISK = EPSILON_BIT | 3,
281  ANCHOR = EPSILON_BIT | 4,
282 
283  /* Tree type, these are used only by tree. */
284  CONCAT = 16,
285  SUBEXP = 17,
286 
287  /* Token type, these are used only by token. */
288  OP_DUP_PLUS = 18,
289  OP_DUP_QUESTION,
290  OP_OPEN_BRACKET,
291  OP_CLOSE_BRACKET,
292  OP_CHARSET_RANGE,
293  OP_OPEN_DUP_NUM,
294  OP_CLOSE_DUP_NUM,
295  OP_NON_MATCH_LIST,
296  OP_OPEN_COLL_ELEM,
297  OP_CLOSE_COLL_ELEM,
298  OP_OPEN_EQUIV_CLASS,
299  OP_CLOSE_EQUIV_CLASS,
300  OP_OPEN_CHAR_CLASS,
301  OP_CLOSE_CHAR_CLASS,
302  OP_WORD,
303  OP_NOTWORD,
304  OP_SPACE,
305  OP_NOTSPACE,
306  BACK_SLASH
307 
308 } re_token_type_t;
309 
310 #ifdef RE_ENABLE_I18N
311 typedef struct
312 {
313  /* Multibyte characters. */
314  wchar_t *mbchars;
315 
316  /* Collating symbols. */
317 # ifdef _LIBC
318  int32_t *coll_syms;
319 # endif
320 
321  /* Equivalence classes. */
322 # ifdef _LIBC
323  int32_t *equiv_classes;
324 # endif
325 
326  /* Range expressions. */
327 # ifdef _LIBC
328  uint32_t *range_starts;
329  uint32_t *range_ends;
330 # else /* not _LIBC */
331  wchar_t *range_starts;
332  wchar_t *range_ends;
333 # endif /* not _LIBC */
334 
335  /* Character classes. */
336  wctype_t *char_classes;
337 
338  /* If this character set is the non-matching list. */
339  unsigned int non_match : 1;
340 
341  /* # of multibyte characters. */
342  int nmbchars;
343 
344  /* # of collating symbols. */
345  int ncoll_syms;
346 
347  /* # of equivalence classes. */
348  int nequiv_classes;
349 
350  /* # of range expressions. */
351  int nranges;
352 
353  /* # of character classes. */
354  int nchar_classes;
355 } re_charset_t;
356 #endif /* RE_ENABLE_I18N */
357 
358 typedef struct
359 {
360  union
361  {
362  unsigned char c; /* for CHARACTER */
363  re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */
364 #ifdef RE_ENABLE_I18N
365  re_charset_t *mbcset; /* for COMPLEX_BRACKET */
366 #endif /* RE_ENABLE_I18N */
367  int idx; /* for BACK_REF */
368  re_context_type ctx_type; /* for ANCHOR */
369  } opr;
370 #if __GNUC__ >= 2
371  re_token_type_t type : 8;
372 #else
373  re_token_type_t type;
374 #endif
375  unsigned int constraint : 10; /* context constraint */
376  unsigned int duplicated : 1;
377  unsigned int opt_subexp : 1;
378 #ifdef RE_ENABLE_I18N
379  unsigned int accept_mb : 1;
380  /* These 2 bits can be moved into the union if needed (e.g. if running out
381  of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
382  unsigned int mb_partial : 1;
383 #endif
384  unsigned int word_char : 1;
385 } re_token_t;
386 
387 #define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
388 
389 struct re_string_t
390 {
391  /* Indicate the raw buffer which is the original string passed as an
392  argument of regexec(), re_search(), etc.. */
393  const unsigned char *raw_mbs;
394  /* Store the multibyte string. In case of "case insensitive mode" like
395  REG_ICASE, upper cases of the string are stored, otherwise MBS points
396  the same address that RAW_MBS points. */
397  unsigned char *mbs;
398 #ifdef RE_ENABLE_I18N
399  /* Store the wide character string which is corresponding to MBS. */
400  wint_t *wcs;
401  int *offsets;
402  mbstate_t cur_state;
403 #endif
404  /* Index in RAW_MBS. Each character mbs[i] corresponds to
405  raw_mbs[raw_mbs_idx + i]. */
406  int raw_mbs_idx;
407  /* The length of the valid characters in the buffers. */
408  int valid_len;
409  /* The corresponding number of bytes in raw_mbs array. */
410  int valid_raw_len;
411  /* The length of the buffers MBS and WCS. */
412  int bufs_len;
413  /* The index in MBS, which is updated by re_string_fetch_byte. */
414  int cur_idx;
415  /* length of RAW_MBS array. */
416  int raw_len;
417  /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */
418  int len;
419  /* End of the buffer may be shorter than its length in the cases such
420  as re_match_2, re_search_2. Then, we use STOP for end of the buffer
421  instead of LEN. */
422  int raw_stop;
423  /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */
424  int stop;
425 
426  /* The context of mbs[0]. We store the context independently, since
427  the context of mbs[0] may be different from raw_mbs[0], which is
428  the beginning of the input string. */
429  unsigned int tip_context;
430  /* The translation passed as a part of an argument of re_compile_pattern. */
432  /* Copy of re_dfa_t's word_char. */
433  re_const_bitset_ptr_t word_char;
434  /* 1 if REG_ICASE. */
435  unsigned char icase;
436  unsigned char is_utf8;
437  unsigned char map_notascii;
438  unsigned char mbs_allocated;
439  unsigned char offsets_needed;
440  unsigned char newline_anchor;
441  unsigned char word_ops_used;
442  int mb_cur_max;
443 };
444 typedef struct re_string_t re_string_t;
445 
446 
447 struct re_dfa_t;
448 typedef struct re_dfa_t re_dfa_t;
449 
450 #ifndef _LIBC
451 # ifdef __i386__
452 # define internal_function __attribute ((regparm (3), stdcall))
453 # else
454 # define internal_function
455 # endif
456 #endif
457 
458 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
459  int new_buf_len)
460  internal_function;
461 #ifdef RE_ENABLE_I18N
462 static void build_wcs_buffer (re_string_t *pstr) internal_function;
463 static int build_wcs_upper_buffer (re_string_t *pstr) internal_function;
464 #endif /* RE_ENABLE_I18N */
465 static void build_upper_buffer (re_string_t *pstr) internal_function;
466 static void re_string_translate_buffer (re_string_t *pstr) internal_function;
467 static unsigned int re_string_context_at (const re_string_t *input, int idx,
468  int eflags)
469  internal_function __attribute ((pure));
470 #define re_string_peek_byte(pstr, offset) \
471  ((pstr)->mbs[(pstr)->cur_idx + offset])
472 #define re_string_fetch_byte(pstr) \
473  ((pstr)->mbs[(pstr)->cur_idx++])
474 #define re_string_first_byte(pstr, idx) \
475  ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF)
476 #define re_string_is_single_byte_char(pstr, idx) \
477  ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \
478  || (pstr)->wcs[(idx) + 1] != WEOF))
479 #define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
480 #define re_string_cur_idx(pstr) ((pstr)->cur_idx)
481 #define re_string_get_buffer(pstr) ((pstr)->mbs)
482 #define re_string_length(pstr) ((pstr)->len)
483 #define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
484 #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
485 #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
486 
487 #ifdef __GNUC__
488 # define alloca(size) __builtin_alloca (size)
489 # define HAVE_ALLOCA 1
490 #elif defined(_MSC_VER)
491 # include <malloc.h>
492 # define alloca _alloca
493 # define HAVE_ALLOCA 1
494 #else
495 # error No alloca()
496 #endif
497 
498 #ifndef _LIBC
499 # if HAVE_ALLOCA
500 /* The OS usually guarantees only one guard page at the bottom of the stack,
501  and a page size can be as small as 4096 bytes. So we cannot safely
502  allocate anything larger than 4096 bytes. Also care for the possibility
503  of a few compiler-allocated temporary stack slots. */
504 # define __libc_use_alloca(n) ((n) < 4032)
505 # else
506 /* alloca is implemented with malloc, so just use malloc. */
507 # define __libc_use_alloca(n) 0
508 # endif
509 #endif
510 
511 #define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
512 #define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
513 #define re_free(p) free (p)
514 
515 struct bin_tree_t
516 {
517  struct bin_tree_t *parent;
518  struct bin_tree_t *left;
519  struct bin_tree_t *right;
520  struct bin_tree_t *first;
521  struct bin_tree_t *next;
522 
523  re_token_t token;
524 
525  /* `node_idx' is the index in dfa->nodes, if `type' == 0.
526  Otherwise `type' indicate the type of this node. */
527  int node_idx;
528 };
529 typedef struct bin_tree_t bin_tree_t;
530 
531 #define BIN_TREE_STORAGE_SIZE \
532  ((1024 - sizeof (void *)) / sizeof (bin_tree_t))
533 
534 struct bin_tree_storage_t
535 {
536  struct bin_tree_storage_t *next;
537  bin_tree_t data[BIN_TREE_STORAGE_SIZE];
538 };
539 typedef struct bin_tree_storage_t bin_tree_storage_t;
540 
541 #define CONTEXT_WORD 1
542 #define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
543 #define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
544 #define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
545 
546 #define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
547 #define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
548 #define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
549 #define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
550 #define IS_ORDINARY_CONTEXT(c) ((c) == 0)
551 
552 #define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
553 #define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
554 #define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
555 #define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
556 
557 #define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
558  ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
559  || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
560  || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
561  || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
562 
563 #define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
564  ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
565  || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
566  || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
567  || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
568 
569 struct re_dfastate_t
570 {
571  unsigned int hash;
572  re_node_set nodes;
573  re_node_set non_eps_nodes;
574  re_node_set inveclosure;
575  re_node_set *entrance_nodes;
576  struct re_dfastate_t **trtable, **word_trtable;
577  unsigned int context : 4;
578  unsigned int halt : 1;
579  /* If this state can accept `multi byte'.
580  Note that we refer to multibyte characters, and multi character
581  collating elements as `multi byte'. */
582  unsigned int accept_mb : 1;
583  /* If this state has backreference node(s). */
584  unsigned int has_backref : 1;
585  unsigned int has_constraint : 1;
586 };
587 typedef struct re_dfastate_t re_dfastate_t;
588 
589 struct re_state_table_entry
590 {
591  int num;
592  int alloc;
593  re_dfastate_t **array;
594 };
595 
596 /* Array type used in re_sub_match_last_t and re_sub_match_top_t. */
597 
598 typedef struct
599 {
600  int next_idx;
601  int alloc;
602  re_dfastate_t **array;
603 } state_array_t;
604 
605 /* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */
606 
607 typedef struct
608 {
609  int node;
610  int str_idx; /* The position NODE match at. */
611  state_array_t path;
612 } re_sub_match_last_t;
613 
614 /* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
615  And information about the node, whose type is OP_CLOSE_SUBEXP,
616  corresponding to NODE is stored in LASTS. */
617 
618 typedef struct
619 {
620  int str_idx;
621  int node;
622  state_array_t *path;
623  int alasts; /* Allocation size of LASTS. */
624  int nlasts; /* The number of LASTS. */
625  re_sub_match_last_t **lasts;
626 } re_sub_match_top_t;
627 
628 struct re_backref_cache_entry
629 {
630  int node;
631  int str_idx;
632  int subexp_from;
633  int subexp_to;
634  char more;
635  char unused;
636  unsigned short int eps_reachable_subexps_map;
637 };
638 
639 typedef struct
640 {
641  /* The string object corresponding to the input string. */
642  re_string_t input;
643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
644  const re_dfa_t *const dfa;
645 #else
646  const re_dfa_t *dfa;
647 #endif
648  /* EFLAGS of the argument of regexec. */
649  int eflags;
650  /* Where the matching ends. */
651  int match_last;
652  int last_node;
653  /* The state log used by the matcher. */
654  re_dfastate_t **state_log;
655  int state_log_top;
656  /* Back reference cache. */
657  int nbkref_ents;
658  int abkref_ents;
659  struct re_backref_cache_entry *bkref_ents;
660  int max_mb_elem_len;
661  int nsub_tops;
662  int asub_tops;
663  re_sub_match_top_t **sub_tops;
664 } re_match_context_t;
665 
666 typedef struct
667 {
668  re_dfastate_t **sifted_states;
669  re_dfastate_t **limited_states;
670  int last_node;
671  int last_str_idx;
672  re_node_set limits;
673 } re_sift_context_t;
674 
675 struct re_fail_stack_ent_t
676 {
677  int idx;
678  int node;
679  regmatch_t *regs;
680  re_node_set eps_via_nodes;
681 };
682 
683 struct re_fail_stack_t
684 {
685  int num;
686  int alloc;
687  struct re_fail_stack_ent_t *stack;
688 };
689 
690 struct re_dfa_t
691 {
692  re_token_t *nodes;
693  size_t nodes_alloc;
694  size_t nodes_len;
695  int *nexts;
696  int *org_indices;
697  re_node_set *edests;
698  re_node_set *eclosures;
699  re_node_set *inveclosures;
700  struct re_state_table_entry *state_table;
701  re_dfastate_t *init_state;
702  re_dfastate_t *init_state_word;
703  re_dfastate_t *init_state_nl;
704  re_dfastate_t *init_state_begbuf;
705  bin_tree_t *str_tree;
706  bin_tree_storage_t *str_tree_storage;
707  re_bitset_ptr_t sb_char;
708  int str_tree_storage_idx;
709 
710  /* number of subexpressions `re_nsub' is in regex_t. */
711  unsigned int state_hash_mask;
712  int init_node;
713  int nbackref; /* The number of backreference in this dfa. */
714 
715  /* Bitmap expressing which backreference is used. */
716  bitset_word_t used_bkref_map;
717  bitset_word_t completed_bkref_map;
718 
719  unsigned int has_plural_match : 1;
720  /* If this dfa has "multibyte node", which is a backreference or
721  a node which can accept multibyte character or multi character
722  collating element. */
723  unsigned int has_mb_node : 1;
724  unsigned int is_utf8 : 1;
725  unsigned int map_notascii : 1;
726  unsigned int word_ops_used : 1;
727  int mb_cur_max;
728  bitset_t word_char;
729  reg_syntax_t syntax;
730  int *subexp_map;
731 #ifdef DEBUG
732  char* re_str;
733 #endif
734  __libc_lock_define (, lock)
735 };
736 
737 #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
738 #define re_node_set_remove(set,id) \
739  (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
740 #define re_node_set_empty(p) ((p)->nelem = 0)
741 #define re_node_set_free(set) re_free ((set)->elems)
742 
743 
744 typedef enum
745 {
746  SB_CHAR,
747  MB_CHAR,
748  EQUIV_CLASS,
749  COLL_SYM,
750  CHAR_CLASS
751 } bracket_elem_type;
752 
753 typedef struct
754 {
755  bracket_elem_type type;
756  union
757  {
758  unsigned char ch;
759  unsigned char *name;
760  wchar_t wch;
761  } opr;
762 } bracket_elem_t;
763 
764 
765 /* Inline functions for bitset operation. */
766 static inline void
767 bitset_not (bitset_t set)
768 {
769  int bitset_i;
770  for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
771  set[bitset_i] = ~set[bitset_i];
772 }
773 
774 static inline void
775 bitset_merge (bitset_t dest, const bitset_t src)
776 {
777  int bitset_i;
778  for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
779  dest[bitset_i] |= src[bitset_i];
780 }
781 
782 static inline void
783 bitset_mask (bitset_t dest, const bitset_t src)
784 {
785  int bitset_i;
786  for (bitset_i = 0; bitset_i < BITSET_WORDS; ++bitset_i)
787  dest[bitset_i] &= src[bitset_i];
788 }
789 
790 #ifdef RE_ENABLE_I18N
791 /* Inline functions for re_string. */
792 static inline int
793 internal_function __attribute ((pure))
794 re_string_char_size_at (const re_string_t *pstr, int idx)
795 {
796  int byte_idx;
797  if (pstr->mb_cur_max == 1)
798  return 1;
799  for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx)
800  if (pstr->wcs[idx + byte_idx] != WEOF)
801  break;
802  return byte_idx;
803 }
804 
805 static inline wint_t
806 internal_function __attribute ((pure))
807 re_string_wchar_at (const re_string_t *pstr, int idx)
808 {
809  if (pstr->mb_cur_max == 1)
810  return (wint_t) pstr->mbs[idx];
811  return (wint_t) pstr->wcs[idx];
812 }
813 
814 static int
815 internal_function __attribute ((pure))
816 re_string_elem_size_at (const re_string_t *pstr, int idx)
817 {
818 # ifdef _LIBC
819  const unsigned char *p, *extra;
820  const int32_t *table, *indirect;
821  int32_t tmp;
822 # include <locale/weight.h>
823  uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
824 
825  if (nrules != 0)
826  {
827  table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
828  extra = (const unsigned char *)
829  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
830  indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
831  _NL_COLLATE_INDIRECTMB);
832  p = pstr->mbs + idx;
833  tmp = findidx (&p);
834  return p - pstr->mbs - idx;
835  }
836  else
837 # endif /* _LIBC */
838  return 1;
839 }
840 #endif /* RE_ENABLE_I18N */
841 
842 #endif /* _REGEX_INTERNAL_H */
843 
844 /******************************************************************************/
845 /******************************************************************************/
846 /******************************************************************************/
847 /* GKINCLUDE #include "regex_internal.c" */
848 /******************************************************************************/
849 /******************************************************************************/
850 /******************************************************************************/
851 /* Extended regular expression matching and search library.
852  Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
853  This file is part of the GNU C Library.
854  Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
855 
856  The GNU C Library is free software; you can redistribute it and/or
857  modify it under the terms of the GNU Lesser General Public
858  License as published by the Free Software Foundation; either
859  version 2.1 of the License, or (at your option) any later version.
860 
861  The GNU C Library is distributed in the hope that it will be useful,
862  but WITHOUT ANY WARRANTY; without even the implied warranty of
863  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
864  Lesser General Public License for more details.
865 
866  You should have received a copy of the GNU Lesser General Public
867  License along with the GNU C Library; if not, write to the Free
868  Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
869  02111-1307 USA. */
870 
871 static void re_string_construct_common (const char *str, int len,
872  re_string_t *pstr,
873  RE_TRANSLATE_TYPE trans, int icase,
874  const re_dfa_t *dfa) internal_function;
875 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
876  const re_node_set *nodes,
877  unsigned int hash) internal_function;
878 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
879  const re_node_set *nodes,
880  unsigned int context,
881  unsigned int hash) internal_function;
882 
883 /* Functions for string operation. */
884 
885 /* This function allocate the buffers. It is necessary to call
886  re_string_reconstruct before using the object. */
887 
888 static reg_errcode_t
889 internal_function
890 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
891  RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
892 {
894  int init_buf_len;
895 
896  /* Ensure at least one character fits into the buffers. */
897  if (init_len < dfa->mb_cur_max)
898  init_len = dfa->mb_cur_max;
899  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
900  re_string_construct_common (str, len, pstr, trans, icase, dfa);
901 
902  ret = re_string_realloc_buffers (pstr, init_buf_len);
903  if (BE (ret != REG_NOERROR, 0))
904  return ret;
905 
906  pstr->word_char = dfa->word_char;
907  pstr->word_ops_used = dfa->word_ops_used;
908  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
909  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
910  pstr->valid_raw_len = pstr->valid_len;
911  return REG_NOERROR;
912 }
913 
914 /* This function allocate the buffers, and initialize them. */
915 
916 static reg_errcode_t
917 internal_function
918 re_string_construct (re_string_t *pstr, const char *str, int len,
919  RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
920 {
922  memset (pstr, '\0', sizeof (re_string_t));
923  re_string_construct_common (str, len, pstr, trans, icase, dfa);
924 
925  if (len > 0)
926  {
927  ret = re_string_realloc_buffers (pstr, len + 1);
928  if (BE (ret != REG_NOERROR, 0))
929  return ret;
930  }
931  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
932 
933  if (icase)
934  {
935 #ifdef RE_ENABLE_I18N
936  if (dfa->mb_cur_max > 1)
937  {
938  while (1)
939  {
940  ret = build_wcs_upper_buffer (pstr);
941  if (BE (ret != REG_NOERROR, 0))
942  return ret;
943  if (pstr->valid_raw_len >= len)
944  break;
945  if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
946  break;
947  ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
948  if (BE (ret != REG_NOERROR, 0))
949  return ret;
950  }
951  }
952  else
953 #endif /* RE_ENABLE_I18N */
954  build_upper_buffer (pstr);
955  }
956  else
957  {
958 #ifdef RE_ENABLE_I18N
959  if (dfa->mb_cur_max > 1)
960  build_wcs_buffer (pstr);
961  else
962 #endif /* RE_ENABLE_I18N */
963  {
964  if (trans != NULL)
965  re_string_translate_buffer (pstr);
966  else
967  {
968  pstr->valid_len = pstr->bufs_len;
969  pstr->valid_raw_len = pstr->bufs_len;
970  }
971  }
972  }
973 
974  return REG_NOERROR;
975 }
976 
977 /* Helper functions for re_string_allocate, and re_string_construct. */
978 
979 static reg_errcode_t
980 internal_function
981 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
982 {
983 #ifdef RE_ENABLE_I18N
984  if (pstr->mb_cur_max > 1)
985  {
986  wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
987  if (BE (new_wcs == NULL, 0))
988  return REG_ESPACE;
989  pstr->wcs = new_wcs;
990  if (pstr->offsets != NULL)
991  {
992  int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
993  if (BE (new_offsets == NULL, 0))
994  return REG_ESPACE;
995  pstr->offsets = new_offsets;
996  }
997  }
998 #endif /* RE_ENABLE_I18N */
999  if (pstr->mbs_allocated)
1000  {
1001  unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
1002  new_buf_len);
1003  if (BE (new_mbs == NULL, 0))
1004  return REG_ESPACE;
1005  pstr->mbs = new_mbs;
1006  }
1007  pstr->bufs_len = new_buf_len;
1008  return REG_NOERROR;
1009 }
1010 
1011 
1012 static void
1013 internal_function
1014 re_string_construct_common (const char *str, int len, re_string_t *pstr,
1015  RE_TRANSLATE_TYPE trans, int icase,
1016  const re_dfa_t *dfa)
1017 {
1018  pstr->raw_mbs = (const unsigned char *) str;
1019  pstr->len = len;
1020  pstr->raw_len = len;
1021  pstr->trans = trans;
1022  pstr->icase = icase ? 1 : 0;
1023  pstr->mbs_allocated = (trans != NULL || icase);
1024  pstr->mb_cur_max = dfa->mb_cur_max;
1025  pstr->is_utf8 = dfa->is_utf8;
1026  pstr->map_notascii = dfa->map_notascii;
1027  pstr->stop = pstr->len;
1028  pstr->raw_stop = pstr->stop;
1029 }
1030 
1031 #ifdef RE_ENABLE_I18N
1032 
1033 /* Build wide character buffer PSTR->WCS.
1034  If the byte sequence of the string are:
1035  <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
1036  Then wide character buffer will be:
1037  <wc1> , WEOF , <wc2> , WEOF , <wc3>
1038  We use WEOF for padding, they indicate that the position isn't
1039  a first byte of a multibyte character.
1040 
1041  Note that this function assumes PSTR->VALID_LEN elements are already
1042  built and starts from PSTR->VALID_LEN. */
1043 
1044 static void
1045 internal_function
1046 build_wcs_buffer (re_string_t *pstr)
1047 {
1048 #ifdef _LIBC
1049  unsigned char buf[MB_LEN_MAX];
1050  assert (MB_LEN_MAX >= pstr->mb_cur_max);
1051 #else
1052  unsigned char buf[64];
1053 #endif
1054  mbstate_t prev_st;
1055  int byte_idx, end_idx, remain_len;
1056  size_t mbclen;
1057 
1058  /* Build the buffers from pstr->valid_len to either pstr->len or
1059  pstr->bufs_len. */
1060  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1061  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
1062  {
1063  wchar_t wc;
1064  const char *p;
1065 
1066  remain_len = end_idx - byte_idx;
1067  prev_st = pstr->cur_state;
1068  /* Apply the translation if we need. */
1069  if (BE (pstr->trans != NULL, 0))
1070  {
1071  int i, ch;
1072 
1073  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1074  {
1075  ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
1076  buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
1077  }
1078  p = (const char *) buf;
1079  }
1080  else
1081  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
1082  mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1083  if (BE (mbclen == (size_t) -2, 0))
1084  {
1085  /* The buffer doesn't have enough space, finish to build. */
1086  pstr->cur_state = prev_st;
1087  break;
1088  }
1089  else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
1090  {
1091  /* We treat these cases as a singlebyte character. */
1092  mbclen = 1;
1093  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1094  if (BE (pstr->trans != NULL, 0))
1095  wc = pstr->trans[wc];
1096  pstr->cur_state = prev_st;
1097  }
1098 
1099  /* Write wide character and padding. */
1100  pstr->wcs[byte_idx++] = wc;
1101  /* Write paddings. */
1102  for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1103  pstr->wcs[byte_idx++] = WEOF;
1104  }
1105  pstr->valid_len = byte_idx;
1106  pstr->valid_raw_len = byte_idx;
1107 }
1108 
1109 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
1110  but for REG_ICASE. */
1111 
1112 static reg_errcode_t
1113 internal_function
1114 build_wcs_upper_buffer (re_string_t *pstr)
1115 {
1116  mbstate_t prev_st;
1117  int src_idx, byte_idx, end_idx, remain_len;
1118  size_t mbclen;
1119 #ifdef _LIBC
1120  char buf[MB_LEN_MAX];
1121  assert (MB_LEN_MAX >= pstr->mb_cur_max);
1122 #else
1123  char buf[64];
1124 #endif
1125 
1126  byte_idx = pstr->valid_len;
1127  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1128 
1129  /* The following optimization assumes that ASCII characters can be
1130  mapped to wide characters with a simple cast. */
1131  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
1132  {
1133  while (byte_idx < end_idx)
1134  {
1135  wchar_t wc;
1136 
1137  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
1138  && mbsinit (&pstr->cur_state))
1139  {
1140  /* In case of a singlebyte character. */
1141  pstr->mbs[byte_idx]
1142  = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
1143  /* The next step uses the assumption that wchar_t is encoded
1144  ASCII-safe: all ASCII values can be converted like this. */
1145  pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
1146  ++byte_idx;
1147  continue;
1148  }
1149 
1150  remain_len = end_idx - byte_idx;
1151  prev_st = pstr->cur_state;
1152  mbclen = mbrtowc (&wc,
1153  ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
1154  + byte_idx), remain_len, &pstr->cur_state);
1155  if (BE (mbclen + 2 > 2, 1))
1156  {
1157  wchar_t wcu = wc;
1158  if (iswlower (wc))
1159  {
1160  size_t mbcdlen;
1161 
1162  wcu = towupper (wc);
1163  mbcdlen = wcrtomb (buf, wcu, &prev_st);
1164  if (BE (mbclen == mbcdlen, 1))
1165  memcpy (pstr->mbs + byte_idx, buf, mbclen);
1166  else
1167  {
1168  src_idx = byte_idx;
1169  goto offsets_needed;
1170  }
1171  }
1172  else
1173  memcpy (pstr->mbs + byte_idx,
1174  pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
1175  pstr->wcs[byte_idx++] = wcu;
1176  /* Write paddings. */
1177  for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1178  pstr->wcs[byte_idx++] = WEOF;
1179  }
1180  else if (mbclen == (size_t) -1 || mbclen == 0)
1181  {
1182  /* It is an invalid character or '\0'. Just use the byte. */
1183  int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
1184  pstr->mbs[byte_idx] = ch;
1185  /* And also cast it to wide char. */
1186  pstr->wcs[byte_idx++] = (wchar_t) ch;
1187  if (BE (mbclen == (size_t) -1, 0))
1188  pstr->cur_state = prev_st;
1189  }
1190  else
1191  {
1192  /* The buffer doesn't have enough space, finish to build. */
1193  pstr->cur_state = prev_st;
1194  break;
1195  }
1196  }
1197  pstr->valid_len = byte_idx;
1198  pstr->valid_raw_len = byte_idx;
1199  return REG_NOERROR;
1200  }
1201  else
1202  for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
1203  {
1204  wchar_t wc;
1205  const char *p;
1206  offsets_needed:
1207  remain_len = end_idx - byte_idx;
1208  prev_st = pstr->cur_state;
1209  if (BE (pstr->trans != NULL, 0))
1210  {
1211  int i, ch;
1212 
1213  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
1214  {
1215  ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
1216  buf[i] = pstr->trans[ch];
1217  }
1218  p = (const char *) buf;
1219  }
1220  else
1221  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
1222  mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
1223  if (BE (mbclen + 2 > 2, 1))
1224  {
1225  wchar_t wcu = wc;
1226  if (iswlower (wc))
1227  {
1228  size_t mbcdlen;
1229 
1230  wcu = towupper (wc);
1231  mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
1232  if (BE (mbclen == mbcdlen, 1))
1233  memcpy (pstr->mbs + byte_idx, buf, mbclen);
1234  else if (mbcdlen != (size_t) -1)
1235  {
1236  size_t i;
1237 
1238  if (byte_idx + mbcdlen > pstr->bufs_len)
1239  {
1240  pstr->cur_state = prev_st;
1241  break;
1242  }
1243 
1244  if (pstr->offsets == NULL)
1245  {
1246  pstr->offsets = re_malloc (int, pstr->bufs_len);
1247 
1248  if (pstr->offsets == NULL)
1249  return REG_ESPACE;
1250  }
1251  if (!pstr->offsets_needed)
1252  {
1253  for (i = 0; i < (size_t) byte_idx; ++i)
1254  pstr->offsets[i] = i;
1255  pstr->offsets_needed = 1;
1256  }
1257 
1258  memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
1259  pstr->wcs[byte_idx] = wcu;
1260  pstr->offsets[byte_idx] = src_idx;
1261  for (i = 1; i < mbcdlen; ++i)
1262  {
1263  pstr->offsets[byte_idx + i]
1264  = src_idx + (i < mbclen ? i : mbclen - 1);
1265  pstr->wcs[byte_idx + i] = WEOF;
1266  }
1267  pstr->len += mbcdlen - mbclen;
1268  if (pstr->raw_stop > src_idx)
1269  pstr->stop += mbcdlen - mbclen;
1270  end_idx = (pstr->bufs_len > pstr->len)
1271  ? pstr->len : pstr->bufs_len;
1272  byte_idx += mbcdlen;
1273  src_idx += mbclen;
1274  continue;
1275  }
1276  else
1277  memcpy (pstr->mbs + byte_idx, p, mbclen);
1278  }
1279  else
1280  memcpy (pstr->mbs + byte_idx, p, mbclen);
1281 
1282  if (BE (pstr->offsets_needed != 0, 0))
1283  {
1284  size_t i;
1285  for (i = 0; i < mbclen; ++i)
1286  pstr->offsets[byte_idx + i] = src_idx + i;
1287  }
1288  src_idx += mbclen;
1289 
1290  pstr->wcs[byte_idx++] = wcu;
1291  /* Write paddings. */
1292  for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
1293  pstr->wcs[byte_idx++] = WEOF;
1294  }
1295  else if (mbclen == (size_t) -1 || mbclen == 0)
1296  {
1297  /* It is an invalid character or '\0'. Just use the byte. */
1298  int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
1299 
1300  if (BE (pstr->trans != NULL, 0))
1301  ch = pstr->trans [ch];
1302  pstr->mbs[byte_idx] = ch;
1303 
1304  if (BE (pstr->offsets_needed != 0, 0))
1305  pstr->offsets[byte_idx] = src_idx;
1306  ++src_idx;
1307 
1308  /* And also cast it to wide char. */
1309  pstr->wcs[byte_idx++] = (wchar_t) ch;
1310  if (BE (mbclen == (size_t) -1, 0))
1311  pstr->cur_state = prev_st;
1312  }
1313  else
1314  {
1315  /* The buffer doesn't have enough space, finish to build. */
1316  pstr->cur_state = prev_st;
1317  break;
1318  }
1319  }
1320  pstr->valid_len = byte_idx;
1321  pstr->valid_raw_len = src_idx;
1322  return REG_NOERROR;
1323 }
1324 
1325 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
1326  Return the index. */
1327 
1328 static int
1329 internal_function
1330 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
1331 {
1332  mbstate_t prev_st;
1333  int rawbuf_idx;
1334  size_t mbclen;
1335  wchar_t wc = WEOF;
1336 
1337  /* Skip the characters which are not necessary to check. */
1338  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
1339  rawbuf_idx < new_raw_idx;)
1340  {
1341  int remain_len;
1342  remain_len = pstr->len - rawbuf_idx;
1343  prev_st = pstr->cur_state;
1344  mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
1345  remain_len, &pstr->cur_state);
1346  if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
1347  {
1348  /* We treat these cases as a single byte character. */
1349  if (mbclen == 0 || remain_len == 0)
1350  wc = L'\0';
1351  else
1352  wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
1353  mbclen = 1;
1354  pstr->cur_state = prev_st;
1355  }
1356  /* Then proceed the next character. */
1357  rawbuf_idx += mbclen;
1358  }
1359  *last_wc = (wint_t) wc;
1360  return rawbuf_idx;
1361 }
1362 #endif /* RE_ENABLE_I18N */
1363 
1364 /* Build the buffer PSTR->MBS, and apply the translation if we need.
1365  This function is used in case of REG_ICASE. */
1366 
1367 static void
1368 internal_function
1369 build_upper_buffer (re_string_t *pstr)
1370 {
1371  int char_idx, end_idx;
1372  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1373 
1374  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
1375  {
1376  int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
1377  if (BE (pstr->trans != NULL, 0))
1378  ch = pstr->trans[ch];
1379  if (islower (ch))
1380  pstr->mbs[char_idx] = toupper (ch);
1381  else
1382  pstr->mbs[char_idx] = ch;
1383  }
1384  pstr->valid_len = char_idx;
1385  pstr->valid_raw_len = char_idx;
1386 }
1387 
1388 /* Apply TRANS to the buffer in PSTR. */
1389 
1390 static void
1391 internal_function
1392 re_string_translate_buffer (re_string_t *pstr)
1393 {
1394  int buf_idx, end_idx;
1395  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
1396 
1397  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
1398  {
1399  int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
1400  pstr->mbs[buf_idx] = pstr->trans[ch];
1401  }
1402 
1403  pstr->valid_len = buf_idx;
1404  pstr->valid_raw_len = buf_idx;
1405 }
1406 
1407 /* This function re-construct the buffers.
1408  Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
1409  convert to upper case in case of REG_ICASE, apply translation. */
1410 
1411 static reg_errcode_t
1412 internal_function
1413 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
1414 {
1415  int offset = idx - pstr->raw_mbs_idx;
1416  if (BE (offset < 0, 0))
1417  {
1418  /* Reset buffer. */
1419 #ifdef RE_ENABLE_I18N
1420  if (pstr->mb_cur_max > 1)
1421  memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1422 #endif /* RE_ENABLE_I18N */
1423  pstr->len = pstr->raw_len;
1424  pstr->stop = pstr->raw_stop;
1425  pstr->valid_len = 0;
1426  pstr->raw_mbs_idx = 0;
1427  pstr->valid_raw_len = 0;
1428  pstr->offsets_needed = 0;
1429  pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
1430  : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
1431  if (!pstr->mbs_allocated)
1432  pstr->mbs = (unsigned char *) pstr->raw_mbs;
1433  offset = idx;
1434  }
1435 
1436  if (BE (offset != 0, 1))
1437  {
1438  /* Should the already checked characters be kept? */
1439  if (BE (offset < pstr->valid_raw_len, 1))
1440  {
1441  /* Yes, move them to the front of the buffer. */
1442 #ifdef RE_ENABLE_I18N
1443  if (BE (pstr->offsets_needed, 0))
1444  {
1445  int low = 0, high = pstr->valid_len, mid;
1446  do
1447  {
1448  mid = (high + low) / 2;
1449  if (pstr->offsets[mid] > offset)
1450  high = mid;
1451  else if (pstr->offsets[mid] < offset)
1452  low = mid + 1;
1453  else
1454  break;
1455  }
1456  while (low < high);
1457  if (pstr->offsets[mid] < offset)
1458  ++mid;
1459  pstr->tip_context = re_string_context_at (pstr, mid - 1,
1460  eflags);
1461  /* This can be quite complicated, so handle specially
1462  only the common and easy case where the character with
1463  different length representation of lower and upper
1464  case is present at or after offset. */
1465  if (pstr->valid_len > offset
1466  && mid == offset && pstr->offsets[mid] == offset)
1467  {
1468  memmove (pstr->wcs, pstr->wcs + offset,
1469  (pstr->valid_len - offset) * sizeof (wint_t));
1470  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
1471  pstr->valid_len -= offset;
1472  pstr->valid_raw_len -= offset;
1473  for (low = 0; low < pstr->valid_len; low++)
1474  pstr->offsets[low] = pstr->offsets[low + offset] - offset;
1475  }
1476  else
1477  {
1478  /* Otherwise, just find out how long the partial multibyte
1479  character at offset is and fill it with WEOF/255. */
1480  pstr->len = pstr->raw_len - idx + offset;
1481  pstr->stop = pstr->raw_stop - idx + offset;
1482  pstr->offsets_needed = 0;
1483  while (mid > 0 && pstr->offsets[mid - 1] == offset)
1484  --mid;
1485  while (mid < pstr->valid_len)
1486  if (pstr->wcs[mid] != WEOF)
1487  break;
1488  else
1489  ++mid;
1490  if (mid == pstr->valid_len)
1491  pstr->valid_len = 0;
1492  else
1493  {
1494  pstr->valid_len = pstr->offsets[mid] - offset;
1495  if (pstr->valid_len)
1496  {
1497  for (low = 0; low < pstr->valid_len; ++low)
1498  pstr->wcs[low] = WEOF;
1499  memset (pstr->mbs, 255, pstr->valid_len);
1500  }
1501  }
1502  pstr->valid_raw_len = pstr->valid_len;
1503  }
1504  }
1505  else
1506 #endif
1507  {
1508  pstr->tip_context = re_string_context_at (pstr, offset - 1,
1509  eflags);
1510 #ifdef RE_ENABLE_I18N
1511  if (pstr->mb_cur_max > 1)
1512  memmove (pstr->wcs, pstr->wcs + offset,
1513  (pstr->valid_len - offset) * sizeof (wint_t));
1514 #endif /* RE_ENABLE_I18N */
1515  if (BE (pstr->mbs_allocated, 0))
1516  memmove (pstr->mbs, pstr->mbs + offset,
1517  pstr->valid_len - offset);
1518  pstr->valid_len -= offset;
1519  pstr->valid_raw_len -= offset;
1520 #if DEBUG
1521  assert (pstr->valid_len > 0);
1522 #endif
1523  }
1524  }
1525  else
1526  {
1527  /* No, skip all characters until IDX. */
1528  int prev_valid_len = pstr->valid_len;
1529 
1530 #ifdef RE_ENABLE_I18N
1531  if (BE (pstr->offsets_needed, 0))
1532  {
1533  pstr->len = pstr->raw_len - idx + offset;
1534  pstr->stop = pstr->raw_stop - idx + offset;
1535  pstr->offsets_needed = 0;
1536  }
1537 #endif
1538  pstr->valid_len = 0;
1539 #ifdef RE_ENABLE_I18N
1540  if (pstr->mb_cur_max > 1)
1541  {
1542  int wcs_idx;
1543  wint_t wc = WEOF;
1544 
1545  if (pstr->is_utf8)
1546  {
1547  const unsigned char *raw, *p, *q, *end;
1548 
1549  /* Special case UTF-8. Multi-byte chars start with any
1550  byte other than 0x80 - 0xbf. */
1551  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
1552  end = raw + (offset - pstr->mb_cur_max);
1553  if (end < pstr->raw_mbs)
1554  end = pstr->raw_mbs;
1555  p = raw + offset - 1;
1556 #ifdef _LIBC
1557  /* We know the wchar_t encoding is UCS4, so for the simple
1558  case, ASCII characters, skip the conversion step. */
1559  if (isascii (*p) && BE (pstr->trans == NULL, 1))
1560  {
1561  memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
1562  /* pstr->valid_len = 0; */
1563  wc = (wchar_t) *p;
1564  }
1565  else
1566 #endif
1567  for (; p >= end; --p)
1568  if ((*p & 0xc0) != 0x80)
1569  {
1570  mbstate_t cur_state;
1571  wchar_t wc2;
1572  int mlen = raw + pstr->len - p;
1573  unsigned char buf[6];
1574  size_t mbclen;
1575 
1576  q = p;
1577  if (BE (pstr->trans != NULL, 0))
1578  {
1579  int i = mlen < 6 ? mlen : 6;
1580  while (--i >= 0)
1581  buf[i] = pstr->trans[p[i]];
1582  q = buf;
1583  }
1584  /* XXX Don't use mbrtowc, we know which conversion
1585  to use (UTF-8 -> UCS4). */
1586  memset (&cur_state, 0, sizeof (cur_state));
1587  mbclen = mbrtowc (&wc2, (const char *) p, mlen,
1588  &cur_state);
1589  if (raw + offset - p <= mbclen
1590  && mbclen < (size_t) -2)
1591  {
1592  memset (&pstr->cur_state, '\0',
1593  sizeof (mbstate_t));
1594  pstr->valid_len = mbclen - (raw + offset - p);
1595  wc = wc2;
1596  }
1597  break;
1598  }
1599  }
1600 
1601  if (wc == WEOF)
1602  pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
1603  if (wc == WEOF)
1604  pstr->tip_context
1605  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
1606  else
1607  pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
1608  && IS_WIDE_WORD_CHAR (wc))
1609  ? CONTEXT_WORD
1610  : ((IS_WIDE_NEWLINE (wc)
1611  && pstr->newline_anchor)
1612  ? CONTEXT_NEWLINE : 0));
1613  if (BE (pstr->valid_len, 0))
1614  {
1615  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
1616  pstr->wcs[wcs_idx] = WEOF;
1617  if (pstr->mbs_allocated)
1618  memset (pstr->mbs, 255, pstr->valid_len);
1619  }
1620  pstr->valid_raw_len = pstr->valid_len;
1621  }
1622  else
1623 #endif /* RE_ENABLE_I18N */
1624  {
1625  int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
1626  pstr->valid_raw_len = 0;
1627  if (pstr->trans)
1628  c = pstr->trans[c];
1629  pstr->tip_context = (bitset_contain (pstr->word_char, c)
1630  ? CONTEXT_WORD
1631  : ((IS_NEWLINE (c) && pstr->newline_anchor)
1632  ? CONTEXT_NEWLINE : 0));
1633  }
1634  }
1635  if (!BE (pstr->mbs_allocated, 0))
1636  pstr->mbs += offset;
1637  }
1638  pstr->raw_mbs_idx = idx;
1639  pstr->len -= offset;
1640  pstr->stop -= offset;
1641 
1642  /* Then build the buffers. */
1643 #ifdef RE_ENABLE_I18N
1644  if (pstr->mb_cur_max > 1)
1645  {
1646  if (pstr->icase)
1647  {
1648  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
1649  if (BE (ret != REG_NOERROR, 0))
1650  return ret;
1651  }
1652  else
1653  build_wcs_buffer (pstr);
1654  }
1655  else
1656 #endif /* RE_ENABLE_I18N */
1657  if (BE (pstr->mbs_allocated, 0))
1658  {
1659  if (pstr->icase)
1660  build_upper_buffer (pstr);
1661  else if (pstr->trans != NULL)
1662  re_string_translate_buffer (pstr);
1663  }
1664  else
1665  pstr->valid_len = pstr->len;
1666 
1667  pstr->cur_idx = 0;
1668  return REG_NOERROR;
1669 }
1670 
1671 static unsigned char
1672 internal_function __attribute ((pure))
1673 re_string_peek_byte_case (const re_string_t *pstr, int idx)
1674 {
1675  int ch, off;
1676 
1677  /* Handle the common (easiest) cases first. */
1678  if (BE (!pstr->mbs_allocated, 1))
1679  return re_string_peek_byte (pstr, idx);
1680 
1681 #ifdef RE_ENABLE_I18N
1682  if (pstr->mb_cur_max > 1
1683  && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
1684  return re_string_peek_byte (pstr, idx);
1685 #endif
1686 
1687  off = pstr->cur_idx + idx;
1688 #ifdef RE_ENABLE_I18N
1689  if (pstr->offsets_needed)
1690  off = pstr->offsets[off];
1691 #endif
1692 
1693  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1694 
1695 #ifdef RE_ENABLE_I18N
1696  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
1697  this function returns CAPITAL LETTER I instead of first byte of
1698  DOTLESS SMALL LETTER I. The latter would confuse the parser,
1699  since peek_byte_case doesn't advance cur_idx in any way. */
1700  if (pstr->offsets_needed && !isascii (ch))
1701  return re_string_peek_byte (pstr, idx);
1702 #endif
1703 
1704  return ch;
1705 }
1706 
1707 static unsigned char
1708 internal_function __attribute ((pure))
1709 re_string_fetch_byte_case (re_string_t *pstr)
1710 {
1711  if (BE (!pstr->mbs_allocated, 1))
1712  return re_string_fetch_byte (pstr);
1713 
1714 #ifdef RE_ENABLE_I18N
1715  if (pstr->offsets_needed)
1716  {
1717  int off, ch;
1718 
1719  /* For tr_TR.UTF-8 [[:islower:]] there is
1720  [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
1721  in that case the whole multi-byte character and return
1722  the original letter. On the other side, with
1723  [[: DOTLESS SMALL LETTER I return [[:I, as doing
1724  anything else would complicate things too much. */
1725 
1726  if (!re_string_first_byte (pstr, pstr->cur_idx))
1727  return re_string_fetch_byte (pstr);
1728 
1729  off = pstr->offsets[pstr->cur_idx];
1730  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
1731 
1732  if (! isascii (ch))
1733  return re_string_fetch_byte (pstr);
1734 
1735  re_string_skip_bytes (pstr,
1736  re_string_char_size_at (pstr, pstr->cur_idx));
1737  return ch;
1738  }
1739 #endif
1740 
1741  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
1742 }
1743 
1744 static void
1745 internal_function
1746 re_string_destruct (re_string_t *pstr)
1747 {
1748 #ifdef RE_ENABLE_I18N
1749  re_free (pstr->wcs);
1750  re_free (pstr->offsets);
1751 #endif /* RE_ENABLE_I18N */
1752  if (pstr->mbs_allocated)
1753  re_free (pstr->mbs);
1754 }
1755 
1756 /* Return the context at IDX in INPUT. */
1757 
1758 static unsigned int
1759 internal_function
1760 re_string_context_at (const re_string_t *input, int idx, int eflags)
1761 {
1762  int c;
1763  if (BE (idx < 0, 0))
1764  /* In this case, we use the value stored in input->tip_context,
1765  since we can't know the character in input->mbs[-1] here. */
1766  return input->tip_context;
1767  if (BE (idx == input->len, 0))
1768  return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
1769  : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
1770 #ifdef RE_ENABLE_I18N
1771  if (input->mb_cur_max > 1)
1772  {
1773  wint_t wc;
1774  int wc_idx = idx;
1775  while(input->wcs[wc_idx] == WEOF)
1776  {
1777 #ifdef DEBUG
1778  /* It must not happen. */
1779  assert (wc_idx >= 0);
1780 #endif
1781  --wc_idx;
1782  if (wc_idx < 0)
1783  return input->tip_context;
1784  }
1785  wc = input->wcs[wc_idx];
1786  if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
1787  return CONTEXT_WORD;
1788  return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
1789  ? CONTEXT_NEWLINE : 0);
1790  }
1791  else
1792 #endif
1793  {
1794  c = re_string_byte_at (input, idx);
1795  if (bitset_contain (input->word_char, c))
1796  return CONTEXT_WORD;
1797  return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
1798  }
1799 }
1800 
1801 /* Functions for set operation. */
1802 
1803 static reg_errcode_t
1804 internal_function
1805 re_node_set_alloc (re_node_set *set, int size)
1806 {
1807  set->alloc = size;
1808  set->nelem = 0;
1809  set->elems = re_malloc (int, size);
1810  if (BE (set->elems == NULL, 0))
1811  return REG_ESPACE;
1812  return REG_NOERROR;
1813 }
1814 
1815 static reg_errcode_t
1816 internal_function
1817 re_node_set_init_1 (re_node_set *set, int elem)
1818 {
1819  set->alloc = 1;
1820  set->nelem = 1;
1821  set->elems = re_malloc (int, 1);
1822  if (BE (set->elems == NULL, 0))
1823  {
1824  set->alloc = set->nelem = 0;
1825  return REG_ESPACE;
1826  }
1827  set->elems[0] = elem;
1828  return REG_NOERROR;
1829 }
1830 
1831 static reg_errcode_t
1832 internal_function
1833 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1834 {
1835  set->alloc = 2;
1836  set->elems = re_malloc (int, 2);
1837  if (BE (set->elems == NULL, 0))
1838  return REG_ESPACE;
1839  if (elem1 == elem2)
1840  {
1841  set->nelem = 1;
1842  set->elems[0] = elem1;
1843  }
1844  else
1845  {
1846  set->nelem = 2;
1847  if (elem1 < elem2)
1848  {
1849  set->elems[0] = elem1;
1850  set->elems[1] = elem2;
1851  }
1852  else
1853  {
1854  set->elems[0] = elem2;
1855  set->elems[1] = elem1;
1856  }
1857  }
1858  return REG_NOERROR;
1859 }
1860 
1861 static reg_errcode_t
1862 internal_function
1863 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1864 {
1865  dest->nelem = src->nelem;
1866  if (src->nelem > 0)
1867  {
1868  dest->alloc = dest->nelem;
1869  dest->elems = re_malloc (int, dest->alloc);
1870  if (BE (dest->elems == NULL, 0))
1871  {
1872  dest->alloc = dest->nelem = 0;
1873  return REG_ESPACE;
1874  }
1875  memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1876  }
1877  else
1878  re_node_set_init_empty (dest);
1879  return REG_NOERROR;
1880 }
1881 
1882 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1883  DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1884  Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1885 
1886 static reg_errcode_t
1887 internal_function
1888 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1889  const re_node_set *src2)
1890 {
1891  int i1, i2, is, id, delta, sbase;
1892  if (src1->nelem == 0 || src2->nelem == 0)
1893  return REG_NOERROR;
1894 
1895  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1896  conservative estimate. */
1897  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1898  {
1899  int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1900  int *new_elems = re_realloc (dest->elems, int, new_alloc);
1901  if (BE (new_elems == NULL, 0))
1902  return REG_ESPACE;
1903  dest->elems = new_elems;
1904  dest->alloc = new_alloc;
1905  }
1906 
1907  /* Find the items in the intersection of SRC1 and SRC2, and copy
1908  into the top of DEST those that are not already in DEST itself. */
1909  sbase = dest->nelem + src1->nelem + src2->nelem;
1910  i1 = src1->nelem - 1;
1911  i2 = src2->nelem - 1;
1912  id = dest->nelem - 1;
1913  for (;;)
1914  {
1915  if (src1->elems[i1] == src2->elems[i2])
1916  {
1917  /* Try to find the item in DEST. Maybe we could binary search? */
1918  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1919  --id;
1920 
1921  if (id < 0 || dest->elems[id] != src1->elems[i1])
1922  dest->elems[--sbase] = src1->elems[i1];
1923 
1924  if (--i1 < 0 || --i2 < 0)
1925  break;
1926  }
1927 
1928  /* Lower the highest of the two items. */
1929  else if (src1->elems[i1] < src2->elems[i2])
1930  {
1931  if (--i2 < 0)
1932  break;
1933  }
1934  else
1935  {
1936  if (--i1 < 0)
1937  break;
1938  }
1939  }
1940 
1941  id = dest->nelem - 1;
1942  is = dest->nelem + src1->nelem + src2->nelem - 1;
1943  delta = is - sbase + 1;
1944 
1945  /* Now copy. When DELTA becomes zero, the remaining
1946  DEST elements are already in place; this is more or
1947  less the same loop that is in re_node_set_merge. */
1948  dest->nelem += delta;
1949  if (delta > 0 && id >= 0)
1950  for (;;)
1951  {
1952  if (dest->elems[is] > dest->elems[id])
1953  {
1954  /* Copy from the top. */
1955  dest->elems[id + delta--] = dest->elems[is--];
1956  if (delta == 0)
1957  break;
1958  }
1959  else
1960  {
1961  /* Slide from the bottom. */
1962  dest->elems[id + delta] = dest->elems[id];
1963  if (--id < 0)
1964  break;
1965  }
1966  }
1967 
1968  /* Copy remaining SRC elements. */
1969  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1970 
1971  return REG_NOERROR;
1972 }
1973 
1974 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1975  DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1976 
1977 static reg_errcode_t
1978 internal_function
1979 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1980  const re_node_set *src2)
1981 {
1982  int i1, i2, id;
1983  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1984  {
1985  dest->alloc = src1->nelem + src2->nelem;
1986  dest->elems = re_malloc (int, dest->alloc);
1987  if (BE (dest->elems == NULL, 0))
1988  return REG_ESPACE;
1989  }
1990  else
1991  {
1992  if (src1 != NULL && src1->nelem > 0)
1993  return re_node_set_init_copy (dest, src1);
1994  else if (src2 != NULL && src2->nelem > 0)
1995  return re_node_set_init_copy (dest, src2);
1996  else
1997  re_node_set_init_empty (dest);
1998  return REG_NOERROR;
1999  }
2000  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
2001  {
2002  if (src1->elems[i1] > src2->elems[i2])
2003  {
2004  dest->elems[id++] = src2->elems[i2++];
2005  continue;
2006  }
2007  if (src1->elems[i1] == src2->elems[i2])
2008  ++i2;
2009  dest->elems[id++] = src1->elems[i1++];
2010  }
2011  if (i1 < src1->nelem)
2012  {
2013  memcpy (dest->elems + id, src1->elems + i1,
2014  (src1->nelem - i1) * sizeof (int));
2015  id += src1->nelem - i1;
2016  }
2017  else if (i2 < src2->nelem)
2018  {
2019  memcpy (dest->elems + id, src2->elems + i2,
2020  (src2->nelem - i2) * sizeof (int));
2021  id += src2->nelem - i2;
2022  }
2023  dest->nelem = id;
2024  return REG_NOERROR;
2025 }
2026 
2027 /* Calculate the union set of the sets DEST and SRC. And store it to
2028  DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
2029 
2030 static reg_errcode_t
2031 internal_function
2032 re_node_set_merge (re_node_set *dest, const re_node_set *src)
2033 {
2034  int is, id, sbase, delta;
2035  if (src == NULL || src->nelem == 0)
2036  return REG_NOERROR;
2037  if (dest->alloc < 2 * src->nelem + dest->nelem)
2038  {
2039  int new_alloc = 2 * (src->nelem + dest->alloc);
2040  int *new_buffer = re_realloc (dest->elems, int, new_alloc);
2041  if (BE (new_buffer == NULL, 0))
2042  return REG_ESPACE;
2043  dest->elems = new_buffer;
2044  dest->alloc = new_alloc;
2045  }
2046 
2047  if (BE (dest->nelem == 0, 0))
2048  {
2049  dest->nelem = src->nelem;
2050  memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
2051  return REG_NOERROR;
2052  }
2053 
2054  /* Copy into the top of DEST the items of SRC that are not
2055  found in DEST. Maybe we could binary search in DEST? */
2056  for (sbase = dest->nelem + 2 * src->nelem,
2057  is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
2058  {
2059  if (dest->elems[id] == src->elems[is])
2060  is--, id--;
2061  else if (dest->elems[id] < src->elems[is])
2062  dest->elems[--sbase] = src->elems[is--];
2063  else /* if (dest->elems[id] > src->elems[is]) */
2064  --id;
2065  }
2066 
2067  if (is >= 0)
2068  {
2069  /* If DEST is exhausted, the remaining items of SRC must be unique. */
2070  sbase -= is + 1;
2071  memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
2072  }
2073 
2074  id = dest->nelem - 1;
2075  is = dest->nelem + 2 * src->nelem - 1;
2076  delta = is - sbase + 1;
2077  if (delta == 0)
2078  return REG_NOERROR;
2079 
2080  /* Now copy. When DELTA becomes zero, the remaining
2081  DEST elements are already in place. */
2082  dest->nelem += delta;
2083  for (;;)
2084  {
2085  if (dest->elems[is] > dest->elems[id])
2086  {
2087  /* Copy from the top. */
2088  dest->elems[id + delta--] = dest->elems[is--];
2089  if (delta == 0)
2090  break;
2091  }
2092  else
2093  {
2094  /* Slide from the bottom. */
2095  dest->elems[id + delta] = dest->elems[id];
2096  if (--id < 0)
2097  {
2098  /* Copy remaining SRC elements. */
2099  memcpy (dest->elems, dest->elems + sbase,
2100  delta * sizeof (int));
2101  break;
2102  }
2103  }
2104  }
2105 
2106  return REG_NOERROR;
2107 }
2108 
2109 /* Insert the new element ELEM to the re_node_set* SET.
2110  SET should not already have ELEM.
2111  return -1 if an error is occured, return 1 otherwise. */
2112 
2113 static int
2114 internal_function
2115 re_node_set_insert (re_node_set *set, int elem)
2116 {
2117  int idx;
2118  /* In case the set is empty. */
2119  if (set->alloc == 0)
2120  {
2121  if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
2122  return 1;
2123  else
2124  return -1;
2125  }
2126 
2127  if (BE (set->nelem, 0) == 0)
2128  {
2129  /* We already guaranteed above that set->alloc != 0. */
2130  set->elems[0] = elem;
2131  ++set->nelem;
2132  return 1;
2133  }
2134 
2135  /* Realloc if we need. */
2136  if (set->alloc == set->nelem)
2137  {
2138  int *new_elems;
2139  set->alloc = set->alloc * 2;
2140  new_elems = re_realloc (set->elems, int, set->alloc);
2141  if (BE (new_elems == NULL, 0))
2142  return -1;
2143  set->elems = new_elems;
2144  }
2145 
2146  /* Move the elements which follows the new element. Test the
2147  first element separately to skip a check in the inner loop. */
2148  if (elem < set->elems[0])
2149  {
2150  idx = 0;
2151  for (idx = set->nelem; idx > 0; idx--)
2152  set->elems[idx] = set->elems[idx - 1];
2153  }
2154  else
2155  {
2156  for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
2157  set->elems[idx] = set->elems[idx - 1];
2158  }
2159 
2160  /* Insert the new element. */
2161  set->elems[idx] = elem;
2162  ++set->nelem;
2163  return 1;
2164 }
2165 
2166 /* Insert the new element ELEM to the re_node_set* SET.
2167  SET should not already have any element greater than or equal to ELEM.
2168  Return -1 if an error is occured, return 1 otherwise. */
2169 
2170 static int
2171 internal_function
2172 re_node_set_insert_last (re_node_set *set, int elem)
2173 {
2174  /* Realloc if we need. */
2175  if (set->alloc == set->nelem)
2176  {
2177  int *new_elems;
2178  set->alloc = (set->alloc + 1) * 2;
2179  new_elems = re_realloc (set->elems, int, set->alloc);
2180  if (BE (new_elems == NULL, 0))
2181  return -1;
2182  set->elems = new_elems;
2183  }
2184 
2185  /* Insert the new element. */
2186  set->elems[set->nelem++] = elem;
2187  return 1;
2188 }
2189 
2190 /* Compare two node sets SET1 and SET2.
2191  return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
2192 
2193 static int
2194 internal_function __attribute ((pure))
2195 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
2196 {
2197  int i;
2198  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
2199  return 0;
2200  for (i = set1->nelem ; --i >= 0 ; )
2201  if (set1->elems[i] != set2->elems[i])
2202  return 0;
2203  return 1;
2204 }
2205 
2206 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
2207 
2208 static int
2209 internal_function __attribute ((pure))
2210 re_node_set_contains (const re_node_set *set, int elem)
2211 {
2212  unsigned int idx, right, mid;
2213  if (set->nelem <= 0)
2214  return 0;
2215 
2216  /* Binary search the element. */
2217  idx = 0;
2218  right = set->nelem - 1;
2219  while (idx < right)
2220  {
2221  mid = (idx + right) / 2;
2222  if (set->elems[mid] < elem)
2223  idx = mid + 1;
2224  else
2225  right = mid;
2226  }
2227  return set->elems[idx] == elem ? idx + 1 : 0;
2228 }
2229 
2230 static void
2231 internal_function
2232 re_node_set_remove_at (re_node_set *set, int idx)
2233 {
2234  if (idx < 0 || idx >= set->nelem)
2235  return;
2236  --set->nelem;
2237  for (; idx < set->nelem; idx++)
2238  set->elems[idx] = set->elems[idx + 1];
2239 }
2240 
2241 
2242 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
2243  Or return -1, if an error will be occured. */
2244 
2245 static int
2246 internal_function
2247 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
2248 {
2249  int type = token.type;
2250  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
2251  {
2252  size_t new_nodes_alloc = dfa->nodes_alloc * 2;
2253  int *new_nexts, *new_indices;
2254  re_node_set *new_edests, *new_eclosures;
2255  re_token_t *new_nodes;
2256 
2257  /* Avoid overflows. */
2258  if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
2259  return -1;
2260 
2261  new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
2262  if (BE (new_nodes == NULL, 0))
2263  return -1;
2264  dfa->nodes = new_nodes;
2265  new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
2266  new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
2267  new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
2268  new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
2269  if (BE (new_nexts == NULL || new_indices == NULL
2270  || new_edests == NULL || new_eclosures == NULL, 0))
2271  return -1;
2272  dfa->nexts = new_nexts;
2273  dfa->org_indices = new_indices;
2274  dfa->edests = new_edests;
2275  dfa->eclosures = new_eclosures;
2276  dfa->nodes_alloc = new_nodes_alloc;
2277  }
2278  dfa->nodes[dfa->nodes_len] = token;
2279  dfa->nodes[dfa->nodes_len].constraint = 0;
2280 #ifdef RE_ENABLE_I18N
2281  dfa->nodes[dfa->nodes_len].accept_mb =
2282  (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
2283 #endif
2284  dfa->nexts[dfa->nodes_len] = -1;
2285  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
2286  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
2287  return dfa->nodes_len++;
2288 }
2289 
2290 static inline unsigned int
2291 internal_function
2292 calc_state_hash (const re_node_set *nodes, unsigned int context)
2293 {
2294  unsigned int hash = nodes->nelem + context;
2295  int i;
2296  for (i = 0 ; i < nodes->nelem ; i++)
2297  hash += nodes->elems[i];
2298  return hash;
2299 }
2300 
2301 /* Search for the state whose node_set is equivalent to NODES.
2302  Return the pointer to the state, if we found it in the DFA.
2303  Otherwise create the new one and return it. In case of an error
2304  return NULL and set the error code in ERR.
2305  Note: - We assume NULL as the invalid state, then it is possible that
2306  return value is NULL and ERR is REG_NOERROR.
2307  - We never return non-NULL value in case of any errors, it is for
2308  optimization. */
2309 
2310 static re_dfastate_t *
2311 internal_function
2312 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
2313  const re_node_set *nodes)
2314 {
2315  unsigned int hash;
2316  re_dfastate_t *new_state;
2317  struct re_state_table_entry *spot;
2318  int i;
2319  if (BE (nodes->nelem == 0, 0))
2320  {
2321  *err = REG_NOERROR;
2322  return NULL;
2323  }
2324  hash = calc_state_hash (nodes, 0);
2325  spot = dfa->state_table + (hash & dfa->state_hash_mask);
2326 
2327  for (i = 0 ; i < spot->num ; i++)
2328  {
2329  re_dfastate_t *state = spot->array[i];
2330  if (hash != state->hash)
2331  continue;
2332  if (re_node_set_compare (&state->nodes, nodes))
2333  return state;
2334  }
2335 
2336  /* There are no appropriate state in the dfa, create the new one. */
2337  new_state = create_ci_newstate (dfa, nodes, hash);
2338  if (BE (new_state == NULL, 0))
2339  *err = REG_ESPACE;
2340 
2341  return new_state;
2342 }
2343 
2344 /* Search for the state whose node_set is equivalent to NODES and
2345  whose context is equivalent to CONTEXT.
2346  Return the pointer to the state, if we found it in the DFA.
2347  Otherwise create the new one and return it. In case of an error
2348  return NULL and set the error code in ERR.
2349  Note: - We assume NULL as the invalid state, then it is possible that
2350  return value is NULL and ERR is REG_NOERROR.
2351  - We never return non-NULL value in case of any errors, it is for
2352  optimization. */
2353 
2354 static re_dfastate_t *
2355 internal_function
2356 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
2357  const re_node_set *nodes, unsigned int context)
2358 {
2359  unsigned int hash;
2360  re_dfastate_t *new_state;
2361  struct re_state_table_entry *spot;
2362  int i;
2363  if (nodes->nelem == 0)
2364  {
2365  *err = REG_NOERROR;
2366  return NULL;
2367  }
2368  hash = calc_state_hash (nodes, context);
2369  spot = dfa->state_table + (hash & dfa->state_hash_mask);
2370 
2371  for (i = 0 ; i < spot->num ; i++)
2372  {
2373  re_dfastate_t *state = spot->array[i];
2374  if (state->hash == hash
2375  && state->context == context
2376  && re_node_set_compare (state->entrance_nodes, nodes))
2377  return state;
2378  }
2379  /* There are no appropriate state in `dfa', create the new one. */
2380  new_state = create_cd_newstate (dfa, nodes, context, hash);
2381  if (BE (new_state == NULL, 0))
2382  *err = REG_ESPACE;
2383 
2384  return new_state;
2385 }
2386 
2387 /* Finish initialization of the new state NEWSTATE, and using its hash value
2388  HASH put in the appropriate bucket of DFA's state table. Return value
2389  indicates the error code if failed. */
2390 
2391 static reg_errcode_t
2392 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
2393  unsigned int hash)
2394 {
2395  struct re_state_table_entry *spot;
2396  reg_errcode_t err;
2397  int i;
2398 
2399  newstate->hash = hash;
2400  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
2401  if (BE (err != REG_NOERROR, 0))
2402  return REG_ESPACE;
2403  for (i = 0; i < newstate->nodes.nelem; i++)
2404  {
2405  int elem = newstate->nodes.elems[i];
2406  if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
2407  re_node_set_insert_last (&newstate->non_eps_nodes, elem);
2408  }
2409 
2410  spot = dfa->state_table + (hash & dfa->state_hash_mask);
2411  if (BE (spot->alloc <= spot->num, 0))
2412  {
2413  int new_alloc = 2 * spot->num + 2;
2414  re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
2415  new_alloc);
2416  if (BE (new_array == NULL, 0))
2417  return REG_ESPACE;
2418  spot->array = new_array;
2419  spot->alloc = new_alloc;
2420  }
2421  spot->array[spot->num++] = newstate;
2422  return REG_NOERROR;
2423 }
2424 
2425 static void
2426 free_state (re_dfastate_t *state)
2427 {
2428  re_node_set_free (&state->non_eps_nodes);
2429  re_node_set_free (&state->inveclosure);
2430  if (state->entrance_nodes != &state->nodes)
2431  {
2432  re_node_set_free (state->entrance_nodes);
2433  re_free (state->entrance_nodes);
2434  }
2435  re_node_set_free (&state->nodes);
2436  re_free (state->word_trtable);
2437  re_free (state->trtable);
2438  re_free (state);
2439 }
2440 
2441 /* Create the new state which is independ of contexts.
2442  Return the new state if succeeded, otherwise return NULL. */
2443 
2444 static re_dfastate_t *
2445 internal_function
2446 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2447  unsigned int hash)
2448 {
2449  int i;
2450  reg_errcode_t err;
2451  re_dfastate_t *newstate;
2452 
2453  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2454  if (BE (newstate == NULL, 0))
2455  return NULL;
2456  err = re_node_set_init_copy (&newstate->nodes, nodes);
2457  if (BE (err != REG_NOERROR, 0))
2458  {
2459  re_free (newstate);
2460  return NULL;
2461  }
2462 
2463  newstate->entrance_nodes = &newstate->nodes;
2464  for (i = 0 ; i < nodes->nelem ; i++)
2465  {
2466  re_token_t *node = dfa->nodes + nodes->elems[i];
2467  re_token_type_t type = node->type;
2468  if (type == CHARACTER && !node->constraint)
2469  continue;
2470 #ifdef RE_ENABLE_I18N
2471  newstate->accept_mb |= node->accept_mb;
2472 #endif /* RE_ENABLE_I18N */
2473 
2474  /* If the state has the halt node, the state is a halt state. */
2475  if (type == END_OF_RE)
2476  newstate->halt = 1;
2477  else if (type == OP_BACK_REF)
2478  newstate->has_backref = 1;
2479  else if (type == ANCHOR || node->constraint)
2480  newstate->has_constraint = 1;
2481  }
2482  err = register_state (dfa, newstate, hash);
2483  if (BE (err != REG_NOERROR, 0))
2484  {
2485  free_state (newstate);
2486  newstate = NULL;
2487  }
2488  return newstate;
2489 }
2490 
2491 /* Create the new state which is depend on the context CONTEXT.
2492  Return the new state if succeeded, otherwise return NULL. */
2493 
2494 static re_dfastate_t *
2495 internal_function
2496 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
2497  unsigned int context, unsigned int hash)
2498 {
2499  int i, nctx_nodes = 0;
2500  reg_errcode_t err;
2501  re_dfastate_t *newstate;
2502 
2503  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
2504  if (BE (newstate == NULL, 0))
2505  return NULL;
2506  err = re_node_set_init_copy (&newstate->nodes, nodes);
2507  if (BE (err != REG_NOERROR, 0))
2508  {
2509  re_free (newstate);
2510  return NULL;
2511  }
2512 
2513  newstate->context = context;
2514  newstate->entrance_nodes = &newstate->nodes;
2515 
2516  for (i = 0 ; i < nodes->nelem ; i++)
2517  {
2518  unsigned int constraint = 0;
2519  re_token_t *node = dfa->nodes + nodes->elems[i];
2520  re_token_type_t type = node->type;
2521  if (node->constraint)
2522  constraint = node->constraint;
2523 
2524  if (type == CHARACTER && !constraint)
2525  continue;
2526 #ifdef RE_ENABLE_I18N
2527  newstate->accept_mb |= node->accept_mb;
2528 #endif /* RE_ENABLE_I18N */
2529 
2530  /* If the state has the halt node, the state is a halt state. */
2531  if (type == END_OF_RE)
2532  newstate->halt = 1;
2533  else if (type == OP_BACK_REF)
2534  newstate->has_backref = 1;
2535  else if (type == ANCHOR)
2536  constraint = node->opr.ctx_type;
2537 
2538  if (constraint)
2539  {
2540  if (newstate->entrance_nodes == &newstate->nodes)
2541  {
2542  newstate->entrance_nodes = re_malloc (re_node_set, 1);
2543  if (BE (newstate->entrance_nodes == NULL, 0))
2544  {
2545  free_state (newstate);
2546  return NULL;
2547  }
2548  re_node_set_init_copy (newstate->entrance_nodes, nodes);
2549  nctx_nodes = 0;
2550  newstate->has_constraint = 1;
2551  }
2552 
2553  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
2554  {
2555  re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
2556  ++nctx_nodes;
2557  }
2558  }
2559  }
2560  err = register_state (dfa, newstate, hash);
2561  if (BE (err != REG_NOERROR, 0))
2562  {
2563  free_state (newstate);
2564  newstate = NULL;
2565  }
2566  return newstate;
2567 }
2568 
2569 /******************************************************************************/
2570 /******************************************************************************/
2571 /******************************************************************************/
2572 /* GKINCLUDE #include "regcomp.c" */
2573 /******************************************************************************/
2574 /******************************************************************************/
2575 /******************************************************************************/
2576 /* Extended regular expression matching and search library.
2577  Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
2578  This file is part of the GNU C Library.
2579  Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
2580 
2581  The GNU C Library is free software; you can redistribute it and/or
2582  modify it under the terms of the GNU Lesser General Public
2583  License as published by the Free Software Foundation; either
2584  version 2.1 of the License, or (at your option) any later version.
2585 
2586  The GNU C Library is distributed in the hope that it will be useful,
2587  but WITHOUT ANY WARRANTY; without even the implied warranty of
2588  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2589  Lesser General Public License for more details.
2590 
2591  You should have received a copy of the GNU Lesser General Public
2592  License along with the GNU C Library; if not, write to the Free
2593  Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
2594  02111-1307 USA. */
2595 
2596 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
2597  size_t length, reg_syntax_t syntax);
2598 static void re_compile_fastmap_iter (regex_t *bufp,
2599  const re_dfastate_t *init_state,
2600  char *fastmap);
2601 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
2602 #ifdef RE_ENABLE_I18N
2603 static void free_charset (re_charset_t *cset);
2604 #endif /* RE_ENABLE_I18N */
2605 static void free_workarea_compile (regex_t *preg);
2606 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
2607 #ifdef RE_ENABLE_I18N
2608 static void optimize_utf8 (re_dfa_t *dfa);
2609 #endif
2610 static reg_errcode_t analyze (regex_t *preg);
2611 static reg_errcode_t preorder (bin_tree_t *root,
2612  reg_errcode_t (fn (void *, bin_tree_t *)),
2613  void *extra);
2614 static reg_errcode_t postorder (bin_tree_t *root,
2615  reg_errcode_t (fn (void *, bin_tree_t *)),
2616  void *extra);
2617 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
2618 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
2619 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
2620  bin_tree_t *node);
2621 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
2622 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
2623 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
2624 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
2625 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
2626  unsigned int constraint);
2627 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
2628 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
2629  int node, int root);
2630 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
2631 static int fetch_number (re_string_t *input, re_token_t *token,
2632  reg_syntax_t syntax);
2633 static int peek_token (re_token_t *token, re_string_t *input,
2634  reg_syntax_t syntax) internal_function;
2635 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
2636  reg_syntax_t syntax, reg_errcode_t *err);
2637 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
2638  re_token_t *token, reg_syntax_t syntax,
2639  int nest, reg_errcode_t *err);
2640 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
2641  re_token_t *token, reg_syntax_t syntax,
2642  int nest, reg_errcode_t *err);
2643 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
2644  re_token_t *token, reg_syntax_t syntax,
2645  int nest, reg_errcode_t *err);
2646 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
2647  re_token_t *token, reg_syntax_t syntax,
2648  int nest, reg_errcode_t *err);
2649 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
2650  re_dfa_t *dfa, re_token_t *token,
2651  reg_syntax_t syntax, reg_errcode_t *err);
2652 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
2653  re_token_t *token, reg_syntax_t syntax,
2654  reg_errcode_t *err);
2655 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
2656  re_string_t *regexp,
2657  re_token_t *token, int token_len,
2658  re_dfa_t *dfa,
2659  reg_syntax_t syntax,
2660  int accept_hyphen);
2661 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
2662  re_string_t *regexp,
2663  re_token_t *token);
2664 #ifdef RE_ENABLE_I18N
2665 static reg_errcode_t build_equiv_class (bitset_t sbcset,
2666  re_charset_t *mbcset,
2667  int *equiv_class_alloc,
2668  const unsigned char *name);
2669 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2670  bitset_t sbcset,
2671  re_charset_t *mbcset,
2672  int *char_class_alloc,
2673  const unsigned char *class_name,
2674  reg_syntax_t syntax);
2675 #else /* not RE_ENABLE_I18N */
2676 static reg_errcode_t build_equiv_class (bitset_t sbcset,
2677  const unsigned char *name);
2678 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
2679  bitset_t sbcset,
2680  const unsigned char *class_name,
2681  reg_syntax_t syntax);
2682 #endif /* not RE_ENABLE_I18N */
2683 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
2685  const unsigned char *class_name,
2686  const unsigned char *extra,
2687  int non_match, reg_errcode_t *err);
2688 static bin_tree_t *create_tree (re_dfa_t *dfa,
2689  bin_tree_t *left, bin_tree_t *right,
2690  re_token_type_t type);
2691 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
2692  bin_tree_t *left, bin_tree_t *right,
2693  const re_token_t *token);
2694 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
2695 static void free_token (re_token_t *node);
2696 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
2697 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
2698 
2699 /* This table gives an error message for each of the error codes listed
2700  in regex.h. Obviously the order here has to be same as there.
2701  POSIX doesn't require that we do anything for REG_NOERROR,
2702  but why not be nice? */
2703 
2704 const char __re_error_msgid[] attribute_hidden =
2705  {
2706 #define REG_NOERROR_IDX 0
2707  gettext_noop ("Success") /* REG_NOERROR */
2708  "\0"
2709 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
2710  gettext_noop ("No match") /* REG_NOMATCH */
2711  "\0"
2712 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
2713  gettext_noop ("Invalid regular expression") /* REG_BADPAT */
2714  "\0"
2715 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
2716  gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
2717  "\0"
2718 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
2719  gettext_noop ("Invalid character class name") /* REG_ECTYPE */
2720  "\0"
2721 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
2722  gettext_noop ("Trailing backslash") /* REG_EESCAPE */
2723  "\0"
2724 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
2725  gettext_noop ("Invalid back reference") /* REG_ESUBREG */
2726  "\0"
2727 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
2728  gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
2729  "\0"
2730 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
2731  gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
2732  "\0"
2733 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
2734  gettext_noop ("Unmatched \\{") /* REG_EBRACE */
2735  "\0"
2736 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
2737  gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
2738  "\0"
2739 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
2740  gettext_noop ("Invalid range end") /* REG_ERANGE */
2741  "\0"
2742 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
2743  gettext_noop ("Memory exhausted") /* REG_ESPACE */
2744  "\0"
2745 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
2746  gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
2747  "\0"
2748 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
2749  gettext_noop ("Premature end of regular expression") /* REG_EEND */
2750  "\0"
2751 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
2752  gettext_noop ("Regular expression too big") /* REG_ESIZE */
2753  "\0"
2754 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
2755  gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
2756  };
2757 
2758 const size_t __re_error_msgid_idx[] attribute_hidden =
2759  {
2760  REG_NOERROR_IDX,
2761  REG_NOMATCH_IDX,
2762  REG_BADPAT_IDX,
2763  REG_ECOLLATE_IDX,
2764  REG_ECTYPE_IDX,
2765  REG_EESCAPE_IDX,
2766  REG_ESUBREG_IDX,
2767  REG_EBRACK_IDX,
2768  REG_EPAREN_IDX,
2769  REG_EBRACE_IDX,
2770  REG_BADBR_IDX,
2771  REG_ERANGE_IDX,
2772  REG_ESPACE_IDX,
2773  REG_BADRPT_IDX,
2774  REG_EEND_IDX,
2775  REG_ESIZE_IDX,
2776  REG_ERPAREN_IDX
2777  };
2778 
2779 /* Entry points for GNU code. */
2780 
2781 /* re_compile_pattern is the GNU regular expression compiler: it
2782  compiles PATTERN (of length LENGTH) and puts the result in BUFP.
2783  Returns 0 if the pattern was valid, otherwise an error string.
2784 
2785  Assumes the `allocated' (and perhaps `buffer') and `translate' fields
2786  are set in BUFP on entry. */
2787 
2788 const char *
2789 re_compile_pattern (pattern, length, bufp)
2790  const char *pattern;
2791  size_t length;
2792  struct re_pattern_buffer *bufp;
2793 {
2795 
2796  /* And GNU code determines whether or not to get register information
2797  by passing null for the REGS argument to re_match, etc., not by
2798  setting no_sub, unless RE_NO_SUB is set. */
2799  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
2800 
2801  /* Match anchors at newline. */
2802  bufp->newline_anchor = 1;
2803 
2804  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
2805 
2806  if (!ret)
2807  return NULL;
2808  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
2809 }
2810 #ifdef _LIBC
2811 weak_alias (__re_compile_pattern, re_compile_pattern)
2812 #endif
2813 
2814 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
2815  also be assigned to arbitrarily: each pattern buffer stores its own
2816  syntax, so it can be changed between regex compilations. */
2817 /* This has no initializer because initialized variables in Emacs
2818  become read-only after dumping. */
2820 
2821 
2822 /* Specify the precise syntax of regexps for compilation. This provides
2823  for compatibility for various utilities which historically have
2824  different, incompatible syntaxes.
2825 
2826  The argument SYNTAX is a bit mask comprised of the various bits
2827  defined in regex.h. We return the old syntax. */
2828 
2832 {
2834 
2836  return ret;
2837 }
2838 #ifdef _LIBC
2839 weak_alias (__re_set_syntax, re_set_syntax)
2840 #endif
2841 
2842 int
2843 re_compile_fastmap (bufp)
2844  struct re_pattern_buffer *bufp;
2845 {
2846  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2847  char *fastmap = bufp->fastmap;
2848 
2849  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
2850  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
2851  if (dfa->init_state != dfa->init_state_word)
2852  re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
2853  if (dfa->init_state != dfa->init_state_nl)
2854  re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
2855  if (dfa->init_state != dfa->init_state_begbuf)
2856  re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
2857  bufp->fastmap_accurate = 1;
2858  return 0;
2859 }
2860 #ifdef _LIBC
2861 weak_alias (__re_compile_fastmap, re_compile_fastmap)
2862 #endif
2863 
2864 static inline void
2865 __attribute ((always_inline))
2866 re_set_fastmap (char *fastmap, int icase, int ch)
2867 {
2868  fastmap[ch] = 1;
2869  if (icase)
2870  fastmap[tolower (ch)] = 1;
2871 }
2872 
2873 /* Helper function for re_compile_fastmap.
2874  Compile fastmap for the initial_state INIT_STATE. */
2875 
2876 static void
2877 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
2878  char *fastmap)
2879 {
2880  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
2881  int node_cnt;
2882  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
2883  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
2884  {
2885  int node = init_state->nodes.elems[node_cnt];
2886  re_token_type_t type = dfa->nodes[node].type;
2887 
2888  if (type == CHARACTER)
2889  {
2890  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
2891 #ifdef RE_ENABLE_I18N
2892  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2893  {
2894  unsigned char *buf = alloca (dfa->mb_cur_max), *p;
2895  wchar_t wc;
2896  mbstate_t state;
2897 
2898  p = buf;
2899  *p++ = dfa->nodes[node].opr.c;
2900  while (++node < dfa->nodes_len
2901  && dfa->nodes[node].type == CHARACTER
2902  && dfa->nodes[node].mb_partial)
2903  *p++ = dfa->nodes[node].opr.c;
2904  memset (&state, '\0', sizeof (state));
2905  if (mbrtowc (&wc, (const char *) buf, p - buf,
2906  &state) == p - buf
2907  && (__wcrtomb ((char *) buf, towlower (wc), &state)
2908  != (size_t) -1))
2909  re_set_fastmap (fastmap, 0, buf[0]);
2910  }
2911 #endif
2912  }
2913  else if (type == SIMPLE_BRACKET)
2914  {
2915  int i, ch;
2916  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
2917  {
2918  int j;
2919  bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
2920  for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
2921  if (w & ((bitset_word_t) 1 << j))
2922  re_set_fastmap (fastmap, icase, ch);
2923  }
2924  }
2925 #ifdef RE_ENABLE_I18N
2926  else if (type == COMPLEX_BRACKET)
2927  {
2928  int i;
2929  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
2930  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
2931  || cset->nranges || cset->nchar_classes)
2932  {
2933 # ifdef _LIBC
2934  if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
2935  {
2936  /* In this case we want to catch the bytes which are
2937  the first byte of any collation elements.
2938  e.g. In da_DK, we want to catch 'a' since "aa"
2939  is a valid collation element, and don't catch
2940  'b' since 'b' is the only collation element
2941  which starts from 'b'. */
2942  const int32_t *table = (const int32_t *)
2943  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2944  for (i = 0; i < SBC_MAX; ++i)
2945  if (table[i] < 0)
2946  re_set_fastmap (fastmap, icase, i);
2947  }
2948 # else
2949  if (dfa->mb_cur_max > 1)
2950  for (i = 0; i < SBC_MAX; ++i)
2951  if (__btowc (i) == WEOF)
2952  re_set_fastmap (fastmap, icase, i);
2953 # endif /* not _LIBC */
2954  }
2955  for (i = 0; i < cset->nmbchars; ++i)
2956  {
2957  char buf[256];
2958  mbstate_t state;
2959  memset (&state, '\0', sizeof (state));
2960  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
2961  re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
2962  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
2963  {
2964  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
2965  != (size_t) -1)
2966  re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
2967  }
2968  }
2969  }
2970 #endif /* RE_ENABLE_I18N */
2971  else if (type == OP_PERIOD
2972 #ifdef RE_ENABLE_I18N
2973  || type == OP_UTF8_PERIOD
2974 #endif /* RE_ENABLE_I18N */
2975  || type == END_OF_RE)
2976  {
2977  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
2978  if (type == END_OF_RE)
2979  bufp->can_be_null = 1;
2980  return;
2981  }
2982  }
2983 }
2984 
2985 /* Entry point for POSIX code. */
2986 /* regcomp takes a regular expression as a string and compiles it.
2987 
2988  PREG is a regex_t *. We do not expect any fields to be initialized,
2989  since POSIX says we shouldn't. Thus, we set
2990 
2991  `buffer' to the compiled pattern;
2992  `used' to the length of the compiled pattern;
2993  `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
2994  REG_EXTENDED bit in CFLAGS is set; otherwise, to
2995  RE_SYNTAX_POSIX_BASIC;
2996  `newline_anchor' to REG_NEWLINE being set in CFLAGS;
2997  `fastmap' to an allocated space for the fastmap;
2998  `fastmap_accurate' to zero;
2999  `re_nsub' to the number of subexpressions in PATTERN.
3000 
3001  PATTERN is the address of the pattern string.
3002 
3003  CFLAGS is a series of bits which affect compilation.
3004 
3005  If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
3006  use POSIX basic syntax.
3007 
3008  If REG_NEWLINE is set, then . and [^...] don't match newline.
3009  Also, regexec will try a match beginning after every newline.
3010 
3011  If REG_ICASE is set, then we considers upper- and lowercase
3012  versions of letters to be equivalent when matching.
3013 
3014  If REG_NOSUB is set, then when PREG is passed to regexec, that
3015  routine will report only success or failure, and nothing about the
3016  registers.
3017 
3018  It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
3019  the return codes and their meanings.) */
3020 
3021 int
3022 regcomp (preg, pattern, cflags)
3023  regex_t *__restrict preg;
3024  const char *__restrict pattern;
3025  int cflags;
3026 {
3030 
3031  preg->buffer = NULL;
3032  preg->allocated = 0;
3033  preg->used = 0;
3034 
3035  /* Try to allocate space for the fastmap. */
3036  preg->fastmap = re_malloc (char, SBC_MAX);
3037  if (BE (preg->fastmap == NULL, 0))
3038  return REG_ESPACE;
3039 
3040  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
3041 
3042  /* If REG_NEWLINE is set, newlines are treated differently. */
3043  if (cflags & REG_NEWLINE)
3044  { /* REG_NEWLINE implies neither . nor [^...] match newline. */
3045  syntax &= ~RE_DOT_NEWLINE;
3046  syntax |= RE_HAT_LISTS_NOT_NEWLINE;
3047  /* It also changes the matching behavior. */
3048  preg->newline_anchor = 1;
3049  }
3050  else
3051  preg->newline_anchor = 0;
3052  preg->no_sub = !!(cflags & REG_NOSUB);
3053  preg->translate = NULL;
3054 
3055  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
3056 
3057  /* POSIX doesn't distinguish between an unmatched open-group and an
3058  unmatched close-group: both are REG_EPAREN. */
3059  if (ret == REG_ERPAREN)
3060  ret = REG_EPAREN;
3061 
3062  /* We have already checked preg->fastmap != NULL. */
3063  if (BE (ret == REG_NOERROR, 1))
3064  /* Compute the fastmap now, since regexec cannot modify the pattern
3065  buffer. This function never fails in this implementation. */
3066  (void) re_compile_fastmap (preg);
3067  else
3068  {
3069  /* Some error occurred while compiling the expression. */
3070  re_free (preg->fastmap);
3071  preg->fastmap = NULL;
3072  }
3073 
3074  return (int) ret;
3075 }
3076 #ifdef _LIBC
3077 weak_alias (__regcomp, regcomp)
3078 #endif
3079 
3080 /* Returns a message corresponding to an error code, ERRCODE, returned
3081  from either regcomp or regexec. We don't use PREG here. */
3082 
3083 /* regerror ( int errcode, preg, errbuf, errbuf_size) */
3084 size_t
3085 regerror (
3086  int errcode,
3087  const regex_t *__restrict preg,
3088  char *__restrict errbuf,
3089  size_t errbuf_size)
3090 {
3091  const char *msg;
3092  size_t msg_size;
3093 
3094  if (BE (errcode < 0
3095  || errcode >= (int) (sizeof (__re_error_msgid_idx)
3096  / sizeof (__re_error_msgid_idx[0])), 0))
3097  /* Only error codes returned by the rest of the code should be passed
3098  to this routine. If we are given anything else, or if other regex
3099  code generates an invalid error code, then the program has a bug.
3100  Dump core so we can fix it. */
3101  abort ();
3102 
3103  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
3104 
3105  msg_size = strlen (msg) + 1; /* Includes the null. */
3106 
3107  if (BE (errbuf_size != 0, 1))
3108  {
3109  if (BE (msg_size > errbuf_size, 0))
3110  {
3111 #if defined HAVE_MEMPCPY || defined _LIBC
3112  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
3113 #else
3114  memcpy (errbuf, msg, errbuf_size - 1);
3115  errbuf[errbuf_size - 1] = 0;
3116 #endif
3117  }
3118  else
3119  memcpy (errbuf, msg, msg_size);
3120  }
3121 
3122  return msg_size;
3123 }
3124 #ifdef _LIBC
3125 weak_alias (__regerror, regerror)
3126 #endif
3127 
3128 
3129 #ifdef RE_ENABLE_I18N
3130 /* This static array is used for the map to single-byte characters when
3131  UTF-8 is used. Otherwise we would allocate memory just to initialize
3132  it the same all the time. UTF-8 is the preferred encoding so this is
3133  a worthwhile optimization. */
3134 static const bitset_t utf8_sb_map =
3135 {
3136  /* Set the first 128 bits. */
3137  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
3138 };
3139 #endif
3140 
3141 
3142 static void
3143 free_dfa_content (re_dfa_t *dfa)
3144 {
3145  int i, j;
3146 
3147  if (dfa->nodes)
3148  for (i = 0; i < dfa->nodes_len; ++i)
3149  free_token (dfa->nodes + i);
3150  re_free (dfa->nexts);
3151  for (i = 0; i < dfa->nodes_len; ++i)
3152  {
3153  if (dfa->eclosures != NULL)
3154  re_node_set_free (dfa->eclosures + i);
3155  if (dfa->inveclosures != NULL)
3156  re_node_set_free (dfa->inveclosures + i);
3157  if (dfa->edests != NULL)
3158  re_node_set_free (dfa->edests + i);
3159  }
3160  re_free (dfa->edests);
3161  re_free (dfa->eclosures);
3162  re_free (dfa->inveclosures);
3163  re_free (dfa->nodes);
3164 
3165  if (dfa->state_table)
3166  for (i = 0; i <= dfa->state_hash_mask; ++i)
3167  {
3168  struct re_state_table_entry *entry = dfa->state_table + i;
3169  for (j = 0; j < entry->num; ++j)
3170  {
3171  re_dfastate_t *state = entry->array[j];
3172  free_state (state);
3173  }
3174  re_free (entry->array);
3175  }
3176  re_free (dfa->state_table);
3177 #ifdef RE_ENABLE_I18N
3178  if (dfa->sb_char != utf8_sb_map)
3179  re_free (dfa->sb_char);
3180 #endif
3181  re_free (dfa->subexp_map);
3182 #ifdef DEBUG
3183  re_free (dfa->re_str);
3184 #endif
3185 
3186  re_free (dfa);
3187 }
3188 
3189 
3190 /* Free dynamically allocated space used by PREG. */
3191 
3192 void
3193 regfree (preg)
3194  regex_t *preg;
3195 {
3196  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3197  if (BE (dfa != NULL, 1))
3198  free_dfa_content (dfa);
3199  preg->buffer = NULL;
3200  preg->allocated = 0;
3201 
3202  re_free (preg->fastmap);
3203  preg->fastmap = NULL;
3204 
3205  re_free (preg->translate);
3206  preg->translate = NULL;
3207 }
3208 #ifdef _LIBC
3209 weak_alias (__regfree, regfree)
3210 #endif
3211 
3212 /* Entry points compatible with 4.2 BSD regex library. We don't define
3213  them unless specifically requested. */
3214 
3215 #if defined _REGEX_RE_COMP || defined _LIBC
3216 
3217 /* BSD has one and only one pattern buffer. */
3218 static struct re_pattern_buffer re_comp_buf;
3219 
3220 char *
3221 # ifdef _LIBC
3222 /* Make these definitions weak in libc, so POSIX programs can redefine
3223  these names if they don't use our functions, and still use
3224  regcomp/regexec above without link errors. */
3225 weak_function
3226 # endif
3227 re_comp (s)
3228  const char *s;
3229 {
3231  char *fastmap;
3232 
3233  if (!s)
3234  {
3235  if (!re_comp_buf.buffer)
3236  return gettext ("No previous regular expression");
3237  return 0;
3238  }
3239 
3240  if (re_comp_buf.buffer)
3241  {
3242  fastmap = re_comp_buf.fastmap;
3243  re_comp_buf.fastmap = NULL;
3244  __regfree (&re_comp_buf);
3245  memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
3246  re_comp_buf.fastmap = fastmap;
3247  }
3248 
3249  if (re_comp_buf.fastmap == NULL)
3250  {
3251  re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
3252  if (re_comp_buf.fastmap == NULL)
3253  return (char *) gettext (__re_error_msgid
3254  + __re_error_msgid_idx[(int) REG_ESPACE]);
3255  }
3256 
3257  /* Since `re_exec' always passes NULL for the `regs' argument, we
3258  don't need to initialize the pattern buffer fields which affect it. */
3259 
3260  /* Match anchors at newlines. */
3261  re_comp_buf.newline_anchor = 1;
3262 
3263  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
3264 
3265  if (!ret)
3266  return NULL;
3267 
3268  /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
3269  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
3270 }
3271 
3272 #ifdef _LIBC
3273 libc_freeres_fn (free_mem)
3274 {
3275  __regfree (&re_comp_buf);
3276 }
3277 #endif
3278 
3279 #endif /* _REGEX_RE_COMP */
3280 
3281 /* Internal entry point.
3282  Compile the regular expression PATTERN, whose length is LENGTH.
3283  SYNTAX indicate regular expression's syntax. */
3284 
3285 static reg_errcode_t
3286 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
3288 {
3289  reg_errcode_t err = REG_NOERROR;
3290  re_dfa_t *dfa;
3291  re_string_t regexp;
3292 
3293  /* Initialize the pattern buffer. */
3294  preg->fastmap_accurate = 0;
3295  preg->syntax = syntax;
3296  preg->not_bol = preg->not_eol = 0;
3297  preg->used = 0;
3298  preg->re_nsub = 0;
3299  preg->can_be_null = 0;
3301 
3302  /* Initialize the dfa. */
3303  dfa = (re_dfa_t *) preg->buffer;
3304  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
3305  {
3306  /* If zero allocated, but buffer is non-null, try to realloc
3307  enough space. This loses if buffer's address is bogus, but
3308  that is the user's responsibility. If ->buffer is NULL this
3309  is a simple allocation. */
3310  dfa = re_realloc (preg->buffer, re_dfa_t, 1);
3311  if (dfa == NULL)
3312  return REG_ESPACE;
3313  preg->allocated = sizeof (re_dfa_t);
3314  preg->buffer = (unsigned char *) dfa;
3315  }
3316  preg->used = sizeof (re_dfa_t);
3317 
3318  err = init_dfa (dfa, length);
3319  if (BE (err != REG_NOERROR, 0))
3320  {
3321  free_dfa_content (dfa);
3322  preg->buffer = NULL;
3323  preg->allocated = 0;
3324  return err;
3325  }
3326 #ifdef DEBUG
3327  /* Note: length+1 will not overflow since it is checked in init_dfa. */
3328  dfa->re_str = re_malloc (char, length + 1);
3329  strncpy (dfa->re_str, pattern, length + 1);
3330 #endif
3331 
3332  __libc_lock_init (dfa->lock);
3333 
3334  err = re_string_construct (&regexp, pattern, length, preg->translate,
3335  syntax & RE_ICASE, dfa);
3336  if (BE (err != REG_NOERROR, 0))
3337  {
3338  re_compile_internal_free_return:
3339  free_workarea_compile (preg);
3340  re_string_destruct (&regexp);
3341  free_dfa_content (dfa);
3342  preg->buffer = NULL;
3343  preg->allocated = 0;
3344  return err;
3345  }
3346 
3347  /* Parse the regular expression, and build a structure tree. */
3348  preg->re_nsub = 0;
3349  dfa->str_tree = parse (&regexp, preg, syntax, &err);
3350  if (BE (dfa->str_tree == NULL, 0))
3351  goto re_compile_internal_free_return;
3352 
3353  /* Analyze the tree and create the nfa. */
3354  err = analyze (preg);
3355  if (BE (err != REG_NOERROR, 0))
3356  goto re_compile_internal_free_return;
3357 
3358 #ifdef RE_ENABLE_I18N
3359  /* If possible, do searching in single byte encoding to speed things up. */
3360  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
3361  optimize_utf8 (dfa);
3362 #endif
3363 
3364  /* Then create the initial state of the dfa. */
3365  err = create_initial_state (dfa);
3366 
3367  /* Release work areas. */
3368  free_workarea_compile (preg);
3369  re_string_destruct (&regexp);
3370 
3371  if (BE (err != REG_NOERROR, 0))
3372  {
3373  free_dfa_content (dfa);
3374  preg->buffer = NULL;
3375  preg->allocated = 0;
3376  }
3377 
3378  return err;
3379 }
3380 
3381 /* Initialize DFA. We use the length of the regular expression PAT_LEN
3382  as the initial length of some arrays. */
3383 
3384 static reg_errcode_t
3385 init_dfa (re_dfa_t *dfa, size_t pat_len)
3386 {
3387  unsigned int table_size;
3388 #ifndef _LIBC
3389  char *codeset_name;
3390 #endif
3391 
3392  memset (dfa, '\0', sizeof (re_dfa_t));
3393 
3394  /* Force allocation of str_tree_storage the first time. */
3395  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3396 
3397  /* Avoid overflows. */
3398  if (pat_len == SIZE_MAX)
3399  return REG_ESPACE;
3400 
3401  dfa->nodes_alloc = pat_len + 1;
3402  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
3403 
3404  /* table_size = 2 ^ ceil(log pat_len) */
3405  for (table_size = 1; ; table_size <<= 1)
3406  if (table_size > pat_len)
3407  break;
3408 
3409  dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
3410  dfa->state_hash_mask = table_size - 1;
3411 
3412  dfa->mb_cur_max = MB_CUR_MAX;
3413 #ifdef _LIBC
3414  if (dfa->mb_cur_max == 6
3415  && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
3416  dfa->is_utf8 = 1;
3417  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
3418  != 0);
3419 #else
3420 # ifdef HAVE_LANGINFO_CODESET
3421  codeset_name = nl_langinfo (CODESET);
3422 # else
3423  codeset_name = getenv ("LC_ALL");
3424  if (codeset_name == NULL || codeset_name[0] == '\0')
3425  codeset_name = getenv ("LC_CTYPE");
3426  if (codeset_name == NULL || codeset_name[0] == '\0')
3427  codeset_name = getenv ("LANG");
3428  if (codeset_name == NULL)
3429  codeset_name = "";
3430  else if (strchr (codeset_name, '.') != NULL)
3431  codeset_name = strchr (codeset_name, '.') + 1;
3432 # endif
3433 
3434  if (strcasecmp (codeset_name, "UTF-8") == 0
3435  || strcasecmp (codeset_name, "UTF8") == 0)
3436  dfa->is_utf8 = 1;
3437 
3438  /* We check exhaustively in the loop below if this charset is a
3439  superset of ASCII. */
3440  dfa->map_notascii = 0;
3441 #endif
3442 
3443 #ifdef RE_ENABLE_I18N
3444  if (dfa->mb_cur_max > 1)
3445  {
3446  if (dfa->is_utf8)
3447  dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
3448  else
3449  {
3450  int i, j, ch;
3451 
3452  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3453  if (BE (dfa->sb_char == NULL, 0))
3454  return REG_ESPACE;
3455 
3456  /* Set the bits corresponding to single byte chars. */
3457  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3458  for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3459  {
3460  wint_t wch = __btowc (ch);
3461  if (wch != WEOF)
3462  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
3463 # ifndef _LIBC
3464  if (isascii (ch) && wch != ch)
3465  dfa->map_notascii = 1;
3466 # endif
3467  }
3468  }
3469  }
3470 #endif
3471 
3472  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
3473  return REG_ESPACE;
3474  return REG_NOERROR;
3475 }
3476 
3477 /* Initialize WORD_CHAR table, which indicate which character is
3478  "word". In this case "word" means that it is the word construction
3479  character used by some operators like "<", ">", etc. */
3480 
3481 static void
3482 internal_function
3483 init_word_char (re_dfa_t *dfa)
3484 {
3485  int i, j, ch;
3486  dfa->word_ops_used = 1;
3487  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
3488  for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
3489  if (isalnum (ch) || ch == '_')
3490  dfa->word_char[i] |= (bitset_word_t) 1 << j;
3491 }
3492 
3493 /* Free the work area which are only used while compiling. */
3494 
3495 static void
3496 free_workarea_compile (regex_t *preg)
3497 {
3498  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3499  bin_tree_storage_t *storage, *next;
3500  for (storage = dfa->str_tree_storage; storage; storage = next)
3501  {
3502  next = storage->next;
3503  re_free (storage);
3504  }
3505  dfa->str_tree_storage = NULL;
3506  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
3507  dfa->str_tree = NULL;
3508  re_free (dfa->org_indices);
3509  dfa->org_indices = NULL;
3510 }
3511 
3512 /* Create initial states for all contexts. */
3513 
3514 static reg_errcode_t
3515 create_initial_state (re_dfa_t *dfa)
3516 {
3517  int first, i;
3518  reg_errcode_t err;
3519  re_node_set init_nodes;
3520 
3521  /* Initial states have the epsilon closure of the node which is
3522  the first node of the regular expression. */
3523  first = dfa->str_tree->first->node_idx;
3524  dfa->init_node = first;
3525  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
3526  if (BE (err != REG_NOERROR, 0))
3527  return err;
3528 
3529  /* The back-references which are in initial states can epsilon transit,
3530  since in this case all of the subexpressions can be null.
3531  Then we add epsilon closures of the nodes which are the next nodes of
3532  the back-references. */
3533  if (dfa->nbackref > 0)
3534  for (i = 0; i < init_nodes.nelem; ++i)
3535  {
3536  int node_idx = init_nodes.elems[i];
3537  re_token_type_t type = dfa->nodes[node_idx].type;
3538 
3539  int clexp_idx;
3540  if (type != OP_BACK_REF)
3541  continue;
3542  for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
3543  {
3544  re_token_t *clexp_node;
3545  clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
3546  if (clexp_node->type == OP_CLOSE_SUBEXP
3547  && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
3548  break;
3549  }
3550  if (clexp_idx == init_nodes.nelem)
3551  continue;
3552 
3553  if (type == OP_BACK_REF)
3554  {
3555  int dest_idx = dfa->edests[node_idx].elems[0];
3556  if (!re_node_set_contains (&init_nodes, dest_idx))
3557  {
3558  re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
3559  i = 0;
3560  }
3561  }
3562  }
3563 
3564  /* It must be the first time to invoke acquire_state. */
3565  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
3566  /* We don't check ERR here, since the initial state must not be NULL. */
3567  if (BE (dfa->init_state == NULL, 0))
3568  return err;
3569  if (dfa->init_state->has_constraint)
3570  {
3571  dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
3572  CONTEXT_WORD);
3573  dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
3574  CONTEXT_NEWLINE);
3575  dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
3576  &init_nodes,
3577  CONTEXT_NEWLINE
3578  | CONTEXT_BEGBUF);
3579  if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
3580  || dfa->init_state_begbuf == NULL, 0))
3581  return err;
3582  }
3583  else
3584  dfa->init_state_word = dfa->init_state_nl
3585  = dfa->init_state_begbuf = dfa->init_state;
3586 
3587  re_node_set_free (&init_nodes);
3588  return REG_NOERROR;
3589 }
3590 
3591 #ifdef RE_ENABLE_I18N
3592 /* If it is possible to do searching in single byte encoding instead of UTF-8
3593  to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
3594  DFA nodes where needed. */
3595 
3596 static void
3597 optimize_utf8 (re_dfa_t *dfa)
3598 {
3599  int node, i, mb_chars = 0, has_period = 0;
3600 
3601  for (node = 0; node < dfa->nodes_len; ++node)
3602  switch (dfa->nodes[node].type)
3603  {
3604  case CHARACTER:
3605  if (dfa->nodes[node].opr.c >= 0x80)
3606  mb_chars = 1;
3607  break;
3608  case ANCHOR:
3609  switch (dfa->nodes[node].opr.idx)
3610  {
3611  case LINE_FIRST:
3612  case LINE_LAST:
3613  case BUF_FIRST:
3614  case BUF_LAST:
3615  break;
3616  default:
3617  /* Word anchors etc. cannot be handled. */
3618  return;
3619  }
3620  break;
3621  case OP_PERIOD:
3622  has_period = 1;
3623  break;
3624  case OP_BACK_REF:
3625  case OP_ALT:
3626  case END_OF_RE:
3627  case OP_DUP_ASTERISK:
3628  case OP_OPEN_SUBEXP:
3629  case OP_CLOSE_SUBEXP:
3630  break;
3631  case COMPLEX_BRACKET:
3632  return;
3633  case SIMPLE_BRACKET:
3634  /* Just double check. The non-ASCII range starts at 0x80. */
3635  assert (0x80 % BITSET_WORD_BITS == 0);
3636  for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
3637  if (dfa->nodes[node].opr.sbcset[i])
3638  return;
3639  break;
3640  default:
3641  abort ();
3642  }
3643 
3644  if (mb_chars || has_period)
3645  for (node = 0; node < dfa->nodes_len; ++node)
3646  {
3647  if (dfa->nodes[node].type == CHARACTER
3648  && dfa->nodes[node].opr.c >= 0x80)
3649  dfa->nodes[node].mb_partial = 0;
3650  else if (dfa->nodes[node].type == OP_PERIOD)
3651  dfa->nodes[node].type = OP_UTF8_PERIOD;
3652  }
3653 
3654  /* The search can be in single byte locale. */
3655  dfa->mb_cur_max = 1;
3656  dfa->is_utf8 = 0;
3657  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
3658 }
3659 #endif
3660 
3661 /* Analyze the structure tree, and calculate "first", "next", "edest",
3662  "eclosure", and "inveclosure". */
3663 
3664 static reg_errcode_t
3665 analyze (regex_t *preg)
3666 {
3667  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3669 
3670  /* Allocate arrays. */
3671  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
3672  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
3673  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
3674  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
3675  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
3676  || dfa->eclosures == NULL, 0))
3677  return REG_ESPACE;
3678 
3679  dfa->subexp_map = re_malloc (int, preg->re_nsub);
3680  if (dfa->subexp_map != NULL)
3681  {
3682  int i;
3683  for (i = 0; i < preg->re_nsub; i++)
3684  dfa->subexp_map[i] = i;
3685  preorder (dfa->str_tree, optimize_subexps, dfa);
3686  for (i = 0; i < preg->re_nsub; i++)
3687  if (dfa->subexp_map[i] != i)
3688  break;
3689  if (i == preg->re_nsub)
3690  {
3691  free (dfa->subexp_map);
3692  dfa->subexp_map = NULL;
3693  }
3694  }
3695 
3696  ret = postorder (dfa->str_tree, lower_subexps, preg);
3697  if (BE (ret != REG_NOERROR, 0))
3698  return ret;
3699  ret = postorder (dfa->str_tree, calc_first, dfa);
3700  if (BE (ret != REG_NOERROR, 0))
3701  return ret;
3702  preorder (dfa->str_tree, calc_next, dfa);
3703  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
3704  if (BE (ret != REG_NOERROR, 0))
3705  return ret;
3706  ret = calc_eclosure (dfa);
3707  if (BE (ret != REG_NOERROR, 0))
3708  return ret;
3709 
3710  /* We only need this during the prune_impossible_nodes pass in regexec.c;
3711  skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
3712  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
3713  || dfa->nbackref)
3714  {
3715  dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
3716  if (BE (dfa->inveclosures == NULL, 0))
3717  return REG_ESPACE;
3718  ret = calc_inveclosure (dfa);
3719  }
3720 
3721  return ret;
3722 }
3723 
3724 /* Our parse trees are very unbalanced, so we cannot use a stack to
3725  implement parse tree visits. Instead, we use parent pointers and
3726  some hairy code in these two functions. */
3727 static reg_errcode_t
3728 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3729  void *extra)
3730 {
3731  bin_tree_t *node, *prev;
3732 
3733  for (node = root; ; )
3734  {
3735  /* Descend down the tree, preferably to the left (or to the right
3736  if that's the only child). */
3737  while (node->left || node->right)
3738  if (node->left)
3739  node = node->left;
3740  else
3741  node = node->right;
3742 
3743  do
3744  {
3745  reg_errcode_t err = fn (extra, node);
3746  if (BE (err != REG_NOERROR, 0))
3747  return err;
3748  if (node->parent == NULL)
3749  return REG_NOERROR;
3750  prev = node;
3751  node = node->parent;
3752  }
3753  /* Go up while we have a node that is reached from the right. */
3754  while (node->right == prev || node->right == NULL);
3755  node = node->right;
3756  }
3757 }
3758 
3759 static reg_errcode_t
3760 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
3761  void *extra)
3762 {
3763  bin_tree_t *node;
3764 
3765  for (node = root; ; )
3766  {
3767  reg_errcode_t err = fn (extra, node);
3768  if (BE (err != REG_NOERROR, 0))
3769  return err;
3770 
3771  /* Go to the left node, or up and to the right. */
3772  if (node->left)
3773  node = node->left;
3774  else
3775  {
3776  bin_tree_t *prev = NULL;
3777  while (node->right == prev || node->right == NULL)
3778  {
3779  prev = node;
3780  node = node->parent;
3781  if (!node)
3782  return REG_NOERROR;
3783  }
3784  node = node->right;
3785  }
3786  }
3787 }
3788 
3789 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
3790  re_search_internal to map the inner one's opr.idx to this one's. Adjust
3791  backreferences as well. Requires a preorder visit. */
3792 static reg_errcode_t
3793 optimize_subexps (void *extra, bin_tree_t *node)
3794 {
3795  re_dfa_t *dfa = (re_dfa_t *) extra;
3796 
3797  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
3798  {
3799  int idx = node->token.opr.idx;
3800  node->token.opr.idx = dfa->subexp_map[idx];
3801  dfa->used_bkref_map |= 1 << node->token.opr.idx;
3802  }
3803 
3804  else if (node->token.type == SUBEXP
3805  && node->left && node->left->token.type == SUBEXP)
3806  {
3807  int other_idx = node->left->token.opr.idx;
3808 
3809  node->left = node->left->left;
3810  if (node->left)
3811  node->left->parent = node;
3812 
3813  dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
3814  if (other_idx < BITSET_WORD_BITS)
3815  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
3816  }
3817 
3818  return REG_NOERROR;
3819 }
3820 
3821 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
3822  of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
3823 static reg_errcode_t
3824 lower_subexps (void *extra, bin_tree_t *node)
3825 {
3826  regex_t *preg = (regex_t *) extra;
3827  reg_errcode_t err = REG_NOERROR;
3828 
3829  if (node->left && node->left->token.type == SUBEXP)
3830  {
3831  node->left = lower_subexp (&err, preg, node->left);
3832  if (node->left)
3833  node->left->parent = node;
3834  }
3835  if (node->right && node->right->token.type == SUBEXP)
3836  {
3837  node->right = lower_subexp (&err, preg, node->right);
3838  if (node->right)
3839  node->right->parent = node;
3840  }
3841 
3842  return err;
3843 }
3844 
3845 static bin_tree_t *
3846 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
3847 {
3848  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3849  bin_tree_t *body = node->left;
3850  bin_tree_t *op, *cls, *tree1, *tree;
3851 
3852  if (preg->no_sub
3853  /* We do not optimize empty subexpressions, because otherwise we may
3854  have bad CONCAT nodes with NULL children. This is obviously not
3855  very common, so we do not lose much. An example that triggers
3856  this case is the sed "script" /\(\)/x. */
3857  && node->left != NULL
3858  && (node->token.opr.idx >= BITSET_WORD_BITS
3859  || !(dfa->used_bkref_map
3860  & ((bitset_word_t) 1 << node->token.opr.idx))))
3861  return node->left;
3862 
3863  /* Convert the SUBEXP node to the concatenation of an
3864  OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
3865  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
3866  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
3867  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
3868  tree = create_tree (dfa, op, tree1, CONCAT);
3869  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
3870  {
3871  *err = REG_ESPACE;
3872  return NULL;
3873  }
3874 
3875  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
3876  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
3877  return tree;
3878 }
3879 
3880 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
3881  nodes. Requires a postorder visit. */
3882 static reg_errcode_t
3883 calc_first (void *extra, bin_tree_t *node)
3884 {
3885  re_dfa_t *dfa = (re_dfa_t *) extra;
3886  if (node->token.type == CONCAT)
3887  {
3888  node->first = node->left->first;
3889  node->node_idx = node->left->node_idx;
3890  }
3891  else
3892  {
3893  node->first = node;
3894  node->node_idx = re_dfa_add_node (dfa, node->token);
3895  if (BE (node->node_idx == -1, 0))
3896  return REG_ESPACE;
3897  }
3898  return REG_NOERROR;
3899 }
3900 
3901 /* Pass 2: compute NEXT on the tree. Preorder visit. */
3902 static reg_errcode_t
3903 calc_next (void *extra, bin_tree_t *node)
3904 {
3905  switch (node->token.type)
3906  {
3907  case OP_DUP_ASTERISK:
3908  node->left->next = node;
3909  break;
3910  case CONCAT:
3911  node->left->next = node->right->first;
3912  node->right->next = node->next;
3913  break;
3914  default:
3915  if (node->left)
3916  node->left->next = node->next;
3917  if (node->right)
3918  node->right->next = node->next;
3919  break;
3920  }
3921  return REG_NOERROR;
3922 }
3923 
3924 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
3925 static reg_errcode_t
3926 link_nfa_nodes (void *extra, bin_tree_t *node)
3927 {
3928  re_dfa_t *dfa = (re_dfa_t *) extra;
3929  int idx = node->node_idx;
3930  reg_errcode_t err = REG_NOERROR;
3931 
3932  switch (node->token.type)
3933  {
3934  case CONCAT:
3935  break;
3936 
3937  case END_OF_RE:
3938  assert (node->next == NULL);
3939  break;
3940 
3941  case OP_DUP_ASTERISK:
3942  case OP_ALT:
3943  {
3944  int left, right;
3945  dfa->has_plural_match = 1;
3946  if (node->left != NULL)
3947  left = node->left->first->node_idx;
3948  else
3949  left = node->next->node_idx;
3950  if (node->right != NULL)
3951  right = node->right->first->node_idx;
3952  else
3953  right = node->next->node_idx;
3954  assert (left > -1);
3955  assert (right > -1);
3956  err = re_node_set_init_2 (dfa->edests + idx, left, right);
3957  }
3958  break;
3959 
3960  case ANCHOR:
3961  case OP_OPEN_SUBEXP:
3962  case OP_CLOSE_SUBEXP:
3963  err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
3964  break;
3965 
3966  case OP_BACK_REF:
3967  dfa->nexts[idx] = node->next->node_idx;
3968  if (node->token.type == OP_BACK_REF)
3969  re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
3970  break;
3971 
3972  default:
3973  assert (!IS_EPSILON_NODE (node->token.type));
3974  dfa->nexts[idx] = node->next->node_idx;
3975  break;
3976  }
3977 
3978  return err;
3979 }
3980 
3981 /* Duplicate the epsilon closure of the node ROOT_NODE.
3982  Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
3983  to their own constraint. */
3984 
3985 static reg_errcode_t
3986 internal_function
3987 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
3988  int root_node, unsigned int init_constraint)
3989 {
3990  int org_node, clone_node, ret;
3991  unsigned int constraint = init_constraint;
3992  for (org_node = top_org_node, clone_node = top_clone_node;;)
3993  {
3994  int org_dest, clone_dest;
3995  if (dfa->nodes[org_node].type == OP_BACK_REF)
3996  {
3997  /* If the back reference epsilon-transit, its destination must
3998  also have the constraint. Then duplicate the epsilon closure
3999  of the destination of the back reference, and store it in
4000  edests of the back reference. */
4001  org_dest = dfa->nexts[org_node];
4002  re_node_set_empty (dfa->edests + clone_node);
4003  clone_dest = duplicate_node (dfa, org_dest, constraint);
4004  if (BE (clone_dest == -1, 0))
4005  return REG_ESPACE;
4006  dfa->nexts[clone_node] = dfa->nexts[org_node];
4007  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4008  if (BE (ret < 0, 0))
4009  return REG_ESPACE;
4010  }
4011  else if (dfa->edests[org_node].nelem == 0)
4012  {
4013  /* In case of the node can't epsilon-transit, don't duplicate the
4014  destination and store the original destination as the
4015  destination of the node. */
4016  dfa->nexts[clone_node] = dfa->nexts[org_node];
4017  break;
4018  }
4019  else if (dfa->edests[org_node].nelem == 1)
4020  {
4021  /* In case of the node can epsilon-transit, and it has only one
4022  destination. */
4023  org_dest = dfa->edests[org_node].elems[0];
4024  re_node_set_empty (dfa->edests + clone_node);
4025  if (dfa->nodes[org_node].type == ANCHOR)
4026  {
4027  /* In case of the node has another constraint, append it. */
4028  if (org_node == root_node && clone_node != org_node)
4029  {
4030  /* ...but if the node is root_node itself, it means the
4031  epsilon closure have a loop, then tie it to the
4032  destination of the root_node. */
4033  ret = re_node_set_insert (dfa->edests + clone_node,
4034  org_dest);
4035  if (BE (ret < 0, 0))
4036  return REG_ESPACE;
4037  break;
4038  }
4039  constraint |= dfa->nodes[org_node].opr.ctx_type;
4040  }
4041  clone_dest = duplicate_node (dfa, org_dest, constraint);
4042  if (BE (clone_dest == -1, 0))
4043  return REG_ESPACE;
4044  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4045  if (BE (ret < 0, 0))
4046  return REG_ESPACE;
4047  }
4048  else /* dfa->edests[org_node].nelem == 2 */
4049  {
4050  /* In case of the node can epsilon-transit, and it has two
4051  destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
4052  org_dest = dfa->edests[org_node].elems[0];
4053  re_node_set_empty (dfa->edests + clone_node);
4054  /* Search for a duplicated node which satisfies the constraint. */
4055  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
4056  if (clone_dest == -1)
4057  {
4058  /* There are no such a duplicated node, create a new one. */
4059  reg_errcode_t err;
4060  clone_dest = duplicate_node (dfa, org_dest, constraint);
4061  if (BE (clone_dest == -1, 0))
4062  return REG_ESPACE;
4063  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4064  if (BE (ret < 0, 0))
4065  return REG_ESPACE;
4066  err = duplicate_node_closure (dfa, org_dest, clone_dest,
4067  root_node, constraint);
4068  if (BE (err != REG_NOERROR, 0))
4069  return err;
4070  }
4071  else
4072  {
4073  /* There are a duplicated node which satisfy the constraint,
4074  use it to avoid infinite loop. */
4075  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4076  if (BE (ret < 0, 0))
4077  return REG_ESPACE;
4078  }
4079 
4080  org_dest = dfa->edests[org_node].elems[1];
4081  clone_dest = duplicate_node (dfa, org_dest, constraint);
4082  if (BE (clone_dest == -1, 0))
4083  return REG_ESPACE;
4084  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
4085  if (BE (ret < 0, 0))
4086  return REG_ESPACE;
4087  }
4088  org_node = org_dest;
4089  clone_node = clone_dest;
4090  }
4091  return REG_NOERROR;
4092 }
4093 
4094 /* Search for a node which is duplicated from the node ORG_NODE, and
4095  satisfies the constraint CONSTRAINT. */
4096 
4097 static int
4098 search_duplicated_node (const re_dfa_t *dfa, int org_node,
4099  unsigned int constraint)
4100 {
4101  int idx;
4102  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
4103  {
4104  if (org_node == dfa->org_indices[idx]
4105  && constraint == dfa->nodes[idx].constraint)
4106  return idx; /* Found. */
4107  }
4108  return -1; /* Not found. */
4109 }
4110 
4111 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
4112  Return the index of the new node, or -1 if insufficient storage is
4113  available. */
4114 
4115 static int
4116 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
4117 {
4118  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
4119  if (BE (dup_idx != -1, 1))
4120  {
4121  dfa->nodes[dup_idx].constraint = constraint;
4122  if (dfa->nodes[org_idx].type == ANCHOR)
4123  dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
4124  dfa->nodes[dup_idx].duplicated = 1;
4125 
4126  /* Store the index of the original node. */
4127  dfa->org_indices[dup_idx] = org_idx;
4128  }
4129  return dup_idx;
4130 }
4131 
4132 static reg_errcode_t
4133 calc_inveclosure (re_dfa_t *dfa)
4134 {
4135  int src, idx, ret;
4136  for (idx = 0; idx < dfa->nodes_len; ++idx)
4137  re_node_set_init_empty (dfa->inveclosures + idx);
4138 
4139  for (src = 0; src < dfa->nodes_len; ++src)
4140  {
4141  int *elems = dfa->eclosures[src].elems;
4142  for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
4143  {
4144  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
4145  if (BE (ret == -1, 0))
4146  return REG_ESPACE;
4147  }
4148  }
4149 
4150  return REG_NOERROR;
4151 }
4152 
4153 /* Calculate "eclosure" for all the node in DFA. */
4154 
4155 static reg_errcode_t
4156 calc_eclosure (re_dfa_t *dfa)
4157 {
4158  int node_idx, incomplete;
4159 #ifdef DEBUG
4160  assert (dfa->nodes_len > 0);
4161 #endif
4162  incomplete = 0;
4163  /* For each nodes, calculate epsilon closure. */
4164  for (node_idx = 0; ; ++node_idx)
4165  {
4166  reg_errcode_t err;
4167  re_node_set eclosure_elem;
4168  if (node_idx == dfa->nodes_len)
4169  {
4170  if (!incomplete)
4171  break;
4172  incomplete = 0;
4173  node_idx = 0;
4174  }
4175 
4176 #ifdef DEBUG
4177  assert (dfa->eclosures[node_idx].nelem != -1);
4178 #endif
4179 
4180  /* If we have already calculated, skip it. */
4181  if (dfa->eclosures[node_idx].nelem != 0)
4182  continue;
4183  /* Calculate epsilon closure of `node_idx'. */
4184  err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
4185  if (BE (err != REG_NOERROR, 0))
4186  return err;
4187 
4188  if (dfa->eclosures[node_idx].nelem == 0)
4189  {
4190  incomplete = 1;
4191  re_node_set_free (&eclosure_elem);
4192  }
4193  }
4194  return REG_NOERROR;
4195 }
4196 
4197 /* Calculate epsilon closure of NODE. */
4198 
4199 static reg_errcode_t
4200 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
4201 {
4202  reg_errcode_t err;
4203  unsigned int constraint;
4204  int i, incomplete;
4205  re_node_set eclosure;
4206  incomplete = 0;
4207  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
4208  if (BE (err != REG_NOERROR, 0))
4209  return err;
4210 
4211  /* This indicates that we are calculating this node now.
4212  We reference this value to avoid infinite loop. */
4213  dfa->eclosures[node].nelem = -1;
4214 
4215  constraint = ((dfa->nodes[node].type == ANCHOR)
4216  ? dfa->nodes[node].opr.ctx_type : 0);
4217  /* If the current node has constraints, duplicate all nodes.
4218  Since they must inherit the constraints. */
4219  if (constraint
4220  && dfa->edests[node].nelem
4221  && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
4222  {
4223  err = duplicate_node_closure (dfa, node, node, node, constraint);
4224  if (BE (err != REG_NOERROR, 0))
4225  return err;
4226  }
4227 
4228  /* Expand each epsilon destination nodes. */
4229  if (IS_EPSILON_NODE(dfa->nodes[node].type))
4230  for (i = 0; i < dfa->edests[node].nelem; ++i)
4231  {
4232  re_node_set eclosure_elem;
4233  int edest = dfa->edests[node].elems[i];
4234  /* If calculating the epsilon closure of `edest' is in progress,
4235  return intermediate result. */
4236  if (dfa->eclosures[edest].nelem == -1)
4237  {
4238  incomplete = 1;
4239  continue;
4240  }
4241  /* If we haven't calculated the epsilon closure of `edest' yet,
4242  calculate now. Otherwise use calculated epsilon closure. */
4243  if (dfa->eclosures[edest].nelem == 0)
4244  {
4245  err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
4246  if (BE (err != REG_NOERROR, 0))
4247  return err;
4248  }
4249  else
4250  eclosure_elem = dfa->eclosures[edest];
4251  /* Merge the epsilon closure of `edest'. */
4252  re_node_set_merge (&eclosure, &eclosure_elem);
4253  /* If the epsilon closure of `edest' is incomplete,
4254  the epsilon closure of this node is also incomplete. */
4255  if (dfa->eclosures[edest].nelem == 0)
4256  {
4257  incomplete = 1;
4258  re_node_set_free (&eclosure_elem);
4259  }
4260  }
4261 
4262  /* Epsilon closures include itself. */
4263  re_node_set_insert (&eclosure, node);
4264  if (incomplete && !root)
4265  dfa->eclosures[node].nelem = 0;
4266  else
4267  dfa->eclosures[node] = eclosure;
4268  *new_set = eclosure;
4269  return REG_NOERROR;
4270 }
4271 
4272 /* Functions for token which are used in the parser. */
4273 
4274 /* Fetch a token from INPUT.
4275  We must not use this function inside bracket expressions. */
4276 
4277 static void
4278 internal_function
4279 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
4280 {
4281  re_string_skip_bytes (input, peek_token (result, input, syntax));
4282 }
4283 
4284 /* Peek a token from INPUT, and return the length of the token.
4285  We must not use this function inside bracket expressions. */
4286 
4287 static int
4288 internal_function
4289 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4290 {
4291  unsigned char c;
4292 
4293  if (re_string_eoi (input))
4294  {
4295  token->type = END_OF_RE;
4296  return 0;
4297  }
4298 
4299  c = re_string_peek_byte (input, 0);
4300  token->opr.c = c;
4301 
4302  token->word_char = 0;
4303 #ifdef RE_ENABLE_I18N
4304  token->mb_partial = 0;
4305  if (input->mb_cur_max > 1 &&
4306  !re_string_first_byte (input, re_string_cur_idx (input)))
4307  {
4308  token->type = CHARACTER;
4309  token->mb_partial = 1;
4310  return 1;
4311  }
4312 #endif
4313  if (c == '\\')
4314  {
4315  unsigned char c2;
4316  if (re_string_cur_idx (input) + 1 >= re_string_length (input))
4317  {
4318  token->type = BACK_SLASH;
4319  return 1;
4320  }
4321 
4322  c2 = re_string_peek_byte_case (input, 1);
4323  token->opr.c = c2;
4324  token->type = CHARACTER;
4325 #ifdef RE_ENABLE_I18N
4326  if (input->mb_cur_max > 1)
4327  {
4328  wint_t wc = re_string_wchar_at (input,
4329  re_string_cur_idx (input) + 1);
4330  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4331  }
4332  else
4333 #endif
4334  token->word_char = IS_WORD_CHAR (c2) != 0;
4335 
4336  switch (c2)
4337  {
4338  case '|':
4339  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
4340  token->type = OP_ALT;
4341  break;
4342  case '1': case '2': case '3': case '4': case '5':
4343  case '6': case '7': case '8': case '9':
4344  if (!(syntax & RE_NO_BK_REFS))
4345  {
4346  token->type = OP_BACK_REF;
4347  token->opr.idx = c2 - '1';
4348  }
4349  break;
4350  case '<':
4351  if (!(syntax & RE_NO_GNU_OPS))
4352  {
4353  token->type = ANCHOR;
4354  token->opr.ctx_type = WORD_FIRST;
4355  }
4356  break;
4357  case '>':
4358  if (!(syntax & RE_NO_GNU_OPS))
4359  {
4360  token->type = ANCHOR;
4361  token->opr.ctx_type = WORD_LAST;
4362  }
4363  break;
4364  case 'b':
4365  if (!(syntax & RE_NO_GNU_OPS))
4366  {
4367  token->type = ANCHOR;
4368  token->opr.ctx_type = WORD_DELIM;
4369  }
4370  break;
4371  case 'B':
4372  if (!(syntax & RE_NO_GNU_OPS))
4373  {
4374  token->type = ANCHOR;
4375  token->opr.ctx_type = NOT_WORD_DELIM;
4376  }
4377  break;
4378  case 'w':
4379  if (!(syntax & RE_NO_GNU_OPS))
4380  token->type = OP_WORD;
4381  break;
4382  case 'W':
4383  if (!(syntax & RE_NO_GNU_OPS))
4384  token->type = OP_NOTWORD;
4385  break;
4386  case 's':
4387  if (!(syntax & RE_NO_GNU_OPS))
4388  token->type = OP_SPACE;
4389  break;
4390  case 'S':
4391  if (!(syntax & RE_NO_GNU_OPS))
4392  token->type = OP_NOTSPACE;
4393  break;
4394  case '`':
4395  if (!(syntax & RE_NO_GNU_OPS))
4396  {
4397  token->type = ANCHOR;
4398  token->opr.ctx_type = BUF_FIRST;
4399  }
4400  break;
4401  case '\'':
4402  if (!(syntax & RE_NO_GNU_OPS))
4403  {
4404  token->type = ANCHOR;
4405  token->opr.ctx_type = BUF_LAST;
4406  }
4407  break;
4408  case '(':
4409  if (!(syntax & RE_NO_BK_PARENS))
4410  token->type = OP_OPEN_SUBEXP;
4411  break;
4412  case ')':
4413  if (!(syntax & RE_NO_BK_PARENS))
4414  token->type = OP_CLOSE_SUBEXP;
4415  break;
4416  case '+':
4417  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4418  token->type = OP_DUP_PLUS;
4419  break;
4420  case '?':
4421  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
4422  token->type = OP_DUP_QUESTION;
4423  break;
4424  case '{':
4425  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4426  token->type = OP_OPEN_DUP_NUM;
4427  break;
4428  case '}':
4429  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
4430  token->type = OP_CLOSE_DUP_NUM;
4431  break;
4432  default:
4433  break;
4434  }
4435  return 2;
4436  }
4437 
4438  token->type = CHARACTER;
4439 #ifdef RE_ENABLE_I18N
4440  if (input->mb_cur_max > 1)
4441  {
4442  wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
4443  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
4444  }
4445  else
4446 #endif
4447  token->word_char = IS_WORD_CHAR (token->opr.c);
4448 
4449  switch (c)
4450  {
4451  case '\n':
4452  if (syntax & RE_NEWLINE_ALT)
4453  token->type = OP_ALT;
4454  break;
4455  case '|':
4456  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
4457  token->type = OP_ALT;
4458  break;
4459  case '*':
4460  token->type = OP_DUP_ASTERISK;
4461  break;
4462  case '+':
4463  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4464  token->type = OP_DUP_PLUS;
4465  break;
4466  case '?':
4467  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
4468  token->type = OP_DUP_QUESTION;
4469  break;
4470  case '{':
4471  if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4472  token->type = OP_OPEN_DUP_NUM;
4473  break;
4474  case '}':
4475  if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
4476  token->type = OP_CLOSE_DUP_NUM;
4477  break;
4478  case '(':
4479  if (syntax & RE_NO_BK_PARENS)
4480  token->type = OP_OPEN_SUBEXP;
4481  break;
4482  case ')':
4483  if (syntax & RE_NO_BK_PARENS)
4484  token->type = OP_CLOSE_SUBEXP;
4485  break;
4486  case '[':
4487  token->type = OP_OPEN_BRACKET;
4488  break;
4489  case '.':
4490  token->type = OP_PERIOD;
4491  break;
4492  case '^':
4493  if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
4494  re_string_cur_idx (input) != 0)
4495  {
4496  char prev = re_string_peek_byte (input, -1);
4497  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
4498  break;
4499  }
4500  token->type = ANCHOR;
4501  token->opr.ctx_type = LINE_FIRST;
4502  break;
4503  case '$':
4504  if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
4505  re_string_cur_idx (input) + 1 != re_string_length (input))
4506  {
4507  re_token_t next;
4508  re_string_skip_bytes (input, 1);
4509  peek_token (&next, input, syntax);
4510  re_string_skip_bytes (input, -1);
4511  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
4512  break;
4513  }
4514  token->type = ANCHOR;
4515  token->opr.ctx_type = LINE_LAST;
4516  break;
4517  default:
4518  break;
4519  }
4520  return 1;
4521 }
4522 
4523 /* Peek a token from INPUT, and return the length of the token.
4524  We must not use this function out of bracket expressions. */
4525 
4526 static int
4527 internal_function
4528 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
4529 {
4530  unsigned char c;
4531  if (re_string_eoi (input))
4532  {
4533  token->type = END_OF_RE;
4534  return 0;
4535  }
4536  c = re_string_peek_byte (input, 0);
4537  token->opr.c = c;
4538 
4539 #ifdef RE_ENABLE_I18N
4540  if (input->mb_cur_max > 1 &&
4541  !re_string_first_byte (input, re_string_cur_idx (input)))
4542  {
4543  token->type = CHARACTER;
4544  return 1;
4545  }
4546 #endif /* RE_ENABLE_I18N */
4547 
4548  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
4549  && re_string_cur_idx (input) + 1 < re_string_length (input))
4550  {
4551  /* In this case, '\' escape a character. */
4552  unsigned char c2;
4553  re_string_skip_bytes (input, 1);
4554  c2 = re_string_peek_byte (input, 0);
4555  token->opr.c = c2;
4556  token->type = CHARACTER;
4557  return 1;
4558  }
4559  if (c == '[') /* '[' is a special char in a bracket exps. */
4560  {
4561  unsigned char c2;
4562  int token_len;
4563  if (re_string_cur_idx (input) + 1 < re_string_length (input))
4564  c2 = re_string_peek_byte (input, 1);
4565  else
4566  c2 = 0;
4567  token->opr.c = c2;
4568  token_len = 2;
4569  switch (c2)
4570  {
4571  case '.':
4572  token->type = OP_OPEN_COLL_ELEM;
4573  break;
4574  case '=':
4575  token->type = OP_OPEN_EQUIV_CLASS;
4576  break;
4577  case ':':
4578  if (syntax & RE_CHAR_CLASSES)
4579  {
4580  token->type = OP_OPEN_CHAR_CLASS;
4581  break;
4582  }
4583  /* else fall through. */
4584  default:
4585  token->type = CHARACTER;
4586  token->opr.c = c;
4587  token_len = 1;
4588  break;
4589  }
4590  return token_len;
4591  }
4592  switch (c)
4593  {
4594  case '-':
4595  token->type = OP_CHARSET_RANGE;
4596  break;
4597  case ']':
4598  token->type = OP_CLOSE_BRACKET;
4599  break;
4600  case '^':
4601  token->type = OP_NON_MATCH_LIST;
4602  break;
4603  default:
4604  token->type = CHARACTER;
4605  }
4606  return 1;
4607 }
4608 
4609 /* Functions for parser. */
4610 
4611 /* Entry point of the parser.
4612  Parse the regular expression REGEXP and return the structure tree.
4613  If an error is occured, ERR is set by error code, and return NULL.
4614  This function build the following tree, from regular expression <reg_exp>:
4615  CAT
4616  / \
4617  / \
4618  <reg_exp> EOR
4619 
4620  CAT means concatenation.
4621  EOR means end of regular expression. */
4622 
4623 static bin_tree_t *
4624 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
4625  reg_errcode_t *err)
4626 {
4627  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4628  bin_tree_t *tree, *eor, *root;
4629  re_token_t current_token;
4630  dfa->syntax = syntax;
4631  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4632  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
4633  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4634  return NULL;
4635  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
4636  if (tree != NULL)
4637  root = create_tree (dfa, tree, eor, CONCAT);
4638  else
4639  root = eor;
4640  if (BE (eor == NULL || root == NULL, 0))
4641  {
4642  *err = REG_ESPACE;
4643  return NULL;
4644  }
4645  return root;
4646 }
4647 
4648 /* This function build the following tree, from regular expression
4649  <branch1>|<branch2>:
4650  ALT
4651  / \
4652  / \
4653  <branch1> <branch2>
4654 
4655  ALT means alternative, which represents the operator `|'. */
4656 
4657 static bin_tree_t *
4658 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4659  reg_syntax_t syntax, int nest, reg_errcode_t *err)
4660 {
4661  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4662  bin_tree_t *tree, *branch = NULL;
4663  tree = parse_branch (regexp, preg, token, syntax, nest, err);
4664  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4665  return NULL;
4666 
4667  while (token->type == OP_ALT)
4668  {
4669  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4670  if (token->type != OP_ALT && token->type != END_OF_RE
4671  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4672  {
4673  branch = parse_branch (regexp, preg, token, syntax, nest, err);
4674  if (BE (*err != REG_NOERROR && branch == NULL, 0))
4675  return NULL;
4676  }
4677  else
4678  branch = NULL;
4679  tree = create_tree (dfa, tree, branch, OP_ALT);
4680  if (BE (tree == NULL, 0))
4681  {
4682  *err = REG_ESPACE;
4683  return NULL;
4684  }
4685  }
4686  return tree;
4687 }
4688 
4689 /* This function build the following tree, from regular expression
4690  <exp1><exp2>:
4691  CAT
4692  / \
4693  / \
4694  <exp1> <exp2>
4695 
4696  CAT means concatenation. */
4697 
4698 static bin_tree_t *
4699 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
4700  reg_syntax_t syntax, int nest, reg_errcode_t *err)
4701 {
4702  bin_tree_t *tree, *exp;
4703  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4704  tree = parse_expression (regexp, preg, token, syntax, nest, err);
4705  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4706  return NULL;
4707 
4708  while (token->type != OP_ALT && token->type != END_OF_RE
4709  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
4710  {
4711  exp = parse_expression (regexp, preg, token, syntax, nest, err);
4712  if (BE (*err != REG_NOERROR && exp == NULL, 0))
4713  {
4714  return NULL;
4715  }
4716  if (tree != NULL && exp != NULL)
4717  {
4718  tree = create_tree (dfa, tree, exp, CONCAT);
4719  if (tree == NULL)
4720  {
4721  *err = REG_ESPACE;
4722  return NULL;
4723  }
4724  }
4725  else if (tree == NULL)
4726  tree = exp;
4727  /* Otherwise exp == NULL, we don't need to create new tree. */
4728  }
4729  return tree;
4730 }
4731 
4732 /* This function build the following tree, from regular expression a*:
4733  *
4734  |
4735  a
4736 */
4737 
4738 static bin_tree_t *
4739 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
4740  reg_syntax_t syntax, int nest, reg_errcode_t *err)
4741 {
4742  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4743  bin_tree_t *tree;
4744  switch (token->type)
4745  {
4746  case CHARACTER:
4747  tree = create_token_tree (dfa, NULL, NULL, token);
4748  if (BE (tree == NULL, 0))
4749  {
4750  *err = REG_ESPACE;
4751  return NULL;
4752  }
4753 #ifdef RE_ENABLE_I18N
4754  if (dfa->mb_cur_max > 1)
4755  {
4756  while (!re_string_eoi (regexp)
4757  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
4758  {
4759  bin_tree_t *mbc_remain;
4760  fetch_token (token, regexp, syntax);
4761  mbc_remain = create_token_tree (dfa, NULL, NULL, token);
4762  tree = create_tree (dfa, tree, mbc_remain, CONCAT);
4763  if (BE (mbc_remain == NULL || tree == NULL, 0))
4764  {
4765  *err = REG_ESPACE;
4766  return NULL;
4767  }
4768  }
4769  }
4770 #endif
4771  break;
4772  case OP_OPEN_SUBEXP:
4773  tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
4774  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4775  return NULL;
4776  break;
4777  case OP_OPEN_BRACKET:
4778  tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
4779  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4780  return NULL;
4781  break;
4782  case OP_BACK_REF:
4783  if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
4784  {
4785  *err = REG_ESUBREG;
4786  return NULL;
4787  }
4788  dfa->used_bkref_map |= 1 << token->opr.idx;
4789  tree = create_token_tree (dfa, NULL, NULL, token);
4790  if (BE (tree == NULL, 0))
4791  {
4792  *err = REG_ESPACE;
4793  return NULL;
4794  }
4795  ++dfa->nbackref;
4796  dfa->has_mb_node = 1;
4797  break;
4798  case OP_OPEN_DUP_NUM:
4800  {
4801  *err = REG_BADRPT;
4802  return NULL;
4803  }
4804  /* FALLTHROUGH */
4805  case OP_DUP_ASTERISK:
4806  case OP_DUP_PLUS:
4807  case OP_DUP_QUESTION:
4809  {
4810  *err = REG_BADRPT;
4811  return NULL;
4812  }
4813  else if (syntax & RE_CONTEXT_INDEP_OPS)
4814  {
4815  fetch_token (token, regexp, syntax);
4816  return parse_expression (regexp, preg, token, syntax, nest, err);
4817  }
4818  /* else fall through */
4819  case OP_CLOSE_SUBEXP:
4820  if ((token->type == OP_CLOSE_SUBEXP) &&
4822  {
4823  *err = REG_ERPAREN;
4824  return NULL;
4825  }
4826  /* else fall through */
4827  case OP_CLOSE_DUP_NUM:
4828  /* We treat it as a normal character. */
4829 
4830  /* Then we can these characters as normal characters. */
4831  token->type = CHARACTER;
4832  /* mb_partial and word_char bits should be initialized already
4833  by peek_token. */
4834  tree = create_token_tree (dfa, NULL, NULL, token);
4835  if (BE (tree == NULL, 0))
4836  {
4837  *err = REG_ESPACE;
4838  return NULL;
4839  }
4840  break;
4841  case ANCHOR:
4842  if ((token->opr.ctx_type
4843  & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
4844  && dfa->word_ops_used == 0)
4845  init_word_char (dfa);
4846  if (token->opr.ctx_type == WORD_DELIM
4847  || token->opr.ctx_type == NOT_WORD_DELIM)
4848  {
4849  bin_tree_t *tree_first, *tree_last;
4850  if (token->opr.ctx_type == WORD_DELIM)
4851  {
4852  token->opr.ctx_type = WORD_FIRST;
4853  tree_first = create_token_tree (dfa, NULL, NULL, token);
4854  token->opr.ctx_type = WORD_LAST;
4855  }
4856  else
4857  {
4858  token->opr.ctx_type = INSIDE_WORD;
4859  tree_first = create_token_tree (dfa, NULL, NULL, token);
4860  token->opr.ctx_type = INSIDE_NOTWORD;
4861  }
4862  tree_last = create_token_tree (dfa, NULL, NULL, token);
4863  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
4864  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
4865  {
4866  *err = REG_ESPACE;
4867  return NULL;
4868  }
4869  }
4870  else
4871  {
4872  tree = create_token_tree (dfa, NULL, NULL, token);
4873  if (BE (tree == NULL, 0))
4874  {
4875  *err = REG_ESPACE;
4876  return NULL;
4877  }
4878  }
4879  /* We must return here, since ANCHORs can't be followed
4880  by repetition operators.
4881  eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
4882  it must not be "<ANCHOR(^)><REPEAT(*)>". */
4883  fetch_token (token, regexp, syntax);
4884  return tree;
4885  case OP_PERIOD:
4886  tree = create_token_tree (dfa, NULL, NULL, token);
4887  if (BE (tree == NULL, 0))
4888  {
4889  *err = REG_ESPACE;
4890  return NULL;
4891  }
4892  if (dfa->mb_cur_max > 1)
4893  dfa->has_mb_node = 1;
4894  break;
4895  case OP_WORD:
4896  case OP_NOTWORD:
4897  tree = build_charclass_op (dfa, regexp->trans,
4898  (const unsigned char *) "alnum",
4899  (const unsigned char *) "_",
4900  token->type == OP_NOTWORD, err);
4901  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4902  return NULL;
4903  break;
4904  case OP_SPACE:
4905  case OP_NOTSPACE:
4906  tree = build_charclass_op (dfa, regexp->trans,
4907  (const unsigned char *) "space",
4908  (const unsigned char *) "",
4909  token->type == OP_NOTSPACE, err);
4910  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4911  return NULL;
4912  break;
4913  case OP_ALT:
4914  case END_OF_RE:
4915  return NULL;
4916  case BACK_SLASH:
4917  *err = REG_EESCAPE;
4918  return NULL;
4919  default:
4920  /* Must not happen? */
4921 #ifdef DEBUG
4922  assert (0);
4923 #endif
4924  return NULL;
4925  }
4926  fetch_token (token, regexp, syntax);
4927 
4928  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
4929  || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
4930  {
4931  tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
4932  if (BE (*err != REG_NOERROR && tree == NULL, 0))
4933  return NULL;
4934  /* In BRE consecutive duplications are not allowed. */
4936  && (token->type == OP_DUP_ASTERISK
4937  || token->type == OP_OPEN_DUP_NUM))
4938  {
4939  *err = REG_BADRPT;
4940  return NULL;
4941  }
4942  }
4943 
4944  return tree;
4945 }
4946 
4947 /* This function build the following tree, from regular expression
4948  (<reg_exp>):
4949  SUBEXP
4950  |
4951  <reg_exp>
4952 */
4953 
4954 static bin_tree_t *
4955 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
4956  reg_syntax_t syntax, int nest, reg_errcode_t *err)
4957 {
4958  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
4959  bin_tree_t *tree;
4960  size_t cur_nsub;
4961  cur_nsub = preg->re_nsub++;
4962 
4963  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
4964 
4965  /* The subexpression may be a null string. */
4966  if (token->type == OP_CLOSE_SUBEXP)
4967  tree = NULL;
4968  else
4969  {
4970  tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
4971  if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
4972  *err = REG_EPAREN;
4973  if (BE (*err != REG_NOERROR, 0))
4974  return NULL;
4975  }
4976 
4977  if (cur_nsub <= '9' - '1')
4978  dfa->completed_bkref_map |= 1 << cur_nsub;
4979 
4980  tree = create_tree (dfa, tree, NULL, SUBEXP);
4981  if (BE (tree == NULL, 0))
4982  {
4983  *err = REG_ESPACE;
4984  return NULL;
4985  }
4986  tree->token.opr.idx = cur_nsub;
4987  return tree;
4988 }
4989 
4990 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
4991 
4992 static bin_tree_t *
4993 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
4994  re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
4995 {
4996  bin_tree_t *tree = NULL, *old_tree = NULL;
4997  int i, start, end, start_idx = re_string_cur_idx (regexp);
4998  re_token_t start_token = *token;
4999 
5000  if (token->type == OP_OPEN_DUP_NUM)
5001  {
5002  end = 0;
5003  start = fetch_number (regexp, token, syntax);
5004  if (start == -1)
5005  {
5006  if (token->type == CHARACTER && token->opr.c == ',')
5007  start = 0; /* We treat "{,m}" as "{0,m}". */
5008  else
5009  {
5010  *err = REG_BADBR; /* <re>{} is invalid. */
5011  return NULL;
5012  }
5013  }
5014  if (BE (start != -2, 1))
5015  {
5016  /* We treat "{n}" as "{n,n}". */
5017  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
5018  : ((token->type == CHARACTER && token->opr.c == ',')
5019  ? fetch_number (regexp, token, syntax) : -2));
5020  }
5021  if (BE (start == -2 || end == -2, 0))
5022  {
5023  /* Invalid sequence. */
5024  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
5025  {
5026  if (token->type == END_OF_RE)
5027  *err = REG_EBRACE;
5028  else
5029  *err = REG_BADBR;
5030 
5031  return NULL;
5032  }
5033 
5034  /* If the syntax bit is set, rollback. */
5035  re_string_set_index (regexp, start_idx);
5036  *token = start_token;
5037  token->type = CHARACTER;
5038  /* mb_partial and word_char bits should be already initialized by
5039  peek_token. */
5040  return elem;
5041  }
5042 
5043  if (BE (end != -1 && start > end, 0))
5044  {
5045  /* First number greater than second. */
5046  *err = REG_BADBR;
5047  return NULL;
5048  }
5049  }
5050  else
5051  {
5052  start = (token->type == OP_DUP_PLUS) ? 1 : 0;
5053  end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
5054  }
5055 
5056  fetch_token (token, regexp, syntax);
5057 
5058  if (BE (elem == NULL, 0))
5059  return NULL;
5060  if (BE (start == 0 && end == 0, 0))
5061  {
5062  postorder (elem, free_tree, NULL);
5063  return NULL;
5064  }
5065 
5066  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
5067  if (BE (start > 0, 0))
5068  {
5069  tree = elem;
5070  for (i = 2; i <= start; ++i)
5071  {
5072  elem = duplicate_tree (elem, dfa);
5073  tree = create_tree (dfa, tree, elem, CONCAT);
5074  if (BE (elem == NULL || tree == NULL, 0))
5075  goto parse_dup_op_espace;
5076  }
5077 
5078  if (start == end)
5079  return tree;
5080 
5081  /* Duplicate ELEM before it is marked optional. */
5082  elem = duplicate_tree (elem, dfa);
5083  old_tree = tree;
5084  }
5085  else
5086  old_tree = NULL;
5087 
5088  if (elem->token.type == SUBEXP)
5089  postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
5090 
5091  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
5092  if (BE (tree == NULL, 0))
5093  goto parse_dup_op_espace;
5094 
5095  /* This loop is actually executed only when end != -1,
5096  to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
5097  already created the start+1-th copy. */
5098  for (i = start + 2; i <= end; ++i)
5099  {
5100  elem = duplicate_tree (elem, dfa);
5101  tree = create_tree (dfa, tree, elem, CONCAT);
5102  if (BE (elem == NULL || tree == NULL, 0))
5103  goto parse_dup_op_espace;
5104 
5105  tree = create_tree (dfa, tree, NULL, OP_ALT);
5106  if (BE (tree == NULL, 0))
5107  goto parse_dup_op_espace;
5108  }
5109 
5110  if (old_tree)
5111  tree = create_tree (dfa, old_tree, tree, CONCAT);
5112 
5113  return tree;
5114 
5115  parse_dup_op_espace:
5116  *err = REG_ESPACE;
5117  return NULL;
5118 }
5119 
5120 /* Size of the names for collating symbol/equivalence_class/character_class.
5121  I'm not sure, but maybe enough. */
5122 #define BRACKET_NAME_BUF_SIZE 32
5123 
5124 #ifndef _LIBC
5125  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
5126  Build the range expression which starts from START_ELEM, and ends
5127  at END_ELEM. The result are written to MBCSET and SBCSET.
5128  RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5129  mbcset->range_ends, is a pointer argument sinse we may
5130  update it. */
5131 
5132 static reg_errcode_t
5133 internal_function
5134 # ifdef RE_ENABLE_I18N
5135 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
5136  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
5137 # else /* not RE_ENABLE_I18N */
5138 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
5139  bracket_elem_t *end_elem)
5140 # endif /* not RE_ENABLE_I18N */
5141 {
5142  unsigned int start_ch, end_ch;
5143  /* Equivalence Classes and Character Classes can't be a range start/end. */
5144  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5145  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5146  0))
5147  return REG_ERANGE;
5148 
5149  /* We can handle no multi character collating elements without libc
5150  support. */
5151  if (BE ((start_elem->type == COLL_SYM
5152  && strlen ((char *) start_elem->opr.name) > 1)
5153  || (end_elem->type == COLL_SYM
5154  && strlen ((char *) end_elem->opr.name) > 1), 0))
5155  return REG_ECOLLATE;
5156 
5157 # ifdef RE_ENABLE_I18N
5158  {
5159  wchar_t wc;
5160  wint_t start_wc;
5161  wint_t end_wc;
5162  wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
5163 
5164  start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
5165  : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5166  : 0));
5167  end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
5168  : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5169  : 0));
5170  start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
5171  ? __btowc (start_ch) : start_elem->opr.wch);
5172  end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
5173  ? __btowc (end_ch) : end_elem->opr.wch);
5174  if (start_wc == WEOF || end_wc == WEOF)
5175  return REG_ECOLLATE;
5176  cmp_buf[0] = start_wc;
5177  cmp_buf[4] = end_wc;
5178  if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
5179  return REG_ERANGE;
5180 
5181  /* Got valid collation sequence values, add them as a new entry.
5182  However, for !_LIBC we have no collation elements: if the
5183  character set is single byte, the single byte character set
5184  that we build below suffices. parse_bracket_exp passes
5185  no MBCSET if dfa->mb_cur_max == 1. */
5186  if (mbcset)
5187  {
5188  /* Check the space of the arrays. */
5189  if (BE (*range_alloc == mbcset->nranges, 0))
5190  {
5191  /* There is not enough space, need realloc. */
5192  wchar_t *new_array_start, *new_array_end;
5193  int new_nranges;
5194 
5195  /* +1 in case of mbcset->nranges is 0. */
5196  new_nranges = 2 * mbcset->nranges + 1;
5197  /* Use realloc since mbcset->range_starts and mbcset->range_ends
5198  are NULL if *range_alloc == 0. */
5199  new_array_start = re_realloc (mbcset->range_starts, wchar_t,
5200  new_nranges);
5201  new_array_end = re_realloc (mbcset->range_ends, wchar_t,
5202  new_nranges);
5203 
5204  if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5205  return REG_ESPACE;
5206 
5207  mbcset->range_starts = new_array_start;
5208  mbcset->range_ends = new_array_end;
5209  *range_alloc = new_nranges;
5210  }
5211 
5212  mbcset->range_starts[mbcset->nranges] = start_wc;
5213  mbcset->range_ends[mbcset->nranges++] = end_wc;
5214  }
5215 
5216  /* Build the table for single byte characters. */
5217  for (wc = 0; wc < SBC_MAX; ++wc)
5218  {
5219  cmp_buf[2] = wc;
5220  if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
5221  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
5222  bitset_set (sbcset, wc);
5223  }
5224  }
5225 # else /* not RE_ENABLE_I18N */
5226  {
5227  unsigned int ch;
5228  start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
5229  : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
5230  : 0));
5231  end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
5232  : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
5233  : 0));
5234  if (start_ch > end_ch)
5235  return REG_ERANGE;
5236  /* Build the table for single byte characters. */
5237  for (ch = 0; ch < SBC_MAX; ++ch)
5238  if (start_ch <= ch && ch <= end_ch)
5239  bitset_set (sbcset, ch);
5240  }
5241 # endif /* not RE_ENABLE_I18N */
5242  return REG_NOERROR;
5243 }
5244 #endif /* not _LIBC */
5245 
5246 #ifndef _LIBC
5247 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
5248  Build the collating element which is represented by NAME.
5249  The result are written to MBCSET and SBCSET.
5250  COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5251  pointer argument since we may update it. */
5252 
5253 static reg_errcode_t
5254 internal_function
5255 # ifdef RE_ENABLE_I18N
5256 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
5257  int *coll_sym_alloc, const unsigned char *name)
5258 # else /* not RE_ENABLE_I18N */
5259 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
5260 # endif /* not RE_ENABLE_I18N */
5261 {
5262  size_t name_len = strlen ((const char *) name);
5263  if (BE (name_len != 1, 0))
5264  return REG_ECOLLATE;
5265  else
5266  {
5267  bitset_set (sbcset, name[0]);
5268  return REG_NOERROR;
5269  }
5270 }
5271 #endif /* not _LIBC */
5272 
5273 /* This function parse bracket expression like "[abc]", "[a-c]",
5274  "[[.a-a.]]" etc. */
5275 
5276 static bin_tree_t *
5277 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
5279 {
5280 #ifdef _LIBC
5281  const unsigned char *collseqmb;
5282  const char *collseqwc;
5283  uint32_t nrules;
5284  int32_t table_size;
5285  const int32_t *symb_table;
5286  const unsigned char *extra;
5287 
5288  /* Local function for parse_bracket_exp used in _LIBC environement.
5289  Seek the collating symbol entry correspondings to NAME.
5290  Return the index of the symbol in the SYMB_TABLE. */
5291 
5292  auto inline int32_t
5293  __attribute ((always_inline))
5294  seek_collating_symbol_entry (name, name_len)
5295  const unsigned char *name;
5296  size_t name_len;
5297  {
5298  int32_t hash = elem_hash ((const char *) name, name_len);
5299  int32_t elem = hash % table_size;
5300  if (symb_table[2 * elem] != 0)
5301  {
5302  int32_t second = hash % (table_size - 2) + 1;
5303 
5304  do
5305  {
5306  /* First compare the hashing value. */
5307  if (symb_table[2 * elem] == hash
5308  /* Compare the length of the name. */
5309  && name_len == extra[symb_table[2 * elem + 1]]
5310  /* Compare the name. */
5311  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
5312  name_len) == 0)
5313  {
5314  /* Yep, this is the entry. */
5315  break;
5316  }
5317 
5318  /* Next entry. */
5319  elem += second;
5320  }
5321  while (symb_table[2 * elem] != 0);
5322  }
5323  return elem;
5324  }
5325 
5326  /* Local function for parse_bracket_exp used in _LIBC environement.
5327  Look up the collation sequence value of BR_ELEM.
5328  Return the value if succeeded, UINT_MAX otherwise. */
5329 
5330  auto inline unsigned int
5331  __attribute ((always_inline))
5332  lookup_collation_sequence_value (br_elem)
5333  bracket_elem_t *br_elem;
5334  {
5335  if (br_elem->type == SB_CHAR)
5336  {
5337  /*
5338  if (MB_CUR_MAX == 1)
5339  */
5340  if (nrules == 0)
5341  return collseqmb[br_elem->opr.ch];
5342  else
5343  {
5344  wint_t wc = __btowc (br_elem->opr.ch);
5345  return __collseq_table_lookup (collseqwc, wc);
5346  }
5347  }
5348  else if (br_elem->type == MB_CHAR)
5349  {
5350  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
5351  }
5352  else if (br_elem->type == COLL_SYM)
5353  {
5354  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
5355  if (nrules != 0)
5356  {
5357  int32_t elem, idx;
5358  elem = seek_collating_symbol_entry (br_elem->opr.name,
5359  sym_name_len);
5360  if (symb_table[2 * elem] != 0)
5361  {
5362  /* We found the entry. */
5363  idx = symb_table[2 * elem + 1];
5364  /* Skip the name of collating element name. */
5365  idx += 1 + extra[idx];
5366  /* Skip the byte sequence of the collating element. */
5367  idx += 1 + extra[idx];
5368  /* Adjust for the alignment. */
5369  idx = (idx + 3) & ~3;
5370  /* Skip the multibyte collation sequence value. */
5371  idx += sizeof (unsigned int);
5372  /* Skip the wide char sequence of the collating element. */
5373  idx += sizeof (unsigned int) *
5374  (1 + *(unsigned int *) (extra + idx));
5375  /* Return the collation sequence value. */
5376  return *(unsigned int *) (extra + idx);
5377  }
5378  else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
5379  {
5380  /* No valid character. Match it as a single byte
5381  character. */
5382  return collseqmb[br_elem->opr.name[0]];
5383  }
5384  }
5385  else if (sym_name_len == 1)
5386  return collseqmb[br_elem->opr.name[0]];
5387  }
5388  return UINT_MAX;
5389  }
5390 
5391  /* Local function for parse_bracket_exp used in _LIBC environement.
5392  Build the range expression which starts from START_ELEM, and ends
5393  at END_ELEM. The result are written to MBCSET and SBCSET.
5394  RANGE_ALLOC is the allocated size of mbcset->range_starts, and
5395  mbcset->range_ends, is a pointer argument sinse we may
5396  update it. */
5397 
5398  auto inline reg_errcode_t
5399  __attribute ((always_inline))
5400  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
5401  re_charset_t *mbcset;
5402  int *range_alloc;
5403  bitset_t sbcset;
5404  bracket_elem_t *start_elem, *end_elem;
5405  {
5406  unsigned int ch;
5407  uint32_t start_collseq;
5408  uint32_t end_collseq;
5409 
5410  /* Equivalence Classes and Character Classes can't be a range
5411  start/end. */
5412  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
5413  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
5414  0))
5415  return REG_ERANGE;
5416 
5417  start_collseq = lookup_collation_sequence_value (start_elem);
5418  end_collseq = lookup_collation_sequence_value (end_elem);
5419  /* Check start/end collation sequence values. */
5420  if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
5421  return REG_ECOLLATE;
5422  if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
5423  return REG_ERANGE;
5424 
5425  /* Got valid collation sequence values, add them as a new entry.
5426  However, if we have no collation elements, and the character set
5427  is single byte, the single byte character set that we
5428  build below suffices. */
5429  if (nrules > 0 || dfa->mb_cur_max > 1)
5430  {
5431  /* Check the space of the arrays. */
5432  if (BE (*range_alloc == mbcset->nranges, 0))
5433  {
5434  /* There is not enough space, need realloc. */
5435  uint32_t *new_array_start;
5436  uint32_t *new_array_end;
5437  int new_nranges;
5438 
5439  /* +1 in case of mbcset->nranges is 0. */
5440  new_nranges = 2 * mbcset->nranges + 1;
5441  new_array_start = re_realloc (mbcset->range_starts, uint32_t,
5442  new_nranges);
5443  new_array_end = re_realloc (mbcset->range_ends, uint32_t,
5444  new_nranges);
5445 
5446  if (BE (new_array_start == NULL || new_array_end == NULL, 0))
5447  return REG_ESPACE;
5448 
5449  mbcset->range_starts = new_array_start;
5450  mbcset->range_ends = new_array_end;
5451  *range_alloc = new_nranges;
5452  }
5453 
5454  mbcset->range_starts[mbcset->nranges] = start_collseq;
5455  mbcset->range_ends[mbcset->nranges++] = end_collseq;
5456  }
5457 
5458  /* Build the table for single byte characters. */
5459  for (ch = 0; ch < SBC_MAX; ch++)
5460  {
5461  uint32_t ch_collseq;
5462  /*
5463  if (MB_CUR_MAX == 1)
5464  */
5465  if (nrules == 0)
5466  ch_collseq = collseqmb[ch];
5467  else
5468  ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
5469  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
5470  bitset_set (sbcset, ch);
5471  }
5472  return REG_NOERROR;
5473  }
5474 
5475  /* Local function for parse_bracket_exp used in _LIBC environement.
5476  Build the collating element which is represented by NAME.
5477  The result are written to MBCSET and SBCSET.
5478  COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
5479  pointer argument sinse we may update it. */
5480 
5481  auto inline reg_errcode_t
5482  __attribute ((always_inline))
5483  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
5484  re_charset_t *mbcset;
5485  int *coll_sym_alloc;
5486  bitset_t sbcset;
5487  const unsigned char *name;
5488  {
5489  int32_t elem, idx;
5490  size_t name_len = strlen ((const char *) name);
5491  if (nrules != 0)
5492  {
5493  elem = seek_collating_symbol_entry (name, name_len);
5494  if (symb_table[2 * elem] != 0)
5495  {
5496  /* We found the entry. */
5497  idx = symb_table[2 * elem + 1];
5498  /* Skip the name of collating element name. */
5499  idx += 1 + extra[idx];
5500  }
5501  else if (symb_table[2 * elem] == 0 && name_len == 1)
5502  {
5503  /* No valid character, treat it as a normal
5504  character. */
5505  bitset_set (sbcset, name[0]);
5506  return REG_NOERROR;
5507  }
5508  else
5509  return REG_ECOLLATE;
5510 
5511  /* Got valid collation sequence, add it as a new entry. */
5512  /* Check the space of the arrays. */
5513  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
5514  {
5515  /* Not enough, realloc it. */
5516  /* +1 in case of mbcset->ncoll_syms is 0. */
5517  int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
5518  /* Use realloc since mbcset->coll_syms is NULL
5519  if *alloc == 0. */
5520  int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
5521  new_coll_sym_alloc);
5522  if (BE (new_coll_syms == NULL, 0))
5523  return REG_ESPACE;
5524  mbcset->coll_syms = new_coll_syms;
5525  *coll_sym_alloc = new_coll_sym_alloc;
5526  }
5527  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
5528  return REG_NOERROR;
5529  }
5530  else
5531  {
5532  if (BE (name_len != 1, 0))
5533  return REG_ECOLLATE;
5534  else
5535  {
5536  bitset_set (sbcset, name[0]);
5537  return REG_NOERROR;
5538  }
5539  }
5540  }
5541 #endif
5542 
5543  re_token_t br_token;
5544  re_bitset_ptr_t sbcset;
5545 #ifdef RE_ENABLE_I18N
5546  re_charset_t *mbcset;
5547  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
5548  int equiv_class_alloc = 0, char_class_alloc = 0;
5549 #endif /* not RE_ENABLE_I18N */
5550  int non_match = 0;
5551  bin_tree_t *work_tree;
5552  int token_len;
5553  int first_round = 1;
5554 #ifdef _LIBC
5555  collseqmb = (const unsigned char *)
5556  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
5557  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5558  if (nrules)
5559  {
5560  /*
5561  if (MB_CUR_MAX > 1)
5562  */
5563  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
5564  table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
5565  symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5566  _NL_COLLATE_SYMB_TABLEMB);
5567  extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5568  _NL_COLLATE_SYMB_EXTRAMB);
5569  }
5570 #endif
5571  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
5572 #ifdef RE_ENABLE_I18N
5573  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
5574 #endif /* RE_ENABLE_I18N */
5575 #ifdef RE_ENABLE_I18N
5576  if (BE (sbcset == NULL || mbcset == NULL, 0))
5577 #else
5578  if (BE (sbcset == NULL, 0))
5579 #endif /* RE_ENABLE_I18N */
5580  {
5581  *err = REG_ESPACE;
5582  return NULL;
5583  }
5584 
5585  token_len = peek_token_bracket (token, regexp, syntax);
5586  if (BE (token->type == END_OF_RE, 0))
5587  {
5588  *err = REG_BADPAT;
5589  goto parse_bracket_exp_free_return;
5590  }
5591  if (token->type == OP_NON_MATCH_LIST)
5592  {
5593 #ifdef RE_ENABLE_I18N
5594  mbcset->non_match = 1;
5595 #endif /* not RE_ENABLE_I18N */
5596  non_match = 1;
5598  bitset_set (sbcset, '\0');
5599  re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5600  token_len = peek_token_bracket (token, regexp, syntax);
5601  if (BE (token->type == END_OF_RE, 0))
5602  {
5603  *err = REG_BADPAT;
5604  goto parse_bracket_exp_free_return;
5605  }
5606  }
5607 
5608  /* We treat the first ']' as a normal character. */
5609  if (token->type == OP_CLOSE_BRACKET)
5610  token->type = CHARACTER;
5611 
5612  while (1)
5613  {
5614  bracket_elem_t start_elem, end_elem;
5615  unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
5616  unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
5618  int token_len2 = 0, is_range_exp = 0;
5619  re_token_t token2;
5620 
5621  start_elem.opr.name = start_name_buf;
5622  ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
5623  syntax, first_round);
5624  if (BE (ret != REG_NOERROR, 0))
5625  {
5626  *err = ret;
5627  goto parse_bracket_exp_free_return;
5628  }
5629  first_round = 0;
5630 
5631  /* Get information about the next token. We need it in any case. */
5632  token_len = peek_token_bracket (token, regexp, syntax);
5633 
5634  /* Do not check for ranges if we know they are not allowed. */
5635  if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
5636  {
5637  if (BE (token->type == END_OF_RE, 0))
5638  {
5639  *err = REG_EBRACK;
5640  goto parse_bracket_exp_free_return;
5641  }
5642  if (token->type == OP_CHARSET_RANGE)
5643  {
5644  re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
5645  token_len2 = peek_token_bracket (&token2, regexp, syntax);
5646  if (BE (token2.type == END_OF_RE, 0))
5647  {
5648  *err = REG_EBRACK;
5649  goto parse_bracket_exp_free_return;
5650  }
5651  if (token2.type == OP_CLOSE_BRACKET)
5652  {
5653  /* We treat the last '-' as a normal character. */
5654  re_string_skip_bytes (regexp, -token_len);
5655  token->type = CHARACTER;
5656  }
5657  else
5658  is_range_exp = 1;
5659  }
5660  }
5661 
5662  if (is_range_exp == 1)
5663  {
5664  end_elem.opr.name = end_name_buf;
5665  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
5666  dfa, syntax, 1);
5667  if (BE (ret != REG_NOERROR, 0))
5668  {
5669  *err = ret;
5670  goto parse_bracket_exp_free_return;
5671  }
5672 
5673  token_len = peek_token_bracket (token, regexp, syntax);
5674 
5675 #ifdef _LIBC
5676  *err = build_range_exp (sbcset, mbcset, &range_alloc,
5677  &start_elem, &end_elem);
5678 #else
5679 # ifdef RE_ENABLE_I18N
5680  *err = build_range_exp (sbcset,
5681  dfa->mb_cur_max > 1 ? mbcset : NULL,
5682  &range_alloc, &start_elem, &end_elem);
5683 # else
5684  *err = build_range_exp (sbcset, &start_elem, &end_elem);
5685 # endif
5686 #endif /* RE_ENABLE_I18N */
5687  if (BE (*err != REG_NOERROR, 0))
5688  goto parse_bracket_exp_free_return;
5689  }
5690  else
5691  {
5692  switch (start_elem.type)
5693  {
5694  case SB_CHAR:
5695  bitset_set (sbcset, start_elem.opr.ch);
5696  break;
5697 #ifdef RE_ENABLE_I18N
5698  case MB_CHAR:
5699  /* Check whether the array has enough space. */
5700  if (BE (mbchar_alloc == mbcset->nmbchars, 0))
5701  {
5702  wchar_t *new_mbchars;
5703  /* Not enough, realloc it. */
5704  /* +1 in case of mbcset->nmbchars is 0. */
5705  mbchar_alloc = 2 * mbcset->nmbchars + 1;
5706  /* Use realloc since array is NULL if *alloc == 0. */
5707  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
5708  mbchar_alloc);
5709  if (BE (new_mbchars == NULL, 0))
5710  goto parse_bracket_exp_espace;
5711  mbcset->mbchars = new_mbchars;
5712  }
5713  mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
5714  break;
5715 #endif /* RE_ENABLE_I18N */
5716  case EQUIV_CLASS:
5717  *err = build_equiv_class (sbcset,
5718 #ifdef RE_ENABLE_I18N
5719  mbcset, &equiv_class_alloc,
5720 #endif /* RE_ENABLE_I18N */
5721  start_elem.opr.name);
5722  if (BE (*err != REG_NOERROR, 0))
5723  goto parse_bracket_exp_free_return;
5724  break;
5725  case COLL_SYM:
5726  *err = build_collating_symbol (sbcset,
5727 #ifdef RE_ENABLE_I18N
5728  mbcset, &coll_sym_alloc,
5729 #endif /* RE_ENABLE_I18N */
5730  start_elem.opr.name);
5731  if (BE (*err != REG_NOERROR, 0))
5732  goto parse_bracket_exp_free_return;
5733  break;
5734  case CHAR_CLASS:
5735  *err = build_charclass (regexp->trans, sbcset,
5736 #ifdef RE_ENABLE_I18N
5737  mbcset, &char_class_alloc,
5738 #endif /* RE_ENABLE_I18N */
5739  start_elem.opr.name, syntax);
5740  if (BE (*err != REG_NOERROR, 0))
5741  goto parse_bracket_exp_free_return;
5742  break;
5743  default:
5744  assert (0);
5745  break;
5746  }
5747  }
5748  if (BE (token->type == END_OF_RE, 0))
5749  {
5750  *err = REG_EBRACK;
5751  goto parse_bracket_exp_free_return;
5752  }
5753  if (token->type == OP_CLOSE_BRACKET)
5754  break;
5755  }
5756 
5757  re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5758 
5759  /* If it is non-matching list. */
5760  if (non_match)
5761  bitset_not (sbcset);
5762 
5763 #ifdef RE_ENABLE_I18N
5764  /* Ensure only single byte characters are set. */
5765  if (dfa->mb_cur_max > 1)
5766  bitset_mask (sbcset, dfa->sb_char);
5767 
5768  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
5769  || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
5770  || mbcset->non_match)))
5771  {
5772  bin_tree_t *mbc_tree;
5773  int sbc_idx;
5774  /* Build a tree for complex bracket. */
5775  dfa->has_mb_node = 1;
5776  br_token.type = COMPLEX_BRACKET;
5777  br_token.opr.mbcset = mbcset;
5778  mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5779  if (BE (mbc_tree == NULL, 0))
5780  goto parse_bracket_exp_espace;
5781  for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
5782  if (sbcset[sbc_idx])
5783  break;
5784  /* If there are no bits set in sbcset, there is no point
5785  of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
5786  if (sbc_idx < BITSET_WORDS)
5787  {
5788  /* Build a tree for simple bracket. */
5789  br_token.type = SIMPLE_BRACKET;
5790  br_token.opr.sbcset = sbcset;
5791  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5792  if (BE (work_tree == NULL, 0))
5793  goto parse_bracket_exp_espace;
5794 
5795  /* Then join them by ALT node. */
5796  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
5797  if (BE (work_tree == NULL, 0))
5798  goto parse_bracket_exp_espace;
5799  }
5800  else
5801  {
5802  re_free (sbcset);
5803  work_tree = mbc_tree;
5804  }
5805  }
5806  else
5807 #endif /* not RE_ENABLE_I18N */
5808  {
5809 #ifdef RE_ENABLE_I18N
5810  free_charset (mbcset);
5811 #endif
5812  /* Build a tree for simple bracket. */
5813  br_token.type = SIMPLE_BRACKET;
5814  br_token.opr.sbcset = sbcset;
5815  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
5816  if (BE (work_tree == NULL, 0))
5817  goto parse_bracket_exp_espace;
5818  }
5819  return work_tree;
5820 
5821  parse_bracket_exp_espace:
5822  *err = REG_ESPACE;
5823  parse_bracket_exp_free_return:
5824  re_free (sbcset);
5825 #ifdef RE_ENABLE_I18N
5826  free_charset (mbcset);
5827 #endif /* RE_ENABLE_I18N */
5828  return NULL;
5829 }
5830 
5831 /* Parse an element in the bracket expression. */
5832 
5833 static reg_errcode_t
5834 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
5835  re_token_t *token, int token_len, re_dfa_t *dfa,
5836  reg_syntax_t syntax, int accept_hyphen)
5837 {
5838 #ifdef RE_ENABLE_I18N
5839  int cur_char_size;
5840  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
5841  if (cur_char_size > 1)
5842  {
5843  elem->type = MB_CHAR;
5844  elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
5845  re_string_skip_bytes (regexp, cur_char_size);
5846  return REG_NOERROR;
5847  }
5848 #endif /* RE_ENABLE_I18N */
5849  re_string_skip_bytes (regexp, token_len); /* Skip a token. */
5850  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
5851  || token->type == OP_OPEN_EQUIV_CLASS)
5852  return parse_bracket_symbol (elem, regexp, token);
5853  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
5854  {
5855  /* A '-' must only appear as anything but a range indicator before
5856  the closing bracket. Everything else is an error. */
5857  re_token_t token2;
5858  (void) peek_token_bracket (&token2, regexp, syntax);
5859  if (token2.type != OP_CLOSE_BRACKET)
5860  /* The actual error value is not standardized since this whole
5861  case is undefined. But ERANGE makes good sense. */
5862  return REG_ERANGE;
5863  }
5864  elem->type = SB_CHAR;
5865  elem->opr.ch = token->opr.c;
5866  return REG_NOERROR;
5867 }
5868 
5869 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
5870  such as [:<character_class>:], [.<collating_element>.], and
5871  [=<equivalent_class>=]. */
5872 
5873 static reg_errcode_t
5874 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
5875  re_token_t *token)
5876 {
5877  unsigned char ch, delim = token->opr.c;
5878  int i = 0;
5879  if (re_string_eoi(regexp))
5880  return REG_EBRACK;
5881  for (;; ++i)
5882  {
5883  if (i >= BRACKET_NAME_BUF_SIZE)
5884  return REG_EBRACK;
5885  if (token->type == OP_OPEN_CHAR_CLASS)
5886  ch = re_string_fetch_byte_case (regexp);
5887  else
5888  ch = re_string_fetch_byte (regexp);
5889  if (re_string_eoi(regexp))
5890  return REG_EBRACK;
5891  if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
5892  break;
5893  elem->opr.name[i] = ch;
5894  }
5895  re_string_skip_bytes (regexp, 1);
5896  elem->opr.name[i] = '\0';
5897  switch (token->type)
5898  {
5899  case OP_OPEN_COLL_ELEM:
5900  elem->type = COLL_SYM;
5901  break;
5902  case OP_OPEN_EQUIV_CLASS:
5903  elem->type = EQUIV_CLASS;
5904  break;
5905  case OP_OPEN_CHAR_CLASS:
5906  elem->type = CHAR_CLASS;
5907  break;
5908  default:
5909  break;
5910  }
5911  return REG_NOERROR;
5912 }
5913 
5914  /* Helper function for parse_bracket_exp.
5915  Build the equivalence class which is represented by NAME.
5916  The result are written to MBCSET and SBCSET.
5917  EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
5918  is a pointer argument sinse we may update it. */
5919 
5920 static reg_errcode_t
5921 #ifdef RE_ENABLE_I18N
5922 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
5923  int *equiv_class_alloc, const unsigned char *name)
5924 #else /* not RE_ENABLE_I18N */
5925 build_equiv_class (bitset_t sbcset, const unsigned char *name)
5926 #endif /* not RE_ENABLE_I18N */
5927 {
5928 #ifdef _LIBC
5929  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5930  if (nrules != 0)
5931  {
5932  const int32_t *table, *indirect;
5933  const unsigned char *weights, *extra, *cp;
5934  unsigned char char_buf[2];
5935  int32_t idx1, idx2;
5936  unsigned int ch;
5937  size_t len;
5938  /* This #include defines a local function! */
5939 # include <locale/weight.h>
5940  /* Calculate the index for equivalence class. */
5941  cp = name;
5942  table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
5943  weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5944  _NL_COLLATE_WEIGHTMB);
5945  extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
5946  _NL_COLLATE_EXTRAMB);
5947  indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
5948  _NL_COLLATE_INDIRECTMB);
5949  idx1 = findidx (&cp);
5950  if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
5951  /* This isn't a valid character. */
5952  return REG_ECOLLATE;
5953 
5954  /* Build single byte matcing table for this equivalence class. */
5955  char_buf[1] = (unsigned char) '\0';
5956  len = weights[idx1];
5957  for (ch = 0; ch < SBC_MAX; ++ch)
5958  {
5959  char_buf[0] = ch;
5960  cp = char_buf;
5961  idx2 = findidx (&cp);
5962 /*
5963  idx2 = table[ch];
5964 */
5965  if (idx2 == 0)
5966  /* This isn't a valid character. */
5967  continue;
5968  if (len == weights[idx2])
5969  {
5970  int cnt = 0;
5971  while (cnt <= len &&
5972  weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
5973  ++cnt;
5974 
5975  if (cnt > len)
5976  bitset_set (sbcset, ch);
5977  }
5978  }
5979  /* Check whether the array has enough space. */
5980  if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
5981  {
5982  /* Not enough, realloc it. */
5983  /* +1 in case of mbcset->nequiv_classes is 0. */
5984  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
5985  /* Use realloc since the array is NULL if *alloc == 0. */
5986  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
5987  int32_t,
5988  new_equiv_class_alloc);
5989  if (BE (new_equiv_classes == NULL, 0))
5990  return REG_ESPACE;
5991  mbcset->equiv_classes = new_equiv_classes;
5992  *equiv_class_alloc = new_equiv_class_alloc;
5993  }
5994  mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
5995  }
5996  else
5997 #endif /* _LIBC */
5998  {
5999  if (BE (strlen ((const char *) name) != 1, 0))
6000  return REG_ECOLLATE;
6001  bitset_set (sbcset, *name);
6002  }
6003  return REG_NOERROR;
6004 }
6005 
6006  /* Helper function for parse_bracket_exp.
6007  Build the character class which is represented by NAME.
6008  The result are written to MBCSET and SBCSET.
6009  CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
6010  is a pointer argument sinse we may update it. */
6011 
6012 static reg_errcode_t
6013 #ifdef RE_ENABLE_I18N
6014 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6015  re_charset_t *mbcset, int *char_class_alloc,
6016  const unsigned char *class_name, reg_syntax_t syntax)
6017 #else /* not RE_ENABLE_I18N */
6018 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
6019  const unsigned char *class_name, reg_syntax_t syntax)
6020 #endif /* not RE_ENABLE_I18N */
6021 {
6022  int i;
6023  const char *name = (const char *) class_name;
6024 
6025  /* In case of REG_ICASE "upper" and "lower" match the both of
6026  upper and lower cases. */
6027  if ((syntax & RE_ICASE)
6028  && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
6029  name = "alpha";
6030 
6031 #ifdef RE_ENABLE_I18N
6032  /* Check the space of the arrays. */
6033  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
6034  {
6035  /* Not enough, realloc it. */
6036  /* +1 in case of mbcset->nchar_classes is 0. */
6037  int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
6038  /* Use realloc since array is NULL if *alloc == 0. */
6039  wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
6040  new_char_class_alloc);
6041  if (BE (new_char_classes == NULL, 0))
6042  return REG_ESPACE;
6043  mbcset->char_classes = new_char_classes;
6044  *char_class_alloc = new_char_class_alloc;
6045  }
6046  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
6047 #endif /* RE_ENABLE_I18N */
6048 
6049 #define BUILD_CHARCLASS_LOOP(ctype_func) \
6050  do { \
6051  if (BE (trans != NULL, 0)) \
6052  { \
6053  for (i = 0; i < SBC_MAX; ++i) \
6054  if (ctype_func (i)) \
6055  bitset_set (sbcset, trans[i]); \
6056  } \
6057  else \
6058  { \
6059  for (i = 0; i < SBC_MAX; ++i) \
6060  if (ctype_func (i)) \
6061  bitset_set (sbcset, i); \
6062  } \
6063  } while (0)
6064 
6065  if (strcmp (name, "alnum") == 0)
6066  BUILD_CHARCLASS_LOOP (isalnum);
6067  else if (strcmp (name, "cntrl") == 0)
6068  BUILD_CHARCLASS_LOOP (iscntrl);
6069  else if (strcmp (name, "lower") == 0)
6070  BUILD_CHARCLASS_LOOP (islower);
6071  else if (strcmp (name, "space") == 0)
6072  BUILD_CHARCLASS_LOOP (isspace);
6073  else if (strcmp (name, "alpha") == 0)
6074  BUILD_CHARCLASS_LOOP (isalpha);
6075  else if (strcmp (name, "digit") == 0)
6076  BUILD_CHARCLASS_LOOP (isdigit);
6077  else if (strcmp (name, "print") == 0)
6078  BUILD_CHARCLASS_LOOP (isprint);
6079  else if (strcmp (name, "upper") == 0)
6080  BUILD_CHARCLASS_LOOP (isupper);
6081  else if (strcmp (name, "blank") == 0)
6082  BUILD_CHARCLASS_LOOP (isblank);
6083  else if (strcmp (name, "graph") == 0)
6084  BUILD_CHARCLASS_LOOP (isgraph);
6085  else if (strcmp (name, "punct") == 0)
6086  BUILD_CHARCLASS_LOOP (ispunct);
6087  else if (strcmp (name, "xdigit") == 0)
6088  BUILD_CHARCLASS_LOOP (isxdigit);
6089  else
6090  return REG_ECTYPE;
6091 
6092  return REG_NOERROR;
6093 }
6094 
6095 static bin_tree_t *
6096 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
6097  const unsigned char *class_name,
6098  const unsigned char *extra, int non_match,
6099  reg_errcode_t *err)
6100 {
6101  re_bitset_ptr_t sbcset;
6102 #ifdef RE_ENABLE_I18N
6103  re_charset_t *mbcset;
6104  int alloc = 0;
6105 #endif /* not RE_ENABLE_I18N */
6107  re_token_t br_token;
6108  bin_tree_t *tree;
6109 
6110  sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
6111 #ifdef RE_ENABLE_I18N
6112  mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
6113 #endif /* RE_ENABLE_I18N */
6114 
6115 #ifdef RE_ENABLE_I18N
6116  if (BE (sbcset == NULL || mbcset == NULL, 0))
6117 #else /* not RE_ENABLE_I18N */
6118  if (BE (sbcset == NULL, 0))
6119 #endif /* not RE_ENABLE_I18N */
6120  {
6121  *err = REG_ESPACE;
6122  return NULL;
6123  }
6124 
6125  if (non_match)
6126  {
6127 #ifdef RE_ENABLE_I18N
6128  /*
6129  if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
6130  bitset_set(cset->sbcset, '\0');
6131  */
6132  mbcset->non_match = 1;
6133 #endif /* not RE_ENABLE_I18N */
6134  }
6135 
6136  /* We don't care the syntax in this case. */
6137  ret = build_charclass (trans, sbcset,
6138 #ifdef RE_ENABLE_I18N
6139  mbcset, &alloc,
6140 #endif /* RE_ENABLE_I18N */
6141  class_name, 0);
6142 
6143  if (BE (ret != REG_NOERROR, 0))
6144  {
6145  re_free (sbcset);
6146 #ifdef RE_ENABLE_I18N
6147  free_charset (mbcset);
6148 #endif /* RE_ENABLE_I18N */
6149  *err = ret;
6150  return NULL;
6151  }
6152  /* \w match '_' also. */
6153  for (; *extra; extra++)
6154  bitset_set (sbcset, *extra);
6155 
6156  /* If it is non-matching list. */
6157  if (non_match)
6158  bitset_not (sbcset);
6159 
6160 #ifdef RE_ENABLE_I18N
6161  /* Ensure only single byte characters are set. */
6162  if (dfa->mb_cur_max > 1)
6163  bitset_mask (sbcset, dfa->sb_char);
6164 #endif
6165 
6166  /* Build a tree for simple bracket. */
6167  br_token.type = SIMPLE_BRACKET;
6168  br_token.opr.sbcset = sbcset;
6169  tree = create_token_tree (dfa, NULL, NULL, &br_token);
6170  if (BE (tree == NULL, 0))
6171  goto build_word_op_espace;
6172 
6173 #ifdef RE_ENABLE_I18N
6174  if (dfa->mb_cur_max > 1)
6175  {
6176  bin_tree_t *mbc_tree;
6177  /* Build a tree for complex bracket. */
6178  br_token.type = COMPLEX_BRACKET;
6179  br_token.opr.mbcset = mbcset;
6180  dfa->has_mb_node = 1;
6181  mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
6182  if (BE (mbc_tree == NULL, 0))
6183  goto build_word_op_espace;
6184  /* Then join them by ALT node. */
6185  tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
6186  if (BE (mbc_tree != NULL, 1))
6187  return tree;
6188  }
6189  else
6190  {
6191  free_charset (mbcset);
6192  return tree;
6193  }
6194 #else /* not RE_ENABLE_I18N */
6195  return tree;
6196 #endif /* not RE_ENABLE_I18N */
6197 
6198  build_word_op_espace:
6199  re_free (sbcset);
6200 #ifdef RE_ENABLE_I18N
6201  free_charset (mbcset);
6202 #endif /* RE_ENABLE_I18N */
6203  *err = REG_ESPACE;
6204  return NULL;
6205 }
6206 
6207 /* This is intended for the expressions like "a{1,3}".
6208  Fetch a number from `input', and return the number.
6209  Return -1, if the number field is empty like "{,1}".
6210  Return -2, If an error is occured. */
6211 
6212 static int
6213 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
6214 {
6215  int num = -1;
6216  unsigned char c;
6217  while (1)
6218  {
6219  fetch_token (token, input, syntax);
6220  c = token->opr.c;
6221  if (BE (token->type == END_OF_RE, 0))
6222  return -2;
6223  if (token->type == OP_CLOSE_DUP_NUM || c == ',')
6224  break;
6225  num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
6226  ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
6227  num = (num > RE_DUP_MAX) ? -2 : num;
6228  }
6229  return num;
6230 }
6231 
6232 #ifdef RE_ENABLE_I18N
6233 static void
6234 free_charset (re_charset_t *cset)
6235 {
6236  re_free (cset->mbchars);
6237 # ifdef _LIBC
6238  re_free (cset->coll_syms);
6239  re_free (cset->equiv_classes);
6240  re_free (cset->range_starts);
6241  re_free (cset->range_ends);
6242 # endif
6243  re_free (cset->char_classes);
6244  re_free (cset);
6245 }
6246 #endif /* RE_ENABLE_I18N */
6247 
6248 /* Functions for binary tree operation. */
6249 
6250 /* Create a tree node. */
6251 
6252 static bin_tree_t *
6253 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6254  re_token_type_t type)
6255 {
6256  re_token_t t;
6257  t.type = type;
6258  return create_token_tree (dfa, left, right, &t);
6259 }
6260 
6261 static bin_tree_t *
6262 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
6263  const re_token_t *token)
6264 {
6265  bin_tree_t *tree;
6266  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
6267  {
6268  bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
6269 
6270  if (storage == NULL)
6271  return NULL;
6272  storage->next = dfa->str_tree_storage;
6273  dfa->str_tree_storage = storage;
6274  dfa->str_tree_storage_idx = 0;
6275  }
6276  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
6277 
6278  tree->parent = NULL;
6279  tree->left = left;
6280  tree->right = right;
6281  tree->token = *token;
6282  tree->token.duplicated = 0;
6283  tree->token.opt_subexp = 0;
6284  tree->first = NULL;
6285  tree->next = NULL;
6286  tree->node_idx = -1;
6287 
6288  if (left != NULL)
6289  left->parent = tree;
6290  if (right != NULL)
6291  right->parent = tree;
6292  return tree;
6293 }
6294 
6295 /* Mark the tree SRC as an optional subexpression.
6296  To be called from preorder or postorder. */
6297 
6298 static reg_errcode_t
6299 mark_opt_subexp (void *extra, bin_tree_t *node)
6300 {
6301  int idx = (int) (long) extra;
6302  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
6303  node->token.opt_subexp = 1;
6304 
6305  return REG_NOERROR;
6306 }
6307 
6308 /* Free the allocated memory inside NODE. */
6309 
6310 static void
6311 free_token (re_token_t *node)
6312 {
6313 #ifdef RE_ENABLE_I18N
6314  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
6315  free_charset (node->opr.mbcset);
6316  else
6317 #endif /* RE_ENABLE_I18N */
6318  if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
6319  re_free (node->opr.sbcset);
6320 }
6321 
6322 /* Worker function for tree walking. Free the allocated memory inside NODE
6323  and its children. */
6324 
6325 static reg_errcode_t
6326 free_tree (void *extra, bin_tree_t *node)
6327 {
6328  free_token (&node->token);
6329  return REG_NOERROR;
6330 }
6331 
6332 
6333 /* Duplicate the node SRC, and return new node. This is a preorder
6334  visit similar to the one implemented by the generic visitor, but
6335  we need more infrastructure to maintain two parallel trees --- so,
6336  it's easier to duplicate. */
6337 
6338 static bin_tree_t *
6339 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
6340 {
6341  const bin_tree_t *node;
6342  bin_tree_t *dup_root;
6343  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
6344 
6345  for (node = root; ; )
6346  {
6347  /* Create a new tree and link it back to the current parent. */
6348  *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
6349  if (*p_new == NULL)
6350  return NULL;
6351  (*p_new)->parent = dup_node;
6352  (*p_new)->token.duplicated = 1;
6353  dup_node = *p_new;
6354 
6355  /* Go to the left node, or up and to the right. */
6356  if (node->left)
6357  {
6358  node = node->left;
6359  p_new = &dup_node->left;
6360  }
6361  else
6362  {
6363  const bin_tree_t *prev = NULL;
6364  while (node->right == prev || node->right == NULL)
6365  {
6366  prev = node;
6367  node = node->parent;
6368  dup_node = dup_node->parent;
6369  if (!node)
6370  return dup_root;
6371  }
6372  node = node->right;
6373  p_new = &dup_node->right;
6374  }
6375  }
6376 }
6377 
6378 /******************************************************************************/
6379 /******************************************************************************/
6380 /******************************************************************************/
6381 /* GKINCLUDE #include "regexec.c" */
6382 /******************************************************************************/
6383 /******************************************************************************/
6384 /******************************************************************************/
6385 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
6386  int n) internal_function;
6387 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
6388 static void match_ctx_free (re_match_context_t *cache) internal_function;
6389 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
6390  int str_idx, int from, int to)
6391  internal_function;
6392 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
6393  internal_function;
6394 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
6395  int str_idx) internal_function;
6396 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
6397  int node, int str_idx)
6398  internal_function;
6399 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
6400  re_dfastate_t **limited_sts, int last_node,
6401  int last_str_idx)
6402  internal_function;
6403 static reg_errcode_t re_search_internal (const regex_t *preg,
6404  const char *string, int length,
6405  int start, int range, int stop,
6406  size_t nmatch, regmatch_t pmatch[],
6407  int eflags) internal_function;
6408 static int re_search_2_stub (struct re_pattern_buffer *bufp,
6409  const char *string1, int length1,
6410  const char *string2, int length2,
6411  int start, int range, struct re_registers *regs,
6412  int stop, int ret_len) internal_function;
6413 static int re_search_stub (struct re_pattern_buffer *bufp,
6414  const char *string, int length, int start,
6415  int range, int stop, struct re_registers *regs,
6416  int ret_len) internal_function;
6417 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
6418  int nregs, int regs_allocated) internal_function;
6419 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
6420  internal_function;
6421 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
6422  int *p_match_first) internal_function;
6423 static int check_halt_state_context (const re_match_context_t *mctx,
6424  const re_dfastate_t *state, int idx)
6425  internal_function;
6426 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
6427  regmatch_t *prev_idx_match, int cur_node,
6428  int cur_idx, int nmatch) internal_function;
6429 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
6430  int str_idx, int dest_node, int nregs,
6431  regmatch_t *regs,
6432  re_node_set *eps_via_nodes)
6433  internal_function;
6434 static reg_errcode_t set_regs (const regex_t *preg,
6435  const re_match_context_t *mctx,
6436  size_t nmatch, regmatch_t *pmatch,
6437  int fl_backtrack) internal_function;
6438 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
6439  internal_function;
6440 
6441 #ifdef RE_ENABLE_I18N
6442 static int sift_states_iter_mb (const re_match_context_t *mctx,
6443  re_sift_context_t *sctx,
6444  int node_idx, int str_idx, int max_str_idx)
6445  internal_function;
6446 #endif /* RE_ENABLE_I18N */
6447 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
6448  re_sift_context_t *sctx)
6449  internal_function;
6450 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
6451  re_sift_context_t *sctx, int str_idx,
6452  re_node_set *cur_dest)
6453  internal_function;
6454 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
6455  re_sift_context_t *sctx,
6456  int str_idx,
6457  re_node_set *dest_nodes)
6458  internal_function;
6459 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
6460  re_node_set *dest_nodes,
6461  const re_node_set *candidates)
6462  internal_function;
6463 static int check_dst_limits (const re_match_context_t *mctx,
6464  re_node_set *limits,
6465  int dst_node, int dst_idx, int src_node,
6466  int src_idx) internal_function;
6467 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
6468  int boundaries, int subexp_idx,
6469  int from_node, int bkref_idx)
6470  internal_function;
6471 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
6472  int limit, int subexp_idx,
6473  int node, int str_idx,
6474  int bkref_idx) internal_function;
6475 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
6476  re_node_set *dest_nodes,
6477  const re_node_set *candidates,
6478  re_node_set *limits,
6479  struct re_backref_cache_entry *bkref_ents,
6480  int str_idx) internal_function;
6481 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
6482  re_sift_context_t *sctx,
6483  int str_idx, const re_node_set *candidates)
6484  internal_function;
6485 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
6486  re_dfastate_t **dst,
6487  re_dfastate_t **src, int num)
6488  internal_function;
6489 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
6490  re_match_context_t *mctx) internal_function;
6491 static re_dfastate_t *transit_state (reg_errcode_t *err,
6492  re_match_context_t *mctx,
6493  re_dfastate_t *state) internal_function;
6494 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
6495  re_match_context_t *mctx,
6496  re_dfastate_t *next_state)
6497  internal_function;
6498 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
6499  re_node_set *cur_nodes,
6500  int str_idx) internal_function;
6501 #if 0
6502 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
6503  re_match_context_t *mctx,
6504  re_dfastate_t *pstate)
6505  internal_function;
6506 #endif
6507 #ifdef RE_ENABLE_I18N
6508 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
6509  re_dfastate_t *pstate)
6510  internal_function;
6511 #endif /* RE_ENABLE_I18N */
6512 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
6513  const re_node_set *nodes)
6514  internal_function;
6515 static reg_errcode_t get_subexp (re_match_context_t *mctx,
6516  int bkref_node, int bkref_str_idx)
6517  internal_function;
6518 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
6519  const re_sub_match_top_t *sub_top,
6520  re_sub_match_last_t *sub_last,
6521  int bkref_node, int bkref_str)
6522  internal_function;
6523 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
6524  int subexp_idx, int type) internal_function;
6525 static reg_errcode_t check_arrival (re_match_context_t *mctx,
6526  state_array_t *path, int top_node,
6527  int top_str, int last_node, int last_str,
6528  int type) internal_function;
6529 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
6530  int str_idx,
6531  re_node_set *cur_nodes,
6532  re_node_set *next_nodes)
6533  internal_function;
6534 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
6535  re_node_set *cur_nodes,
6536  int ex_subexp, int type)
6537  internal_function;
6538 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
6539  re_node_set *dst_nodes,
6540  int target, int ex_subexp,
6541  int type) internal_function;
6542 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
6543  re_node_set *cur_nodes, int cur_str,
6544  int subexp_num, int type)
6545  internal_function;
6546 static int build_trtable (const re_dfa_t *dfa,
6547  re_dfastate_t *state) internal_function;
6548 #ifdef RE_ENABLE_I18N
6549 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
6550  const re_string_t *input, int idx)
6551  internal_function;
6552 # ifdef _LIBC
6553 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
6554  size_t name_len)
6555  internal_function;
6556 # endif /* _LIBC */
6557 #endif /* RE_ENABLE_I18N */
6558 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
6559  const re_dfastate_t *state,
6560  re_node_set *states_node,
6561  bitset_t *states_ch) internal_function;
6562 static int check_node_accept (const re_match_context_t *mctx,
6563  const re_token_t *node, int idx)
6564  internal_function;
6565 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
6566  internal_function;
6567 
6568 /* Entry point for POSIX code. */
6569 
6570 /* regexec searches for a given pattern, specified by PREG, in the
6571  string STRING.
6572 
6573  If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6574  `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6575  least NMATCH elements, and we set them to the offsets of the
6576  corresponding matched substrings.
6577 
6578  EFLAGS specifies `execution flags' which affect matching: if
6579  REG_NOTBOL is set, then ^ does not match at the beginning of the
6580  string; if REG_NOTEOL is set, then $ does not match at the end.
6581 
6582  We return 0 if we find a match and REG_NOMATCH if not. */
6583 
6584 int
6585 regexec (preg, string, nmatch, pmatch, eflags)
6586  const regex_t *__restrict preg;
6587  const char *__restrict string;
6588  size_t nmatch;
6589  regmatch_t pmatch[];
6590  int eflags;
6591 {
6592  reg_errcode_t err;
6593  int start, length;
6594  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
6595 
6596  if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
6597  return REG_BADPAT;
6598 
6599  if (eflags & REG_STARTEND)
6600  {
6601  start = pmatch[0].rm_so;
6602  length = pmatch[0].rm_eo;
6603  }
6604  else
6605  {
6606  start = 0;
6607  length = strlen (string);
6608  }
6609 
6610  __libc_lock_lock (dfa->lock);
6611  if (preg->no_sub)
6612  err = re_search_internal (preg, string, length, start, length - start,
6613  length, 0, NULL, eflags);
6614  else
6615  err = re_search_internal (preg, string, length, start, length - start,
6616  length, nmatch, pmatch, eflags);
6617  __libc_lock_unlock (dfa->lock);
6618  return err != REG_NOERROR;
6619 }
6620 
6621 #ifdef _LIBC
6622 # include <shlib-compat.h>
6623 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
6624 
6625 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
6626 __typeof__ (__regexec) __compat_regexec;
6627 
6628 int
6629 attribute_compat_text_section
6630 __compat_regexec (const regex_t *__restrict preg,
6631  const char *__restrict string, size_t nmatch,
6632  regmatch_t pmatch[], int eflags)
6633 {
6634  return regexec (preg, string, nmatch, pmatch,
6635  eflags & (REG_NOTBOL | REG_NOTEOL));
6636 }
6637 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
6638 # endif
6639 #endif
6640 
6641 /* Entry points for GNU code. */
6642 
6643 /* re_match, re_search, re_match_2, re_search_2
6644 
6645  The former two functions operate on STRING with length LENGTH,
6646  while the later two operate on concatenation of STRING1 and STRING2
6647  with lengths LENGTH1 and LENGTH2, respectively.
6648 
6649  re_match() matches the compiled pattern in BUFP against the string,
6650  starting at index START.
6651 
6652  re_search() first tries matching at index START, then it tries to match
6653  starting from index START + 1, and so on. The last start position tried
6654  is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
6655  way as re_match().)
6656 
6657  The parameter STOP of re_{match,search}_2 specifies that no match exceeding
6658  the first STOP characters of the concatenation of the strings should be
6659  concerned.
6660 
6661  If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
6662  and all groups is stroed in REGS. (For the "_2" variants, the offsets are
6663  computed relative to the concatenation, not relative to the individual
6664  strings.)
6665 
6666  On success, re_match* functions return the length of the match, re_search*
6667  return the position of the start of the match. Return value -1 means no
6668  match was found and -2 indicates an internal error. */
6669 
6670 int
6671 re_match (bufp, string, length, start, regs)
6672  struct re_pattern_buffer *bufp;
6673  const char *string;
6674  int length, start;
6675  struct re_registers *regs;
6676 {
6677  return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
6678 }
6679 #ifdef _LIBC
6680 weak_alias (__re_match, re_match)
6681 #endif
6682 
6683 int
6684 re_search (bufp, string, length, start, range, regs)
6685  struct re_pattern_buffer *bufp;
6686  const char *string;
6687  int length, start, range;
6688  struct re_registers *regs;
6689 {
6690  return re_search_stub (bufp, string, length, start, range, length, regs, 0);
6691 }
6692 #ifdef _LIBC
6693 weak_alias (__re_search, re_search)
6694 #endif
6695 
6696 int
6697 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
6698  struct re_pattern_buffer *bufp;
6699  const char *string1, *string2;
6700  int length1, length2, start, stop;
6701  struct re_registers *regs;
6702 {
6703  return re_search_2_stub (bufp, string1, length1, string2, length2,
6704  start, 0, regs, stop, 1);
6705 }
6706 #ifdef _LIBC
6707 weak_alias (__re_match_2, re_match_2)
6708 #endif
6709 
6710 int
6711 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
6712  struct re_pattern_buffer *bufp;
6713  const char *string1, *string2;
6714  int length1, length2, start, range, stop;
6715  struct re_registers *regs;
6716 {
6717  return re_search_2_stub (bufp, string1, length1, string2, length2,
6718  start, range, regs, stop, 0);
6719 }
6720 #ifdef _LIBC
6721 weak_alias (__re_search_2, re_search_2)
6722 #endif
6723 
6724 static int
6725 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
6726  stop, ret_len)
6727  struct re_pattern_buffer *bufp;
6728  const char *string1, *string2;
6729  int length1, length2, start, range, stop, ret_len;
6730  struct re_registers *regs;
6731 {
6732  const char *str;
6733  int rval;
6734  int len = length1 + length2;
6735  int free_str = 0;
6736 
6737  if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
6738  return -2;
6739 
6740  /* Concatenate the strings. */
6741  if (length2 > 0)
6742  if (length1 > 0)
6743  {
6744  char *s = re_malloc (char, len);
6745 
6746  if (BE (s == NULL, 0))
6747  return -2;
6748 #ifdef _LIBC
6749  memcpy (__mempcpy (s, string1, length1), string2, length2);
6750 #else
6751  memcpy (s, string1, length1);
6752  memcpy (s + length1, string2, length2);
6753 #endif
6754  str = s;
6755  free_str = 1;
6756  }
6757  else
6758  str = string2;
6759  else
6760  str = string1;
6761 
6762  rval = re_search_stub (bufp, str, len, start, range, stop, regs,
6763  ret_len);
6764  if (free_str)
6765  re_free ((char *) str);
6766  return rval;
6767 }
6768 
6769 /* The parameters have the same meaning as those of re_search.
6770  Additional parameters:
6771  If RET_LEN is nonzero the length of the match is returned (re_match style);
6772  otherwise the position of the match is returned. */
6773 
6774 static int
6775 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
6776  struct re_pattern_buffer *bufp;
6777  const char *string;
6778  int length, start, range, stop, ret_len;
6779  struct re_registers *regs;
6780 {
6782  regmatch_t *pmatch;
6783  int nregs, rval;
6784  int eflags = 0;
6785  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
6786 
6787  /* Check for out-of-range. */
6788  if (BE (start < 0 || start > length, 0))
6789  return -1;
6790  if (BE (start + range > length, 0))
6791  range = length - start;
6792  else if (BE (start + range < 0, 0))
6793  range = -start;
6794 
6795  __libc_lock_lock (dfa->lock);
6796 
6797  eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
6798  eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
6799 
6800  /* Compile fastmap if we haven't yet. */
6801  if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
6802  re_compile_fastmap (bufp);
6803 
6804  if (BE (bufp->no_sub, 0))
6805  regs = NULL;
6806 
6807  /* We need at least 1 register. */
6808  if (regs == NULL)
6809  nregs = 1;
6810  else if (BE (bufp->regs_allocated == REGS_FIXED &&
6811  regs->num_regs < bufp->re_nsub + 1, 0))
6812  {
6813  nregs = regs->num_regs;
6814  if (BE (nregs < 1, 0))
6815  {
6816  /* Nothing can be copied to regs. */
6817  regs = NULL;
6818  nregs = 1;
6819  }
6820  }
6821  else
6822  nregs = bufp->re_nsub + 1;
6823  pmatch = re_malloc (regmatch_t, nregs);
6824  if (BE (pmatch == NULL, 0))
6825  {
6826  rval = -2;
6827  goto out;
6828  }
6829 
6830  result = re_search_internal (bufp, string, length, start, range, stop,
6831  nregs, pmatch, eflags);
6832 
6833  rval = 0;
6834 
6835  /* I hope we needn't fill ther regs with -1's when no match was found. */
6836  if (result != REG_NOERROR)
6837  rval = -1;
6838  else if (regs != NULL)
6839  {
6840  /* If caller wants register contents data back, copy them. */
6841  bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
6842  bufp->regs_allocated);
6843  if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
6844  rval = -2;
6845  }
6846 
6847  if (BE (rval == 0, 1))
6848  {
6849  if (ret_len)
6850  {
6851  assert (pmatch[0].rm_so == start);
6852  rval = pmatch[0].rm_eo - start;
6853  }
6854  else
6855  rval = pmatch[0].rm_so;
6856  }
6857  re_free (pmatch);
6858  out:
6859  __libc_lock_unlock (dfa->lock);
6860  return rval;
6861 }
6862 
6863 static unsigned
6864 re_copy_regs (regs, pmatch, nregs, regs_allocated)
6865  struct re_registers *regs;
6866  regmatch_t *pmatch;
6867  int nregs, regs_allocated;
6868 {
6869  int rval = REGS_REALLOCATE;
6870  int i;
6871  int need_regs = nregs + 1;
6872  /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
6873  uses. */
6874 
6875  /* Have the register data arrays been allocated? */
6876  if (regs_allocated == REGS_UNALLOCATED)
6877  { /* No. So allocate them with malloc. */
6878  regs->start = re_malloc (regoff_t, need_regs);
6879  regs->end = re_malloc (regoff_t, need_regs);
6880  if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
6881  return REGS_UNALLOCATED;
6882  regs->num_regs = need_regs;
6883  }
6884  else if (regs_allocated == REGS_REALLOCATE)
6885  { /* Yes. If we need more elements than were already
6886  allocated, reallocate them. If we need fewer, just
6887  leave it alone. */
6888  if (BE (need_regs > regs->num_regs, 0))
6889  {
6890  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
6891  regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
6892  if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
6893  return REGS_UNALLOCATED;
6894  regs->start = new_start;
6895  regs->end = new_end;
6896  regs->num_regs = need_regs;
6897  }
6898  }
6899  else
6900  {
6901  assert (regs_allocated == REGS_FIXED);
6902  /* This function may not be called with REGS_FIXED and nregs too big. */
6903  assert (regs->num_regs >= nregs);
6904  rval = REGS_FIXED;
6905  }
6906 
6907  /* Copy the regs. */
6908  for (i = 0; i < nregs; ++i)
6909  {
6910  regs->start[i] = pmatch[i].rm_so;
6911  regs->end[i] = pmatch[i].rm_eo;
6912  }
6913  for ( ; i < regs->num_regs; ++i)
6914  regs->start[i] = regs->end[i] = -1;
6915 
6916  return rval;
6917 }
6918 
6919 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6920  ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
6921  this memory for recording register information. STARTS and ENDS
6922  must be allocated using the malloc library routine, and must each
6923  be at least NUM_REGS * sizeof (regoff_t) bytes long.
6924 
6925  If NUM_REGS == 0, then subsequent matches should allocate their own
6926  register data.
6927 
6928  Unless this function is called, the first search or match using
6929  PATTERN_BUFFER will allocate its own register data, without
6930  freeing the old data. */
6931 
6932 void
6933 re_set_registers (bufp, regs, num_regs, starts, ends)
6934  struct re_pattern_buffer *bufp;
6935  struct re_registers *regs;
6936  unsigned num_regs;
6937  regoff_t *starts, *ends;
6938 {
6939  if (num_regs)
6940  {
6942  regs->num_regs = num_regs;
6943  regs->start = starts;
6944  regs->end = ends;
6945  }
6946  else
6947  {
6949  regs->num_regs = 0;
6950  regs->start = regs->end = (regoff_t *) 0;
6951  }
6952 }
6953 #ifdef _LIBC
6954 weak_alias (__re_set_registers, re_set_registers)
6955 #endif
6956 
6957 /* Entry points compatible with 4.2 BSD regex library. We don't define
6958  them unless specifically requested. */
6959 
6960 #if defined _REGEX_RE_COMP || defined _LIBC
6961 int
6962 # ifdef _LIBC
6963 weak_function
6964 # endif
6965 re_exec (s)
6966  const char *s;
6967 {
6968  return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
6969 }
6970 #endif /* _REGEX_RE_COMP */
6971 
6972 /* Internal entry point. */
6973 
6974 /* Searches for a compiled pattern PREG in the string STRING, whose
6975  length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
6976  mingings with regexec. START, and RANGE have the same meanings
6977  with re_search.
6978  Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
6979  otherwise return the error code.
6980  Note: We assume front end functions already check ranges.
6981  (START + RANGE >= 0 && START + RANGE <= LENGTH) */
6982 
6983 static reg_errcode_t
6984 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
6985  eflags)
6986  const regex_t *preg;
6987  const char *string;
6988  int length, start, range, stop, eflags;
6989  size_t nmatch;
6990  regmatch_t pmatch[];
6991 {
6992  reg_errcode_t err;
6993  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
6994  int left_lim, right_lim, incr;
6995  int fl_longest_match, match_first, match_kind, match_last = -1;
6996  int extra_nmatch;
6997  int sb, ch;
6998 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
6999  re_match_context_t mctx = { .dfa = dfa };
7000 #else
7001  re_match_context_t mctx;
7002 #endif
7003  char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
7004  && range && !preg->can_be_null) ? preg->fastmap : NULL;
7005  RE_TRANSLATE_TYPE t = preg->translate;
7006 
7007 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
7008  memset (&mctx, '\0', sizeof (re_match_context_t));
7009  mctx.dfa = dfa;
7010 #endif
7011 
7012  extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
7013  nmatch -= extra_nmatch;
7014 
7015  /* Check if the DFA haven't been compiled. */
7016  if (BE (preg->used == 0 || dfa->init_state == NULL
7017  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
7018  || dfa->init_state_begbuf == NULL, 0))
7019  return REG_NOMATCH;
7020 
7021 #ifdef DEBUG
7022  /* We assume front-end functions already check them. */
7023  assert (start + range >= 0 && start + range <= length);
7024 #endif
7025 
7026  /* If initial states with non-begbuf contexts have no elements,
7027  the regex must be anchored. If preg->newline_anchor is set,
7028  we'll never use init_state_nl, so do not check it. */
7029  if (dfa->init_state->nodes.nelem == 0
7030  && dfa->init_state_word->nodes.nelem == 0
7031  && (dfa->init_state_nl->nodes.nelem == 0
7032  || !preg->newline_anchor))
7033  {
7034  if (start != 0 && start + range != 0)
7035  return REG_NOMATCH;
7036  start = range = 0;
7037  }
7038 
7039  /* We must check the longest matching, if nmatch > 0. */
7040  fl_longest_match = (nmatch != 0 || dfa->nbackref);
7041 
7042  err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
7043  preg->translate, preg->syntax & RE_ICASE, dfa);
7044  if (BE (err != REG_NOERROR, 0))
7045  goto free_return;
7046  mctx.input.stop = stop;
7047  mctx.input.raw_stop = stop;
7048  mctx.input.newline_anchor = preg->newline_anchor;
7049 
7050  err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
7051  if (BE (err != REG_NOERROR, 0))
7052  goto free_return;
7053 
7054  /* We will log all the DFA states through which the dfa pass,
7055  if nmatch > 1, or this dfa has "multibyte node", which is a
7056  back-reference or a node which can accept multibyte character or
7057  multi character collating element. */
7058  if (nmatch > 1 || dfa->has_mb_node)
7059  {
7060  mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
7061  if (BE (mctx.state_log == NULL, 0))
7062  {
7063  err = REG_ESPACE;
7064  goto free_return;
7065  }
7066  }
7067  else
7068  mctx.state_log = NULL;
7069 
7070  match_first = start;
7071  mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
7072  : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
7073 
7074  /* Check incrementally whether of not the input string match. */
7075  incr = (range < 0) ? -1 : 1;
7076  left_lim = (range < 0) ? start + range : start;
7077  right_lim = (range < 0) ? start : start + range;
7078  sb = dfa->mb_cur_max == 1;
7079  match_kind =
7080  (fastmap
7081  ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
7082  | (range >= 0 ? 2 : 0)
7083  | (t != NULL ? 1 : 0))
7084  : 8);
7085 
7086  for (;; match_first += incr)
7087  {
7088  err = REG_NOMATCH;
7089  if (match_first < left_lim || right_lim < match_first)
7090  goto free_return;
7091 
7092  /* Advance as rapidly as possible through the string, until we
7093  find a plausible place to start matching. This may be done
7094  with varying efficiency, so there are various possibilities:
7095  only the most common of them are specialized, in order to
7096  save on code size. We use a switch statement for speed. */
7097  switch (match_kind)
7098  {
7099  case 8:
7100  /* No fastmap. */
7101  break;
7102 
7103  case 7:
7104  /* Fastmap with single-byte translation, match forward. */
7105  while (BE (match_first < right_lim, 1)
7106  && !fastmap[t[(unsigned char) string[match_first]]])
7107  ++match_first;
7108  goto forward_match_found_start_or_reached_end;
7109 
7110  case 6:
7111  /* Fastmap without translation, match forward. */
7112  while (BE (match_first < right_lim, 1)
7113  && !fastmap[(unsigned char) string[match_first]])
7114  ++match_first;
7115 
7116  forward_match_found_start_or_reached_end:
7117  if (BE (match_first == right_lim, 0))
7118  {
7119  ch = match_first >= length
7120  ? 0 : (unsigned char) string[match_first];
7121  if (!fastmap[t ? t[ch] : ch])
7122  goto free_return;
7123  }
7124  break;
7125 
7126  case 4:
7127  case 5:
7128  /* Fastmap without multi-byte translation, match backwards. */
7129  while (match_first >= left_lim)
7130  {
7131  ch = match_first >= length
7132  ? 0 : (unsigned char) string[match_first];
7133  if (fastmap[t ? t[ch] : ch])
7134  break;
7135  --match_first;
7136  }
7137  if (match_first < left_lim)
7138  goto free_return;
7139  break;
7140 
7141  default:
7142  /* In this case, we can't determine easily the current byte,
7143  since it might be a component byte of a multibyte
7144  character. Then we use the constructed buffer instead. */
7145  for (;;)
7146  {
7147  /* If MATCH_FIRST is out of the valid range, reconstruct the
7148  buffers. */
7149  unsigned int offset = match_first - mctx.input.raw_mbs_idx;
7150  if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
7151  {
7152  err = re_string_reconstruct (&mctx.input, match_first,
7153  eflags);
7154  if (BE (err != REG_NOERROR, 0))
7155  goto free_return;
7156 
7157  offset = match_first - mctx.input.raw_mbs_idx;
7158  }
7159  /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
7160  Note that MATCH_FIRST must not be smaller than 0. */
7161  ch = (match_first >= length
7162  ? 0 : re_string_byte_at (&mctx.input, offset));
7163  if (fastmap[ch])
7164  break;
7165  match_first += incr;
7166  if (match_first < left_lim || match_first > right_lim)
7167  {
7168  err = REG_NOMATCH;
7169  goto free_return;
7170  }
7171  }
7172  break;
7173  }
7174 
7175  /* Reconstruct the buffers so that the matcher can assume that
7176  the matching starts from the beginning of the buffer. */
7177  err = re_string_reconstruct (&mctx.input, match_first, eflags);
7178  if (BE (err != REG_NOERROR, 0))
7179  goto free_return;
7180 
7181 #ifdef RE_ENABLE_I18N
7182  /* Don't consider this char as a possible match start if it part,
7183  yet isn't the head, of a multibyte character. */
7184  if (!sb && !re_string_first_byte (&mctx.input, 0))
7185  continue;
7186 #endif
7187 
7188  /* It seems to be appropriate one, then use the matcher. */
7189  /* We assume that the matching starts from 0. */
7190  mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
7191  match_last = check_matching (&mctx, fl_longest_match,
7192  range >= 0 ? &match_first : NULL);
7193  if (match_last != -1)
7194  {
7195  if (BE (match_last == -2, 0))
7196  {
7197  err = REG_ESPACE;
7198  goto free_return;
7199  }
7200  else
7201  {
7202  mctx.match_last = match_last;
7203  if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
7204  {
7205  re_dfastate_t *pstate = mctx.state_log[match_last];
7206  mctx.last_node = check_halt_state_context (&mctx, pstate,
7207  match_last);
7208  }
7209  if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
7210  || dfa->nbackref)
7211  {
7212  err = prune_impossible_nodes (&mctx);
7213  if (err == REG_NOERROR)
7214  break;
7215  if (BE (err != REG_NOMATCH, 0))
7216  goto free_return;
7217  match_last = -1;
7218  }
7219  else
7220  break; /* We found a match. */
7221  }
7222  }
7223 
7224  match_ctx_clean (&mctx);
7225  }
7226 
7227 #ifdef DEBUG
7228  assert (match_last != -1);
7229  assert (err == REG_NOERROR);
7230 #endif
7231 
7232  /* Set pmatch[] if we need. */
7233  if (nmatch > 0)
7234  {
7235  int reg_idx;
7236 
7237  /* Initialize registers. */
7238  for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
7239  pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
7240 
7241  /* Set the points where matching start/end. */
7242  pmatch[0].rm_so = 0;
7243  pmatch[0].rm_eo = mctx.match_last;
7244 
7245  if (!preg->no_sub && nmatch > 1)
7246  {
7247  err = set_regs (preg, &mctx, nmatch, pmatch,
7248  dfa->has_plural_match && dfa->nbackref > 0);
7249  if (BE (err != REG_NOERROR, 0))
7250  goto free_return;
7251  }
7252 
7253  /* At last, add the offset to the each registers, since we slided
7254  the buffers so that we could assume that the matching starts
7255  from 0. */
7256  for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7257  if (pmatch[reg_idx].rm_so != -1)
7258  {
7259 #ifdef RE_ENABLE_I18N
7260  if (BE (mctx.input.offsets_needed != 0, 0))
7261  {
7262  pmatch[reg_idx].rm_so =
7263  (pmatch[reg_idx].rm_so == mctx.input.valid_len
7264  ? mctx.input.valid_raw_len
7265  : mctx.input.offsets[pmatch[reg_idx].rm_so]);
7266  pmatch[reg_idx].rm_eo =
7267  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
7268  ? mctx.input.valid_raw_len
7269  : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
7270  }
7271 #else
7272  assert (mctx.input.offsets_needed == 0);
7273 #endif
7274  pmatch[reg_idx].rm_so += match_first;
7275  pmatch[reg_idx].rm_eo += match_first;
7276  }
7277  for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
7278  {
7279  pmatch[nmatch + reg_idx].rm_so = -1;
7280  pmatch[nmatch + reg_idx].rm_eo = -1;
7281  }
7282 
7283  if (dfa->subexp_map)
7284  for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
7285  if (dfa->subexp_map[reg_idx] != reg_idx)
7286  {
7287  pmatch[reg_idx + 1].rm_so
7288  = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
7289  pmatch[reg_idx + 1].rm_eo
7290  = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
7291  }
7292  }
7293 
7294  free_return:
7295  re_free (mctx.state_log);
7296  if (dfa->nbackref)
7297  match_ctx_free (&mctx);
7298  re_string_destruct (&mctx.input);
7299  return err;
7300 }
7301 
7302 static reg_errcode_t
7303 prune_impossible_nodes (mctx)
7304  re_match_context_t *mctx;
7305 {
7306  const re_dfa_t *const dfa = mctx->dfa;
7307  int halt_node, match_last;
7309  re_dfastate_t **sifted_states;
7310  re_dfastate_t **lim_states = NULL;
7311  re_sift_context_t sctx;
7312 #ifdef DEBUG
7313  assert (mctx->state_log != NULL);
7314 #endif
7315  match_last = mctx->match_last;
7316  halt_node = mctx->last_node;
7317  sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
7318  if (BE (sifted_states == NULL, 0))
7319  {
7320  ret = REG_ESPACE;
7321  goto free_return;
7322  }
7323  if (dfa->nbackref)
7324  {
7325  lim_states = re_malloc (re_dfastate_t *, match_last + 1);
7326  if (BE (lim_states == NULL, 0))
7327  {
7328  ret = REG_ESPACE;
7329  goto free_return;
7330  }
7331  while (1)
7332  {
7333  memset (lim_states, '\0',
7334  sizeof (re_dfastate_t *) * (match_last + 1));
7335  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
7336  match_last);
7337  ret = sift_states_backward (mctx, &sctx);
7338  re_node_set_free (&sctx.limits);
7339  if (BE (ret != REG_NOERROR, 0))
7340  goto free_return;
7341  if (sifted_states[0] != NULL || lim_states[0] != NULL)
7342  break;
7343  do
7344  {
7345  --match_last;
7346  if (match_last < 0)
7347  {
7348  ret = REG_NOMATCH;
7349  goto free_return;
7350  }
7351  } while (mctx->state_log[match_last] == NULL
7352  || !mctx->state_log[match_last]->halt);
7353  halt_node = check_halt_state_context (mctx,
7354  mctx->state_log[match_last],
7355  match_last);
7356  }
7357  ret = merge_state_array (dfa, sifted_states, lim_states,
7358  match_last + 1);
7359  re_free (lim_states);
7360  lim_states = NULL;
7361  if (BE (ret != REG_NOERROR, 0))
7362  goto free_return;
7363  }
7364  else
7365  {
7366  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
7367  ret = sift_states_backward (mctx, &sctx);
7368  re_node_set_free (&sctx.limits);
7369  if (BE (ret != REG_NOERROR, 0))
7370  goto free_return;
7371  }
7372  re_free (mctx->state_log);
7373  mctx->state_log = sifted_states;
7374  sifted_states = NULL;
7375  mctx->last_node = halt_node;
7376  mctx->match_last = match_last;
7377  ret = REG_NOERROR;
7378  free_return:
7379  re_free (sifted_states);
7380  re_free (lim_states);
7381  return ret;
7382 }
7383 
7384 /* Acquire an initial state and return it.
7385  We must select appropriate initial state depending on the context,
7386  since initial states may have constraints like "<", "^", etc.. */
7387 
7388 static inline re_dfastate_t *
7389 __attribute ((always_inline)) internal_function
7390 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
7391  int idx)
7392 {
7393  const re_dfa_t *const dfa = mctx->dfa;
7394  if (dfa->init_state->has_constraint)
7395  {
7396  unsigned int context;
7397  context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
7398  if (IS_WORD_CONTEXT (context))
7399  return dfa->init_state_word;
7400  else if (IS_ORDINARY_CONTEXT (context))
7401  return dfa->init_state;
7402  else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
7403  return dfa->init_state_begbuf;
7404  else if (IS_NEWLINE_CONTEXT (context))
7405  return dfa->init_state_nl;
7406  else if (IS_BEGBUF_CONTEXT (context))
7407  {
7408  /* It is relatively rare case, then calculate on demand. */
7409  return re_acquire_state_context (err, dfa,
7410  dfa->init_state->entrance_nodes,
7411  context);
7412  }
7413  else
7414  /* Must not happen? */
7415  return dfa->init_state;
7416  }
7417  else
7418  return dfa->init_state;
7419 }
7420 
7421 /* Check whether the regular expression match input string INPUT or not,
7422  and return the index where the matching end, return -1 if not match,
7423  or return -2 in case of an error.
7424  FL_LONGEST_MATCH means we want the POSIX longest matching.
7425  If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
7426  next place where we may want to try matching.
7427  Note that the matcher assume that the maching starts from the current
7428  index of the buffer. */
7429 
7430 static int
7431 internal_function
7432 check_matching (re_match_context_t *mctx, int fl_longest_match,
7433  int *p_match_first)
7434 {
7435  const re_dfa_t *const dfa = mctx->dfa;
7436  reg_errcode_t err;
7437  int match = 0;
7438  int match_last = -1;
7439  int cur_str_idx = re_string_cur_idx (&mctx->input);
7440  re_dfastate_t *cur_state;
7441  int at_init_state = p_match_first != NULL;
7442  int next_start_idx = cur_str_idx;
7443 
7444  err = REG_NOERROR;
7445  cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
7446  /* An initial state must not be NULL (invalid). */
7447  if (BE (cur_state == NULL, 0))
7448  {
7449  assert (err == REG_ESPACE);
7450  return -2;
7451  }
7452 
7453  if (mctx->state_log != NULL)
7454  {
7455  mctx->state_log[cur_str_idx] = cur_state;
7456 
7457  /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
7458  later. E.g. Processing back references. */
7459  if (BE (dfa->nbackref, 0))
7460  {
7461  at_init_state = 0;
7462  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
7463  if (BE (err != REG_NOERROR, 0))
7464  return err;
7465 
7466  if (cur_state->has_backref)
7467  {
7468  err = transit_state_bkref (mctx, &cur_state->nodes);
7469  if (BE (err != REG_NOERROR, 0))
7470  return err;
7471  }
7472  }
7473  }
7474 
7475  /* If the RE accepts NULL string. */
7476  if (BE (cur_state->halt, 0))
7477  {
7478  if (!cur_state->has_constraint
7479  || check_halt_state_context (mctx, cur_state, cur_str_idx))
7480  {
7481  if (!fl_longest_match)
7482  return cur_str_idx;
7483  else
7484  {
7485  match_last = cur_str_idx;
7486  match = 1;
7487  }
7488  }
7489  }
7490 
7491  while (!re_string_eoi (&mctx->input))
7492  {
7493  re_dfastate_t *old_state = cur_state;
7494  int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
7495 
7496  if (BE (next_char_idx >= mctx->input.bufs_len, 0)
7497  || (BE (next_char_idx >= mctx->input.valid_len, 0)
7498  && mctx->input.valid_len < mctx->input.len))
7499  {
7500  err = extend_buffers (mctx);
7501  if (BE (err != REG_NOERROR, 0))
7502  {
7503  assert (err == REG_ESPACE);
7504  return -2;
7505  }
7506  }
7507 
7508  cur_state = transit_state (&err, mctx, cur_state);
7509  if (mctx->state_log != NULL)
7510  cur_state = merge_state_with_log (&err, mctx, cur_state);
7511 
7512  if (cur_state == NULL)
7513  {
7514  /* Reached the invalid state or an error. Try to recover a valid
7515  state using the state log, if available and if we have not
7516  already found a valid (even if not the longest) match. */
7517  if (BE (err != REG_NOERROR, 0))
7518  return -2;
7519 
7520  if (mctx->state_log == NULL
7521  || (match && !fl_longest_match)
7522  || (cur_state = find_recover_state (&err, mctx)) == NULL)
7523  break;
7524  }
7525 
7526  if (BE (at_init_state, 0))
7527  {
7528  if (old_state == cur_state)
7529  next_start_idx = next_char_idx;
7530  else
7531  at_init_state = 0;
7532  }
7533 
7534  if (cur_state->halt)
7535  {
7536  /* Reached a halt state.
7537  Check the halt state can satisfy the current context. */
7538  if (!cur_state->has_constraint
7539  || check_halt_state_context (mctx, cur_state,
7540  re_string_cur_idx (&mctx->input)))
7541  {
7542  /* We found an appropriate halt state. */
7543  match_last = re_string_cur_idx (&mctx->input);
7544  match = 1;
7545 
7546  /* We found a match, do not modify match_first below. */
7547  p_match_first = NULL;
7548  if (!fl_longest_match)
7549  break;
7550  }
7551  }
7552  }
7553 
7554  if (p_match_first)
7555  *p_match_first += next_start_idx;
7556 
7557  return match_last;
7558 }
7559 
7560 /* Check NODE match the current context. */
7561 
7562 static int
7563 internal_function
7564 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
7565 {
7566  re_token_type_t type = dfa->nodes[node].type;
7567  unsigned int constraint = dfa->nodes[node].constraint;
7568  if (type != END_OF_RE)
7569  return 0;
7570  if (!constraint)
7571  return 1;
7572  if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
7573  return 0;
7574  return 1;
7575 }
7576 
7577 /* Check the halt state STATE match the current context.
7578  Return 0 if not match, if the node, STATE has, is a halt node and
7579  match the context, return the node. */
7580 
7581 static int
7582 internal_function
7583 check_halt_state_context (const re_match_context_t *mctx,
7584  const re_dfastate_t *state, int idx)
7585 {
7586  int i;
7587  unsigned int context;
7588 #ifdef DEBUG
7589  assert (state->halt);
7590 #endif
7591  context = re_string_context_at (&mctx->input, idx, mctx->eflags);
7592  for (i = 0; i < state->nodes.nelem; ++i)
7593  if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
7594  return state->nodes.elems[i];
7595  return 0;
7596 }
7597 
7598 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
7599  corresponding to the DFA).
7600  Return the destination node, and update EPS_VIA_NODES, return -1 in case
7601  of errors. */
7602 
7603 static int
7604 internal_function
7605 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
7606  int *pidx, int node, re_node_set *eps_via_nodes,
7607  struct re_fail_stack_t *fs)
7608 {
7609  const re_dfa_t *const dfa = mctx->dfa;
7610  int i, err;
7611  if (IS_EPSILON_NODE (dfa->nodes[node].type))
7612  {
7613  re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
7614  re_node_set *edests = &dfa->edests[node];
7615  int dest_node;
7616  err = re_node_set_insert (eps_via_nodes, node);
7617  if (BE (err < 0, 0))
7618  return -2;
7619  /* Pick up a valid destination, or return -1 if none is found. */
7620  for (dest_node = -1, i = 0; i < edests->nelem; ++i)
7621  {
7622  int candidate = edests->elems[i];
7623  if (!re_node_set_contains (cur_nodes, candidate))
7624  continue;
7625  if (dest_node == -1)
7626  dest_node = candidate;
7627 
7628  else
7629  {
7630  /* In order to avoid infinite loop like "(a*)*", return the second
7631  epsilon-transition if the first was already considered. */
7632  if (re_node_set_contains (eps_via_nodes, dest_node))
7633  return candidate;
7634 
7635  /* Otherwise, push the second epsilon-transition on the fail stack. */
7636  else if (fs != NULL
7637  && push_fail_stack (fs, *pidx, candidate, nregs, regs,
7638  eps_via_nodes))
7639  return -2;
7640 
7641  /* We know we are going to exit. */
7642  break;
7643  }
7644  }
7645  return dest_node;
7646  }
7647  else
7648  {
7649  int naccepted = 0;
7650  re_token_type_t type = dfa->nodes[node].type;
7651 
7652 #ifdef RE_ENABLE_I18N
7653  if (dfa->nodes[node].accept_mb)
7654  naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
7655  else
7656 #endif /* RE_ENABLE_I18N */
7657  if (type == OP_BACK_REF)
7658  {
7659  int subexp_idx = dfa->nodes[node].opr.idx + 1;
7660  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
7661  if (fs != NULL)
7662  {
7663  if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
7664  return -1;
7665  else if (naccepted)
7666  {
7667  char *buf = (char *) re_string_get_buffer (&mctx->input);
7668  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
7669  naccepted) != 0)
7670  return -1;
7671  }
7672  }
7673 
7674  if (naccepted == 0)
7675  {
7676  int dest_node;
7677  err = re_node_set_insert (eps_via_nodes, node);
7678  if (BE (err < 0, 0))
7679  return -2;
7680  dest_node = dfa->edests[node].elems[0];
7681  if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7682  dest_node))
7683  return dest_node;
7684  }
7685  }
7686 
7687  if (naccepted != 0
7688  || check_node_accept (mctx, dfa->nodes + node, *pidx))
7689  {
7690  int dest_node = dfa->nexts[node];
7691  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
7692  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
7693  || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
7694  dest_node)))
7695  return -1;
7696  re_node_set_empty (eps_via_nodes);
7697  return dest_node;
7698  }
7699  }
7700  return -1;
7701 }
7702 
7703 static reg_errcode_t
7704 internal_function
7705 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
7706  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
7707 {
7708  reg_errcode_t err;
7709  int num = fs->num++;
7710  if (fs->num == fs->alloc)
7711  {
7712  struct re_fail_stack_ent_t *new_array;
7713  new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
7714  * fs->alloc * 2));
7715  if (new_array == NULL)
7716  return REG_ESPACE;
7717  fs->alloc *= 2;
7718  fs->stack = new_array;
7719  }
7720  fs->stack[num].idx = str_idx;
7721  fs->stack[num].node = dest_node;
7722  fs->stack[num].regs = re_malloc (regmatch_t, nregs);
7723  if (fs->stack[num].regs == NULL)
7724  return REG_ESPACE;
7725  memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
7726  err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
7727  return err;
7728 }
7729 
7730 static int
7731 internal_function
7732 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
7733  regmatch_t *regs, re_node_set *eps_via_nodes)
7734 {
7735  int num = --fs->num;
7736  assert (num >= 0);
7737  *pidx = fs->stack[num].idx;
7738  memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
7739  re_node_set_free (eps_via_nodes);
7740  re_free (fs->stack[num].regs);
7741  *eps_via_nodes = fs->stack[num].eps_via_nodes;
7742  return fs->stack[num].node;
7743 }
7744 
7745 /* Set the positions where the subexpressions are starts/ends to registers
7746  PMATCH.
7747  Note: We assume that pmatch[0] is already set, and
7748  pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
7749 
7750 static reg_errcode_t
7751 internal_function
7752 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
7753  regmatch_t *pmatch, int fl_backtrack)
7754 {
7755  const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
7756  int idx, cur_node;
7757  re_node_set eps_via_nodes;
7758  struct re_fail_stack_t *fs;
7759  struct re_fail_stack_t fs_body = { 0, 2, NULL };
7760  regmatch_t *prev_idx_match;
7761  int prev_idx_match_malloced = 0;
7762 
7763 #ifdef DEBUG
7764  assert (nmatch > 1);
7765  assert (mctx->state_log != NULL);
7766 #endif
7767  if (fl_backtrack)
7768  {
7769  fs = &fs_body;
7770  fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
7771  if (fs->stack == NULL)
7772  return REG_ESPACE;
7773  }
7774  else
7775  fs = NULL;
7776 
7777  cur_node = dfa->init_node;
7778  re_node_set_init_empty (&eps_via_nodes);
7779 
7780  if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
7781  prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
7782  else
7783  {
7784  prev_idx_match = re_malloc (regmatch_t, nmatch);
7785  if (prev_idx_match == NULL)
7786  {
7787  free_fail_stack_return (fs);
7788  return REG_ESPACE;
7789  }
7790  prev_idx_match_malloced = 1;
7791  }
7792  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7793 
7794  for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
7795  {
7796  update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
7797 
7798  if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
7799  {
7800  int reg_idx;
7801  if (fs)
7802  {
7803  for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
7804  if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
7805  break;
7806  if (reg_idx == nmatch)
7807  {
7808  re_node_set_free (&eps_via_nodes);
7809  if (prev_idx_match_malloced)
7810  re_free (prev_idx_match);
7811  return free_fail_stack_return (fs);
7812  }
7813  cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7814  &eps_via_nodes);
7815  }
7816  else
7817  {
7818  re_node_set_free (&eps_via_nodes);
7819  if (prev_idx_match_malloced)
7820  re_free (prev_idx_match);
7821  return REG_NOERROR;
7822  }
7823  }
7824 
7825  /* Proceed to next node. */
7826  cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
7827  &eps_via_nodes, fs);
7828 
7829  if (BE (cur_node < 0, 0))
7830  {
7831  if (BE (cur_node == -2, 0))
7832  {
7833  re_node_set_free (&eps_via_nodes);
7834  if (prev_idx_match_malloced)
7835  re_free (prev_idx_match);
7836  free_fail_stack_return (fs);
7837  return REG_ESPACE;
7838  }
7839  if (fs)
7840  cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
7841  &eps_via_nodes);
7842  else
7843  {
7844  re_node_set_free (&eps_via_nodes);
7845  if (prev_idx_match_malloced)
7846  re_free (prev_idx_match);
7847  return REG_NOMATCH;
7848  }
7849  }
7850  }
7851  re_node_set_free (&eps_via_nodes);
7852  if (prev_idx_match_malloced)
7853  re_free (prev_idx_match);
7854  return free_fail_stack_return (fs);
7855 }
7856 
7857 static reg_errcode_t
7858 internal_function
7859 free_fail_stack_return (struct re_fail_stack_t *fs)
7860 {
7861  if (fs)
7862  {
7863  int fs_idx;
7864  for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
7865  {
7866  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
7867  re_free (fs->stack[fs_idx].regs);
7868  }
7869  re_free (fs->stack);
7870  }
7871  return REG_NOERROR;
7872 }
7873 
7874 static void
7875 internal_function
7876 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
7877  regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
7878 {
7879  int type = dfa->nodes[cur_node].type;
7880  if (type == OP_OPEN_SUBEXP)
7881  {
7882  int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7883 
7884  /* We are at the first node of this sub expression. */
7885  if (reg_num < nmatch)
7886  {
7887  pmatch[reg_num].rm_so = cur_idx;
7888  pmatch[reg_num].rm_eo = -1;
7889  }
7890  }
7891  else if (type == OP_CLOSE_SUBEXP)
7892  {
7893  int reg_num = dfa->nodes[cur_node].opr.idx + 1;
7894  if (reg_num < nmatch)
7895  {
7896  /* We are at the last node of this sub expression. */
7897  if (pmatch[reg_num].rm_so < cur_idx)
7898  {
7899  pmatch[reg_num].rm_eo = cur_idx;
7900  /* This is a non-empty match or we are not inside an optional
7901  subexpression. Accept this right away. */
7902  memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
7903  }
7904  else
7905  {
7906  if (dfa->nodes[cur_node].opt_subexp
7907  && prev_idx_match[reg_num].rm_so != -1)
7908  /* We transited through an empty match for an optional
7909  subexpression, like (a?)*, and this is not the subexp's
7910  first match. Copy back the old content of the registers
7911  so that matches of an inner subexpression are undone as
7912  well, like in ((a?))*. */
7913  memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
7914  else
7915  /* We completed a subexpression, but it may be part of
7916  an optional one, so do not update PREV_IDX_MATCH. */
7917  pmatch[reg_num].rm_eo = cur_idx;
7918  }
7919  }
7920  }
7921 }
7922 
7923 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
7924  and sift the nodes in each states according to the following rules.
7925  Updated state_log will be wrote to STATE_LOG.
7926 
7927  Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
7928  1. When STR_IDX == MATCH_LAST(the last index in the state_log):
7929  If `a' isn't the LAST_NODE and `a' can't epsilon transit to
7930  the LAST_NODE, we throw away the node `a'.
7931  2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
7932  string `s' and transit to `b':
7933  i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
7934  away the node `a'.
7935  ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
7936  thrown away, we throw away the node `a'.
7937  3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
7938  i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
7939  node `a'.
7940  ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
7941  we throw away the node `a'. */
7942 
7943 #define STATE_NODE_CONTAINS(state,node) \
7944  ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
7945 
7946 static reg_errcode_t
7947 internal_function
7948 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
7949 {
7950  reg_errcode_t err;
7951  int null_cnt = 0;
7952  int str_idx = sctx->last_str_idx;
7953  re_node_set cur_dest;
7954 
7955 #ifdef DEBUG
7956  assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
7957 #endif
7958 
7959  /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
7960  transit to the last_node and the last_node itself. */
7961  err = re_node_set_init_1 (&cur_dest, sctx->last_node);
7962  if (BE (err != REG_NOERROR, 0))
7963  return err;
7964  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7965  if (BE (err != REG_NOERROR, 0))
7966  goto free_return;
7967 
7968  /* Then check each states in the state_log. */
7969  while (str_idx > 0)
7970  {
7971  /* Update counters. */
7972  null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
7973  if (null_cnt > mctx->max_mb_elem_len)
7974  {
7975  memset (sctx->sifted_states, '\0',
7976  sizeof (re_dfastate_t *) * str_idx);
7977  re_node_set_free (&cur_dest);
7978  return REG_NOERROR;
7979  }
7980  re_node_set_empty (&cur_dest);
7981  --str_idx;
7982 
7983  if (mctx->state_log[str_idx])
7984  {
7985  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
7986  if (BE (err != REG_NOERROR, 0))
7987  goto free_return;
7988  }
7989 
7990  /* Add all the nodes which satisfy the following conditions:
7991  - It can epsilon transit to a node in CUR_DEST.
7992  - It is in CUR_SRC.
7993  And update state_log. */
7994  err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
7995  if (BE (err != REG_NOERROR, 0))
7996  goto free_return;
7997  }
7998  err = REG_NOERROR;
7999  free_return:
8000  re_node_set_free (&cur_dest);
8001  return err;
8002 }
8003 
8004 static reg_errcode_t
8005 internal_function
8006 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
8007  int str_idx, re_node_set *cur_dest)
8008 {
8009  const re_dfa_t *const dfa = mctx->dfa;
8010  const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
8011  int i;
8012 
8013  /* Then build the next sifted state.
8014  We build the next sifted state on `cur_dest', and update
8015  `sifted_states[str_idx]' with `cur_dest'.
8016  Note:
8017  `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
8018  `cur_src' points the node_set of the old `state_log[str_idx]'
8019  (with the epsilon nodes pre-filtered out). */
8020  for (i = 0; i < cur_src->nelem; i++)
8021  {
8022  int prev_node = cur_src->elems[i];
8023  int naccepted = 0;
8024  int ret;
8025 
8026 #ifdef DEBUG
8027  re_token_type_t type = dfa->nodes[prev_node].type;
8028  assert (!IS_EPSILON_NODE (type));
8029 #endif
8030 #ifdef RE_ENABLE_I18N
8031  /* If the node may accept `multi byte'. */
8032  if (dfa->nodes[prev_node].accept_mb)
8033  naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
8034  str_idx, sctx->last_str_idx);
8035 #endif /* RE_ENABLE_I18N */
8036 
8037  /* We don't check backreferences here.
8038  See update_cur_sifted_state(). */
8039  if (!naccepted
8040  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
8041  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
8042  dfa->nexts[prev_node]))
8043  naccepted = 1;
8044 
8045  if (naccepted == 0)
8046  continue;
8047 
8048  if (sctx->limits.nelem)
8049  {
8050  int to_idx = str_idx + naccepted;
8051  if (check_dst_limits (mctx, &sctx->limits,
8052  dfa->nexts[prev_node], to_idx,
8053  prev_node, str_idx))
8054  continue;
8055  }
8056  ret = re_node_set_insert (cur_dest, prev_node);
8057  if (BE (ret == -1, 0))
8058  return REG_ESPACE;
8059  }
8060 
8061  return REG_NOERROR;
8062 }
8063 
8064 /* Helper functions. */
8065 
8066 static reg_errcode_t
8067 internal_function
8068 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
8069 {
8070  int top = mctx->state_log_top;
8071 
8072  if (next_state_log_idx >= mctx->input.bufs_len
8073  || (next_state_log_idx >= mctx->input.valid_len
8074  && mctx->input.valid_len < mctx->input.len))
8075  {
8076  reg_errcode_t err;
8077  err = extend_buffers (mctx);
8078  if (BE (err != REG_NOERROR, 0))
8079  return err;
8080  }
8081 
8082  if (top < next_state_log_idx)
8083  {
8084  memset (mctx->state_log + top + 1, '\0',
8085  sizeof (re_dfastate_t *) * (next_state_log_idx - top));
8086  mctx->state_log_top = next_state_log_idx;
8087  }
8088  return REG_NOERROR;
8089 }
8090 
8091 static reg_errcode_t
8092 internal_function
8093 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
8094  re_dfastate_t **src, int num)
8095 {
8096  int st_idx;
8097  reg_errcode_t err;
8098  for (st_idx = 0; st_idx < num; ++st_idx)
8099  {
8100  if (dst[st_idx] == NULL)
8101  dst[st_idx] = src[st_idx];
8102  else if (src[st_idx] != NULL)
8103  {
8104  re_node_set merged_set;
8105  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
8106  &src[st_idx]->nodes);
8107  if (BE (err != REG_NOERROR, 0))
8108  return err;
8109  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
8110  re_node_set_free (&merged_set);
8111  if (BE (err != REG_NOERROR, 0))
8112  return err;
8113  }
8114  }
8115  return REG_NOERROR;
8116 }
8117 
8118 static reg_errcode_t
8119 internal_function
8120 update_cur_sifted_state (const re_match_context_t *mctx,
8121  re_sift_context_t *sctx, int str_idx,
8122  re_node_set *dest_nodes)
8123 {
8124  const re_dfa_t *const dfa = mctx->dfa;
8125  reg_errcode_t err = REG_NOERROR;
8126  const re_node_set *candidates;
8127  candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
8128  : &mctx->state_log[str_idx]->nodes);
8129 
8130  if (dest_nodes->nelem == 0)
8131  sctx->sifted_states[str_idx] = NULL;
8132  else
8133  {
8134  if (candidates)
8135  {
8136  /* At first, add the nodes which can epsilon transit to a node in
8137  DEST_NODE. */
8138  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
8139  if (BE (err != REG_NOERROR, 0))
8140  return err;
8141 
8142  /* Then, check the limitations in the current sift_context. */
8143  if (sctx->limits.nelem)
8144  {
8145  err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
8146  mctx->bkref_ents, str_idx);
8147  if (BE (err != REG_NOERROR, 0))
8148  return err;
8149  }
8150  }
8151 
8152  sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
8153  if (BE (err != REG_NOERROR, 0))
8154  return err;
8155  }
8156 
8157  if (candidates && mctx->state_log[str_idx]->has_backref)
8158  {
8159  err = sift_states_bkref (mctx, sctx, str_idx, candidates);
8160  if (BE (err != REG_NOERROR, 0))
8161  return err;
8162  }
8163  return REG_NOERROR;
8164 }
8165 
8166 static reg_errcode_t
8167 internal_function
8168 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
8169  const re_node_set *candidates)
8170 {
8171  reg_errcode_t err = REG_NOERROR;
8172  int i;
8173 
8174  re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
8175  if (BE (err != REG_NOERROR, 0))
8176  return err;
8177 
8178  if (!state->inveclosure.alloc)
8179  {
8180  err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
8181  if (BE (err != REG_NOERROR, 0))
8182  return REG_ESPACE;
8183  for (i = 0; i < dest_nodes->nelem; i++)
8184  re_node_set_merge (&state->inveclosure,
8185  dfa->inveclosures + dest_nodes->elems[i]);
8186  }
8187  return re_node_set_add_intersect (dest_nodes, candidates,
8188  &state->inveclosure);
8189 }
8190 
8191 static reg_errcode_t
8192 internal_function
8193 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
8194  const re_node_set *candidates)
8195 {
8196  int ecl_idx;
8197  reg_errcode_t err;
8198  re_node_set *inv_eclosure = dfa->inveclosures + node;
8199  re_node_set except_nodes;
8200  re_node_set_init_empty (&except_nodes);
8201  for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8202  {
8203  int cur_node = inv_eclosure->elems[ecl_idx];
8204  if (cur_node == node)
8205  continue;
8206  if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
8207  {
8208  int edst1 = dfa->edests[cur_node].elems[0];
8209  int edst2 = ((dfa->edests[cur_node].nelem > 1)
8210  ? dfa->edests[cur_node].elems[1] : -1);
8211  if ((!re_node_set_contains (inv_eclosure, edst1)
8212  && re_node_set_contains (dest_nodes, edst1))
8213  || (edst2 > 0
8214  && !re_node_set_contains (inv_eclosure, edst2)
8215  && re_node_set_contains (dest_nodes, edst2)))
8216  {
8217  err = re_node_set_add_intersect (&except_nodes, candidates,
8218  dfa->inveclosures + cur_node);
8219  if (BE (err != REG_NOERROR, 0))
8220  {
8221  re_node_set_free (&except_nodes);
8222  return err;
8223  }
8224  }
8225  }
8226  }
8227  for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
8228  {
8229  int cur_node = inv_eclosure->elems[ecl_idx];
8230  if (!re_node_set_contains (&except_nodes, cur_node))
8231  {
8232  int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
8233  re_node_set_remove_at (dest_nodes, idx);
8234  }
8235  }
8236  re_node_set_free (&except_nodes);
8237  return REG_NOERROR;
8238 }
8239 
8240 static int
8241 internal_function
8242 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
8243  int dst_node, int dst_idx, int src_node, int src_idx)
8244 {
8245  const re_dfa_t *const dfa = mctx->dfa;
8246  int lim_idx, src_pos, dst_pos;
8247 
8248  int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
8249  int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
8250  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8251  {
8252  int subexp_idx;
8253  struct re_backref_cache_entry *ent;
8254  ent = mctx->bkref_ents + limits->elems[lim_idx];
8255  subexp_idx = dfa->nodes[ent->node].opr.idx;
8256 
8257  dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8258  subexp_idx, dst_node, dst_idx,
8259  dst_bkref_idx);
8260  src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
8261  subexp_idx, src_node, src_idx,
8262  src_bkref_idx);
8263 
8264  /* In case of:
8265  <src> <dst> ( <subexp> )
8266  ( <subexp> ) <src> <dst>
8267  ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
8268  if (src_pos == dst_pos)
8269  continue; /* This is unrelated limitation. */
8270  else
8271  return 1;
8272  }
8273  return 0;
8274 }
8275 
8276 static int
8277 internal_function
8278 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
8279  int subexp_idx, int from_node, int bkref_idx)
8280 {
8281  const re_dfa_t *const dfa = mctx->dfa;
8282  const re_node_set *eclosures = dfa->eclosures + from_node;
8283  int node_idx;
8284 
8285  /* Else, we are on the boundary: examine the nodes on the epsilon
8286  closure. */
8287  for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
8288  {
8289  int node = eclosures->elems[node_idx];
8290  switch (dfa->nodes[node].type)
8291  {
8292  case OP_BACK_REF:
8293  if (bkref_idx != -1)
8294  {
8295  struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
8296  do
8297  {
8298  int dst, cpos;
8299 
8300  if (ent->node != node)
8301  continue;
8302 
8303  if (subexp_idx < BITSET_WORD_BITS
8304  && !(ent->eps_reachable_subexps_map
8305  & ((bitset_word_t) 1 << subexp_idx)))
8306  continue;
8307 
8308  /* Recurse trying to reach the OP_OPEN_SUBEXP and
8309  OP_CLOSE_SUBEXP cases below. But, if the
8310  destination node is the same node as the source
8311  node, don't recurse because it would cause an
8312  infinite loop: a regex that exhibits this behavior
8313  is ()\1*\1* */
8314  dst = dfa->edests[node].elems[0];
8315  if (dst == from_node)
8316  {
8317  if (boundaries & 1)
8318  return -1;
8319  else /* if (boundaries & 2) */
8320  return 0;
8321  }
8322 
8323  cpos =
8324  check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8325  dst, bkref_idx);
8326  if (cpos == -1 /* && (boundaries & 1) */)
8327  return -1;
8328  if (cpos == 0 && (boundaries & 2))
8329  return 0;
8330 
8331  if (subexp_idx < BITSET_WORD_BITS)
8332  ent->eps_reachable_subexps_map
8333  &= ~((bitset_word_t) 1 << subexp_idx);
8334  }
8335  while (ent++->more);
8336  }
8337  break;
8338 
8339  case OP_OPEN_SUBEXP:
8340  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
8341  return -1;
8342  break;
8343 
8344  case OP_CLOSE_SUBEXP:
8345  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
8346  return 0;
8347  break;
8348 
8349  default:
8350  break;
8351  }
8352  }
8353 
8354  return (boundaries & 2) ? 1 : 0;
8355 }
8356 
8357 static int
8358 internal_function
8359 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
8360  int subexp_idx, int from_node, int str_idx,
8361  int bkref_idx)
8362 {
8363  struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
8364  int boundaries;
8365 
8366  /* If we are outside the range of the subexpression, return -1 or 1. */
8367  if (str_idx < lim->subexp_from)
8368  return -1;
8369 
8370  if (lim->subexp_to < str_idx)
8371  return 1;
8372 
8373  /* If we are within the subexpression, return 0. */
8374  boundaries = (str_idx == lim->subexp_from);
8375  boundaries |= (str_idx == lim->subexp_to) << 1;
8376  if (boundaries == 0)
8377  return 0;
8378 
8379  /* Else, examine epsilon closure. */
8380  return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
8381  from_node, bkref_idx);
8382 }
8383 
8384 /* Check the limitations of sub expressions LIMITS, and remove the nodes
8385  which are against limitations from DEST_NODES. */
8386 
8387 static reg_errcode_t
8388 internal_function
8389 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
8390  const re_node_set *candidates, re_node_set *limits,
8391  struct re_backref_cache_entry *bkref_ents, int str_idx)
8392 {
8393  reg_errcode_t err;
8394  int node_idx, lim_idx;
8395 
8396  for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
8397  {
8398  int subexp_idx;
8399  struct re_backref_cache_entry *ent;
8400  ent = bkref_ents + limits->elems[lim_idx];
8401 
8402  if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
8403  continue; /* This is unrelated limitation. */
8404 
8405  subexp_idx = dfa->nodes[ent->node].opr.idx;
8406  if (ent->subexp_to == str_idx)
8407  {
8408  int ops_node = -1;
8409  int cls_node = -1;
8410  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8411  {
8412  int node = dest_nodes->elems[node_idx];
8413  re_token_type_t type = dfa->nodes[node].type;
8414  if (type == OP_OPEN_SUBEXP
8415  && subexp_idx == dfa->nodes[node].opr.idx)
8416  ops_node = node;
8417  else if (type == OP_CLOSE_SUBEXP
8418  && subexp_idx == dfa->nodes[node].opr.idx)
8419  cls_node = node;
8420  }
8421 
8422  /* Check the limitation of the open subexpression. */
8423  /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
8424  if (ops_node >= 0)
8425  {
8426  err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
8427  candidates);
8428  if (BE (err != REG_NOERROR, 0))
8429  return err;
8430  }
8431 
8432  /* Check the limitation of the close subexpression. */
8433  if (cls_node >= 0)
8434  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8435  {
8436  int node = dest_nodes->elems[node_idx];
8437  if (!re_node_set_contains (dfa->inveclosures + node,
8438  cls_node)
8439  && !re_node_set_contains (dfa->eclosures + node,
8440  cls_node))
8441  {
8442  /* It is against this limitation.
8443  Remove it form the current sifted state. */
8444  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8445  candidates);
8446  if (BE (err != REG_NOERROR, 0))
8447  return err;
8448  --node_idx;
8449  }
8450  }
8451  }
8452  else /* (ent->subexp_to != str_idx) */
8453  {
8454  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
8455  {
8456  int node = dest_nodes->elems[node_idx];
8457  re_token_type_t type = dfa->nodes[node].type;
8458  if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
8459  {
8460  if (subexp_idx != dfa->nodes[node].opr.idx)
8461  continue;
8462  /* It is against this limitation.
8463  Remove it form the current sifted state. */
8464  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
8465  candidates);
8466  if (BE (err != REG_NOERROR, 0))
8467  return err;
8468  }
8469  }
8470  }
8471  }
8472  return REG_NOERROR;
8473 }
8474 
8475 static reg_errcode_t
8476 internal_function
8477 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
8478  int str_idx, const re_node_set *candidates)
8479 {
8480  const re_dfa_t *const dfa = mctx->dfa;
8481  reg_errcode_t err;
8482  int node_idx, node;
8483  re_sift_context_t local_sctx;
8484  int first_idx = search_cur_bkref_entry (mctx, str_idx);
8485 
8486  if (first_idx == -1)
8487  return REG_NOERROR;
8488 
8489  local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
8490 
8491  for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
8492  {
8493  int enabled_idx;
8494  re_token_type_t type;
8495  struct re_backref_cache_entry *entry;
8496  node = candidates->elems[node_idx];
8497  type = dfa->nodes[node].type;
8498  /* Avoid infinite loop for the REs like "()\1+". */
8499  if (node == sctx->last_node && str_idx == sctx->last_str_idx)
8500  continue;
8501  if (type != OP_BACK_REF)
8502  continue;
8503 
8504  entry = mctx->bkref_ents + first_idx;
8505  enabled_idx = first_idx;
8506  do
8507  {
8508  int subexp_len;
8509  int to_idx;
8510  int dst_node;
8511  int ret;
8512  re_dfastate_t *cur_state;
8513 
8514  if (entry->node != node)
8515  continue;
8516  subexp_len = entry->subexp_to - entry->subexp_from;
8517  to_idx = str_idx + subexp_len;
8518  dst_node = (subexp_len ? dfa->nexts[node]
8519  : dfa->edests[node].elems[0]);
8520 
8521  if (to_idx > sctx->last_str_idx
8522  || sctx->sifted_states[to_idx] == NULL
8523  || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
8524  || check_dst_limits (mctx, &sctx->limits, node,
8525  str_idx, dst_node, to_idx))
8526  continue;
8527 
8528  if (local_sctx.sifted_states == NULL)
8529  {
8530  local_sctx = *sctx;
8531  err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
8532  if (BE (err != REG_NOERROR, 0))
8533  goto free_return;
8534  }
8535  local_sctx.last_node = node;
8536  local_sctx.last_str_idx = str_idx;
8537  ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
8538  if (BE (ret < 0, 0))
8539  {
8540  err = REG_ESPACE;
8541  goto free_return;
8542  }
8543  cur_state = local_sctx.sifted_states[str_idx];
8544  err = sift_states_backward (mctx, &local_sctx);
8545  if (BE (err != REG_NOERROR, 0))
8546  goto free_return;
8547  if (sctx->limited_states != NULL)
8548  {
8549  err = merge_state_array (dfa, sctx->limited_states,
8550  local_sctx.sifted_states,
8551  str_idx + 1);
8552  if (BE (err != REG_NOERROR, 0))
8553  goto free_return;
8554  }
8555  local_sctx.sifted_states[str_idx] = cur_state;
8556  re_node_set_remove (&local_sctx.limits, enabled_idx);
8557 
8558  /* mctx->bkref_ents may have changed, reload the pointer. */
8559  entry = mctx->bkref_ents + enabled_idx;
8560  }
8561  while (enabled_idx++, entry++->more);
8562  }
8563  err = REG_NOERROR;
8564  free_return:
8565  if (local_sctx.sifted_states != NULL)
8566  {
8567  re_node_set_free (&local_sctx.limits);
8568  }
8569 
8570  return err;
8571 }
8572 
8573 
8574 #ifdef RE_ENABLE_I18N
8575 static int
8576 internal_function
8577 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
8578  int node_idx, int str_idx, int max_str_idx)
8579 {
8580  const re_dfa_t *const dfa = mctx->dfa;
8581  int naccepted;
8582  /* Check the node can accept `multi byte'. */
8583  naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
8584  if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
8585  !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
8586  dfa->nexts[node_idx]))
8587  /* The node can't accept the `multi byte', or the
8588  destination was already thrown away, then the node
8589  could't accept the current input `multi byte'. */
8590  naccepted = 0;
8591  /* Otherwise, it is sure that the node could accept
8592  `naccepted' bytes input. */
8593  return naccepted;
8594 }
8595 #endif /* RE_ENABLE_I18N */
8596 
8597 
8598 /* Functions for state transition. */
8599 
8600 /* Return the next state to which the current state STATE will transit by
8601  accepting the current input byte, and update STATE_LOG if necessary.
8602  If STATE can accept a multibyte char/collating element/back reference
8603  update the destination of STATE_LOG. */
8604 
8605 static re_dfastate_t *
8606 internal_function
8607 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
8608  re_dfastate_t *state)
8609 {
8610  re_dfastate_t **trtable;
8611  unsigned char ch;
8612 
8613 #ifdef RE_ENABLE_I18N
8614  /* If the current state can accept multibyte. */
8615  if (BE (state->accept_mb, 0))
8616  {
8617  *err = transit_state_mb (mctx, state);
8618  if (BE (*err != REG_NOERROR, 0))
8619  return NULL;
8620  }
8621 #endif /* RE_ENABLE_I18N */
8622 
8623  /* Then decide the next state with the single byte. */
8624 #if 0
8625  if (0)
8626  /* don't use transition table */
8627  return transit_state_sb (err, mctx, state);
8628 #endif
8629 
8630  /* Use transition table */
8631  ch = re_string_fetch_byte (&mctx->input);
8632  for (;;)
8633  {
8634  trtable = state->trtable;
8635  if (BE (trtable != NULL, 1))
8636  return trtable[ch];
8637 
8638  trtable = state->word_trtable;
8639  if (BE (trtable != NULL, 1))
8640  {
8641  unsigned int context;
8642  context
8643  = re_string_context_at (&mctx->input,
8644  re_string_cur_idx (&mctx->input) - 1,
8645  mctx->eflags);
8646  if (IS_WORD_CONTEXT (context))
8647  return trtable[ch + SBC_MAX];
8648  else
8649  return trtable[ch];
8650  }
8651 
8652  if (!build_trtable (mctx->dfa, state))
8653  {
8654  *err = REG_ESPACE;
8655  return NULL;
8656  }
8657 
8658  /* Retry, we now have a transition table. */
8659  }
8660 }
8661 
8662 /* Update the state_log if we need */
8663 re_dfastate_t *
8664 internal_function
8665 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
8666  re_dfastate_t *next_state)
8667 {
8668  const re_dfa_t *const dfa = mctx->dfa;
8669  int cur_idx = re_string_cur_idx (&mctx->input);
8670 
8671  if (cur_idx > mctx->state_log_top)
8672  {
8673  mctx->state_log[cur_idx] = next_state;
8674  mctx->state_log_top = cur_idx;
8675  }
8676  else if (mctx->state_log[cur_idx] == 0)
8677  {
8678  mctx->state_log[cur_idx] = next_state;
8679  }
8680  else
8681  {
8682  re_dfastate_t *pstate;
8683  unsigned int context;
8684  re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
8685  /* If (state_log[cur_idx] != 0), it implies that cur_idx is
8686  the destination of a multibyte char/collating element/
8687  back reference. Then the next state is the union set of
8688  these destinations and the results of the transition table. */
8689  pstate = mctx->state_log[cur_idx];
8690  log_nodes = pstate->entrance_nodes;
8691  if (next_state != NULL)
8692  {
8693  table_nodes = next_state->entrance_nodes;
8694  *err = re_node_set_init_union (&next_nodes, table_nodes,
8695  log_nodes);
8696  if (BE (*err != REG_NOERROR, 0))
8697  return NULL;
8698  }
8699  else
8700  next_nodes = *log_nodes;
8701  /* Note: We already add the nodes of the initial state,
8702  then we don't need to add them here. */
8703 
8704  context = re_string_context_at (&mctx->input,
8705  re_string_cur_idx (&mctx->input) - 1,
8706  mctx->eflags);
8707  next_state = mctx->state_log[cur_idx]
8708  = re_acquire_state_context (err, dfa, &next_nodes, context);
8709  /* We don't need to check errors here, since the return value of
8710  this function is next_state and ERR is already set. */
8711 
8712  if (table_nodes != NULL)
8713  re_node_set_free (&next_nodes);
8714  }
8715 
8716  if (BE (dfa->nbackref, 0) && next_state != NULL)
8717  {
8718  /* Check OP_OPEN_SUBEXP in the current state in case that we use them
8719  later. We must check them here, since the back references in the
8720  next state might use them. */
8721  *err = check_subexp_matching_top (mctx, &next_state->nodes,
8722  cur_idx);
8723  if (BE (*err != REG_NOERROR, 0))
8724  return NULL;
8725 
8726  /* If the next state has back references. */
8727  if (next_state->has_backref)
8728  {
8729  *err = transit_state_bkref (mctx, &next_state->nodes);
8730  if (BE (*err != REG_NOERROR, 0))
8731  return NULL;
8732  next_state = mctx->state_log[cur_idx];
8733  }
8734  }
8735 
8736  return next_state;
8737 }
8738 
8739 /* Skip bytes in the input that correspond to part of a
8740  multi-byte match, then look in the log for a state
8741  from which to restart matching. */
8742 re_dfastate_t *
8743 internal_function
8744 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
8745 {
8746  re_dfastate_t *cur_state;
8747  do
8748  {
8749  int max = mctx->state_log_top;
8750  int cur_str_idx = re_string_cur_idx (&mctx->input);
8751 
8752  do
8753  {
8754  if (++cur_str_idx > max)
8755  return NULL;
8756  re_string_skip_bytes (&mctx->input, 1);
8757  }
8758  while (mctx->state_log[cur_str_idx] == NULL);
8759 
8760  cur_state = merge_state_with_log (err, mctx, NULL);
8761  }
8762  while (*err == REG_NOERROR && cur_state == NULL);
8763  return cur_state;
8764 }
8765 
8766 /* Helper functions for transit_state. */
8767 
8768 /* From the node set CUR_NODES, pick up the nodes whose types are
8769  OP_OPEN_SUBEXP and which have corresponding back references in the regular
8770  expression. And register them to use them later for evaluating the
8771  correspoding back references. */
8772 
8773 static reg_errcode_t
8774 internal_function
8775 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
8776  int str_idx)
8777 {
8778  const re_dfa_t *const dfa = mctx->dfa;
8779  int node_idx;
8780  reg_errcode_t err;
8781 
8782  /* TODO: This isn't efficient.
8783  Because there might be more than one nodes whose types are
8784  OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
8785  nodes.
8786  E.g. RE: (a){2} */
8787  for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
8788  {
8789  int node = cur_nodes->elems[node_idx];
8790  if (dfa->nodes[node].type == OP_OPEN_SUBEXP
8791  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
8792  && (dfa->used_bkref_map
8793  & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
8794  {
8795  err = match_ctx_add_subtop (mctx, node, str_idx);
8796  if (BE (err != REG_NOERROR, 0))
8797  return err;
8798  }
8799  }
8800  return REG_NOERROR;
8801 }
8802 
8803 #if 0
8804 /* Return the next state to which the current state STATE will transit by
8805  accepting the current input byte. */
8806 
8807 static re_dfastate_t *
8808 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
8809  re_dfastate_t *state)
8810 {
8811  const re_dfa_t *const dfa = mctx->dfa;
8812  re_node_set next_nodes;
8813  re_dfastate_t *next_state;
8814  int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
8815  unsigned int context;
8816 
8817  *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
8818  if (BE (*err != REG_NOERROR, 0))
8819  return NULL;
8820  for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
8821  {
8822  int cur_node = state->nodes.elems[node_cnt];
8823  if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
8824  {
8825  *err = re_node_set_merge (&next_nodes,
8826  dfa->eclosures + dfa->nexts[cur_node]);
8827  if (BE (*err != REG_NOERROR, 0))
8828  {
8829  re_node_set_free (&next_nodes);
8830  return NULL;
8831  }
8832  }
8833  }
8834  context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
8835  next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
8836  /* We don't need to check errors here, since the return value of
8837  this function is next_state and ERR is already set. */
8838 
8839  re_node_set_free (&next_nodes);
8840  re_string_skip_bytes (&mctx->input, 1);
8841  return next_state;
8842 }
8843 #endif
8844 
8845 #ifdef RE_ENABLE_I18N
8846 static reg_errcode_t
8847 internal_function
8848 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
8849 {
8850  const re_dfa_t *const dfa = mctx->dfa;
8851  reg_errcode_t err;
8852  int i;
8853 
8854  for (i = 0; i < pstate->nodes.nelem; ++i)
8855  {
8856  re_node_set dest_nodes, *new_nodes;
8857  int cur_node_idx = pstate->nodes.elems[i];
8858  int naccepted, dest_idx;
8859  unsigned int context;
8860  re_dfastate_t *dest_state;
8861 
8862  if (!dfa->nodes[cur_node_idx].accept_mb)
8863  continue;
8864 
8865  if (dfa->nodes[cur_node_idx].constraint)
8866  {
8867  context = re_string_context_at (&mctx->input,
8868  re_string_cur_idx (&mctx->input),
8869  mctx->eflags);
8870  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
8871  context))
8872  continue;
8873  }
8874 
8875  /* How many bytes the node can accept? */
8876  naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
8877  re_string_cur_idx (&mctx->input));
8878  if (naccepted == 0)
8879  continue;
8880 
8881  /* The node can accepts `naccepted' bytes. */
8882  dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
8883  mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
8884  : mctx->max_mb_elem_len);
8885  err = clean_state_log_if_needed (mctx, dest_idx);
8886  if (BE (err != REG_NOERROR, 0))
8887  return err;
8888 #ifdef DEBUG
8889  assert (dfa->nexts[cur_node_idx] != -1);
8890 #endif
8891  new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
8892 
8893  dest_state = mctx->state_log[dest_idx];
8894  if (dest_state == NULL)
8895  dest_nodes = *new_nodes;
8896  else
8897  {
8898  err = re_node_set_init_union (&dest_nodes,
8899  dest_state->entrance_nodes, new_nodes);
8900  if (BE (err != REG_NOERROR, 0))
8901  return err;
8902  }
8903  context = re_string_context_at (&mctx->input, dest_idx - 1,
8904  mctx->eflags);
8905  mctx->state_log[dest_idx]
8906  = re_acquire_state_context (&err, dfa, &dest_nodes, context);
8907  if (dest_state != NULL)
8908  re_node_set_free (&dest_nodes);
8909  if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
8910  return err;
8911  }
8912  return REG_NOERROR;
8913 }
8914 #endif /* RE_ENABLE_I18N */
8915 
8916 static reg_errcode_t
8917 internal_function
8918 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
8919 {
8920  const re_dfa_t *const dfa = mctx->dfa;
8921  reg_errcode_t err;
8922  int i;
8923  int cur_str_idx = re_string_cur_idx (&mctx->input);
8924 
8925  for (i = 0; i < nodes->nelem; ++i)
8926  {
8927  int dest_str_idx, prev_nelem, bkc_idx;
8928  int node_idx = nodes->elems[i];
8929  unsigned int context;
8930  const re_token_t *node = dfa->nodes + node_idx;
8931  re_node_set *new_dest_nodes;
8932 
8933  /* Check whether `node' is a backreference or not. */
8934  if (node->type != OP_BACK_REF)
8935  continue;
8936 
8937  if (node->constraint)
8938  {
8939  context = re_string_context_at (&mctx->input, cur_str_idx,
8940  mctx->eflags);
8941  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
8942  continue;
8943  }
8944 
8945  /* `node' is a backreference.
8946  Check the substring which the substring matched. */
8947  bkc_idx = mctx->nbkref_ents;
8948  err = get_subexp (mctx, node_idx, cur_str_idx);
8949  if (BE (err != REG_NOERROR, 0))
8950  goto free_return;
8951 
8952  /* And add the epsilon closures (which is `new_dest_nodes') of
8953  the backreference to appropriate state_log. */
8954 #ifdef DEBUG
8955  assert (dfa->nexts[node_idx] != -1);
8956 #endif
8957  for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
8958  {
8959  int subexp_len;
8960  re_dfastate_t *dest_state;
8961  struct re_backref_cache_entry *bkref_ent;
8962  bkref_ent = mctx->bkref_ents + bkc_idx;
8963  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
8964  continue;
8965  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
8966  new_dest_nodes = (subexp_len == 0
8967  ? dfa->eclosures + dfa->edests[node_idx].elems[0]
8968  : dfa->eclosures + dfa->nexts[node_idx]);
8969  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
8970  - bkref_ent->subexp_from);
8971  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
8972  mctx->eflags);
8973  dest_state = mctx->state_log[dest_str_idx];
8974  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
8975  : mctx->state_log[cur_str_idx]->nodes.nelem);
8976  /* Add `new_dest_node' to state_log. */
8977  if (dest_state == NULL)
8978  {
8979  mctx->state_log[dest_str_idx]
8980  = re_acquire_state_context (&err, dfa, new_dest_nodes,
8981  context);
8982  if (BE (mctx->state_log[dest_str_idx] == NULL
8983  && err != REG_NOERROR, 0))
8984  goto free_return;
8985  }
8986  else
8987  {
8988  re_node_set dest_nodes;
8989  err = re_node_set_init_union (&dest_nodes,
8990  dest_state->entrance_nodes,
8991  new_dest_nodes);
8992  if (BE (err != REG_NOERROR, 0))
8993  {
8994  re_node_set_free (&dest_nodes);
8995  goto free_return;
8996  }
8997  mctx->state_log[dest_str_idx]
8998  = re_acquire_state_context (&err, dfa, &dest_nodes, context);
8999  re_node_set_free (&dest_nodes);
9000  if (BE (mctx->state_log[dest_str_idx] == NULL
9001  && err != REG_NOERROR, 0))
9002  goto free_return;
9003  }
9004  /* We need to check recursively if the backreference can epsilon
9005  transit. */
9006  if (subexp_len == 0
9007  && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
9008  {
9009  err = check_subexp_matching_top (mctx, new_dest_nodes,
9010  cur_str_idx);
9011  if (BE (err != REG_NOERROR, 0))
9012  goto free_return;
9013  err = transit_state_bkref (mctx, new_dest_nodes);
9014  if (BE (err != REG_NOERROR, 0))
9015  goto free_return;
9016  }
9017  }
9018  }
9019  err = REG_NOERROR;
9020  free_return:
9021  return err;
9022 }
9023 
9024 /* Enumerate all the candidates which the backreference BKREF_NODE can match
9025  at BKREF_STR_IDX, and register them by match_ctx_add_entry().
9026  Note that we might collect inappropriate candidates here.
9027  However, the cost of checking them strictly here is too high, then we
9028  delay these checking for prune_impossible_nodes(). */
9029 
9030 static reg_errcode_t
9031 internal_function
9032 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
9033 {
9034  const re_dfa_t *const dfa = mctx->dfa;
9035  int subexp_num, sub_top_idx;
9036  const char *buf = (const char *) re_string_get_buffer (&mctx->input);
9037  /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
9038  int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
9039  if (cache_idx != -1)
9040  {
9041  const struct re_backref_cache_entry *entry
9042  = mctx->bkref_ents + cache_idx;
9043  do
9044  if (entry->node == bkref_node)
9045  return REG_NOERROR; /* We already checked it. */
9046  while (entry++->more);
9047  }
9048 
9049  subexp_num = dfa->nodes[bkref_node].opr.idx;
9050 
9051  /* For each sub expression */
9052  for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
9053  {
9054  reg_errcode_t err;
9055  re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
9056  re_sub_match_last_t *sub_last;
9057  int sub_last_idx, sl_str, bkref_str_off;
9058 
9059  if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
9060  continue; /* It isn't related. */
9061 
9062  sl_str = sub_top->str_idx;
9063  bkref_str_off = bkref_str_idx;
9064  /* At first, check the last node of sub expressions we already
9065  evaluated. */
9066  for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
9067  {
9068  int sl_str_diff;
9069  sub_last = sub_top->lasts[sub_last_idx];
9070  sl_str_diff = sub_last->str_idx - sl_str;
9071  /* The matched string by the sub expression match with the substring
9072  at the back reference? */
9073  if (sl_str_diff > 0)
9074  {
9075  if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
9076  {
9077  /* Not enough chars for a successful match. */
9078  if (bkref_str_off + sl_str_diff > mctx->input.len)
9079  break;
9080 
9081  err = clean_state_log_if_needed (mctx,
9082  bkref_str_off
9083  + sl_str_diff);
9084  if (BE (err != REG_NOERROR, 0))
9085  return err;
9086  buf = (const char *) re_string_get_buffer (&mctx->input);
9087  }
9088  if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
9089  /* We don't need to search this sub expression any more. */
9090  break;
9091  }
9092  bkref_str_off += sl_str_diff;
9093  sl_str += sl_str_diff;
9094  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9095  bkref_str_idx);
9096 
9097  /* Reload buf, since the preceding call might have reallocated
9098  the buffer. */
9099  buf = (const char *) re_string_get_buffer (&mctx->input);
9100 
9101  if (err == REG_NOMATCH)
9102  continue;
9103  if (BE (err != REG_NOERROR, 0))
9104  return err;
9105  }
9106 
9107  if (sub_last_idx < sub_top->nlasts)
9108  continue;
9109  if (sub_last_idx > 0)
9110  ++sl_str;
9111  /* Then, search for the other last nodes of the sub expression. */
9112  for (; sl_str <= bkref_str_idx; ++sl_str)
9113  {
9114  int cls_node, sl_str_off;
9115  const re_node_set *nodes;
9116  sl_str_off = sl_str - sub_top->str_idx;
9117  /* The matched string by the sub expression match with the substring
9118  at the back reference? */
9119  if (sl_str_off > 0)
9120  {
9121  if (BE (bkref_str_off >= mctx->input.valid_len, 0))
9122  {
9123  /* If we are at the end of the input, we cannot match. */
9124  if (bkref_str_off >= mctx->input.len)
9125  break;
9126 
9127  err = extend_buffers (mctx);
9128  if (BE (err != REG_NOERROR, 0))
9129  return err;
9130 
9131  buf = (const char *) re_string_get_buffer (&mctx->input);
9132  }
9133  if (buf [bkref_str_off++] != buf[sl_str - 1])
9134  break; /* We don't need to search this sub expression
9135  any more. */
9136  }
9137  if (mctx->state_log[sl_str] == NULL)
9138  continue;
9139  /* Does this state have a ')' of the sub expression? */
9140  nodes = &mctx->state_log[sl_str]->nodes;
9141  cls_node = find_subexp_node (dfa, nodes, subexp_num,
9142  OP_CLOSE_SUBEXP);
9143  if (cls_node == -1)
9144  continue; /* No. */
9145  if (sub_top->path == NULL)
9146  {
9147  sub_top->path = calloc (sizeof (state_array_t),
9148  sl_str - sub_top->str_idx + 1);
9149  if (sub_top->path == NULL)
9150  return REG_ESPACE;
9151  }
9152  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
9153  in the current context? */
9154  err = check_arrival (mctx, sub_top->path, sub_top->node,
9155  sub_top->str_idx, cls_node, sl_str,
9156  OP_CLOSE_SUBEXP);
9157  if (err == REG_NOMATCH)
9158  continue;
9159  if (BE (err != REG_NOERROR, 0))
9160  return err;
9161  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
9162  if (BE (sub_last == NULL, 0))
9163  return REG_ESPACE;
9164  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
9165  bkref_str_idx);
9166  if (err == REG_NOMATCH)
9167  continue;
9168  }
9169  }
9170  return REG_NOERROR;
9171 }
9172 
9173 /* Helper functions for get_subexp(). */
9174 
9175 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
9176  If it can arrive, register the sub expression expressed with SUB_TOP
9177  and SUB_LAST. */
9178 
9179 static reg_errcode_t
9180 internal_function
9181 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
9182  re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
9183 {
9184  reg_errcode_t err;
9185  int to_idx;
9186  /* Can the subexpression arrive the back reference? */
9187  err = check_arrival (mctx, &sub_last->path, sub_last->node,
9188  sub_last->str_idx, bkref_node, bkref_str,
9189  OP_OPEN_SUBEXP);
9190  if (err != REG_NOERROR)
9191  return err;
9192  err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
9193  sub_last->str_idx);
9194  if (BE (err != REG_NOERROR, 0))
9195  return err;
9196  to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
9197  return clean_state_log_if_needed (mctx, to_idx);
9198 }
9199 
9200 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
9201  Search '(' if FL_OPEN, or search ')' otherwise.
9202  TODO: This function isn't efficient...
9203  Because there might be more than one nodes whose types are
9204  OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
9205  nodes.
9206  E.g. RE: (a){2} */
9207 
9208 static int
9209 internal_function
9210 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
9211  int subexp_idx, int type)
9212 {
9213  int cls_idx;
9214  for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
9215  {
9216  int cls_node = nodes->elems[cls_idx];
9217  const re_token_t *node = dfa->nodes + cls_node;
9218  if (node->type == type
9219  && node->opr.idx == subexp_idx)
9220  return cls_node;
9221  }
9222  return -1;
9223 }
9224 
9225 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
9226  LAST_NODE at LAST_STR. We record the path onto PATH since it will be
9227  heavily reused.
9228  Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
9229 
9230 static reg_errcode_t
9231 internal_function
9232 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
9233  int top_str, int last_node, int last_str, int type)
9234 {
9235  const re_dfa_t *const dfa = mctx->dfa;
9236  reg_errcode_t err = REG_NOERROR;
9237  int subexp_num, backup_cur_idx, str_idx, null_cnt;
9238  re_dfastate_t *cur_state = NULL;
9239  re_node_set *cur_nodes, next_nodes;
9240  re_dfastate_t **backup_state_log;
9241  unsigned int context;
9242 
9243  subexp_num = dfa->nodes[top_node].opr.idx;
9244  /* Extend the buffer if we need. */
9245  if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
9246  {
9247  re_dfastate_t **new_array;
9248  int old_alloc = path->alloc;
9249  path->alloc += last_str + mctx->max_mb_elem_len + 1;
9250  new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
9251  if (BE (new_array == NULL, 0))
9252  {
9253  path->alloc = old_alloc;
9254  return REG_ESPACE;
9255  }
9256  path->array = new_array;
9257  memset (new_array + old_alloc, '\0',
9258  sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
9259  }
9260 
9261  str_idx = path->next_idx ? path->next_idx : top_str;
9262 
9263  /* Temporary modify MCTX. */
9264  backup_state_log = mctx->state_log;
9265  backup_cur_idx = mctx->input.cur_idx;
9266  mctx->state_log = path->array;
9267  mctx->input.cur_idx = str_idx;
9268 
9269  /* Setup initial node set. */
9270  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9271  if (str_idx == top_str)
9272  {
9273  err = re_node_set_init_1 (&next_nodes, top_node);
9274  if (BE (err != REG_NOERROR, 0))
9275  return err;
9276  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9277  if (BE (err != REG_NOERROR, 0))
9278  {
9279  re_node_set_free (&next_nodes);
9280  return err;
9281  }
9282  }
9283  else
9284  {
9285  cur_state = mctx->state_log[str_idx];
9286  if (cur_state && cur_state->has_backref)
9287  {
9288  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
9289  if (BE (err != REG_NOERROR, 0))
9290  return err;
9291  }
9292  else
9293  re_node_set_init_empty (&next_nodes);
9294  }
9295  if (str_idx == top_str || (cur_state && cur_state->has_backref))
9296  {
9297  if (next_nodes.nelem)
9298  {
9299  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9300  subexp_num, type);
9301  if (BE (err != REG_NOERROR, 0))
9302  {
9303  re_node_set_free (&next_nodes);
9304  return err;
9305  }
9306  }
9307  cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9308  if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9309  {
9310  re_node_set_free (&next_nodes);
9311  return err;
9312  }
9313  mctx->state_log[str_idx] = cur_state;
9314  }
9315 
9316  for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
9317  {
9318  re_node_set_empty (&next_nodes);
9319  if (mctx->state_log[str_idx + 1])
9320  {
9321  err = re_node_set_merge (&next_nodes,
9322  &mctx->state_log[str_idx + 1]->nodes);
9323  if (BE (err != REG_NOERROR, 0))
9324  {
9325  re_node_set_free (&next_nodes);
9326  return err;
9327  }
9328  }
9329  if (cur_state)
9330  {
9331  err = check_arrival_add_next_nodes (mctx, str_idx,
9332  &cur_state->non_eps_nodes,
9333  &next_nodes);
9334  if (BE (err != REG_NOERROR, 0))
9335  {
9336  re_node_set_free (&next_nodes);
9337  return err;
9338  }
9339  }
9340  ++str_idx;
9341  if (next_nodes.nelem)
9342  {
9343  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
9344  if (BE (err != REG_NOERROR, 0))
9345  {
9346  re_node_set_free (&next_nodes);
9347  return err;
9348  }
9349  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
9350  subexp_num, type);
9351  if (BE (err != REG_NOERROR, 0))
9352  {
9353  re_node_set_free (&next_nodes);
9354  return err;
9355  }
9356  }
9357  context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
9358  cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
9359  if (BE (cur_state == NULL && err != REG_NOERROR, 0))
9360  {
9361  re_node_set_free (&next_nodes);
9362  return err;
9363  }
9364  mctx->state_log[str_idx] = cur_state;
9365  null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
9366  }
9367  re_node_set_free (&next_nodes);
9368  cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
9369  : &mctx->state_log[last_str]->nodes);
9370  path->next_idx = str_idx;
9371 
9372  /* Fix MCTX. */
9373  mctx->state_log = backup_state_log;
9374  mctx->input.cur_idx = backup_cur_idx;
9375 
9376  /* Then check the current node set has the node LAST_NODE. */
9377  if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
9378  return REG_NOERROR;
9379 
9380  return REG_NOMATCH;
9381 }
9382 
9383 /* Helper functions for check_arrival. */
9384 
9385 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
9386  to NEXT_NODES.
9387  TODO: This function is similar to the functions transit_state*(),
9388  however this function has many additional works.
9389  Can't we unify them? */
9390 
9391 static reg_errcode_t
9392 internal_function
9393 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
9394  re_node_set *cur_nodes, re_node_set *next_nodes)
9395 {
9396  const re_dfa_t *const dfa = mctx->dfa;
9397  int result;
9398  int cur_idx;
9399  reg_errcode_t err = REG_NOERROR;
9400  re_node_set union_set;
9401  re_node_set_init_empty (&union_set);
9402  for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
9403  {
9404  int naccepted = 0;
9405  int cur_node = cur_nodes->elems[cur_idx];
9406 #ifdef DEBUG
9407  re_token_type_t type = dfa->nodes[cur_node].type;
9408  assert (!IS_EPSILON_NODE (type));
9409 #endif
9410 #ifdef RE_ENABLE_I18N
9411  /* If the node may accept `multi byte'. */
9412  if (dfa->nodes[cur_node].accept_mb)
9413  {
9414  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
9415  str_idx);
9416  if (naccepted > 1)
9417  {
9418  re_dfastate_t *dest_state;
9419  int next_node = dfa->nexts[cur_node];
9420  int next_idx = str_idx + naccepted;
9421  dest_state = mctx->state_log[next_idx];
9422  re_node_set_empty (&union_set);
9423  if (dest_state)
9424  {
9425  err = re_node_set_merge (&union_set, &dest_state->nodes);
9426  if (BE (err != REG_NOERROR, 0))
9427  {
9428  re_node_set_free (&union_set);
9429  return err;
9430  }
9431  }
9432  result = re_node_set_insert (&union_set, next_node);
9433  if (BE (result < 0, 0))
9434  {
9435  re_node_set_free (&union_set);
9436  return REG_ESPACE;
9437  }
9438  mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
9439  &union_set);
9440  if (BE (mctx->state_log[next_idx] == NULL
9441  && err != REG_NOERROR, 0))
9442  {
9443  re_node_set_free (&union_set);
9444  return err;
9445  }
9446  }
9447  }
9448 #endif /* RE_ENABLE_I18N */
9449  if (naccepted
9450  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
9451  {
9452  result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
9453  if (BE (result < 0, 0))
9454  {
9455  re_node_set_free (&union_set);
9456  return REG_ESPACE;
9457  }
9458  }
9459  }
9460  re_node_set_free (&union_set);
9461  return REG_NOERROR;
9462 }
9463 
9464 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
9465  CUR_NODES, however exclude the nodes which are:
9466  - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
9467  - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
9468 */
9469 
9470 static reg_errcode_t
9471 internal_function
9472 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
9473  int ex_subexp, int type)
9474 {
9475  reg_errcode_t err;
9476  int idx, outside_node;
9477  re_node_set new_nodes;
9478 #ifdef DEBUG
9479  assert (cur_nodes->nelem);
9480 #endif
9481  err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
9482  if (BE (err != REG_NOERROR, 0))
9483  return err;
9484  /* Create a new node set NEW_NODES with the nodes which are epsilon
9485  closures of the node in CUR_NODES. */
9486 
9487  for (idx = 0; idx < cur_nodes->nelem; ++idx)
9488  {
9489  int cur_node = cur_nodes->elems[idx];
9490  const re_node_set *eclosure = dfa->eclosures + cur_node;
9491  outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
9492  if (outside_node == -1)
9493  {
9494  /* There are no problematic nodes, just merge them. */
9495  err = re_node_set_merge (&new_nodes, eclosure);
9496  if (BE (err != REG_NOERROR, 0))
9497  {
9498  re_node_set_free (&new_nodes);
9499  return err;
9500  }
9501  }
9502  else
9503  {
9504  /* There are problematic nodes, re-calculate incrementally. */
9505  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
9506  ex_subexp, type);
9507  if (BE (err != REG_NOERROR, 0))
9508  {
9509  re_node_set_free (&new_nodes);
9510  return err;
9511  }
9512  }
9513  }
9514  re_node_set_free (cur_nodes);
9515  *cur_nodes = new_nodes;
9516  return REG_NOERROR;
9517 }
9518 
9519 /* Helper function for check_arrival_expand_ecl.
9520  Check incrementally the epsilon closure of TARGET, and if it isn't
9521  problematic append it to DST_NODES. */
9522 
9523 static reg_errcode_t
9524 internal_function
9525 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
9526  int target, int ex_subexp, int type)
9527 {
9528  int cur_node;
9529  for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
9530  {
9531  int err;
9532 
9533  if (dfa->nodes[cur_node].type == type
9534  && dfa->nodes[cur_node].opr.idx == ex_subexp)
9535  {
9536  if (type == OP_CLOSE_SUBEXP)
9537  {
9538  err = re_node_set_insert (dst_nodes, cur_node);
9539  if (BE (err == -1, 0))
9540  return REG_ESPACE;
9541  }
9542  break;
9543  }
9544  err = re_node_set_insert (dst_nodes, cur_node);
9545  if (BE (err == -1, 0))
9546  return REG_ESPACE;
9547  if (dfa->edests[cur_node].nelem == 0)
9548  break;
9549  if (dfa->edests[cur_node].nelem == 2)
9550  {
9551  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
9552  dfa->edests[cur_node].elems[1],
9553  ex_subexp, type);
9554  if (BE (err != REG_NOERROR, 0))
9555  return err;
9556  }
9557  cur_node = dfa->edests[cur_node].elems[0];
9558  }
9559  return REG_NOERROR;
9560 }
9561 
9562 
9563 /* For all the back references in the current state, calculate the
9564  destination of the back references by the appropriate entry
9565  in MCTX->BKREF_ENTS. */
9566 
9567 static reg_errcode_t
9568 internal_function
9569 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
9570  int cur_str, int subexp_num, int type)
9571 {
9572  const re_dfa_t *const dfa = mctx->dfa;
9573  reg_errcode_t err;
9574  int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
9575  struct re_backref_cache_entry *ent;
9576 
9577  if (cache_idx_start == -1)
9578  return REG_NOERROR;
9579 
9580  restart:
9581  ent = mctx->bkref_ents + cache_idx_start;
9582  do
9583  {
9584  int to_idx, next_node;
9585 
9586  /* Is this entry ENT is appropriate? */
9587  if (!re_node_set_contains (cur_nodes, ent->node))
9588  continue; /* No. */
9589 
9590  to_idx = cur_str + ent->subexp_to - ent->subexp_from;
9591  /* Calculate the destination of the back reference, and append it
9592  to MCTX->STATE_LOG. */
9593  if (to_idx == cur_str)
9594  {
9595  /* The backreference did epsilon transit, we must re-check all the
9596  node in the current state. */
9597  re_node_set new_dests;
9598  reg_errcode_t err2, err3;
9599  next_node = dfa->edests[ent->node].elems[0];
9600  if (re_node_set_contains (cur_nodes, next_node))
9601  continue;
9602  err = re_node_set_init_1 (&new_dests, next_node);
9603  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
9604  err3 = re_node_set_merge (cur_nodes, &new_dests);
9605  re_node_set_free (&new_dests);
9606  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
9607  || err3 != REG_NOERROR, 0))
9608  {
9609  err = (err != REG_NOERROR ? err
9610  : (err2 != REG_NOERROR ? err2 : err3));
9611  return err;
9612  }
9613  /* TODO: It is still inefficient... */
9614  goto restart;
9615  }
9616  else
9617  {
9618  re_node_set union_set;
9619  next_node = dfa->nexts[ent->node];
9620  if (mctx->state_log[to_idx])
9621  {
9622  int ret;
9623  if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
9624  next_node))
9625  continue;
9626  err = re_node_set_init_copy (&union_set,
9627  &mctx->state_log[to_idx]->nodes);
9628  ret = re_node_set_insert (&union_set, next_node);
9629  if (BE (err != REG_NOERROR || ret < 0, 0))
9630  {
9631  re_node_set_free (&union_set);
9632  err = err != REG_NOERROR ? err : REG_ESPACE;
9633  return err;
9634  }
9635  }
9636  else
9637  {
9638  err = re_node_set_init_1 (&union_set, next_node);
9639  if (BE (err != REG_NOERROR, 0))
9640  return err;
9641  }
9642  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
9643  re_node_set_free (&union_set);
9644  if (BE (mctx->state_log[to_idx] == NULL
9645  && err != REG_NOERROR, 0))
9646  return err;
9647  }
9648  }
9649  while (ent++->more);
9650  return REG_NOERROR;
9651 }
9652 
9653 /* Build transition table for the state.
9654  Return 1 if succeeded, otherwise return NULL. */
9655 
9656 static int
9657 internal_function
9658 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
9659 {
9660  reg_errcode_t err;
9661  int i, j, ch, need_word_trtable = 0;
9662  bitset_word_t elem, mask;
9663  bool dests_node_malloced = false;
9664  bool dest_states_malloced = false;
9665  int ndests; /* Number of the destination states from `state'. */
9666  re_dfastate_t **trtable;
9667  re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
9668  re_node_set follows, *dests_node;
9669  bitset_t *dests_ch;
9670  bitset_t acceptable;
9671 
9672  struct dests_alloc
9673  {
9674  re_node_set dests_node[SBC_MAX];
9675  bitset_t dests_ch[SBC_MAX];
9676  } *dests_alloc;
9677 
9678  /* We build DFA states which corresponds to the destination nodes
9679  from `state'. `dests_node[i]' represents the nodes which i-th
9680  destination state contains, and `dests_ch[i]' represents the
9681  characters which i-th destination state accepts. */
9682  if (__libc_use_alloca (sizeof (struct dests_alloc)))
9683  dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
9684  else
9685  {
9686  dests_alloc = re_malloc (struct dests_alloc, 1);
9687  if (BE (dests_alloc == NULL, 0))
9688  return 0;
9689  dests_node_malloced = true;
9690  }
9691  dests_node = dests_alloc->dests_node;
9692  dests_ch = dests_alloc->dests_ch;
9693 
9694  /* Initialize transiton table. */
9695  state->word_trtable = state->trtable = NULL;
9696 
9697  /* At first, group all nodes belonging to `state' into several
9698  destinations. */
9699  ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
9700  if (BE (ndests <= 0, 0))
9701  {
9702  if (dests_node_malloced)
9703  free (dests_alloc);
9704  /* Return 0 in case of an error, 1 otherwise. */
9705  if (ndests == 0)
9706  {
9707  state->trtable = (re_dfastate_t **)
9708  calloc (sizeof (re_dfastate_t *), SBC_MAX);
9709  return 1;
9710  }
9711  return 0;
9712  }
9713 
9714  err = re_node_set_alloc (&follows, ndests + 1);
9715  if (BE (err != REG_NOERROR, 0))
9716  goto out_free;
9717 
9718  if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
9719  + ndests * 3 * sizeof (re_dfastate_t *)))
9720  dest_states = (re_dfastate_t **)
9721  alloca (ndests * 3 * sizeof (re_dfastate_t *));
9722  else
9723  {
9724  dest_states = (re_dfastate_t **)
9725  malloc (ndests * 3 * sizeof (re_dfastate_t *));
9726  if (BE (dest_states == NULL, 0))
9727  {
9728 out_free:
9729  if (dest_states_malloced)
9730  free (dest_states);
9731  re_node_set_free (&follows);
9732  for (i = 0; i < ndests; ++i)
9733  re_node_set_free (dests_node + i);
9734  if (dests_node_malloced)
9735  free (dests_alloc);
9736  return 0;
9737  }
9738  dest_states_malloced = true;
9739  }
9740  dest_states_word = dest_states + ndests;
9741  dest_states_nl = dest_states_word + ndests;
9742  bitset_empty (acceptable);
9743 
9744  /* Then build the states for all destinations. */
9745  for (i = 0; i < ndests; ++i)
9746  {
9747  int next_node;
9748  re_node_set_empty (&follows);
9749  /* Merge the follows of this destination states. */
9750  for (j = 0; j < dests_node[i].nelem; ++j)
9751  {
9752  next_node = dfa->nexts[dests_node[i].elems[j]];
9753  if (next_node != -1)
9754  {
9755  err = re_node_set_merge (&follows, dfa->eclosures + next_node);
9756  if (BE (err != REG_NOERROR, 0))
9757  goto out_free;
9758  }
9759  }
9760  dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
9761  if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
9762  goto out_free;
9763  /* If the new state has context constraint,
9764  build appropriate states for these contexts. */
9765  if (dest_states[i]->has_constraint)
9766  {
9767  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
9768  CONTEXT_WORD);
9769  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
9770  goto out_free;
9771 
9772  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
9773  need_word_trtable = 1;
9774 
9775  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
9776  CONTEXT_NEWLINE);
9777  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
9778  goto out_free;
9779  }
9780  else
9781  {
9782  dest_states_word[i] = dest_states[i];
9783  dest_states_nl[i] = dest_states[i];
9784  }
9785  bitset_merge (acceptable, dests_ch[i]);
9786  }
9787 
9788  if (!BE (need_word_trtable, 0))
9789  {
9790  /* We don't care about whether the following character is a word
9791  character, or we are in a single-byte character set so we can
9792  discern by looking at the character code: allocate a
9793  256-entry transition table. */
9794  trtable = state->trtable =
9795  (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
9796  if (BE (trtable == NULL, 0))
9797  goto out_free;
9798 
9799  /* For all characters ch...: */
9800  for (i = 0; i < BITSET_WORDS; ++i)
9801  for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9802  elem;
9803  mask <<= 1, elem >>= 1, ++ch)
9804  if (BE (elem & 1, 0))
9805  {
9806  /* There must be exactly one destination which accepts
9807  character ch. See group_nodes_into_DFAstates. */
9808  for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9809  ;
9810 
9811  /* j-th destination accepts the word character ch. */
9812  if (dfa->word_char[i] & mask)
9813  trtable[ch] = dest_states_word[j];
9814  else
9815  trtable[ch] = dest_states[j];
9816  }
9817  }
9818  else
9819  {
9820  /* We care about whether the following character is a word
9821  character, and we are in a multi-byte character set: discern
9822  by looking at the character code: build two 256-entry
9823  transition tables, one starting at trtable[0] and one
9824  starting at trtable[SBC_MAX]. */
9825  trtable = state->word_trtable =
9826  (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
9827  if (BE (trtable == NULL, 0))
9828  goto out_free;
9829 
9830  /* For all characters ch...: */
9831  for (i = 0; i < BITSET_WORDS; ++i)
9832  for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
9833  elem;
9834  mask <<= 1, elem >>= 1, ++ch)
9835  if (BE (elem & 1, 0))
9836  {
9837  /* There must be exactly one destination which accepts
9838  character ch. See group_nodes_into_DFAstates. */
9839  for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
9840  ;
9841 
9842  /* j-th destination accepts the word character ch. */
9843  trtable[ch] = dest_states[j];
9844  trtable[ch + SBC_MAX] = dest_states_word[j];
9845  }
9846  }
9847 
9848  /* new line */
9849  if (bitset_contain (acceptable, NEWLINE_CHAR))
9850  {
9851  /* The current state accepts newline character. */
9852  for (j = 0; j < ndests; ++j)
9853  if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
9854  {
9855  /* k-th destination accepts newline character. */
9856  trtable[NEWLINE_CHAR] = dest_states_nl[j];
9857  if (need_word_trtable)
9858  trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
9859  /* There must be only one destination which accepts
9860  newline. See group_nodes_into_DFAstates. */
9861  break;
9862  }
9863  }
9864 
9865  if (dest_states_malloced)
9866  free (dest_states);
9867 
9868  re_node_set_free (&follows);
9869  for (i = 0; i < ndests; ++i)
9870  re_node_set_free (dests_node + i);
9871 
9872  if (dests_node_malloced)
9873  free (dests_alloc);
9874 
9875  return 1;
9876 }
9877 
9878 /* Group all nodes belonging to STATE into several destinations.
9879  Then for all destinations, set the nodes belonging to the destination
9880  to DESTS_NODE[i] and set the characters accepted by the destination
9881  to DEST_CH[i]. This function return the number of destinations. */
9882 
9883 static int
9884 internal_function
9885 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
9886  re_node_set *dests_node, bitset_t *dests_ch)
9887 {
9888  reg_errcode_t err;
9889  int result;
9890  int i, j, k;
9891  int ndests; /* Number of the destinations from `state'. */
9892  bitset_t accepts; /* Characters a node can accept. */
9893  const re_node_set *cur_nodes = &state->nodes;
9894  bitset_empty (accepts);
9895  ndests = 0;
9896 
9897  /* For all the nodes belonging to `state', */
9898  for (i = 0; i < cur_nodes->nelem; ++i)
9899  {
9900  re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
9901  re_token_type_t type = node->type;
9902  unsigned int constraint = node->constraint;
9903 
9904  /* Enumerate all single byte character this node can accept. */
9905  if (type == CHARACTER)
9906  bitset_set (accepts, node->opr.c);
9907  else if (type == SIMPLE_BRACKET)
9908  {
9909  bitset_merge (accepts, node->opr.sbcset);
9910  }
9911  else if (type == OP_PERIOD)
9912  {
9913 #ifdef RE_ENABLE_I18N
9914  if (dfa->mb_cur_max > 1)
9915  bitset_merge (accepts, dfa->sb_char);
9916  else
9917 #endif
9918  bitset_set_all (accepts);
9919  if (!(dfa->syntax & RE_DOT_NEWLINE))
9920  bitset_clear (accepts, '\n');
9921  if (dfa->syntax & RE_DOT_NOT_NULL)
9922  bitset_clear (accepts, '\0');
9923  }
9924 #ifdef RE_ENABLE_I18N
9925  else if (type == OP_UTF8_PERIOD)
9926  {
9927  memset (accepts, '\xff', sizeof (bitset_t) / 2);
9928  if (!(dfa->syntax & RE_DOT_NEWLINE))
9929  bitset_clear (accepts, '\n');
9930  if (dfa->syntax & RE_DOT_NOT_NULL)
9931  bitset_clear (accepts, '\0');
9932  }
9933 #endif
9934  else
9935  continue;
9936 
9937  /* Check the `accepts' and sift the characters which are not
9938  match it the context. */
9939  if (constraint)
9940  {
9941  if (constraint & NEXT_NEWLINE_CONSTRAINT)
9942  {
9943  bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
9944  bitset_empty (accepts);
9945  if (accepts_newline)
9946  bitset_set (accepts, NEWLINE_CHAR);
9947  else
9948  continue;
9949  }
9950  if (constraint & NEXT_ENDBUF_CONSTRAINT)
9951  {
9952  bitset_empty (accepts);
9953  continue;
9954  }
9955 
9956  if (constraint & NEXT_WORD_CONSTRAINT)
9957  {
9958  bitset_word_t any_set = 0;
9959  if (type == CHARACTER && !node->word_char)
9960  {
9961  bitset_empty (accepts);
9962  continue;
9963  }
9964 #ifdef RE_ENABLE_I18N
9965  if (dfa->mb_cur_max > 1)
9966  for (j = 0; j < BITSET_WORDS; ++j)
9967  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
9968  else
9969 #endif
9970  for (j = 0; j < BITSET_WORDS; ++j)
9971  any_set |= (accepts[j] &= dfa->word_char[j]);
9972  if (!any_set)
9973  continue;
9974  }
9975  if (constraint & NEXT_NOTWORD_CONSTRAINT)
9976  {
9977  bitset_word_t any_set = 0;
9978  if (type == CHARACTER && node->word_char)
9979  {
9980  bitset_empty (accepts);
9981  continue;
9982  }
9983 #ifdef RE_ENABLE_I18N
9984  if (dfa->mb_cur_max > 1)
9985  for (j = 0; j < BITSET_WORDS; ++j)
9986  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
9987  else
9988 #endif
9989  for (j = 0; j < BITSET_WORDS; ++j)
9990  any_set |= (accepts[j] &= ~dfa->word_char[j]);
9991  if (!any_set)
9992  continue;
9993  }
9994  }
9995 
9996  /* Then divide `accepts' into DFA states, or create a new
9997  state. Above, we make sure that accepts is not empty. */
9998  for (j = 0; j < ndests; ++j)
9999  {
10000  bitset_t intersec; /* Intersection sets, see below. */
10001  bitset_t remains;
10002  /* Flags, see below. */
10003  bitset_word_t has_intersec, not_subset, not_consumed;
10004 
10005  /* Optimization, skip if this state doesn't accept the character. */
10006  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
10007  continue;
10008 
10009  /* Enumerate the intersection set of this state and `accepts'. */
10010  has_intersec = 0;
10011  for (k = 0; k < BITSET_WORDS; ++k)
10012  has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
10013  /* And skip if the intersection set is empty. */
10014  if (!has_intersec)
10015  continue;
10016 
10017  /* Then check if this state is a subset of `accepts'. */
10018  not_subset = not_consumed = 0;
10019  for (k = 0; k < BITSET_WORDS; ++k)
10020  {
10021  not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
10022  not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
10023  }
10024 
10025  /* If this state isn't a subset of `accepts', create a
10026  new group state, which has the `remains'. */
10027  if (not_subset)
10028  {
10029  bitset_copy (dests_ch[ndests], remains);
10030  bitset_copy (dests_ch[j], intersec);
10031  err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
10032  if (BE (err != REG_NOERROR, 0))
10033  goto error_return;
10034  ++ndests;
10035  }
10036 
10037  /* Put the position in the current group. */
10038  result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
10039  if (BE (result < 0, 0))
10040  goto error_return;
10041 
10042  /* If all characters are consumed, go to next node. */
10043  if (!not_consumed)
10044  break;
10045  }
10046  /* Some characters remain, create a new group. */
10047  if (j == ndests)
10048  {
10049  bitset_copy (dests_ch[ndests], accepts);
10050  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
10051  if (BE (err != REG_NOERROR, 0))
10052  goto error_return;
10053  ++ndests;
10054  bitset_empty (accepts);
10055  }
10056  }
10057  return ndests;
10058  error_return:
10059  for (j = 0; j < ndests; ++j)
10060  re_node_set_free (dests_node + j);
10061  return -1;
10062 }
10063 
10064 #ifdef RE_ENABLE_I18N
10065 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
10066  Return the number of the bytes the node accepts.
10067  STR_IDX is the current index of the input string.
10068 
10069  This function handles the nodes which can accept one character, or
10070  one collating element like '.', '[a-z]', opposite to the other nodes
10071  can only accept one byte. */
10072 
10073 static int
10074 internal_function
10075 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
10076  const re_string_t *input, int str_idx)
10077 {
10078  const re_token_t *node = dfa->nodes + node_idx;
10079  int char_len, elem_len;
10080  int i;
10081 
10082  if (BE (node->type == OP_UTF8_PERIOD, 0))
10083  {
10084  unsigned char c = re_string_byte_at (input, str_idx), d;
10085  if (BE (c < 0xc2, 1))
10086  return 0;
10087 
10088  if (str_idx + 2 > input->len)
10089  return 0;
10090 
10091  d = re_string_byte_at (input, str_idx + 1);
10092  if (c < 0xe0)
10093  return (d < 0x80 || d > 0xbf) ? 0 : 2;
10094  else if (c < 0xf0)
10095  {
10096  char_len = 3;
10097  if (c == 0xe0 && d < 0xa0)
10098  return 0;
10099  }
10100  else if (c < 0xf8)
10101  {
10102  char_len = 4;
10103  if (c == 0xf0 && d < 0x90)
10104  return 0;
10105  }
10106  else if (c < 0xfc)
10107  {
10108  char_len = 5;
10109  if (c == 0xf8 && d < 0x88)
10110  return 0;
10111  }
10112  else if (c < 0xfe)
10113  {
10114  char_len = 6;
10115  if (c == 0xfc && d < 0x84)
10116  return 0;
10117  }
10118  else
10119  return 0;
10120 
10121  if (str_idx + char_len > input->len)
10122  return 0;
10123 
10124  for (i = 1; i < char_len; ++i)
10125  {
10126  d = re_string_byte_at (input, str_idx + i);
10127  if (d < 0x80 || d > 0xbf)
10128  return 0;
10129  }
10130  return char_len;
10131  }
10132 
10133  char_len = re_string_char_size_at (input, str_idx);
10134  if (node->type == OP_PERIOD)
10135  {
10136  if (char_len <= 1)
10137  return 0;
10138  /* FIXME: I don't think this if is needed, as both '\n'
10139  and '\0' are char_len == 1. */
10140  /* '.' accepts any one character except the following two cases. */
10141  if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
10142  re_string_byte_at (input, str_idx) == '\n') ||
10143  ((dfa->syntax & RE_DOT_NOT_NULL) &&
10144  re_string_byte_at (input, str_idx) == '\0'))
10145  return 0;
10146  return char_len;
10147  }
10148 
10149  elem_len = re_string_elem_size_at (input, str_idx);
10150  if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
10151  return 0;
10152 
10153  if (node->type == COMPLEX_BRACKET)
10154  {
10155  const re_charset_t *cset = node->opr.mbcset;
10156 # ifdef _LIBC
10157  const unsigned char *pin
10158  = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
10159  int j;
10160  uint32_t nrules;
10161 # endif /* _LIBC */
10162  int match_len = 0;
10163  wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
10164  ? re_string_wchar_at (input, str_idx) : 0);
10165 
10166  /* match with multibyte character? */
10167  for (i = 0; i < cset->nmbchars; ++i)
10168  if (wc == cset->mbchars[i])
10169  {
10170  match_len = char_len;
10171  goto check_node_accept_bytes_match;
10172  }
10173  /* match with character_class? */
10174  for (i = 0; i < cset->nchar_classes; ++i)
10175  {
10176  wctype_t wt = cset->char_classes[i];
10177  if (__iswctype (wc, wt))
10178  {
10179  match_len = char_len;
10180  goto check_node_accept_bytes_match;
10181  }
10182  }
10183 
10184 # ifdef _LIBC
10185  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10186  if (nrules != 0)
10187  {
10188  unsigned int in_collseq = 0;
10189  const int32_t *table, *indirect;
10190  const unsigned char *weights, *extra;
10191  const char *collseqwc;
10192  int32_t idx;
10193  /* This #include defines a local function! */
10194 # include <locale/weight.h>
10195 
10196  /* match with collating_symbol? */
10197  if (cset->ncoll_syms)
10198  extra = (const unsigned char *)
10199  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10200  for (i = 0; i < cset->ncoll_syms; ++i)
10201  {
10202  const unsigned char *coll_sym = extra + cset->coll_syms[i];
10203  /* Compare the length of input collating element and
10204  the length of current collating element. */
10205  if (*coll_sym != elem_len)
10206  continue;
10207  /* Compare each bytes. */
10208  for (j = 0; j < *coll_sym; j++)
10209  if (pin[j] != coll_sym[1 + j])
10210  break;
10211  if (j == *coll_sym)
10212  {
10213  /* Match if every bytes is equal. */
10214  match_len = j;
10215  goto check_node_accept_bytes_match;
10216  }
10217  }
10218 
10219  if (cset->nranges)
10220  {
10221  if (elem_len <= char_len)
10222  {
10223  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
10224  in_collseq = __collseq_table_lookup (collseqwc, wc);
10225  }
10226  else
10227  in_collseq = find_collation_sequence_value (pin, elem_len);
10228  }
10229  /* match with range expression? */
10230  for (i = 0; i < cset->nranges; ++i)
10231  if (cset->range_starts[i] <= in_collseq
10232  && in_collseq <= cset->range_ends[i])
10233  {
10234  match_len = elem_len;
10235  goto check_node_accept_bytes_match;
10236  }
10237 
10238  /* match with equivalence_class? */
10239  if (cset->nequiv_classes)
10240  {
10241  const unsigned char *cp = pin;
10242  table = (const int32_t *)
10243  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
10244  weights = (const unsigned char *)
10245  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
10246  extra = (const unsigned char *)
10247  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
10248  indirect = (const int32_t *)
10249  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
10250  idx = findidx (&cp);
10251  if (idx > 0)
10252  for (i = 0; i < cset->nequiv_classes; ++i)
10253  {
10254  int32_t equiv_class_idx = cset->equiv_classes[i];
10255  size_t weight_len = weights[idx];
10256  if (weight_len == weights[equiv_class_idx])
10257  {
10258  int cnt = 0;
10259  while (cnt <= weight_len
10260  && (weights[equiv_class_idx + 1 + cnt]
10261  == weights[idx + 1 + cnt]))
10262  ++cnt;
10263  if (cnt > weight_len)
10264  {
10265  match_len = elem_len;
10266  goto check_node_accept_bytes_match;
10267  }
10268  }
10269  }
10270  }
10271  }
10272  else
10273 # endif /* _LIBC */
10274  {
10275  /* match with range expression? */
10276 #if __GNUC__ >= 2
10277  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
10278 #else
10279  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
10280  cmp_buf[2] = wc;
10281 #endif
10282  for (i = 0; i < cset->nranges; ++i)
10283  {
10284  cmp_buf[0] = cset->range_starts[i];
10285  cmp_buf[4] = cset->range_ends[i];
10286  if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
10287  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
10288  {
10289  match_len = char_len;
10290  goto check_node_accept_bytes_match;
10291  }
10292  }
10293  }
10294  check_node_accept_bytes_match:
10295  if (!cset->non_match)
10296  return match_len;
10297  else
10298  {
10299  if (match_len > 0)
10300  return 0;
10301  else
10302  return (elem_len > char_len) ? elem_len : char_len;
10303  }
10304  }
10305  return 0;
10306 }
10307 
10308 # ifdef _LIBC
10309 static unsigned int
10310 internal_function
10311 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
10312 {
10313  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
10314  if (nrules == 0)
10315  {
10316  if (mbs_len == 1)
10317  {
10318  /* No valid character. Match it as a single byte character. */
10319  const unsigned char *collseq = (const unsigned char *)
10320  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
10321  return collseq[mbs[0]];
10322  }
10323  return UINT_MAX;
10324  }
10325  else
10326  {
10327  int32_t idx;
10328  const unsigned char *extra = (const unsigned char *)
10329  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
10330  int32_t extrasize = (const unsigned char *)
10331  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
10332 
10333  for (idx = 0; idx < extrasize;)
10334  {
10335  int mbs_cnt, found = 0;
10336  int32_t elem_mbs_len;
10337  /* Skip the name of collating element name. */
10338  idx = idx + extra[idx] + 1;
10339  elem_mbs_len = extra[idx++];
10340  if (mbs_len == elem_mbs_len)
10341  {
10342  for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
10343  if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
10344  break;
10345  if (mbs_cnt == elem_mbs_len)
10346  /* Found the entry. */
10347  found = 1;
10348  }
10349  /* Skip the byte sequence of the collating element. */
10350  idx += elem_mbs_len;
10351  /* Adjust for the alignment. */
10352  idx = (idx + 3) & ~3;
10353  /* Skip the collation sequence value. */
10354  idx += sizeof (uint32_t);
10355  /* Skip the wide char sequence of the collating element. */
10356  idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
10357  /* If we found the entry, return the sequence value. */
10358  if (found)
10359  return *(uint32_t *) (extra + idx);
10360  /* Skip the collation sequence value. */
10361  idx += sizeof (uint32_t);
10362  }
10363  return UINT_MAX;
10364  }
10365 }
10366 # endif /* _LIBC */
10367 #endif /* RE_ENABLE_I18N */
10368 
10369 /* Check whether the node accepts the byte which is IDX-th
10370  byte of the INPUT. */
10371 
10372 static int
10373 internal_function
10374 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
10375  int idx)
10376 {
10377  unsigned char ch;
10378  ch = re_string_byte_at (&mctx->input, idx);
10379  switch (node->type)
10380  {
10381  case CHARACTER:
10382  if (node->opr.c != ch)
10383  return 0;
10384  break;
10385 
10386  case SIMPLE_BRACKET:
10387  if (!bitset_contain (node->opr.sbcset, ch))
10388  return 0;
10389  break;
10390 
10391 #ifdef RE_ENABLE_I18N
10392  case OP_UTF8_PERIOD:
10393  if (ch >= 0x80)
10394  return 0;
10395  /* FALLTHROUGH */
10396 #endif
10397  case OP_PERIOD:
10398  if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
10399  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
10400  return 0;
10401  break;
10402 
10403  default:
10404  return 0;
10405  }
10406 
10407  if (node->constraint)
10408  {
10409  /* The node has constraints. Check whether the current context
10410  satisfies the constraints. */
10411  unsigned int context = re_string_context_at (&mctx->input, idx,
10412  mctx->eflags);
10413  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
10414  return 0;
10415  }
10416 
10417  return 1;
10418 }
10419 
10420 /* Extend the buffers, if the buffers have run out. */
10421 
10422 static reg_errcode_t
10423 internal_function
10424 extend_buffers (re_match_context_t *mctx)
10425 {
10427  re_string_t *pstr = &mctx->input;
10428 
10429  /* Double the lengthes of the buffers. */
10430  ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
10431  if (BE (ret != REG_NOERROR, 0))
10432  return ret;
10433 
10434  if (mctx->state_log != NULL)
10435  {
10436  /* And double the length of state_log. */
10437  /* XXX We have no indication of the size of this buffer. If this
10438  allocation fail we have no indication that the state_log array
10439  does not have the right size. */
10440  re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
10441  pstr->bufs_len + 1);
10442  if (BE (new_array == NULL, 0))
10443  return REG_ESPACE;
10444  mctx->state_log = new_array;
10445  }
10446 
10447  /* Then reconstruct the buffers. */
10448  if (pstr->icase)
10449  {
10450 #ifdef RE_ENABLE_I18N
10451  if (pstr->mb_cur_max > 1)
10452  {
10453  ret = build_wcs_upper_buffer (pstr);
10454  if (BE (ret != REG_NOERROR, 0))
10455  return ret;
10456  }
10457  else
10458 #endif /* RE_ENABLE_I18N */
10459  build_upper_buffer (pstr);
10460  }
10461  else
10462  {
10463 #ifdef RE_ENABLE_I18N
10464  if (pstr->mb_cur_max > 1)
10465  build_wcs_buffer (pstr);
10466  else
10467 #endif /* RE_ENABLE_I18N */
10468  {
10469  if (pstr->trans != NULL)
10470  re_string_translate_buffer (pstr);
10471  }
10472  }
10473  return REG_NOERROR;
10474 }
10475 
10476 
10477 /* Functions for matching context. */
10478 
10479 /* Initialize MCTX. */
10480 
10481 static reg_errcode_t
10482 internal_function
10483 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
10484 {
10485  mctx->eflags = eflags;
10486  mctx->match_last = -1;
10487  if (n > 0)
10488  {
10489  mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
10490  mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
10491  if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
10492  return REG_ESPACE;
10493  }
10494  /* Already zero-ed by the caller.
10495  else
10496  mctx->bkref_ents = NULL;
10497  mctx->nbkref_ents = 0;
10498  mctx->nsub_tops = 0; */
10499  mctx->abkref_ents = n;
10500  mctx->max_mb_elem_len = 1;
10501  mctx->asub_tops = n;
10502  return REG_NOERROR;
10503 }
10504 
10505 /* Clean the entries which depend on the current input in MCTX.
10506  This function must be invoked when the matcher changes the start index
10507  of the input, or changes the input string. */
10508 
10509 static void
10510 internal_function
10511 match_ctx_clean (re_match_context_t *mctx)
10512 {
10513  int st_idx;
10514  for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
10515  {
10516  int sl_idx;
10517  re_sub_match_top_t *top = mctx->sub_tops[st_idx];
10518  for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
10519  {
10520  re_sub_match_last_t *last = top->lasts[sl_idx];
10521  re_free (last->path.array);
10522  re_free (last);
10523  }
10524  re_free (top->lasts);
10525  if (top->path)
10526  {
10527  re_free (top->path->array);
10528  re_free (top->path);
10529  }
10530  free (top);
10531  }
10532 
10533  mctx->nsub_tops = 0;
10534  mctx->nbkref_ents = 0;
10535 }
10536 
10537 /* Free all the memory associated with MCTX. */
10538 
10539 static void
10540 internal_function
10541 match_ctx_free (re_match_context_t *mctx)
10542 {
10543  /* First, free all the memory associated with MCTX->SUB_TOPS. */
10544  match_ctx_clean (mctx);
10545  re_free (mctx->sub_tops);
10546  re_free (mctx->bkref_ents);
10547 }
10548 
10549 /* Add a new backreference entry to MCTX.
10550  Note that we assume that caller never call this function with duplicate
10551  entry, and call with STR_IDX which isn't smaller than any existing entry.
10552 */
10553 
10554 static reg_errcode_t
10555 internal_function
10556 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
10557  int to)
10558 {
10559  if (mctx->nbkref_ents >= mctx->abkref_ents)
10560  {
10561  struct re_backref_cache_entry* new_entry;
10562  new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
10563  mctx->abkref_ents * 2);
10564  if (BE (new_entry == NULL, 0))
10565  {
10566  re_free (mctx->bkref_ents);
10567  return REG_ESPACE;
10568  }
10569  mctx->bkref_ents = new_entry;
10570  memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
10571  sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
10572  mctx->abkref_ents *= 2;
10573  }
10574  if (mctx->nbkref_ents > 0
10575  && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
10576  mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
10577 
10578  mctx->bkref_ents[mctx->nbkref_ents].node = node;
10579  mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
10580  mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
10581  mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
10582 
10583  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
10584  If bit N is clear, means that this entry won't epsilon-transition to
10585  an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
10586  it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
10587  such node.
10588 
10589  A backreference does not epsilon-transition unless it is empty, so set
10590  to all zeros if FROM != TO. */
10591  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
10592  = (from == to ? ~0 : 0);
10593 
10594  mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
10595  if (mctx->max_mb_elem_len < to - from)
10596  mctx->max_mb_elem_len = to - from;
10597  return REG_NOERROR;
10598 }
10599 
10600 /* Search for the first entry which has the same str_idx, or -1 if none is
10601  found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
10602 
10603 static int
10604 internal_function
10605 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
10606 {
10607  int left, right, mid, last;
10608  last = right = mctx->nbkref_ents;
10609  for (left = 0; left < right;)
10610  {
10611  mid = (left + right) / 2;
10612  if (mctx->bkref_ents[mid].str_idx < str_idx)
10613  left = mid + 1;
10614  else
10615  right = mid;
10616  }
10617  if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
10618  return left;
10619  else
10620  return -1;
10621 }
10622 
10623 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
10624  at STR_IDX. */
10625 
10626 static reg_errcode_t
10627 internal_function
10628 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
10629 {
10630 #ifdef DEBUG
10631  assert (mctx->sub_tops != NULL);
10632  assert (mctx->asub_tops > 0);
10633 #endif
10634  if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
10635  {
10636  int new_asub_tops = mctx->asub_tops * 2;
10637  re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
10638  re_sub_match_top_t *,
10639  new_asub_tops);
10640  if (BE (new_array == NULL, 0))
10641  return REG_ESPACE;
10642  mctx->sub_tops = new_array;
10643  mctx->asub_tops = new_asub_tops;
10644  }
10645  mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
10646  if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
10647  return REG_ESPACE;
10648  mctx->sub_tops[mctx->nsub_tops]->node = node;
10649  mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
10650  return REG_NOERROR;
10651 }
10652 
10653 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
10654  at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
10655 
10656 static re_sub_match_last_t *
10657 internal_function
10658 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
10659 {
10660  re_sub_match_last_t *new_entry;
10661  if (BE (subtop->nlasts == subtop->alasts, 0))
10662  {
10663  int new_alasts = 2 * subtop->alasts + 1;
10664  re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
10665  re_sub_match_last_t *,
10666  new_alasts);
10667  if (BE (new_array == NULL, 0))
10668  return NULL;
10669  subtop->lasts = new_array;
10670  subtop->alasts = new_alasts;
10671  }
10672  new_entry = calloc (1, sizeof (re_sub_match_last_t));
10673  if (BE (new_entry != NULL, 1))
10674  {
10675  subtop->lasts[subtop->nlasts] = new_entry;
10676  new_entry->node = node;
10677  new_entry->str_idx = str_idx;
10678  ++subtop->nlasts;
10679  }
10680  return new_entry;
10681 }
10682 
10683 static void
10684 internal_function
10685 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
10686  re_dfastate_t **limited_sts, int last_node, int last_str_idx)
10687 {
10688  sctx->sifted_states = sifted_sts;
10689  sctx->limited_states = limited_sts;
10690  sctx->last_node = last_node;
10691  sctx->last_str_idx = last_str_idx;
10692  re_node_set_init_empty (&sctx->limits);
10693 }
10694 
10695 
10696 /* Binary backward compatibility. */
10697 #if _LIBC
10698 # include <shlib-compat.h>
10699 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3)
10700 link_warning (re_max_failures, "the 're_max_failures' variable is obsolete and will go away.")
10701 int re_max_failures = 2000;
10702 # endif
10703 #endif
10704 #endif
regoff_t * end
Definition: gkregex.h:419
void regfree(regex_t *__preg)
ssize_t hash(handle obj)
Definition: pytypes.h:457
int array[24]
#define RE_DOT_NOT_NULL
Definition: gkregex.h:93
reg_syntax_t re_set_syntax(reg_syntax_t __syntax)
static const Key c2
#define max(a, b)
Definition: datatypes.h:20
constexpr int last(int, int result)
#define RE_DOT_NEWLINE
Definition: gkregex.h:89
#define RE_NO_BK_PARENS
Definition: gkregex.h:119
#define REGS_REALLOCATE
Definition: gkregex.h:384
reg_errcode_t
Definition: gkregex.h:303
return int(ret)+1
#define SIZE_MAX
Definition: ms_stdint.h:211
#define REG_ICASE
Definition: gkregex.h:272
for(size_t i=1;i< poses.size();++i)
EIGEN_DEVICE_FUNC const ExpReturnType exp() const
Q id(Eigen::AngleAxisd(0, Q_z_axis))
#define __restrict
Definition: gkregex.h:522
uint32_t uint_fast32_t
Definition: ms_stdint.h:115
KeyVector nodes
Definition: testMFAS.cpp:28
regoff_t rm_so
Definition: gkregex.h:436
int n
#define RE_LIMITED_OPS
Definition: gkregex.h:106
Scalar Scalar * c
Definition: benchVecAdd.cpp:17
#define RE_SYNTAX_POSIX_BASIC
Definition: gkregex.h:230
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
char * fastmap
Definition: gkregex.h:362
MatrixXd L
Definition: LLT_example.cpp:6
#define RE_NO_EMPTY_RANGES
Definition: gkregex.h:133
size_t regerror(int __errcode, const regex_t *__restrict __preg, char *__restrict __errbuf, size_t __errbuf_size)
static char trans
if((m *x).isApprox(y))
#define RE_CONTEXT_INDEP_OPS
Definition: gkregex.h:81
const mpreal root(const mpreal &x, unsigned long int k, mp_rnd_t r=mpreal::get_default_rnd())
Definition: mpreal.h:2194
#define REG_NOTEOL
Definition: gkregex.h:294
reg_syntax_t syntax
Definition: gkregex.h:357
regoff_t rm_eo
Definition: gkregex.h:437
#define RE_CONTEXT_INVALID_OPS
Definition: gkregex.h:85
#define RE_UNMATCHED_RIGHT_PAREN_ORD
Definition: gkregex.h:137
#define REG_NOTBOL
Definition: gkregex.h:291
#define RE_NO_BK_VBAR
Definition: gkregex.h:127
#define RE_TRANSLATE_TYPE
Definition: gkregex.h:340
int re_match(struct re_pattern_buffer *__buffer, const char *__string, int __length, int __start, struct re_registers *__regs)
static char left
#define RE_HAT_LISTS_NOT_NEWLINE
Definition: gkregex.h:97
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
void re_set_registers(struct re_pattern_buffer *__buffer, struct re_registers *__regs, unsigned int __num_regs, regoff_t *__starts, regoff_t *__ends)
int re_match_2(struct re_pattern_buffer *__buffer, const char *__string1, int __length1, const char *__string2, int __length2, int __start, struct re_registers *__regs, int __stop)
constexpr int first(int i)
Implementation details for constexpr functions.
Values result
#define REG_NEWLINE
Definition: gkregex.h:277
int re_search_2(struct re_pattern_buffer *__buffer, const char *__string1, int __length1, const char *__string2, int __length2, int __start, int __range, struct re_registers *__regs, int __stop)
Definition: pytypes.h:928
unsigned num_regs
Definition: gkregex.h:417
int regexec(const regex_t *__restrict __preg, const char *__restrict __string, size_t __nmatch, regmatch_t __pmatch[__restrict_arr], int __eflags)
#define RE_BACKSLASH_ESCAPE_IN_LISTS
Definition: gkregex.h:48
int re_search(struct re_pattern_buffer *__buffer, const char *__string, int __length, int __start, int __range, struct re_registers *__regs)
unsigned fastmap_accurate
Definition: gkregex.h:390
unsigned int uint32_t
Definition: ms_stdint.h:85
size_t re_nsub
Definition: gkregex.h:371
signed int int32_t
Definition: ms_stdint.h:82
#define REGS_FIXED
Definition: gkregex.h:385
const char * re_compile_pattern(const char *__pattern, size_t __length, struct re_pattern_buffer *__buffer)
#define RE_INVALID_INTERVAL_ORD
Definition: gkregex.h:158
int data[]
int regcomp(regex_t *__restrict __preg, const char *__restrict __pattern, int __cflags)
RealScalar s
unsigned long int used
Definition: gkregex.h:354
EIGEN_DEVICE_FUNC const Scalar & q
unsigned not_eol
Definition: gkregex.h:401
static char right
#define NULL
Definition: ccolamd.c:609
regoff_t * start
Definition: gkregex.h:418
unsigned newline_anchor
Definition: gkregex.h:404
RowVector3d w
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set offsets
#define REG_STARTEND
Definition: gkregex.h:298
unsigned can_be_null
Definition: gkregex.h:377
#define RE_NO_BK_REFS
Definition: gkregex.h:123
int regoff_t
Definition: gkregex.h:410
unsigned char * buffer
Definition: gkregex.h:348
#define RE_CONTEXT_INVALID_DUP
Definition: gkregex.h:171
unsigned not_bol
Definition: gkregex.h:398
#define REGS_UNALLOCATED
Definition: gkregex.h:383
ArrayXXf table(10, 4)
unsigned long int allocated
Definition: gkregex.h:351
DenseIndex ret
Definition: level1_impl.h:59
unsigned long int reg_syntax_t
Definition: gkregex.h:44
#define RE_SYNTAX_POSIX_EXTENDED
Definition: gkregex.h:239
#define RE_NO_SUB
Definition: gkregex.h:175
Matrix stack(size_t nrMatrices,...)
Definition: Matrix.cpp:396
float * p
#define RE_NO_GNU_OPS
Definition: gkregex.h:145
void gkfooo()
Definition: gkregex.c:22
#define RE_DUP_MAX
Definition: gkregex.h:261
Annotation for function names.
Definition: attr.h:36
#define REG_EXTENDED
Definition: gkregex.h:268
#define REG_NOSUB
Definition: gkregex.h:281
unsigned no_sub
Definition: gkregex.h:394
#define RE_BK_PLUS_QM
Definition: gkregex.h:53
reg_syntax_t re_syntax_options
size_t len(handle h)
Definition: pytypes.h:1514
#define RE_INTERVALS
Definition: gkregex.h:102
def parse(input_path, output_path, quiet=False, generate_xml_flag=True)
Definition: parse_xml.py:10
unsigned regs_allocated
Definition: gkregex.h:386
#define RE_NEWLINE_ALT
Definition: gkregex.h:110
#define RE_ICASE
Definition: gkregex.h:162
#define RE_CARET_ANCHORS_HERE
Definition: gkregex.h:167
Definition: pytypes.h:1325
#define RE_CONTEXT_INDEP_ANCHORS
Definition: gkregex.h:73
std::ptrdiff_t j
int re_compile_fastmap(struct re_pattern_buffer *__buffer)
#define RE_CHAR_CLASSES
Definition: gkregex.h:59
Point2 t(10, 10)
RE_TRANSLATE_TYPE translate
Definition: gkregex.h:368
Definition: pytypes.h:897
#define RE_NO_BK_BRACES
Definition: gkregex.h:115


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:42:09