2 * Copyright Neil Brown ©2015-2023 <neil@brown.name>
3 * May be distributed under terms of GPLv2 - see file:COPYING
5 * rexel - A Regular EXpression Evaluation Library
6 * because everyone needs their own regex library
8 * This library supports matching without backtracking by providing
9 * a single character at a time. When a match is found, the
10 * length of that match is reported.
12 * Compiled form of a regex is an array of 16 bit unsigned numbers called
13 * rexels, or Regular EXpression ELements.
14 * This involves some cheating as wctype_t (unsigned long int) values
15 * are stored in 16 bits.
16 * This array is comprised of a "regexp" section follow by some a "set"
18 * The first entry in the regex section is the size of that section (including
19 * the length). Adding this size to the start gives the start of the
22 * The "set" section contains some "sets" each of which contains 1 or
23 * more subsections followed by a "zero". Each subsection starts with
24 * its size. The first section can have size zero, others cannot (as
25 * another zero marks the end of the "set").
26 * The first subsection of a set is a list of "character classes".
27 * An internal mapping is created from used character classes (like
28 * "digit" and "lower" etc) to small numbers. If a set should match a
29 * given character class, the small number is stored in this subsection
30 * If a set should *not* match, then the small number is added with the
33 * Subsequent subsections contain a general character-set, each
34 * subsection for a single unicode plane. The top six bits of the first
35 * entry is the plane number, the remaining bits are the size. After
36 * this are "size" 16bit chars in sorted order. The values in even
37 * slots are in the set, values in odd slots are not. Value not in any
38 * slot are treated like the largest value less than it which does have
39 * a slot. So iff a search for "largest entry nor larger than" finds an
40 * even slot, then the target is in the set.
42 * The rexels in the "regexp" section come in 4 groups.
43 * 0: 15 bit unicode number. Other unicode numbers require the plane
44 * to be given first, described below.
45 * 100: address of a "regex" subarray. The match forks at this point,
46 * both the next entry and the addressed entry are considered.
47 * This limits total size to 8192 entries.
49 * 101: address of a char set. This 13 bit address is an offset from the
50 * start of the "set" section.
52 * 1100: start of a capture group. 4096 possible groups.
53 * 1101: end of a capture group.
55 * 1110: must match the most recent instance of the capture group.
57 * 1111 0000 000: A UNICODE plane number - 0..16 The next 16 bit
58 * number is combined with the plane number to produce a code point
59 * to match. The first half of plane zero does not need this
60 * 32bit format - those value can be given in 16 bites.
62 * The last several values have special meanings.
63 * 0xffe0 - match any char
64 * 0xffe1 - match any character except any EOL character
65 * 0xffe2 - match no char - dead end.
66 * 0xffe3 - report success.
67 * 0xffe4 - match at start of line
68 * 0xffe5 - match at end of line
69 * 0xffe6 - match at start of word
70 * 0xffe7 - match at end of word
71 * 0xffe8 - match a word break (start or end)
72 * 0xffe9 - match any point that isn't a word break.
73 * 0xffea - match start of document
74 * 0xffeb - match end of document
75 * 0xffec - match the location indicated by RXL_POINT
77 * 0xfffb - match any quote: " ' ` - lax searching
78 * 0xfffc - match 1 or more spaces/tabs/newlines - lax searching.
79 * 0xfffd - match - or _ - lax searching
80 * 0xfffe - Subsequent chars (0x..) are matched ignoring case
81 * 0xffff - Subsequent chars are match case-correct.
83 * When matching, two pairs of extra arrays are allocated and used.
84 * One pair is 'before', one pair is 'after'. They swap on each char.
85 * One contains a threaded linkage among all points in regex subarray
86 * which are currently matched. A 'zero' marks the end of the chain.
87 * The other records the length of the longest match at that point.
88 * So when a char is matched, the length+1 of the 'before' moves
89 * to the 'after' position.
91 * A match is *before* processing the index command.
93 * "man 7 regex" described POSIX regular expression and notes some area
94 * where implementations differ, using (!). The terminology describes
95 * a Regular Expression (RE) as:
96 * RE -> branch ( '|' branch ) * # 1 or more branches
98 * branch -> piece ( piece ) * # 1 or more pieces,
100 * piece -> atom ( '*' | '+' | '?' | bound )? # an atom, possibly
101 * # followed by repeater
102 * bound -> '{' N ( ',' ( N )? )? '}' # 1 or 2 numbers in braces.
103 * atom -> '(' RE ')' | C | '.' | \?? # dot or char or
104 * # RE in parentheses.
106 * Responding to each implementation difference:
107 * - There must be at least one branch in an RE, and all must be non-empty.
108 * - A branch needs at least one piece.
109 * - This implementation (currently) only allows a *single* '*','+','?'
111 * - Integers in a bound must be less than 256.
112 * - The empty string atom '()' is not permitted
113 * - '\C', where 'C' is a special character: one of ^.[$()|*+?{\ removes any
114 * special meaning from that character. This does not apply inside []
115 * as those characters have no special meaning, or a different meaning there.
116 * - '\C', where 'C' is not in that list is an error except for those used for
117 * some special character classes. Those classes which are not
118 * "everything except" are permitted equally inside character sets ([]).
121 * \p a punctuation character
122 * \s a spacing character
123 * \w a word (alphabetic character)
124 * \D \P \S \W are negation of above. So non-digit, non-punctuation etc.
125 * \A an upper case character
126 * \a a lower case character
128 * - A '{' followed by a non-digit is just a '{'
129 * - Two ranges may *not* share an endpoint.
130 * - equivalence classes and collating elements are not implemented.
131 * - No particular limit on the length of an RE is imposed (yet)
135 * ?: - a group that will never be captured
136 * ?| - don't capture, and within each branch any caputuring uses
138 * ?0 - all remainder is literal
139 * ?nnn:- a group of precisely nnn literal match chars
140 * ?isLn-isLn: - flags before any optional '-' are set. Flags after
143 * L - lax matching for space, hyphen, and quote
144 * s - single line, '.' matches newline
145 * n - no capture in subgroups either
158 #include <sys/mman.h>
159 #include <sys/stat.h>
170 unsigned short *rxl safe;
171 unsigned short * safe link[2];
172 unsigned short * safe leng[2];
173 unsigned long * safe ignorecase;
174 unsigned short active;
180 /* Backtracking state */
182 /* 'buf' is an allocated buffer of 'buf_size'.
183 * There is text up to buf_count which we have been asked
184 * to match against. We are currently considering the
185 * wchar at buf_pos to see if it matches. If buf_pos == buf_count,
186 * rxl_advance_bt() will return so that another char can be provided.
189 unsigned short buf_size;
190 unsigned short buf_pos, buf_count;
191 /* pos in an index in 'rxl' that buf[buf_pos] must be matched against. */
193 /* 'record' is an allocated buffer of record_size of
194 * which record_count entries record details of the current match
195 * of buf..buf_pos from above.
196 * We only record fork points that were taken on the path through the
197 * pattern which achieved the current match. During matching, we may
198 * need to rewind to these, and follow forward from that fork.
199 * During capture extraction, we need to follow those to find
200 * how the match applied.
206 unsigned short record_count;
207 unsigned short record_size;
214 #define wctype_cells (sizeof(wctype_t) / sizeof(unsigned short))
216 /* ignorecase is a bit set */
217 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
218 static inline int BITSET_SIZE(int bits)
220 return (bits + BITS_PER_LONG - 1) / BITS_PER_LONG;
222 static inline bool test_bit(int bit, const unsigned long *set safe)
224 return !!(set[bit / BITS_PER_LONG] & (1 << (bit % BITS_PER_LONG)));
227 static inline void set_bit(int bit, unsigned long *set safe)
229 set[bit / BITS_PER_LONG] |= (1 << (bit % BITS_PER_LONG));
232 static inline void clear_bit(int bit, unsigned long *set safe)
234 set[bit / BITS_PER_LONG] &= ~(1 << (bit % BITS_PER_LONG));
237 #define NO_LINK 0xFFFF
238 #define LOOP_CHECK 0xFFFE
241 #define REC_ANY 0xFFE0
242 #define REC_ANY_NONL 0xFFE1
243 #define REC_NONE 0xFFE2
244 #define REC_MATCH 0xFFE3
246 #define REC_SOL 0xFFE4
247 #define REC_EOL 0xFFE5
248 #define REC_SOW 0xFFE6
249 #define REC_EOW 0xFFE7
250 #define REC_WBRK 0xFFE8
251 #define REC_NOWBRK 0xFFE9
252 #define REC_SOD 0xFFEa /* Start of Document */
253 #define REC_EOD 0xFFEb /* End of Document */
254 #define REC_POINT 0xFFEc
256 #define REC_LAXQUOT 0xFFFb
257 #define REC_LAXSPC 0xFFFc
258 #define REC_LAXDASH 0xFFFd
259 #define REC_IGNCASE 0xFFFe
260 #define REC_USECASE 0xFFFf
262 #define REC_FORK 0x8000
263 #define REC_FORKLAST 0x8000
264 #define REC_FORKFIRST 0x9000
265 #define REC_SET 0xa000
266 #define REC_CAPTURE 0xc000
267 #define REC_CAPTURED 0xc800
269 #define REC_BACKREF 0xe000
270 #define REC_PLANE 0xf000
271 #define REC_ISCHAR(x) (!((x) & 0x8000))
272 #define REC_ISSPEC(x) ((x) >= REC_ANY)
273 #define REC_ISFORK(x) (((x) & 0xe000) == REC_FORK)
274 #define REC_ISSET(x) (((x) & 0xe000) == REC_SET)
275 #define REC_ISCAPTURE(x) (((x) & 0xf000) == REC_CAPTURE)
276 #define REC_ISBACKREF(x) (((x) & 0xf000) == REC_BACKREF)
277 #define REC_BACKREF_MAKE(x) (REC_BACKREF | ((x) & 0x7ff))
278 #define REC_ADDR(x) ((x) & 0x0fff)
279 #define REC_ISFIRST(x) (!!((x) & (REC_FORKFIRST ^ REC_FORKLAST)))
280 #define REC_CAPNUM(x) ((x) & 0x07ff)
281 #define REC_CAPEND(x) ((x) & 0x0800)
282 #define REC_ISPLANE(x) (((x) & 0xFFE0) == 0xF000)
283 #define REC_PLANE_OF(x) ((x) & 0x1F)
284 #define REC_PLANE_MAKE(x, y) ((REC_PLANE_OF(x)<<16) | y)
286 static inline bool rec_noop(unsigned short cmd)
288 /* These commands don't affect matching directly, they
289 * are extracted and used outside the core matcher.
291 return cmd == REC_IGNCASE || cmd == REC_USECASE ||
295 static inline bool rec_zerowidth(unsigned short cmd)
297 return ((cmd >= REC_SOL && cmd <= REC_NOWBRK) ||
301 /* First entry contains start of maps, and flags */
302 #define RXL_PATNLEN(rxl) ((rxl)[0] & 0x3fff)
303 #define RXL_SETSTART(rxl) ((rxl) + RXL_PATNLEN(rxl))
305 static enum rxl_found rxl_advance_bt(struct match_state *st safe, wint_t ch);
306 static void bt_link(struct match_state *st safe, int pos, int len);
307 static int get_capture(struct match_state *s safe, int n, int which,
308 char *buf, int *startp);
311 * The match_state contains several partial matches that lead to "here".
312 * rxl_advance() examines each of these to determine if they will still match
313 * after consuming either a character or a position-type flag (SOL, EOL, etc).
314 * It calls do_link for each case that is still a possible match.
315 * 'pos' is the position in the regexp that matches the new point in the target.
316 * 'dest' is the place in the new threaded list to record this match. i.e.
317 * the slot that it currently the end of the list.
318 * 'len' is the length of the match of to this (new) point in the target.
319 * If there is already a match to this point in the pattern, we just update
320 * the length and don't relink anything.
323 static void do_link(struct match_state *st safe, int pos, int *destp, int len)
325 unsigned short cmd = st->rxl[pos];
327 while (rec_noop(cmd)) {
331 if (cmd == REC_MATCH) {
334 /* Don't accept another start point */
339 bt_link(st, pos, len);
345 if (cmd == REC_NOWBRK) {
346 /* NOWBRK is special because it matches a character
347 * without consuming it. We link them at the start
348 * of the list so they can be found quickly.
350 if (st->link[st->active][pos] == NO_LINK) {
351 st->leng[st->active][pos] = len;
353 if (st->link[st->active][0] == 0)
355 st->link[st->active][pos] =
356 st->link[st->active][0];
357 st->link[st->active][0] = pos;
358 } else if (st->leng[st->active][pos] < len)
359 st->leng[st->active][pos] = len;
360 } else if (!REC_ISFORK(cmd)) {
361 /* not a FORK, so just link it in. */
362 if (st->link[st->active][pos] == NO_LINK) {
363 st->leng[st->active][pos] = len;
365 st->link[st->active][*destp] = pos;
366 st->link[st->active][pos] = 0;
368 } else if (st->leng[st->active][pos] < len)
369 st->leng[st->active][pos] = len;
370 } else if (st->link[st->active][pos] == NO_LINK ||
371 st->leng[st->active][pos] < len) {
372 st->link[st->active][pos] = LOOP_CHECK;
373 st->leng[st->active][pos] = len;
374 do_link(st, REC_ADDR(cmd), destp, len);
375 do_link(st, pos+1, destp, len);
379 static int set_match(struct match_state *st safe, unsigned short addr,
382 unsigned short *set safe = RXL_SETSTART(st->rxl) + addr;
383 wchar_t uch = ch, lch = ch;
388 /* As Unicode has 3 cases, can we be sure that everything
389 * has a 'lower' to map to? Surely everything has at least
390 * and upper or a lower...
395 /* First there is an 'invert' flag and possibly some char classes */
397 invert = !!(len & 0x8000);
400 for ( ; len; set += wctype_cells) {
403 memcpy(&class, set, sizeof(wctype_t));
404 if (iswctype(uch, class) ||
405 (uch != lch && iswctype(lch, class)))
409 /* now there might be some sets. Each set starts with a size with
410 * top 5 bytes indicating top bytes of unicode planes, and bottom
411 * 11 bytes size of table
413 while ((len = *set++) != 0) {
414 int high = (len & 0xF800) << 5;
415 /* Both upper and lower case have been placed in set,
416 * so only need to search for one of them
418 unsigned short target;
422 if ((uch & 0x1f0000) == high)
423 target = uch & 0xffff;
424 else if ((lch & 0x1f0000) == high)
425 target = lch & 0xffff;
430 /* Binary search to find first entry that is greater
433 lo = 0; /* Invar: No entry before lo is greater than target */
434 hi = len; /* Invar: Every entry at or after hi is greater
437 /* Sanity check - array must be sorted */
438 for (lo = 1; lo < len; lo++)
439 if (set[lo-1] >= set[lo]) {
440 printf("Set %d, subset %d not ordered at %u\n",
441 addr, set - RXL_SETSTART(st->rxl) - addr,
449 int mid = (lo + hi) / 2;
450 if (set[mid] > target)
455 /* set[lo] == set[hi] = first entry greater than target.
456 * If 'lo' is even, there was no match. If 'lo' is odd,
467 * Advance the match state to process 'ch' or a flag.
468 * flag indicates start/end of word/line.
469 * Returns -2 if there is no possibility of a match including this ch/flag
470 * Returns -1 if part of the pattern has matched, and more input is needed.
471 * Returns >=0 if a match has been found. The return value is the number
472 * of characters (not flags) in the match.
473 * When a >=0 return is given, it might still be useful to keep calling
474 * rxl_advance if a maximal match is wanted.
475 * If the match must be anchor to the first character, this must have been
476 * advised to rxl_prepare. The caller should keep calling until no chars are
477 * left, or -2 is returned (no match), or >=0 is returned. In the latter case
478 * the call might keep calling to see if a longer match is possible. Until -2
479 * is seen, a longer match is still possible.
482 static int advance_one(struct match_state *st safe, int i,
486 unsigned int cmd = st->rxl[i];
487 wint_t lch = ch, uch = ch;
488 bool ic = test_bit(i, st->ignorecase);
497 if (REC_ISSPEC(cmd)) {
506 if (ch == '\n' || ch == '\r' || ch == '\f')
512 /* cannot match more chars here */
554 if (flag & RXL_POINT)
564 else if (flag & RXL_NOWBRK || !flag)
572 else if (flag & RXL_NOWBRK || !flag)
578 if (flag & (RXL_SOW | RXL_EOW))
580 else if (flag & RXL_NOWBRK || !flag)
586 if (flag & (RXL_SOW | RXL_EOW))
588 else if (flag & RXL_NOWBRK)
594 if (strchr(" \t\r\n\f", ch) != NULL)
602 if (strchr("'`\"", ch) != NULL)
610 if (strchr("-_.", ch) != NULL)
619 /* expecting a char, so ignore position info */
621 } else if (REC_ISCHAR(cmd)) {
623 (ic && (cmd == uch || cmd == lch)))
627 } else if (REC_ISPLANE(cmd) && i+1 < RXL_PATNLEN(st->rxl)) {
628 cmd = REC_PLANE_MAKE(cmd, st->rxl[i+1]);
631 (ic && (cmd == uch || cmd == lch)))
635 } else if (REC_ISSET(cmd)) {
636 if (set_match(st, REC_ADDR(cmd), ch, ic))
640 } else if (REC_ISBACKREF(cmd)) {
641 /* Backref not supported */
644 /* Nothing else is possible here */
648 /* no match on this path */
650 else if (advance == 0)
651 /* Nothing conclusive here */
652 do_link(st, i, eolp, len);
654 /* Need to advance and link the new address in. However
655 * if there is a fork, we might need to link multiple
656 * addresses in. Best use recursion.
658 do_link(st, i+inc, eolp, len + (ch != 0));
662 enum rxl_found rxl_advance(struct match_state *st safe, wint_t ch)
668 wint_t flag = ch & ~(0x1fffff);
669 enum rxl_found ret = RXL_NOMATCH;
672 return rxl_advance_bt(st, ch);
676 if (flag && ((flag & (flag-1)) || ch)) {
679 /* Need to handle flags separately */
680 for (f = RXL_SOD ; f && f <= RXL_LAST; f <<= 1) {
683 r = rxl_advance(st, f);
691 if (flag == RXL_ANCHOR) {
699 if (flag == RXL_NOWBRK &&
700 (st->link[active][0] == NO_LINK ||
701 st->rxl[st->link[active][0]] != REC_NOWBRK))
702 /* Reporting not-a-word-boundary, but no-one cares,
705 return st->match >= 0 ? RXL_MATCH_FLAG :
706 st->len >= 0 ? RXL_CONTINUE : RXL_NOMATCH;
712 /* We haven't found a match yet and search is not anchored,
713 * so possibly start a search here.
716 while (st->link[active][eol])
717 eol = st->link[active][eol];
718 /* Found the end of the list. */
719 do_link(st, 1, &eol, 0);
726 /* Trace shows current length at each state. FORK points
727 * are not state points
728 * At each point we print the Char or Set number, then the
729 * length of a match to there - on the next line.
730 * Allow 4 chars per column
734 int len = RXL_PATNLEN(st->rxl);
736 for (i = 1; i < len; i++)
737 if (!REC_ISFORK(st->rxl[i])) {
738 unsigned short cmd = st->rxl[i];
739 if (REC_ISCHAR(cmd)) {
740 if (cmd > ' ' && cmd < 0x7f)
741 printf("'%c' ", cmd);
744 } else if (REC_ISSET(cmd)) {
745 printf("S%-3d", REC_ADDR(cmd));
746 } else if (REC_ISBACKREF(cmd)) {
747 printf("B%-3d", REC_CAPNUM(cmd));
748 } else if (REC_ISCAPTURE(cmd)) {
750 printf("%3d)", REC_CAPNUM(cmd));
752 printf("(%-3d", REC_CAPNUM(cmd));
754 case REC_ANY: printf(" . "); break;
755 case REC_ANY_NONL: printf(" .? "); break;
756 case REC_NONE:printf("## "); break;
757 case REC_SOL: printf(" ^ "); break;
758 case REC_EOL: printf(" $ "); break;
759 case REC_SOW: printf("\\< "); break;
760 case REC_EOW: printf("\\> "); break;
761 case REC_SOD: printf("\\` "); break;
762 case REC_EOD: printf("\\' "); break;
763 case REC_POINT: printf("\\= "); break;
764 case REC_WBRK: printf("\\b "); break;
765 case REC_NOWBRK: printf("\\B "); break;
766 case REC_MATCH:printf("!!! "); break;
767 case REC_LAXSPC: printf("x20!"); break;
768 case REC_LAXQUOT: printf("'! "); break;
769 case REC_LAXDASH: printf("-! "); break;
770 case REC_IGNCASE: printf("?i:"); break;
771 case REC_USECASE: printf("?c:"); break;
772 default: printf("!%04x", cmd);
776 for (i = 1 ; i < len; i++)
777 if (!REC_ISFORK(st->rxl[i])) {
778 if (st->link[active][i] == NO_LINK)
781 printf("%2d ", st->leng[active][i]);
785 if (flag & RXL_SOD) printf(" SOD");
786 if (flag & RXL_SOL) printf(" SOL");
787 if (flag & RXL_EOL) printf(" EOL");
788 if (flag & RXL_EOD) printf(" EOD");
789 if (flag & RXL_SOW) printf(" SOW");
790 if (flag & RXL_EOW) printf(" EOW");
791 if (flag & RXL_NOWBRK) printf(" NOWBRK");
792 if (flag & RXL_POINT) printf(" POINT");
794 printf("Match %s(%x)",
795 ch < ' ' ? "?" : put_utf8(t, ch) , ch);
797 /* Now check the linkage is correct. The chain should lead
798 * to 0 without seeing any 'NO_LINK' or any ISFORK, and
799 * the number of NO_LINK plus number on chain should make len
804 if (st->link[active][i] == NO_LINK)
806 if (i && REC_ISFORK(st->rxl[i]))
809 i = st->link[active][i];
813 for (i = 0; i < len; i++)
814 if (st->link[active][i] == NO_LINK ||
815 st->link[active][i] == LOOP_CHECK)
821 /* Firstly, clear out next lists */
822 /* This works because NO_LINK is 0xffff */
823 memset(st->link[next], 0xff, RXL_PATNLEN(st->rxl) * 2);
824 memset(st->leng[next], 0, RXL_PATNLEN(st->rxl) * 2);
825 st->link[next][0] = 0;
827 /* Now advance each current match */
828 for (i = st->link[active][0]; i; i = st->link[active][i]) {
829 int len = st->leng[active][i];
831 advance_one(st, i, ch, flag, len, &eol);
833 st->link[next][eol] = 0;
834 if (st->match > st->len) {
836 st->start = st->total - st->len;
840 if (st->match >= 0 || eol != 0)
841 printf(" ... -> %d\n", st->match);
843 printf(" ... -> NOMATCH\n");
846 /* 'ret' is the result of an flags, or RXL_NOMATCH. */
847 if (ret >= RXL_MATCH && !flag && st->match >= 0)
849 if (ret >= RXL_MATCH_FLAG || (st->match >= 0 && flag))
850 return RXL_MATCH_FLAG;
851 if (ret >= RXL_MATCH || st->match >= 0)
853 if (eol == 0 && st->match < 0 && st->anchored) {
854 /* No chance of finding (another) match now */
862 int rxl_info(struct match_state *st safe, int *lenp safe, int *totalp,
863 int *startp, int *since_startp)
878 *since_startp = st->total - st->start;
880 /* Return 'true' if there might be something here and
881 * we cannot safely skip forward
883 return st->anchored ||
884 (st->link[st->active] && st->link[st->active][0] != 0);
887 #define RESIZE(buf) \
889 if ((buf##_size) <= (buf##_count+1)) { \
890 if ((buf##_size) < 8) \
893 (buf) = realloc(buf, (buf##_size) * sizeof((buf)[0]));\
897 static void bt_link(struct match_state *st safe, int pos, int len)
899 unsigned short cmd = st->rxl[pos];
901 if (!REC_ISFORK(cmd)) {
907 st->record[st->record_count].pos = pos;
908 st->record[st->record_count].len = len;
909 st->record_count += 1;
910 if (REC_ISFIRST(cmd))
911 /* priority fork - follow the fork */
912 do_link(st, REC_ADDR(cmd), NULL, len);
914 /* just continue for now, fork later */
915 do_link(st, pos + 1, NULL, len);
918 static enum rxl_found rxl_advance_bt(struct match_state *st safe, wint_t ch)
920 /* This is a back-tracking version of rxl_advance(). If the new
921 * ch/flag matches, we store it and return. If it doesn't, we
922 * backtrack and retry another path until we run out of all
923 * paths, or find a path where more input is needed.
926 if (st->anchored && st->record_count == 0 && st->pos == 0)
933 st->buf[st->buf_count++] = ch;
940 ch = st->buf[st->buf_pos++];
941 flags = ch & ~0x1FFFFF;
944 for (f = RXL_SOD ; f && f <= RXL_EOD*2; f <<= 1) {
951 else if (f == RXL_EOD*2) {
953 printf("%d: Match %s(%x)", i,
954 ch < ' ' ? "?" : put_utf8(t, ch) , ch);
955 } else if (f & flags){
956 printf("%d: Flag:", i);
957 if (f & RXL_SOD) printf(" SOD");
958 if (f & RXL_EOD) printf(" EOD");
959 if (f & RXL_SOL) printf(" SOL");
960 if (f & RXL_EOL) printf(" EOL");
961 if (f & RXL_SOW) printf(" SOW");
962 if (f & RXL_EOW) printf(" EOW");
963 if (f & RXL_NOWBRK) printf(" NOWBRK");
964 if (f & RXL_POINT) printf(" POINT");
968 if (f == RXL_EOD*2 && ch)
969 r = advance_one(st, i, ch, 0,
970 st->buf_pos - 1, NULL);
971 else if (f == RXL_EOD*2)
974 r = advance_one(st, i, 0, f,
975 st->buf_pos - 1, NULL);
980 /* REC_BACKREF cannot be handled generically
981 * by advance_one. We need to find the
982 * referenced string and match - or ask for more.
986 int len = get_capture(st, REC_CAPNUM(st->rxl[i]), 0,
989 prevpos = st->buf_pos;
990 while (len > 0 && st->buf_pos < st->buf_count) {
991 ch = st->buf[st->buf_pos++] & 0x1FFFFF;
992 if ((st->buf[start] & 0x1FFFFF) == ch) {
996 /* Failure to match */
1001 /* Need more input */
1002 st->buf_pos = prevpos;
1003 return RXL_CONTINUE;
1006 /* Successful match */
1007 do_link(st, i+1, NULL, st->buf_pos);
1013 /* st->pos has been advanced if needed */
1014 if (st->match > st->len) {
1015 st->len = st->match;
1016 st->start = st->total - st->buf_count;
1020 printf(" -- OK(%d %d %d/%d)\n", r,
1022 st->start, st->len);
1029 /* match failed - backtrack */
1030 if (st->record_count > 0) {
1031 st->record_count -= 1;
1032 st->pos = st->record[st->record_count].pos;
1033 st->buf_pos = st->record[st->record_count].len;
1036 printf(" -- NAK backtrack to %d/%d\n",
1037 st->pos, st->buf_pos);
1039 if (REC_ISFIRST(st->rxl[st->pos]))
1040 /* priority jump, just step forward now */
1041 do_link(st, st->pos+1, NULL, st->buf_pos);
1043 /* delayed jump, take it now */
1044 do_link(st, REC_ADDR(st->rxl[st->pos]),
1047 /* cannot backtrack, so skip first char unless anchored */
1050 printf(" -- NAK - %s\n",
1051 st->anchored?"abort":"fail");
1058 memmove(st->buf, st->buf+1,
1059 sizeof(st->buf[0]) * st->buf_count);
1061 do_link(st, 1, NULL, st->buf_pos);
1065 } while (st->buf_pos < st->buf_count);
1070 return RXL_CONTINUE;
1078 CapturePerBranch=16,
1081 struct parse_state {
1082 const char *patn safe;
1083 unsigned short *rxl;
1085 unsigned short *sets;
1086 int set; /* Next offset to store a set */
1087 enum modifier mod; /* Multiple 'or'ed together */
1090 /* Details of set currently being parsed */
1094 static void add_cmd(struct parse_state *st safe, unsigned short cmd)
1097 st->rxl[st->next] = cmd;
1101 static void relocate(struct parse_state *st safe, unsigned short start, int len)
1108 for (i = st->next-1; i >= start; i-=1) {
1109 unsigned short cmd = st->rxl[i];
1110 if (REC_ISFORK(cmd) &&
1111 REC_ADDR(cmd) >= start)
1113 st->rxl[i+len] = cmd;
1118 static wint_t cvt_hex(const char *s safe, int len)
1122 if (!*s || !isxdigit(*s))
1128 rv += *s - 'A' + 10;
1130 rv += *s - 'a' + 10;
1139 static wint_t cvt_oct(const char **sp safe, int maxlen)
1141 const char *s = *sp;
1144 if (!s || !*s || !isdigit(*s) || *s == '8' || *s == '9')
1155 static bool _add_range(struct parse_state *st safe, wchar_t start, wchar_t end,
1156 int plane, int *planes safe, int *newplane safe)
1161 unsigned short *set;
1165 /* guess 2 entries for each plane, plus 1
1166 * if we add a plane. Each plane needs an extra slot
1167 * if the set is inverted.
1169 for (p = (start & 0x1F0000)>>16;
1170 p <= (end & 0x1F0000)>>16 ;
1172 if (!((*planes) & (1 << p))) {
1178 /* All planes handled, so set *newplane beyond
1181 *newplane = 0x11 << 16;
1184 /* OK, for real this time, need to build up set 'plane' */
1185 if (start >= ((plane+1) << 16)) {
1186 /* Nothing to do for this plane, move to 'start' */
1187 *newplane = start >> 16;
1190 if (end < (plane << 16)) {
1191 /* nothing more to do */
1192 *newplane = 0x11 << 16;
1195 /* Contract range to this plane */
1196 if (start < (plane << 16))
1197 start = plane << 16;
1198 if (end >= ((plane+1) << 16))
1199 end = ((plane+1) << 16) - 1;
1200 if (!((*planes) & (1 << plane))) {
1201 st->sets[st->set] = (plane << 11);
1202 *planes |= 1 << plane;
1204 /* now clip to 16bit */
1208 /* Now insert range into the list.
1209 * 1/ Perform search for 'start'.
1210 * 2/ If at 'even' offset then not present yet.
1211 * 2a/ if 'start-1' is present, update that to end
1212 * 2b/ if next is <= end, update that to start
1213 * 2c/ otherwise shift up and insert range - done.
1214 * 3/ if at 'odd' offset then is in already
1215 * 3a/ if next is beyond 'end', then done
1216 * 3b/ otherwise update next to end
1217 * 4/ while ranges over-lap, delete two endpoints
1222 set = st->sets + st->set + 1;
1223 /* Binary search to find first entry that is greater
1226 lo = 0; /* Invar: No entry before lo is greater than target */
1227 hi = len; /* Invar: Every entry at or after hi is greater than target */
1229 int mid = (lo + hi) / 2;
1230 if (set[mid] > start)
1235 /* set[lo] == set[hi] = first entry greater than target.
1236 * If 'lo' is even, there was no match. If 'lo' is odd,
1239 if ((lo & 1) == 0) {
1240 /* Not yet present.*/
1241 if (lo > 0 && set[lo-1] == start) {
1242 /* Extend the earlier range */
1248 } else if (lo < len && set[lo] <= end+1)
1251 /* need to insert */
1252 memmove(set+lo+2, set+lo, sizeof(set[lo])*(len-lo));
1262 /* Already present, lo is end of a range, or beyond len */
1263 if (lo == len || set[lo] > end)
1264 /* nothing to do */;
1269 /* Lo now points to the end of a range. If it overlaps the next,
1272 while (lo+1 < len && set[lo] >= set[lo+1]) {
1273 /* Need to merge these ranges */
1275 if (set[lo] > set[lo+2])
1276 set[lo+2] = set[lo];
1277 memmove(set+lo, set+lo+2,
1278 sizeof(set[lo])*(len - (lo+2)));
1286 static bool add_range(struct parse_state *st safe, wchar_t start, wchar_t end,
1287 int plane, int *planes safe, int *newplane safe)
1289 if (!(st->mod & IgnoreCase) ||
1290 !iswalpha(start) || !iswalpha(end))
1291 return _add_range(st, start, end, plane, planes, newplane);
1292 if (!_add_range(st, towlower(start), towlower(end),
1293 plane, planes, newplane))
1295 return _add_range(st, towupper(start), towupper(end),
1296 plane, planes, newplane);
1299 static void add_class(struct parse_state *st safe, int plane, wctype_t cls)
1302 /* already handled. */
1306 memcpy(&st->sets[st->set + st->len + 1], &cls,
1308 st->len += wctype_cells;
1311 static bool is_set_element(const char *p safe)
1315 if (p[0] != '.' && p[0] != '=' && p[0] != ':')
1317 for (i = 1; p[i]; i++)
1319 if (p[i-1] == p[1] && i > 1)
1327 static int do_parse_set(struct parse_state *st safe, int plane)
1329 const char *p safe = st->patn;
1331 int newplane = 0xFFFFFF;
1334 /* first characters are special... */
1342 ch = get_utf8(&p, NULL);
1343 if (ch == '\\' && p[0] && strchr("0xuUnrft", p[0]) != NULL) {
1345 case '0': ch = cvt_oct(&p, 3); break;
1346 case 'x': ch = cvt_hex(p, 2); p += 2; break;
1347 case 'u': ch = cvt_hex(p, 4); p += 4; break;
1348 case 'U': ch = cvt_hex(p, 8); p += 8; break;
1349 case 't': ch = '\t'; break;
1350 case 'n': ch = '\n'; break;
1351 case 'r': ch = '\r'; break;
1352 case 'f': ch = '\f'; break;
1360 } else if (ch == '[' && is_set_element(p)) {
1362 case '.': /* collating set */
1363 case '=': /* collating element */
1367 case ':': /* character class */
1376 cls = strndup(p, e-p);
1382 while (*p && *p != ']')
1385 add_class(st, plane, wct);
1389 } else if (p[0] == '-' && p[1] != ']') {
1393 ch2 = get_utf8(&p, NULL);
1395 !add_range(st, ch, ch2, plane, &planes, &newplane))
1397 } else if (p[0] == ']' && (p == st->patn ||
1398 (invert && p == st->patn+1))) {
1399 if (!add_range(st, ch, ch, plane, &planes, &newplane))
1401 } else if (ch == '\\' && p[0] > 0 && p[0] < 0x7f && p[1] != '-'
1402 && strchr("daApsw", p[0]) != NULL) {
1404 case 'd': add_class(st, plane, wctype("digit")); break;
1405 case 'a': add_class(st, plane, wctype("lower")); break;
1406 case 'A': add_class(st, plane, wctype("upper")); break;
1407 case 'p': add_class(st, plane, wctype("punct")); break;
1408 case 's': add_class(st, plane, wctype("space")); break;
1409 case 'h': add_class(st, plane, wctype("blank")); break;
1410 case 'w': add_class(st, plane, wctype("alpha")); break;
1414 if (!add_range(st, ch, ch, plane, &planes, &newplane))
1417 } while (*p != ']');
1421 /* We have a (possibly empty) class list. Record size */
1422 unsigned short l = st->len;
1425 st->sets[st->set] = l;
1427 /* We have a set, not empty. Store size */
1428 st->sets[st->set] = st->len | (plane << 11);
1431 st->set += st->len+1;
1435 static bool parse_set(struct parse_state *st safe)
1441 if (*st->patn++ != '[')
1443 /* parse the set description multiple times if necessary
1444 * building up each sub table one at a time.
1445 * First time through we do classes, and report which set
1446 * to do next. Then again for each Unicode plane that
1448 * do_parse_set returns -1 on error, next plane number needed,
1449 * or a number larger than any valid plane number when done.
1450 * When pre-parsing to calculate sizes, we guess the sizes on a single
1451 * walk through - possibly over-estimating.
1454 plane = -1; /* Code for "parse classes" */
1458 plane = do_parse_set(st, plane);
1459 } while (plane >= 0 && plane <= 0x100000);
1463 st->sets[st->set] = 0;
1465 add_cmd(st, REC_SET | set);
1469 static unsigned short add_class_set(struct parse_state *st safe,
1470 char *cls safe, int in)
1474 /* Need the length header, a class, and another zero length */
1475 st->set += 1 + wctype_cells + 1;
1478 st->sets[set] = wctype_cells + (in ? 0 : 0x8000);
1480 add_class(st, -1, wctype(cls));
1481 st->sets[set + 1 + wctype_cells] = 0;
1482 st->set += 1 + wctype_cells + 1;
1483 return REC_SET | set;
1485 static bool parse_re(struct parse_state *st safe, int capture);
1486 static bool parse_atom(struct parse_state *st safe)
1488 /* parse out an atom: one of:
1495 * char - including UTF8
1497 * If there is a syntax error, return False, else return True;
1501 bool is_char = False;
1503 if (*st->patn == '\0')
1505 if (*st->patn == '.') {
1506 if (st->mod & SingleLine)
1507 add_cmd(st, REC_ANY);
1509 add_cmd(st, REC_ANY_NONL);
1513 if (*st->patn == '(') {
1515 if (!parse_re(st, 0))
1517 if (*st->patn != ')')
1522 if (*st->patn == '^') {
1523 add_cmd(st, REC_SOL);
1527 if (*st->patn == '$') {
1528 if (isdigit(st->patn[1])) {
1529 /* $n and $nn is a backref */
1530 ch = st->patn[1] - '0';
1532 if (isdigit(st->patn[0])) {
1533 ch = ch * 10 + st->patn[0] - '0';
1536 add_cmd(st, REC_BACKREF_MAKE(ch));
1539 add_cmd(st, REC_EOL);
1543 if (*st->patn == '[')
1544 return parse_set(st);
1545 if ((st->mod & LaxMatch) &&
1546 st->patn[0] == ' ' && st->patn[1] != ' ' && st->patn[1] != '\t' &&
1547 (st->next == 1 || (st->patn[-1] != ' ' && st->patn[-1] != '\t'))) {
1548 add_cmd(st, REC_LAXSPC);
1549 /* LAXSPC can be repeated */
1550 add_cmd(st, REC_FORKFIRST | (st->next - 1));
1554 if ((st->mod & LaxMatch) &&
1555 (st->patn[0] == '"' || st->patn[0] == '`' || st->patn[0] == '\'')) {
1556 add_cmd(st, REC_LAXQUOT);
1560 if ((st->mod & LaxMatch) &&
1561 (st->patn[0] == '-' || st->patn[0] == '_')) {
1562 add_cmd(st, REC_LAXDASH);
1566 if (*st->patn & 0x80) {
1567 ch = get_utf8(&st->patn, NULL);
1578 /* These just fall through and are interpreted
1595 /* These are simple translations */
1596 case '`': ch = REC_SOD; break;
1597 case '\'':ch = REC_EOD; break;
1598 case '=': ch = REC_POINT; break;
1599 case '<': ch = REC_SOW; break;
1600 case '>': ch = REC_EOW; break;
1601 case 'b': ch = REC_WBRK; break;
1602 case 'B': ch = REC_NOWBRK; break;
1603 case 't': ch = '\t'; break;
1604 case 'n': ch = '\n'; break;
1605 case 'r': ch = '\r'; break;
1606 case 'f': ch = '\f'; break;
1607 case '0': ch = cvt_oct(&st->patn, 4);
1611 case 'x': ch = cvt_hex(st->patn+1, 2);
1617 case 'u': ch = cvt_hex(st->patn+1, 4);
1623 case 'U': ch = cvt_hex(st->patn+1, 8);
1629 case '1'...'9': ch = st->patn[0] - '0';
1630 while (st->patn[1] >= '0' && st->patn[1] <= '9') {
1631 ch = ch * 10 + st->patn[1] - '0';
1634 ch = REC_BACKREF_MAKE(ch);
1636 case 'd': ch = add_class_set(st, "digit", 1); break;
1637 case 'D': ch = add_class_set(st, "digit", 0); break;
1638 case 's': ch = add_class_set(st, "space", 1); break;
1639 case 'S': ch = add_class_set(st, "space", 0); break;
1640 case 'h': ch = add_class_set(st, "blank", 1); break;
1641 case 'H': ch = add_class_set(st, "blank", 0); break;
1642 case 'w': ch = add_class_set(st, "alpha", 1); break;
1643 case 'W': ch = add_class_set(st, "alpha", 0); break;
1644 case 'p': ch = add_class_set(st, "punct", 1); break;
1645 case 'P': ch = add_class_set(st, "punct", 0); break;
1647 case 'a': ch = add_class_set(st, "lower", 1); break;
1648 case 'A': ch = add_class_set(st, "upper", 0); break;
1650 /* Anything else is an error (e.g. \0) or
1651 * reserved for future use.
1653 default: return False;
1656 if (ch < 0x8000 || !is_char) {
1658 } else if (ch <= 0x10FFFF) {
1659 add_cmd(st, REC_PLANE | (ch >> 16));
1660 add_cmd(st, ch & 0xFFFF);
1666 static bool parse_piece(struct parse_state *st safe)
1668 int start = st->next;
1675 if (!parse_atom(st))
1678 if (c != '*' && c != '+' && c != '?' &&
1679 !(c=='{' && isdigit(st->patn[1])))
1685 /* make spare for 'jump forward' */
1686 relocate(st, start, 1);
1687 if (st->patn[0] == '?') {
1689 /* non-greedy match */
1690 add_cmd(st, REC_FORKLAST | (start+1));
1692 st->rxl[start] = REC_FORKFIRST | st->next;
1694 add_cmd(st, REC_FORKFIRST | (start+1));
1696 st->rxl[start] = REC_FORKLAST | st->next;
1700 /* just (optional) jump back */
1701 if (st->patn[0] == '?') {
1704 add_cmd(st, REC_FORKLAST | start);
1706 add_cmd(st, REC_FORKFIRST | start);
1709 /* Just a jump-forward */
1710 relocate(st, start, 1);
1711 if (st->patn[0] == '?') {
1714 st->rxl[start] = REC_FORKFIRST | st->next;
1717 st->rxl[start] = REC_FORKLAST | st->next;
1720 case '{':/* Need a number, maybe a comma, if not maybe a number,
1721 * then }, and optionally a '?' after that.
1723 min = strtoul(st->patn, &ep, 10);
1724 if (min > 256 || !ep)
1731 max = strtoul(ep, &ep, 10);
1732 if (max > 256 || max < min || !ep)
1739 nongreedy = (REC_FORKLAST ^ REC_FORKFIRST);
1743 /* Atom need to be repeated min times, and maybe as many
1744 * as 'max', or indefinitely if max < 0
1747 /* Make a duplicate */
1748 int newstart = st->next;
1749 relocate(st, start, st->next - start);
1755 /* Need to allow the atom to be skipped */
1756 relocate(st, start, 1);
1758 st->rxl[start] = (REC_FORKLAST^nongreedy) | st->next;
1764 add_cmd(st, (REC_FORKFIRST^nongreedy) | start);
1765 } else if (max > 1) {
1766 /* need to duplicate atom but make each one optional */
1767 int len = st->next - start;
1768 int last = st->next + (len + 1) * (max-1);
1769 if (skip && st->rxl) {
1770 st->rxl[skip] = (REC_FORKLAST^nongreedy) | last;
1774 add_cmd(st, (REC_FORKLAST^nongreedy) | last);
1775 newstart = st->next;
1776 relocate(st, start, len+1);
1781 if (last != st->next) {
1794 static bool parse_branch(struct parse_state *st safe)
1797 if (!parse_piece(st))
1799 switch (*st->patn) {
1803 /* repeated modifier - illegal */
1806 } while (*st->patn && *st->patn != '|' && *st->patn != ')');
1811 * like 'strstr', but length is passed for both 'needle' and
1812 * 'haystack', and it also finds a match for a prefix of needle
1814 * This can be used to accelerate search for verbatim content.
1816 int rxl_fast_match(const char *needle safe, int nlen,
1817 const char *haystack safe, int hlen)
1821 while (hlen >= nlen) {
1825 while (haystack[i] == needle[i]) {
1835 /* Maybe a suffix of haystack is a prefix of needle */
1840 while (haystack[i] == needle[i]) {
1850 /* No match - return original hlen, when is now in ret */
1854 static int parse_prefix(struct parse_state *st safe)
1856 /* A prefix to an re starts with '?' and 1 or more
1857 * chars depending on what the next char is.
1858 * A () group that starts '?' is never captured.
1860 * ?: no-op. Only effect is to disable capture This applies
1862 * ?| groups in each branch use same capture numbering
1863 * ?0 All chars to end of pattern are literal matches. This cannot
1865 * ?N..: where N is a digit 1-9, it and all digits upto ':' give the
1866 * number of chars for literal pattern. This must be the whole re,
1867 * so either ) follows, or end-of-pattern.
1869 * Each letter represents a flag which is set if it appears before
1870 * '-' and cleared if it appears after. The '-' is optional, but
1872 * i ignore case in this group
1873 * L lax matching for space and hyphen in this group
1874 * s single-line. '.' matches newline in this group
1875 * n no capture in subgroups unless 'n' is explicitly re-enabled.
1876 * So if 'n' is already disabled, you need
1878 * to create a captured group
1879 * any other flag before the ':' causes an error
1884 * 1 good prefix, st->mod updated if needed.
1885 * 2 literal parsed, don't parse further for this re.
1892 if (*st->patn != '?')
1901 st->mod |= CapturePerBranch;
1911 for (; *s && *s != ':'; s++)
1917 st->mod |= IgnoreCase;
1919 st->mod &= ~IgnoreCase;
1922 st->mod |= LaxMatch;
1924 st->mod &= ~LaxMatch;
1927 st->mod |= SingleLine;
1929 st->mod &= ~SingleLine;
1932 st->mod &= ~DoCapture;
1934 st->mod |= DoCapture;
1945 verblen = strtoul(s, &ve, 10);
1946 if (!ve || *ve != ':')
1955 while (verblen && *s) {
1958 ch = get_utf8(&s, NULL);
1970 static bool parse_re(struct parse_state *st safe, int capture_flag)
1974 int save_mod = st->mod;
1976 int capture = st->capture;
1977 int capture_start, capture_max;
1978 int do_capture = st->mod & DoCapture;
1980 switch (parse_prefix(st)) {
1981 case -1: /* error */
1986 /* Don't capture if inside () */
1987 do_capture = capture_flag;
1990 /* Fully parsed - no more is permitted */
1993 if ((st->mod ^ save_mod) & IgnoreCase) {
1994 /* change of ignore-case status */
1995 if (st->mod & IgnoreCase)
1996 add_cmd(st, REC_IGNCASE);
1998 add_cmd(st, REC_USECASE);
2002 add_cmd(st, REC_CAPTURE | capture);
2005 capture_start = capture_max = st->capture;
2006 start = re_start = st->next;
2008 while ((ret = parse_branch(st)) != 0 && *st->patn == '|') {
2010 relocate(st, start, 1);
2012 st->rxl[start] = REC_FORKLAST | (st->next + 2);
2013 add_cmd(st, REC_FORKLAST | start); /* will become 'jump to end' */
2014 add_cmd(st, REC_NONE);
2016 if (st->mod & CapturePerBranch) {
2017 /* Restart capture index each time */
2018 if (st->capture > capture_max)
2019 capture_max = st->capture;
2020 st->capture = capture_start;
2023 if (ret && st->rxl) {
2024 /* Need to patch all the "jump to end" links */
2025 while (start > re_start) {
2026 unsigned short cmd = st->rxl[start - 2];
2027 st->rxl[start - 2] = REC_FORKLAST | st->next;
2028 start = REC_ADDR(cmd);
2031 if (st->mod & CapturePerBranch && st->capture < capture_max)
2032 st->capture = capture_max;
2034 add_cmd(st, REC_CAPTURED | capture);
2035 if ((st->mod ^ save_mod) & IgnoreCase) {
2036 /* restore from char of ignore-case status */
2037 if (save_mod & IgnoreCase)
2038 add_cmd(st, REC_IGNCASE);
2040 add_cmd(st, REC_USECASE);
2046 unsigned short *rxl_parse(const char *patn safe, int *lenp, int nocase)
2048 struct parse_state st;
2050 st.mod = nocase ? IgnoreCase | LaxMatch : 0;
2051 st.mod |= DoCapture;
2058 add_cmd(&st, REC_IGNCASE);
2059 if (!parse_re(&st, DoCapture) || *st.patn != '\0') {
2061 *lenp = st.patn - patn;
2064 add_cmd(&st, REC_MATCH);
2065 st.rxl = malloc((st.next + st.set) * sizeof(st.rxl[0]));
2066 st.rxl[0] = st.next;
2067 st.sets = st.rxl + st.next;
2069 st.mod = nocase ? IgnoreCase | LaxMatch : 0;
2070 st.mod |= DoCapture;
2075 add_cmd(&st, REC_IGNCASE);
2076 if (!parse_re(&st, DoCapture)) {
2084 add_cmd(&st, REC_MATCH);
2088 unsigned short *safe rxl_parse_verbatim(const char *patn safe, int nocase)
2090 struct parse_state st;
2093 st.next = 1 + !!nocase + strlen(patn) + 1;
2094 st.rxl = malloc(st.next * sizeof(st.rxl[0]));
2095 st.rxl[0] = st.next;
2098 add_cmd(&st, REC_IGNCASE);
2099 while ((ch = get_utf8(&patn, NULL)) < WERR)
2101 add_cmd(&st, REC_MATCH);
2105 int rxl_prefix(unsigned short *rxl safe, char *ret safe, int max)
2107 /* Return in ret a contant prefix of the search string,
2108 * if there is one. Report the number of bytes.
2110 int len = RXL_PATNLEN(rxl);
2114 for (i = 1; i < len; i++) {
2115 if (REC_ISCHAR(rxl[i])) {
2116 int l = utf8_bytes(rxl[i]);
2117 if (l == 0 || found + l > max)
2120 put_utf8(ret + found, rxl[i]);
2124 if (REC_ISPLANE(rxl[i]) && i+1 < len) {
2125 int ch = REC_PLANE_MAKE(rxl[i], rxl[i+1]);
2126 int l = utf8_bytes(ch);
2128 if (l == 0 || found + l > max)
2131 put_utf8(ret + found, ch);
2135 if (rxl[i] >= REC_SOL && rxl[i] <= REC_POINT)
2136 /* zero-width match doesn't change prefix */
2138 if (rxl[i] == REC_USECASE)
2139 /* Insisting on case-correct doesn't change prefix */
2141 if (REC_ISCAPTURE(rxl[i]))
2143 /* Nothing more that is fixed */
2149 struct match_state *safe rxl_prepare(unsigned short *rxl safe, int flags)
2151 struct match_state *st;
2152 int len = RXL_PATNLEN(rxl);
2157 st = malloc(sizeof(*st));
2158 memset(st, 0, sizeof(*st));
2160 st->anchored = flags & RXLF_ANCHORED;
2161 st->backtrack = flags & RXLF_BACKTRACK;
2162 if (!st->backtrack) {
2163 st->link[0] = malloc(len * sizeof(unsigned short) * 4);
2164 st->link[1] = st->link[0] + len;
2165 st->leng[0] = st->link[1] + len;
2166 st->leng[1] = st->leng[0] + len;
2168 st->ignorecase = calloc(BITSET_SIZE(len), sizeof(*st->ignorecase));
2171 if (!st->backtrack) {
2175 for (i = 1; i < len; i++) {
2176 if (rxl[i] == REC_IGNCASE)
2178 if (rxl[i] == REC_USECASE)
2181 set_bit(i, st->ignorecase);
2182 if (!st->backtrack) {
2183 st->link[0][i] = NO_LINK;
2184 st->link[1][i] = NO_LINK;
2188 /* Set linkage to say we have a zero-length match
2189 * at the start state.
2193 do_link(st, 1, &eol, 0);
2198 void rxl_free_state(struct match_state *s)
2202 free(s->ignorecase);
2209 static int get_capture(struct match_state *s safe, int n, int which,
2210 char *buf, int *startp)
2212 int bufp = 0; /* Index in s->buf */
2213 int rxlp = 1; /* Index in s->rxl */
2214 int recp = 0; /* Index in s->rec */
2215 int rxll = RXL_PATNLEN(s->rxl);
2219 while (rxlp < rxll && recp <= s->record_count) {
2220 unsigned short cmd = s->rxl[rxlp];
2222 if (REC_ISFORK(cmd)) {
2223 bool recorded = False;
2224 if (recp < s->record_count &&
2225 s->record[recp].pos == rxlp &&
2226 s->record[recp].len == bufp) {
2233 /* This fork was taken */
2234 rxlp = REC_ADDR(cmd);
2236 /* Fork not taken */
2240 if (REC_ISCAPTURE(cmd) &&
2241 REC_CAPNUM(cmd) == n) {
2242 if (REC_CAPEND(cmd)) {
2251 if (rec_zerowidth(cmd)) {
2262 if (len > 0 && buf) {
2264 for (i = 0; i < len; i++)
2265 buf[i] = s->buf[start+i] & 0xff;
2271 int rxl_capture(struct match_state *st safe, int cap, int which,
2272 int *startp safe, int *lenp safe)
2274 int len = get_capture(st, cap, which, NULL, startp);
2280 static int interp(struct match_state *s safe, const char *form safe, char *buf)
2288 if (form[0] != '\\') {
2295 if (form[1] == '\\') {
2302 /* must be \NN to get last of group NN or
2303 * \:NN:MM to get MMth of grop NN
2305 if (isdigit(form[1])) {
2306 which = strtoul(form+1, &e, 10);
2307 } else if (form[1] == ':' && isdigit(form[2])) {
2308 which = strtoul(form+2, &e, 10);
2309 if (!e || e[0] != ':' || !isdigit(e[1]))
2311 n = strtoul(e+2, &e, 10);
2315 n = get_capture(s, which, n, buf ? buf+len : buf, NULL);
2326 char *rxl_interp(struct match_state *s safe, const char *form safe)
2333 size = interp(s, form, NULL);
2336 ret = malloc(size+1);
2337 interp(s, form, ret);
2343 static void printc(unsigned int c)
2345 if (c <= ' ' || c >= 0x7f)
2346 printf("\\x%02x", c);
2351 static void print_set(unsigned short *set safe)
2354 int invert = len & 0x8000;
2360 printf("[{%d}", sizeof(wctype_t));
2363 memcpy(&class, set, sizeof(wctype_t));
2364 set += wctype_cells;
2365 len -= wctype_cells;
2366 printf(":%lx", class);
2370 while ((len = *set++) != 0) {
2371 unsigned int plane = (len & 0xF800) << 5;
2373 printf("%d:[", len);
2375 printc(*set | plane);
2379 printc(set[0] + plane);
2391 void rxl_print(unsigned short *rxl safe)
2393 unsigned short *set = RXL_SETSTART(rxl);
2396 for (i = rxl+1 ; i < set; i++) {
2397 unsigned int cmd = *i;
2399 printf("%04u: ", i-rxl);
2400 if (REC_ISCHAR(cmd))
2401 printf("match %s (#%x)\n", put_utf8(t, cmd), cmd);
2402 else if (REC_ISSPEC(cmd)) {
2404 case REC_ANY: printf("match ANY\n"); break;
2405 case REC_ANY_NONL: printf("match ANY-non-NL\n"); break;
2406 case REC_NONE: printf("DEAD END\n"); break;
2407 case REC_SOL: printf("match start-of-line\n"); break;
2408 case REC_EOL: printf("match end-of-line\n"); break;
2409 case REC_SOW: printf("match start-of-word\n"); break;
2410 case REC_EOW: printf("match end-of-word\n"); break;
2411 case REC_SOD: printf("match start-of-doc\n"); break;
2412 case REC_EOD: printf("match end-of-doc\n"); break;
2413 case REC_POINT: printf("match focus-point\n"); break;
2414 case REC_MATCH: printf("MATCHING COMPLETE\n"); break;
2415 case REC_WBRK: printf("match word-break\n"); break;
2416 case REC_NOWBRK: printf("match non-wordbreak\n"); break;
2417 case REC_LAXSPC: printf("match lax-space\n"); break;
2418 case REC_LAXQUOT: printf("match lax-quote\n"); break;
2419 case REC_LAXDASH: printf("match lax-dash\n"); break;
2420 case REC_IGNCASE: printf("switch ignore-case\n"); break;
2421 case REC_USECASE: printf("switch use-case\n"); break;
2422 default: printf("ERROR %x\n", cmd); break;
2424 } else if (REC_ISPLANE(cmd) && i + 1 < set) {
2425 cmd = REC_PLANE_MAKE(cmd, i[1]);
2427 printf("match %s (#%x)\n", put_utf8(t, cmd), cmd);
2428 } else if (REC_ISFORK(cmd))
2429 printf("branch to %d %s\n", REC_ADDR(cmd),
2430 REC_ISFIRST(cmd) ? "first" : "later");
2431 else if (REC_ISSET(cmd)) {
2432 printf("Match from set %d: ", REC_ADDR(cmd));
2433 print_set(set + REC_ADDR(cmd));
2435 } else if (REC_ISBACKREF(cmd)) {
2436 printf("Backref to capture %d\n", REC_CAPNUM(cmd));
2437 } else if (REC_ISCAPTURE(cmd) && !REC_CAPEND(cmd)) {
2438 printf("Start Capture %d\n", REC_CAPNUM(cmd));
2439 } else if (REC_ISCAPTURE(cmd)) {
2440 printf("End Capture %d\n", REC_CAPNUM(cmd));
2442 printf("ERROR %x\n", cmd);
2452 static struct test {
2453 char *patn, *target;
2454 int flags, start, len;
2455 char *form, *replacement;
2457 { "abc", "the abc", 0, 4, 3},
2458 { "abc\\'", "the abc", 0, 4, 3},
2459 { "\\`abc", "the abc", 0, -1, -1},
2460 { "abc\\=", "abcabcPointabc", 0, 3, 3},
2461 { "abc", "the ABC", F_ICASE, 4, 3},
2462 { "a*", " aaaaac", 0, 0, 0},
2463 { "a*", "aaaaac", 0, 0, 5},
2464 { "a*", "AaAaac", F_ICASE, 0, 5},
2465 { "a+", " aaaaac", 0, 1, 5},
2466 { "?0()+*", " (()+***", 0, 2, 4},
2467 { ".(?2:..).", "abcdefg..hijk", 0, 6, 4},
2468 { "?L:1 2", "hello 1 \t\n 2", 0, 6, 6},
2469 { "?s:a.c", "a\nc abc", 0, 0, 3},
2470 { "?-s:a.c", "a\nc abc", 0, 4, 3},
2471 { "ab[]^-]*cd", "xyab-^^]-cd", 0, 2, 9},
2473 { "?i:(from|to|cc|subject|in-reply-to):", "From: bob", 0, 0, 5 },
2474 // Inverting set of multiple classes
2475 { "[^\\A\\a]", "a", 0, -1, -1},
2476 // Search for start of a C function: non-label at start of line
2477 { "^([^ a-zA-Z0-9#]|[\\A\\a\\d_]+[\\s]*[^: a-zA-Z0-9_])", "hello: ",
2479 // Match an empty string
2480 { "[^a-z]*", "abc", 0, 0, 0},
2481 { "^[^a-z]*", "abc", 0, 0, 0},
2482 // repeated long string - should match longest
2483 { "(hello |there )+", "I said hello there hello to you", 0, 7, 18},
2484 // pattern not complete
2485 { "extra close paren)", "anyting", F_PERR, -2, -2},
2487 { "\\bab", "xyabc acab abc a", 0, 11, 2},
2488 { "\\<ab", "xyabc acab abc a", 0, 11, 2},
2489 { "ab\\>", "ababa abaca ab a", 0, 12, 2},
2490 { "ab\\b", "ababa abaca ab", 0, 12, 2},
2491 { "\\Bab", "ababa abaca ab", 0, 2, 2},
2492 { "[0-9].\\Bab", "012+ab 45abc", 0, 7, 4},
2493 { "[\\060-\\x09].\\Bab", "012+ab 45abc", 0, 7, 4},
2495 { "[\\0101\\0102\\x43\\u0064]*", "ABCddCBA1234", 0, 0, 8},
2496 { "\\011", "a\tb", 0, 1, 1},
2498 { "[\t\r\f\n]*\t\r\f\n", "trfn\t\r\f\n\t\r\f\n", 0, 4, 8},
2499 // backref for backtracking only
2500 { "(.(.).)\\1", "123123", F_BACKTRACK, 0, 6},
2501 // backre must skip partial match
2502 { "a(bcdef)$1", "abcdefbc abcdefbcdefg", F_BACKTRACK, 9, 11},
2504 { "hello there-all", "Hello\t There_ALL-youse", F_ICASE, 0, 17},
2505 { "hello there-all", "Hello\t There_ALL-youse", 0, -1, -1},
2506 { "^[^a-zA-Z0-9\n]*$", "=======", 0, 0, 7},
2507 { "^$", "", 0, 0, 0},
2508 { "a\\S+b", " a b axyb ", 0, 5, 4},
2509 { "a[^\\s]+b", " a b axyb ", 0, 5, 4},
2510 { "a[^\\s123]+b", " a b a12b axyb ", 0, 10, 4},
2511 { "([\\w\\d]+)\\s*=\\s*(.*[^\\s])", " name = some value ", 0, 1, 17,
2512 "\\1,\\2", "name,some value"},
2513 { "\\brl([123]*|end)\\b", " ->rlend = ", 0, 3, 5,
2514 "->ri\\1 = ", "->riend = "},
2515 { "?|foo(bar)|(bat)foo", "foobar", 0, 0, 6, "\\1", "bar"},
2516 { "?|foo(bar)|(bat)foo", "batfoo", 0, 0, 6, "\\1", "bat"},
2517 // compare greedy and non-greedy
2518 { "(.*)=(.*)", "assign=var=val", 0, 0, 14, "\\1..\\2", "assign=var..val"},
2519 { "(.*?)=(.*)", "assign=var=val", 0, 0, 14, "\\1..\\2", "assign..var=val"},
2520 { "(.+)=(.*)", "assign=var=val", 0, 0, 14, "\\1..\\2", "assign=var..val"},
2521 { "(.+?)=(.*)", "assign=var=val", 0, 0, 14, "\\1..\\2", "assign..var=val"},
2522 { "(.{5,15})=(.*)", "assign=var=val", 0, 0, 14, "\\1..\\2", "assign=var..val"},
2523 { "(.{5,15}?)=(.*)", "assign=var=val", 0, 0, 14, "\\1..\\2", "assign..var=val"},
2524 { "(.?)[a-e]*f", "abcdef", 0, 0, 6, "\\1", "a"},
2525 { "(.?""?)[a-e]*f", "abcdef", 0, 0, 6, "\\1", ""},
2526 { "diff|(stg|git).*show", "git diff", 0, 4, 4},
2527 // \h matches space but not newline
2528 { "spa\\hce", "spa\nce spa ce", 0, 7, 6},
2529 // \s matches newline
2530 { "spa\\sce", "spa\nce spa ce", 0, 0, 6},
2531 { "x🗑x\U0001f5d1\\U0001f5d1x[x\U0001f5d1y]x", "x🗑x🗑🗑x🗑x", 0, 0, 8},
2534 static void run_tests(bool trace)
2536 int cnt = sizeof(tests) / sizeof(tests[0]);
2540 for (i = 0, alg = 0;
2542 i < cnt-1 ? i++ : (i = 0, alg++)) {
2544 int f = tests[i].flags;
2545 char *patn = tests[i].patn;
2546 const char *target = tests[i].target;
2547 unsigned short *rxl;
2552 struct match_state *st;
2555 printf("Match %s against %s using %s\n", patn, target,
2556 alg ? "backtrack" : "parallel");
2558 rxl = rxl_parse_verbatim(patn, f & F_ICASE);
2560 rxl = rxl_parse(patn, &len, f & F_ICASE);
2564 printf("Parse error as expected\n\n");
2567 printf("test %d(%s): Parse error at %d\n",
2572 printf("test %d(%s): No parse error found\n", i, patn);
2578 st = rxl_prepare(rxl, alg ? RXLF_BACKTRACK : 0);
2581 flags = RXL_SOL|RXL_SOD;
2584 wint_t wc = get_utf8(&target, NULL);
2587 if (iswalnum(prev) && !iswalnum(wc))
2589 else if (!iswalnum(prev) && iswalnum(wc))
2592 flags |= RXL_NOWBRK;
2593 if (wc == 'P' && strstarts(target, "oint"))
2596 r = rxl_advance(st, wc | flags);
2599 } while (r != RXL_DONE);
2601 flags |= RXL_EOL|RXL_EOD;
2604 rxl_advance(st, flags);
2608 rxl_info(st, &mlen, NULL, &mstart, NULL);
2609 if ((f & F_BACKTRACK) && !alg) {
2610 /* This is only expected to work for backtracking */
2611 if (mstart != -1 || mlen != -1) {
2612 printf("test %d(%s) %s: succeeded unexpectedly %d/%d\n",
2613 i, patn, "parallel",
2616 } else if (tests[i].start != mstart ||
2617 tests[i].len != mlen) {
2618 printf("test %d(%s) %s: found %d/%d instead of %d/%d\n",
2620 alg ? "backtracking" : "parallel",
2621 mstart, mlen, tests[i].start, tests[i].len);
2624 if (alg && tests[i].form && tests[i].replacement) {
2628 printf("Backtrack:");
2629 for (j=0; j<st->record_count; j++)
2630 printf(" (%d,%d)", st->record[j].pos,
2634 new = rxl_interp(st, tests[i].form);
2635 if (!new || strcmp(new, tests[i].replacement) != 0) {
2636 printf("test %d(%s) replace is <%s>, not <%s>\n",
2638 new, tests[i].replacement);
2647 static void usage(void)
2649 fprintf(stderr, "Usage: rexel -itvB pattern target\n");
2650 fprintf(stderr, " : rexel -itvB -f pattern file\n");
2651 fprintf(stderr, " : rexel -T # run tests\n");
2652 fprintf(stderr, " : -i=ignore case, -t=trace -v=verbatim pattern, -B=backtrack\n");
2655 int main(int argc, char *argv[])
2657 unsigned short *rxl;
2658 struct match_state *st;
2667 int ignore_case = 0;
2672 const char *patn, *target, *t;
2676 while ((opt = getopt(argc, argv, "itvTfB")) > 0)
2679 use_file = 1; break;
2681 ignore_case = 1; break;
2683 verbatim = 1; break;
2685 trace = True; break;
2687 backtrack = RXLF_BACKTRACK; break;
2690 printf("All tests passed successfully\n");
2697 if (optind + 2 != argc) {
2701 patn = argv[optind];
2703 int fd = open(argv[optind+1], O_RDONLY);
2710 target = mmap(NULL, stb.st_size, PROT_READ, MAP_SHARED, fd, 0);
2713 target = argv[optind+1];
2715 setlocale(LC_ALL, "");
2716 setlocale(LC_CTYPE, "enUS.UTF-8");
2719 rxl = rxl_parse_verbatim(patn, ignore_case);
2721 rxl = rxl_parse(patn, &len, ignore_case);
2724 printf("Failed to parse: %s at %s\n", patn, patn+len);
2729 plen = rxl_prefix(rxl, prefix, sizeof(prefix)-1);
2731 printf("Prefix: \"%.*s\"\n", plen, prefix);
2733 printf("No static prefix\n");
2735 st = rxl_prepare(rxl, backtrack);
2738 flags = RXL_SOL|RXL_SOD;
2740 wint_t wc = get_utf8(&t, NULL);
2742 rxl_advance(st, RXL_EOL|RXL_EOD);
2745 r = rxl_advance(st, wc | flags);
2748 } while (r != RXL_DONE);
2749 rxl_info(st, &thelen, NULL, &start, NULL);
2751 printf("No match\n");
2755 const char *tstart, *tend;
2758 tstart = target + start;
2759 while (tstart > target && tstart[-1] != '\n')
2762 while (*tend && *tend != '\n')
2765 tend = tstart + strlen(tstart);
2767 printf("%0.*s\n", tend-tstart, tstart);
2769 ccnt = tstart-target;
2770 while ((wc = get_utf8(&t, NULL)) < WERR) {
2773 else if (ccnt == start)
2775 else if (ccnt < start + thelen)