]> git.neil.brown.name Git - edlib.git/blob - lib-search.c
lib-search: add "reverse" command for search commands.
[edlib.git] / lib-search.c
1 /*
2  * Copyright Neil Brown ©2015-2023 <neil@brown.name>
3  * May be distributed under terms of GPLv2 - see file:COPYING
4  *
5  * Searching.
6  * "text-search" command searches from given mark until it
7  * finds the given pattern or end of buffer.
8  * If the pattern is found, then 'm' is left at the extremity of
9  * the match in the direction of search: so the start if search backwards
10  * or the end if searching forwards.
11  * The returned value is the length of the match + 1, or an Efail
12  * In the case of an error, the location of ->mark is undefined.
13  * If mark2 is given, don't go beyond there.
14  *
15  * "text-match" is similar to text-search forwards, but requires that
16  * the match starts at ->mark.  ->mark is moved to the end of the
17  * match if the text does, in fact, match.
18  * If the match fails, Efalse is returned (different to "text-search")
19  */
20
21 #include <stdlib.h>
22 #include <wctype.h>
23 #include <string.h>
24 #include "core.h"
25 #include "rexel.h"
26
27 struct search_state {
28         struct match_state *st safe;
29         struct mark *end;
30         struct mark *endmark;
31         struct mark *point;
32         wint_t prev_ch;
33         bool prev_point;
34         struct command c;
35         char prefix[64];
36         int prefix_len;
37         bool anchor_at_end;
38
39         unsigned short *rxl safe;
40 };
41
42 static void state_free(struct command *c safe)
43 {
44         struct search_state *ss = container_of(c, struct search_state, c);
45
46         free(ss->rxl);
47         rxl_free_state(ss->st);
48         mark_free(ss->end);
49         mark_free(ss->endmark);
50         mark_free(ss->point);
51         free(ss);
52 }
53
54 static int is_word(wint_t ch)
55 {
56         return ch == '_' || iswalnum(ch);
57 }
58
59 /*
60  * 'search_test' together with 'stuct search_state' encapsulates
61  * a parsed regexp and some matching state.  If called as 'consume'
62  * (or anything starting 'c') it processes one char into the match
63  * and returns 1 if it is worth providing more characters.
64  * Other options for ci->key are:
65  * - reinit - state is re-initialised with flags from ->num, end and
66  *            endmark from ->mark and ->mark2
67  * - getinfo - extract total, start, len, since-start from match
68  * - getcapture - get "start" or "len" for a capture in ->num
69  * - interp - interpolate \N captures in ->str
70  */
71 DEF_CB(search_test)
72 {
73         struct search_state *ss = container_of(ci->comm,
74                                                struct search_state, c);
75
76         if (ci->key[0] == 'c') {
77                 /* consume */
78                 wint_t wch = ci->num & 0xFFFFF;
79                 wint_t flags = 0;
80                 int maxlen, since_start;
81                 enum rxl_found found;
82                 int anchored;
83
84                 if ((unsigned int)ci->num == WEOF) {
85                         wch = 0;
86                         flags |= RXL_EOD;
87                 }
88                 if (ss->prev_ch == WEOF)
89                         flags |= RXL_SOD;
90                 if (is_eol(ss->prev_ch) || ss->prev_ch == WEOF ||
91                     ss->prev_ch == 0)
92                         flags |= RXL_SOL;
93                 switch (is_word(ss->prev_ch) * 2 + is_word(wch)) {
94                 case 0: /* in space */
95                 case 3: /* within word */
96                         flags |= RXL_NOWBRK;
97                         break;
98                 case 1: /* start of word */
99                         flags |= RXL_SOW;
100                         break;
101                 case 2: /* end of word */
102                         flags |= RXL_EOW;
103                         break;
104                 }
105                 if (is_eol(wch))
106                         flags |= RXL_EOL;
107                 if (ss->prev_point) {
108                         flags |= RXL_POINT;
109                         ss->prev_point = False;
110                 }
111                 if (ss->point && ci->mark && mark_same(ss->point, ci->mark))
112                         /* Need to assert POINT before next char */
113                         ss->prev_point = True;
114
115                 found = rxl_advance(ss->st, wch | flags);
116                 anchored = rxl_info(ss->st, &maxlen, NULL, NULL, &since_start);
117
118                 if (found >= RXL_MATCH && ss->endmark && ci->mark &&
119                     since_start - maxlen <= 1) {
120                         mark_to_mark(ss->endmark, ci->mark);
121                         if (found == RXL_MATCH_FLAG)
122                                 doc_prev(ci->home, ss->endmark);
123                 }
124                 if (ss->end && ci->mark &&
125                     mark_ordered_not_same(ss->end, ci->mark)) {
126                         /* Mark is *after* the char, so if end and mark
127                          * are the same, we haven't passed the 'end' yet,
128                          * and it is too early to abort.  Hence 'not' above
129                          */
130                         if (ss->anchor_at_end) {
131                                 found = rxl_advance(ss->st, RXL_ANCHOR);
132                                 anchored = 1;
133                         } else
134                                 return Efalse;
135                 }
136                 if (found == RXL_DONE)
137                         /* No match here */
138                         return Efalse;
139                 if (!anchored && ci->str &&
140                     ss->prefix_len && ci->num2 > ss->prefix_len) {
141                         /* It is worth searching for the prefix to improve
142                          * search speed.
143                          */
144                         int pstart = rxl_fast_match(ss->prefix, ss->prefix_len,
145                                                     ci->str, ci->num2);
146                         /* This may not be a full match even for the prefix,
147                          * but it is a good place to skip to.
148                          * If there was no match, pstart is ci->num2,
149                          * so we skip the entire chunk.
150                          * We reposition to just before the possible match
151                          * so that ->end processesing is handled before the
152                          * match can start.
153                          */
154                         if (pstart > 1)
155                                 pstart = utf8_round_len(ci->str, pstart - 1);
156                         if (pstart > 0) {
157                                 const char *s;
158                                 int prev = utf8_round_len(ci->str, pstart - 1);
159                                 s = ci->str + prev;
160                                 ss->prev_ch = get_utf8(&s, NULL);
161                                 return pstart + 1;
162                         }
163                 }
164                 ss->prev_ch = wch;
165                 return 1;
166         }
167         if (strcmp(ci->key, "reinit") == 0) {
168                 rxl_free_state(ss->st);
169                 ss->st = rxl_prepare(ss->rxl, ci->num & 3);
170                 ss->prev_ch = (unsigned int)ci->num2 ?: WEOF;
171                 mark_free(ss->end);
172                 mark_free(ss->endmark);
173                 if (ci->mark) {
174                         ss->end = mark_dup(ci->mark);
175                         ss->anchor_at_end = ci->num & 4;
176                 } else
177                         ss->end = NULL;
178                 if (ci->mark2)
179                         ss->endmark = mark_dup(ci->mark2);
180                 else
181                         ss->endmark = NULL;
182                 return 1;
183         }
184         if (strcmp(ci->key, "setpoint") == 0 && ci->mark) {
185                 mark_free(ss->point);
186                 ss->point = mark_dup(ci->mark);
187                 return 1;
188         }
189         if (strcmp(ci->key, "getinfo") == 0 && ci->str) {
190                 int len, total, start, since_start;
191                 rxl_info(ss->st, &len, &total, &start, &since_start);
192                 if (strcmp(ci->str, "len") == 0)
193                         return len < 0 ? Efalse : len+1;
194                 if (strcmp(ci->str, "total") == 0)
195                         return total+1;
196                 if (strcmp(ci->str, "start") == 0)
197                         return start < 0 ? Efalse : start + 1;
198                 if (strcmp(ci->str, "since-start") == 0)
199                         return since_start < 0 ? Efalse : since_start + 1;
200                 return Einval;
201         }
202         if (strcmp(ci->key, "getcapture") == 0 && ci->str) {
203                 int start, len;
204                 if (rxl_capture(ss->st, ci->num, ci->num2, &start, &len)) {
205                         if (strcmp(ci->str, "start") == 0)
206                                 return start + 1;
207                         if (strcmp(ci->str, "len") == 0)
208                                 return len + 1;
209                         return Einval;
210                 }
211                 return Efalse;
212         }
213         if (strcmp(ci->key, "interp") == 0 && ci->str) {
214                 char *ret;
215                 ret = rxl_interp(ss->st, ci->str);
216                 comm_call(ci->comm2, "cb", ci->focus, 0, NULL, ret);
217                 free(ret);
218                 return 1;
219         }
220         if (strcmp(ci->key, "reverse") == 0) {
221                 /* Search backward from @mark in @focus for a match, or
222                  * until we hit @mark2.  Leave @mark at the start of the
223                  * match.  Return length of match, or negative.
224                  *
225                  * rexel only lets us search forwards, and stepping back one
226                  * char at a time to match the pattern is too slow.
227                  * So we step back a steadily growing number of chars and search
228                  * forward as far as the previous location.  Once we
229                  * find any match, we check if there is a later one that
230                  * still satisfies.
231                  */
232                 int step_size = 65536;
233                 int maxlen;
234                 int ret = -1;
235                 struct mark *m, *start, *end, *endmark;
236                 struct mark *m2 = ci->mark2;
237                 struct pane *p = ci->focus;
238
239                 if (!ci->mark)
240                         return Enoarg;
241                 m = mark_dup(ci->mark); /* search cursor */
242                 start = mark_dup(ci->mark); /* start of the range being searched */
243                 end = mark_dup(ci->mark); /* end of the range being searched */
244
245                 ss->end = end;
246                 if (!ss->endmark)
247                         ss->endmark = ci->mark;
248                 endmark = ss->endmark;
249                 ss->anchor_at_end = True;
250
251                 pane_set_time(p);
252
253                 while (!m2 || m2->seq < start->seq) {
254                         mark_to_mark(end, start);
255                         call("doc:char", p, -step_size, start, NULL, 0, m2);
256                         if (mark_same(start, end))
257                                 /* We have hit the start(m2), don't continue */
258                                 break;
259                         step_size *= 2;
260                         ss->prev_ch = doc_prior(p, start);
261                         ss->st = rxl_prepare(ss->rxl, 0);
262                         ss->prev_point = ss->point ? mark_same(ss->point, m) : False;
263
264                         mark_to_mark(m, start);
265                         call_comm("doc:content", p, &ss->c, 0, m);
266                         rxl_info(ss->st, &maxlen, NULL, NULL, NULL);
267                         rxl_free_state(ss->st);
268                         if (maxlen >= 0) {
269                                 /* found a match */
270                                 ret = maxlen;
271                                 break;
272                         }
273
274                         if (pane_too_long(p, 2000)) {
275                                 /* FIXME returning success is wrong if
276                                  * we timed out But I want to move the
277                                  * point, and this is easiest.  What do
278                                  * I really want here?  Do I just need
279                                  * to make reverse search faster?
280                                  */
281                                 mark_to_mark(endmark, start);
282                                 ret = 0;
283                                 break;
284                         }
285                 }
286                 while (maxlen >= 0) {
287                         /* There is a match starting at 'endmark'.
288                          * The might be a later match - check for it.
289                          */
290                         call("doc:char", p, -maxlen, ss->endmark);
291                         if (mark_ordered_not_same(end, ss->endmark))
292                                 break;
293                         ret = maxlen;
294                         if (endmark != ss->endmark &&
295                             mark_ordered_or_same(ss->endmark, endmark))
296                                 /* Didn't move forward!!  Presumably
297                                  * buggy doc:step implementation.
298                                  */
299                                 break;
300
301                         mark_to_mark(endmark, ss->endmark);
302                         ss->endmark = m;
303                         mark_to_mark(start, endmark);
304                         ss->prev_ch = doc_next(p, start);
305                         ss->st = rxl_prepare(ss->rxl, 0);
306                         call_comm("doc:content", p, &ss->c, 0, start);
307                         rxl_info(ss->st, &maxlen, NULL, NULL, NULL);
308                         rxl_free_state(ss->st);
309                 }
310                 mark_free(start);
311                 mark_free(end);
312                 mark_free(m);
313                 return ret;
314         }
315         return Efail;
316 }
317
318 static int search_forward(struct pane *p safe,
319                           struct mark *m safe, struct mark *m2,
320                           struct mark *point,
321                           unsigned short *rxl safe,
322                           struct mark *endmark, bool anchored)
323 {
324         /* Search forward from @m in @p for @rxl looking as far as @m2,
325          * and leaving @endmark at the end point, and returning the
326          * length of the match, or -1.
327          */
328         struct search_state ss;
329         int maxlen;
330
331         if (m2 && m->seq >= m2->seq)
332                 return -1;
333         ss.st = rxl_prepare(rxl, anchored ? RXLF_ANCHORED : 0);
334         ss.prefix_len = rxl_prefix(rxl, ss.prefix, sizeof(ss.prefix));
335         ss.end = m2;
336         ss.endmark = endmark;
337         ss.point = point;
338         ss.prev_point = point ? mark_same(point, m) : False;
339         ss.c = search_test;
340         ss.prev_ch = doc_prior(p, m);
341         ss.anchor_at_end = False;
342         call_comm("doc:content", p, &ss.c, 0, m, NULL, 0, m2);
343         rxl_info(ss.st, &maxlen, NULL, NULL, NULL);
344         rxl_free_state(ss.st);
345         return maxlen;
346 }
347
348 static int search_backward(struct pane *p safe,
349                            struct mark *m safe, struct mark *m2,
350                            struct mark *point,
351                            unsigned short *rxl safe,
352                            struct mark *endmark safe)
353 {
354         /* Search backward from @m in @p for a match of @s.  The match
355          * must start at or before m, but may finish later.  Only search
356          * as far as @m2 (if set), and leave endmark pointing at the
357          * start of the match, if one is found.
358          * Return length of match, or negative.
359          *
360          * rexel only lets us search forwards, and stepping back
361          * one char at a time to match the pattern is too slow.
362          * So we step back a steadily growing number of
363          * chars, and search forward as pfar as the previous location.
364          * Once we find any match, we check if there is a later one
365          * that still satisfies.
366          */
367         struct search_state ss;
368
369         if (m2 && m->seq <= m2->seq)
370                 return -1;
371
372         ss.rxl = rxl;
373         ss.st = rxl_prepare(rxl, 0);
374         ss.prefix_len = rxl_prefix(rxl, ss.prefix, sizeof(ss.prefix));
375         ss.end = m2;
376         ss.endmark = endmark;
377         ss.point = point;
378         ss.prev_point = point ? mark_same(point, m) : False;
379         ss.c = search_test;
380         return comm_call(&ss.c, "reverse", p, 0, m, NULL, 0, m2);
381 }
382
383 DEF_CMD(text_search)
384 {
385         struct mark *m, *endmark = NULL;
386         unsigned short *rxl;
387         int since_start;
388         int ret;
389
390         if (!ci->str)
391                 return Enoarg;
392
393         rxl = rxl_parse(ci->str, NULL, ci->num);
394         if (!rxl)
395                 return Einval;
396
397         if (ci->mark) {
398                 struct mark *point;
399
400                 m = ci->mark;
401                 endmark = mark_dup(m);
402                 point = call_ret(mark, "doc:point", ci->focus);
403                 if (!endmark)
404                         return Efail;
405                 if (strcmp(ci->key, "text-match") == 0)
406                         since_start = search_forward(ci->focus, m, ci->mark2,
407                                                      point, rxl, endmark, True);
408                 else if (ci->num2)
409                         since_start = search_backward(ci->focus, m, ci->mark2,
410                                                       point, rxl, endmark);
411                 else
412                         since_start = search_forward(ci->focus, m, ci->mark2,
413                                                      point, rxl, endmark, False);
414
415                 if (since_start >= 0)
416                         mark_to_mark(m, endmark);
417                 mark_free(endmark);
418                 if (since_start < 0) {
419                         if (strcmp(ci->key, "text-match") == 0)
420                                 ret = Efalse; /* non-fatal */
421                         else
422                                 ret = Efail;
423                 } else
424                         ret = since_start + 1;
425         } else if (ci->str2) {
426                 struct match_state *st = rxl_prepare(
427                         rxl, strcmp(ci->key, "text-match") == 0 ? RXLF_ANCHORED : 0);
428                 int flags = RXL_SOL|RXL_SOD;
429                 const char *t = ci->str2;
430                 int thelen = -1, start = 0;
431                 enum rxl_found r;
432                 wint_t prev_ch = WEOF;
433
434                 do {
435                         wint_t wc = get_utf8(&t, NULL);
436                         if (wc >= WERR|| (ci->num2 > 0 && t > ci->str2 + ci->num2)) {
437                                 rxl_advance(st, RXL_EOL|RXL_EOD);
438                                 break;
439                         }
440                         switch (is_word(prev_ch) * 2 + is_word(wc)) {
441                         case 0: /* in space */
442                         case 3: /* within word */
443                                 flags |= RXL_NOWBRK;
444                                 break;
445                         case 1: /* start of word */
446                                 flags |= RXL_SOW;
447                                 break;
448                         case 2: /* end of word */
449                                 flags |= RXL_EOW;
450                                 break;
451                         }
452                         if (is_eol(wc))
453                                 flags |= RXL_EOL;
454                         if (prev_ch == WEOF || is_eol(prev_ch))
455                                 flags |= RXL_SOL;
456                         prev_ch = wc;
457                         r = rxl_advance(st, wc | flags);
458                         flags = 0;
459                         if (r >= RXL_MATCH) {
460                                 /* "start" is in chars, not bytes, so we cannot.
461                                  * use it.  Need since_start and then count
462                                  * back.
463                                  */
464                                 rxl_info(st, &thelen, NULL, NULL, &since_start);
465                                 start = t - ci->str2;
466                                 while (since_start > 0) {
467                                         start = utf8_round_len(ci->str2, start-1);
468                                         since_start -= 1;
469                                 }
470                         }
471                 } while (r != RXL_DONE);
472                 rxl_free_state(st);
473                 if (thelen < 0)
474                         ret = Efalse;
475                 else if (strcmp(ci->key, "text-match") == 0)
476                         ret = thelen + 1;
477                 else
478                         ret = start + 1;
479         } else {
480                 ret = Einval;
481         }
482         free(rxl);
483         return ret;
484 }
485
486 DEF_CMD(make_search)
487 {
488         struct search_state *ss;
489         unsigned short *rxl;
490
491         if (!ci->str)
492                 return Enoarg;
493
494         rxl = rxl_parse(ci->str, NULL, ci->num2);
495         if (!rxl)
496                 return Einval;
497         ss = calloc(1, sizeof(*ss));
498         ss->rxl = rxl;
499         ss->prefix_len = rxl_prefix(rxl, ss->prefix, sizeof(ss->prefix));
500         ss->c = search_test;
501         ss->c.free = state_free;
502         command_get(&ss->c);
503         comm_call(&ss->c, "reinit", ci->focus,
504                   ci->num, ci->mark, NULL, 0, ci->mark2);
505         comm_call(ci->comm2, "cb", ci->focus,
506                   0, NULL, NULL,
507                   0, NULL, NULL, 0,0, &ss->c);
508         command_put(&ss->c);
509         return 1;
510 }
511
512 struct texteql {
513         struct command c;
514         const char *text safe;
515         bool matched;
516 };
517
518 DEF_CB(equal_test)
519 {
520         struct texteql *te = container_of(ci->comm, struct texteql, c);
521         wint_t have, want;
522         int i;
523
524         if (!te->text[0])
525                 return Efalse;
526         have = ci->num & 0xFFFFF;
527         want = get_utf8(&te->text, NULL);
528         if (have != want)
529                 return Efalse;
530         for (i = 0;
531              i < ci->num2 && ci->str;
532              i++)
533                 if (!te->text[i] || te->text[i] != ci->str[i])
534                         break;
535         te->text += i;
536         if (!te->text[0])
537                 te->matched = True;
538         if (ci->str && i < ci->num2)
539                 /* Stop looking */
540                 return Efalse;
541         return 1 + i;
542 }
543
544 DEF_CMD(text_equals)
545 {
546         struct texteql te;
547
548         if (!ci->str || !ci->mark)
549                 return Enoarg;
550
551         te.c = equal_test;
552         te.text = ci->str;
553         te.matched = False;
554         call_comm("doc:content", ci->focus, &te.c, 0, ci->mark);
555         return te.matched ? 1 : Efalse;
556 }
557
558 void edlib_init(struct pane *ed safe)
559 {
560         call_comm("global-set-command", ed, &text_search, 0, NULL,
561                   "text-search");
562         call_comm("global-set-command", ed, &text_search, 0, NULL,
563                   "text-match");
564         call_comm("global-set-command", ed, &make_search, 0, NULL,
565                   "make-search");
566         call_comm("global-set-command", ed, &text_equals, 0, NULL,
567                   "text-equals");
568 }