blob: 969c46a8591d3a408e7744f0f8bdfe8092efd425 [file] [log] [blame]
Wolfgang Denka5ecbe62013-03-23 23:50:31 +00001/*
2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3 * All rights reserved
4 *
5 * "THE BEER-WARE LICENSE" (Revision 42):
6 * Sergey Lyubka wrote this file. As long as you retain this notice you
7 * can do whatever you want with this stuff. If we meet some day, and you think
8 * this stuff is worth it, you can buy me a beer in return.
9 */
10
11/*
12 * Downloaded Sat Nov 5 17:43:06 CET 2011 at
13 * http://slre.sourceforge.net/1.0/slre.c
14 */
15
16#ifdef SLRE_TEST
17#include <stdio.h>
18#include <assert.h>
19#include <ctype.h>
20#include <stdlib.h>
21#include <string.h>
22#else
23#include <common.h>
24#include <linux/ctype.h>
25#endif /* SLRE_TEST */
26
27#include <errno.h>
28
29#include <slre.h>
30
31enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
32 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
33
34#ifdef SLRE_TEST
35static struct {
36 const char *name;
37 int narg;
38 const char *flags;
39} opcodes[] = {
40 {"END", 0, ""}, /* End of code block or program */
41 {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
42 {"ANY", 0, ""}, /* Match any character, "." */
43 {"EXACT", 2, "d"}, /* Match exact string */
44 {"ANYOF", 2, "D"}, /* Match any from set, "[]" */
45 {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
46 {"OPEN ", 1, "i"}, /* Capture start, "(" */
47 {"CLOSE", 1, "i"}, /* Capture end, ")" */
48 {"BOL", 0, ""}, /* Beginning of string, "^" */
49 {"EOL", 0, ""}, /* End of string, "$" */
50 {"STAR", 1, "o"}, /* Match zero or more times "*" */
51 {"PLUS", 1, "o"}, /* Match one or more times, "+" */
52 {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
53 {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
54 {"QUEST", 1, "o"}, /* Match zero or one time, "?" */
55 {"SPACE", 0, ""}, /* Match whitespace, "\s" */
56 {"NONSPACE", 0, ""}, /* Match non-space, "\S" */
57 {"DIGIT", 0, ""} /* Match digit, "\d" */
58};
59#endif /* SLRE_TEST */
60
61/*
62 * Commands and operands are all unsigned char (1 byte long). All code offsets
63 * are relative to current address, and positive (always point forward). Data
64 * offsets are absolute. Commands with operands:
65 *
66 * BRANCH offset1 offset2
67 * Try to match the code block that follows the BRANCH instruction
68 * (code block ends with END). If no match, try to match code block that
69 * starts at offset1. If either of these match, jump to offset2.
70 *
71 * EXACT data_offset data_length
72 * Try to match exact string. String is recorded in data section from
73 * data_offset, and has length data_length.
74 *
75 * OPEN capture_number
76 * CLOSE capture_number
77 * If the user have passed 'struct cap' array for captures, OPEN
78 * records the beginning of the matched substring (cap->ptr), CLOSE
79 * sets the length (cap->len) for respective capture_number.
80 *
81 * STAR code_offset
82 * PLUS code_offset
83 * QUEST code_offset
84 * *, +, ?, respectively. Try to gobble as much as possible from the
85 * matched buffer, until code block that follows these instructions
86 * matches. When the longest possible string is matched,
87 * jump to code_offset
88 *
89 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
90 */
91
92static const char *meta_chars = "|.^$*+?()[\\";
93
94#ifdef SLRE_TEST
95
96static void
97print_character_set(FILE *fp, const unsigned char *p, int len)
98{
99 int i;
100
101 for (i = 0; i < len; i++) {
102 if (i > 0)
103 (void) fputc(',', fp);
104 if (p[i] == 0) {
105 i++;
106 if (p[i] == 0)
107 (void) fprintf(fp, "\\x%02x", p[i]);
108 else
109 (void) fprintf(fp, "%s", opcodes[p[i]].name);
110 } else if (isprint(p[i])) {
111 (void) fputc(p[i], fp);
112 } else {
113 (void) fprintf(fp, "\\x%02x", p[i]);
114 }
115 }
116}
117
118void
119slre_dump(const struct slre *r, FILE *fp)
120{
121 int i, j, ch, op, pc;
122
123 for (pc = 0; pc < r->code_size; pc++) {
124
125 op = r->code[pc];
126 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
127
128 for (i = 0; opcodes[op].flags[i] != '\0'; i++)
129 switch (opcodes[op].flags[i]) {
130 case 'i':
131 (void) fprintf(fp, "%d ", r->code[pc + 1]);
132 pc++;
133 break;
134 case 'o':
135 (void) fprintf(fp, "%d ",
136 pc + r->code[pc + 1] - i);
137 pc++;
138 break;
139 case 'D':
140 print_character_set(fp, r->data +
141 r->code[pc + 1], r->code[pc + 2]);
142 pc += 2;
143 break;
144 case 'd':
145 (void) fputc('"', fp);
146 for (j = 0; j < r->code[pc + 2]; j++) {
147 ch = r->data[r->code[pc + 1] + j];
148 if (isprint(ch)) {
149 (void) fputc(ch, fp);
150 } else {
151 (void) fprintf(fp,
152 "\\x%02x", ch);
153 }
154 }
155 (void) fputc('"', fp);
156 pc += 2;
157 break;
158 }
159
160 (void) fputc('\n', fp);
161 }
162}
163#endif /* SLRE_TEST */
164
165static void
166set_jump_offset(struct slre *r, int pc, int offset)
167{
168 assert(offset < r->code_size);
169
170 if (r->code_size - offset > 0xff)
171 r->err_str = "Jump offset is too big";
172 else
173 r->code[pc] = (unsigned char) (r->code_size - offset);
174}
175
176static void
177emit(struct slre *r, int code)
178{
179 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
180 r->err_str = "RE is too long (code overflow)";
181 else
182 r->code[r->code_size++] = (unsigned char) code;
183}
184
185static void
186store_char_in_data(struct slre *r, int ch)
187{
188 if (r->data_size >= (int) sizeof(r->data))
189 r->err_str = "RE is too long (data overflow)";
190 else
191 r->data[r->data_size++] = ch;
192}
193
194static void
195exact(struct slre *r, const char **re)
196{
197 int old_data_size = r->data_size;
198
199 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
200 store_char_in_data(r, *(*re)++);
201
202 emit(r, EXACT);
203 emit(r, old_data_size);
204 emit(r, r->data_size - old_data_size);
205}
206
207static int
208get_escape_char(const char **re)
209{
210 int res;
211
212 switch (*(*re)++) {
213 case 'n':
214 res = '\n';
215 break;
216 case 'r':
217 res = '\r';
218 break;
219 case 't':
220 res = '\t';
221 break;
222 case '0':
223 res = 0;
224 break;
225 case 'S':
226 res = NONSPACE << 8;
227 break;
228 case 's':
229 res = SPACE << 8;
230 break;
231 case 'd':
232 res = DIGIT << 8;
233 break;
234 default:
235 res = (*re)[-1];
236 break;
237 }
238
239 return res;
240}
241
242static void
243anyof(struct slre *r, const char **re)
244{
245 int esc, old_data_size = r->data_size, op = ANYOF;
246
247 if (**re == '^') {
248 op = ANYBUT;
249 (*re)++;
250 }
251
252 while (**re != '\0')
253
254 switch (*(*re)++) {
255 case ']':
256 emit(r, op);
257 emit(r, old_data_size);
258 emit(r, r->data_size - old_data_size);
259 return;
260 /* NOTREACHED */
261 break;
262 case '\\':
263 esc = get_escape_char(re);
264 if ((esc & 0xff) == 0) {
265 store_char_in_data(r, 0);
266 store_char_in_data(r, esc >> 8);
267 } else {
268 store_char_in_data(r, esc);
269 }
270 break;
271 default:
272 store_char_in_data(r, (*re)[-1]);
273 break;
274 }
275
276 r->err_str = "No closing ']' bracket";
277}
278
279static void
280relocate(struct slre *r, int begin, int shift)
281{
282 emit(r, END);
283 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
284 r->code_size += shift;
285}
286
287static void
288quantifier(struct slre *r, int prev, int op)
289{
290 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
291 r->code[prev + 2]--;
292 emit(r, EXACT);
293 emit(r, r->code[prev + 1] + r->code[prev + 2]);
294 emit(r, 1);
295 prev = r->code_size - 3;
296 }
297 relocate(r, prev, 2);
298 r->code[prev] = op;
299 set_jump_offset(r, prev + 1, prev);
300}
301
302static void
303exact_one_char(struct slre *r, int ch)
304{
305 emit(r, EXACT);
306 emit(r, r->data_size);
307 emit(r, 1);
308 store_char_in_data(r, ch);
309}
310
311static void
312fixup_branch(struct slre *r, int fixup)
313{
314 if (fixup > 0) {
315 emit(r, END);
316 set_jump_offset(r, fixup, fixup - 2);
317 }
318}
319
320static void
321compile(struct slre *r, const char **re)
322{
323 int op, esc, branch_start, last_op, fixup, cap_no, level;
324
325 fixup = 0;
326 level = r->num_caps;
327 branch_start = last_op = r->code_size;
328
329 for (;;)
330 switch (*(*re)++) {
331 case '\0':
332 (*re)--;
333 return;
334 /* NOTREACHED */
335 break;
336 case '^':
337 emit(r, BOL);
338 break;
339 case '$':
340 emit(r, EOL);
341 break;
342 case '.':
343 last_op = r->code_size;
344 emit(r, ANY);
345 break;
346 case '[':
347 last_op = r->code_size;
348 anyof(r, re);
349 break;
350 case '\\':
351 last_op = r->code_size;
352 esc = get_escape_char(re);
353 if (esc & 0xff00)
354 emit(r, esc >> 8);
355 else
356 exact_one_char(r, esc);
357 break;
358 case '(':
359 last_op = r->code_size;
360 cap_no = ++r->num_caps;
361 emit(r, OPEN);
362 emit(r, cap_no);
363
364 compile(r, re);
365 if (*(*re)++ != ')') {
366 r->err_str = "No closing bracket";
367 return;
368 }
369
370 emit(r, CLOSE);
371 emit(r, cap_no);
372 break;
373 case ')':
374 (*re)--;
375 fixup_branch(r, fixup);
376 if (level == 0) {
377 r->err_str = "Unbalanced brackets";
378 return;
379 }
380 return;
381 /* NOTREACHED */
382 break;
383 case '+':
384 case '*':
385 op = (*re)[-1] == '*' ? STAR : PLUS;
386 if (**re == '?') {
387 (*re)++;
388 op = op == STAR ? STARQ : PLUSQ;
389 }
390 quantifier(r, last_op, op);
391 break;
392 case '?':
393 quantifier(r, last_op, QUEST);
394 break;
395 case '|':
396 fixup_branch(r, fixup);
397 relocate(r, branch_start, 3);
398 r->code[branch_start] = BRANCH;
399 set_jump_offset(r, branch_start + 1, branch_start);
400 fixup = branch_start + 2;
401 r->code[fixup] = 0xff;
402 break;
403 default:
404 (*re)--;
405 last_op = r->code_size;
406 exact(r, re);
407 break;
408 }
409}
410
411int
412slre_compile(struct slre *r, const char *re)
413{
414 r->err_str = NULL;
415 r->code_size = r->data_size = r->num_caps = r->anchored = 0;
416
417 if (*re == '^')
418 r->anchored++;
419
420 emit(r, OPEN); /* This will capture what matches full RE */
421 emit(r, 0);
422
423 while (*re != '\0')
424 compile(r, &re);
425
426 if (r->code[2] == BRANCH)
427 fixup_branch(r, 4);
428
429 emit(r, CLOSE);
430 emit(r, 0);
431 emit(r, END);
432
433 return (r->err_str == NULL ? 1 : 0);
434}
435
436static int match(const struct slre *, int,
437 const char *, int, int *, struct cap *);
438
439static void
440loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
441{
442 int saved_offset, matched_offset;
443
xypron.glpk@gmx.deb8865e62017-05-08 21:13:43 +0200444 matched_offset = *ofs;
Wolfgang Denka5ecbe62013-03-23 23:50:31 +0000445
446 while (match(r, pc + 2, s, len, ofs, NULL)) {
447 saved_offset = *ofs;
448 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
449 matched_offset = saved_offset;
450 *ofs = saved_offset;
451 }
452
453 *ofs = matched_offset;
454}
455
456static void
457loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
458{
459 int saved_offset = *ofs;
460
461 while (match(r, pc + 2, s, len, ofs, NULL)) {
462 saved_offset = *ofs;
463 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
464 break;
465 }
466
467 *ofs = saved_offset;
468}
469
470static int
471is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
472{
473 int i, ch;
474
475 ch = s[*ofs];
476
477 for (i = 0; i < len; i++)
478 if (p[i] == ch) {
479 (*ofs)++;
480 return 1;
481 }
482
483 return 0;
484}
485
486static int
487is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
488{
489 int i, ch;
490
491 ch = s[*ofs];
492
493 for (i = 0; i < len; i++) {
494 if (p[i] == ch)
495 return 0;
496 }
497
498 (*ofs)++;
499 return 1;
500}
501
502static int
503match(const struct slre *r, int pc, const char *s, int len,
504 int *ofs, struct cap *caps)
505{
506 int n, saved_offset, res = 1;
507
508 while (res && r->code[pc] != END) {
509
510 assert(pc < r->code_size);
511 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
512
513 switch (r->code[pc]) {
514 case BRANCH:
515 saved_offset = *ofs;
516 res = match(r, pc + 3, s, len, ofs, caps);
517 if (res == 0) {
518 *ofs = saved_offset;
519 res = match(r, pc + r->code[pc + 1],
520 s, len, ofs, caps);
521 }
522 pc += r->code[pc + 2];
523 break;
524 case EXACT:
525 res = 0;
526 n = r->code[pc + 2]; /* String length */
527 if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
528 r->code[pc + 1], n)) {
529 (*ofs) += n;
530 res = 1;
531 }
532 pc += 3;
533 break;
534 case QUEST:
535 res = 1;
536 saved_offset = *ofs;
537 if (!match(r, pc + 2, s, len, ofs, caps))
538 *ofs = saved_offset;
539 pc += r->code[pc + 1];
540 break;
541 case STAR:
542 res = 1;
543 loop_greedy(r, pc, s, len, ofs);
544 pc += r->code[pc + 1];
545 break;
546 case STARQ:
547 res = 1;
548 loop_non_greedy(r, pc, s, len, ofs);
549 pc += r->code[pc + 1];
550 break;
551 case PLUS:
552 res = match(r, pc + 2, s, len, ofs, caps);
553 if (res == 0)
554 break;
555
556 loop_greedy(r, pc, s, len, ofs);
557 pc += r->code[pc + 1];
558 break;
559 case PLUSQ:
560 res = match(r, pc + 2, s, len, ofs, caps);
561 if (res == 0)
562 break;
563
564 loop_non_greedy(r, pc, s, len, ofs);
565 pc += r->code[pc + 1];
566 break;
567 case SPACE:
568 res = 0;
569 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
570 (*ofs)++;
571 res = 1;
572 }
573 pc++;
574 break;
575 case NONSPACE:
576 res = 0;
577 if (*ofs < len &&
578 !isspace(((unsigned char *)s)[*ofs])) {
579 (*ofs)++;
580 res = 1;
581 }
582 pc++;
583 break;
584 case DIGIT:
585 res = 0;
586 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
587 (*ofs)++;
588 res = 1;
589 }
590 pc++;
591 break;
592 case ANY:
593 res = 0;
594 if (*ofs < len) {
595 (*ofs)++;
596 res = 1;
597 }
598 pc++;
599 break;
600 case ANYOF:
601 res = 0;
602 if (*ofs < len)
603 res = is_any_of(r->data + r->code[pc + 1],
604 r->code[pc + 2], s, ofs);
605 pc += 3;
606 break;
607 case ANYBUT:
608 res = 0;
609 if (*ofs < len)
610 res = is_any_but(r->data + r->code[pc + 1],
611 r->code[pc + 2], s, ofs);
612 pc += 3;
613 break;
614 case BOL:
615 res = *ofs == 0 ? 1 : 0;
616 pc++;
617 break;
618 case EOL:
619 res = *ofs == len ? 1 : 0;
620 pc++;
621 break;
622 case OPEN:
623 if (caps != NULL)
624 caps[r->code[pc + 1]].ptr = s + *ofs;
625 pc += 2;
626 break;
627 case CLOSE:
628 if (caps != NULL)
629 caps[r->code[pc + 1]].len = (s + *ofs) -
630 caps[r->code[pc + 1]].ptr;
631 pc += 2;
632 break;
633 case END:
634 pc++;
635 break;
636 default:
637 printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
638 assert(0);
639 break;
640 }
641 }
642
643 return res;
644}
645
646int
647slre_match(const struct slre *r, const char *buf, int len,
648 struct cap *caps)
649{
650 int i, ofs = 0, res = 0;
651
652 if (r->anchored) {
653 res = match(r, 0, buf, len, &ofs, caps);
654 } else {
655 for (i = 0; i < len && res == 0; i++) {
656 ofs = i;
657 res = match(r, 0, buf, len, &ofs, caps);
658 }
659 }
660
661 return res;
662}
663
664#ifdef SLRE_TEST
665#define N_CAPS 5
666
667int main(int argc, char *argv[])
668{
669 struct slre slre;
670 struct cap caps[N_CAPS];
671 unsigned char data[1 * 1024 * 1024];
672 FILE *fp;
673 int i, res, len;
674
675 if (argc < 2) {
676 fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
677 return 1;
678 }
679
680 fp = fopen(argv[2], "rb");
681 if (fp == NULL) {
682 fprintf(stderr, "Error: cannot open %s:%s\n",
683 argv[2], strerror(errno));
684 return 1;
685 }
686
687 if (!slre_compile(&slre, argv[1])) {
688 fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
689 return 1;
690 }
Wolfgang Denk3765b3e2013-10-07 13:07:26 +0200691
Wolfgang Denka5ecbe62013-03-23 23:50:31 +0000692 slre_dump(&slre, stderr);
693
694 while (fgets(data, sizeof(data), fp) != NULL) {
695 len = strlen(data);
696
697 if ((len > 0) && (data[len-1] == '\n')) {
698 data[len-1] = '\0';
699 --len;
700 }
701
702 printf("Data = \"%s\"\n", data);
703
704 (void) memset(caps, 0, sizeof(caps));
705
Wolfgang Denka5ecbe62013-03-23 23:50:31 +0000706 res = slre_match(&slre, data, len, caps);
707 printf("Result [%d]: %d\n", i, res);
708
709 for (i = 0; i < N_CAPS; i++) {
710 if (caps[i].len > 0) {
711 printf("Substring %d: len=%d [%.*s]\n", i,
712 caps[i].len,
713 caps[i].len, caps[i].ptr);
714 }
715 }
716 printf("----------------------------------------------------\n");
717 }
718 (void) fclose(fp);
719
720 return 0;
721}
722#endif /* SLRE_TEST */