2 * Copyright (C) 1994-1996, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.12 1996-06-04 10:20:02 adam
8 * Added support for character mapping.
10 * Revision 1.11 1996/01/08 19:15:24 adam
11 * Allow single $ in expressions.
13 * Revision 1.10 1996/01/08 09:09:17 adam
14 * Function dfa_parse got 'const' string argument.
15 * New functions to define char mappings made public.
17 * Revision 1.9 1995/12/06 12:24:58 adam
18 * Removed verbatim mode code.
20 * Revision 1.8 1995/12/06 09:09:58 adam
21 * Work on left and right anchors.
23 * Revision 1.7 1995/11/27 09:23:02 adam
24 * New berbatim hook in regular expressions. "[]n ..".
26 * Revision 1.6 1995/10/16 09:31:25 adam
29 * Revision 1.5 1995/10/02 15:17:58 adam
30 * Bug fix in dfa_delete.
32 * Revision 1.4 1995/09/28 09:18:52 adam
33 * Removed various preprocessor defines.
35 * Revision 1.3 1995/09/04 12:33:26 adam
36 * Various cleanup. YAZ util used instead.
38 * Revision 1.2 1995/01/25 11:30:50 adam
39 * Simple error reporting when parsing regular expressions.
40 * Memory usage reduced.
42 * Revision 1.1 1995/01/24 16:02:52 adam
43 * New private header file in dfa module (dfap.h).
44 * Module no longer uses yacc to parse regular expressions.
66 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
67 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
68 /* a character in range [ch[0]..ch[1]] */
69 /* otherwise ch[0] represents */
70 /* accepting no (-acceptno) */
72 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
73 unsigned nullable : 1;
74 Set firstpos; /* first positions */
75 Set lastpos; /* last positions */
78 struct Tblock { /* Tnode bucket (block) */
79 struct Tblock *next; /* pointer to next bucket */
80 struct Tnode *tarray; /* array of nodes in bucket */
84 #define STATE_HASH 199
85 #define POSET_CHUNK 100
87 int debug_dfa_trav = 0;
88 int debug_dfa_tran = 0;
89 int debug_dfa_followpos = 0;
92 static struct DFA_parse *parse_info = NULL;
95 static int inside_string;
96 static const unsigned char *expr_ptr;
97 static struct Tnode **posar;
100 static Set *followpos;
102 static struct Tnode *mk_Tnode (void);
103 static struct Tnode *mk_Tnode_cset (BSet charset);
104 static void term_Tnode (void);
107 del_followpos (void),
110 mk_dfa_tran (struct DFA_states *dfas),
111 add_follow (Set lastpos, Set firstpos),
112 dfa_trav (struct Tnode *n),
113 init_followpos (void),
114 pr_tran (struct DFA_states *dfas),
115 pr_verbose (struct DFA_states *dfas),
125 *str_char (unsigned c);
141 static int lookahead;
142 static unsigned look_ch;
143 static BSet look_chars;
145 static struct Tnode *expr_1 (void),
150 static struct Tnode *expr_1 (void)
152 struct Tnode *t1, *t2, *tn;
154 if (!(t1 = expr_2 ()))
156 while (lookahead == L_ALT)
159 if (!(t2 = expr_2 ()))
171 static struct Tnode *expr_2 (void)
173 struct Tnode *t1, *t2, *tn;
175 if (!(t1 = expr_3 ()))
177 while (lookahead == L_WILD ||
178 lookahead == L_ANYZ ||
179 lookahead == L_ANY ||
181 lookahead == L_CHAR ||
182 lookahead == L_CHARS)
184 if (!(t2 = expr_3 ()))
196 static struct Tnode *expr_3 (void)
198 struct Tnode *t1, *tn;
200 if (!(t1 = expr_4 ()))
202 if (lookahead == L_CLOS0)
210 else if (lookahead == L_CLOS1)
218 else if (lookahead == L_QUEST)
224 tn->u.p[1] = mk_Tnode();
225 tn->u.p[1]->pos = EPSILON;
231 static struct Tnode *expr_4 (void)
239 if (!(t1 = expr_1 ()))
241 if (lookahead == L_RP)
248 t1->pos = ++parse_info->position;
249 t1->u.ch[1] = t1->u.ch[0] = look_ch;
253 t1 = mk_Tnode_cset (look_chars);
257 t1 = mk_Tnode_cset (parse_info->anyset);
263 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
264 t1->u.p[1] = mk_Tnode();
265 t1->u.p[1]->pos = EPSILON;
271 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
279 static void do_parse (struct DFA_parse *dfap, const char **s,
282 int start_anchor_flag = 0;
283 struct Tnode *t1, *t2, *tn;
287 expr_ptr = (const unsigned char *) *s;
291 if (lookahead == L_START)
293 start_anchor_flag = 1;
296 if (lookahead == L_END)
299 t1->pos = ++parse_info->position;
300 t1->u.ch[1] = t1->u.ch[0] = '\n';
306 if (t1 && lookahead == L_END)
309 t2->pos = ++parse_info->position;
310 t2->u.ch[1] = t2->u.ch[0] = '\n';
321 if (t1 && lookahead == 0)
324 t2->pos = ++parse_info->position;
325 t2->u.ch[0] = -(++parse_info->rule);
326 t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule);
337 if (lookahead == L_RP)
338 err_code = DFA_ERR_RP;
339 else if (lookahead == L_LP)
340 err_code = DFA_ERR_LP;
342 err_code = DFA_ERR_SYNTAX;
345 *s = (const char *) expr_ptr;
348 static int nextchar (int *esc)
351 if (*expr_ptr == '\0')
353 else if (*expr_ptr != '\\')
382 static int nextchar_set (int *esc)
384 if (*expr_ptr == ' ')
389 return nextchar (esc);
392 static int read_charset (void)
394 int i, ch0, ch1, esc0, esc1, cc = 0;
395 look_chars = mk_BSet (&parse_info->charset);
396 res_BSet (parse_info->charset, look_chars);
398 ch0 = nextchar_set (&esc0);
399 if (!esc0 && ch0 == '^')
402 ch0 = nextchar_set (&esc0);
406 if (!esc0 && ch0 == ']')
408 add_BSet (parse_info->charset, look_chars, ch0);
409 ch1 = nextchar_set (&esc1);
410 if (!esc1 && ch1 == '-')
412 if ((ch1 = nextchar_set (&esc1)) == 0)
414 if (!esc1 && ch1 == ']')
416 add_BSet (parse_info->charset, look_chars, '-');
419 for (i=ch0; ++i<=ch1;)
420 add_BSet (parse_info->charset, look_chars, i);
421 ch0 = nextchar_set (&esc0);
430 com_BSet (parse_info->charset, look_chars);
434 static int map_l_char (void)
437 const char *cp0 = expr_ptr-1;
438 int i = 0, len = strlen(cp0);
440 if (cp0[0] == 1 && cp0[1])
446 if (!parse_info->cmap)
449 mapto = (*parse_info->cmap) (&cp0, len);
453 look_ch = mapto[i][0];
454 logf (LOG_DEBUG, "map from %c to %d", expr_ptr[-1], look_ch);
458 static int lex_sub(void)
461 while ((look_ch = nextchar (&esc)) != 0)
465 return map_l_char ();
466 inside_string = !inside_string;
468 else if (esc || inside_string)
469 return map_l_char ();
470 else if (look_ch == '[')
471 return read_charset();
475 for (cc = parse_info->charMap; *cc; cc += 2)
482 return map_l_char ();
487 static void lex (void)
489 lookahead = lex_sub ();
492 static const char *str_char (unsigned c)
509 sprintf (s+1, "x%02x", c);
531 static void out_char (int c)
533 printf ("%s", str_char (c));
536 static void term_Tnode (void)
538 struct Tblock *t, *t1;
539 for (t = parse_info->start; (t1 = t);)
547 static struct Tnode *mk_Tnode (void)
551 if (parse_info->use_Tnode == parse_info->max_Tnode)
553 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
554 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
557 if (parse_info->use_Tnode == 0)
558 parse_info->start = tnew;
560 parse_info->end->next = tnew;
561 parse_info->end = tnew;
563 parse_info->max_Tnode += TADD;
565 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
568 struct Tnode *mk_Tnode_cset (BSet charset)
570 struct Tnode *tn1, *tn0 = mk_Tnode();
571 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
577 tn0->pos = ++parse_info->position;
578 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
580 tn0->u.ch[1] = parse_info->charset->size;
583 tn0->u.ch[1] = ch1-1;
584 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
590 tn1 = tn0->u.p[1] = mk_Tnode();
592 tn1->pos = ++(parse_info->position);
593 if ((ch1 = travi_BSet (parse_info->charset, charset,
596 tn1->u.ch[1] = parse_info->charset->size;
599 tn1->u.ch[1] = ch1-1;
606 static void del_followpos (void)
611 static void init_pos (void)
613 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
614 * (1+parse_info->position));
617 static void del_pos (void)
622 static void add_follow (Set lastpos, Set firstpos)
626 followpos[lastpos->value] =
627 union_Set (poset, followpos[lastpos->value], firstpos);
628 lastpos = lastpos->next;
632 static void dfa_trav (struct Tnode *n)
637 dfa_trav (n->u.p[0]);
638 dfa_trav (n->u.p[1]);
639 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
640 n->firstpos = mk_Set (poset);
641 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
642 if (n->u.p[0]->nullable)
643 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
644 n->lastpos = mk_Set (poset);
645 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
646 if (n->u.p[1]->nullable)
647 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
649 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
651 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
652 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
653 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
654 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
659 dfa_trav (n->u.p[0]);
660 dfa_trav (n->u.p[1]);
661 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
663 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
664 n->u.p[1]->firstpos);
665 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
667 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
668 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
669 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
670 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
675 dfa_trav (n->u.p[0]);
676 n->nullable = n->u.p[0]->nullable;
677 n->firstpos = n->u.p[0]->firstpos;
678 n->lastpos = n->u.p[0]->lastpos;
679 add_follow (n->lastpos, n->firstpos);
684 dfa_trav (n->u.p[0]);
686 n->firstpos = n->u.p[0]->firstpos;
687 n->lastpos = n->u.p[0]->lastpos;
688 add_follow (n->lastpos, n->firstpos);
694 n->lastpos = mk_Set (poset);
695 n->firstpos = mk_Set (poset);
702 n->firstpos = mk_Set (poset);
703 n->firstpos = add_Set (poset, n->firstpos, n->pos);
704 n->lastpos = mk_Set (poset);
705 n->lastpos = add_Set (poset, n->lastpos, n->pos);
708 printf ("#%d (n#%d)", -n->u.ch[0], -n->u.ch[1]);
709 else if (n->u.ch[1] > n->u.ch[0])
712 out_char (n->u.ch[0]);
713 if (n->u.ch[1] > n->u.ch[0]+1)
715 out_char (n->u.ch[1]);
719 out_char (n->u.ch[0]);
723 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
724 printf (" firstpos :");
725 pr_Set (poset, n->firstpos);
726 printf (" lastpos :");
727 pr_Set (poset, n->lastpos);
731 static void init_followpos (void)
735 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
736 for (i = parse_info->position+1; --i >= 0; fa++)
737 *fa = mk_Set (poset);
740 static void mk_dfa_tran (struct DFA_states *dfas)
747 struct DFA_state *dfa_from, *dfa_to;
750 max_char = parse_info->charset->size;
751 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
753 tran_set = cp_Set (poset, parse_info->root->firstpos);
754 i = add_DFA_state (dfas, &tran_set, &dfa_from);
756 dfa_from->rule_no = 0;
757 while ((dfa_from = get_DFA_state (dfas)))
761 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
762 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
763 *pos_i++ = tran_set->value;
768 c = posar[tran_set->value]->u.ch[1];
773 dfa_from->rule_no = -i;
774 dfa_from->rule_nno = -j;
776 for (char_1 = 0; char_1 <= max_char; char_1++)
779 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
780 if (posar[i]->u.ch[1] >= char_1
781 && (c=posar[i]->u.ch[0]) < char_0)
787 if (char_0 > max_char)
792 tran_set = mk_Set (poset);
793 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
795 if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
796 char_1 = c - 1; /* forward chunk */
797 else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
798 char_1 = c; /* backward chunk */
799 if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
800 tran_set = union_Set (poset, tran_set, followpos[i]);
804 add_DFA_state (dfas, &tran_set, &dfa_to);
805 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
810 sort_DFA_states (dfas);
813 static void pr_tran (struct DFA_states *dfas)
816 struct DFA_tran *tran;
817 int prev_no, i, c, no;
819 for (no=0; no < dfas->no; ++no)
821 s = dfas->sortarray[no];
822 assert (s->no == no);
823 printf ("DFA state %d", no);
825 printf ("#%d", s->rule_no);
827 printf (" #%d", s->rule_nno);
830 pr_Set (poset, s->set);
832 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
834 if (prev_no != tran->to)
838 printf (" goto %d on [", tran->to);
841 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
842 printf ("%s", str_char(c));
850 static void pr_verbose (struct DFA_states *dfas)
854 printf ("%d/%d tree nodes used, %d bytes each\n",
855 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
856 k = inf_BSetHandle (parse_info->charset, &i, &j);
857 printf ("%ld/%ld character sets, %d bytes each\n",
858 i/k, j/k, k*sizeof(BSetWord));
859 k = inf_SetType (poset, &i, &j);
860 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
861 printf ("%d DFA states\n", dfas->no);
864 static void pr_followpos (void)
867 printf ("\nfollowsets:\n");
868 for (i=1; i <= parse_info->position; i++)
871 pr_Set (poset, followpos[i]);
874 if (posar[i]->u.ch[0] < 0)
875 printf ("#%d", -posar[i]->u.ch[0]);
876 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
879 out_char (posar[i]->u.ch[0]);
880 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
882 out_char (posar[i]->u.ch[1]);
886 out_char (posar[i]->u.ch[0]);
892 void dfa_parse_cmap_clean (struct DFA *d)
894 struct DFA_parse *dfa = d->parse_info;
899 dfa->charMapSize = 7;
900 dfa->charMap = imalloc (dfa->charMapSize * sizeof(*dfa->charMap));
905 void dfa_parse_cmap_new (struct DFA *d, const int *cmap)
907 struct DFA_parse *dfa = d->parse_info;
912 for (cp = cmap; *cp; cp += 2)
914 size = cp - cmap + 1;
915 if (size > dfa->charMapSize)
918 ifree (dfa->charMap);
919 dfa->charMapSize = size;
920 dfa->charMap = imalloc (size * sizeof(*dfa->charMap));
922 memcpy (dfa->charMap, cmap, size * sizeof(*dfa->charMap));
925 void dfa_parse_cmap_del (struct DFA *d, int from)
927 struct DFA_parse *dfa = d->parse_info;
931 for (cc = dfa->charMap; *cc; cc += 2)
934 while ((cc[0] = cc[2]))
943 void dfa_parse_cmap_add (struct DFA *d, int from, int to)
945 struct DFA_parse *dfa = d->parse_info;
950 for (cc = dfa->charMap; *cc; cc += 2)
956 indx = cc - dfa->charMap;
957 size = dfa->charMapSize;
960 int *cn = imalloc ((size+16) * sizeof(*dfa->charMap));
961 memcpy (cn, dfa->charMap, indx*sizeof(*dfa->charMap));
962 ifree (dfa->charMap);
964 dfa->charMapSize = size+16;
966 dfa->charMap[indx] = from;
967 dfa->charMap[indx+1] = to;
968 dfa->charMap[indx+2] = 0;
971 void dfa_parse_cmap_thompson (struct DFA *d)
973 static int thompson_chars[] =
989 dfa_parse_cmap_new (d, thompson_chars);
992 static struct DFA_parse *dfa_parse_init (void)
994 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
996 parse_info->charset = mk_BSetHandle (255, 20);
997 parse_info->position = 0;
998 parse_info->rule = 0;
999 parse_info->root = NULL;
1001 parse_info->anyset = mk_BSet (&parse_info->charset);
1002 res_BSet (parse_info->charset, parse_info->anyset);
1003 add_BSet (parse_info->charset, parse_info->anyset, '\n');
1004 com_BSet (parse_info->charset, parse_info->anyset);
1005 parse_info->use_Tnode = parse_info->max_Tnode = 0;
1006 parse_info->charMap = NULL;
1007 parse_info->charMapSize = 0;
1008 parse_info->cmap = NULL;
1012 static void rm_dfa_parse (struct DFA_parse **dfap)
1015 assert (parse_info);
1017 rm_BSetHandle (&parse_info->charset);
1018 ifree (parse_info->charMap);
1023 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
1025 struct DFA_states *dfas;
1027 assert (poset_chunk > 10);
1031 poset = mk_SetType (poset_chunk);
1034 assert (parse_info->root);
1035 dfa_trav (parse_info->root);
1037 if (debug_dfa_followpos)
1039 init_DFA_states (&dfas, poset, STATE_HASH);
1051 struct DFA *dfa_init (void)
1055 dfa = imalloc (sizeof(*dfa));
1056 dfa->parse_info = dfa_parse_init ();
1057 dfa->state_info = NULL;
1059 dfa_parse_cmap_thompson (dfa);
1063 void dfa_set_cmap (struct DFA *dfa, char **(*cmap)(const char **from, int len))
1065 dfa->parse_info->cmap = cmap;
1068 int dfa_parse (struct DFA *dfa, const char **pattern)
1073 assert (dfa->parse_info);
1074 do_parse (dfa->parse_info, pattern, &top);
1077 if ( !dfa->parse_info->root )
1078 dfa->parse_info->root = top;
1085 n->u.p[0] = dfa->parse_info->root;
1087 dfa->parse_info->root = n;
1092 void dfa_mkstate (struct DFA *dfa)
1096 dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK);
1097 dfa->no_states = dfa->state_info->no;
1098 dfa->states = dfa->state_info->sortarray;
1099 rm_dfa_parse (&dfa->parse_info);
1102 void dfa_delete (struct DFA **dfap)
1106 if ((*dfap)->parse_info)
1107 rm_dfa_parse (&(*dfap)->parse_info);
1108 if ((*dfap)->state_info)
1109 rm_DFA_states (&(*dfap)->state_info);