2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.23 1999-05-20 12:57:18 adam
8 * Implemented TCL filter. Updated recctrl system.
10 * Revision 1.22 1998/11/03 16:07:13 adam
13 * Revision 1.21 1998/11/03 15:43:39 adam
14 * Fixed bug introduced by previous commit.
16 * Revision 1.20 1998/11/03 14:51:28 adam
17 * Changed code so that it creates as few data1 nodes as possible.
19 * Revision 1.19 1998/11/03 10:22:39 adam
20 * Fixed memory leak that could occur for when large data1 node were
21 * concatenated. Data-type data1_nodes may have multiple nodes.
23 * Revision 1.18 1998/10/15 13:11:47 adam
24 * Added support for option -record for "end element". When specified
25 * end element will mark end-of-record when at outer-level.
27 * Revision 1.17 1998/07/01 10:13:51 adam
30 * Revision 1.16 1998/06/30 15:15:09 adam
31 * Tags are trimmed: white space removed before- and after the tag.
33 * Revision 1.15 1998/06/30 12:55:45 adam
36 * Revision 1.14 1998/03/05 08:41:00 adam
37 * Implemented rule contexts.
39 * Revision 1.13 1997/12/12 06:33:58 adam
40 * Fixed bug that showed up when multiple filter where used.
41 * Made one routine thread-safe.
43 * Revision 1.12 1997/11/18 10:03:24 adam
44 * Member num_children removed from data1_node.
46 * Revision 1.11 1997/11/06 11:41:01 adam
47 * Implemented "begin variant" for the sgml.regx filter.
49 * Revision 1.10 1997/10/31 12:36:12 adam
50 * Minor change that avoids compiler warning.
52 * Revision 1.9 1997/09/29 09:02:49 adam
53 * Fixed small bug (introduced by previous commit).
55 * Revision 1.8 1997/09/17 12:19:22 adam
56 * Zebra version corresponds to YAZ version 1.4.
57 * Changed Zebra server so that it doesn't depend on global common_resource.
59 * Revision 1.7 1997/07/15 16:33:07 adam
60 * Check for zero length in execData.
62 * Revision 1.6 1997/02/24 10:41:51 adam
63 * Cleanup of code and commented out the "end element-end-record" code.
65 * Revision 1.5 1997/02/19 16:22:33 adam
66 * Fixed "end element" to terminate record in outer-most level.
68 * Revision 1.4 1997/02/12 20:42:58 adam
69 * Changed some log messages.
71 * Revision 1.3 1996/11/08 14:05:33 adam
72 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
74 * Revision 1.2 1996/10/29 14:02:09 adam
75 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
76 * data1_get_tabpath is used.
78 * Revision 1.1 1996/10/11 10:57:30 adam
79 * New module recctrl. Used to manage records (extract/retrieval).
81 * Revision 1.24 1996/06/17 14:25:31 adam
82 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
84 * Revision 1.23 1996/06/04 10:19:00 adam
85 * Minor changes - removed include of ctype.h.
87 * Revision 1.22 1996/06/03 15:23:13 adam
88 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
90 * Revision 1.21 1996/05/14 16:58:38 adam
93 * Revision 1.20 1996/05/01 13:46:36 adam
94 * First work on multiple records in one file.
95 * New option, -offset, to the "unread" command in the filter module.
97 * Revision 1.19 1996/02/12 16:18:20 adam
98 * Yet another bug fix in implementation of unread command.
100 * Revision 1.18 1996/02/12 16:07:54 adam
101 * Bug fix in new unread command.
103 * Revision 1.17 1996/02/12 15:56:11 adam
104 * New code command: unread.
106 * Revision 1.16 1996/01/17 14:57:51 adam
107 * Prototype changed for reader functions in extract/retrieve. File
108 * is identified by 'void *' instead of 'int.
110 * Revision 1.15 1996/01/08 19:15:47 adam
111 * New input filter that works!
113 * Revision 1.14 1996/01/08 09:10:38 adam
114 * Yet another complete rework on this module.
116 * Revision 1.13 1995/12/15 17:21:50 adam
117 * This version is able to set data.formatted_text in data1-nodes.
119 * Revision 1.12 1995/12/15 16:20:10 adam
120 * The filter files (*.flt) are read from the path given by data1_tabpath.
122 * Revision 1.11 1995/12/15 12:35:16 adam
125 * Revision 1.10 1995/12/15 10:35:36 adam
128 * Revision 1.9 1995/12/14 16:38:48 adam
129 * Completely new attempt to make regular expression parsing.
131 * Revision 1.8 1995/12/13 17:16:59 adam
134 * Revision 1.7 1995/12/13 16:51:58 adam
135 * Modified to set last_child in data1_nodes.
136 * Uses destroy handler to free up data text nodes.
138 * Revision 1.6 1995/12/13 13:45:37 quinn
139 * Changed data1 to use nmem.
141 * Revision 1.5 1995/12/11 09:12:52 adam
142 * The rec_get function returns NULL if record doesn't exist - will
143 * happen in the server if the result set records have been deleted since
144 * the creation of the set (i.e. the search).
145 * The server saves a result temporarily if it is 'volatile', i.e. the
146 * set is register dependent.
148 * Revision 1.4 1995/12/05 16:57:40 adam
149 * More work on regular patterns.
151 * Revision 1.3 1995/12/05 09:37:09 adam
152 * One malloc was renamed to xmalloc.
154 * Revision 1.2 1995/12/04 17:59:24 adam
155 * More work on regular expression conversion.
157 * Revision 1.1 1995/12/04 14:25:30 adam
158 * Started work on regular expression parsed input to structured records.
167 #include <zebrautl.h>
177 #define F_WIN_EOF 2000000000
181 #define REGX_PATTERN 1
186 #define REGX_CONTEXT 6
193 struct lexRuleAction {
197 struct DFA *dfa; /* REGX_PATTERN */
200 struct regxCode *code; /* REGX_CODE */
202 struct lexRuleAction *next;
207 struct lexRuleAction *actionList;
211 struct lexRuleInfo info;
212 struct lexRule *next;
218 struct lexRule *rules;
219 struct lexRuleInfo **fastRule;
223 struct lexRuleAction *beginActionList;
224 struct lexRuleAction *endActionList;
225 struct lexRuleAction *initActionList;
226 struct lexContext *next;
229 struct lexConcatBuf {
237 struct lexContext *context;
239 struct lexContext **context_stack;
240 int context_stack_size;
241 int context_stack_top;
247 Tcl_Interp *tcl_interp;
250 void (*f_win_ef)(void *, off_t);
252 int f_win_start; /* first byte of buffer is this file offset */
253 int f_win_end; /* last byte of buffer is this offset - 1 */
254 int f_win_size; /* size of buffer */
255 char *f_win_buf; /* buffer itself */
256 int (*f_win_rf)(void *, char *, size_t);
257 off_t (*f_win_sf)(void *, off_t);
259 struct lexConcatBuf **concatBuf;
261 data1_node **d1_stack;
272 struct lexSpec *spec;
275 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
278 int i, r, off = start_pos - spec->f_win_start;
280 if (off >= 0 && end_pos <= spec->f_win_end)
282 *size = end_pos - start_pos;
283 return spec->f_win_buf + off;
285 if (off < 0 || start_pos >= spec->f_win_end)
287 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
288 spec->f_win_start = start_pos;
290 if (!spec->f_win_buf)
291 spec->f_win_buf = xmalloc (spec->f_win_size);
292 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
294 spec->f_win_end = spec->f_win_start + *size;
296 if (*size > end_pos - start_pos)
297 *size = end_pos - start_pos;
298 return spec->f_win_buf;
300 for (i = 0; i<spec->f_win_end - start_pos; i++)
301 spec->f_win_buf[i] = spec->f_win_buf[i + off];
302 r = (*spec->f_win_rf)(spec->f_win_fh,
304 spec->f_win_size - i);
305 spec->f_win_start = start_pos;
306 spec->f_win_end += r;
308 if (*size > end_pos - start_pos)
309 *size = end_pos - start_pos;
310 return spec->f_win_buf;
313 static int f_win_advance (struct lexSpec *spec, int *pos)
318 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
319 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
320 if (*pos == F_WIN_EOF)
322 buf = f_win_get (spec, *pos, *pos+1, &size);
332 static void regxCodeDel (struct regxCode **pp)
334 struct regxCode *p = *pp;
343 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
347 p = xmalloc (sizeof(*p));
348 p->str = xmalloc (len+1);
349 memcpy (p->str, buf, len);
354 static struct DFA *lexSpecDFA (void)
359 dfa_parse_cmap_del (dfa, ' ');
360 dfa_parse_cmap_del (dfa, '\t');
361 dfa_parse_cmap_add (dfa, '/', 0);
365 static void actionListDel (struct lexRuleAction **rap)
367 struct lexRuleAction *ra1, *ra;
369 for (ra = *rap; ra; ra = ra1)
375 dfa_delete (&ra->u.pattern.dfa);
378 regxCodeDel (&ra->u.code);
386 static struct lexContext *lexContextCreate (const char *name)
388 struct lexContext *p = xmalloc (sizeof(*p));
390 p->name = xstrdup (name);
393 p->dfa = lexSpecDFA ();
396 p->beginActionList = NULL;
397 p->endActionList = NULL;
398 p->initActionList = NULL;
403 static void lexContextDestroy (struct lexContext *p)
405 struct lexRule *rp, *rp1;
408 for (rp = p->rules; rp; rp = rp1)
411 actionListDel (&rp->info.actionList);
414 actionListDel (&p->beginActionList);
415 actionListDel (&p->endActionList);
420 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
425 p = xmalloc (sizeof(*p));
426 p->name = xmalloc (strlen(name)+1);
427 strcpy (p->name, name);
434 p->context_stack_size = 100;
435 p->context_stack = xmalloc (sizeof(*p->context_stack) *
436 p->context_stack_size);
440 p->concatBuf = xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
441 for (i = 0; i < p->maxLevel; i++)
443 p->concatBuf[i] = xmalloc (sizeof(**p->concatBuf));
444 p->concatBuf[i]->len = p->concatBuf[i]->max = 0;
445 p->concatBuf[i]->buf = 0;
447 p->d1_stack = xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
452 static void lexSpecDestroy (struct lexSpec **pp)
455 struct lexContext *lt;
463 for (i = 0; i < p->maxLevel; i++)
464 xfree (p->concatBuf[i]);
465 xfree (p->concatBuf);
470 struct lexContext *lt_next = lt->next;
471 lexContextDestroy (lt);
476 Tcl_DeleteInterp (p->tcl_interp);
479 xfree (p->f_win_buf);
480 xfree (p->context_stack);
486 static int readParseToken (const char **cpp, int *len)
488 const char *cp = *cpp;
492 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
521 if (*cp >= 'a' && *cp <= 'z')
523 else if (*cp >= 'A' && *cp <= 'Z')
524 cmd[i] = *cp + 'a' - 'A';
527 if (i < sizeof(cmd)-2)
534 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
536 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
542 if (!strcmp (cmd, "begin"))
544 else if (!strcmp (cmd, "end"))
546 else if (!strcmp (cmd, "body"))
548 else if (!strcmp (cmd, "context"))
550 else if (!strcmp (cmd, "init"))
554 logf (LOG_WARN, "bad command %s", cmd);
560 static int actionListMk (struct lexSpec *spec, const char *s,
561 struct lexRuleAction **ap)
567 while ((tok = readParseToken (&s, &len)))
575 *ap = xmalloc (sizeof(**ap));
577 regxCodeMk (&(*ap)->u.code, s, len);
581 *ap = xmalloc (sizeof(**ap));
583 (*ap)->u.pattern.body = bodyMark;
585 (*ap)->u.pattern.dfa = lexSpecDFA ();
587 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
592 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
595 dfa_mkstate ((*ap)->u.pattern.dfa);
599 logf (LOG_WARN, "cannot use BEGIN here");
602 logf (LOG_WARN, "cannot use INIT here");
605 *ap = xmalloc (sizeof(**ap));
615 int readOneSpec (struct lexSpec *spec, const char *s)
619 struct lexContext *lc;
621 tok = readParseToken (&s, &len);
622 if (tok == REGX_CONTEXT)
624 char context_name[32];
625 tok = readParseToken (&s, &len);
626 if (tok != REGX_CODE)
628 logf (LOG_WARN, "missing name after CONTEXT keyword");
633 memcpy (context_name, s, len);
634 context_name[len] = '\0';
635 lc = lexContextCreate (context_name);
636 lc->next = spec->context;
641 spec->context = lexContextCreate ("main");
646 actionListDel (&spec->context->beginActionList);
647 actionListMk (spec, s, &spec->context->beginActionList);
650 actionListDel (&spec->context->endActionList);
651 actionListMk (spec, s, &spec->context->endActionList);
654 actionListDel (&spec->context->initActionList);
655 actionListMk (spec, s, &spec->context->initActionList);
659 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
661 r = dfa_parse (spec->context->dfa, &s);
664 logf (LOG_WARN, "regular expression error. r=%d", r);
669 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
673 rp = xmalloc (sizeof(*rp));
674 rp->info.no = spec->context->ruleNo++;
675 rp->next = spec->context->rules;
676 spec->context->rules = rp;
677 actionListMk (spec, s, &rp->info.actionList);
682 int readFileSpec (struct lexSpec *spec)
684 struct lexContext *lc;
687 int c, i, errors = 0;
690 lineBuf = xmalloc (1+lineSize);
691 logf (LOG_LOG, "reading regx filter %s.flt", spec->name);
692 sprintf (lineBuf, "%s.flt", spec->name);
693 if (!(spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh),
696 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
705 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
707 while (c != '\n' && c != EOF)
726 if (c != ' ' && c != '\t')
735 readOneSpec (spec, lineBuf);
736 spec->lineNo += addLine;
745 debug_dfa_followpos = 1;
748 for (lc = spec->context; lc; lc = lc->next)
751 lc->fastRule = xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
752 for (i = 0; i < lc->ruleNo; i++)
753 lc->fastRule[i] = NULL;
754 for (rp = lc->rules; rp; rp = rp->next)
755 lc->fastRule[rp->info.no] = &rp->info;
756 dfa_mkstate (lc->dfa);
765 static struct lexSpec *curLexSpec = NULL;
768 static void execData (struct lexSpec *spec,
769 const char *ebuf, int elen, int formatted_text)
771 struct data1_node *res, *parent;
774 if (elen == 0) /* shouldn't happen, but it does! */
778 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
779 ebuf, 15, ebuf + elen-15);
781 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
783 logf (LOG_DEBUG, "data (%d bytes)", elen);
786 if (spec->d1_level <= 1)
789 parent = spec->d1_stack[spec->d1_level -1];
792 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
793 org_len = res->u.data.len;
798 res = data1_mk_node (spec->dh, spec->m);
799 res->parent = parent;
800 res->which = DATA1N_data;
801 res->u.data.what = DATA1I_text;
803 res->u.data.formatted_text = formatted_text;
805 if (elen > DATA1_LOCALDATA)
806 res->u.data.data = nmem_malloc (spec->m, elen);
808 res->u.data.data = res->lbuf;
809 memcpy (res->u.data.data, ebuf, elen);
811 res->u.data.data = 0;
813 res->root = parent->root;
815 parent->last_child = res;
816 if (spec->d1_stack[spec->d1_level])
817 spec->d1_stack[spec->d1_level]->next = res;
820 spec->d1_stack[spec->d1_level] = res;
822 if (org_len + elen >= spec->concatBuf[spec->d1_level]->max)
824 char *old_buf, *new_buf;
826 spec->concatBuf[spec->d1_level]->max = org_len + elen + 256;
827 new_buf = xmalloc (spec->concatBuf[spec->d1_level]->max);
828 if ((old_buf = spec->concatBuf[spec->d1_level]->buf))
830 memcpy (new_buf, old_buf, org_len);
833 spec->concatBuf[spec->d1_level]->buf = new_buf;
835 assert (spec->concatBuf[spec->d1_level]);
836 memcpy (spec->concatBuf[spec->d1_level]->buf + org_len, ebuf, elen);
837 res->u.data.len += elen;
840 static void execDataP (struct lexSpec *spec,
841 const char *ebuf, int elen, int formatted_text)
843 execData (spec, ebuf, elen, formatted_text);
846 static void tagDataRelease (struct lexSpec *spec)
850 if ((res = spec->d1_stack[spec->d1_level]) &&
851 res->which == DATA1N_data &&
852 res->u.data.what == DATA1I_text)
854 assert (!res->u.data.data);
855 assert (res->u.data.len > 0);
856 if (res->u.data.len > DATA1_LOCALDATA)
857 res->u.data.data = nmem_malloc (spec->m, res->u.data.len);
859 res->u.data.data = res->lbuf;
860 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level]->buf,
865 static void variantBegin (struct lexSpec *spec,
866 const char *class_str, int class_len,
867 const char *type_str, int type_len,
868 const char *value_str, int value_len)
870 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
871 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
876 if (spec->d1_level == 0)
878 logf (LOG_WARN, "in variant begin. No record type defined");
881 if (class_len >= DATA1_MAX_SYMBOL)
882 class_len = DATA1_MAX_SYMBOL-1;
883 memcpy (tclass, class_str, class_len);
884 tclass[class_len] = '\0';
886 if (type_len >= DATA1_MAX_SYMBOL)
887 type_len = DATA1_MAX_SYMBOL-1;
888 memcpy (ttype, type_str, type_len);
889 ttype[type_len] = '\0';
892 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
897 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
901 if (parent->which != DATA1N_variant)
903 res = data1_mk_node (spec->dh, spec->m);
904 res->parent = parent;
905 res->which = DATA1N_variant;
906 res->u.variant.type = 0;
907 res->u.variant.value = 0;
908 res->root = parent->root;
910 parent->last_child = res;
911 if (spec->d1_stack[spec->d1_level])
913 tagDataRelease (spec);
914 spec->d1_stack[spec->d1_level]->next = res;
918 spec->d1_stack[spec->d1_level] = res;
919 spec->d1_stack[++(spec->d1_level)] = NULL;
921 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
922 if (spec->d1_stack[i]->u.variant.type == tp)
929 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
931 parent = spec->d1_stack[spec->d1_level-1];
932 res = data1_mk_node (spec->dh, spec->m);
933 res->parent = parent;
934 res->which = DATA1N_variant;
935 res->root = parent->root;
936 res->u.variant.type = tp;
938 if (value_len >= DATA1_LOCALDATA)
939 value_len =DATA1_LOCALDATA-1;
940 memcpy (res->lbuf, value_str, value_len);
941 res->lbuf[value_len] = '\0';
943 res->u.variant.value = res->lbuf;
945 parent->last_child = res;
946 if (spec->d1_stack[spec->d1_level])
948 tagDataRelease (spec);
949 spec->d1_stack[spec->d1_level]->next = res;
953 spec->d1_stack[spec->d1_level] = res;
954 spec->d1_stack[++(spec->d1_level)] = NULL;
957 static void tagStrip (const char **tag, int *len)
961 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
964 for (i = 0; i < *len && isspace((*tag)[i]); i++)
970 static void tagBegin (struct lexSpec *spec,
971 const char *tag, int len)
973 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
974 data1_element *elem = NULL;
975 data1_node *partag = get_parent_tag(spec->dh, parent);
977 data1_element *e = NULL;
980 if (spec->d1_level == 0)
982 logf (LOG_WARN, "in element begin. No record type defined");
985 tagStrip (&tag, &len);
987 res = data1_mk_node (spec->dh, spec->m);
988 res->parent = parent;
989 res->which = DATA1N_tag;
990 res->u.tag.get_bytes = -1;
992 if (len >= DATA1_LOCALDATA)
993 res->u.tag.tag = nmem_malloc (spec->m, len+1);
995 res->u.tag.tag = res->lbuf;
997 memcpy (res->u.tag.tag, tag, len);
998 res->u.tag.tag[len] = '\0';
1001 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1003 if (parent->which == DATA1N_variant)
1006 if (!(e = partag->u.tag.element))
1009 elem = data1_getelementbytagname (spec->dh,
1010 spec->d1_stack[0]->u.root.absyn,
1012 res->u.tag.element = elem;
1013 res->u.tag.node_selected = 0;
1014 res->u.tag.make_variantlist = 0;
1015 res->u.tag.no_data_requested = 0;
1016 res->root = parent->root;
1018 parent->last_child = res;
1019 if (spec->d1_stack[spec->d1_level])
1021 tagDataRelease (spec);
1022 spec->d1_stack[spec->d1_level]->next = res;
1025 parent->child = res;
1026 spec->d1_stack[spec->d1_level] = res;
1027 spec->d1_stack[++(spec->d1_level)] = NULL;
1030 static void tagEnd (struct lexSpec *spec, int min_level,
1031 const char *tag, int len)
1033 tagStrip (&tag, &len);
1034 while (spec->d1_level > min_level)
1036 tagDataRelease (spec);
1038 if (spec->d1_level == 0)
1040 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1042 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1044 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1048 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1053 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1056 struct DFA_state *state = dfa->states[0];
1059 unsigned char c_prev = 0;
1060 int ptr = *pptr; /* current pointer */
1061 int start_ptr = *pptr; /* first char of match */
1062 int last_ptr = 0; /* last char of match */
1063 int last_rule = 0; /* rule number of current match */
1068 c = f_win_advance (spec, &ptr);
1069 if (ptr == F_WIN_EOF)
1086 *mptr = start_ptr; /* match starts here */
1087 *pptr = last_ptr; /* match end here (+1) */
1090 state = dfa->states[0];
1095 else if (c >= t->ch[0] && c <= t->ch[1])
1097 state = dfa->states[t->to];
1102 last_rule = state->rule_no;
1107 last_rule = state->rule_nno;
1119 static int execTok (struct lexSpec *spec, const char **src,
1120 const char **tokBuf, int *tokLen)
1122 const char *s = *src;
1124 while (*s == ' ' || *s == '\t')
1128 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1132 while (*s >= '0' && *s <= '9')
1133 n = n*10 + (*s++ -'0');
1134 if (spec->arg_no == 0)
1141 if (n >= spec->arg_no)
1143 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1147 else if (*s == '\"')
1150 while (*s && *s != '\"')
1152 *tokLen = s - *tokBuf;
1157 else if (*s == '\n' || *s == ';')
1165 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1167 *tokLen = s - *tokBuf;
1174 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1176 *tokLen = s - *tokBuf;
1182 static char *regxStrz (const char *src, int len, char *str)
1186 memcpy (str, src, len);
1192 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1193 int argc, char **argv)
1195 struct lexSpec *spec = clientData;
1198 if (!strcmp(argv[1], "record") && argc == 3)
1200 char *absynName = argv[2];
1204 logf (LOG_DEBUG, "begin record %s", absynName);
1206 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1207 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1212 res = data1_mk_node (spec->dh, spec->m);
1213 res->which = DATA1N_root;
1214 res->u.root.type = absynName;
1215 res->u.root.absyn = absyn;
1218 spec->d1_stack[spec->d1_level] = res;
1219 spec->d1_stack[++(spec->d1_level)] = NULL;
1222 else if (!strcmp(argv[1], "element") && argc == 3)
1224 tagBegin (spec, argv[2], strlen(argv[2]));
1226 else if (!strcmp (argv[1], "variant") && argc == 5)
1228 variantBegin (spec, argv[2], strlen(argv[2]),
1229 argv[3], strlen(argv[3]),
1230 argv[4], strlen(argv[4]));
1232 else if (!strcmp (argv[1], "context") && argc == 3)
1234 struct lexContext *lc = spec->context;
1236 logf (LOG_DEBUG, "begin context %s",argv[2]);
1238 while (lc && strcmp (argv[2], lc->name))
1242 spec->context_stack[++(spec->context_stack_top)] = lc;
1245 logf (LOG_WARN, "unknown context %s", argv[2]);
1252 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1253 int argc, char **argv)
1255 struct lexSpec *spec = clientData;
1259 if (!strcmp (argv[1], "record"))
1261 while (spec->d1_level)
1263 tagDataRelease (spec);
1267 logf (LOG_DEBUG, "end record");
1269 spec->stop_flag = 1;
1271 else if (!strcmp (argv[1], "element"))
1275 if (!strcmp(argv[2], "-record"))
1286 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1287 if (spec->d1_level == 0)
1290 logf (LOG_DEBUG, "end element end records");
1292 spec->stop_flag = 1;
1295 else if (!strcmp (argv[1], "context"))
1298 logf (LOG_DEBUG, "end context");
1300 if (spec->context_stack_top)
1301 (spec->context_stack_top)--;
1308 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1309 int argc, char **argv)
1313 const char *element = 0;
1314 struct lexSpec *spec = clientData;
1318 if (!strcmp("-text", argv[argi]))
1323 else if (!strcmp("-element", argv[argi]))
1327 element = argv[argi++];
1333 tagBegin (spec, element, strlen(element));
1337 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1341 tagEnd (spec, 1, NULL, 0);
1345 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1346 int argc, char **argv)
1348 struct lexSpec *spec = clientData;
1355 if (!strcmp("-offset", argv[argi]))
1360 offset = atoi(argv[argi]);
1369 no = atoi(argv[argi]);
1370 if (no >= spec->arg_no)
1371 no = spec->arg_no - 1;
1372 spec->ptr = spec->arg_start[no] + offset;
1376 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1379 for (i = 0; i < spec->arg_no; i++)
1381 char var_name[10], *var_buf;
1384 sprintf (var_name, "%d", i);
1385 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1389 ch = var_buf[var_len];
1390 var_buf[var_len] = '\0';
1391 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1392 var_buf[var_len] = ch;
1395 Tcl_Eval (spec->tcl_interp, code->str);
1400 static void execCode (struct lexSpec *spec, struct regxCode *code)
1402 const char *s = code->str;
1404 const char *cmd_str;
1406 r = execTok (spec, &s, &cmd_str, &cmd_len);
1413 r = execTok (spec, &s, &cmd_str, &cmd_len);
1416 p = regxStrz (cmd_str, cmd_len, ptmp);
1417 if (!strcmp (p, "begin"))
1419 r = execTok (spec, &s, &cmd_str, &cmd_len);
1422 logf (LOG_WARN, "missing keyword after 'begin'");
1425 p = regxStrz (cmd_str, cmd_len, ptmp);
1426 if (!strcmp (p, "record"))
1428 r = execTok (spec, &s, &cmd_str, &cmd_len);
1431 if (spec->d1_level == 0)
1433 static char absynName[64];
1438 memcpy (absynName, cmd_str, cmd_len);
1439 absynName[cmd_len] = '\0';
1442 logf (LOG_DEBUG, "begin record %s", absynName);
1444 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1445 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1450 res = data1_mk_node (spec->dh, spec->m);
1451 res->which = DATA1N_root;
1452 res->u.root.type = absynName;
1453 res->u.root.absyn = absyn;
1456 spec->d1_stack[spec->d1_level] = res;
1457 spec->d1_stack[++(spec->d1_level)] = NULL;
1460 r = execTok (spec, &s, &cmd_str, &cmd_len);
1462 else if (!strcmp (p, "element"))
1464 r = execTok (spec, &s, &cmd_str, &cmd_len);
1467 tagBegin (spec, cmd_str, cmd_len);
1468 r = execTok (spec, &s, &cmd_str, &cmd_len);
1470 else if (!strcmp (p, "variant"))
1473 const char *class_str = NULL;
1475 const char *type_str = NULL;
1477 const char *value_str = NULL;
1478 r = execTok (spec, &s, &cmd_str, &cmd_len);
1481 class_str = cmd_str;
1482 class_len = cmd_len;
1483 r = execTok (spec, &s, &cmd_str, &cmd_len);
1489 r = execTok (spec, &s, &cmd_str, &cmd_len);
1492 value_str = cmd_str;
1493 value_len = cmd_len;
1495 variantBegin (spec, class_str, class_len,
1496 type_str, type_len, value_str, value_len);
1499 r = execTok (spec, &s, &cmd_str, &cmd_len);
1501 else if (!strcmp (p, "context"))
1505 struct lexContext *lc = spec->context;
1506 r = execTok (spec, &s, &cmd_str, &cmd_len);
1507 p = regxStrz (cmd_str, cmd_len, ptmp);
1509 logf (LOG_DEBUG, "begin context %s", p);
1511 while (lc && strcmp (p, lc->name))
1514 spec->context_stack[++(spec->context_stack_top)] = lc;
1516 logf (LOG_WARN, "unknown context %s", p);
1519 r = execTok (spec, &s, &cmd_str, &cmd_len);
1523 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1526 else if (!strcmp (p, "end"))
1528 r = execTok (spec, &s, &cmd_str, &cmd_len);
1531 logf (LOG_WARN, "missing keyword after 'end'");
1534 p = regxStrz (cmd_str, cmd_len, ptmp);
1535 if (!strcmp (p, "record"))
1537 while (spec->d1_level)
1539 tagDataRelease (spec);
1542 r = execTok (spec, &s, &cmd_str, &cmd_len);
1544 logf (LOG_DEBUG, "end record");
1546 spec->stop_flag = 1;
1548 else if (!strcmp (p, "element"))
1551 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1553 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1558 tagEnd (spec, min_level, cmd_str, cmd_len);
1559 r = execTok (spec, &s, &cmd_str, &cmd_len);
1562 tagEnd (spec, min_level, NULL, 0);
1563 if (spec->d1_level == 0)
1566 logf (LOG_DEBUG, "end element end records");
1568 spec->stop_flag = 1;
1572 else if (!strcmp (p, "context"))
1575 logf (LOG_DEBUG, "end context");
1577 if (spec->context_stack_top)
1578 (spec->context_stack_top)--;
1579 r = execTok (spec, &s, &cmd_str, &cmd_len);
1582 logf (LOG_WARN, "bad keyword '%s' after end", p);
1584 else if (!strcmp (p, "data"))
1588 const char *element_str = NULL;
1590 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1592 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1594 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1596 r = execTok (spec, &s, &element_str, &element_len);
1601 logf (LOG_WARN, "bad data option: %.*s",
1606 logf (LOG_WARN, "missing data item after data");
1610 tagBegin (spec, element_str, element_len);
1613 execData (spec, cmd_str, cmd_len,textFlag);
1614 r = execTok (spec, &s, &cmd_str, &cmd_len);
1617 tagEnd (spec, 1, NULL, 0);
1619 else if (!strcmp (p, "unread"))
1622 r = execTok (spec, &s, &cmd_str, &cmd_len);
1623 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1625 r = execTok (spec, &s, &cmd_str, &cmd_len);
1628 logf (LOG_WARN, "missing number after -offset");
1631 p = regxStrz (cmd_str, cmd_len, ptmp);
1633 r = execTok (spec, &s, &cmd_str, &cmd_len);
1639 logf (LOG_WARN, "missing index after unread command");
1642 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1644 logf (LOG_WARN, "bad index after unread command");
1649 no = *cmd_str - '0';
1650 if (no >= spec->arg_no)
1651 no = spec->arg_no - 1;
1652 spec->ptr = spec->arg_start[no] + offset;
1654 r = execTok (spec, &s, &cmd_str, &cmd_len);
1656 else if (!strcmp (p, "context"))
1660 struct lexContext *lc = spec->context;
1661 r = execTok (spec, &s, &cmd_str, &cmd_len);
1662 p = regxStrz (cmd_str, cmd_len, ptmp);
1664 while (lc && strcmp (p, lc->name))
1667 spec->context_stack[spec->context_stack_top] = lc;
1669 logf (LOG_WARN, "unknown context %s", p);
1672 r = execTok (spec, &s, &cmd_str, &cmd_len);
1676 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1677 r = execTok (spec, &s, &cmd_str, &cmd_len);
1682 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1684 r = execTok (spec, &s, &cmd_str, &cmd_len);
1691 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1692 int start_ptr, int *pptr)
1701 arg_start[0] = start_ptr;
1703 spec->arg_start = arg_start;
1704 spec->arg_end = arg_end;
1711 if (ap->u.pattern.body)
1713 arg_start[arg_no] = *pptr;
1714 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1716 arg_end[arg_no] = F_WIN_EOF;
1718 arg_start[arg_no] = F_WIN_EOF;
1719 arg_end[arg_no] = F_WIN_EOF;
1724 arg_end[arg_no] = sptr;
1726 arg_start[arg_no] = sptr;
1727 arg_end[arg_no] = *pptr;
1732 arg_start[arg_no] = *pptr;
1733 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1735 if (sptr != arg_start[arg_no])
1737 arg_end[arg_no] = *pptr;
1742 spec->arg_no = arg_no;
1745 if (spec->tcl_interp)
1746 execTcl(spec, ap->u.code);
1748 execCode (spec, ap->u.code);
1750 execCode (spec, ap->u.code);
1753 if (spec->stop_flag)
1757 arg_start[arg_no] = *pptr;
1758 arg_end[arg_no] = F_WIN_EOF;
1767 static int execRule (struct lexSpec *spec, struct lexContext *context,
1768 int ruleNo, int start_ptr, int *pptr)
1771 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1773 return execAction (spec, context->fastRule[ruleNo]->actionList,
1777 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1779 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1780 struct DFA_state *state = context->dfa->states[0];
1783 unsigned char c_prev = '\n';
1785 int last_rule = 0; /* rule number of current match */
1786 int last_ptr = *ptr; /* last char of match */
1787 int start_ptr = *ptr; /* first char of match */
1788 int skip_ptr = *ptr; /* first char of run */
1792 c = f_win_advance (spec, ptr);
1793 if (*ptr == F_WIN_EOF)
1795 /* end of file met */
1798 /* there was a match */
1799 if (skip_ptr < start_ptr)
1801 /* deal with chars that didn't match */
1804 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1805 execDataP (spec, buf, size, 0);
1807 /* restore pointer */
1810 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1812 /* restore skip pointer */
1816 else if (skip_ptr < *ptr)
1818 /* deal with chars that didn't match */
1821 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1822 execDataP (spec, buf, size, 0);
1824 if (*ptr == F_WIN_EOF)
1831 { /* no transition for character c ... */
1834 if (skip_ptr < start_ptr)
1836 /* deal with chars that didn't match */
1839 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1840 execDataP (spec, buf, size, 0);
1842 /* restore pointer */
1844 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1846 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1849 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1851 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1855 context = spec->context_stack[spec->context_stack_top];
1858 last_ptr = start_ptr = *ptr;
1862 c_prev = f_win_advance (spec, &start_ptr);
1867 c_prev = f_win_advance (spec, &start_ptr);
1870 state = context->dfa->states[0];
1873 else if (c >= t->ch[0] && c <= t->ch[1])
1874 { /* transition ... */
1875 state = context->dfa->states[t->to];
1880 last_rule = state->rule_no;
1883 else if (state->rule_nno)
1885 last_rule = state->rule_nno;
1897 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1898 const char *context_name)
1900 struct lexContext *lt = spec->context;
1903 spec->stop_flag = 0;
1905 spec->context_stack_top = 0;
1908 if (!strcmp (lt->name, context_name))
1914 logf (LOG_WARN, "cannot find context %s", context_name);
1917 spec->context_stack[spec->context_stack_top] = lt;
1918 spec->d1_stack[spec->d1_level] = NULL;
1923 execAction (spec, lt->initActionList, ptr, &ptr);
1926 execAction (spec, lt->beginActionList, ptr, &ptr);
1927 lexNode (spec, &ptr);
1928 while (spec->d1_level)
1930 tagDataRelease (spec);
1933 execAction (spec, lt->endActionList, ptr, &ptr);
1934 return spec->d1_stack[0];
1937 void grs_destroy(void *clientData)
1939 struct lexSpecs *specs = clientData;
1942 lexSpecDestroy(&specs->spec);
1947 void *grs_init(void)
1949 struct lexSpecs *specs = xmalloc (sizeof(*specs));
1954 data1_node *grs_read_regx (struct grs_read_info *p)
1957 struct lexSpecs *specs = p->clientData;
1958 struct lexSpec **curLexSpec = &specs->spec;
1961 logf (LOG_DEBUG, "grs_read_regx");
1963 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1966 lexSpecDestroy (curLexSpec);
1967 *curLexSpec = lexSpecCreate (p->type, p->dh);
1968 res = readFileSpec (*curLexSpec);
1971 lexSpecDestroy (curLexSpec);
1975 (*curLexSpec)->dh = p->dh;
1978 (*curLexSpec)->f_win_start = 0;
1979 (*curLexSpec)->f_win_end = 0;
1980 (*curLexSpec)->f_win_rf = p->readf;
1981 (*curLexSpec)->f_win_sf = p->seekf;
1982 (*curLexSpec)->f_win_fh = p->fh;
1983 (*curLexSpec)->f_win_ef = p->endf;
1984 (*curLexSpec)->f_win_size = 500000;
1986 (*curLexSpec)->m = p->mem;
1987 return lexRoot (*curLexSpec, p->offset, "main");
1990 static struct recTypeGrs regx_type = {
1997 RecTypeGrs recTypeGrs_regx = ®x_type;
2000 data1_node *grs_read_tcl (struct grs_read_info *p)
2003 struct lexSpecs *specs = p->clientData;
2004 struct lexSpec **curLexSpec = &specs->spec;
2007 logf (LOG_DEBUG, "grs_read_tcl");
2009 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2011 Tcl_Interp *tcl_interp;
2013 lexSpecDestroy (curLexSpec);
2014 *curLexSpec = lexSpecCreate (p->type, p->dh);
2015 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2016 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2017 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2018 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2019 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2021 res = readFileSpec (*curLexSpec);
2024 lexSpecDestroy (curLexSpec);
2028 (*curLexSpec)->dh = p->dh;
2031 (*curLexSpec)->f_win_start = 0;
2032 (*curLexSpec)->f_win_end = 0;
2033 (*curLexSpec)->f_win_rf = p->readf;
2034 (*curLexSpec)->f_win_sf = p->seekf;
2035 (*curLexSpec)->f_win_fh = p->fh;
2036 (*curLexSpec)->f_win_ef = p->endf;
2037 (*curLexSpec)->f_win_size = 500000;
2039 (*curLexSpec)->m = p->mem;
2040 return lexRoot (*curLexSpec, p->offset, "main");
2043 static struct recTypeGrs tcl_type = {
2050 RecTypeGrs recTypeGrs_tcl = &tcl_type;