2 * Copyright (C) 1994-2002, Index Data
5 * $Id: regxread.c,v 1.40 2002-05-03 13:50:25 adam Exp $
12 #include <yaz/tpath.h>
20 #if MAJOR_VERSION >= 8
21 #define HAVE_TCL_OBJECTS
27 #define F_WIN_EOF 2000000000
31 #define REGX_PATTERN 1
36 #define REGX_CONTEXT 6
46 struct lexRuleAction {
50 struct DFA *dfa; /* REGX_PATTERN */
53 struct regxCode *code; /* REGX_CODE */
55 struct lexRuleAction *next;
60 struct lexRuleAction *actionList;
64 struct lexRuleInfo info;
71 struct lexRule *rules;
72 struct lexRuleInfo **fastRule;
76 struct lexRuleAction *beginActionList;
77 struct lexRuleAction *endActionList;
78 struct lexRuleAction *initActionList;
79 struct lexContext *next;
89 struct lexContext *context;
91 struct lexContext **context_stack;
92 int context_stack_size;
93 int context_stack_top;
99 Tcl_Interp *tcl_interp;
102 void (*f_win_ef)(void *, off_t);
104 int f_win_start; /* first byte of buffer is this file offset */
105 int f_win_end; /* last byte of buffer is this offset - 1 */
106 int f_win_size; /* size of buffer */
107 char *f_win_buf; /* buffer itself */
108 int (*f_win_rf)(void *, char *, size_t);
109 off_t (*f_win_sf)(void *, off_t);
111 struct lexConcatBuf *concatBuf;
113 data1_node **d1_stack;
124 struct lexSpec *spec;
127 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
130 int i, r, off = start_pos - spec->f_win_start;
132 if (off >= 0 && end_pos <= spec->f_win_end)
134 *size = end_pos - start_pos;
135 return spec->f_win_buf + off;
137 if (off < 0 || start_pos >= spec->f_win_end)
139 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
140 spec->f_win_start = start_pos;
142 if (!spec->f_win_buf)
143 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
144 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
146 spec->f_win_end = spec->f_win_start + *size;
148 if (*size > end_pos - start_pos)
149 *size = end_pos - start_pos;
150 return spec->f_win_buf;
152 for (i = 0; i<spec->f_win_end - start_pos; i++)
153 spec->f_win_buf[i] = spec->f_win_buf[i + off];
154 r = (*spec->f_win_rf)(spec->f_win_fh,
156 spec->f_win_size - i);
157 spec->f_win_start = start_pos;
158 spec->f_win_end += r;
160 if (*size > end_pos - start_pos)
161 *size = end_pos - start_pos;
162 return spec->f_win_buf;
165 static int f_win_advance (struct lexSpec *spec, int *pos)
170 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
171 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
172 if (*pos == F_WIN_EOF)
174 buf = f_win_get (spec, *pos, *pos+1, &size);
184 static void regxCodeDel (struct regxCode **pp)
186 struct regxCode *p = *pp;
191 Tcl_DecrRefCount (p->tcl_obj);
199 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
203 p = (struct regxCode *) xmalloc (sizeof(*p));
204 p->str = (char *) xmalloc (len+1);
205 memcpy (p->str, buf, len);
208 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
210 Tcl_IncrRefCount (p->tcl_obj);
215 static struct DFA *lexSpecDFA (void)
220 dfa_parse_cmap_del (dfa, ' ');
221 dfa_parse_cmap_del (dfa, '\t');
222 dfa_parse_cmap_add (dfa, '/', 0);
226 static void actionListDel (struct lexRuleAction **rap)
228 struct lexRuleAction *ra1, *ra;
230 for (ra = *rap; ra; ra = ra1)
236 dfa_delete (&ra->u.pattern.dfa);
239 regxCodeDel (&ra->u.code);
247 static struct lexContext *lexContextCreate (const char *name)
249 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
251 p->name = xstrdup (name);
254 p->dfa = lexSpecDFA ();
257 p->beginActionList = NULL;
258 p->endActionList = NULL;
259 p->initActionList = NULL;
264 static void lexContextDestroy (struct lexContext *p)
266 struct lexRule *rp, *rp1;
268 dfa_delete (&p->dfa);
270 for (rp = p->rules; rp; rp = rp1)
273 actionListDel (&rp->info.actionList);
276 actionListDel (&p->beginActionList);
277 actionListDel (&p->endActionList);
278 actionListDel (&p->initActionList);
283 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
288 p = (struct lexSpec *) xmalloc (sizeof(*p));
289 p->name = (char *) xmalloc (strlen(name)+1);
290 strcpy (p->name, name);
297 p->context_stack_size = 100;
298 p->context_stack = (struct lexContext **)
299 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
303 p->concatBuf = (struct lexConcatBuf *)
304 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
305 for (i = 0; i < p->maxLevel; i++)
307 p->concatBuf[i].max = 0;
308 p->concatBuf[i].buf = 0;
310 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
315 static void lexSpecDestroy (struct lexSpec **pp)
318 struct lexContext *lt;
326 for (i = 0; i < p->maxLevel; i++)
327 xfree (p->concatBuf[i].buf);
328 xfree (p->concatBuf);
333 struct lexContext *lt_next = lt->next;
334 lexContextDestroy (lt);
339 Tcl_DeleteInterp (p->tcl_interp);
342 xfree (p->f_win_buf);
343 xfree (p->context_stack);
349 static int readParseToken (const char **cpp, int *len)
351 const char *cp = *cpp;
355 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
384 if (*cp >= 'a' && *cp <= 'z')
386 else if (*cp >= 'A' && *cp <= 'Z')
387 cmd[i] = *cp + 'a' - 'A';
390 if (i < (int) sizeof(cmd)-2)
397 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
399 while (*cp && *cp != ' ' && *cp != '\t' &&
400 *cp != '\n' && *cp != '\r')
406 if (!strcmp (cmd, "begin"))
408 else if (!strcmp (cmd, "end"))
410 else if (!strcmp (cmd, "body"))
412 else if (!strcmp (cmd, "context"))
414 else if (!strcmp (cmd, "init"))
418 logf (LOG_WARN, "bad command %s", cmd);
424 static int actionListMk (struct lexSpec *spec, const char *s,
425 struct lexRuleAction **ap)
431 while ((tok = readParseToken (&s, &len)))
439 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
441 regxCodeMk (&(*ap)->u.code, s, len);
445 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
447 (*ap)->u.pattern.body = bodyMark;
449 (*ap)->u.pattern.dfa = lexSpecDFA ();
451 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
456 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
459 dfa_mkstate ((*ap)->u.pattern.dfa);
463 logf (LOG_WARN, "cannot use BEGIN here");
466 logf (LOG_WARN, "cannot use INIT here");
469 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
479 int readOneSpec (struct lexSpec *spec, const char *s)
483 struct lexContext *lc;
485 tok = readParseToken (&s, &len);
486 if (tok == REGX_CONTEXT)
488 char context_name[32];
489 tok = readParseToken (&s, &len);
490 if (tok != REGX_CODE)
492 logf (LOG_WARN, "missing name after CONTEXT keyword");
497 memcpy (context_name, s, len);
498 context_name[len] = '\0';
499 lc = lexContextCreate (context_name);
500 lc->next = spec->context;
505 spec->context = lexContextCreate ("main");
510 actionListDel (&spec->context->beginActionList);
511 actionListMk (spec, s, &spec->context->beginActionList);
514 actionListDel (&spec->context->endActionList);
515 actionListMk (spec, s, &spec->context->endActionList);
518 actionListDel (&spec->context->initActionList);
519 actionListMk (spec, s, &spec->context->initActionList);
523 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
525 r = dfa_parse (spec->context->dfa, &s);
528 logf (LOG_WARN, "regular expression error. r=%d", r);
533 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
537 rp = (struct lexRule *) xmalloc (sizeof(*rp));
538 rp->info.no = spec->context->ruleNo++;
539 rp->next = spec->context->rules;
540 spec->context->rules = rp;
541 actionListMk (spec, s, &rp->info.actionList);
546 int readFileSpec (struct lexSpec *spec)
548 struct lexContext *lc;
549 int c, i, errors = 0;
555 if (spec->tcl_interp)
557 sprintf (fname, "%s.tflt", spec->name);
558 spec_inf = data1_path_fopen (spec->dh, fname, "r");
563 sprintf (fname, "%s.flt", spec->name);
564 spec_inf = data1_path_fopen (spec->dh, fname, "r");
568 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
571 logf (LOG_LOG, "reading regx filter %s", fname);
573 if (spec->tcl_interp)
574 logf (LOG_LOG, "Tcl enabled");
576 lineBuf = wrbuf_alloc();
581 wrbuf_rewind (lineBuf);
582 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
584 while (c != '\n' && c != EOF)
597 wrbuf_putc(lineBuf, c);
605 if (c != ' ' && c != '\t')
610 wrbuf_putc(lineBuf, '\0');
611 readOneSpec (spec, wrbuf_buf(lineBuf));
612 spec->lineNo += addLine;
616 wrbuf_free(lineBuf, 1);
621 debug_dfa_followpos = 1;
624 for (lc = spec->context; lc; lc = lc->next)
627 lc->fastRule = (struct lexRuleInfo **)
628 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
629 for (i = 0; i < lc->ruleNo; i++)
630 lc->fastRule[i] = NULL;
631 for (rp = lc->rules; rp; rp = rp->next)
632 lc->fastRule[rp->info.no] = &rp->info;
633 dfa_mkstate (lc->dfa);
642 static struct lexSpec *curLexSpec = NULL;
645 static void execData (struct lexSpec *spec,
646 const char *ebuf, int elen, int formatted_text)
648 struct data1_node *res, *parent;
651 if (elen == 0) /* shouldn't happen, but it does! */
655 logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen,
656 ebuf, 15, ebuf + elen-15);
658 logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf);
660 logf (LOG_LOG, "data (%d bytes)", elen);
663 if (spec->d1_level <= 1)
666 parent = spec->d1_stack[spec->d1_level -1];
669 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
670 org_len = res->u.data.len;
675 res = data1_mk_node (spec->dh, spec->m, DATA1N_data, parent);
676 res->u.data.what = DATA1I_text;
678 res->u.data.formatted_text = formatted_text;
679 res->u.data.data = 0;
681 if (spec->d1_stack[spec->d1_level])
682 spec->d1_stack[spec->d1_level]->next = res;
683 spec->d1_stack[spec->d1_level] = res;
685 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
687 char *old_buf, *new_buf;
689 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
690 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
691 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
693 memcpy (new_buf, old_buf, org_len);
696 spec->concatBuf[spec->d1_level].buf = new_buf;
698 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
699 res->u.data.len += elen;
702 static void execDataP (struct lexSpec *spec,
703 const char *ebuf, int elen, int formatted_text)
705 execData (spec, ebuf, elen, formatted_text);
708 static void tagDataRelease (struct lexSpec *spec)
712 if ((res = spec->d1_stack[spec->d1_level]) &&
713 res->which == DATA1N_data &&
714 res->u.data.what == DATA1I_text)
716 assert (!res->u.data.data);
717 assert (res->u.data.len > 0);
718 if (res->u.data.len > DATA1_LOCALDATA)
719 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
721 res->u.data.data = res->lbuf;
722 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
727 static void variantBegin (struct lexSpec *spec,
728 const char *class_str, int class_len,
729 const char *type_str, int type_len,
730 const char *value_str, int value_len)
732 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
733 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
738 if (spec->d1_level == 0)
740 logf (LOG_WARN, "in variant begin. No record type defined");
743 if (class_len >= DATA1_MAX_SYMBOL)
744 class_len = DATA1_MAX_SYMBOL-1;
745 memcpy (tclass, class_str, class_len);
746 tclass[class_len] = '\0';
748 if (type_len >= DATA1_MAX_SYMBOL)
749 type_len = DATA1_MAX_SYMBOL-1;
750 memcpy (ttype, type_str, type_len);
751 ttype[type_len] = '\0';
754 logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype,
759 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
763 if (parent->which != DATA1N_variant)
765 res = data1_mk_node (spec->dh, spec->m, DATA1N_variant, parent);
766 if (spec->d1_stack[spec->d1_level])
767 tagDataRelease (spec);
768 spec->d1_stack[spec->d1_level] = res;
769 spec->d1_stack[++(spec->d1_level)] = NULL;
771 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
772 if (spec->d1_stack[i]->u.variant.type == tp)
779 logf (LOG_LOG, "variant node (%d)", spec->d1_level);
781 parent = spec->d1_stack[spec->d1_level-1];
782 res = data1_mk_node (spec->dh, spec->m, DATA1N_variant, parent);
783 res->u.variant.type = tp;
785 if (value_len >= DATA1_LOCALDATA)
786 value_len =DATA1_LOCALDATA-1;
787 memcpy (res->lbuf, value_str, value_len);
788 res->lbuf[value_len] = '\0';
790 res->u.variant.value = res->lbuf;
792 if (spec->d1_stack[spec->d1_level])
793 tagDataRelease (spec);
794 spec->d1_stack[spec->d1_level] = res;
795 spec->d1_stack[++(spec->d1_level)] = NULL;
798 static void tagStrip (const char **tag, int *len)
802 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
805 for (i = 0; i < *len && isspace((*tag)[i]); i++)
811 static void tagBegin (struct lexSpec *spec,
812 const char *tag, int len)
814 struct data1_node *parent;
815 data1_element *elem = NULL;
818 data1_element *e = NULL;
821 if (spec->d1_level == 0)
823 logf (LOG_WARN, "in element begin. No record type defined");
826 tagStrip (&tag, &len);
828 parent = spec->d1_stack[spec->d1_level -1];
829 partag = get_parent_tag(spec->dh, parent);
831 res = data1_mk_node (spec->dh, spec->m, DATA1N_tag, parent);
833 if (len >= DATA1_LOCALDATA)
834 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
836 res->u.tag.tag = res->lbuf;
838 memcpy (res->u.tag.tag, tag, len);
839 res->u.tag.tag[len] = '\0';
842 logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
844 if (parent->which == DATA1N_variant)
847 if (!(e = partag->u.tag.element))
850 elem = data1_getelementbytagname (spec->dh,
851 spec->d1_stack[0]->u.root.absyn,
853 res->u.tag.element = elem;
855 if (spec->d1_stack[spec->d1_level])
856 tagDataRelease (spec);
857 spec->d1_stack[spec->d1_level] = res;
858 spec->d1_stack[++(spec->d1_level)] = NULL;
861 static void tagEnd (struct lexSpec *spec, int min_level,
862 const char *tag, int len)
864 tagStrip (&tag, &len);
865 while (spec->d1_level > min_level)
867 tagDataRelease (spec);
869 if (spec->d1_level == 0)
871 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
873 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
875 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
879 logf (LOG_LOG, "end tag (%d)", spec->d1_level);
884 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
887 struct DFA_state *state = dfa->states[0];
890 unsigned char c_prev = 0;
891 int ptr = *pptr; /* current pointer */
892 int start_ptr = *pptr; /* first char of match */
893 int last_ptr = 0; /* last char of match */
894 int last_rule = 0; /* rule number of current match */
899 c = f_win_advance (spec, &ptr);
900 if (ptr == F_WIN_EOF)
917 *mptr = start_ptr; /* match starts here */
918 *pptr = last_ptr; /* match end here (+1) */
921 state = dfa->states[0];
926 else if (c >= t->ch[0] && c <= t->ch[1])
928 state = dfa->states[t->to];
933 last_rule = state->rule_no;
938 last_rule = state->rule_nno;
950 static int execTok (struct lexSpec *spec, const char **src,
951 const char **tokBuf, int *tokLen)
953 const char *s = *src;
955 while (*s == ' ' || *s == '\t')
959 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
963 while (*s >= '0' && *s <= '9')
964 n = n*10 + (*s++ -'0');
965 if (spec->arg_no == 0)
972 if (n >= spec->arg_no)
974 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
981 while (*s && *s != '\"')
983 *tokLen = s - *tokBuf;
988 else if (*s == '\n' || *s == ';')
996 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
999 *tokLen = s - *tokBuf;
1006 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1009 *tokLen = s - *tokBuf;
1015 static char *regxStrz (const char *src, int len, char *str)
1019 memcpy (str, src, len);
1025 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1026 int argc, char **argv)
1028 struct lexSpec *spec = (struct lexSpec *) clientData;
1031 if (!strcmp(argv[1], "record") && argc == 3)
1033 char *absynName = argv[2];
1038 logf (LOG_LOG, "begin record %s", absynName);
1040 absyn = data1_get_absyn (spec->dh, absynName);
1042 res = data1_mk_node (spec->dh, spec->m);
1043 res->which = DATA1N_root;
1045 data1_insert_string(spec->dh, res, spec->m, absynName);
1046 res->u.root.absyn = absyn;
1049 spec->d1_stack[spec->d1_level] = res;
1050 spec->d1_stack[++(spec->d1_level)] = NULL;
1052 else if (!strcmp(argv[1], "element") && argc == 3)
1054 tagBegin (spec, argv[2], strlen(argv[2]));
1056 else if (!strcmp (argv[1], "variant") && argc == 5)
1058 variantBegin (spec, argv[2], strlen(argv[2]),
1059 argv[3], strlen(argv[3]),
1060 argv[4], strlen(argv[4]));
1062 else if (!strcmp (argv[1], "context") && argc == 3)
1064 struct lexContext *lc = spec->context;
1066 logf (LOG_LOG, "begin context %s",argv[2]);
1068 while (lc && strcmp (argv[2], lc->name))
1072 spec->context_stack[++(spec->context_stack_top)] = lc;
1075 logf (LOG_WARN, "unknown context %s", argv[2]);
1082 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1083 int argc, char **argv)
1085 struct lexSpec *spec = (struct lexSpec *) clientData;
1089 if (!strcmp (argv[1], "record"))
1091 while (spec->d1_level)
1093 tagDataRelease (spec);
1097 logf (LOG_LOG, "end record");
1099 spec->stop_flag = 1;
1101 else if (!strcmp (argv[1], "element"))
1105 if (argc >= 3 && !strcmp(argv[2], "-record"))
1114 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1115 if (spec->d1_level == 0)
1118 logf (LOG_LOG, "end element end records");
1120 spec->stop_flag = 1;
1123 else if (!strcmp (argv[1], "context"))
1126 logf (LOG_LOG, "end context");
1128 if (spec->context_stack_top)
1129 (spec->context_stack_top)--;
1136 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1137 int argc, char **argv)
1141 const char *element = 0;
1142 struct lexSpec *spec = (struct lexSpec *) clientData;
1146 if (!strcmp("-text", argv[argi]))
1151 else if (!strcmp("-element", argv[argi]))
1155 element = argv[argi++];
1161 tagBegin (spec, element, strlen(element));
1165 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1167 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1168 execData (spec, native, strlen(native), textFlag);
1169 Tcl_DStringFree (&ds);
1171 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1176 tagEnd (spec, 1, NULL, 0);
1180 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1181 int argc, char **argv)
1183 struct lexSpec *spec = (struct lexSpec *) clientData;
1190 if (!strcmp("-offset", argv[argi]))
1195 offset = atoi(argv[argi]);
1204 no = atoi(argv[argi]);
1205 if (no >= spec->arg_no)
1206 no = spec->arg_no - 1;
1207 spec->ptr = spec->arg_start[no] + offset;
1211 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1215 for (i = 0; i < spec->arg_no; i++)
1217 char var_name[10], *var_buf;
1220 sprintf (var_name, "%d", i);
1221 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1225 ch = var_buf[var_len];
1226 var_buf[var_len] = '\0';
1227 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1228 var_buf[var_len] = ch;
1231 #if HAVE_TCL_OBJECTS
1232 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1234 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1238 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1239 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1240 spec->tcl_interp->errorLine,
1241 spec->tcl_interp->result,
1242 err ? err : "[NO ERRORINFO]");
1248 static void execCode (struct lexSpec *spec, struct regxCode *code)
1250 const char *s = code->str;
1252 const char *cmd_str;
1254 r = execTok (spec, &s, &cmd_str, &cmd_len);
1261 r = execTok (spec, &s, &cmd_str, &cmd_len);
1264 p = regxStrz (cmd_str, cmd_len, ptmp);
1265 if (!strcmp (p, "begin"))
1267 r = execTok (spec, &s, &cmd_str, &cmd_len);
1270 logf (LOG_WARN, "missing keyword after 'begin'");
1273 p = regxStrz (cmd_str, cmd_len, ptmp);
1274 if (!strcmp (p, "record"))
1276 r = execTok (spec, &s, &cmd_str, &cmd_len);
1279 if (spec->d1_level == 0)
1281 static char absynName[64];
1287 memcpy (absynName, cmd_str, cmd_len);
1288 absynName[cmd_len] = '\0';
1291 logf (LOG_LOG, "begin record %s", absynName);
1293 absyn = data1_get_absyn (spec->dh, absynName);
1295 res = data1_mk_node (spec->dh, spec->m, DATA1N_root, 0);
1296 res->u.root.type = absynName;
1297 res->u.root.absyn = absyn;
1299 spec->d1_stack[spec->d1_level] = res;
1300 spec->d1_stack[++(spec->d1_level)] = NULL;
1302 r = execTok (spec, &s, &cmd_str, &cmd_len);
1304 else if (!strcmp (p, "element"))
1306 r = execTok (spec, &s, &cmd_str, &cmd_len);
1309 tagBegin (spec, cmd_str, cmd_len);
1310 r = execTok (spec, &s, &cmd_str, &cmd_len);
1312 else if (!strcmp (p, "variant"))
1315 const char *class_str = NULL;
1317 const char *type_str = NULL;
1319 const char *value_str = NULL;
1320 r = execTok (spec, &s, &cmd_str, &cmd_len);
1323 class_str = cmd_str;
1324 class_len = cmd_len;
1325 r = execTok (spec, &s, &cmd_str, &cmd_len);
1331 r = execTok (spec, &s, &cmd_str, &cmd_len);
1334 value_str = cmd_str;
1335 value_len = cmd_len;
1337 variantBegin (spec, class_str, class_len,
1338 type_str, type_len, value_str, value_len);
1341 r = execTok (spec, &s, &cmd_str, &cmd_len);
1343 else if (!strcmp (p, "context"))
1347 struct lexContext *lc = spec->context;
1348 r = execTok (spec, &s, &cmd_str, &cmd_len);
1349 p = regxStrz (cmd_str, cmd_len, ptmp);
1351 logf (LOG_LOG, "begin context %s", p);
1353 while (lc && strcmp (p, lc->name))
1356 spec->context_stack[++(spec->context_stack_top)] = lc;
1358 logf (LOG_WARN, "unknown context %s", p);
1361 r = execTok (spec, &s, &cmd_str, &cmd_len);
1365 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1368 else if (!strcmp (p, "end"))
1370 r = execTok (spec, &s, &cmd_str, &cmd_len);
1373 logf (LOG_WARN, "missing keyword after 'end'");
1376 p = regxStrz (cmd_str, cmd_len, ptmp);
1377 if (!strcmp (p, "record"))
1379 while (spec->d1_level)
1381 tagDataRelease (spec);
1384 r = execTok (spec, &s, &cmd_str, &cmd_len);
1386 logf (LOG_LOG, "end record");
1388 spec->stop_flag = 1;
1390 else if (!strcmp (p, "element"))
1393 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1395 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1400 tagEnd (spec, min_level, cmd_str, cmd_len);
1401 r = execTok (spec, &s, &cmd_str, &cmd_len);
1404 tagEnd (spec, min_level, NULL, 0);
1405 if (spec->d1_level == 0)
1408 logf (LOG_LOG, "end element end records");
1410 spec->stop_flag = 1;
1414 else if (!strcmp (p, "context"))
1417 logf (LOG_LOG, "end context");
1419 if (spec->context_stack_top)
1420 (spec->context_stack_top)--;
1421 r = execTok (spec, &s, &cmd_str, &cmd_len);
1424 logf (LOG_WARN, "bad keyword '%s' after end", p);
1426 else if (!strcmp (p, "data"))
1430 const char *element_str = NULL;
1432 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1434 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1436 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1438 r = execTok (spec, &s, &element_str, &element_len);
1443 logf (LOG_WARN, "bad data option: %.*s",
1448 logf (LOG_WARN, "missing data item after data");
1452 tagBegin (spec, element_str, element_len);
1455 execData (spec, cmd_str, cmd_len,textFlag);
1456 r = execTok (spec, &s, &cmd_str, &cmd_len);
1459 tagEnd (spec, 1, NULL, 0);
1461 else if (!strcmp (p, "unread"))
1464 r = execTok (spec, &s, &cmd_str, &cmd_len);
1465 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1467 r = execTok (spec, &s, &cmd_str, &cmd_len);
1470 logf (LOG_WARN, "missing number after -offset");
1473 p = regxStrz (cmd_str, cmd_len, ptmp);
1475 r = execTok (spec, &s, &cmd_str, &cmd_len);
1481 logf (LOG_WARN, "missing index after unread command");
1484 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1486 logf (LOG_WARN, "bad index after unread command");
1491 no = *cmd_str - '0';
1492 if (no >= spec->arg_no)
1493 no = spec->arg_no - 1;
1494 spec->ptr = spec->arg_start[no] + offset;
1496 r = execTok (spec, &s, &cmd_str, &cmd_len);
1498 else if (!strcmp (p, "context"))
1502 struct lexContext *lc = spec->context;
1503 r = execTok (spec, &s, &cmd_str, &cmd_len);
1504 p = regxStrz (cmd_str, cmd_len, ptmp);
1506 while (lc && strcmp (p, lc->name))
1509 spec->context_stack[spec->context_stack_top] = lc;
1511 logf (LOG_WARN, "unknown context %s", p);
1514 r = execTok (spec, &s, &cmd_str, &cmd_len);
1518 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1519 r = execTok (spec, &s, &cmd_str, &cmd_len);
1524 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1526 r = execTok (spec, &s, &cmd_str, &cmd_len);
1533 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1534 int start_ptr, int *pptr)
1543 arg_start[0] = start_ptr;
1545 spec->arg_start = arg_start;
1546 spec->arg_end = arg_end;
1553 if (ap->u.pattern.body)
1555 arg_start[arg_no] = *pptr;
1556 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1558 arg_end[arg_no] = F_WIN_EOF;
1560 arg_start[arg_no] = F_WIN_EOF;
1561 arg_end[arg_no] = F_WIN_EOF;
1566 arg_end[arg_no] = sptr;
1568 arg_start[arg_no] = sptr;
1569 arg_end[arg_no] = *pptr;
1574 arg_start[arg_no] = *pptr;
1575 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1577 if (sptr != arg_start[arg_no])
1579 arg_end[arg_no] = *pptr;
1584 spec->arg_no = arg_no;
1587 if (spec->tcl_interp)
1588 execTcl(spec, ap->u.code);
1590 execCode (spec, ap->u.code);
1592 execCode (spec, ap->u.code);
1595 if (spec->stop_flag)
1599 arg_start[arg_no] = *pptr;
1600 arg_end[arg_no] = F_WIN_EOF;
1609 static int execRule (struct lexSpec *spec, struct lexContext *context,
1610 int ruleNo, int start_ptr, int *pptr)
1613 logf (LOG_LOG, "exec rule %d", ruleNo);
1615 return execAction (spec, context->fastRule[ruleNo]->actionList,
1619 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1621 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1622 struct DFA_state *state = context->dfa->states[0];
1625 unsigned char c_prev = '\n';
1627 int last_rule = 0; /* rule number of current match */
1628 int last_ptr = *ptr; /* last char of match */
1629 int start_ptr = *ptr; /* first char of match */
1630 int skip_ptr = *ptr; /* first char of run */
1634 c = f_win_advance (spec, ptr);
1635 if (*ptr == F_WIN_EOF)
1637 /* end of file met */
1640 /* there was a match */
1641 if (skip_ptr < start_ptr)
1643 /* deal with chars that didn't match */
1646 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1647 execDataP (spec, buf, size, 0);
1649 /* restore pointer */
1652 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1654 /* restore skip pointer */
1658 else if (skip_ptr < *ptr)
1660 /* deal with chars that didn't match */
1663 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1664 execDataP (spec, buf, size, 0);
1666 if (*ptr == F_WIN_EOF)
1673 { /* no transition for character c ... */
1676 if (skip_ptr < start_ptr)
1678 /* deal with chars that didn't match */
1681 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1682 execDataP (spec, buf, size, 0);
1684 /* restore pointer */
1686 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1688 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1691 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1693 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1697 context = spec->context_stack[spec->context_stack_top];
1700 last_ptr = start_ptr = *ptr;
1704 c_prev = f_win_advance (spec, &start_ptr);
1709 c_prev = f_win_advance (spec, &start_ptr);
1712 state = context->dfa->states[0];
1715 else if (c >= t->ch[0] && c <= t->ch[1])
1716 { /* transition ... */
1717 state = context->dfa->states[t->to];
1722 last_rule = state->rule_no;
1725 else if (state->rule_nno)
1727 last_rule = state->rule_nno;
1739 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1740 const char *context_name)
1742 struct lexContext *lt = spec->context;
1745 spec->stop_flag = 0;
1747 spec->context_stack_top = 0;
1750 if (!strcmp (lt->name, context_name))
1756 logf (LOG_WARN, "cannot find context %s", context_name);
1759 spec->context_stack[spec->context_stack_top] = lt;
1760 spec->d1_stack[spec->d1_level] = NULL;
1765 execAction (spec, lt->initActionList, ptr, &ptr);
1768 execAction (spec, lt->beginActionList, ptr, &ptr);
1769 lexNode (spec, &ptr);
1770 while (spec->d1_level)
1772 tagDataRelease (spec);
1775 execAction (spec, lt->endActionList, ptr, &ptr);
1776 return spec->d1_stack[0];
1779 void grs_destroy(void *clientData)
1781 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1784 lexSpecDestroy(&specs->spec);
1789 void *grs_init(void)
1791 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1796 data1_node *grs_read_regx (struct grs_read_info *p)
1799 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1800 struct lexSpec **curLexSpec = &specs->spec;
1803 logf (LOG_LOG, "grs_read_regx");
1805 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1808 lexSpecDestroy (curLexSpec);
1809 *curLexSpec = lexSpecCreate (p->type, p->dh);
1810 res = readFileSpec (*curLexSpec);
1813 lexSpecDestroy (curLexSpec);
1817 (*curLexSpec)->dh = p->dh;
1820 (*curLexSpec)->f_win_start = 0;
1821 (*curLexSpec)->f_win_end = 0;
1822 (*curLexSpec)->f_win_rf = p->readf;
1823 (*curLexSpec)->f_win_sf = p->seekf;
1824 (*curLexSpec)->f_win_fh = p->fh;
1825 (*curLexSpec)->f_win_ef = p->endf;
1826 (*curLexSpec)->f_win_size = 500000;
1828 (*curLexSpec)->m = p->mem;
1829 return lexRoot (*curLexSpec, p->offset, "main");
1832 static struct recTypeGrs regx_type = {
1839 RecTypeGrs recTypeGrs_regx = ®x_type;
1842 data1_node *grs_read_tcl (struct grs_read_info *p)
1845 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1846 struct lexSpec **curLexSpec = &specs->spec;
1849 logf (LOG_LOG, "grs_read_tcl");
1851 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1853 Tcl_Interp *tcl_interp;
1855 lexSpecDestroy (curLexSpec);
1856 *curLexSpec = lexSpecCreate (p->type, p->dh);
1857 Tcl_FindExecutable("");
1858 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1859 Tcl_Init(tcl_interp);
1860 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1861 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1862 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1863 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1865 res = readFileSpec (*curLexSpec);
1868 lexSpecDestroy (curLexSpec);
1872 (*curLexSpec)->dh = p->dh;
1875 (*curLexSpec)->f_win_start = 0;
1876 (*curLexSpec)->f_win_end = 0;
1877 (*curLexSpec)->f_win_rf = p->readf;
1878 (*curLexSpec)->f_win_sf = p->seekf;
1879 (*curLexSpec)->f_win_fh = p->fh;
1880 (*curLexSpec)->f_win_ef = p->endf;
1881 (*curLexSpec)->f_win_size = 500000;
1883 (*curLexSpec)->m = p->mem;
1884 return lexRoot (*curLexSpec, p->offset, "main");
1887 static struct recTypeGrs tcl_type = {
1894 RecTypeGrs recTypeGrs_tcl = &tcl_type;