2 * Copyright (C) 1994-2002, Index Data
5 * $Id: regxread.c,v 1.42 2002-05-07 11:05:20 adam Exp $
12 #include <yaz/tpath.h>
20 #if MAJOR_VERSION >= 8
21 #define HAVE_TCL_OBJECTS
27 #define F_WIN_EOF 2000000000
31 #define REGX_PATTERN 1
36 #define REGX_CONTEXT 6
46 struct lexRuleAction {
50 struct DFA *dfa; /* REGX_PATTERN */
53 struct regxCode *code; /* REGX_CODE */
55 struct lexRuleAction *next;
60 struct lexRuleAction *actionList;
64 struct lexRuleInfo info;
71 struct lexRule *rules;
72 struct lexRuleInfo **fastRule;
76 struct lexRuleAction *beginActionList;
77 struct lexRuleAction *endActionList;
78 struct lexRuleAction *initActionList;
79 struct lexContext *next;
89 struct lexContext *context;
91 struct lexContext **context_stack;
92 int context_stack_size;
93 int context_stack_top;
99 Tcl_Interp *tcl_interp;
102 void (*f_win_ef)(void *, off_t);
104 int f_win_start; /* first byte of buffer is this file offset */
105 int f_win_end; /* last byte of buffer is this offset - 1 */
106 int f_win_size; /* size of buffer */
107 char *f_win_buf; /* buffer itself */
108 int (*f_win_rf)(void *, char *, size_t);
109 off_t (*f_win_sf)(void *, off_t);
111 struct lexConcatBuf *concatBuf;
113 data1_node **d1_stack;
124 struct lexSpec *spec;
127 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
130 int i, r, off = start_pos - spec->f_win_start;
132 if (off >= 0 && end_pos <= spec->f_win_end)
134 *size = end_pos - start_pos;
135 return spec->f_win_buf + off;
137 if (off < 0 || start_pos >= spec->f_win_end)
139 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
140 spec->f_win_start = start_pos;
142 if (!spec->f_win_buf)
143 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
144 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
146 spec->f_win_end = spec->f_win_start + *size;
148 if (*size > end_pos - start_pos)
149 *size = end_pos - start_pos;
150 return spec->f_win_buf;
152 for (i = 0; i<spec->f_win_end - start_pos; i++)
153 spec->f_win_buf[i] = spec->f_win_buf[i + off];
154 r = (*spec->f_win_rf)(spec->f_win_fh,
156 spec->f_win_size - i);
157 spec->f_win_start = start_pos;
158 spec->f_win_end += r;
160 if (*size > end_pos - start_pos)
161 *size = end_pos - start_pos;
162 return spec->f_win_buf;
165 static int f_win_advance (struct lexSpec *spec, int *pos)
170 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
171 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
172 if (*pos == F_WIN_EOF)
174 buf = f_win_get (spec, *pos, *pos+1, &size);
184 static void regxCodeDel (struct regxCode **pp)
186 struct regxCode *p = *pp;
191 Tcl_DecrRefCount (p->tcl_obj);
199 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
203 p = (struct regxCode *) xmalloc (sizeof(*p));
204 p->str = (char *) xmalloc (len+1);
205 memcpy (p->str, buf, len);
208 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
210 Tcl_IncrRefCount (p->tcl_obj);
215 static struct DFA *lexSpecDFA (void)
220 dfa_parse_cmap_del (dfa, ' ');
221 dfa_parse_cmap_del (dfa, '\t');
222 dfa_parse_cmap_add (dfa, '/', 0);
226 static void actionListDel (struct lexRuleAction **rap)
228 struct lexRuleAction *ra1, *ra;
230 for (ra = *rap; ra; ra = ra1)
236 dfa_delete (&ra->u.pattern.dfa);
239 regxCodeDel (&ra->u.code);
247 static struct lexContext *lexContextCreate (const char *name)
249 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
251 p->name = xstrdup (name);
254 p->dfa = lexSpecDFA ();
257 p->beginActionList = NULL;
258 p->endActionList = NULL;
259 p->initActionList = NULL;
264 static void lexContextDestroy (struct lexContext *p)
266 struct lexRule *rp, *rp1;
268 dfa_delete (&p->dfa);
270 for (rp = p->rules; rp; rp = rp1)
273 actionListDel (&rp->info.actionList);
276 actionListDel (&p->beginActionList);
277 actionListDel (&p->endActionList);
278 actionListDel (&p->initActionList);
283 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
288 p = (struct lexSpec *) xmalloc (sizeof(*p));
289 p->name = (char *) xmalloc (strlen(name)+1);
290 strcpy (p->name, name);
297 p->context_stack_size = 100;
298 p->context_stack = (struct lexContext **)
299 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
303 p->concatBuf = (struct lexConcatBuf *)
304 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
305 for (i = 0; i < p->maxLevel; i++)
307 p->concatBuf[i].max = 0;
308 p->concatBuf[i].buf = 0;
310 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
315 static void lexSpecDestroy (struct lexSpec **pp)
318 struct lexContext *lt;
326 for (i = 0; i < p->maxLevel; i++)
327 xfree (p->concatBuf[i].buf);
328 xfree (p->concatBuf);
333 struct lexContext *lt_next = lt->next;
334 lexContextDestroy (lt);
339 Tcl_DeleteInterp (p->tcl_interp);
342 xfree (p->f_win_buf);
343 xfree (p->context_stack);
349 static int readParseToken (const char **cpp, int *len)
351 const char *cp = *cpp;
355 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
384 if (*cp >= 'a' && *cp <= 'z')
386 else if (*cp >= 'A' && *cp <= 'Z')
387 cmd[i] = *cp + 'a' - 'A';
390 if (i < (int) sizeof(cmd)-2)
397 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
399 while (*cp && *cp != ' ' && *cp != '\t' &&
400 *cp != '\n' && *cp != '\r')
406 if (!strcmp (cmd, "begin"))
408 else if (!strcmp (cmd, "end"))
410 else if (!strcmp (cmd, "body"))
412 else if (!strcmp (cmd, "context"))
414 else if (!strcmp (cmd, "init"))
418 logf (LOG_WARN, "bad command %s", cmd);
424 static int actionListMk (struct lexSpec *spec, const char *s,
425 struct lexRuleAction **ap)
431 while ((tok = readParseToken (&s, &len)))
439 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
441 regxCodeMk (&(*ap)->u.code, s, len);
445 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
447 (*ap)->u.pattern.body = bodyMark;
449 (*ap)->u.pattern.dfa = lexSpecDFA ();
451 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
456 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
459 dfa_mkstate ((*ap)->u.pattern.dfa);
463 logf (LOG_WARN, "cannot use BEGIN here");
466 logf (LOG_WARN, "cannot use INIT here");
469 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
479 int readOneSpec (struct lexSpec *spec, const char *s)
483 struct lexContext *lc;
485 tok = readParseToken (&s, &len);
486 if (tok == REGX_CONTEXT)
488 char context_name[32];
489 tok = readParseToken (&s, &len);
490 if (tok != REGX_CODE)
492 logf (LOG_WARN, "missing name after CONTEXT keyword");
497 memcpy (context_name, s, len);
498 context_name[len] = '\0';
499 lc = lexContextCreate (context_name);
500 lc->next = spec->context;
505 spec->context = lexContextCreate ("main");
510 actionListDel (&spec->context->beginActionList);
511 actionListMk (spec, s, &spec->context->beginActionList);
514 actionListDel (&spec->context->endActionList);
515 actionListMk (spec, s, &spec->context->endActionList);
518 actionListDel (&spec->context->initActionList);
519 actionListMk (spec, s, &spec->context->initActionList);
523 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
525 r = dfa_parse (spec->context->dfa, &s);
528 logf (LOG_WARN, "regular expression error. r=%d", r);
533 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
537 rp = (struct lexRule *) xmalloc (sizeof(*rp));
538 rp->info.no = spec->context->ruleNo++;
539 rp->next = spec->context->rules;
540 spec->context->rules = rp;
541 actionListMk (spec, s, &rp->info.actionList);
546 int readFileSpec (struct lexSpec *spec)
548 struct lexContext *lc;
549 int c, i, errors = 0;
555 if (spec->tcl_interp)
557 sprintf (fname, "%s.tflt", spec->name);
558 spec_inf = data1_path_fopen (spec->dh, fname, "r");
563 sprintf (fname, "%s.flt", spec->name);
564 spec_inf = data1_path_fopen (spec->dh, fname, "r");
568 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
571 logf (LOG_LOG, "reading regx filter %s", fname);
573 if (spec->tcl_interp)
574 logf (LOG_LOG, "Tcl enabled");
576 lineBuf = wrbuf_alloc();
581 wrbuf_rewind (lineBuf);
582 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
584 while (c != '\n' && c != EOF)
597 wrbuf_putc(lineBuf, c);
605 if (c != ' ' && c != '\t')
610 wrbuf_putc(lineBuf, '\0');
611 readOneSpec (spec, wrbuf_buf(lineBuf));
612 spec->lineNo += addLine;
616 wrbuf_free(lineBuf, 1);
621 debug_dfa_followpos = 1;
624 for (lc = spec->context; lc; lc = lc->next)
627 lc->fastRule = (struct lexRuleInfo **)
628 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
629 for (i = 0; i < lc->ruleNo; i++)
630 lc->fastRule[i] = NULL;
631 for (rp = lc->rules; rp; rp = rp->next)
632 lc->fastRule[rp->info.no] = &rp->info;
633 dfa_mkstate (lc->dfa);
642 static struct lexSpec *curLexSpec = NULL;
645 static void execData (struct lexSpec *spec,
646 const char *ebuf, int elen, int formatted_text)
648 struct data1_node *res, *parent;
651 if (elen == 0) /* shouldn't happen, but it does! */
655 logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen,
656 ebuf, 15, ebuf + elen-15);
658 logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf);
660 logf (LOG_LOG, "data (%d bytes)", elen);
663 if (spec->d1_level <= 1)
666 parent = spec->d1_stack[spec->d1_level -1];
669 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
670 org_len = res->u.data.len;
675 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
676 res->u.data.what = DATA1I_text;
678 res->u.data.formatted_text = formatted_text;
679 res->u.data.data = 0;
681 if (spec->d1_stack[spec->d1_level])
682 spec->d1_stack[spec->d1_level]->next = res;
683 spec->d1_stack[spec->d1_level] = res;
685 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
687 char *old_buf, *new_buf;
689 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
690 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
691 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
693 memcpy (new_buf, old_buf, org_len);
696 spec->concatBuf[spec->d1_level].buf = new_buf;
698 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
699 res->u.data.len += elen;
702 static void execDataP (struct lexSpec *spec,
703 const char *ebuf, int elen, int formatted_text)
705 execData (spec, ebuf, elen, formatted_text);
708 static void tagDataRelease (struct lexSpec *spec)
712 if ((res = spec->d1_stack[spec->d1_level]) &&
713 res->which == DATA1N_data &&
714 res->u.data.what == DATA1I_text)
716 assert (!res->u.data.data);
717 assert (res->u.data.len > 0);
718 if (res->u.data.len > DATA1_LOCALDATA)
719 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
721 res->u.data.data = res->lbuf;
722 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
727 static void variantBegin (struct lexSpec *spec,
728 const char *class_str, int class_len,
729 const char *type_str, int type_len,
730 const char *value_str, int value_len)
732 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
733 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
738 if (spec->d1_level == 0)
740 logf (LOG_WARN, "in variant begin. No record type defined");
743 if (class_len >= DATA1_MAX_SYMBOL)
744 class_len = DATA1_MAX_SYMBOL-1;
745 memcpy (tclass, class_str, class_len);
746 tclass[class_len] = '\0';
748 if (type_len >= DATA1_MAX_SYMBOL)
749 type_len = DATA1_MAX_SYMBOL-1;
750 memcpy (ttype, type_str, type_len);
751 ttype[type_len] = '\0';
754 logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype,
759 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
763 if (parent->which != DATA1N_variant)
765 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
766 if (spec->d1_stack[spec->d1_level])
767 tagDataRelease (spec);
768 spec->d1_stack[spec->d1_level] = res;
769 spec->d1_stack[++(spec->d1_level)] = NULL;
771 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
772 if (spec->d1_stack[i]->u.variant.type == tp)
779 logf (LOG_LOG, "variant node (%d)", spec->d1_level);
781 parent = spec->d1_stack[spec->d1_level-1];
782 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
783 res->u.variant.type = tp;
785 if (value_len >= DATA1_LOCALDATA)
786 value_len =DATA1_LOCALDATA-1;
787 memcpy (res->lbuf, value_str, value_len);
788 res->lbuf[value_len] = '\0';
790 res->u.variant.value = res->lbuf;
792 if (spec->d1_stack[spec->d1_level])
793 tagDataRelease (spec);
794 spec->d1_stack[spec->d1_level] = res;
795 spec->d1_stack[++(spec->d1_level)] = NULL;
798 static void tagStrip (const char **tag, int *len)
802 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
805 for (i = 0; i < *len && isspace((*tag)[i]); i++)
811 static void tagBegin (struct lexSpec *spec,
812 const char *tag, int len)
814 struct data1_node *parent;
815 data1_element *elem = NULL;
818 data1_element *e = NULL;
821 if (spec->d1_level == 0)
823 logf (LOG_WARN, "in element begin. No record type defined");
826 tagStrip (&tag, &len);
828 parent = spec->d1_stack[spec->d1_level -1];
829 partag = get_parent_tag(spec->dh, parent);
831 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_tag, parent);
833 if (len >= DATA1_LOCALDATA)
834 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
836 res->u.tag.tag = res->lbuf;
838 memcpy (res->u.tag.tag, tag, len);
839 res->u.tag.tag[len] = '\0';
842 logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
844 if (parent->which == DATA1N_variant)
847 if (!(e = partag->u.tag.element))
850 elem = data1_getelementbytagname (spec->dh,
851 spec->d1_stack[0]->u.root.absyn,
853 res->u.tag.element = elem;
855 if (spec->d1_stack[spec->d1_level])
856 tagDataRelease (spec);
857 spec->d1_stack[spec->d1_level] = res;
858 spec->d1_stack[++(spec->d1_level)] = NULL;
861 static void tagEnd (struct lexSpec *spec, int min_level,
862 const char *tag, int len)
864 tagStrip (&tag, &len);
865 while (spec->d1_level > min_level)
867 tagDataRelease (spec);
869 if (spec->d1_level == 0)
871 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
873 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
875 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
879 logf (LOG_LOG, "end tag (%d)", spec->d1_level);
884 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
887 struct DFA_state *state = dfa->states[0];
890 unsigned char c_prev = 0;
891 int ptr = *pptr; /* current pointer */
892 int start_ptr = *pptr; /* first char of match */
893 int last_ptr = 0; /* last char of match */
894 int last_rule = 0; /* rule number of current match */
899 c = f_win_advance (spec, &ptr);
900 if (ptr == F_WIN_EOF)
917 *mptr = start_ptr; /* match starts here */
918 *pptr = last_ptr; /* match end here (+1) */
921 state = dfa->states[0];
926 else if (c >= t->ch[0] && c <= t->ch[1])
928 state = dfa->states[t->to];
933 last_rule = state->rule_no;
938 last_rule = state->rule_nno;
950 static int execTok (struct lexSpec *spec, const char **src,
951 const char **tokBuf, int *tokLen)
953 const char *s = *src;
955 while (*s == ' ' || *s == '\t')
959 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
963 while (*s >= '0' && *s <= '9')
964 n = n*10 + (*s++ -'0');
965 if (spec->arg_no == 0)
972 if (n >= spec->arg_no)
974 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
981 while (*s && *s != '\"')
983 *tokLen = s - *tokBuf;
988 else if (*s == '\n' || *s == ';')
996 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
999 *tokLen = s - *tokBuf;
1006 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1009 *tokLen = s - *tokBuf;
1015 static char *regxStrz (const char *src, int len, char *str)
1019 memcpy (str, src, len);
1025 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1026 int argc, char **argv)
1028 struct lexSpec *spec = (struct lexSpec *) clientData;
1031 if (!strcmp(argv[1], "record") && argc == 3)
1033 char *absynName = argv[2];
1037 logf (LOG_LOG, "begin record %s", absynName);
1039 res = data1_mk_root (spec->dh, spec->m, absynName);
1041 spec->d1_stack[spec->d1_level] = res;
1042 spec->d1_stack[++(spec->d1_level)] = NULL;
1044 else if (!strcmp(argv[1], "element") && argc == 3)
1046 tagBegin (spec, argv[2], strlen(argv[2]));
1048 else if (!strcmp (argv[1], "variant") && argc == 5)
1050 variantBegin (spec, argv[2], strlen(argv[2]),
1051 argv[3], strlen(argv[3]),
1052 argv[4], strlen(argv[4]));
1054 else if (!strcmp (argv[1], "context") && argc == 3)
1056 struct lexContext *lc = spec->context;
1058 logf (LOG_LOG, "begin context %s",argv[2]);
1060 while (lc && strcmp (argv[2], lc->name))
1064 spec->context_stack[++(spec->context_stack_top)] = lc;
1067 logf (LOG_WARN, "unknown context %s", argv[2]);
1074 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1075 int argc, char **argv)
1077 struct lexSpec *spec = (struct lexSpec *) clientData;
1081 if (!strcmp (argv[1], "record"))
1083 while (spec->d1_level)
1085 tagDataRelease (spec);
1089 logf (LOG_LOG, "end record");
1091 spec->stop_flag = 1;
1093 else if (!strcmp (argv[1], "element"))
1097 if (argc >= 3 && !strcmp(argv[2], "-record"))
1106 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1107 if (spec->d1_level == 0)
1110 logf (LOG_LOG, "end element end records");
1112 spec->stop_flag = 1;
1115 else if (!strcmp (argv[1], "context"))
1118 logf (LOG_LOG, "end context");
1120 if (spec->context_stack_top)
1121 (spec->context_stack_top)--;
1128 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1129 int argc, char **argv)
1133 const char *element = 0;
1134 struct lexSpec *spec = (struct lexSpec *) clientData;
1138 if (!strcmp("-text", argv[argi]))
1143 else if (!strcmp("-element", argv[argi]))
1147 element = argv[argi++];
1153 tagBegin (spec, element, strlen(element));
1157 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1159 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1160 execData (spec, native, strlen(native), textFlag);
1161 Tcl_DStringFree (&ds);
1163 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1168 tagEnd (spec, 1, NULL, 0);
1172 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1173 int argc, char **argv)
1175 struct lexSpec *spec = (struct lexSpec *) clientData;
1182 if (!strcmp("-offset", argv[argi]))
1187 offset = atoi(argv[argi]);
1196 no = atoi(argv[argi]);
1197 if (no >= spec->arg_no)
1198 no = spec->arg_no - 1;
1199 spec->ptr = spec->arg_start[no] + offset;
1203 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1207 for (i = 0; i < spec->arg_no; i++)
1209 char var_name[10], *var_buf;
1212 sprintf (var_name, "%d", i);
1213 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1217 ch = var_buf[var_len];
1218 var_buf[var_len] = '\0';
1219 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1220 var_buf[var_len] = ch;
1223 #if HAVE_TCL_OBJECTS
1224 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1226 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1230 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1231 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1232 spec->tcl_interp->errorLine,
1233 spec->tcl_interp->result,
1234 err ? err : "[NO ERRORINFO]");
1240 static void execCode (struct lexSpec *spec, struct regxCode *code)
1242 const char *s = code->str;
1244 const char *cmd_str;
1246 r = execTok (spec, &s, &cmd_str, &cmd_len);
1253 r = execTok (spec, &s, &cmd_str, &cmd_len);
1256 p = regxStrz (cmd_str, cmd_len, ptmp);
1257 if (!strcmp (p, "begin"))
1259 r = execTok (spec, &s, &cmd_str, &cmd_len);
1262 logf (LOG_WARN, "missing keyword after 'begin'");
1265 p = regxStrz (cmd_str, cmd_len, ptmp);
1266 if (!strcmp (p, "record"))
1268 r = execTok (spec, &s, &cmd_str, &cmd_len);
1271 if (spec->d1_level == 0)
1273 static char absynName[64];
1278 memcpy (absynName, cmd_str, cmd_len);
1279 absynName[cmd_len] = '\0';
1281 logf (LOG_LOG, "begin record %s", absynName);
1283 res = data1_mk_root (spec->dh, spec->m, absynName);
1285 spec->d1_stack[spec->d1_level] = res;
1286 spec->d1_stack[++(spec->d1_level)] = NULL;
1288 r = execTok (spec, &s, &cmd_str, &cmd_len);
1290 else if (!strcmp (p, "element"))
1292 r = execTok (spec, &s, &cmd_str, &cmd_len);
1295 tagBegin (spec, cmd_str, cmd_len);
1296 r = execTok (spec, &s, &cmd_str, &cmd_len);
1298 else if (!strcmp (p, "variant"))
1301 const char *class_str = NULL;
1303 const char *type_str = NULL;
1305 const char *value_str = NULL;
1306 r = execTok (spec, &s, &cmd_str, &cmd_len);
1309 class_str = cmd_str;
1310 class_len = cmd_len;
1311 r = execTok (spec, &s, &cmd_str, &cmd_len);
1317 r = execTok (spec, &s, &cmd_str, &cmd_len);
1320 value_str = cmd_str;
1321 value_len = cmd_len;
1323 variantBegin (spec, class_str, class_len,
1324 type_str, type_len, value_str, value_len);
1327 r = execTok (spec, &s, &cmd_str, &cmd_len);
1329 else if (!strcmp (p, "context"))
1333 struct lexContext *lc = spec->context;
1334 r = execTok (spec, &s, &cmd_str, &cmd_len);
1335 p = regxStrz (cmd_str, cmd_len, ptmp);
1337 logf (LOG_LOG, "begin context %s", p);
1339 while (lc && strcmp (p, lc->name))
1342 spec->context_stack[++(spec->context_stack_top)] = lc;
1344 logf (LOG_WARN, "unknown context %s", p);
1347 r = execTok (spec, &s, &cmd_str, &cmd_len);
1351 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1354 else if (!strcmp (p, "end"))
1356 r = execTok (spec, &s, &cmd_str, &cmd_len);
1359 logf (LOG_WARN, "missing keyword after 'end'");
1362 p = regxStrz (cmd_str, cmd_len, ptmp);
1363 if (!strcmp (p, "record"))
1365 while (spec->d1_level)
1367 tagDataRelease (spec);
1370 r = execTok (spec, &s, &cmd_str, &cmd_len);
1372 logf (LOG_LOG, "end record");
1374 spec->stop_flag = 1;
1376 else if (!strcmp (p, "element"))
1379 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1381 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1386 tagEnd (spec, min_level, cmd_str, cmd_len);
1387 r = execTok (spec, &s, &cmd_str, &cmd_len);
1390 tagEnd (spec, min_level, NULL, 0);
1391 if (spec->d1_level == 0)
1394 logf (LOG_LOG, "end element end records");
1396 spec->stop_flag = 1;
1400 else if (!strcmp (p, "context"))
1403 logf (LOG_LOG, "end context");
1405 if (spec->context_stack_top)
1406 (spec->context_stack_top)--;
1407 r = execTok (spec, &s, &cmd_str, &cmd_len);
1410 logf (LOG_WARN, "bad keyword '%s' after end", p);
1412 else if (!strcmp (p, "data"))
1416 const char *element_str = NULL;
1418 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1420 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1422 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1424 r = execTok (spec, &s, &element_str, &element_len);
1429 logf (LOG_WARN, "bad data option: %.*s",
1434 logf (LOG_WARN, "missing data item after data");
1438 tagBegin (spec, element_str, element_len);
1441 execData (spec, cmd_str, cmd_len,textFlag);
1442 r = execTok (spec, &s, &cmd_str, &cmd_len);
1445 tagEnd (spec, 1, NULL, 0);
1447 else if (!strcmp (p, "unread"))
1450 r = execTok (spec, &s, &cmd_str, &cmd_len);
1451 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1453 r = execTok (spec, &s, &cmd_str, &cmd_len);
1456 logf (LOG_WARN, "missing number after -offset");
1459 p = regxStrz (cmd_str, cmd_len, ptmp);
1461 r = execTok (spec, &s, &cmd_str, &cmd_len);
1467 logf (LOG_WARN, "missing index after unread command");
1470 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1472 logf (LOG_WARN, "bad index after unread command");
1477 no = *cmd_str - '0';
1478 if (no >= spec->arg_no)
1479 no = spec->arg_no - 1;
1480 spec->ptr = spec->arg_start[no] + offset;
1482 r = execTok (spec, &s, &cmd_str, &cmd_len);
1484 else if (!strcmp (p, "context"))
1488 struct lexContext *lc = spec->context;
1489 r = execTok (spec, &s, &cmd_str, &cmd_len);
1490 p = regxStrz (cmd_str, cmd_len, ptmp);
1492 while (lc && strcmp (p, lc->name))
1495 spec->context_stack[spec->context_stack_top] = lc;
1497 logf (LOG_WARN, "unknown context %s", p);
1500 r = execTok (spec, &s, &cmd_str, &cmd_len);
1504 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1505 r = execTok (spec, &s, &cmd_str, &cmd_len);
1510 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1512 r = execTok (spec, &s, &cmd_str, &cmd_len);
1519 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1520 int start_ptr, int *pptr)
1529 arg_start[0] = start_ptr;
1531 spec->arg_start = arg_start;
1532 spec->arg_end = arg_end;
1539 if (ap->u.pattern.body)
1541 arg_start[arg_no] = *pptr;
1542 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1544 arg_end[arg_no] = F_WIN_EOF;
1546 arg_start[arg_no] = F_WIN_EOF;
1547 arg_end[arg_no] = F_WIN_EOF;
1552 arg_end[arg_no] = sptr;
1554 arg_start[arg_no] = sptr;
1555 arg_end[arg_no] = *pptr;
1560 arg_start[arg_no] = *pptr;
1561 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1563 if (sptr != arg_start[arg_no])
1565 arg_end[arg_no] = *pptr;
1570 spec->arg_no = arg_no;
1573 if (spec->tcl_interp)
1574 execTcl(spec, ap->u.code);
1576 execCode (spec, ap->u.code);
1578 execCode (spec, ap->u.code);
1581 if (spec->stop_flag)
1585 arg_start[arg_no] = *pptr;
1586 arg_end[arg_no] = F_WIN_EOF;
1595 static int execRule (struct lexSpec *spec, struct lexContext *context,
1596 int ruleNo, int start_ptr, int *pptr)
1599 logf (LOG_LOG, "exec rule %d", ruleNo);
1601 return execAction (spec, context->fastRule[ruleNo]->actionList,
1605 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1607 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1608 struct DFA_state *state = context->dfa->states[0];
1611 unsigned char c_prev = '\n';
1613 int last_rule = 0; /* rule number of current match */
1614 int last_ptr = *ptr; /* last char of match */
1615 int start_ptr = *ptr; /* first char of match */
1616 int skip_ptr = *ptr; /* first char of run */
1620 c = f_win_advance (spec, ptr);
1621 if (*ptr == F_WIN_EOF)
1623 /* end of file met */
1626 /* there was a match */
1627 if (skip_ptr < start_ptr)
1629 /* deal with chars that didn't match */
1632 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1633 execDataP (spec, buf, size, 0);
1635 /* restore pointer */
1638 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1640 /* restore skip pointer */
1644 else if (skip_ptr < *ptr)
1646 /* deal with chars that didn't match */
1649 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1650 execDataP (spec, buf, size, 0);
1652 if (*ptr == F_WIN_EOF)
1659 { /* no transition for character c ... */
1662 if (skip_ptr < start_ptr)
1664 /* deal with chars that didn't match */
1667 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1668 execDataP (spec, buf, size, 0);
1670 /* restore pointer */
1672 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1674 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1677 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1679 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1683 context = spec->context_stack[spec->context_stack_top];
1686 last_ptr = start_ptr = *ptr;
1690 c_prev = f_win_advance (spec, &start_ptr);
1695 c_prev = f_win_advance (spec, &start_ptr);
1698 state = context->dfa->states[0];
1701 else if (c >= t->ch[0] && c <= t->ch[1])
1702 { /* transition ... */
1703 state = context->dfa->states[t->to];
1708 last_rule = state->rule_no;
1711 else if (state->rule_nno)
1713 last_rule = state->rule_nno;
1725 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1726 const char *context_name)
1728 struct lexContext *lt = spec->context;
1731 spec->stop_flag = 0;
1733 spec->context_stack_top = 0;
1736 if (!strcmp (lt->name, context_name))
1742 logf (LOG_WARN, "cannot find context %s", context_name);
1745 spec->context_stack[spec->context_stack_top] = lt;
1746 spec->d1_stack[spec->d1_level] = NULL;
1751 execAction (spec, lt->initActionList, ptr, &ptr);
1754 execAction (spec, lt->beginActionList, ptr, &ptr);
1755 lexNode (spec, &ptr);
1756 while (spec->d1_level)
1758 tagDataRelease (spec);
1761 execAction (spec, lt->endActionList, ptr, &ptr);
1762 return spec->d1_stack[0];
1765 void grs_destroy(void *clientData)
1767 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1770 lexSpecDestroy(&specs->spec);
1775 void *grs_init(void)
1777 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1782 data1_node *grs_read_regx (struct grs_read_info *p)
1785 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1786 struct lexSpec **curLexSpec = &specs->spec;
1789 logf (LOG_LOG, "grs_read_regx");
1791 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1794 lexSpecDestroy (curLexSpec);
1795 *curLexSpec = lexSpecCreate (p->type, p->dh);
1796 res = readFileSpec (*curLexSpec);
1799 lexSpecDestroy (curLexSpec);
1803 (*curLexSpec)->dh = p->dh;
1806 (*curLexSpec)->f_win_start = 0;
1807 (*curLexSpec)->f_win_end = 0;
1808 (*curLexSpec)->f_win_rf = p->readf;
1809 (*curLexSpec)->f_win_sf = p->seekf;
1810 (*curLexSpec)->f_win_fh = p->fh;
1811 (*curLexSpec)->f_win_ef = p->endf;
1812 (*curLexSpec)->f_win_size = 500000;
1814 (*curLexSpec)->m = p->mem;
1815 return lexRoot (*curLexSpec, p->offset, "main");
1818 static struct recTypeGrs regx_type = {
1825 RecTypeGrs recTypeGrs_regx = ®x_type;
1828 data1_node *grs_read_tcl (struct grs_read_info *p)
1831 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1832 struct lexSpec **curLexSpec = &specs->spec;
1835 logf (LOG_LOG, "grs_read_tcl");
1837 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1839 Tcl_Interp *tcl_interp;
1841 lexSpecDestroy (curLexSpec);
1842 *curLexSpec = lexSpecCreate (p->type, p->dh);
1843 Tcl_FindExecutable("");
1844 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1845 Tcl_Init(tcl_interp);
1846 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1847 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1848 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1849 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1851 res = readFileSpec (*curLexSpec);
1854 lexSpecDestroy (curLexSpec);
1858 (*curLexSpec)->dh = p->dh;
1861 (*curLexSpec)->f_win_start = 0;
1862 (*curLexSpec)->f_win_end = 0;
1863 (*curLexSpec)->f_win_rf = p->readf;
1864 (*curLexSpec)->f_win_sf = p->seekf;
1865 (*curLexSpec)->f_win_fh = p->fh;
1866 (*curLexSpec)->f_win_ef = p->endf;
1867 (*curLexSpec)->f_win_size = 500000;
1869 (*curLexSpec)->m = p->mem;
1870 return lexRoot (*curLexSpec, p->offset, "main");
1873 static struct recTypeGrs tcl_type = {
1880 RecTypeGrs recTypeGrs_tcl = &tcl_type;