X-Git-Url: http://jsfdemo.indexdata.com/?a=blobdiff_plain;f=dfa%2Fdfa.c;h=f55b69abb036372b5a2431cf7992a3ce41e79f80;hb=fd12cf9b8e16c109f3c0f7aedb0e0efd65209e16;hp=255324975dfdc14c64eb49e3b58d04871801612a;hpb=e27ce02d4d96ac2b8220134c837c53cfef8eba23;p=idzebra-moved-to-github.git diff --git a/dfa/dfa.c b/dfa/dfa.c index 2553249..f55b69a 100644 --- a/dfa/dfa.c +++ b/dfa/dfa.c @@ -1,10 +1,32 @@ /* - * Copyright (C) 1994, Index Data I/S + * Copyright (C) 1994-1997, Index Data I/S * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * * $Log: dfa.c,v $ - * Revision 1.10 1996-01-08 09:09:17 adam + * Revision 1.17 1997-09-18 08:59:17 adam + * Extra generic handle for the character mapping routines. + * + * Revision 1.16 1997/09/05 15:29:57 adam + * Changed prototype for chr_map_input - added const. + * Added support for C++, headers uses extern "C" for public definitions. + * + * Revision 1.15 1997/02/10 10:19:20 adam + * Added facility for open character sets, eg [a-]. + * + * Revision 1.14 1996/10/29 13:57:22 adam + * Include of zebrautl.h instead of alexutil.h. + * + * Revision 1.13 1996/06/17 14:24:08 adam + * Bug fix: read_charset didn't handle character mapping. + * + * Revision 1.12 1996/06/04 10:20:02 adam + * Added support for character mapping. + * + * Revision 1.11 1996/01/08 19:15:24 adam + * Allow single $ in expressions. + * + * Revision 1.10 1996/01/08 09:09:17 adam * Function dfa_parse got 'const' string argument. * New functions to define char mappings made public. * @@ -45,10 +67,12 @@ #include #include -#include +#include #include "dfap.h" #include "imalloc.h" +#define DFA_OPEN_RANGE 1 + #define CAT 16000 #define OR 16001 #define STAR 16002 @@ -270,15 +294,15 @@ static struct Tnode *expr_4 (void) return t1; } -static void do_parse (struct DFA_parse *dfap, const char **s, struct Tnode **tnp) +static void do_parse (struct DFA_parse *dfap, const char **s, + struct Tnode **tnp) { - int anchor_flag = 0; int start_anchor_flag = 0; struct Tnode *t1, *t2, *tn; parse_info = dfap; err_code = 0; - expr_ptr = (unsigned char *) *s; + expr_ptr = (const unsigned char *) *s; inside_string = 0; lex (); @@ -287,31 +311,32 @@ static void do_parse (struct DFA_parse *dfap, const char **s, struct Tnode **tnp start_anchor_flag = 1; lex (); } - t1 = expr_1 (); - if (anchor_flag) + if (lookahead == L_END) { - tn = mk_Tnode (); - tn->pos = CAT; - tn->u.p[0] = t2; - tn->u.p[1] = t1; - t1 = tn; + t1 = mk_Tnode (); + t1->pos = ++parse_info->position; + t1->u.ch[1] = t1->u.ch[0] = '\n'; + lex (); } - if (lookahead == L_END && t1) + else { - t2 = mk_Tnode (); - t2->pos = ++parse_info->position; - t2->u.ch[1] = t2->u.ch[0] = '\n'; - - tn = mk_Tnode (); - tn->pos = CAT; - tn->u.p[0] = t1; - tn->u.p[1] = t2; - t1 = tn; - - anchor_flag |= 2; - lex (); + t1 = expr_1 (); + if (t1 && lookahead == L_END) + { + t2 = mk_Tnode (); + t2->pos = ++parse_info->position; + t2->u.ch[1] = t2->u.ch[0] = '\n'; + + tn = mk_Tnode (); + tn->pos = CAT; + tn->u.p[0] = t1; + tn->u.p[1] = t2; + t1 = tn; + + lex (); + } } - if (lookahead == 0 && t1) + if (t1 && lookahead == 0) { t2 = mk_Tnode(); t2->pos = ++parse_info->position; @@ -335,7 +360,7 @@ static void do_parse (struct DFA_parse *dfap, const char **s, struct Tnode **tnp err_code = DFA_ERR_SYNTAX; } } - *s = (char *) expr_ptr; + *s = (const char *) expr_ptr; } static int nextchar (int *esc) @@ -398,20 +423,52 @@ static int read_charset (void) { if (!esc0 && ch0 == ']') break; + if (parse_info->cmap) + { + const char **mapto; + char mapfrom[2]; + const char *mcp = mapfrom; + mapfrom[0] = ch0; + mapto = (*parse_info->cmap)(parse_info->cmap_data, &mcp, 1); + assert (mapto); + ch0 = mapto[0][0]; + } add_BSet (parse_info->charset, look_chars, ch0); ch1 = nextchar_set (&esc1); if (!esc1 && ch1 == '-') { + int open_range = 0; if ((ch1 = nextchar_set (&esc1)) == 0) break; +#if DFA_OPEN_RANGE + if (!esc1 && ch1 == ']') + { + ch1 = 255; + open_range = 1; + } +#else if (!esc1 && ch1 == ']') { add_BSet (parse_info->charset, look_chars, '-'); break; } +#endif + if (!open_range && parse_info->cmap) + { + const char **mapto; + char mapfrom[2]; + const char *mcp = mapfrom; + mapfrom[0] = ch1; + mapto = (*parse_info->cmap) (parse_info->cmap_data, &mcp, 1); + assert (mapto); + ch1 = mapto[0][0]; + } for (i=ch0; ++i<=ch1;) add_BSet (parse_info->charset, look_chars, i); - ch0 = nextchar_set (&esc0); + if (!open_range) + ch0 = nextchar_set (&esc0); + else + break; } else { @@ -424,6 +481,30 @@ static int read_charset (void) return L_CHARS; } +static int map_l_char (void) +{ + const char **mapto; + const char *cp0 = (const char *) (expr_ptr-1); + int i = 0, len = strlen(cp0); + + if (cp0[0] == 1 && cp0[1]) + { + expr_ptr++; + look_ch = cp0[1]; + return L_CHAR; + } + if (!parse_info->cmap) + return L_CHAR; + + mapto = (*parse_info->cmap) (parse_info->cmap_data, &cp0, len); + assert (mapto); + + expr_ptr = (const unsigned char *) cp0; + look_ch = mapto[i][0]; + logf (LOG_DEBUG, "map from %c to %d", expr_ptr[-1], look_ch); + return L_CHAR; +} + static int lex_sub(void) { int esc; @@ -431,18 +512,16 @@ static int lex_sub(void) if (look_ch == '\"') { if (esc) - return L_CHAR; + return map_l_char (); inside_string = !inside_string; } else if (esc || inside_string) - return L_CHAR; + return map_l_char (); else if (look_ch == '[') return read_charset(); else { const int *cc; - if (look_ch == '/') - logf (LOG_DEBUG, "xxxx / xxx"); for (cc = parse_info->charMap; *cc; cc += 2) if (*cc == look_ch) { @@ -450,7 +529,7 @@ static int lex_sub(void) --expr_ptr; return cc[1]; } - return L_CHAR; + return map_l_char (); } return 0; } @@ -976,6 +1055,7 @@ static struct DFA_parse *dfa_parse_init (void) parse_info->use_Tnode = parse_info->max_Tnode = 0; parse_info->charMap = NULL; parse_info->charMapSize = 0; + parse_info->cmap = NULL; return parse_info; } @@ -1030,6 +1110,13 @@ struct DFA *dfa_init (void) return dfa; } +void dfa_set_cmap (struct DFA *dfa, void *vp, + const char **(*cmap)(void *vp, const char **from, int len)) +{ + dfa->parse_info->cmap = cmap; + dfa->parse_info->cmap_data = vp; +} + int dfa_parse (struct DFA *dfa, const char **pattern) { struct Tnode *top;