From 440f23de23b0c2bce38451617cd69eb106b5c2df Mon Sep 17 00:00:00 2001 From: Heikki Levanto Date: Fri, 5 May 2006 09:14:42 +0000 Subject: [PATCH] Fixed the interface to match, merged with Adam's whitespace patch, First converters work --- include/yaz/nfa.h | 81 ++++++++++++++++++++++++++++++-- src/nfa.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++-- test/nfatest1.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 331 insertions(+), 14 deletions(-) diff --git a/include/yaz/nfa.h b/include/yaz/nfa.h index 7e256af..76c131e 100644 --- a/include/yaz/nfa.h +++ b/include/yaz/nfa.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. - * $Id: nfa.h,v 1.4 2006-05-04 18:58:24 adam Exp $ + * $Id: nfa.h,v 1.5 2006-05-05 09:14:42 heikki Exp $ */ /** @@ -15,6 +15,13 @@ * possible sequence of input characters that match the ranges in the * conditions, and that leads into a terminal state. * + * Separate from this we have converters. Those can often be used + * together with a NFA (think match-pattern and replace-pattern). + * + * A converter is a routine that produces some output. It can translate a + * range of characters into another range, emit a constant string, or + * something like that. + * */ #ifndef YAZ_NFA_H @@ -39,6 +46,11 @@ typedef struct yaz_nfa_state yaz_nfa_state; typedef struct yaz_nfa_transition yaz_nfa_transition; +/** brief Simple character range converter */ +typedef struct yaz_nfa_converter yaz_nfa_converter; + + + /** \brief Initialize the NFA without any states in it * * \return a pointer to the newly created NFA @@ -124,7 +136,7 @@ int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s, * Add a transition between two existing states. The condition * is (as always) a range of yaz_nfa_chars. * \param n the nfa - * \param from_state which state the transition is from + * \param from_state which state the transition is from. null=initial * \param to_state where the transition goes to * \param range_start is the beginning of the range of values * \param range_end is the end of the range of values @@ -176,7 +188,7 @@ yaz_nfa_state *yaz_nfa_add_sequence( yaz_nfa *n, * * \param n the nfa itself * \param inbuff buffer of input data. Will be incremented when match - * \param incharsleft max number of inchars to use from inbuff + * \param incharsleft max number of inchars to use from inbuff. decrements. * \param result the result pointer from the nfa (what ever that is) * * In case of errors, returns the best match so far, @@ -189,7 +201,7 @@ yaz_nfa_state *yaz_nfa_add_sequence( yaz_nfa *n, * */ -int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t incharsleft, +int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t *incharsleft, void **result ); /** yaz_nfa_match return codes */ @@ -224,6 +236,64 @@ int yaz_nfa_get_backref( yaz_nfa *n, yaz_nfa_char **start, yaz_nfa_char **end ); +/** \brief Create a string converter. + * \param n the nfa + * \param string the string to output + * \param length how many chars in the string + * + * This converter produces a constant string in the output + */ +yaz_nfa_converter *yaz_nfa_create_string_converter ( + yaz_nfa *n, + yaz_nfa_char *string, + size_t length ); + +/** \brief Create a backref converter + * \param n the nfa + * \param backref_no The backreference to reproduce + * + * This converter copies a backref into the output buffer + */ +yaz_nfa_converter *yaz_nfa_create_backref_converter ( + yaz_nfa *n, int backref_no ); + + + +/** \brief Connects converters in a chain. + * \param n the nfa (mostly for nmem access) + * \param startpoint the first converter in the chain + * \param newconverter + * + * Places the new converter at the end of the chain that starts from + * startpoint. + * + */ +void yaz_nfa_append_converter ( + yaz_nfa *n, + yaz_nfa_converter *startpoint, + yaz_nfa_converter *newconverter); + +/** brief Runs the chain of converters. + * \param n the nfa (mostly for nmem access) + * \param c the first converter in a chain + * \param outbuff buffer to write the output in. Increments the ptr. + * \param outcharsleft how many may we write + * + * Runs the converters in the chain, placing output into outbuff + * (and incrementing the pointer). + * + * \retval 0 OK + * \retval 1 no match to get backrefs from + * \retval 2 no room in outbuf + * + */ +int yaz_nfa_run_converters( + yaz_nfa *n, + yaz_nfa_converter *c, + yaz_nfa_char **outbuff, + size_t *outcharsleft); + + /** \brief Get the first state of the NFA. * * \param n the nfa @@ -259,6 +329,9 @@ yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s); void yaz_nfa_dump(FILE *F, yaz_nfa *n, char *(*strfunc)(void *) ); + + + YAZ_END_CDECL #endif diff --git a/src/nfa.c b/src/nfa.c index 0eee5e8..863af23 100644 --- a/src/nfa.c +++ b/src/nfa.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfa.c,v 1.5 2006-05-04 18:58:54 adam Exp $ + * $Id: nfa.c,v 1.6 2006-05-05 09:14:42 heikki Exp $ */ /** @@ -75,6 +75,20 @@ struct yaz_nfa_transition { #define LOOP_LIMIT 100 +typedef enum { + conv_none, + conv_string, + conv_backref +} yaz_nfa_converter_type; + +struct yaz_nfa_converter { + yaz_nfa_converter *next; + yaz_nfa_converter_type type; + yaz_nfa_char *string; + size_t strlen; + int backref_no; + int char_diff; +}; /* * * * * * * @@ -177,6 +191,8 @@ void yaz_nfa_add_transition(yaz_nfa *n, yaz_nfa_char range_start, yaz_nfa_char range_end) { yaz_nfa_transition *t = nmem_malloc(n->nmem, sizeof(yaz_nfa_transition)); + if (!from_state) + from_state = n->firststate; t->range_start = range_start; t->range_end = range_end; t->to_state = to_state; @@ -335,7 +351,7 @@ static void match_state( int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, - size_t incharsleft, + size_t *incharsleft, void **result ){ struct matcher m; int sz; @@ -349,6 +365,7 @@ int yaz_nfa_match(yaz_nfa *n, m.bestnode = n->nstates; m.result = 0; m.errorcode = 0; + m.empties = 0; sz = sizeof( struct yaz_nfa_backref_info) * n->nbackrefs; if (!n->curr_backrefs) { n->curr_backrefs = nmem_malloc( n->nmem, sz); @@ -361,8 +378,9 @@ int yaz_nfa_match(yaz_nfa *n, n->best_backrefs[i].end = 0; } - match_state(n->firststate, *inbuff, incharsleft, &m); + match_state(n->firststate, *inbuff, *incharsleft, &m); if (m.result) { + *incharsleft -= (m.longest-*inbuff); *result = m.result; *inbuff = m.longest; if (m.errorcode) @@ -392,6 +410,115 @@ int yaz_nfa_get_backref( yaz_nfa *n, return 0; } +/* * * * * * * * * * * * * * + * Converters + * * * * * * * * * * * * * */ + +static yaz_nfa_converter *create_null_converter ( yaz_nfa *n) { + yaz_nfa_converter *c; + c=nmem_malloc(n->nmem, sizeof(yaz_nfa_converter)); + c->next=0; + c->type=conv_none; + c->string=0; + c->strlen=0; + c->backref_no=0; + c->char_diff=0; + return c; +} + +yaz_nfa_converter *yaz_nfa_create_string_converter ( + yaz_nfa *n, + yaz_nfa_char *string, + size_t length){ + yaz_nfa_converter *c; + int i; + c=create_null_converter(n); + c->type=conv_string; + c->string=nmem_malloc(n->nmem, length*sizeof(yaz_nfa_char)); + for (i=0;istring[i]=string[i]; + c->strlen=length; + return c; +} + +yaz_nfa_converter *yaz_nfa_create_backref_converter ( + yaz_nfa *n, int backref_no ) { + yaz_nfa_converter *c; + c=create_null_converter(n); + c->type=conv_backref; + c->backref_no=backref_no; + return c; +} + + +void yaz_nfa_append_converter ( + yaz_nfa *n, + yaz_nfa_converter *startpoint, + yaz_nfa_converter *newconverter) { + while (startpoint->next) + startpoint=startpoint->next; + startpoint->next=newconverter; +} + +static int string_convert ( + yaz_nfa *n, + yaz_nfa_converter *c, + yaz_nfa_char **outbuff, + size_t *outcharsleft){ + size_t sz=c->strlen; + yaz_nfa_char *p=c->string; + while (sz--) { + if ((*outcharsleft)-- <= 0) + return 2; + **outbuff=*p++; + (*outbuff)++; + } + return 0; +} +static int backref_convert ( + yaz_nfa *n, + yaz_nfa_converter *c, + yaz_nfa_char **outbuff, + size_t *outcharsleft){ + yaz_nfa_char *cp1,*cp2; + int i; + i=yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2); + if (i==2) /* no backref, produce no output, that's ok */ + return 0; + if (i==1) /* no match in dfa */ + return 1; /* should not happen */ + while (cp2>cp1) { + if ((*outcharsleft)-- <= 0) + return 2; + **outbuff=*cp1++; + (*outbuff)++; + } + return 0; +} + + +int yaz_nfa_run_converters( + yaz_nfa *n, + yaz_nfa_converter *c, + yaz_nfa_char **outbuff, + size_t *outcharsleft){ + int rc=0; + // yaz_nfa_char *bufstart=*outbuff; + while (c && !rc) { + switch(c->type) { + case conv_string: + rc=string_convert(n,c,outbuff,outcharsleft); + break; + case conv_backref: + rc=backref_convert(n,c,outbuff,outcharsleft); + break; + default: + rc=3; /* internal error */ + } + c=c->next; + } + return rc; +} /* * * * * * * * * * Debug routines diff --git a/test/nfatest1.c b/test/nfatest1.c index 2b730eb..7ae8b29 100644 --- a/test/nfatest1.c +++ b/test/nfatest1.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfatest1.c,v 1.3 2006-05-04 18:59:13 adam Exp $ + * $Id: nfatest1.c,v 1.4 2006-05-05 09:14:42 heikki Exp $ * */ @@ -20,18 +20,26 @@ char *printfunc(void *result) { return buf; } +char *printfunc2(void *result) { + static char buf[200]; + sprintf(buf,"(%p)", result); + return buf; +} + void test_match(yaz_nfa *n, - yaz_nfa_char *buf, int buflen, + yaz_nfa_char *buf, size_t buflen, int expcode, char *expstr) { yaz_nfa_char *c = buf; yaz_nfa_char *cp1, *cp2; void *resptr = 0; int i, bi; - i = yaz_nfa_match(n,&c, buflen,&resptr); + size_t buflen2 = buflen; + i = yaz_nfa_match(n,&c, &buflen2,&resptr); #if VERBOSE printf("\n'%s' returned %d. Moved c by %d, and resulted in '%s'\n", expstr, i, (c-buf),(char*)resptr); #endif + YAZ_CHECK_EQ(buflen-buflen2, c-buf); YAZ_CHECK_EQ(i, expcode); if (i!=1) YAZ_CHECK_EQ(strcmp(expstr,(char*)resptr), 0); @@ -66,6 +74,7 @@ void construction_test() { yaz_nfa_char tst5[]={'y', 'k', 'l', 'k', 'k', 'l', 'k', 'd', 0}; yaz_nfa_char tst6[]={'x', 'z', 'k', 'a', 'b', 0}; void *p; + size_t sz; YAZ_CHECK(n); @@ -167,23 +176,131 @@ void construction_test() { test_match(n, tst5, 9, YAZ_NFA_SUCCESS, "y k+ d"); cp = tst6; - i = yaz_nfa_match(n,&cp, 8,&p); + sz = 8; + i = yaz_nfa_match(n, &cp, &sz, &p); YAZ_CHECK_EQ(i, YAZ_NFA_SUCCESS); i = yaz_nfa_get_backref(n, 2, &cp1, &cp2 ); YAZ_CHECK_EQ(i, 0); #if VERBOSE - printf("backref from %p to %p is %d long\n", - cp1, cp2, cp2-cp1 ); + printf("backref from %p to %p is %d long. sz is now %d\n", + cp1,cp2, cp2-cp1,sz ); #endif yaz_nfa_destroy(n); } +void converter_test() { + yaz_nfa* n= yaz_nfa_init(); + yaz_nfa_converter *c1,*c2; + yaz_nfa_char str1[]={'a','b','c'}; + yaz_nfa_char seq1[]={'A','B','C',0}; + yaz_nfa_char seq2[]={'k','m','n','m','x','P','Q','X',0}; + yaz_nfa_char outbuf[1024]; + yaz_nfa_char *outp,*cp, *cp1, *cp2; + yaz_nfa_state *s, *s2; + void *vp; + int i; + size_t sz; + + c1=yaz_nfa_create_string_converter(n,str1,3); + + for(i=0;i<1024;i++) + outbuf[i]=10000+i; + outp=outbuf; + sz=1; + i=yaz_nfa_run_converters(n, c1, &outp, &sz); + YAZ_CHECK_EQ(i,2); /* overrun */ + YAZ_CHECK_EQ(outbuf[0],'a'); + YAZ_CHECK_EQ(outbuf[1],10000+1); + + for(i=0;i<1024;i++) + outbuf[i]=10000+i; + outp=outbuf; + sz=3; + i=yaz_nfa_run_converters(n, c1, &outp, &sz); + YAZ_CHECK_EQ(i,0); + YAZ_CHECK_EQ(outbuf[0],'a'); + YAZ_CHECK_EQ(outbuf[1],'b'); + YAZ_CHECK_EQ(outbuf[2],'c'); + YAZ_CHECK_EQ(outbuf[3],10000+3); + YAZ_CHECK_EQ(sz,0); + + c2=yaz_nfa_create_string_converter(n,str1,2); + yaz_nfa_append_converter(n,c1,c2); + + for(i=0;i<1024;i++) + outbuf[i]=10000+i; + outp=outbuf; + sz=10; + i=yaz_nfa_run_converters(n, c1, &outp, &sz); + YAZ_CHECK_EQ(i,0); + YAZ_CHECK_EQ(outbuf[0],'a'); + YAZ_CHECK_EQ(outbuf[1],'b'); + YAZ_CHECK_EQ(outbuf[2],'c'); + YAZ_CHECK_EQ(outbuf[3],'a'); + YAZ_CHECK_EQ(outbuf[4],'b'); + YAZ_CHECK_EQ(outbuf[5],10000+5); + YAZ_CHECK_EQ(sz,5); + + /* ABC -> abcab */ + (void) yaz_nfa_add_state(n);/* start state */ + s=yaz_nfa_add_state(n); + yaz_nfa_add_empty_transition(n,0,s); + yaz_nfa_set_backref_point(n,s,1,1); + s=yaz_nfa_add_sequence(n, s, seq1 ); + yaz_nfa_set_result(n,s,c1); + yaz_nfa_set_backref_point(n,s,1,0); + + /* [k-o][m-n]*x -> m-n sequence */ + s=yaz_nfa_add_state(n); + yaz_nfa_add_empty_transition(n,0,s); + yaz_nfa_set_backref_point(n,s,2,1); + s2=yaz_nfa_add_state(n); + yaz_nfa_add_transition(n,s,s2,'k','o'); + yaz_nfa_add_transition(n,s2,s2,'m','n'); + s=yaz_nfa_add_state(n); + yaz_nfa_add_transition(n,s2,s,'x','x'); + yaz_nfa_set_backref_point(n,s,2,0); + + c1=yaz_nfa_create_backref_converter(n,2); + yaz_nfa_set_result(n,s,c1); + +#if VERBOSE + yaz_nfa_dump(0,n, printfunc2); +#endif + + cp=seq2; + sz=18; + i=yaz_nfa_match(n,&cp,&sz,&vp); + c2=vp; + YAZ_CHECK_EQ(i,YAZ_NFA_SUCCESS); + i=yaz_nfa_get_backref(n, 2, &cp1, &cp2 ); + YAZ_CHECK_EQ(i,0); + YAZ_CHECK_EQ((int)c1,(int)c2); + for(i=0;i<1024;i++) + outbuf[i]=10000+i; + outp=outbuf; + sz=11; + i=yaz_nfa_run_converters(n, c2, &outp, &sz); + YAZ_CHECK_EQ(i,0); + YAZ_CHECK_EQ(outbuf[0],'k'); + YAZ_CHECK_EQ(outbuf[1],'m'); + YAZ_CHECK_EQ(outbuf[2],'n'); + YAZ_CHECK_EQ(outbuf[3],'m'); + YAZ_CHECK_EQ(outbuf[4],'x'); + YAZ_CHECK_EQ(outbuf[5],10000+5); + YAZ_CHECK_EQ(sz,11-5); + + yaz_nfa_destroy(n); +} + + int main(int argc, char **argv) { YAZ_CHECK_INIT(argc, argv); nmem_init (); - construction_test(); + construction_test(); + converter_test(); nmem_exit (); YAZ_CHECK_TERM; } -- 1.7.10.4