From: Heikki Levanto Date: Fri, 14 Jul 2006 13:06:37 +0000 (+0000) Subject: Loading of the nfa now possible from an xml file. X-Git-Tag: YAZ.2.1.26~23 X-Git-Url: http://jsfdemo.indexdata.com/cgi-bin?a=commitdiff_plain;h=fd3a0b7302ceed6425c05b6537cb654201cb1494;p=yaz-moved-to-github.git Loading of the nfa now possible from an xml file. Had to disable two tests, because make distcheck could not find the files I wanted to load. After that distcheck passes all right. --- diff --git a/include/yaz/nfa.h b/include/yaz/nfa.h index addea16..7ff29bc 100644 --- a/include/yaz/nfa.h +++ b/include/yaz/nfa.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. - * $Id: nfa.h,v 1.8 2006-07-04 12:59:56 heikki Exp $ + * $Id: nfa.h,v 1.9 2006-07-14 13:06:37 heikki Exp $ */ /** @@ -534,6 +534,11 @@ void yaz_nfa_dump(FILE *F, yaz_nfa *n, char *(*strfunc)(void *) ); +/** \brief Helper to dump converters + * + */ +char *yaz_nfa_dump_converter(void *conv); + /* \} */ diff --git a/include/yaz/nfaxml.h b/include/yaz/nfaxml.h index 6c7436d..98fd8a6 100644 --- a/include/yaz/nfaxml.h +++ b/include/yaz/nfaxml.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. - * $Id: nfaxml.h,v 1.4 2006-07-06 13:10:29 heikki Exp $ + * $Id: nfaxml.h,v 1.5 2006-07-14 13:06:37 heikki Exp $ */ /** @@ -10,7 +10,7 @@ * The xml file is something like this (using round brakcets * on tags, not to confuse our documentation tools) * (?xml ...) - * (charmap) + * (ruleset) * (rule) * (fromstring) FOO (/fromstring) * (tostring) BAR (/tostring) @@ -48,7 +48,7 @@ YAZ_BEGIN_CDECL /** \brief Parse the NFA from a XML document * * \param doc the xml tree to parse - * \param error_info will be filled in case of errors + * \param filename used for info in error messages * * \returns either the NFA, or null in case of errors * @@ -62,7 +62,7 @@ YAZ_BEGIN_CDECL * logged in yazlog. * */ -yaz_nfa *yaz_nfa_parse_xml_doc(xmlDocPtr doc); +yaz_nfa *yaz_nfa_parse_xml_doc(xmlDocPtr doc, const char *filename); /** \brief Parse the NFA from a file @@ -102,7 +102,7 @@ yaz_nfa *yaz_nfa_parse_xml_file(const char *filepath); * logged in yazlog. * */ -yaz_nfa *yaz_nfa_parse_xml_memory(const char *xmlbuff); +yaz_nfa *yaz_nfa_parse_xml_memory(const char *xmlbuff, const char *filename); YAZ_END_CDECL diff --git a/src/nfa.c b/src/nfa.c index 6046b3d..fe6362e 100644 --- a/src/nfa.c +++ b/src/nfa.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfa.c,v 1.11 2006-07-07 08:36:36 adam Exp $ + * $Id: nfa.c,v 1.12 2006-07-14 13:06:38 heikki Exp $ */ /** @@ -748,6 +748,45 @@ void yaz_nfa_dump(FILE *F, yaz_nfa *n, } } +static char buf[5000]=""; +char *yaz_nfa_dump_converter(void *conv) +{ + char onebuf[500]=""; + yaz_nfa_converter *c=conv; + yaz_nfa_char *cp; + size_t len; + *buf=0; + while (c) { + switch(c->type) { + case conv_none: + sprintf(onebuf,"(none)" ); + break; + case conv_string: + sprintf(onebuf,"(string '" ); + strcat(buf,onebuf); + cp=c->string; + len=c->strlen; + while (len--) { + onebuf[0]=*cp++; + onebuf[1]=0; + strcat(buf,onebuf); + } + strcat(buf,"')"); + onebuf[0]=0; + break; + case conv_backref: + sprintf(onebuf,"(backref %d) ",c->backref_no); + break; + case conv_range: + sprintf(onebuf,"(range %d) ",c->char_diff); + break; + } + strcat(buf,onebuf); + c=c->next; + } /* while */ + return buf; +} + /* * Local variables: diff --git a/src/nfaxml.c b/src/nfaxml.c index 84a8b0b..94ef5e6 100644 --- a/src/nfaxml.c +++ b/src/nfaxml.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfaxml.c,v 1.8 2006-07-06 14:06:17 heikki Exp $ + * $Id: nfaxml.c,v 1.9 2006-07-14 13:06:38 heikki Exp $ */ /** @@ -25,25 +25,267 @@ #include #include +/** \brief How long strings we are willing to handle here */ +#define MAXDATALEN 200 + +/** \brief Get content of a node, in utf16, for yaz_nfa */ +static int utf16_content(xmlNodePtr node, yaz_nfa_char *buf, int maxlen, + const char *filename, int rulenumber) +{ + int bufidx=0; + xmlChar *content = xmlNodeGetContent(node); + xmlChar *cp=content; + int conlen=strlen((char *)content); + int len; + int res; + while (*cp && (bufidxchildren; node; node = node->next) + { + if (node->type != XML_ELEMENT_NODE) + continue; + clauses++; + if (!strcmp((const char *) node->name, "fromstring")) + { + state = parse_fromstring(nfa, node, filename, rulenumber ); + if (!state) + return 0; + } else if (!strcmp((const char *) node->name, "tostring")) + { + conv = parse_tostring(nfa, node, filename, rulenumber ); + if (!conv) + return 0; + } else if (!strcmp((const char *) node->name, "fromrange")) + { + state = parse_fromrange(nfa, node, + &range_begin, &range_end, filename, rulenumber ); + if (!state) + return 0; + } else if (!strcmp((const char *) node->name, "torange")) + { + conv = parse_torange(nfa, node, + range_begin, range_end, filename, rulenumber ); + if (!conv) + return 0; + } else { + yaz_log(YLOG_FATAL,"Unknown clause '%s' in %s rule %d", + node->name, filename,rulenumber); + return 0; + } + } /* for child */ + if (!state) { + yaz_log(YLOG_FATAL,"No 'from' clause in a rule %d in %s", + rulenumber,filename); + return 0; + } + if (!conv) { + yaz_log(YLOG_FATAL,"No 'to' clause in a rule %d in %s", + rulenumber,filename); + return 0; + } + if (clauses != 2) { + yaz_log(YLOG_FATAL,"Must have exactly one 'from' and one 'to' clause " + "in rule %d in %s", rulenumber,filename); + return 0; + } + if ( YAZ_NFA_SUCCESS == yaz_nfa_set_result(nfa,state,conv)) + return 1; + yaz_log(YLOG_FATAL,"Conflicting rules in %s rule %d", + filename, rulenumber); + return 0; +} /* parse_rule */ + + /** \brief Parse the NFA from a XML document */ -yaz_nfa *yaz_nfa_parse_xml_doc(xmlDocPtr doc) +yaz_nfa *yaz_nfa_parse_xml_doc(xmlDocPtr doc, const char *filename) { - libxml2_error_to_yazlog(YLOG_FATAL, "yaz_nfa_parse_doc"); + xmlNodePtr node; + yaz_nfa *nfa; + int rulenumber=0; if (!doc) return 0; - - return 0; -} + libxml2_error_to_yazlog(YLOG_FATAL, "yaz_nfa_parse_doc"); + node = xmlDocGetRootElement(doc); + if (!node || node->type != XML_ELEMENT_NODE || + strcmp((const char *) node->name, "ruleset")) + { + yaz_log(YLOG_FATAL,"nfa_parse_xml: Could not find root element 'ruleset' " + "in %s", filename); + return 0; + } + nfa= yaz_nfa_init(); + if (!nfa) + { + yaz_log(YLOG_FATAL,"nfa_parse_xml: Creating nfa failed, can't parse %s", + filename); + return 0; + } + + for (node = node->children; node; node = node->next) + { + if (node->type != XML_ELEMENT_NODE) + continue; + if (!strcmp((const char *) node->name, "rule")) { + if (!parse_rule(nfa,node,filename,rulenumber++)) + return 0; + } else { + yaz_log(YLOG_FATAL,"nfa_parse_xml: " + "expected 'rule', found '%s' in %s", + (const char *) node->name,filename); + return 0; + } + } /* for */ + return nfa; +} /* yaz_nfa_parse_xml_doc */ /** \brief Parse the NFA from a file */ -yaz_nfa *yaz_nfa_parse_xml_file(const char *filepath) { +yaz_nfa *yaz_nfa_parse_xml_file(const char *filepath) +{ int nSubst; - xmlDocPtr doc; + if (!filepath) + { + yaz_log(YLOG_FATAL,"yaz_nfa_parse_xml_file called with NULL"); + return 0; + } libxml2_error_to_yazlog(YLOG_FATAL, "yaz_nfa_parse_xml_file"); doc = xmlParseFile(filepath); @@ -54,16 +296,21 @@ yaz_nfa *yaz_nfa_parse_xml_file(const char *filepath) { if (nSubst==-1) { return 0; } - return yaz_nfa_parse_xml_doc(doc); + return yaz_nfa_parse_xml_doc(doc, filepath); } /** \brief Parse the NFA from a memory buffer */ -yaz_nfa *yaz_nfa_parse_xml_memory(const char *xmlbuff) { +yaz_nfa *yaz_nfa_parse_xml_memory(const char *xmlbuff, const char *filename) { xmlDocPtr doc; + if (!xmlbuff) + { + yaz_log(YLOG_FATAL,"yaz_nfa_parse_memroy called with NULL"); + return 0; + } libxml2_error_to_yazlog(YLOG_FATAL, "yaz_nfa_parse_xml_memory"); doc = xmlParseMemory(xmlbuff, strlen(xmlbuff)); - return yaz_nfa_parse_xml_doc(doc); + return yaz_nfa_parse_xml_doc(doc,filename); } diff --git a/test/Makefile.am b/test/Makefile.am index bbce667..6605a2a 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,6 +1,6 @@ ## Copyright (C) 1994-2006, Index Data ApS ## All rights reserved. -## $Id: Makefile.am,v 1.23 2006-07-07 13:39:05 heikki Exp $ +## $Id: Makefile.am,v 1.24 2006-07-14 13:06:38 heikki Exp $ check_PROGRAMS = tsticonv tstnmem tstmatchstr tstwrbuf tstodr tstccl tstlog \ tstsoap1 tstsoap2 tstodrstack tstlogthread tstxmlquery tstpquery \ @@ -19,7 +19,9 @@ EXTRA_DIST = tstodr.asn tstodrcodec.c tstodrcodec.h cqlsample \ marc4 marc4.xml marc4.chr marc4.xml.marc \ marc5 marc5.xml marc5.chr marc5.xml.marc \ marc6 marc6.xml marc6.chr marc6.xml.marc \ - tst_record_conv.xsl + tst_record_conv.xsl \ + nfaxml-simple.xml nfaxml-main.xml \ + nfaxml-include.xml nfaxml-badinclude.xml YAZCOMP = ../util/yaz-asncomp YAZCOMPLINE = $(YAZCOMP) -d z.tcl -i yaz -I../include $(YCFLAGS) diff --git a/test/nfaxmltest1.c b/test/nfaxmltest1.c index b6cf6d5..754be3b 100644 --- a/test/nfaxmltest1.c +++ b/test/nfaxmltest1.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfaxmltest1.c,v 1.6 2006-07-07 13:39:05 heikki Exp $ + * $Id: nfaxmltest1.c,v 1.7 2006-07-14 13:06:38 heikki Exp $ * */ @@ -25,8 +25,9 @@ void test1() { " bar " "" ""; - yaz_nfa *nfa = yaz_nfa_parse_xml_memory(xmlstr); - YAZ_CHECK_TODO(nfa); + yaz_nfa *nfa = yaz_nfa_parse_xml_memory(xmlstr,"test1"); + YAZ_CHECK(nfa); + yaz_nfa_destroy(nfa); } @@ -40,7 +41,9 @@ void test2() { ""; /* missing "" */ yaz_log(YLOG_LOG,"Parsing bad xml, expecting errors:"); - nfa = yaz_nfa_parse_xml_memory(xmlstr); + nfa = yaz_nfa_parse_xml_memory(xmlstr,"test2"); + YAZ_CHECK(!nfa); + nfa = yaz_nfa_parse_xml_memory(0,"test2-null"); YAZ_CHECK(!nfa); } @@ -59,7 +62,7 @@ void test3() { do { yaz_log(YLOG_LOG,"Parsing (good) xml file '%s'", *f); nfa=yaz_nfa_parse_xml_file(*f); - YAZ_CHECK_TODO(nfa); + YAZ_CHECK_TODO(nfa); /* fails on make distcheck, can't find the files*/ } while (*++f); f = badfilenames; @@ -70,6 +73,232 @@ void test3() { } while (*++f); } +/** \brief Test parsing of a few minimal xml strings, with logical errors */ +void test4() { + yaz_nfa *nfa; + char *xmls[] = { + /*a*/" ", + /*b*/" ", + /*c*/" ", + /*d*/"" + "MissingTo" + "", + /*e*/"" + "DuplicateFrom" + "Another Fromstring" + "", + /*f*/"" + "MissingFrom" + "", + /*g*/"" + "DuplicateTo" + "AnotherTo" + "", + /*h*/"" + "GoodUTF:æøå" + "", + /*i*/"" + "BadUtf8:Ø" + "", + /*j*/"" + "" + "ConflictingRules" + "IdenticalStrings" + "" + "" + "ConflictingRules" + "IdenticalStrings" + "" + "", + /*k*/"", /* empty string! */ + /*l*/"" + "" + "A-Z" + "a-x" + "" + "", + 0 }; + char **xmlp=xmls; + char label[]= { 'a', 0 }; + while ( *xmlp ) { + yaz_log(YLOG_LOG,"test4-%s: Parsing bad xml, expecting errors:", + label); + nfa = yaz_nfa_parse_xml_memory(*xmlp,label); + YAZ_CHECK(!nfa); + xmlp++; + label[0]++; + } +} /* test4 */ + +static void test5() { + struct conv_test { + unsigned char *name; + int expresult; + unsigned char *xml; + unsigned char *from; + unsigned char *to; + }; + struct conv_test tests[]= { + { "test5-1", YAZ_NFA_SUCCESS, + "" + "" + "foo" + "bar" + "" + "", + "this is a foo test ofoofo fo foofoo fofoofooofoooo ", + "this is a bar test obarfo fo barbar fobarbarobaroo " + }, + { "test5-2", YAZ_NFA_SUCCESS, + "" + "" + "ooooo" + "five " + "" + "" + "oooo" + "four " + "" + "" + "ooo" + "three " + "" + "" + "oo" + "two " + "" + "", + "oo-oooo-", + "two -four -" + }, + { "test5-4", YAZ_NFA_SUCCESS, 0, /* same xml */ + "oo-oooo-ooooooo-", + "two -four -five two -" + }, + { "test5-3", YAZ_NFA_OVERRUN, 0, /* could match further oo's */ + "oo-oooo-ooooooo", + "two -four -five " + }, + { "test5-4 (lowercase)", YAZ_NFA_SUCCESS, + "" + "" + "A-Z" + "a-z" + "" + "", + "LowerCase TEST with A-Z and a-z", + "lowercase test with a-z and a-z" + }, + { "test5-5 (lowercase entities)", YAZ_NFA_SUCCESS, + "" + "" + "A-Z" + "a-z" + "" + "", + "LowerCase TEST with A-Z and a-z (and ) A; )", + "lowercase test with a-z and a-z (and ) a; )" + }, + { "test5-6 (danish lowercase)", YAZ_NFA_SUCCESS, + "" + "" + "A-Z" + "a-z" + "" + "" + "À-Ö" + "à-ö" + "" + "" + "Ø-ß" + "ø-ÿ" + "" + "" + "Å" + "å" + "" + "" + "Dänish" + "DÄNISH" + "" + "", + "LowerCase TEST with Dänish Å !? åæø ÅÆØ XYZ", + "lowercase test with DÄNISH å !? åæø åæø xyz" + }, + {0,0,0,0} + }; + char *xml=0; +#define MAXBUF 2048 + yaz_nfa *nfa; + yaz_nfa_char frombuf[MAXBUF]; + yaz_nfa_char tobuf[MAXBUF]; + unsigned char charbuf[MAXBUF]; + struct conv_test *thistest=tests; + unsigned char *cp; + yaz_nfa_char *ycp; + size_t incharsleft; + size_t outcharsleft; + size_t prev_incharsleft; + int rc; + yaz_nfa_char *fromp; + yaz_nfa_char *top; + while (thistest->name) { + yaz_log(YLOG_DEBUG,"Starting test %s",thistest->name); + if (thistest->xml) + xml=thistest->xml; + nfa = yaz_nfa_parse_xml_memory(xml, thistest->name); + YAZ_CHECK(nfa); + if (nfa) { + if ( yaz_test_get_verbosity() > 3) { + yaz_nfa_dump(0,nfa,yaz_nfa_dump_converter); + } + ycp=frombuf; + cp=thistest->from; + while ( (*ycp++ = *cp++) ) + ; /* strcpy, but expand to yaz_nfa_chars */ + incharsleft = strlen(thistest->from); + prev_incharsleft = 0; + outcharsleft = MAXBUF-1; + fromp = frombuf; + top = tobuf; + rc = YAZ_NFA_SUCCESS; + while ( (rc == YAZ_NFA_SUCCESS) && (incharsleft>0) && + (prev_incharsleft != incharsleft ) ) /* prevent loops */ + { + prev_incharsleft=incharsleft; + rc=yaz_nfa_convert_slice(nfa, &fromp, &incharsleft, + &top, &outcharsleft); + } + YAZ_CHECK_EQ(rc, thistest->expresult); + if ( (rc == thistest->expresult) && + (rc == YAZ_NFA_SUCCESS)) { + YAZ_CHECK_EQ(incharsleft, 0); + YAZ_CHECK( prev_incharsleft != incharsleft ); + } + ycp=tobuf; + cp=charbuf; + while (ycp != top ) + *cp++ = *ycp++; + *cp=0; + if ( yaz_test_get_verbosity() > 2) { + printf("%s from: '%s' \n",thistest->name, thistest->from); + printf("%s result: '%s' \n",thistest->name, charbuf); + printf("%s expect: '%s' \n",thistest->name, thistest->to); + } + YAZ_CHECK( 0==strcmp(thistest->to,charbuf) ); + yaz_nfa_destroy(nfa); + } + thistest++; + } + +} /* test5 */ + + +/* More things to test: + * + * - Empty strings in to/from + * - ranges, length mismatches, etc + */ int main(int argc, char **argv) { @@ -80,6 +309,8 @@ int main(int argc, char **argv) test1(); test2(); test3(); + test4(); + test5(); nmem_exit (); YAZ_CHECK_TERM;