From 63f558c67e4818ce6d02747cfa3769f26a273cf0 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 16 Sep 2004 14:07:48 +0000 Subject: [PATCH] Merge from head the facility that removes leading articles. --- configure.in | 4 +- doc/recordmodel.xml | 372 +++++++++++++++++++++++++++--------------------- include/charmap.h | 6 +- include/zebramap.h | 4 +- index/extract.c | 230 ++++++------------------------ index/zrpn.c | 64 +++------ tab/default.idx | 2 +- tab/scan.chr | 7 +- test/Makefile.am | 2 +- test/sort2/Makefile.am | 10 ++ test/sort2/default.idx | 55 +++++++ test/sort2/my.abs | 14 ++ test/sort2/rec1.xml | 3 + test/sort2/rec2.xml | 3 + test/sort2/rec3.xml | 3 + test/sort2/rec4.xml | 3 + test/sort2/sort.chr | 34 +++++ test/sort2/test1.sh | 34 +++++ test/sort2/zebra.cfg | 14 ++ util/charmap.c | 61 +++++--- util/zebramap.c | 6 +- 21 files changed, 502 insertions(+), 429 deletions(-) create mode 100644 test/sort2/Makefile.am create mode 100644 test/sort2/default.idx create mode 100644 test/sort2/my.abs create mode 100644 test/sort2/rec1.xml create mode 100644 test/sort2/rec2.xml create mode 100644 test/sort2/rec3.xml create mode 100644 test/sort2/rec4.xml create mode 100644 test/sort2/sort.chr create mode 100755 test/sort2/test1.sh create mode 100644 test/sort2/zebra.cfg diff --git a/configure.in b/configure.in index 5dc565f..345c77a 100644 --- a/configure.in +++ b/configure.in @@ -1,5 +1,5 @@ dnl Zebra, Index Data Aps, 1995-2004 -dnl $Id: configure.in,v 1.91.2.2 2004-08-20 11:07:32 adam Exp $ +dnl $Id: configure.in,v 1.91.2.3 2004-09-16 14:07:48 adam Exp $ dnl AC_INIT(include/zebraver.h) AM_INIT_AUTOMAKE(idzebra,1.3.18) @@ -387,7 +387,7 @@ AC_OUTPUT([ doc/tkl.xsl test/Makefile test/gils/Makefile test/usmarc/Makefile test/api/Makefile test/rusmarc/Makefile test/cddb/Makefile test/malxml/Makefile - test/config/Makefile + test/config/Makefile test/sort2/Makefile perl/Makefile.PL test/xelm/Makefile test/dmoz/Makefile test/xpath/Makefile test/sort/Makefile test/zsh/Makefile test/marcxml/Makefile test/charmap/Makefile test/codec/Makefile diff --git a/doc/recordmodel.xml b/doc/recordmodel.xml index e052f12..b83fa67 100644 --- a/doc/recordmodel.xml +++ b/doc/recordmodel.xml @@ -1,5 +1,5 @@ - + The Record Model @@ -1786,174 +1786,216 @@ special-purpose fields such as WWW-style linkages (URx). - - The field types, and hence character sets, are associated with data - elements by the .abs files (see above). - The file default.idx - provides the association between field type codes (as used in the .abs - files) and the character map files (with the .chr suffix). The format - of the .idx file is as follows - - - - - - - index field type code - - - This directive introduces a new search index code. - The argument is a one-character code to be used in the - .abs files to select this particular index type. An index, roughly, - corresponds to a particular structure attribute during search. Refer - to . - - - - sort field code type - - - This directive introduces a - sort index. The argument is a one-character code to be used in the - .abs fie to select this particular index type. The corresponding - use attribute must be used in the sort request to refer to this - particular sort index. The corresponding character map (see below) - is used in the sort process. - - - - completeness boolean - - - This directive enables or disables complete field indexing. - The value of the boolean should be 0 - (disable) or 1. If completeness is enabled, the index entry will - contain the complete contents of the field (up to a limit), with words - (non-space characters) separated by single space characters - (normalized to " " on display). When completeness is - disabled, each word is indexed as a separate entry. Complete subfield - indexing is most useful for fields which are typically browsed (eg. - titles, authors, or subjects), or instances where a match on a - complete subfield is essential (eg. exact title searching). For fields - where completeness is disabled, the search engine will interpret a - search containing space characters as a word proximity search. - - - - charmap filename - - - This is the filename of the character - map to be used for this index for field type. - - - - - - - The contents of the character map files are structured as follows: - - - - - - - lowercase value-set - - - This directive introduces the basic value set of the field type. - The format is an ordered list (without spaces) of the - characters which may occur in "words" of the given type. - The order of the entries in the list determines the - sort order of the index. In addition to single characters, the - following combinations are legal: - - - - - - - - Backslashes may be used to introduce three-digit octal, or - two-digit hex representations of single characters - (preceded by x). - In addition, the combinations - \\, \\r, \\n, \\t, \\s (space — remember that real - space-characters may not occur in the value definition), and - \\ are recognized, with their usual interpretation. - - - - - - Curly braces {} may be used to enclose ranges of single - characters (possibly using the escape convention described in the - preceding point), eg. {a-z} to introduce the - standard range of ASCII characters. - Note that the interpretation of such a range depends on - the concrete representation in your local, physical character set. - - - - - - paranthesises () may be used to enclose multi-byte characters - - eg. diacritics or special national combinations (eg. Spanish - "ll"). When found in the input stream (or a search term), - these characters are viewed and sorted as a single character, with a - sorting value depending on the position of the group in the value - statement. - - + + The default.idx file + + The field types, and hence character sets, are associated with data + elements by the .abs files (see above). + The file default.idx + provides the association between field type codes (as used in the .abs + files) and the character map files (with the .chr suffix). The format + of the .idx file is as follows + - + + + + + index field type code + + + This directive introduces a new search index code. + The argument is a one-character code to be used in the + .abs files to select this particular index type. An index, roughly, + corresponds to a particular structure attribute during search. Refer + to . + + + + sort field code type + + + This directive introduces a + sort index. The argument is a one-character code to be used in the + .abs fie to select this particular index type. The corresponding + use attribute must be used in the sort request to refer to this + particular sort index. The corresponding character map (see below) + is used in the sort process. + + + + completeness boolean + + + This directive enables or disables complete field indexing. + The value of the boolean should be 0 + (disable) or 1. If completeness is enabled, the index entry will + contain the complete contents of the field (up to a limit), with words + (non-space characters) separated by single space characters + (normalized to " " on display). When completeness is + disabled, each word is indexed as a separate entry. Complete subfield + indexing is most useful for fields which are typically browsed (eg. + titles, authors, or subjects), or instances where a match on a + complete subfield is essential (eg. exact title searching). For fields + where completeness is disabled, the search engine will interpret a + search containing space characters as a word proximity search. + + + + charmap filename + + + This is the filename of the character + map to be used for this index for field type. + + + + + - - - - uppercase value-set - - - This directive introduces the - upper-case equivalencis to the value set (if any). The number and - order of the entries in the list should be the same as in the - lowercase directive. - - - - space value-set - - - This directive introduces the character - which separate words in the input stream. Depending on the - completeness mode of the field in question, these characters either - terminate an index entry, or delimit individual "words" in - the input stream. The order of the elements is not significant — - otherwise the representation is the same as for the - uppercase and lowercase - directives. - - - - map value-set - target - - - This directive introduces a - mapping between each of the members of the value-set on the left to - the character on the right. The character on the right must occur in - the value set (the lowercase directive) of - the character set, but - it may be a paranthesis-enclosed multi-octet character. This directive - may be used to map diacritics to their base characters, or to map - HTML-style character-representations to their natural form, etc. - - - - + + The character map file format + + The contents of the character map files are structured as follows: + + + + + + lowercase value-set + + + This directive introduces the basic value set of the field type. + The format is an ordered list (without spaces) of the + characters which may occur in "words" of the given type. + The order of the entries in the list determines the + sort order of the index. In addition to single characters, the + following combinations are legal: + + + + + + + + Backslashes may be used to introduce three-digit octal, or + two-digit hex representations of single characters + (preceded by x). + In addition, the combinations + \\, \\r, \\n, \\t, \\s (space — remember that real + space-characters may not occur in the value definition), and + \\ are recognized, with their usual interpretation. + + + + + + Curly braces {} may be used to enclose ranges of single + characters (possibly using the escape convention described in the + preceding point), eg. {a-z} to introduce the + standard range of ASCII characters. + Note that the interpretation of such a range depends on + the concrete representation in your local, physical character set. + + + + + + paranthesises () may be used to enclose multi-byte characters - + eg. diacritics or special national combinations (eg. Spanish + "ll"). When found in the input stream (or a search term), + these characters are viewed and sorted as a single character, with a + sorting value depending on the position of the group in the value + statement. + + + + + + + + + uppercase value-set + + + This directive introduces the + upper-case equivalencis to the value set (if any). The number and + order of the entries in the list should be the same as in the + lowercase directive. + + + + space value-set + + + This directive introduces the character + which separate words in the input stream. Depending on the + completeness mode of the field in question, these characters either + terminate an index entry, or delimit individual "words" in + the input stream. The order of the elements is not significant — + otherwise the representation is the same as for the + uppercase and lowercase + directives. + + + + map value-set + target + + + This directive introduces a + mapping between each of the members of the value-set on the left to + the character on the right. The character on the right must occur in + the value set (the lowercase directive) of + the character set, but + it may be a paranthesis-enclosed multi-octet character. This directive + may be used to map diacritics to their base characters, or to map + HTML-style character-representations to their natural form, etc. The map directive + can also be used to ignore leading articles in searching and/or sorting, and to perform + other special transformations. See section . + + + + + + + Ignoring leading articles + + In addition to specifying sort orders, space (blank) handling, and upper/lowercase folding, + you can also use the character map files to make Zebra ignore leading articles in sorting + records, or when doing complete field searching. + + + This is done using the map directive in the character map file. In a + nutshell, what you do is map certain sequences of characters, when they occur + in the beginning of a field, to a space. Assuming that the character "@" is + defined as a space character in your file, you can do: + + map (^The\s) @ + map (^the\s) @ + + The effect of these directives is to map either 'the' or 'The', followed by a space + character, to a space. The hat ^ character denotes beginning-of-field only when + complete-subfield indexing or sort indexing is taking place; otherwise, it is treated just + as any other character. + + + Because the default.idx file can be used to associate different + character maps with different indexing types -- and you can create additional indexing + types, should the need arise -- it is possible to specify that leading articles should be + ignored either in sorting, in complete-field searching, or both. + + + If you ignore certain prefixes in sorting, then these will be eliminated from the index, + and sorting will take place as if they weren't there. However, if you set the system up + to ignore certain prefixes in searching, then these are deleted both + from the indexes and from query terms, when the client specifies complete-field + searching. This has the effect that a search for 'the science journal' and 'science + journal' would both produce the same results. + + - diff --git a/include/charmap.h b/include/charmap.h index 365b6ab..facf776 100644 --- a/include/charmap.h +++ b/include/charmap.h @@ -1,4 +1,4 @@ -/* $Id: charmap.h,v 1.9 2004-07-28 09:47:41 adam Exp $ +/* $Id: charmap.h,v 1.9.2.1 2004-09-16 14:07:49 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -43,9 +43,9 @@ YAZ_EXPORT chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, const char *tabroot); YAZ_EXPORT void chrmaptab_destroy (chrmaptab tab); -YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len); +YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len, int first); YAZ_EXPORT const char **chr_map_input_x(chrmaptab t, - const char **from, int *len); + const char **from, int *len, int first); YAZ_EXPORT const char **chr_map_input_q(chrmaptab maptab, const char **from, int len, const char **qmap); diff --git a/include/zebramap.h b/include/zebramap.h index 62845c0..60e8ebc 100644 --- a/include/zebramap.h +++ b/include/zebramap.h @@ -1,4 +1,4 @@ -/* $Id: zebramap.h,v 1.15 2004-07-28 09:47:41 adam Exp $ +/* $Id: zebramap.h,v 1.15.2.1 2004-09-16 14:07:49 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -34,7 +34,7 @@ ZebraMaps zebra_maps_open (Res res, const char *base); void zebra_maps_close (ZebraMaps zm); const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id, - const char **from, int len); + const char **from, int len, int first); const char *zebra_maps_output(ZebraMaps, unsigned reg_id, const char **from); int zebra_maps_attr (ZebraMaps zms, Z_AttributesPlusTerm *zapt, diff --git a/index/extract.c b/index/extract.c index 9561c6e..1b8fdb7 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.158 2004-08-04 08:35:23 adam Exp $ +/* $Id: extract.c,v 1.157.2.1 2004-09-16 14:07:50 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -20,6 +20,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + #include #include #include @@ -92,51 +93,6 @@ static const char **searchRecordKey (ZebraHandle zh, struct recKeys *reckeys, int attrSetS, int attrUseS) { -#if IT_KEY_NEW -/* #error searchRecordKey does not work yet in this mode.. */ - static const char *ws[32]; - void *decode_handle = iscz1_start(); - int off = 0; - int startSeq = -1; - int seqno = 0; - int i; - - for (i = 0; i<32; i++) - ws[i] = NULL; - - while (off < reckeys->buf_used) - { - const char *src = reckeys->buf + off; - struct it_key key; - char *dst = (char*) &key; - int attrSet, attrUse; - - iscz1_decode(decode_handle, &dst, &src); - assert(key.len < 4 && key.len > 2); - - attrSet = key.mem[0]; - attrUse = key.mem[1]; - seqno = key.mem[2]; - - if (attrUseS == attrUse && attrSetS == attrSet) - { - int woff; - - if (startSeq == -1) - startSeq = seqno; - woff = seqno - startSeq; - if (woff >= 0 && woff < 31) - ws[woff] = src; - } - - while (*src++) - ; - off = src - reckeys->buf; - } - iscz1_stop(decode_handle); - assert (off == reckeys->buf_used); - return ws; -#else static const char *ws[32]; int off = 0; int startSeq = -1; @@ -215,7 +171,6 @@ static const char **searchRecordKey (ZebraHandle zh, } assert (off == reckeys->buf_used); return ws; -#endif } struct file_read_info { @@ -460,18 +415,6 @@ struct recordLogInfo { int recordOffset; struct recordGroup *rGroup; }; - -void create_rec_keys_codec(struct recKeys *keys) -{ - keys->buf_used = 0; -#if IT_KEY_NEW - iscz1_reset(keys->codec_handle); -#else - keys->prevAttrUse = -1; - keys->prevAttrSet = -1; - keys->prevSeqNo = 0; -#endif -} static int file_extract_record(ZebraHandle zh, SYSNO *sysno, const char *fname, @@ -512,8 +455,10 @@ static int file_extract_record(ZebraHandle zh, /* we are going to read from a file, so prepare the extraction */ int i; - create_rec_keys_codec(&zh->reg->keys); - + zh->reg->keys.buf_used = 0; + zh->reg->keys.prevAttrUse = -1; + zh->reg->keys.prevAttrSet = -1; + zh->reg->keys.prevSeqNo = 0; zh->reg->sortKeys.buf_used = 0; recordOffset = fi->file_moffset; @@ -606,10 +551,7 @@ static int file_extract_record(ZebraHandle zh, { rinfo = dict_lookup (zh->reg->matchDict, matchStr); if (rinfo) - { - assert(*rinfo == sizeof(*sysno)); memcpy (sysno, rinfo+1, sizeof(*sysno)); - } } else { @@ -900,7 +842,7 @@ int buffer_extract_record (ZebraHandle zh, int delete_flag, int test_mode, const char *recordType, - SYSNO *sysno, + int *sysno, const char *match_criteria, const char *fname, int force_update, @@ -935,8 +877,10 @@ int buffer_extract_record (ZebraHandle zh, extractCtrl.endf = zebra_record_int_end; extractCtrl.fh = &fc; - create_rec_keys_codec(&zh->reg->keys); - + zh->reg->keys.buf_used = 0; + zh->reg->keys.prevAttrUse = -1; + zh->reg->keys.prevAttrSet = -1; + zh->reg->keys.prevSeqNo = 0; zh->reg->sortKeys.buf_used = 0; if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0])) @@ -1029,10 +973,7 @@ int buffer_extract_record (ZebraHandle zh, if (matchStr) { rinfo = dict_lookup (zh->reg->matchDict, matchStr); if (rinfo) - { - assert(*rinfo == sizeof(*sysno)); memcpy (sysno, rinfo+1, sizeof(*sysno)); - } } } @@ -1237,8 +1178,10 @@ int explain_extract (void *handle, Record rec, data1_node *n) abort (); } - create_rec_keys_codec(&zh->reg->keys); - + zh->reg->keys.buf_used = 0; + zh->reg->keys.prevAttrUse = -1; + zh->reg->keys.prevAttrSet = -1; + zh->reg->keys.prevSeqNo = 0; zh->reg->sortKeys.buf_used = 0; extractCtrl.init = extract_init; @@ -1289,16 +1232,12 @@ int explain_extract (void *handle, Record rec, data1_node *n) void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, int cmd, struct recKeys *reckeys) { -#if IT_KEY_NEW - void *decode_handle = iscz1_start(); -#else - int seqno = 0; #if SU_SCHEME #else unsigned char attrSet = (unsigned char) -1; unsigned short attrUse = (unsigned short) -1; #endif -#endif + int seqno = 0; int off = 0; int ch = 0; ZebraExplainInfo zei = zh->reg->zei; @@ -1320,53 +1259,6 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, zh->reg->key_file_no = 0; } zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1); -#if IT_KEY_NEW - while (off < reckeys->buf_used) - { - const char *src = reckeys->buf + off; - struct it_key key; - char *dst = (char*) &key; - int attrSet, attrUse; - - iscz1_decode(decode_handle, &dst, &src); - assert(key.len < 4 && key.len > 2); - - attrSet = key.mem[0]; - attrUse = key.mem[1]; /* sequence in mem[2] */ - - if (zh->reg->key_buf_used + 1024 > - (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*)) - extract_flushWriteKeys (zh,0); - assert(zh->reg->ptr_i >= 0); - ++(zh->reg->ptr_i); - assert(zh->reg->ptr_i > 0); - (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] = - (char*)zh->reg->key_buf + zh->reg->key_buf_used; - - ch = zebraExplain_lookupSU (zei, attrSet, attrUse); - if (ch < 0) - ch = zebraExplain_addSU (zei, attrSet, attrUse); - - assert (ch > 0); - zh->reg->key_buf_used += - key_SU_encode (ch,((char*)zh->reg->key_buf) + - zh->reg->key_buf_used); - while (*src) - ((char*)zh->reg->key_buf) [(zh->reg->key_buf_used)++] = *src++; - src++; - ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = '\0'; - ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = cmd; - - key.len = 2; - key.mem[0] = sysno; - key.mem[1] = key.mem[2]; /* sequence .. */ - - memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used, - &key, sizeof(key)); - (zh->reg->key_buf_used) += sizeof(key); - off = src - reckeys->buf; - } -#else while (off < reckeys->buf_used) { const char *src = reckeys->buf + off; @@ -1431,11 +1323,7 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, (zh->reg->key_buf_used) += sizeof(key); off = src - reckeys->buf; } -#endif assert (off == reckeys->buf_used); -#if IT_KEY_NEW - iscz1_stop(decode_handle); -#endif } void extract_flushWriteKeys (ZebraHandle zh, int final) @@ -1572,22 +1460,18 @@ void extract_flushWriteKeys (ZebraHandle zh, int final) zh->reg->key_buf_used = 0; } -void extract_add_index_string (RecWord *p, const char *str, int length) +void extract_add_index_string (RecWord *p, const char *string, + int length) { char *dst; - ZebraHandle zh = p->extractCtrl->handle; - struct recKeys *keys = &zh->reg->keys; -#if IT_KEY_NEW - struct it_key key; - const char *src = (char*) &key; -#else unsigned char attrSet; unsigned short attrUse; int lead = 0; int diff = 0; int *pseqno = &p->seqno; + ZebraHandle zh = p->extractCtrl->handle; ZebraExplainInfo zei = zh->reg->zei; -#endif + struct recKeys *keys = &zh->reg->keys; if (keys->buf_used+1024 > keys->buf_max) { @@ -1601,25 +1485,6 @@ void extract_add_index_string (RecWord *p, const char *str, int length) } dst = keys->buf + keys->buf_used; -#if IT_KEY_NEW - key.len = 3; - key.mem[0] = p->attrSet; - key.mem[1] = p->attrUse; - key.mem[2] = p->seqno; - -#if 0 - /* just for debugging .. */ - yaz_log(LOG_LOG, "set=%d use=%d seqno=%d", p->attrSet, p->attrUse, - p->seqno); -#endif - - iscz1_encode(keys->codec_handle, &dst, &src); - - *dst++ = p->reg_type; - memcpy (dst, str, length); - dst += length; - *dst++ = '\0'; -#else /* leader byte is encoded as follows: bit 0 : 1 if attrset is unchanged; 0 if attrset is changed bit 1 : 1 if attruse is unchanged; 0 if attruse is changed @@ -1634,13 +1499,13 @@ void extract_add_index_string (RecWord *p, const char *str, int length) lead |= 2; else keys->prevAttrUse = attrUse; - +#if 1 diff = 1 + *pseqno - keys->prevSeqNo; if (diff >= 1 && diff <= 15) lead |= (diff << 2); else diff = 0; - +#endif keys->prevSeqNo = *pseqno; *dst++ = lead; @@ -1672,7 +1537,7 @@ void extract_add_index_string (RecWord *p, const char *str, int length) } #endif *dst++ = p->reg_type; - memcpy (dst, str, length); + memcpy (dst, string, length); dst += length; *dst++ = '\0'; @@ -1681,11 +1546,10 @@ void extract_add_index_string (RecWord *p, const char *str, int length) memcpy (dst, pseqno, sizeof(*pseqno)); dst += sizeof(*pseqno); } -#endif keys->buf_used = dst - keys->buf; } -static void extract_add_sort_string (RecWord *p, const char *str, +static void extract_add_sort_string (RecWord *p, const char *string, int length) { ZebraHandle zh = p->extractCtrl->handle; @@ -1718,7 +1582,7 @@ static void extract_add_sort_string (RecWord *p, const char *str, off += key_SU_encode(p->attrSet, sk->buf + off); off += key_SU_encode(p->attrUse, sk->buf + off); off += key_SU_encode(length, sk->buf + off); - memcpy (sk->buf + off, str, length); + memcpy (sk->buf + off, string, length); sk->buf_used = off + length; } @@ -1738,7 +1602,7 @@ static void extract_add_incomplete_field (RecWord *p) const char **map = 0; if (remain > 0) - map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0); while (map) { @@ -1750,7 +1614,7 @@ static void extract_add_incomplete_field (RecWord *p) { remain = p->length - (b - p->string); if (remain > 0) - map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0); else map = 0; } @@ -1765,7 +1629,7 @@ static void extract_add_incomplete_field (RecWord *p) buf[i++] = *(cp++); remain = p->length - (b - p->string); if (remain > 0) - map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0); else map = 0; } @@ -1782,9 +1646,12 @@ static void extract_add_complete_field (RecWord *p) char buf[IT_MAX_WORD+1]; const char **map = 0; int i = 0, remain = p->length; + int first; /* first position */ + +yaz_log(LOG_DEBUG, "Complete field, w='%s'", p->string); if (remain > 0) - map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain); + map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain, 1); while (remain > 0 && i < IT_MAX_WORD) { @@ -1793,7 +1660,10 @@ static void extract_add_complete_field (RecWord *p) remain = p->length - (b - p->string); if (remain > 0) - map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + { + first = i ? 0 : 1; + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, first); + } else map = 0; } @@ -1814,13 +1684,16 @@ static void extract_add_complete_field (RecWord *p) { if (i >= IT_MAX_WORD) break; +yaz_log(LOG_DEBUG, "Adding string to index '%d'", *map); while (i < IT_MAX_WORD && *cp) buf[i++] = *(cp++); } remain = p->length - (b - p->string); if (remain > 0) + { map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, - remain); + remain, 0); + } else map = 0; } @@ -1891,13 +1764,8 @@ void encode_key_init (struct encode_info *i) i->prevseq=0; i->prevcmd=-1; i->keylen=0; -#if IT_KEY_NEW - i->encode_handle = iscz1_start(); -#endif } -#if IT_KEY_NEW -#else char *encode_key_int (int d, char *bp) { if (d <= 63) @@ -1922,8 +1790,6 @@ char *encode_key_int (int d, char *bp) } return bp; } -#endif - #define OLDENCODE 1 #ifdef OLDENCODE @@ -1934,19 +1800,11 @@ char *encode_key_int (int d, char *bp) void encode_key_write (char *k, struct encode_info *i, FILE *outf) { struct it_key key; - char *bp = i->buf, *bp0; - const char *src = (char *) &key; + char *bp = i->buf; - /* copy term to output buf */ while ((*bp++ = *k++)) ; - /* and copy & align key so we can mangle */ - memcpy (&key, k+1, sizeof(struct it_key)); /* *k is insert/delete */ -#if IT_KEY_NEW - bp0 = bp++; - iscz1_encode(i->encode_handle, &bp, &src); - *bp0 = (*k * 128) + bp - bp0 - 1; /* length and insert/delete combined */ -#else + memcpy (&key, k+1, sizeof(struct it_key)); bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp); if (i->sysno != key.sysno) { @@ -1958,7 +1816,6 @@ void encode_key_write (char *k, struct encode_info *i, FILE *outf) bp = encode_key_int (key.seqno - i->seqno, bp); i->seqno = key.seqno; i->cmd = *k; -#endif if (fwrite (i->buf, bp - i->buf, 1, outf) != 1) { logf (LOG_FATAL|LOG_ERRNO, "fwrite"); @@ -1968,9 +1825,6 @@ void encode_key_write (char *k, struct encode_info *i, FILE *outf) void encode_key_flush (struct encode_info *i, FILE *outf) { /* dummy routine */ -#if IT_KEY_NEW - iscz1_stop(i->encode_handle); -#endif } #else diff --git a/index/zrpn.c b/index/zrpn.c index 7fefb9b..7f1411f 100644 --- a/index/zrpn.c +++ b/index/zrpn.c @@ -1,4 +1,4 @@ -/* $Id: zrpn.c,v 1.142 2004-08-04 08:35:23 adam Exp $ +/* $Id: zrpn.c,v 1.141.2.1 2004-09-16 14:07:50 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -56,7 +56,7 @@ typedef struct { static const char **rpn_char_map_handler (void *vp, const char **from, int len) { struct rpn_char_map_info *p = (struct rpn_char_map_info *) vp; - const char **out = zebra_maps_input (p->zm, p->reg_type, from, len); + const char **out = zebra_maps_input (p->zm, p->reg_type, from, len, 0); #if 0 if (out && *out) { @@ -164,7 +164,7 @@ struct grep_info { #ifdef TERM_COUNT int *term_no; #endif - ISAMC_P *isam_p_buf; + ISAMS_P *isam_p_buf; int isam_p_size; int isam_p_indx; ZebraHandle zh; @@ -194,12 +194,12 @@ static void add_isam_p (const char *name, const char *info, { if (p->isam_p_indx == p->isam_p_size) { - ISAMC_P *new_isam_p_buf; + ISAMS_P *new_isam_p_buf; #ifdef TERM_COUNT int *new_term_no; #endif p->isam_p_size = 2*p->isam_p_size + 100; - new_isam_p_buf = (ISAMC_P *) xmalloc (sizeof(*new_isam_p_buf) * + new_isam_p_buf = (ISAMS_P *) xmalloc (sizeof(*new_isam_p_buf) * p->isam_p_size); if (p->isam_p_buf) { @@ -253,7 +253,7 @@ static int grep_handle (char *name, const char *info, void *p) } static int term_pre (ZebraMaps zebra_maps, int reg_type, const char **src, - const char *ct1, const char *ct2) + const char *ct1, const char *ct2, int first) { const char *s1, *s0 = *src; const char **map; @@ -266,7 +266,7 @@ static int term_pre (ZebraMaps zebra_maps, int reg_type, const char **src, if (ct2 && strchr (ct2, *s0)) break; s1 = s0; - map = zebra_maps_input (zebra_maps, reg_type, &s1, strlen(s1)); + map = zebra_maps_input (zebra_maps, reg_type, &s1, strlen(s1), first); if (**map != *CHR_SPACE) break; s0 = s1; @@ -290,13 +290,13 @@ static int term_100 (ZebraMaps zebra_maps, int reg_type, const char *space_start = 0; const char *space_end = 0; - if (!term_pre (zebra_maps, reg_type, src, NULL, NULL)) + if (!term_pre (zebra_maps, reg_type, src, NULL, NULL, !space_split)) return 0; s0 = *src; while (*s0) { s1 = s0; - map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0)); + map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0); if (space_split) { if (**map == *CHR_SPACE) @@ -348,7 +348,7 @@ static int term_101 (ZebraMaps zebra_maps, int reg_type, int i = 0; int j = 0; - if (!term_pre (zebra_maps, reg_type, src, "#", "#")) + if (!term_pre (zebra_maps, reg_type, src, "#", "#", !space_split)) return 0; s0 = *src; while (*s0) @@ -362,7 +362,7 @@ static int term_101 (ZebraMaps zebra_maps, int reg_type, else { s1 = s0; - map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0)); + map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0); if (space_split && **map == *CHR_SPACE) break; while (s1 < s0) @@ -390,7 +390,7 @@ static int term_103 (ZebraMaps zebra_maps, int reg_type, const char **src, const char *s0, *s1; const char **map; - if (!term_pre (zebra_maps, reg_type, src, "^\\()[].*+?|", "(")) + if (!term_pre (zebra_maps, reg_type, src, "^\\()[].*+?|", "(", !space_split)) return 0; s0 = *src; if (errors && *s0 == '+' && s0[1] && s0[2] == '+' && s0[3] && @@ -411,7 +411,7 @@ static int term_103 (ZebraMaps zebra_maps, int reg_type, const char **src, else { s1 = s0; - map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0)); + map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0); if (**map == *CHR_SPACE) break; while (s1 < s0) @@ -448,7 +448,7 @@ static int term_104 (ZebraMaps zebra_maps, int reg_type, int i = 0; int j = 0; - if (!term_pre (zebra_maps, reg_type, src, "?*#", "?*#")) + if (!term_pre (zebra_maps, reg_type, src, "?*#", "?*#", !space_split)) return 0; s0 = *src; while (*s0) @@ -491,7 +491,7 @@ static int term_104 (ZebraMaps zebra_maps, int reg_type, } { s1 = s0; - map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0)); + map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0); if (space_split && **map == *CHR_SPACE) break; while (s1 < s0) @@ -519,7 +519,7 @@ static int term_105 (ZebraMaps zebra_maps, int reg_type, int i = 0; int j = 0; - if (!term_pre (zebra_maps, reg_type, src, "*!", "*!")) + if (!term_pre (zebra_maps, reg_type, src, "*!", "*!", !space_split)) return 0; s0 = *src; while (*s0) @@ -537,7 +537,7 @@ static int term_105 (ZebraMaps zebra_maps, int reg_type, } { s1 = s0; - map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0)); + map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0), 0); if (space_split && **map == *CHR_SPACE) break; while (s1 < s0) @@ -1235,7 +1235,7 @@ static int trans_scan_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt, while ((len = (cp_end - cp)) > 0) { - map = zebra_maps_input (zh->reg->zebra_maps, reg_type, &cp, len); + map = zebra_maps_input (zh->reg->zebra_maps, reg_type, &cp, len, 0); if (**map == *CHR_SPACE) space_map = *map; else @@ -1365,7 +1365,7 @@ static RSET rpn_search_APT_phrase (ZebraHandle zh, { char term_dst[IT_MAX_WORD+1]; RSET rset[60], result; - int rset_no = 0; + int rset_no = 0; struct grep_info grep_info; char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type); const char *termp = termz; @@ -1787,7 +1787,6 @@ static RSET rpn_search_APT_local (ZebraHandle zh, Z_AttributesPlusTerm *zapt, RSET result; RSFD rsfd; struct it_key key; - int sys; rset_temp_parms parms; parms.rset_term = rset_term_create (termz, -1, rank_type, @@ -1798,19 +1797,10 @@ static RSET rpn_search_APT_local (ZebraHandle zh, Z_AttributesPlusTerm *zapt, result = rset_create (rset_kind_temp, &parms); rsfd = rset_open (result, RSETF_WRITE); - sys = atoi(termz); - if (sys <= 0) - sys = 1; -#if IT_KEY_NEW - key.mem[0] = sys; - key.mem[1] = 1; - key.len = 2; -#else - key.sysno = sys; + key.sysno = atoi (termz); key.seqno = 1; if (key.sysno <= 0) key.sysno = 1; -#endif rset_write (result, rsfd, &key); rset_close (result, rsfd); return result; @@ -2400,7 +2390,7 @@ RSET rpn_search (ZebraHandle zh, NMEM nmem, struct scan_info_entry { char *term; - ISAMC_P isam_p; + ISAMS_P isam_p; }; struct scan_info { @@ -2424,8 +2414,8 @@ static int scan_handle (char *name, const char *info, int pos, void *client) scan_info->list[idx].term = (char *) odr_malloc (scan_info->odr, strlen(name + len_prefix)+1); strcpy (scan_info->list[idx].term, name + len_prefix); - assert (*info == sizeof(ISAMC_P)); - memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAMC_P)); + assert (*info == sizeof(ISAMS_P)); + memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAMS_P)); return 0; } @@ -2475,19 +2465,11 @@ static void count_set (RSET r, int *count) rfd = rset_open (r, RSETF_READ); while (rset_read (r, rfd, &key, &term_index)) { -#if IT_KEY_NEW - if (key.mem[0] != psysno) - { - psysno = key.mem[0]; - (*count)++; - } -#else if (key.sysno != psysno) { psysno = key.sysno; (*count)++; } -#endif kno++; } rset_close (r, rfd); diff --git a/tab/default.idx b/tab/default.idx index 9e2cb81..e146213 100644 --- a/tab/default.idx +++ b/tab/default.idx @@ -1,5 +1,5 @@ # Zebra indexes as referred to from the *.abs-files. -# $Id: default.idx,v 1.10 2004-07-28 09:40:46 adam Exp $ +# $Id: default.idx,v 1.10.2.1 2004-09-16 14:07:50 adam Exp $ # # Traditional word index diff --git a/tab/scan.chr b/tab/scan.chr index 599dd7c..26e0f45 100644 --- a/tab/scan.chr +++ b/tab/scan.chr @@ -1,6 +1,6 @@ # Danish/Swedish character map. # -# $Id: scan.chr,v 1.1 1999-09-07 07:19:21 adam Exp $ +# $Id: scan.chr,v 1.1.6.1 2004-09-16 14:07:50 adam Exp $ # Define the basic value-set. *Beware* of changing this without re-indexing # your databases. @@ -32,6 +32,11 @@ map (Ø) map (Å) Å map (Ö) Ö +map (^the ) # +map (^The ) # +map (^a ) # +map (^A ) # + map éÉ e map á a map ó o diff --git a/test/Makefile.am b/test/Makefile.am index f61aaba..4682121 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS=codec api gils malxml config usmarc dmoz xpath sort xelm cddb \ +SUBDIRS=codec api gils malxml config usmarc dmoz xpath sort sort2 xelm cddb \ rusmarc zsh marcxml charmap diff --git a/test/sort2/Makefile.am b/test/sort2/Makefile.am new file mode 100644 index 0000000..473f338 --- /dev/null +++ b/test/sort2/Makefile.am @@ -0,0 +1,10 @@ +# $Id: Makefile.am,v 1.2.2.1 2004-09-16 14:07:51 adam Exp $ + +check_SCRIPTS = test1.sh + +TESTS = $(check_SCRIPTS) + +EXTRA_DIST = zebra.cfg default.idx \ + rec1.xml rec2.xml rec3.xml rec4.xml zebra.cfg my.abs sort.chr \ + $(check_SCRIPTS) + diff --git a/test/sort2/default.idx b/test/sort2/default.idx new file mode 100644 index 0000000..cb378f5 --- /dev/null +++ b/test/sort2/default.idx @@ -0,0 +1,55 @@ +# Zebra indexes as referred to from the *.abs-files. +# $Id: default.idx,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $ +# + +# Traditional word index +# Used if completenss is 'incomplete field' (@attr 6=1) and +# structure is word/phrase/word-list/free-form-text/document-text +index w +completeness 0 +position 1 +charmap sort.chr + +# Phrase index +# Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1) +# and structure is word/phrase/word-list/free-form-text/document-text +index p +completeness 1 +charmap sort.chr + +# URX (URL) index +# Used if structure=urx (@attr 4=104) +index u +completeness 0 +charmap urx.chr + +# Numeric index +# Used if structure=numeric (@attr 4=109) +index n +completeness 0 +charmap numeric.chr + +# Null map index (no mapping at all) +# Used if structure=key (@attr 4=3) +index 0 +completeness 0 +position 1 +charmap @ + +# Year +# Used if structure=year (@attr 4=4) +index y +completeness 0 +charmap @ + +# Date +# Used if structure=date (@attr 4=5) +index d +completeness 0 +charmap @ + +# Sort, with prefixes to ignore +sort s +completeness 1 +charmap sort.chr + diff --git a/test/sort2/my.abs b/test/sort2/my.abs new file mode 100644 index 0000000..36dd566 --- /dev/null +++ b/test/sort2/my.abs @@ -0,0 +1,14 @@ +# $Id: my.abs,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $ + +name my +reference WAIS-schema +attset bib1.att +tagset generic.tag +xpath enable + +varset var1.var + +esetname B @ +esetname F @ + +elm title Title !:p,!:w,!:s diff --git a/test/sort2/rec1.xml b/test/sort2/rec1.xml new file mode 100644 index 0000000..6dbf26f --- /dev/null +++ b/test/sort2/rec1.xml @@ -0,0 +1,3 @@ + + first computer + diff --git a/test/sort2/rec2.xml b/test/sort2/rec2.xml new file mode 100644 index 0000000..23bb030 --- /dev/null +++ b/test/sort2/rec2.xml @@ -0,0 +1,3 @@ + + second computer + diff --git a/test/sort2/rec3.xml b/test/sort2/rec3.xml new file mode 100644 index 0000000..245c6e1 --- /dev/null +++ b/test/sort2/rec3.xml @@ -0,0 +1,3 @@ + + A third computer + diff --git a/test/sort2/rec4.xml b/test/sort2/rec4.xml new file mode 100644 index 0000000..8ed6c2c --- /dev/null +++ b/test/sort2/rec4.xml @@ -0,0 +1,3 @@ + + the fourth computer + diff --git a/test/sort2/sort.chr b/test/sort2/sort.chr new file mode 100644 index 0000000..c802b61 --- /dev/null +++ b/test/sort2/sort.chr @@ -0,0 +1,34 @@ +# character map that removes some leading prefixes +# +# $Id: sort.chr,v 1.2.2.1 2004-09-16 14:07:51 adam Exp $ + +# Define the basic value-set. *Beware* of changing this without re-indexing +# your databases. + +lowercase {0-9}{a-y}üzæäøöå +uppercase {0-9}{A-Y}ÜZÆÄØÖÅ + +# Breaking characters + +space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~ + +# Characters to be considered equivalent for searching purposes. + +# equivalent æä(ae) +# equivalent øö(oe) +# equivalent å(aa) +# equivalent uü + +map (^The\s) @ +map (^the\s) @ +map (^a\s) @ +map (^A\s) @ + +#map éÉ e +#map á a +#map ó o +#map í i + +#map (Aa) (AA) + +#map (aa) a diff --git a/test/sort2/test1.sh b/test/sort2/test1.sh new file mode 100755 index 0000000..604aed0 --- /dev/null +++ b/test/sort2/test1.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# $Id: test1.sh,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $ + +pp=${srcdir:-"."} + +ulimit -c 10000 +LOG=test1.log +rm -f $LOG +rm -fr lock +mkdir lock +rm -fr reg +mkdir reg +rm -fr recs +mkdir recs +cp $pp/rec*.xml recs +../../index/zebraidx -c $pp/zebra.cfg -l $LOG update recs || exit 1 +../../index/zebrasrv -c $pp/zebra.cfg -l $LOG unix:socket & +sleep 1 +test -f lock/zebrasrv.pid || exit 2 +../api/testclient -n4 unix:socket '@or computer @attr 7=1 @attr 1=4 0' >tmp1 + +kill `cat lock/zebrasrv.pid` + +echo 'Result count: 4 +my: + title: first computer +my: + title: the fourth computer +my: + title: second computer +my: + title: A third computer' >tmp2 + +diff tmp1 tmp2 diff --git a/test/sort2/zebra.cfg b/test/sort2/zebra.cfg new file mode 100644 index 0000000..a495694 --- /dev/null +++ b/test/sort2/zebra.cfg @@ -0,0 +1,14 @@ +# Simple Zebra configuration file +# $Id: zebra.cfg,v 1.1.2.1 2004-09-16 14:07:51 adam Exp $ +# +# Where the schema files, attribute files, etc are located. +profilePath: ${srcdir:-.}:${srcdir:-.}/../../tab + +# Files that describe the attribute sets supported. +attset: bib1.att +attset: explain.att + +recordtype.xml: grs.sgml +lockdir: lock +register: reg:20M +isam: b diff --git a/util/charmap.c b/util/charmap.c index a4f834c..898f133 100644 --- a/util/charmap.c +++ b/util/charmap.c @@ -1,4 +1,4 @@ -/* $Id: charmap.c,v 1.29.2.1 2004-08-06 10:08:19 adam Exp $ +/* $Id: charmap.c,v 1.29.2.2 2004-09-16 14:07:51 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -40,6 +40,8 @@ typedef unsigned ucs4_t; #define CHR_MAXSTR 1024 #define CHR_MAXEQUIV 32 +const unsigned char CHR_FIELD_BEGIN = '^'; + const char *CHR_UNKNOWN = "\001"; const char *CHR_SPACE = "\002"; const char *CHR_BASE = "\003"; @@ -142,7 +144,7 @@ static chr_t_entry *find_entry(chr_t_entry *t, const char **from, int len) return t->target ? t : 0; } -static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len) +static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len, int first) { chr_t_entry *res; @@ -153,35 +155,49 @@ static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len) from++; len++; } - if (*len > 0 && t->children && t->children[(unsigned char) **from]) + if (*len > 0 && t->children) { const char *old_from = *from; int old_len = *len; + + res = 0; + + if (first && t->children[CHR_FIELD_BEGIN]) + { + if ((res = find_entry_x(t->children[CHR_FIELD_BEGIN], from, len, 0)) && res != t->children[CHR_FIELD_BEGIN]) + return res; + else + res = 0; + /* otherwhise there was no match on beginning of field, move on */ + } - (*len)--; - (*from)++; - if ((res = find_entry_x(t->children[(unsigned char) *old_from], - from, len))) - return res; - /* no match */ - *len = old_len; - *from = old_from; + if (!res && t->children[(unsigned char) **from]) + { + (*len)--; + (*from)++; + if ((res = find_entry_x(t->children[(unsigned char) *old_from], + from, len, 0))) + return res; + /* no match */ + *len = old_len; + *from = old_from; + } } /* no children match. use ourselves, if we have a target */ return t->target ? t : 0; } -const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len) +const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len, int first) { chr_t_entry *t = maptab->input; chr_t_entry *res; - if (!(res = find_entry_x(t, from, len))) + if (!(res = find_entry_x(t, from, len, first))) abort(); return (const char **) (res->target); } -const char **chr_map_input(chrmaptab maptab, const char **from, int len) +const char **chr_map_input(chrmaptab maptab, const char **from, int len, int first) { chr_t_entry *t = maptab->input; chr_t_entry *res; @@ -189,7 +205,7 @@ const char **chr_map_input(chrmaptab maptab, const char **from, int len) len_tmp[0] = len; len_tmp[1] = -1; - if (!(res = find_entry_x(t, from, len_tmp))) + if (!(res = find_entry_x(t, from, len_tmp, first))) abort(); return (const char **) (res->target); } @@ -259,7 +275,7 @@ ucs4_t zebra_prim_w(ucs4_t **s) ucs4_t i = 0; char fmtstr[8]; - yaz_log (LOG_DEBUG, "prim %.3s", (char *) *s); + yaz_log (LOG_DEBUG, "prim_w %.3s", (char *) *s); if (**s == '\\') { (*s)++; @@ -374,7 +390,7 @@ static void fun_mkstring(const char *s, void *data, int num) chrwork *arg = (chrwork *) data; const char **res, *p = s; - res = chr_map_input(arg->map, &s, strlen(s)); + res = chr_map_input(arg->map, &s, strlen(s), 0); if (*res == (char*) CHR_UNKNOWN) logf(LOG_WARN, "Map: '%s' has no mapping", p); strncat(arg->string, *res, CHR_MAXSTR - strlen(arg->string)); @@ -443,6 +459,7 @@ static int scan_string(char *s_native, char str[1024]; ucs4_t arg[512]; + ucs4_t arg_prim[512]; ucs4_t *s0, *s = arg; ucs4_t c, begin, end; size_t i; @@ -498,11 +515,11 @@ static int scan_string(char *s_native, case '[': s++; abort(); break; case '(': ++s; - s0 = s; - while (*s != ')' || s[-1] == '\\') - s++; - *s = 0; - if (scan_to_utf8 (t_utf8, s0, s - s0, str, sizeof(str)-1)) + s0 = s; i = 0; + while (*s != ')' || s[-1] == '\\') + arg_prim[i++] = zebra_prim_w(&s); + arg_prim[i] = 0; + if (scan_to_utf8 (t_utf8, arg_prim, zebra_ucs4_strlen(arg_prim), str, sizeof(str)-1)) return -1; (*fun)(str, data, num ? (*num)++ : 0); s++; diff --git a/util/zebramap.c b/util/zebramap.c index 0d1cf07..c1983e3 100644 --- a/util/zebramap.c +++ b/util/zebramap.c @@ -1,4 +1,4 @@ -/* $Id: zebramap.c,v 1.32 2004-06-16 20:30:47 adam Exp $ +/* $Id: zebramap.c,v 1.32.2.1 2004-09-16 14:07:51 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -291,13 +291,13 @@ chrmaptab zebra_charmap_get (ZebraMaps zms, unsigned reg_id) } const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id, - const char **from, int len) + const char **from, int len, int first) { chrmaptab maptab; maptab = zebra_charmap_get (zms, reg_id); if (maptab) - return chr_map_input(maptab, from, len); + return chr_map_input(maptab, from, len, first); zms->temp_map_str[0] = **from; -- 1.7.10.4