hex translation for .chr files. Added test/charmap test cases.
+
+Added feature charmaps (.chr) so that characters may be specified in
+\LXXXX HEX notation.
+
+Fixed problem with encoding directive for charmap(.chr) files.
+
--- 1.3.15 2004/01/15
Fix bug. X-Path attribute expressions with spaces in them now works.
dnl Zebra, Index Data Aps, 1995-2004
-dnl $Id: configure.in,v 1.88 2004-01-15 14:22:21 adam Exp $
+dnl $Id: configure.in,v 1.89 2004-03-09 15:12:14 adam Exp $
dnl
AC_INIT(include/zebraver.h)
AM_INIT_AUTOMAKE(idzebra,1.3.15)
test/config/Makefile
perl/Makefile.PL test/xelm/Makefile
test/dmoz/Makefile test/xpath/Makefile test/sort/Makefile test/zsh/Makefile
- test/marcxml/Makefile
+ test/marcxml/Makefile test/charmap/Makefile
examples/Makefile examples/gils/Makefile examples/zthes/Makefile
idzebra.spec
])
-SUBDIRS=api gils malxml config usmarc dmoz xpath sort xelm cddb rusmarc zsh marcxml
+SUBDIRS=api gils malxml config usmarc dmoz xpath sort xelm cddb \
+ rusmarc zsh marcxml charmap
--- /dev/null
+# $Id: Makefile.am,v 1.1 2004-03-09 15:12:15 adam Exp $
+
+check_SCRIPTS = test1.sh test2.sh
+
+TESTS = $(check_SCRIPTS)
+
+EXTRA_DIST = zebra.cfg x.xml default.idx string.utf8.chr \
+ $(check_SCRIPTS)
+
--- /dev/null
+# Zebra indexes as referred to from the *.abs-files.
+# $Id: default.idx,v 1.1 2004-03-09 15:12:15 adam Exp $
+#
+# Traditional word index
+index w
+completeness 0
+position 1
+charmap string.utf8.chr
+
+# Phrase index
+index p
+completeness 1
+charmap string.chr
+
+# URX (URL) index
+index u
+completeness 0
+charmap urx.chr
+
+# Numeric index (integer only)
+index n
+completeness 0
+charmap numeric.chr
+
+# Null map index (no mapping at all)
+index 0
+completeness 0
+position 1
+charmap @
+
+# Sort register (no mapping at all)
+sort s
+completeness 1
+charmap string.chr
+
+index y
+completeness 0
+charmap @
--- /dev/null
+# $Id: string.utf8.chr,v 1.1 2004-03-09 15:12:15 adam Exp $
+
+# Define the basic value-set. *Beware* of changing this without re-indexing
+# your databases.
+
+# This specifies that _this_ file is in UTF-8.
+encoding utf-8
+
+lowercase {0-9}{a-y}üzæäøöå
+uppercase {0-9}{A-Y}ÜZÆÄØÖÅ
+
+# Breaking characters
+
+space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~
+
+# Characters to be considered equivalent for searching purposes.
+
+# equivalent æä(ae)
+# equivalent øö(oe)
+# equivalent å(aa)
+# equivalent uü
+
+# Supplemental mappings
+
+# Latin letter with h with dot below
+map \L1E25 h
+# Latin letter with H with dot below
+map \L1E24 h
--- /dev/null
+#!/bin/sh
+LOG=test1.log
+rm -f $LOG
+if ../../index/zebraidx -l $LOG -V|grep Expat >/dev/null; then
+ ../../index/zebraidx -l$LOG init
+else
+ exit 0
+fi
+../../index/zebraidx -l$LOG update *.xml
+../../index/zebrasrv -l$LOG unix:socket &
+sleep 1
+../api/testclient unix:socket '@term string æ' >tmp1
+echo 'Result count: 1' >tmp2
+kill `cat zebrasrv.pid` || exit 1
+diff tmp1 tmp2 || exit 2
+rm -f tmp1 tmp2
--- /dev/null
+#!/bin/sh
+LOG=test2.log
+rm -f $LOG
+if ../../index/zebraidx -l $LOG -V|grep Expat >/dev/null; then
+ ../../index/zebraidx -l$LOG init
+else
+ exit 0
+fi
+../../index/zebraidx -l$LOG update *.xml
+../../index/zebrasrv -l$LOG unix:socket &
+sleep 1
+# search for UNICODE 1E25 - letter h with dot below
+../api/testclient unix:socket '@term string ḥ' >tmp1
+echo 'Result count: 1' >tmp2
+kill `cat zebrasrv.pid` || exit 1
+diff tmp1 tmp2 || exit 2
+rm -f tmp1 tmp2
--- /dev/null
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<gils>
+ <Title>
+ h æ
+ <Acronym>
+ UUCCSEIS
+ </Acronym>
+ </Title>
+</gils>
--- /dev/null
+# Simple Zebra configuration file
+# $Id: zebra.cfg,v 1.1 2004-03-09 15:12:15 adam Exp $
+#
+# Where the schema files, attribute files, etc are located.
+profilePath: .:../../tab
+
+# Files that describe the attribute sets supported.
+attset: bib1.att
+attset: gils.att
+attset: explain.att
+
+recordtype: grs.xml
+
+isam: b
-/* $Id: charmap.c,v 1.27 2003-01-13 10:53:16 oleg Exp $
- Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
+/* $Id: charmap.c,v 1.28 2004-03-09 15:12:15 adam Exp $
+ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
Index Data Aps
This file is part of the Zebra server.
return c;
}
+static int zebra_ucs4_strlen(ucs4_t *s)
+{
+ int i = 0;
+ while (*s++)
+ i++;
+ return i;
+}
+
ucs4_t zebra_prim_w(ucs4_t **s)
{
ucs4_t c;
case 't': c = '\t'; (*s)++; break;
case 's': c = ' '; (*s)++; break;
case 'x':
- fmtstr[0] = (*s)[0];
- fmtstr[1] = (*s)[1];
- fmtstr[2] = (*s)[2];
- fmtstr[3] = 0;
- sscanf(fmtstr, "x%2x", &i);
- c = i;
- *s += 3; break;
+ if (zebra_ucs4_strlen(*s) >= 3)
+ {
+ fmtstr[0] = (*s)[1];
+ fmtstr[1] = (*s)[2];
+ fmtstr[2] = 0;
+ sscanf(fmtstr, "%x", &i);
+ c = i;
+ *s += 3;
+ }
+ break;
case '0':
case '1':
case '2':
case '7':
case '8':
case '9':
- fmtstr[0] = (*s)[0];
- fmtstr[1] = (*s)[1];
- fmtstr[2] = (*s)[2];
- fmtstr[3] = 0;
- sscanf(fmtstr, "%3o", &i);
- c = i;
- *s += 3;
+ if (zebra_ucs4_strlen(*s) >= 3)
+ {
+ fmtstr[0] = (*s)[0];
+ fmtstr[1] = (*s)[1];
+ fmtstr[2] = (*s)[2];
+ fmtstr[3] = 0;
+ sscanf(fmtstr, "%o", &i);
+ c = i;
+ *s += 3;
+ }
break;
+ case 'L':
+ if (zebra_ucs4_strlen(*s) >= 5)
+ {
+ fmtstr[0] = (*s)[1];
+ fmtstr[1] = (*s)[2];
+ fmtstr[2] = (*s)[3];
+ fmtstr[3] = (*s)[4];
+ fmtstr[4] = 0;
+ sscanf(fmtstr, "%x", &i);
+ c = i;
+ *s += 5;
+ }
+ break;
default:
(*s)++;
}
ret = yaz_iconv (t, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
if (ret == (size_t) (-1))
{
+ yaz_log(LOG_LOG, "from: %2X %2X %2X %2X",
+ from[0], from[1], from[2], from[3]);
yaz_log (LOG_WARN|LOG_ERRNO, "bad unicode sequence");
return -1;
}
* zebra need to comment next part of code.
*/
- /*
+ /* Original code */
+#if 1
if (t_unicode != 0)
yaz_iconv_close (t_unicode);
t_unicode = yaz_iconv_open (ucs4_native, argv[1]);
- */
-
+#endif
/*
* Fix me. It is additional staff for conversion of characters from local encoding
* of *.chr file to UTF-8 (internal encoding).
* NOTE: The derective encoding must be first directive in *.chr file.
*/
+ /* For whatever reason Oleg enabled this.. */
+#if 0
if (t_utf8 != 0)
yaz_iconv_close(t_utf8);
t_utf8 = yaz_iconv_open ("UTF-8", argv[1]);
+#endif
}
else
{