From 0ffb25b0bf78f53399267664b71206c88980c4da Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 22 Oct 2007 12:21:38 +0000 Subject: [PATCH] Added ICU chain component - which used to be part of Pazpar2. --- NEWS | 2 + configure.ac | 8 +- include/yaz/Makefile.am | 3 +- include/yaz/icu_I18N.h | 284 +++++++++++ src/Makefile.am | 9 +- src/icu_I18N.c | 1219 ++++++++++++++++++++++++++++++++++++++++++++++ test/Makefile.am | 7 +- test/tst_icu_I18N.c | 693 ++++++++++++++++++++++++++ util/Makefile.am | 10 +- util/yaz-icu-example.xml | 30 ++ util/yaz-icu.c | 556 +++++++++++++++++++++ yaz-config.in | 17 +- 12 files changed, 2826 insertions(+), 12 deletions(-) create mode 100644 include/yaz/icu_I18N.h create mode 100644 src/icu_I18N.c create mode 100644 test/tst_icu_I18N.c create mode 100644 util/yaz-icu-example.xml create mode 100644 util/yaz-icu.c diff --git a/NEWS b/NEWS index 1bae7fd..50a5065 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,5 @@ +Added ICU chain component - which used to be part of Pazpar2. + Added HTTP tunnel facility for COMSTACK, bug #1752. This is a facility that allows a Web proxy, such as squid, to tunnel Z39.50 traffic. This facility is "transparent" to must applications diff --git a/configure.ac b/configure.ac index 8388f03..1170ef3 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ dnl YAZ Toolkit, Index Data 1995-2007 dnl See the file LICENSE for details. -dnl $Id: configure.ac,v 1.94 2007-09-26 19:14:04 adam Exp $ +dnl $Id: configure.ac,v 1.95 2007-10-22 12:21:38 adam Exp $ AC_PREREQ(2.59) AC_INIT([yaz],[3.0.15],[yaz-help@indexdata.dk]) AC_CONFIG_SRCDIR(configure.ac) @@ -313,9 +313,13 @@ fi dnl AC_SUBST(YAZ_CONFIG_CFLAGS) dnl -HAVETHREADS=0 +dnl +AC_CHECK_ICU(3.6, [], + AC_MSG_WARN([For ICU internationalizing support please install libicu36-dev + or similar])) dnl dnl ------ GNU threads +HAVETHREADS=0 AC_ARG_ENABLE(pth, [ --enable-pth enable GNU threads],[enable_pth=$enableval],[enable_pth=no]) AC_SUBST(LIBPTH) if test "$enable_pth" = "yes"; then diff --git a/include/yaz/Makefile.am b/include/yaz/Makefile.am index 4dfa8ce..55de6c6 100644 --- a/include/yaz/Makefile.am +++ b/include/yaz/Makefile.am @@ -1,4 +1,4 @@ -## $Id: Makefile.am,v 1.46 2007-04-30 08:29:07 adam Exp $ +## $Id: Makefile.am,v 1.47 2007-10-22 12:21:39 adam Exp $ pkginclude_HEADERS= backend.h ccl.h ccl_xml.h cql.h comstack.h \ diagbib1.h diagsrw.h diagsru_update.h sortspec.h log.h logrpn.h marcdisp.h \ @@ -9,6 +9,7 @@ pkginclude_HEADERS= backend.h ccl.h ccl_xml.h cql.h comstack.h \ yaz-ccl.h yaz-iconv.h yaz-util.h yaz-version.h yconfig.h proto.h \ xmlquery.h libxml2_error.h xmltypes.h snprintf.h query-charset.h \ mutex.h oid_db.h oid_util.h oid_std.h tokenizer.h copy_types.h \ + icu_I18N.h \ \ ill.h ill-core.h item-req.h oclc-ill-req-ext.h z-accdes1.h z-accform1.h \ z-acckrb1.h z-core.h z-date.h z-diag1.h z-espec1.h z-estask.h z-exp.h \ diff --git a/include/yaz/icu_I18N.h b/include/yaz/icu_I18N.h new file mode 100644 index 0000000..efcd033 --- /dev/null +++ b/include/yaz/icu_I18N.h @@ -0,0 +1,284 @@ +/* $Id: icu_I18N.h,v 1.1 2007-10-22 12:21:39 adam Exp $ + Copyright (c) 2006-2007, Index Data. + + This file is part of Pazpar2. + + Pazpar2 is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. + + Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with Pazpar2; see the file LICENSE. If not, write to the + Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. +*/ + +#ifndef ICU_I18NL_H +#define ICU_I18NL_H + +#include + +#include +#include + +#include /* Basic ICU data types */ +#include /* char names */ + +//#include +#include +//#include /* C Converter API */ +//#include /* some more string fcns*/ +//#include +#include +//#include +#include + + + +// declared structs and functions + +int icu_check_status (UErrorCode status); + +struct icu_buf_utf16 +{ + UChar * utf16; + int32_t utf16_len; + int32_t utf16_cap; +}; + +struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity); +struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, + size_t capacity); +struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16); +void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16); + + + +struct icu_buf_utf8 +{ + uint8_t * utf8; + int32_t utf8_len; + int32_t utf8_cap; +}; + +struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity); +struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, + size_t capacity); +void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8); + + +UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16, + struct icu_buf_utf8 * src8, + UErrorCode * status); + +UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, + const char * src8cstr, + UErrorCode * status); + + +UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8, + struct icu_buf_utf16 * src16, + UErrorCode * status); + +struct icu_casemap +{ + char locale[16]; + char action; +}; + +struct icu_casemap * icu_casemap_create(const char *locale, char action, + UErrorCode *status); + +void icu_casemap_destroy(struct icu_casemap * casemap); + +int icu_casemap_casemap(struct icu_casemap * casemap, + struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + UErrorCode *status); + +int icu_utf16_casemap(struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + const char *locale, char action, + UErrorCode *status); + +UErrorCode icu_sortkey8_from_utf16(UCollator *coll, + struct icu_buf_utf8 * dest8, + struct icu_buf_utf16 * src16, + UErrorCode * status); + +struct icu_tokenizer +{ + char locale[16]; + char action; + UBreakIterator* bi; + struct icu_buf_utf16 * buf16; + int32_t token_count; + int32_t token_id; + int32_t token_start; + int32_t token_end; + // keep always invariant + // 0 <= token_start + // <= token_end + // <= buf16->utf16_len + // and invariant + // 0 <= token_id <= token_count +}; + +struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, + UErrorCode *status); + +void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer); + +int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * src16, UErrorCode *status); + +int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * tkn16, + UErrorCode *status); + +int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer); +int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer); +int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer); +int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer); +int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer); + + + +struct icu_normalizer +{ + char action; + struct icu_buf_utf16 * rules16; + UParseError parse_error[256]; + UTransliterator * trans; +}; + +struct icu_normalizer * icu_normalizer_create(const char *rules, char action, + UErrorCode *status); + + +void icu_normalizer_destroy(struct icu_normalizer * normalizer); + +int icu_normalizer_normalize(struct icu_normalizer * normalizer, + struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + UErrorCode *status); + + +#if 0 +struct icu_token +{ + int32_t token_id; + uint8_t * display8; + uint8_t * norm8; + uint8_t * sort8; +} +#endif + + +enum icu_chain_step_type { + ICU_chain_step_type_none, // + ICU_chain_step_type_display, // convert to utf8 display format + ICU_chain_step_type_index, // convert to utf8 index format + ICU_chain_step_type_sortkey, // convert to utf8 sortkey format + ICU_chain_step_type_casemap, // apply utf16 charmap + ICU_chain_step_type_normalize, // apply utf16 normalization + ICU_chain_step_type_tokenize // apply utf16 tokenization +}; + + + +struct icu_chain_step +{ + // type and action object + enum icu_chain_step_type type; + union { + struct icu_casemap * casemap; + struct icu_normalizer * normalizer; + struct icu_tokenizer * tokenizer; + } u; + // temprary post-action utf16 buffer + struct icu_buf_utf16 * buf16; + struct icu_chain_step * previous; + int more_tokens; + int need_new_token; +}; + + +struct icu_chain; + +struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, + enum icu_chain_step_type type, + const uint8_t * rule, + struct icu_buf_utf16 * buf16, + UErrorCode *status); + + +void icu_chain_step_destroy(struct icu_chain_step * step); + + +struct icu_chain +{ + uint8_t identifier[128]; + uint8_t locale[16]; + + // number of tokens returned so far + int32_t token_count; + + // utf8 output buffers + struct icu_buf_utf8 * display8; + struct icu_buf_utf8 * norm8; + struct icu_buf_utf8 * sort8; + + // utf16 source buffer + struct icu_buf_utf16 * src16; + + // linked list of chain steps + struct icu_chain_step * steps; +}; + +struct icu_chain * icu_chain_create(const uint8_t * identifier, + const uint8_t * locale); + +void icu_chain_destroy(struct icu_chain * chain); + +struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, + UErrorCode * status); + + +struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, + enum icu_chain_step_type type, + const uint8_t * rule, + UErrorCode *status); + + +int icu_chain_step_next_token(struct icu_chain * chain, + struct icu_chain_step * step, + UErrorCode *status); + +int icu_chain_assign_cstr(struct icu_chain * chain, + const char * src8cstr, + UErrorCode *status); + +int icu_chain_next_token(struct icu_chain * chain, + UErrorCode *status); + +int icu_chain_get_token_count(struct icu_chain * chain); + +const char * icu_chain_get_display(struct icu_chain * chain); + +const char * icu_chain_get_norm(struct icu_chain * chain); + +const char * icu_chain_get_sort(struct icu_chain * chain); + + + + + +#endif // ICU_I18NL_H diff --git a/src/Makefile.am b/src/Makefile.am index b4f6cf8..c4fa09d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,10 +1,10 @@ ## This file is part of the YAZ toolkit. ## Copyright (C) 1995-2007, Index Data, All rights reserved. -## $Id: Makefile.am,v 1.68 2007-05-07 13:18:32 adam Exp $ +## $Id: Makefile.am,v 1.69 2007-10-22 12:21:39 adam Exp $ YAZ_VERSION_INFO=3:0:0 -lib_LTLIBRARIES = libyaz.la libyaz_server.la +lib_LTLIBRARIES = libyaz.la libyaz_server.la libyaz_icu.la dist-hook: test -f $(srcdir)/cql.c || exit 1 @@ -28,7 +28,7 @@ YAZCOMP_I = $(YAZCOMP) -d $(srcdir)/ill.tcl -i yaz -I$(top_srcdir)/include AM_CPPFLAGS=-I$(top_srcdir)/include $(XML2_CFLAGS) $(SSL_CFLAGS) libyaz_server_la_CFLAGS = $(PTHREAD_CFLAGS) - +libyaz_icu_la_CPPFLAGS = $(AM_CPPFLAGS) $(ICU_CPPFLAGS) AM_YFLAGS=-p cql_ @@ -103,6 +103,9 @@ libyaz_server_la_SOURCES = statserv.c seshigh.c eventl.c \ libyaz_server_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO) +libyaz_icu_la_SOURCES = icu_I18N.c +libyaz_icu_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO) + # Rules for Z39.50 V3 z-accdes1.c \ z-accform1.c \ diff --git a/src/icu_I18N.c b/src/icu_I18N.c new file mode 100644 index 0000000..a085caa --- /dev/null +++ b/src/icu_I18N.c @@ -0,0 +1,1219 @@ +/* $Id: icu_I18N.c,v 1.1 2007-10-22 12:21:39 adam Exp $ + Copyright (c) 2006-2007, Index Data. + + This file is part of Pazpar2. + + Pazpar2 is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. + + Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with Pazpar2; see the file LICENSE. If not, write to the + Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. +*/ + +#if HAVE_CONFIG_H +#include "cconfig.h" +#endif + +#define USE_TIMING 0 +#if USE_TIMING +#include +#endif + + +#ifdef HAVE_ICU +#include + +#include + +#include +#include +#include + +#include /* some more string fcns*/ +#include /* char names */ + + +//#include +//#include /* Basic ICU data types */ +#include +//#include /* C Converter API */ +//#include +//#include +/* #include */ + + + + +int icu_check_status (UErrorCode status) +{ + if(U_FAILURE(status)){ + yaz_log(YLOG_WARN, + "ICU: %d %s\n", status, u_errorName(status)); + return 0; + } + return 1; + +} + + + +struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity) +{ + struct icu_buf_utf16 * buf16 + = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16)); + + buf16->utf16 = 0; + buf16->utf16_len = 0; + buf16->utf16_cap = 0; + + if (capacity > 0){ + buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity); + buf16->utf16[0] = (UChar) 0; + buf16->utf16_cap = capacity; + } + return buf16; +}; + +struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, + size_t capacity) +{ + if (buf16){ + if (capacity > 0){ + if (0 == buf16->utf16) + buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity); + else + buf16->utf16 + = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity); + buf16->utf16[0] = (UChar) 0; + buf16->utf16_len = 0; + buf16->utf16_cap = capacity; + } + else { + if (buf16->utf16) + free(buf16->utf16); + buf16->utf16 = 0; + buf16->utf16_len = 0; + buf16->utf16_cap = 0; + } + } + + return buf16; +}; + + +struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16) +{ + if(!dest16 || !src16 + || dest16 == src16) + return 0; + + if (dest16->utf16_cap < src16->utf16_len) + icu_buf_utf16_resize(dest16, src16->utf16_len * 2); + + u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len); + dest16->utf16_len = src16->utf16_len; + + return dest16; +}; + + +void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16) +{ + if (buf16){ + if (buf16->utf16) + free(buf16->utf16); + free(buf16); + } +}; + + + + + + +struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity) +{ + struct icu_buf_utf8 * buf8 + = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8)); + + buf8->utf8 = 0; + buf8->utf8_len = 0; + buf8->utf8_cap = 0; + + if (capacity > 0){ + buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity); + buf8->utf8[0] = (uint8_t) 0; + buf8->utf8_cap = capacity; + } + return buf8; +}; + + + +struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, + size_t capacity) +{ + if (buf8){ + if (capacity > 0){ + if (0 == buf8->utf8) + buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity); + else + buf8->utf8 + = (uint8_t *) realloc(buf8->utf8, + sizeof(uint8_t) * capacity); + buf8->utf8_cap = capacity; + } + else { + if (buf8->utf8) + free(buf8->utf8); + buf8->utf8 = 0; + buf8->utf8_len = 0; + buf8->utf8_cap = 0; + } + } + + return buf8; +}; + + +struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8, + struct icu_buf_utf8 * src8) +{ + if(!dest8 || !src8 + || dest8 == src8) + return 0; + + + if (dest8->utf8_cap < src8->utf8_len) + icu_buf_utf8_resize(dest8, src8->utf8_len * 2); + + strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len); + + return dest8; +}; + + +const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8) +{ + if (!src8 || src8->utf8_len == 0) + return ""; + if (src8->utf8_len == src8->utf8_cap) + src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1); + src8->utf8[src8->utf8_len] = '\0'; + return (const char *) src8->utf8; +} + + +void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8) +{ + if (buf8){ + if (buf8->utf8) + free(buf8->utf8); + free(buf8); + } +}; + + + +UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16, + struct icu_buf_utf8 * src8, + UErrorCode * status) +{ + int32_t utf16_len = 0; + + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + (const char *) src8->utf8, src8->utf8_len, status); + + // check for buffer overflow, resize and retry + if (*status == U_BUFFER_OVERFLOW_ERROR + //|| dest16->utf16_len > dest16->utf16_cap + ){ + icu_buf_utf16_resize(dest16, utf16_len * 2); + *status = U_ZERO_ERROR; + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + (const char *) src8->utf8, src8->utf8_len, status); + } + + //if (*status != U_BUFFER_OVERFLOW_ERROR + if (U_SUCCESS(*status) + && utf16_len <= dest16->utf16_cap) + dest16->utf16_len = utf16_len; + else { + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + } + + return *status; +}; + + + +UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, + const char * src8cstr, + UErrorCode * status) +{ + size_t src8cstr_len = 0; + int32_t utf16_len = 0; + + src8cstr_len = strlen(src8cstr); + + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + src8cstr, src8cstr_len, status); + + // check for buffer overflow, resize and retry + if (*status == U_BUFFER_OVERFLOW_ERROR + //|| dest16->utf16_len > dest16->utf16_cap + ){ + icu_buf_utf16_resize(dest16, utf16_len * 2); + *status = U_ZERO_ERROR; + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + src8cstr, src8cstr_len, status); + } + + // if (*status != U_BUFFER_OVERFLOW_ERROR + if (U_SUCCESS(*status) + && utf16_len <= dest16->utf16_cap) + dest16->utf16_len = utf16_len; + else { + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + } + + return *status; +}; + + + + +UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8, + struct icu_buf_utf16 * src16, + UErrorCode * status) +{ + int32_t utf8_len = 0; + + u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap, + &utf8_len, + src16->utf16, src16->utf16_len, status); + + // check for buffer overflow, resize and retry + if (*status == U_BUFFER_OVERFLOW_ERROR + //|| dest8->utf8_len > dest8->utf8_cap + ){ + icu_buf_utf8_resize(dest8, utf8_len * 2); + *status = U_ZERO_ERROR; + u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap, + &utf8_len, + src16->utf16, src16->utf16_len, status); + + } + + //if (*status != U_BUFFER_OVERFLOW_ERROR + if (U_SUCCESS(*status) + && utf8_len <= dest8->utf8_cap) + dest8->utf8_len = utf8_len; + else { + dest8->utf8[0] = (uint8_t) 0; + dest8->utf8_len = 0; + } + + return *status; +}; + + + +struct icu_casemap * icu_casemap_create(const char *locale, char action, + UErrorCode *status) +{ + struct icu_casemap * casemap + = (struct icu_casemap *) malloc(sizeof(struct icu_casemap)); + strcpy(casemap->locale, locale); + casemap->action = action; + + switch(casemap->action) { + case 'l': + break; + case 'u': + break; + case 't': + break; + case 'f': + break; + default: + icu_casemap_destroy(casemap); + return 0; + } + + return casemap; +}; + +void icu_casemap_destroy(struct icu_casemap * casemap) +{ + if (casemap) + free(casemap); +}; + + +int icu_casemap_casemap(struct icu_casemap * casemap, + struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + UErrorCode *status) +{ + if(!casemap) + return 0; + + return icu_utf16_casemap(dest16, src16, + casemap->locale, casemap->action, status); +}; + + +int icu_utf16_casemap(struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + const char *locale, char action, + UErrorCode *status) +{ + int32_t dest16_len = 0; + + switch(action) { + case 'l': + dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + locale, status); + break; + case 'u': + dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + locale, status); + break; + case 't': + dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + 0, locale, status); + break; + case 'f': + dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + U_FOLD_CASE_DEFAULT, status); + break; + + default: + return U_UNSUPPORTED_ERROR; + break; + } + + // check for buffer overflow, resize and retry + if (*status == U_BUFFER_OVERFLOW_ERROR + && dest16 != src16 // do not resize if in-place conversion + //|| dest16_len > dest16->utf16_cap + ){ + icu_buf_utf16_resize(dest16, dest16_len * 2); + *status = U_ZERO_ERROR; + + + switch(action) { + case 'l': + dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + locale, status); + break; + case 'u': + dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + locale, status); + break; + case 't': + dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + 0, locale, status); + break; + case 'f': + dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + U_FOLD_CASE_DEFAULT, status); + break; + + default: + return U_UNSUPPORTED_ERROR; + break; + } + } + + if (U_SUCCESS(*status) + && dest16_len <= dest16->utf16_cap) + dest16->utf16_len = dest16_len; + else { + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + } + + return *status; +}; + + + +UErrorCode icu_sortkey8_from_utf16(UCollator *coll, + struct icu_buf_utf8 * dest8, + struct icu_buf_utf16 * src16, + UErrorCode * status) +{ + + int32_t sortkey_len = 0; + + sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, + dest8->utf8, dest8->utf8_cap); + + // check for buffer overflow, resize and retry + if (sortkey_len > dest8->utf8_cap) { + icu_buf_utf8_resize(dest8, sortkey_len * 2); + sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, + dest8->utf8, dest8->utf8_cap); + } + + if (U_SUCCESS(*status) + && sortkey_len > 0) + dest8->utf8_len = sortkey_len; + else { + dest8->utf8[0] = (UChar) 0; + dest8->utf8_len = 0; + } + + return sortkey_len; +}; + + + +struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, + UErrorCode *status) +{ + struct icu_tokenizer * tokenizer + = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer)); + + strcpy(tokenizer->locale, locale); + tokenizer->action = action; + tokenizer->bi = 0; + tokenizer->buf16 = 0; + tokenizer->token_count = 0; + tokenizer->token_id = 0; + tokenizer->token_start = 0; + tokenizer->token_end = 0; + + + switch(tokenizer->action) { + case 'l': + tokenizer->bi + = ubrk_open(UBRK_LINE, tokenizer->locale, + 0, 0, status); + break; + case 's': + tokenizer->bi + = ubrk_open(UBRK_SENTENCE, tokenizer->locale, + 0, 0, status); + break; + case 'w': + tokenizer->bi + = ubrk_open(UBRK_WORD, tokenizer->locale, + 0, 0, status); + break; + case 'c': + tokenizer->bi + = ubrk_open(UBRK_CHARACTER, tokenizer->locale, + 0, 0, status); + break; + case 't': + tokenizer->bi + = ubrk_open(UBRK_TITLE, tokenizer->locale, + 0, 0, status); + break; + default: + *status = U_UNSUPPORTED_ERROR; + return 0; + break; + } + + // ICU error stuff is a very funny business + if (U_SUCCESS(*status)) + return tokenizer; + + // freeing if failed + icu_tokenizer_destroy(tokenizer); + return 0; +}; + +void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer) +{ + if (tokenizer) { + if (tokenizer->bi) + ubrk_close(tokenizer->bi); + free(tokenizer); + } +}; + +int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * src16, + UErrorCode *status) +{ + if (!tokenizer || !tokenizer->bi || !src16) + return 0; + + + tokenizer->buf16 = src16; + tokenizer->token_count = 0; + tokenizer->token_id = 0; + tokenizer->token_start = 0; + tokenizer->token_end = 0; + + ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status); + + + if (U_FAILURE(*status)) + return 0; + + return 1; +}; + +int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * tkn16, + UErrorCode *status) +{ + int32_t tkn_start = 0; + int32_t tkn_end = 0; + int32_t tkn_len = 0; + + + if (!tokenizer || !tokenizer->bi + || !tokenizer->buf16 || !tokenizer->buf16->utf16_len) + return 0; + + // never change tokenizer->buf16 and keep always invariant + // 0 <= tokenizer->token_start + // <= tokenizer->token_end + // <= tokenizer->buf16->utf16_len + // returns length of token + + if (0 == tokenizer->token_end) // first call + tkn_start = ubrk_first(tokenizer->bi); + else //successive calls + tkn_start = tokenizer->token_end; + + // get next position + tkn_end = ubrk_next(tokenizer->bi); + + // repairing invariant at end of ubrk, which is UBRK_DONE = -1 + if (UBRK_DONE == tkn_end) + tkn_end = tokenizer->buf16->utf16_len; + + // copy out if everything is well + if(U_FAILURE(*status)) + return 0; + + // everything OK, now update internal state + tkn_len = tkn_end - tkn_start; + + if (0 < tkn_len){ + tokenizer->token_count++; + tokenizer->token_id++; + } else { + tokenizer->token_id = 0; + } + tokenizer->token_start = tkn_start; + tokenizer->token_end = tkn_end; + + + // copying into token buffer if it exists + if (tkn16){ + if (tkn16->utf16_cap < tkn_len) + icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2); + + u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], + tkn_len); + + tkn16->utf16_len = tkn_len; + } + + return tkn_len; +} + + +int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_id; +}; + +int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_start; +}; + +int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_end; +}; + +int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer) +{ + return (tokenizer->token_end - tokenizer->token_start); +}; + +int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_count; +}; + + + +struct icu_normalizer * icu_normalizer_create(const char *rules, char action, + UErrorCode *status) +{ + + struct icu_normalizer * normalizer + = (struct icu_normalizer *) malloc(sizeof(struct icu_normalizer)); + + normalizer->action = action; + normalizer->trans = 0; + normalizer->rules16 = icu_buf_utf16_create(0); + icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status); + + switch(normalizer->action) { + case 'f': + normalizer->trans + = utrans_openU(normalizer->rules16->utf16, + normalizer->rules16->utf16_len, + UTRANS_FORWARD, + 0, 0, + normalizer->parse_error, status); + // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans); + break; + case 'r': + normalizer->trans + = utrans_openU(normalizer->rules16->utf16, + normalizer->rules16->utf16_len, + UTRANS_REVERSE , + 0, 0, + normalizer->parse_error, status); + // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans); + break; + default: + *status = U_UNSUPPORTED_ERROR; + return 0; + break; + } + + if (U_SUCCESS(*status)) + return normalizer; + + // freeing if failed + icu_normalizer_destroy(normalizer); + return 0; +}; + + +void icu_normalizer_destroy(struct icu_normalizer * normalizer){ + if (normalizer) { + if (normalizer->rules16) + icu_buf_utf16_destroy(normalizer->rules16); + if (normalizer->trans) + { + // yaz_log(YLOG_LOG, "utrans_close %p", normalizer->trans); + utrans_close(normalizer->trans); + } + free(normalizer); + } +}; + + + +int icu_normalizer_normalize(struct icu_normalizer * normalizer, + struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + UErrorCode *status) +{ + if (!normalizer || !normalizer->trans || !src16 || !dest16) + return 0; + + if (!icu_buf_utf16_copy(dest16, src16)) + return 0; + + utrans_transUChars (normalizer->trans, + dest16->utf16, &(dest16->utf16_len), + dest16->utf16_cap, + 0, &(src16->utf16_len), status); + + if (U_FAILURE(*status)){ + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + } + + return dest16->utf16_len; +} + + + + +struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, + enum icu_chain_step_type type, + const uint8_t * rule, + struct icu_buf_utf16 * buf16, + UErrorCode *status) +{ + struct icu_chain_step * step = 0; + + if(!chain || !type || !rule) + return 0; + + step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step)); + + step->type = type; + + step->buf16 = buf16; + + // create auxilary objects + switch(step->type) { + case ICU_chain_step_type_display: + break; + case ICU_chain_step_type_index: + break; + case ICU_chain_step_type_sortkey: + break; + case ICU_chain_step_type_casemap: + step->u.casemap = icu_casemap_create((char *) chain->locale, + (char) rule[0], status); + break; + case ICU_chain_step_type_normalize: + step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status); + break; + case ICU_chain_step_type_tokenize: + step->u.tokenizer = icu_tokenizer_create((char *) chain->locale, + (char) rule[0], status); + break; + default: + break; + } + + return step; +}; + + +void icu_chain_step_destroy(struct icu_chain_step * step){ + + if (!step) + return; + + icu_chain_step_destroy(step->previous); + + switch(step->type) { + case ICU_chain_step_type_display: + break; + case ICU_chain_step_type_index: + break; + case ICU_chain_step_type_sortkey: + break; + case ICU_chain_step_type_casemap: + icu_casemap_destroy(step->u.casemap); + icu_buf_utf16_destroy(step->buf16); + break; + case ICU_chain_step_type_normalize: + icu_normalizer_destroy(step->u.normalizer); + icu_buf_utf16_destroy(step->buf16); + break; + case ICU_chain_step_type_tokenize: + icu_tokenizer_destroy(step->u.tokenizer); + icu_buf_utf16_destroy(step->buf16); + break; + default: + break; + } + free(step); +}; + + + +struct icu_chain * icu_chain_create(const uint8_t * identifier, + const uint8_t * locale) +{ + + struct icu_chain * chain + = (struct icu_chain *) malloc(sizeof(struct icu_chain)); + + strncpy((char *) chain->identifier, (const char *) identifier, 128); + chain->identifier[128 - 1] = '\0'; + strncpy((char *) chain->locale, (const char *) locale, 16); + chain->locale[16 - 1] = '\0'; + + chain->token_count = 0; + + chain->display8 = icu_buf_utf8_create(0); + chain->norm8 = icu_buf_utf8_create(0); + chain->sort8 = icu_buf_utf8_create(0); + + chain->src16 = icu_buf_utf16_create(0); + + chain->steps = 0; + + return chain; +}; + + +void icu_chain_destroy(struct icu_chain * chain) +{ + if (chain){ + icu_buf_utf8_destroy(chain->display8); + icu_buf_utf8_destroy(chain->norm8); + icu_buf_utf8_destroy(chain->sort8); + + icu_buf_utf16_destroy(chain->src16); + + icu_chain_step_destroy(chain->steps); + free(chain); + } +}; + + + +struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, + UErrorCode * status){ + + xmlNode *node = 0; + struct icu_chain * chain = 0; + + if (!xml_node + ||xml_node->type != XML_ELEMENT_NODE + || strcmp((const char *) xml_node->name, "icu_chain")) + + return 0; + + xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id"); + xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale"); + + if (!xml_id || !strlen((const char *) xml_id) + || !xml_locale || !strlen((const char *) xml_locale)) + return 0; + + chain = icu_chain_create((const uint8_t *) xml_id, + (const uint8_t *) xml_locale); + + xmlFree(xml_id); + xmlFree(xml_locale); + if (!chain) + return 0; + + for (node = xml_node->children; node; node = node->next) + { + if (node->type != XML_ELEMENT_NODE) + continue; + + xmlChar *xml_rule = xmlGetProp(node, (xmlChar *) "rule"); + struct icu_chain_step * step = 0; + + if (!strcmp((const char *) node->name, + (const char *) "casemap")){ + step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, + (const uint8_t *) xml_rule, status); + } + else if (!strcmp((const char *) node->name, + (const char *) "normalize")){ + step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, + (const uint8_t *) xml_rule, status); + } + else if (!strcmp((const char *) node->name, + (const char *) "tokenize")){ + step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, + (const uint8_t *) xml_rule, status); + } + else if (!strcmp((const char *) node->name, + (const char *) "display")){ + step = icu_chain_insert_step(chain, ICU_chain_step_type_display, + (const uint8_t *) "", status); + } + else if (!strcmp((const char *) node->name, + (const char *) "index")){ + step = icu_chain_insert_step(chain, ICU_chain_step_type_index, + (const uint8_t *) "", status); + } + else if (!strcmp((const char *) node->name, + (const char *) "sortkey")){ + step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey, + (const uint8_t *) "", status); + } + + xmlFree(xml_rule); + if (!step || U_FAILURE(*status)){ + icu_chain_destroy(chain); + return 0; + } + + + } + + return chain; +}; + + + +struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, + enum icu_chain_step_type type, + const uint8_t * rule, + UErrorCode *status) +{ + struct icu_chain_step * step = 0; + struct icu_buf_utf16 * src16 = 0; + struct icu_buf_utf16 * buf16 = 0; + + if (!chain || !type || !rule) + return 0; + + // assign utf16 src buffers as needed + if (chain->steps && chain->steps->buf16) + src16 = chain->steps->buf16; + else if (chain->src16) + src16 = chain->src16; + else + return 0; + + + // create utf16 destination buffers as needed, or + switch(type) { + case ICU_chain_step_type_display: + buf16 = src16; + break; + case ICU_chain_step_type_index: + buf16 = src16; + break; + case ICU_chain_step_type_sortkey: + buf16 = src16; + break; + case ICU_chain_step_type_casemap: + buf16 = icu_buf_utf16_create(0); + break; + case ICU_chain_step_type_normalize: + buf16 = icu_buf_utf16_create(0); + break; + case ICU_chain_step_type_tokenize: + buf16 = icu_buf_utf16_create(0); + break; + default: + break; + } + + // create actual chain step with this buffer + step = icu_chain_step_create(chain, type, rule, buf16, status); + + step->previous = chain->steps; + chain->steps = step; + + return step; +}; + + +int icu_chain_step_next_token(struct icu_chain * chain, + struct icu_chain_step * step, + UErrorCode *status) +{ + struct icu_buf_utf16 * src16 = 0; + + //printf("icu_chain_step_next_token %d\n", (int) step); + + if (!chain || !chain->src16 || !step || !step->more_tokens) + return 0; + + // assign utf16 src buffers as neeed, advance in previous steps + // tokens until non-zero token met, and setting stop condition + if (step->previous){ + src16 = step->previous->buf16; + if (step->need_new_token) + //while (step->more_tokens && !src16->utf16_len) + step->more_tokens + = icu_chain_step_next_token(chain, step->previous, status); + } + else { // first step can only work once on chain->src16 input buffer + src16 = chain->src16; + step->more_tokens = 1; + } + + // stop if nothing to process + // i.e new token source was not properly assigned + if (!step->more_tokens || !src16) // || !src16->utf16_len + return 0; + + //printf("icu_chain_step_next_token %d working\n", (int) step); + + + // perform the work, eventually put this steps output in + // step->buf16 or the chains UTF8 output buffers + switch(step->type) { + case ICU_chain_step_type_display: + icu_utf16_to_utf8(chain->display8, src16, status); + break; + case ICU_chain_step_type_index: + icu_utf16_to_utf8(chain->norm8, src16, status); + break; + case ICU_chain_step_type_sortkey: + icu_utf16_to_utf8(chain->sort8, src16, status); + break; + case ICU_chain_step_type_casemap: + icu_casemap_casemap(step->u.casemap, + step->buf16, src16, status); + break; + case ICU_chain_step_type_normalize: + icu_normalizer_normalize(step->u.normalizer, + step->buf16, src16, status); + break; + case ICU_chain_step_type_tokenize: + // attach to new src16 token only first time during splitting + if (step->need_new_token){ + icu_tokenizer_attach(step->u.tokenizer, src16, status); + step->need_new_token = 0; + } + // splitting one src16 token into multiple buf16 tokens + step->more_tokens + = icu_tokenizer_next_token(step->u.tokenizer, + step->buf16, status); + // make sure to get new previous token if this one had been used up + if (step->previous && !step->more_tokens){ + if (icu_chain_step_next_token(chain, step->previous, status)){ + icu_tokenizer_attach(step->u.tokenizer, src16, status); + step->need_new_token = 0; + step->more_tokens + = icu_tokenizer_next_token(step->u.tokenizer, + step->buf16, status); + } + } + if (0 == step->more_tokens) + return 0; + break; + default: + return 0; + break; + } + + + + // stop further token processing if last step and + // new tokens are needed from previous (non-existing) step + if (!step->previous && step->need_new_token) + step->more_tokens = 0; + + //printf("%d %d %d\n", + // step->more_tokens, src16->utf16_len, step->buf16->utf16_len); + + + if (U_FAILURE(*status)) + return 0; + + return 1; +}; + + + +int icu_chain_assign_cstr(struct icu_chain * chain, + const char * src8cstr, + UErrorCode *status) +{ + struct icu_chain_step * stp = 0; + + if (!chain || !src8cstr) + return 0; + + stp = chain->steps; + + // clear token count + chain->token_count = 0; + + // clear all steps stop states + + while (stp){ + stp->more_tokens = 1; + stp->need_new_token = 1; + stp = stp->previous; + } + + // finally convert UTF8 to UTF16 string + icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status); + + if (U_FAILURE(*status)) + return 0; + + return 1; +}; + + + +int icu_chain_next_token(struct icu_chain * chain, + UErrorCode *status) +{ + int success = 0; + + if (!chain || !chain->steps) + return 0; + + success = icu_chain_step_next_token(chain, chain->steps, status); + + if (success){ + chain->token_count++; + return chain->token_count; + } + + return 0; +}; + +int icu_chain_get_token_count(struct icu_chain * chain) +{ + if (!chain) + return 0; + + return chain->token_count; +}; + + + +const char * icu_chain_get_display(struct icu_chain * chain) +{ + if (chain->display8) + return icu_buf_utf8_to_cstr(chain->display8); + + return 0; +}; + +const char * icu_chain_get_norm(struct icu_chain * chain) +{ + if (chain->norm8) + return icu_buf_utf8_to_cstr(chain->norm8); + + return 0; +}; + +const char * icu_chain_get_sort(struct icu_chain * chain) +{ + if (chain->sort8) + return icu_buf_utf8_to_cstr(chain->sort8); + + return 0; +}; + + + + +#endif // HAVE_ICU + + + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ diff --git a/test/Makefile.am b/test/Makefile.am index 652bf0f..131cfcd 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,11 +1,11 @@ ## Copyright (C) 1995-2007, Index Data ApS ## All rights reserved. -## $Id: Makefile.am,v 1.38 2007-10-15 20:45:46 adam Exp $ +## $Id: Makefile.am,v 1.39 2007-10-22 12:21:39 adam Exp $ check_PROGRAMS = tsticonv tstnmem tstmatchstr tstwrbuf tstodr tstccl tstlog \ tstsoap1 tstsoap2 tstodrstack tstlogthread tstxmlquery tstpquery \ tst_comstack tst_filepath tst_record_conv tst_retrieval tst_tpath \ - tst_timing tst_query_charset tst_oid + tst_timing tst_query_charset tst_oid tst_icu_I18N check_SCRIPTS = tstcql.sh tstmarc.sh tstmarccol.sh TESTS = $(check_PROGRAMS) $(check_SCRIPTS) @@ -32,6 +32,7 @@ YAZCOMP = ../util/yaz-asncomp YAZCOMPLINE = $(YAZCOMP) -d z.tcl -i yaz -I../include $(YCFLAGS) AM_CPPFLAGS=-I$(top_srcdir)/include $(XML2_CFLAGS) +tst_icu_I18N_CPPFLAGS=$(AM_CPPFLAGS) $(ICU_CPPFLAGS) dist-hook: mkdir -p $(distdir)/cql @@ -43,6 +44,7 @@ tstodrcodec.c tstodrcodec.h: tstodr.asn $(YAZCOMP) cd $(srcdir); $(YAZCOMP) tstodr.asn LDADD = ../src/libyaz.la +tst_icu_I18N_LDADD = ../src/libyaz_icu.la $(LDADD) $(ICU_LIBS) CONFIG_CLEAN_FILES=*.log @@ -67,3 +69,4 @@ tst_retrieval_SOURCES = tst_retrieval.c tst_tpath_SOURCES = tst_tpath.c tst_timing_SOURCES = tst_timing.c tst_query_charset_SOURCES = tst_query_charset.c +tst_icu_I18N_SOURCES = tst_icu_I18N.c diff --git a/test/tst_icu_I18N.c b/test/tst_icu_I18N.c new file mode 100644 index 0000000..768e387 --- /dev/null +++ b/test/tst_icu_I18N.c @@ -0,0 +1,693 @@ +/* $Id: tst_icu_I18N.c,v 1.1 2007-10-22 12:21:39 adam Exp $ + Copyright (c) 2006-2007, Index Data. + + This file is part of Pazpar2. + + Pazpar2 is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. + + Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with Pazpar2; see the file LICENSE. If not, write to the + Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. +*/ + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + + +#if HAVE_CONFIG_H +#include "cconfig.h" +#endif + +#define USE_TIMING 0 +#if USE_TIMING +#include +#endif + +#include + + + +#ifdef HAVE_ICU +#include + +#include +#include + +//#include +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + + +#define MAX_KEY_SIZE 256 +struct icu_termmap +{ + uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated + char disp_term[MAX_KEY_SIZE]; // standard C utf-8 string +}; + + + +int icu_termmap_cmp(const void *vp1, const void *vp2) +{ + struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1; + struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2; + + int cmp = 0; + + cmp = strcmp((const char *)itmp1->sort_key, + (const char *)itmp2->sort_key); + return cmp; +}; + + + + +int test_icu_casemap(const char * locale, char action, + const char * src8cstr, const char * chk8cstr) +{ + int success = 0; + UErrorCode status = U_ZERO_ERROR; + + struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0); + struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0); + struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0); + + + int src8cstr_len = strlen(src8cstr); + int chk8cstr_len = strlen(chk8cstr); + + // converting to UTF16 + icu_utf16_from_utf8_cstr(src16, src8cstr, &status); + + // perform case mapping + icu_utf16_casemap(dest16, src16, locale, action, &status); + + // converting to UTF8 + icu_utf16_to_utf8(dest8, dest16, &status); + + + + // determine success + if (dest8->utf8 + && (dest8->utf8_len == strlen(chk8cstr)) + && !strcmp(chk8cstr, (const char *) dest8->utf8)) + success = 1; + else + success = 0; + + // report failures + if (!success){ + printf("\nERROR\n"); + printf("original string: '%s' (%d)\n", src8cstr, src8cstr_len); + printf("icu_casemap '%s:%c' '%s' (%d)\n", + locale, action, dest8->utf8, dest8->utf8_len); + printf("expected string: '%s' (%d)\n", chk8cstr, chk8cstr_len); + } + + // clean the buffers + icu_buf_utf8_destroy(src8); + icu_buf_utf8_destroy(dest8); + icu_buf_utf16_destroy(src16); + icu_buf_utf16_destroy(dest16); + + + return success; +} + + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_casemap(int argc, char **argv) +{ + + // Locale 'en' + + // sucessful tests + YAZ_CHECK(test_icu_casemap("en", 'l', + "A ReD fOx hunTS sQUirriLs", + "a red fox hunts squirrils")); + + YAZ_CHECK(test_icu_casemap("en", 'u', + "A ReD fOx hunTS sQUirriLs", + "A RED FOX HUNTS SQUIRRILS")); + + YAZ_CHECK(test_icu_casemap("en", 'f', + "A ReD fOx hunTS sQUirriLs", + "a red fox hunts squirrils")); + + YAZ_CHECK(test_icu_casemap("en", 't', + "A ReD fOx hunTS sQUirriLs", + "A Red Fox Hunts Squirrils")); + + + // Locale 'da' + + // sucess expected + YAZ_CHECK(test_icu_casemap("da", 'l', + "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN", + "åh æble, øs fløde i åen efter blåbærgrøden")); + + YAZ_CHECK(test_icu_casemap("da", 'u', + "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN", + "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN")); + + YAZ_CHECK(test_icu_casemap("da", 'f', + "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN", + "åh æble, øs fløde i åen efter blåbærgrøden")); + + YAZ_CHECK(test_icu_casemap("da", 't', + "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN", + "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden")); + + // Locale 'de' + + // sucess expected + YAZ_CHECK(test_icu_casemap("de", 'l', + "zWÖlf ärgerliche Würste rollen ÜBer die StRAße", + "zwölf ärgerliche würste rollen über die straße")); + + YAZ_CHECK(test_icu_casemap("de", 'u', + "zWÖlf ärgerliche Würste rollen ÜBer die StRAße", + "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE")); + + YAZ_CHECK(test_icu_casemap("de", 'f', + "zWÖlf ärgerliche Würste rollen ÜBer die StRAße", + "zwölf ärgerliche würste rollen über die strasse")); + + YAZ_CHECK(test_icu_casemap("de", 't', + "zWÖlf ärgerliche Würste rollen ÜBer die StRAße", + "Zwölf Ärgerliche Würste Rollen Über Die Straße")); + +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +int test_icu_sortmap(const char * locale, int src_list_len, + const char ** src_list, const char ** chk_list) +{ + int success = 1; + + UErrorCode status = U_ZERO_ERROR; + + struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0); + struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0); + + int i; + + struct icu_termmap * list[src_list_len]; + + UCollator *coll = ucol_open(locale, &status); + icu_check_status(status); + + if(U_FAILURE(status)) + return 0; + + // assigning display terms and sort keys using buf 8 and buf16 + for( i = 0; i < src_list_len; i++) + { + + list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap)); + + // copy display term + strcpy(list[i]->disp_term, src_list[i]); + + // transforming to UTF16 + icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status); + icu_check_status(status); + + // computing sortkeys + icu_sortkey8_from_utf16(coll, buf8, buf16, &status); + icu_check_status(status); + + // assigning sortkeys + memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len); + //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len); + //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8); + } + + + // do the sorting + qsort(list, src_list_len, + sizeof(struct icu_termmap *), icu_termmap_cmp); + + // checking correct sorting + for (i = 0; i < src_list_len; i++){ + if (0 != strcmp(list[i]->disp_term, chk_list[i])){ + success = 0; + } + } + + if(!success){ + printf("\nERROR\n"); + printf("Input str: '%s' : ", locale); + for (i = 0; i < src_list_len; i++) { + printf(" '%s'", list[i]->disp_term); + } + printf("\n"); + printf("ICU sort: '%s' : ", locale); + for (i = 0; i < src_list_len; i++) { + printf(" '%s'", list[i]->disp_term); + //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]); + } + printf("\n"); + printf("Expected: '%s' : ", locale); + for (i = 0; i < src_list_len; i++) { + printf(" '%s'", chk_list[i]); + } + printf("\n"); + } + + + + for( i = 0; i < src_list_len; i++) + free(list[i]); + + + ucol_close(coll); + + icu_buf_utf8_destroy(buf8); + icu_buf_utf16_destroy(buf16); + + return success; +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_sortmap(int argc, char **argv) +{ + + // sucessful tests + size_t en_1_len = 6; + const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"}; + const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"}; + YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck)); + YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck)); + YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck)); + YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck)); + YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck)); + + // sucessful tests + size_t da_1_len = 6; + const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"}; + const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"}; + YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck)); + YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck)); + + // sucessful tests + size_t de_1_len = 9; + const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"}; + const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"}; + YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck)); + YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck)); + YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck)); + +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + + + + +int test_icu_normalizer(const char * rules8cstr, + const char * src8cstr, + const char * chk8cstr) +{ + int success = 0; + + UErrorCode status = U_ZERO_ERROR; + + struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0); + struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0); + struct icu_normalizer * normalizer + = icu_normalizer_create(rules8cstr, 'f', &status); + icu_check_status(status); + + icu_utf16_from_utf8_cstr(src16, src8cstr, &status); + icu_check_status(status); + + icu_normalizer_normalize(normalizer, dest16, src16, &status); + icu_check_status(status); + + icu_utf16_to_utf8(dest8, dest16, &status); + icu_check_status(status); + + + if(!strcmp((const char *) dest8->utf8, + (const char *) chk8cstr)) + success = 1; + else { + success = 0; + printf("Normalization\n"); + printf("Rules: '%s'\n", rules8cstr); + printf("Input: '%s'\n", src8cstr); + printf("Normalized: '%s'\n", dest8->utf8); + printf("Expected: '%s'\n", chk8cstr); + } + + + icu_normalizer_destroy(normalizer); + icu_buf_utf16_destroy(src16); + icu_buf_utf16_destroy(dest16); + icu_buf_utf8_destroy(dest8); + + return success; +}; + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_normalizer(int argc, char **argv) +{ + + YAZ_CHECK(test_icu_normalizer("[:Punctuation:] Any-Remove", + "Don't shoot!", + "Dont shoot")); + + YAZ_CHECK(test_icu_normalizer("[:Control:] Any-Remove", + "Don't\n shoot!", + "Don't shoot!")); + + YAZ_CHECK(test_icu_normalizer("[:Decimal_Number:] Any-Remove", + "This is 4 you!", + "This is you!")); + + YAZ_CHECK(test_icu_normalizer("Lower; [:^Letter:] Remove", + "Don't shoot!", + "dontshoot")); + + YAZ_CHECK(test_icu_normalizer("[:^Number:] Remove", + "Monday 15th of April", + "15")); + + YAZ_CHECK(test_icu_normalizer("Lower;" + "[[:WhiteSpace:][:Punctuation:]] Remove", + " word4you? ", + "word4you")); + + + YAZ_CHECK(test_icu_normalizer("NFD; [:Nonspacing Mark:] Remove; NFC", + "à côté de l'alcôve ovoïde", + "a cote de l'alcove ovoide")); + +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +int test_icu_tokenizer(const char * locale, char action, + const char * src8cstr, int count) +{ + int success = 1; + + UErrorCode status = U_ZERO_ERROR; + struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0); + struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0); + + //printf("Input: '%s'\n", src8cstr); + + // transforming to UTF16 + icu_utf16_from_utf8_cstr(src16, src8cstr, &status); + icu_check_status(status); + + // set up tokenizer + struct icu_tokenizer * tokenizer + = icu_tokenizer_create(locale, action, &status); + icu_check_status(status); + YAZ_CHECK(tokenizer); + + // attach text buffer to tokenizer + icu_tokenizer_attach(tokenizer, src16, &status); + icu_check_status(status); + YAZ_CHECK(tokenizer->bi); + + // perform work on tokens + //printf("Tokens: "); + while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){ + icu_check_status(status); + + // converting to UTF8 + icu_utf16_to_utf8(tkn8, tkn16, &status); + + //printf("token %d %d %d %d '%s'\n", + // + // icu_tokenizer_token_start(tokenizer), + // icu_tokenizer_token_end(tokenizer), + // icu_tokenizer_token_length(tokenizer), + // tkn8->utf8); + } + + if (count != icu_tokenizer_token_count(tokenizer)){ + success = 0; + printf("\nTokenizer '%s:%c' Error: \n", locale, action); + printf("Input: '%s'\n", src8cstr); + printf("Tokens: %d", icu_tokenizer_token_count(tokenizer)); + printf(", expected: %d\n", count); + } + + icu_tokenizer_destroy(tokenizer); + icu_buf_utf16_destroy(src16); + icu_buf_utf16_destroy(tkn16); + icu_buf_utf8_destroy(tkn8); + + return success; +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_tokenizer(int argc, char **argv) +{ + + + const char * en_str + = "O Romeo, Romeo! wherefore art thou Romeo?"; + + YAZ_CHECK(test_icu_tokenizer("en", 's', en_str, 2)); + YAZ_CHECK(test_icu_tokenizer("en", 'l', en_str, 7)); + YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16)); + YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41)); + + + + const char * da_str + = "Blåbærtærte. Denne kage stammer fra Finland. " + "Den er med blåbær, men alle sommerens forskellige bær kan bruges."; + + YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3)); + YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17)); + YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37)); + YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110)); + +} + + +void test_icu_I18N_chain(int argc, char **argv) +{ + const char * en_str + = "O Romeo, Romeo! wherefore art thou\t Romeo?"; + + printf("ICU chain:\ninput: '%s'\n", en_str); + + UErrorCode status = U_ZERO_ERROR; + //struct icu_chain_step * step = 0; + struct icu_chain * chain = 0; + + + const char * xml_str = "" + "" + "" + "" + "" + "" + "" + "" + ""; + + + xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); + xmlNode *xml_node = xmlDocGetRootElement(doc); + YAZ_CHECK(xml_node); + + + chain = icu_chain_xml_config(xml_node, &status); + +#if 0 + chain = icu_chain_create((uint8_t *) "en:word", (uint8_t *) "en"); + step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, + (const uint8_t *) "[:Control:] Any-Remove", + &status); + step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, + (const uint8_t *) "s", + &status); + step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, + (const uint8_t *) "l", + &status); + step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, + (const uint8_t *) + "[[:WhiteSpace:][:Punctuation:]] Any-Remove", + &status); + step = icu_chain_insert_step(chain, ICU_chain_step_type_display, + (const uint8_t *)"", + &status); +/* step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */ +/* (const uint8_t *) "Lower", */ +/* &status); */ + step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, + (const uint8_t *) "l", + &status); + step = icu_chain_insert_step(chain, ICU_chain_step_type_index, + (const uint8_t *)"", + &status); +/* step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey, */ +/* (const uint8_t *)"", */ +/* &status); */ + +#endif + + xmlFreeDoc(doc); + YAZ_CHECK(chain); + + YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status)); + + while (icu_chain_next_token(chain, &status)){ + printf("%d '%s' '%s'\n", + icu_chain_get_token_count(chain), + icu_chain_get_norm(chain), + icu_chain_get_display(chain)); + } + + YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7); + + + YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status)); + + while (icu_chain_next_token(chain, &status)){ + printf("%d '%s' '%s'\n", + icu_chain_get_token_count(chain), + icu_chain_get_norm(chain), + icu_chain_get_display(chain)); + } + + + YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 3); + + icu_chain_destroy(chain); +} + + +void test_bug_1140(void) +{ + const char * en_str + = "O Romeo, Romeo! wherefore art thou\t Romeo?"; + + printf("ICU chain:\ninput: '%s'\n", en_str); + + UErrorCode status = U_ZERO_ERROR; + //struct icu_chain_step * step = 0; + struct icu_chain * chain = 0; + + const char * xml_str = "" + + /* if the first rule is normalize instead. Then it works */ +#if 0 + "" +#endif + "" + "" + "" + "" + "" + "" + ""; + + + xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); + xmlNode *xml_node = xmlDocGetRootElement(doc); + YAZ_CHECK(xml_node); + + chain = icu_chain_xml_config(xml_node, &status); + + xmlFreeDoc(doc); + YAZ_CHECK(chain); + + YAZ_CHECK(icu_chain_assign_cstr( + chain, "O Romeo, Romeo! wherefore art thou\t Romeo?", + &status)); + + while (icu_chain_next_token(chain, &status)) + ; + + YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7); + + YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status)); + + while (icu_chain_next_token(chain, &status)){ + printf("%d '%s' '%s'\n", + icu_chain_get_token_count(chain), + icu_chain_get_norm(chain), + icu_chain_get_display(chain)); + } + + /* we expect 'what' 'is' 'this', i.e. 3 tokens */ + YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 3); + + icu_chain_destroy(chain); +} + +#endif // HAVE_ICU + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +int main(int argc, char **argv) +{ + + YAZ_CHECK_INIT(argc, argv); + YAZ_CHECK_LOG(); + +#ifdef HAVE_ICU + + //test_icu_I18N_casemap_failures(argc, argv); + test_icu_I18N_casemap(argc, argv); + test_icu_I18N_sortmap(argc, argv); + test_icu_I18N_normalizer(argc, argv); + test_icu_I18N_tokenizer(argc, argv); + test_icu_I18N_chain(argc, argv); + test_bug_1140(); + +#else // HAVE_ICU + + printf("ICU unit tests omitted.\n" + "Please install libicu36-dev and icu-doc or similar\n"); + YAZ_CHECK(0 == 0); + +#endif // HAVE_ICU + + YAZ_CHECK_TERM; +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ diff --git a/util/Makefile.am b/util/Makefile.am index 08ae41a..ce451b3 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -1,16 +1,17 @@ ## Copyright (C) 1995-2007, Index Data ## All rights reserved. -## $Id: Makefile.am,v 1.32 2007-06-03 09:57:25 adam Exp $ +## $Id: Makefile.am,v 1.33 2007-10-22 12:21:40 adam Exp $ bin_SCRIPTS = yaz-asncomp yaz-config -EXTRA_DIST = yaz-asncomp +EXTRA_DIST = yaz-asncomp yaz-icu-example.xml DISTCLEANFILES = yaz-config AM_CPPFLAGS=-I$(top_srcdir)/include $(XML2_CFLAGS) +yaz_icu_CPPFLAGS=$(AM_CPPFLAGS) $(ICU_CPPFLAGS) -bin_PROGRAMS = yaz-marcdump yaz-iconv yaz-illclient +bin_PROGRAMS = yaz-marcdump yaz-iconv yaz-illclient yaz-icu noinst_PROGRAMS = cclsh cql2pqf cql2xcql srwtst yaz-benchmark yaz-xmlquery # MARC dumper utility @@ -41,3 +42,6 @@ yaz_xmlquery_LDADD = ../src/libyaz.la yaz_illclient_SOURCES = yaz-illclient.c yaz_illclient_LDADD = ../src/libyaz.la $(READLINE_LIBS) + +yaz_icu_SOURCES = yaz-icu.c +yaz_icu_LDADD =../src/libyaz_icu.la ../src/libyaz.la $(ICU_LIBS) diff --git a/util/yaz-icu-example.xml b/util/yaz-icu-example.xml new file mode 100644 index 0000000..ea7e7d2 --- /dev/null +++ b/util/yaz-icu-example.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/util/yaz-icu.c b/util/yaz-icu.c new file mode 100644 index 0000000..4247f80 --- /dev/null +++ b/util/yaz-icu.c @@ -0,0 +1,556 @@ +/* $Id: yaz-icu.c,v 1.1 2007-10-22 12:21:40 adam Exp $ + Copyright (c) 2006-2007, Index Data. + +This file is part of Pazpar2. + +Pazpar2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Pazpar2; see the file LICENSE. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + */ + +#if HAVE_CONFIG_H +#include "cconfig.h" +#endif + +#include + +#include +#include + +//#include +#include + + +#ifdef HAVE_ICU + +#include +#include + +#include + +/* commando line and config parameters */ +static struct config_t { + char conffile[1024]; + char print[1024]; + int xmloutput; + struct icu_chain * chain; + FILE * infile; + FILE * outfile; +} config; + + + +void print_option_error(const struct config_t *p_config) +{ + fprintf(stderr, "Calling error, valid options are :\n"); + fprintf(stderr, "yaz-icu\n" + " [-c (path/to/config/file.xml)]\n" + " [-p (a|c|l|t)] print ICU info \n" + " [-x] XML output\n" + "\n" + "Examples:\n" + "cat hugetextfile.txt | ./yaz-icu -c config.xml \n" + "./yaz-icu -p c\n" + "./yaz-icu -p l -x\n" + "./yaz-icu -p t -x\n" + "\n" + "Example ICU chain XML configuration file:\n" + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + ); + exit(1); +} + +void read_params(int argc, char **argv, struct config_t *p_config) +{ + char *arg; + int ret; + + /* set default parameters */ + p_config->conffile[0] = 0; + p_config->print[0] = 0; + p_config->xmloutput = 0; + p_config->chain = 0; + p_config->infile = stdin; + p_config->outfile = stdout; + + /* set up command line parameters */ + + while ((ret = options("c:p:x", argv, argc, &arg)) != -2) + { + switch (ret) + { + case 'c': + strcpy(p_config->conffile, arg); + break; + case 'p': + strcpy(p_config->print, arg); + break; + case 'x': + p_config->xmloutput = 1; + break; + default: + print_option_error(p_config); + } + } + + if ((!strlen(p_config->conffile) + && !strlen(p_config->print)) + || !config.infile + || !config.outfile) + + print_option_error(p_config); +}; + + +/* UConverter *conv; */ +/* conv = ucnv_open("utf-8", &status); */ +/* assert(U_SUCCESS(status)); */ + +/* *ustr16_len */ +/* = ucnv_toUChars(conv, ustr16, 1024, */ +/* (const char *) *xstr8, strlen((const char *) *xstr8), */ +/* &status); */ + + + +/* ucnv_fromUChars(conv, */ +/* (char *) *xstr8, strlen((const char *) *xstr8), */ +/* ustr16, *ustr16_len, */ +/* &status); */ +/* ucnv_close(conv); */ + + +static void print_icu_converters(const struct config_t *p_config) +{ + int32_t count; + int32_t i; + + count = ucnv_countAvailable(); + if (p_config->xmloutput) + fprintf(config.outfile, "\n", + count, ucnv_getDefaultName()); + else { + fprintf(config.outfile, "Available ICU converters: %d\n", count); + fprintf(config.outfile, "Default ICU Converter is: '%s'\n", + ucnv_getDefaultName()); + } + + for(i=0;ixmloutput) + fprintf(config.outfile, "\n", + ucnv_getAvailableName(i)); + else + fprintf(config.outfile, "%s ", ucnv_getAvailableName(i)); + } + + if (p_config->xmloutput) + fprintf(config.outfile, "\n"); + else + fprintf(config.outfile, "\n"); +} + +static void print_icu_transliterators(const struct config_t *p_config) +{ + int32_t count; + int32_t i; + + count = utrans_countAvailableIDs(); + + int32_t buf_cap = 128; + char buf[buf_cap]; + + if (p_config->xmloutput) + fprintf(config.outfile, "\n", count); + else + fprintf(config.outfile, "Available ICU transliterators: %d\n", count); + + for(i = 0; i xmloutput) + fprintf(config.outfile, "\n", buf); + else + fprintf(config.outfile, " %s", buf); + } + + if (p_config->xmloutput){ + fprintf(config.outfile, "\n"); + } + else + { + fprintf(config.outfile, "\n\nUnicode Set Patterns:\n" + " Pattern Description\n" + " Ranges [a-z] The lower case letters a through z\n" + " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n" + " String [abc{def}] chars a, b and c, and string 'def'\n" + " Categories [\\p{Letter}] Perl General Category 'Letter'.\n" + " Categories [:Letter:] Posix General Category 'Letter'.\n" + "\n" + " Combination Example\n" + " Union [[:Greek:] [:letter:]]\n" + " Intersection [[:Greek:] & [:letter:]]\n" + " Set Complement [[:Greek:] - [:letter:]]\n" + " Complement [^[:Greek:] [:letter:]]\n" + "\n" + "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n" + "\n" + "Examples:\n" + " [:Punctuation:] Any-Remove\n" + " [:Cased-Letter:] Any-Upper\n" + " [:Control:] Any-Remove\n" + " [:Decimal_Number:] Any-Remove\n" + " [:Final_Punctuation:] Any-Remove\n" + " [:Georgian:] Any-Upper\n" + " [:Katakana:] Any-Remove\n" + " [:Arabic:] Any-Remove\n" + " [:Punctuation:] Remove\n" + " [[:Punctuation:]-[.,]] Remove\n" + " [:Line_Separator:] Any-Remove\n" + " [:Math_Symbol:] Any-Remove\n" + " Lower; [:^Letter:] Remove (word tokenization)\n" + " [:^Number:] Remove (numeric tokenization)\n" + " [:^Katagana:] Remove (remove everything except Katagana)\n" + " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n" + " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n" + " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n" + " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n" + "\n" + "see http://icu.sourceforge.net/userguide/Transform.html\n" + " http://www.unicode.org/Public/UNIDATA/UCD.html\n" + " http://icu.sourceforge.net/userguide/Transform.html\n" + " http://icu.sourceforge.net/userguide/TransformRule.html\n" + ); + + + fprintf(config.outfile, "\n\n"); + + } +} + +static void print_icu_xml_locales(const struct config_t *p_config) +{ + int32_t count; + int32_t i; + UErrorCode status = U_ZERO_ERROR; + + UChar keyword[64]; + int32_t keyword_len = 0; + char keyword_str[128]; + int32_t keyword_str_len = 0; + + UChar language[64]; + int32_t language_len = 0; + char lang_str[128]; + int32_t lang_str_len = 0; + + UChar script[64]; + int32_t script_len = 0; + char script_str[128]; + int32_t script_str_len = 0; + + UChar location[64]; + int32_t location_len = 0; + char location_str[128]; + int32_t location_str_len = 0; + + UChar variant[64]; + int32_t variant_len = 0; + char variant_str[128]; + int32_t variant_str_len = 0; + + UChar name[64]; + int32_t name_len = 0; + char name_str[128]; + int32_t name_str_len = 0; + + UChar localname[64]; + int32_t localname_len = 0; + char localname_str[128]; + int32_t localname_str_len = 0; + + count = uloc_countAvailable() ; + + if (p_config->xmloutput){ + + fprintf(config.outfile, "\n", + count, uloc_getDefault(), ucol_countAvailable()); + } + + for(i=0;ixmloutput){ + fprintf(config.outfile, ""); + if (strlen(localname_str)) + fprintf(config.outfile, "%s", localname_str); + fprintf(config.outfile, "\n"); + } + else if (1 == p_config->xmloutput){ + fprintf(config.outfile, "%s", uloc_getAvailable(i)); + fprintf(config.outfile, " | "); + if (strlen(name_str)) + fprintf(config.outfile, "%s", name_str); + fprintf(config.outfile, " | "); + if (strlen(localname_str)) + fprintf(config.outfile, "%s", localname_str); + fprintf(config.outfile, "\n"); + } + else + fprintf(config.outfile, "%s ", uloc_getAvailable(i)); + } + if (p_config->xmloutput) + fprintf(config.outfile, "\n"); + else + fprintf(config.outfile, "\n"); + + if(U_FAILURE(status)) { + fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status)); + exit(status); + } +} + + +static void print_info(const struct config_t *p_config) +{ + if (p_config->xmloutput) + fprintf(config.outfile, "\n" + "\n"); + + if ('c' == config.print[0]) + print_icu_converters(&config); + else if ('l' == config.print[0]) + print_icu_xml_locales(&config); + else if ('t' == config.print[0]) + print_icu_transliterators(&config); + else { + print_icu_converters(&config); + print_icu_xml_locales(&config); + print_icu_transliterators(&config); + } + + if (p_config->xmloutput) + fprintf(config.outfile, "\n"); + + exit(0); +}; + + + +static void process_text_file(const struct config_t *p_config) +{ + char *line = 0; + char linebuf[1024]; + + xmlDoc *doc = xmlParseFile(config.conffile); + xmlNode *xml_node = xmlDocGetRootElement(doc); + + long unsigned int token_count = 0; + long unsigned int line_count = 0; + + UErrorCode status = U_ZERO_ERROR; + int success = 0; + + if (! xml_node) { + printf("Could not parse XML config file '%s' \n", + config.conffile); + exit (1); + } + + + config.chain = icu_chain_xml_config(xml_node, &status); + + if (config.chain && U_SUCCESS(status)) + success = 1; + else { + printf("Could not set up ICU chain from config file '%s' \n", + config.conffile); + exit (1); + } + + if (p_config->xmloutput) + fprintf(config.outfile, + "\n" + "\n" + "\n"); + + // read input lines for processing + while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile))) + { + success = icu_chain_assign_cstr(config.chain, line, &status); + line_count++; + + while (success && icu_chain_next_token(config.chain, &status)){ + if (U_FAILURE(status)) + success = 0; + else { + token_count++; + if (p_config->xmloutput) + fprintf(config.outfile, + "\n", + token_count, + line_count, + icu_chain_get_norm(config.chain), + icu_chain_get_display(config.chain)); + else + fprintf(config.outfile, "%lu %lu '%s' '%s'\n", + token_count, + line_count, + icu_chain_get_norm(config.chain), + icu_chain_get_display(config.chain)); + } + } + + } + + if (p_config->xmloutput) + fprintf(config.outfile, + "\n" + "\n"); + + icu_chain_destroy(config.chain); + xmlFreeDoc(doc); + if (line) + free(line); +}; + +#endif // HAVE_ICU + + +int main(int argc, char **argv) +{ + +#ifdef HAVE_ICU + + read_params(argc, argv, &config); + + if (config.conffile && strlen(config.conffile)) + process_text_file(&config); + + if (config.print && strlen(config.print)) + print_info(&config); + +#else // HAVE_ICU + + printf("ICU not available on your system.\n" + "Please install libicu36-dev and icu-doc or similar, " + "re-configure and re-compile\n"); + + +#endif // HAVE_ICU + + return(0); +}; + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/yaz-config.in b/yaz-config.in index 5eba756..76b9152 100644 --- a/yaz-config.in +++ b/yaz-config.in @@ -1,5 +1,5 @@ #!/bin/sh -# $Id: yaz-config.in,v 1.30 2007-04-18 18:46:37 adam Exp $ +# $Id: yaz-config.in,v 1.31 2007-10-22 12:21:38 adam Exp $ yazprefix=@prefix@ yaz_echo_cflags=no yaz_echo_libs=no @@ -11,6 +11,8 @@ yaz_src_root=@YAZ_SRC_ROOT@ yaz_build_root=@YAZ_BUILD_ROOT@ PTHREAD_LIBS="@PTHREAD_LIBS@" PTHREAD_CFLAGS="@PTHREAD_CFLAGS@" +ICU_LIBS="@ICU_LIBS@" +ICU_CPPFLAGS="@ICU_CPPFLAGS@" yazextralibs="@LIBS@" YAZVERSION=@VERSION@ @@ -65,6 +67,9 @@ while test $# -gt 0; do threads) lib_thread=yes ;; + icu) + lib_icu=yes + ;; server) lib_thread=yes lib_server=yes @@ -84,6 +89,10 @@ if test "$lib_server" = "yes"; then lyaz="-lyaz_server $lyaz" fi +if test "$lib_icu" = "yes"; then + lyaz="-lyaz_icu $lyaz" +fi + YAZINC="@YAZ_CONFIG_CFLAGS@" if test "$yaz_echo_source" = "yes"; then @@ -127,6 +136,12 @@ if test "$lib_thread" = "yes"; then YAZLALIB="$YAZLALIB $PTHREAD_LIBS" fi +if test "$lib_icu" = "yes"; then + YAZINC="$YAZINC $ICU_CPPFLAGS" + YAZLIB="$YAZLIB $ICU_LIBS" + YAZLALIB="$YAZLALIB $ICU_LIBS" +fi + if test "$yaz_echo_help" = "yes"; then usage 1 1>&2 fi -- 1.7.10.4