From d22d19e9cecc3a50dfca023434580b38e53a9ef7 Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Mon, 7 May 2007 09:31:36 +0000 Subject: [PATCH] moved working ICU sorting into YAZ unittest test_icu_I18N.c commented casemapping out for the time beeing, need to integrate with new dynamic ICU buffers --- src/icu_I18N.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++- src/icu_I18N.h | 62 +++++++++++++- src/test_icu_I18N.c | 157 +++++++++++++++++++++++++++++++--- 3 files changed, 437 insertions(+), 19 deletions(-) diff --git a/src/icu_I18N.c b/src/icu_I18N.c index c0a7407..6dd150e 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.c,v 1.4 2007-05-02 14:01:36 marc Exp $ +/* $Id: icu_I18N.c,v 1.5 2007-05-07 09:31:36 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -35,6 +35,8 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include +#include +#include #include /* some more string fcns*/ #include /* char names */ @@ -49,6 +51,238 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA /* #include */ + + +int icu_check_status (UErrorCode status) +{ + //if(U_FAILURE(status)) + if(!U_SUCCESS(status)) + yaz_log(YLOG_WARN, + "ICU: %d %s\n", status, u_errorName(status)); + return status; +} + + + +struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity) +{ + struct icu_buf_utf16 * buf16 + = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16)); + + buf16->utf16 = 0; + buf16->utf16_len = 0; + buf16->utf16_cap = 0; + + if (capacity > 0){ + buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity); + buf16->utf16[0] = (UChar) 0; + buf16->utf16_cap = capacity; + } + return buf16; +}; + + +struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, + size_t capacity) +{ + if (buf16){ + if (capacity > 0){ + if (0 == buf16->utf16) + buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity); + else + buf16->utf16 + = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity); + buf16->utf16[0] = (UChar) 0; + buf16->utf16_len = 0; + buf16->utf16_cap = capacity; + } + else { + if (buf16->utf16) + free(buf16->utf16); + buf16->utf16 = 0; + buf16->utf16_len = 0; + buf16->utf16_cap = 0; + } + } + + return buf16; +}; + + +void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16) +{ + if (buf16){ + if (buf16->utf16) + free(buf16->utf16); + free(buf16); + } +}; + + + + + + +struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity) +{ + struct icu_buf_utf8 * buf8 + = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8)); + + buf8->utf8 = 0; + buf8->utf8_len = 0; + buf8->utf8_cap = 0; + + if (capacity > 0){ + buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity); + buf8->utf8[0] = (uint8_t) 0; + buf8->utf8_cap = capacity; + } + return buf8; +}; + + + +struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, + size_t capacity) +{ + if (buf8){ + if (capacity > 0){ + if (0 == buf8->utf8) + buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity); + else + buf8->utf8 + = (uint8_t *) realloc(buf8->utf8, sizeof(uint8_t) * capacity); + buf8->utf8[0] = (uint8_t) 0; + buf8->utf8_len = 0; + buf8->utf8_cap = capacity; + } + else { + if (buf8->utf8) + free(buf8->utf8); + buf8->utf8 = 0; + buf8->utf8_len = 0; + buf8->utf8_cap = 0; + } + } + + return buf8; +}; + + + +void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8) +{ + if (buf8){ + if (buf8->utf8) + free(buf8->utf8); + free(buf8); + } +}; + + + +UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16, + struct icu_buf_utf8 * src8, + UErrorCode * status) +{ + int32_t utf16_len = 0; + + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + (const char *) src8->utf8, src8->utf8_len, status); + + // check for buffer overflow, resize and retry + if (*status == U_BUFFER_OVERFLOW_ERROR + //|| dest16->utf16_len > dest16->utf16_cap + ){ + icu_buf_utf16_resize(dest16, utf16_len * 2); + *status = U_ZERO_ERROR; + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + (const char *) src8->utf8, src8->utf8_len, status); + } + + if (*status != U_BUFFER_OVERFLOW_ERROR + && utf16_len < dest16->utf16_cap) + dest16->utf16_len = utf16_len; + else { + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + } + + return *status; +}; + + + +UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, + const char * src8cstr, + UErrorCode * status) +{ + size_t src8cstr_len = 0; + int32_t utf16_len = 0; + + src8cstr_len = strlen(src8cstr); + + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + src8cstr, src8cstr_len, status); + + // check for buffer overflow, resize and retry + if (*status == U_BUFFER_OVERFLOW_ERROR + //|| dest16->utf16_len > dest16->utf16_cap + ){ + icu_buf_utf16_resize(dest16, utf16_len * 2); + *status = U_ZERO_ERROR; + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + src8cstr, src8cstr_len, status); + } + + if (*status != U_BUFFER_OVERFLOW_ERROR + && utf16_len < dest16->utf16_cap) + dest16->utf16_len = utf16_len; + else { + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + } + + return *status; +}; + + +UErrorCode icu_sortkey8_from_utf16(UCollator *coll, + struct icu_buf_utf8 * dest8, + struct icu_buf_utf16 * src16, + UErrorCode * status) +{ + + int32_t sortkey_len = 0; + + sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, + dest8->utf8, dest8->utf8_cap); + + // check for buffer overflow, resize and retry + if (sortkey_len > dest8->utf8_cap) { + icu_buf_utf8_resize(dest8, sortkey_len * 2); + sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, + dest8->utf8, dest8->utf8_cap); + } + + if (sortkey_len > 0) + dest8->utf8_len = sortkey_len; + + return *status; +}; + + + + + +/// CRAP FOLLOWING HERE ... + +#if 0 + // forward declarations for helper functions int icu_check_status (UErrorCode status); @@ -277,6 +511,7 @@ char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap, return dest8; } +#endif diff --git a/src/icu_I18N.h b/src/icu_I18N.h index 6dc7096..eb44204 100644 --- a/src/icu_I18N.h +++ b/src/icu_I18N.h @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.h,v 1.4 2007-05-02 14:01:36 marc Exp $ +/* $Id: icu_I18N.h,v 1.5 2007-05-07 09:31:36 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -27,11 +27,11 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include -//#include /* Basic ICU data types */ -//#include /* char names */ +#include /* Basic ICU data types */ +#include /* char names */ //#include -//#include +#include //#include /* C Converter API */ //#include /* some more string fcns*/ //#include @@ -39,7 +39,59 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA //#include +int icu_check_status (UErrorCode status); +struct icu_buf_utf16 +{ + UChar * utf16; + int32_t utf16_len; + int32_t utf16_cap; +}; + +struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity); +struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, + size_t capacity); +void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16); + + + +struct icu_buf_utf8 +{ + uint8_t * utf8; + int32_t utf8_len; + int32_t utf8_cap; +}; + +struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity); +struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, + size_t capacity); +void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8); + + +UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16, + struct icu_buf_utf8 * src8, + UErrorCode * status); + +UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, + const char * src8cstr, + UErrorCode * status); + +UErrorCode icu_sortkey8_from_utf16(UCollator *coll, + struct icu_buf_utf8 * dest8, + struct icu_buf_utf16 * src16, + UErrorCode * status); + + + + + + + + + +// CRAP to Follow here ... + +#if 0 struct icu_termmap { char * sort_key; // standard C string '\0' terminated @@ -59,6 +111,8 @@ char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap, size_t *dest8_len, const char *src8, const char *locale); +#endif // 0 + #endif // HAVE_ICU #endif // ICU_I18NL_H diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index 62b5255..065a8f0 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.7 2007-05-02 14:01:36 marc Exp $ +/* $Id: test_icu_I18N.c,v 1.8 2007-05-07 09:31:36 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -39,10 +39,45 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include "icu_I18N.h" #include #include +#include + + +#include /* some more string fcns*/ +#include /* char names */ +//#include +//#include /* Basic ICU data types */ +#include // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +#define MAX_KEY_SIZE 256 + +struct icu_termmap +{ + uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated + char disp_term[MAX_KEY_SIZE]; // standard C utf-8 string +}; + + + +int icu_termmap_cmp(const void *vp1, const void *vp2) +{ + struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1; + struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2; + + int cmp = 0; + + cmp = strcmp((const char *)itmp1->sort_key, + (const char *)itmp2->sort_key); + return cmp; +} + + + +#if 0 + int test_icu_casemap(const char * locale, char action, const char * src8, const char * check8) { @@ -193,8 +228,11 @@ void test_icu_I18N_casemap_failures(int argc, char **argv) nmem_destroy(nmem); } +#endif + // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 +#if 0 int test_icu_sortmap(const char * locale, size_t list_len, const char ** src8_list, const char ** check8_list) { @@ -260,6 +298,97 @@ int test_icu_sortmap(const char * locale, size_t list_len, return sucess; } +#else + +int test_icu_sortmap(const char * locale, int src_list_len, + const char ** src_list, const char ** chk_list) +{ + int success = 1; + + UErrorCode status = U_ZERO_ERROR; + + struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0); + struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0); + + int i; + + struct icu_termmap * list[src_list_len]; + + UCollator *coll = ucol_open(locale, &status); + icu_check_status(status); + + if(!U_SUCCESS(status)) + return 0; + + // assigning display terms and sort keys using buf 8 and buf16 + for( i = 0; i < src_list_len; i++) + { + + list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap)); + + // copy display term + strcpy(list[i]->disp_term, src_list[i]); + + // transforming to UTF16 + icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status); + icu_check_status(status); + + // computing sortkeys + icu_sortkey8_from_utf16(coll, buf8, buf16, &status); + icu_check_status(status); + + // assigning sortkeys + memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len); + //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len); + //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8); + } + + + // do the sorting + qsort(list, src_list_len, + sizeof(struct icu_termmap *), icu_termmap_cmp); + + // checking correct sorting + for (i = 0; i < src_list_len; i++){ + if (0 != strcmp(list[i]->disp_term, chk_list[i])){ + success = 0; + } + } + + if(!success){ + printf("\nERROR\n"); + printf("Input str: '%s' : ", locale); + for (i = 0; i < src_list_len; i++) { + printf(" '%s'", list[i]->disp_term); + } + printf("\n"); + printf("ICU sort: '%s' : ", locale); + for (i = 0; i < src_list_len; i++) { + printf(" '%s'", list[i]->disp_term); + //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]); + } + printf("\n"); + printf("Expected: '%s' : ", locale); + for (i = 0; i < src_list_len; i++) { + printf(" '%s'", chk_list[i]); + } + printf("\n"); + } + + + ucol_close(coll); + + icu_buf_utf8_destroy(buf8); + icu_buf_utf16_destroy(buf16); + + + + return success; +} + + +#endif + // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 @@ -269,27 +398,27 @@ void test_icu_I18N_sortmap(int argc, char **argv) // sucessful tests size_t en_1_len = 6; const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"}; - const char * en_1_cck[6] = {"a", "A", "K", "k", "z", "Z"}; + const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"}; YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck)); - YAZ_CHECK(0 == test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck)); - YAZ_CHECK(0 == test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck)); - YAZ_CHECK(0 == test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck)); - YAZ_CHECK(0 == test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck)); + YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck)); + YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck)); + YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck)); + YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck)); - // sucessful tests - this one fails and should not!!! + // sucessful tests size_t da_1_len = 6; const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"}; const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"}; - YAZ_CHECK(0 == test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck)); - YAZ_CHECK(0 == test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck)); + YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck)); + YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck)); // sucessful tests size_t de_1_len = 9; const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"}; - const char * de_1_cck[9] = {"ä", "a", "o", "ö", "s", "ß", "t", "u", "ü"}; + const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"}; YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck)); - YAZ_CHECK(0 == test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck)); - YAZ_CHECK(0 == test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck)); + YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck)); + YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck)); } @@ -306,8 +435,8 @@ int main(int argc, char **argv) #ifdef HAVE_ICU - test_icu_I18N_casemap_failures(argc, argv); - test_icu_I18N_casemap(argc, argv); + //test_icu_I18N_casemap_failures(argc, argv); + //test_icu_I18N_casemap(argc, argv); test_icu_I18N_sortmap(argc, argv); #else -- 1.7.10.4