-/* $Id: icu_I18N.c,v 1.4 2007-05-02 14:01:36 marc Exp $
+/* $Id: icu_I18N.c,v 1.5 2007-05-07 09:31:36 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
#include <yaz/log.h>
#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
#include <unicode/ustring.h> /* some more string fcns*/
#include <unicode/uchar.h> /* char names */
/* #include <unicode/unistr.h> */
+
+
+int icu_check_status (UErrorCode status)
+{
+ //if(U_FAILURE(status))
+ if(!U_SUCCESS(status))
+ yaz_log(YLOG_WARN,
+ "ICU: %d %s\n", status, u_errorName(status));
+ return status;
+}
+
+
+
+struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
+{
+ struct icu_buf_utf16 * buf16
+ = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
+
+ buf16->utf16 = 0;
+ buf16->utf16_len = 0;
+ buf16->utf16_cap = 0;
+
+ if (capacity > 0){
+ buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
+ buf16->utf16[0] = (UChar) 0;
+ buf16->utf16_cap = capacity;
+ }
+ return buf16;
+};
+
+
+struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
+ size_t capacity)
+{
+ if (buf16){
+ if (capacity > 0){
+ if (0 == buf16->utf16)
+ buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
+ else
+ buf16->utf16
+ = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
+ buf16->utf16[0] = (UChar) 0;
+ buf16->utf16_len = 0;
+ buf16->utf16_cap = capacity;
+ }
+ else {
+ if (buf16->utf16)
+ free(buf16->utf16);
+ buf16->utf16 = 0;
+ buf16->utf16_len = 0;
+ buf16->utf16_cap = 0;
+ }
+ }
+
+ return buf16;
+};
+
+
+void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
+{
+ if (buf16){
+ if (buf16->utf16)
+ free(buf16->utf16);
+ free(buf16);
+ }
+};
+
+
+
+
+
+
+struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
+{
+ struct icu_buf_utf8 * buf8
+ = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
+
+ buf8->utf8 = 0;
+ buf8->utf8_len = 0;
+ buf8->utf8_cap = 0;
+
+ if (capacity > 0){
+ buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
+ buf8->utf8[0] = (uint8_t) 0;
+ buf8->utf8_cap = capacity;
+ }
+ return buf8;
+};
+
+
+
+struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
+ size_t capacity)
+{
+ if (buf8){
+ if (capacity > 0){
+ if (0 == buf8->utf8)
+ buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
+ else
+ buf8->utf8
+ = (uint8_t *) realloc(buf8->utf8, sizeof(uint8_t) * capacity);
+ buf8->utf8[0] = (uint8_t) 0;
+ buf8->utf8_len = 0;
+ buf8->utf8_cap = capacity;
+ }
+ else {
+ if (buf8->utf8)
+ free(buf8->utf8);
+ buf8->utf8 = 0;
+ buf8->utf8_len = 0;
+ buf8->utf8_cap = 0;
+ }
+ }
+
+ return buf8;
+};
+
+
+
+void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
+{
+ if (buf8){
+ if (buf8->utf8)
+ free(buf8->utf8);
+ free(buf8);
+ }
+};
+
+
+
+UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
+ struct icu_buf_utf8 * src8,
+ UErrorCode * status)
+{
+ int32_t utf16_len = 0;
+
+ u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
+ &utf16_len,
+ (const char *) src8->utf8, src8->utf8_len, status);
+
+ // check for buffer overflow, resize and retry
+ if (*status == U_BUFFER_OVERFLOW_ERROR
+ //|| dest16->utf16_len > dest16->utf16_cap
+ ){
+ icu_buf_utf16_resize(dest16, utf16_len * 2);
+ *status = U_ZERO_ERROR;
+ u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
+ &utf16_len,
+ (const char *) src8->utf8, src8->utf8_len, status);
+ }
+
+ if (*status != U_BUFFER_OVERFLOW_ERROR
+ && utf16_len < dest16->utf16_cap)
+ dest16->utf16_len = utf16_len;
+ else {
+ dest16->utf16[0] = (UChar) 0;
+ dest16->utf16_len = 0;
+ }
+
+ return *status;
+};
+
+
+
+UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
+ const char * src8cstr,
+ UErrorCode * status)
+{
+ size_t src8cstr_len = 0;
+ int32_t utf16_len = 0;
+
+ src8cstr_len = strlen(src8cstr);
+
+ u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
+ &utf16_len,
+ src8cstr, src8cstr_len, status);
+
+ // check for buffer overflow, resize and retry
+ if (*status == U_BUFFER_OVERFLOW_ERROR
+ //|| dest16->utf16_len > dest16->utf16_cap
+ ){
+ icu_buf_utf16_resize(dest16, utf16_len * 2);
+ *status = U_ZERO_ERROR;
+ u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
+ &utf16_len,
+ src8cstr, src8cstr_len, status);
+ }
+
+ if (*status != U_BUFFER_OVERFLOW_ERROR
+ && utf16_len < dest16->utf16_cap)
+ dest16->utf16_len = utf16_len;
+ else {
+ dest16->utf16[0] = (UChar) 0;
+ dest16->utf16_len = 0;
+ }
+
+ return *status;
+};
+
+
+UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
+ struct icu_buf_utf8 * dest8,
+ struct icu_buf_utf16 * src16,
+ UErrorCode * status)
+{
+
+ int32_t sortkey_len = 0;
+
+ sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
+ dest8->utf8, dest8->utf8_cap);
+
+ // check for buffer overflow, resize and retry
+ if (sortkey_len > dest8->utf8_cap) {
+ icu_buf_utf8_resize(dest8, sortkey_len * 2);
+ sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
+ dest8->utf8, dest8->utf8_cap);
+ }
+
+ if (sortkey_len > 0)
+ dest8->utf8_len = sortkey_len;
+
+ return *status;
+};
+
+
+
+
+
+/// CRAP FOLLOWING HERE ...
+
+#if 0
+
// forward declarations for helper functions
int icu_check_status (UErrorCode status);
return dest8;
}
+#endif
-/* $Id: icu_I18N.h,v 1.4 2007-05-02 14:01:36 marc Exp $
+/* $Id: icu_I18N.h,v 1.5 2007-05-07 09:31:36 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
#include <yaz/nmem.h>
-//#include <unicode/utypes.h> /* Basic ICU data types */
-//#include <unicode/uchar.h> /* char names */
+#include <unicode/utypes.h> /* Basic ICU data types */
+#include <unicode/uchar.h> /* char names */
//#include <unicode/ustdio.h>
-//#include <unicode/ucol.h>
+#include <unicode/ucol.h>
//#include <unicode/ucnv.h> /* C Converter API */
//#include <unicode/ustring.h> /* some more string fcns*/
//#include <unicode/uloc.h>
//#include <unicode/unistr.h>
+int icu_check_status (UErrorCode status);
+struct icu_buf_utf16
+{
+ UChar * utf16;
+ int32_t utf16_len;
+ int32_t utf16_cap;
+};
+
+struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
+struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
+ size_t capacity);
+void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
+
+
+
+struct icu_buf_utf8
+{
+ uint8_t * utf8;
+ int32_t utf8_len;
+ int32_t utf8_cap;
+};
+
+struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
+struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
+ size_t capacity);
+void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
+
+
+UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
+ struct icu_buf_utf8 * src8,
+ UErrorCode * status);
+
+UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
+ const char * src8cstr,
+ UErrorCode * status);
+
+UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
+ struct icu_buf_utf8 * dest8,
+ struct icu_buf_utf16 * src16,
+ UErrorCode * status);
+
+
+
+
+
+
+
+
+
+// CRAP to Follow here ...
+
+#if 0
struct icu_termmap
{
char * sort_key; // standard C string '\0' terminated
size_t *dest8_len, const char *src8,
const char *locale);
+#endif // 0
+
#endif // HAVE_ICU
#endif // ICU_I18NL_H
-/* $Id: test_icu_I18N.c,v 1.7 2007-05-02 14:01:36 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.8 2007-05-07 09:31:36 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
#include "icu_I18N.h"
#include <string.h>
#include <stdlib.h>
+#include <stdio.h>
+
+
+#include <unicode/ustring.h> /* some more string fcns*/
+#include <unicode/uchar.h> /* char names */
+//#include <unicode/ustdio.h>
+//#include <unicode/utypes.h> /* Basic ICU data types */
+#include <unicode/ucol.h>
// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
+#define MAX_KEY_SIZE 256
+
+struct icu_termmap
+{
+ uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
+ char disp_term[MAX_KEY_SIZE]; // standard C utf-8 string
+};
+
+
+
+int icu_termmap_cmp(const void *vp1, const void *vp2)
+{
+ struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
+ struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
+
+ int cmp = 0;
+
+ cmp = strcmp((const char *)itmp1->sort_key,
+ (const char *)itmp2->sort_key);
+ return cmp;
+}
+
+
+
+#if 0
+
int test_icu_casemap(const char * locale, char action,
const char * src8, const char * check8)
{
nmem_destroy(nmem);
}
+#endif
+
// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+#if 0
int test_icu_sortmap(const char * locale, size_t list_len,
const char ** src8_list, const char ** check8_list)
{
return sucess;
}
+#else
+
+int test_icu_sortmap(const char * locale, int src_list_len,
+ const char ** src_list, const char ** chk_list)
+{
+ int success = 1;
+
+ UErrorCode status = U_ZERO_ERROR;
+
+ struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
+ struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
+
+ int i;
+
+ struct icu_termmap * list[src_list_len];
+
+ UCollator *coll = ucol_open(locale, &status);
+ icu_check_status(status);
+
+ if(!U_SUCCESS(status))
+ return 0;
+
+ // assigning display terms and sort keys using buf 8 and buf16
+ for( i = 0; i < src_list_len; i++)
+ {
+
+ list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
+
+ // copy display term
+ strcpy(list[i]->disp_term, src_list[i]);
+
+ // transforming to UTF16
+ icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
+ icu_check_status(status);
+
+ // computing sortkeys
+ icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
+ icu_check_status(status);
+
+ // assigning sortkeys
+ memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
+ //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
+ //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
+ }
+
+
+ // do the sorting
+ qsort(list, src_list_len,
+ sizeof(struct icu_termmap *), icu_termmap_cmp);
+
+ // checking correct sorting
+ for (i = 0; i < src_list_len; i++){
+ if (0 != strcmp(list[i]->disp_term, chk_list[i])){
+ success = 0;
+ }
+ }
+
+ if(!success){
+ printf("\nERROR\n");
+ printf("Input str: '%s' : ", locale);
+ for (i = 0; i < src_list_len; i++) {
+ printf(" '%s'", list[i]->disp_term);
+ }
+ printf("\n");
+ printf("ICU sort: '%s' : ", locale);
+ for (i = 0; i < src_list_len; i++) {
+ printf(" '%s'", list[i]->disp_term);
+ //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
+ }
+ printf("\n");
+ printf("Expected: '%s' : ", locale);
+ for (i = 0; i < src_list_len; i++) {
+ printf(" '%s'", chk_list[i]);
+ }
+ printf("\n");
+ }
+
+
+ ucol_close(coll);
+
+ icu_buf_utf8_destroy(buf8);
+ icu_buf_utf16_destroy(buf16);
+
+
+
+ return success;
+}
+
+
+#endif
+
// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
// sucessful tests
size_t en_1_len = 6;
const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
- const char * en_1_cck[6] = {"a", "A", "K", "k", "z", "Z"};
+ const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
- YAZ_CHECK(0 == test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
- YAZ_CHECK(0 == test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
- YAZ_CHECK(0 == test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
- YAZ_CHECK(0 == test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
+ YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
+ YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
+ YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
+ YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
- // sucessful tests - this one fails and should not!!!
+ // sucessful tests
size_t da_1_len = 6;
const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
- YAZ_CHECK(0 == test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
- YAZ_CHECK(0 == test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
+ YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
+ YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
// sucessful tests
size_t de_1_len = 9;
const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
- const char * de_1_cck[9] = {"ä", "a", "o", "ö", "s", "ß", "t", "u", "ü"};
+ const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
- YAZ_CHECK(0 == test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
- YAZ_CHECK(0 == test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
+ YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
+ YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
}
#ifdef HAVE_ICU
- test_icu_I18N_casemap_failures(argc, argv);
- test_icu_I18N_casemap(argc, argv);
+ //test_icu_I18N_casemap_failures(argc, argv);
+ //test_icu_I18N_casemap(argc, argv);
test_icu_I18N_sortmap(argc, argv);
#else