From d22d19e9cecc3a50dfca023434580b38e53a9ef7 Mon Sep 17 00:00:00 2001
From: Marc Cromme <marc@indexdata.dk>
Date: Mon, 7 May 2007 09:31:36 +0000
Subject: [PATCH] moved working ICU sorting into YAZ unittest test_icu_I18N.c
 commented casemapping out for the time beeing, need to
 integrate with new dynamic ICU buffers

---
 src/icu_I18N.c      |  237 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/icu_I18N.h      |   62 +++++++++++++-
 src/test_icu_I18N.c |  157 +++++++++++++++++++++++++++++++---
 3 files changed, 437 insertions(+), 19 deletions(-)

diff --git a/src/icu_I18N.c b/src/icu_I18N.c
index c0a7407..6dd150e 100644
--- a/src/icu_I18N.c
+++ b/src/icu_I18N.c
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.c,v 1.4 2007-05-02 14:01:36 marc Exp $
+/* $Id: icu_I18N.c,v 1.5 2007-05-07 09:31:36 marc Exp $
    Copyright (c) 2006-2007, Index Data.
 
 This file is part of Pazpar2.
@@ -35,6 +35,8 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 #include <yaz/log.h>
 
 #include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
 
 #include <unicode/ustring.h>  /* some more string fcns*/
 #include <unicode/uchar.h>    /* char names           */
@@ -49,6 +51,238 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 /* #include <unicode/unistr.h> */
 
 
+
+
+int icu_check_status (UErrorCode status)
+{
+    //if(U_FAILURE(status))
+    if(!U_SUCCESS(status))
+      yaz_log(YLOG_WARN, 
+              "ICU: %d %s\n", status, u_errorName(status));
+  return status;
+}
+
+
+
+struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
+{
+  struct icu_buf_utf16 * buf16 
+    = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
+
+  buf16->utf16 = 0;
+  buf16->utf16_len = 0;
+  buf16->utf16_cap = 0;
+
+  if (capacity > 0){
+    buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
+    buf16->utf16[0] = (UChar) 0;
+    buf16->utf16_cap = capacity;
+  }
+  return buf16;
+};
+
+
+struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
+                                            size_t capacity)
+{
+  if (buf16){
+    if (capacity >  0){
+      if (0 == buf16->utf16)
+        buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
+      else
+        buf16->utf16 
+          = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
+      buf16->utf16[0] = (UChar) 0;
+      buf16->utf16_len = 0;
+      buf16->utf16_cap = capacity;
+    } 
+    else { 
+      if (buf16->utf16)
+        free(buf16->utf16);
+      buf16->utf16 = 0;
+      buf16->utf16_len = 0;
+      buf16->utf16_cap = 0;
+    }
+  }
+
+  return buf16;
+};
+
+
+void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
+{
+  if (buf16){
+    if (buf16->utf16)
+      free(buf16->utf16);
+    free(buf16);
+  }
+};
+
+
+
+
+
+
+struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
+{
+  struct icu_buf_utf8 * buf8 
+    = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
+
+  buf8->utf8 = 0;
+  buf8->utf8_len = 0;
+  buf8->utf8_cap = 0;
+
+  if (capacity > 0){
+    buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
+    buf8->utf8[0] = (uint8_t) 0;
+    buf8->utf8_cap = capacity;
+  }
+  return buf8;
+};
+
+
+
+struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
+                                          size_t capacity)
+{
+  if (buf8){
+    if (capacity >  0){
+      if (0 == buf8->utf8)
+        buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
+      else
+        buf8->utf8 
+          = (uint8_t *) realloc(buf8->utf8, sizeof(uint8_t) * capacity);
+      buf8->utf8[0] = (uint8_t) 0;
+      buf8->utf8_len = 0;
+      buf8->utf8_cap = capacity;
+    } 
+    else { 
+      if (buf8->utf8)
+        free(buf8->utf8);
+      buf8->utf8 = 0;
+      buf8->utf8_len = 0;
+      buf8->utf8_cap = 0;
+    }
+  }
+
+  return buf8;
+};
+
+
+
+void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
+{
+  if (buf8){
+    if (buf8->utf8)
+      free(buf8->utf8);
+    free(buf8);
+  }
+};
+
+
+
+UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
+                               struct icu_buf_utf8 * src8,
+                               UErrorCode * status)
+{
+  int32_t utf16_len = 0;
+  
+  u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
+                &utf16_len,
+                (const char *) src8->utf8, src8->utf8_len, status);
+  
+  // check for buffer overflow, resize and retry
+  if (*status == U_BUFFER_OVERFLOW_ERROR
+      //|| dest16->utf16_len > dest16->utf16_cap
+      ){
+    icu_buf_utf16_resize(dest16, utf16_len * 2);
+    *status = U_ZERO_ERROR;
+    u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
+                  &utf16_len,
+                  (const char *) src8->utf8, src8->utf8_len, status);
+  }
+
+  if (*status != U_BUFFER_OVERFLOW_ERROR
+      && utf16_len < dest16->utf16_cap)
+    dest16->utf16_len = utf16_len;
+  else {
+    dest16->utf16[0] = (UChar) 0;
+    dest16->utf16_len = 0;
+  }
+  
+  return *status;
+};
+
+ 
+
+UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
+                                    const char * src8cstr,
+                                    UErrorCode * status)
+{
+  size_t src8cstr_len = 0;
+  int32_t utf16_len = 0;
+
+  src8cstr_len = strlen(src8cstr);
+  
+  u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
+                &utf16_len,
+                src8cstr, src8cstr_len, status);
+  
+  // check for buffer overflow, resize and retry
+  if (*status == U_BUFFER_OVERFLOW_ERROR
+      //|| dest16->utf16_len > dest16->utf16_cap
+      ){
+    icu_buf_utf16_resize(dest16, utf16_len * 2);
+    *status = U_ZERO_ERROR;
+    u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
+                  &utf16_len,
+                  src8cstr, src8cstr_len, status);
+  }
+
+  if (*status != U_BUFFER_OVERFLOW_ERROR
+      && utf16_len < dest16->utf16_cap)
+    dest16->utf16_len = utf16_len;
+  else {
+    dest16->utf16[0] = (UChar) 0;
+    dest16->utf16_len = 0;
+  }
+  
+  return *status;
+};
+
+
+UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
+                                   struct icu_buf_utf8 * dest8, 
+                                   struct icu_buf_utf16 * src16,
+                                   UErrorCode * status)
+{ 
+  
+  int32_t sortkey_len = 0;
+
+  sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
+                                dest8->utf8, dest8->utf8_cap);
+
+  // check for buffer overflow, resize and retry
+  if (sortkey_len > dest8->utf8_cap) {
+    icu_buf_utf8_resize(dest8, sortkey_len * 2);
+    sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
+                                  dest8->utf8, dest8->utf8_cap);
+  }
+
+  if (sortkey_len > 0)
+    dest8->utf8_len = sortkey_len;
+ 
+  return *status;
+};
+
+
+
+
+
+/// CRAP FOLLOWING HERE ...
+
+#if 0
+
 // forward declarations for helper functions
 
 int icu_check_status (UErrorCode status);
@@ -277,6 +511,7 @@ char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap,
     return dest8;
 }
 
+#endif
 
 
 
diff --git a/src/icu_I18N.h b/src/icu_I18N.h
index 6dc7096..eb44204 100644
--- a/src/icu_I18N.h
+++ b/src/icu_I18N.h
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.h,v 1.4 2007-05-02 14:01:36 marc Exp $
+/* $Id: icu_I18N.h,v 1.5 2007-05-07 09:31:36 marc Exp $
    Copyright (c) 2006-2007, Index Data.
 
 This file is part of Pazpar2.
@@ -27,11 +27,11 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 #include <yaz/nmem.h>
 
 
-//#include <unicode/utypes.h>   /* Basic ICU data types */
-//#include <unicode/uchar.h>    /* char names           */
+#include <unicode/utypes.h>   /* Basic ICU data types */
+#include <unicode/uchar.h>    /* char names           */
 
 //#include <unicode/ustdio.h>
-//#include <unicode/ucol.h> 
+#include <unicode/ucol.h> 
 //#include <unicode/ucnv.h>     /* C   Converter API    */
 //#include <unicode/ustring.h>  /* some more string fcns*/
 //#include <unicode/uloc.h>
@@ -39,7 +39,59 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 //#include <unicode/unistr.h>
 
 
+int icu_check_status (UErrorCode status);
 
+struct icu_buf_utf16
+{
+  UChar * utf16;
+  int32_t utf16_len;
+  int32_t utf16_cap;
+};
+
+struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
+struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
+                                            size_t capacity);
+void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
+
+
+
+struct icu_buf_utf8
+{
+  uint8_t * utf8;
+  int32_t utf8_len;
+  int32_t utf8_cap;
+};
+
+struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
+struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
+                                          size_t capacity);
+void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
+
+
+UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
+                               struct icu_buf_utf8 * src8,
+                               UErrorCode * status);
+
+UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
+                                    const char * src8cstr,
+                                    UErrorCode * status);
+
+UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
+                                   struct icu_buf_utf8 * dest8, 
+                                   struct icu_buf_utf16 * src16,
+                                   UErrorCode * status);
+
+
+
+
+
+
+
+
+
+// CRAP to Follow here ...
+
+#if 0
 struct icu_termmap
 {
     char * sort_key;   // standard C string '\0' terminated 
@@ -59,6 +111,8 @@ char * icu_sortmap(NMEM nmem, char *buf, size_t buf_cap,
                    size_t *dest8_len,  const char *src8,
                    const char *locale);
 
+#endif // 0
+
 
 #endif // HAVE_ICU
 #endif // ICU_I18NL_H
diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c
index 62b5255..065a8f0 100644
--- a/src/test_icu_I18N.c
+++ b/src/test_icu_I18N.c
@@ -1,4 +1,4 @@
-/* $Id: test_icu_I18N.c,v 1.7 2007-05-02 14:01:36 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.8 2007-05-07 09:31:36 marc Exp $
    Copyright (c) 2006-2007, Index Data.
 
 This file is part of Pazpar2.
@@ -39,10 +39,45 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 #include "icu_I18N.h"
 #include <string.h>
 #include <stdlib.h>
+#include <stdio.h>
+
+
+#include <unicode/ustring.h>  /* some more string fcns*/
+#include <unicode/uchar.h>    /* char names           */
+//#include <unicode/ustdio.h>
+//#include <unicode/utypes.h>   /* Basic ICU data types */
+#include <unicode/ucol.h> 
 
 
 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 
+
+#define MAX_KEY_SIZE 256
+
+struct icu_termmap
+{
+  uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
+  char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
+};
+
+
+
+int icu_termmap_cmp(const void *vp1, const void *vp2)
+{
+  struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
+  struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
+
+  int cmp = 0;
+    
+  cmp = strcmp((const char *)itmp1->sort_key, 
+               (const char *)itmp2->sort_key);
+  return cmp;
+}
+
+
+
+#if 0
+
 int test_icu_casemap(const char * locale, char action,
                      const char * src8, const char * check8)
 {
@@ -193,8 +228,11 @@ void test_icu_I18N_casemap_failures(int argc, char **argv)
     nmem_destroy(nmem);
 }
 
+#endif
+
 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 
+#if 0
 int test_icu_sortmap(const char * locale, size_t list_len,
                      const char ** src8_list, const char ** check8_list)
 {
@@ -260,6 +298,97 @@ int test_icu_sortmap(const char * locale, size_t list_len,
     return sucess;
 }
 
+#else
+
+int test_icu_sortmap(const char * locale, int src_list_len,
+                     const char ** src_list, const char ** chk_list)
+{
+    int success = 1;
+
+    UErrorCode status = U_ZERO_ERROR;
+
+  struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
+  struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
+
+  int i;
+
+  struct icu_termmap * list[src_list_len];
+
+  UCollator *coll = ucol_open(locale, &status); 
+  icu_check_status(status);
+
+  if(!U_SUCCESS(status))
+    return 0;
+
+  // assigning display terms and sort keys using buf 8 and buf16
+  for( i = 0; i < src_list_len; i++) 
+    {
+
+      list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
+
+      // copy display term
+      strcpy(list[i]->disp_term, src_list[i]);    
+
+      // transforming to UTF16
+      icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
+      icu_check_status(status);
+
+      // computing sortkeys
+      icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
+      icu_check_status(status);
+    
+      // assigning sortkeys
+      memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);    
+      //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);    
+      //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
+    } 
+
+
+  // do the sorting
+  qsort(list, src_list_len, 
+        sizeof(struct icu_termmap *), icu_termmap_cmp);
+
+  // checking correct sorting
+  for (i = 0; i < src_list_len; i++){
+    if (0 != strcmp(list[i]->disp_term, chk_list[i])){
+      success = 0;
+    }
+  }
+
+  if(!success){
+  printf("\nERROR\n"); 
+  printf("Input str: '%s' : ", locale); 
+  for (i = 0; i < src_list_len; i++) {
+    printf(" '%s'", list[i]->disp_term); 
+  }
+  printf("\n");
+  printf("ICU sort:  '%s' : ", locale); 
+  for (i = 0; i < src_list_len; i++) {
+    printf(" '%s'", list[i]->disp_term); 
+    //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]); 
+  }
+  printf("\n"); 
+  printf("Expected:  '%s' : ", locale); 
+  for (i = 0; i < src_list_len; i++) {
+    printf(" '%s'", chk_list[i]); 
+  }
+  printf("\n"); 
+  }
+  
+  
+  ucol_close(coll);
+
+  icu_buf_utf8_destroy(buf8);
+  icu_buf_utf16_destroy(buf16);
+
+
+
+    return success;  
+}
+
+
+#endif
+
 
 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 
@@ -269,27 +398,27 @@ void test_icu_I18N_sortmap(int argc, char **argv)
     // sucessful tests
     size_t en_1_len = 6;
     const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
-    const char * en_1_cck[6] = {"a", "A", "K", "k", "z", "Z"};
+    const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
     YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
-    YAZ_CHECK(0 == test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
-    YAZ_CHECK(0 == test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
-    YAZ_CHECK(0 == test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
-    YAZ_CHECK(0 == test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
+    YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
+    YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
+    YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
+    YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
     
-    // sucessful tests - this one fails and should not!!!
+    // sucessful tests 
     size_t da_1_len = 6;
     const char * da_1_src[6] = {"z", "Ã¥", "o", "Ã¦", "a", "Ã¸"};
     const char * da_1_cck[6] = {"a", "o", "z", "Ã¦", "Ã¸", "Ã¥"};
-    YAZ_CHECK(0 == test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
-    YAZ_CHECK(0 == test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
+    YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
+    YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
     
     // sucessful tests
     size_t de_1_len = 9;
     const char * de_1_src[9] = {"u", "Ã¤", "o", "t", "s", "Ã", "Ã¼", "Ã¶", "a"};
-    const char * de_1_cck[9] = {"Ã¤", "a", "o", "Ã¶", "s", "Ã", "t", "u", "Ã¼"};
+    const char * de_1_cck[9] = {"a","Ã¤", "o", "Ã¶", "s", "Ã", "t", "u", "Ã¼"};
     YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
-    YAZ_CHECK(0 == test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
-    YAZ_CHECK(0 == test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
+    YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
+    YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
     
 }
 
@@ -306,8 +435,8 @@ int main(int argc, char **argv)
 
 #ifdef HAVE_ICU
 
-    test_icu_I18N_casemap_failures(argc, argv);
-    test_icu_I18N_casemap(argc, argv);
+    //test_icu_I18N_casemap_failures(argc, argv);
+    //test_icu_I18N_casemap(argc, argv);
     test_icu_I18N_sortmap(argc, argv);
  
 #else
-- 
1.7.10.4