From: Marc Cromme <marc@indexdata.dk>
Date: Thu, 10 May 2007 12:11:42 +0000 (+0000)
Subject: started ICU transliterator integration for more complex normalization rules than... 
X-Git-Tag: PAZPAR2.1.0.0~160
X-Git-Url: http://jsfdemo.indexdata.com/cgi-bin?a=commitdiff_plain;h=22ab526ca79529370276260b37538c676b3816ee;p=pazpar2-moved-to-github.git

started ICU transliterator integration for more complex normalization rules than lowercasing
---

diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c
index 151ae09..ce795e6 100644
--- a/src/test_icu_I18N.c
+++ b/src/test_icu_I18N.c
@@ -1,4 +1,4 @@
-/* $Id: test_icu_I18N.c,v 1.13 2007-05-10 11:53:47 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.14 2007-05-10 12:11:42 marc Exp $
    Copyright (c) 2006-2007, Index Data.
 
    This file is part of Pazpar2.
@@ -314,9 +314,89 @@ void test_icu_I18N_sortmap(int argc, char **argv)
 
 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 
-void test_icu_I18N_normmap(int argc, char **argv)
+void test_icu_I18N_transliterator(int argc, char **argv)
 {
 
+  /* setting up transliterator */
+
+#if 0
+
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError parse_error[256];
+
+    int32_t id_cap = 256;
+    UChar id[256];
+    id[0] = 0;
+
+    trans = utrans_openU(id, id_len, UTRANS_FORWARD,
+                         0, 0, parse_error, &status);
+    
+   
+    if(U_FAILURE(status)) {
+      printf("Parse Error: line %d offset %d \n", 
+              parse_error->line, parse_error->offset);
+    }
+    icu_check_status(status);
+
+
+  int32_t ustr16_lim = *ustr16_len;
+    /* Transliterate a segment of a UChar* string */
+    
+    utrans_transUChars (trans, ustr16, &*ustr16_len,
+                        ustr16_cap,
+                        0, &ustr16_lim, &status);
+    
+    utrans_close (trans);
+
+    printf("\n\nUnicode Set Patterns:\n"
+             "   Pattern         Description\n"
+             "   Ranges          [a-z]  The lower case letters a through z\n"
+             "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
+             "   String          [abc{def}] chars a, b and c, and string 'def'\n
+"
+             "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n
+"
+             "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
+             "\n"
+             "   Combination     Example\n"
+             "   Union           [[:Greek:] [:letter:]]\n"
+             "   Intersection    [[:Greek:] & [:letter:]]\n"
+             "   Set Complement  [[:Greek:] - [:letter:]]\n"
+             "   Complement      [^[:Greek:] [:letter:]]\n"
+             "\n"
+             "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
+             "\n"
+             "Examples:\n"
+             "   [:Punctuation:] Any-Remove\n"
+             "   [:Cased-Letter:] Any-Upper\n"
+             "   [:Control:] Any-Remove\n"
+             "   [:Decimal_Number:] Any-Remove\n"
+             "   [:Final_Punctuation:] Any-Remove\n"
+             "   [:Georgian:] Any-Upper\n"
+             "   [:Katakana:] Any-Remove\n"
+             "   [:Arabic:] Any-Remove\n"
+             "   [:Punctuation:] Remove\n"
+             "   [[:Punctuation:]-[.,]] Remove\n"
+             "   [:Line_Separator:] Any-Remove\n"
+             "   [:Math_Symbol:] Any-Remove\n"
+             "   Lower; [:^Letter:] Remove (word tokenization)\n"
+             "   [:^Number:] Remove (numeric tokenization)\n"
+             "   [:^Katagana:] Remove (remove everything except Katagana)\n"
+             "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization
+)\n"
+             "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from ch
+aracters)\n"
+             "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transform
+s latin and katagana to hiragana)\n"
+             "   [[:separator:][:start punctuation:][:initial punctuation:]] Rem
+ove \n"
+             "\n"
+             "see http://icu.sourceforge.net/userguide/Transform.html\n"
+             "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
+             "    http://icu.sourceforge.net/userguide/Transform.html\n"
+             "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
+             );
+#endif
 
 }
 
@@ -430,7 +510,7 @@ int main(int argc, char **argv)
     //test_icu_I18N_casemap_failures(argc, argv);
     test_icu_I18N_casemap(argc, argv);
     test_icu_I18N_sortmap(argc, argv);
-    test_icu_I18N_normmap(argc, argv);
+    test_icu_I18N_transliterator(argc, argv);
     test_icu_I18N_tokenizer(argc, argv);
 
 #else // HAVE_ICU