-/* $Id: icu_I18N.c,v 1.17 2007-05-20 19:00:17 marc Exp $
+/* $Id: icu_I18N.c,v 1.18 2007-05-21 10:14:08 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
void icu_chain_destroy(struct icu_chain * chain)
{
- icu_buf_utf8_destroy(chain->display8);
- icu_buf_utf8_destroy(chain->norm8);
- icu_buf_utf8_destroy(chain->sort8);
+ if (chain){
+ icu_buf_utf8_destroy(chain->display8);
+ icu_buf_utf8_destroy(chain->norm8);
+ icu_buf_utf8_destroy(chain->sort8);
+
+ icu_buf_utf16_destroy(chain->src16);
+
+ icu_chain_step_destroy(chain->steps);
+ }
+};
+
+
+
+struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
+ UErrorCode * status){
+
+ xmlNode *node = 0;
+ struct icu_chain * chain = 0;
+
+ if (!xml_node
+ ||xml_node->type != XML_ELEMENT_NODE
+ || strcmp((const char *) xml_node->name, "icu_chain"))
+
+ return 0;
+
+ xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
+ xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
+
+ if (!xml_id || !strlen((const char *) xml_id)
+ || !xml_locale || !strlen((const char *) xml_locale))
+ return 0;
+
+ chain = icu_chain_create((const uint8_t *) xml_id,
+ (const uint8_t *) xml_locale);
+
+ if (!chain)
+ return 0;
+
+ for (node = xml_node->children; node; node = node->next)
+ {
+ if (node->type != XML_ELEMENT_NODE)
+ continue;
+
+ xmlChar *xml_rule = xmlGetProp(node, (xmlChar *) "rule");
+ struct icu_chain_step * step = 0;
+
+ if (!strcmp((const char *) node->name,
+ (const char *) "casemap")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
+ (const uint8_t *) xml_rule, status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "normalize")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
+ (const uint8_t *) xml_rule, status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "tokenize")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
+ (const uint8_t *) xml_rule, status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "display")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
+ (const uint8_t *) "", status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "normal")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_norm,
+ (const uint8_t *) "", status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "sort")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_sort,
+ (const uint8_t *) "", status);
+ }
- icu_buf_utf16_destroy(chain->src16);
+ if (!step || U_FAILURE(*status)){
+ icu_chain_destroy(chain);
+ return 0;
+ }
+
+
+ }
- icu_chain_step_destroy(chain->steps);
+ return chain;
};
+
struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
enum icu_chain_step_type type,
const uint8_t * rule,
const char * src8cstr,
UErrorCode *status)
{
- struct icu_chain_step * stp = chain->steps;
+ struct icu_chain_step * stp = 0;
if (!chain || !src8cstr)
return 0;
+
+ stp = chain->steps;
// clear token count
chain->token_count = 0;
-/* $Id: icu_I18N.h,v 1.15 2007-05-20 19:00:17 marc Exp $
+/* $Id: icu_I18N.h,v 1.16 2007-05-21 10:14:08 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
#include <yaz/nmem.h>
+#include <libxml/parser.h>
+#include <libxml/tree.h>
#include <unicode/utypes.h> /* Basic ICU data types */
#include <unicode/uchar.h> /* char names */
struct icu_chain * icu_chain_create(const uint8_t * identifier,
const uint8_t * locale);
+
void icu_chain_destroy(struct icu_chain * chain);
+struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
+ UErrorCode * status);
+
+
struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
enum icu_chain_step_type type,
const uint8_t * rule,
-/* $Id: test_icu_I18N.c,v 1.22 2007-05-20 19:00:17 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.23 2007-05-21 10:14:08 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
void test_icu_I18N_chain(int argc, char **argv)
{
const char * en_str
- = "O Romeo, Romeo! wherefore art\nthou\tRomeo?";
+ = "O Romeo, Romeo! wherefore art thou\t Romeo?";
printf("ICU chain:\ninput: '%s'\n", en_str);
UErrorCode status = U_ZERO_ERROR;
- struct icu_chain_step * step = 0;
- struct icu_chain * chain
- = icu_chain_create((uint8_t *) "en:word", (uint8_t *) "en");
+ //struct icu_chain_step * step = 0;
+ struct icu_chain * chain = 0;
+
+
+ const char * xml_str = "<icu_chain id=\"en:word\" locale=\"en\">"
+ "<normalize rule=\"[:Control:] Any-Remove\"/>"
+ "<tokenize rule=\"l\"/>"
+ "<normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
+ "<display/>"
+ "<casemap rule=\"l\"/>"
+ "<normal/>"
+ "<sort/>"
+ "</icu_chain>";
+
+
+ xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
+ xmlNode *xml_node = xmlDocGetRootElement(doc);
+ YAZ_CHECK(xml_node);
+
+
+ chain = icu_chain_xml_config(xml_node, &status);
+
+#if 0
+ chain = icu_chain_create((uint8_t *) "en:word", (uint8_t *) "en");
step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
(const uint8_t *) "[:Control:] Any-Remove",
&status);
/* (const uint8_t *)"", */
/* &status); */
+#endif
-
+ YAZ_CHECK(chain);
YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status));
icu_chain_get_display(chain));
}
+ YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7);
+
icu_chain_destroy(chain);
}