X-Git-Url: http://jsfdemo.indexdata.com/?a=blobdiff_plain;f=src%2Ficu_I18N.c;h=d791abadd5f4b6e7e4c439439dc185327e82f83f;hb=d293bd2470d6d93f44c89ca292ae3dfbb22e9b09;hp=76b9ed7d30a3f901faa74fdcd58c3f9b933142c4;hpb=316bd768750e074a7ba30e24a46f7e95d0302c8c;p=pazpar2-moved-to-github.git diff --git a/src/icu_I18N.c b/src/icu_I18N.c index 76b9ed7..d791aba 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.c,v 1.10 2007-05-11 09:35:50 marc Exp $ +/* $Id: icu_I18N.c,v 1.16 2007-05-16 19:50:01 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -111,6 +111,23 @@ struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, }; +struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16) +{ + if(!dest16 || !src16 + || dest16 == src16) + return 0; + + if (dest16->utf16_cap < src16->utf16_len) + icu_buf_utf16_resize(dest16, src16->utf16_len * 2); + + u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len); + dest16->utf16_len = src16->utf16_len; + + return dest16; +}; + + void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16) { if (buf16){ @@ -172,6 +189,23 @@ struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, }; +struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8, + struct icu_buf_utf8 * src8) +{ + if(!dest8 || !src8 + || dest8 == src8) + return 0; + + + if (dest8->utf8_cap < src8->utf8_len) + icu_buf_utf8_resize(dest8, src8->utf8_len * 2); + + strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len); + + return dest8; +}; + + void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8) { @@ -404,7 +438,7 @@ UErrorCode icu_sortkey8_from_utf16(UCollator *coll, dest8->utf8_len = 0; } - return *status; + return sortkey_len; }; @@ -461,13 +495,8 @@ struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, if (U_SUCCESS(*status)) return tokenizer; - // reestablishing zero error state - //if (*status == U_USING_DEFAULT_WARNING) - // *status = U_ZERO_ERROR; - - // freeing if failed - free(tokenizer); + icu_tokenizer_destroy(tokenizer); return 0; }; @@ -611,8 +640,9 @@ struct icu_normalizer * icu_normalizer_create(const char *rules, char action, normalizer->action = action; normalizer->trans = 0; + normalizer->rules16 = icu_buf_utf16_create(0); icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status); - + switch(normalizer->action) { case 'f': normalizer->trans @@ -622,14 +652,14 @@ struct icu_normalizer * icu_normalizer_create(const char *rules, char action, 0, 0, normalizer->parse_error, status); break; -/* case 'b': */ -/* normalizer->trans */ -/* = utrans_openU(normalizer->rules16->utf16, */ -/* normalizer->rules16->utf16_len, */ -/* UTRANS_BACKWARD, */ -/* 0, 0, */ -/* normalizer->parse_error, status); */ -/* break; */ + case 'r': + normalizer->trans + = utrans_openU(normalizer->rules16->utf16, + normalizer->rules16->utf16_len, + UTRANS_REVERSE , + 0, 0, + normalizer->parse_error, status); + break; default: *status = U_UNSUPPORTED_ERROR; return 0; @@ -640,21 +670,383 @@ struct icu_normalizer * icu_normalizer_create(const char *rules, char action, return normalizer; // freeing if failed - free(normalizer); + icu_normalizer_destroy(normalizer); return 0; }; void icu_normalizer_destroy(struct icu_normalizer * normalizer){ if (normalizer) { + if (normalizer->rules16) + icu_buf_utf16_destroy(normalizer->rules16); if (normalizer->trans) - utrans_close (normalizer->trans); + utrans_close(normalizer->trans); free(normalizer); } }; +int icu_normalizer_normalize(struct icu_normalizer * normalizer, + struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + UErrorCode *status) +{ + if (!normalizer || !normalizer->trans || !src16 || !dest16) + return 0; + + if (!icu_buf_utf16_copy(dest16, src16)) + return 0; + + utrans_transUChars (normalizer->trans, + dest16->utf16, &(dest16->utf16_len), + dest16->utf16_cap, + 0, &(src16->utf16_len), status); + + if (U_FAILURE(*status)){ + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + } + + return dest16->utf16_len; +} + + + + +struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, + enum icu_chain_step_type type, + const uint8_t * rule, + struct icu_buf_utf16 * buf16, + UErrorCode *status) +{ + struct icu_chain_step * step = 0; + + if(!chain || !type || !rule) + return 0; + + step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step)); + + step->type = type; + step->more_tokens = 0; + + if (buf16) + step->buf16 = buf16; + else + step->buf16 = 0; + + // create auxilary objects + switch(step->type) { + case ICU_chain_step_type_display: + break; + case ICU_chain_step_type_norm: + break; + case ICU_chain_step_type_sort: + break; + case ICU_chain_step_type_charmap: + break; + case ICU_chain_step_type_normalize: + step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status); + break; + case ICU_chain_step_type_tokenize: + step->u.tokenizer = icu_tokenizer_create((char *) chain->locale, + (char) rule[0], status); + break; + default: + break; + } + + return step; +}; + + +void icu_chain_step_destroy(struct icu_chain_step * step){ + + if (!step) + return; + + icu_chain_step_destroy(step->previous); + + switch(step->type) { + case ICU_chain_step_type_display: + break; + case ICU_chain_step_type_norm: + break; + case ICU_chain_step_type_sort: + break; + case ICU_chain_step_type_charmap: + icu_buf_utf16_destroy(step->buf16); + break; + case ICU_chain_step_type_normalize: + icu_normalizer_destroy(step->u.normalizer); + icu_buf_utf16_destroy(step->buf16); + break; + case ICU_chain_step_type_tokenize: + icu_tokenizer_destroy(step->u.tokenizer); + icu_buf_utf16_destroy(step->buf16); + break; + default: + break; + } + + +}; + + + +struct icu_chain * icu_chain_create(const uint8_t * identifier, + const uint8_t * locale) +{ + + struct icu_chain * chain + = (struct icu_chain *) malloc(sizeof(struct icu_chain)); + + strncpy((char *) chain->identifier, (const char *) identifier, 128); + chain->identifier[128 - 1] = '\0'; + strncpy((char *) chain->locale, (const char *) locale, 16); + chain->locale[16 - 1] = '\0'; + + chain->token_count = 0; + + chain->display8 = icu_buf_utf8_create(0); + chain->norm8 = icu_buf_utf8_create(0); + chain->sort8 = icu_buf_utf8_create(0); + + chain->src16 = icu_buf_utf16_create(0); + + chain->steps = 0; + + return chain; +}; + + +void icu_chain_destroy(struct icu_chain * chain) +{ + icu_buf_utf8_destroy(chain->display8); + icu_buf_utf8_destroy(chain->norm8); + icu_buf_utf8_destroy(chain->sort8); + + icu_buf_utf16_destroy(chain->src16); + + icu_chain_step_destroy(chain->steps); +}; + + +struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, + enum icu_chain_step_type type, + const uint8_t * rule, + UErrorCode *status) +{ + struct icu_chain_step * step = 0; + struct icu_buf_utf16 * src16 = 0; + struct icu_buf_utf16 * buf16 = 0; + + if (!chain || !type || !rule) + return 0; + + // assign utf16 src buffers as needed + if (chain->steps && chain->steps->buf16) + src16 = chain->steps->buf16; + else if (chain->src16) + src16 = chain->src16; + else + return 0; + + + // assign utf16 destination buffers as needed, or + // re-use previous uft18 buffer if this step does not touch it + switch(type) { + case ICU_chain_step_type_display: + buf16 = src16; + break; + case ICU_chain_step_type_norm: + buf16 = src16; + break; + case ICU_chain_step_type_sort: + buf16 = src16; + break; + case ICU_chain_step_type_charmap: + buf16 = icu_buf_utf16_create(0); + break; + case ICU_chain_step_type_normalize: + buf16 = icu_buf_utf16_create(0); + break; + case ICU_chain_step_type_tokenize: + buf16 = icu_buf_utf16_create(0); + break; + default: + break; + } + + // create actual chain step with this buffer + step = icu_chain_step_create(chain, type, rule, buf16, status); + + step->previous = chain->steps; + chain->steps = step; + + return step; +}; + + +int icu_chain_step_next_token(struct icu_chain * chain, + struct icu_chain_step * step, + UErrorCode *status) +{ + struct icu_buf_utf16 * src16 = 0; + + printf("icu_chain_step_next_token %d\n", (int) step); + + if (!chain || !chain->src16 || !step || !step->more_tokens) + return 0; + + // assign utf16 src buffers as neeed, advance in previous steps + // tokens, and setting stop condition + if (step->previous){ + src16 = step->previous->buf16; + step->more_tokens + = icu_chain_step_next_token(chain, step->previous, status); + } + else { // first step can only work once on chain->src16 input buffer + src16 = chain->src16; + step->more_tokens = 1; + } + + // stop if nothing to process + // i.e new token source was not properly assigned + if (!step->more_tokens || !src16 || !src16->utf16_len) // + return 0; + + printf("icu_chain_step_next_token %d working\n", (int) step); + + + // perform the work, eventually put this steps output in + // step->buf16 or the chains UTF8 output buffers + switch(step->type) { + case ICU_chain_step_type_display: + icu_utf16_to_utf8(chain->display8, src16, status); + break; + case ICU_chain_step_type_norm: + icu_utf16_to_utf8(chain->norm8, src16, status); + break; + case ICU_chain_step_type_sort: + icu_utf16_to_utf8(chain->sort8, src16, status); + break; + case ICU_chain_step_type_charmap: + break; + case ICU_chain_step_type_normalize: + icu_normalizer_normalize(step->u.normalizer, + step->buf16, src16, status); + break; + case ICU_chain_step_type_tokenize: + icu_tokenizer_attach(step->u.tokenizer, src16, status); + step->more_tokens + = icu_tokenizer_next_token(step->u.tokenizer, + step->buf16, status); + break; + default: + return 0; + break; + } + + + // stop further token processing if last step + if (!step->previous) + step->more_tokens = 0; + + + if (U_FAILURE(*status)) + return 0; + + return 1; +}; + + + +int icu_chain_assign_cstr(struct icu_chain * chain, + const char * src8cstr, + UErrorCode *status) +{ + struct icu_chain_step * stp = chain->steps; + + if (!chain || !src8cstr) + return 0; + + // clear token count + chain->token_count = 0; + + // clear all steps stop states + + while (stp){ + stp->more_tokens = 1; + stp = stp->previous; + } + + // finally convert UTF8 to UTF16 string + icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status); + + if (U_FAILURE(*status)) + return 0; + + return 1; +}; + + + +int icu_chain_next_token(struct icu_chain * chain, + UErrorCode *status) +{ + int success = 0; + + if (!chain || !chain->steps) + return 0; + + success = icu_chain_step_next_token(chain, chain->steps, status); + + if (success){ + chain->token_count++; + return chain->token_count; + } + + return 0; +}; + +int icu_chain_get_token_count(struct icu_chain * chain) +{ + if (!chain) + return 0; + + return chain->token_count; +}; + + + +const char * icu_chain_get_display(struct icu_chain * chain) +{ + if (chain->display8) + return (const char *) chain->display8->utf8; + + return 0; +}; + +const char * icu_chain_get_norm(struct icu_chain * chain) +{ + if (chain->norm8) + return (const char *) chain->norm8->utf8; + + return 0; +}; + +const char * icu_chain_get_sort(struct icu_chain * chain) +{ + if (chain->sort8) + return (const char *) chain->sort8->utf8; + + return 0; +}; + + + + #endif // HAVE_ICU