From 94b1547e5951e1e01bf5180159e74095cd0527f4 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Fri, 31 May 2013 23:05:12 +0200 Subject: [PATCH] First go at returning start+offset The offset+size however are based on UChar however. --- include/yaz/icu.h | 16 ++++++++++++++++ include/yaz/icu_I18N.h | 3 ++- src/icu_chain.c | 21 ++++++++++++++++++++- src/icu_tokenizer.c | 6 +++++- src/nmemsdup.c | 2 +- util/yaz-icu.c | 9 +++++++-- 6 files changed, 51 insertions(+), 6 deletions(-) diff --git a/include/yaz/icu.h b/include/yaz/icu.h index 488071b..007e29b 100644 --- a/include/yaz/icu.h +++ b/include/yaz/icu.h @@ -109,6 +109,14 @@ YAZ_EXPORT const char * icu_chain_token_norm(yaz_icu_chain_t chain); */ YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain); +/** \brief returns token as it relates to originl text + \param chain ICU chain + \param start offset in original text + \param size number of uchars in original text +*/ +YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain, + size_t *start, size_t *len); + /** \brief ICU tokenizer iterator type (opaque) */ typedef struct icu_iter *yaz_icu_iter_t; @@ -170,6 +178,14 @@ const char *icu_iter_get_display(yaz_icu_iter_t iter); YAZ_EXPORT int icu_iter_get_token_number(yaz_icu_iter_t iter); +/** \brief returns ICU original token start (offset) and length + \param iter ICU tokenizer iterator + \param start offset of last token in original text + \param len length of last token in original text +*/ +YAZ_EXPORT +void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len); + YAZ_END_CDECL #endif /* YAZ_ICU_H */ diff --git a/include/yaz/icu_I18N.h b/include/yaz/icu_I18N.h index b26cb60..d61c007 100644 --- a/include/yaz/icu_I18N.h +++ b/include/yaz/icu_I18N.h @@ -139,7 +139,8 @@ int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, struct icu_buf_utf16 * tkn16, - UErrorCode *status); + UErrorCode *status, + size_t *start, size_t *len); int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer); diff --git a/src/icu_chain.c b/src/icu_chain.c index 2ac1960..de2e627 100644 --- a/src/icu_chain.c +++ b/src/icu_chain.c @@ -368,6 +368,8 @@ struct icu_iter { struct icu_buf_utf8 *result; struct icu_buf_utf16 *input; int token_count; + size_t org_start; + size_t org_len; struct icu_chain_step *steps; }; @@ -423,7 +425,8 @@ struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter, } dst = icu_buf_utf16_create(0); iter->status = U_ZERO_ERROR; - if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status)) + if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status, + &iter->org_start, &iter->org_len)) { icu_buf_utf16_destroy(dst); dst = 0; @@ -499,6 +502,8 @@ void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr) /* fill and assign input string.. It will be 0 after first iteration */ icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status); + iter->org_start = 0; + iter->org_len = iter->input->utf16_len; } void icu_iter_destroy(yaz_icu_iter_t iter) @@ -564,6 +569,13 @@ int icu_iter_get_token_number(yaz_icu_iter_t iter) return iter->token_count; } + +void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len) +{ + *start = iter->org_start; + *len = iter->org_len; +} + int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr, UErrorCode *status) { @@ -608,6 +620,13 @@ const char *icu_chain_token_sortkey(struct icu_chain *chain) return 0; } +void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len) +{ + if (chain->iter) + icu_iter_get_org_info(chain->iter, start, len); +} + + #endif /* YAZ_HAVE_ICU */ /* diff --git a/src/icu_tokenizer.c b/src/icu_tokenizer.c index 67246ea..7e2fc3f 100644 --- a/src/icu_tokenizer.c +++ b/src/icu_tokenizer.c @@ -155,7 +155,8 @@ int icu_tokenizer_attach(struct icu_tokenizer *tokenizer, int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *tkn16, - UErrorCode *status) + UErrorCode *status, + size_t *start, size_t *len) { int32_t tkn_start = 0; int32_t tkn_end = 0; @@ -202,6 +203,9 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, tokenizer->token_start = tkn_start; tokenizer->token_end = tkn_end; + *start = tkn_start; + *len = tkn_end - tkn_start; + /* copying into token buffer if it exists */ if (tkn16) { diff --git a/src/nmemsdup.c b/src/nmemsdup.c index b0ea602..3126aac 100644 --- a/src/nmemsdup.c +++ b/src/nmemsdup.c @@ -33,7 +33,7 @@ char *nmem_strdup_null(NMEM mem, const char *src) char *nmem_strdupn(NMEM mem, const char *src, size_t n) { char *dst = (char *) nmem_malloc(mem, n+1); - memcpy (dst, src, n); + memcpy(dst, src, n); dst[n] = '\0'; return dst; } diff --git a/util/yaz-icu.c b/util/yaz-icu.c index 048e45e..3e0c1d8 100644 --- a/util/yaz-icu.c +++ b/util/yaz-icu.c @@ -473,7 +473,10 @@ static void process_text_file(struct config_t *p_config) success = 0; else { + size_t start, len; const char *sortkey = icu_chain_token_sortkey(p_config->chain); + + icu_chain_get_org_info(p_config->chain, &start, &len); wrbuf_rewind(sw); wrbuf_puts_escaped(sw, sortkey); token_count++; @@ -504,11 +507,13 @@ static void process_text_file(struct config_t *p_config) } else { - fprintf(p_config->outfile, "%lu %lu '%s' '%s'", + fprintf(p_config->outfile, "%lu %lu '%s' '%s' %ld+%ld", token_count, line_count, icu_chain_token_norm(p_config->chain), - icu_chain_token_display(p_config->chain)); + icu_chain_token_display(p_config->chain), + (long) start, + (long) len); if (p_config->sortoutput) { fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw)); -- 1.7.10.4