The offset+size however are based on UChar however.
*/
YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain);
+/** \brief returns token as it relates to originl text
+ \param chain ICU chain
+ \param start offset in original text
+ \param size number of uchars in original text
+*/
+YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain,
+ size_t *start, size_t *len);
+
/** \brief ICU tokenizer iterator type (opaque) */
typedef struct icu_iter *yaz_icu_iter_t;
YAZ_EXPORT
int icu_iter_get_token_number(yaz_icu_iter_t iter);
+/** \brief returns ICU original token start (offset) and length
+ \param iter ICU tokenizer iterator
+ \param start offset of last token in original text
+ \param len length of last token in original text
+*/
+YAZ_EXPORT
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len);
+
YAZ_END_CDECL
#endif /* YAZ_ICU_H */
int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
struct icu_buf_utf16 * tkn16,
- UErrorCode *status);
+ UErrorCode *status,
+ size_t *start, size_t *len);
int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
struct icu_buf_utf8 *result;
struct icu_buf_utf16 *input;
int token_count;
+ size_t org_start;
+ size_t org_len;
struct icu_chain_step *steps;
};
}
dst = icu_buf_utf16_create(0);
iter->status = U_ZERO_ERROR;
- if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
+ if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status,
+ &iter->org_start, &iter->org_len))
{
icu_buf_utf16_destroy(dst);
dst = 0;
/* fill and assign input string.. It will be 0 after
first iteration */
icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
+ iter->org_start = 0;
+ iter->org_len = iter->input->utf16_len;
}
void icu_iter_destroy(yaz_icu_iter_t iter)
return iter->token_count;
}
+
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
+{
+ *start = iter->org_start;
+ *len = iter->org_len;
+}
+
int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
UErrorCode *status)
{
return 0;
}
+void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len)
+{
+ if (chain->iter)
+ icu_iter_get_org_info(chain->iter, start, len);
+}
+
+
#endif /* YAZ_HAVE_ICU */
/*
int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
struct icu_buf_utf16 *tkn16,
- UErrorCode *status)
+ UErrorCode *status,
+ size_t *start, size_t *len)
{
int32_t tkn_start = 0;
int32_t tkn_end = 0;
tokenizer->token_start = tkn_start;
tokenizer->token_end = tkn_end;
+ *start = tkn_start;
+ *len = tkn_end - tkn_start;
+
/* copying into token buffer if it exists */
if (tkn16)
{
char *nmem_strdupn(NMEM mem, const char *src, size_t n)
{
char *dst = (char *) nmem_malloc(mem, n+1);
- memcpy (dst, src, n);
+ memcpy(dst, src, n);
dst[n] = '\0';
return dst;
}
success = 0;
else
{
+ size_t start, len;
const char *sortkey = icu_chain_token_sortkey(p_config->chain);
+
+ icu_chain_get_org_info(p_config->chain, &start, &len);
wrbuf_rewind(sw);
wrbuf_puts_escaped(sw, sortkey);
token_count++;
}
else
{
- fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
+ fprintf(p_config->outfile, "%lu %lu '%s' '%s' %ld+%ld",
token_count,
line_count,
icu_chain_token_norm(p_config->chain),
- icu_chain_token_display(p_config->chain));
+ icu_chain_token_display(p_config->chain),
+ (long) start,
+ (long) len);
if (p_config->sortoutput)
{
fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));