*/
YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain);
+/** \brief ICU tokenizer iterator type (opaque) */
+typedef struct icu_iter *yaz_icu_iter_t;
+
+/** \brief create ICU tokenizer iterator from chain
+ \param chain ICU chain
+ \returns ICU iterator
+*/
+YAZ_EXPORT
+yaz_icu_iter_t icu_iter_create(struct icu_chain *chain);
+
+/** \brief starts iteration over string
+ \param iter ICU tokenizer iterator
+ \param src8cstr input string (0-terminated)
+
+ Call icu_iter_next to iterate over each token.
+*/
+YAZ_EXPORT
+void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr);
+
+/** \brief iterates over one token
+ \param iter ICU tokenizer iterator
+ \retval 0 no more tokens (EOF)
+ \retval 1 got one token (use icu_iter_get..-functions)
+*/
+YAZ_EXPORT
+int icu_iter_next(yaz_icu_iter_t iter);
+
+/** \brief destroy ICU tokenizer iterator
+ \param iter ICU tokenizer iterator
+*/
+YAZ_EXPORT
+void icu_iter_destroy(yaz_icu_iter_t iter);
+
+/** \brief returns ICU normalized token
+ \param iter ICU tokenizer iterator
+ \returns string (0-terminated)
+*/
+YAZ_EXPORT
+const char *icu_iter_get_norm(yaz_icu_iter_t iter);
+
+/** \brief returns ICU sortkey string
+ \param iter ICU tokenizer iterator
+ \returns string (0-terminated)
+*/
+YAZ_EXPORT
+const char *icu_iter_get_sortkey(yaz_icu_iter_t iter);
+
+/** \brief returns ICU display string
+ \param iter ICU tokenizer iterator
+ \returns string (0-terminated)
+*/
+YAZ_EXPORT
+const char *icu_iter_get_display(yaz_icu_iter_t iter);
+
+/** \brief returns ICU token count for iterator
+ \param iter ICU tokenizer iterator
+ \returns token count (1, 2, 3...)
+*/
+YAZ_EXPORT
+const char *icu_iter_get_token_number(yaz_icu_iter_t iter);
+
YAZ_END_CDECL
#endif /* YAZ_ICU_H */
int sort,
UErrorCode * status);
-
-struct icu_iter;
-struct icu_iter *icu_iter_create(struct icu_chain *chain);
-void icu_iter_first(struct icu_iter *iter, const char *src8cstr);
-void icu_iter_destroy(struct icu_iter *iter);
-int icu_iter_next(struct icu_iter *iter, struct icu_buf_utf8 *result);
-const char *icu_iter_get_sortkey(struct icu_iter *iter);
-const char *icu_iter_get_display(struct icu_iter *iter);
-
#endif /* ICU_I18NL_H */
/*
struct icu_chain
{
- struct icu_iter *iter;
+ yaz_icu_iter_t iter;
char *locale;
int sort;
UCollator * coll;
- /* utf8 output buffers */
- struct icu_buf_utf8 * norm8;
-
/* linked list of chain steps */
struct icu_chain_step * csteps;
};
if (!chain || !type || !rule)
return 0;
- step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
+ step = (struct icu_chain_step *) xmalloc(sizeof(*step));
step->type = type;
/* create auxilary objects */
UErrorCode * status)
{
struct icu_chain * chain
- = (struct icu_chain *) xmalloc(sizeof(struct icu_chain));
+ = (struct icu_chain *) xmalloc(sizeof(*chain));
*status = U_ZERO_ERROR;
if (U_FAILURE(*status))
return 0;
- chain->norm8 = icu_buf_utf8_create(0);
chain->csteps = 0;
return chain;
if (chain->coll)
ucol_close(chain->coll);
- icu_buf_utf8_destroy(chain->norm8);
if (chain->iter)
icu_iter_destroy(chain->iter);
icu_chain_step_destroy(chain->csteps);
UErrorCode status;
struct icu_buf_utf8 *display;
struct icu_buf_utf8 *sort8;
+ struct icu_buf_utf8 *result;
struct icu_buf_utf16 *input;
int token_count;
struct icu_chain_step *steps;
icu_buf_utf8_destroy(dst8);
}
-struct icu_buf_utf16 *icu_iter_invoke(struct icu_iter *iter,
+struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
struct icu_chain_step *step,
struct icu_buf_utf16 *src)
{
}
}
-struct icu_iter *icu_iter_create(struct icu_chain *chain)
+yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
{
- struct icu_iter *iter = xmalloc(sizeof(*iter));
+ yaz_icu_iter_t iter = xmalloc(sizeof(*iter));
iter->chain = chain;
iter->status = U_ZERO_ERROR;
iter->display = icu_buf_utf8_create(0);
iter->sort8 = icu_buf_utf8_create(0);
+ iter->result = icu_buf_utf8_create(0);
iter->last = 0; /* no last returned string (yet) */
iter->steps = icu_chain_step_clone(chain->csteps);
iter->input = 0;
return iter;
}
-void icu_iter_first(struct icu_iter *iter, const char *src8cstr)
+void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
{
if (iter->input)
icu_buf_utf16_destroy(iter->input);
icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
}
-void icu_iter_destroy(struct icu_iter *iter)
+void icu_iter_destroy(yaz_icu_iter_t iter)
{
if (iter)
{
icu_buf_utf8_destroy(iter->display);
icu_buf_utf8_destroy(iter->sort8);
+ icu_buf_utf8_destroy(iter->result);
if (iter->input)
icu_buf_utf16_destroy(iter->input);
icu_chain_step_destroy(iter->steps);
}
}
-int icu_iter_next(struct icu_iter *iter, struct icu_buf_utf8 *result)
+int icu_iter_next(yaz_icu_iter_t iter)
{
if (!iter->input && iter->last == 0)
return 0;
iter->sort8, iter->last,
&iter->status);
}
- icu_utf16_to_utf8(result, iter->last, &iter->status);
+ icu_utf16_to_utf8(iter->result, iter->last, &iter->status);
icu_buf_utf16_destroy(iter->last);
return 1;
}
}
-const char *icu_iter_get_sortkey(struct icu_iter *iter)
+const char *icu_iter_get_norm(yaz_icu_iter_t iter)
+{
+ return icu_buf_utf8_to_cstr(iter->result);
+}
+
+const char *icu_iter_get_sortkey(yaz_icu_iter_t iter)
{
return icu_buf_utf8_to_cstr(iter->sort8);
}
-const char *icu_iter_get_display(struct icu_iter *iter)
+const char *icu_iter_get_display(yaz_icu_iter_t iter)
{
return icu_buf_utf8_to_cstr(iter->display);
}
+const char *icu_iter_get_token_number(yaz_icu_iter_t iter)
+{
+ return iter->token_count;
+}
+
int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr,
UErrorCode *status)
{
int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status)
{
*status = U_ZERO_ERROR;
- return icu_iter_next(chain->iter, chain->norm8);
+ return icu_iter_next(chain->iter);
}
int icu_chain_token_number(struct icu_chain * chain)
const char * icu_chain_token_norm(struct icu_chain * chain)
{
- if (chain->norm8)
- return icu_buf_utf8_to_cstr(chain->norm8);
+ if (chain->iter)
+ return icu_iter_get_norm(chain->iter);
return 0;
}
UErrorCode status = U_ZERO_ERROR;
struct icu_chain * chain = 0;
xmlNode *xml_node;
- struct icu_iter *iter;
- struct icu_buf_utf8 *token;
+ yaz_icu_iter_t iter;
const char * xml_str = "<icu locale=\"en\">"
"<tokenize rule=\"w\"/>"
YAZ_CHECK(iter);
if (!iter)
return;
- token = icu_buf_utf8_create(0);
- while (icu_iter_next(iter, token))
+ while (icu_iter_next(iter))
{
- yaz_log(YLOG_LOG, "[%.*s]", (int) token->utf8_len, token->utf8);
+ yaz_log(YLOG_LOG, "[%s]", icu_iter_get_norm(iter));
}
- icu_buf_utf8_destroy(token);
-
icu_iter_destroy(iter);
icu_chain_destroy(chain);
}
static int test_iter(struct icu_chain *chain, const char *input,
const char *expected)
{
- struct icu_iter *iter = icu_iter_create(chain);
+ yaz_icu_iter_t iter = icu_iter_create(chain);
WRBUF result, second;
int success = 1;
- struct icu_buf_utf8 *token;
if (!iter)
{
return 0;
}
- token = icu_buf_utf8_create(0);
-
- if (icu_iter_next(iter, token))
+ if (icu_iter_next(iter))
{
yaz_log(YLOG_WARN, "test_iter: expecting 0 before icu_iter_first");
return 0;
result = wrbuf_alloc();
icu_iter_first(iter, input);
- while (icu_iter_next(iter, token))
+ while (icu_iter_next(iter))
{
wrbuf_puts(result, "[");
- wrbuf_write(result, (const char *) token->utf8, (int) token->utf8_len);
+ wrbuf_puts(result, icu_iter_get_norm(iter));
wrbuf_puts(result, "]");
}
-
second = wrbuf_alloc();
icu_iter_first(iter, input);
- while (icu_iter_next(iter, token))
+ while (icu_iter_next(iter))
{
wrbuf_puts(second, "[");
- wrbuf_write(second, (const char *) token->utf8, (int) token->utf8_len);
+ wrbuf_puts(second, icu_iter_get_norm(iter));
wrbuf_puts(second, "]");
}
- icu_buf_utf8_destroy(token);
icu_iter_destroy(iter);
if (strcmp(expected, wrbuf_cstr(result)))