1 /* $Id: icu_I18N.c,v 1.8 2007-05-09 14:01:21 marc Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
28 #include <yaz/timing.h>
41 #include <unicode/ustring.h> /* some more string fcns*/
42 #include <unicode/uchar.h> /* char names */
45 //#include <unicode/ustdio.h>
46 //#include <unicode/utypes.h> /* Basic ICU data types */
47 #include <unicode/ucol.h>
48 //#include <unicode/ucnv.h> /* C Converter API */
49 //#include <unicode/uloc.h>
50 //#include <unicode/ubrk.h>
51 /* #include <unicode/unistr.h> */
56 int icu_check_status (UErrorCode status)
58 if(U_FAILURE(status)){
60 "ICU: %d %s\n", status, u_errorName(status));
69 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
71 struct icu_buf_utf16 * buf16
72 = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
79 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
80 buf16->utf16[0] = (UChar) 0;
81 buf16->utf16_cap = capacity;
87 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
92 if (0 == buf16->utf16)
93 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
96 = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
97 buf16->utf16[0] = (UChar) 0;
99 buf16->utf16_cap = capacity;
105 buf16->utf16_len = 0;
106 buf16->utf16_cap = 0;
114 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
128 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
130 struct icu_buf_utf8 * buf8
131 = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
138 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
139 buf8->utf8[0] = (uint8_t) 0;
140 buf8->utf8_cap = capacity;
147 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
153 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
156 = (uint8_t *) realloc(buf8->utf8,
157 sizeof(uint8_t) * capacity);
158 buf8->utf8[0] = (uint8_t) 0;
160 buf8->utf8_cap = capacity;
176 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
187 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
188 struct icu_buf_utf8 * src8,
191 int32_t utf16_len = 0;
193 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
195 (const char *) src8->utf8, src8->utf8_len, status);
197 // check for buffer overflow, resize and retry
198 if (*status == U_BUFFER_OVERFLOW_ERROR
199 //|| dest16->utf16_len > dest16->utf16_cap
201 icu_buf_utf16_resize(dest16, utf16_len * 2);
202 *status = U_ZERO_ERROR;
203 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
205 (const char *) src8->utf8, src8->utf8_len, status);
208 //if (*status != U_BUFFER_OVERFLOW_ERROR
209 if (U_SUCCESS(*status)
210 && utf16_len < dest16->utf16_cap)
211 dest16->utf16_len = utf16_len;
213 dest16->utf16[0] = (UChar) 0;
214 dest16->utf16_len = 0;
222 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
223 const char * src8cstr,
226 size_t src8cstr_len = 0;
227 int32_t utf16_len = 0;
229 src8cstr_len = strlen(src8cstr);
231 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
233 src8cstr, src8cstr_len, status);
235 // check for buffer overflow, resize and retry
236 if (*status == U_BUFFER_OVERFLOW_ERROR
237 //|| dest16->utf16_len > dest16->utf16_cap
239 icu_buf_utf16_resize(dest16, utf16_len * 2);
240 *status = U_ZERO_ERROR;
241 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
243 src8cstr, src8cstr_len, status);
246 // if (*status != U_BUFFER_OVERFLOW_ERROR
247 if (U_SUCCESS(*status)
248 && utf16_len < dest16->utf16_cap)
249 dest16->utf16_len = utf16_len;
251 dest16->utf16[0] = (UChar) 0;
252 dest16->utf16_len = 0;
261 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
262 struct icu_buf_utf16 * src16,
265 int32_t utf8_len = 0;
267 u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
269 src16->utf16, src16->utf16_len, status);
271 // check for buffer overflow, resize and retry
272 if (*status == U_BUFFER_OVERFLOW_ERROR
273 //|| dest8->utf8_len > dest8->utf8_cap
275 icu_buf_utf8_resize(dest8, utf8_len * 2);
276 *status = U_ZERO_ERROR;
277 u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
279 src16->utf16, src16->utf16_len, status);
283 //if (*status != U_BUFFER_OVERFLOW_ERROR
284 if (U_SUCCESS(*status)
285 && utf8_len < dest8->utf8_cap)
286 dest8->utf8_len = utf8_len;
288 dest8->utf8[0] = (uint8_t) 0;
297 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
298 struct icu_buf_utf16 * src16,
299 const char *locale, char action,
302 int32_t dest16_len = 0;
306 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
307 src16->utf16, src16->utf16_len,
311 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
312 src16->utf16, src16->utf16_len,
316 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
317 src16->utf16, src16->utf16_len,
321 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
322 src16->utf16, src16->utf16_len,
323 U_FOLD_CASE_DEFAULT, status);
327 return U_UNSUPPORTED_ERROR;
331 // check for buffer overflow, resize and retry
332 if (*status == U_BUFFER_OVERFLOW_ERROR
333 //|| dest16_len > dest16->utf16_cap
335 icu_buf_utf16_resize(dest16, dest16_len * 2);
336 *status = U_ZERO_ERROR;
341 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
342 src16->utf16, src16->utf16_len,
346 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
347 src16->utf16, src16->utf16_len,
351 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
352 src16->utf16, src16->utf16_len,
356 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
357 src16->utf16, src16->utf16_len,
358 U_FOLD_CASE_DEFAULT, status);
362 return U_UNSUPPORTED_ERROR;
367 if (U_SUCCESS(*status)
368 && dest16_len < dest16->utf16_cap)
369 dest16->utf16_len = dest16_len;
371 dest16->utf16[0] = (UChar) 0;
372 dest16->utf16_len = 0;
380 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
381 struct icu_buf_utf8 * dest8,
382 struct icu_buf_utf16 * src16,
386 int32_t sortkey_len = 0;
388 sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
389 dest8->utf8, dest8->utf8_cap);
391 // check for buffer overflow, resize and retry
392 if (sortkey_len > dest8->utf8_cap) {
393 icu_buf_utf8_resize(dest8, sortkey_len * 2);
394 sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
395 dest8->utf8, dest8->utf8_cap);
398 if (U_SUCCESS(*status)
400 dest8->utf8_len = sortkey_len;
402 dest8->utf8[0] = (UChar) 0;
411 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
414 struct icu_tokenizer * tokenizer
415 = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
417 strcpy(tokenizer->locale, locale);
418 tokenizer->action = action;
420 tokenizer->buf16 = 0;
421 tokenizer->token_id = 0;
422 tokenizer->token_start = 0;
423 tokenizer->token_end = 0;
426 switch(tokenizer->action) {
429 = ubrk_open(UBRK_LINE, tokenizer->locale,
434 = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
439 = ubrk_open(UBRK_WORD, tokenizer->locale,
444 = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
449 = ubrk_open(UBRK_TITLE, tokenizer->locale,
453 *status = U_UNSUPPORTED_ERROR;
458 // ICU error stuff is a very funny business
459 if (U_SUCCESS(*status))
462 // reestablishing zero error state
463 //if (*status == U_USING_DEFAULT_WARNING)
464 // *status = U_ZERO_ERROR;
472 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
477 ubrk_close(tokenizer->bi);
482 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
483 struct icu_buf_utf16 * src16,
486 if (!tokenizer || !tokenizer->bi || !src16)
489 tokenizer->buf16 = src16;
491 ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
494 if (U_FAILURE(*status))
500 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
501 struct icu_buf_utf16 * tkn16,
504 int32_t tkn_start = 0;
508 if (!tokenizer || !tokenizer->bi
509 || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
512 // never change tokenizer->buf16 and keep always invariant
513 // 0 <= tokenizer->token_start
514 // <= tokenizer->token_end
515 // <= tokenizer->buf16->utf16_len
516 // returns length of token
518 if (0 == tokenizer->token_end) // first call
519 tkn_start = ubrk_first(tokenizer->bi);
520 else //successive calls
521 tkn_start = tokenizer->token_end;
524 tkn_end = ubrk_next(tokenizer->bi);
526 // repairing invariant at end of ubrk, which is UBRK_DONE = -1
527 if (UBRK_DONE == tkn_end)
528 tkn_end = tokenizer->buf16->utf16_len;
530 // copy out if everything is well
531 if(U_FAILURE(*status))
534 tokenizer->token_id++;
535 tokenizer->token_start = tkn_start;
536 tokenizer->token_end = tkn_end;
538 // copying into token buffer if it exists
540 if (tkn16->utf16_cap < (tkn_end - tkn_start))
541 icu_buf_utf16_resize(tkn16, (size_t) (tkn_end - tkn_start) * 2);
543 u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
544 (tkn_end - tkn_start));
546 tkn16->utf16_len = (tkn_end - tkn_start);
549 return (tokenizer->token_end - tokenizer->token_start);
553 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
555 return tokenizer->token_id;
558 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
560 return tokenizer->token_start;
563 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
565 return tokenizer->token_end;
568 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
570 return (tokenizer->token_end - tokenizer->token_start);
573 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
575 return tokenizer->token_count;
589 * indent-tabs-mode: nil
591 * vim: shiftwidth=4 tabstop=8 expandtab