2 * Copyright (C) 1995-2006, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.21 2006-04-19 23:48:06 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
32 #include <yaz/yaz-util.h>
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35 size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51 size_t *no_read, int *combining);
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71 size_t *no_read, int *combining);
73 struct yaz_iconv_struct {
76 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77 size_t inbytesleft, size_t *no_read);
78 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79 size_t inbytesleft, size_t *no_read);
80 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81 char **outbuf, size_t *outbytesleft,
87 unsigned long comb_x[8];
88 size_t comb_no_read[8];
90 unsigned long unget_x;
94 unsigned long compose_char;
96 unsigned long write_marc8_comb_ch[8];
97 size_t write_marc8_comb_no;
98 unsigned long write_marc8_last;
99 const char *write_marc8_page_chr;
102 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
103 size_t inbytesleft, size_t *no_read)
105 unsigned long x = inp[0];
110 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
111 size_t inbytesleft, size_t *no_read)
120 cd->my_errno = YAZ_ICONV_EINVAL;
123 if (inp[1] != 0xbb && inp[2] == 0xbf)
130 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
131 size_t inbytesleft, size_t *no_read)
140 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
143 cd->my_errno = YAZ_ICONV_EILSEQ;
145 else if (inp[0] <= 0xdf && inbytesleft >= 2)
147 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
153 cd->my_errno = YAZ_ICONV_EILSEQ;
156 else if (inp[0] <= 0xef && inbytesleft >= 3)
158 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
165 cd->my_errno = YAZ_ICONV_EILSEQ;
168 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
170 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
171 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
177 cd->my_errno = YAZ_ICONV_EILSEQ;
180 else if (inp[0] <= 0xfb && inbytesleft >= 5)
182 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
183 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
190 cd->my_errno = YAZ_ICONV_EILSEQ;
193 else if (inp[0] <= 0xfd && inbytesleft >= 6)
195 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
196 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
197 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
203 cd->my_errno = YAZ_ICONV_EILSEQ;
209 cd->my_errno = YAZ_ICONV_EINVAL;
214 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
215 size_t inbytesleft, size_t *no_read)
221 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
226 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
232 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
233 size_t inbytesleft, size_t *no_read)
239 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
244 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
251 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
252 size_t inbytesleft, size_t *no_read)
256 if (inbytesleft < sizeof(wchar_t))
258 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
264 memcpy (&wch, inp, sizeof(wch));
266 *no_read = sizeof(wch);
273 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
274 size_t inbytesleft, size_t *no_read,
277 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
278 size_t inbytesleft, size_t *no_read)
281 if (cd->comb_offset < cd->comb_size)
283 *no_read = cd->comb_no_read[cd->comb_offset];
284 x = cd->comb_x[cd->comb_offset];
286 /* special case for double-diacritic combining characters,
287 INVERTED BREVE and DOUBLE TILDE.
288 We'll increment the no_read counter by 1, since we want to skip over
289 the processing of the closing ligature character
291 /* this code is no longer necessary.. our handlers code in
292 yaz_marc8_?_conv (generated by charconv.tcl) now returns
293 0 and no_read=1 when a sequence does not match the input.
294 The SECOND HALFs in codetables.xml produces a non-existant
295 entry in the conversion trie.. Hence when met, the input byte is
296 skipped as it should (in yaz_iconv)
299 if (x == 0x0361 || x == 0x0360)
307 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
310 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
313 cd->comb_x[cd->comb_size] = x;
314 cd->comb_no_read[cd->comb_size] = *no_read;
316 inbytesleft = inbytesleft - *no_read;
321 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
322 size_t inbytesleft, size_t *no_read,
326 while(inbytesleft >= 1 && inp[0] == 27)
328 size_t inbytesleft0 = inbytesleft;
331 while(inbytesleft > 0 && strchr("(,$!", *inp))
336 if (inbytesleft <= 0)
339 cd->my_errno = YAZ_ICONV_EINVAL;
342 cd->marc8_esc_mode = *inp++;
344 (*no_read) += inbytesleft0 - inbytesleft;
346 if (inbytesleft <= 0)
351 size_t no_read_sub = 0;
354 switch(cd->marc8_esc_mode)
356 case 'B': /* Basic ASCII */
357 case 'E': /* ANSEL */
358 case 's': /* ASCII */
359 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
361 case 'g': /* Greek */
362 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
364 case 'b': /* Subscripts */
365 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
367 case 'p': /* Superscripts */
368 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
370 case '2': /* Basic Hebrew */
371 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
373 case 'N': /* Basic Cyrillic */
374 case 'Q': /* Extended Cyrillic */
375 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
377 case '3': /* Basic Arabic */
378 case '4': /* Extended Arabic */
379 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
381 case 'S': /* Greek */
382 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
384 case '1': /* Chinese, Japanese, Korean (EACC) */
385 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
389 cd->my_errno = YAZ_ICONV_EILSEQ;
392 *no_read += no_read_sub;
397 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
398 char **outbuf, size_t *outbytesleft,
401 unsigned char *outp = (unsigned char *) *outbuf;
403 if (x <= 0x7f && *outbytesleft >= 1)
405 *outp++ = (unsigned char) x;
408 else if (x <= 0x7ff && *outbytesleft >= 2)
410 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
411 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
412 (*outbytesleft) -= 2;
414 else if (x <= 0xffff && *outbytesleft >= 3)
416 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
417 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
418 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
419 (*outbytesleft) -= 3;
421 else if (x <= 0x1fffff && *outbytesleft >= 4)
423 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
424 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
425 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
426 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
427 (*outbytesleft) -= 4;
429 else if (x <= 0x3ffffff && *outbytesleft >= 5)
431 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
432 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
433 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
434 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
435 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
436 (*outbytesleft) -= 5;
438 else if (*outbytesleft >= 6)
440 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
441 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
442 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
443 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
444 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
445 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
446 (*outbytesleft) -= 6;
450 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
453 *outbuf = (char *) outp;
458 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
459 char **outbuf, size_t *outbytesleft,
462 /* list of two char unicode sequence that, when combined, are
463 equivalent to single unicode chars that can be represented in
465 Regular iconv on Linux at least does not seem to convert these,
466 but since MARC-8 to UTF-8 generates these composed sequence
467 we get a better chance of a successful MARC-8 -> ISO-8859-1
470 unsigned long x1, x2;
473 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
474 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
475 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
476 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
477 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
478 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
479 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
480 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
481 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
482 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
483 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
484 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
485 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
486 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
487 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
488 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
489 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
490 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
491 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
492 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
493 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
494 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
495 /* omitted: 0xd7 MULTIPLICATION SIGN */
496 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
497 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
498 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
499 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
500 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
501 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
502 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
503 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
504 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
505 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
506 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
507 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
508 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
509 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
510 /* omitted: 0xe6 LATIN SMALL LETTER AE */
511 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
512 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
513 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
514 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
515 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
516 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
517 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
518 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
519 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
520 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
521 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
522 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
523 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
524 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
525 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
526 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
527 /* omitted: 0xf7 DIVISION SIGN */
528 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
529 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
530 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
531 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
532 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
533 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
534 /* omitted: 0xfe LATIN SMALL LETTER THORN */
535 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
539 unsigned char *outp = (unsigned char *) *outbuf;
541 if (cd->compose_char)
544 for (i = 0; latin1_comb[i].x1; i++)
545 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
547 x = latin1_comb[i].y;
550 if (*outbytesleft < 1)
551 { /* no room. Retain compose_char and bail out */
552 cd->my_errno = YAZ_ICONV_E2BIG;
555 if (!latin1_comb[i].x1)
556 { /* not found. Just write compose_char */
557 *outp++ = (unsigned char) cd->compose_char;
559 *outbuf = (char *) outp;
561 /* compose_char used so reset it. x now holds current char */
562 cd->compose_char = 0;
565 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
567 cd->compose_char = x;
570 else if (x > 255 || x < 1)
572 cd->my_errno = YAZ_ICONV_EILSEQ;
575 else if (*outbytesleft < 1)
577 cd->my_errno = YAZ_ICONV_E2BIG;
580 *outp++ = (unsigned char) x;
582 *outbuf = (char *) outp;
587 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
588 char **outbuf, size_t *outbytesleft,
591 unsigned char *outp = (unsigned char *) *outbuf;
592 if (*outbytesleft >= 4)
594 *outp++ = (unsigned char) (x>>24);
595 *outp++ = (unsigned char) (x>>16);
596 *outp++ = (unsigned char) (x>>8);
597 *outp++ = (unsigned char) x;
598 (*outbytesleft) -= 4;
602 cd->my_errno = YAZ_ICONV_E2BIG;
605 *outbuf = (char *) outp;
609 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
610 char **outbuf, size_t *outbytesleft,
613 unsigned char *outp = (unsigned char *) *outbuf;
614 if (*outbytesleft >= 4)
616 *outp++ = (unsigned char) x;
617 *outp++ = (unsigned char) (x>>8);
618 *outp++ = (unsigned char) (x>>16);
619 *outp++ = (unsigned char) (x>>24);
620 (*outbytesleft) -= 4;
624 cd->my_errno = YAZ_ICONV_E2BIG;
627 *outbuf = (char *) outp;
631 static unsigned long lookup_marc8(yaz_iconv_t cd,
632 unsigned long x, int *comb,
633 const char **page_chr)
636 char *utf8_outbuf = utf8_buf;
637 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
639 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
640 if (r == (size_t)(-1))
642 cd->my_errno = YAZ_ICONV_EILSEQ;
648 size_t inbytesleft, no_read_sub = 0;
652 inp = (unsigned char *) utf8_buf;
653 inbytesleft = strlen(utf8_buf);
655 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
658 *page_chr = "\033(B";
661 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
667 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
673 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
679 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
682 *page_chr = "\033(2";
685 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
688 *page_chr = "\033(N";
691 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
694 *page_chr = "\033(3";
697 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
700 *page_chr = "\033(S";
703 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
706 *page_chr = "\033(1";
709 cd->my_errno = YAZ_ICONV_EILSEQ;
714 static size_t flush_combos(yaz_iconv_t cd,
715 char **outbuf, size_t *outbytesleft)
717 unsigned long y = cd->write_marc8_last;
718 unsigned char byte, second_half = 0;
720 size_t i, out_no = 0;
725 byte = (unsigned char )((y>>16) & 0xff);
727 out_buf[out_no++] = byte;
728 byte = (unsigned char)((y>>8) & 0xff);
730 out_buf[out_no++] = byte;
731 byte = (unsigned char )(y & 0xff);
733 out_buf[out_no++] = byte;
735 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
737 cd->my_errno = YAZ_ICONV_E2BIG;
738 return (size_t) (-1);
741 for (i = 0; i < cd->write_marc8_comb_no; i++)
743 /* all MARC-8 combined characters are simple bytes */
744 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
747 else if (byte == 0xFA)
753 memcpy(*outbuf, out_buf, out_no);
755 (*outbytesleft) -= out_no;
758 *(*outbuf)++ = second_half;
762 cd->write_marc8_last = 0;
763 cd->write_marc8_comb_no = 0;
767 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
768 char **outbuf, size_t *outbytesleft,
772 const char *page_chr = 0;
773 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
776 return (size_t) (-1);
780 if (cd->write_marc8_comb_no < 6)
781 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
785 size_t r = flush_combos(cd, outbuf, outbytesleft);
788 if (strcmp(page_chr, cd->write_marc8_page_chr))
790 size_t plen = strlen(page_chr);
792 if (*outbytesleft < plen)
794 cd->my_errno = YAZ_ICONV_E2BIG;
795 return (size_t) (-1);
797 memcpy(*outbuf, page_chr, plen);
799 (*outbytesleft) -= plen;
800 cd->write_marc8_page_chr = page_chr;
802 cd->write_marc8_last = y;
806 size_t r = flush_combos(cd, outbuf, outbytesleft);
810 cd->write_marc8_comb_no--;
812 cd->write_marc8_last = 0;
820 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
821 char **outbuf, size_t *outbytesleft,
824 unsigned char *outp = (unsigned char *) *outbuf;
826 if (*outbytesleft >= sizeof(wchar_t))
829 memcpy(outp, &wch, sizeof(wch));
831 (*outbytesleft) -= sizeof(wch);
835 cd->my_errno = YAZ_ICONV_E2BIG;
838 *outbuf = (char *) outp;
843 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
845 return cd->read_handle && cd->write_handle;
848 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
850 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
852 cd->write_handle = 0;
855 cd->my_errno = YAZ_ICONV_UNKNOWN;
856 cd->marc8_esc_mode = 'B';
857 cd->comb_offset = cd->comb_size = 0;
858 cd->compose_char = 0;
860 cd->write_marc8_comb_no = 0;
861 cd->write_marc8_last = 0;
862 cd->write_marc8_page_chr = "\033(B";
864 /* a useful hack: if fromcode has leading @,
865 the library not use YAZ's own conversions .. */
866 if (fromcode[0] == '@')
870 if (!yaz_matchstr(fromcode, "UTF8"))
872 cd->read_handle = yaz_read_UTF8;
873 cd->init_handle = yaz_init_UTF8;
875 else if (!yaz_matchstr(fromcode, "ISO88591"))
876 cd->read_handle = yaz_read_ISO8859_1;
877 else if (!yaz_matchstr(fromcode, "UCS4"))
878 cd->read_handle = yaz_read_UCS4;
879 else if (!yaz_matchstr(fromcode, "UCS4LE"))
880 cd->read_handle = yaz_read_UCS4LE;
881 else if (!yaz_matchstr(fromcode, "MARC8"))
882 cd->read_handle = yaz_read_marc8;
884 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
885 cd->read_handle = yaz_read_wchar_t;
888 if (!yaz_matchstr(tocode, "UTF8"))
889 cd->write_handle = yaz_write_UTF8;
890 else if (!yaz_matchstr(tocode, "ISO88591"))
891 cd->write_handle = yaz_write_ISO8859_1;
892 else if (!yaz_matchstr (tocode, "UCS4"))
893 cd->write_handle = yaz_write_UCS4;
894 else if (!yaz_matchstr(tocode, "UCS4LE"))
895 cd->write_handle = yaz_write_UCS4LE;
896 else if (!yaz_matchstr(tocode, "MARC8"))
897 cd->write_handle = yaz_write_marc8;
899 else if (!yaz_matchstr(tocode, "WCHAR_T"))
900 cd->write_handle = yaz_write_wchar_t;
905 if (!cd->read_handle || !cd->write_handle)
907 cd->iconv_cd = iconv_open (tocode, fromcode);
908 if (cd->iconv_cd == (iconv_t) (-1))
915 if (!cd->read_handle || !cd->write_handle)
925 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
926 char **outbuf, size_t *outbytesleft)
935 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
936 if (r == (size_t)(-1))
941 cd->my_errno = YAZ_ICONV_E2BIG;
944 cd->my_errno = YAZ_ICONV_EINVAL;
947 cd->my_errno = YAZ_ICONV_EILSEQ;
950 cd->my_errno = YAZ_ICONV_UNKNOWN;
956 if (inbuf == 0 || *inbuf == 0)
959 cd->my_errno = YAZ_ICONV_UNKNOWN;
969 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
970 *inbytesleft, &no_read);
973 if (cd->my_errno == YAZ_ICONV_EINVAL)
978 *inbytesleft -= no_read;
990 if (*inbytesleft == 0)
997 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1008 no_read = cd->no_read_x;
1012 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1013 (*inbytesleft - no_read) == 0 ? 1 : 0);
1016 /* unable to write it. save it because read_handle cannot
1018 if (cd->my_errno == YAZ_ICONV_E2BIG)
1021 cd->no_read_x = no_read;
1027 *inbytesleft -= no_read;
1028 (*inbuf) += no_read;
1033 int yaz_iconv_error (yaz_iconv_t cd)
1035 return cd->my_errno;
1038 int yaz_iconv_close (yaz_iconv_t cd)
1042 iconv_close (cd->iconv_cd);
1051 * indent-tabs-mode: nil
1053 * vim: shiftwidth=4 tabstop=8 expandtab