2 * Copyright (C) 1995-2006, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.22 2006-04-24 23:21:26 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
32 #include <yaz/yaz-util.h>
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35 size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51 size_t *no_read, int *combining);
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71 size_t *no_read, int *combining);
73 struct yaz_iconv_struct {
76 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77 size_t inbytesleft, size_t *no_read);
78 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79 size_t inbytesleft, size_t *no_read);
80 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81 char **outbuf, size_t *outbytesleft,
87 unsigned long comb_x[8];
88 size_t comb_no_read[8];
90 unsigned long unget_x;
94 unsigned long compose_char;
96 unsigned long write_marc8_comb_ch[8];
97 size_t write_marc8_comb_no;
98 unsigned long write_marc8_last;
99 const char *write_marc8_page_chr;
103 unsigned long x1, x2;
106 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
107 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
108 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
109 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
110 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
111 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
112 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
113 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
114 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
115 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
116 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
117 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
118 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
119 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
120 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
121 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
122 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
123 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
124 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
125 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
126 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
127 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
128 /* omitted: 0xd7 MULTIPLICATION SIGN */
129 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
130 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
131 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
132 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
133 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
134 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
135 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
136 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
137 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
138 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
139 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
140 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
141 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
142 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
143 /* omitted: 0xe6 LATIN SMALL LETTER AE */
144 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
145 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
146 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
147 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
148 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
149 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
150 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
151 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
152 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
153 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
154 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
155 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
156 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
157 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
158 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
159 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
160 /* omitted: 0xf7 DIVISION SIGN */
161 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
162 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
163 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
164 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
165 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
166 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
167 /* omitted: 0xfe LATIN SMALL LETTER THORN */
168 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
173 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
174 size_t inbytesleft, size_t *no_read)
176 unsigned long x = inp[0];
181 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
182 size_t inbytesleft, size_t *no_read)
191 cd->my_errno = YAZ_ICONV_EINVAL;
194 if (inp[1] != 0xbb && inp[2] == 0xbf)
201 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
202 size_t inbytesleft, size_t *no_read)
211 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
214 cd->my_errno = YAZ_ICONV_EILSEQ;
216 else if (inp[0] <= 0xdf && inbytesleft >= 2)
218 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
224 cd->my_errno = YAZ_ICONV_EILSEQ;
227 else if (inp[0] <= 0xef && inbytesleft >= 3)
229 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
236 cd->my_errno = YAZ_ICONV_EILSEQ;
239 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
241 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
242 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
248 cd->my_errno = YAZ_ICONV_EILSEQ;
251 else if (inp[0] <= 0xfb && inbytesleft >= 5)
253 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
254 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
261 cd->my_errno = YAZ_ICONV_EILSEQ;
264 else if (inp[0] <= 0xfd && inbytesleft >= 6)
266 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
267 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
268 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
274 cd->my_errno = YAZ_ICONV_EILSEQ;
280 cd->my_errno = YAZ_ICONV_EINVAL;
285 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
286 size_t inbytesleft, size_t *no_read)
292 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
297 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
303 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
304 size_t inbytesleft, size_t *no_read)
310 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
315 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
322 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
323 size_t inbytesleft, size_t *no_read)
327 if (inbytesleft < sizeof(wchar_t))
329 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
335 memcpy (&wch, inp, sizeof(wch));
337 *no_read = sizeof(wch);
344 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
345 size_t inbytesleft, size_t *no_read,
348 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
349 size_t inbytesleft, size_t *no_read)
352 if (cd->comb_offset < cd->comb_size)
354 *no_read = cd->comb_no_read[cd->comb_offset];
355 x = cd->comb_x[cd->comb_offset];
357 /* special case for double-diacritic combining characters,
358 INVERTED BREVE and DOUBLE TILDE.
359 We'll increment the no_read counter by 1, since we want to skip over
360 the processing of the closing ligature character
362 /* this code is no longer necessary.. our handlers code in
363 yaz_marc8_?_conv (generated by charconv.tcl) now returns
364 0 and no_read=1 when a sequence does not match the input.
365 The SECOND HALFs in codetables.xml produces a non-existant
366 entry in the conversion trie.. Hence when met, the input byte is
367 skipped as it should (in yaz_iconv)
370 if (x == 0x0361 || x == 0x0360)
378 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
381 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
384 cd->comb_x[cd->comb_size] = x;
385 cd->comb_no_read[cd->comb_size] = *no_read;
387 inbytesleft = inbytesleft - *no_read;
392 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
393 size_t inbytesleft, size_t *no_read)
395 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
396 if (x && cd->comb_size == 1)
398 /* For MARC8s we try to get a Latin-1 page code out of it */
400 for (i = 0; latin1_comb[i].x1; i++)
401 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
403 *no_read += cd->comb_no_read[0];
405 x = latin1_comb[i].y;
412 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
413 size_t inbytesleft, size_t *no_read,
417 while(inbytesleft >= 1 && inp[0] == 27)
419 size_t inbytesleft0 = inbytesleft;
422 while(inbytesleft > 0 && strchr("(,$!", *inp))
427 if (inbytesleft <= 0)
430 cd->my_errno = YAZ_ICONV_EINVAL;
433 cd->marc8_esc_mode = *inp++;
435 (*no_read) += inbytesleft0 - inbytesleft;
437 if (inbytesleft <= 0)
442 size_t no_read_sub = 0;
445 switch(cd->marc8_esc_mode)
447 case 'B': /* Basic ASCII */
448 case 'E': /* ANSEL */
449 case 's': /* ASCII */
450 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
452 case 'g': /* Greek */
453 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
455 case 'b': /* Subscripts */
456 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
458 case 'p': /* Superscripts */
459 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
461 case '2': /* Basic Hebrew */
462 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
464 case 'N': /* Basic Cyrillic */
465 case 'Q': /* Extended Cyrillic */
466 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
468 case '3': /* Basic Arabic */
469 case '4': /* Extended Arabic */
470 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
472 case 'S': /* Greek */
473 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
475 case '1': /* Chinese, Japanese, Korean (EACC) */
476 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
480 cd->my_errno = YAZ_ICONV_EILSEQ;
483 *no_read += no_read_sub;
488 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
489 char **outbuf, size_t *outbytesleft,
492 unsigned char *outp = (unsigned char *) *outbuf;
494 if (x <= 0x7f && *outbytesleft >= 1)
496 *outp++ = (unsigned char) x;
499 else if (x <= 0x7ff && *outbytesleft >= 2)
501 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
502 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
503 (*outbytesleft) -= 2;
505 else if (x <= 0xffff && *outbytesleft >= 3)
507 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
508 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
509 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
510 (*outbytesleft) -= 3;
512 else if (x <= 0x1fffff && *outbytesleft >= 4)
514 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
515 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
516 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
517 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
518 (*outbytesleft) -= 4;
520 else if (x <= 0x3ffffff && *outbytesleft >= 5)
522 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
523 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
524 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
525 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
526 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
527 (*outbytesleft) -= 5;
529 else if (*outbytesleft >= 6)
531 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
532 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
533 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
534 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
535 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
536 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
537 (*outbytesleft) -= 6;
541 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
544 *outbuf = (char *) outp;
549 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
550 char **outbuf, size_t *outbytesleft,
553 /* list of two char unicode sequence that, when combined, are
554 equivalent to single unicode chars that can be represented in
556 Regular iconv on Linux at least does not seem to convert these,
557 but since MARC-8 to UTF-8 generates these composed sequence
558 we get a better chance of a successful MARC-8 -> ISO-8859-1
560 unsigned char *outp = (unsigned char *) *outbuf;
562 if (cd->compose_char)
565 for (i = 0; latin1_comb[i].x1; i++)
566 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
568 x = latin1_comb[i].y;
571 if (*outbytesleft < 1)
572 { /* no room. Retain compose_char and bail out */
573 cd->my_errno = YAZ_ICONV_E2BIG;
576 if (!latin1_comb[i].x1)
577 { /* not found. Just write compose_char */
578 *outp++ = (unsigned char) cd->compose_char;
580 *outbuf = (char *) outp;
582 /* compose_char used so reset it. x now holds current char */
583 cd->compose_char = 0;
586 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
588 cd->compose_char = x;
591 else if (x > 255 || x < 1)
593 cd->my_errno = YAZ_ICONV_EILSEQ;
596 else if (*outbytesleft < 1)
598 cd->my_errno = YAZ_ICONV_E2BIG;
601 *outp++ = (unsigned char) x;
603 *outbuf = (char *) outp;
608 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
609 char **outbuf, size_t *outbytesleft,
612 unsigned char *outp = (unsigned char *) *outbuf;
613 if (*outbytesleft >= 4)
615 *outp++ = (unsigned char) (x>>24);
616 *outp++ = (unsigned char) (x>>16);
617 *outp++ = (unsigned char) (x>>8);
618 *outp++ = (unsigned char) x;
619 (*outbytesleft) -= 4;
623 cd->my_errno = YAZ_ICONV_E2BIG;
626 *outbuf = (char *) outp;
630 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
631 char **outbuf, size_t *outbytesleft,
634 unsigned char *outp = (unsigned char *) *outbuf;
635 if (*outbytesleft >= 4)
637 *outp++ = (unsigned char) x;
638 *outp++ = (unsigned char) (x>>8);
639 *outp++ = (unsigned char) (x>>16);
640 *outp++ = (unsigned char) (x>>24);
641 (*outbytesleft) -= 4;
645 cd->my_errno = YAZ_ICONV_E2BIG;
648 *outbuf = (char *) outp;
652 static unsigned long lookup_marc8(yaz_iconv_t cd,
653 unsigned long x, int *comb,
654 const char **page_chr)
657 char *utf8_outbuf = utf8_buf;
658 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
660 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
661 if (r == (size_t)(-1))
663 cd->my_errno = YAZ_ICONV_EILSEQ;
669 size_t inbytesleft, no_read_sub = 0;
673 inp = (unsigned char *) utf8_buf;
674 inbytesleft = strlen(utf8_buf);
676 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
679 *page_chr = "\033(B";
682 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
688 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
694 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
700 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
703 *page_chr = "\033(2";
706 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
709 *page_chr = "\033(N";
712 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
715 *page_chr = "\033(3";
718 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
721 *page_chr = "\033(S";
724 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
727 *page_chr = "\033(1";
730 cd->my_errno = YAZ_ICONV_EILSEQ;
735 static size_t flush_combos(yaz_iconv_t cd,
736 char **outbuf, size_t *outbytesleft)
738 unsigned long y = cd->write_marc8_last;
739 unsigned char byte, second_half = 0;
741 size_t i, out_no = 0;
746 byte = (unsigned char )((y>>16) & 0xff);
748 out_buf[out_no++] = byte;
749 byte = (unsigned char)((y>>8) & 0xff);
751 out_buf[out_no++] = byte;
752 byte = (unsigned char )(y & 0xff);
754 out_buf[out_no++] = byte;
756 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
758 cd->my_errno = YAZ_ICONV_E2BIG;
759 return (size_t) (-1);
762 for (i = 0; i < cd->write_marc8_comb_no; i++)
764 /* all MARC-8 combined characters are simple bytes */
765 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
768 else if (byte == 0xFA)
774 memcpy(*outbuf, out_buf, out_no);
776 (*outbytesleft) -= out_no;
779 *(*outbuf)++ = second_half;
783 cd->write_marc8_last = 0;
784 cd->write_marc8_comb_no = 0;
788 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
789 char **outbuf, size_t *outbytesleft,
793 const char *page_chr = 0;
794 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
797 return (size_t) (-1);
801 if (cd->write_marc8_comb_no < 6)
802 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
806 size_t r = flush_combos(cd, outbuf, outbytesleft);
809 if (strcmp(page_chr, cd->write_marc8_page_chr))
811 size_t plen = strlen(page_chr);
813 if (*outbytesleft < plen)
815 cd->my_errno = YAZ_ICONV_E2BIG;
816 return (size_t) (-1);
818 memcpy(*outbuf, page_chr, plen);
820 (*outbytesleft) -= plen;
821 cd->write_marc8_page_chr = page_chr;
823 cd->write_marc8_last = y;
827 size_t r = flush_combos(cd, outbuf, outbytesleft);
831 cd->write_marc8_comb_no--;
833 cd->write_marc8_last = 0;
841 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
842 char **outbuf, size_t *outbytesleft,
845 unsigned char *outp = (unsigned char *) *outbuf;
847 if (*outbytesleft >= sizeof(wchar_t))
850 memcpy(outp, &wch, sizeof(wch));
852 (*outbytesleft) -= sizeof(wch);
856 cd->my_errno = YAZ_ICONV_E2BIG;
859 *outbuf = (char *) outp;
864 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
866 return cd->read_handle && cd->write_handle;
869 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
871 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
873 cd->write_handle = 0;
876 cd->my_errno = YAZ_ICONV_UNKNOWN;
877 cd->marc8_esc_mode = 'B';
878 cd->comb_offset = cd->comb_size = 0;
879 cd->compose_char = 0;
881 cd->write_marc8_comb_no = 0;
882 cd->write_marc8_last = 0;
883 cd->write_marc8_page_chr = "\033(B";
885 /* a useful hack: if fromcode has leading @,
886 the library not use YAZ's own conversions .. */
887 if (fromcode[0] == '@')
891 if (!yaz_matchstr(fromcode, "UTF8"))
893 cd->read_handle = yaz_read_UTF8;
894 cd->init_handle = yaz_init_UTF8;
896 else if (!yaz_matchstr(fromcode, "ISO88591"))
897 cd->read_handle = yaz_read_ISO8859_1;
898 else if (!yaz_matchstr(fromcode, "UCS4"))
899 cd->read_handle = yaz_read_UCS4;
900 else if (!yaz_matchstr(fromcode, "UCS4LE"))
901 cd->read_handle = yaz_read_UCS4LE;
902 else if (!yaz_matchstr(fromcode, "MARC8"))
903 cd->read_handle = yaz_read_marc8;
904 else if (!yaz_matchstr(fromcode, "MARC8s"))
905 cd->read_handle = yaz_read_marc8s;
907 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
908 cd->read_handle = yaz_read_wchar_t;
911 if (!yaz_matchstr(tocode, "UTF8"))
912 cd->write_handle = yaz_write_UTF8;
913 else if (!yaz_matchstr(tocode, "ISO88591"))
914 cd->write_handle = yaz_write_ISO8859_1;
915 else if (!yaz_matchstr (tocode, "UCS4"))
916 cd->write_handle = yaz_write_UCS4;
917 else if (!yaz_matchstr(tocode, "UCS4LE"))
918 cd->write_handle = yaz_write_UCS4LE;
919 else if (!yaz_matchstr(tocode, "MARC8"))
920 cd->write_handle = yaz_write_marc8;
921 else if (!yaz_matchstr(tocode, "MARC8s"))
922 cd->write_handle = yaz_write_marc8;
924 else if (!yaz_matchstr(tocode, "WCHAR_T"))
925 cd->write_handle = yaz_write_wchar_t;
930 if (!cd->read_handle || !cd->write_handle)
932 cd->iconv_cd = iconv_open (tocode, fromcode);
933 if (cd->iconv_cd == (iconv_t) (-1))
940 if (!cd->read_handle || !cd->write_handle)
950 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
951 char **outbuf, size_t *outbytesleft)
960 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
961 if (r == (size_t)(-1))
966 cd->my_errno = YAZ_ICONV_E2BIG;
969 cd->my_errno = YAZ_ICONV_EINVAL;
972 cd->my_errno = YAZ_ICONV_EILSEQ;
975 cd->my_errno = YAZ_ICONV_UNKNOWN;
981 if (inbuf == 0 || *inbuf == 0)
984 cd->my_errno = YAZ_ICONV_UNKNOWN;
994 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
995 *inbytesleft, &no_read);
998 if (cd->my_errno == YAZ_ICONV_EINVAL)
1003 *inbytesleft -= no_read;
1015 if (*inbytesleft == 0)
1017 r = *inbuf - inbuf0;
1022 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1033 no_read = cd->no_read_x;
1037 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1038 (*inbytesleft - no_read) == 0 ? 1 : 0);
1041 /* unable to write it. save it because read_handle cannot
1043 if (cd->my_errno == YAZ_ICONV_E2BIG)
1046 cd->no_read_x = no_read;
1052 *inbytesleft -= no_read;
1053 (*inbuf) += no_read;
1058 int yaz_iconv_error (yaz_iconv_t cd)
1060 return cd->my_errno;
1063 int yaz_iconv_close (yaz_iconv_t cd)
1067 iconv_close (cd->iconv_cd);
1076 * indent-tabs-mode: nil
1078 * vim: shiftwidth=4 tabstop=8 expandtab