Added my new NFA character normalizer. Not yet ready, but want to
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2006, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.22 2006-04-24 23:21:26 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  */
16
17 #if HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include <errno.h>
22 #include <string.h>
23 #include <ctype.h>
24 #if HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27
28 #if HAVE_ICONV_H
29 #include <iconv.h>
30 #endif
31
32 #include <yaz/yaz-util.h>
33
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35                                size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37                                size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39                                size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41                                size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43                                size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45                                size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47                                size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49                                size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51                                size_t *no_read, int *combining);
52
53
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55                                 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57                                 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59                                 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61                                 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63                                 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65                                 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67                                 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69                                 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71                                 size_t *no_read, int *combining);
72
73 struct yaz_iconv_struct {
74     int my_errno;
75     int init_flag;
76     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77                           size_t inbytesleft, size_t *no_read);
78     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79                                  size_t inbytesleft, size_t *no_read);
80     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81                            char **outbuf, size_t *outbytesleft,
82                            int last);
83     int marc8_esc_mode;
84
85     int comb_offset;
86     int comb_size;
87     unsigned long comb_x[8];
88     size_t comb_no_read[8];
89     size_t no_read_x;
90     unsigned long unget_x;
91 #if HAVE_ICONV_H
92     iconv_t iconv_cd;
93 #endif
94     unsigned long compose_char;
95
96     unsigned long write_marc8_comb_ch[8];
97     size_t write_marc8_comb_no;
98     unsigned long write_marc8_last;
99     const char *write_marc8_page_chr;
100 };
101
102 static struct {
103     unsigned long x1, x2;
104     unsigned y;
105 } latin1_comb[] = {
106     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
107     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
108     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
109     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
110     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
111     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
112     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
113     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
114     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
115     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
116     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
117     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
118     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
119     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
120     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
121     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
122     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
123     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
124     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
125     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
126     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
127     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
128     /* omitted:    0xd7      MULTIPLICATION SIGN */
129     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
130     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
131     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
132     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
133     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
134     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
135     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
136     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
137     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
138     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
139     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
140     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
141     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
142     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
143     /* omitted:    0xe6      LATIN SMALL LETTER AE */
144     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
145     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
146     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
147     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
148     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
149     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
150     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
151     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
152     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
153     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
154     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
155     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
156     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
157     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
158     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
159     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
160     /* omitted:    0xf7      DIVISION SIGN */
161     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
162     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
163     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
164     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
165     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
166     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
167     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
168     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
169     
170     { 0, 0, 0}
171 };
172
173 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
174                                          size_t inbytesleft, size_t *no_read)
175 {
176     unsigned long x = inp[0];
177     *no_read = 1;
178     return x;
179 }
180
181 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
182                              size_t inbytesleft, size_t *no_read)
183 {
184     if (inp[0] != 0xef)
185     {
186         *no_read = 0;
187         return 0;
188     }
189     if (inbytesleft < 3)
190     {
191         cd->my_errno = YAZ_ICONV_EINVAL;
192         return (size_t) -1;
193     }
194     if (inp[1] != 0xbb && inp[2] == 0xbf)
195         *no_read = 3;
196     else
197         *no_read = 0;
198     return 0;
199 }
200
201 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
202                                     size_t inbytesleft, size_t *no_read)
203 {
204     unsigned long x = 0;
205
206     if (inp[0] <= 0x7f)
207     {
208         x = inp[0];
209         *no_read = 1;
210     }
211     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
212     {
213         *no_read = 0;
214         cd->my_errno = YAZ_ICONV_EILSEQ;
215     }
216     else if (inp[0] <= 0xdf && inbytesleft >= 2)
217     {
218         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
219         if (x >= 0x80)
220             *no_read = 2;
221         else
222         {
223             *no_read = 0;
224             cd->my_errno = YAZ_ICONV_EILSEQ;
225         }
226     }
227     else if (inp[0] <= 0xef && inbytesleft >= 3)
228     {
229         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
230             (inp[2] & 0x3f);
231         if (x >= 0x800)
232             *no_read = 3;
233         else
234         {
235             *no_read = 0;
236             cd->my_errno = YAZ_ICONV_EILSEQ;
237         }
238     }
239     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
240     {
241         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
242             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
243         if (x >= 0x10000)
244             *no_read = 4;
245         else
246         {
247             *no_read = 0;
248             cd->my_errno = YAZ_ICONV_EILSEQ;
249         }
250     }
251     else if (inp[0] <= 0xfb && inbytesleft >= 5)
252     {
253         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
254             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
255             (inp[4] & 0x3f);
256         if (x >= 0x200000)
257             *no_read = 5;
258         else
259         {
260             *no_read = 0;
261             cd->my_errno = YAZ_ICONV_EILSEQ;
262         }
263     }
264     else if (inp[0] <= 0xfd && inbytesleft >= 6)
265     {
266         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
267             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
268             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
269         if (x >= 0x4000000)
270             *no_read = 6;
271         else
272         {
273             *no_read = 0;
274             cd->my_errno = YAZ_ICONV_EILSEQ;
275         }
276     }
277     else
278     {
279         *no_read = 0;
280         cd->my_errno = YAZ_ICONV_EINVAL;
281     }
282     return x;
283 }
284
285 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
286                                     size_t inbytesleft, size_t *no_read)
287 {
288     unsigned long x = 0;
289     
290     if (inbytesleft < 4)
291     {
292         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
293         *no_read = 0;
294     }
295     else
296     {
297         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
298         *no_read = 4;
299     }
300     return x;
301 }
302
303 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
304                                       size_t inbytesleft, size_t *no_read)
305 {
306     unsigned long x = 0;
307     
308     if (inbytesleft < 4)
309     {
310         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
311         *no_read = 0;
312     }
313     else
314     {
315         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
316         *no_read = 4;
317     }
318     return x;
319 }
320
321 #if HAVE_WCHAR_H
322 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
323                                        size_t inbytesleft, size_t *no_read)
324 {
325     unsigned long x = 0;
326     
327     if (inbytesleft < sizeof(wchar_t))
328     {
329         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
330         *no_read = 0;
331     }
332     else
333     {
334         wchar_t wch;
335         memcpy (&wch, inp, sizeof(wch));
336         x = wch;
337         *no_read = sizeof(wch);
338     }
339     return x;
340 }
341 #endif
342
343
344 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
345                                           size_t inbytesleft, size_t *no_read,
346                                           int *comb);
347
348 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
349                                      size_t inbytesleft, size_t *no_read)
350 {
351     unsigned long x;
352     if (cd->comb_offset < cd->comb_size)
353     {
354         *no_read = cd->comb_no_read[cd->comb_offset];
355         x = cd->comb_x[cd->comb_offset];
356
357         /* special case for double-diacritic combining characters, 
358            INVERTED BREVE and DOUBLE TILDE.
359            We'll increment the no_read counter by 1, since we want to skip over
360            the processing of the closing ligature character
361         */
362         /* this code is no longer necessary.. our handlers code in
363            yaz_marc8_?_conv (generated by charconv.tcl) now returns
364            0 and no_read=1 when a sequence does not match the input.
365            The SECOND HALFs in codetables.xml produces a non-existant
366            entry in the conversion trie.. Hence when met, the input byte is
367            skipped as it should (in yaz_iconv)
368         */
369 #if 0
370         if (x == 0x0361 || x == 0x0360)
371             *no_read += 1;
372 #endif
373         cd->comb_offset++;
374         return x;
375     }
376
377     cd->comb_offset = 0;
378     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
379     {
380         int comb = 0;
381         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
382         if (!comb || !x)
383             break;
384         cd->comb_x[cd->comb_size] = x;
385         cd->comb_no_read[cd->comb_size] = *no_read;
386         inp += *no_read;
387         inbytesleft = inbytesleft - *no_read;
388     }
389     return x;
390 }
391
392 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
393                                      size_t inbytesleft, size_t *no_read)
394 {
395     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
396     if (x && cd->comb_size == 1)
397     {
398         /* For MARC8s we try to get a Latin-1 page code out of it */
399         int i;
400         for (i = 0; latin1_comb[i].x1; i++)
401             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
402             {
403                 *no_read += cd->comb_no_read[0];
404                 cd->comb_size = 0;
405                 x = latin1_comb[i].y;
406                 break;
407             }
408     }
409     return x;
410 }
411
412 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
413                                          size_t inbytesleft, size_t *no_read,
414                                          int *comb)
415 {
416     *no_read = 0;
417     while(inbytesleft >= 1 && inp[0] == 27)
418     {
419         size_t inbytesleft0 = inbytesleft;
420         inp++;
421         inbytesleft--;
422         while(inbytesleft > 0 && strchr("(,$!", *inp))
423         {
424             inbytesleft--;
425             inp++;
426         }
427         if (inbytesleft <= 0)
428         {
429             *no_read = 0;
430             cd->my_errno = YAZ_ICONV_EINVAL;
431             return 0;
432         }
433         cd->marc8_esc_mode = *inp++;
434         inbytesleft--;
435         (*no_read) += inbytesleft0 - inbytesleft;
436     }
437     if (inbytesleft <= 0)
438         return 0;
439     else
440     {
441         unsigned long x;
442         size_t no_read_sub = 0;
443         *comb = 0;
444
445         switch(cd->marc8_esc_mode)
446         {
447         case 'B':  /* Basic ASCII */
448         case 'E':  /* ANSEL */
449         case 's':  /* ASCII */
450             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
451             break;
452         case 'g':  /* Greek */
453             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
454             break;
455         case 'b':  /* Subscripts */
456             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
457             break;
458         case 'p':  /* Superscripts */
459             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
460             break;
461         case '2':  /* Basic Hebrew */
462             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
463             break;
464         case 'N':  /* Basic Cyrillic */
465         case 'Q':  /* Extended Cyrillic */
466             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
467             break;
468         case '3':  /* Basic Arabic */
469         case '4':  /* Extended Arabic */
470             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
471             break;
472         case 'S':  /* Greek */
473             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
474             break;
475         case '1':  /* Chinese, Japanese, Korean (EACC) */
476             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
477             break;
478         default:
479             *no_read = 0;
480             cd->my_errno = YAZ_ICONV_EILSEQ;
481             return 0;
482         }
483         *no_read += no_read_sub;
484         return x;
485     }
486 }
487
488 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
489                               char **outbuf, size_t *outbytesleft,
490                               int last)
491 {
492     unsigned char *outp = (unsigned char *) *outbuf;
493
494     if (x <= 0x7f && *outbytesleft >= 1)
495     {
496         *outp++ = (unsigned char) x;
497         (*outbytesleft)--;
498     } 
499     else if (x <= 0x7ff && *outbytesleft >= 2)
500     {
501         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
502         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
503         (*outbytesleft) -= 2;
504     }
505     else if (x <= 0xffff && *outbytesleft >= 3)
506     {
507         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
508         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
509         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
510         (*outbytesleft) -= 3;
511     }
512     else if (x <= 0x1fffff && *outbytesleft >= 4)
513     {
514         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
515         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
516         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
517         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
518         (*outbytesleft) -= 4;
519     }
520     else if (x <= 0x3ffffff && *outbytesleft >= 5)
521     {
522         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
523         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
524         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
525         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
526         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
527         (*outbytesleft) -= 5;
528     }
529     else if (*outbytesleft >= 6)
530     {
531         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
532         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
533         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
534         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
535         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
536         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
537         (*outbytesleft) -= 6;
538     }
539     else 
540     {
541         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
542         return (size_t)(-1);
543     }
544     *outbuf = (char *) outp;
545     return 0;
546 }
547
548
549 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
550                                    char **outbuf, size_t *outbytesleft,
551                                    int last)
552 {
553     /* list of two char unicode sequence that, when combined, are
554        equivalent to single unicode chars that can be represented in
555        ISO-8859-1/Latin-1.
556        Regular iconv on Linux at least does not seem to convert these,
557        but since MARC-8 to UTF-8 generates these composed sequence
558        we get a better chance of a successful MARC-8 -> ISO-8859-1
559        conversion */
560     unsigned char *outp = (unsigned char *) *outbuf;
561
562     if (cd->compose_char)
563     {
564         int i;
565         for (i = 0; latin1_comb[i].x1; i++)
566             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
567             {
568                 x = latin1_comb[i].y;
569                 break;
570             }
571         if (*outbytesleft < 1)
572         {  /* no room. Retain compose_char and bail out */
573             cd->my_errno = YAZ_ICONV_E2BIG;
574             return (size_t)(-1);
575         }
576         if (!latin1_comb[i].x1) 
577         {   /* not found. Just write compose_char */
578             *outp++ = (unsigned char) cd->compose_char;
579             (*outbytesleft)--;
580             *outbuf = (char *) outp;
581         }
582         /* compose_char used so reset it. x now holds current char */
583         cd->compose_char = 0;
584     }
585
586     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
587     {
588         cd->compose_char = x;
589         return 0;
590     }
591     else if (x > 255 || x < 1)
592     {
593         cd->my_errno = YAZ_ICONV_EILSEQ;
594         return (size_t) -1;
595     }
596     else if (*outbytesleft < 1)
597     {
598         cd->my_errno = YAZ_ICONV_E2BIG;
599         return (size_t)(-1);
600     }
601     *outp++ = (unsigned char) x;
602     (*outbytesleft)--;
603     *outbuf = (char *) outp;
604     return 0;
605 }
606
607
608 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
609                               char **outbuf, size_t *outbytesleft,
610                               int last)
611 {
612     unsigned char *outp = (unsigned char *) *outbuf;
613     if (*outbytesleft >= 4)
614     {
615         *outp++ = (unsigned char) (x>>24);
616         *outp++ = (unsigned char) (x>>16);
617         *outp++ = (unsigned char) (x>>8);
618         *outp++ = (unsigned char) x;
619         (*outbytesleft) -= 4;
620     }
621     else
622     {
623         cd->my_errno = YAZ_ICONV_E2BIG;
624         return (size_t)(-1);
625     }
626     *outbuf = (char *) outp;
627     return 0;
628 }
629
630 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
631                                 char **outbuf, size_t *outbytesleft,
632                                 int last)
633 {
634     unsigned char *outp = (unsigned char *) *outbuf;
635     if (*outbytesleft >= 4)
636     {
637         *outp++ = (unsigned char) x;
638         *outp++ = (unsigned char) (x>>8);
639         *outp++ = (unsigned char) (x>>16);
640         *outp++ = (unsigned char) (x>>24);
641         (*outbytesleft) -= 4;
642     }
643     else
644     {
645         cd->my_errno = YAZ_ICONV_E2BIG;
646         return (size_t)(-1);
647     }
648     *outbuf = (char *) outp;
649     return 0;
650 }
651
652 static unsigned long lookup_marc8(yaz_iconv_t cd,
653                                   unsigned long x, int *comb,
654                                   const char **page_chr)
655 {
656     char utf8_buf[7];
657     char *utf8_outbuf = utf8_buf;
658     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
659
660     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
661     if (r == (size_t)(-1))
662     {
663         cd->my_errno = YAZ_ICONV_EILSEQ;
664         return 0;
665     }
666     else
667     {
668         unsigned char *inp;
669         size_t inbytesleft, no_read_sub = 0;
670         unsigned long x;
671
672         *utf8_outbuf = '\0';        
673         inp = (unsigned char *) utf8_buf;
674         inbytesleft = strlen(utf8_buf);
675         
676         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
677         if (x)
678         {
679             *page_chr = "\033(B";
680             return x;
681         }
682         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
683         if (x)
684         {
685             *page_chr = "\033g";
686             return x;
687         }
688         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
689         if (x)
690         {
691             *page_chr = "\033b";
692             return x;
693         }
694         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
695         if (x)
696         {
697             *page_chr = "\033p";
698             return x;
699         }
700         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
701         if (x)
702         {
703             *page_chr = "\033(2";
704             return x;
705         }
706         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
707         if (x)
708         {
709             *page_chr = "\033(N";
710             return x;
711         }
712         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
713         if (x)
714         {
715             *page_chr = "\033(3";
716             return x;
717         }
718         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
719         if (x)
720         {
721             *page_chr = "\033(S";
722             return x;
723         }
724         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
725         if (x)
726         {
727             *page_chr = "\033(1";
728             return x;
729         }
730         cd->my_errno = YAZ_ICONV_EILSEQ;
731         return x;
732     }
733 }
734
735 static size_t flush_combos(yaz_iconv_t cd,
736                            char **outbuf, size_t *outbytesleft)
737 {
738     unsigned long y = cd->write_marc8_last;
739     unsigned char byte, second_half = 0;
740     char out_buf[10];
741     size_t i, out_no = 0;
742
743     if (!y)
744         return 0;
745
746     byte = (unsigned char )((y>>16) & 0xff);
747     if (byte)
748         out_buf[out_no++] = byte;
749     byte = (unsigned char)((y>>8) & 0xff);
750     if (byte)
751         out_buf[out_no++] = byte;
752     byte = (unsigned char )(y & 0xff);
753     if (byte)
754         out_buf[out_no++] = byte;
755
756     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
757     {
758         cd->my_errno = YAZ_ICONV_E2BIG;
759         return (size_t) (-1);
760     }
761
762     for (i = 0; i < cd->write_marc8_comb_no; i++)
763     {
764         /* all MARC-8 combined characters are simple bytes */
765         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
766         if (byte == 0xEB)
767             second_half = 0xEC;
768         else if (byte == 0xFA)
769             second_half = 0xFB;
770
771         *(*outbuf)++ = byte;
772         (*outbytesleft)--;
773     }
774     memcpy(*outbuf, out_buf, out_no);
775     *outbuf += out_no;
776     (*outbytesleft) -= out_no;
777     if (second_half)
778     {
779         *(*outbuf)++ = second_half;
780         (*outbytesleft)--;
781     }        
782
783     cd->write_marc8_last = 0;
784     cd->write_marc8_comb_no = 0;
785     return 0;
786 }
787
788 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
789                               char **outbuf, size_t *outbytesleft,
790                               int last)
791 {
792     int comb = 0;
793     const char *page_chr = 0;
794     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
795
796     if (!y)
797         return (size_t) (-1);
798
799     if (comb)
800     {
801         if (cd->write_marc8_comb_no < 6)
802             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
803     }
804     else
805     {
806         size_t r = flush_combos(cd, outbuf, outbytesleft);
807         if (r)
808             return r;
809         if (strcmp(page_chr, cd->write_marc8_page_chr))
810         {
811             size_t plen = strlen(page_chr);
812
813             if (*outbytesleft < plen)
814             {
815                 cd->my_errno = YAZ_ICONV_E2BIG;
816                 return (size_t) (-1);
817             }
818             memcpy(*outbuf, page_chr, plen);
819             (*outbuf) += plen;
820             (*outbytesleft) -= plen;
821             cd->write_marc8_page_chr = page_chr;            
822         }
823         cd->write_marc8_last = y;
824     }
825     if (last)
826     {
827         size_t r = flush_combos(cd, outbuf, outbytesleft);
828         if (r)
829         {
830             if (comb)
831                 cd->write_marc8_comb_no--;
832             else
833                 cd->write_marc8_last = 0;
834             return r;
835         }
836     }
837     return 0;
838 }
839
840 #if HAVE_WCHAR_H
841 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
842                                  char **outbuf, size_t *outbytesleft,
843                                  int last)
844 {
845     unsigned char *outp = (unsigned char *) *outbuf;
846
847     if (*outbytesleft >= sizeof(wchar_t))
848     {
849         wchar_t wch = x;
850         memcpy(outp, &wch, sizeof(wch));
851         outp += sizeof(wch);
852         (*outbytesleft) -= sizeof(wch);
853     }
854     else
855     {
856         cd->my_errno = YAZ_ICONV_E2BIG;
857         return (size_t)(-1);
858     }
859     *outbuf = (char *) outp;
860     return 0;
861 }
862 #endif
863
864 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
865 {
866     return cd->read_handle && cd->write_handle;
867 }
868
869 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
870 {
871     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
872
873     cd->write_handle = 0;
874     cd->read_handle = 0;
875     cd->init_handle = 0;
876     cd->my_errno = YAZ_ICONV_UNKNOWN;
877     cd->marc8_esc_mode = 'B';
878     cd->comb_offset = cd->comb_size = 0;
879     cd->compose_char = 0;
880
881     cd->write_marc8_comb_no = 0;
882     cd->write_marc8_last = 0;
883     cd->write_marc8_page_chr = "\033(B";
884
885     /* a useful hack: if fromcode has leading @,
886        the library not use YAZ's own conversions .. */
887     if (fromcode[0] == '@')
888         fromcode++;
889     else
890     {
891         if (!yaz_matchstr(fromcode, "UTF8"))
892         {
893             cd->read_handle = yaz_read_UTF8;
894             cd->init_handle = yaz_init_UTF8;
895         }
896         else if (!yaz_matchstr(fromcode, "ISO88591"))
897             cd->read_handle = yaz_read_ISO8859_1;
898         else if (!yaz_matchstr(fromcode, "UCS4"))
899             cd->read_handle = yaz_read_UCS4;
900         else if (!yaz_matchstr(fromcode, "UCS4LE"))
901             cd->read_handle = yaz_read_UCS4LE;
902         else if (!yaz_matchstr(fromcode, "MARC8"))
903             cd->read_handle = yaz_read_marc8;
904         else if (!yaz_matchstr(fromcode, "MARC8s"))
905             cd->read_handle = yaz_read_marc8s;
906 #if HAVE_WCHAR_H
907         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
908             cd->read_handle = yaz_read_wchar_t;
909 #endif
910         
911         if (!yaz_matchstr(tocode, "UTF8"))
912             cd->write_handle = yaz_write_UTF8;
913         else if (!yaz_matchstr(tocode, "ISO88591"))
914             cd->write_handle = yaz_write_ISO8859_1;
915         else if (!yaz_matchstr (tocode, "UCS4"))
916             cd->write_handle = yaz_write_UCS4;
917         else if (!yaz_matchstr(tocode, "UCS4LE"))
918             cd->write_handle = yaz_write_UCS4LE;
919         else if (!yaz_matchstr(tocode, "MARC8"))
920             cd->write_handle = yaz_write_marc8;
921         else if (!yaz_matchstr(tocode, "MARC8s"))
922             cd->write_handle = yaz_write_marc8;
923 #if HAVE_WCHAR_H
924         else if (!yaz_matchstr(tocode, "WCHAR_T"))
925             cd->write_handle = yaz_write_wchar_t;
926 #endif
927     }
928 #if HAVE_ICONV_H
929     cd->iconv_cd = 0;
930     if (!cd->read_handle || !cd->write_handle)
931     {
932         cd->iconv_cd = iconv_open (tocode, fromcode);
933         if (cd->iconv_cd == (iconv_t) (-1))
934         {
935             xfree (cd);
936             return 0;
937         }
938     }
939 #else
940     if (!cd->read_handle || !cd->write_handle)
941     {
942         xfree (cd);
943         return 0;
944     }
945 #endif
946     cd->init_flag = 1;
947     return cd;
948 }
949
950 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
951                  char **outbuf, size_t *outbytesleft)
952 {
953     char *inbuf0;
954     size_t r = 0;
955
956 #if HAVE_ICONV_H
957     if (cd->iconv_cd)
958     {
959         size_t r =
960             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
961         if (r == (size_t)(-1))
962         {
963             switch (yaz_errno())
964             {
965             case E2BIG:
966                 cd->my_errno = YAZ_ICONV_E2BIG;
967                 break;
968             case EINVAL:
969                 cd->my_errno = YAZ_ICONV_EINVAL;
970                 break;
971             case EILSEQ:
972                 cd->my_errno = YAZ_ICONV_EILSEQ;
973                 break;
974             default:
975                 cd->my_errno = YAZ_ICONV_UNKNOWN;
976             }
977         }
978         return r;
979     }
980 #endif
981     if (inbuf == 0 || *inbuf == 0)
982     {
983         cd->init_flag = 1;
984         cd->my_errno = YAZ_ICONV_UNKNOWN;
985         return 0;
986     }
987     inbuf0 = *inbuf;
988
989     if (cd->init_flag)
990     {
991         if (cd->init_handle)
992         {
993             size_t no_read;
994             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
995                                          *inbytesleft, &no_read);
996             if (r)
997             {
998                 if (cd->my_errno == YAZ_ICONV_EINVAL)
999                     return r;
1000                 cd->init_flag = 0;
1001                 return r;
1002             }
1003             *inbytesleft -= no_read;
1004             *inbuf += no_read;
1005         }
1006         cd->init_flag = 0;
1007         cd->unget_x = 0;
1008         cd->no_read_x = 0;
1009     }
1010     while (1)
1011     {
1012         unsigned long x;
1013         size_t no_read;
1014
1015         if (*inbytesleft == 0)
1016         {
1017             r = *inbuf - inbuf0;
1018             break;
1019         }
1020         if (!cd->unget_x)
1021         {
1022             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1023                                   &no_read);
1024             if (no_read == 0)
1025             {
1026                 r = (size_t)(-1);
1027                 break;
1028             }
1029         }
1030         else
1031         {
1032             x = cd->unget_x;
1033             no_read = cd->no_read_x;
1034         }
1035         if (x)
1036         {
1037             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1038                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1039             if (r)
1040             {
1041                 /* unable to write it. save it because read_handle cannot
1042                    rewind .. */
1043                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1044                 {
1045                     cd->unget_x = x;
1046                     cd->no_read_x = no_read;
1047                     break;
1048                 }
1049             }
1050             cd->unget_x = 0;
1051         }
1052         *inbytesleft -= no_read;
1053         (*inbuf) += no_read;
1054     }
1055     return r;
1056 }
1057
1058 int yaz_iconv_error (yaz_iconv_t cd)
1059 {
1060     return cd->my_errno;
1061 }
1062
1063 int yaz_iconv_close (yaz_iconv_t cd)
1064 {
1065 #if HAVE_ICONV_H
1066     if (cd->iconv_cd)
1067         iconv_close (cd->iconv_cd);
1068 #endif
1069     xfree (cd);
1070     return 0;
1071 }
1072
1073 /*
1074  * Local variables:
1075  * c-basic-offset: 4
1076  * indent-tabs-mode: nil
1077  * End:
1078  * vim: shiftwidth=4 tabstop=8 expandtab
1079  */
1080