1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2008 Index Data
3 * See the file LICENSE for details.
7 * \brief Implements simple ICONV
9 * This implements an interface similar to that of iconv and
10 * is used by YAZ to interface with iconv (if present).
11 * For systems where iconv is not present, this layer
12 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
32 #include <yaz/xmalloc.h>
36 yaz_conv_func_t yaz_marc8_42_conv;
37 yaz_conv_func_t yaz_marc8_45_conv;
38 yaz_conv_func_t yaz_marc8_67_conv;
39 yaz_conv_func_t yaz_marc8_62_conv;
40 yaz_conv_func_t yaz_marc8_70_conv;
41 yaz_conv_func_t yaz_marc8_32_conv;
42 yaz_conv_func_t yaz_marc8_4E_conv;
43 yaz_conv_func_t yaz_marc8_51_conv;
44 yaz_conv_func_t yaz_marc8_33_conv;
45 yaz_conv_func_t yaz_marc8_34_conv;
46 yaz_conv_func_t yaz_marc8_53_conv;
47 yaz_conv_func_t yaz_marc8_31_conv;
49 struct yaz_iconv_struct {
52 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
53 size_t inbytesleft, size_t *no_read);
54 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
55 size_t inbytesleft, size_t *no_read);
61 unsigned long comb_x[8];
62 size_t comb_no_read[8];
64 unsigned long unget_x;
68 struct yaz_iconv_encoder_s encoder;
72 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
73 size_t inbytesleft, size_t *no_read)
75 unsigned long x = inp[0];
81 static unsigned long yaz_read_wchar_t(yaz_iconv_t cd, unsigned char *inp,
82 size_t inbytesleft, size_t *no_read)
86 if (inbytesleft < sizeof(wchar_t))
88 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
94 memcpy(&wch, inp, sizeof(wch));
96 *no_read = sizeof(wch);
103 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
104 size_t inbytesleft, size_t *no_read,
107 static unsigned long yaz_read_marc8(yaz_iconv_t cd, unsigned char *inp,
108 size_t inbytesleft, size_t *no_read)
111 if (cd->comb_offset < cd->comb_size)
113 *no_read = cd->comb_no_read[cd->comb_offset];
114 x = cd->comb_x[cd->comb_offset];
116 /* special case for double-diacritic combining characters,
117 INVERTED BREVE and DOUBLE TILDE.
118 We'll increment the no_read counter by 1, since we want to skip over
119 the processing of the closing ligature character
121 /* this code is no longer necessary.. our handlers code in
122 yaz_marc8_?_conv (generated by charconv.tcl) now returns
123 0 and no_read=1 when a sequence does not match the input.
124 The SECOND HALFs in codetables.xml produces a non-existant
125 entry in the conversion trie.. Hence when met, the input byte is
126 skipped as it should (in yaz_iconv)
129 if (x == 0x0361 || x == 0x0360)
137 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
141 if (inbytesleft == 0 && cd->comb_size)
143 cd->my_errno = YAZ_ICONV_EINVAL;
148 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
151 cd->comb_x[cd->comb_size] = x;
152 cd->comb_no_read[cd->comb_size] = *no_read;
154 inbytesleft = inbytesleft - *no_read;
159 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
160 size_t inbytesleft, size_t *no_read)
162 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
163 if (x && cd->comb_size == 1)
165 if (yaz_iso_8859_1_lookup_x12(x, cd->comb_x[0], &x))
167 *no_read += cd->comb_no_read[0];
174 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
175 size_t inbytesleft, size_t *no_read,
179 while (inbytesleft > 0 && *inp == 27)
181 int *modep = &cd->g0_mode;
182 size_t inbytesleft0 = inbytesleft;
186 if (inbytesleft == 0)
188 if (*inp == '$') /* set with multiple bytes */
193 if (inbytesleft == 0)
195 if (*inp == '(' || *inp == ',') /* G0 */
200 else if (*inp == ')' || *inp == '-') /* G1 */
204 modep = &cd->g1_mode;
206 if (inbytesleft == 0)
208 if (*inp == '!') /* ANSEL is a special case */
213 if (inbytesleft == 0)
215 *modep = *inp++; /* Final character */
218 (*no_read) += inbytesleft0 - inbytesleft;
220 if (inbytesleft == 0)
222 else if (*inp == ' ')
230 size_t no_read_sub = 0;
231 int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
236 case 'B': /* Basic ASCII */
237 case 's': /* ASCII */
238 x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
240 case 'E': /* ANSEL */
241 x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
243 case 'g': /* Greek */
244 x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
246 case 'b': /* Subscripts */
247 x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
249 case 'p': /* Superscripts */
250 x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
252 case '2': /* Basic Hebrew */
253 x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
255 case 'N': /* Basic Cyrillic */
256 x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
258 case 'Q': /* Extended Cyrillic */
259 x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
261 case '3': /* Basic Arabic */
262 x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
264 case '4': /* Extended Arabic */
265 x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
267 case 'S': /* Greek */
268 x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
270 case '1': /* Chinese, Japanese, Korean (EACC) */
271 x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
275 cd->my_errno = YAZ_ICONV_EILSEQ;
278 *no_read += no_read_sub;
283 cd->my_errno = YAZ_ICONV_EINVAL;
289 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
291 return cd->read_handle && cd->encoder.write_handle;
295 static int prepare_encoders(yaz_iconv_t cd, const char *tocode)
297 if (yaz_marc8_encoder(tocode, &cd->encoder))
299 if (yaz_utf8_encoder(tocode, &cd->encoder))
301 if (yaz_ucs4_encoder(tocode, &cd->encoder))
303 if (yaz_iso_8859_1_encoder(tocode, &cd->encoder))
305 if (yaz_iso_5428_encoder(tocode, &cd->encoder))
307 if (yaz_advancegreek_encoder(tocode, &cd->encoder))
309 if (yaz_wchar_encoder(tocode, &cd->encoder))
314 yaz_iconv_t yaz_iconv_open(const char *tocode, const char *fromcode)
316 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
318 cd->encoder.data = 0;
319 cd->encoder.write_handle = 0;
320 cd->encoder.flush_handle = 0;
321 cd->encoder.init_handle = 0;
322 cd->encoder.destroy_handle = 0;
326 cd->my_errno = YAZ_ICONV_UNKNOWN;
328 /* a useful hack: if fromcode has leading @,
329 the library not use YAZ's own conversions .. */
330 if (fromcode[0] == '@')
334 if (!yaz_matchstr(fromcode, "UTF8"))
336 cd->read_handle = yaz_read_UTF8;
337 cd->init_handle = yaz_init_UTF8;
339 else if (!yaz_matchstr(fromcode, "ISO88591"))
340 cd->read_handle = yaz_read_ISO8859_1;
341 else if (!yaz_matchstr(fromcode, "UCS4"))
342 cd->read_handle = yaz_read_UCS4;
343 else if (!yaz_matchstr(fromcode, "UCS4LE"))
344 cd->read_handle = yaz_read_UCS4LE;
345 else if (!yaz_matchstr(fromcode, "MARC8"))
346 cd->read_handle = yaz_read_marc8;
347 else if (!yaz_matchstr(fromcode, "MARC8s"))
348 cd->read_handle = yaz_read_marc8s;
349 else if (!yaz_matchstr(fromcode, "advancegreek"))
350 cd->read_handle = yaz_read_advancegreek;
351 else if (!yaz_matchstr(fromcode, "iso54281984"))
352 cd->read_handle = yaz_read_iso5428_1984;
353 else if (!yaz_matchstr(fromcode, "iso5428:1984"))
354 cd->read_handle = yaz_read_iso5428_1984;
356 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
357 cd->read_handle = yaz_read_wchar_t;
359 prepare_encoders(cd, tocode);
361 if (cd->read_handle && cd->encoder.write_handle)
371 cd->iconv_cd = iconv_open(tocode, fromcode);
372 if (cd->iconv_cd == (iconv_t) (-1))
386 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
387 char **outbuf, size_t *outbytesleft)
396 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
397 if (r == (size_t)(-1))
402 cd->my_errno = YAZ_ICONV_E2BIG;
405 cd->my_errno = YAZ_ICONV_EINVAL;
408 cd->my_errno = YAZ_ICONV_EILSEQ;
411 cd->my_errno = YAZ_ICONV_UNKNOWN;
423 cd->my_errno = YAZ_ICONV_UNKNOWN;
427 cd->comb_offset = cd->comb_size = 0;
429 if (cd->encoder.init_handle)
430 (*cd->encoder.init_handle)(&cd->encoder);
435 if (cd->init_handle && inbuf && *inbuf)
438 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
439 *inbytesleft, &no_read);
442 if (cd->my_errno == YAZ_ICONV_EINVAL)
447 *inbytesleft -= no_read;
453 if (!inbuf || !*inbuf)
455 if (outbuf && *outbuf)
458 r = (*cd->encoder.write_handle)(cd, &cd->encoder,
459 cd->unget_x, outbuf, outbytesleft);
460 if (cd->encoder.flush_handle)
461 r = (*cd->encoder.flush_handle)(cd, &cd->encoder,
462 outbuf, outbytesleft);
477 no_read = cd->no_read_x;
481 if (*inbytesleft == 0)
486 x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
496 r = (*cd->encoder.write_handle)(cd, &cd->encoder,
497 x, outbuf, outbytesleft);
500 /* unable to write it. save it because read_handle cannot
502 if (cd->my_errno == YAZ_ICONV_E2BIG)
505 cd->no_read_x = no_read;
511 *inbytesleft -= no_read;
517 int yaz_iconv_error(yaz_iconv_t cd)
522 int yaz_iconv_close(yaz_iconv_t cd)
526 iconv_close(cd->iconv_cd);
528 if (cd->encoder.destroy_handle)
529 (*cd->encoder.destroy_handle)(&cd->encoder);
534 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
542 * indent-tabs-mode: nil
544 * vim: shiftwidth=4 tabstop=8 expandtab