* Copyright (C) 1995-2005, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: siconv.c,v 1.11 2005-02-07 11:23:18 adam Exp $
+ * $Id: siconv.c,v 1.12 2005-05-08 07:35:23 adam Exp $
*/
/**
* \file siconv.c
unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
size_t inbytesleft, size_t *no_read);
size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
- char **outbuf, size_t *outbytesleft);
+ char **outbuf, size_t *outbytesleft,
+ int last);
int marc8_esc_mode;
#if NEW_COMB
int comb_offset;
int marc8_comb_no_read;
#endif
size_t no_read_x;
- unsigned unget_x;
+ unsigned long unget_x;
#if HAVE_ICONV_H
iconv_t iconv_cd;
#endif
+ unsigned long compose_char;
};
static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
#endif
static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
- char **outbuf, size_t *outbytesleft)
+ char **outbuf, size_t *outbytesleft,
+ int last)
{
unsigned char *outp = (unsigned char *) *outbuf;
if (x <= 0x7f && *outbytesleft >= 1)
return 0;
}
+
static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
- char **outbuf, size_t *outbytesleft)
+ char **outbuf, size_t *outbytesleft,
+ int last)
{
+ /* list of two char unicode sequence that, when combined, are
+ equivalent to single unicode chars that can be represented in
+ ISO-8859-1/Latin-1.
+ Regular iconv on Linux at least does not seem to convert these,
+ but since MARC-8 to UTF-8 generates these composed sequence
+ we get a better chance of a successful MARC-8 -> ISO-8859-1
+ conversion */
+ static struct {
+ unsigned long x1, x2;
+ unsigned y;
+ } comb[] = {
+ { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
+ { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
+ { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
+ { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
+ { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
+ { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
+ /* no need for 0xc6 LATIN CAPITAL LETTER AE */
+ { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
+ { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
+ { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
+ { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
+ { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
+ { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
+ { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
+ { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
+ { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
+ { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
+ { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
+ { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
+ { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
+ { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
+ { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
+ /* omitted: 0xd7 MULTIPLICATION SIGN */
+ /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
+ { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
+ { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
+ { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
+ { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
+ { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
+ /* omitted: 0xde LATIN CAPITAL LETTER THORN */
+ /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
+ { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
+ { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
+ { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
+ { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
+ { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
+ { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
+ /* omitted: 0xe6 LATIN SMALL LETTER AE */
+ { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
+ { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
+ { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
+ { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
+ { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
+ { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
+ { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
+ { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
+ { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
+ /* omitted: 0xf0 LATIN SMALL LETTER ETH */
+ { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
+ { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
+ { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
+ { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
+ { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
+ { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
+ /* omitted: 0xf7 DIVISION SIGN */
+ /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
+ { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
+ { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
+ { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
+ { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
+ { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
+ /* omitted: 0xfe LATIN SMALL LETTER THORN */
+ { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
+
+ { 0, 0, 0}
+ };
unsigned char *outp = (unsigned char *) *outbuf;
+
+ if (!last && x > 32 && x < 127 && cd->compose_char == 0)
+ {
+ cd->compose_char = x;
+ return 0;
+ }
+ else if (cd->compose_char)
+ {
+ int i;
+ for (i = 0; comb[i].x1; i++)
+ if (cd->compose_char == comb[i].x1 && x == comb[i].x2)
+ {
+ x = comb[i].y;
+ break;
+ }
+ if (!comb[i].x1)
+ { /* not found */
+ if (*outbytesleft >= 1)
+ {
+ *outp++ = (unsigned char) cd->compose_char;
+ (*outbytesleft)--;
+ *outbuf = (char *) outp;
+ if (!last && x > 32 && x < 127)
+ {
+ cd->compose_char = x;
+ return 0;
+ }
+ }
+ else
+ {
+ cd->my_errno = YAZ_ICONV_E2BIG;
+ return (size_t)(-1);
+ }
+ }
+ /* compose_char and old x combined to one new char: x */
+ cd->compose_char = 0;
+ }
if (x > 255 || x < 1)
{
cd->my_errno = YAZ_ICONV_EILSEQ;
static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
- char **outbuf, size_t *outbytesleft)
+ char **outbuf, size_t *outbytesleft,
+ int last)
{
unsigned char *outp = (unsigned char *) *outbuf;
if (*outbytesleft >= 4)
}
static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
- char **outbuf, size_t *outbytesleft)
+ char **outbuf, size_t *outbytesleft,
+ int last)
{
unsigned char *outp = (unsigned char *) *outbuf;
if (*outbytesleft >= 4)
#if HAVE_WCHAR_H
static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
- char **outbuf, size_t *outbytesleft)
+ char **outbuf, size_t *outbytesleft,
+ int last)
{
unsigned char *outp = (unsigned char *) *outbuf;
#else
cd->marc8_comb_x = 0;
#endif
+ cd->compose_char = 0;
/* a useful hack: if fromcode has leading @,
the library not use YAZ's own conversions .. */
}
if (x)
{
- r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
+ r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
+ (*inbytesleft - no_read) == 0 ? 1 : 0);
if (r)
{
/* unable to write it. save it because read_handle cannot