1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2011 Index Data
3 * See the file LICENSE for details.
7 * \brief UTF-8 encoding / decoding
21 static size_t init_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
23 size_t inbytesleft, size_t *no_read)
25 if (!inp || inp[0] != 0xef)
32 yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
35 if (inp[1] != 0xbb && inp[2] == 0xbf)
42 unsigned long yaz_read_UTF8_char(unsigned char *inp,
43 size_t inbytesleft, size_t *no_read,
48 *no_read = 0; /* by default */
54 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
56 *error = YAZ_ICONV_EILSEQ;
58 else if (inp[0] <= 0xdf && inbytesleft >= 2)
60 if ((inp[1] & 0xc0) == 0x80)
62 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
66 *error = YAZ_ICONV_EILSEQ;
69 *error = YAZ_ICONV_EILSEQ;
71 else if (inp[0] <= 0xef && inbytesleft >= 3)
73 if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80)
75 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
80 *error = YAZ_ICONV_EILSEQ;
83 *error = YAZ_ICONV_EILSEQ;
85 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
87 if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
88 && (inp[3] & 0xc0) == 0x80)
90 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
91 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
95 *error = YAZ_ICONV_EILSEQ;
98 *error = YAZ_ICONV_EILSEQ;
100 else if (inp[0] <= 0xfb && inbytesleft >= 5)
102 if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
103 && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80)
105 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
106 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
111 *error = YAZ_ICONV_EILSEQ;
114 *error = YAZ_ICONV_EILSEQ;
116 else if (inp[0] <= 0xfd && inbytesleft >= 6)
118 if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
119 && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80
120 && (inp[5] & 0xc0) == 0x80)
122 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
123 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
124 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
128 *error = YAZ_ICONV_EILSEQ;
131 *error = YAZ_ICONV_EILSEQ;
134 *error = YAZ_ICONV_EINVAL; /* incomplete sentence */
139 static unsigned long read_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
141 size_t inbytesleft, size_t *no_read)
144 int r = yaz_read_UTF8_char(inp, inbytesleft, no_read, &err);
145 yaz_iconv_set_errno(cd, err);
150 static size_t write_UTF8(yaz_iconv_t cd, yaz_iconv_encoder_t en,
152 char **outbuf, size_t *outbytesleft)
155 int r = yaz_write_UTF8_char(x, outbuf, outbytesleft, &err);
156 yaz_iconv_set_errno(cd, err);
160 size_t yaz_write_UTF8_char(unsigned long x,
161 char **outbuf, size_t *outbytesleft,
164 unsigned char *outp = (unsigned char *) *outbuf;
166 if (x <= 0x7f && *outbytesleft >= 1)
168 *outp++ = (unsigned char) x;
171 else if (x <= 0x7ff && *outbytesleft >= 2)
173 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
174 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
175 (*outbytesleft) -= 2;
177 else if (x <= 0xffff && *outbytesleft >= 3)
179 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
180 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
181 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
182 (*outbytesleft) -= 3;
184 else if (x <= 0x1fffff && *outbytesleft >= 4)
186 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
187 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
188 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
189 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
190 (*outbytesleft) -= 4;
192 else if (x <= 0x3ffffff && *outbytesleft >= 5)
194 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
195 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
196 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
197 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
198 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
199 (*outbytesleft) -= 5;
201 else if (*outbytesleft >= 6)
203 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
204 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
205 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
206 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
207 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
208 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
209 (*outbytesleft) -= 6;
213 *error = YAZ_ICONV_E2BIG; /* not room for output */
216 *outbuf = (char *) outp;
220 yaz_iconv_encoder_t yaz_utf8_encoder(const char *tocode,
221 yaz_iconv_encoder_t e)
224 if (!yaz_matchstr(tocode, "UTF8"))
226 e->write_handle = write_UTF8;
232 yaz_iconv_decoder_t yaz_utf8_decoder(const char *fromcode,
233 yaz_iconv_decoder_t d)
235 if (!yaz_matchstr(fromcode, "UTF8"))
237 d->init_handle = init_utf8;
238 d->read_handle = read_utf8;
248 * c-file-style: "Stroustrup"
249 * indent-tabs-mode: nil
251 * vim: shiftwidth=4 tabstop=8 expandtab