--- /dev/null
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2008 Index Data
+ * See the file LICENSE for details.
+ */
+/**
+ * \file
+ * \brief MARC-8 decoding
+ *
+ * MARC-8 reference:
+ * http://www.loc.gov/marc/specifications/speccharmarc8.html
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <yaz/xmalloc.h>
+#include "iconv-p.h"
+
+struct decoder_data {
+ int g0_mode;
+ int g1_mode;
+
+ int comb_offset;
+ int comb_size;
+ unsigned long comb_x[8];
+ size_t comb_no_read[8];
+};
+
+yaz_conv_func_t yaz_marc8_42_conv;
+yaz_conv_func_t yaz_marc8_45_conv;
+yaz_conv_func_t yaz_marc8_67_conv;
+yaz_conv_func_t yaz_marc8_62_conv;
+yaz_conv_func_t yaz_marc8_70_conv;
+yaz_conv_func_t yaz_marc8_32_conv;
+yaz_conv_func_t yaz_marc8_4E_conv;
+yaz_conv_func_t yaz_marc8_51_conv;
+yaz_conv_func_t yaz_marc8_33_conv;
+yaz_conv_func_t yaz_marc8_34_conv;
+yaz_conv_func_t yaz_marc8_53_conv;
+yaz_conv_func_t yaz_marc8_31_conv;
+
+
+static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
+ struct decoder_data *data,
+ unsigned char *inp,
+ size_t inbytesleft, size_t *no_read,
+ int *comb);
+
+static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
+ unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
+{
+ struct decoder_data *data = (struct decoder_data *) d->data;
+ unsigned long x;
+ if (data->comb_offset < data->comb_size)
+ {
+ *no_read = data->comb_no_read[data->comb_offset];
+ x = data->comb_x[data->comb_offset];
+
+ /* special case for double-diacritic combining characters,
+ INVERTED BREVE and DOUBLE TILDE.
+ We'll increment the no_read counter by 1, since we want to skip over
+ the processing of the closing ligature character
+ */
+ /* this code is no longer necessary.. our handlers code in
+ yaz_marc8_?_conv (generated by charconv.tcl) now returns
+ 0 and no_read=1 when a sequence does not match the input.
+ The SECOND HALFs in codetables.xml produces a non-existant
+ entry in the conversion trie.. Hence when met, the input byte is
+ skipped as it should (in yaz_iconv)
+ */
+#if 0
+ if (x == 0x0361 || x == 0x0360)
+ *no_read += 1;
+#endif
+ data->comb_offset++;
+ return x;
+ }
+
+ data->comb_offset = 0;
+ for (data->comb_size = 0; data->comb_size < 8; data->comb_size++)
+ {
+ int comb = 0;
+
+ if (inbytesleft == 0 && data->comb_size)
+ {
+ yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
+ x = 0;
+ *no_read = 0;
+ break;
+ }
+ x = yaz_read_marc8_comb(cd, data, inp, inbytesleft, no_read, &comb);
+ if (!comb || !x)
+ break;
+ data->comb_x[data->comb_size] = x;
+ data->comb_no_read[data->comb_size] = *no_read;
+ inp += *no_read;
+ inbytesleft = inbytesleft - *no_read;
+ }
+ return x;
+}
+
+static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d,
+ unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
+{
+ struct decoder_data *data = (struct decoder_data *) d->data;
+ unsigned long x = read_marc8(cd, d, inp, inbytesleft, no_read);
+ if (x && data->comb_size == 1)
+ {
+ if (yaz_iso_8859_1_lookup_x12(x, data->comb_x[0], &x))
+ {
+ *no_read += data->comb_no_read[0];
+ data->comb_size = 0;
+ }
+ }
+ return x;
+}
+
+static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
+ struct decoder_data *data,
+ unsigned char *inp,
+ size_t inbytesleft, size_t *no_read,
+ int *comb)
+{
+ *no_read = 0;
+ while (inbytesleft > 0 && *inp == 27)
+ {
+ int *modep = &data->g0_mode;
+ size_t inbytesleft0 = inbytesleft;
+
+ inbytesleft--;
+ inp++;
+ if (inbytesleft == 0)
+ goto incomplete;
+ if (*inp == '$') /* set with multiple bytes */
+ {
+ inbytesleft--;
+ inp++;
+ }
+ if (inbytesleft == 0)
+ goto incomplete;
+ if (*inp == '(' || *inp == ',') /* G0 */
+ {
+ inbytesleft--;
+ inp++;
+ }
+ else if (*inp == ')' || *inp == '-') /* G1 */
+ {
+ inbytesleft--;
+ inp++;
+ modep = &data->g1_mode;
+ }
+ if (inbytesleft == 0)
+ goto incomplete;
+ if (*inp == '!') /* ANSEL is a special case */
+ {
+ inbytesleft--;
+ inp++;
+ }
+ if (inbytesleft == 0)
+ goto incomplete;
+ *modep = *inp++; /* Final character */
+ inbytesleft--;
+
+ (*no_read) += inbytesleft0 - inbytesleft;
+ }
+ if (inbytesleft == 0)
+ return 0;
+ else if (*inp == ' ')
+ {
+ *no_read += 1;
+ return ' ';
+ }
+ else
+ {
+ unsigned long x;
+ size_t no_read_sub = 0;
+ int mode = *inp < 128 ? data->g0_mode : data->g1_mode;
+ *comb = 0;
+
+ switch(mode)
+ {
+ case 'B': /* Basic ASCII */
+ case 's': /* ASCII */
+ x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case 'E': /* ANSEL */
+ x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
+ break;
+ case 'g': /* Greek */
+ x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case 'b': /* Subscripts */
+ x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case 'p': /* Superscripts */
+ x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case '2': /* Basic Hebrew */
+ x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case 'N': /* Basic Cyrillic */
+ x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case 'Q': /* Extended Cyrillic */
+ x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case '3': /* Basic Arabic */
+ x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case '4': /* Extended Arabic */
+ x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case 'S': /* Greek */
+ x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ case '1': /* Chinese, Japanese, Korean (EACC) */
+ x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
+ break;
+ default:
+ *no_read = 0;
+ yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
+ return 0;
+ }
+ *no_read += no_read_sub;
+ return x;
+ }
+incomplete:
+ *no_read = 0;
+ yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL);
+ return 0;
+}
+
+
+static size_t init_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
+ unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
+{
+ struct decoder_data *data = (struct decoder_data *) d->data;
+ data->g0_mode = 'B';
+ data->g1_mode = 'E';
+ data->comb_offset = data->comb_size = 0;
+ return 0;
+}
+
+void destroy_marc8(yaz_iconv_decoder_t d)
+{
+ struct decoder_data *data = (struct decoder_data *) d->data;
+ xfree(data);
+}
+
+yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode,
+ yaz_iconv_decoder_t d)
+{
+ if (!yaz_matchstr(fromcode, "MARC8") || !yaz_matchstr(fromcode, "ANSEL"))
+ d->read_handle = read_marc8;
+ else if (!yaz_matchstr(fromcode, "ISO5426"))
+ d->read_handle = read_marc8;
+ else if (!yaz_matchstr(fromcode, "MARC8s"))
+ d->read_handle = read_marc8s;
+ else
+ return 0;
+ {
+ struct decoder_data *data = (struct decoder_data *)
+ xmalloc(sizeof(*data));
+ d->data = data;
+ d->init_handle = init_marc8;
+ d->destroy_handle = destroy_marc8;
+ }
+ return d;
+}
+
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */