1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2013 Index Data
3 * See the file LICENSE for details.
13 #include <yaz/yaz-util.h>
18 static int compare_buffers(char *msg, int no,
19 int expect_len, const char *expect_buf,
20 int got_len, const char *got_buf)
22 if (expect_len == got_len
23 && !memcmp(expect_buf, got_buf, expect_len))
26 if (0) /* use 1 see how the buffers differ (for debug purposes) */
29 printf("tsticonv test=%s i=%d failed\n", msg, no);
30 printf("off got exp\n");
31 for (i = 0; i<got_len || i<expect_len; i++)
37 sprintf(got_char, "%02X", got_buf[i]);
39 sprintf(got_char, "? ");
42 sprintf(expect_char, "%02X", expect_buf[i]);
44 sprintf(expect_char, "? ");
46 printf("%02d %s %s %c\n",
47 i, got_char, expect_char, got_buf[i] == expect_buf[i] ?
55 static int tst_convert_l(yaz_iconv_t cd, size_t in_len, const char *in_buf,
56 size_t expect_len, const char *expect_buf)
59 char *inbuf= (char*) in_buf;
60 size_t inbytesleft = in_len > 0 ? in_len : strlen(in_buf);
62 char *outbuf = outbuf0;
66 size_t outbytesleft = outbuf0 + sizeof(outbuf0) - outbuf;
67 if (outbytesleft > 12)
69 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
70 if (r == (size_t) (-1))
72 int e = yaz_iconv_error(cd);
73 if (e != YAZ_ICONV_E2BIG)
78 yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
83 return compare_buffers("tsticonv 22", 0,
84 expect_len, expect_buf,
85 outbuf - outbuf0, outbuf0);
88 static int tst_convert_x(yaz_iconv_t cd, const char *buf, const char *cmpbuf,
92 WRBUF b = wrbuf_alloc();
94 size_t inbytesleft = strlen(buf);
95 const char *inp = buf;
97 for (rounds = 0; inbytesleft && rounds < (int) sizeof(outbuf); rounds++)
99 size_t outbytesleft = sizeof(outbuf);
101 size_t r = yaz_iconv(cd, (char**) &inp, &inbytesleft,
102 &outp, &outbytesleft);
103 wrbuf_write(b, outbuf, outp - outbuf);
104 if (r == (size_t) (-1))
106 int e = yaz_iconv_error(cd);
107 if (e != YAZ_ICONV_E2BIG)
109 if (expect_error != -1)
110 if (e != expect_error)
117 size_t outbytesleft = sizeof(outbuf);
119 r = yaz_iconv(cd, 0, 0, &outp, &outbytesleft);
120 wrbuf_write(b, outbuf, outp - outbuf);
121 if (expect_error != -1)
127 if (wrbuf_len(b) == strlen(cmpbuf)
128 && !memcmp(cmpbuf, wrbuf_buf(b), wrbuf_len(b)))
132 WRBUF w = wrbuf_alloc();
136 wrbuf_puts_escaped(w, buf);
137 yaz_log(YLOG_LOG, "input %s", wrbuf_cstr(w));
140 wrbuf_write_escaped(w, wrbuf_buf(b), wrbuf_len(b));
141 yaz_log(YLOG_LOG, "got %s", wrbuf_cstr(w));
144 wrbuf_puts_escaped(w, cmpbuf);
145 yaz_log(YLOG_LOG, "exp %s", wrbuf_cstr(w));
154 static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf)
156 return tst_convert_x(cd, buf, cmpbuf, 0);
159 /* some test strings in ISO-8859-1 format */
160 static const char *iso_8859_1_a[] = {
169 static void tst_marc8_to_ucs4b(void)
171 yaz_iconv_t cd = yaz_iconv_open("UCS4", "MARC8");
176 YAZ_CHECK(tst_convert_l(
179 "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o",
181 "\x00\x00\xFF\x1F" "\x00\x00\x00o"));
182 YAZ_CHECK(tst_convert_l(
185 "\033$1" "\x6F\x77\x29" /* AE0E */
186 "\x6F\x52\x7C" /* c0F4 */ "\033(B",
188 "\x00\x00\xAE\x0E" "\x00\x00\xC0\xF4"));
189 YAZ_CHECK(tst_convert_l(
193 "\x21\x50\x6E" /* UCS 7CFB */
194 "\x21\x51\x31" /* UCS 7D71 */
195 "\x21\x3A\x67" /* UCS 5B89 */
196 "\x21\x33\x22" /* UCS 5168 */
197 "\x21\x33\x53" /* UCS 5206 */
198 "\x21\x44\x2B" /* UCS 6790 */
206 "\x00\x00\x67\x90"));
208 YAZ_CHECK(tst_convert_l(
211 "\xB0\xB2", /* AYN and oSLASH */
213 "\x00\x00\x02\xBB" "\x00\x00\x00\xF8"));
214 YAZ_CHECK(tst_convert_l(
217 "\xF6\x61", /* a underscore */
219 "\x00\x00\x00\x61" "\x00\x00\x03\x32"));
221 YAZ_CHECK(tst_convert_l(
224 "\x61\xC2", /* a, phonorecord mark */
226 "\x00\x00\x00\x61" "\x00\x00\x21\x17"));
229 YAZ_CHECK(tst_convert_l(
232 "el" "\xe8" "am\xe8" "an", /* elaman where a is a" */
241 "\x00\x00\x00" "n"));
243 YAZ_CHECK(tst_convert_l(
248 "\x00\x00\x00\x41" "\x00\x00\x03\x04" "\x00\x00\x03\x08"));
250 YAZ_CHECK(tst_convert_l(
255 "\x00\x00\x00\x74" "\x00\x00\x03\x61" "\x00\x00\x00\x73"));
257 YAZ_CHECK(tst_convert_l(
262 "\x00\x00\x00\x74" "\x00\x00\x03\x60" "\x00\x00\x00\x73"));
267 static void tst_ucs4b_to_utf8(void)
269 yaz_iconv_t cd = yaz_iconv_open("UTF8", "UCS4");
273 YAZ_CHECK(tst_convert_l(
276 "\x00\x00\xFF\x1F\x00\x00\x00o",
278 "\xEF\xBC\x9F\x6F"));
280 YAZ_CHECK(tst_convert_l(
283 "\x00\x00\xAE\x0E\x00\x00\xC0\xF4",
285 "\xEA\xB8\x8E\xEC\x83\xB4"));
289 static void dconvert(int mandatory, const char *tmpcode)
294 for (i = 0; iso_8859_1_a[i]; i++)
297 char *inbuf = (char*) iso_8859_1_a[i];
298 size_t inbytesleft = strlen(inbuf);
301 char *outbuf = outbuf0;
302 size_t outbytesleft = sizeof(outbuf0);
304 cd = yaz_iconv_open(tmpcode, "ISO-8859-1");
305 YAZ_CHECK(cd || !mandatory);
308 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
309 YAZ_CHECK(r != (size_t) (-1));
311 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
312 YAZ_CHECK(r != (size_t) (-1));
314 if (r == (size_t) (-1))
317 cd = yaz_iconv_open("ISO-8859-1", tmpcode);
318 YAZ_CHECK(cd || !mandatory);
322 inbytesleft = sizeof(outbuf0) - outbytesleft;
325 outbytesleft = sizeof(outbuf1);
326 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
327 YAZ_CHECK(r != (size_t) (-1));
329 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
330 if (r == (size_t)(-1))
332 fprintf(stderr, "failed\n");
334 YAZ_CHECK(r != (size_t) (-1));
336 if (r != (size_t)(-1))
338 ret = compare_buffers("dconvert", i,
339 strlen(iso_8859_1_a[i]), iso_8859_1_a[i],
340 sizeof(outbuf1) - outbytesleft, outbuf1);
347 int utf8_check(unsigned c)
356 size_t inbytesleft = 4;
357 char *outbuf = utf8buf;
358 size_t outbytesleft = sizeof(utf8buf);
360 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "UCS4LE");
363 for (i = 0; i<4; i++)
366 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
369 if (r == (size_t)(-1))
372 cd = yaz_iconv_open("UCS4LE", "UTF-8");
375 inbytesleft = sizeof(utf8buf) - outbytesleft;
381 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
382 if (r == (size_t)(-1))
387 if (memcmp(src, dst, 4))
393 static void tst_marc8_to_utf8(void)
395 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8");
401 YAZ_CHECK(tst_convert(cd, "Cours de math",
403 /* COMBINING ACUTE ACCENT */
404 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
405 "Cours de mathe\xcc\x81"));
407 YAZ_CHECK(tst_convert(cd, "\xea" "a", "a\xcc\x8a"));
408 YAZ_CHECK(tst_convert(cd, "a" "\xea" "\x1e", "a" "\x1e\xcc\x8a"));
409 YAZ_CHECK(tst_convert(cd, "a" "\xea" "p", "a" "p\xcc\x8a"));
411 YAZ_CHECK(tst_convert_x(cd, "a\xea", "a", YAZ_ICONV_EINVAL));
412 YAZ_CHECK(tst_convert(cd, "p", "\xcc\x8a")); /* note: missing p */
413 yaz_iconv(cd, 0, 0, 0, 0); /* incomplete. so we have to reset */
416 YAZ_CHECK(tst_convert(cd, ESC "(N" ESC ")Qp" ESC "(B", "\xd0\x9f"));
418 YAZ_CHECK(tst_convert_x(cd, ESC , "", YAZ_ICONV_EINVAL));
419 YAZ_CHECK(tst_convert_x(cd, ESC "(", "", YAZ_ICONV_EINVAL));
420 YAZ_CHECK(tst_convert_x(cd, ESC "(B", "", 0));
422 YAZ_CHECK(tst_convert(cd, ESC "(B" "\x31", "1")); /* ASCII in G0 */
423 YAZ_CHECK(tst_convert(cd, ESC ")B" "\xB1", "1")); /* ASCII in G1 */
428 static void tst_marc8s_to_utf8(void)
430 yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8s");
436 YAZ_CHECK(tst_convert(cd, "Cours de math",
438 /* E9: LATIN SMALL LETTER E WITH ACUTE */
439 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
440 "Cours de math\xc3\xa9"));
446 static void tst_marc8_to_latin1(void)
448 yaz_iconv_t cd = yaz_iconv_open("ISO-8859-1", "MARC8");
454 YAZ_CHECK(tst_convert(cd, "ax", "ax"));
456 /* latin capital letter o with stroke */
457 YAZ_CHECK(tst_convert(cd, "\xa2", "\xd8"));
459 /* with latin small letter ae */
460 YAZ_CHECK(tst_convert(cd, "eneb\xb5r", "eneb\346r"));
462 YAZ_CHECK(tst_convert(cd, "\xea" "a\xa2", "\xe5" "\xd8"));
464 YAZ_CHECK(tst_convert(cd, "\xea" "a\xa2" "b", "\xe5" "\xd8" "b"));
466 YAZ_CHECK(tst_convert(cd, "\xea" "a" "\xea" "a", "\xe5" "\xe5"));
468 YAZ_CHECK(tst_convert(cd, "Cours de math",
470 YAZ_CHECK(tst_convert(cd, "Cours de mathâe",
472 YAZ_CHECK(tst_convert(cd, "12345678âe",
474 YAZ_CHECK(tst_convert(cd, "123456789âe",
476 YAZ_CHECK(tst_convert(cd, "1234567890âe",
478 YAZ_CHECK(tst_convert(cd, "12345678901âe",
480 YAZ_CHECK(tst_convert(cd, "Cours de mathâem",
482 YAZ_CHECK(tst_convert(cd, "Cours de mathâematiques",
483 "Cours de mathématiques"));
488 static void tst_utf8_to_marc8(const char *marc8_type)
490 yaz_iconv_t cd = yaz_iconv_open(marc8_type, "UTF-8");
496 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
498 /** Pure ASCII. 11 characters (sizeof(outbuf)-1) */
499 YAZ_CHECK(tst_convert(cd, "Cours de mat", "Cours de mat"));
501 /** Pure ASCII. 12 characters (sizeof(outbuf)) */
502 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"));
504 /** Pure ASCII. 13 characters (sizeof(outbuf)+1) */
505 YAZ_CHECK(tst_convert(cd, "Cours de math.", "Cours de math."));
507 /** UPPERCASE SCANDINAVIAN O */
508 YAZ_CHECK(tst_convert(cd, "S\xc3\x98", "S\xa2"));
511 YAZ_CHECK(tst_convert(cd, "A" "\xCC\x8A", "\xEA" "A"));
514 YAZ_CHECK(tst_convert(cd, "\xC3\x85", "\xEA" "A"));
516 /** A MACRON + UMLAUT, DIAERESIS */
517 YAZ_CHECK(tst_convert(cd, "A" "\xCC\x84" "\xCC\x88",
520 /* Ligature spanning two characters */
521 YAZ_CHECK(tst_convert(cd,
522 "\x74" "\xCD\xA1" "\x73", /* UTF-8 */
523 "\xEB\x74\xEC\x73")); /* MARC-8 */
525 /* Double title spanning two characters */
526 YAZ_CHECK(tst_convert(cd,
527 "\x74" "\xCD\xA0" "\x73", /* UTF-8 */
528 "\xFA\x74\xFB\x73")); /* MARC-8 */
530 /** Ideographic question mark (Unicode FF1F) */
531 YAZ_CHECK(tst_convert(cd,
532 "\xEF\xBC\x9F" "o", /* UTF-8 */
533 "\033$1" "\x21\x2B\x3B" "\033(B" "o" ));
536 /** Ideographic space per ANSI Z39.64 */
537 YAZ_CHECK(tst_convert(cd,
538 "\xe3\x80\x80" "o", /* UTF-8 */
539 "\033$1" "\x21\x23\x21" "\033(B" "o" ));
541 /** Superscript 0 . bug #642 */
542 YAZ_CHECK(tst_convert(cd,
543 "(\xe2\x81\xb0)", /* UTF-8 */
548 YAZ_CHECK(tst_convert(cd,
549 /* offset 0x530 in UTF-8 rec marccol4.u8.marc */
550 "\xE3\x83\xB3" "\xE3\x82\xBF"
551 "\xCC\x84" "\xCC\x84" "\xE3\x83\xBC" /* UTF-8 */,
552 "\x1B\x24\x31" "\x69\x25\x73"
553 "\x1B\x28\x42" "\xE5\xE5" "\x1B\x24\x31"
555 "\x69\x21\x3C" "\x1B\x28\x42"));
559 YAZ_CHECK(tst_convert(cd,
560 "\xCE\x94\xCE\xB5\xCF\x84"
561 "\xCE\xBF\xCF\x81\xCE\xB1"
562 "\xCE\xBA\xCE\xB7\xCF\x82\x2C",
564 "\x1B\x28\x53\x45\x66\x78\x72\x75"
570 char *inbuf0 = "\xe2\x81\xb0";
571 char *inbuf = inbuf0;
572 size_t inbytesleft = strlen(inbuf);
574 char *outbuf = outbuf0;
575 size_t outbytesleft = sizeof(outbuf0)-1;
580 r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
581 YAZ_CHECK(r != (size_t) (-1));
584 *outbuf = '\0'; /* so we know when to stop printing */
585 for (i = 0; outbuf0[i]; i++)
587 int ch = outbuf0[i] & 0xff;
588 yaz_log(YLOG_LOG, "ch%d %02X %c", i, ch, ch >= ' ' ? ch : '?');
592 r = yaz_iconv(cd, 0, 0, &outbuf, &outbytesleft);
593 YAZ_CHECK(r != (size_t) (-1));
594 *outbuf = '\0'; /* for strcmp test below and printing */
596 for (i = 0; outbuf0[i]; i++)
598 int ch = outbuf0[i] & 0xff;
599 yaz_log(YLOG_LOG, "ch%d %02X %c", i, ch, ch >= ' ' ? ch : '?');
602 YAZ_CHECK(strcmp("\033p0\x1bs", outbuf0) == 0);
604 yaz_iconv(cd, 0, 0, 0, 0);
608 static void tst_advance_to_utf8(void)
610 yaz_iconv_t cd = yaz_iconv_open("utf-8", "advancegreek");
616 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
620 static void tst_utf8_to_advance(void)
622 yaz_iconv_t cd = yaz_iconv_open("advancegreek", "utf-8");
628 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
632 static void tst_latin1_to_marc8(void)
634 yaz_iconv_t cd = yaz_iconv_open("MARC8", "ISO-8859-1");
640 YAZ_CHECK(tst_convert(cd, "Cours ", "Cours "));
642 /** Pure ASCII. 11 characters (sizeof(outbuf)-1) */
643 YAZ_CHECK(tst_convert(cd, "Cours de mat", "Cours de mat"));
645 /** Pure ASCII. 12 characters (sizeof(outbuf)) */
646 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"));
648 /** Pure ASCII. 13 characters (sizeof(outbuf)) */
649 YAZ_CHECK(tst_convert(cd, "Cours de math.", "Cours de math."));
651 /** D8: UPPERCASE SCANDINAVIAN O */
652 YAZ_CHECK(tst_convert(cd, "S\xd8", "S\xa2"));
654 /** E9: LATIN SMALL LETTER E WITH ACUTE */
655 YAZ_CHECK(tst_convert(cd, "Cours de math\xe9", "Cours de mathâe"));
656 YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math"
658 YAZ_CHECK(tst_convert(cd, "Cours de mathé", "Cours de mathâe" ));
659 YAZ_CHECK(tst_convert(cd, "12345678é","12345678âe"));
660 YAZ_CHECK(tst_convert(cd, "123456789é", "123456789âe"));
661 YAZ_CHECK(tst_convert(cd, "1234567890é","1234567890âe"));
662 YAZ_CHECK(tst_convert(cd, "12345678901é", "12345678901âe"));
663 YAZ_CHECK(tst_convert(cd, "Cours de mathém", "Cours de mathâem"));
664 YAZ_CHECK(tst_convert(cd, "Cours de mathématiques",
665 "Cours de mathâematiques"));
669 static void tst_utf8_codes(void)
671 YAZ_CHECK(utf8_check(3));
672 YAZ_CHECK(utf8_check(127));
673 YAZ_CHECK(utf8_check(128));
674 YAZ_CHECK(utf8_check(255));
675 YAZ_CHECK(utf8_check(256));
676 YAZ_CHECK(utf8_check(900));
677 YAZ_CHECK(utf8_check(1000));
678 YAZ_CHECK(utf8_check(10000));
679 YAZ_CHECK(utf8_check(100000));
680 YAZ_CHECK(utf8_check(1000000));
681 YAZ_CHECK(utf8_check(10000000));
682 YAZ_CHECK(utf8_check(100000000));
685 static void tst_danmarc_to_latin1(void)
687 yaz_iconv_t cd = yaz_iconv_open("iso-8859-1", "danmarc");
693 YAZ_CHECK(tst_convert(cd, "ax", "ax"));
695 YAZ_CHECK(tst_convert(cd, "a@@b", "a@b"));
696 YAZ_CHECK(tst_convert(cd, "a@@@@b", "a@@b"));
697 YAZ_CHECK(tst_convert(cd, "@000ab", "\nb"));
699 YAZ_CHECK(tst_convert(cd, "@\xe5", "aa"));
700 YAZ_CHECK(tst_convert(cd, "@\xc5.", "Aa."));
706 int main (int argc, char **argv)
708 YAZ_CHECK_INIT(argc, argv);
714 tst_marc8s_to_utf8();
716 tst_marc8_to_latin1();
718 tst_advance_to_utf8();
719 tst_utf8_to_advance();
721 tst_utf8_to_marc8("marc8");
722 tst_utf8_to_marc8("marc8lossy");
723 tst_utf8_to_marc8("marc8lossless");
725 tst_danmarc_to_latin1();
727 tst_latin1_to_marc8();
729 tst_marc8_to_ucs4b();
732 dconvert(1, "UTF-8");
733 dconvert(1, "ISO-8859-1");
735 dconvert(1, "UCS4LE");
736 dconvert(0, "CP865");
743 * c-file-style: "Stroustrup"
744 * indent-tabs-mode: nil
746 * vim: shiftwidth=4 tabstop=8 expandtab