cec66ab347d4320f050eaa2499c31524bfb3ad69
[yaz-moved-to-github.git] / util / marcdump.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #define _FILE_OFFSET_BITS 64
7
8 #if HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #if YAZ_HAVE_XML2
13 #include <libxml/parser.h>
14 #include <libxml/tree.h>
15 #include <libxml/xpath.h>
16 #include <libxml/xpathInternals.h>
17
18 /* Libxml2 version < 2.6.15. xmlreader not reliable/present */
19 #if LIBXML_VERSION < 20615
20 #define USE_XMLREADER 0
21 #else
22 #define USE_XMLREADER 1
23 #endif
24
25 #if USE_XMLREADER
26 #include <libxml/xmlreader.h>
27 #endif
28
29 #endif
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <assert.h>
36
37 #if HAVE_LOCALE_H
38 #include <locale.h>
39 #endif
40 #if HAVE_LANGINFO_H
41 #include <langinfo.h>
42 #endif
43
44 #include <yaz/marcdisp.h>
45 #include <yaz/json.h>
46 #include <yaz/yaz-util.h>
47 #include <yaz/xmalloc.h>
48 #include <yaz/options.h>
49
50 #ifndef SEEK_SET
51 #define SEEK_SET 0
52 #endif
53 #ifndef SEEK_END
54 #define SEEK_END 2
55 #endif
56
57
58 static char *prog;
59
60 static int no_errors = 0;
61
62 static void usage(const char *prog)
63 {
64     fprintf(stderr, "Usage: %s [-i format] [-o format] [-f from] [-t to] "
65             "[-l pos=value] [-c cfile] [-s prefix] [-C size] [-n] "
66             "[-p] [-v] [-V] file...\n",
67             prog);
68 }
69
70 static void show_version(void)
71 {
72     char vstr[20], sha1_str[41];
73
74     yaz_version(vstr, sha1_str);
75     printf("YAZ version: %s %s\n", YAZ_VERSION, YAZ_VERSION_SHA1);
76     if (strcmp(sha1_str, YAZ_VERSION_SHA1))
77         printf("YAZ DLL/SO: %s %s\n", vstr, sha1_str);
78     exit(0);
79 }
80
81 static int getbyte_stream(void *client_data)
82 {
83     FILE *f = (FILE*) client_data;
84
85     int c = fgetc(f);
86     if (c == EOF)
87         return 0;
88     return c;
89 }
90
91 static void ungetbyte_stream(int c, void *client_data)
92 {
93     FILE *f = (FILE*) client_data;
94
95     if (c == 0)
96         c = EOF;
97     ungetc(c, f);
98 }
99
100 static void marcdump_read_line(yaz_marc_t mt, const char *fname)
101 {
102     FILE *inf = fopen(fname, "rb");
103     if (!inf)
104     {
105         fprintf(stderr, "%s: cannot open %s:%s\n",
106                 prog, fname, strerror(errno));
107         exit(1);
108     }
109
110     while (yaz_marc_read_line(mt, getbyte_stream,
111                               ungetbyte_stream, inf) == 0)
112     {
113         WRBUF wrbuf = wrbuf_alloc();
114         yaz_marc_write_mode(mt, wrbuf);
115         fputs(wrbuf_cstr(wrbuf), stdout);
116         wrbuf_destroy(wrbuf);
117     }
118     fclose(inf);
119 }
120
121 static void marcdump_read_json(yaz_marc_t mt, const char *fname)
122 {
123     FILE *inf = fopen(fname, "rb");
124     if (!inf)
125     {
126         fprintf(stderr, "%s: cannot open %s:%s\n",
127                 prog, fname, strerror(errno));
128         exit(1);
129     }
130     else
131     {
132         const char *errmsg;
133         size_t errpos;
134         WRBUF w = wrbuf_alloc();
135         struct json_node *n;
136         int c;
137
138         while ((c = getc(inf)) != EOF)
139             wrbuf_putc(w, c);
140         n = json_parse2(wrbuf_cstr(w), &errmsg, &errpos);
141         if (n)
142         {
143             int r = yaz_marc_read_json_node(mt, n);
144             if (r == 0)
145             {
146                 wrbuf_rewind(w);
147                 yaz_marc_write_mode(mt, w);
148                 fputs(wrbuf_cstr(w), stdout);
149                 wrbuf_rewind(w);
150             }
151             else
152             {
153                 fprintf(stderr, "%s: JSON MARC parsing failed ret=%d\n", fname,
154                         r);
155             }
156         }
157         else
158         {
159             fprintf(stderr, "%s: JSON parse error: %s . pos=%ld\n", fname,
160                     errmsg, (long) errpos);
161         }
162         wrbuf_destroy(w);
163         fclose(inf);
164     }
165 }
166
167
168 #if YAZ_HAVE_XML2
169 static void marcdump_read_xml(yaz_marc_t mt, const char *fname)
170 {
171     WRBUF wrbuf = wrbuf_alloc();
172 #if USE_XMLREADER
173     xmlTextReaderPtr reader = xmlReaderForFile(fname, 0 /* encoding */,
174                                                0 /* options */);
175
176     if (reader)
177     {
178         int ret;
179         while ((ret = xmlTextReaderRead(reader)) == 1)
180         {
181             int type = xmlTextReaderNodeType(reader);
182             if (type == XML_READER_TYPE_ELEMENT)
183             {
184                 char *name = (char *) xmlTextReaderLocalName(reader);
185                 if (!strcmp(name, "record") || !strcmp(name, "r"))
186                 {
187                     xmlNodePtr ptr = xmlTextReaderExpand(reader);
188
189                     int r = yaz_marc_read_xml(mt, ptr);
190                     if (r)
191                     {
192                         no_errors++;
193                         fprintf(stderr, "yaz_marc_read_xml failed\n");
194                     }
195                     else
196                     {
197                         int write_rc = yaz_marc_write_mode(mt, wrbuf);
198                         if (write_rc)
199                         {
200                             yaz_log(YLOG_WARN, "yaz_marc_write_mode: "
201                                     "write error: %d", write_rc);
202                             no_errors++;
203                         }
204                         fputs(wrbuf_cstr(wrbuf), stdout);
205                         wrbuf_rewind(wrbuf);
206                     }
207                 }
208                 xmlFree(name);
209             }
210         }
211         xmlFreeTextReader(reader);
212     }
213 #else
214     xmlDocPtr doc = xmlParseFile(fname);
215     if (doc)
216     {
217         xmlNodePtr ptr = xmlDocGetRootElement(doc);
218         for (; ptr; ptr = ptr->next)
219         {
220             if (ptr->type == XML_ELEMENT_NODE)
221             {
222                 if (!strcmp((const char *) ptr->name, "collection"))
223                 {
224                     ptr = ptr->children;
225                     continue;
226                 }
227                 if (!strcmp((const char *) ptr->name, "record") ||
228                     !strcmp((const char *) ptr->name, "r"))
229                 {
230                     int r = yaz_marc_read_xml(mt, ptr);
231                     if (r)
232                     {
233                         no_errors++;
234                         fprintf(stderr, "yaz_marc_read_xml failed\n");
235                     }
236                     else
237                     {
238                         yaz_marc_write_mode(mt, wrbuf);
239
240                         fputs(wrbuf_cstr(wrbuf), stdout);
241                         wrbuf_rewind(wrbuf);
242                     }
243                 }
244             }
245         }
246         xmlFreeDoc(doc);
247     }
248 #endif
249     fputs(wrbuf_cstr(wrbuf), stdout);
250     wrbuf_destroy(wrbuf);
251 }
252 #endif
253
254 static void dump(const char *fname, const char *from, const char *to,
255                  int input_format, int output_format,
256                  int write_using_libxml2,
257                  int print_offset, const char *split_fname, int split_chunk,
258                  int verbose, FILE *cfile, const char *leader_spec)
259 {
260     yaz_marc_t mt = yaz_marc_create();
261     yaz_iconv_t cd = 0;
262
263     if (yaz_marc_leader_spec(mt, leader_spec))
264     {
265         fprintf(stderr, "bad leader spec: %s\n", leader_spec);
266         yaz_marc_destroy(mt);
267         exit(2);
268     }
269     if (from && to)
270     {
271         cd = yaz_iconv_open(to, from);
272         if (!cd)
273         {
274             fprintf(stderr, "conversion from %s to %s "
275                     "unsupported\n", from, to);
276             yaz_marc_destroy(mt);
277             exit(2);
278         }
279         yaz_marc_iconv(mt, cd);
280     }
281     yaz_marc_enable_collection(mt);
282     yaz_marc_xml(mt, output_format);
283     yaz_marc_write_using_libxml2(mt, write_using_libxml2);
284     yaz_marc_debug(mt, verbose);
285
286     if (input_format == YAZ_MARC_MARCXML || input_format == YAZ_MARC_TURBOMARC || input_format == YAZ_MARC_XCHANGE)
287     {
288 #if YAZ_HAVE_XML2
289         marcdump_read_xml(mt, fname);
290 #endif
291     }
292     else if (input_format == YAZ_MARC_LINE)
293     {
294         marcdump_read_line(mt, fname);
295     }
296     else if (input_format == YAZ_MARC_JSON)
297     {
298         marcdump_read_json(mt, fname);
299     }
300     else if (input_format == YAZ_MARC_ISO2709)
301     {
302         FILE *inf = fopen(fname, "rb");
303         int num = 1;
304         int marc_no = 0;
305         int split_file_no = -1;
306         if (!inf)
307         {
308             fprintf(stderr, "%s: cannot open %s:%s\n",
309                     prog, fname, strerror(errno));
310             exit(1);
311         }
312         if (cfile)
313             fprintf(cfile, "char *marc_records[] = {\n");
314         for(;; marc_no++)
315         {
316             const char *result = 0;
317             size_t len;
318             size_t rlen;
319             size_t len_result;
320             size_t r;
321             char buf[100001];
322
323             r = fread(buf, 1, 5, inf);
324             if (r < 5)
325             {
326                 if (r == 0) /* normal EOF, all good */
327                     break;
328                 if (print_offset && verbose)
329                 {
330                     printf("<!-- Extra %ld bytes at end of file -->\n",
331                            (long) r);
332                 }
333                 break;
334             }
335             while (*buf < '0' || *buf > '9')
336             {
337                 int i;
338                 long off = ftell(inf) - 5;
339                 printf("<!-- Skipping bad byte %d (0x%02X) at offset "
340                        "%ld (0x%lx) -->\n",
341                        *buf & 0xff, *buf & 0xff,
342                        off, off);
343                 for (i = 0; i<4; i++)
344                     buf[i] = buf[i+1];
345                 r = fread(buf+4, 1, 1, inf);
346                 no_errors++;
347                 if (r < 1)
348                     break;
349             }
350             if (r < 1)
351             {
352                 if (verbose || print_offset)
353                     printf("<!-- End of file with data -->\n");
354                 break;
355             }
356             if (print_offset)
357             {
358                 long off = ftell(inf) - 5;
359                 printf("<!-- Record %d offset %ld (0x%lx) -->\n",
360                        num, off, off);
361             }
362             len = atoi_n(buf, 5);
363             if (len < 25 || len > 100000)
364             {
365                 long off = ftell(inf) - 5;
366                 printf("<!-- Bad Length %ld read at offset %ld (%lx) -->\n",
367                        (long)len, (long) off, (long) off);
368                 no_errors++;
369                 break;
370             }
371             rlen = len - 5;
372             r = fread(buf + 5, 1, rlen, inf);
373             if (r < rlen)
374             {
375                 long off = ftell(inf);
376                 printf("<!-- Premature EOF at offset %ld (%lx) -->\n",
377                        (long) off, (long) off);
378                 no_errors++;
379                 break;
380             }
381             while (buf[len-1] != ISO2709_RS)
382             {
383                 if (len > sizeof(buf)-2)
384                 {
385                     r = 0;
386                     break;
387                 }
388                 r = fread(buf + len, 1, 1, inf);
389                 if (r != 1)
390                     break;
391                 len++;
392             }
393             if (r < 1)
394             {
395                 printf("<!-- EOF while searching for RS -->\n");
396                 no_errors++;
397                 break;
398             }
399             if (split_fname)
400             {
401                 char fname[256];
402                 const char *mode = 0;
403                 FILE *sf;
404                 if ((marc_no % split_chunk) == 0)
405                 {
406                     mode = "wb";
407                     split_file_no++;
408                 }
409                 else
410                     mode = "ab";
411                 sprintf(fname, "%.200s%07d", split_fname, split_file_no);
412                 sf = fopen(fname, mode);
413                 if (!sf)
414                 {
415                     fprintf(stderr, "Could not open %s\n", fname);
416                     split_fname = 0;
417                 }
418                 else
419                 {
420                     if (fwrite(buf, 1, len, sf) != len)
421                     {
422                         fprintf(stderr, "Could write content to %s\n",
423                                 fname);
424                         split_fname = 0;
425                         no_errors++;
426                     }
427                     fclose(sf);
428                 }
429             }
430             len_result = rlen;
431             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
432             if (r == -1)
433                 no_errors++;
434             if (r > 0 && result && len_result)
435             {
436                 if (fwrite(result, len_result, 1, stdout) != 1)
437                 {
438                     fprintf(stderr, "Write to stdout failed\n");
439                     no_errors++;
440                     break;
441                 }
442             }
443             if (r > 0 && cfile)
444             {
445                 char *p = buf;
446                 size_t i;
447                 if (marc_no)
448                     fprintf(cfile, ",");
449                 fprintf(cfile, "\n");
450                 for (i = 0; i < r; i++)
451                 {
452                     if ((i & 15) == 0)
453                         fprintf(cfile, "  \"");
454                     if (p[i] < 32 || p[i] > 126)
455                         fprintf(cfile, "\" \"\\x%02X\" \"", p[i] & 255);
456                     else
457                         fputc(p[i], cfile);
458
459                     if (i < r - 1 && (i & 15) == 15)
460                         fprintf(cfile, "\"\n");
461
462                 }
463                 fprintf(cfile, "\"\n");
464             }
465             num++;
466             if (verbose)
467                 printf("\n");
468         }
469         if (cfile)
470             fprintf(cfile, "};\n");
471         fclose(inf);
472     }
473     {
474         WRBUF wrbuf = wrbuf_alloc();
475         yaz_marc_write_trailer(mt, wrbuf);
476         fputs(wrbuf_cstr(wrbuf), stdout);
477         wrbuf_destroy(wrbuf);
478     }
479     if (cd)
480         yaz_iconv_close(cd);
481     yaz_marc_destroy(mt);
482 }
483
484 int main (int argc, char **argv)
485 {
486     int r;
487     int print_offset = 0;
488     char *arg;
489     int verbose = 0;
490     int no = 0;
491     int output_format = YAZ_MARC_LINE;
492     FILE *cfile = 0;
493     char *from = 0, *to = 0;
494     int input_format = YAZ_MARC_ISO2709;
495     int split_chunk = 1;
496     const char *split_fname = 0;
497     const char *leader_spec = 0;
498     int write_using_libxml2 = 0;
499
500 #if HAVE_LOCALE_H
501     setlocale(LC_CTYPE, "");
502 #endif
503 #if HAVE_LANGINFO_H
504 #ifdef CODESET
505     to = nl_langinfo(CODESET);
506 #endif
507 #endif
508
509     prog = *argv;
510     while ((r = options("i:o:C:npc:xOeXIf:t:s:l:Vv", argv, argc, &arg)) != -2)
511     {
512         no++;
513         switch (r)
514         {
515         case 'i':
516             input_format = yaz_marc_decode_formatstr(arg);
517             if (input_format == -1)
518             {
519                 fprintf(stderr, "%s: bad input format: %s\n", prog, arg);
520                 exit(1);
521             }
522 #if YAZ_HAVE_XML2
523 #else
524             if (input_format == YAZ_MARC_MARCXML
525                 || input_format == YAZ_MARC_XCHANGE)
526             {
527                 fprintf(stderr, "%s: Libxml2 support not enabled\n", prog);
528                 exit(3);
529             }
530 #endif
531             break;
532         case 'o':
533             /* dirty hack so we can make Libxml2 do the writing ..
534                rather than WRBUF */
535             if (strlen(arg) > 4 && strncmp(arg, "xml,", 4) == 0)
536             {
537                 /* Only supported for Libxml2 2.6.0 or later */
538 #if LIBXML_VERSION >= 20600
539                 arg = arg + 4;
540                 write_using_libxml2 = 1;
541 #else
542                 fprintf(stderr, "%s: output using Libxml2 unsupported\n", prog);
543                 exit(4);
544 #endif
545             }
546             output_format = yaz_marc_decode_formatstr(arg);
547             if (output_format == -1)
548             {
549                 fprintf(stderr, "%s: bad output format: %s\n", prog, arg);
550                 exit(1);
551             }
552             break;
553         case 'l':
554             leader_spec = arg;
555             break;
556         case 'f':
557             from = arg;
558             break;
559         case 't':
560             to = arg;
561             break;
562         case 'c':
563             if (cfile)
564                 fclose(cfile);
565             cfile = fopen(arg, "w");
566             break;
567         case 'x':
568             fprintf(stderr, "%s: -x no longer supported. "
569                     "Use -i marcxml instead\n", prog);
570             exit(1);
571             break;
572         case 'O':
573             fprintf(stderr, "%s: OAI MARC no longer supported."
574                     " Use MARCXML instead.\n", prog);
575             exit(1);
576             break;
577         case 'e':
578             fprintf(stderr, "%s: -e no longer supported. "
579                     "Use -o marcxchange instead\n", prog);
580             exit(1);
581             break;
582         case 'X':
583             fprintf(stderr, "%s: -X no longer supported. "
584                     "Use -o marcxml instead\n", prog);
585             exit(1);
586             break;
587         case 'I':
588             fprintf(stderr, "%s: -I no longer supported. "
589                     "Use -o marc instead\n", prog);
590             exit(1);
591             break;
592         case 'n':
593             output_format = YAZ_MARC_CHECK;
594             break;
595         case 'p':
596             print_offset = 1;
597             break;
598         case 's':
599             split_fname = arg;
600             break;
601         case 'C':
602             split_chunk = atoi(arg);
603             break;
604         case 0:
605             dump(arg, from, to, input_format, output_format,
606                  write_using_libxml2,
607                  print_offset, split_fname, split_chunk,
608                  verbose, cfile, leader_spec);
609             break;
610         case 'v':
611             verbose++;
612             break;
613         case 'V':
614             show_version();
615             break;
616         default:
617             usage(prog);
618             exit(1);
619         }
620     }
621     if (cfile)
622         fclose(cfile);
623     if (!no)
624     {
625         usage(prog);
626         exit(1);
627     }
628     if (no_errors)
629         exit(5);
630     exit(0);
631 }
632 /*
633  * Local variables:
634  * c-basic-offset: 4
635  * c-file-style: "Stroustrup"
636  * indent-tabs-mode: nil
637  * End:
638  * vim: shiftwidth=4 tabstop=8 expandtab
639  */
640