1 /* $Id: recgrs.c,v 1.60 2002-08-17 07:59:54 adam Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
27 #include <sys/types.h>
38 #define GRS_MAX_WORD 512
44 struct grs_handler *next;
48 struct grs_handler *handlers;
51 static int read_grs_type (struct grs_handlers *h,
52 struct grs_read_info *p, const char *type,
55 struct grs_handler *gh = h->handlers;
56 const char *cp = strchr (type, '.');
58 if (cp == NULL || cp == type)
60 cp = strlen(type) + type;
64 strcpy (p->type, cp+1);
65 for (gh = h->handlers; gh; gh = gh->next)
67 if (!memcmp (type, gh->type->type, cp-type))
72 gh->clientData = (*gh->type->init)();
74 p->clientData = gh->clientData;
75 *root = (gh->type->read)(p);
76 gh->clientData = p->clientData;
83 static void grs_add_handler (struct grs_handlers *h, RecTypeGrs t)
85 struct grs_handler *gh = (struct grs_handler *) xmalloc (sizeof(*gh));
86 gh->next = h->handlers;
93 static void *grs_init(RecType recType)
95 struct grs_handlers *h = (struct grs_handlers *) xmalloc (sizeof(*h));
98 grs_add_handler (h, recTypeGrs_sgml);
99 grs_add_handler (h, recTypeGrs_regx);
101 grs_add_handler (h, recTypeGrs_tcl);
103 grs_add_handler (h, recTypeGrs_marc);
105 grs_add_handler (h, recTypeGrs_xml);
110 static void grs_destroy(void *clientData)
112 struct grs_handlers *h = (struct grs_handlers *) clientData;
113 struct grs_handler *gh = h->handlers, *gh_next;
118 (*gh->type->destroy)(gh->clientData);
126 1 start element (tag)
128 3 start attr (and attr-exact)
135 static void index_xpath (data1_node *n, struct recExtractCtrl *p,
136 int level, RecWord *wrd, int use)
139 char tag_path_full[1024];
147 wrd->string = n->u.data.data;
148 wrd->length = n->u.data.len;
149 wrd->attrSet = VAL_IDXPATH,
151 if (p->flagShowRecords)
153 printf("%*s data=", (level + 1) * 4, "");
154 for (i = 0; i<wrd->length && i < 8; i++)
155 fputc (wrd->string[i], stdout);
164 for (nn = n; nn; nn = nn->parent)
166 if (nn->which == DATA1N_tag)
168 size_t tlen = strlen(nn->u.tag.tag);
169 if (tlen + flen > (sizeof(tag_path_full)-2))
171 memcpy (tag_path_full + flen, nn->u.tag.tag, tlen);
173 tag_path_full[flen++] = '/';
175 else if (nn->which == DATA1N_root)
179 wrd->string = tag_path_full;
181 wrd->attrSet = VAL_IDXPATH;
183 if (p->flagShowRecords)
185 printf("%*s tag=", (level + 1) * 4, "");
186 for (i = 0; i<wrd->length && i < 40; i++)
187 fputc (wrd->string[i], stdout);
195 (*p->tokenAdd)(wrd); /* index element pag (AKA tag path) */
198 for (xp = n->u.tag.attributes; xp; xp = xp->next)
201 /* attribute (no value) */
204 wrd->string = xp->name;
205 wrd->length = strlen(xp->name);
211 strlen(xp->name) + strlen(xp->value) < sizeof(comb)-2)
213 /* attribute value exact */
214 strcpy (comb, xp->name);
216 strcat (comb, xp->value);
221 wrd->length = strlen(comb);
227 for (xp = n->u.tag.attributes; xp; xp = xp->next)
229 char attr_tag_path_full[1024];
231 sprintf (attr_tag_path_full, "@%s/%.*s",
232 xp->name, flen, tag_path_full);
236 wrd->string = attr_tag_path_full;
237 wrd->length = strlen(attr_tag_path_full);
242 wrd->string = xp->value;
243 wrd->length = strlen(xp->value);
249 wrd->string = attr_tag_path_full;
250 wrd->length = strlen(attr_tag_path_full);
258 static void index_termlist (data1_node *par, data1_node *n,
259 struct recExtractCtrl *p, int level, RecWord *wrd)
261 data1_termlist *tlist = 0;
262 data1_datatype dtype = DATA1K_string;
264 * cycle up towards the root until we find a tag with an att..
265 * this has the effect of indexing locally defined tags with
266 * the attribute of their ancestor in the record.
269 while (!par->u.tag.element)
270 if (!par->parent || !(par=get_parent_tag(p->dh, par->parent)))
272 if (!par || !(tlist = par->u.tag.element->termlists))
274 if (par->u.tag.element->tag)
275 dtype = par->u.tag.element->tag->kind;
277 for (; tlist; tlist = tlist->next)
280 /* consider source */
283 if (!strcmp (tlist->source, "data") && n->which == DATA1N_data)
285 wrd->string = n->u.data.data;
286 wrd->length = n->u.data.len;
288 else if (!strcmp (tlist->source, "tag") && n->which == DATA1N_tag)
290 wrd->string = n->u.tag.tag;
291 wrd->length = strlen(n->u.tag.tag);
293 else if (sscanf (tlist->source, "attr(%511[^)])", xattr) == 1 &&
294 n->which == DATA1N_tag)
296 data1_xattr *p = n->u.tag.attributes;
297 while (p && strcmp (p->name, xattr))
301 wrd->string = p->value;
302 wrd->length = strlen(p->value);
307 if (p->flagShowRecords)
310 printf("%*sIdx: [%s]", (level + 1) * 4, "",
312 printf("%s:%s [%d] %s",
313 tlist->att->parent->name,
314 tlist->att->name, tlist->att->value,
317 for (i = 0; i<wrd->length && i < 8; i++)
318 fputc (wrd->string[i], stdout);
322 fputc ('\n', stdout);
326 wrd->reg_type = *tlist->structure;
327 wrd->attrSet = (int) (tlist->att->parent->reference);
328 wrd->attrUse = tlist->att->locals->local;
335 static int dumpkeys(data1_node *n, struct recExtractCtrl *p, int level,
338 for (; n; n = n->next)
340 if (p->flagShowRecords) /* display element description to user */
342 if (n->which == DATA1N_root)
344 printf("%*s", level * 4, "");
345 printf("Record type: '%s'\n", n->u.root.type);
347 else if (n->which == DATA1N_tag)
351 printf("%*s", level * 4, "");
352 if (!(e = n->u.tag.element))
353 printf("Local tag: '%s'\n", n->u.tag.tag);
356 printf("Elm: '%s' ", e->name);
359 data1_tag *t = e->tag;
361 printf("TagNam: '%s' ", t->names->name);
364 printf("%s[%d],", t->tagset->name, t->tagset->type);
367 if (t->which == DATA1T_numeric)
368 printf("%d)", t->value.numeric);
370 printf("'%s')", t->value.string);
377 if (n->which == DATA1N_tag)
379 index_termlist (n, n, p, level, wrd);
380 /* index start tag */
381 assert (n->root->u.root.absyn);
383 if (!n->root->u.root.absyn)
384 index_xpath (n, p, level, wrd, 1);
385 else if (n->root->u.root.absyn->enable_xpath_indexing)
386 index_xpath (n, p, level, wrd, 1);
390 if (dumpkeys(n->child, p, level + 1, wrd) < 0)
394 if (n->which == DATA1N_data)
396 data1_node *par = get_parent_tag(p->dh, n);
398 if (p->flagShowRecords)
400 printf("%*s", level * 4, "");
402 if (n->u.data.len > 32)
403 printf("'%.24s ... %.6s'\n", n->u.data.data,
404 n->u.data.data + n->u.data.len-6);
405 else if (n->u.data.len > 0)
406 printf("'%.*s'\n", n->u.data.len, n->u.data.data);
412 index_termlist (par, n, p, level, wrd);
413 if (!n->root->u.root.absyn)
414 index_xpath (n, p, level, wrd, 1016);
415 else if (n->root->u.root.absyn->enable_xpath_indexing)
416 index_xpath (n, p, level, wrd, 1016);
419 if (n->which == DATA1N_tag)
422 if (!n->root->u.root.absyn)
423 index_xpath (n, p, level, wrd, 2);
424 else if (n->root->u.root.absyn->enable_xpath_indexing)
425 index_xpath (n, p, level, wrd, 2);
428 if (p->flagShowRecords && n->which == DATA1N_root)
430 printf("%*s-------------\n\n", level * 4, "");
436 int grs_extract_tree(struct recExtractCtrl *p, data1_node *n)
439 int oidtmp[OID_SIZE];
442 oe.proto = PROTO_Z3950;
443 oe.oclass = CLASS_SCHEMA;
446 oe.value = n->u.root.absyn->reference;
448 if ((oid_ent_to_oid (&oe, oidtmp)))
449 (*p->schemaAdd)(p, oidtmp);
453 return dumpkeys(n, p, 0, &wrd);
456 static int grs_extract_sub(struct grs_handlers *h, struct recExtractCtrl *p,
460 struct grs_read_info gri;
462 int oidtmp[OID_SIZE];
465 gri.readf = p->readf;
466 gri.seekf = p->seekf;
467 gri.tellf = p->tellf;
470 gri.offset = p->offset;
474 if (read_grs_type (h, &gri, p->subType, &n))
475 return RECCTRL_EXTRACT_ERROR;
477 return RECCTRL_EXTRACT_EOF;
478 oe.proto = PROTO_Z3950;
479 oe.oclass = CLASS_SCHEMA;
481 if (!n->u.root.absyn)
482 return RECCTRL_EXTRACT_ERROR;
486 oe.value = n->u.root.absyn->reference;
487 if ((oid_ent_to_oid (&oe, oidtmp)))
488 (*p->schemaAdd)(p, oidtmp);
491 /* ensure our data1 tree is UTF-8 */
492 data1_iconv (p->dh, mem, n, "UTF-8", data1_get_encoding(p->dh, n));
495 data1_pr_tree (p->dh, n, stdout);
499 if (dumpkeys(n, p, 0, &wrd) < 0)
501 data1_free_tree(p->dh, n);
502 return RECCTRL_EXTRACT_ERROR;
504 data1_free_tree(p->dh, n);
505 return RECCTRL_EXTRACT_OK;
508 static int grs_extract(void *clientData, struct recExtractCtrl *p)
511 NMEM mem = nmem_create ();
512 struct grs_handlers *h = (struct grs_handlers *) clientData;
514 ret = grs_extract_sub(h, p, mem);
520 * Return: -1: Nothing done. 0: Ok. >0: Bib-1 diagnostic.
522 static int process_comp(data1_handle dh, data1_node *n, Z_RecordComposition *c)
524 data1_esetname *eset;
530 case Z_RecordComp_simple:
531 if (c->u.simple->which != Z_ElementSetNames_generic)
532 return 26; /* only generic form supported. Fix this later */
533 if (!(eset = data1_getesetbyname(dh, n->u.root.absyn,
534 c->u.simple->u.generic)))
536 logf(LOG_LOG, "Unknown esetname '%s'", c->u.simple->u.generic);
537 return 25; /* invalid esetname */
539 logf(LOG_DEBUG, "Esetname '%s' in simple compspec",
540 c->u.simple->u.generic);
543 case Z_RecordComp_complex:
544 if (c->u.complex->generic)
546 /* insert check for schema */
547 if ((p = c->u.complex->generic->elementSpec))
551 case Z_ElementSpec_elementSetName:
553 data1_getesetbyname(dh, n->u.root.absyn,
554 p->u.elementSetName)))
556 logf(LOG_LOG, "Unknown esetname '%s'",
557 p->u.elementSetName);
558 return 25; /* invalid esetname */
560 logf(LOG_DEBUG, "Esetname '%s' in complex compspec",
561 p->u.elementSetName);
564 case Z_ElementSpec_externalSpec:
565 if (p->u.externalSpec->which == Z_External_espec1)
567 logf(LOG_DEBUG, "Got Espec-1");
568 espec = p->u.externalSpec-> u.espec1;
572 logf(LOG_LOG, "Unknown external espec.");
573 return 25; /* bad. what is proper diagnostic? */
584 logf (LOG_DEBUG, "Element: Espec-1 match");
585 return data1_doespec1(dh, n, espec);
589 logf (LOG_DEBUG, "Element: all match");
594 static void add_idzebra_info (struct recRetrieveCtrl *p, data1_node *top,
597 const char *idzebra_ns[7];
599 idzebra_ns[0] = "xmlns:idzebra";
600 idzebra_ns[1] = "http://www.indexdata.dk/zebra/";
603 data1_tag_add_attr (p->dh, mem, top, idzebra_ns);
605 data1_mk_tag_data_int (p->dh, top, "idzebra:size", p->recordSize,
608 data1_mk_tag_data_int (p->dh, top, "idzebra:score",
611 data1_mk_tag_data_int (p->dh, top, "idzebra:localnumber", p->localno,
614 data1_mk_tag_data_text(p->dh, top, "idzebra:filename",
618 static int grs_retrieve(void *clientData, struct recRetrieveCtrl *p)
620 data1_node *node = 0, *onode = 0, *top;
623 int res, selected = 0;
625 struct grs_read_info gri;
627 struct grs_handlers *h = (struct grs_handlers *) clientData;
628 int requested_schema = VAL_NONE;
629 data1_marctab *marctab;
633 gri.readf = p->readf;
634 gri.seekf = p->seekf;
635 gri.tellf = p->tellf;
642 logf (LOG_DEBUG, "grs_retrieve");
643 if (read_grs_type (h, &gri, p->subType, &node))
655 /* ensure our data1 tree is UTF-8 */
656 data1_iconv (p->dh, mem, node, "UTF-8", data1_get_encoding(p->dh, node));
659 data1_pr_tree (p->dh, node, stdout);
661 top = data1_get_root_tag (p->dh, node);
663 logf (LOG_DEBUG, "grs_retrieve: size");
664 if ((dnew = data1_mk_tag_data_wd(p->dh, top, "size", mem)))
666 dnew->u.data.what = DATA1I_text;
667 dnew->u.data.data = dnew->lbuf;
668 sprintf(dnew->u.data.data, "%d", p->recordSize);
669 dnew->u.data.len = strlen(dnew->u.data.data);
672 tagname = res_get_def(p->res, "tagrank", "rank");
673 if (strcmp(tagname, "0") && p->score >= 0 &&
674 (dnew = data1_mk_tag_data_wd(p->dh, top, tagname, mem)))
676 logf (LOG_DEBUG, "grs_retrieve: %s", tagname);
677 dnew->u.data.what = DATA1I_num;
678 dnew->u.data.data = dnew->lbuf;
679 sprintf(dnew->u.data.data, "%d", p->score);
680 dnew->u.data.len = strlen(dnew->u.data.data);
683 tagname = res_get_def(p->res, "tagsysno", "localControlNumber");
684 if (strcmp(tagname, "0") && p->localno > 0 &&
685 (dnew = data1_mk_tag_data_wd(p->dh, top, tagname, mem)))
687 logf (LOG_DEBUG, "grs_retrieve: %s", tagname);
688 dnew->u.data.what = DATA1I_text;
689 dnew->u.data.data = dnew->lbuf;
691 sprintf(dnew->u.data.data, "%d", p->localno);
692 dnew->u.data.len = strlen(dnew->u.data.data);
695 data1_pr_tree (p->dh, node, stdout);
697 if (p->comp && p->comp->which == Z_RecordComp_complex &&
698 p->comp->u.complex->generic &&
699 p->comp->u.complex->generic->schema)
701 oident *oe = oid_getentbyoid (p->comp->u.complex->generic->schema);
703 requested_schema = oe->value;
706 /* If schema has been specified, map if possible, then check that
707 * we got the right one
709 if (requested_schema != VAL_NONE)
711 logf (LOG_DEBUG, "grs_retrieve: schema mapping");
712 for (map = node->u.root.absyn->maptabs; map; map = map->next)
714 if (map->target_absyn_ref == requested_schema)
717 if (!(node = data1_map_record(p->dh, onode, map, mem)))
726 if (node->u.root.absyn &&
727 requested_schema != node->u.root.absyn->reference)
735 * Does the requested format match a known syntax-mapping? (this reflects
736 * the overlap of schema and formatting which is inherent in the MARC
739 yaz_log (LOG_DEBUG, "grs_retrieve: syntax mapping");
740 if (node->u.root.absyn)
741 for (map = node->u.root.absyn->maptabs; map; map = map->next)
743 if (map->target_absyn_ref == p->input_format)
746 if (!(node = data1_map_record(p->dh, onode, map, mem)))
755 yaz_log (LOG_DEBUG, "grs_retrieve: schemaIdentifier");
756 if (node->u.root.absyn &&
757 node->u.root.absyn->reference != VAL_NONE &&
758 p->input_format == VAL_GRS1)
762 int oidtmp[OID_SIZE];
764 oe.proto = PROTO_Z3950;
765 oe.oclass = CLASS_SCHEMA;
766 oe.value = node->u.root.absyn->reference;
768 if ((oid = oid_ent_to_oid (&oe, oidtmp)))
771 data1_handle dh = p->dh;
775 for (ii = oid; *ii >= 0; ii++)
779 sprintf(p, "%d", *ii);
784 if ((dnew = data1_mk_tag_data_wd(dh, node,
785 "schemaIdentifier", mem)))
787 dnew->u.data.what = DATA1I_oid;
788 dnew->u.data.data = (char *) nmem_malloc(mem, p - tmp);
789 memcpy(dnew->u.data.data, tmp, p - tmp);
790 dnew->u.data.len = p - tmp;
795 logf (LOG_DEBUG, "grs_retrieve: element spec");
796 if (p->comp && (res = process_comp(p->dh, node, p->comp)) > 0)
800 data1_free_tree(p->dh, onode);
801 data1_free_tree(p->dh, node);
805 else if (p->comp && !res)
809 data1_pr_tree (p->dh, node, stdout);
811 logf (LOG_DEBUG, "grs_retrieve: transfer syntax mapping");
812 switch (p->output_format = (p->input_format != VAL_NONE ?
813 p->input_format : VAL_SUTRS))
816 add_idzebra_info (p, top, mem);
819 data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
821 if (!(p->rec_buf = data1_nodetoidsgml(p->dh, node, selected,
826 char *new_buf = (char*) odr_malloc (p->odr, p->rec_len);
827 memcpy (new_buf, p->rec_buf, p->rec_len);
828 p->rec_buf = new_buf;
833 if (!(p->rec_buf = data1_nodetogr(p->dh, node, selected,
835 p->diagnostic = 238; /* not available in requested syntax */
837 p->rec_len = (size_t) (-1);
840 if (!(p->rec_buf = data1_nodetoexplain(p->dh, node, selected,
844 p->rec_len = (size_t) (-1);
847 if (!(p->rec_buf = data1_nodetosummary(p->dh, node, selected,
851 p->rec_len = (size_t) (-1);
855 data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
856 if (!(p->rec_buf = data1_nodetobuf(p->dh, node, selected,
861 char *new_buf = (char*) odr_malloc (p->odr, p->rec_len);
862 memcpy (new_buf, p->rec_buf, p->rec_len);
863 p->rec_buf = new_buf;
867 if (!(p->rec_buf = data1_nodetosoif(p->dh, node, selected,
872 char *new_buf = (char*) odr_malloc (p->odr, p->rec_len);
873 memcpy (new_buf, p->rec_buf, p->rec_len);
874 p->rec_buf = new_buf;
878 if (!node->u.root.absyn)
883 for (marctab = node->u.root.absyn->marc; marctab;
884 marctab = marctab->next)
885 if (marctab->reference == p->input_format)
893 data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
894 if (!(p->rec_buf = data1_nodetomarc(p->dh, marctab, node,
895 selected, &p->rec_len)))
899 char *new_buf = (char*) odr_malloc (p->odr, p->rec_len);
900 memcpy (new_buf, p->rec_buf, p->rec_len);
901 p->rec_buf = new_buf;
905 data1_free_tree(p->dh, node);
907 data1_free_tree(p->dh, onode);
912 static struct recType grs_type =
921 RecType recTypeGrs = &grs_type;