1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2011 Index Data
3 * See the file LICENSE for details.
6 * \file solrtransform.c
7 * \brief Implements SOLR transform (SOLR to RPN conversion).
16 #include <yaz/rpn2solr.h>
17 #include <yaz/xmalloc.h>
18 #include <yaz/diagsrw.h>
19 #include <yaz/tokenizer.h>
20 #include <yaz/wrbuf.h>
21 #include <yaz/z-core.h>
22 #include <yaz/matchstr.h>
23 #include <yaz/oid_db.h>
27 struct solr_prop_entry {
30 Z_AttributeList attr_list;
31 struct solr_prop_entry *next;
34 struct solr_transform_t_ {
35 struct solr_prop_entry *entry;
36 yaz_tok_cfg_t tok_cfg;
44 /* TODO Utility functions, evt. split out int separate file */
45 int solr_strcmp(const char *s1, const char *s2) {
46 return cql_strcmp(s1, s2);
49 int solr_strncmp(const char *s1, const char *s2, size_t n) {
50 return cql_strncmp(s1, s2, n);
54 const char *solr_uri(void)
56 return "TODO:SOLR URI";
59 void solr_buf_write_handler (const char *b, void *client_data)
61 struct solr_buf_write_info *info = (struct solr_buf_write_info *)client_data;
63 if (info->off < 0 || (info->off + l >= info->max))
68 memcpy (info->buf + info->off, b, l);
73 /* Utillity functions end */
75 solr_transform_t solr_transform_create(void)
77 solr_transform_t ct = (solr_transform_t) xmalloc(sizeof(*ct));
78 ct->tok_cfg = yaz_tok_cfg_create();
79 ct->w = wrbuf_alloc();
83 ct->nmem = nmem_create();
87 static int solr_transform_parse_tok_line(solr_transform_t ct,
92 Z_AttributeElement *ae[20];
93 int ret = 0; /* 0=OK, != 0 FAIL */
97 while (t == YAZ_TOK_STRING && ae_num < 20)
99 WRBUF type_str = wrbuf_alloc();
101 Z_AttributeElement *elem = 0;
102 const char *value_str = 0;
103 /* attset type=value OR type=value */
105 elem = (Z_AttributeElement *) nmem_malloc(ct->nmem, sizeof(*elem));
106 elem->attributeSet = 0;
108 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
109 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
110 t = yaz_tok_move(tp);
111 if (t == YAZ_TOK_EOF)
113 wrbuf_destroy(type_str);
115 wrbuf_destroy(set_str);
118 if (t == YAZ_TOK_STRING)
120 wrbuf_puts(ct->w, " ");
121 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
125 yaz_string_to_oid_nmem(yaz_oid_std(), CLASS_ATTSET,
126 wrbuf_cstr(set_str), ct->nmem);
128 type_str = wrbuf_alloc();
129 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
130 t = yaz_tok_move(tp);
132 elem->attributeType = nmem_intdup(ct->nmem, 0);
133 if (sscanf(wrbuf_cstr(type_str), ODR_INT_PRINTF, elem->attributeType)
136 wrbuf_destroy(type_str);
138 wrbuf_destroy(set_str);
139 yaz_log(YLOG_WARN, "Expected numeric attribute type");
144 wrbuf_destroy(type_str);
146 wrbuf_destroy(set_str);
150 yaz_log(YLOG_WARN, "Expected = after after attribute type");
154 t = yaz_tok_move(tp);
155 if (t != YAZ_TOK_STRING) /* value */
157 yaz_log(YLOG_WARN, "Missing attribute value");
161 value_str = yaz_tok_parse_string(tp);
162 if (yaz_isdigit(*value_str))
164 elem->which = Z_AttributeValue_numeric;
165 elem->value.numeric =
166 nmem_intdup(ct->nmem, atoi(value_str));
170 Z_ComplexAttribute *ca = (Z_ComplexAttribute *)
171 nmem_malloc(ct->nmem, sizeof(*ca));
172 elem->which = Z_AttributeValue_complex;
173 elem->value.complex = ca;
175 ca->list = (Z_StringOrNumeric **)
176 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric *));
177 ca->list[0] = (Z_StringOrNumeric *)
178 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric));
179 ca->list[0]->which = Z_StringOrNumeric_string;
180 ca->list[0]->u.string = nmem_strdup(ct->nmem, value_str);
181 ca->num_semanticAction = 0;
182 ca->semanticAction = 0;
184 wrbuf_puts(ct->w, "=");
185 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
186 t = yaz_tok_move(tp);
187 wrbuf_puts(ct->w, " ");
190 if (ret == 0) /* OK? */
192 struct solr_prop_entry **pp = &ct->entry;
195 *pp = (struct solr_prop_entry *) xmalloc(sizeof(**pp));
196 (*pp)->pattern = xstrdup(pattern);
197 (*pp)->value = xstrdup(wrbuf_cstr(ct->w));
199 (*pp)->attr_list.num_attributes = ae_num;
201 (*pp)->attr_list.attributes = 0;
204 (*pp)->attr_list.attributes = (Z_AttributeElement **)
205 nmem_malloc(ct->nmem,
206 ae_num * sizeof(Z_AttributeElement *));
207 memcpy((*pp)->attr_list.attributes, ae,
208 ae_num * sizeof(Z_AttributeElement *));
214 ODR pr = odr_createmem(ODR_PRINT);
215 Z_AttributeList *alp = &(*pp)->attr_list;
216 odr_setprint(pr, yaz_log_file());
217 z_AttributeList(pr, &alp, 0, 0);
225 int solr_transform_define_pattern(solr_transform_t ct, const char *pattern,
229 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, value);
230 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
231 r = solr_transform_parse_tok_line(ct, pattern, tp);
232 yaz_tok_parse_destroy(tp);
236 solr_transform_t solr_transform_open_FILE(FILE *f)
238 solr_transform_t ct = solr_transform_create();
241 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
243 while (fgets(line, sizeof(line)-1, f))
245 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, line);
248 t = yaz_tok_move(tp);
249 if (t == YAZ_TOK_STRING)
251 char * pattern = xstrdup(yaz_tok_parse_string(tp));
252 t = yaz_tok_move(tp);
255 yaz_tok_parse_destroy(tp);
256 solr_transform_close(ct);
259 if (solr_transform_parse_tok_line(ct, pattern, tp))
261 yaz_tok_parse_destroy(tp);
262 solr_transform_close(ct);
267 else if (t != YAZ_TOK_EOF)
269 yaz_tok_parse_destroy(tp);
270 solr_transform_close(ct);
273 yaz_tok_parse_destroy(tp);
278 void solr_transform_close(solr_transform_t ct)
280 struct solr_prop_entry *pe;
286 struct solr_prop_entry *pe_next = pe->next;
293 yaz_tok_cfg_destroy(ct->tok_cfg);
294 wrbuf_destroy(ct->w);
295 nmem_destroy(ct->nmem);
299 solr_transform_t solr_transform_open_fname(const char *fname)
302 FILE *f = fopen(fname, "r");
305 ct = solr_transform_open_FILE(f);
311 struct Z_AttributeElement {
312 Z_AttributeSetId *attributeSet; /* OPT */
317 Z_ComplexAttribute *complex;
318 #define Z_AttributeValue_numeric 1
319 #define Z_AttributeValue_complex 2
324 static int compare_attr(Z_AttributeElement *a, Z_AttributeElement *b)
326 ODR odr_a = odr_createmem(ODR_ENCODE);
327 ODR odr_b = odr_createmem(ODR_ENCODE);
332 z_AttributeElement(odr_a, &a, 0, 0);
333 z_AttributeElement(odr_b, &b, 0, 0);
335 buf_a = odr_getbuf(odr_a, &len_a, 0);
336 buf_b = odr_getbuf(odr_b, &len_b, 0);
338 ret = yaz_memcmp(buf_a, buf_b, len_a, len_b);
345 const char *solr_lookup_reverse(solr_transform_t ct,
346 const char *category,
347 Z_AttributeList *attributes)
349 struct solr_prop_entry *e;
350 size_t clen = strlen(category);
351 for (e = ct->entry; e; e = e->next)
353 if (!strncmp(e->pattern, category, clen))
355 /* category matches.. See if attributes in pattern value
356 are all listed in actual attributes */
358 for (i = 0; i < e->attr_list.num_attributes; i++)
360 /* entry attribute */
361 Z_AttributeElement *e_ae = e->attr_list.attributes[i];
363 for (j = 0; j < attributes->num_attributes; j++)
365 /* actual attribute */
366 Z_AttributeElement *a_ae = attributes->attributes[j];
367 int r = compare_attr(e_ae, a_ae);
371 if (j == attributes->num_attributes)
372 break; /* i was not found at all.. try next pattern */
375 if (i == e->attr_list.num_attributes)
376 return e->pattern + clen;
382 static const char *solr_lookup_property(solr_transform_t ct,
383 const char *pat1, const char *pat2,
387 struct solr_prop_entry *e;
389 if (pat1 && pat2 && pat3)
390 sprintf(pattern, "%.39s.%.39s.%.39s", pat1, pat2, pat3);
391 else if (pat1 && pat2)
392 sprintf(pattern, "%.39s.%.39s", pat1, pat2);
393 else if (pat1 && pat3)
394 sprintf(pattern, "%.39s.%.39s", pat1, pat3);
396 sprintf(pattern, "%.39s", pat1);
400 for (e = ct->entry; e; e = e->next)
402 if (!solr_strcmp(e->pattern, pattern))
408 int solr_pr_attr_uri(solr_transform_t ct, const char *category,
409 const char *uri, const char *val, const char *default_val,
410 void (*pr)(const char *buf, void *client_data),
415 const char *eval = val ? val : default_val;
416 const char *prefix = 0;
420 struct solr_prop_entry *e;
422 for (e = ct->entry; e; e = e->next)
423 if (!memcmp(e->pattern, "set.", 4) && e->value &&
424 !strcmp(e->value, uri))
426 prefix = e->pattern+4;
429 /* must have a prefix now - if not it's an error */
435 res = solr_lookup_property(ct, category, prefix, eval);
436 /* we have some aliases for some relations unfortunately.. */
437 if (!res && !prefix && !strcmp(category, "relation"))
439 if (!strcmp(val, "=="))
440 res = solr_lookup_property(ct, category, prefix, "exact");
441 if (!strcmp(val, "="))
442 res = solr_lookup_property(ct, category, prefix, "eq");
443 if (!strcmp(val, "<="))
444 res = solr_lookup_property(ct, category, prefix, "le");
445 if (!strcmp(val, ">="))
446 res = solr_lookup_property(ct, category, prefix, "ge");
449 res = solr_lookup_property(ct, category, prefix, "*");
455 const char *cp0 = res, *cp1;
456 while ((cp1 = strchr(cp0, '=')))
459 while (*cp1 && *cp1 != ' ')
461 if (cp1 - cp0 >= (ptrdiff_t) sizeof(buf))
463 memcpy(buf, cp0, cp1 - cp0);
465 (*pr)("@attr ", client_data);
467 for (i = 0; buf[i]; i++)
470 (*pr)(eval, client_data);
476 (*pr)(tmp, client_data);
479 (*pr)(" ", client_data);
487 if (errcode && !ct->error)
491 ct->addinfo = xstrdup(val);
498 int solr_pr_attr(solr_transform_t ct, const char *category,
499 const char *val, const char *default_val,
500 void (*pr)(const char *buf, void *client_data),
504 return solr_pr_attr_uri(ct, category, 0 /* uri */,
505 val, default_val, pr, client_data, errcode);
509 static void solr_pr_int(int val,
510 void (*pr)(const char *buf, void *client_data),
513 char buf[21]; /* enough characters to 2^64 */
514 sprintf(buf, "%d", val);
515 (*pr)(buf, client_data);
516 (*pr)(" ", client_data);
520 static int solr_pr_prox(solr_transform_t ct, struct solr_node *mods,
521 void (*pr)(const char *buf, void *client_data),
525 int distance; /* to be filled in later depending on unit */
526 int distance_defined = 0;
528 int proxrel = 2; /* less than or equal */
529 int unit = 2; /* word */
533 const char *name = mods->u.st.index;
534 const char *term = mods->u.st.term;
535 const char *relation = mods->u.st.relation;
537 if (!strcmp(name, "distance")) {
538 distance = strtol(term, (char**) 0, 0);
539 distance_defined = 1;
540 if (!strcmp(relation, "="))
542 else if (!strcmp(relation, ">"))
544 else if (!strcmp(relation, "<"))
546 else if (!strcmp(relation, ">="))
548 else if (!strcmp(relation, "<="))
550 else if (!strcmp(relation, "<>"))
554 ct->error = YAZ_SRW_UNSUPP_PROX_RELATION;
555 ct->addinfo = xstrdup(relation);
559 else if (!strcmp(name, "ordered"))
561 else if (!strcmp(name, "unordered"))
563 else if (!strcmp(name, "unit"))
565 if (!strcmp(term, "word"))
567 else if (!strcmp(term, "sentence"))
569 else if (!strcmp(term, "paragraph"))
571 else if (!strcmp(term, "element"))
575 ct->error = YAZ_SRW_UNSUPP_PROX_UNIT;
576 ct->addinfo = xstrdup(term);
582 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
583 ct->addinfo = xstrdup(name);
586 mods = mods->u.st.modifiers;
589 if (!distance_defined)
590 distance = (unit == 2) ? 1 : 0;
592 solr_pr_int(exclusion, pr, client_data);
593 solr_pr_int(distance, pr, client_data);
594 solr_pr_int(ordered, pr, client_data);
595 solr_pr_int(proxrel, pr, client_data);
596 (*pr)("k ", client_data);
597 solr_pr_int(unit, pr, client_data);
602 /* Returns location of first wildcard character in the `length'
603 * characters starting at `term', or a null pointer of there are
604 * none -- like memchr().
606 static const char *wcchar(int start, const char *term, int length)
610 if (start || term[-1] != '\\')
611 if (strchr("*?", *term))
621 /* ### checks for SOLR relation-name rather than Type-1 attribute */
622 static int has_modifier(struct solr_node *cn, const char *name) {
623 struct solr_node *mod;
624 for (mod = cn->u.st.modifiers; mod != 0; mod = mod->u.st.modifiers) {
625 if (!strcmp(mod->u.st.index, name))
633 static void emit_term(solr_transform_t ct,
634 struct solr_node *cn,
635 const char *term, int length,
636 void (*pr)(const char *buf, void *client_data),
640 const char *ns = cn->u.st.index_uri;
641 int process_term = !has_modifier(cn, "regexp");
644 assert(cn->which == SOLR_NODE_ST);
646 if (process_term && length > 0)
648 if (length > 1 && term[0] == '^' && term[length-1] == '^')
650 solr_pr_attr(ct, "position", "firstAndLast", 0,
651 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
655 else if (term[0] == '^')
657 solr_pr_attr(ct, "position", "first", 0,
658 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
662 else if (term[length-1] == '^')
664 solr_pr_attr(ct, "position", "last", 0,
665 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
670 solr_pr_attr(ct, "position", "any", 0,
671 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
675 if (process_term && length > 0)
677 const char *first_wc = wcchar(1, term, length);
678 const char *second_wc = first_wc ?
679 wcchar(0, first_wc+1, length-(first_wc-term)-1) : 0;
681 /* Check for well-known globbing patterns that represent
682 * simple truncation attributes as expected by, for example,
683 * Bath-compliant server. If we find such a pattern but
684 * there's no mapping for it, that's fine: we just use a
685 * general pattern-matching attribute.
687 if (first_wc == term && second_wc == term + length-1
688 && *first_wc == '*' && *second_wc == '*'
689 && solr_pr_attr(ct, "truncation", "both", 0, pr, client_data, 0))
694 else if (first_wc == term && second_wc == 0 && *first_wc == '*'
695 && solr_pr_attr(ct, "truncation", "left", 0,
701 else if (first_wc == term + length-1 && second_wc == 0
703 && solr_pr_attr(ct, "truncation", "right", 0,
710 /* We have one or more wildcard characters, but not in a
711 * way that can be dealt with using only the standard
712 * left-, right- and both-truncation attributes. We need
713 * to translate the pattern into a Z39.58-type pattern,
714 * which has been supported in BIB-1 since 1996. If
715 * there's no configuration element for "truncation.z3958"
716 * we indicate this as error 28 "Masking character not
720 solr_pr_attr(ct, "truncation", "z3958", 0,
721 pr, client_data, YAZ_SRW_MASKING_CHAR_UNSUPP);
722 z3958_mem = (char *) xmalloc(length+1);
723 for (i = 0; i < length; i++)
725 if (i > 0 && term[i-1] == '\\')
726 z3958_mem[i] = term[i];
727 else if (term[i] == '*')
729 else if (term[i] == '?')
732 z3958_mem[i] = term[i];
734 z3958_mem[length] = '\0';
738 /* No masking characters. Use "truncation.none" if given. */
739 solr_pr_attr(ct, "truncation", "none", 0,
744 solr_pr_attr_uri(ct, "index", ns,
745 cn->u.st.index, "serverChoice",
746 pr, client_data, YAZ_SRW_UNSUPP_INDEX);
748 if (cn->u.st.modifiers)
750 struct solr_node *mod = cn->u.st.modifiers;
751 for (; mod; mod = mod->u.st.modifiers)
753 solr_pr_attr(ct, "relationModifier", mod->u.st.index, 0,
754 pr, client_data, YAZ_SRW_UNSUPP_RELATION_MODIFIER);
758 (*pr)("\"", client_data);
759 for (i = 0; i<length; i++)
761 /* pr(int) each character */
762 /* we do not need to deal with \-sequences because the
763 SOLR and PQF terms have same \-format, bug #1988 */
768 (*pr)(buf, client_data);
770 (*pr)("\" ", client_data);
774 static void emit_terms(solr_transform_t ct,
775 struct solr_node *cn,
776 void (*pr)(const char *buf, void *client_data),
780 struct solr_node *ne = cn->u.st.extra_terms;
783 (*pr)("@", client_data);
784 (*pr)(op, client_data);
785 (*pr)(" ", client_data);
787 emit_term(ct, cn, cn->u.st.term, strlen(cn->u.st.term),
789 for (; ne; ne = ne->u.st.extra_terms)
791 if (ne->u.st.extra_terms)
793 (*pr)("@", client_data);
794 (*pr)(op, client_data);
795 (*pr)(" ", client_data);
797 emit_term(ct, cn, ne->u.st.term, strlen(ne->u.st.term),
802 static void emit_wordlist(solr_transform_t ct,
803 struct solr_node *cn,
804 void (*pr)(const char *buf, void *client_data),
808 const char *cp0 = cn->u.st.term;
810 const char *last_term = 0;
816 cp1 = strchr(cp0, ' ');
819 (*pr)("@", client_data);
820 (*pr)(op, client_data);
821 (*pr)(" ", client_data);
822 emit_term(ct, cn, last_term, last_length, pr, client_data);
826 last_length = cp1 - cp0;
828 last_length = strlen(cp0);
832 emit_term(ct, cn, last_term, last_length, pr, client_data);
835 void solr_transform_r(solr_transform_t ct,
836 struct solr_node *cn,
837 void (*pr)(const char *buf, void *client_data),
841 struct solr_node *mods;
848 ns = cn->u.st.index_uri;
851 /* TODO If relevant fix with solr_uri */
852 if (!strcmp(ns, solr_uri())
853 && cn->u.st.index && !solr_strcmp(cn->u.st.index, "resultSet"))
855 (*pr)("@set \"", client_data);
856 (*pr)(cn->u.st.term, client_data);
857 (*pr)("\" ", client_data);
865 ct->error = YAZ_SRW_UNSUPP_CONTEXT_SET;
869 solr_pr_attr(ct, "always", 0, 0, pr, client_data, 0);
870 solr_pr_attr(ct, "relation", cn->u.st.relation, 0, pr, client_data,
871 YAZ_SRW_UNSUPP_RELATION);
872 solr_pr_attr(ct, "structure", cn->u.st.relation, 0,
873 pr, client_data, YAZ_SRW_UNSUPP_COMBI_OF_RELATION_AND_TERM);
874 if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "all"))
875 emit_wordlist(ct, cn, pr, client_data, "and");
876 else if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "any"))
877 emit_wordlist(ct, cn, pr, client_data, "or");
879 emit_terms(ct, cn, pr, client_data, "and");
882 (*pr)("@", client_data);
883 (*pr)(cn->u.boolean.value, client_data);
884 (*pr)(" ", client_data);
885 mods = cn->u.boolean.modifiers;
886 if (!strcmp(cn->u.boolean.value, "prox"))
888 if (!solr_pr_prox(ct, mods, pr, client_data))
893 /* Boolean modifiers other than on proximity not supported */
894 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
895 ct->addinfo = xstrdup(mods->u.st.index);
899 solr_transform_r(ct, cn->u.boolean.left, pr, client_data);
900 solr_transform_r(ct, cn->u.boolean.right, pr, client_data);
904 fprintf(stderr, "Fatal: impossible SOLR node-type %d\n", cn->which);
909 int solr_transform(solr_transform_t ct, struct solr_node *cn,
910 void (*pr)(const char *buf, void *client_data),
913 struct solr_prop_entry *e;
914 NMEM nmem = nmem_create();
920 for (e = ct->entry; e ; e = e->next)
922 /* TODO remove as SOLR dont supports sets.
923 if (!solr_strncmp(e->pattern, "set.", 4))
924 solr_apply_prefix(nmem, cn, e->pattern+4, e->value);
925 else if (!solr_strcmp(e->pattern, "set"))
926 solr_apply_prefix(nmem, cn, 0, e->value);
929 solr_transform_r(ct, cn, pr, client_data);
935 int solr_transform_FILE(solr_transform_t ct, struct solr_node *cn, FILE *f)
937 /* We can use the cql_fputs util */
938 return solr_transform(ct, cn, cql_fputs, f);
941 int solr_transform_buf(solr_transform_t ct, struct solr_node *cn, char *out, int max)
943 struct solr_buf_write_info info;
949 r = solr_transform(ct, cn, cql_buf_write_handler, &info);
951 /* Attempt to write past end of buffer. For some reason, this
952 SRW diagnostic is deprecated, but it's so perfect for our
953 purposes that it would be stupid not to use it. */
955 ct->error = YAZ_SRW_TOO_MANY_CHARS_IN_QUERY;
956 sprintf(numbuf, "%ld", (long) info.max);
957 ct->addinfo = xstrdup(numbuf);
961 info.buf[info.off] = '\0';
965 int solr_transform_error(solr_transform_t ct, const char **addinfo)
967 *addinfo = ct->addinfo;
971 void solr_transform_set_error(solr_transform_t ct, int error, const char *addinfo)
974 ct->addinfo = addinfo ? xstrdup(addinfo) : 0;
981 * c-file-style: "Stroustrup"
982 * indent-tabs-mode: nil
984 * vim: shiftwidth=4 tabstop=8 expandtab