1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2013 Index Data
3 * See the file LICENSE for details.
6 * \file solrtransform.c
7 * \brief Implements SOLR transform (SOLR to RPN conversion).
16 #include <yaz/rpn2solr.h>
17 #include <yaz/xmalloc.h>
18 #include <yaz/diagsrw.h>
19 #include <yaz/tokenizer.h>
20 #include <yaz/wrbuf.h>
21 #include <yaz/z-core.h>
22 #include <yaz/matchstr.h>
23 #include <yaz/oid_db.h>
27 struct solr_prop_entry {
30 Z_AttributeList attr_list;
31 struct solr_prop_entry *next;
34 struct solr_transform_t_ {
35 struct solr_prop_entry *entry;
36 yaz_tok_cfg_t tok_cfg;
44 /* TODO Utility functions, split out into separate file */
45 int solr_strcmp(const char *s1, const char *s2) {
46 return cql_strcmp(s1, s2);
49 int solr_strncmp(const char *s1, const char *s2, size_t n) {
50 return cql_strncmp(s1, s2, n);
54 const char *solr_uri(void)
56 return "TODO:SOLR URI";
59 void solr_buf_write_handler (const char *b, void *client_data)
61 struct solr_buf_write_info *info = (struct solr_buf_write_info *)client_data;
63 if (info->off < 0 || (info->off + l >= info->max))
68 memcpy (info->buf + info->off, b, l);
73 /* Utility functions end */
75 solr_transform_t solr_transform_create(void)
77 solr_transform_t ct = (solr_transform_t) xmalloc(sizeof(*ct));
78 ct->tok_cfg = yaz_tok_cfg_create();
79 ct->w = wrbuf_alloc();
83 ct->nmem = nmem_create();
87 static int solr_transform_parse_tok_line(solr_transform_t ct,
92 Z_AttributeElement *ae[20];
93 int ret = 0; /* 0=OK, != 0 FAIL */
97 while (t == YAZ_TOK_STRING && ae_num < 20)
99 WRBUF type_str = wrbuf_alloc();
101 Z_AttributeElement *elem = 0;
102 const char *value_str = 0;
103 /* attset type=value OR type=value */
105 elem = (Z_AttributeElement *) nmem_malloc(ct->nmem, sizeof(*elem));
106 elem->attributeSet = 0;
108 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
109 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
110 t = yaz_tok_move(tp);
111 if (t == YAZ_TOK_EOF)
113 wrbuf_destroy(type_str);
115 wrbuf_destroy(set_str);
118 if (t == YAZ_TOK_STRING)
120 wrbuf_puts(ct->w, " ");
121 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
125 yaz_string_to_oid_nmem(yaz_oid_std(), CLASS_ATTSET,
126 wrbuf_cstr(set_str), ct->nmem);
128 type_str = wrbuf_alloc();
129 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
130 t = yaz_tok_move(tp);
132 elem->attributeType = nmem_intdup(ct->nmem, 0);
133 if (sscanf(wrbuf_cstr(type_str), ODR_INT_PRINTF, elem->attributeType)
136 wrbuf_destroy(type_str);
138 wrbuf_destroy(set_str);
139 yaz_log(YLOG_WARN, "Expected numeric attribute type");
144 wrbuf_destroy(type_str);
146 wrbuf_destroy(set_str);
150 yaz_log(YLOG_WARN, "Expected = after after attribute type");
154 t = yaz_tok_move(tp);
155 if (t != YAZ_TOK_STRING) /* value */
157 yaz_log(YLOG_WARN, "Missing attribute value");
161 value_str = yaz_tok_parse_string(tp);
162 if (yaz_isdigit(*value_str))
164 elem->which = Z_AttributeValue_numeric;
165 elem->value.numeric =
166 nmem_intdup(ct->nmem, atoi(value_str));
170 Z_ComplexAttribute *ca = (Z_ComplexAttribute *)
171 nmem_malloc(ct->nmem, sizeof(*ca));
172 elem->which = Z_AttributeValue_complex;
173 elem->value.complex = ca;
175 ca->list = (Z_StringOrNumeric **)
176 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric *));
177 ca->list[0] = (Z_StringOrNumeric *)
178 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric));
179 ca->list[0]->which = Z_StringOrNumeric_string;
180 ca->list[0]->u.string = nmem_strdup(ct->nmem, value_str);
181 ca->num_semanticAction = 0;
182 ca->semanticAction = 0;
184 wrbuf_puts(ct->w, "=");
185 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
186 t = yaz_tok_move(tp);
187 wrbuf_puts(ct->w, " ");
190 if (ret == 0) /* OK? */
192 struct solr_prop_entry **pp = &ct->entry;
195 *pp = (struct solr_prop_entry *) xmalloc(sizeof(**pp));
196 (*pp)->pattern = xstrdup(pattern);
197 (*pp)->value = xstrdup(wrbuf_cstr(ct->w));
199 (*pp)->attr_list.num_attributes = ae_num;
201 (*pp)->attr_list.attributes = 0;
204 (*pp)->attr_list.attributes = (Z_AttributeElement **)
205 nmem_malloc(ct->nmem,
206 ae_num * sizeof(Z_AttributeElement *));
207 memcpy((*pp)->attr_list.attributes, ae,
208 ae_num * sizeof(Z_AttributeElement *));
214 ODR pr = odr_createmem(ODR_PRINT);
215 Z_AttributeList *alp = &(*pp)->attr_list;
216 odr_setprint(pr, yaz_log_file());
217 z_AttributeList(pr, &alp, 0, 0);
225 int solr_transform_define_pattern(solr_transform_t ct, const char *pattern,
229 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, value);
230 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
231 r = solr_transform_parse_tok_line(ct, pattern, tp);
232 yaz_tok_parse_destroy(tp);
236 solr_transform_t solr_transform_open_FILE(FILE *f)
238 solr_transform_t ct = solr_transform_create();
241 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
243 while (fgets(line, sizeof(line)-1, f))
245 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, line);
248 t = yaz_tok_move(tp);
249 if (t == YAZ_TOK_STRING)
251 char * pattern = xstrdup(yaz_tok_parse_string(tp));
252 t = yaz_tok_move(tp);
255 yaz_tok_parse_destroy(tp);
256 solr_transform_close(ct);
259 if (solr_transform_parse_tok_line(ct, pattern, tp))
261 yaz_tok_parse_destroy(tp);
262 solr_transform_close(ct);
267 else if (t != YAZ_TOK_EOF)
269 yaz_tok_parse_destroy(tp);
270 solr_transform_close(ct);
273 yaz_tok_parse_destroy(tp);
278 void solr_transform_close(solr_transform_t ct)
280 struct solr_prop_entry *pe;
286 struct solr_prop_entry *pe_next = pe->next;
293 yaz_tok_cfg_destroy(ct->tok_cfg);
294 wrbuf_destroy(ct->w);
295 nmem_destroy(ct->nmem);
299 solr_transform_t solr_transform_open_fname(const char *fname)
302 FILE *f = fopen(fname, "r");
305 ct = solr_transform_open_FILE(f);
311 struct Z_AttributeElement {
312 Z_AttributeSetId *attributeSet; /* OPT */
317 Z_ComplexAttribute *complex;
318 #define Z_AttributeValue_numeric 1
319 #define Z_AttributeValue_complex 2
324 static int compare_attr(Z_AttributeElement *a, Z_AttributeElement *b)
326 ODR odr_a = odr_createmem(ODR_ENCODE);
327 ODR odr_b = odr_createmem(ODR_ENCODE);
332 z_AttributeElement(odr_a, &a, 0, 0);
333 z_AttributeElement(odr_b, &b, 0, 0);
335 buf_a = odr_getbuf(odr_a, &len_a, 0);
336 buf_b = odr_getbuf(odr_b, &len_b, 0);
338 ret = yaz_memcmp(buf_a, buf_b, len_a, len_b);
345 const char *solr_lookup_reverse(solr_transform_t ct,
346 const char *category,
347 Z_AttributeList *attributes)
349 struct solr_prop_entry *e;
350 size_t clen = strlen(category);
351 for (e = ct->entry; e; e = e->next)
353 if (!strncmp(e->pattern, category, clen))
355 /* category matches.. See if attributes in pattern value
356 are all listed in actual attributes */
358 for (i = 0; i < e->attr_list.num_attributes; i++)
360 /* entry attribute */
361 Z_AttributeElement *e_ae = e->attr_list.attributes[i];
363 for (j = 0; j < attributes->num_attributes; j++)
365 /* actual attribute */
366 Z_AttributeElement *a_ae = attributes->attributes[j];
367 int r = compare_attr(e_ae, a_ae);
371 if (j == attributes->num_attributes)
372 break; /* i was not found at all.. try next pattern */
375 if (i == e->attr_list.num_attributes)
376 return e->pattern + clen;
382 static const char *solr_lookup_property(solr_transform_t ct,
383 const char *pat1, const char *pat2,
387 struct solr_prop_entry *e;
389 if (pat1 && pat2 && pat3)
390 sprintf(pattern, "%.39s.%.39s.%.39s", pat1, pat2, pat3);
391 else if (pat1 && pat2)
392 sprintf(pattern, "%.39s.%.39s", pat1, pat2);
393 else if (pat1 && pat3)
394 sprintf(pattern, "%.39s.%.39s", pat1, pat3);
396 sprintf(pattern, "%.39s", pat1);
400 for (e = ct->entry; e; e = e->next)
402 if (!solr_strcmp(e->pattern, pattern))
408 int solr_pr_attr_uri(solr_transform_t ct, const char *category,
409 const char *uri, const char *val, const char *default_val,
410 void (*pr)(const char *buf, void *client_data),
415 const char *eval = val ? val : default_val;
416 const char *prefix = 0;
420 struct solr_prop_entry *e;
422 for (e = ct->entry; e; e = e->next)
423 if (!memcmp(e->pattern, "set.", 4) && e->value &&
424 !strcmp(e->value, uri))
426 prefix = e->pattern+4;
429 /* must have a prefix now - if not it's an error */
435 res = solr_lookup_property(ct, category, prefix, eval);
436 /* we have some aliases for some relations unfortunately.. */
437 if (!res && !prefix && !strcmp(category, "relation"))
439 if (!strcmp(val, "=="))
440 res = solr_lookup_property(ct, category, prefix, "exact");
441 if (!strcmp(val, "="))
442 res = solr_lookup_property(ct, category, prefix, "eq");
443 if (!strcmp(val, "<="))
444 res = solr_lookup_property(ct, category, prefix, "le");
445 if (!strcmp(val, ">="))
446 res = solr_lookup_property(ct, category, prefix, "ge");
449 res = solr_lookup_property(ct, category, prefix, "*");
455 const char *cp0 = res, *cp1;
456 while ((cp1 = strchr(cp0, '=')))
459 while (*cp1 && *cp1 != ' ')
461 if (cp1 - cp0 >= (ptrdiff_t) sizeof(buf))
463 memcpy(buf, cp0, cp1 - cp0);
465 (*pr)("@attr ", client_data);
467 for (i = 0; buf[i]; i++)
470 (*pr)(eval, client_data);
476 (*pr)(tmp, client_data);
479 (*pr)(" ", client_data);
487 if (errcode && !ct->error)
491 ct->addinfo = xstrdup(val);
498 int solr_pr_attr(solr_transform_t ct, const char *category,
499 const char *val, const char *default_val,
500 void (*pr)(const char *buf, void *client_data),
504 return solr_pr_attr_uri(ct, category, 0 /* uri */,
505 val, default_val, pr, client_data, errcode);
509 static void solr_pr_int(int val,
510 void (*pr)(const char *buf, void *client_data),
513 char buf[21]; /* enough characters to 2^64 */
514 sprintf(buf, "%d", val);
515 (*pr)(buf, client_data);
516 (*pr)(" ", client_data);
520 static int solr_pr_prox(solr_transform_t ct, struct solr_node *mods,
521 void (*pr)(const char *buf, void *client_data),
527 int proxrel = 2; /* less than or equal */
528 int unit = 2; /* word */
532 const char *name = mods->u.st.index;
533 const char *term = mods->u.st.term;
534 const char *relation = mods->u.st.relation;
536 if (!strcmp(name, "distance")) {
537 distance = strtol(term, (char**) 0, 0);
538 if (!strcmp(relation, "="))
540 else if (!strcmp(relation, ">"))
542 else if (!strcmp(relation, "<"))
544 else if (!strcmp(relation, ">="))
546 else if (!strcmp(relation, "<="))
548 else if (!strcmp(relation, "<>"))
552 ct->error = YAZ_SRW_UNSUPP_PROX_RELATION;
553 ct->addinfo = xstrdup(relation);
557 else if (!strcmp(name, "ordered"))
559 else if (!strcmp(name, "unordered"))
561 else if (!strcmp(name, "unit"))
563 if (!strcmp(term, "word"))
565 else if (!strcmp(term, "sentence"))
567 else if (!strcmp(term, "paragraph"))
569 else if (!strcmp(term, "element"))
573 ct->error = YAZ_SRW_UNSUPP_PROX_UNIT;
574 ct->addinfo = xstrdup(term);
580 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
581 ct->addinfo = xstrdup(name);
584 mods = mods->u.st.modifiers;
588 distance = (unit == 2) ? 1 : 0;
590 solr_pr_int(exclusion, pr, client_data);
591 solr_pr_int(distance, pr, client_data);
592 solr_pr_int(ordered, pr, client_data);
593 solr_pr_int(proxrel, pr, client_data);
594 (*pr)("k ", client_data);
595 solr_pr_int(unit, pr, client_data);
600 /* Returns location of first wildcard character in the `length'
601 * characters starting at `term', or a null pointer of there are
602 * none -- like memchr().
604 static const char *wcchar(int start, const char *term, int length)
608 if (start || term[-1] != '\\')
609 if (strchr("*?", *term))
619 /* ### checks for SOLR relation-name rather than Type-1 attribute */
620 static int has_modifier(struct solr_node *cn, const char *name) {
621 struct solr_node *mod;
622 for (mod = cn->u.st.modifiers; mod != 0; mod = mod->u.st.modifiers) {
623 if (!strcmp(mod->u.st.index, name))
631 static void emit_term(solr_transform_t ct,
632 struct solr_node *cn,
633 const char *term, int length,
634 void (*pr)(const char *buf, void *client_data),
638 const char *ns = cn->u.st.index_uri;
639 int process_term = !has_modifier(cn, "regexp");
642 assert(cn->which == SOLR_NODE_ST);
644 if (process_term && length > 0)
646 if (length > 1 && term[0] == '^' && term[length-1] == '^')
648 solr_pr_attr(ct, "position", "firstAndLast", 0,
649 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
653 else if (term[0] == '^')
655 solr_pr_attr(ct, "position", "first", 0,
656 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
660 else if (term[length-1] == '^')
662 solr_pr_attr(ct, "position", "last", 0,
663 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
668 solr_pr_attr(ct, "position", "any", 0,
669 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
673 if (process_term && length > 0)
675 const char *first_wc = wcchar(1, term, length);
676 const char *second_wc = first_wc ?
677 wcchar(0, first_wc+1, length-(first_wc-term)-1) : 0;
679 /* Check for well-known globbing patterns that represent
680 * simple truncation attributes as expected by, for example,
681 * Bath-compliant server. If we find such a pattern but
682 * there's no mapping for it, that's fine: we just use a
683 * general pattern-matching attribute.
685 if (first_wc == term && second_wc == term + length-1
686 && *first_wc == '*' && *second_wc == '*'
687 && solr_pr_attr(ct, "truncation", "both", 0, pr, client_data, 0))
692 else if (first_wc == term && second_wc == 0 && *first_wc == '*'
693 && solr_pr_attr(ct, "truncation", "left", 0,
699 else if (first_wc == term + length-1 && second_wc == 0
701 && solr_pr_attr(ct, "truncation", "right", 0,
708 /* We have one or more wildcard characters, but not in a
709 * way that can be dealt with using only the standard
710 * left-, right- and both-truncation attributes. We need
711 * to translate the pattern into a Z39.58-type pattern,
712 * which has been supported in BIB-1 since 1996. If
713 * there's no configuration element for "truncation.z3958"
714 * we indicate this as error 28 "Masking character not
718 solr_pr_attr(ct, "truncation", "z3958", 0,
719 pr, client_data, YAZ_SRW_MASKING_CHAR_UNSUPP);
720 z3958_mem = (char *) xmalloc(length+1);
721 for (i = 0; i < length; i++)
723 if (i > 0 && term[i-1] == '\\')
724 z3958_mem[i] = term[i];
725 else if (term[i] == '*')
727 else if (term[i] == '?')
730 z3958_mem[i] = term[i];
732 z3958_mem[length] = '\0';
736 /* No masking characters. Use "truncation.none" if given. */
737 solr_pr_attr(ct, "truncation", "none", 0,
742 solr_pr_attr_uri(ct, "index", ns,
743 cn->u.st.index, "serverChoice",
744 pr, client_data, YAZ_SRW_UNSUPP_INDEX);
746 if (cn->u.st.modifiers)
748 struct solr_node *mod = cn->u.st.modifiers;
749 for (; mod; mod = mod->u.st.modifiers)
751 solr_pr_attr(ct, "relationModifier", mod->u.st.index, 0,
752 pr, client_data, YAZ_SRW_UNSUPP_RELATION_MODIFIER);
756 (*pr)("\"", client_data);
757 for (i = 0; i<length; i++)
759 /* pr(int) each character */
760 /* we do not need to deal with \-sequences because the
761 SOLR and PQF terms have same \-format, bug #1988 */
766 (*pr)(buf, client_data);
768 (*pr)("\" ", client_data);
772 static void emit_terms(solr_transform_t ct,
773 struct solr_node *cn,
774 void (*pr)(const char *buf, void *client_data),
778 struct solr_node *ne = cn->u.st.extra_terms;
781 (*pr)("@", client_data);
782 (*pr)(op, client_data);
783 (*pr)(" ", client_data);
785 emit_term(ct, cn, cn->u.st.term, strlen(cn->u.st.term),
787 for (; ne; ne = ne->u.st.extra_terms)
789 if (ne->u.st.extra_terms)
791 (*pr)("@", client_data);
792 (*pr)(op, client_data);
793 (*pr)(" ", client_data);
795 emit_term(ct, cn, ne->u.st.term, strlen(ne->u.st.term),
800 static void emit_wordlist(solr_transform_t ct,
801 struct solr_node *cn,
802 void (*pr)(const char *buf, void *client_data),
806 const char *cp0 = cn->u.st.term;
808 const char *last_term = 0;
814 cp1 = strchr(cp0, ' ');
817 (*pr)("@", client_data);
818 (*pr)(op, client_data);
819 (*pr)(" ", client_data);
820 emit_term(ct, cn, last_term, last_length, pr, client_data);
824 last_length = cp1 - cp0;
826 last_length = strlen(cp0);
830 emit_term(ct, cn, last_term, last_length, pr, client_data);
833 void solr_transform_r(solr_transform_t ct,
834 struct solr_node *cn,
835 void (*pr)(const char *buf, void *client_data),
839 struct solr_node *mods;
846 ns = cn->u.st.index_uri;
849 /* TODO If relevant fix with solr_uri */
850 if (!strcmp(ns, solr_uri())
851 && cn->u.st.index && !solr_strcmp(cn->u.st.index, "resultSet"))
853 (*pr)("@set \"", client_data);
854 (*pr)(cn->u.st.term, client_data);
855 (*pr)("\" ", client_data);
863 ct->error = YAZ_SRW_UNSUPP_CONTEXT_SET;
867 solr_pr_attr(ct, "always", 0, 0, pr, client_data, 0);
868 solr_pr_attr(ct, "relation", cn->u.st.relation, 0, pr, client_data,
869 YAZ_SRW_UNSUPP_RELATION);
870 solr_pr_attr(ct, "structure", cn->u.st.relation, 0,
871 pr, client_data, YAZ_SRW_UNSUPP_COMBI_OF_RELATION_AND_TERM);
872 if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "all"))
873 emit_wordlist(ct, cn, pr, client_data, "and");
874 else if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "any"))
875 emit_wordlist(ct, cn, pr, client_data, "or");
877 emit_terms(ct, cn, pr, client_data, "and");
880 (*pr)("@", client_data);
881 (*pr)(cn->u.boolean.value, client_data);
882 (*pr)(" ", client_data);
883 mods = cn->u.boolean.modifiers;
884 if (!strcmp(cn->u.boolean.value, "prox"))
886 if (!solr_pr_prox(ct, mods, pr, client_data))
891 /* Boolean modifiers other than on proximity not supported */
892 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
893 ct->addinfo = xstrdup(mods->u.st.index);
897 solr_transform_r(ct, cn->u.boolean.left, pr, client_data);
898 solr_transform_r(ct, cn->u.boolean.right, pr, client_data);
902 fprintf(stderr, "Fatal: impossible SOLR node-type %d\n", cn->which);
907 int solr_transform(solr_transform_t ct, struct solr_node *cn,
908 void (*pr)(const char *buf, void *client_data),
911 struct solr_prop_entry *e;
912 NMEM nmem = nmem_create();
918 for (e = ct->entry; e ; e = e->next)
920 /* TODO remove as SOLR dont supports sets.
921 if (!solr_strncmp(e->pattern, "set.", 4))
922 solr_apply_prefix(nmem, cn, e->pattern+4, e->value);
923 else if (!solr_strcmp(e->pattern, "set"))
924 solr_apply_prefix(nmem, cn, 0, e->value);
927 solr_transform_r(ct, cn, pr, client_data);
933 int solr_transform_FILE(solr_transform_t ct, struct solr_node *cn, FILE *f)
935 /* We can use the cql_fputs util */
936 return solr_transform(ct, cn, cql_fputs, f);
939 int solr_transform_buf(solr_transform_t ct, struct solr_node *cn, char *out, int max)
941 struct solr_buf_write_info info;
947 r = solr_transform(ct, cn, cql_buf_write_handler, &info);
949 /* Attempt to write past end of buffer. For some reason, this
950 SRW diagnostic is deprecated, but it's so perfect for our
951 purposes that it would be stupid not to use it. */
953 ct->error = YAZ_SRW_TOO_MANY_CHARS_IN_QUERY;
954 sprintf(numbuf, "%ld", (long) info.max);
955 ct->addinfo = xstrdup(numbuf);
959 info.buf[info.off] = '\0';
963 int solr_transform_error(solr_transform_t ct, const char **addinfo)
965 *addinfo = ct->addinfo;
969 void solr_transform_set_error(solr_transform_t ct, int error, const char *addinfo)
972 ct->addinfo = addinfo ? xstrdup(addinfo) : 0;
979 * c-file-style: "Stroustrup"
980 * indent-tabs-mode: nil
982 * vim: shiftwidth=4 tabstop=8 expandtab