1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2011 Index Data
3 * See the file LICENSE for details.
6 * \file solrtransform.c
7 * \brief Implements SOLR transform (SOLR to RPN conversion).
17 #include <yaz/rpn2solr.h>
18 #include <yaz/xmalloc.h>
19 #include <yaz/diagsrw.h>
20 #include <yaz/tokenizer.h>
21 #include <yaz/wrbuf.h>
22 #include <yaz/z-core.h>
23 #include <yaz/matchstr.h>
24 #include <yaz/oid_db.h>
28 struct solr_prop_entry {
31 Z_AttributeList attr_list;
32 struct solr_prop_entry *next;
35 struct solr_transform_t_ {
36 struct solr_prop_entry *entry;
37 yaz_tok_cfg_t tok_cfg;
45 /* TODO Utility functions, evt. split out int separate file */
46 int solr_strcmp(const char *s1, const char *s2) {
47 return cql_strcmp(s1, s2);
50 int solr_strncmp(const char *s1, const char *s2, size_t n) {
51 return cql_strncmp(s1, s2, n);
55 const char *solr_uri(void)
57 return "TODO:SOLR URI";
60 void solr_buf_write_handler (const char *b, void *client_data)
62 struct solr_buf_write_info *info = (struct solr_buf_write_info *)client_data;
64 if (info->off < 0 || (info->off + l >= info->max))
69 memcpy (info->buf + info->off, b, l);
74 /* Utillity functions end */
76 solr_transform_t solr_transform_create(void)
78 solr_transform_t ct = (solr_transform_t) xmalloc(sizeof(*ct));
79 ct->tok_cfg = yaz_tok_cfg_create();
80 ct->w = wrbuf_alloc();
84 ct->nmem = nmem_create();
88 static int solr_transform_parse_tok_line(solr_transform_t ct,
93 Z_AttributeElement *ae[20];
94 int ret = 0; /* 0=OK, != 0 FAIL */
98 while (t == YAZ_TOK_STRING && ae_num < 20)
100 WRBUF type_str = wrbuf_alloc();
102 Z_AttributeElement *elem = 0;
103 const char *value_str = 0;
104 /* attset type=value OR type=value */
106 elem = (Z_AttributeElement *) nmem_malloc(ct->nmem, sizeof(*elem));
107 elem->attributeSet = 0;
109 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
110 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
111 t = yaz_tok_move(tp);
112 if (t == YAZ_TOK_EOF)
114 wrbuf_destroy(type_str);
116 wrbuf_destroy(set_str);
119 if (t == YAZ_TOK_STRING)
121 wrbuf_puts(ct->w, " ");
122 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
126 yaz_string_to_oid_nmem(yaz_oid_std(), CLASS_ATTSET,
127 wrbuf_cstr(set_str), ct->nmem);
129 type_str = wrbuf_alloc();
130 wrbuf_puts(type_str, yaz_tok_parse_string(tp));
131 t = yaz_tok_move(tp);
133 elem->attributeType = nmem_intdup(ct->nmem, 0);
134 if (sscanf(wrbuf_cstr(type_str), ODR_INT_PRINTF, elem->attributeType)
137 wrbuf_destroy(type_str);
139 wrbuf_destroy(set_str);
140 yaz_log(YLOG_WARN, "Expected numeric attribute type");
145 wrbuf_destroy(type_str);
147 wrbuf_destroy(set_str);
151 yaz_log(YLOG_WARN, "Expected = after after attribute type");
155 t = yaz_tok_move(tp);
156 if (t != YAZ_TOK_STRING) /* value */
158 yaz_log(YLOG_WARN, "Missing attribute value");
162 value_str = yaz_tok_parse_string(tp);
163 if (isdigit(*value_str))
165 elem->which = Z_AttributeValue_numeric;
166 elem->value.numeric =
167 nmem_intdup(ct->nmem, atoi(value_str));
171 Z_ComplexAttribute *ca = (Z_ComplexAttribute *)
172 nmem_malloc(ct->nmem, sizeof(*ca));
173 elem->which = Z_AttributeValue_complex;
174 elem->value.complex = ca;
176 ca->list = (Z_StringOrNumeric **)
177 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric *));
178 ca->list[0] = (Z_StringOrNumeric *)
179 nmem_malloc(ct->nmem, sizeof(Z_StringOrNumeric));
180 ca->list[0]->which = Z_StringOrNumeric_string;
181 ca->list[0]->u.string = nmem_strdup(ct->nmem, value_str);
182 ca->num_semanticAction = 0;
183 ca->semanticAction = 0;
185 wrbuf_puts(ct->w, "=");
186 wrbuf_puts(ct->w, yaz_tok_parse_string(tp));
187 t = yaz_tok_move(tp);
188 wrbuf_puts(ct->w, " ");
191 if (ret == 0) /* OK? */
193 struct solr_prop_entry **pp = &ct->entry;
196 *pp = (struct solr_prop_entry *) xmalloc(sizeof(**pp));
197 (*pp)->pattern = xstrdup(pattern);
198 (*pp)->value = xstrdup(wrbuf_cstr(ct->w));
200 (*pp)->attr_list.num_attributes = ae_num;
202 (*pp)->attr_list.attributes = 0;
205 (*pp)->attr_list.attributes = (Z_AttributeElement **)
206 nmem_malloc(ct->nmem,
207 ae_num * sizeof(Z_AttributeElement *));
208 memcpy((*pp)->attr_list.attributes, ae,
209 ae_num * sizeof(Z_AttributeElement *));
215 ODR pr = odr_createmem(ODR_PRINT);
216 Z_AttributeList *alp = &(*pp)->attr_list;
217 odr_setprint(pr, yaz_log_file());
218 z_AttributeList(pr, &alp, 0, 0);
226 int solr_transform_define_pattern(solr_transform_t ct, const char *pattern,
230 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, value);
231 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
232 r = solr_transform_parse_tok_line(ct, pattern, tp);
233 yaz_tok_parse_destroy(tp);
237 solr_transform_t solr_transform_open_FILE(FILE *f)
239 solr_transform_t ct = solr_transform_create();
242 yaz_tok_cfg_single_tokens(ct->tok_cfg, "=");
244 while (fgets(line, sizeof(line)-1, f))
246 yaz_tok_parse_t tp = yaz_tok_parse_buf(ct->tok_cfg, line);
249 t = yaz_tok_move(tp);
250 if (t == YAZ_TOK_STRING)
252 char * pattern = xstrdup(yaz_tok_parse_string(tp));
253 t = yaz_tok_move(tp);
256 yaz_tok_parse_destroy(tp);
257 solr_transform_close(ct);
260 if (solr_transform_parse_tok_line(ct, pattern, tp))
262 yaz_tok_parse_destroy(tp);
263 solr_transform_close(ct);
268 else if (t != YAZ_TOK_EOF)
270 yaz_tok_parse_destroy(tp);
271 solr_transform_close(ct);
274 yaz_tok_parse_destroy(tp);
279 void solr_transform_close(solr_transform_t ct)
281 struct solr_prop_entry *pe;
287 struct solr_prop_entry *pe_next = pe->next;
294 yaz_tok_cfg_destroy(ct->tok_cfg);
295 wrbuf_destroy(ct->w);
296 nmem_destroy(ct->nmem);
300 solr_transform_t solr_transform_open_fname(const char *fname)
303 FILE *f = fopen(fname, "r");
306 ct = solr_transform_open_FILE(f);
312 struct Z_AttributeElement {
313 Z_AttributeSetId *attributeSet; /* OPT */
318 Z_ComplexAttribute *complex;
319 #define Z_AttributeValue_numeric 1
320 #define Z_AttributeValue_complex 2
325 static int compare_attr(Z_AttributeElement *a, Z_AttributeElement *b)
327 ODR odr_a = odr_createmem(ODR_ENCODE);
328 ODR odr_b = odr_createmem(ODR_ENCODE);
333 z_AttributeElement(odr_a, &a, 0, 0);
334 z_AttributeElement(odr_b, &b, 0, 0);
336 buf_a = odr_getbuf(odr_a, &len_a, 0);
337 buf_b = odr_getbuf(odr_b, &len_b, 0);
339 ret = yaz_memcmp(buf_a, buf_b, len_a, len_b);
346 const char *solr_lookup_reverse(solr_transform_t ct,
347 const char *category,
348 Z_AttributeList *attributes)
350 struct solr_prop_entry *e;
351 size_t clen = strlen(category);
352 for (e = ct->entry; e; e = e->next)
354 if (!strncmp(e->pattern, category, clen))
356 /* category matches.. See if attributes in pattern value
357 are all listed in actual attributes */
359 for (i = 0; i < e->attr_list.num_attributes; i++)
361 /* entry attribute */
362 Z_AttributeElement *e_ae = e->attr_list.attributes[i];
364 for (j = 0; j < attributes->num_attributes; j++)
366 /* actual attribute */
367 Z_AttributeElement *a_ae = attributes->attributes[j];
368 int r = compare_attr(e_ae, a_ae);
372 if (j == attributes->num_attributes)
373 break; /* i was not found at all.. try next pattern */
376 if (i == e->attr_list.num_attributes)
377 return e->pattern + clen;
383 static const char *solr_lookup_property(solr_transform_t ct,
384 const char *pat1, const char *pat2,
388 struct solr_prop_entry *e;
390 if (pat1 && pat2 && pat3)
391 sprintf(pattern, "%.39s.%.39s.%.39s", pat1, pat2, pat3);
392 else if (pat1 && pat2)
393 sprintf(pattern, "%.39s.%.39s", pat1, pat2);
394 else if (pat1 && pat3)
395 sprintf(pattern, "%.39s.%.39s", pat1, pat3);
397 sprintf(pattern, "%.39s", pat1);
401 for (e = ct->entry; e; e = e->next)
403 if (!solr_strcmp(e->pattern, pattern))
409 int solr_pr_attr_uri(solr_transform_t ct, const char *category,
410 const char *uri, const char *val, const char *default_val,
411 void (*pr)(const char *buf, void *client_data),
416 const char *eval = val ? val : default_val;
417 const char *prefix = 0;
421 struct solr_prop_entry *e;
423 for (e = ct->entry; e; e = e->next)
424 if (!memcmp(e->pattern, "set.", 4) && e->value &&
425 !strcmp(e->value, uri))
427 prefix = e->pattern+4;
430 /* must have a prefix now - if not it's an error */
436 res = solr_lookup_property(ct, category, prefix, eval);
437 /* we have some aliases for some relations unfortunately.. */
438 if (!res && !prefix && !strcmp(category, "relation"))
440 if (!strcmp(val, "=="))
441 res = solr_lookup_property(ct, category, prefix, "exact");
442 if (!strcmp(val, "="))
443 res = solr_lookup_property(ct, category, prefix, "eq");
444 if (!strcmp(val, "<="))
445 res = solr_lookup_property(ct, category, prefix, "le");
446 if (!strcmp(val, ">="))
447 res = solr_lookup_property(ct, category, prefix, "ge");
450 res = solr_lookup_property(ct, category, prefix, "*");
456 const char *cp0 = res, *cp1;
457 while ((cp1 = strchr(cp0, '=')))
460 while (*cp1 && *cp1 != ' ')
462 if (cp1 - cp0 >= (ptrdiff_t) sizeof(buf))
464 memcpy(buf, cp0, cp1 - cp0);
466 (*pr)("@attr ", client_data);
468 for (i = 0; buf[i]; i++)
471 (*pr)(eval, client_data);
477 (*pr)(tmp, client_data);
480 (*pr)(" ", client_data);
488 if (errcode && !ct->error)
492 ct->addinfo = xstrdup(val);
499 int solr_pr_attr(solr_transform_t ct, const char *category,
500 const char *val, const char *default_val,
501 void (*pr)(const char *buf, void *client_data),
505 return solr_pr_attr_uri(ct, category, 0 /* uri */,
506 val, default_val, pr, client_data, errcode);
510 static void solr_pr_int(int val,
511 void (*pr)(const char *buf, void *client_data),
514 char buf[21]; /* enough characters to 2^64 */
515 sprintf(buf, "%d", val);
516 (*pr)(buf, client_data);
517 (*pr)(" ", client_data);
521 static int solr_pr_prox(solr_transform_t ct, struct solr_node *mods,
522 void (*pr)(const char *buf, void *client_data),
526 int distance; /* to be filled in later depending on unit */
527 int distance_defined = 0;
529 int proxrel = 2; /* less than or equal */
530 int unit = 2; /* word */
534 const char *name = mods->u.st.index;
535 const char *term = mods->u.st.term;
536 const char *relation = mods->u.st.relation;
538 if (!strcmp(name, "distance")) {
539 distance = strtol(term, (char**) 0, 0);
540 distance_defined = 1;
541 if (!strcmp(relation, "="))
543 else if (!strcmp(relation, ">"))
545 else if (!strcmp(relation, "<"))
547 else if (!strcmp(relation, ">="))
549 else if (!strcmp(relation, "<="))
551 else if (!strcmp(relation, "<>"))
555 ct->error = YAZ_SRW_UNSUPP_PROX_RELATION;
556 ct->addinfo = xstrdup(relation);
560 else if (!strcmp(name, "ordered"))
562 else if (!strcmp(name, "unordered"))
564 else if (!strcmp(name, "unit"))
566 if (!strcmp(term, "word"))
568 else if (!strcmp(term, "sentence"))
570 else if (!strcmp(term, "paragraph"))
572 else if (!strcmp(term, "element"))
576 ct->error = YAZ_SRW_UNSUPP_PROX_UNIT;
577 ct->addinfo = xstrdup(term);
583 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
584 ct->addinfo = xstrdup(name);
587 mods = mods->u.st.modifiers;
590 if (!distance_defined)
591 distance = (unit == 2) ? 1 : 0;
593 solr_pr_int(exclusion, pr, client_data);
594 solr_pr_int(distance, pr, client_data);
595 solr_pr_int(ordered, pr, client_data);
596 solr_pr_int(proxrel, pr, client_data);
597 (*pr)("k ", client_data);
598 solr_pr_int(unit, pr, client_data);
603 /* Returns location of first wildcard character in the `length'
604 * characters starting at `term', or a null pointer of there are
605 * none -- like memchr().
607 static const char *wcchar(int start, const char *term, int length)
611 if (start || term[-1] != '\\')
612 if (strchr("*?", *term))
622 /* ### checks for SOLR relation-name rather than Type-1 attribute */
623 static int has_modifier(struct solr_node *cn, const char *name) {
624 struct solr_node *mod;
625 for (mod = cn->u.st.modifiers; mod != 0; mod = mod->u.st.modifiers) {
626 if (!strcmp(mod->u.st.index, name))
634 static void emit_term(solr_transform_t ct,
635 struct solr_node *cn,
636 const char *term, int length,
637 void (*pr)(const char *buf, void *client_data),
641 const char *ns = cn->u.st.index_uri;
642 int process_term = !has_modifier(cn, "regexp");
645 assert(cn->which == SOLR_NODE_ST);
647 if (process_term && length > 0)
649 if (length > 1 && term[0] == '^' && term[length-1] == '^')
651 solr_pr_attr(ct, "position", "firstAndLast", 0,
652 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
656 else if (term[0] == '^')
658 solr_pr_attr(ct, "position", "first", 0,
659 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
663 else if (term[length-1] == '^')
665 solr_pr_attr(ct, "position", "last", 0,
666 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
671 solr_pr_attr(ct, "position", "any", 0,
672 pr, client_data, YAZ_SRW_ANCHORING_CHAR_IN_UNSUPP_POSITION);
676 if (process_term && length > 0)
678 const char *first_wc = wcchar(1, term, length);
679 const char *second_wc = first_wc ?
680 wcchar(0, first_wc+1, length-(first_wc-term)-1) : 0;
682 /* Check for well-known globbing patterns that represent
683 * simple truncation attributes as expected by, for example,
684 * Bath-compliant server. If we find such a pattern but
685 * there's no mapping for it, that's fine: we just use a
686 * general pattern-matching attribute.
688 if (first_wc == term && second_wc == term + length-1
689 && *first_wc == '*' && *second_wc == '*'
690 && solr_pr_attr(ct, "truncation", "both", 0, pr, client_data, 0))
695 else if (first_wc == term && second_wc == 0 && *first_wc == '*'
696 && solr_pr_attr(ct, "truncation", "left", 0,
702 else if (first_wc == term + length-1 && second_wc == 0
704 && solr_pr_attr(ct, "truncation", "right", 0,
711 /* We have one or more wildcard characters, but not in a
712 * way that can be dealt with using only the standard
713 * left-, right- and both-truncation attributes. We need
714 * to translate the pattern into a Z39.58-type pattern,
715 * which has been supported in BIB-1 since 1996. If
716 * there's no configuration element for "truncation.z3958"
717 * we indicate this as error 28 "Masking character not
721 solr_pr_attr(ct, "truncation", "z3958", 0,
722 pr, client_data, YAZ_SRW_MASKING_CHAR_UNSUPP);
723 z3958_mem = (char *) xmalloc(length+1);
724 for (i = 0; i < length; i++)
726 if (i > 0 && term[i-1] == '\\')
727 z3958_mem[i] = term[i];
728 else if (term[i] == '*')
730 else if (term[i] == '?')
733 z3958_mem[i] = term[i];
735 z3958_mem[length] = '\0';
739 /* No masking characters. Use "truncation.none" if given. */
740 solr_pr_attr(ct, "truncation", "none", 0,
745 solr_pr_attr_uri(ct, "index", ns,
746 cn->u.st.index, "serverChoice",
747 pr, client_data, YAZ_SRW_UNSUPP_INDEX);
749 if (cn->u.st.modifiers)
751 struct solr_node *mod = cn->u.st.modifiers;
752 for (; mod; mod = mod->u.st.modifiers)
754 solr_pr_attr(ct, "relationModifier", mod->u.st.index, 0,
755 pr, client_data, YAZ_SRW_UNSUPP_RELATION_MODIFIER);
759 (*pr)("\"", client_data);
760 for (i = 0; i<length; i++)
762 /* pr(int) each character */
763 /* we do not need to deal with \-sequences because the
764 SOLR and PQF terms have same \-format, bug #1988 */
769 (*pr)(buf, client_data);
771 (*pr)("\" ", client_data);
775 static void emit_terms(solr_transform_t ct,
776 struct solr_node *cn,
777 void (*pr)(const char *buf, void *client_data),
781 struct solr_node *ne = cn->u.st.extra_terms;
784 (*pr)("@", client_data);
785 (*pr)(op, client_data);
786 (*pr)(" ", client_data);
788 emit_term(ct, cn, cn->u.st.term, strlen(cn->u.st.term),
790 for (; ne; ne = ne->u.st.extra_terms)
792 if (ne->u.st.extra_terms)
794 (*pr)("@", client_data);
795 (*pr)(op, client_data);
796 (*pr)(" ", client_data);
798 emit_term(ct, cn, ne->u.st.term, strlen(ne->u.st.term),
803 static void emit_wordlist(solr_transform_t ct,
804 struct solr_node *cn,
805 void (*pr)(const char *buf, void *client_data),
809 const char *cp0 = cn->u.st.term;
811 const char *last_term = 0;
817 cp1 = strchr(cp0, ' ');
820 (*pr)("@", client_data);
821 (*pr)(op, client_data);
822 (*pr)(" ", client_data);
823 emit_term(ct, cn, last_term, last_length, pr, client_data);
827 last_length = cp1 - cp0;
829 last_length = strlen(cp0);
833 emit_term(ct, cn, last_term, last_length, pr, client_data);
836 void solr_transform_r(solr_transform_t ct,
837 struct solr_node *cn,
838 void (*pr)(const char *buf, void *client_data),
842 struct solr_node *mods;
849 ns = cn->u.st.index_uri;
852 /* TODO If relevant fix with solr_uri */
853 if (!strcmp(ns, solr_uri())
854 && cn->u.st.index && !solr_strcmp(cn->u.st.index, "resultSet"))
856 (*pr)("@set \"", client_data);
857 (*pr)(cn->u.st.term, client_data);
858 (*pr)("\" ", client_data);
866 ct->error = YAZ_SRW_UNSUPP_CONTEXT_SET;
870 solr_pr_attr(ct, "always", 0, 0, pr, client_data, 0);
871 solr_pr_attr(ct, "relation", cn->u.st.relation, 0, pr, client_data,
872 YAZ_SRW_UNSUPP_RELATION);
873 solr_pr_attr(ct, "structure", cn->u.st.relation, 0,
874 pr, client_data, YAZ_SRW_UNSUPP_COMBI_OF_RELATION_AND_TERM);
875 if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "all"))
876 emit_wordlist(ct, cn, pr, client_data, "and");
877 else if (cn->u.st.relation && !solr_strcmp(cn->u.st.relation, "any"))
878 emit_wordlist(ct, cn, pr, client_data, "or");
880 emit_terms(ct, cn, pr, client_data, "and");
883 (*pr)("@", client_data);
884 (*pr)(cn->u.boolean.value, client_data);
885 (*pr)(" ", client_data);
886 mods = cn->u.boolean.modifiers;
887 if (!strcmp(cn->u.boolean.value, "prox"))
889 if (!solr_pr_prox(ct, mods, pr, client_data))
894 /* Boolean modifiers other than on proximity not supported */
895 ct->error = YAZ_SRW_UNSUPP_BOOLEAN_MODIFIER;
896 ct->addinfo = xstrdup(mods->u.st.index);
900 solr_transform_r(ct, cn->u.boolean.left, pr, client_data);
901 solr_transform_r(ct, cn->u.boolean.right, pr, client_data);
905 fprintf(stderr, "Fatal: impossible SOLR node-type %d\n", cn->which);
910 int solr_transform(solr_transform_t ct, struct solr_node *cn,
911 void (*pr)(const char *buf, void *client_data),
914 struct solr_prop_entry *e;
915 NMEM nmem = nmem_create();
921 for (e = ct->entry; e ; e = e->next)
923 /* TODO remove as SOLR dont supports sets.
924 if (!solr_strncmp(e->pattern, "set.", 4))
925 solr_apply_prefix(nmem, cn, e->pattern+4, e->value);
926 else if (!solr_strcmp(e->pattern, "set"))
927 solr_apply_prefix(nmem, cn, 0, e->value);
930 solr_transform_r(ct, cn, pr, client_data);
936 int solr_transform_FILE(solr_transform_t ct, struct solr_node *cn, FILE *f)
938 /* We can use the cql_fputs util */
939 return solr_transform(ct, cn, cql_fputs, f);
942 int solr_transform_buf(solr_transform_t ct, struct solr_node *cn, char *out, int max)
944 struct solr_buf_write_info info;
950 r = solr_transform(ct, cn, cql_buf_write_handler, &info);
952 /* Attempt to write past end of buffer. For some reason, this
953 SRW diagnostic is deprecated, but it's so perfect for our
954 purposes that it would be stupid not to use it. */
956 ct->error = YAZ_SRW_TOO_MANY_CHARS_IN_QUERY;
957 sprintf(numbuf, "%ld", (long) info.max);
958 ct->addinfo = xstrdup(numbuf);
962 info.buf[info.off] = '\0';
966 int solr_transform_error(solr_transform_t ct, const char **addinfo)
968 *addinfo = ct->addinfo;
972 void solr_transform_set_error(solr_transform_t ct, int error, const char *addinfo)
975 ct->addinfo = addinfo ? xstrdup(addinfo) : 0;
982 * c-file-style: "Stroustrup"
983 * indent-tabs-mode: nil
985 * vim: shiftwidth=4 tabstop=8 expandtab