yaz-client: fix bad encoding of "itemorder item"

[yaz-moved-to-github.git] / src / cclfind.c
diff --git a/src/cclfind.c b/src/cclfind.c

index 69f059c..caceeea 100644 (file)
--- a/src/cclfind.c
+++ b/src/cclfind.c
@@ -1,47 +1,7 @@
-/*
- * Copyright (c) 1995, the EUROPAGATE consortium (see below).
- *
- * The EUROPAGATE consortium members are:
- *
- *    University College Dublin
- *    Danmarks Teknologiske Videnscenter
- *    An Chomhairle Leabharlanna
- *    Consejo Superior de Investigaciones Cientificas
- *
- * Permission to use, copy, modify, distribute, and sell this software and
- * its documentation, in whole or in part, for any purpose, is hereby granted,
- * provided that:
- *
- * 1. This copyright and permission notice appear in all copies of the
- * software and its documentation. Notices of copyright or attribution
- * which appear at the beginning of any file must remain unchanged.
- *
- * 2. The names of EUROPAGATE or the project partners may not be used to
- * endorse or promote products derived from this software without specific
- * prior written permission.
- *
- * 3. Users of this software (implementors and gateway operators) agree to
- * inform the EUROPAGATE consortium of their use of the software. This
- * information will be used to evaluate the EUROPAGATE project and the
- * software, and to plan further developments. The consortium may use
- * the information in later publications.
- * 
- * 4. Users of this software agree to make their best efforts, when
- * documenting their use of the software, to acknowledge the EUROPAGATE
- * consortium, and the role played by the software in their work.
- *
- * THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
- * IN NO EVENT SHALL THE EUROPAGATE CONSORTIUM OR ITS MEMBERS BE LIABLE
- * FOR ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF
- * ANY KIND, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
- * OR PROFITS, WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND
- * ON ANY THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE
- * USE OR PERFORMANCE OF THIS SOFTWARE.
- *
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2012 Index Data
+ * See the file LICENSE for details.
   */
-
  /** 
   * \file cclfind.c
   * \brief Implements parsing of a CCL FIND query.
@@ -51,67 +11,13 @@
   * of lookahead in the handling of relational operations.. So
   * it's not really pure.
   */
-
-
-/* CCL find (to rpn conversion)
- * Europagate, 1995
- *
- * $Id: cclfind.c,v 1.13 2007-04-30 19:55:40 adam Exp $
- *
- * Old Europagate log:
- *
- * Revision 1.16  1996/01/08  08:41:13  adam
- * Removed unused function.
- *
- * Revision 1.15  1995/07/20  08:14:34  adam
- * Qualifiers were observed too often. Instead tokens are treated as
- * qualifiers only when separated by comma.
- *
- * Revision 1.14  1995/05/16  09:39:26  adam
- * LICENSE.
- *
- * Revision 1.13  1995/04/17  09:31:42  adam
- * Improved handling of qualifiers. Aliases or reserved words.
- *
- * Revision 1.12  1995/03/20  15:27:43  adam
- * Minor changes.
- *
- * Revision 1.11  1995/02/23  08:31:59  adam
- * Changed header.
- *
- * Revision 1.9  1995/02/16  13:20:06  adam
- * Spell fix.
- *
- * Revision 1.8  1995/02/14  19:59:42  adam
- * Removed a syntax error.
- *
- * Revision 1.7  1995/02/14  19:55:10  adam
- * Header files ccl.h/cclp.h are gone! They have been merged an
- * moved to ../include/ccl.h.
- * Node kind(s) in ccl_rpn_node have changed names.
- *
- * Revision 1.6  1995/02/14  16:20:55  adam
- * Qualifiers are read from a file now.
- *
- * Revision 1.5  1995/02/14  14:12:41  adam
- * Ranges for ordered qualfiers implemented (e.g. pd=1980-1990).
- *
- * Revision 1.4  1995/02/14  13:16:29  adam
- * Left and/or right truncation implemented.
- *
- * Revision 1.3  1995/02/14  10:25:56  adam
- * The constructions 'qualifier rel term ...' implemented.
- *
- * Revision 1.2  1995/02/13  15:15:07  adam
- * Added handling of qualifiers. Not finished yet.
- *
- * Revision 1.1  1995/02/13  12:35:20  adam
- * First version of CCL. Qualifiers aren't handled yet.
- *
- */
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
  
  #include <stdlib.h>
  #include <string.h>
+#include <assert.h>
  
  #include "cclp.h"
  
@@ -199,6 +105,7 @@ struct ccl_rpn_node *ccl_rpn_node_create(enum ccl_rpn_kind kind)
      case CCL_RPN_TERM:
          p->u.t.attr_list = 0;
          p->u.t.term = 0;
+        p->u.t.qual = 0;
          break;
      default:
          break;
@@ -225,6 +132,7 @@ void ccl_rpn_delete(struct ccl_rpn_node *rpn)
          break;
      case CCL_RPN_TERM:
          xfree(rpn->u.t.term);
+        xfree(rpn->u.t.qual);
          for (attr = rpn->u.t.attr_list; attr; attr = attr1)
          {
              attr1 = attr->next;
@@ -304,7 +212,22 @@ void ccl_add_attr_string(struct ccl_rpn_node *p, const char *set,
      n->value.str = xstrdup(value);
  }
  
+static size_t cmp_operator(const char **aliases, const char *input)
+{
+    for (; *aliases; aliases++)
+    {
+        const char *cp = *aliases;
+        size_t i;
+        for (i = 0; *cp && *cp == input[i]; i++, cp++)
+            ;
+        if (*cp == '\0')
+            return i;
+    }
+    return 0;
+}
  
+#define REGEX_CHARS "^[]{}()|.*+?!$"
+#define CCL_CHARS "#?\\"
  /**
   * search_term: Parse CCL search term. 
   * cclp:   CCL Parser
@@ -343,15 +266,16 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
          struct ccl_rpn_node *p;
          size_t no, i;
          int no_spaces = 0;
-        int left_trunc = 0;
-        int right_trunc = 0;
-        int mid_trunc = 0;
          int relation_value = -1;
          int position_value = -1;
          int structure_value = -1;
          int truncation_value = -1;
          int completeness_value = -1;
          int len = 0;
+        int left_trunc = 0;
+        int right_trunc = 0;
+        int regex_trunc = 0;
+        int z3958_trunc = 0;
          size_t max = 200;
          if (and_list || or_list || !multi)
              max = 1;
@@ -371,16 +295,6 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
              for (i = 0; i<lookahead->len; i++)
                  if (lookahead->name[i] == ' ')
                      no_spaces++;
-                else if (strchr(truncation_aliases[0], lookahead->name[i]))
-                {
-                    if (no == 0 && i == 0 && lookahead->len >= 1)
-                        left_trunc = 1;
-                    else if (!is_term_ok(lookahead->next->kind, term_list) &&
-                             i == lookahead->len-1 && i >= 1)
-                        right_trunc = 1;
-                    else
-                        mid_trunc = 1;
-                }
              len += 1+lookahead->len+lookahead->ws_prefix_len;
              lookahead = lookahead->next;
          }
@@ -392,6 +306,12 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
          p = ccl_rpn_node_create(CCL_RPN_TERM);
          p->u.t.attr_list = NULL;
          p->u.t.term = NULL;
+        if (qa && qa[0])
+        {
+            const char *n = ccl_qual_get_name(qa[0]);
+            if (n)
+                p->u.t.qual = xstrdup(n);
+        }
  
          /* go through all attributes and add them to the attribute list */
          for (i=0; qa && qa[i]; i++)
@@ -429,7 +349,6 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
                              if (truncation_value != -1)
                                  continue;
                              truncation_value = attr->value.numeric;
-                            left_trunc = right_trunc = mid_trunc = 0;
                              break;
                          case CCL_BIB1_COM:
                              if (completeness_value != -1)
@@ -455,46 +374,116 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
                  ccl_add_attr_numeric(p, attset, CCL_BIB1_STR, 1);
          }
  
+        if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_REGEX,
+                          &attset))
+        {
+            regex_trunc = 1; /* regex trunc (102) allowed */
+        }
+        else if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_Z3958,
+                          &attset))
+        {
+            z3958_trunc = 1; /* Z39.58 trunc (CCL) trunc allowed */
+        }
+
          /* make the RPN token */
-        p->u.t.term = (char *)xmalloc(len);
+        p->u.t.term = (char *)xmalloc(len * 2 + 2);
          ccl_assert(p->u.t.term);
          p->u.t.term[0] = '\0';
          for (i = 0; i<no; i++)
          {
              const char *src_str = cclp->look_token->name;
              size_t src_len = cclp->look_token->len;
-            
-            if (i == 0 && left_trunc)
+            int j;
+            int quote_mode = 0;
+
+            if (p->u.t.term[0] && cclp->look_token->ws_prefix_len)
              {
-                src_len--;
-                src_str++;
+                strxcat(p->u.t.term, cclp->look_token->ws_prefix_buf,
+                        cclp->look_token->ws_prefix_len);
              }
-            if (i == no-1 && right_trunc)
-                src_len--;
-            if (!ccl_qual_match_stop(cclp->bibset, qa, src_str, src_len))
+            for (j = 0; j < src_len; j++)
              {
-#if 0
-                fprintf(stderr, "[%s %.*s]",
-                        ccl_qual_get_name(qa[0]), src_len, src_str);
-#endif
-                if (p->u.t.term[0] && cclp->look_token->ws_prefix_len)
+                size_t op_size;
+                if (j > 0 && src_str[j-1] == '\\')
+                {
+                    if (regex_trunc && strchr(REGEX_CHARS "\\", src_str[j]))
+                    {
+                        regex_trunc = 2;
+                        strcat(p->u.t.term, "\\");
+                    }
+                    else if (z3958_trunc && strchr(CCL_CHARS "\\", src_str[j]))
+                    {
+                        z3958_trunc = 2;
+                        strcat(p->u.t.term, "\\");
+                    }
+                    strxcat(p->u.t.term, src_str + j, 1);
+                }
+                else if (src_str[j] == '"')
+                    quote_mode = !quote_mode;
+                else if (!quote_mode &&
+                         (op_size = cmp_operator(truncation_aliases,
+                                                 src_str + j))
+                    )
                  {
-                    size_t len = strlen(p->u.t.term);
-                    memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf,
-                           cclp->look_token->ws_prefix_len);
-                    p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0';
+                    j += (op_size - 1);  /* j++ in for loop */
+                    if (regex_trunc)
+                    {
+                        strcat(p->u.t.term, ".*");
+                        regex_trunc = 2; /* regex trunc is really needed */
+                    }
+                    else if (z3958_trunc)
+                    {
+                        strcat(p->u.t.term, "?");
+                        z3958_trunc = 2;
+                    }
+                    else if (i == 0 && j == 0)
+                        left_trunc = 1;
+                    else if (i == no - 1 && j == src_len - 1)
+                        right_trunc = 1;
+                    else
+                    {
+                        cclp->error_code = CCL_ERR_TRUNC_NOT_EMBED;
+                        ccl_rpn_delete(p);
+                        return NULL;
+                    }
+                }
+                else if (!quote_mode && src_str[j] == '#')
+                {
+                    if (regex_trunc)
+                    {
+                        strcat(p->u.t.term, ".");
+                        regex_trunc = 2; /* regex trunc is really needed */
+                    }
+                    else if (z3958_trunc)
+                    {
+                        strcat(p->u.t.term, "#");
+                        z3958_trunc = 2;
+                    }
+                    else
+                    {
+                        cclp->error_code = CCL_ERR_TRUNC_NOT_SINGLE;
+                        ccl_rpn_delete(p);
+                        return NULL;
+                    }
+                }
+                else if (src_str[j] != '\\')
+                {
+                    if (regex_trunc && strchr(REGEX_CHARS, src_str[j]))
+                    {
+                        regex_trunc = 2;
+                        strcat(p->u.t.term, "\\");
+                    }
+                    else if (z3958_trunc && strchr(CCL_CHARS, src_str[j]))
+                    {
+                        z3958_trunc = 2;
+                        strcat(p->u.t.term, "\\");
+                    }
+                    strxcat(p->u.t.term, src_str + j, 1);                    
                  }
-                strxcat(p->u.t.term, src_str, src_len);
              }
              ADVANCE;
          }
  
-        if (p->u.t.term[0] == 0)
-        {
-            ccl_rpn_delete(p);
-            continue;
-        }
-
          /* make the top node point to us.. */
          if (p_top)
          {
@@ -548,6 +537,14 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
              }
              ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 2);
          }
+        else if (regex_trunc == 2)
+        {
+            ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 102);
+        }
+        else if (z3958_trunc == 2)
+        {
+            ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 104);
+        }
          else
          {
              if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_NONE,
@@ -568,6 +565,37 @@ static struct ccl_rpn_node *search_term(CCL_parser cclp, ccl_qualifier_t *qa)
      return search_term_x(cclp, qa, list, 0);
  }
  
+
+static struct ccl_rpn_node *search_terms2(CCL_parser cclp,
+                                          ccl_qualifier_t *qa)
+{
+    if (KIND == CCL_TOK_LP)
+    {
+        struct ccl_rpn_node *p;
+        ADVANCE;
+        if (!(p = find_spec(cclp, qa)))
+            return NULL;
+        if (KIND != CCL_TOK_RP)
+        {
+            cclp->error_code = CCL_ERR_RP_EXPECTED;
+            ccl_rpn_delete(p);
+            return NULL;
+        }
+        ADVANCE;
+        return p;
+    }
+    else
+    {
+        static int list[] = {
+            CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ,
+            CCL_TOK_REL, CCL_TOK_SET, -1};
+        
+        return search_term_x(cclp, qa, list, 1);
+    }
+}
+
+
+
  static
  struct ccl_rpn_node *qualifiers_order(CCL_parser cclp,
                                        ccl_qualifier_t *ap, char *attset)
@@ -710,20 +738,6 @@ struct ccl_rpn_node *qualifiers_order(CCL_parser cclp,
          ccl_add_attr_numeric(p, attset, CCL_BIB1_REL, 2);
          return p;
      }
-    else if (KIND == CCL_TOK_LP)
-    {
-        ADVANCE;
-        if (!(p = find_spec(cclp, ap)))
-            return NULL;
-        if (KIND != CCL_TOK_RP)
-        {
-            cclp->error_code = CCL_ERR_RP_EXPECTED;
-            ccl_rpn_delete(p);
-            return NULL;
-        }
-        ADVANCE;
-        return p;
-    }
      else
      {
          if (!(p = search_terms(cclp, ap)))
@@ -739,7 +753,6 @@ static
  struct ccl_rpn_node *qualifier_relation(CCL_parser cclp, ccl_qualifier_t *ap)
  {
      char *attset;
-    struct ccl_rpn_node *p;
      
      if (qual_val_type(ap, CCL_BIB1_REL, CCL_BIB1_REL_ORDER, &attset)
          || qual_val_type(ap, CCL_BIB1_REL, CCL_BIB1_REL_PORDER, &attset))
@@ -752,24 +765,7 @@ struct ccl_rpn_node *qualifier_relation(CCL_parser cclp, ccl_qualifier_t *ap)
          return NULL;
      }
      ADVANCE;
-    if (KIND == CCL_TOK_LP)
-    {
-        ADVANCE;
-        if (!(p = find_spec(cclp, ap)))
-        {
-            return NULL;
-        }
-        if (KIND != CCL_TOK_RP)
-        {
-            cclp->error_code = CCL_ERR_RP_EXPECTED;
-            ccl_rpn_delete(p);
-            return NULL;
-        }
-        ADVANCE;
-    }
-    else
-        p = search_terms(cclp, ap);
-    return p;
+    return search_terms(cclp, ap);
  }
  
  /**
@@ -936,9 +932,10 @@ static struct ccl_rpn_node *qualifier_list(CCL_parser cclp,
  static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa)
  {
      static int list[] = {
-        CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ, CCL_TOK_REL, CCL_TOK_SET, -1};
+        CCL_TOK_TERM, CCL_TOK_COMMA,CCL_TOK_EQ,
+        CCL_TOK_REL, CCL_TOK_SET, -1};
      struct ccl_rpn_node *p1, *p2, *pn;
-    p1 = search_term_x(cclp, qa, list, 1);
+    p1 = search_terms2(cclp, qa);
      if (!p1)
          return NULL;
      while (1)
@@ -956,7 +953,7 @@ static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa)
              p_prox->u.t.attr_list = 0;
  
              ADVANCE;
-            p2 = search_term_x(cclp, qa, list, 1);
+            p2 = search_terms2(cclp, qa);
              if (!p2)
              {
                  ccl_rpn_delete(p1);
@@ -970,7 +967,7 @@ static struct ccl_rpn_node *search_terms(CCL_parser cclp, ccl_qualifier_t *qa)
          }
          else if (is_term_ok(KIND, list))
          {
-            p2 = search_term_x(cclp, qa, list, 1);
+            p2 = search_terms2(cclp, qa);
              if (!p2)
              {
                  ccl_rpn_delete(p1);
@@ -999,22 +996,7 @@ static struct ccl_rpn_node *search_elements(CCL_parser cclp,
  {
      struct ccl_rpn_node *p1;
      struct ccl_token *lookahead;
-    if (KIND == CCL_TOK_LP)
-    {
-        ADVANCE;
-        p1 = find_spec(cclp, qa);
-        if (!p1)
-            return NULL;
-        if (KIND != CCL_TOK_RP)
-        {
-            cclp->error_code = CCL_ERR_RP_EXPECTED;
-            ccl_rpn_delete(p1);
-            return NULL;
-        }
-        ADVANCE;
-        return p1;
-    }
-    else if (KIND == CCL_TOK_SET)
+    if (KIND == CCL_TOK_SET)
      {
          ADVANCE;
          if (KIND == CCL_TOK_EQ)
@@ -1040,7 +1022,7 @@ static struct ccl_rpn_node *search_elements(CCL_parser cclp,
              break;
          lookahead = lookahead->next;
      }
-    if (qa)
+    if (qa || lookahead->kind == CCL_TOK_LP)
          return search_terms(cclp, qa);
      else
      {
@@ -1204,9 +1186,11 @@ struct ccl_rpn_node *ccl_find_str(CCL_bibset bibset, const char *str,
      ccl_token_del(list);
      return p;
  }
+
  /*
   * Local variables:
   * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
   * indent-tabs-mode: nil
   * End:
   * vim: shiftwidth=4 tabstop=8 expandtab