Allow record filters to return 'skip' this record (RECCTRL_EXTRACT_SKIP).

author Adam Dickmeiss <adam@indexdata.dk>

Thu, 1 Mar 2007 10:35:46 +0000 (10:35 +0000)

committer Adam Dickmeiss <adam@indexdata.dk>

Thu, 1 Mar 2007 10:35:46 +0000 (10:35 +0000)
author Adam Dickmeiss <adam@indexdata.dk>
Thu, 1 Mar 2007 10:35:46 +0000 (10:35 +0000)
committer Adam Dickmeiss <adam@indexdata.dk>
Thu, 1 Mar 2007 10:35:46 +0000 (10:35 +0000)
diff --git a/include/idzebra/recctrl.h b/include/idzebra/recctrl.h

index 0ac6639..8a147ba 100644 (file)
--- a/include/idzebra/recctrl.h
+++ b/include/idzebra/recctrl.h
@@ -1,4 +1,4 @@
-/* $Id: recctrl.h,v 1.30 2007-01-15 20:08:24 adam Exp $
+/* $Id: recctrl.h,v 1.31 2007-03-01 10:35:46 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -147,6 +147,7 @@ struct recType
  #define RECCTRL_EXTRACT_EOF   1
  #define RECCTRL_EXTRACT_ERROR_GENERIC 2
  #define RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER 3
+#define RECCTRL_EXTRACT_SKIP  4
  
  typedef struct recTypeClass *RecTypeClass;
  typedef struct recTypes *RecTypes;
diff --git a/index/extract.c b/index/extract.c

index efc5c71..51d2b94 100644 (file)
--- a/index/extract.c
+++ b/index/extract.c
@@ -1,4 +1,4 @@
-/* $Id: extract.c,v 1.249 2007-02-06 09:34:56 adam Exp $
+/* $Id: extract.c,v 1.250 2007-03-01 10:35:46 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -576,38 +576,48 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
          extract_set_store_data_prepare(&extractCtrl);
          
          r = (*recType->extract)(recTypeClientData, &extractCtrl);
-        
-        if (r == RECCTRL_EXTRACT_EOF)
-            return ZEBRA_FAIL;
-        else if (r == RECCTRL_EXTRACT_ERROR_GENERIC)
+
+        switch (r)
          {
+        case RECCTRL_EXTRACT_EOF:
+            return ZEBRA_FAIL;
+        case RECCTRL_EXTRACT_ERROR_GENERIC:
              /* error occured during extraction ... */
              yaz_log (YLOG_WARN, "extract error: generic");
              return ZEBRA_FAIL;
-        }
-        else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER)
-        {
+        case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
              /* error occured during extraction ... */
              yaz_log (YLOG_WARN, "extract error: no such filter");
              return ZEBRA_FAIL;
+        case RECCTRL_EXTRACT_SKIP:
+            if (show_progress)
+                yaz_log (YLOG_LOG, "skip %s %s " ZINT_FORMAT,
+                         recordType, pr_fname, (zint) start_offset);
+            *more = 1;
+            
+            end_offset = stream->endf(stream, 0);
+            if (end_offset)
+                stream->seekf(stream, end_offset);
+
+            return ZEBRA_OK;
+        case RECCTRL_EXTRACT_OK:
+            break;
+        default:
+            yaz_log (YLOG_WARN, "extract error: unknown error: %d", r);
+            return ZEBRA_FAIL;
          }
-        
+        end_offset = stream->endf(stream, 0);
+        if (end_offset)
+            stream->seekf(stream, end_offset);
+        else
+            end_offset = stream->tellf(stream);
+
          all_matches_add(&extractCtrl);
          
          if (extractCtrl.match_criteria[0])
              match_criteria = extractCtrl.match_criteria;
-
-
-        end_offset = stream->endf(stream, 0);
-
-        if (!end_offset)
-            end_offset = stream->tellf(stream);
-        else
-            stream->seekf(stream, end_offset);
-
      }
  
-
      *more = 1;
      if (!sysno)
      {
diff --git a/index/mod_dom.c b/index/mod_dom.c

index d62c796..e65a3bf 100644 (file)
--- a/index/mod_dom.c
+++ b/index/mod_dom.c
@@ -1,4 +1,4 @@
-/* $Id: mod_dom.c,v 1.24 2007-02-28 16:46:19 marc Exp $
+/* $Id: mod_dom.c,v 1.25 2007-03-01 10:35:46 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -108,6 +108,7 @@ struct filter_info {
      struct filter_retrieve *retrieve_list;
      struct filter_input *input_list;
      struct filter_store *store;
+    int record_info_invoked;
  };
  
  
@@ -183,6 +184,7 @@ static void *filter_init(Res res, RecType recType)
      tinfo->input_list = 0;
      tinfo->store = 0;
      tinfo->doc_config = 0;
+    tinfo->record_info_invoked = 0;
  
  #if YAZ_HAVE_EXSLT
      exsltRegisterAll(); 
@@ -713,86 +715,102 @@ static void index_value_of(struct filter_info *tinfo,
                             xmlNodePtr node, 
                             xmlChar * index_p)
  {
-    xmlChar *text = xmlNodeGetContent(node);
-    size_t text_len = strlen((const char *)text);    
-
-    /* if there is no text, we do not need to proceed */
-    if (text_len)
-    {            
-        xmlChar *look = index_p;
-        xmlChar *bval;
-        xmlChar *eval;
-
-        xmlChar index[256];
-        xmlChar type[256];
+    if (tinfo->record_info_invoked == 1)
+    {
+        xmlChar *text = xmlNodeGetContent(node);
+        size_t text_len = strlen((const char *)text);
+        
+        yaz_log(YLOG_LOG, "Indexing :%.*s:", text_len, text);
+        
+        /* if there is no text, we do not need to proceed */
+        if (text_len)
+        {            
+            xmlChar *look = index_p;
+            xmlChar *bval;
+            xmlChar *eval;
+
+            xmlChar index[256];
+            xmlChar type[256];
  
-        /* assingning text to be indexed */
-        recword->term_buf = (const char *)text;
-        recword->term_len = text_len;
+            /* assingning text to be indexed */
+            recword->term_buf = (const char *)text;
+            recword->term_len = text_len;
  
-        /* parsing all index name/type pairs */
-        /* may not start with ' ' or ':' */
-        while (*look && ' ' != *look && ':' != *look)
-        {
-            /* setting name and type to zero */
-            *index = '\0';
-            *type = '\0';
-    
-            /* parsing one index name */
-            bval = look;
-            while (*look && ':' != *look && ' ' != *look)
+            /* parsing all index name/type pairs */
+            /* may not start with ' ' or ':' */
+            while (*look && ' ' != *look && ':' != *look)
              {
-                look++;
-            }
-            eval = look;
-            strncpy((char *)index, (const char *)bval, eval - bval);
-            index[eval - bval] = '\0';
+                /* setting name and type to zero */
+                *index = '\0';
+                *type = '\0';
      
-    
-            /* parsing one index type, if existing */
-            if (':' == *look)
-            {
-                look++;
-      
+                /* parsing one index name */
                  bval = look;
-                while (*look && ' ' != *look)
+                while (*look && ':' != *look && ' ' != *look)
                  {
                      look++;
                  }
                  eval = look;
-                strncpy((char *)type, (const char *)bval, eval - bval);
-                type[eval - bval] = '\0';
-            }
+                strncpy((char *)index, (const char *)bval, eval - bval);
+                index[eval - bval] = '\0';
+    
+    
+                /* parsing one index type, if existing */
+                if (':' == *look)
+                {
+                    look++;
+      
+                    bval = look;
+                    while (*look && ' ' != *look)
+                    {
+                        look++;
+                    }
+                    eval = look;
+                    strncpy((char *)type, (const char *)bval, eval - bval);
+                    type[eval - bval] = '\0';
+                }
  
-            /* writing debug out */
-            if (extctr->flagShowRecords)
-                dom_log(YLOG_LOG, tinfo, 0, 
+                /* actually indexing the text given */
+                dom_log(YLOG_DEBUG, tinfo, 0, 
                          "INDEX '%s:%s' '%s'", 
                          index ? (const char *) index : "null",
                          type ? (const char *) type : "null", 
                          text ? (const char *) text : "null");
  
-            /* actually indexing the text given */
-            recword->index_name = (const char *)index;
-            if (type && *type)
-                recword->index_type = *type;
-            (extctr->tokenAdd)(recword);
-
-            /* eat whitespaces */
-            if (*look && ' ' == *look && *(look+1))
-            {
-                look++;
-            } 
+                recword->index_name = (const char *)index;
+                if (type && *type)
+                    recword->index_type = *type;
+
+                /* writing debug out */
+                if (extctr->flagShowRecords)
+                    dom_log(YLOG_LOG, tinfo, 0, 
+                            "INDEX '%s:%s' '%s'", 
+                            index ? (const char *) index : "null",
+                            type ? (const char *) type : "null", 
+                            text ? (const char *) text : "null");
+                
+                /* actually indexing the text given */
+                recword->index_name = (const char *)index;
+                if (type && *type)
+                    recword->index_type = *type;
+                (extctr->tokenAdd)(recword);
+
+                /* eat whitespaces */
+                if (*look && ' ' == *look && *(look+1))
+                {
+                    look++;
+                } 
+            }
          }
+        xmlFree(text); 
      }
-    
-    xmlFree(text); 
  }
  
  
  /* DOM filter style indexing */
  static void set_record_info(struct filter_info *tinfo, 
                              struct recExtractCtrl *extctr, 
+                            xmlNodePtr node, 
                              xmlChar * id_p, 
                              xmlChar * rank_p, 
                              xmlChar * type_p)
@@ -820,6 +838,12 @@ static void set_record_info(struct filter_info *tinfo,
      /*     else */
      /*         dom_log(YLOG_WARN, tinfo, ptr, "dom filter: unknown record type '%s'",  */
      /*                 type_str); */
+    if (tinfo->record_info_invoked == 1)
+    {
+        /* warn about multiple only once */
+        dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
+    }
+    tinfo->record_info_invoked++;
  
  }
  
@@ -881,7 +905,7 @@ static void process_xml_element_zebra_node(struct filter_info *tinfo,
                              attr->name);
                  }
              }
-            set_record_info(tinfo, extctr, id_p, rank_p, type_p);
+            set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
          } 
          else
          {
@@ -965,7 +989,7 @@ static void process_xml_pi_node(struct filter_info *tinfo,
                          pi_p, look);
              }
              else 
-                set_record_info(tinfo, extctr, id, rank, 0);
+                set_record_info(tinfo, extctr, node, id, rank, 0);
  
          } 
          /* parsing index instruction */
@@ -1031,9 +1055,6 @@ static void extract_dom_doc_node(struct filter_info *tinfo,
                                   struct recExtractCtrl *extctr, 
                                   xmlDocPtr doc)
  {
-    xmlChar *buf_out;
-    int len_out;
-
      /* only need to do the initialization once, reuse recword for all terms */
      RecWord recword;
      (*extctr->init)(extctr, &recword);
@@ -1046,7 +1067,7 @@ static void extract_dom_doc_node(struct filter_info *tinfo,
          xmlFree(buf_out);
      }
      */
-
+    tinfo->record_info_invoked = 0;
      process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
  }
  
@@ -1104,6 +1125,8 @@ static int convert_extract_doc(struct filter_info *tinfo,
      if (doc)
         xmlFreeDoc(doc);
  
+    if (tinfo->record_info_invoked == 0)
+        return RECCTRL_EXTRACT_SKIP;
      return RECCTRL_EXTRACT_OK;
  }
  
diff --git a/test/xslt/dom-index-pi.xsl b/test/xslt/dom-index-pi.xsl

index 3f11dfd..8e0432f 100644 (file)
--- a/test/xslt/dom-index-pi.xsl
+++ b/test/xslt/dom-index-pi.xsl
@@ -2,7 +2,7 @@
    xmlns:m="http://www.loc.gov/MARC21/slim"
    exclude-result-prefixes="m"
    version="1.0">
-  <!-- $Id: dom-index-pi.xsl,v 1.1 2007-02-15 13:01:00 marc Exp $ -->
+  <!-- $Id: dom-index-pi.xsl,v 1.2 2007-03-01 10:35:46 adam Exp $ -->
    <xsl:output indent="yes" method="xml" version="1.0" encoding="UTF-8"/>
    
  
@@ -17,7 +17,7 @@
    <xsl:template match="/m:record">
      <xsl:processing-instruction name="zebra-2.0">
        <xsl:text>record id=</xsl:text>
-      <xsl:value-of select="normalize-space(m:controlfield[@tag='001'])"/>
+      <xsl:value-of select="translate(normalize-space(m:controlfield[@tag='001']),' ','_')"/>
        <xsl:text> rank=</xsl:text>
        <xsl:value-of select="normalize-space(m:rank)"/>
      </xsl:processing-instruction>
author	Adam Dickmeiss <adam@indexdata.dk>
	Thu, 1 Mar 2007 10:35:46 +0000 (10:35 +0000)
committer	Adam Dickmeiss <adam@indexdata.dk>
	Thu, 1 Mar 2007 10:35:46 +0000 (10:35 +0000)
include/idzebra/recctrl.h		patch \| blob \| history
index/extract.c		patch \| blob \| history
index/mod_dom.c		patch \| blob \| history
test/xslt/dom-index-pi.xsl		patch \| blob \| history