ICU functional for scan and snippets.

author Adam Dickmeiss <adam@indexdata.dk>

Thu, 13 Dec 2007 11:09:20 +0000 (11:09 +0000)

committer Adam Dickmeiss <adam@indexdata.dk>

Thu, 13 Dec 2007 11:09:20 +0000 (11:09 +0000)
author Adam Dickmeiss <adam@indexdata.dk>
Thu, 13 Dec 2007 11:09:20 +0000 (11:09 +0000)
committer Adam Dickmeiss <adam@indexdata.dk>
Thu, 13 Dec 2007 11:09:20 +0000 (11:09 +0000)
diff --git a/index/Makefile.am b/index/Makefile.am

index 7a218e6..b08296f 100644 (file)
--- a/index/Makefile.am
+++ b/index/Makefile.am
@@ -1,4 +1,4 @@
-## $Id: Makefile.am,v 1.68 2007-12-03 16:54:49 adam Exp $
+## $Id: Makefile.am,v 1.69 2007-12-13 11:09:20 adam Exp $
  
  aux_libs = \
   ../rset/libidzebra-rset.la \
@@ -95,7 +95,7 @@ libidzebra_2_0_la_SOURCES = \
    rank.h rank1.c ranksimilarity.c rankstatic.c \
    records.c recindex.c recindex.h reckeys.c reckeys.h \
    retrieve.c \
-  rpnscan.c rpnsearch.c rpnfacet.c sortidx.c stream.c \
+  rpnscan.c rpnsearch.c sortidx.c stream.c \
    update_path.c update_file.c trunc.c untrans.c isam_methods.c \
    zaptterm.c zebraapi.c zinfo.c zinfo.h zsets.c key_block.c key_block.h \
    check_res.c rset_isam.c
diff --git a/index/extract.c b/index/extract.c

index cc54d67..aed0650 100644 (file)
--- a/index/extract.c
+++ b/index/extract.c
@@ -1,4 +1,4 @@
-/* $Id: extract.c,v 1.272 2007-12-10 17:06:08 adam Exp $
+/* $Id: extract.c,v 1.273 2007-12-13 11:09:20 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -265,6 +265,26 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
  
  }
  
+static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
+{
+    struct snip_rec_info *h = p->extractCtrl->handle;
+
+    const char *res_buf = 0;
+    size_t res_len = 0;
+
+    const char *display_buf = 0;
+    size_t display_len = 0;
+
+    zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
+    while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
+                                   &display_buf, &display_len))
+    {
+        zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
+                               display_buf, display_len);
+        p->seqno++;
+    }
+}
+
  static void snippet_token_add(RecWord *p)
  {
      struct snip_rec_info *h = p->extractCtrl->handle;
@@ -277,10 +297,15 @@ static void snippet_token_add(RecWord *p)
          int ch = zebraExplain_lookup_attr_str(
              zei, zinfo_index_category_index, p->index_type, p->index_name);
  
-        if (zebra_maps_is_complete(zm))
-            snippet_add_complete_field(p, ch, zm);
+        if (zebra_maps_is_icu(zm))
+            snippet_add_icu(p, ch, zm);
          else
-            snippet_add_incomplete_field(p, ch, zm);
+        {
+            if (zebra_maps_is_complete(zm))
+                snippet_add_complete_field(p, ch, zm);
+            else
+                snippet_add_incomplete_field(p, ch, zm);
+        }
      }
  }
  
@@ -1456,7 +1481,7 @@ void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
  }
  
  
-ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh,
+ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
                                       zebra_rec_keys_t reckeys,
                                       zebra_snippets *snippets)
  {
@@ -1760,8 +1785,8 @@ static void extract_add_icu(RecWord *p, zebra_map_t zm)
      \param p token data to be indexed
  
      Call sequence:
-    extract_token
-    zebra_add_{in}_complete
+    extract_token_add
+    extract_add_{in}_complete
      extract_add_string
      
      extract_add_index_string
diff --git a/index/index.h b/index/index.h

index a123b30..8ef0835 100644 (file)
--- a/index/index.h
+++ b/index/index.h
@@ -1,4 +1,4 @@
-/* $Id: index.h,v 1.212 2007-12-03 13:04:04 adam Exp $
+/* $Id: index.h,v 1.213 2007-12-13 11:09:20 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -377,14 +377,14 @@ Dict dict_open_res(BFiles bfs, const char *name, int cache, int rw,
  void zebra_setError(ZebraHandle zh, int code, const char *addinfo);
  void zebra_setError_zint(ZebraHandle zh, int code, zint i);
  
-void zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, 
-                              const char *index_type,
-                             char **dst, const char *src);
+int zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, 
+                             const char *index_type,
+                             char **dst, const char *src);
  
  ZEBRA_RES zebra_get_hit_vector(ZebraHandle zh, const char *setname, zint sysno);
  
-void zebra_term_untrans(ZebraHandle zh, const char *index_type,
-                       char *dst, const char *src);
+int zebra_term_untrans(ZebraHandle zh, const char *index_type,
+                       char *dst, const char *src);
  
  ZEBRA_RES zebra_apt_get_ord(ZebraHandle zh,
                              Z_AttributesPlusTerm *zapt,
@@ -437,13 +437,6 @@ ZEBRA_RES zebra_term_limits_APT(ZebraHandle zh,
                                  const char **term_ref_id_str,
                                  NMEM nmem);
  
-ZEBRA_RES rpn_facet(ZebraHandle zh, ODR stream,
-                    Z_AttributesPlusTerm *zapt,
-                    const Odr_oid *attributeset,
-                    int *position, int *num_entries, 
-                    ZebraScanEntry **list,
-                    int *is_partial, const char *set_name);
-
  ZEBRA_RES zebra_result_recid_to_sysno(ZebraHandle zh, 
                                        const char *setname,
                                        zint recid,
diff --git a/index/rpnfacet.c b/index/rpnfacet.c

deleted file mode 100644 (file)

index 2bebbfd..0000000
--- a/index/rpnfacet.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/* $Id: rpnfacet.c,v 1.3 2007-11-05 11:20:39 adam Exp $
-   Copyright (C) 1995-2007
-   Index Data ApS
-
-This file is part of the Zebra server.
-
-Zebra is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free
-Software Foundation; either version 2, or (at your option) any later
-version.
-
-Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-*/
-
-#include <stdio.h>
-#include <assert.h>
-#if HAVE_UNISTD_H
-#include <unistd.h>
-#endif
-#include <ctype.h>
-
-#include <yaz/diagbib1.h>
-#include "index.h"
-#include <zebra_xpath.h>
-#include <yaz/wrbuf.h>
-#include <attrfind.h>
-#include <charmap.h>
-#include <rset.h>
-#include <yaz/oid_db.h>
-
-ZEBRA_RES rpn_facet(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt,
-                    const Odr_oid *attributeset,
-                    int *position, int *num_entries, 
-                    ZebraScanEntry **list, int *is_partial, 
-                    const char *set_name)
-{
-    int ord;
-    int use_sort_idx = 1;
-    ZEBRA_RES res = zebra_attr_list_get_ord(zh,
-                                            zapt->attributes,
-                                            zinfo_index_category_sort,
-                                            0 /* index_type */,
-                                            attributeset, &ord);
-    if (res != ZEBRA_OK)
-        return res;
-    else if (use_sort_idx)
-    {
-        const char *index_type = 0;
-        const char *db = 0;
-        const char *string_index = 0;
-        /* for each ord .. */
-        /*   check that sort idx exist for ord */
-        /*   sweep through result set and sort_idx at the same time */
-        char *this_entry_buf = xmalloc(SORT_IDX_ENTRYSIZE);
-        char *dst_buf = xmalloc(SORT_IDX_ENTRYSIZE);
-        size_t sysno_mem_index = 0;
-        RSET rset = resultSetRef(zh, set_name);
-        zint p_this_sys = 0;
-        RSFD rfd;
-        TERMID termid;
-        struct it_key key;
-
-        if (zebraExplain_lookup_ord(zh->reg->zei,
-                                    ord, &index_type, &db, &string_index))
-        {
-            yaz_log(YLOG_WARN, "zebraExplain_lookup_ord failed");
-        }
-        
-        if (zh->m_staticrank)
-            sysno_mem_index = 1;
-        
-        rfd = rset_open(rset, RSETF_READ);
-        while (rset_read(rfd, &key, &termid))
-        {
-            zint sysno = key.mem[sysno_mem_index];
-            if (sysno != p_this_sys)
-            {
-                p_this_sys = sysno;
-                zebra_sort_sysno(zh->reg->sort_index, sysno);
-                zebra_sort_type(zh->reg->sort_index, ord);
-                zebra_sort_read(zh->reg->sort_index, this_entry_buf);
-
-                zebra_term_untrans(zh, index_type, dst_buf, this_entry_buf);
-                yaz_log(YLOG_LOG, "dst_buf=%s", dst_buf);
-            }
-        }
-        rset_close(rfd);
-        xfree(this_entry_buf);
-        xfree(dst_buf);
-        zebra_setError(zh, YAZ_BIB1_TEMPORARY_SYSTEM_ERROR, "facet not done1");
-        return ZEBRA_FAIL;
-    }
-    else
-    {
-        int num = 100; /* to be customizable */
-        int i;
-
-        ZebraMetaRecord *meta = zebra_meta_records_create_range(
-            zh, set_name, 0, num);
-
-        for (i = 0; i < num; i++)
-        {
-            zint sysno = meta[i].sysno;
-            Record rec = rec_get(zh->reg->records, sysno);
-            if (!rec)
-            {
-                yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT,
-                        sysno);
-                break;
-            }
-            else
-            {
-                
-
-                rec_free(&rec);
-            }
-        }
-        zebra_meta_records_destroy(zh, meta, num);
-        zebra_setError(zh, YAZ_BIB1_TEMPORARY_SYSTEM_ERROR, "facet not done2");
-        return ZEBRA_FAIL;
-    }
-}
-
-/*
- * Local variables:
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- * vim: shiftwidth=4 tabstop=8 expandtab
- */
-
diff --git a/index/rpnscan.c b/index/rpnscan.c

index 3312157..fa12bbd 100644 (file)
--- a/index/rpnscan.c
+++ b/index/rpnscan.c
@@ -1,4 +1,4 @@
-/* $Id: rpnscan.c,v 1.23 2007-12-03 11:49:11 adam Exp $
+/* $Id: rpnscan.c,v 1.24 2007-12-13 11:09:20 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -45,14 +45,28 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  static ZEBRA_RES trans_scan_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
                                  char *termz, zebra_map_t zm)
  {
-    char termz0[IT_MAX_WORD];
+    char term_utf8[IT_MAX_WORD];
  
-    if (zapt_term_to_utf8(zh, zapt, termz0) == ZEBRA_FAIL)
+    if (zapt_term_to_utf8(zh, zapt, term_utf8) == ZEBRA_FAIL)
          return ZEBRA_FAIL;    /* error */
+    else if (zebra_maps_is_icu(zm))
+    {
+        const char *res_buf;
+        size_t res_len;
+        zebra_map_tokenize_start(zm, term_utf8, strlen(term_utf8));
+        
+        if (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
+        {
+            memcpy(termz, res_buf, res_len);
+            termz[res_len] = '\0';
+        }
+        else
+            termz[0] = '\0';
+    }
      else
      {
          const char **map;
-        const char *cp = (const char *) termz0;
+        const char *cp = (const char *) term_utf8;
          const char *cp_end = cp + strlen(cp);
          const char *src;
          int i = 0;
@@ -218,8 +232,6 @@ static int scan_save_set(ZebraHandle zh, ODR stream, NMEM nmem,
          if (pos != -1)
          {
              zint sysno;
-            int code = -1;
-            zebra_snippets *rec_snippets = zebra_snippets_create();
              zebra_snippets *hit_snippets = zebra_snippets_create();
  
              glist[pos].term = 0;
@@ -227,22 +239,28 @@ static int scan_save_set(ZebraHandle zh, ODR stream, NMEM nmem,
              
              get_first_snippet_from_rset(zh, rset, hit_snippets, &sysno);
              if (sysno)
-                code = zebra_get_rec_snippets(zh, sysno, rec_snippets);
-         
-            if (code == 0)
              {
-                const struct zebra_snippet_word *w = 
-                    zebra_snippets_lookup(rec_snippets, hit_snippets);
-                if (w)
+                zebra_snippets *rec_snippets = zebra_snippets_create();
+                int code = zebra_get_rec_snippets(zh, sysno, rec_snippets);
+                if (code == 0)
                  {
-                    glist[pos].display_term = odr_strdup(stream, w->term);
+                    const struct zebra_snippet_word *w = 
+                        zebra_snippets_lookup(rec_snippets, hit_snippets);
+                    if (w)
+                    {
+                        glist[pos].display_term = odr_strdup(stream, w->term);
+                    }
                  }
+                zebra_snippets_destroy(rec_snippets);
              }
-            if (!glist[pos].term)
-                zebra_term_untrans_iconv(zh, stream->mem, index_type,
-                                         &glist[pos].term, term);
+            if (zebra_term_untrans_iconv(zh, stream->mem, index_type,
+                                         &glist[pos].term, term))
+            {
+                /* failed.. use display_term instead (which could be 0) */
+                glist[pos].term = glist[pos].display_term;
+            }
+
              glist[pos].occurrences = count;
-            zebra_snippets_destroy(rec_snippets);
              zebra_snippets_destroy(hit_snippets);
          }
          rset_delete(rset);
@@ -517,11 +535,6 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt,
         zebra_setError(zh, YAZ_BIB1_TOO_MANY_DATABASES_SPECIFIED, 0);
          return ZEBRA_FAIL;
      }
-    if (sort_flag)
-    {
-        return rpn_facet(zh, stream, zapt, attributeset, position, num_entries,
-                         list, is_partial, set_name);
-    }
      for (base_no = 0; base_no < num_bases; base_no++)
      {
         int ord;
diff --git a/index/untrans.c b/index/untrans.c

index 0551e3c..904e5e5 100644 (file)
--- a/index/untrans.c
+++ b/index/untrans.c
@@ -1,4 +1,4 @@
-/* $Id: untrans.c,v 1.5 2007-10-31 16:56:14 adam Exp $
+/* $Id: untrans.c,v 1.6 2007-12-13 11:09:20 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -28,35 +28,44 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  #include "index.h"
  #include <charmap.h>
  
-void zebra_term_untrans(ZebraHandle zh, const char *index_type,
-                       char *dst, const char *src)
+int zebra_term_untrans(ZebraHandle zh, const char *index_type,
+                       char *dst, const char *src)
  {
      zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, index_type);
-    int len = 0;
-    while (*src)
+    if (zebra_maps_is_icu(zm))
+        return -1;
+    else
      {
-        const char *cp = zebra_maps_output(zm, &src);
-       if (!cp)
-       {
-           if (len < IT_MAX_WORD-1)
-               dst[len++] = *src;
-           src++;
-       }
-        else
-            while (*cp && len < IT_MAX_WORD-1)
-                dst[len++] = *cp++;
+        int len = 0;
+        while (*src)
+        {
+            const char *cp = zebra_maps_output(zm, &src);
+            if (!cp)
+            {
+                if (len < IT_MAX_WORD-1)
+                    dst[len++] = *src;
+                src++;
+            }
+            else
+                while (*cp && len < IT_MAX_WORD-1)
+                    dst[len++] = *cp++;
+        }
+        dst[len] = '\0';
      }
-    dst[len] = '\0';
+    return 0;
  }
  
-void zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, 
-                              const char *index_type,
-                             char **dst, const char *src)
+int zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream, 
+                             const char *index_type,
+                             char **dst, const char *src)
  {
      char term_src[IT_MAX_WORD];
      char term_dst[IT_MAX_WORD];
+    int r;
      
-    zebra_term_untrans (zh, index_type, term_src, src);
+    r = zebra_term_untrans (zh, index_type, term_src, src);
+    if (r)
+        return r;
  
      if (zh->iconv_from_utf8 != 0)
      {
@@ -83,6 +92,7 @@ void zebra_term_untrans_iconv(ZebraHandle zh, NMEM stream,
      }
      else
          *dst = nmem_strdup(stream, term_src);
+    return 0;
  }
  
  
diff --git a/test/api/t17.c b/test/api/t17.c

index 0c99133..b945f3d 100644 (file)
--- a/test/api/t17.c
+++ b/test/api/t17.c
@@ -1,4 +1,4 @@
-/* $Id: t17.c,v 1.8 2007-12-07 14:17:37 adam Exp $
+/* $Id: t17.c,v 1.9 2007-12-13 11:09:20 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -26,10 +26,17 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  #include <yaz/test.h>
  #include "testlib.h"
  
+#define char_ae "\xc3\xa6"
+#define char_AE "\xc3\x86"
+#define char_oslash "\xc3\xb8"
+#define char_Oslash "\xc3\x98"
+
  const char *myrec[] = {
          "<gils>\n<title>My computer</title>\n</gils>\n",
          "<gils>\n<title>My x computer</title>\n</gils>\n",
          "<gils>\n<title>My computer x</title>\n</gils>\n" ,
+
+        "<gils>\n<title>" char_ae "</title>\n</gils>\n" ,
         0} ;
         
  static void tst(int argc, char **argv)
@@ -51,6 +58,13 @@ static void tst(int argc, char **argv)
  
      YAZ_CHECK(tl_query(zh, "@attr 1=title my", 3));
  
+    YAZ_CHECK(tl_query(zh, "@attr 1=title mY", 3));
+
+    YAZ_CHECK(tl_query(zh, char_ae, 1));
+#if 0
+    YAZ_CHECK(tl_query(zh, char_AE, 1));
+#endif
+
      /* phrase search */
      YAZ_CHECK(tl_query(zh, "@attr 1=title {my computer}", 2));
      YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=1 {my computer}", 2));
@@ -59,7 +73,9 @@ static void tst(int argc, char **argv)
      /* complete-subfield search */
      YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=2 {my computer}", 1));
      YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=2 {my}", 0));
- 
+
+    /* scan */
+    
      YAZ_CHECK(tl_close_down(zh, zs));
  #endif
  }
diff --git a/test/api/t17.idx b/test/api/t17.idx

index 16e9985..922ddbe 100644 (file)
--- a/test/api/t17.idx
+++ b/test/api/t17.idx
@@ -1,5 +1,5 @@
  # Zebra indexes as referred to from the *.abs-files.
-#  $Id: t17.idx,v 1.5 2007-12-07 14:17:37 adam Exp $
+#  $Id: t17.idx,v 1.6 2007-12-13 11:09:20 adam Exp $
  #
  
  # Traditional word index
@@ -12,14 +12,15 @@ alwaysmatches 1
  firstinfield 1
  # simplechain dummy
  icuchain words-icu.xml
-debug 1
+# debug 1
  
  # Phrase index
  # Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1)
  # and structure is word/phrase/word-list/free-form-text/document-text
  index p
  completeness 1
-charmap phrases-icu.xml
+icuchain phrases-icu.xml
+# debug 1
  
  # Sort register
  sort s
diff --git a/win/makefile b/win/makefile

index 7ac8fde..fde5806 100644 (file)
--- a/win/makefile
+++ b/win/makefile
@@ -1,5 +1,5 @@
  # Zebra makefile for MS NMAKE
-# $Id: makefile,v 1.71 2007-12-03 17:16:48 adam Exp $
+# $Id: makefile,v 1.72 2007-12-13 11:09:20 adam Exp $
   
  ###########################################################
  ############### Parameters 
@@ -400,7 +400,6 @@ ZEBRALIB_OBJS= \
         $(OBJDIR)\regxread.obj \
         $(OBJDIR)\res.obj \
         $(OBJDIR)\retrieve.obj \
-       $(OBJDIR)\rpnfacet.obj \
         $(OBJDIR)\rpnscan.obj \
         $(OBJDIR)\rpnsearch.obj \
         $(OBJDIR)\rsbetween.obj \
author	Adam Dickmeiss <adam@indexdata.dk>
	Thu, 13 Dec 2007 11:09:20 +0000 (11:09 +0000)
committer	Adam Dickmeiss <adam@indexdata.dk>
	Thu, 13 Dec 2007 11:09:20 +0000 (11:09 +0000)
index/Makefile.am		patch \| blob \| history
index/extract.c		patch \| blob \| history
index/index.h		patch \| blob \| history
index/rpnfacet.c	[deleted file]	patch \| blob \| history
index/rpnscan.c		patch \| blob \| history
index/untrans.c		patch \| blob \| history
test/api/t17.c		patch \| blob \| history
test/api/t17.idx		patch \| blob \| history
win/makefile		patch \| blob \| history