added section on static and dynamic ranking

author Marc Cromme <marc@indexdata.dk>

Wed, 15 Feb 2006 12:08:47 +0000 (12:08 +0000)

committer Marc Cromme <marc@indexdata.dk>

Wed, 15 Feb 2006 12:08:47 +0000 (12:08 +0000)
author Marc Cromme <marc@indexdata.dk>
Wed, 15 Feb 2006 12:08:47 +0000 (12:08 +0000)
committer Marc Cromme <marc@indexdata.dk>
Wed, 15 Feb 2006 12:08:47 +0000 (12:08 +0000)
diff --git a/doc/Makefile.am b/doc/Makefile.am

index 622fbc7..5983f23 100644 (file)
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,4 +1,4 @@
-## $Id: Makefile.am,v 1.34 2006-02-15 11:07:47 marc Exp $
+## $Id: Makefile.am,v 1.35 2006-02-15 12:08:47 marc Exp $
  docdir=$(datadir)/doc/@PACKAGE@
  
  SUPPORTFILES = \
@@ -50,7 +50,7 @@ HTMLFILES =  administration.html \
     introduction.html \
     license.html \
     locating-records.html \
-   maincomponents.html \
+   architecture-maincomponents.html \
     protocol-support.html \
     quick-start.html \
     record-model-alvisxslt-conf.html \
diff --git a/doc/administration.xml b/doc/administration.xml

index b29790c..eee315e 100644 (file)
--- a/doc/administration.xml
+++ b/doc/administration.xml
@@ -1,5 +1,5 @@
  <chapter id="administration">
- <!-- $Id: administration.xml,v 1.22 2006-02-15 11:07:47 marc Exp $ -->
+ <!-- $Id: administration.xml,v 1.23 2006-02-15 12:08:47 marc Exp $ -->
   <title>Administrating Zebra</title>
   <!-- ### It's a bit daft that this chapter (which describes half of
            the configuration-file formats) is separated from
@@ -843,7 +843,6 @@
      
      <screen>
       register: /d1:500M
-     
       shadow: /scratch1:100M /scratch2:200M
      </screen>
      
@@ -921,8 +920,112 @@
    </sect2>
    
   </sect1>
+
+
+ <sect1 id="administration-ranking">
+  <title>Static and Dynamic Ranking</title>
+  
+   <para>
+    Zebra uses internally inverted indexes to look up term occurencies
+    in documents. Multiple queries from different indexes can be
+    combined by the binary boolean operations <literal>AND</literal>, 
+    <literal>OR</literal> and/or <literal>NOT</literal> (which
+    is in fact a binary <literal>AND NOT</literal> operation). 
+    To ensure fast query execution
+    speed, all indexes have to be sorted in the same order.
+   </para>
+   <para>
+    The indexes are normally sorted according to document 
+    <literal>ID</literal> in
+    ascending order, and any query which does not invoke a special
+    re-ranking function will therefore retrieve the result set in
+    document 
+    <literal>ID</literal>
+    order.
+   </para>
+   <para>
+    If one defines the 
+    <screen>
+    staticrank: 1 
+    </screen> 
+    directive in the main core Zebra config file, the internal document
+    keys used for ordering are augmented by a preceeding integer, which
+    contains the static rank of a given document, and the index lists
+    are ordered 
+    first by ascending static rank,
+    then by ascending document <literal>ID</literal>.
+   </para>
+   <para>
+    This implies that the default rank <literal>0</literal> 
+    is the best rank at the
+    beginning of the list, and <literal>max int</literal> 
+    is the worst static rank.
+   </para>
+   <para>
+    The experimental <literal>alvis</literal> filter provides a
+    directive to fetch static rank information out of the indexed XML
+    records, thus making <emphasis>all</emphasis> hit sets orderd
+    after <emphasis>ascending</emphasis> static
+    rank, and for those doc's which have the same static rank, ordered
+    after <emphasis>ascending</emphasis> doc <literal>ID</literal>.
+    See <xref linkend="record-model-alvisxslt"/> for the glory details.
+   </para>
+   <para>
+    If one wants to do a little fiddeling with the static rank order,
+    one has to invoke additional re-ranking/re-ordering using dynamic 
+    reranking or score functions. These functions return positive
+    interger scores, where <emphasis>highest</emphasis> score is 
+    <emphasis>best</emphasis>, which means that the
+    hit sets will be sorted according to
+    <emphasis>decending</emphasis> 
+    scores (in contrary
+    to the index lists which are sorted according to
+    <emphasis>ascending</emphasis> rank  number and document ID).
+   </para>
+   <!--
+   <para>
+    Those are defined in the zebra C source files 
+    <screen>     
+    "rank-1" : zebra/index/rank1.c  
+               default TF/IDF like zebra dynamic ranking
+    "rank-static" : zebra/index/rankstatic.c
+               do-nothing dummy static ranking (this is just to prove
+               that the static rank can be used in dynamic ranking functions)  
+     "zvrank" : zebra/index/zvrank.c
+               many different dynamic TF/IDF ranking functions 
+    </screen> 
+   </para>
+   -->
+   <para>
+    Those are in the zebra config file enabled by a directive like (use
+    only one of these a time!):
+    <screen> 
+    rank: rank-1        # default
+    rank: rank-static   # dummy 
+    rank: zvrank        # TDF-IDF like
+    </screen>
+    Notice that the <literal>rank-1</literal> and
+    <literal>zvrank</literal> do not use the static rank 
+    information in the list keys, and will produce the same ordering
+    with our without static ranking enabled.
+   </para>
+   <para>
+    The dummy <literal>rank-static</literal> reranking/scoring
+    function returns just 
+    <literal>score = max int - staticrank</literal>
+    in order to preserve the ordering of hit sets with and without it's
+    call.
+     Obviously, to combine static and dynamic ranking usefully, one wants
+    to make a new ranking 
+    function, which is left
+    as an exercise for the reader. 
+   </para>
+   
+ </sect1>
+
   
  </chapter>
+
   <!-- Keep this comment at the end of the file
   Local variables:
   mode: sgml
diff --git a/doc/architecture.xml b/doc/architecture.xml

index 37afaee..0738242 100644 (file)
--- a/doc/architecture.xml
+++ b/doc/architecture.xml
@@ -1,9 +1,9 @@
   <chapter id="architecture">
-  <!-- $Id: architecture.xml,v 1.3 2006-02-15 11:07:47 marc Exp $ -->
+  <!-- $Id: architecture.xml,v 1.4 2006-02-15 12:08:48 marc Exp $ -->
    <title>Overview of Zebra Architecture</title>
    
  
-  <sect1 id="local-representation">
+  <sect1 id="architecture-representation">
     <title>Local Representation</title>
  
     <para>
@@ -32,7 +32,7 @@
     </para>
    </sect1>
  
-  <sect1 id="workflow">
+  <sect1 id="architecture-workflow">
     <title>Indexing and Retrieval Workflow</title>
  
    <para>
@@ -85,7 +85,7 @@
    </sect1>
  
  
-  <sect1 id="maincomponents">
+  <sect1 id="architecture-maincomponents">
     <title>Main Components</title>
     <para>
      The Zebra system is designed to support a wide range of data management
@@ -242,6 +242,7 @@ IDZebra filter grs.danbib (DBC DanBib records)
     <sect3 id="componentmodulesalvis">
      <title>ALVIS Record Model and Filter Module</title>
       <para>
+      <xref linkend="record-model-alvisxslt"/>
        - alvis          Experimental Alvis XSLT filter
        <literal>mod-alvis.so</literal>
        <literal>libidzebra1.4-mod-alvis</literal>
diff --git a/doc/zebra.xml.in b/doc/zebra.xml.in

index 0b39514..3d696cf 100644 (file)
--- a/doc/zebra.xml.in
+++ b/doc/zebra.xml.in
@@ -22,8 +22,8 @@
          <!ENTITY zebrasrv-virtual SYSTEM "zebrasrv-virtual.xml">
          <!ENTITY gfs-synopsis-app "zebrasrv">
  
-       <!ENTITY ref-local-representation '
-               <xref linkend="local-representation"/>'>
+       <!ENTITY ref-architecture-representation '
+               <xref linkend="architecture-representation"/>'>
         <!ENTITY ref-record-types '
                 <xref linkend="record-types"/>'>
         <!ENTITY ref-configuration-file '
@@ -31,7 +31,7 @@
         <!ENTITY ref-shadow-registers '
                 <xref linkend="shadow-registers"/>'>
  ]>
-<!-- $Id: zebra.xml.in,v 1.23 2006-02-15 11:07:47 marc Exp $ -->
+<!-- $Id: zebra.xml.in,v 1.24 2006-02-15 12:08:48 marc Exp $ -->
  <book id="zebra">
   <bookinfo>
    <title>Zebra - User's Guide and Reference</title>
diff --git a/doc/zebraidx-options.xml b/doc/zebraidx-options.xml

index 6d8f7b7..7a01825 100644 (file)
--- a/doc/zebraidx-options.xml
+++ b/doc/zebraidx-options.xml
@@ -1,5 +1,5 @@
  <!-- 
-   $Id: zebraidx-options.xml,v 1.3 2003-12-03 13:57:16 adam Exp $
+   $Id: zebraidx-options.xml,v 1.4 2006-02-15 12:08:48 marc Exp $
     Options for zebraidx.
     Included in both manual and man page for zebraidx
  -->
@@ -14,7 +14,7 @@
      <literal>grs</literal><replaceable>.subtype</replaceable>.
      If no <replaceable>subtype</replaceable> is provided for the GRS
      (General Record Structure) type, the canonical input format
-    is assumed (see &ref-local-representation;).
+    is assumed (see &ref-architecture-representation;).
      Generally, it is probably advisable to specify the record types
      in the <literal>zebra.cfg</literal> file (see
      &ref-record-types;), to avoid confusion at
author	Marc Cromme <marc@indexdata.dk>
	Wed, 15 Feb 2006 12:08:47 +0000 (12:08 +0000)
committer	Marc Cromme <marc@indexdata.dk>
	Wed, 15 Feb 2006 12:08:47 +0000 (12:08 +0000)
doc/Makefile.am		patch \| blob \| history
doc/administration.xml		patch \| blob \| history
doc/architecture.xml		patch \| blob \| history
doc/zebra.xml.in		patch \| blob \| history
doc/zebraidx-options.xml		patch \| blob \| history