added initial DOM XML filter documentation. Much is missing yet ...

author Marc Cromme <marc@indexdata.dk>

Tue, 20 Feb 2007 14:28:31 +0000 (14:28 +0000)

committer Marc Cromme <marc@indexdata.dk>

Tue, 20 Feb 2007 14:28:31 +0000 (14:28 +0000)
author Marc Cromme <marc@indexdata.dk>
Tue, 20 Feb 2007 14:28:31 +0000 (14:28 +0000)
committer Marc Cromme <marc@indexdata.dk>
Tue, 20 Feb 2007 14:28:31 +0000 (14:28 +0000)
diff --git a/doc/Makefile.am b/doc/Makefile.am

index 5f6a183..001d575 100644 (file)
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,4 +1,4 @@
-## $Id: Makefile.am,v 1.63 2007-01-15 20:04:34 adam Exp $
+## $Id: Makefile.am,v 1.64 2007-02-20 14:28:31 marc Exp $
  docdir=$(datadir)/doc/$(PACKAGE)$(PACKAGE_SUFFIX)
  
  SUBDIRS = common
@@ -17,6 +17,7 @@ XMLFILES =  \
     marc_indexing.xml \
     querymodel.xml \
     quickstart.xml \
+   recordmodel-domxml.xml \
     recordmodel-alvisxslt.xml \
     recordmodel-grs.xml \
     manref.xml \
diff --git a/doc/architecture.xml b/doc/architecture.xml

index fd89051..cecd978 100644 (file)
--- a/doc/architecture.xml
+++ b/doc/architecture.xml
@@ -1,5 +1,5 @@
   <chapter id="architecture">
-  <!-- $Id: architecture.xml,v 1.20 2007-02-02 11:10:08 marc Exp $ -->
+  <!-- $Id: architecture.xml,v 1.21 2007-02-20 14:28:31 marc Exp $ -->
    <title>Overview of &zebra; Architecture</title>
  
    <section id="architecture-representation">
@@ -207,9 +207,64 @@
       modules. 
      </para>
  
+   <section id="componentmodulesdom">
+    <title>&dom; &xml; Record Model and Filter Module</title>
+     <para>
+      The &dom; &xml; filter uses a standard &dom; &xml; structure as
+      internal data model, and can thus parse, index, and display 
+      any &xml; document.
+    </para>
+    <para>
+      A parser for binary &marc; records based on the ISO2709 library
+      standard is provided, it transforms these to the internal
+      &marcxml; &dom; representation.  
+    </para>
+    <para>
+      The internal &dom; &xml; representation can be fed into four
+      different pipelines, consisting of arbitraily many sucessive
+      &xslt; transformations; these are for  
+     <itemizedlist>
+       <listitem><para>input parsing and initial
+          transformations,</para></listitem>
+       <listitem><para>indexing term extraction
+          transformations</para></listitem>
+       <listitem><para>transformations before internal document
+          storage, and </para></listitem>
+       <listitem><para>retrieve transformations from storage to output
+          format</para></listitem>
+      </itemizedlist>
+    </para>
+    <para>
+      The &dom; &xml; filter pipelines use &xslt; (and if  supported on
+      your platform, even &exslt;), it brings thus full &xpath;
+      support to the indexing, storage and display rules of not only
+      &xml; documents, but also binary &marc; records.
+    </para>
+    <para>
+      Finally, the &dom; &xml; filter allows for static ranking at index
+      time, and to to sort hit lists according to predefined
+      static ranks.
+    </para>
+    <para>
+      Details on the experimental &dom; &xml; filter are found in 
+      <xref linkend="record-model-domxml"/>.
+      </para>
+     <para>
+      The Debian package <literal>libidzebra-2.0-mod-dom</literal>
+      contains the &dom; filter module.
+     </para>
+    </section>
  
     <section id="componentmodulesalvis">
      <title>ALVIS &xml; Record Model and Filter Module</title>
+     <note>
+      <para>
+        The functionality of this record model has been improved and
+        replaced by the &dom; &xml; record model. See 
+        <xref linkend="componentmodulesdom"/>.
+      </para>
+     </note>
+
       <para>
        The Alvis filter for &xml; files is an &xslt; based input
        filter. 
@@ -252,6 +307,13 @@
  
     <section id="componentmodulesgrs">
      <title>&grs1; Record Model and Filter Modules</title>
+     <note>
+      <para>
+        The functionality of this record model has been improved and
+        replaced by the &dom; &xml; record model. See 
+        <xref linkend="componentmodulesdom"/>.
+      </para>
+     </note>
      <para>
      The &grs1; filter modules described in 
      <xref linkend="grs"/>
diff --git a/doc/entities.ent b/doc/entities.ent

index 4c82c41..f2150fa 100644 (file)
--- a/doc/entities.ent
+++ b/doc/entities.ent
@@ -1,4 +1,4 @@
-<!-- $Id: entities.ent,v 1.4 2006-09-05 12:01:31 adam Exp $ -->
+<!-- $Id: entities.ent,v 1.5 2007-02-20 14:28:31 marc Exp $ -->
  <!ENTITY chap-introduction SYSTEM "introduction.xml">
  <!ENTITY chap-installation SYSTEM "installation.xml">
  <!ENTITY chap-quickstart SYSTEM "quickstart.xml">
@@ -7,8 +7,9 @@
  <!ENTITY chap-administration SYSTEM "administration.xml">
  <!ENTITY chap-querymodel SYSTEM "querymodel.xml">
  <!ENTITY chap-zebraidx SYSTEM "zebraidx.xml">
-<!ENTITY chap-recordmodel-grs SYSTEM "recordmodel-grs.xml">
  <!ENTITY chap-recordmodel-alvisxslt SYSTEM "recordmodel-alvisxslt.xml">
+<!ENTITY chap-recordmodel-domxml SYSTEM "recordmodel-domxml.xml">
+<!ENTITY chap-recordmodel-grs SYSTEM "recordmodel-grs.xml">
  <!ENTITY chap-field-structure SYSTEM "field-structure.xml">
  <!ENTITY app-license SYSTEM "license.xml">
  <!ENTITY app-indexdata SYSTEM "indexdata.xml">
diff --git a/doc/recordmodel-alvisxslt.xml b/doc/recordmodel-alvisxslt.xml

index 93ce649..8eee9b9 100644 (file)
--- a/doc/recordmodel-alvisxslt.xml
+++ b/doc/recordmodel-alvisxslt.xml
@@ -1,15 +1,20 @@
   <chapter id="record-model-alvisxslt">
-  <!-- $Id: recordmodel-alvisxslt.xml,v 1.15 2007-02-02 11:10:08 marc Exp $ -->
+  <!-- $Id: recordmodel-alvisxslt.xml,v 1.16 2007-02-20 14:28:31 marc Exp $ -->
    <title>ALVIS &xml; Record Model and Filter Module</title>
-  
+
+     <note>
+      <para>
+        The functionality of this record model has been improved and
+        replaced by the DOM &xml; record model. See 
+        <xref linkend="record-model-domxml"/>.
+      </para>
+     </note>  
  
    <para>
     The record model described in this chapter applies to the fundamental,
     structured &xml;
     record type <literal>alvis</literal>, introduced in
-   <xref linkend="componentmodulesalvis"/>. The ALVIS &xml; record model
-   is experimental, and it's inner workings might change in future
-   releases of the &zebra; Information Server.
+   <xref linkend="componentmodulesalvis"/>.
    </para>
  
    <para> This filter has been developed under the 
diff --git a/doc/recordmodel-domxml.xml b/doc/recordmodel-domxml.xml

new file mode 100644 (file)

index 0000000..201299a
--- /dev/null
+++ b/doc/recordmodel-domxml.xml
@@ -0,0 +1,621 @@
+<chapter id="record-model-domxml">
+  <!-- $Id: recordmodel-domxml.xml,v 1.1 2007-02-20 14:28:31 marc Exp $ -->
+  <title>&dom; &xml; Record Model and Filter Module</title>
+
+  <para>
+   The record model described in this chapter applies to the fundamental,
+   structured &xml;
+   record type <literal>dom</literal>, introduced in
+   <xref linkend="componentmodulesdom"/>. The &dom; &xml; record model
+   is experimental, and it's inner workings might change in future
+   releases of the &zebra; Information Server.
+  </para>
+
+  
+  
+  <section id="record-model-domxml-filter">
+   <title>&dom; Record Filter</title>
+
+     <para>
+      The &dom; &xml; filter uses a standard &dom; &xml; structure as
+      internal data model, and can therefore parse, index, and display 
+      any &xml; document type. It is wellsuited to work on 
+      standardized &xml;-based formats such as Dublin Core, MODS, METS,
+      MARCXML, OAI-PMH, RSS, and performs equally  well on any other
+      non-standard &xml; format. 
+    </para>
+    <para>
+      A parser for binary &marc; records based on the ISO2709 library
+      standard is provided, it transforms these to the internal
+      &marcxml; &dom; representation. Other binary document parsers
+      are planned to follow.  
+    </para>
+   </section>
+
+
+   <section id="record-model-domxml-architecture">
+    <title>&dom; &xml; filter architecture</title>   
+
+    <para>
+      The internal &dom; &xml; representation can be fed into four
+      different pipelines, consisting of arbitraily many sucessive
+      &xslt; transformations.
+    </para>
+
+    <table id="record-model-domxml-architecture-table" frame="top">
+      <title>&dom; &xml; filter pipelines overview</title>
+      <tgroup cols="5">
+       <thead>
+        <row>
+         <entry>Name</entry>
+         <entry>When</entry>
+         <entry>Description</entry>
+         <entry>Input</entry>
+         <entry>Output</entry>
+        </row>
+       </thead>
+       
+       <tbody>
+        <row>
+         <entry><literal>input</literal></entry>
+         <entry>first</entry>
+         <entry>input parsing and initial
+          transformations to common &xml; format</entry>
+         <entry>raw &xml; record buffers, &xml;  streams and 
+                binary &marc; buffers</entry>
+         <entry>single &dom; &xml; documents suitable for indexing and
+                internal storage</entry>
+        </row>
+        <row>
+         <entry><literal>extract</literal></entry>
+         <entry>second</entry>
+         <entry>indexing term extraction
+          transformations</entry>
+         <entry>common single &dom; &xml; format</entry>
+         <entry>&zebra; internal indexing &dom; &xml; document</entry>
+        </row>
+        <row>
+         <entry><literal>store</literal></entry>
+         <entry>second</entry>
+         <entry> transformations before internal document
+          storage</entry>
+         <entry>common single &dom; &xml; format</entry>
+         <entry>&zebra; internal storage &dom; &xml; document</entry>
+        </row>
+        <row>
+         <entry><literal>retrieve</literal></entry>
+         <entry>third</entry>
+         <entry>document retrieve transformations from storage to output
+          syntax and format</entry>
+         <entry>&zebra; internal storage &dom; &xml; document</entry>
+         <entry>requested output syntax and format</entry>
+        </row>
+       </tbody>
+      </tgroup>
+     </table>
+
+    <para>
+      The &dom; &xml; filter pipelines use &xslt; (and if  supported on
+      your platform, even &exslt;), it brings thus full &xpath;
+      support to the indexing, storage and display rules of not only
+      &xml; documents, but also binary &marc; records.
+    </para>
+   </section>
+
+
+   <section id="record-model-domxml-pipeline">
+    <title>&dom; &xml; filter pipeline configuration</title>   
+
+   <para>
+    The experimental, loadable  &dom; &xml;/&xslt; filter module
+   <literal>mod-dom.so</literal> is packaged in the GNU/Debian package
+    <literal>libidzebra2.0-mod-dom</literal>.
+    It is invoked by the <filename>zebra.cfg</filename> configuration statement
+    <screen>
+     recordtype.xml: dom.db/filter_dom_conf.xml
+    </screen>
+    In this example on all data files with suffix 
+    <filename>*.xml</filename>, where the
+    &dom; &xslt; filter configuration file is found in the
+    path <filename>db/filter_dom_conf.xml</filename>.
+   </para>
+
+
+
+
+
+   <para>The &dom; &xslt; filter configuration file must be
+    valid &xml;. It might look like this (This example is
+    used for indexing and display of &oai; harvested records):
+    <screen>
+    &lt;?xml version="1.0" encoding="UTF-8"?&gt;
+      &lt;schemaInfo&gt;
+        &lt;schema name="identity" stylesheet="xsl/identity.xsl" /&gt;
+        &lt;schema name="index" identifier="http://indexdata.dk/zebra/xslt/1"
+            stylesheet="xsl/oai2index.xsl" /&gt;
+        &lt;schema name="dc" stylesheet="xsl/oai2dc.xsl" /&gt;
+        &lt;!-- use split level 2 when indexing whole &oai; Record lists --&gt;
+        &lt;split level="2"/&gt;
+      &lt;/schemaInfo&gt;
+    </screen> 
+   </para>
+   <para>
+    All named stylesheets defined inside
+    <literal>schema</literal> element tags 
+    are for presentation after search, including
+    the indexing stylesheet (which is a great debugging help). The
+    names defined in the <literal>name</literal> attributes must be
+    unique, these are the literal <literal>schema</literal> or 
+    <literal>element set</literal> names used in 
+      <ulink url="http://www.loc.gov/standards/sru/srw/">&srw;</ulink>,
+      <ulink url="&url.sru;">&sru;</ulink> and
+    &z3950; protocol queries.
+    The paths in the <literal>stylesheet</literal> attributes
+    are relative to zebras working directory, or absolute to file
+    system root.
+   </para>
+   <para>
+    The <literal>&lt;split level="2"/&gt;</literal> decides where the
+    &xml; Reader shall split the
+    collections of records into individual records, which then are
+    loaded into &dom;, and have the indexing &xslt; stylesheet applied.
+   </para>
+   <para>
+    There must be exactly one indexing &xslt; stylesheet, which is
+    defined by the magic attribute  
+    <literal>identifier="http://indexdata.dk/zebra/xslt/1"</literal>.
+   </para>
+
+   <section id="record-model-domxml-internal">
+    <title>&dom; Internal Record Representation</title>   
+    <para>When indexing, an &xml; Reader is invoked to split the input
+    files into suitable record &xml; pieces. Each record piece is then
+    transformed to an &xml; &dom; structure, which is essentially the
+    record model. Only &xslt; transformations can be applied during
+    index, search and retrieval. Consequently, output formats are
+    restricted to whatever &xslt; can deliver from the record &xml;
+    structure, be it other &xml; formats, HTML, or plain text. In case
+    you have <literal>libxslt1</literal> running with E&xslt; support,
+    you can use this functionality inside the &dom;
+    filter configuration &xslt; stylesheets.
+    </para>
+   </section>
+
+   <section id="record-model-domxml-canonical">
+    <title>&dom; Canonical Indexing Format</title>   
+    <para>The output of the indexing &xslt; stylesheets must contain
+    certain elements in the magic 
+     <literal>xmlns:z="http://indexdata.dk/zebra/xslt/1"</literal>
+    namespace. The output of the &xslt; indexing transformation is then
+    parsed using &dom; methods, and the contained instructions are
+    performed on the <emphasis>magic elements and their
+    subtrees</emphasis>.
+    </para>
+    <para>
+    For example, the output of the command
+     <screen>  
+      xsltproc xsl/oai2index.xsl one-record.xml
+     </screen> 
+     might look like this:
+     <screen>
+      &lt;?xml version="1.0" encoding="UTF-8"?&gt;
+      &lt;z:record xmlns:z="http://indexdata.dk/zebra/xslt/1" 
+           z:id="oai:JTRS:CP-3290---Volume-I" 
+           z:rank="47896"
+           z:type="update"&gt;
+       &lt;z:index name="oai_identifier" type="0"&gt;
+                oai:JTRS:CP-3290---Volume-I&lt;/z:index&gt;
+       &lt;z:index name="oai_datestamp" type="0"&gt;2004-07-09&lt;/z:index&gt;
+       &lt;z:index name="oai_setspec" type="0"&gt;jtrs&lt;/z:index&gt;
+       &lt;z:index name="dc_all" type="w"&gt;
+          &lt;z:index name="dc_title" type="w"&gt;Proceedings of the 4th 
+                International Conference and Exhibition:
+                World Congress on Superconductivity - Volume I&lt;/z:index&gt;
+          &lt;z:index name="dc_creator" type="w"&gt;Kumar Krishen and *Calvin
+                Burnham, Editors&lt;/z:index&gt;
+       &lt;/z:index&gt;
+     &lt;/z:record&gt;
+     </screen>
+    </para>
+    <para>This means the following: From the original &xml; file 
+     <literal>one-record.xml</literal> (or from the &xml; record &dom; of the
+     same form coming from a splitted input file), the indexing
+     stylesheet produces an indexing &xml; record, which is defined by
+     the <literal>record</literal> element in the magic namespace
+     <literal>xmlns:z="http://indexdata.dk/zebra/xslt/1"</literal>.
+     &zebra; uses the content of 
+     <literal>z:id="oai:JTRS:CP-3290---Volume-I"</literal> as internal
+     record ID, and - in case static ranking is set - the content of 
+     <literal>z:rank="47896"</literal> as static rank. Following the
+     discussion in <xref linkend="administration-ranking"/>
+     we see that this records is internally ordered
+     lexicographically according to the value of the string
+     <literal>oai:JTRS:CP-3290---Volume-I47896</literal>.
+     The type of action performed during indexing is defined by
+     <literal>z:type="update"&gt;</literal>, with recognized values
+     <literal>insert</literal>, <literal>update</literal>, and 
+     <literal>delete</literal>. 
+    </para>
+    <para>In this example, the following literal indexes are constructed:
+     <screen>
+       oai_identifier
+       oai_datestamp
+       oai_setspec
+       dc_all
+       dc_title
+       dc_creator
+     </screen>
+     where the indexing type is defined in the 
+     <literal>type</literal> attribute 
+     (any value from the standard configuration
+     file <filename>default.idx</filename> will do). Finally, any 
+     <literal>text()</literal> node content recursively contained
+     inside the <literal>index</literal> will be filtered through the
+     appropriate charmap for character normalization, and will be
+     inserted in the index.
+    </para>
+    <para>
+     Specific to this example, we see that the single word
+     <literal>oai:JTRS:CP-3290---Volume-I</literal> will be literal,
+     byte for byte without any form of character normalization,
+     inserted into the index named <literal>oai:identifier</literal>,
+     the text 
+     <literal>Kumar Krishen and *Calvin Burnham, Editors</literal>
+     will be inserted using the <literal>w</literal> character
+     normalization defined in <filename>default.idx</filename> into
+     the index <literal>dc:creator</literal> (that is, after character
+     normalization the index will keep the inidividual words 
+     <literal>kumar</literal>, <literal>krishen</literal>, 
+     <literal>and</literal>, <literal>calvin</literal>,
+     <literal>burnham</literal>, and <literal>editors</literal>), and
+     finally both the texts
+     <literal>Proceedings of the 4th International Conference and Exhibition:
+      World Congress on Superconductivity - Volume I</literal> 
+     and
+     <literal>Kumar Krishen and *Calvin Burnham, Editors</literal> 
+     will be inserted into the index <literal>dc:all</literal> using
+     the same character normalization map <literal>w</literal>. 
+    </para>
+    <para>
+     Finally, this example configuration can be queried using &pqf;
+     queries, either transported by &z3950;, (here using a yaz-client) 
+     <screen>
+      <![CDATA[
+      Z> open localhost:9999
+      Z> elem dc
+      Z> form xml
+      Z>
+      Z> f @attr 1=dc_creator Kumar
+      Z> scan @attr 1=dc_creator adam
+      Z>
+      Z> f @attr 1=dc_title @attr 4=2 "proceeding congress superconductivity"
+      Z> scan @attr 1=dc_title abc
+      ]]>
+     </screen>
+     or the proprietary
+     extentions <literal>x-pquery</literal> and
+     <literal>x-pScanClause</literal> to
+     &sru;, and &srw;
+     <screen>
+      <![CDATA[
+      http://localhost:9999/?version=1.1&operation=searchRetrieve&x-pquery=%40attr+1%3Ddc_creator+%40attr+4%3D6+%22the
+      http://localhost:9999/?version=1.1&operation=scan&x-pScanClause=@attr+1=dc_date+@attr+4=2+a
+      ]]>
+     </screen>
+     See <xref linkend="zebrasrv-sru"/> for more information on &sru;/&srw;
+     configuration, and <xref linkend="gfs-config"/> or the &yaz;
+     <ulink url="&url.yaz.cql;">&cql; section</ulink>
+     for the details or the &yaz; frontend server.
+    </para>
+    <para>
+     Notice that there are no <filename>*.abs</filename>,
+     <filename>*.est</filename>, <filename>*.map</filename>, or other &grs1;
+     filter configuration files involves in this process, and that the
+     literal index names are used during search and retrieval.
+    </para>
+   </section>
+  </section>
+
+
+  <section id="record-model-domxml-conf">
+   <title>&dom; Record Model Configuration</title>
+
+
+  <section id="record-model-domxml-index">
+   <title>&dom; Indexing Configuration</title>
+    <para>
+     As mentioned above, there can be only one indexing
+     stylesheet, and configuration of the indexing process is a synonym
+     of writing an &xslt; stylesheet which produces &xml; output containing the
+     magic elements discussed in  
+     <xref linkend="record-model-domxml-internal"/>. 
+     Obviously, there are million of different ways to accomplish this
+     task, and some comments and code snippets are in order to lead
+     our paduans on the right track to the  good side of the force.
+    </para>
+    <para>
+     Stylesheets can be written in the <emphasis>pull</emphasis> or
+     the <emphasis>push</emphasis> style: <emphasis>pull</emphasis>
+     means that the output &xml; structure is taken as starting point of
+     the internal structure of the &xslt; stylesheet, and portions of
+     the input &xml; are <emphasis>pulled</emphasis> out and inserted
+     into the right spots of the output &xml; structure. On the other
+     side, <emphasis>push</emphasis> &xslt; stylesheets are recursavly
+     calling their template definitions, a process which is commanded
+     by the input &xml; structure, and avake to produce some output &xml;
+     whenever some special conditions in the input styelsheets are
+     met. The <emphasis>pull</emphasis> type is well-suited for input
+     &xml; with strong and well-defined structure and semantcs, like the
+     following &oai; indexing example, whereas the
+     <emphasis>push</emphasis> type might be the only possible way to
+     sort out deeply recursive input &xml; formats.
+    </para>
+    <para> 
+     A <emphasis>pull</emphasis> stylesheet example used to index
+     &oai; harvested records could use some of the following template
+     definitions:
+     <screen>
+      <![CDATA[
+      <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+       xmlns:z="http://indexdata.dk/zebra/xslt/1"
+       xmlns:oai="http://www.openarchives.org/&oai;/2.0/" 
+       xmlns:oai_dc="http://www.openarchives.org/&oai;/2.0/oai_dc/" 
+       xmlns:dc="http://purl.org/dc/elements/1.1/"
+       version="1.0">
+
+       <xsl:output indent="yes" method="xml" version="1.0" encoding="UTF-8"/>
+
+        <!-- disable all default text node output -->
+        <xsl:template match="text()"/>
+
+         <!-- match on oai xml record root -->
+         <xsl:template match="/">    
+          <z:record z:id="{normalize-space(oai:record/oai:header/oai:identifier)}" 
+           z:type="update">
+           <!-- you might want to use z:rank="{some &xslt; function here}" --> 
+           <xsl:apply-templates/>
+          </z:record>
+         </xsl:template>
+
+         <!-- &oai; indexing templates -->
+         <xsl:template match="oai:record/oai:header/oai:identifier">
+          <z:index name="oai_identifier" type="0">
+           <xsl:value-of select="."/>
+          </z:index>    
+         </xsl:template>
+
+         <!-- etc, etc -->
+
+         <!-- DC specific indexing templates -->
+         <xsl:template match="oai:record/oai:metadata/oai_dc:dc/dc:title">
+          <z:index name="dc_title" type="w">
+           <xsl:value-of select="."/>
+          </z:index>
+         </xsl:template>
+
+         <!-- etc, etc -->
+ 
+      </xsl:stylesheet>
+      ]]>
+     </screen>
+    </para>
+    <para>
+     Notice also,
+     that the names and types of the indexes can be defined in the
+     indexing &xslt; stylesheet <emphasis>dynamically according to
+     content in the original &xml; records</emphasis>, which has
+     opportunities for great power and wizardery as well as grande
+     disaster.  
+    </para>
+    <para>
+     The following excerpt of a <emphasis>push</emphasis> stylesheet
+     <emphasis>might</emphasis> 
+     be a good idea according to your strict control of the &xml;
+     input format (due to rigerours checking against well-defined and
+     tight RelaxNG or &xml; Schema's, for example):
+     <screen>
+      <![CDATA[
+      <xsl:template name="element-name-indexes">     
+       <z:index name="{name()}" type="w">
+        <xsl:value-of select="'1'"/>
+       </z:index>
+      </xsl:template>
+      ]]>
+     </screen>
+     This template creates indexes which have the name of the working 
+     node of any input  &xml; file, and assigns a '1' to the index.
+     The example query 
+     <literal>find @attr 1=xyz 1</literal> 
+     finds all files which contain at least one
+     <literal>xyz</literal> &xml; element. In case you can not control
+     which element names the input files contain, you might ask for
+     disaster and bad karma using this technique.
+    </para>
+    <para>
+     One variation over the theme <emphasis>dynamically created
+     indexes</emphasis> will definitely be unwise:
+     <screen>
+      <![CDATA[  
+      <!-- match on oai xml record root -->
+      <xsl:template match="/">    
+       <z:record z:type="update">
+      
+        <!-- create dynamic index name from input content --> 
+        <xsl:variable name="dynamic_content">
+         <xsl:value-of select="oai:record/oai:header/oai:identifier"/>
+        </xsl:variable>
+        
+        <!-- create zillions of indexes with unknown names -->
+        <z:index name="{$dynamic_content}" type="w">
+         <xsl:value-of select="oai:record/oai:metadata/oai_dc:dc"/>
+        </z:index>          
+       </z:record>
+       
+      </xsl:template>
+      ]]>
+     </screen>
+     Don't be tempted to cross
+     the line to the dark side of the force, paduan; this leads
+     to suffering and pain, and universal
+     disentigration of your project schedule.
+    </para>
+  </section>
+
+  <section id="record-model-domxml-elementset">
+   <title>&dom; Exchange Formats</title>
+   <para>
+     An exchange format can be anything which can be the outcome of an
+     &xslt; transformation, as far as the stylesheet is registered in
+     the main &dom; &xslt; filter configuration file, see
+     <xref linkend="record-model-domxml-filter"/>.
+     In principle anything that can be expressed in  &xml;, HTML, and
+     TEXT can be the output of a <literal>schema</literal> or 
+    <literal>element set</literal> directive during search, as long as
+     the information comes from the 
+     <emphasis>original input record &xml; &dom; tree</emphasis>
+     (and not the transformed and <emphasis>indexed</emphasis> &xml;!!).
+    </para>
+    <para>
+     In addition, internal administrative information from the &zebra;
+     indexer can be accessed during record retrieval. The following
+     example is a summary of the possibilities:
+     <screen>
+      <![CDATA[  
+      <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+       xmlns:z="http://indexdata.dk/zebra/xslt/1"
+       version="1.0">
+
+       <!-- register internal zebra parameters -->       
+       <xsl:param name="id" select="''"/>
+       <xsl:param name="filename" select="''"/>
+       <xsl:param name="score" select="''"/>
+       <xsl:param name="schema" select="''"/>
+           
+       <xsl:output indent="yes" method="xml" version="1.0" encoding="UTF-8"/>
+
+       <!-- use then for display of internal information -->
+       <xsl:template match="/">
+         <z:zebra>
+           <id><xsl:value-of select="$id"/></id>
+           <filename><xsl:value-of select="$filename"/></filename>
+           <score><xsl:value-of select="$score"/></score>
+           <schema><xsl:value-of select="$schema"/></schema>
+         </z:zebra>
+       </xsl:template>
+
+      </xsl:stylesheet>
+      ]]>
+     </screen>
+    </para>
+
+  </section>
+
+  <section id="record-model-domxml-example">
+   <title>&dom; Filter &oai; Indexing Example</title>
+   <para>
+     The sourcecode tarball contains a working &dom; filter example in
+     the directory <filename>examples/dom-oai/</filename>, which
+     should get you started.  
+    </para>
+    <para>
+     More example data can be harvested from any &oai; complient server,
+     see details at the  &oai; 
+     <ulink url="http://www.openarchives.org/">
+      http://www.openarchives.org/</ulink> web site, and the community
+      links at 
+     <ulink url="http://www.openarchives.org/community/index.html">
+      http://www.openarchives.org/community/index.html</ulink>.
+     There is a  tutorial
+     found at
+     <ulink url="http://www.oaforum.org/tutorial/">
+      http://www.oaforum.org/tutorial/</ulink>.
+    </para>
+   </section>
+
+  </section>
+
+  
+ </chapter>
+
+
+<!--
+
+c)  Main "dom" &xslt; filter config file:
+  cat db/filter_dom_conf.xml 
+
+  <?xml version="1.0" encoding="UTF8"?>
+  <schemaInfo>
+    <schema name="dom" stylesheet="db/dom2dom.xsl" />
+    <schema name="index" identifier="http://indexdata.dk/zebra/xslt/1"
+            stylesheet="db/dom2index.xsl" />
+    <schema name="dc" stylesheet="db/dom2dc.xsl" />
+    <schema name="dc-short" stylesheet="db/dom2dc_short.xsl" />
+    <schema name="snippet" snippet="25" stylesheet="db/dom2snippet.xsl" />
+    <schema name="help" stylesheet="db/dom2help.xsl" />
+    <split level="1"/>
+  </schemaInfo>
+
+  the paths are relative to the directory where zebra.init is placed
+  and is started up.
+
+  The split level decides where the SAX parser shall split the
+  collections of records into individual records, which then are
+  loaded into &dom;, and have the indexing &xslt; stylesheet applied.
+
+  The indexing stylesheet is found by it's identifier.
+
+  All the other stylesheets are for presentation after search.
+
+- in data/ a short sample of harvested carnivorous plants
+  ZEBRA_INDEX_DIRS=data/carnivor_20050118_2200_short-346.xml
+
+- in root also one single data record - nice for testing the xslt
+  stylesheets,
+  
+  xsltproc db/dom2index.xsl carni*.xml
+
+  and so on.
+
+- in db/ a cql2pqf.txt yaz-client config file 
+  which is also used in the yaz-server <ulink url="&url.cql;">&cql;</ulink>-to-&pqf; process
+
+   see: http://www.indexdata.com/yaz/doc/tools.tkl#tools.cql.map
+
+- in db/ an indexing &xslt; stylesheet. This is a PULL-type XSLT thing,
+  as it constructs the new &xml; structure by pulling data out of the
+  respective elements/attributes of the old structure.
+
+  Notice the special zebra namespace, and the special elements in this
+  namespace which indicate to the zebra indexer what to do.
+
+  <z:record id="67ht7" rank="675" type="update">
+  indicates that a new record with given id and static rank has to be updated. 
+
+  <z:index name="title" type="w">
+   encloses all the text/&xml; which shall be indexed in the index named
+   "title" and of index type "w" (see  file default.idx in your zebra
+   installation) 
+
+
+   </para>
+
+   <para>
+-->
+
+
+
+
+ <!-- Keep this comment at the end of the file
+ Local variables:
+ mode: sgml
+ sgml-omittag:t
+ sgml-shorttag:t
+ sgml-minimize-attributes:nil
+ sgml-always-quote-attributes:t
+ sgml-indent-step:1
+ sgml-indent-data:t
+ sgml-parent-document: "zebra.xml"
+ sgml-local-catalogs: nil
+ sgml-namecase-general:t
+ End:
+ -->
diff --git a/doc/recordmodel-grs.xml b/doc/recordmodel-grs.xml

index 848db70..7ba26d3 100644 (file)
--- a/doc/recordmodel-grs.xml
+++ b/doc/recordmodel-grs.xml
@@ -1,7 +1,15 @@
   <chapter id="grs">
-  <!-- $Id: recordmodel-grs.xml,v 1.7 2007-02-02 11:10:08 marc Exp $ -->
+  <!-- $Id: recordmodel-grs.xml,v 1.8 2007-02-20 14:28:31 marc Exp $ -->
    <title>&grs1; Record Model and Filter Modules</title>
  
+     <note>
+      <para>
+        The functionality of this record model has been improved and
+        replaced by the DOM &xml; record model. See 
+        <xref linkend="record-model-domxml"/>.
+      </para>
+     </note>
+
    <para>
     The record model described in this chapter applies to the fundamental,
     structured
diff --git a/doc/zebra.xml b/doc/zebra.xml

index 5110f33..540bbd4 100644 (file)
--- a/doc/zebra.xml
+++ b/doc/zebra.xml
@@ -11,7 +11,7 @@
  
       <!ENTITY test SYSTEM "test.xml">
  ]>
-<!-- $Id: zebra.xml,v 1.16 2007-02-02 11:10:08 marc Exp $ -->
+<!-- $Id: zebra.xml,v 1.17 2007-02-20 14:28:31 marc Exp $ -->
  <book id="zebra">
   <bookinfo>
    <title>&zebra; - User's Guide and Reference</title>
@@ -62,6 +62,7 @@
    &chap-architecture;
    &chap-querymodel;
    &chap-administration;
+  &chap-recordmodel-domxml;
    &chap-recordmodel-alvisxslt;
    &chap-recordmodel-grs;
    &chap-field-structure;
author	Marc Cromme <marc@indexdata.dk>
	Tue, 20 Feb 2007 14:28:31 +0000 (14:28 +0000)
committer	Marc Cromme <marc@indexdata.dk>
	Tue, 20 Feb 2007 14:28:31 +0000 (14:28 +0000)
doc/Makefile.am		patch \| blob \| history
doc/architecture.xml		patch \| blob \| history
doc/entities.ent		patch \| blob \| history
doc/recordmodel-alvisxslt.xml		patch \| blob \| history
doc/recordmodel-domxml.xml	[new file with mode: 0644]	patch \| blob
doc/recordmodel-grs.xml		patch \| blob \| history
doc/zebra.xml		patch \| blob \| history