solr and dbc tests
authorHeikki Levanto <heikki@indexdata.dk>
Wed, 27 Nov 2013 14:57:42 +0000 (15:57 +0100)
committerHeikki Levanto <heikki@indexdata.dk>
Wed, 27 Nov 2013 14:57:42 +0000 (15:57 +0100)
heikki/README-HEIKKI
heikki/dbc-os/bibliotek.dk.xml
heikki/dbc-os/dbc-opensearch-gw.cfg
heikki/dbc-os/test2.cfg
heikki/dbc-os/test2.sh
heikki/solr/opencontent.xml [new file with mode: 0644]
heikki/solr/plot1.cmd [new file with mode: 0644]
heikki/solr/solr-pz2.xsl [new file with mode: 0644]
heikki/solr/solr.lui.xml [new file with mode: 0644]
heikki/solr/test3.cfg [new file with mode: 0644]
heikki/solr/test3.sh [new file with mode: 0755]

index 4e1eebf..0571886 100644 (file)
@@ -47,3 +47,18 @@ I should also add stuff directly to the client, and to the record, as I need.
 
 Next: Plot the tf/idf scores against round-robin sorted order. Will be messy,
 but later when we get a target that returns sorted records, it will make sense.
+
+
+Wed 27-Nov
+Setting up multiple SOLR targets in the same pazpar2
+ - Add #999 to the z-urls, so pazpar2 won't merge them. Different number for each
+
+This URL shows the databases, with their numbers
+http://lui.indexdata.com/solr/select?q=database:*&facet=true&facet.method=fc&facet.field=author_exact&facet.field=subject_exact&facet.field=date&facet.field=medium_exact&facet.field=database&rows=0&facet.mincount=1
+
+Add this to the target defs
+<set name="pz:extra_args" value="fq=database:4902">
+
+After this, it should be possible to get records from different databases, some
+with many records, some with a few. This is a good testing ground for merging
+rankings! Test first with a round-robin, and plot the scores.
index 5320b6f..91d297e 100644 (file)
@@ -3,7 +3,8 @@
   <set name="pz:apdulog" value="1"/>
 
   <!-- mapping for unqualified search -->
-  <set name="pz:cclmap:term" value="u=1016 t=l,r s=al"/>
+  <!--<set name="pz:cclmap:term" value="u=1016 t=l,r s=al"/>  -->
+  <set name="pz:cclmap:term" value="t=l,r s=al"/>
 
   <!-- field-specific mappings -->
   <set name="pz:cclmap:ti" value="u=4 s=al"/>
index 47bb1ba..2b09c34 100644 (file)
@@ -14,7 +14,8 @@ database: Default
 baseurl: http://openbibdk.addi.dk/0.8/
 objectformat: dkabm
 #constantparams: action=search&facets.numberOfTerms=10&facets.facetName=facet.creator&facets.facetName=facet.type&facets.facetName=facet.subject&agency=100200&profile=test&collectionType=work
-constantparams: action=search&agency=100200&profile=test&collectionType=work&rank=rank_general
+#constantparams: action=search&agency=100200&profile=test&collectionType=work&rank=rank_general
+constantparams: action=search&agency=100200&profile=test&collectionType=work
 fields: bibliotek.dk.fields.txt
 
 database: bibliotek.work
index da81bfc..86540f5 100644 (file)
@@ -32,7 +32,9 @@
             <transform rule="Title"/>
         </icu_chain>
 
-        <rank debug="yes"/>
+        <!--rank debug="yes"/-->
+        <!-- rank cluster="yes" lead="1" length="log" debug="no"/  Autographics settings-->
+        <rank cluster="yes" lead="1" length="log" debug="yes"/>
 
         <!-- we try to keep same order as in marc21.xsl -->
         <metadata name="id" brief="yes"/>
         <metadata name="physical-specified"/>
 
         <metadata name="series-title"/>
-
         <metadata name="description" brief="yes" merge="unique" rank="3"/>
         <metadata name="subject-long" rank="3"/>
         <metadata name="subject" termlist="yes" rank="0" limitmap="ccl: su" />
+
         <metadata name="snippet" brief="yes" merge="unique"/>
         <metadata name="electronic-url" brief="yes" merge="no"/>
         <metadata name="electronic-format-type" />
         <metadata name="available"/>
         <metadata name="due"/>
         <metadata name="thumburl" brief="yes" merge="unique"/>
-        <metadata name="score"/>
+
     </service>
 
   </server>
index 9dfe54a..f187620 100755 (executable)
@@ -51,9 +51,11 @@ else
 fi
 QRY=`echo $Q | sed 's/ /+/g' `
 
+SORT="sort=score"
 #SEARCH="command=search$SES&$QRY&rank=1&sort=relevance"
 #SEARCH="command=search$SES&$QRY"
-SEARCH="command=search$SES&query=$QRY&sort=relevance"
+#SEARCH="command=search$SES&query=$QRY&sort=relevance"
+SEARCH="command=search$SES&query=$QRY&$SORT"
 echo $SEARCH
 curl -s "$URL?$SEARCH" > search.out
 cat search.out | grep search
@@ -80,7 +82,7 @@ do
 done
 
 
-SHOW="command=show$SES&sort=relevance_h&start=0&num=100"
+SHOW="command=show$SES&start=0&num=100&$SORT"
 echo $SHOW
 curl -s "http://localhost:9017/?$SHOW" > show.out
 #grep "relevance" show.out | grep += | grep -v "(0)"
@@ -101,11 +103,12 @@ grep "round-robin" show.out |
 echo '\
   set term png
   set out "plot.png"
-  set yrange [0:300000]
+  #set yrange [0:300000]
+  set logscale y
   plot \' > plot.cmd
 for F in *.data
 do
-  BF=`basename $F .data`
+  BF=`basename $F .data | sed 's/_/ /g' `
   echo -n " \"$F\" using 1:2  with points  title \"$BF\", " >> plot.cmd
 done
 echo "0 notitle" >> plot.cmd
diff --git a/heikki/solr/opencontent.xml b/heikki/solr/opencontent.xml
new file mode 100644 (file)
index 0000000..804dd2c
--- /dev/null
@@ -0,0 +1,33 @@
+<settings target="localhost:9999/Default">
+
+  <!-- Open content targets -->
+
+  <set target="localhost:9999/Default"
+                 name="pz:name" value="OpenSearch"/>
+
+  <!-- settings apply to all targets -->
+
+  <!-- mapping for unqualified search -->
+  <!--
+  <set name="pz:cclmap:term" value="u=1016 t=l,r s=al 2=102"/>
+  -->
+  <set name="pz:cclmap:term" value="u=1016 t=l,r s=al"/>
+
+  <!-- field-specific mappings -->
+  <set name="pz:cclmap:ti" value="u=4 s=al 2=102"/>
+  <set name="pz:cclmap:su" value="u=21 s=al 2=102"/>
+  <set name="pz:cclmap:isbn" value="u=7 2=102"/>
+  <set name="pz:cclmap:issn" value="u=8 2=102"/>
+  <set name="pz:cclmap:date" value="u=30 r=r 2=102"/>
+
+  <!-- Retrieval settings -->
+
+  <set name="pz:requestsyntax" value="marc21"/>
+  <set name="pz:elements" value="F"/>
+
+  <!-- Result normalization settings -->
+
+  <set name="pz:nativesyntax" value="iso2709"/>
+  <set name="pz:xslt" value="marc21.xsl"/>
+
+</settings>
diff --git a/heikki/solr/plot1.cmd b/heikki/solr/plot1.cmd
new file mode 100644 (file)
index 0000000..bcf1627
--- /dev/null
@@ -0,0 +1,9 @@
+\
+  set term png
+  set out "plot.png"
+  #set yrange [0:300000]
+  plot \
+ "hp.data" using 0:1  with points  title "harry potter",  \
+ "vw.data" using 0:1  with points  title "vietnam war",  \
+ "wa.data" using 0:1  with points  title "water or fire or ice"
+
diff --git a/heikki/solr/solr-pz2.xsl b/heikki/solr/solr-pz2.xsl
new file mode 100644 (file)
index 0000000..4fe7bc1
--- /dev/null
@@ -0,0 +1,74 @@
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
+               xmlns:pz="http://www.indexdata.com/pazpar2/1.0" >
+
+  <xsl:param name="medium" />
+
+  <xsl:template  match="/">
+      <xsl:apply-templates></xsl:apply-templates>
+  </xsl:template>
+
+  <xsl:template  match="response">
+      <xsl:apply-templates></xsl:apply-templates>
+  </xsl:template>
+
+  <xsl:template  match="records">
+      <xsl:apply-templates></xsl:apply-templates>
+  </xsl:template>
+
+  <xsl:template match="doc">
+    <pz:record>
+      <xsl:apply-templates></xsl:apply-templates>
+    </pz:record>
+  </xsl:template>
+
+  <xsl:template match="float[@name]">
+    <pz:metadata>
+       <xsl:attribute  name="type">
+         <xsl:value-of select="@name"/>
+       </xsl:attribute>
+       <xsl:value-of select="."/>
+    </pz:metadata>
+  </xsl:template>
+
+  <xsl:template match="str[@name]">
+    <pz:metadata>
+       <xsl:attribute  name="type">
+         <xsl:value-of select="@name"/>
+       </xsl:attribute>
+       <xsl:value-of select="."/>
+    </pz:metadata>
+  </xsl:template>
+
+  <xsl:template match="date[@name]">
+    <pz:metadata>
+       <xsl:attribute  name="type">
+         <xsl:value-of select="@name"/>
+       </xsl:attribute>
+       <xsl:value-of select="."/>
+    </pz:metadata>
+  </xsl:template>
+
+  <xsl:template match="arr">
+    <xsl:for-each select="str">
+      <xsl:call-template name="string"/>
+    </xsl:for-each>
+  </xsl:template>
+
+  <xsl:template name="string">
+      <pz:metadata>
+       <xsl:attribute  name="type">
+         <xsl:value-of select="../@name"/>
+       </xsl:attribute>
+       <xsl:choose>
+         <xsl:when test="../@name = 'medium' and string-length($medium) > 0">
+           <xsl:value-of select="$medium"/>
+         </xsl:when>
+         <xsl:otherwise>
+           <xsl:value-of select="."/>
+         </xsl:otherwise>
+       </xsl:choose>
+      </pz:metadata>
+  </xsl:template>
+
+</xsl:stylesheet>
diff --git a/heikki/solr/solr.lui.xml b/heikki/solr/solr.lui.xml
new file mode 100644 (file)
index 0000000..4e5905d
--- /dev/null
@@ -0,0 +1,40 @@
+<!-- Solr target -->
+<settings target="LUI Solr Test">
+  <set name="pz:name"  value="LUI Solr Test" />
+  <set name="pz:url"   value="lui.indexdata.com/solr" />
+
+  <set name="pz:limitmap:author"  value="rpn:@attr 1=author_exact 6=3" />
+  <set name="pz:limitmap:subject" value="rpn:@attr 1=subject_exact" />
+  <set name="pz:limitmap:date"    value="rpn:@attr 1=date @attr 6=3" />
+  <set name="pz:limitmap:medium"  value="rpn:@attr 1=medium_exact @attr 6=3" />
+
+  <set name="full_text_target"  value="=NO" />
+  <!-- Configure native facets -->
+<!--
+  <set name="pz:termlist_term_count" value="10"/>
+  <set name="pz:facetmap:author"  value="author_exact"  />
+  <set name="pz:facetmap:subject" value="subject_exact" />
+  <set name="pz:facetmap:medium"  value="medium_exact"  />
+  <set name="pz:facetmap:date"  value="date" />
+-->
+
+  <set name="use_url_proxy"  value="0" />
+  <set name="pz:piggyback"   value="1" />
+  <set name="pz:preferred"   value="1" />
+  <set name="pz:block_timeout"  value="2" />
+  <set name="pz:cclmap:term"  value="1=text s=Dal" />
+  <set name="pz:cclmap:au"    value="1=author t=z"   />
+  <set name="pz:cclmap:su"    value="1=subject t=z"  />
+  <set name="pz:cclmap:date"  value="1=date t=z" />
+<!--
+  <set name="pz:cclmap:issn"  value="1=issn" />
+-->
+  <set name="pz:cclmap:ti"  value="1=title" />
+  <set name="pz:cclmap:isbn"  value="1=isbn" />
+  <set name="pz:cclmap:author_phrase" value="1=author_exact 6=3"/>
+  <set name="pz:sru"         value="solr"  />
+  <set name="pz:xslt"  value="solr-pz2.xsl" />
+  <set name="use_thumbnails" value="0" />
+  <set name="pz:queryencoding"  value="UTF-8" />
+
+</settings>
diff --git a/heikki/solr/test3.cfg b/heikki/solr/test3.cfg
new file mode 100644 (file)
index 0000000..7816b85
--- /dev/null
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pazpar2 xmlns="http://www.indexdata.com/pazpar2/1.0">
+  
+  <server>
+    <listen port="9017"/>
+
+    <service>
+        <timeout session="60" z3950_operation="30" z3950_session="180"/>
+
+        <!-- settings src="bibliotek.dk.xml"/-->
+        <settings src="solr.lui.xml"/>
+
+        <icu_chain id="relevance" locale="en">
+            <transform rule="[:Control:] Any-Remove"/>
+            <tokenize rule="l"/>
+            <transform rule="[[:WhiteSpace:][:Punctuation:]`] Remove"/>
+            <casemap rule="l"/>
+        </icu_chain>
+
+        <icu_chain id="sort" locale="en">
+            <transform rule="[[:Control:][:WhiteSpace:][:Punctuation:]`] Remove"/>
+            <casemap rule="l"/>
+        </icu_chain>
+
+        <icu_chain id="mergekey" locale="en">
+            <tokenize rule="l"/>
+            <transform rule="[[:Control:][:WhiteSpace:][:Punctuation:]`] Remove"/>
+            <casemap rule="l"/>
+        </icu_chain>
+
+        <icu_chain id="facet" locale="en">
+            <display/>
+            <transform rule="Title"/>
+        </icu_chain>
+
+        <!--rank debug="yes"/-->
+        <!-- rank cluster="yes" lead="1" length="log" debug="no"/  Autographics settings-->
+        <rank cluster="yes" lead="1" length="log" debug="yes"/>
+
+        <!-- we try to keep same order as in marc21.xsl -->
+        <metadata name="id" brief="yes"/>
+        <metadata name="lccn" merge="unique"/>
+        <metadata name="isbn"/>
+        <metadata name="issn"/>
+        <metadata name="tech-rep-nr"/>
+        <metadata name="author" brief="yes" termlist="yes" merge="unique"
+                    rank="2" mergekey="optional" limitmap="ccl: au" />
+        <metadata name="author-title"/>
+        <metadata name="author-date"/>
+        <metadata name="corporate-name"/>
+        <metadata name="corporate-location"/>
+        <metadata name="corporate-date"/>
+        <metadata name="meeting-name"/>
+        <metadata name="meeting-location"/>
+        <metadata name="meeting-date"/>
+        <metadata name="date" brief="yes" sortkey="numeric" type="year"
+                    merge="range" termlist="yes"/>
+        <metadata name="title" brief="yes" sortkey="skiparticle"
+                    merge="longest" rank="6" mergekey="required"/>
+        <metadata name="title-remainder" brief="yes" merge="longest" rank="5"/>
+        <metadata name="title-responsibility" brief="yes" merge="longest"/>
+        <metadata name="title-dates" brief="yes" merge="longest"/>
+        <metadata name="title-medium" brief="yes" merge="longest"/>
+        <metadata name="title-number-section" brief="yes" merge="longest"/>
+        <metadata name="title-complete"/>
+        <metadata name="title-uniform"/>
+        <metadata name="medium" brief="yes" merge="longest" mergekey="optional" limitmap="local:" />
+        <metadata name="iii-id" brief="yes" merge="no"/>
+        <metadata name="edition"/>
+        <metadata name="publication-place"/>
+        <metadata name="publication-name"/>
+        <metadata name="publication-date"/>
+        <metadata name="physical-extent"/>
+        <metadata name="physical-format"/>
+        <metadata name="physical-dimensions"/>
+        <metadata name="physical-accomp"/>
+        <metadata name="physical-unittype"/>
+        <metadata name="physical-unitsize"/>
+        <metadata name="physical-specified"/>
+
+        <metadata name="series-title"/>
+        <metadata name="description" brief="yes" merge="unique" rank="3"/>
+        <metadata name="subject-long" rank="3"/>
+        <metadata name="subject" termlist="yes" rank="0" limitmap="ccl: su" />
+
+        <metadata name="snippet" brief="yes" merge="unique"/>
+        <metadata name="electronic-url" brief="yes" merge="no"/>
+        <metadata name="electronic-format-type" />
+        <metadata name="electronic-format-instruction" />
+        <metadata name="electronic-text" brief="yes" merge="no"/>
+        <metadata name="electronic-note"/>
+        <metadata name="citation"/>
+        <metadata name="holding"/>
+        <metadata name="fulltext"/>
+        <metadata name="has-fulltext"/>
+        <metadata name="oclc-number"/>
+        <metadata name="system-control-nr"/>
+        <metadata name="locallocation" brief="yes"/>
+        <metadata name="callnumber" brief="yes"/>
+        <metadata name="publicnote" brief="yes"/>
+
+        <!-- journals -->
+        <metadata name="journal-title"/>
+        <metadata name="journal-subpart"/>
+        <metadata name="volume-number"/>
+        <metadata name="issue-date"/>
+        <metadata name="issue-number"/>
+        <metadata name="pages-number"/>
+
+        <metadata name="url_recipe" setting="postproc" brief="yes" merge="no"/>
+        <metadata name="open_url_resolver" setting="parameter" merge="no"/>
+        <metadata name="open-url" merge="longest"/>
+        <metadata name="use_url_proxy" setting="postproc" brief="yes" merge="no"/>
+
+        <metadata name="publisher"/>
+        <metadata name="available"/>
+        <metadata name="due"/>
+        <metadata name="thumburl" brief="yes" merge="unique"/>
+
+        <metadata name="score" brief="yes" sortkey="numeric" merge="range"/>
+    </service>
+
+  </server>
+  
+</pazpar2>
+
+<!-- Keep this comment at the end of the file
+     Local variables:
+     mode: nxml
+     End:
+-->
+
diff --git a/heikki/solr/test3.sh b/heikki/solr/test3.sh
new file mode 100755 (executable)
index 0000000..f40845d
--- /dev/null
@@ -0,0 +1,116 @@
+#!/bin/bash
+#
+# Simple script (and config) to get pz2 to run against DBC's OpenSearch, and
+# calculate rankings. See how they differ for different queries
+#
+
+if [ "$1" == "clean" ]
+then
+  echo "Cleaning up"
+  rm -f $PIDFILE $YAZPIDFILE *.out *.log *.data *~ plot.cmd
+  exit
+fi
+killall pazpar2 
+
+rm -f *.out *.log
+
+URL="http://localhost:9017/"
+CFG="test3.cfg"
+
+PZ="../../src/pazpar2"
+if [ ! -x $PZ ]
+then
+  echo "$PZ2 not executable. Panic"
+  exit 1
+fi
+
+PIDFILE=pz2.pid
+
+$PZ -f $CFG  -l pz2.log -p $PIDFILE &
+sleep 0.2 # make sure it has time to start
+echo "Init"
+curl -s "$URL?command=init" > init.out
+SESSION=`xml_grep --text_only "//session" init.out `
+# cat init.out; echo
+echo "Got session $SESSION"
+SES="&session=$SESSION"
+
+
+if [ -z "$1" ]
+then
+  Q="computer"
+else
+  Q=$1
+fi
+QRY=`echo $Q | sed 's/ /+/g' `
+
+SORT="sort=score"
+#SEARCH="command=search$SES&$QRY&rank=1&sort=relevance"
+#SEARCH="command=search$SES&$QRY"
+#SEARCH="command=search$SES&query=$QRY&sort=relevance"
+SEARCH="command=search$SES&query=$QRY&$SORT"
+echo $SEARCH
+curl -s "$URL?$SEARCH" > search.out
+cat search.out | grep search
+echo
+sleep 0.5 # let the search start working
+
+STAT="command=stat&$SES"
+echo "" > stat.out
+LOOPING=1
+while [ $LOOPING = 1 ]
+do
+  sleep 0.5
+  curl -s "$URL?$STAT" > stat.out
+  ACT=`xml_grep --text_only "//activeclients" stat.out`
+  HIT=`xml_grep --text_only "//hits" stat.out`
+  REC=`xml_grep --text_only "//records" stat.out`
+  echo "$ACT $HIT $REC"
+  if grep -q "<activeclients>0</activeclients>" stat.out
+  then
+    LOOPING=0
+  fi
+  echo >> stats.out
+  cat stat.out >> stats.out
+done
+
+
+SHOW="command=show$SES&start=0&num=100&$SORT"
+echo $SHOW
+curl -s "http://localhost:9017/?$SHOW" > show.out
+#grep "relevance" show.out | grep += | grep -v "(0)"
+#grep "round-robin" show.out
+grep '^ <md-title>' show.out | head -11
+grep 'Received' dbc-opensearch-gw.log | head -1 >> titles.out
+grep '^ <md-title>' show.out >> titles.out
+
+# Plot it
+DF=`echo $QRY | sed 's/@//g' | sed 's/[+"]/_/g' | sed s"/'//g "`
+grep "round-robin" show.out |
+  cut -d' ' -f 6,7 |
+  sed 's/[^0-9 ]//g' |
+  awk '{print FNR,$0}'> $DF.data
+
+
+
+echo '\
+  set term png
+  set out "plot.png"
+  #set yrange [0:300000]
+  set logscale y
+  plot \' > plot.cmd
+for F in *.data
+do
+  BF=`basename $F .data | sed 's/_/ /g' `
+  echo -n " \"$F\" using 1:2  with points  title \"$BF\", " >> plot.cmd
+done
+echo "0 notitle" >> plot.cmd
+
+gnuplot < plot.cmd
+
+echo
+
+echo "All done"
+kill `cat $PIDFILE`
+rm -f $PIDFILE 
+