Normalizing scores

author Heikki Levanto <heikki@indexdata.dk>

Wed, 4 Dec 2013 10:13:56 +0000 (11:13 +0100)

committer Heikki Levanto <heikki@indexdata.dk>

Wed, 4 Dec 2013 10:13:56 +0000 (11:13 +0100)
author Heikki Levanto <heikki@indexdata.dk>
Wed, 4 Dec 2013 10:13:56 +0000 (11:13 +0100)
committer Heikki Levanto <heikki@indexdata.dk>
Wed, 4 Dec 2013 10:13:56 +0000 (11:13 +0100)
diff --git a/heikki/dbc-os/dbc-opensearch-gw.cfg b/heikki/dbc-os/dbc-opensearch-gw.cfg

index 2b09c34..8b7fedd 100644 (file)
--- a/heikki/dbc-os/dbc-opensearch-gw.cfg
+++ b/heikki/dbc-os/dbc-opensearch-gw.cfg
@@ -15,7 +15,7 @@ baseurl: http://openbibdk.addi.dk/0.8/
  objectformat: dkabm
  #constantparams: action=search&facets.numberOfTerms=10&facets.facetName=facet.creator&facets.facetName=facet.type&facets.facetName=facet.subject&agency=100200&profile=test&collectionType=work
  #constantparams: action=search&agency=100200&profile=test&collectionType=work&rank=rank_general
-constantparams: action=search&agency=100200&profile=test&collectionType=work
+constantparams: action=search&agency=100200&profile=test&collectionType=work&objectFormat=score
  fields: bibliotek.dk.fields.txt
  
  database: bibliotek.work
diff --git a/heikki/dbc-os/test2.sh b/heikki/dbc-os/test2.sh

index f187620..bb96855 100755 (executable)
--- a/heikki/dbc-os/test2.sh
+++ b/heikki/dbc-os/test2.sh
@@ -51,7 +51,7 @@ else
  fi
  QRY=`echo $Q | sed 's/ /+/g' `
  
-SORT="sort=score"
+SORT="sort=relevance_h"
  #SEARCH="command=search$SES&$QRY&rank=1&sort=relevance"
  #SEARCH="command=search$SES&$QRY"
  #SEARCH="command=search$SES&query=$QRY&sort=relevance"
diff --git a/heikki/solr/run.sh b/heikki/solr/run.sh

index ea40894..761fe67 100755 (executable)
--- a/heikki/solr/run.sh
+++ b/heikki/solr/run.sh
@@ -9,18 +9,21 @@ then
    echo "It will be in the title of all plots, together with the query"
    exit 1
  fi
+(cd ../../src; rm -f pazpar2; make; grep '###' relevance.c )
  TITLE="$1"
  OUTFILE=`echo $1.txt | sed 's/ /_/g'`
-echo "$TITLE" > $OUTFILE
+echo "Run $TITLE" > $OUTFILE
  ./test3.sh clean
+rm *.png
  
  function onerun() {
      QRY="$1"
      echo "" >> $OUTFILE
      echo "Query: $QRY" >> $OUTFILE
-    PNG=`echo "solr_$TITLE $QRY.png" | sed 's/ /_/g' `
+    PNG=`echo "$TITLE $QRY.png" | sed 's/ /_/g' `
      echo "Graph: $PNG" >> $OUTFILE
      ./test3.sh "$QRY" "$TITLE"
+    cat stat.line >> $OUTFILE
      grep "plotline" show.out | head -10 >> $OUTFILE
      cp plot.png $PNG
  }
@@ -28,6 +31,8 @@ function onerun() {
  onerun "harry potter"
  onerun "vietnam war"
  onerun "water or fire or ice"
+onerun "zen and motorcycle"
  echo "" >> $OUTFILE
-echo "client#, position, tf/idf, roundrobin, solr # database # title" >> $OUTFILE
+echo "client#, position, tf/idf, roundrobin, solr, normalized # database # title" >> $OUTFILE
  
+rm plot.png
diff --git a/heikki/solr/test3.sh b/heikki/solr/test3.sh

index 741a722..68bd69e 100755 (executable)
--- a/heikki/solr/test3.sh
+++ b/heikki/solr/test3.sh
@@ -75,6 +75,7 @@ do
    HIT=`xml_grep --text_only "//hits" stat.out`
    REC=`xml_grep --text_only "//records" stat.out`
    echo "$ACT $HIT $REC"
+  echo "Hits/Fetched: $HIT / $REC" > stat.line
    if grep -q "<activeclients>0</activeclients>" stat.out
    then
      LOOPING=0
@@ -96,19 +97,24 @@ echo "Client numbers"
  cat scores.data | cut -d' ' -f2 | sort -u
  head -10 scores.data
  
+T1=`grep ": 1 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T2=`grep ": 2 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T3=`grep ": 3 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T4=`grep ": 4 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T5=`grep ": 5 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T6=`grep ": 6 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+
  echo "
    set term png
    set out \"plot.png\"
    set title \"$HEADLINE\"
+  plot \"scores.data\" using 0:(\$2==1?\$6:1/0) with points title \"1: $T1\", \
+       \"scores.data\" using 0:(\$2==2?\$6:1/0) with points title \"2: $T2\", \
+       \"scores.data\" using 0:(\$2==3?\$6:1/0) with points title \"3: $T3\", \
+       \"scores.data\" using 0:(\$2==4?\$6:1/0) with points title \"4: $T4\", \
+       \"scores.data\" using 0:(\$2==5?\$6:1/0) with points title \"5: $T5\", \
+       \"scores.data\" using 0:(\$2==6?\$6:1/0) with points title \"6: $T6\"
  " > plot.cmd
-echo '
-  plot "scores.data" using 0:($2==0?$6:1/0) with points title "db-1", \
-       "scores.data" using 0:($2==1?$6:1/0) with points title "db-2", \
-       "scores.data" using 0:($2==2?$6:1/0) with points title "db-3", \
-       "scores.data" using 0:($2==3?$6:1/0) with points title "db-4", \
-       "scores.data" using 0:($2==4?$6:1/0) with points title "db-5", \
-       "scores.data" using 0:($2==5?$6:1/0) with points title "db-6" \
-' >> plot.cmd
  cat plot.cmd | gnuplot
  
  
diff --git a/src/relevance.c b/src/relevance.c

index 5284686..e7f8585 100644 (file)
--- a/src/relevance.c
+++ b/src/relevance.c
@@ -47,6 +47,18 @@ struct relevance
      double lead_decay;
      int length_divide;
      NMEM nmem;
+    struct normalizing *norm;
+};
+
+// Structure to keep data for normalizing scores from one client
+struct normalizing
+{
+    int num;
+    float sum;
+    float max;
+    int count;
+    struct client *client;
+    struct normalizing *next;
  };
  
  struct word_entry {
@@ -57,6 +69,29 @@ struct word_entry {
      struct word_entry *next;
  };
  
+// Find the normalizing entry for this client, or create one if not there
+struct normalizing *findnorm( struct relevance *rel, struct client* client)
+{
+    struct normalizing *n = rel->norm;
+    while (n) {
+        if (n->client == client )
+            return n;
+        n = n->next;
+    }
+    n = nmem_malloc(rel->nmem, sizeof(struct normalizing) );
+    if ( rel->norm )
+        n->num = rel->norm->num +1;
+    else
+        n->num = 1;
+    n->sum = 0.0;
+    n->count = 0;
+    n->max = 0.0;
+    n->client = client;
+    n->next = rel->norm;
+    rel->norm = n;
+    return n;
+}
+
  static struct word_entry *word_entry_match(struct relevance *r,
                                             const char *norm_str,
                                             const char *rank, int *weight)
@@ -307,6 +342,8 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
          nmem_malloc(res->nmem, res->vec_len * sizeof(*res->term_pos));
  
      relevance_clear(res);
+
+    res->norm = 0; 
      return res;
  }
  
@@ -342,17 +379,6 @@ void relevance_newrec(struct relevance *r, struct record_cluster *rec)
      }
  }
  
-void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
-{
-    int i;
-
-    for (i = 1; i < r->vec_len; i++)
-        if (cluster->term_frequency_vec[i] > 0)
-            r->doc_frequency_vec[i]++;
-
-    r->doc_frequency_vec[0]++;
-}
-
  static const char *getfield(struct record *bestrecord, const char *tag)
  {
      struct session *se = client_get_session(bestrecord->client);
@@ -361,11 +387,39 @@ static const char *getfield(struct record *bestrecord, const char *tag)
      if (md_field_id <0)
          return "";
      md = bestrecord->metadata[md_field_id];
-    if ( md) 
+    if ( md)
          return md->data.text.disp;
      return "";
  }
  
+void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
+{
+    int i;
+    
+    // Find the best record in a cluster - the one with lowest position
+    // (in this proto. Later, find a better one)
+    struct record *bestrecord = 0;
+    struct record *record;
+    struct normalizing *n;
+    float score;
+    for (record = cluster->records; record; record = record->next) 
+        if ( bestrecord == 0 || bestrecord->position < record->position )
+            bestrecord = record;
+    n = findnorm(r,bestrecord->client);
+    n->count ++;
+    score = atof( getfield(bestrecord,"score") );
+    n->sum += score;
+    if ( n->max < score )
+        n->max = score;
+
+    for (i = 1; i < r->vec_len; i++)
+        if (cluster->term_frequency_vec[i] > 0)
+            r->doc_frequency_vec[i]++;
+
+    r->doc_frequency_vec[0]++;
+}
+
+
  // Prepare for a relevance-sorted read
  void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
                              enum conf_sortkey_type type)
@@ -373,11 +427,6 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
      int i;
      float *idfvec = xmalloc(rel->vec_len * sizeof(float));
      int n_clients = clients_count();
-    struct client * clients[n_clients];
-    yaz_log(YLOG_LOG,"round-robin: have %d clients", n_clients);
-    for (i = 0; i < n_clients; i++)
-        clients[i] = 0;
-
  
      reclist_enter(reclist);
      // Calculate document frequency vector for each term.
@@ -439,50 +488,55 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
          // get the log entries
          if (type == Metadata_sortkey_relevance_h) {
              struct record *record;
-            int thisclient = 0;
+            struct normalizing *norm;
              struct record *bestrecord = 0;
              int nclust = 0;
+            int tfrel = relevance; // keep the old tf/idf score;
+            int robinscore;
+            int solrscore;
+            int normscore;
              // Find the best record in a cluster - the one with lowest position
              for (record = rec->records; record; record = record->next) {
                  if ( bestrecord == 0 || bestrecord->position < record->position )
                      bestrecord = record;
                  nclust++; // and count them all, for logging
              }
-            // find the client number for the record (we only have a pointer
-            while ( clients[thisclient] != 0
-                    && clients[thisclient] != bestrecord->client )
-                thisclient++;
-            if ( clients[thisclient] == 0 )
-            {
-                yaz_log(YLOG_LOG,"round-robin: found new client at %d: p=%p\n", thisclient, bestrecord->client);
-                clients[thisclient] = bestrecord->client;
-            }
+            norm = findnorm(rel, bestrecord->client);
              // Calculate a round-robin score
-            int tfrel = relevance; // keep the old tf/idf score
-            int robinscore = -(bestrecord->position * n_clients + thisclient) ;
+            robinscore = -(bestrecord->position * n_clients + norm->num) ;
              wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n",
-                         bestrecord->position, thisclient, nclust, tfrel, relevance );
+                         bestrecord->position, norm->num, nclust, tfrel, relevance );
              yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d",
-                         bestrecord->position, thisclient, nclust, relevance );
+                         bestrecord->position, norm->num, nclust, relevance );
  
              // Check if the record has a score field
-            const char *score = getfield(bestrecord,"score");
-            int solrscore = 10000.0 * atof(score);
-            const char *id = getfield(bestrecord, "id");
-            // clear the id, we only want the first numerical part
-            char idbuf[64];
-            i=0;
-            while( id[i] >= '0' && id[i] <= '9' ) {
-                idbuf[i] = id[i];
-                i++;
+            {
+                const char *score = getfield(bestrecord,"score");
+                const char *id = getfield(bestrecord, "id");
+                const char *title = getfield(bestrecord, "title");
+                // clear the id, we only want the first numerical part
+                char idbuf[64];
+                solrscore = 10000.0 * atof(score);
+                i=0;
+                while( id[i] >= '0' && id[i] <= '9' ) {
+                    idbuf[i] = id[i];
+                    i++;
+                }
+                idbuf[i] = '\0';
+                if ( norm->count )
+                {
+                    float avg = norm->sum / norm->count;
+                    normscore = 10000.0 * (  atof(score) / norm->max );
+                    wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n",
+                          score, norm->max, normscore);
+                } else
+                    yaz_log(YLOG_LOG, "normscore: no count, can not normalize %s ", score );
+
+                wrbuf_printf(w,"plotline: %d %d %d %d %d %d # %s %s\n",
+                                norm->num, bestrecord->position,
+                                tfrel, robinscore, solrscore, normscore, idbuf, title );
              }
-            idbuf[i] = '\0';
-            
-            const char *title = getfield(bestrecord, "title");
-            wrbuf_printf(w,"plotline: %d %d %d %d %d # %s %s\n",
-                            thisclient, bestrecord->position,
-                            tfrel, robinscore, solrscore, idbuf, title );
-            relevance = solrscore;
+            relevance = normscore; // ###
          }
          rec->relevance_score = relevance;
      }
author	Heikki Levanto <heikki@indexdata.dk>
	Wed, 4 Dec 2013 10:13:56 +0000 (11:13 +0100)
committer	Heikki Levanto <heikki@indexdata.dk>
	Wed, 4 Dec 2013 10:13:56 +0000 (11:13 +0100)
heikki/dbc-os/dbc-opensearch-gw.cfg		patch \| blob \| history
heikki/dbc-os/test2.sh		patch \| blob \| history
heikki/solr/run.sh		patch \| blob \| history
heikki/solr/test3.sh		patch \| blob \| history
src/relevance.c		patch \| blob \| history