Merge branch 'ranking-h' of ssh://git.indexdata.com:222/home/git/pub/pazpar2 into...
authorHeikki Levanto <heikki@indexdata.dk>
Wed, 4 Dec 2013 11:58:56 +0000 (12:58 +0100)
committerHeikki Levanto <heikki@indexdata.dk>
Wed, 4 Dec 2013 11:59:16 +0000 (12:59 +0100)
Conflicts:
heikki/solr/test3.sh
src/relevance.c
Also fixed a detail with sorting of the score numbers

1  2 
heikki/solr/test3.sh
src/relevance.c

@@@ -97,13 -96,8 +97,15 @@@ echo "Client numbers
  cat scores.data | cut -d' ' -f2 | sort -u
  head -10 scores.data
  
+ exit 1
 +T1=`grep ": 1 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
 +T2=`grep ": 2 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
 +T3=`grep ": 3 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
 +T4=`grep ": 4 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
 +T5=`grep ": 5 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
 +T6=`grep ": 6 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
 +
  echo "
    set term png
    set out \"plot.png\"
diff --cc src/relevance.c
@@@ -392,34 -366,14 +392,47 @@@ static const char *getfield(struct reco
      return "";
  }
  
 +void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
 +{
 +    int i;
 +    
 +    // Find the best record in a cluster - the one with lowest position
 +    // (in this proto. Later, find a better one)
 +    struct record *bestrecord = 0;
 +    struct record *record;
 +    struct normalizing *n;
 +    float score;
 +    for (record = cluster->records; record; record = record->next) 
 +        if ( bestrecord == 0 || bestrecord->position < record->position )
 +            bestrecord = record;
 +    n = findnorm(r,bestrecord->client);
 +    n->count ++;
 +    score = atof( getfield(bestrecord,"score") );
 +    n->sum += score;
 +    if ( n->max < score )
 +        n->max = score;
 +
 +    for (i = 1; i < r->vec_len; i++)
 +        if (cluster->term_frequency_vec[i] > 0)
 +            r->doc_frequency_vec[i]++;
 +
 +    r->doc_frequency_vec[0]++;
 +}
 +
 +
+ // Helper to compare floats, for qsort
+ static int sort_float(const void *x, const void *y)
+ {
+     const float *fx = x;
+     const float *fy = y;
 -    return *fx - *fy;
++    //yaz_log(YLOG_LOG,"sorting %f and %f", *fx, *fy);  // ###
++    if ( *fx > *fy )
++        return 1;
++    if ( *fx < *fy )
++        return -1;
++    return 0;   // do not return *fx-*fy, it is often too close to zero.
+ }
  // Prepare for a relevance-sorted read
  void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
                              enum conf_sortkey_type type)
      int i;
      float *idfvec = xmalloc(rel->vec_len * sizeof(float));
      int n_clients = clients_count();
 -    struct client * clients[n_clients];
+     int clusternumber = 0;
+     yaz_log(YLOG_LOG,"round-robin: have %d clients", n_clients);
 -    for (i = 0; i < n_clients; i++)
 -        clients[i] = 0;
 -
  
      reclist_enter(reclist);
      // Calculate document frequency vector for each term.
          // get the log entries
          if (type == Metadata_sortkey_relevance_h) {
              struct record *record;
 -            int thisclient = 0;
 +            struct normalizing *norm;
              struct record *bestrecord = 0;
              int nclust = 0;
-             int tfrel = relevance; // keep the old tf/idf score;
-             int robinscore;
-             int solrscore;
+             int tfrel = relevance; // keep the old tf/idf score
+             int robinscore = 0;
+             int solrscore = 0;
 +            int normscore;
+             const char *score;
+             const char *id;
+             const char *title;
+             char idbuf[64];
+             int mergescore = 0;
              // Find the best record in a cluster - the one with lowest position
              for (record = rec->records; record; record = record->next) {
                  if ( bestrecord == 0 || bestrecord->position < record->position )
                      bestrecord = record;
                  nclust++; // and count them all, for logging
              }
 -            // find the client number for the record (we only have a pointer
 -            while ( clients[thisclient] != 0
 -                    && clients[thisclient] != bestrecord->client )
 -                thisclient++;
 -            if ( clients[thisclient] == 0 )
 -            {
 -                yaz_log(YLOG_LOG,"round-robin: found new client at %d: p=%p\n", thisclient, bestrecord->client);
 -                clients[thisclient] = bestrecord->client;
 -            }
 +            norm = findnorm(rel, bestrecord->client);
              // Calculate a round-robin score
 -            robinscore = -(bestrecord->position * n_clients + thisclient) ;
 +            robinscore = -(bestrecord->position * n_clients + norm->num) ;
              wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n",
 -                         bestrecord->position, thisclient, nclust, tfrel, relevance );
 +                         bestrecord->position, norm->num, nclust, tfrel, relevance );
              yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d",
 -                         bestrecord->position, thisclient, nclust, relevance );
 +                         bestrecord->position, norm->num, nclust, relevance );
  
              // Check if the record has a score field
+             score = getfield(bestrecord,"score");
++            id = getfield(bestrecord, "id");
++            title = getfield(bestrecord, "title");
+             solrscore = 10000.0 * atof(score);
 -            
++            // clear the id, we only want the first numerical part
++            i=0;
++            while( id[i] >= '0' && id[i] <= '9' ) {
++                idbuf[i] = id[i];
++                i++;
++            }
++            idbuf[i] = '\0';
++            if ( norm->count )
 +            {
-                 const char *score = getfield(bestrecord,"score");
-                 const char *id = getfield(bestrecord, "id");
-                 const char *title = getfield(bestrecord, "title");
-                 // clear the id, we only want the first numerical part
-                 char idbuf[64];
-                 solrscore = 10000.0 * atof(score);
-                 i=0;
-                 while( id[i] >= '0' && id[i] <= '9' ) {
-                     idbuf[i] = id[i];
-                     i++;
++                //float avg = norm->sum / norm->count;
++                normscore = 10000.0 * (  atof(score) / norm->max );
++                wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n",
++                        score, norm->max, normscore);
++            } else
++                yaz_log(YLOG_LOG, "normscore: no count, can not normalize %s ", score );
++
+             // If we have a score in the best record, we probably have in them all
+             // and we can try to merge scores
+             if ( *score ) {
+                 float scores[nclust];
+                 float s = 0.0;
+                 int i=0;
 -                for (record = rec->records; record; record = record->next, i++) {
 -                    scores[i] = atof( getfield(record,"score") );
 -                    yaz_log(YLOG_LOG,"mergescore %d: %f", i, scores[i] );
 -                    wrbuf_printf(w,"mergeplot %d: %f x\n", clusternumber, 10000*scores[i] );
++                if ( rec->records && rec->records->next ) 
++                { // have more than one record
++                    for (record = rec->records; record; record = record->next, i++)
++                    {
++                        scores[i] = atof( getfield(record,"score") );
++                        yaz_log(YLOG_LOG,"mergescore %d: %f", i, scores[i] );
++                        wrbuf_printf(w,"mergeplot %d: %f x\n", clusternumber, 10000*scores[i] );
++                    }
++                    qsort(scores, nclust, sizeof(float), sort_float );
++                    for (i = 0; i<nclust; i++)
++                    {
++                        yaz_log(YLOG_LOG,"Sorted mergescore %d: %f + %f/%d = %f", i, s,scores[i],i+1, s+scores[i] / (i+1) );
++                        wrbuf_printf(w,"Sorted mergescore %d: %f + %f/%d = %f\n",  i, s,scores[i],i+1, s+scores[i] / (i+1));
++                        s += scores[i] / (i+1);
++                    }
++                    mergescore = s * 10000;
                  }
-                 idbuf[i] = '\0';
-                 if ( norm->count )
-                 {
-                     float avg = norm->sum / norm->count;
-                     normscore = 10000.0 * (  atof(score) / norm->max );
-                     wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n",
-                           score, norm->max, normscore);
-                 } else
-                     yaz_log(YLOG_LOG, "normscore: no count, can not normalize %s ", score );
-                 wrbuf_printf(w,"plotline: %d %d %d %d %d %d # %s %s\n",
-                                 norm->num, bestrecord->position,
-                                 tfrel, robinscore, solrscore, normscore, idbuf, title );
 -                qsort(scores, nclust, sizeof(float), sort_float );
 -                for (i = 0; i<nclust; i++) {
 -                    s += scores[i] / (i+1);
 -                    yaz_log(YLOG_LOG,"Sorted mergescore %d: %f makes %f", i, scores[i], s );
 -                    wrbuf_printf(w,"Sorted mergescore %d: %f makes %f\n", i, scores[i], s );
++                else
++                { // only one record, take the easy way out of merging
++                    mergescore = atof( score ) * 10000;
+                 }
 -                mergescore = s * 10000;
+                 wrbuf_printf(w,"mergeplot %d: x %d \n", clusternumber, mergescore );
+                 // TODO - Should not use bestrecord->position, but something from rec that
+                 // corresponds to the hit number, for plotting.
+             } // merge score
+             id = getfield(bestrecord, "id");
+             // clear the id, we only want the first numerical part
+             i=0;
+             while( id[i] >= '0' && id[i] <= '9' ) {
+                 idbuf[i] = id[i];
+                 i++;
              }
-             relevance = normscore; // ###
+             idbuf[i] = '\0';
+             
+             title = getfield(bestrecord, "title");
 -            wrbuf_printf(w,"plotline: %d %d %d %d %d %d # %s %s\n",
 -                            thisclient, bestrecord->position,
 -                            tfrel, robinscore, solrscore, mergescore, idbuf, title );
++            wrbuf_printf(w,"plotline: %d %d %d %d %d %d %d # %s %s\n",
++                            norm->num, bestrecord->position,
++                            tfrel, robinscore, solrscore, normscore, mergescore, idbuf, title );
+             relevance = mergescore;
          }
          rec->relevance_score = relevance;
      }