test of least-square normalizing
authorHeikki Levanto <heikki@indexdata.dk>
Thu, 12 Dec 2013 14:39:16 +0000 (15:39 +0100)
committerHeikki Levanto <heikki@indexdata.dk>
Thu, 12 Dec 2013 14:39:16 +0000 (15:39 +0100)
just a perl script to normalize the numbers I got from the other tests,
and to plot. Run ./fit.po *.in for a nice plot

heikki/fitting/fit.pl [new file with mode: 0755]
heikki/fitting/os-potter.in [new file with mode: 0644]
heikki/fitting/os-water_or.in [new file with mode: 0644]
heikki/fitting/os-zen.in [new file with mode: 0644]
heikki/fitting/primo1.in [new file with mode: 0644]
heikki/fitting/primo2.in [new file with mode: 0644]
heikki/fitting/primo3.in [new file with mode: 0644]
heikki/fitting/solr.4.in [new file with mode: 0644]
heikki/fitting/solr.5.in [new file with mode: 0644]
heikki/fitting/solr.6.in [new file with mode: 0644]
src/relevance.c

diff --git a/heikki/fitting/fit.pl b/heikki/fitting/fit.pl
new file mode 100755 (executable)
index 0000000..913eca2
--- /dev/null
@@ -0,0 +1,137 @@
+#!/usr/bin/perl -w
+# fit.c - experiments in curve fitting
+# for pazpar'2 ranking normalizing
+
+# We have a number of data points ( position, score) from
+# different sources. The task is to normalize them so that
+# they all fall near the curve y=1/p, where p is the position
+# This is done by adjusting the ranks R so that Rn = aR+b
+# We need to find parameters a,b so as to minimize the chi-
+# squared difference from y=1/p
+
+
+my $plotnr = 1; # number the tmp files for plotting
+my $plotcmd = ""; # the plot commands for gnuplot
+
+# Calculate the (squared) difference from the normalized rank to the 1/n function
+# Params
+#  p = position (x)
+#  r = rank, not normalized
+#  a,b normalizing params
+sub diff {
+    my ( $p, $r, $a, $b ) = @_;
+    my $rn = $r * $a + $b;
+    my $f = 1.0 / $p;  # target value
+    my $d = $rn - $f;
+    return $d * $d;
+}
+
+# Read and process one data file
+# Just one float number per line, nothing else
+sub onefile {
+    my $fn = shift;
+    my @d;
+    open F, $fn or die "Could not open $fn: $!\n";
+    my $n = 1; # number of data points
+    my $first;
+    my $last;
+    my $title;
+    while ( <F> ){
+        chomp();
+        $title = $_ unless defined($title);
+        next unless /^[0-9]/; # skip comments etc
+        my $v = 1.0 * $_ ;
+        $first = $v unless defined($first);
+        $last = $v;
+        #print "Data $n is $v\n";
+        $d[$n++] = $v;
+    }
+    $title =~ s/^[# ]+//; # clean the '#' and leading space
+    print "$fn: '$title' $n points: $first - $last \n";
+    # Initial guess Rn = a*R + b
+    my $a = 1.0 / $first;
+    my $b = - $last;
+    # step sizes for a and b
+    my $da = $a / 3;
+    my $db = - $b / 3;
+    my $iteration = 0;
+    my $prev = 0.0;
+    while (1) {
+        $iteration++;
+        # 5 sums: at (a,b) (a+,b), (a-,b), (a,b+), (a,b-)
+        my $sab = 0.0; # at a,b
+        my $sap = 0.0; # at a+da,b
+        my $sam = 0.0; # at a-da,n
+        my $sbp = 0.0; # at a, b+db
+        my $sbm = 0.0; # at a, b-db
+        for ( my $p = 1 ; $p < $n; $p++ ) {
+            $sab += diff( $p, $d[$p], $a, $b );
+            $sap += diff( $p, $d[$p], $a+$da, $b );
+            $sam += diff( $p, $d[$p], $a-$da, $b );
+            $sbp += diff( $p, $d[$p], $a, $b+$db );
+            $sbm += diff( $p, $d[$p], $a, $b-$db );
+        }
+        my $dif = $sab - $prev;
+        #print "iteration $iteration: a=$a +- $da   b=$b +- $db chisq=$sab dif=$dif\n";
+        if ( (abs($da) < abs($a)/100.0 && abs($db) < abs($b)/100.0) ||
+             ($iteration >= 100 ) ||
+             (abs($dif) < 0.00001 ) ) {
+            print "it-$iteration: a=$a +- $da   b=$b +- $db chisq=$sab dif=$dif\n";
+            last;
+        }
+        $prev = $sab;
+        # adjust a
+        if ( $sap < $sab && $sap < $sam ) {
+            $a += $da;
+        } elsif ( $sam < $sab && $sam < $sap ) {
+            $a -= $da;
+        } else {
+            $da = $da /2;
+        }
+        $da = $da * 0.99;
+        # adjust b
+        if ( $sbp < $sab && $sbp < $sbm ) {
+            $b += $db;
+        } elsif ( $sbm < $sab && $sbm < $sbp ) {
+            $b -= $db;
+        } else {
+            $db = $db /2;
+        }
+        $db = $db * 0.99;
+    }
+
+    # plot the file
+    my $pf = "/tmp/plot.$plotnr.data";
+    $plotnr++;
+    open PF, ">$pf" or die "Could not open plot file $pf: $!\n";
+    for ( my $p = 1 ; $p < $n; $p++ ) {
+        my $rn = $d[$p] * $a + $b;
+        print PF "$p $rn\n";
+    }
+    close PF;
+    $plotcmd .= "," if ($plotcmd);
+    $plotcmd .= "\"$pf\" using 1:2 with points title \"$title\"";
+
+    
+
+}
+
+# main
+
+if ( !defined($ARGV[0]) ) {
+    die "Need at least one file to plot\n";
+}
+while ($ARGV[0]) {
+  onefile( $ARGV[0] );
+  shift(@ARGV);
+}
+my $cmd =
+    "set term png\n" .
+    "set out \"plot.png\" \n" .
+    "plot $plotcmd \n";
+
+print "$cmd \n";
+
+open GP, "| gnuplot" or die "Could not open a pipe to gnuplot: $!\n";
+print GP $cmd;
+close GP;
\ No newline at end of file
diff --git a/heikki/fitting/os-potter.in b/heikki/fitting/os-potter.in
new file mode 100644 (file)
index 0000000..fc77330
--- /dev/null
@@ -0,0 +1,99 @@
+# OpenSearch: Harry Potter
+35632
+6386
+39669
+62696
+62696
+32809
+32809
+39669
+55836
+39669
+55836
+62696
+41044
+39544
+41976
+49630
+49630
+50795
+6043
+34662
+34506
+14020
+6825
+6825
+34506
+11982
+12767
+27727
+2452
+11077
+31873
+32809
+30702
+35632
+35632
+1252
+58113
+16620
+24931
+37956
+34031
+38090
+5895
+32809
+39669
+32809
+39669
+30702
+35632
+6825
+13456
+0
+3021
+37548
+11876
+45461
+43659
+10559
+5538
+6386
+13285
+34762
+34762
+37584
+0
+59435
+40863
+41406
+37300
+32439
+32370
+0
+63142
+11535
+47107
+28198
+50795
+20776
+32809
+29717
+32809
+27727
+35353
+10885
+30702
+30756
+27796
+27727
+34363
+37369
+32439
+30794
+36301
+37369
+104072
+13650
+5767
+0
diff --git a/heikki/fitting/os-water_or.in b/heikki/fitting/os-water_or.in
new file mode 100644 (file)
index 0000000..21003d0
--- /dev/null
@@ -0,0 +1,101 @@
+# OpenSearch: Water or Fire or Ice
+1072620
+1072620
+227252
+953170
+0
+190130
+132687
+357539
+41227
+653127
+182406
+0
+342992
+186997
+32487
+158852
+265375
+103032
+190130
+295437
+244746
+357539
+124766
+504118
+0
+158861
+254715
+0
+0
+0
+0
+23663
+11977
+11977
+286400
+26359
+154715
+25311
+0
+82271
+126715
+91603
+286400
+254715
+38714
+38714
+25311
+82747
+0
+278475
+91514
+161559
+161559
+161559
+161559
+161559
+0
+293315
+91603
+20613
+21422
+20982
+27976
+73574
+86354
+64764
+225581
+0
+35025
+138314
+138314
+25311
+108300
+238359
+253717
+0
+32468
+0
+0
+0
+0
+0
+16234
+0
+0
+26359
+0
+50624
+284612
+12720
+13988
+0
+0
+59792
+0
+0
+0
+83929
+0
+0
diff --git a/heikki/fitting/os-zen.in b/heikki/fitting/os-zen.in
new file mode 100644 (file)
index 0000000..559faff
--- /dev/null
@@ -0,0 +1,9 @@
+# OpenSearch: Zen and motorcycle
+949202
+413772
+59799
+105466
+17462
+64071
+0
+0
diff --git a/heikki/fitting/primo1.in b/heikki/fitting/primo1.in
new file mode 100644 (file)
index 0000000..667c1f9
--- /dev/null
@@ -0,0 +1,101 @@
+#primo-1
+0.20756114
+0.13844302
+0.10148811
+0.10148811
+0.0888021
+0.0888021
+0.0888021
+0.0888021
+0.0888021
+0.0888021
+0.0888021
+0.0888021
+0.06343007
+0.06343007
+0.06343007
+0.054605015
+0.050744057
+0.050744057
+0.04440105
+0.04440105
+0.04440105
+0.04440105
+0.04288424
+0.038058043
+0.038058043
+0.038058043
+0.038058043
+0.038058043
+0.033645514
+0.032335546
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.031715035
+0.03139628
+0.03019823
+0.025889121
+0.02537203
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
+0.025372028
diff --git a/heikki/fitting/primo2.in b/heikki/fitting/primo2.in
new file mode 100644 (file)
index 0000000..7ef4dde
--- /dev/null
@@ -0,0 +1,101 @@
+#primo-2
+0.20761509
+0.02813717
+0.013241022
+0.013241022
+0.011585894
+0.011585894
+0.011585894
+0.011585894
+0.011585894
+0.011585894
+0.011585894
+0.011585894
+0.009792839
+0.008275638
+0.008275638
+0.008275638
+0.008137711
+0.0077239294
+0.0077239294
+0.0077239294
+0.0066707116
+0.006620511
+0.006620511
+0.00641362
+0.0060688015
+0.005792947
+0.005792947
+0.005792947
+0.005792947
+0.005792947
+0.0053102016
+0.005189515
+0.004965385
+0.0049653836
+0.0049653836
+0.0049653836
+0.0049653836
+0.004827456
+0.004827456
+0.004787209
+0.004758492
+0.004593435
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004137819
+0.004096232
+0.0039643953
+0.0038964467
+0.0038964467
+0.0038921367
+0.003827483
+0.003793001
+0.0037412783
+0.0036817277
+0.0035235784
+0.0034085026
+0.003379219
+0.003379219
+0.0033444054
+0.0033102573
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
+0.0033102555
diff --git a/heikki/fitting/primo3.in b/heikki/fitting/primo3.in
new file mode 100644 (file)
index 0000000..9b80bf6
--- /dev/null
@@ -0,0 +1,101 @@
+#primo-3
+0.9688704
+0.48564208
+0.4844352
+0.34602517
+0.24282716
+0.2422176
+0.2422176
+0.17301258
+0.13841006
+0.13841006
+0.13841006
+0.08650733
+0.06920503
+0.0605544
+0.0605544
+0.04555319
+0.043436013
+0.04338245
+0.043360904
+0.043360904
+0.043360904
+0.043360904
+0.043360904
+0.04333935
+0.04333935
+0.04329782
+0.04328929
+0.043283623
+0.036257647
+0.034602515
+0.034602515
+0.034602515
+0.032769855
+0.031989623
+0.030320304
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.0302772
+0.030199146
+0.029892702
+0.02957932
+0.028755857
+0.026976993
+0.026775088
+0.026775088
+0.026708232
+0.026482046
+0.026482044
+0.026206192
+0.026116762
+0.025731273
+0.025316
+0.024804201
+0.024745526
+0.024551062
+0.024409074
+0.024283035
+0.024211751
+0.023766499
+0.023736173
+0.023624163
+0.023554381
+0.023495784
+0.023482127
+0.02325794
+0.023171788
+0.023171788
+0.023115499
+0.02302765
+0.022964898
+0.022964898
+0.022910973
+0.02275256
+0.02262008
+0.022522116
+0.022344224
+0.022237735
+0.022181446
+0.022108069
+0.022106145
+0.02174285
+0.021742849
+0.021723554
+0.021723554
+0.021666314
+0.021626573
+0.021626573
+0.021626573
+0.021568384
+0.021453962
+0.021378037
diff --git a/heikki/fitting/solr.4.in b/heikki/fitting/solr.4.in
new file mode 100644 (file)
index 0000000..3838e38
--- /dev/null
@@ -0,0 +1,60 @@
+#solr-4
+23010
+21476
+21256
+21089
+20581
+20351
+20351
+20351
+20351
+20351
+20089
+20089
+20089
+19555
+19525
+19525
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19404
+19358
+19358
+19358
+18975
+18975
+18975
+18975
+18975
+18975
+18975
+18975
+18975
+18975
+18975
+18975
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
diff --git a/heikki/fitting/solr.5.in b/heikki/fitting/solr.5.in
new file mode 100644 (file)
index 0000000..219b7fe
--- /dev/null
@@ -0,0 +1,20 @@
+#solr-5
+21694
+20581
+20581
+20581
+20581
+20351
+20293
+20293
+20293
+20293
+20293
+20293
+20293
+20293
+20293
+20293
+19404
+19404
+19404
diff --git a/heikki/fitting/solr.6.in b/heikki/fitting/solr.6.in
new file mode 100644 (file)
index 0000000..2925d58
--- /dev/null
@@ -0,0 +1,23 @@
+#solr-6
+22137
+20581
+20119
+19525
+19358
+19358
+19358
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
+18788
index 1f3eb28..377cdb7 100644 (file)
@@ -53,7 +53,7 @@ struct relevance
 // Structure to keep data for normalizing scores from one client
 struct normalizing
 {
-    int num;
+    int num; // number of the client
     float sum;
     float max;
     int count;
@@ -612,7 +612,7 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
             wrbuf_printf(w,"plotline: %d %d %d %d %d %d %d # %s %s\n",
                             norm->num, bestrecord->position,
                             tfrel, robinscore, solrscore, normscore, mergescore, idbuf, title );
-            relevance = normscore;
+            relevance = solrscore;
         }
         rec->relevance_score = relevance;
     }