1 /* $Id: zvrank.c,v 1.21 2006-05-10 08:13:26 adam Exp $
2 Copyright (C) 1995-2005
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
24 Zvrank: an experimental ranking algorithm. See doc/zvrank.txt and
25 source in index/zvrank.c. Enable this by using rank: zvrank in zebra.cfg.
26 Contributed by Johannes Leveling <Johannes.Leveling at
30 /* Zebra Vector Space Model RANKing
32 ** six (seven) letter identifier for weighting scheme
33 ** best document weighting:
34 ** tfc nfc (tpc npc) [original naming]
35 ** ntc atc npc apc [SMART naming, used here]
36 ** best query weighting:
37 ** nfx tfx bfx (npx tpx bpx) [original naming]
38 ** atn ntn btn apn npn bpn [SMART naming]
39 ** -> should set zvrank.weighting-scheme to one of
40 ** "ntc-atn", "atc-atn", etc.
44 #include <math.h> /* for log */
57 static int log_level = 0;
58 static int log_initialized = 0;
60 static double blog(double x) {
61 /* log_2, log_e or log_10 is used, best to change it here if necessary */
64 return log(x); /* / log(base) */
69 struct rank_class_info {
70 char rscheme[8]; /* name of weighting scheme */
74 struct rs_info { /* for result set */
75 int db_docs; /* number of documents in database (collection) */
76 int db_terms; /* number of distinct terms in database (debugging?) */
77 int db_f_max; /* maximum of f_t in database (debugging?) */
78 char *db_f_max_str; /* string (most frequent term) - for debugging */
80 char rscheme[8]; /* name of weighting scheme */
84 void (*d_tf_fct)(void *, void *); /* doc term frequency function */
85 void (*d_idf_fct)(void *, void *); /* doc idf function */
86 void (*d_norm_fct)(void *, void *); /* doc normalization function */
88 void (*q_tf_fct)(void *, void *); /* query term frequency function */
89 void (*q_idf_fct)(void *, void *); /* query idf function */
90 void (*q_norm_fct)(void *, void *); /* query normalization function */
92 double (*sim_fct)(void *, void *); /* similarity function (scoring function) */
96 typedef struct rs_info *RS;
98 static void prn_rs(RS rs) { /* for debugging */
99 yaz_log(log_level, "* RS:");
100 yaz_log(log_level, " db_docs: %d", rs->db_docs);
101 yaz_log(log_level, " db_terms: %d", rs->db_terms);
102 yaz_log(log_level, " f_max: %d", rs->db_f_max);
103 yaz_log(log_level, " f_max_str: %s", rs->db_f_max_str);
104 yaz_log(log_level, " veclen: %d", rs->veclen);
105 /* rscheme implies functions */
106 yaz_log(log_level, " rscheme: %s", rs->rscheme);
110 struct ds_info { /* document info */
111 char *docid; /* unique doc identifier */
112 int docno; /* doc number */
113 int doclen; /* document length */
114 int d_f_max; /* maximum number of any term in doc (needed) */
115 char *d_f_max_str; /* most frequent term in d - for debugging */
116 int veclen; /* vector length */
117 struct ts_info *terms;
118 double docsim; /* similarity in [0, ..., 1] (= score/1000) */
120 typedef struct ds_info* DS;
123 static void prn_ds(DS ds) { /* for debugging */
124 yaz_log(log_level, " * DS:");
125 yaz_log(log_level, " docid: %s", ds->docid);
126 yaz_log(log_level, " docno: %d", ds->docno);
127 yaz_log(log_level, " doclen: %d", ds->doclen);
128 yaz_log(log_level, " d_f_max: %d", ds->d_f_max);
129 yaz_log(log_level, " d_f_max_str:%s", ds->d_f_max_str);
130 yaz_log(log_level, " veclen: %d", ds->veclen);
135 struct ts_info { /* term info */
145 typedef struct ts_info *TS;
148 static void prn_ts(TS ts) { /* for debugging */
149 yaz_log(log_level, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f",
150 ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);
160 ** weighting functions
161 ** check: RS is not needed anymore
164 /* calculate and store new term frequency vector */
165 static void tf_none(void *rsi, void *dsi) {
168 /* no conversion. 1 <= tf */
170 for (i=0; i < veclen; i++) {
171 freq=ds->terms[i].locc;
172 ds->terms[i].tf=freq;
177 static void tf_binary(void *rsi, void *dsi) {
182 for (i=0; i < veclen; i++) {
183 freq=ds->terms[i].locc;
192 static void tf_max_norm(void *rsi, void *dsi) {
196 /* divide each term by max, so 0 <= tf <= 1 */
197 tf_max=ds->d_f_max; /* largest frequency of t in document */
199 for (i=0; i < veclen; i++) {
200 freq=ds->terms[i].locc;
203 ds->terms[i].tf=freq/tf_max;
210 static void tf_aug_norm(void *rsi, void *dsi) {
215 /* augmented normalized tf. 0.5 <= tf <= 1 for K = 0.5 */
216 tf_max=ds->d_f_max; /* largest frequency of t in document */
218 K=0.5; /* zvrank.const-K */
219 for (i=0; i < veclen; i++) {
220 freq=ds->terms[i].locc;
223 ds->terms[i].tf=K+(1.0-K)*(freq/tf_max);
230 static void tf_square(void *rsi, void *dsi) {
235 for (i=0; i < veclen; i++) {
236 freq=ds->terms[i].locc;
238 ds->terms[i].tf=freq*freq;
245 static void tf_log(void *rsi, void *dsi) {
250 for (i=0; i < veclen; i++) {
251 freq=ds->terms[i].locc;
253 ds->terms[i].tf=1.0+blog(freq);
260 /* calculate and store inverse document frequency vector */
261 static void idf_none(void *rsi, void *dsi) {
266 for (i=0; i < veclen; i++) {
267 ds->terms[i].idf=1.0;
272 static void idf_tfidf(void *rsi, void *dsi) {
278 /* normal tfidf weight */
280 num_docs=rs->db_docs;
281 for (i=0; i < veclen; i++) {
282 gocc=ds->terms[i].gocc;
286 idf=blog((double) (num_docs/gocc));
287 ds->terms[i].idf=idf;
292 static void idf_prob(void *rsi, void *dsi) {
298 /* probabilistic formulation */
300 num_docs=rs->db_docs;
301 for (i=0; i < veclen; i++) {
302 gocc=ds->terms[i].gocc;
306 idf=blog((double) ((num_docs-gocc)/gocc));
307 ds->terms[i].idf=idf;
312 static void idf_freq(void *rsi, void *dsi) {
318 /* frequency formulation */
320 num_docs=rs->db_docs;
325 for (i=0; i < veclen; i++) {
326 ds->terms[i].idf=idf;
331 static void idf_squared(void *rsi, void *dsi) {
339 num_docs=rs->db_docs;
340 yaz_log(log_level, "idf_squared: db_docs required");
341 for (i=0; i < veclen; i++) {
342 gocc=ds->terms[i].gocc;
346 idf=blog(CAST_ZINT_TO_DOUBLE(num_docs/gocc));
348 ds->terms[i].idf=idf;
353 /* calculate and store normalized weight (tf-idf) vector */
354 static void norm_none(void *rsi, void *dsi) {
357 /* no normalization */
359 for (i=0; i < veclen; i++) {
360 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
365 static void norm_sum(void *rsi, void *dsi) {
371 for (i=0; i < veclen; i++) {
372 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
373 tfs+=ds->terms[i].wt;
376 for (i=0; i < veclen; i++) {
377 ds->terms[i].wt=ds->terms[i].wt/tfs;
379 /* else: tfs==0 && ds->terms[i].wt==0 */
383 static void norm_cosine(void *rsi, void *dsi) {
389 for (i=0; i < veclen; i++) {
390 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
391 tfs+=(ds->terms[i].wt*ds->terms[i].wt);
395 for (i=0; i < veclen; i++) {
396 ds->terms[i].wt=ds->terms[i].wt/tfs;
398 /* else: tfs==0 && ds->terms[i].wt==0 */
402 static void norm_fourth(void *rsi, void *dsi) {
408 for (i=0; i < veclen; i++) {
409 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
410 fr=(ds->terms[i].wt*ds->terms[i].wt);
415 for (i=0; i < veclen; i++) {
416 ds->terms[i].wt=ds->terms[i].wt/tfs;
418 /* else: tfs==0 && ds->terms[i].wt==0 */
422 static void norm_max(void *rsi, void *dsi) {
428 for (i=0; i < veclen; i++) {
429 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
430 if (ds->terms[i].wt > tfm)
434 for (i=0; i < veclen; i++) {
435 ds->terms[i].wt=ds->terms[i].wt/tfm;
437 /* else: tfs==0 && ds->terms[i].wt==0 */
441 /* FIXME add norm_pivot, ... */
443 static double sim_cosine(void *dsi1, void *dsi2) {
447 double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;
450 veclen=ds1->veclen; /* and ds2->veclen */
451 for (i=0; i < veclen; i++) {
458 sdiv=sqrt(sqr11*sqr22);
464 /* FIXME: add norm_jaccard, norm_dice, ... */
466 /* end weighting functions */
470 static void zv_init_scheme(RS rs, const char *sname) {
472 char c0, c1, c2, c3, c4, c5, c6;
473 const char *def_rscheme="ntc-atn"; /* a good default */
475 yaz_log(log_level, "zv_init_scheme");
478 yaz_log(log_level, "zvrank: invalid weighting-scheme \"%s\"", sname);
479 if (slen > 0) c0=sname[0]; else c0=def_rscheme[0];
480 if (slen > 1) c1=sname[1]; else c1=def_rscheme[1];
481 if (slen > 2) c2=sname[2]; else c2=def_rscheme[2];
483 if (slen > 4) c4=sname[4]; else c4=def_rscheme[4];
484 if (slen > 5) c5=sname[5]; else c5=def_rscheme[5];
485 if (slen > 6) c6=sname[6]; else c6=def_rscheme[6];
487 /* assign doc functions */
491 rs->d_tf_fct=tf_binary;
495 rs->d_tf_fct=tf_max_norm;
497 yaz_log(log_level, "tf_max_norm: d_f_max required");
500 rs->d_tf_fct=tf_aug_norm;
502 yaz_log(log_level, "tf_aug_norm: d_f_max required");
505 rs->d_tf_fct=tf_square;
513 rs->d_tf_fct=tf_none;
519 rs->d_idf_fct=idf_tfidf;
521 yaz_log(log_level, "idf_tfidf: db_docs required");
524 rs->d_idf_fct=idf_prob;
526 yaz_log(log_level, "idf_prob: db_docs required");
529 rs->d_idf_fct=idf_freq;
531 yaz_log(log_level, "idf_freq: db_docs required");
534 rs->d_idf_fct=idf_squared;
536 yaz_log(log_level, "idf_squared: db_docs required");
539 rs->d_idf_fct=idf_none;
545 rs->d_norm_fct=norm_sum;
549 rs->d_norm_fct=norm_cosine;
553 rs->d_norm_fct=norm_fourth;
557 rs->d_norm_fct=norm_max;
561 rs->d_norm_fct=norm_none;
565 /* assign query functions */
569 rs->q_tf_fct=tf_binary;
573 rs->q_tf_fct=tf_max_norm;
574 yaz_log(log_level, "tf_max_norm: d_f_max required");
578 rs->q_tf_fct=tf_aug_norm;
580 yaz_log(log_level, "tf_aug_norm: d_f_max required");
583 rs->q_tf_fct=tf_square;
591 rs->q_tf_fct=tf_none;
597 rs->q_idf_fct=idf_tfidf;
599 yaz_log(log_level, "idf_tfidf: db_docs required");
602 rs->q_idf_fct=idf_prob;
604 yaz_log(log_level, "idf_prob: db_docs required");
607 rs->q_idf_fct=idf_freq;
609 yaz_log(log_level, "idf_freq: db_docs required");
612 rs->q_idf_fct=idf_squared;
614 yaz_log(log_level, "idf_squared: db_docs required");
617 rs->q_idf_fct=idf_none;
623 rs->q_norm_fct=norm_sum;
627 rs->q_norm_fct=norm_cosine;
631 rs->q_norm_fct=norm_fourth;
635 rs->q_norm_fct=norm_max;
639 rs->q_norm_fct=norm_none;
643 rs->sim_fct=sim_cosine;
644 yaz_log(log_level, "zv_scheme %s", rs->rscheme);
647 static void zv_init(RS rs, const char *rscheme) {
648 yaz_log(log_level, "zv_init");
650 rs->db_docs=100000; /* assign correct value here */
651 rs->db_terms=500000; /* assign correct value here (for debugging) */
652 rs->db_f_max=50; /* assign correct value here */
653 rs->db_f_max_str="a"; /* assign correct value here (for debugging) */
654 /* FIXME - get those values from somewhere */
655 zv_init_scheme(rs, rscheme);
662 * zv_create: Creates/Initialises this rank handler. This routine is
663 * called exactly once. The routine returns the class_handle.
665 static void *zv_create (ZebraHandle zh) {
669 struct rank_class_info *ci = (struct rank_class_info *)
670 xmalloc (sizeof(*ci));
671 if (!log_initialized)
673 log_level = yaz_log_module_level("zvrank");
677 yaz_log(log_level, "zv_create");
678 wscheme= res_get_def(res, "zvrank.weighting-scheme", "");
679 for (i = 0; wscheme[i] && i < 8; i++)
680 ci->rscheme[i]=wscheme[i];
681 ci->rscheme[i] = '\0';
686 * zv_destroy: Destroys this rank handler. This routine is called
687 * when the handler is no longer needed - i.e. when the server
688 * dies. The class_handle was previously returned by create.
690 static void zv_destroy (struct zebra_register *reg, void *class_handle) {
691 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
692 yaz_log(log_level, "zv_destroy");
698 * zv_begin: Prepares beginning of "real" ranking. Called once for
699 * each result set. The returned handle is a "set handle" and
700 * will be used in each of the handlers below.
702 static void *zv_begin(struct zebra_register *reg, void *class_handle,
703 RSET rset, NMEM nmem, TERMID *terms, int numterms)
705 struct rs_info *rs=(struct rs_info *)nmem_malloc(nmem,sizeof(*rs));
706 struct rank_class_info *ci=(struct rank_class_info *)class_handle;
712 yaz_log(log_level, "zv_begin");
714 zv_init(rs, ci->rscheme);
719 rs->qdoc=(struct ds_info *)nmem_malloc(nmem,sizeof(*rs->qdoc));
720 rs->qdoc->terms=(struct ts_info *)nmem_malloc(nmem,
721 sizeof(*rs->qdoc->terms)*rs->veclen);
722 rs->qdoc->veclen=veclen;
723 rs->qdoc->d_f_max=1; /* no duplicates */
724 rs->qdoc->d_f_max_str="";
726 rs->rdoc=(struct ds_info *)nmem_malloc(nmem,sizeof(*rs->rdoc));
727 rs->rdoc->terms=(struct ts_info *)nmem_malloc(nmem,
728 sizeof(*rs->rdoc->terms)*rs->veclen);
729 rs->rdoc->veclen=veclen;
730 rs->rdoc->d_f_max=10; /* just a guess */
731 rs->rdoc->d_f_max_str="";
732 /* yaz_log(log_level, "zv_begin_init"); */
733 for (i = 0; i < rs->veclen; i++)
735 gocc= rset_count(terms[i]->rset);
736 terms[i]->rankpriv=ip=nmem_malloc(nmem, sizeof(int));
737 *ip=i; /* save the index for add() */
738 /* yaz_log(log_level, "zv_begin_init i=%d gocc=%d", i, gocc); */
739 rs->qdoc->terms[i].gocc=gocc;
740 rs->qdoc->terms[i].locc=1; /* assume query has no duplicate terms */
741 rs->rdoc->terms[i].gocc=gocc;
742 rs->rdoc->terms[i].locc=0;
744 (*rs->q_tf_fct)(rs, rs->qdoc); /* we do this once only */
745 (*rs->q_idf_fct)(rs, rs->qdoc);
746 (*rs->q_norm_fct)(rs, rs->qdoc);
751 * zv_end: Terminates ranking process. Called after a result set
754 static void zv_end (struct zebra_register *reg, void *rsi)
756 yaz_log(log_level, "zv_end");
757 /* they all are nmem'd */
762 * zv_add: Called for each word occurence in a result set. This routine
763 * should be as fast as possible. This routine should "incrementally"
766 static void zv_add (void *rsi, int seqno, TERMID term) {
768 int *ip = term->rankpriv;
772 yaz_log(log_level, "zvrank zv_add seqno=%d NULL term",seqno );
775 rs->rdoc->terms[i].locc++;
776 yaz_log(log_level, "zvrank zv_add seqno=%d '%s' term_index=%d cnt=%d",
777 seqno, term->name, i, rs->rdoc->terms[i].locc );
781 * zv_calc: Called for each document in a result. This handler should
782 * produce a score based on previous call(s) to the add handler. The
783 * score should be between 0 and 1000. If score cannot be obtained
784 * -1 should be returned.
786 static int zv_calc (void *rsi, zint sysno, zint staticrank, int *stop_flag)
792 /* yaz_log(log_level, "zv_calc"); */
797 for (i = 0; i < veclen; i++) {
798 /* qdoc weight has already been calculated */
799 (*rs->d_tf_fct)(rs, rs->rdoc);
800 (*rs->d_idf_fct)(rs, rs->rdoc);
801 (*rs->d_norm_fct)(rs, rs->rdoc);
802 dscore=rs->sim_fct(rs->qdoc, rs->rdoc);
804 score = (int) (dscore * 1000 +.5);
805 yaz_log (log_level, "zv_calc: sysno=" ZINT_FORMAT " score=%d",
807 if (score > 1000) /* should not happen */
809 /* reset counts for the next record */
810 for (i = 0; i < rs->veclen; i++)
811 rs->rdoc->terms[i].locc=0;
816 * Pseudo-meta code with sequence of calls as they occur in a
817 * server. Handlers are prefixed by --:
833 static struct rank_control rank_control_vsm = {
843 struct rank_control *rank_zv_class = &rank_control_vsm;
848 * indent-tabs-mode: nil
850 * vim: shiftwidth=4 tabstop=8 expandtab