1 /* $Id: zvrank.c,v 1.19 2005-08-19 11:04:23 adam Exp $
2 Copyright (C) 1995-2005
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
24 Zvrank: an experimental ranking algorithm. See doc/zvrank.txt and
25 source in index/zvrank.c. Enable this by using rank: zvrank in zebra.cfg.
26 Contributed by Johannes Leveling <Johannes.Leveling at
30 /* Zebra Vector Space Model RANKing
32 ** six (seven) letter identifier for weighting scheme
33 ** best document weighting:
34 ** tfc nfc (tpc npc) [original naming]
35 ** ntc atc npc apc [SMART naming, used here]
36 ** best query weighting:
37 ** nfx tfx bfx (npx tpx bpx) [original naming]
38 ** atn ntn btn apn npn bpn [SMART naming]
39 ** -> should set zvrank.weighting-scheme to one of
40 ** "ntc-atn", "atc-atn", etc.
44 #include <math.h> /* for log */
56 static int log_level = 0;
57 static int log_initialized = 0;
59 static double blog(double x) {
60 /* log_2, log_e or log_10 is used, best to change it here if necessary */
63 return log(x); /* / log(base) */
68 struct rank_class_info {
69 char rscheme[8]; /* name of weighting scheme */
73 struct rs_info { /* for result set */
74 int db_docs; /* number of documents in database (collection) */
75 int db_terms; /* number of distinct terms in database (debugging?) */
76 int db_f_max; /* maximum of f_t in database (debugging?) */
77 char *db_f_max_str; /* string (most frequent term) - for debugging */
79 char rscheme[8]; /* name of weighting scheme */
83 void (*d_tf_fct)(void *, void *); /* doc term frequency function */
84 void (*d_idf_fct)(void *, void *); /* doc idf function */
85 void (*d_norm_fct)(void *, void *); /* doc normalization function */
87 void (*q_tf_fct)(void *, void *); /* query term frequency function */
88 void (*q_idf_fct)(void *, void *); /* query idf function */
89 void (*q_norm_fct)(void *, void *); /* query normalization function */
91 double (*sim_fct)(void *, void *); /* similarity function (scoring function) */
95 typedef struct rs_info *RS;
97 static void prn_rs(RS rs) { /* for debugging */
98 yaz_log(log_level, "* RS:");
99 yaz_log(log_level, " db_docs: %d", rs->db_docs);
100 yaz_log(log_level, " db_terms: %d", rs->db_terms);
101 yaz_log(log_level, " f_max: %d", rs->db_f_max);
102 yaz_log(log_level, " f_max_str: %s", rs->db_f_max_str);
103 yaz_log(log_level, " veclen: %d", rs->veclen);
104 /* rscheme implies functions */
105 yaz_log(log_level, " rscheme: %s", rs->rscheme);
109 struct ds_info { /* document info */
110 char *docid; /* unique doc identifier */
111 int docno; /* doc number */
112 int doclen; /* document length */
113 int d_f_max; /* maximum number of any term in doc (needed) */
114 char *d_f_max_str; /* most frequent term in d - for debugging */
115 int veclen; /* vector length */
116 struct ts_info *terms;
117 double docsim; /* similarity in [0, ..., 1] (= score/1000) */
119 typedef struct ds_info* DS;
122 static void prn_ds(DS ds) { /* for debugging */
123 yaz_log(log_level, " * DS:");
124 yaz_log(log_level, " docid: %s", ds->docid);
125 yaz_log(log_level, " docno: %d", ds->docno);
126 yaz_log(log_level, " doclen: %d", ds->doclen);
127 yaz_log(log_level, " d_f_max: %d", ds->d_f_max);
128 yaz_log(log_level, " d_f_max_str:%s", ds->d_f_max_str);
129 yaz_log(log_level, " veclen: %d", ds->veclen);
134 struct ts_info { /* term info */
144 typedef struct ts_info *TS;
147 static void prn_ts(TS ts) { /* for debugging */
148 yaz_log(log_level, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f",
149 ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);
159 ** weighting functions
160 ** check: RS is not needed anymore
163 /* calculate and store new term frequency vector */
164 static void tf_none(void *rsi, void *dsi) {
167 /* no conversion. 1 <= tf */
169 for (i=0; i < veclen; i++) {
170 freq=ds->terms[i].locc;
171 ds->terms[i].tf=freq;
176 static void tf_binary(void *rsi, void *dsi) {
181 for (i=0; i < veclen; i++) {
182 freq=ds->terms[i].locc;
191 static void tf_max_norm(void *rsi, void *dsi) {
195 /* divide each term by max, so 0 <= tf <= 1 */
196 tf_max=ds->d_f_max; /* largest frequency of t in document */
198 for (i=0; i < veclen; i++) {
199 freq=ds->terms[i].locc;
202 ds->terms[i].tf=freq/tf_max;
209 static void tf_aug_norm(void *rsi, void *dsi) {
214 /* augmented normalized tf. 0.5 <= tf <= 1 for K = 0.5 */
215 tf_max=ds->d_f_max; /* largest frequency of t in document */
217 K=0.5; /* zvrank.const-K */
218 for (i=0; i < veclen; i++) {
219 freq=ds->terms[i].locc;
222 ds->terms[i].tf=K+(1.0-K)*(freq/tf_max);
229 static void tf_square(void *rsi, void *dsi) {
234 for (i=0; i < veclen; i++) {
235 freq=ds->terms[i].locc;
237 ds->terms[i].tf=freq*freq;
244 static void tf_log(void *rsi, void *dsi) {
249 for (i=0; i < veclen; i++) {
250 freq=ds->terms[i].locc;
252 ds->terms[i].tf=1.0+blog(freq);
259 /* calculate and store inverse document frequency vector */
260 static void idf_none(void *rsi, void *dsi) {
265 for (i=0; i < veclen; i++) {
266 ds->terms[i].idf=1.0;
271 static void idf_tfidf(void *rsi, void *dsi) {
277 /* normal tfidf weight */
279 num_docs=rs->db_docs;
280 for (i=0; i < veclen; i++) {
281 gocc=ds->terms[i].gocc;
285 idf=blog((double) (num_docs/gocc));
286 ds->terms[i].idf=idf;
291 static void idf_prob(void *rsi, void *dsi) {
297 /* probabilistic formulation */
299 num_docs=rs->db_docs;
300 for (i=0; i < veclen; i++) {
301 gocc=ds->terms[i].gocc;
305 idf=blog((double) ((num_docs-gocc)/gocc));
306 ds->terms[i].idf=idf;
311 static void idf_freq(void *rsi, void *dsi) {
317 /* frequency formulation */
319 num_docs=rs->db_docs;
324 for (i=0; i < veclen; i++) {
325 ds->terms[i].idf=idf;
330 static void idf_squared(void *rsi, void *dsi) {
338 num_docs=rs->db_docs;
339 yaz_log(log_level, "idf_squared: db_docs required");
340 for (i=0; i < veclen; i++) {
341 gocc=ds->terms[i].gocc;
345 idf=blog(CAST_ZINT_TO_DOUBLE(num_docs/gocc));
347 ds->terms[i].idf=idf;
352 /* calculate and store normalized weight (tf-idf) vector */
353 static void norm_none(void *rsi, void *dsi) {
356 /* no normalization */
358 for (i=0; i < veclen; i++) {
359 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
364 static void norm_sum(void *rsi, void *dsi) {
370 for (i=0; i < veclen; i++) {
371 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
372 tfs+=ds->terms[i].wt;
375 for (i=0; i < veclen; i++) {
376 ds->terms[i].wt=ds->terms[i].wt/tfs;
378 /* else: tfs==0 && ds->terms[i].wt==0 */
382 static void norm_cosine(void *rsi, void *dsi) {
388 for (i=0; i < veclen; i++) {
389 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
390 tfs+=(ds->terms[i].wt*ds->terms[i].wt);
394 for (i=0; i < veclen; i++) {
395 ds->terms[i].wt=ds->terms[i].wt/tfs;
397 /* else: tfs==0 && ds->terms[i].wt==0 */
401 static void norm_fourth(void *rsi, void *dsi) {
407 for (i=0; i < veclen; i++) {
408 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
409 fr=(ds->terms[i].wt*ds->terms[i].wt);
414 for (i=0; i < veclen; i++) {
415 ds->terms[i].wt=ds->terms[i].wt/tfs;
417 /* else: tfs==0 && ds->terms[i].wt==0 */
421 static void norm_max(void *rsi, void *dsi) {
427 for (i=0; i < veclen; i++) {
428 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
429 if (ds->terms[i].wt > tfm)
433 for (i=0; i < veclen; i++) {
434 ds->terms[i].wt=ds->terms[i].wt/tfm;
436 /* else: tfs==0 && ds->terms[i].wt==0 */
440 /* FIXME add norm_pivot, ... */
442 static double sim_cosine(void *dsi1, void *dsi2) {
446 double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;
449 veclen=ds1->veclen; /* and ds2->veclen */
450 for (i=0; i < veclen; i++) {
457 sdiv=sqrt(sqr11*sqr22);
463 /* FIXME: add norm_jaccard, norm_dice, ... */
465 /* end weighting functions */
469 static void zv_init_scheme(RS rs, const char *sname) {
471 char c0, c1, c2, c3, c4, c5, c6;
472 const char *def_rscheme="ntc-atn"; /* a good default */
474 yaz_log(log_level, "zv_init_scheme");
477 yaz_log(log_level, "zvrank: invalid weighting-scheme \"%s\"", sname);
478 if (slen > 0) c0=sname[0]; else c0=def_rscheme[0];
479 if (slen > 1) c1=sname[1]; else c1=def_rscheme[1];
480 if (slen > 2) c2=sname[2]; else c2=def_rscheme[2];
482 if (slen > 4) c4=sname[4]; else c4=def_rscheme[4];
483 if (slen > 5) c5=sname[5]; else c5=def_rscheme[5];
484 if (slen > 6) c6=sname[6]; else c6=def_rscheme[6];
486 /* assign doc functions */
490 rs->d_tf_fct=tf_binary;
494 rs->d_tf_fct=tf_max_norm;
496 yaz_log(log_level, "tf_max_norm: d_f_max required");
499 rs->d_tf_fct=tf_aug_norm;
501 yaz_log(log_level, "tf_aug_norm: d_f_max required");
504 rs->d_tf_fct=tf_square;
512 rs->d_tf_fct=tf_none;
518 rs->d_idf_fct=idf_tfidf;
520 yaz_log(log_level, "idf_tfidf: db_docs required");
523 rs->d_idf_fct=idf_prob;
525 yaz_log(log_level, "idf_prob: db_docs required");
528 rs->d_idf_fct=idf_freq;
530 yaz_log(log_level, "idf_freq: db_docs required");
533 rs->d_idf_fct=idf_squared;
535 yaz_log(log_level, "idf_squared: db_docs required");
538 rs->d_idf_fct=idf_none;
544 rs->d_norm_fct=norm_sum;
548 rs->d_norm_fct=norm_cosine;
552 rs->d_norm_fct=norm_fourth;
556 rs->d_norm_fct=norm_max;
560 rs->d_norm_fct=norm_none;
564 /* assign query functions */
568 rs->q_tf_fct=tf_binary;
572 rs->q_tf_fct=tf_max_norm;
573 yaz_log(log_level, "tf_max_norm: d_f_max required");
577 rs->q_tf_fct=tf_aug_norm;
579 yaz_log(log_level, "tf_aug_norm: d_f_max required");
582 rs->q_tf_fct=tf_square;
590 rs->q_tf_fct=tf_none;
596 rs->q_idf_fct=idf_tfidf;
598 yaz_log(log_level, "idf_tfidf: db_docs required");
601 rs->q_idf_fct=idf_prob;
603 yaz_log(log_level, "idf_prob: db_docs required");
606 rs->q_idf_fct=idf_freq;
608 yaz_log(log_level, "idf_freq: db_docs required");
611 rs->q_idf_fct=idf_squared;
613 yaz_log(log_level, "idf_squared: db_docs required");
616 rs->q_idf_fct=idf_none;
622 rs->q_norm_fct=norm_sum;
626 rs->q_norm_fct=norm_cosine;
630 rs->q_norm_fct=norm_fourth;
634 rs->q_norm_fct=norm_max;
638 rs->q_norm_fct=norm_none;
642 rs->sim_fct=sim_cosine;
643 yaz_log(log_level, "zv_scheme %s", rs->rscheme);
646 static void zv_init(RS rs, const char *rscheme) {
647 yaz_log(log_level, "zv_init");
649 rs->db_docs=100000; /* assign correct value here */
650 rs->db_terms=500000; /* assign correct value here (for debugging) */
651 rs->db_f_max=50; /* assign correct value here */
652 rs->db_f_max_str="a"; /* assign correct value here (for debugging) */
653 /* FIXME - get those values from somewhere */
654 zv_init_scheme(rs, rscheme);
661 * zv_create: Creates/Initialises this rank handler. This routine is
662 * called exactly once. The routine returns the class_handle.
664 static void *zv_create (ZebraHandle zh) {
668 struct rank_class_info *ci = (struct rank_class_info *)
669 xmalloc (sizeof(*ci));
670 if (!log_initialized)
672 log_level = yaz_log_module_level("zvrank");
676 yaz_log(log_level, "zv_create");
677 wscheme= res_get_def(res, "zvrank.weighting-scheme", "");
678 for (i = 0; wscheme[i] && i < 8; i++)
679 ci->rscheme[i]=wscheme[i];
680 ci->rscheme[i] = '\0';
685 * zv_destroy: Destroys this rank handler. This routine is called
686 * when the handler is no longer needed - i.e. when the server
687 * dies. The class_handle was previously returned by create.
689 static void zv_destroy (struct zebra_register *reg, void *class_handle) {
690 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
691 yaz_log(log_level, "zv_destroy");
697 * zv_begin: Prepares beginning of "real" ranking. Called once for
698 * each result set. The returned handle is a "set handle" and
699 * will be used in each of the handlers below.
701 static void *zv_begin(struct zebra_register *reg, void *class_handle,
702 RSET rset, NMEM nmem, TERMID *terms, int numterms)
704 struct rs_info *rs=(struct rs_info *)nmem_malloc(nmem,sizeof(*rs));
705 struct rank_class_info *ci=(struct rank_class_info *)class_handle;
711 yaz_log(log_level, "zv_begin");
713 zv_init(rs, ci->rscheme);
718 rs->qdoc=(struct ds_info *)nmem_malloc(nmem,sizeof(*rs->qdoc));
719 rs->qdoc->terms=(struct ts_info *)nmem_malloc(nmem,
720 sizeof(*rs->qdoc->terms)*rs->veclen);
721 rs->qdoc->veclen=veclen;
722 rs->qdoc->d_f_max=1; /* no duplicates */
723 rs->qdoc->d_f_max_str="";
725 rs->rdoc=(struct ds_info *)nmem_malloc(nmem,sizeof(*rs->rdoc));
726 rs->rdoc->terms=(struct ts_info *)nmem_malloc(nmem,
727 sizeof(*rs->rdoc->terms)*rs->veclen);
728 rs->rdoc->veclen=veclen;
729 rs->rdoc->d_f_max=10; /* just a guess */
730 rs->rdoc->d_f_max_str="";
731 /* yaz_log(log_level, "zv_begin_init"); */
732 for (i = 0; i < rs->veclen; i++)
734 gocc= rset_count(terms[i]->rset);
735 terms[i]->rankpriv=ip=nmem_malloc(nmem, sizeof(int));
736 *ip=i; /* save the index for add() */
737 /* yaz_log(log_level, "zv_begin_init i=%d gocc=%d", i, gocc); */
738 rs->qdoc->terms[i].gocc=gocc;
739 rs->qdoc->terms[i].locc=1; /* assume query has no duplicate terms */
740 rs->rdoc->terms[i].gocc=gocc;
741 rs->rdoc->terms[i].locc=0;
743 (*rs->q_tf_fct)(rs, rs->qdoc); /* we do this once only */
744 (*rs->q_idf_fct)(rs, rs->qdoc);
745 (*rs->q_norm_fct)(rs, rs->qdoc);
750 * zv_end: Terminates ranking process. Called after a result set
753 static void zv_end (struct zebra_register *reg, void *rsi)
755 yaz_log(log_level, "zv_end");
756 /* they all are nmem'd */
761 * zv_add: Called for each word occurence in a result set. This routine
762 * should be as fast as possible. This routine should "incrementally"
765 static void zv_add (void *rsi, int seqno, TERMID term) {
767 int *ip = term->rankpriv;
771 yaz_log(log_level, "zvrank zv_add seqno=%d NULL term",seqno );
774 rs->rdoc->terms[i].locc++;
775 yaz_log(log_level, "zvrank zv_add seqno=%d '%s' term_index=%d cnt=%d",
776 seqno, term->name, i, rs->rdoc->terms[i].locc );
780 * zv_calc: Called for each document in a result. This handler should
781 * produce a score based on previous call(s) to the add handler. The
782 * score should be between 0 and 1000. If score cannot be obtained
783 * -1 should be returned.
785 static int zv_calc (void *rsi, zint sysno, zint staticrank, int *stop_flag)
791 /* yaz_log(log_level, "zv_calc"); */
796 for (i = 0; i < veclen; i++) {
797 /* qdoc weight has already been calculated */
798 (*rs->d_tf_fct)(rs, rs->rdoc);
799 (*rs->d_idf_fct)(rs, rs->rdoc);
800 (*rs->d_norm_fct)(rs, rs->rdoc);
801 dscore=rs->sim_fct(rs->qdoc, rs->rdoc);
803 score = (int) (dscore * 1000 +.5);
804 yaz_log (log_level, "zv_calc: sysno=" ZINT_FORMAT " score=%d",
806 if (score > 1000) /* should not happen */
808 /* reset counts for the next record */
809 for (i = 0; i < rs->veclen; i++)
810 rs->rdoc->terms[i].locc=0;
815 * Pseudo-meta code with sequence of calls as they occur in a
816 * server. Handlers are prefixed by --:
832 static struct rank_control rank_control_vsm = {
842 struct rank_control *rank_zv_class = &rank_control_vsm;