From f6300536016759df5f7d5279bcceaba2e87f3f6e Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 24 Sep 2012 19:28:54 +0200 Subject: [PATCH] Change semantics of rank lead. --- NEWS | 7 ++++--- src/pazpar2_config.c | 4 ++-- src/pazpar2_config.h | 2 +- src/relevance.c | 29 +++++++++++++++-------------- src/relevance.h | 2 +- 5 files changed, 23 insertions(+), 21 deletions(-) diff --git a/NEWS b/NEWS index 0611b78..a73f2f9 100644 --- a/NEWS +++ b/NEWS @@ -4,9 +4,10 @@ occur next to each other; number-1 if they are one term apart , .. 0 if they are number a part (all in order). Default is 0 (following terms has no effect). -Rank tweak: lead=number will increase mult by number if term is first -term in field, number-1 if second, ... 0 if term is at offset -number of more. Default value is 0 (position irrelevant). +Rank tweak: lead=k will divide mult by 1 + log2(1+k*l) where k is +value given by lead and l is length from beginning of field where +term occurs (l=0 for first term, l=1 for second term, ..). Default +value of k is 0.0. Rank tweak: length=strategy. length="linear" if mult is to be divided by length (existing, default behavior), length="log" if mult is to be diff --git a/src/pazpar2_config.c b/src/pazpar2_config.c index 73176b8..b45d825 100644 --- a/src/pazpar2_config.c +++ b/src/pazpar2_config.c @@ -135,7 +135,7 @@ struct conf_service *service_init(struct conf_server *server, service->rank_cluster = 1; service->rank_debug = 0; service->rank_follow = 0; - service->rank_lead = 0; + service->rank_lead = 0.0; service->rank_length = 2; service->charsets = 0; @@ -658,7 +658,7 @@ static struct conf_service *service_create_static(struct conf_server *server, } if (rank_lead) { - service->rank_lead = atoi(rank_lead); + service->rank_lead = atof(rank_lead); } if (rank_length) { diff --git a/src/pazpar2_config.h b/src/pazpar2_config.h index af416a7..86900fc 100644 --- a/src/pazpar2_config.h +++ b/src/pazpar2_config.h @@ -119,7 +119,7 @@ struct conf_service int rank_cluster; int rank_debug; int rank_follow; - int rank_lead; + double rank_lead; int rank_length; char *default_sort; diff --git a/src/relevance.c b/src/relevance.c index a002a7b..10e8cc4 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -37,7 +37,7 @@ struct relevance pp2_charset_token_t prt; int rank_cluster; int follow_boost; - int lead_boost; + double lead_decay; int length_divide; NMEM nmem; }; @@ -51,8 +51,9 @@ struct word_entry { struct word_entry *next; }; -static int word_entry_match(struct relevance *r, const char *norm_str, - const char *rank, int *mult) +static struct word_entry *word_entry_match(struct relevance *r, + const char *norm_str, + const char *rank, int *mult) { int i = 1; struct word_entry *entries = r->entries; @@ -79,7 +80,7 @@ static int word_entry_match(struct relevance *r, const char *norm_str, { e_follow->follow_boost = extra--; } - return entries->termno; + return entries; } entries->follow_boost = 0; } @@ -93,7 +94,7 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster, int *mult = r->term_frequency_vec_tmp; const char *norm_str; int i, length = 0; - int lead_mult = r->lead_boost; + double lead_decay = r->lead_decay; struct word_entry *e; WRBUF w = cluster->relevance_explain1; @@ -108,14 +109,14 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster, while ((norm_str = pp2_charset_token_next(r->prt))) { int local_mult = 0; - int res = word_entry_match(r, norm_str, rank, &local_mult); - if (res) + e = word_entry_match(r, norm_str, rank, &local_mult); + if (e) { + int res = e->termno; assert(res < r->vec_len); - mult[res] += local_mult + lead_mult; + mult[res] += local_mult / (1 + log2(1 + lead_decay * length)); + wrbuf_printf(w, "%s: mult[%d] += local_mult(%d) / (1+log2(1+lead_decay(%f) * length(%d)));\n", e->display_str, res, local_mult, lead_decay, length); } - if (lead_mult > 0) - --lead_mult; length++; } @@ -123,8 +124,8 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster, { if (length == 0 || mult[i] == 0) continue; - wrbuf_printf(w, "%s: field=%s vecf[%d] += mult(%d)", - e->display_str, name, i, mult[i]); + wrbuf_printf(w, "%s: field=%s vecf[%d] += mult[%d](%d)", + e->display_str, name, i, i, mult[i]); switch (r->length_divide) { case 0: @@ -193,7 +194,7 @@ static void pull_terms(struct relevance *res, struct ccl_rpn_node *n) struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, struct ccl_rpn_node *query, int rank_cluster, - int follow_boost, int lead_boost, + int follow_boost, double lead_decay, int length_divide) { NMEM nmem = nmem_create(); @@ -205,7 +206,7 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, res->vec_len = 1; res->rank_cluster = rank_cluster; res->follow_boost = follow_boost; - res->lead_boost = lead_boost; + res->lead_decay = lead_decay; res->length_divide = length_divide; res->prt = pp2_charset_token_create(pft, "relevance"); diff --git a/src/relevance.h b/src/relevance.h index f585899..8b868bc 100644 --- a/src/relevance.h +++ b/src/relevance.h @@ -31,7 +31,7 @@ struct reclist; struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, struct ccl_rpn_node *query, int rank_cluster, int follow_boost, - int lead_boost, int length_divide); + double lead_decay, int length_divide); void relevance_destroy(struct relevance **rp); void relevance_newrec(struct relevance *r, struct record_cluster *cluster); void relevance_countwords(struct relevance *r, struct record_cluster *cluster, -- 1.7.10.4