X-Git-Url: http://jsfdemo.indexdata.com/?a=blobdiff_plain;f=src%2Frelevance.c;h=ddf511fdc8aa1bd64a67d45d9f70a14b5748404c;hb=0ff1a97b2a69905755b9adb24a474d30f1c52150;hp=5e1fd76225ff7f9f2bb26990b353840777bf8d9b;hpb=cc29eab9f928f6cd0f4231cb2e554e2ac7b0b1f3;p=pazpar2-moved-to-github.git diff --git a/src/relevance.c b/src/relevance.c index 5e1fd76..ddf511f 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -1,4 +1,4 @@ -/* $Id: relevance.c,v 1.9 2007-04-10 08:48:56 adam Exp $ +/* $Id: relevance.c,v 1.13 2007-05-10 11:46:09 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -30,14 +30,25 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include "relevance.h" #include "pazpar2.h" +#define USE_TRIE 0 + struct relevance { int *doc_frequency_vec; int vec_len; +#if USE_TRIE struct word_trie *wt; +#else + struct word_entry *entries; + pp2_charset_t pct; +#endif NMEM nmem; }; +#if USE_TRIE +#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1) + + // We use this data structure to recognize terms in input records, // and map them to record term vectors for counting. struct word_trie @@ -63,6 +74,7 @@ static struct word_trie *create_word_trie_node(NMEM nmem) static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num) { + while (*term) { int c = tolower(*term); if (c < 'a' || c > 'z') @@ -86,8 +98,6 @@ static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, } } -#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1) - static int word_trie_match(struct word_trie *t, const char *word, int *skipped) { int c = raw_char(tolower(*word)); @@ -128,48 +138,23 @@ static struct word_trie *build_word_trie(NMEM nmem, const char **terms) return res; } -struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs) -{ - struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance)); - const char **p; - int i; - - for (p = terms, i = 0; *p; p++, i++) - ; - res->vec_len = ++i; - res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int)); - memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int)); - res->nmem = nmem; - res->wt = build_word_trie(nmem, terms); - return res; -} - -void relevance_newrec(struct relevance *r, struct record_cluster *rec) -{ - if (!rec->term_frequency_vec) - { - rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); - memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int)); - } -} - // FIXME. The definition of a word is crude here.. should support // some form of localization mechanism? void relevance_countwords(struct relevance *r, struct record_cluster *cluster, - const char *words, int multiplier) + const char *words, int multiplier) { while (*words) { char c; int res; - int skipped; + int skipped = 0; while (*words && (c = raw_char(tolower(*words))) < 0) words++; if (!*words) - return; - skipped = 0; - if ((res = word_trie_match(r->wt, words, &skipped))) + break; + res = word_trie_match(r->wt, words, &skipped); + if (res) { words += skipped; cluster->term_frequency_vec[res] += multiplier; @@ -183,6 +168,113 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster, } } +#else + +struct word_entry { + const char *norm_str; + int termno; + struct word_entry *next; +}; + +static void add_word_entry(NMEM nmem, + struct word_entry **entries, + const char *norm_str, + int term_no) +{ + struct word_entry *ne = nmem_malloc(nmem, sizeof(*ne)); + ne->norm_str = nmem_strdup(nmem, norm_str); + ne->termno = term_no; + + ne->next = *entries; + *entries = ne; +} + + +int word_entry_match(struct word_entry *entries, const char *norm_str) +{ + for (; entries; entries = entries->next) + { + if (!strcmp(norm_str, entries->norm_str)) + return entries->termno; + } + return 0; +} + +static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem, + const char **terms) +{ + int termno = 1; /* >0 signals THERE is an entry */ + struct word_entry *entries = 0; + const char **p = terms; + + for (; *p; p++) + { + pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p); + const char *norm_str; + + while ((norm_str = pp2_relevance_token_next(prt))) + add_word_entry(nmem, &entries, norm_str, termno); + + pp2_relevance_token_destroy(prt); + + termno++; + } + return entries; +} + +void relevance_countwords(struct relevance *r, struct record_cluster *cluster, + const char *words, int multiplier) +{ + pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words); + + const char *norm_str; + + while ((norm_str = pp2_relevance_token_next(prt))) + { + int res = word_entry_match(r->entries, norm_str); + if (res) + cluster->term_frequency_vec[res] += multiplier; + cluster->term_frequency_vec[0]++; + } + pp2_relevance_token_destroy(prt); +} + +#endif + + + +struct relevance *relevance_create(pp2_charset_t pct, + NMEM nmem, const char **terms, int numrecs) +{ + struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance)); + const char **p; + int i; + + for (p = terms, i = 0; *p; p++, i++) + ; + res->vec_len = ++i; + res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int)); + memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int)); + res->nmem = nmem; +#if USE_TRIE + res->wt = build_word_trie(nmem, terms); +#else + res->entries = build_word_entries(pct, nmem, terms); + res->pct = pct; +#endif + return res; +} + +void relevance_newrec(struct relevance *r, struct record_cluster *rec) +{ + if (!rec->term_frequency_vec) + { + rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); + memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int)); + } +} + + void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) { int i; @@ -231,7 +323,17 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) if (!rel->doc_frequency_vec[i]) idfvec[i] = 0; else - idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]); + { + // This conditional may be terribly wrong + // It was there to address the situation where vec[0] == vec[i] + // which leads to idfvec[i] == 0... not sure about this + // Traditional TF-IDF may assume that a word that occurs in every + // record is irrelevant, but this is actually something we will + // see a lot + if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] / + rel->doc_frequency_vec[i])) < 0.0000001) + idfvec[i] = 1; + } } // Calculate relevance for each document for (i = 0; i < reclist->num_records; i++)