X-Git-Url: http://jsfdemo.indexdata.com/?a=blobdiff_plain;f=src%2Frelevance.c;h=bb081033c3e0852e0f6e89c434f32ba2211a8829;hb=f8892da7570d4e365d36d2de128cc581f5240980;hp=92f51a88b3c38cc4a1821d1ecea51b192bde557c;hpb=7eed289231a792736f9a9b500e74778e32376155;p=pazpar2-moved-to-github.git diff --git a/src/relevance.c b/src/relevance.c index 92f51a8..bb08103 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -1,7 +1,5 @@ -/* $Id: relevance.c,v 1.10 2007-04-16 13:54:55 marc Exp $ - Copyright (c) 2006-2007, Index Data. - -This file is part of Pazpar2. +/* This file is part of Pazpar2. + Copyright (C) 2006-2008 Index Data Pazpar2 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -14,30 +12,40 @@ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Pazpar2; see the file LICENSE. If not, write to the -Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA -02111-1307, USA. - */ +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -#include -#include -#include +*/ #if HAVE_CONFIG_H -#include +#include #endif +#include +#include + #include "relevance.h" #include "pazpar2.h" +#define USE_TRIE 0 + struct relevance { int *doc_frequency_vec; int vec_len; +#if USE_TRIE struct word_trie *wt; +#else + struct word_entry *entries; + pp2_charset_t pct; +#endif NMEM nmem; }; +#if USE_TRIE +#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1) + + // We use this data structure to recognize terms in input records, // and map them to record term vectors for counting. struct word_trie @@ -87,8 +95,6 @@ static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, } } -#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1) - static int word_trie_match(struct word_trie *t, const char *word, int *skipped) { int c = raw_char(tolower(*word)); @@ -129,48 +135,23 @@ static struct word_trie *build_word_trie(NMEM nmem, const char **terms) return res; } -struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs) -{ - struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance)); - const char **p; - int i; - - for (p = terms, i = 0; *p; p++, i++) - ; - res->vec_len = ++i; - res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int)); - memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int)); - res->nmem = nmem; - res->wt = build_word_trie(nmem, terms); - return res; -} - -void relevance_newrec(struct relevance *r, struct record_cluster *rec) -{ - if (!rec->term_frequency_vec) - { - rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); - memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int)); - } -} - // FIXME. The definition of a word is crude here.. should support // some form of localization mechanism? void relevance_countwords(struct relevance *r, struct record_cluster *cluster, - const char *words, int multiplier) + const char *words, int multiplier) { while (*words) { char c; int res; - int skipped; + int skipped = 0; while (*words && (c = raw_char(tolower(*words))) < 0) words++; if (!*words) - return; - skipped = 0; - if ((res = word_trie_match(r->wt, words, &skipped))) + break; + res = word_trie_match(r->wt, words, &skipped); + if (res) { words += skipped; cluster->term_frequency_vec[res] += multiplier; @@ -184,41 +165,123 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster, } } -void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) +#else + +struct word_entry { + const char *norm_str; + int termno; + struct word_entry *next; +}; + +static void add_word_entry(NMEM nmem, + struct word_entry **entries, + const char *norm_str, + int term_no) { - int i; + struct word_entry *ne = nmem_malloc(nmem, sizeof(*ne)); + ne->norm_str = nmem_strdup(nmem, norm_str); + ne->termno = term_no; + + ne->next = *entries; + *entries = ne; +} - for (i = 1; i < r->vec_len; i++) - if (cluster->term_frequency_vec[i] > 0) - r->doc_frequency_vec[i]++; - r->doc_frequency_vec[0]++; +int word_entry_match(struct word_entry *entries, const char *norm_str) +{ + for (; entries; entries = entries->next) + { + if (!strcmp(norm_str, entries->norm_str)) + return entries->termno; + } + return 0; } -#ifdef GAGA -#ifdef FLOAT_REL -static int comp(const void *p1, const void *p2) +static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem, + const char **terms) { - float res; - struct record **r1 = (struct record **) p1; - struct record **r2 = (struct record **) p2; - res = (*r2)->relevance - (*r1)->relevance; - if (res > 0) - return 1; - else if (res < 0) - return -1; - else - return 0; + int termno = 1; /* >0 signals THERE is an entry */ + struct word_entry *entries = 0; + const char **p = terms; + + for (; *p; p++) + { + pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p); + const char *norm_str; + + while ((norm_str = pp2_relevance_token_next(prt))) + add_word_entry(nmem, &entries, norm_str, termno); + + pp2_relevance_token_destroy(prt); + + termno++; + } + return entries; } -#else -static int comp(const void *p1, const void *p2) + +void relevance_countwords(struct relevance *r, struct record_cluster *cluster, + const char *words, int multiplier) { - struct record_cluster **r1 = (struct record_cluster **) p1; - struct record_cluster **r2 = (struct record_cluster **) p2; - return (*r2)->relevance - (*r1)->relevance; + pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words); + + const char *norm_str; + + while ((norm_str = pp2_relevance_token_next(prt))) + { + int res = word_entry_match(r->entries, norm_str); + if (res) + cluster->term_frequency_vec[res] += multiplier; + cluster->term_frequency_vec[0]++; + } + pp2_relevance_token_destroy(prt); } + #endif + + + +struct relevance *relevance_create(pp2_charset_t pct, + NMEM nmem, const char **terms, int numrecs) +{ + struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance)); + const char **p; + int i; + + for (p = terms, i = 0; *p; p++, i++) + ; + res->vec_len = ++i; + res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int)); + memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int)); + res->nmem = nmem; +#if USE_TRIE + res->wt = build_word_trie(nmem, terms); +#else + res->entries = build_word_entries(pct, nmem, terms); + res->pct = pct; #endif + return res; +} + +void relevance_newrec(struct relevance *r, struct record_cluster *rec) +{ + if (!rec->term_frequency_vec) + { + rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); + memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int)); + } +} + + +void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) +{ + int i; + + for (i = 1; i < r->vec_len; i++) + if (cluster->term_frequency_vec[i] > 0) + r->doc_frequency_vec[i]++; + + r->doc_frequency_vec[0]++; +} // Prepare for a relevance-sorted read void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) @@ -232,7 +295,17 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) if (!rel->doc_frequency_vec[i]) idfvec[i] = 0; else - idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]); + { + // This conditional may be terribly wrong + // It was there to address the situation where vec[0] == vec[i] + // which leads to idfvec[i] == 0... not sure about this + // Traditional TF-IDF may assume that a word that occurs in every + // record is irrelevant, but this is actually something we will + // see a lot + if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] / + rel->doc_frequency_vec[i])) < 0.0000001) + idfvec[i] = 1; + } } // Calculate relevance for each document for (i = 0; i < reclist->num_records; i++) @@ -251,9 +324,6 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) } rec->relevance = (int) (relevance * 100000); } -#ifdef GAGA - qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp); -#endif reclist->pointer = 0; xfree(idfvec); }