X-Git-Url: http://jsfdemo.indexdata.com/?a=blobdiff_plain;f=src%2Frelevance.c;h=86907634a9452864f3616600f1c8a74ab18894b8;hb=5b4ea0cf66dd82c871ed7d69a5801d78789087b2;hp=9d2f47d3cbc12a3d65fea134c605ea32619efdae;hpb=4d37a7d84107f77bd61f5eb057ed99c59b84607d;p=pazpar2-moved-to-github.git diff --git a/src/relevance.c b/src/relevance.c index 9d2f47d..8690763 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -1,7 +1,5 @@ -/* $Id: relevance.c,v 1.12 2007-05-10 09:26:19 adam Exp $ - Copyright (c) 2006-2007, Index Data. - -This file is part of Pazpar2. +/* This file is part of Pazpar2. + Copyright (C) 2006-2009 Index Data Pazpar2 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -14,19 +12,18 @@ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Pazpar2; see the file LICENSE. If not, write to the -Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA -02111-1307, USA. - */ +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -#include -#include -#include +*/ #if HAVE_CONFIG_H -#include +#include #endif +#include +#include + #include "relevance.h" #include "pazpar2.h" @@ -40,14 +37,15 @@ struct relevance struct word_trie *wt; #else struct word_entry *entries; + pp2_charset_t pct; #endif NMEM nmem; }; +#if USE_TRIE #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1) -#if USE_TRIE // We use this data structure to recognize terms in input records, // and map them to record term vectors for counting. struct word_trie @@ -137,6 +135,36 @@ static struct word_trie *build_word_trie(NMEM nmem, const char **terms) return res; } + +// FIXME. The definition of a word is crude here.. should support +// some form of localization mechanism? +void relevance_countwords(struct relevance *r, struct record_cluster *cluster, + const char *words, int multiplier) +{ + while (*words) + { + char c; + int res; + int skipped = 0; + while (*words && (c = raw_char(tolower(*words))) < 0) + words++; + if (!*words) + break; + res = word_trie_match(r->wt, words, &skipped); + if (res) + { + words += skipped; + cluster->term_frequency_vec[res] += multiplier; + } + else + { + while (*words && (c = raw_char(tolower(*words))) >= 0) + words++; + } + cluster->term_frequency_vec[0]++; + } +} + #else struct word_entry { @@ -169,46 +197,51 @@ int word_entry_match(struct word_entry *entries, const char *norm_str) return 0; } -static struct word_entry *build_word_entries(NMEM nmem, +static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem, const char **terms) { int termno = 1; /* >0 signals THERE is an entry */ struct word_entry *entries = 0; const char **p = terms; - WRBUF norm_str = wrbuf_alloc(); for (; *p; p++) { - const char *cp = *p; - for (; *cp; cp++) - { - int c = raw_char(*cp); - if (c >= 0) - wrbuf_putc(norm_str, c); - else - { - if (wrbuf_len(norm_str)) - add_word_entry(nmem, &entries, wrbuf_cstr(norm_str), - termno); - wrbuf_rewind(norm_str); - } - } - if (wrbuf_len(norm_str)) - add_word_entry(nmem, &entries, wrbuf_cstr(norm_str), termno); - wrbuf_rewind(norm_str); + pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p); + const char *norm_str; + + while ((norm_str = pp2_relevance_token_next(prt))) + add_word_entry(nmem, &entries, norm_str, termno); + + pp2_relevance_token_destroy(prt); + termno++; } - wrbuf_destroy(norm_str); return entries; } - +void relevance_countwords(struct relevance *r, struct record_cluster *cluster, + const char *words, int multiplier) +{ + pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words); + + const char *norm_str; + + while ((norm_str = pp2_relevance_token_next(prt))) + { + int res = word_entry_match(r->entries, norm_str); + if (res) + cluster->term_frequency_vec[res] += multiplier; + cluster->term_frequency_vec[0]++; + } + pp2_relevance_token_destroy(prt); +} #endif -struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs) +struct relevance *relevance_create(pp2_charset_t pct, + NMEM nmem, const char **terms, int numrecs) { struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance)); const char **p; @@ -223,7 +256,8 @@ struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs) #if USE_TRIE res->wt = build_word_trie(nmem, terms); #else - res->entries = build_word_entries(nmem, terms); + res->entries = build_word_entries(pct, nmem, terms); + res->pct = pct; #endif return res; } @@ -238,55 +272,6 @@ void relevance_newrec(struct relevance *r, struct record_cluster *rec) } -// FIXME. The definition of a word is crude here.. should support -// some form of localization mechanism? -void relevance_countwords(struct relevance *r, struct record_cluster *cluster, - const char *words, int multiplier) -{ -#if !USE_TRIE - WRBUF norm_str = wrbuf_alloc(); -#endif - while (*words) - { - char c; - int res; -#if USE_TRIE - int skipped = 0; -#endif - while (*words && (c = raw_char(tolower(*words))) < 0) - words++; - if (!*words) - return; -#if USE_TRIE - res = word_trie_match(r->wt, words, &skipped); - if (res) - { - words += skipped; - cluster->term_frequency_vec[res] += multiplier; - } - else - { - while (*words && (c = raw_char(tolower(*words))) >= 0) - words++; - } -#else - while (*words && (c = raw_char(tolower(*words))) >= 0) - { - wrbuf_putc(norm_str, c); - words++; - } - res = word_entry_match(r->entries, wrbuf_cstr(norm_str)); - if (res) - cluster->term_frequency_vec[res] += multiplier; - wrbuf_rewind(norm_str); -#endif - cluster->term_frequency_vec[0]++; - } -#if !USE_TRIE - wrbuf_destroy(norm_str); -#endif -} - void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) { int i; @@ -298,31 +283,6 @@ void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) r->doc_frequency_vec[0]++; } -#ifdef GAGA -#ifdef FLOAT_REL -static int comp(const void *p1, const void *p2) -{ - float res; - struct record **r1 = (struct record **) p1; - struct record **r2 = (struct record **) p2; - res = (*r2)->relevance - (*r1)->relevance; - if (res > 0) - return 1; - else if (res < 0) - return -1; - else - return 0; -} -#else -static int comp(const void *p1, const void *p2) -{ - struct record_cluster **r1 = (struct record_cluster **) p1; - struct record_cluster **r2 = (struct record_cluster **) p2; - return (*r2)->relevance - (*r1)->relevance; -} -#endif -#endif - // Prepare for a relevance-sorted read void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) { @@ -364,9 +324,6 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) } rec->relevance = (int) (relevance * 100000); } -#ifdef GAGA - qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp); -#endif reclist->pointer = 0; xfree(idfvec); } @@ -374,7 +331,9 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab */ +