X-Git-Url: http://jsfdemo.indexdata.com/?a=blobdiff_plain;f=src%2Frelevance.c;h=b4177c06d3a92f222dcc20887225b1df4ec29c37;hb=6bd7fce99de886558fa6d3770ec9d866b5e37ef8;hp=bc841e4e5e18396ee988824eb57343ca545a0205;hpb=8ca1269eac32c1ddc19d16dc4f74e9a1e3e0b8f9;p=pazpar2-moved-to-github.git diff --git a/src/relevance.c b/src/relevance.c index bc841e4..b4177c0 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -1,5 +1,22 @@ -/* - * $Id: relevance.c,v 1.8 2007-01-15 04:34:28 quinn Exp $ +/* $Id: relevance.c,v 1.11 2007-05-01 05:04:53 quinn Exp $ + Copyright (c) 2006-2007, Index Data. + +This file is part of Pazpar2. + +Pazpar2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Pazpar2; see the file LICENSE. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. */ #include @@ -46,6 +63,7 @@ static struct word_trie *create_word_trie_node(NMEM nmem) static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num) { + while (*term) { int c = tolower(*term); if (c < 'a' || c > 'z') @@ -214,7 +232,17 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) if (!rel->doc_frequency_vec[i]) idfvec[i] = 0; else - idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]); + { + // This conditional may be terribly wrong + // It was there to address the situation where vec[0] == vec[i] + // which leads to idfvec[i] == 0... not sure about this + // Traditional TF-IDF may assume that a word that occurs in every + // record is irrelevant, but this is actually something we will + // see a lot + if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] / + rel->doc_frequency_vec[i])) < 0.0000001) + idfvec[i] = 1; + } } // Calculate relevance for each document for (i = 0; i < reclist->num_records; i++)