src/relevance.c

   1 /* This file is part of Pazpar2.
   2    Copyright (C) 2006-2009 Index Data
   3
   4 Pazpar2 is free software; you can redistribute it and/or modify it under
   5 the terms of the GNU General Public License as published by the Free
   6 Software Foundation; either version 2, or (at your option) any later
   7 version.
   8
   9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 for more details.
  13
  14 You should have received a copy of the GNU General Public License
  15 along with this program; if not, write to the Free Software
  16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 */
  19
  20 #if HAVE_CONFIG_H
  21 #include <config.h>
  22 #endif
  23
  24 #include <math.h>
  25 #include <stdlib.h>
  26
  27 #include "relevance.h"
  28 #include "pazpar2.h"
  29
  30 #define USE_TRIE 0
  31
  32 struct relevance
  33 {
  34     int *doc_frequency_vec;
  35     int vec_len;
  36 #if USE_TRIE
  37     struct word_trie *wt;
  38 #else
  39     struct word_entry *entries;
  40     pp2_charset_t pct;
  41 #endif
  42     NMEM nmem;
  43 };
  44
  45 #if USE_TRIE
  46 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
  47
  48
  49 // We use this data structure to recognize terms in input records,
  50 // and map them to record term vectors for counting.
  51 struct word_trie
  52 {
  53     struct
  54     {
  55         struct word_trie *child;
  56         int termno;
  57     } list[26];
  58 };
  59
  60 static struct word_trie *create_word_trie_node(NMEM nmem)
  61 {
  62     struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
  63     int i;
  64     for (i = 0; i < 26; i++)
  65     {
  66         res->list[i].child = 0;
  67         res->list[i].termno = -1;
  68     }
  69     return res;
  70 }
  71
  72 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
  73 {
  74
  75     while (*term) {
  76         int c = tolower(*term);
  77         if (c < 'a' || c > 'z')
  78             term++;
  79         else
  80         {
  81             c -= 'a';
  82             if (!*(++term))
  83                 n->list[c].termno = num;
  84             else
  85             {
  86                 if (!n->list[c].child)
  87                 {
  88                     struct word_trie *new = create_word_trie_node(nmem);
  89                     n->list[c].child = new;
  90                 }
  91                 word_trie_addterm(nmem, n->list[c].child, term, num);
  92             }
  93             break;
  94         }
  95     }
  96 }
  97
  98 static int word_trie_match(struct word_trie *t, const char *word, int *skipped)
  99 {
 100     int c = raw_char(tolower(*word));
 101
 102     if (!*word)
 103         return 0;
 104
 105     word++;
 106     (*skipped)++;
 107     if (!*word || raw_char(*word) < 0)
 108     {
 109         if (t->list[c].termno > 0)
 110             return t->list[c].termno;
 111         else
 112             return 0;
 113     }
 114     else
 115     {
 116         if (t->list[c].child)
 117         {
 118             return word_trie_match(t->list[c].child, word, skipped);
 119         }
 120         else
 121             return 0;
 122     }
 123
 124 }
 125
 126
 127 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
 128 {
 129     struct word_trie *res = create_word_trie_node(nmem);
 130     const char **p;
 131     int i;
 132
 133     for (i = 1, p = terms; *p; p++, i++)
 134         word_trie_addterm(nmem, res, *p, i);
 135     return res;
 136 }
 137
 138
 139 // FIXME. The definition of a word is crude here.. should support
 140 // some form of localization mechanism?
 141 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 142                           const char *words, int multiplier)
 143 {
 144     while (*words)
 145     {
 146         char c;
 147         int res;
 148         int skipped = 0;
 149         while (*words && (c = raw_char(tolower(*words))) < 0)
 150             words++;
 151         if (!*words)
 152             break;
 153         res = word_trie_match(r->wt, words, &skipped);
 154         if (res)
 155         {
 156             words += skipped;
 157             cluster->term_frequency_vec[res] += multiplier;
 158         }
 159         else
 160         {
 161             while (*words && (c = raw_char(tolower(*words))) >= 0)
 162                 words++;
 163         }
 164         cluster->term_frequency_vec[0]++;
 165     }
 166 }
 167
 168 #else
 169
 170 struct word_entry {
 171     const char *norm_str;
 172     int termno;
 173     struct word_entry *next;
 174 };
 175
 176 static void add_word_entry(NMEM nmem,
 177                            struct word_entry **entries,
 178                            const char *norm_str,
 179                            int term_no)
 180 {
 181     struct word_entry *ne = nmem_malloc(nmem, sizeof(*ne));
 182     ne->norm_str = nmem_strdup(nmem, norm_str);
 183     ne->termno = term_no;
 184
 185     ne->next = *entries;
 186     *entries = ne;
 187 }
 188
 189
 190 int word_entry_match(struct word_entry *entries, const char *norm_str)
 191 {
 192     for (; entries; entries = entries->next)
 193     {
 194         if (!strcmp(norm_str, entries->norm_str))
 195             return entries->termno;
 196     }
 197     return 0;
 198 }
 199
 200 static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem,
 201                                              const char **terms)
 202 {
 203     int termno = 1; /* >0 signals THERE is an entry */
 204     struct word_entry *entries = 0;
 205     const char **p = terms;
 206
 207     for (; *p; p++)
 208     {
 209         pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p);
 210         const char *norm_str;
 211
 212         while ((norm_str = pp2_relevance_token_next(prt)))
 213             add_word_entry(nmem, &entries, norm_str, termno);
 214
 215         pp2_relevance_token_destroy(prt);
 216
 217         termno++;
 218     }
 219     return entries;
 220 }
 221
 222 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 223         const char *words, int multiplier)
 224 {
 225     pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words);
 226
 227     const char *norm_str;
 228
 229     while ((norm_str = pp2_relevance_token_next(prt)))
 230     {
 231         int res = word_entry_match(r->entries, norm_str);
 232         if (res)
 233             cluster->term_frequency_vec[res] += multiplier;
 234         cluster->term_frequency_vec[0]++;
 235     }
 236     pp2_relevance_token_destroy(prt);
 237 }
 238
 239 #endif
 240
 241
 242
 243 struct relevance *relevance_create(pp2_charset_t pct,
 244                                    NMEM nmem, const char **terms, int numrecs)
 245 {
 246     struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
 247     const char **p;
 248     int i;
 249
 250     for (p = terms, i = 0; *p; p++, i++)
 251         ;
 252     res->vec_len = ++i;
 253     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
 254     memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int));
 255     res->nmem = nmem;
 256 #if USE_TRIE
 257     res->wt = build_word_trie(nmem, terms);
 258 #else
 259     res->entries = build_word_entries(pct, nmem, terms);
 260     res->pct = pct;
 261 #endif
 262     return res;
 263 }
 264
 265 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
 266 {
 267     if (!rec->term_frequency_vec)
 268     {
 269         rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
 270         memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int));
 271     }
 272 }
 273
 274
 275 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
 276 {
 277     int i;
 278
 279     for (i = 1; i < r->vec_len; i++)
 280         if (cluster->term_frequency_vec[i] > 0)
 281             r->doc_frequency_vec[i]++;
 282
 283     r->doc_frequency_vec[0]++;
 284 }
 285
 286 // Prepare for a relevance-sorted read
 287 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
 288 {
 289     int i;
 290     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
 291
 292     // Calculate document frequency vector for each term.
 293     for (i = 1; i < rel->vec_len; i++)
 294     {
 295         if (!rel->doc_frequency_vec[i])
 296             idfvec[i] = 0;
 297         else
 298         {
 299             // This conditional may be terribly wrong
 300             // It was there to address the situation where vec[0] == vec[i]
 301             // which leads to idfvec[i] == 0... not sure about this
 302             // Traditional TF-IDF may assume that a word that occurs in every
 303             // record is irrelevant, but this is actually something we will
 304             // see a lot
 305             if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] /
 306                             rel->doc_frequency_vec[i])) < 0.0000001)
 307                 idfvec[i] = 1;
 308         }
 309     }
 310     // Calculate relevance for each document
 311     for (i = 0; i < reclist->num_records; i++)
 312     {
 313         int t;
 314         struct record_cluster *rec = reclist->flatlist[i];
 315         float relevance;
 316         relevance = 0;
 317         for (t = 1; t < rel->vec_len; t++)
 318         {
 319             float termfreq;
 320             if (!rec->term_frequency_vec[0])
 321                 break;
 322             termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
 323             relevance += termfreq * idfvec[t];
 324         }
 325         rec->relevance = (int) (relevance * 100000);
 326     }
 327     reclist->pointer = 0;
 328     xfree(idfvec);
 329 }
 330
 331 /*
 332  * Local variables:
 333  * c-basic-offset: 4
 334  * c-file-style: "Stroustrup"
 335  * indent-tabs-mode: nil
 336  * End:
 337  * vim: shiftwidth=4 tabstop=8 expandtab
 338  */
 339