2 * $Id: relevance.c,v 1.5 2007-01-08 18:32:35 quinn Exp $
13 #include "relevance.h"
18 int *doc_frequency_vec;
24 // We use this data structure to recognize terms in input records,
25 // and map them to record term vectors for counting.
30 struct word_trie *child;
35 static struct word_trie *create_word_trie_node(NMEM nmem)
37 struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
39 for (i = 0; i < 26; i++)
41 res->list[i].child = 0;
42 res->list[i].termno = -1;
47 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
50 int c = tolower(*term);
51 if (c < 'a' || c > 'z')
57 n->list[c].termno = num;
60 if (!n->list[c].child)
62 struct word_trie *new = create_word_trie_node(nmem);
63 n->list[c].child = new;
65 word_trie_addterm(nmem, n->list[c].child, term, num);
72 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
74 static int word_trie_match(struct word_trie *t, const char *word, int *skipped)
76 int c = raw_char(tolower(*word));
83 if (!*word || raw_char(*word) < 0)
85 if (t->list[c].termno > 0)
86 return t->list[c].termno;
94 return word_trie_match(t->list[c].child, word, skipped);
103 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
105 struct word_trie *res = create_word_trie_node(nmem);
109 for (i = 1, p = terms; *p; p++, i++)
110 word_trie_addterm(nmem, res, *p, i);
114 struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs)
116 struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
120 for (p = terms, i = 0; *p; p++, i++)
123 res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
124 bzero(res->doc_frequency_vec, res->vec_len * sizeof(int));
126 res->wt = build_word_trie(nmem, terms);
130 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
132 if (!rec->term_frequency_vec)
134 rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
135 bzero(rec->term_frequency_vec, r->vec_len * sizeof(int));
140 // FIXME. The definition of a word is crude here.. should support
141 // some form of localization mechanism?
142 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
143 const char *words, int multiplier)
150 while (*words && (c = raw_char(tolower(*words))) < 0)
155 if ((res = word_trie_match(r->wt, words, &skipped)))
158 cluster->term_frequency_vec[res] += multiplier;
162 while (*words && (c = raw_char(tolower(*words))) >= 0)
165 cluster->term_frequency_vec[0]++;
169 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
173 for (i = 1; i < r->vec_len; i++)
174 if (cluster->term_frequency_vec[i] > 0)
175 r->doc_frequency_vec[i]++;
177 r->doc_frequency_vec[0]++;
181 static int comp(const void *p1, const void *p2)
184 struct record **r1 = (struct record **) p1;
185 struct record **r2 = (struct record **) p2;
186 res = (*r2)->relevance - (*r1)->relevance;
195 static int comp(const void *p1, const void *p2)
197 struct record **r1 = (struct record **) p1;
198 struct record **r2 = (struct record **) p2;
199 return (*r2)->relevance - (*r1)->relevance;
203 // Prepare for a relevance-sorted read of up to num entries
204 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
207 float *idfvec = xmalloc(rel->vec_len * sizeof(float));
209 // Calculate document frequency vector for each term.
210 for (i = 1; i < rel->vec_len; i++)
212 if (!rel->doc_frequency_vec[i])
215 idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]);
217 // Calculate relevance for each document
218 for (i = 0; i < reclist->num_records; i++)
221 struct record_cluster *rec = reclist->flatlist[i];
224 for (t = 1; t < rel->vec_len; t++)
227 if (!rec->term_frequency_vec[0])
229 termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
230 relevance += termfreq * idfvec[t];
232 rec->relevance = (int) (relevance * 100000);
234 qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp);
235 reclist->pointer = 0;
242 * indent-tabs-mode: nil
244 * vim: shiftwidth=4 tabstop=8 expandtab