1 /* $Id: ranksimilarity.c,v 1.7 2006-05-10 08:13:22 adam Exp $
2 Copyright (C) 1995-2005
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
36 static int log_level = 0;
37 static int log_initialized = 0;
39 struct ranksimilarity_class_info {
43 /** term specific info and statistics to be used under ranking */
44 struct ranksimilarity_term_info {
46 /** frequency of term within document field */
47 int freq_term_docfield;
49 /** frequency of term within result set of given term */
50 zint freq_term_resset;
52 /** number of docs within result set */
55 /** number of terms in this field */
58 /** number of docs with this field in database */
61 /** sum of size of all docs with this field in database
62 (in bytes or terms ??) */
65 /** rank flag is one if term is to be included in ranking */
68 /** relative ranking weight of term field */
71 /** term id used to access term name and other info */
74 /** index number in terms[i] array */
78 struct ranksimilarity_set_info {
81 /** number of terms in query */
84 /** number of terms in query which are included in ranking */
85 int no_ranked_terms_query;
87 /** number of documents in entire collection */
88 zint no_docs_database;
90 /** sum of size of all documents in entire collection
91 (in bytes or terms ?? best implemented as sum of size of
92 all indexes/fields in db ??)*/
93 zint size_docs_database;
95 /** array of size no_terms_query with statistics gathered per term */
96 struct ranksimilarity_term_info *entries;
102 /* local clean-up function */
103 static void ranksimilar_rec_reset(struct ranksimilarity_set_info *si)
107 for (i = 0; i < si->no_terms_query; i++)
109 si->entries[i].freq_term_docfield = 0;
115 * create: Creates/Initialises this rank handler. This routine is
116 * called exactly once. The routine returns the class_handle.
118 static void *create (ZebraHandle zh)
120 struct ranksimilarity_class_info *ci =
121 (struct ranksimilarity_class_info *) xmalloc (sizeof(*ci));
123 if (!log_initialized)
125 log_level = yaz_log_module_level("rank-similarity");
128 yaz_log(log_level, "create()");
133 * destroy: Destroys this rank handler. This routine is called
134 * when the handler is no longer needed - i.e. when the server
135 * dies. The class_handle was previously returned by create.
137 static void destroy (struct zebra_register *reg, void *class_handle)
139 struct ranksimilarity_class_info *ci
140 = (struct ranksimilarity_class_info *) class_handle;
141 yaz_log(log_level, "destroy()");
147 * begin: Prepares beginning of "real" ranking. Called once for
148 * each result set. The returned handle is a "set handle" and
149 * will be used in each of the handlers below.
151 static void *begin (struct zebra_register *reg,
152 void *class_handle, RSET rset, NMEM nmem,
153 TERMID *terms, int numterms)
155 struct ranksimilarity_set_info *si =
156 (struct ranksimilarity_set_info *) nmem_malloc (nmem, sizeof(*si));
159 yaz_log(log_level, "begin() numterms=%d", numterms);
161 /* setting database global statistics */
162 si->no_docs_database = -1; /* TODO */
163 si->size_docs_database = -1; /* TODO */
165 /* setting query statistics */
166 si->no_terms_query = numterms;
167 si->no_ranked_terms_query = 0;
169 /* setting internal data structures */
171 si->entries = (struct ranksimilarity_term_info *)
172 nmem_malloc (si->nmem, sizeof(*si->entries)*numterms);
174 /* reset the counts for the next term */
175 ranksimilar_rec_reset(si);
178 /* looping all terms in a specific field of query */
179 for (i = 0; i < numterms; i++)
181 struct ord_list *ol = NULL;
184 /* adding to number of rank entries */
185 if (strncmp (terms[i]->flags, "rank,", 5))
187 si->entries[i].rank_flag = 0;
188 yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s not ranked",
189 i, terms[i]->name, terms[i]->flags);
193 const char *cp = strstr(terms[i]->flags+4, ",w=");
195 yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s",
196 i, terms[i]->name, terms[i]->flags);
198 (si->no_ranked_terms_query)++;
201 si->entries[i].rank_flag = 1;
202 /* notice that the call to rset_count(rset) has he side-effect
203 of setting rset->hits_limit = rset_count(rset) ??? */
204 si->entries[i].freq_term_resset = rset_count(terms[i]->rset);
205 si->entries[i].no_docs_resset = terms[i]->rset->hits_count;
206 si->entries[i].no_docs_field = -1; /*TODO*/
207 si->entries[i].size_docs_field = -1; /*TODO*/
208 si->entries[i].no_terms_field = -1; /*TODO*/
211 si->entries[i].field_weight = atoi (cp+3);
213 si->entries[i].field_weight = 34; /* sqrroot of 1000 */
217 yaz_log(log_level, "begin() rset_count(terms[%d]->rset) = "
218 ZINT_FORMAT, i, rset_count(terms[i]->rset));
219 yaz_log(log_level, "begin() terms[%d]->rset->hits_limit = "
220 ZINT_FORMAT, i, terms[i]->rset->hits_limit);
221 yaz_log(log_level, "begin() terms[%d]->rset->hits_count = "
222 ZINT_FORMAT, i, terms[i]->rset->hits_count);
223 yaz_log(log_level, "begin() terms[%d]->rset->hits_round = "
224 ZINT_FORMAT, i, terms[i]->rset->hits_round);
225 yaz_log(log_level, "begin() terms[%d]->rset->hits_approx = %d",
226 i, terms[i]->rset->hits_approx);
229 /* looping indexes where term terms[i] is found */
230 for (; ol; ol = ol->next)
234 const char *string_index = 0;
238 zebraExplain_lookup_ord(reg->zei,
239 ol->ord, &index_type, &db, &set, &use,
244 "begin() index: ord=%d type=%c db=%s str-index=%s",
245 ol->ord, index_type, db, string_index);
248 "begin() index: ord=%d type=%c db=%s set=%d use=%d",
249 ol->ord, index_type, db, set, use);
254 si->entries[i].term = terms[i];
255 si->entries[i].term_index=i;
257 /* setting next entry in term */
258 terms[i]->rankpriv = &(si->entries[i]);
265 * end: Terminates ranking process. Called after a result set
268 static void end (struct zebra_register *reg, void *set_handle)
270 yaz_log(log_level, "end()");
276 * add: Called for each word occurence in a result set. This routine
277 * should be as fast as possible. This routine should "incrementally"
280 static void add (void *set_handle, int seqno, TERMID term)
282 struct ranksimilarity_set_info *si
283 = (struct ranksimilarity_set_info *) set_handle;
284 struct ranksimilarity_term_info *ti;
288 /* yaz_log(log_level, "add() seqno=%d NULL term", seqno); */
292 ti= (struct ranksimilarity_term_info *) term->rankpriv;
294 si->last_pos = seqno;
295 ti->freq_term_docfield++;
296 /*yaz_log(log_level, "add() seqno=%d term=%s freq_term_docfield=%d",
297 seqno, term->name, ti->freq_term_docfield); */
301 * calc: Called for each document in a result. This handler should
302 * produce a score based on previous call(s) to the add handler. The
303 * score should be between 0 and 1000. If score cannot be obtained
304 * -1 should be returned.
306 static int calc (void *set_handle, zint sysno, zint staticrank,
310 struct ranksimilarity_set_info *si
311 = (struct ranksimilarity_set_info *) set_handle;
314 yaz_log(log_level, "calc() sysno = " ZINT_FORMAT, sysno);
315 yaz_log(log_level, "calc() staticrank = " ZINT_FORMAT, staticrank);
317 yaz_log(log_level, "calc() si->no_terms_query = %d",
319 yaz_log(log_level, "calc() si->no_ranked_terms_query = %d",
320 si->no_ranked_terms_query);
321 yaz_log(log_level, "calc() si->no_docs_database = " ZINT_FORMAT,
322 si->no_docs_database);
323 yaz_log(log_level, "calc() si->size_docs_database = " ZINT_FORMAT,
324 si->size_docs_database);
327 if (!si->no_ranked_terms_query)
328 return -1; /* ranking not enabled for any terms */
331 /* if we set *stop_flag = 1, we stop processing (of result set list) */
334 /* here goes your formula to compute a scoring function */
335 /* you may use all the gathered statistics here */
336 for (i = 0; i < si->no_terms_query; i++)
338 yaz_log(log_level, "calc() entries[%d] termid %p",
339 i, si->entries[i].term);
340 if (si->entries[i].term){
341 yaz_log(log_level, "calc() entries[%d] term '%s' flags=%s",
342 i, si->entries[i].term->name, si->entries[i].term->flags);
343 yaz_log(log_level, "calc() entries[%d] rank_flag %d",
344 i, si->entries[i].rank_flag );
345 yaz_log(log_level, "calc() entries[%d] field_weight %d",
346 i, si->entries[i].field_weight );
347 yaz_log(log_level, "calc() entries[%d] freq_term_docfield %d",
348 i, si->entries[i].freq_term_docfield );
349 yaz_log(log_level, "calc() entries[%d] freq_term_resset " ZINT_FORMAT,
350 i, si->entries[i].freq_term_resset );
351 yaz_log(log_level, "calc() entries[%d] no_docs_resset " ZINT_FORMAT,
352 i, si->entries[i].no_docs_resset );
353 yaz_log(log_level, "calc() entries[%d] no_docs_field " ZINT_FORMAT,
354 i, si->entries[i].no_docs_field );
355 yaz_log(log_level, "calc() entries[%d] size_docs_field " ZINT_FORMAT,
356 i, si->entries[i].size_docs_field );
357 yaz_log(log_level, "calc() entries[%d] no_terms_field " ZINT_FORMAT,
358 i, si->entries[i].no_terms_field );
363 /* reset the counts for the next term */
364 ranksimilar_rec_reset(si);
367 /* staticrank = 0 is highest, MAXINT lowest */
368 score = INT_MAX - staticrank; /* but score is reverse (logical) */
371 /* debugging statistics output */
372 yaz_log(log_level, "calc() statistics: score = %d", score);
378 * Pseudo-meta code with sequence of calls as they occur in a
379 * server. Handlers are prefixed by --:
395 static struct rank_control rank_control = {
405 struct rank_control *rank_similarity_class = &rank_control;
409 * indent-tabs-mode: nil
411 * vim: shiftwidth=4 tabstop=8 expandtab