XSLT filter gets snippet as simple string for now.
-# $Id: Makefile.am,v 1.6 2005-03-30 09:25:23 adam Exp $
+# $Id: Makefile.am,v 1.7 2005-06-07 11:36:38 adam Exp $
pkginclude_HEADERS=api.h version.h res.h recctrl.h data1.h recgrs.h \
- zebramap.h bfile.h dict.h isam-codec.h isams.h isamc.h isamb.h util.h
+ zebramap.h bfile.h dict.h isam-codec.h isams.h isamc.h isamb.h util.h \
+ snippet.h
-/* $Id: recctrl.h,v 1.10 2005-04-28 08:20:39 adam Exp $
+/* $Id: recctrl.h,v 1.11 2005-06-07 11:36:38 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
#include <yaz/odr.h>
#include <idzebra/res.h>
#include <idzebra/data1.h>
+#include <idzebra/snippet.h>
#include <idzebra/zebramap.h>
YAZ_BEGIN_CDECL
int recordSize; /* size of record in bytes */
char *fname; /* name of file (or NULL if internal) */
data1_handle dh;
+ zebra_snippets *hit_snippet;
+ zebra_snippets *doc_snippet;
/* response */
oid_value output_format;
--- /dev/null
+/* $Id: snippet.h,v 1.1 2005-06-07 11:36:38 adam Exp $
+ Copyright (C) 1995-2005
+ Index Data ApS
+
+This file is part of the Zebra server.
+
+Zebra is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with Zebra; see the file LICENSE.zebra. If not, write to the
+Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+02111-1307, USA.
+*/
+
+#ifndef SNIPPET_H
+#define SNIPPET_H
+
+#include <idzebra/util.h>
+
+YAZ_BEGIN_CDECL
+
+struct zebra_snippet_word {
+ zint seqno;
+ int ord;
+ char *term;
+ int match;
+ struct zebra_snippet_word *next;
+};
+
+typedef struct zebra_snippets zebra_snippets;
+typedef struct zebra_snippet_word zebra_snippet_word;
+
+zebra_snippets *zebra_snippets_create();
+void zebra_snippets_destroy(zebra_snippets *l);
+void zebra_snippets_append(zebra_snippets *l,
+ zint seqno, int ord, const char *term);
+void zebra_snippets_append_match(zebra_snippets *l,
+ zint seqno, int ord, const char *term,
+ int match);
+zebra_snippet_word *zebra_snippets_list(zebra_snippets *l);
+void zebra_snippets_log(zebra_snippets *l, int log_level);
+zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit,
+ int window_size);
+
+YAZ_END_CDECL
+
+#endif
-/* $Id: extract.c,v 1.183 2005-05-31 13:01:36 adam Exp $
+/* $Id: extract.c,v 1.184 2005-06-07 11:36:38 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
keys->buf_used = dst - keys->buf;
}
+ZEBRA_RES zebra_snippets_rec_keys(ZebraHandle zh, struct recKeys *reckeys,
+ zebra_snippets *snippets)
+{
+ void *decode_handle = iscz1_start();
+ int off = 0;
+ int seqno = 0;
+ NMEM nmem = nmem_create();
+
+ yaz_log(YLOG_LOG, "zebra_rec_keys_snippets buf=%p sz=%d", reckeys->buf,
+ reckeys->buf_used);
+ assert(reckeys->buf);
+ while (off < reckeys->buf_used)
+ {
+ const char *src = reckeys->buf + off;
+ struct it_key key;
+ char *dst = (char*) &key;
+ char dst_buf[IT_MAX_WORD];
+ char *dst_term = dst_buf;
+
+ iscz1_decode(decode_handle, &dst, &src);
+ assert(key.len <= 4 && key.len > 2);
+
+ seqno = (int) key.mem[key.len-1];
+
+ zebra_term_untrans_iconv(zh, nmem, src[0], &dst_term, src+1);
+ zebra_snippets_append(snippets, seqno, key.mem[0], dst_term);
+ while (*src++)
+ ;
+ off = src - reckeys->buf;
+ nmem_reset(nmem);
+ }
+ nmem_destroy(nmem);
+ iscz1_stop(decode_handle);
+ return ZEBRA_OK;
+}
+
void print_rec_keys(ZebraHandle zh, struct recKeys *reckeys)
{
void *decode_handle = iscz1_start();
-/* $Id: index.h,v 1.140 2005-06-06 21:31:08 adam Exp $
+/* $Id: index.h,v 1.141 2005-06-07 11:36:38 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
extern struct rank_control *rankzv_class;
extern struct rank_control *rankliv_class;
-int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, ODR stream,
+int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score,
+ zebra_snippets *hit_snippet, ODR stream,
oid_value input_format, Z_RecordComposition *comp,
oid_value *output_format, char **rec_bufp,
int *rec_lenp, char **basenamep,
void zebra_record_int_end (void *fh, off_t offset);
void print_rec_keys(ZebraHandle zh, struct recKeys *reckeys);
+ZEBRA_RES zebra_snippets_rec_keys(ZebraHandle zh, struct recKeys *reckeys,
+ zebra_snippets *snippets);
+ZEBRA_RES zebra_snippets_hit_vector(ZebraHandle zh, const char *setname,
+ zint sysno, zebra_snippets *snippets);
void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,
int cmd, struct recKeys *reckeys);
-/* $Id: retrieve.c,v 1.30 2005-05-31 13:01:37 adam Exp $
+/* $Id: retrieve.c,v 1.31 2005-06-07 11:36:38 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
fc->offset_end = off;
}
-int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, ODR stream,
+int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score,
+ zebra_snippets *hit_snippet, ODR stream,
oid_value input_format, Z_RecordComposition *comp,
oid_value *output_format, char **rec_bufp,
int *rec_lenp, char **basenamep,
retrieveCtrl.res = zh->res;
retrieveCtrl.rec_buf = 0;
retrieveCtrl.rec_len = -1;
-
+ retrieveCtrl.hit_snippet = hit_snippet;
+ retrieveCtrl.doc_snippet = zebra_snippets_create();
+
if (1)
{
+ /* snippets code */
struct recKeys reckeys;
+ zebra_snippets *snippet;
reckeys.buf = rec->info[recInfo_delKeys];
reckeys.buf_used = rec->size[recInfo_delKeys];
- print_rec_keys(zh, &reckeys);
+ zebra_snippets_rec_keys(zh, &reckeys, retrieveCtrl.doc_snippet);
+
+
+ yaz_log(YLOG_LOG, "DOC SNIPPET:");
+ zebra_snippets_log(retrieveCtrl.doc_snippet, YLOG_LOG);
+ yaz_log(YLOG_LOG, "HIT SNIPPET:");
+ zebra_snippets_log(retrieveCtrl.hit_snippet, YLOG_LOG);
+
+ snippet = zebra_snippets_window(retrieveCtrl.doc_snippet,
+ retrieveCtrl.hit_snippet,
+ 10);
+
+ yaz_log(YLOG_LOG, "WINDOW SNIPPET:");
+ zebra_snippets_log(snippet, YLOG_LOG);
+
+ (*rt->retrieve)(clientData, &retrieveCtrl);
+
+ zebra_snippets_destroy(snippet);
}
- (*rt->retrieve)(clientData, &retrieveCtrl);
+ else
+ {
+ (*rt->retrieve)(clientData, &retrieveCtrl);
+ }
+
+ zebra_snippets_destroy(retrieveCtrl.doc_snippet);
+
*output_format = retrieveCtrl.output_format;
*rec_bufp = (char *) retrieveCtrl.rec_buf;
*rec_lenp = retrieveCtrl.rec_len;
-/* $Id: zebraapi.c,v 1.173 2005-06-02 11:59:53 adam Exp $
+/* $Id: zebraapi.c,v 1.174 2005-06-07 11:36:38 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
{
char *buf;
int len;
+ zebra_snippets *hit_snippet = zebra_snippets_create();
- zebra_get_hit_vector(zh, setname, poset[i].sysno);
+ zebra_snippets_hit_vector(zh, setname, poset[i].sysno,
+ hit_snippet);
recs[i].errCode =
zebra_record_fetch(zh, poset[i].sysno, poset[i].score,
+ hit_snippet,
stream, input_format, comp,
&recs[i].format, &buf, &len,
&recs[i].base, &recs[i].errString);
+
recs[i].len = len;
if (len > 0)
{
recs[i].buf = buf;
recs[i].score = poset[i].score;
recs[i].sysno = poset[i].sysno;
+ zebra_snippets_destroy(hit_snippet);
}
else
{
-/* $Id: zsets.c,v 1.86 2005-06-06 21:31:08 adam Exp $
+/* $Id: zsets.c,v 1.87 2005-06-07 11:36:38 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
return ZEBRA_FAIL;
}
-ZEBRA_RES zebra_get_hit_vector(ZebraHandle zh, const char *setname,
- zint sysno)
+ZEBRA_RES zebra_snippets_hit_vector(ZebraHandle zh, const char *setname,
+ zint sysno, zebra_snippets *snippets)
{
ZebraSet sset = resultSetGet(zh, setname);
yaz_log(YLOG_LOG, "zebra_get_hit_vector setname=%s zysno=" ZINT_FORMAT,
{
struct ord_list *ol;
key_logdump_txt(YLOG_LOG, &key, termid->name);
- yaz_log(YLOG_LOG, " type=%d", termid->type);
for (ol = termid->ol; ol; ol = ol->next)
+ {
yaz_log(YLOG_LOG, " ord=%d", ol->ord);
+ zebra_snippets_append(snippets, key.mem[key.len-1],
+ ol->ord, termid->name);
+ }
}
}
rset_close(rsfd);
rset_delete(rset_comb);
+ nmem_destroy(nmem);
}
return ZEBRA_OK;
}
-/* $Id: xslt.c,v 1.7 2005-06-01 07:32:46 adam Exp $
+/* $Id: xslt.c,v 1.8 2005-06-07 11:36:38 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
#define ZEBRA_SCHEMA_IDENTITY_NS "http://indexdata.dk/zebra/identity/1"
static const char *zebra_index_ns = ZEBRA_INDEX_NS;
+static void set_param_xml(const char **params, const char *name,
+ const char *value, ODR odr)
+{
+ while (*params)
+ params++;
+ params[0] = name;
+ params[1] = value;
+ params[2] = 0;
+}
+
static void set_param_str(const char **params, const char *name,
const char *value, ODR odr)
{
}
+static const char *snippet_doc(struct recRetrieveCtrl *p)
+{
+ const char *xml_doc_str;
+ int ord = 0;
+ WRBUF wrbuf = wrbuf_alloc();
+ zebra_snippets *res =
+ zebra_snippets_window(p->doc_snippet, p->hit_snippet, 10);
+ zebra_snippet_word *w = zebra_snippets_list(res);
+
+#if 1
+ wrbuf_printf(wrbuf, "\'");
+#else
+ wrbuf_printf(wrbuf, "<snippet>\n");
+#endif
+ for (; w; w = w->next)
+ {
+ if (ord == 0)
+ ord = w->ord;
+ else if (ord != w->ord)
+ break;
+#if 1
+ wrbuf_printf(wrbuf, "%s%s%s ",
+ w->match ? "*" : "",
+ w->term,
+ w->match ? "*" : "");
+#else
+ wrbuf_printf(wrbuf, " <term %s ord='%d' seqno='%d'>",
+ (w->match ? "match='1'" : ""),
+ w->ord, w->seqno);
+ wrbuf_xmlputs(wrbuf, w->term);
+ wrbuf_printf(wrbuf, "</term>\n");
+#endif
+ }
+#if 1
+ wrbuf_printf(wrbuf, "\'");
+#else
+ wrbuf_printf(wrbuf, "</snippet>\n");
+#endif
+ xml_doc_str = odr_strdup(p->odr, wrbuf_buf(wrbuf));
+
+ zebra_snippets_destroy(res);
+ wrbuf_free(wrbuf, 1);
+ return xml_doc_str;
+}
+
static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
{
const char *esn = ZEBRA_SCHEMA_IDENTITY_NS;
set_param_int(params, "score", p->score, p->odr);
set_param_int(params, "size", p->recordSize, p->odr);
+ set_param_xml(params, "snippet", snippet_doc(p), p->odr);
doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
0 /* URL */,
0 /* encoding */,
<schemaInfo>
- <!-- $Id: marcschema.xml,v 1.1 2005-05-31 14:18:17 adam Exp $ -->
+ <!-- $Id: marcschema.xml,v 1.2 2005-06-07 11:36:40 adam Exp $ -->
<schema name="index" identifier="http://indexdata.dk/zebra/indexing/1"
stylesheet="index.xsl" />
<schema name="F" stylesheet="id.xsl" />
+ <schema name="snippet" stylesheet="snippet.xsl" />
</schemaInfo>
-## $Id: Makefile.am,v 1.13 2005-03-30 09:25:25 adam Exp $
+## $Id: Makefile.am,v 1.14 2005-06-07 11:36:42 adam Exp $
lib_LTLIBRARIES = libidzebra-util.la
LDADD = libidzebra-util.la $(YAZLALIB)
libidzebra_util_la_SOURCES = zint.c res.c charmap.c zebramap.c passwddb.c \
- zebra-lock.c dirent.c xpath.c atoi_zn.c
+ zebra-lock.c dirent.c xpath.c atoi_zn.c snippet.c
passtest_SOURCES = passtest.c
--- /dev/null
+/* $Id: snippet.c,v 1.1 2005-06-07 11:36:43 adam Exp $
+ Copyright (C) 1995-2005
+ Index Data ApS
+
+This file is part of the Zebra server.
+
+Zebra is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with Zebra; see the file LICENSE.zebra. If not, write to the
+Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+02111-1307, USA.
+*/
+
+#include <stddef.h>
+#include <yaz/nmem.h>
+#include <yaz/log.h>
+#include <idzebra/snippet.h>
+
+struct zebra_snippets {
+ NMEM nmem;
+ zebra_snippet_word *front;
+ zebra_snippet_word *tail;
+};
+
+zebra_snippets *zebra_snippets_create()
+{
+ NMEM nmem = nmem_create();
+ zebra_snippets *l = nmem_malloc(nmem, sizeof(*l));
+ l->nmem = nmem;
+ l->front = l->tail = 0;
+ return l;
+}
+
+void zebra_snippets_destroy(zebra_snippets *l)
+{
+ if (l)
+ nmem_destroy(l->nmem);
+}
+
+void zebra_snippets_append(zebra_snippets *l,
+ zint seqno, int ord, const char *term)
+{
+ zebra_snippets_append_match(l, seqno, ord, term, 0);
+}
+
+void zebra_snippets_append_match(zebra_snippets *l,
+ zint seqno, int ord, const char *term,
+ int match)
+{
+ struct zebra_snippet_word *w = nmem_malloc(l->nmem, sizeof(*w));
+
+ w->next = 0;
+ if (l->tail)
+ l->tail->next = w;
+ else
+ l->front = w;
+ l->tail = w;
+
+ w->seqno = seqno;
+ w->ord = ord;
+ w->term = nmem_strdup(l->nmem, term);
+ w->match = match;
+}
+
+zebra_snippet_word *zebra_snippets_list(zebra_snippets *l)
+{
+ return l->front;
+}
+
+void zebra_snippets_log(zebra_snippets *l, int log_level)
+{
+ zebra_snippet_word *w;
+ for (w = l->front; w; w = w->next)
+ yaz_log(log_level, "term=%s%s seqno=" ZINT_FORMAT " ord=%d",
+ w->term, (w->match ? "*" : ""), w->seqno, w->ord);
+}
+
+zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit,
+ int window_size)
+{
+ int ord = -1;
+
+ zebra_snippets *result = zebra_snippets_create();
+
+ while(1)
+ {
+ int window_start;
+ zebra_snippet_word *hit_w, *doc_w;
+ int min_ord = 0; /* not set yet */
+ for (hit_w = zebra_snippets_list(hit); hit_w; hit_w = hit_w->next)
+ if (hit_w->ord > ord &&
+ (min_ord == 0 || hit_w->ord < min_ord))
+ min_ord = hit_w->ord;
+ if (min_ord == 0)
+ break;
+ ord = min_ord;
+
+ int first_seq_no_best_window = 0;
+ int last_seq_no_best_window = 0;
+ int number_best_window = 0;
+
+ for (hit_w = zebra_snippets_list(hit); hit_w; hit_w = hit_w->next)
+ {
+ if (hit_w->ord == ord)
+ {
+ zebra_snippet_word *look_w = hit_w;
+ int number_this = 0;
+ int seq_no_last = 0;
+ while (look_w && look_w->seqno < hit_w->seqno + window_size)
+ {
+ if (look_w->ord == ord)
+ {
+ seq_no_last = look_w->seqno;
+ number_this++;
+ }
+ look_w = look_w->next;
+ }
+ if (number_this > number_best_window)
+ {
+ number_best_window = number_this;
+ first_seq_no_best_window = hit_w->seqno;
+ last_seq_no_best_window = seq_no_last;
+ }
+ }
+ }
+ yaz_log(YLOG_LOG, "ord=%d", ord);
+ yaz_log(YLOG_LOG, "first_seq_no_best_window=%d", first_seq_no_best_window);
+ yaz_log(YLOG_LOG, "last_seq_no_best_window=%d", last_seq_no_best_window);
+ yaz_log(YLOG_LOG, "number_best_window=%d", number_best_window);
+
+ window_start = (first_seq_no_best_window + last_seq_no_best_window -
+ window_size) / 2;
+ for (doc_w = zebra_snippets_list(doc); doc_w; doc_w = doc_w->next)
+ if (doc_w->ord == ord
+ && doc_w->seqno >= window_start
+ && doc_w->seqno < window_start + window_size)
+ {
+ int match = 0;
+ for (hit_w = zebra_snippets_list(hit); hit_w; hit_w = hit_w->next)
+ {
+ if (hit_w->ord == ord && hit_w->seqno == doc_w->seqno)
+
+ {
+ match = 1;
+ break;
+ }
+ }
+ zebra_snippets_append_match(result, doc_w->seqno, ord,
+ doc_w->term, match);
+ }
+ }
+ return result;
+}
+