From 4b903c542156253ebb2a4f004b528fe9e3af5212 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 29 Oct 2007 13:43:57 +0000 Subject: [PATCH] First indexing using index_types system (ICU). --- include/idzebra/recctrl.h | 2 +- index/extract.c | 87 +++++++++++++++++++++++++++++++++++++++++++-- test/api/Makefile.am | 7 ++-- test/api/indextypes17.xml | 18 ++++++++++ test/api/t17.c | 66 ++++++++++++++++++++++++++++++++++ test/api/zebra17.cfg | 12 +++++++ 6 files changed, 186 insertions(+), 6 deletions(-) create mode 100644 test/api/indextypes17.xml create mode 100644 test/api/t17.c create mode 100644 test/api/zebra17.cfg diff --git a/include/idzebra/recctrl.h b/include/idzebra/recctrl.h index c7c92cc..92f46e4 100644 --- a/include/idzebra/recctrl.h +++ b/include/idzebra/recctrl.h @@ -1,4 +1,4 @@ -/* $Id: recctrl.h,v 1.35 2007-05-08 12:50:04 adam Exp $ +/* $Id: recctrl.h,v 1.36 2007-10-29 13:43:57 adam Exp $ Copyright (C) 1995-2007 Index Data ApS diff --git a/index/extract.c b/index/extract.c index 24e5b9e..0eec47e 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.263 2007-10-29 09:25:40 adam Exp $ +/* $Id: extract.c,v 1.264 2007-10-29 13:43:57 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -20,6 +20,10 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +/** \file + \brief indexes records and extract tokens for indexing and sorting +*/ + #include #include #include @@ -31,10 +35,12 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #endif #include + #include "index.h" #include "orddict.h" #include #include +#include static int log_level_extract = 0; static int log_level_details = 0; @@ -68,6 +74,7 @@ static void extract_flush_sort_keys(ZebraHandle zh, zint sysno, int cmd, zebra_rec_keys_t skp); static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid); static void extract_token_add(RecWord *p); +static void extract_token_add2(RecWord *p); static void check_log_limit(ZebraHandle zh) { @@ -828,7 +835,14 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, stream->endf(stream, &null_offset);; extractCtrl.init = extract_init; - extractCtrl.tokenAdd = extract_token_add; + if (zh->reg->index_types) + { + extractCtrl.tokenAdd = extract_token_add2; + } + else + { + extractCtrl.tokenAdd = extract_token_add; + } extractCtrl.schemaAdd = extract_schema_add; extractCtrl.dh = zh->reg->dh; extractCtrl.handle = zh; @@ -1744,6 +1758,75 @@ static void extract_add_complete_field(RecWord *p) extract_add_string(p, buf, i); } +static void extract_token_add2_index(ZebraHandle zh, zebra_index_type_t type, + RecWord *p) +{ + struct it_key key; + const char *res_buf = 0; + size_t res_len = 0; + int r = zebra_index_type_tokenize(type, p->term_buf, p->term_len, + &res_buf, &res_len); + int cat = zinfo_index_category_index; + int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, p->index_type, p->index_name); + if (ch < 0) + ch = zebraExplain_add_attr_str(zh->reg->zei, cat, p->index_type, p->index_name); + while (r) + { + int i = 0; + key.mem[i++] = ch; + key.mem[i++] = p->record_id; + key.mem[i++] = p->section_id; + + if (zh->m_segment_indexing) + key.mem[i++] = p->segment; + key.mem[i++] = p->seqno; + key.len = i; + + yaz_log(YLOG_LOG, "keys_write %.*s", (int) res_len, res_buf); + zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key); + + p->seqno++; + r = zebra_index_type_tokenize(type, 0, 0, &res_buf, &res_len); + } +} + +static void extract_token_add2(RecWord *p) +{ + zebra_index_type_t type; + ZebraHandle zh = p->extractCtrl->handle; + char type_tmp[2]; + type_tmp[0] = p->index_type; + type_tmp[1] = '\0'; + type = zebra_index_type_get(zh->reg->index_types, type_tmp); + if (type) + { + if (zebra_index_type_is_index(type)) + { + extract_token_add2_index(zh, type, p); + } + else if (zebra_index_type_is_sort(type)) + { + ; + + } + } +} + +/** \brief top-level indexing handler for recctrl system + \param p token data to be indexed + + Call sequence: + extract_token + zebra_add_{in}_complete + extract_add_string + + extract_add_index_string + or + extract_add_sort_string + or + extract_add_staticrank_string + +*/ static void extract_token_add(RecWord *p) { ZebraHandle zh = p->extractCtrl->handle; diff --git a/test/api/Makefile.am b/test/api/Makefile.am index 3c5f376..bec97ea 100644 --- a/test/api/Makefile.am +++ b/test/api/Makefile.am @@ -1,15 +1,15 @@ -# $Id: Makefile.am,v 1.40 2006-11-16 12:48:28 adam Exp $ +# $Id: Makefile.am,v 1.41 2007-10-29 13:43:58 adam Exp $ noinst_PROGRAMS = testclient testclient_SOURCES = testclient.c -simpletests = t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 t16 +simpletests = t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 t16 t17 safaritests = safari1 check_PROGRAMS = $(simpletests) $(safaritests) TESTS = $(check_PROGRAMS) EXTRA_DIST=zebra.cfg zebra6.cfg zebra8.cfg zebra10.cfg zebra15.cfg safari.cfg \ - t10.att t10.abs + t10.att t10.abs zebra17.cfg indextypes17.xml noinst_LIBRARIES = libtestlib.a @@ -32,6 +32,7 @@ t13_SOURCES = t13.c t14_SOURCES = t14.c t15_SOURCES = t15.c t16_SOURCES = t16.c +t17_SOURCES = t17.c safari1_SOURCES = safari1.c testlib.c diff --git a/test/api/indextypes17.xml b/test/api/indextypes17.xml new file mode 100644 index 0000000..49b4d21 --- /dev/null +++ b/test/api/indextypes17.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + diff --git a/test/api/t17.c b/test/api/t17.c new file mode 100644 index 0000000..84b1ae3 --- /dev/null +++ b/test/api/t17.c @@ -0,0 +1,66 @@ +/* $Id: t17.c,v 1.1 2007-10-29 13:43:58 adam Exp $ + Copyright (C) 1995-2007 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + +/** \file + \brief tests unicode enabled searching (index_types) +*/ +#include +#include "testlib.h" + +const char *myrec[] = { + "\nMy title\n\n", + "\nMy x title\n\n", + "\nMy title x\n\n" , + 0} ; + +static void tst(int argc, char **argv) +{ + ZebraService zs = tl_start_up("zebra17.cfg", argc, argv); + ZebraHandle zh = zebra_open(zs, 0); + + YAZ_CHECK(tl_init_data(zh, myrec)); + + /* simple term */ + YAZ_CHECK(tl_query(zh, "@attr 1=title notfound", 0)); + + /* we should get 3 hits. But 0 for now */ +#if 0 + + YAZ_CHECK(tl_query(zh, "@attr 1=title title", 3)); +#else + YAZ_CHECK(tl_query(zh, "@attr 1=title title", 0)); +#endif + + + YAZ_CHECK(tl_close_down(zh, zs)); +} + +TL_MAIN + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/test/api/zebra17.cfg b/test/api/zebra17.cfg new file mode 100644 index 0000000..9c2fb1e --- /dev/null +++ b/test/api/zebra17.cfg @@ -0,0 +1,12 @@ +# $Id: zebra17.cfg,v 1.1 2007-10-29 13:43:58 adam Exp $ +profilepath: ${srcdir:-.}:${srcdir:-.}/../../tab + +attset: bib1.att +attset: explain.att + +recordType: grs.sgml + +indextypes: indextypes17.xml + +isam: b + -- 1.7.10.4