1 /* $Id: charsets.c,v 1.2 2007-05-23 14:44:18 marc Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 \brief Pazpar2 Character set facilities
30 #include <yaz/xmalloc.h>
31 #include <yaz/wrbuf.h>
38 //#include "parameters.h"
45 struct pp2_charset_s {
46 const char *(*token_next_handler)(pp2_relevance_token_t prt);
47 /* other handlers will come as we see fit */
49 struct icu_chain * icu_chn;
54 static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt);
57 static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt);
60 /* tokenzier handle */
61 struct pp2_relevance_token_s {
62 const char *cp; /* unnormalized buffer we're tokenizing */
63 pp2_charset_t pct; /* our main charset handle (type+config) */
64 WRBUF norm_str; /* normized string we return (temporarily) */
67 pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn)
69 pp2_charset_t pct = xmalloc(sizeof(*pct));
73 pct->icu_chn = icu_chn;
74 pct->icu_sts = U_ZERO_ERROR;
75 pct->token_next_handler = pp2_relevance_token_icu;
79 pct->token_next_handler = pp2_relevance_token_a_to_z;
82 pct->token_next_handler = pp2_relevance_token_a_to_z;
88 void pp2_charset_destroy(pp2_charset_t pct)
93 pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct,
96 pp2_relevance_token_t prt = xmalloc(sizeof(*prt));
102 pct->icu_sts = U_ZERO_ERROR;
104 ok = icu_chain_assign_cstr(pct->icu_chn, buf, &pct->icu_sts);
105 printf("\nfield ok: %d '%s'\n", ok, buf);
114 prt->norm_str = wrbuf_alloc();
125 void pp2_relevance_token_destroy(pp2_relevance_token_t prt)
129 wrbuf_destroy(prt->norm_str);
133 const char *pp2_relevance_token_next(pp2_relevance_token_t prt)
136 return (prt->pct->token_next_handler)(prt);
139 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 1 : -1)
140 /* original tokenizer with our tokenize interface, but we
141 add +1 to ensure no '\0' are in our string (except for EOF)
143 static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt)
145 const char *cp = prt->cp;
148 /* skip white space */
149 while (*cp && (c = raw_char(tolower(*cp))) < 0)
156 /* now read the term itself */
157 wrbuf_rewind(prt->norm_str);
158 while (*cp && (c = raw_char(tolower(*cp))) >= 0)
160 wrbuf_putc(prt->norm_str, c);
164 return wrbuf_cstr(prt->norm_str);
169 static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt)
171 //&& U_SUCCESS(pct->icu_sts))
172 if (icu_chain_next_token(prt->pct->icu_chn, &prt->pct->icu_sts)){
173 printf("'%s' ", icu_chain_get_norm(prt->pct->icu_chn));
174 if (U_FAILURE(prt->pct->icu_sts))
176 printf("ICU status failure\n ");
180 return icu_chain_get_norm(prt->pct->icu_chn);
192 * indent-tabs-mode: nil
194 * vim: shiftwidth=4 tabstop=8 expandtab