1 /* Copyright (C) 2006, Index Data ApS
2 * See the file LICENSE for details.
4 * $Id: nfa.c,v 1.1 2006-05-03 09:04:33 heikki Exp $
6 * This is a simple NFA-based system for character set normalizing
7 * in yaz and zebra. Unlike standard NFA's, this operates on ranges of
8 * characters at a time, so as to keep the size small.
10 * All characters are internally handled as unsigned 32-bit ints, so
11 * this works well with unicode. Translating to and from utf-8 is trivial, if
14 * The NFA stores a translation-thing in the terminal nodes. It does not
15 * concern itself with what that thing is, for us it is juts a void*
17 * The module consists of two parts: Building the NFA, and translating
29 typedef struct yaz_nfa_backref_info yaz_backref_info;
31 struct yaz_nfa_backref_info {
38 int nstates; /* how many states do we have */
39 int nbackrefs; /* how many backrefs do we have */
40 yaz_nfa_state *laststate; /* points to the last in the circular list */
41 yaz_nfa_state *firststate; /* the initial state */
42 yaz_backref_info *curr_backrefs; /* the backrefs we are matching */
43 yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/
46 struct yaz_nfa_state {
47 int num; /* state number. used for resoving ambiguities, and for dumping */
48 void *result; /* signals a terminal state, and gives the result */
49 yaz_nfa_transition *lasttrans; /* circular list of transitions */
50 yaz_nfa_state *next; /* Circular linked list */
51 int backref_start; /* which backreference starts here. 0 if none */
52 int backref_end; /* which backreference ends here. 0 if none */
55 struct yaz_nfa_transition {
56 yaz_nfa_state *to_state; /* where to */
57 yaz_nfa_char range_start;
58 yaz_nfa_char range_end;
59 yaz_nfa_transition *next; /* a linked list of them */
62 /* a range from 1 down to 0 is impossible, and is used to denote an */
63 /* empty (epsilon) transition */
67 /* a limit for the number of empty transitions we can have in a row before */
68 /* we declare this a loop */
69 #define LOOP_LIMIT 100
75 * Initializing and destroying whole nfa's
78 yaz_nfa *yaz_nfa_init() {
79 NMEM my_nmem = nmem_create();
80 yaz_nfa *n=nmem_malloc(my_nmem, sizeof(yaz_nfa));
87 void yaz_nfa_destroy(yaz_nfa *n) {
88 nmem_destroy(n->nmem);
93 * Low-level interface to building the NFA
96 yaz_nfa_state *yaz_nfa_add_state(yaz_nfa *n) {
97 yaz_nfa_state *s = nmem_malloc(n->nmem,sizeof(yaz_nfa_state));
98 s->num = (n->nstates)++;
104 s->next=n->laststate->next;
105 n->laststate->next=s;
107 } else { /* first state */
115 int yaz_nfa_set_result(yaz_nfa *n, yaz_nfa_state *s,void *result) {
116 if ((s->result)&&result)
122 void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) {
128 int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s,
132 if (s->backref_start)
134 s->backref_start=backref_number;
135 if (n->nbackrefs<backref_number)
136 n->nbackrefs=backref_number;
140 s->backref_end=backref_number;
145 int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s,
150 return s->backref_start;
152 return s->backref_end;
155 void yaz_nfa_add_transition(yaz_nfa *n,
156 yaz_nfa_state *from_state,
157 yaz_nfa_state *to_state,
158 yaz_nfa_char range_start,
159 yaz_nfa_char range_end) {
160 yaz_nfa_transition *t=nmem_malloc(n->nmem,sizeof(yaz_nfa_transition));
161 t->range_start=range_start;
162 t->range_end=range_end;
163 t->to_state=to_state;
164 if (from_state->lasttrans) {
165 t->next= from_state->lasttrans->next;
166 from_state->lasttrans->next=t;
167 from_state->lasttrans=t;
168 } else { /* first trans */
169 from_state->lasttrans=t;
174 void yaz_nfa_add_empty_transition( yaz_nfa *n,
175 yaz_nfa_state *from_state,
176 yaz_nfa_state *to_state) {
177 yaz_nfa_add_transition(n,from_state,to_state,
178 EMPTY_START,EMPTY_END);
182 * Medium-level interface
185 /* Finds a transition from s where the range is exactly c..c */
186 /* There should only be one such transition */
187 static yaz_nfa_state *find_single_trans(
189 yaz_nfa_char range_start,
190 yaz_nfa_char range_end) {
191 yaz_nfa_transition *t=s->lasttrans;
196 if ( ( t->range_start == range_start ) && ( t->range_end == range_end) )
198 } while (t != s->lasttrans );
203 yaz_nfa_state *yaz_nfa_add_range(yaz_nfa *n,
205 yaz_nfa_char range_start,
206 yaz_nfa_char range_end) {
207 yaz_nfa_state *nextstate;
208 if (!s) /* default to top-level of the nfa */
210 nextstate=find_single_trans(s,range_start,range_end);
212 nextstate = yaz_nfa_add_state(n);
213 yaz_nfa_add_transition(n, s, nextstate, range_start, range_end);
218 yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n,
221 yaz_nfa_state *nextstate;
222 if (!s) /* default to top-level of the nfa */
224 nextstate=find_single_trans(s,*seq,*seq);
227 if (!*seq) /* whole sequence matched */
230 return yaz_nfa_add_sequence(n, nextstate, seq);
231 } else { /* no next state, build the rest */
233 s=yaz_nfa_add_range(n,s, *seq, *seq);
238 return 0; /* compiler shut up, we return somewhere above */
248 yaz_nfa_char *longest;
252 int empties; /* count how many consecutive empty transitions */
255 /* Finds the longest match. In case of ties, chooses the one with the
256 * lowest numbered state */
257 static void match_state(yaz_nfa_state *s,
258 yaz_nfa_char *inchar,
260 struct matcher *m ) {
261 yaz_nfa_transition *t=s->lasttrans;
267 if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){
269 match_state(t->to_state, inchar+1,incharsleft-1,m);
270 /* yes, descent to all matching nodes, even if overrun, */
271 /* since we can find a better one later */
272 } else if (( t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
273 if ( m->empties++ > LOOP_LIMIT )
274 m->errorcode= YAZ_NFA_LOOP;
276 match_state(t->to_state, inchar,incharsleft,m);
278 } while (t != s->lasttrans );
280 m->errorcode=YAZ_NFA_OVERRUN;
283 if (s->result) { /* terminal node */
284 if ( (m->longest < inchar) || /* longer result */
285 ((m->longest == inchar)&&(m->bestnode<s->num)) ){
286 /* or as long, but with lower node number. Still better */
294 int yaz_nfa_match(yaz_nfa *n,
295 yaz_nfa_char **inbuff,
300 return YAZ_NFA_NOMATCH;
302 m.bestnode=n->nstates;
305 if (!n->curr_backrefs) {
306 int sz=sizeof( struct yaz_nfa_backref_info);
307 n->curr_backrefs=nmem_malloc( n->nmem, n->nbackrefs* sz);
308 n->best_backrefs=nmem_malloc( n->nmem, n->nbackrefs* sz);
312 match_state(n->firststate, *inbuff, incharsleft, &m);
319 return YAZ_NFA_SUCCESS;
321 return YAZ_NFA_NOMATCH;
331 yaz_nfa_state *yaz_nfa_get_first(yaz_nfa *n){
334 return n->firststate;
337 yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s){
347 static void dump_trans(FILE *F, yaz_nfa_transition *t ) {
353 if ( (t->range_start <= ' ') || (t->range_start>'z'))
355 if ( (t->range_end <= ' ') || (t->range_end>'z'))
357 if ((t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
360 fprintf(F," -> [%d] %s '%c' %x - '%c' %x \n",
366 static void dump_state(FILE *F, yaz_nfa_state *s,
367 char *(*strfunc)(void *) ) {
368 yaz_nfa_transition *t;
369 char *resultstring="";
372 resultstring=(*strfunc)(s->result);
375 resultstring=s->result;
377 fprintf(F," state [%d] %s %s",
378 s->num, s->result?"(FINAL)":"",resultstring );
379 if (s->backref_start) {
380 fprintf(F," start-%d",s->backref_start);
382 if (s->backref_end) {
383 fprintf(F," end-%d",s->backref_end);
388 fprintf(F," (no transitions)\n");
393 } while (t != s->lasttrans);
398 void yaz_nfa_dump(FILE *F, yaz_nfa *n,
399 char *(*strfunc)(void *) ) {
401 if (!F) /* lazy programmers can just pass 0 for F */
403 fprintf(F,"The NFA has %d states \n",n->nstates);
408 dump_state(F,s, strfunc);
409 } while (s != n->laststate);
418 * indent-tabs-mode: nil
420 * vim: shiftwidth=4 tabstop=8 expandtab