1 /* Copyright (C) 2006, Index Data ApS
2 * See the file LICENSE for details.
4 * $Id: nfa.c,v 1.2 2006-05-03 11:10:00 heikki Exp $
6 * This is a simple NFA-based system for character set normalizing
7 * in yaz and zebra. Unlike standard NFA's, this operates on ranges of
8 * characters at a time, so as to keep the size small.
10 * All characters are internally handled as unsigned 32-bit ints, so
11 * this works well with unicode. Translating to and from utf-8 is trivial, if
14 * The NFA stores a translation-thing in the terminal nodes. It does not
15 * concern itself with what that thing is, for us it is juts a void*
17 * The module consists of two parts: Building the NFA, and translating
29 typedef struct yaz_nfa_backref_info yaz_backref_info;
31 struct yaz_nfa_backref_info {
38 int nstates; /* how many states do we have */
39 int nbackrefs; /* how many backrefs do we have */
40 yaz_nfa_state *laststate; /* points to the last in the circular list */
41 yaz_nfa_state *firststate; /* the initial state */
42 yaz_backref_info *curr_backrefs; /* the backrefs we are matching */
43 yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/
46 struct yaz_nfa_state {
47 int num; /* state number. used for resoving ambiguities, and for dumping */
48 void *result; /* signals a terminal state, and gives the result */
49 yaz_nfa_transition *lasttrans; /* circular list of transitions */
50 yaz_nfa_state *next; /* Circular linked list */
51 int backref_start; /* which backreference starts here. 0 if none */
52 int backref_end; /* which backreference ends here. 0 if none */
55 struct yaz_nfa_transition {
56 yaz_nfa_state *to_state; /* where to */
57 yaz_nfa_char range_start;
58 yaz_nfa_char range_end;
59 yaz_nfa_transition *next; /* a linked list of them */
62 /* a range from 1 down to 0 is impossible, and is used to denote an */
63 /* empty (epsilon) transition */
67 /* a limit for the number of empty transitions we can have in a row before */
68 /* we declare this a loop */
69 #define LOOP_LIMIT 100
75 * Initializing and destroying whole nfa's
78 yaz_nfa *yaz_nfa_init() {
79 NMEM my_nmem = nmem_create();
80 yaz_nfa *n=nmem_malloc(my_nmem, sizeof(yaz_nfa));
91 void yaz_nfa_destroy(yaz_nfa *n) {
92 nmem_destroy(n->nmem);
97 * Low-level interface to building the NFA
100 yaz_nfa_state *yaz_nfa_add_state(yaz_nfa *n) {
101 yaz_nfa_state *s = nmem_malloc(n->nmem,sizeof(yaz_nfa_state));
102 s->num = (n->nstates)++;
108 s->next=n->laststate->next;
109 n->laststate->next=s;
111 } else { /* first state */
119 int yaz_nfa_set_result(yaz_nfa *n, yaz_nfa_state *s,void *result) {
120 if ((s->result)&&result)
126 void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) {
132 int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s,
136 if (s->backref_start)
138 s->backref_start=backref_number;
139 if (n->nbackrefs<backref_number) {
140 n->nbackrefs=backref_number;
143 /* clear them just in case we have already matched on */
144 /* with this nfa, and created a too small backref table */
145 /* we will reallocate when matching. */
150 if (n->nbackrefs<backref_number)
151 return 2; /* can't end a backref that has not been started */
152 s->backref_end=backref_number;
157 int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s,
162 return s->backref_start;
164 return s->backref_end;
167 void yaz_nfa_add_transition(yaz_nfa *n,
168 yaz_nfa_state *from_state,
169 yaz_nfa_state *to_state,
170 yaz_nfa_char range_start,
171 yaz_nfa_char range_end) {
172 yaz_nfa_transition *t=nmem_malloc(n->nmem,sizeof(yaz_nfa_transition));
173 t->range_start=range_start;
174 t->range_end=range_end;
175 t->to_state=to_state;
176 if (from_state->lasttrans) {
177 t->next= from_state->lasttrans->next;
178 from_state->lasttrans->next=t;
179 from_state->lasttrans=t;
180 } else { /* first trans */
181 from_state->lasttrans=t;
186 void yaz_nfa_add_empty_transition( yaz_nfa *n,
187 yaz_nfa_state *from_state,
188 yaz_nfa_state *to_state) {
189 yaz_nfa_add_transition(n,from_state,to_state,
190 EMPTY_START,EMPTY_END);
194 * Medium-level interface
197 /* Finds a transition from s where the range is exactly c..c */
198 /* There should only be one such transition */
199 static yaz_nfa_state *find_single_trans(
201 yaz_nfa_char range_start,
202 yaz_nfa_char range_end) {
203 yaz_nfa_transition *t=s->lasttrans;
208 if ( ( t->range_start == range_start ) && ( t->range_end == range_end) )
210 } while (t != s->lasttrans );
215 yaz_nfa_state *yaz_nfa_add_range(yaz_nfa *n,
217 yaz_nfa_char range_start,
218 yaz_nfa_char range_end) {
219 yaz_nfa_state *nextstate;
220 if (!s) /* default to top-level of the nfa */
222 nextstate=find_single_trans(s,range_start,range_end);
224 nextstate = yaz_nfa_add_state(n);
225 yaz_nfa_add_transition(n, s, nextstate, range_start, range_end);
230 yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n,
233 yaz_nfa_state *nextstate;
234 if (!s) /* default to top-level of the nfa */
236 nextstate=find_single_trans(s,*seq,*seq);
239 if (!*seq) /* whole sequence matched */
242 return yaz_nfa_add_sequence(n, nextstate, seq);
243 } else { /* no next state, build the rest */
245 s=yaz_nfa_add_range(n,s, *seq, *seq);
250 return 0; /* compiler shut up, we return somewhere above */
261 yaz_nfa_char *longest;
265 int empties; /* count how many consecutive empty transitions */
268 /* Finds the longest match. In case of ties, chooses the one with the
269 * lowest numbered state. Keep track of the back references. Recursively
270 * traverses the NFA. Keeps track of the best hit it has found. */
272 static void match_state(
274 yaz_nfa_char *inchar,
276 struct matcher *m ) {
277 yaz_nfa_transition *t=s->lasttrans;
278 if (s->backref_start)
279 m->n->curr_backrefs[s->backref_start].start=inchar;
281 m->n->curr_backrefs[s->backref_end].end=inchar;
286 if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){
288 match_state(t->to_state, inchar+1,incharsleft-1,m);
289 /* yes, descent to all matching nodes, even if overrun, */
290 /* since we can find a better one later */
291 } else if (( t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
292 if ( m->empties++ > LOOP_LIMIT )
293 m->errorcode= YAZ_NFA_LOOP;
295 match_state(t->to_state, inchar,incharsleft,m);
297 } while (t != s->lasttrans );
299 m->errorcode=YAZ_NFA_OVERRUN;
302 if (s->result) { /* terminal node */
303 if ( (m->longest < inchar) || /* longer result */
304 ((m->longest == inchar)&&(m->bestnode<s->num)) ){
305 /* or as long, but with lower node number. Still better */
310 if (m->n->curr_backrefs)
311 for (i=0; i<m->n->nbackrefs;i++)
312 m->n->best_backrefs[i]=m->n->curr_backrefs[i];
315 if (s->backref_start)
316 m->n->curr_backrefs[s->backref_start].start=0;
318 m->n->curr_backrefs[s->backref_end].end=0;
321 int yaz_nfa_match(yaz_nfa *n,
322 yaz_nfa_char **inbuff,
327 return YAZ_NFA_NOMATCH;
330 m.bestnode=n->nstates;
333 if ((!n->curr_backrefs) && (n->nbackrefs)){
334 int sz=sizeof( struct yaz_nfa_backref_info) * (n->nbackrefs+1);
335 n->curr_backrefs=nmem_malloc( n->nmem, sz);
336 n->best_backrefs=nmem_malloc( n->nmem, sz);
340 match_state(n->firststate, *inbuff, incharsleft, &m);
347 return YAZ_NFA_SUCCESS;
349 return YAZ_NFA_NOMATCH;
359 yaz_nfa_state *yaz_nfa_get_first(yaz_nfa *n){
362 return n->firststate;
365 yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s){
375 static void dump_trans(FILE *F, yaz_nfa_transition *t ) {
381 if ( (t->range_start <= ' ') || (t->range_start>'z'))
383 if ( (t->range_end <= ' ') || (t->range_end>'z'))
385 if ((t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) {
388 fprintf(F," -> [%d] %s '%c' %x - '%c' %x \n",
394 static void dump_state(FILE *F, yaz_nfa_state *s,
395 char *(*strfunc)(void *) ) {
396 yaz_nfa_transition *t;
397 char *resultstring="";
400 resultstring=(*strfunc)(s->result);
403 resultstring=s->result;
405 fprintf(F," state [%d] %s %s",
406 s->num, s->result?"(FINAL)":"",resultstring );
407 if (s->backref_start) {
408 fprintf(F," start-%d",s->backref_start);
410 if (s->backref_end) {
411 fprintf(F," end-%d",s->backref_end);
416 fprintf(F," (no transitions)\n");
421 } while (t != s->lasttrans);
426 void yaz_nfa_dump(FILE *F, yaz_nfa *n,
427 char *(*strfunc)(void *) ) {
429 if (!F) /* lazy programmers can just pass 0 for F */
431 fprintf(F,"The NFA has %d states \n",n->nstates);
436 dump_state(F,s, strfunc);
437 } while (s != n->laststate);
446 * indent-tabs-mode: nil
448 * vim: shiftwidth=4 tabstop=8 expandtab