2 * Copyright (c) 1995-2006, Index Data
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Index Data nor the names of its contributors
13 * may be used to endorse or promote products derived from this
14 * software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 /* $Id: nfa.h,v 1.12 2006-10-09 21:02:41 adam Exp $ */
30 * \brief NFA for character set normalizing
32 * The NFA is a character mathcing device. It consists of states
33 * and transitions between them. Each transition has a condition, which
34 * is a range of values.
36 * When matching, we always start at the first state, and find the longest
37 * possible sequence of input characters that match the ranges in the
38 * conditions, and that leads into a terminal state.
40 * Separate from this we have converters. Those can often be used
41 * together with a NFA (think match-pattern and replace-pattern).
43 * A converter is a routine that produces some output. It can translate a
44 * range of characters into another range, emit a constant string, or
45 * something like that.
52 #include <yaz/yconfig.h>
57 /** \name return codes and data types*/
60 #define YAZ_NFA_SUCCESS 0
62 /** \brief no match found */
63 #define YAZ_NFA_NOMATCH 1
65 /** \brief Need more input */
66 #define YAZ_NFA_OVERRUN 2
68 /** \brief The NFA is looping */
69 #define YAZ_NFA_LOOP 3
71 /** \brief no room in output buffer */
72 #define YAZ_NFA_NOSPACE 4
74 /** \brief tryig to set a result when one already exists*/
75 #define YAZ_NFA_ALREADY 5
77 /** \brief Attempting to set an end to a backref that has not been started */
78 #define YAZ_NFA_NOSTART 6
80 /** \brief Asking for a non-existing backref */
81 #define YAZ_NFA_NOSUCHBACKREF 7
83 /** \brief Internal error, should never happen */
84 #define YAZ_NFA_INTERNAL 8
87 /** \brief Internal character type. 32-bit unicode! */
88 typedef unsigned int yaz_nfa_char;
91 /** \brief The NFA itself
92 * The internals are hidden in nfa.c */
93 typedef struct yaz_nfa yaz_nfa;
95 /** \brief A state of the NFA */
96 typedef struct yaz_nfa_state yaz_nfa_state;
98 /** \brief Transition from one state to another */
99 typedef struct yaz_nfa_transition yaz_nfa_transition;
101 /** \brief A converter produces some output to a buffer */
102 typedef struct yaz_nfa_converter yaz_nfa_converter;
106 /** \name Low-level interface to building the NFA */
109 /** \brief Initialize the NFA without any states in it
111 * \return a pointer to the newly created NFA
114 yaz_nfa *yaz_nfa_init(void);
116 /** \brief Destroy the whole thing */
117 void yaz_nfa_destroy(
118 yaz_nfa *n /** the nfa to destroy */
121 /** \brief Add a normal state to the NFA.
123 * The first state added will become the starting point.
124 * Returns a pointer to it, which you can safely ignore, or use in building
127 yaz_nfa_state *yaz_nfa_add_state(
128 yaz_nfa *n /** The NFA to add the state to */
132 /** \brief Sets the result pointer to a state
134 * \param n the NFA itself
135 * \param s the state to which the result will be added
136 * \param result the result pointer
138 * Sets the result pointer of a state. If already set, returns an error. Call
139 * with a NULL pointer to clear the result, before setting a new one.
141 * \retval YAZ_NFA_SUCCESS ok
142 * \retval YAZ_NFA_ALREADY The state already has a result!
144 int yaz_nfa_set_result(
150 /** \brief Gets the result pointer from a state
152 * \retval NULL if no result set
154 void *yaz_nfa_get_result(
155 yaz_nfa *n /** The NFA itself */,
156 yaz_nfa_state *s /** The state whose result you want */);
158 /** \brief Set a backref point to a state.
160 * Each state can be the beginning and/or ending point of a backref
161 * sequence. This call sets one of those flags in the state. After
162 * matching, we can get hold of the backrefs that matched, and use
163 * them in our translations. The numbering of backrefs start at 1,
167 * \param s the state to add to
168 * \param backref_number is the number of the back reference.
169 * \param is_start is 1 for start of the backref, 0 for end
171 * \retval YAZ_NFA_SUCCESS for OK
172 * \retval YAZ_NFA_ALREADY if the backref is already set
173 * \retval YAZ_NFA_NOSTART for ending a backref that has not been started
177 int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s,
181 /** \brief Get the backref point of a state
184 * \param s the state to add to
185 * \param is_start is 1 for start of the backref, 0 for end
186 * \return the backref number associated with the state, or 0 if none.
189 int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s,
192 /** \brief Add a transition to the NFA.
194 * Add a transition between two existing states. The condition
195 * is (as always) a range of yaz_nfa_chars.
197 * \param from_state which state the transition is from. null=initial
198 * \param to_state where the transition goes to
199 * \param range_start is the beginning of the range of values
200 * \param range_end is the end of the range of values
202 void yaz_nfa_add_transition(yaz_nfa *n,
203 yaz_nfa_state *from_state,
204 yaz_nfa_state *to_state,
205 yaz_nfa_char range_start,
206 yaz_nfa_char range_end);
208 /** \brief Add an empty (epsilon) transition.
211 * \param from_state which state the transition is from
212 * \param to_state where the transition goes to
214 void yaz_nfa_add_empty_transition( yaz_nfa *n,
215 yaz_nfa_state *from_state,
216 yaz_nfa_state *to_state);
218 /** \brief Add a translation from a range to the NFA.
221 * \param st the state to add this to. If null, adds to the initial state
222 * \param range_start is the beginning of the range of values
223 * \param range_end is the end of the range of values
225 * Checks if we already have a transition like this. If so, does not add
226 * a new one, but returns the target state. Otherwise creates a new state,
227 * and a transition to it.
229 yaz_nfa_state *yaz_nfa_add_range( yaz_nfa *n,
231 yaz_nfa_char range_start,
232 yaz_nfa_char range_end );
234 /** \brief Add a sequence of transitions and states.
237 * \param s the state to add this to. If null, adds to the initial state
238 * \param seq is a sequence of yaz_fna_chars.
239 * \param seq_len is the length of the sequence
240 * \return the final state
242 * Starting from state s (or from the initial state, if s is
243 * null), finds as much of seq as possible and inserts the rest.
245 yaz_nfa_state *yaz_nfa_add_sequence( yaz_nfa *n,
252 /** \name Low-level interface for mathcing the NFA. */
254 * These do the actual matching. They know nothing of
255 * the type of the result pointers
259 /** \brief Find the longest possible match.
261 * \param n the nfa itself
262 * \param inbuff buffer of input data. Will be incremented when match
263 * \param incharsleft max number of inchars to use from inbuff. decrements.
264 * \param result the result pointer from the nfa (what ever that is)
266 * In case of errors, returns the best match so far,
267 * which the caller is free to ignore.
269 * \retval YAZ_NFA_SUCCESS success
270 * \retval YAZ_NFA_NOMATCH no match found
271 * \retval YAZ_NFA_OVERRUN overrun of input buffer
272 * \retval YAZ_NFA_LOOP looping too far
276 int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t *incharsleft,
279 /** \brief Get a back reference after a successfull match.
282 * \param backref_no the number of the backref to get
283 * \param start beginning of the matching substring
284 * \param end end of the matching substring
286 * Returns pointers to the beginning and end of a backref, or null
287 * pointers if one endpoint not met. Those pointers point to the
288 * original buffer that was matched, so the caller will not have to
289 * worry about freeing anything special.
291 * It is technically possible to create NFAs that meet the start but
292 * not the end of a backref. It is up to the caller to decide how
293 * to handle such a situation.
295 * \retval YAZ_NFA_SUCCESS OK
296 * \retval YAZ_NFA_NOMATCH The NFA hasn't matched anything, no backref
297 * \retval YAZ_NFA_NOSUCHBACKREF no such backref
300 int yaz_nfa_get_backref( yaz_nfa *n,
302 yaz_nfa_char **start,
303 yaz_nfa_char **end );
307 /** \name Low-level interface to the converters */
308 /* These produce some output text into a buffer. There are a few
309 * kinds of converters, each producing different type of output.
313 /** \brief Create a string converter.
315 * \param string the string to output
316 * \param length how many chars in the string
318 * This converter produces a constant string in the output
320 yaz_nfa_converter *yaz_nfa_create_string_converter (
322 yaz_nfa_char *string,
325 /** \brief Create a backref converter
327 * \param backref_no The backreference to reproduce
329 * This converter copies a backref into the output buffer
331 yaz_nfa_converter *yaz_nfa_create_backref_converter (
332 yaz_nfa *n, int backref_no );
335 /** \brief Create a charcater range converter
337 * \param backref_no The backreference to reproduce
338 * \param from_char the first character of the original range
339 * \param to_char the first character of the target range
341 * This converter takes a backreference, and shifts the characters
342 * by a constant value. For example, translating a-z to A-Z.
343 * Note that backref 0 is always the last character that matched a
344 * range, even if no backrefs were defined in teh nfa. This makes
345 * it pretty useful with this converter.
348 yaz_nfa_converter *yaz_nfa_create_range_converter (
349 yaz_nfa *n, int backref_no,
350 yaz_nfa_char from_char,
351 yaz_nfa_char to_char);
354 /** \brief Connects converters in a chain.
355 * \param n the nfa (mostly for nmem access)
356 * \param startpoint the first converter in the chain
357 * \param newconverter
359 * Places the new converter at the end of the chain that starts from
363 void yaz_nfa_append_converter (
365 yaz_nfa_converter *startpoint,
366 yaz_nfa_converter *newconverter);
368 /** brief Runs the chain of converters.
369 * \param n the nfa (mostly for nmem access)
370 * \param c the first converter in a chain
371 * \param outbuff buffer to write the output in. Increments the ptr.
372 * \param outcharsleft how many may we write
374 * Runs the converters in the chain, placing output into outbuff
375 * (and incrementing the pointer).
377 * \retval YAZ_NFA_SUCCESS OK
378 * \retval YAZ_NFA_NOMATCH no match to get backrefs from
379 * \retval YAZ_NFA_NOSPACE no room in outbuf
380 * \retval YAZ_NFA_INTERNAL Should never happen
383 int yaz_nfa_run_converters(
385 yaz_nfa_converter *c,
386 yaz_nfa_char **outbuff,
387 size_t *outcharsleft);
391 /** \name High-level interface to the NFA */
392 /* This interface combines the NFA and converters, for ease of
393 * access. There are a few calls to build a complete system, and a call
394 * to do the actual conversion.
398 /** \brief Add a rule that converts one string to another ('IX' -> '9')
400 * \param n The nfa itself
401 * \param from_string the string to match
402 * \param from_length length of the from_string
403 * \param to_string the string to write in the output
404 * \param to_length length of the to_string
406 * Adds a matching rule and a string converter to the NFA.
407 * Can be used for converting strings into nothing, for example,
410 * \retval YAZ_NFA_SUCCESS OK
411 * \retval YAZ_NFA_ALREADY Conflict with some other rule
414 int yaz_nfa_add_string_rule( yaz_nfa *n,
415 yaz_nfa_char *from_string,
417 yaz_nfa_char *to_string,
420 /** brief Just like yaz_nfa_add_string_rule, but takes the strings in ascii
422 * \param n The nfa itself
423 * \param from_string the string to match
424 * \param to_string the string to write in the output
426 * Like yaz_nfa_add_string_rule, this adds a rule to translate a string
427 * into another. The only difference is that this one takes the strings as
428 * normal char *, which means that no high-valued unicodes can be handled,
429 * and that this one uses null-terminated strings. In short, this is a
430 * simplified version mostly intended for tests and other small uses.
432 * \retval YAZ_NFA_SUCCESS OK
433 * \retval YAZ_NFA_ALREADY Conflict with some other rule
435 int yaz_nfa_add_ascii_string_rule( yaz_nfa *n,
440 /** \brief Add a rule that converts a character range
442 * \param n The nfa itself
443 * \param range_start Where the matching range starts
444 * \param range_end Where the matching range ends
445 * \param output_range_start Where the resulting range starts
448 * Adds a character range rule to the NFA. The range to be converted
449 * is defined by the range_start and range_end parameters. The output
450 * range starts at output_range_start, and is automatically as long
451 * as the input range.
453 * Useful for alphabet normalizing [a-z] -> [A-Z]
455 * \retval YAZ_NFA_SUCCESS OK
456 * \retval YAZ_NFA_ALREADY Conflict with some other rule
458 int yaz_nfa_add_char_range_rule (yaz_nfa *n,
459 yaz_nfa_char range_start,
460 yaz_nfa_char range_end,
461 yaz_nfa_char output_range_start);
463 /** \brief Add a rule that converts a character range to a string
465 * \param n The nfa itself
466 * \param range_start Where the matching range starts
467 * \param range_end Where the matching range ends
468 * \param to_string the string to write in the output
469 * \param to_length length of the to_string
471 * \retval YAZ_NFA_SUCCESS OK
472 * \retval YAZ_NFA_ALREADY Conflict with some other rule
474 * Adds a character range match rule, and a string converter.
476 * Useful in converting a range of special characters into (short?)
477 * strings of whitespace, or even to nothing at all.
479 int yaz_nfa_add_char_string_rule (yaz_nfa *n,
480 yaz_nfa_char range_start,
481 yaz_nfa_char range_end,
482 yaz_nfa_char* to_string,
485 /** \brief Converts one 'slice' that is, the best matching rule.
487 * \param n the nfa itself
488 * \param inbuff buffer of input data. Will be incremented when match
489 * \param incharsleft max number of inchars to use from inbuff. decrements.
490 * \param outbuff buffer for output data. Will be incremented when match
491 * \param outcharsleft max number of chars to write to outbuff.
493 * \retval YAZ_NFA_SUCCESS OK
494 * \retval YAZ_NFA_OVERRUN No more input data, some pattern could match
495 * \retval YAZ_NFA_NOSPACE No room in the putput buffer
496 * \retval YAZ_NFA_NOSUCHBACKREF NFA refers to a non-existing backref
498 * Finds the best match at the beginning of inbuf, and fires its converter(s)
499 * to produce output in outbuff. Increments both inbuf and outbuf pointers and
500 * decrements the *charsleft values, so all is ready for calling again, until
501 * the buffer is exhausted. That loop is left to the caller, so he can load
502 * more data in the buffer in good time.
504 * If no match is found, converts one character into itself. If the matcher
505 * returns any sort of error, leaves the pointers where they were.
507 int yaz_nfa_convert_slice (yaz_nfa *n,
508 yaz_nfa_char **inbuff,
510 yaz_nfa_char **outbuff,
511 size_t *outcharsleft);
516 /** \name Debug routines */
517 /* These provide a method for traversing all the states defined
518 * in the NFA, for example to release memory allocated in the results,
519 * and a simple debug routine to dump the NFA */
523 /** \brief Get the first state of the NFA.
527 * Useful for iterating through all states, probably calling get_result
528 * for each, and doing something to the results (freeing memory?)
530 * \returns a pointer to the first state, or NULL if none.
532 yaz_nfa_state *yaz_nfa_get_first(yaz_nfa *n);
534 /** \brief Get the next state of the NFA.
537 * \param s the state to add to
538 * \return the next state, or NULL if no more.
540 yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s);
542 /** \brief Dump the NFA into a file .
544 * \param F The file handle to dump into (null => stdout)
546 * \param strfunc can be used for converting the resultinfo a string.
548 * strfunc is a function like
549 * char *f( void *result);
550 * it takes the result, and converts into a printable string (which
551 * must be allocated somewhere by the caller). If the results are
552 * already printable, passing a null pointer here prints them with a %s
555 void yaz_nfa_dump(FILE *F,
557 char *(*strfunc)(void *) );
559 /** \brief Helper to dump converters
562 char *yaz_nfa_dump_converter(void *conv);