/* Copyright (C) 2006, Index Data ApS
* See the file LICENSE for details.
- * $Id: nfa.h,v 1.6 2006-05-05 14:02:27 heikki Exp $
+ * $Id: nfa.h,v 1.9 2006-07-14 13:06:37 heikki Exp $
*/
/**
#define YAZ_NFA_H
#include <yaz/yconfig.h>
+#include <stdio.h>
YAZ_BEGIN_CDECL
-/** \brief Internal character type */
+/** \name return codes and data types*/
+/* \{ */
+/** \brief Success */
+#define YAZ_NFA_SUCCESS 0
+
+/** \brief no match found */
+#define YAZ_NFA_NOMATCH 1
+
+/** \brief Need more input */
+#define YAZ_NFA_OVERRUN 2
+
+/** \brief The NFA is looping */
+#define YAZ_NFA_LOOP 3
+
+/** \brief no room in output buffer */
+#define YAZ_NFA_NOSPACE 4
+
+/** \brief tryig to set a result when one already exists*/
+#define YAZ_NFA_ALREADY 5
+
+/** \brief Attempting to set an end to a backref that has not been started */
+#define YAZ_NFA_NOSTART 6
+
+/** \brief Asking for a non-existing backref */
+#define YAZ_NFA_NOSUCHBACKREF 7
+
+/** \brief Internal error, should never happen */
+#define YAZ_NFA_INTERNAL 8
+
+
+/** \brief Internal character type. 32-bit unicode! */
typedef unsigned int yaz_nfa_char;
/** \brief Transition from one state to another */
typedef struct yaz_nfa_transition yaz_nfa_transition;
-
-/** brief Simple character range converter */
+/** \brief A converter produces some output to a buffer */
typedef struct yaz_nfa_converter yaz_nfa_converter;
+/* \} */
+/** \name Low-level interface to building the NFA */
+/* \{ */
/** \brief Initialize the NFA without any states in it
*
/** \brief Sets the result pointer to a state
*
- * Call with null to clear the pointer.
+ * \param n the NFA itself
+ * \param s the state to which the result will be added
+ * \param result the result pointer
+ *
+ * Sets the result pointer of a state. If already set, returns an error. Call
+ * with a NULL pointer to clear the result, before setting a new one.
*
- * \retval 0 ok
- * \retval 1 The state already has a result!
+ * \retval YAZ_NFA_SUCCESS ok
+ * \retval YAZ_NFA_ALREADY The state already has a result!
*/
int yaz_nfa_set_result(
- /** The NFA itsef */
yaz_nfa *n,
- /** The state to which the result is added */
yaz_nfa_state *s,
- /** The result. The NFA does not care what it is, just stores it */
void *result
);
*
* \param n the nfa
* \param s the state to add to
- * \param backref_number is the number of the back reference. 0 for clearing
+ * \param backref_number is the number of the back reference.
* \param is_start is 1 for start of the backref, 0 for end
- * \retval 0 for OK
- * \retval 1 if the backref is already set
- * \retval 2 for ending a backref that has not been started
+ *
+ * \retval YAZ_NFA_SUCCESS for OK
+ * \retval YAZ_NFA_ALREADY if the backref is already set
+ * \retval YAZ_NFA_NOSTART for ending a backref that has not been started
*
*/
/** \brief Add a sequence of transitions and states.
*
+ * \param n the nfa
+ * \param s the state to add this to. If null, adds to the initial state
+ * \param seq is a sequence of yaz_fna_chars.
+ * \param seq_len is the length of the sequence
+ * \Return the final state
+ *
* Starting from state s (or from the initial state, if s is
* null), finds as much of seq as possible and inserts the rest.
- * \Return the final state
*/
yaz_nfa_state *yaz_nfa_add_sequence( yaz_nfa *n,
yaz_nfa_state *s,
- yaz_nfa_char *seq );
+ yaz_nfa_char *seq,
+ size_t seq_len );
+/** \} */
+
+/** \name Low-level interface for mathcing the NFA. */
+/*
+ * These do the actual matching. They know nothing of
+ * the type of the result pointers
+ */
+/** \{ */
/** \brief Find the longest possible match.
*
* In case of errors, returns the best match so far,
* which the caller is free to ignore.
*
- * \retval 0 success
- * \retval 1 no match found
- * \retval 2 overrun'of input buffer
- * \retval 3 looping too far
+ * \retval YAZ_NFA_SUCCESS success
+ * \retval YAZ_NFA_NOMATCH no match found
+ * \retval YAZ_NFA_OVERRUN overrun of input buffer
+ * \retval YAZ_NFA_LOOP looping too far
*
*/
int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t *incharsleft,
void **result );
-/** yaz_nfa_match return codes */
-#define YAZ_NFA_SUCCESS 0
-#define YAZ_NFA_NOMATCH 1
-#define YAZ_NFA_OVERRUN 2
-#define YAZ_NFA_LOOP 3
-
/** \brief Get a back reference after a successfull match.
*
* \param n the nfa
* not the end of a backref. It is up to the caller to decide how
* to handle such a situation.
*
- * \retval 0 OK
- * \retval 1 no match
- * \retval 2 no such backref
+ * \retval YAZ_NFA_SUCCESS OK
+ * \retval YAZ_NFA_NOMATCH The NFA hasn't matched anything, no backref
+ * \retval YAZ_NFA_NOSUCHBACKREF no such backref
*/
int yaz_nfa_get_backref( yaz_nfa *n,
yaz_nfa_char **start,
yaz_nfa_char **end );
+/* \} */
+
+/** \name Low-level interface to the converters */
+/* These produce some output text into a buffer. There are a few
+ * kinds of converters, each producing different type of output.
+ */
+/* \{ */
+
/** \brief Create a string converter.
* \param n the nfa
* \param string the string to output
* Runs the converters in the chain, placing output into outbuff
* (and incrementing the pointer).
*
- * \retval 0 OK
- * \retval 1 no match to get backrefs from
- * \retval 2 no room in outbuf
+ * \retval YAZ_NFA_SUCCESS OK
+ * \retval YAZ_NFA_NOMATCH no match to get backrefs from
+ * \retval YAZ_NFA_NOSPACE no room in outbuf
+ * \retval YAZ_NFA_INTERNAL Should never happen
*
*/
int yaz_nfa_run_converters(
yaz_nfa_char **outbuff,
size_t *outcharsleft);
+/** \} */
+
+/** \name High-level interface to the NFA */
+/* This interface combines the NFA and converters, for ease of
+ * access. There are a few calls to build a complete system, and a call
+ * to do the actual conversion.
+ */
+/* \{ */
+
+/** \brief Add a rule that converts one string to another ('IX' -> '9')
+ *
+ * \param n The nfa itself
+ * \param from_string the string to match
+ * \param from_length length of the from_string
+ * \param to_string the string to write in the output
+ * \param to_length length of the to_string
+ *
+ * Adds a matching rule and a string converter to the NFA.
+ * Can be used for converting strings into nothing, for example,
+ * to remove markup.
+ *
+ * \retval YAZ_NFA_SUCCESS OK
+ * \retval YAZ_NFA_ALREADY Conflict with some other rule
+ *
+ */
+int yaz_nfa_add_string_rule( yaz_nfa *n,
+ yaz_nfa_char *from_string,
+ size_t from_length,
+ yaz_nfa_char *to_string,
+ size_t to_length);
+
+/** brief Just like yaz_nfa_add_string_rule, but takes the strings in ascii
+ *
+ * \param n The nfa itself
+ * \param from_string the string to match
+ * \param to_string the string to write in the output
+ *
+ * Like yaz_nfa_add_string_rule, this adds a rule to translate a string
+ * into another. The only difference is that this one takes the strings as
+ * normal char *, which means that no high-valued unicodes can be handled,
+ * and that this one uses null-terminated strings. In short, this is a
+ * simplified version mostly intended for tests and other small uses.
+ *
+ * \retval YAZ_NFA_SUCCESS OK
+ * \retval YAZ_NFA_ALREADY Conflict with some other rule
+ */
+int yaz_nfa_add_ascii_string_rule( yaz_nfa *n,
+ char *from_string,
+ char *to_string);
+
+
+/** \brief Add a rule that converts a character range
+ *
+ * \param n The nfa itself
+ * \param range_start Where the matching range starts
+ * \param range_end Where the matching range ends
+ * \param output_range_start Where the resulting range starts
+ *
+ *
+ * Adds a character range rule to the NFA. The range to be converted
+ * is defined by the range_start and range_end parameters. The output
+ * range starts at output_range_start, and is automatically as long
+ * as the input range.
+ *
+ * Useful for alphabet normalizing [a-z] -> [A-Z]
+ *
+ * \retval YAZ_NFA_SUCCESS OK
+ * \retval YAZ_NFA_ALREADY Conflict with some other rule
+ */
+int yaz_nfa_add_char_range_rule (yaz_nfa *n,
+ yaz_nfa_char range_start,
+ yaz_nfa_char range_end,
+ yaz_nfa_char output_range_start);
+
+/** \brief Add a rule that converts a character range to a string
+ *
+ * \param n The nfa itself
+ * \param range_start Where the matching range starts
+ * \param range_end Where the matching range ends
+ * \param to_string the string to write in the output
+ * \param to_length length of the to_string
+ *
+ * \retval YAZ_NFA_SUCCESS OK
+ * \retval YAZ_NFA_ALREADY Conflict with some other rule
+ *
+ * Adds a character range match rule, and a string converter.
+ *
+ * Useful in converting a range of special characters into (short?)
+ * strings of whitespace, or even to nothing at all.
+ */
+int yaz_nfa_add_char_string_rule (yaz_nfa *n,
+ yaz_nfa_char range_start,
+ yaz_nfa_char range_end,
+ yaz_nfa_char* to_string,
+ size_t to_length);
+
+/** \brief Converts one 'slice' that is, the best matching rule.
+ *
+ * \param n the nfa itself
+ * \param inbuff buffer of input data. Will be incremented when match
+ * \param incharsleft max number of inchars to use from inbuff. decrements.
+ * \param outbuff buffer for output data. Will be incremented when match
+ * \param outcharsleft max number of chars to write to outbuff.
+ *
+ * \retval YAZ_NFA_SUCCESS OK
+ * \retval YAZ_NFA_OVERRUN No more input data, some pattern could match
+ * \retval YAZ_NFA_NOSPACE No room in the putput buffer
+ * \retval YAZ_NFA_NOSUCHBACKREF NFA refers to a non-existing backref
+ *
+ * Finds the best match at the beginning of inbuf, and fires its converter(s)
+ * to produce output in outbuff. Increments both inbuf and outbuf pointers and
+ * decrements the *charsleft values, so all is ready for calling again, until
+ * the buffer is exhausted. That loop is left to the caller, so he can load
+ * more data in the buffer in good time.
+ *
+ * If no match is found, converts one character into itself. If the matcher
+ * returns any sort of error, leaves the pointers where they were.
+ */
+int yaz_nfa_convert_slice (yaz_nfa *n,
+ yaz_nfa_char **inbuff,
+ size_t *incharsleft,
+ yaz_nfa_char **outbuff,
+ size_t *outcharsleft);
+
+
+/* \} */
+
+/** \name Debug routines */
+/* These provide a method for traversing all the states defined
+ * in the NFA, for example to release memory allocated in the results,
+ * and a simple debug routine to dump the NFA */
+/* \{ */
+
/** \brief Get the first state of the NFA.
*
* already printable, passing a null pointer here prints them with a %s
*
*/
-void yaz_nfa_dump(FILE *F, yaz_nfa *n, char *(*strfunc)(void *) );
+void yaz_nfa_dump(FILE *F,
+ yaz_nfa *n,
+ char *(*strfunc)(void *) );
+/** \brief Helper to dump converters
+ *
+ */
+char *yaz_nfa_dump_converter(void *conv);
+/* \} */