X-Git-Url: http://jsfdemo.indexdata.com/?a=blobdiff_plain;ds=sidebyside;f=src%2Ficu_chain_test.c;h=873551c948bd3f3edd3c5cdfbacae0f0e90abe2c;hb=be895e49ab7887f5423529b6f11c377218f61c1a;hp=6c630ae7154bc1b4214a71728d7199ed18cfb6e0;hpb=d6b6f45fcc03e7d59583fac94a21599126faf0b6;p=pazpar2-moved-to-github.git diff --git a/src/icu_chain_test.c b/src/icu_chain_test.c index 6c630ae..873551c 100644 --- a/src/icu_chain_test.c +++ b/src/icu_chain_test.c @@ -1,12 +1,26 @@ -/** - gcc -I/usr/include/libxml2 -lxml2 -o icu-xml-convert icu-xml-convert.c - */ +/* This file is part of Pazpar2. + Copyright (C) 2006-2008 Index Data + +Pazpar2 is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ #if HAVE_CONFIG_H -#include "cconfig.h" +#include #endif -#define _GNU_SOURCE #include #include @@ -37,63 +51,71 @@ static struct config_t { void print_option_error(const struct config_t *p_config) { - fprintf(stderr, "Calling error, valid options are :\n"); - fprintf(stderr, "icu_chain_test\n" - " [-c (path/to/config/file.xml)]\n" - " [-p (a|c|l|t)] print ICU info \n" - " [-x] XML output\n" - "\n" - "Examples:\n" - "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n" - "./icu_chain_test -p c\n" - "./icu_chain_test -p l -x\n" - "./icu_chain_test -p t -x\n" + fprintf(stderr, "Calling error, valid options are :\n"); + fprintf(stderr, "icu_chain_test\n" + " [-c (path/to/config/file.xml)]\n" + " [-p (a|c|l|t)] print ICU info \n" + " [-x] XML output\n" + "\n" + "Examples:\n" + "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n" + "./icu_chain_test -p c\n" + "./icu_chain_test -p l -x\n" + "./icu_chain_test -p t -x\n" + "\n" + "Example ICU chain XML configuration file:\n" + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" ); - exit(1); + exit(1); } -void read_params(int argc, char **argv, struct config_t *p_config){ - char *arg; - int ret; - - /* set default parameters */ - p_config->conffile[0] = 0; - p_config->print[0] = 0; - p_config->xmloutput = 0; - p_config->chain = 0; - p_config->infile = stdin; - p_config->outfile = stdout; - - /* set up command line parameters */ - - while ((ret = options("c:p:x", argv, argc, &arg)) != -2) +void read_params(int argc, char **argv, struct config_t *p_config) +{ + char *arg; + int ret; + + /* set default parameters */ + p_config->conffile[0] = 0; + p_config->print[0] = 0; + p_config->xmloutput = 0; + p_config->chain = 0; + p_config->infile = stdin; + p_config->outfile = stdout; + + /* set up command line parameters */ + + while ((ret = options("c:p:x", argv, argc, &arg)) != -2) { - switch (ret) + switch (ret) { case 'c': - strcpy(p_config->conffile, arg); - break; + strcpy(p_config->conffile, arg); + break; case 'p': - strcpy(p_config->print, arg); - break; + strcpy(p_config->print, arg); + break; case 'x': p_config->xmloutput = 1; - break; + break; default: - print_option_error(p_config); + print_option_error(p_config); } } - //p_config->infile = fopen("/etc/passwd", "r"); - - - - if ((!strlen(p_config->conffile) - && !strlen(p_config->print)) - || !config.infile - || !config.outfile) - - print_option_error(p_config); + if ((!strlen(p_config->conffile) + && !strlen(p_config->print)) + || !config.infile + || !config.outfile) + + print_option_error(p_config); }; @@ -123,15 +145,17 @@ static void print_icu_converters(const struct config_t *p_config) count = ucnv_countAvailable(); if (p_config->xmloutput) fprintf(config.outfile, "\n", - count, ucnv_getDefaultName()); + count, ucnv_getDefaultName()); else { fprintf(config.outfile, "Available ICU converters: %d\n", count); - fprintf(config.outfile, "Default ICU Converter is: '%s'\n", ucnv_getDefaultName()); + fprintf(config.outfile, "Default ICU Converter is: '%s'\n", + ucnv_getDefaultName()); } for(i=0;ixmloutput) - fprintf(config.outfile, "\n", ucnv_getAvailableName(i)); + fprintf(config.outfile, "\n", + ucnv_getAvailableName(i)); else fprintf(config.outfile, "%s ", ucnv_getAvailableName(i)); } @@ -144,254 +168,254 @@ static void print_icu_converters(const struct config_t *p_config) static void print_icu_transliterators(const struct config_t *p_config) { - int32_t count; - int32_t i; - - count = utrans_countAvailableIDs(); - - int32_t buf_cap = 128; - char buf[buf_cap]; - - if (p_config->xmloutput) - fprintf(config.outfile, "\n", count); - else - fprintf(config.outfile, "Available ICU transliterators: %d\n", count); - - for(i=0;ixmloutput) + fprintf(config.outfile, "\n", count); + else + fprintf(config.outfile, "Available ICU transliterators: %d\n", count); + + for(i = 0; i xmloutput) - fprintf(config.outfile, "\n", buf); - else - fprintf(config.outfile, " %s", buf); + utrans_getAvailableID(i, buf, buf_cap); + if (p_config->xmloutput) + fprintf(config.outfile, "\n", buf); + else + fprintf(config.outfile, " %s", buf); } - - if (p_config->xmloutput){ - fprintf(config.outfile, "\n"); - } - else + + if (p_config->xmloutput){ + fprintf(config.outfile, "\n"); + } + else { - fprintf(config.outfile, "\n\nUnicode Set Patterns:\n" - " Pattern Description\n" - " Ranges [a-z] The lower case letters a through z\n" - " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n" - " String [abc{def}] chars a, b and c, and string 'def'\n" - " Categories [\\p{Letter}] Perl General Category 'Letter'.\n" - " Categories [:Letter:] Posix General Category 'Letter'.\n" - "\n" - " Combination Example\n" - " Union [[:Greek:] [:letter:]]\n" - " Intersection [[:Greek:] & [:letter:]]\n" - " Set Complement [[:Greek:] - [:letter:]]\n" - " Complement [^[:Greek:] [:letter:]]\n" - "\n" + fprintf(config.outfile, "\n\nUnicode Set Patterns:\n" + " Pattern Description\n" + " Ranges [a-z] The lower case letters a through z\n" + " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n" + " String [abc{def}] chars a, b and c, and string 'def'\n" + " Categories [\\p{Letter}] Perl General Category 'Letter'.\n" + " Categories [:Letter:] Posix General Category 'Letter'.\n" + "\n" + " Combination Example\n" + " Union [[:Greek:] [:letter:]]\n" + " Intersection [[:Greek:] & [:letter:]]\n" + " Set Complement [[:Greek:] - [:letter:]]\n" + " Complement [^[:Greek:] [:letter:]]\n" + "\n" "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n" - "\n" - "Examples:\n" - " [:Punctuation:] Any-Remove\n" - " [:Cased-Letter:] Any-Upper\n" - " [:Control:] Any-Remove\n" - " [:Decimal_Number:] Any-Remove\n" - " [:Final_Punctuation:] Any-Remove\n" - " [:Georgian:] Any-Upper\n" - " [:Katakana:] Any-Remove\n" - " [:Arabic:] Any-Remove\n" - " [:Punctuation:] Remove\n" - " [[:Punctuation:]-[.,]] Remove\n" - " [:Line_Separator:] Any-Remove\n" - " [:Math_Symbol:] Any-Remove\n" - " Lower; [:^Letter:] Remove (word tokenization)\n" - " [:^Number:] Remove (numeric tokenization)\n" - " [:^Katagana:] Remove (remove everything except Katagana)\n" - " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n" - " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n" - " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n" - " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n" - "\n" - "see http://icu.sourceforge.net/userguide/Transform.html\n" - " http://www.unicode.org/Public/UNIDATA/UCD.html\n" - " http://icu.sourceforge.net/userguide/Transform.html\n" - " http://icu.sourceforge.net/userguide/TransformRule.html\n" - ); - - - fprintf(config.outfile, "\n\n"); - + "\n" + "Examples:\n" + " [:Punctuation:] Any-Remove\n" + " [:Cased-Letter:] Any-Upper\n" + " [:Control:] Any-Remove\n" + " [:Decimal_Number:] Any-Remove\n" + " [:Final_Punctuation:] Any-Remove\n" + " [:Georgian:] Any-Upper\n" + " [:Katakana:] Any-Remove\n" + " [:Arabic:] Any-Remove\n" + " [:Punctuation:] Remove\n" + " [[:Punctuation:]-[.,]] Remove\n" + " [:Line_Separator:] Any-Remove\n" + " [:Math_Symbol:] Any-Remove\n" + " Lower; [:^Letter:] Remove (word tokenization)\n" + " [:^Number:] Remove (numeric tokenization)\n" + " [:^Katagana:] Remove (remove everything except Katagana)\n" + " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n" + " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n" + " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n" + " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n" + "\n" + "see http://icu.sourceforge.net/userguide/Transform.html\n" + " http://www.unicode.org/Public/UNIDATA/UCD.html\n" + " http://icu.sourceforge.net/userguide/Transform.html\n" + " http://icu.sourceforge.net/userguide/TransformRule.html\n" + ); + + + fprintf(config.outfile, "\n\n"); + } } static void print_icu_xml_locales(const struct config_t *p_config) { - int32_t count; - int32_t i; - UErrorCode status = U_ZERO_ERROR; - - UChar keyword[64]; - int32_t keyword_len = 0; - char keyword_str[128]; - int32_t keyword_str_len = 0; - - UChar language[64]; - int32_t language_len = 0; - char lang_str[128]; - int32_t lang_str_len = 0; - - UChar script[64]; - int32_t script_len = 0; - char script_str[128]; - int32_t script_str_len = 0; - - UChar location[64]; - int32_t location_len = 0; - char location_str[128]; - int32_t location_str_len = 0; - - UChar variant[64]; - int32_t variant_len = 0; - char variant_str[128]; - int32_t variant_str_len = 0; - - UChar name[64]; - int32_t name_len = 0; - char name_str[128]; - int32_t name_str_len = 0; - - UChar localname[64]; - int32_t localname_len = 0; - char localname_str[128]; - int32_t localname_str_len = 0; - - count = uloc_countAvailable() ; - - if (p_config->xmloutput){ + int32_t count; + int32_t i; + UErrorCode status = U_ZERO_ERROR; + + UChar keyword[64]; + int32_t keyword_len = 0; + char keyword_str[128]; + int32_t keyword_str_len = 0; + + UChar language[64]; + int32_t language_len = 0; + char lang_str[128]; + int32_t lang_str_len = 0; + + UChar script[64]; + int32_t script_len = 0; + char script_str[128]; + int32_t script_str_len = 0; + + UChar location[64]; + int32_t location_len = 0; + char location_str[128]; + int32_t location_str_len = 0; + + UChar variant[64]; + int32_t variant_len = 0; + char variant_str[128]; + int32_t variant_str_len = 0; + + UChar name[64]; + int32_t name_len = 0; + char name_str[128]; + int32_t name_str_len = 0; + + UChar localname[64]; + int32_t localname_len = 0; + char localname_str[128]; + int32_t localname_str_len = 0; + + count = uloc_countAvailable() ; + + if (p_config->xmloutput){ - fprintf(config.outfile, "\n", - count, uloc_getDefault(), ucol_countAvailable()); - } + fprintf(config.outfile, "\n", + count, uloc_getDefault(), ucol_countAvailable()); + } - for(i=0;ixmloutput){ - fprintf(config.outfile, ""); - if (strlen(localname_str)) - fprintf(config.outfile, "%s", localname_str); - fprintf(config.outfile, "\n"); - } - else if (1 == p_config->xmloutput){ - fprintf(config.outfile, "%s", uloc_getAvailable(i)); - fprintf(config.outfile, " | "); - if (strlen(name_str)) - fprintf(config.outfile, "%s", name_str); - fprintf(config.outfile, " | "); - if (strlen(localname_str)) - fprintf(config.outfile, "%s", localname_str); - fprintf(config.outfile, "\n"); + language_len + = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", + language, 64, + &status); + + u_strToUTF8(lang_str, 128, &lang_str_len, + language, language_len, + &status); + + + script_len + = uloc_getDisplayScript(uloc_getAvailable(i), "en", + script, 64, + &status); + + u_strToUTF8(script_str, 128, &script_str_len, + script, script_len, + &status); + + location_len + = uloc_getDisplayCountry(uloc_getAvailable(i), "en", + location, 64, + &status); + + u_strToUTF8(location_str, 128, &location_str_len, + location, location_len, + &status); + + variant_len + = uloc_getDisplayVariant(uloc_getAvailable(i), "en", + variant, 64, + &status); + + u_strToUTF8(variant_str, 128, &variant_str_len, + variant, variant_len, + &status); + + name_len + = uloc_getDisplayName(uloc_getAvailable(i), "en", + name, 64, + &status); + + u_strToUTF8(name_str, 128, &name_str_len, + name, name_len, + &status); + + localname_len + = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), + localname, 64, + &status); + + u_strToUTF8(localname_str, 128, &localname_str_len, + localname, localname_len, + &status); + + + if (p_config->xmloutput){ + fprintf(config.outfile, ""); + if (strlen(localname_str)) + fprintf(config.outfile, "%s", localname_str); + fprintf(config.outfile, "\n"); + } + else if (1 == p_config->xmloutput){ + fprintf(config.outfile, "%s", uloc_getAvailable(i)); + fprintf(config.outfile, " | "); + if (strlen(name_str)) + fprintf(config.outfile, "%s", name_str); + fprintf(config.outfile, " | "); + if (strlen(localname_str)) + fprintf(config.outfile, "%s", localname_str); + fprintf(config.outfile, "\n"); + } + else + fprintf(config.outfile, "%s ", uloc_getAvailable(i)); } + if (p_config->xmloutput) + fprintf(config.outfile, "\n"); else - fprintf(config.outfile, "%s ", uloc_getAvailable(i)); - } - if (p_config->xmloutput) - fprintf(config.outfile, "\n"); - else - fprintf(config.outfile, "\n"); - - if(U_FAILURE(status)) { - fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status)); - exit(status); - } + fprintf(config.outfile, "\n"); + + if(U_FAILURE(status)) { + fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status)); + exit(status); + } } static void print_info(const struct config_t *p_config) { - if (p_config->xmloutput) - fprintf(config.outfile, "\n" - "\n"); + if (p_config->xmloutput) + fprintf(config.outfile, "\n" + "\n"); if ('c' == config.print[0]) print_icu_converters(&config); @@ -408,16 +432,15 @@ static void print_info(const struct config_t *p_config) if (p_config->xmloutput) fprintf(config.outfile, "\n"); - exit(0); + exit(0); }; static void process_text_file(const struct config_t *p_config) { - char * line = 0; - size_t line_cap = 0; - ssize_t line_len; + char *line = 0; + char linebuf[1024]; xmlDoc *doc = xmlParseFile(config.conffile); xmlNode *xml_node = xmlDocGetRootElement(doc); @@ -428,12 +451,23 @@ static void process_text_file(const struct config_t *p_config) UErrorCode status = U_ZERO_ERROR; int success = 0; + if (! xml_node) { + printf("Could not parse XML config file '%s' \n", + config.conffile); + exit (1); + } + config.chain = icu_chain_xml_config(xml_node, &status); if (config.chain && U_SUCCESS(status)) success = 1; - + else { + printf("Could not set up ICU chain from config file '%s' \n", + config.conffile); + exit (1); + } + if (p_config->xmloutput) fprintf(config.outfile, "\n" @@ -441,7 +475,8 @@ static void process_text_file(const struct config_t *p_config) "\n"); // read input lines for processing - while ((line_len = getline(&line, &line_cap, config.infile)) != -1) { + while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile))) + { success = icu_chain_assign_cstr(config.chain, line, &status); line_count++; @@ -469,7 +504,7 @@ static void process_text_file(const struct config_t *p_config) } - if (p_config->xmloutput) + if (p_config->xmloutput) fprintf(config.outfile, "\n" "\n");