1 /* $Id: icu_chain_test.c,v 1.7 2007-08-30 08:45:08 marc Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
31 //#include <yaz/xmalloc.h>
32 #include <yaz/options.h>
37 #include <unicode/ucnv.h>
38 #include <unicode/ustring.h>
42 /* commando line and config parameters */
43 static struct config_t {
47 struct icu_chain * chain;
54 void print_option_error(const struct config_t *p_config)
56 fprintf(stderr, "Calling error, valid options are :\n");
57 fprintf(stderr, "icu_chain_test\n"
58 " [-c (path/to/config/file.xml)]\n"
59 " [-p (a|c|l|t)] print ICU info \n"
63 "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n"
64 "./icu_chain_test -p c\n"
65 "./icu_chain_test -p l -x\n"
66 "./icu_chain_test -p t -x\n"
68 "Example ICU chain XML configuration file:\n"
69 "<icu_chain id=\"en:word\" locale=\"en\">\n"
70 " <normalize rule=\"[:Control:] Any-Remove\"/>\n"
71 " <tokenize rule=\"l\"/>\n"
72 " <normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
74 " <casemap rule=\"l\"/>\n"
82 void read_params(int argc, char **argv, struct config_t *p_config)
87 /* set default parameters */
88 p_config->conffile[0] = 0;
89 p_config->print[0] = 0;
90 p_config->xmloutput = 0;
92 p_config->infile = stdin;
93 p_config->outfile = stdout;
95 /* set up command line parameters */
97 while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
102 strcpy(p_config->conffile, arg);
105 strcpy(p_config->print, arg);
108 p_config->xmloutput = 1;
111 print_option_error(p_config);
115 if ((!strlen(p_config->conffile)
116 && !strlen(p_config->print))
120 print_option_error(p_config);
124 /* UConverter *conv; */
125 /* conv = ucnv_open("utf-8", &status); */
126 /* assert(U_SUCCESS(status)); */
129 /* = ucnv_toUChars(conv, ustr16, 1024, */
130 /* (const char *) *xstr8, strlen((const char *) *xstr8), */
135 /* ucnv_fromUChars(conv, */
136 /* (char *) *xstr8, strlen((const char *) *xstr8), */
137 /* ustr16, *ustr16_len, */
139 /* ucnv_close(conv); */
142 static void print_icu_converters(const struct config_t *p_config)
147 count = ucnv_countAvailable();
148 if (p_config->xmloutput)
149 fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
150 count, ucnv_getDefaultName());
152 fprintf(config.outfile, "Available ICU converters: %d\n", count);
153 fprintf(config.outfile, "Default ICU Converter is: '%s'\n",
154 ucnv_getDefaultName());
157 for(i=0;i<count;i++){
158 if (p_config->xmloutput)
159 fprintf(config.outfile, "<converter id=\"%s\"/>\n",
160 ucnv_getAvailableName(i));
162 fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
165 if (p_config->xmloutput)
166 fprintf(config.outfile, "</converters>\n");
168 fprintf(config.outfile, "\n");
171 static void print_icu_transliterators(const struct config_t *p_config)
176 count = utrans_countAvailableIDs();
178 int32_t buf_cap = 128;
181 if (p_config->xmloutput)
182 fprintf(config.outfile, "<transliterators count=\"%d\">\n", count);
184 fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
186 for(i = 0; i <count; i++)
188 utrans_getAvailableID(i, buf, buf_cap);
189 if (p_config->xmloutput)
190 fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
192 fprintf(config.outfile, " %s", buf);
195 if (p_config->xmloutput){
196 fprintf(config.outfile, "</transliterators>\n");
200 fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
201 " Pattern Description\n"
202 " Ranges [a-z] The lower case letters a through z\n"
203 " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n"
204 " String [abc{def}] chars a, b and c, and string 'def'\n"
205 " Categories [\\p{Letter}] Perl General Category 'Letter'.\n"
206 " Categories [:Letter:] Posix General Category 'Letter'.\n"
208 " Combination Example\n"
209 " Union [[:Greek:] [:letter:]]\n"
210 " Intersection [[:Greek:] & [:letter:]]\n"
211 " Set Complement [[:Greek:] - [:letter:]]\n"
212 " Complement [^[:Greek:] [:letter:]]\n"
214 "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
217 " [:Punctuation:] Any-Remove\n"
218 " [:Cased-Letter:] Any-Upper\n"
219 " [:Control:] Any-Remove\n"
220 " [:Decimal_Number:] Any-Remove\n"
221 " [:Final_Punctuation:] Any-Remove\n"
222 " [:Georgian:] Any-Upper\n"
223 " [:Katakana:] Any-Remove\n"
224 " [:Arabic:] Any-Remove\n"
225 " [:Punctuation:] Remove\n"
226 " [[:Punctuation:]-[.,]] Remove\n"
227 " [:Line_Separator:] Any-Remove\n"
228 " [:Math_Symbol:] Any-Remove\n"
229 " Lower; [:^Letter:] Remove (word tokenization)\n"
230 " [:^Number:] Remove (numeric tokenization)\n"
231 " [:^Katagana:] Remove (remove everything except Katagana)\n"
232 " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
233 " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n"
234 " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
235 " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
237 "see http://icu.sourceforge.net/userguide/Transform.html\n"
238 " http://www.unicode.org/Public/UNIDATA/UCD.html\n"
239 " http://icu.sourceforge.net/userguide/Transform.html\n"
240 " http://icu.sourceforge.net/userguide/TransformRule.html\n"
244 fprintf(config.outfile, "\n\n");
249 static void print_icu_xml_locales(const struct config_t *p_config)
253 UErrorCode status = U_ZERO_ERROR;
256 int32_t keyword_len = 0;
257 char keyword_str[128];
258 int32_t keyword_str_len = 0;
261 int32_t language_len = 0;
263 int32_t lang_str_len = 0;
266 int32_t script_len = 0;
267 char script_str[128];
268 int32_t script_str_len = 0;
271 int32_t location_len = 0;
272 char location_str[128];
273 int32_t location_str_len = 0;
276 int32_t variant_len = 0;
277 char variant_str[128];
278 int32_t variant_str_len = 0;
281 int32_t name_len = 0;
283 int32_t name_str_len = 0;
286 int32_t localname_len = 0;
287 char localname_str[128];
288 int32_t localname_str_len = 0;
290 count = uloc_countAvailable() ;
292 if (p_config->xmloutput){
294 fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
295 count, uloc_getDefault(), ucol_countAvailable());
302 = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
306 u_strToUTF8(keyword_str, 128, &keyword_str_len,
307 keyword, keyword_len,
312 = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
316 u_strToUTF8(lang_str, 128, &lang_str_len,
317 language, language_len,
322 = uloc_getDisplayScript(uloc_getAvailable(i), "en",
326 u_strToUTF8(script_str, 128, &script_str_len,
331 = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
335 u_strToUTF8(location_str, 128, &location_str_len,
336 location, location_len,
340 = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
344 u_strToUTF8(variant_str, 128, &variant_str_len,
345 variant, variant_len,
349 = uloc_getDisplayName(uloc_getAvailable(i), "en",
353 u_strToUTF8(name_str, 128, &name_str_len,
358 = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
362 u_strToUTF8(localname_str, 128, &localname_str_len,
363 localname, localname_len,
367 if (p_config->xmloutput){
368 fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
369 /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
370 /* if (strlen(keyword_str)) */
371 /* fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
372 /* if (ucol_getAvailable(i)) */
373 /* fprintf(config.outfile, " collation=\"1\""); */
374 if (strlen(lang_str))
375 fprintf(config.outfile, " language=\"%s\"", lang_str);
376 if (strlen(script_str))
377 fprintf(config.outfile, " script=\"%s\"", script_str);
378 if (strlen(location_str))
379 fprintf(config.outfile, " location=\"%s\"", location_str);
380 if (strlen(variant_str))
381 fprintf(config.outfile, " variant=\"%s\"", variant_str);
382 if (strlen(name_str))
383 fprintf(config.outfile, " name=\"%s\"", name_str);
384 if (strlen(localname_str))
385 fprintf(config.outfile, " localname=\"%s\"", localname_str);
386 fprintf(config.outfile, ">");
387 if (strlen(localname_str))
388 fprintf(config.outfile, "%s", localname_str);
389 fprintf(config.outfile, "</locale>\n");
391 else if (1 == p_config->xmloutput){
392 fprintf(config.outfile, "%s", uloc_getAvailable(i));
393 fprintf(config.outfile, " | ");
394 if (strlen(name_str))
395 fprintf(config.outfile, "%s", name_str);
396 fprintf(config.outfile, " | ");
397 if (strlen(localname_str))
398 fprintf(config.outfile, "%s", localname_str);
399 fprintf(config.outfile, "\n");
402 fprintf(config.outfile, "%s ", uloc_getAvailable(i));
404 if (p_config->xmloutput)
405 fprintf(config.outfile, "</locales>\n");
407 fprintf(config.outfile, "\n");
409 if(U_FAILURE(status)) {
410 fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
416 static void print_info(const struct config_t *p_config)
418 if (p_config->xmloutput)
419 fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
422 if ('c' == config.print[0])
423 print_icu_converters(&config);
424 else if ('l' == config.print[0])
425 print_icu_xml_locales(&config);
426 else if ('t' == config.print[0])
427 print_icu_transliterators(&config);
429 print_icu_converters(&config);
430 print_icu_xml_locales(&config);
431 print_icu_transliterators(&config);
434 if (p_config->xmloutput)
435 fprintf(config.outfile, "</icu>\n");
442 static void process_text_file(const struct config_t *p_config)
447 xmlDoc *doc = xmlParseFile(config.conffile);
448 xmlNode *xml_node = xmlDocGetRootElement(doc);
450 long unsigned int token_count = 0;
451 long unsigned int line_count = 0;
453 UErrorCode status = U_ZERO_ERROR;
457 printf("Could not parse XML config file '%s' \n",
463 config.chain = icu_chain_xml_config(xml_node, &status);
465 if (config.chain && U_SUCCESS(status))
468 printf("Could not set up ICU chain from config file '%s' \n",
473 if (p_config->xmloutput)
474 fprintf(config.outfile,
475 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
479 // read input lines for processing
480 while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
482 success = icu_chain_assign_cstr(config.chain, line, &status);
485 while (success && icu_chain_next_token(config.chain, &status)){
486 if (U_FAILURE(status))
490 if (p_config->xmloutput)
491 fprintf(config.outfile,
492 "<token id=\%lu\" line=\"%lu\""
493 " norm=\"%s\" display=\"%s\"/>\n",
496 icu_chain_get_norm(config.chain),
497 icu_chain_get_display(config.chain));
499 fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
502 icu_chain_get_norm(config.chain),
503 icu_chain_get_display(config.chain));
509 if (p_config->xmloutput)
510 fprintf(config.outfile,
514 icu_chain_destroy(config.chain);
523 int main(int argc, char **argv)
528 read_params(argc, argv, &config);
530 if (config.conffile && strlen(config.conffile))
531 process_text_file(&config);
533 if (config.print && strlen(config.print))
538 printf("ICU not available on your system.\n"
539 "Please install libicu36-dev and icu-doc or similar, "
540 "re-configure and re-compile\n");
552 * indent-tabs-mode: nil
554 * vim: shiftwidth=4 tabstop=8 expandtab