2 gcc -I/usr/include/libxml2 -lxml2 -o icu-xml-convert icu-xml-convert.c
12 //#include <yaz/xmalloc.h>
13 #include <yaz/options.h>
18 #include <unicode/ucnv.h>
19 #include <unicode/ustring.h>
23 /* commando line and config parameters */
24 static struct config_t {
28 struct icu_chain * chain;
35 void print_option_error(const struct config_t *p_config)
37 fprintf(stderr, "Calling error, valid options are :\n");
38 fprintf(stderr, "icu_chain_test\n"
39 " [-c (path/to/config/file.xml)]\n"
40 " [-p (a|c|l|t)] print ICU info \n"
44 "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n"
45 "./icu_chain_test -p c\n"
46 "./icu_chain_test -p l -x\n"
47 "./icu_chain_test -p t -x\n"
52 void read_params(int argc, char **argv, struct config_t *p_config){
56 /* set default parameters */
57 p_config->conffile[0] = 0;
58 p_config->print[0] = 0;
59 p_config->xmloutput = 0;
61 p_config->infile = stdin;
62 p_config->outfile = stdout;
64 /* set up command line parameters */
66 while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
71 strcpy(p_config->conffile, arg);
74 strcpy(p_config->print, arg);
77 p_config->xmloutput = 1;
80 print_option_error(p_config);
84 //p_config->infile = fopen("/etc/passwd", "r");
88 if ((!strlen(p_config->conffile)
89 && !strlen(p_config->print))
93 print_option_error(p_config);
97 /* UConverter *conv; */
98 /* conv = ucnv_open("utf-8", &status); */
99 /* assert(U_SUCCESS(status)); */
102 /* = ucnv_toUChars(conv, ustr16, 1024, */
103 /* (const char *) *xstr8, strlen((const char *) *xstr8), */
108 /* ucnv_fromUChars(conv, */
109 /* (char *) *xstr8, strlen((const char *) *xstr8), */
110 /* ustr16, *ustr16_len, */
112 /* ucnv_close(conv); */
115 static void print_icu_converters(const struct config_t *p_config)
120 count = ucnv_countAvailable();
121 if (p_config->xmloutput)
122 fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
123 count, ucnv_getDefaultName());
125 fprintf(config.outfile, "Available ICU converters: %d\n", count);
126 fprintf(config.outfile, "Default ICU Converter is: '%s'\n", ucnv_getDefaultName());
129 for(i=0;i<count;i++){
130 if (p_config->xmloutput)
131 fprintf(config.outfile, "<converter id=\"%s\"/>\n", ucnv_getAvailableName(i));
133 fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
136 if (p_config->xmloutput)
137 fprintf(config.outfile, "</converters>\n");
139 fprintf(config.outfile, "\n");
142 static void print_icu_transliterators(const struct config_t *p_config)
147 count = utrans_countAvailableIDs();
149 int32_t buf_cap = 128;
152 if (p_config->xmloutput)
153 fprintf(config.outfile, "<transliterators count=\"%d\">\n", count);
155 fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
159 utrans_getAvailableID(i, buf, buf_cap);
160 if (p_config->xmloutput)
161 fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
163 fprintf(config.outfile, " %s", buf);
166 if (p_config->xmloutput){
167 fprintf(config.outfile, "</transliterators>\n");
171 fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
172 " Pattern Description\n"
173 " Ranges [a-z] The lower case letters a through z\n"
174 " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n"
175 " String [abc{def}] chars a, b and c, and string 'def'\n"
176 " Categories [\\p{Letter}] Perl General Category 'Letter'.\n"
177 " Categories [:Letter:] Posix General Category 'Letter'.\n"
179 " Combination Example\n"
180 " Union [[:Greek:] [:letter:]]\n"
181 " Intersection [[:Greek:] & [:letter:]]\n"
182 " Set Complement [[:Greek:] - [:letter:]]\n"
183 " Complement [^[:Greek:] [:letter:]]\n"
185 "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
188 " [:Punctuation:] Any-Remove\n"
189 " [:Cased-Letter:] Any-Upper\n"
190 " [:Control:] Any-Remove\n"
191 " [:Decimal_Number:] Any-Remove\n"
192 " [:Final_Punctuation:] Any-Remove\n"
193 " [:Georgian:] Any-Upper\n"
194 " [:Katakana:] Any-Remove\n"
195 " [:Arabic:] Any-Remove\n"
196 " [:Punctuation:] Remove\n"
197 " [[:Punctuation:]-[.,]] Remove\n"
198 " [:Line_Separator:] Any-Remove\n"
199 " [:Math_Symbol:] Any-Remove\n"
200 " Lower; [:^Letter:] Remove (word tokenization)\n"
201 " [:^Number:] Remove (numeric tokenization)\n"
202 " [:^Katagana:] Remove (remove everything except Katagana)\n"
203 " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
204 " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n"
205 " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
206 " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
208 "see http://icu.sourceforge.net/userguide/Transform.html\n"
209 " http://www.unicode.org/Public/UNIDATA/UCD.html\n"
210 " http://icu.sourceforge.net/userguide/Transform.html\n"
211 " http://icu.sourceforge.net/userguide/TransformRule.html\n"
215 fprintf(config.outfile, "\n\n");
220 static void print_icu_xml_locales(const struct config_t *p_config)
224 UErrorCode status = U_ZERO_ERROR;
227 int32_t keyword_len = 0;
228 char keyword_str[128];
229 int32_t keyword_str_len = 0;
232 int32_t language_len = 0;
234 int32_t lang_str_len = 0;
237 int32_t script_len = 0;
238 char script_str[128];
239 int32_t script_str_len = 0;
242 int32_t location_len = 0;
243 char location_str[128];
244 int32_t location_str_len = 0;
247 int32_t variant_len = 0;
248 char variant_str[128];
249 int32_t variant_str_len = 0;
252 int32_t name_len = 0;
254 int32_t name_str_len = 0;
257 int32_t localname_len = 0;
258 char localname_str[128];
259 int32_t localname_str_len = 0;
261 count = uloc_countAvailable() ;
263 if (p_config->xmloutput){
265 fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
266 count, uloc_getDefault(), ucol_countAvailable());
273 = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
277 u_strToUTF8(keyword_str, 128, &keyword_str_len,
278 keyword, keyword_len,
283 = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
287 u_strToUTF8(lang_str, 128, &lang_str_len,
288 language, language_len,
293 = uloc_getDisplayScript(uloc_getAvailable(i), "en",
297 u_strToUTF8(script_str, 128, &script_str_len,
302 = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
306 u_strToUTF8(location_str, 128, &location_str_len,
307 location, location_len,
311 = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
315 u_strToUTF8(variant_str, 128, &variant_str_len,
316 variant, variant_len,
320 = uloc_getDisplayName(uloc_getAvailable(i), "en",
324 u_strToUTF8(name_str, 128, &name_str_len,
329 = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
333 u_strToUTF8(localname_str, 128, &localname_str_len,
334 localname, localname_len,
338 if (p_config->xmloutput){
339 fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
340 /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
341 /* if (strlen(keyword_str)) */
342 /* fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
343 /* if (ucol_getAvailable(i)) */
344 /* fprintf(config.outfile, " collation=\"1\""); */
345 if (strlen(lang_str))
346 fprintf(config.outfile, " language=\"%s\"", lang_str);
347 if (strlen(script_str))
348 fprintf(config.outfile, " script=\"%s\"", script_str);
349 if (strlen(location_str))
350 fprintf(config.outfile, " location=\"%s\"", location_str);
351 if (strlen(variant_str))
352 fprintf(config.outfile, " variant=\"%s\"", variant_str);
353 if (strlen(name_str))
354 fprintf(config.outfile, " name=\"%s\"", name_str);
355 if (strlen(localname_str))
356 fprintf(config.outfile, " localname=\"%s\"", localname_str);
357 fprintf(config.outfile, ">");
358 if (strlen(localname_str))
359 fprintf(config.outfile, "%s", localname_str);
360 fprintf(config.outfile, "</locale>\n");
362 else if (1 == p_config->xmloutput){
363 fprintf(config.outfile, "%s", uloc_getAvailable(i));
364 fprintf(config.outfile, " | ");
365 if (strlen(name_str))
366 fprintf(config.outfile, "%s", name_str);
367 fprintf(config.outfile, " | ");
368 if (strlen(localname_str))
369 fprintf(config.outfile, "%s", localname_str);
370 fprintf(config.outfile, "\n");
373 fprintf(config.outfile, "%s ", uloc_getAvailable(i));
375 if (p_config->xmloutput)
376 fprintf(config.outfile, "</locales>\n");
378 fprintf(config.outfile, "\n");
380 if(U_FAILURE(status)) {
381 fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
387 static void print_info(const struct config_t *p_config)
389 if (p_config->xmloutput)
390 fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
393 if ('c' == config.print[0])
394 print_icu_converters(&config);
395 else if ('l' == config.print[0])
396 print_icu_xml_locales(&config);
397 else if ('t' == config.print[0])
398 print_icu_transliterators(&config);
400 print_icu_converters(&config);
401 print_icu_xml_locales(&config);
402 print_icu_transliterators(&config);
405 if (p_config->xmloutput)
406 fprintf(config.outfile, "</icu>\n");
413 static void process_text_file(const struct config_t *p_config)
419 xmlDoc *doc = xmlParseFile(config.conffile);
420 xmlNode *xml_node = xmlDocGetRootElement(doc);
422 long unsigned int token_count = 0;
423 long unsigned int line_count = 0;
425 UErrorCode status = U_ZERO_ERROR;
429 config.chain = icu_chain_xml_config(xml_node, &status);
431 if (config.chain && U_SUCCESS(status))
434 if (p_config->xmloutput)
435 fprintf(config.outfile,
436 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
440 // read input lines for processing
441 while ((line_len = getline(&line, &line_cap, config.infile)) != -1) {
442 success = icu_chain_assign_cstr(config.chain, line, &status);
445 while (success && icu_chain_next_token(config.chain, &status)){
446 if (U_FAILURE(status))
450 if (p_config->xmloutput)
451 fprintf(config.outfile,
452 "<token id=\%lu\" line=\"%lu\""
453 " norm=\"%s\" display=\"%s\"/>\n",
456 icu_chain_get_norm(config.chain),
457 icu_chain_get_display(config.chain));
459 fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
462 icu_chain_get_norm(config.chain),
463 icu_chain_get_display(config.chain));
469 if (p_config->xmloutput)
470 fprintf(config.outfile,
474 icu_chain_destroy(config.chain);
483 int main(int argc, char **argv)
488 read_params(argc, argv, &config);
490 if (config.conffile && strlen(config.conffile))
491 process_text_file(&config);
493 if (config.print && strlen(config.print))
498 printf("ICU not available on your system.\n"
499 "Please install libicu36-dev and icu-doc or similar, "
500 "re-configure and re-compile\n");
512 * indent-tabs-mode: nil
514 * vim: shiftwidth=4 tabstop=8 expandtab