20 typedef unsigned char uchar;
21 typedef unsigned short uint16;
22 typedef unsigned int uint;
24 #define MY_UCA_MAXWEIGHT_TO_PARSE 64
25 #define MY_UCA_MAXWEIGHT_TO_DUMP 8
26 #define MY_UCA_MAXLEVEL 4
27 #define MY_UCA_VERSION_SIZE 32
28 #define MY_UCA_MAX_CONTRACTION 6
30 #define MY_UCA_NCONTRACTIONS 1024
31 #define MY_UCA_MAXCHAR (0x10FFFF+1)
32 #define MY_UCA_NCHARS 256
33 #define MY_UCA_CMASK 255
34 #define MY_UCA_PSHIFT 8
35 #define MY_UCA_NPAGES MY_UCA_MAXCHAR/MY_UCA_NCHARS
39 uint16 weight[MY_UCA_MAXLEVEL+1][MY_UCA_MAXWEIGHT_TO_DUMP];
46 uint ch[MY_UCA_MAX_CONTRACTION];
52 char version[MY_UCA_VERSION_SIZE];
56 int optimize_contractions;
64 static int load_uca_file(
MY_UCA *uca,
65 size_t maxchar,
int *pageloaded)
68 size_t lineno, out_of_range_chars= 0;
69 char *weights[MY_UCA_MAXWEIGHT_TO_PARSE];
71 for (lineno= 0; fgets(str,
sizeof(str), stdin); lineno++)
75 char *s, *ch[MY_UCA_MAX_CONTRACTION];
76 size_t codenum,
i,
code;
81 if (*str==
'\r' || *str ==
'\n' || *str ==
'#')
85 if (*str ==
'@' && !strncmp(str,
"@version ", 9))
88 if (strtok(str,
" \r\n\t") && (value= strtok(NULL,
" \r\n\t")))
89 snprintf(uca->version, MY_UCA_VERSION_SIZE, value);
94 if ((code= strtol(str,NULL,16)) > maxchar)
101 if ((comment= strchr(str,
'#')))
104 for ( ; *comment ==
' '; comment++);
108 fprintf(stderr,
"Warning: could not parse line #%d:\n'%s'\n",
114 if ((weight= strchr(str,
';')))
117 for ( ; *weight==
' '; weight++);
121 fprintf(stderr,
"Warning: could not parse line #%d:\n%s\n", lineno, str);
125 for (codenum= 0, s= strtok(str,
" \t"); s;
126 codenum++, s= strtok(NULL,
" \t"))
128 if (codenum == MY_UCA_MAX_CONTRACTION)
130 fprintf(stderr,
"Contraction length is too long (%d) line #%d",
144 if (uca->ncontractions >= MY_UCA_NCONTRACTIONS)
147 "Too many contractions (%d) at line #%d\n"
148 "Rebuild with a bigger MY_UCA_MAXCONTRACTIONS value\n",
149 uca->ncontractions, lineno);
153 for (i= 0; i < MY_UCA_MAX_CONTRACTION; i++)
155 c->ch[
i]= (i < codenum) ? (uint) strtol(ch[i], NULL, 16) : 0;
159 fprintf(stderr,
"Contraction: %04X-%04X-%04X\n",
160 c->ch[0], c->ch[1], c->ch[2]);
165 item= &uca->item[code];
176 s= strtok(weight,
" []");
179 if (item->num >= MY_UCA_MAXWEIGHT_TO_PARSE)
181 fprintf(stderr,
"Line #%d has more than %d weights\n",
182 lineno, MY_UCA_MAXWEIGHT_TO_PARSE);
183 fprintf(stderr,
"Can't continue.\n");
186 weights[item->num]= s;
187 s= strtok(NULL,
" []");
191 for (i= 0; i < item->num; i++)
195 if (i >= MY_UCA_MAXWEIGHT_TO_DUMP)
198 "Warning: at line %d: character %04X has"
199 " more than %d many weights (%d). "
200 "Skipping the extra weights.\n",
201 lineno, code, MY_UCA_MAXWEIGHT_TO_DUMP, item->num);
202 item->num= MY_UCA_MAXWEIGHT_TO_DUMP;
206 for (s= weights[i]; *s; )
209 size_t part= strtol(s + 1, &endptr, 16);
210 if (i < MY_UCA_MAXWEIGHT_TO_DUMP)
212 item->weight[
level][
i]= part;
216 fprintf(stderr,
"Too many weights (%d) at line %d\n", i, lineno);
224 pageloaded[code >> MY_UCA_PSHIFT]++;
227 if (out_of_range_chars)
228 fprintf(stderr,
"%d out-of-range characters skipped\n", out_of_range_chars);
240 set_implicit_weights(
MY_UCA *uca,
size_t maxchar)
244 for (code= 0; code < maxchar; code++)
246 size_t base, aaaa, bbbb;
259 if (code >= 0x3400 && code <= 0x4DB5)
261 else if (code >= 0x4E00 && code <= 0x9FA5)
266 aaaa= base + (code >> 15);
267 bbbb= (code & 0x7FFF) | 0x8000;
268 item->weight[0][0]= aaaa;
269 item->weight[0][1]= bbbb;
271 item->weight[1][0]= 0x0020;
272 item->weight[1][1]= 0x0000;
274 item->weight[2][0]= 0x0002;
275 item->weight[2][1]= 0x0000;
277 item->weight[3][0]= 0x0001;
278 item->weight[3][2]= 0x0000;
286 get_page_statistics(
MY_UCA *uca,
size_t page,
size_t level,
287 size_t *maxnum,
size_t *ndefs)
291 for (offs= 0; offs < MY_UCA_NCHARS; offs++)
294 MY_UCA_ITEM *item= &uca->item[page * MY_UCA_NCHARS + offs];
297 for (num= 0, i= 0; i < item->num; i++)
299 if (item->weight[level][i])
302 *maxnum= *maxnum < num ? num : *maxnum;
305 if (level == 1 && num == 1)
308 if (item->weight[level][0] == 0x0020)
311 else if (level == 2 && num == 1)
314 if (item->weight[level][0] == 0x0002)
321 static const char *pname[]= {
"",
"_s",
"_t",
"_q"};
322 static const char *lname[]= {
"primary",
"secondary",
"tertiary",
"quaternary"};
327 static char prefix[MY_UCA_VERSION_SIZE];
329 strcpy(prefix,
"uca");
330 for (s= uca->version, d= prefix + strlen(prefix); *s; s++)
332 if ((*s >=
'0' && *s <=
'9') || (*s >=
'a' && *s <=
'z'))
341 page_name(
MY_UCA *uca,
size_t page,
size_t level)
343 static char page_name_buf[120];
345 snprintf(page_name_buf,
sizeof(page_name_buf),
349 return page_name_buf;
358 uint16 *weight,
size_t weight_elements)
362 memset(weight, 0, weight_elements *
sizeof(*weight));
374 for (num= 0, i= 0; i < item->num && i < MY_UCA_MAXWEIGHT_TO_DUMP; i++)
376 if (item->weight[level][i])
378 weight[num]= item->weight[
level][
i];
379 #ifdef INVERT_TERTIARY_WEIGHTS
386 if (weight[num] >= 0x20)
387 fprintf(stderr,
"Tertiary weight is too big: %02X\n", weight[num]);
388 weight[num]= (uint) (0x20) - weight[num];
399 print_one_page(
MY_UCA *uca,
size_t level,
400 size_t page,
size_t maxnum)
402 size_t offs, mchars, nchars= 0, chars_per_line;
404 printf(
"uint16 %s[]= { /* %04X (%d weights per char) */\n",
405 page_name(uca, page, level), page * MY_UCA_NCHARS, maxnum);
410 case 0: mchars= 8; chars_per_line= 8;
break;
411 case 1: mchars= 8; chars_per_line= 8;
break;
412 case 2: mchars= 8; chars_per_line= 4;
break;
413 case 3: mchars= 9; chars_per_line= 3;
break;
414 case 4: mchars= 8; chars_per_line= 2;
break;
416 mchars= uca->item[page * MY_UCA_NCHARS + offs].num;
422 for (offs=0; offs < MY_UCA_NCHARS; offs++)
424 uint16 weight[MY_UCA_MAXWEIGHT_TO_DUMP + 1];
425 size_t num,
i, code= page * MY_UCA_NCHARS + offs;
428 normalize_weight(item, level, weight,
sizeof(weight)/
sizeof(weight[0]));
431 for (i= 0; i < maxnum; i++)
435 printf(
"0x%04X", tmp);
437 if (tmp > 0xFFFF || tmp < 0)
440 "Error: Too big weight for code point %04X level %d: %08X\n",
445 if ((offs + 1 != MY_UCA_NCHARS) || (i + 1 != maxnum))
455 printf(
" /* %04X */\n", (code + 1) - chars_per_line);
473 weight_cmp(uint16 *w1, uint16 *w2,
size_t len)
476 for (i= 0; i < len; i++)
489 uint16 weight[MY_UCA_MAXWEIGHT_TO_DUMP + 1];
494 normalize_weight(&c->item, level,
495 weight,
sizeof(weight)/
sizeof(weight[0]));
497 if (uca->optimize_contractions)
524 uint16 sweight[MY_UCA_MAXWEIGHT_TO_DUMP*MY_UCA_MAX_CONTRACTION + 1], *sw;
527 for (sw= sweight, i= 0; c->ch[
i]; i++)
530 sw+= normalize_weight(item, level, sw, MY_UCA_MAXWEIGHT_TO_DUMP);
533 if (sw - sweight < MY_UCA_MAXWEIGHT_TO_DUMP &&
534 !weight_cmp(sweight, weight, MY_UCA_MAXWEIGHT_TO_DUMP))
537 fprintf(stderr,
"Equal[%d]: %04X [%04X-%04X-%04X] == {%04X,%04X,%04X} [%04X-%04X-%04X]\n",
539 c->ch[0], sweight[0], sweight[1], sweight[2],
540 c->ch[0], c->ch[1], c->ch[2],
541 weight[0], weight[1], weight[2]);
547 printf(
"%s{", optimize ?
"/* " :
"");
548 for (ch= 0; ch < MY_UCA_MAX_CONTRACTION; ch++)
550 uint codepoint= c ? c->ch[ch] : 0;
551 printf(
"%s", ch > 0 ?
"," :
"");
553 printf(
"0x%04X", codepoint);
559 for (ch= 0; ch < MY_UCA_MAXWEIGHT_TO_DUMP; ch++)
561 uint w= c ? weight[ch] : 0;
562 printf(
"%s", ch > 0 ?
"," :
"");
570 printf(
"},%s\n", optimize ?
" */" :
"");
575 print_contractions(
MY_UCA *uca,
size_t level)
580 printf(
"/* Contractions, %s level */\n", lname[level]);
581 printf(
"MY_CONTRACTION %s_default_contraction%s[]= {\n",
582 prefix_name(uca), pname[level]);
583 for (i= 0; i < uca->ncontractions; i++)
586 print_contraction(uca, c, level);
588 print_contraction(uca, NULL, level);
593 static int contractions= 0;
594 static int nlevels= 1;
597 usage(FILE *
file,
int rc)
599 fprintf(file,
"Usage:\n");
600 fprintf(file,
"uca-dump [options...] < /path/to/allkeys.txt\n");
602 fprintf(file,
"Options:\n");
603 fprintf(file,
"--levels=NUM How many levels to dump, 1-4, default 1.\n");
604 fprintf(file,
"--contractions=NUM Whether to dump comtractions, 0-1, default 0.\n");
605 fprintf(file,
"--optimize-contractions=NUM Whether to optimize contractions, 0-1, default 0.\n");
606 fprintf(file,
"--debug=NUM Print debug information, 0-1, default 0.\n");
607 fprintf(file,
"\n\n");
613 get_int_option(
const char *str,
const char *
name,
int *num)
615 size_t namelen= strlen(name);
616 if (!strncmp(str, name, namelen))
618 *num= atoi(str + namelen);
619 if (*num == 0 && str[namelen] !=
'0')
621 fprintf(stderr,
"\nBad numeric option value: %s\n\n", str);
631 process_options(
int ac,
char **av,
MY_UCA *uca)
634 for (i= 1; i < ac ; i++)
637 if (!get_int_option(av[i],
"--levels=", &nlevels) &&
638 !get_int_option(av[i],
"--contractions=", &contractions) &&
639 !get_int_option(av[i],
"--debug=", &uca->debug) &&
640 !get_int_option(av[i],
"--optimize-contractions=", &uca->optimize_contractions))
642 fprintf(stderr,
"\nUnknown option: %s\n\n", av[i]);
650 main(
int ac,
char **av)
653 size_t level, maxchar= MY_UCA_MAXCHAR;
654 static int pageloaded[MY_UCA_NPAGES];
656 memset(&uca, 0,
sizeof(uca));
658 process_options(ac, av, &uca);
660 memset(pageloaded, 0,
sizeof(pageloaded));
662 load_uca_file(&uca, maxchar, pageloaded);
665 set_implicit_weights(&uca, maxchar);
668 printf(
"#include \"my_uca.h\"\n");
670 printf(
"#define MY_UCA_NPAGES %d\n", MY_UCA_NPAGES);
671 printf(
"#define MY_UCA_NCHARS %d\n", MY_UCA_NCHARS);
672 printf(
"#define MY_UCA_CMASK %d\n", MY_UCA_CMASK);
673 printf(
"#define MY_UCA_PSHIFT %d\n", MY_UCA_PSHIFT);
675 printf(
"/* Created from allkeys.txt. Unicode version '%s'. */\n\n", uca.version);
677 for (level= 0; level < nlevels; level++)
680 int pagemaxlen[MY_UCA_NPAGES];
682 for (page=0; page < MY_UCA_NPAGES; page++)
690 if (!pageloaded[page])
697 get_page_statistics(&uca, page, level, &maxnum, &ndefs);
707 if (ndefs == MY_UCA_NCHARS)
710 pagemaxlen[
page]= maxnum;
714 print_one_page(&uca, level, page, maxnum);
719 printf(
"uchar %s_length%s[%d]={\n",
720 prefix_name(&uca), pname[level], MY_UCA_NPAGES);
721 for (page=0; page < MY_UCA_NPAGES; page++)
725 page < MY_UCA_NPAGES - 1 ?
"," :
"" ,
726 (page + 1) % 16 ?
"" :
"\n");
732 printf(
"uint16 *%s_weight%s[%d]={\n",
733 prefix_name(&uca), pname[level], MY_UCA_NPAGES);
734 for (page=0; page < MY_UCA_NPAGES; page++)
736 const char *comma= page < MY_UCA_NPAGES - 1 ?
"," :
"";
737 const char *nline= (page + 1) % 4 ?
"" :
"\n";
738 if (!pagemaxlen[page])
739 printf(
"NULL %s%s%s", level ?
" ":
"", comma , nline);
741 printf(
"%s%s%s", page_name(&uca, page, level), comma, nline);
747 print_contractions(&uca, level);
751 printf(
"int main(void){ return 0;};\n");