1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 2000-2009 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * *
19 ***********************************************************************/
20 #pragma prototyped
21 /*
22 * Glenn Fowler
23 * AT&T Research
24 */
25
26 static const char usage[] =
27 "[-?\n@(#)$Id: msgcvt (AT&T Research) 2000-05-01 $\n]"
28 USAGE_LICENSE
29 "[+NAME?msgcvt - convert message file to/from html]"
30 "[+DESCRIPTION?\bmsgcvt\b reads a \bgencat\b(1) format file on the standard"
31 " input and converts it to \bhtml\b on the standard output. The input"
32 " file must contain the control statement \b$quote \"\b and use the \""
33 " character to quote message text. The output is in a form suitable for"
34 " automatic translation by web sites like"
35 " \bhttp://babelfish.altavista.com/\b or filters like"
36 " \btranslate\b(1).]"
37 "[h:html?Generate \bhtml\b from \bgencat\b(1) input. This is the default.]"
38 "[m:msg?Generate a \bgencat\b(1) message file from (presumably translated)"
39 " \bhtml\b. Wide characters are UTF-8 encoded.]"
40 "[r:raw?The message file is raw message text, one message per line, with no"
41 " quoting or line numbering.]"
42 "[+SEE ALSO?\bgencat\b(1), \bmsgcc\b(1), \bmsggen\b(1), \btranslate\b(1)]"
43 ;
44
45 #include <ast.h>
46 #include <ctype.h>
47 #include <error.h>
48
49 #define MSG_RAW (1<<0)
50 #define MSG_SPLICE (1<<1)
51
52 #define SPACE(s) (isspace(*s)&&(s+=1)||*s=='\\'&&(*(s+1)=='n'||*(s+1)=='t')&&(s+=2))
53
54 typedef void (*Convert_f)(Sfio_t*, Sfio_t*, int);
55
56 typedef struct
57 {
58 const char* name;
59 int code;
60 } Code_t;
61
62 static const Code_t codes[] =
63 {
64 "aacute", 225,
65 "Aacute", 193,
66 "acirc", 226,
67 "Acirc", 194,
68 "aelig", 230,
69 "AElig", 198,
70 "agrave", 224,
71 "Agrave", 192,
72 "amp", '&',
73 "aring", 229,
74 "Aring", 197,
75 "atilde", 227,
76 "Atilde", 195,
77 "auml", 228,
78 "Auml", 196,
79 "ccedil", 231,
80 "Ccedil", 199,
81 "copy", 169,
82 "eacute", 233,
83 "Eacute", 201,
84 "ecirc", 234,
85 "Ecirc", 202,
86 "egrave", 232,
87 "Egrave", 200,
88 "euml", 235,
89 "Euml", 203,
90 "gt", '>',
91 "iacute", 237,
92 "Iacute", 205,
93 "icirc", 238,
94 "Icirc", 206,
95 "igrave", 236,
96 "Igrave", 204,
97 "iuml", 239,
98 "Iuml", 207,
99 "lt", '<',
100 "nbsp", ' ',
101 "ntilde", 241,
102 "Ntilde", 209,
103 "oacute", 243,
104 "Oacute", 211,
105 "ocirc", 244,
106 "Ocirc", 212,
107 "ograve", 242,
108 "Ograve", 210,
109 "oslash", 248,
110 "Oslash", 216,
111 "otilde", 245,
112 "Otilde", 213,
113 "ouml", 246,
114 "Ouml", 214,
115 "quot", '"',
116 "reg", 174,
117 "szlig", 223,
118 "uacute", 250,
119 "Uacute", 218,
120 "ucirc", 251,
121 "Ucirc", 219,
122 "ugrave", 249,
123 "Ugrave", 217,
124 "uuml", 252,
125 "Uuml", 220,
126 "yuml", 255,
127 };
128
129 static int
decode(Sfio_t * ip)130 decode(Sfio_t* ip)
131 {
132 register int c;
133 register int i;
134 char name[32];
135
136 if ((c = sfgetc(ip)) == EOF)
137 return '&';
138 name[0] = c;
139 i = 1;
140 if (c != '#' && !isalpha(c))
141 goto bad;
142 while ((c = sfgetc(ip)) != EOF && c != ';')
143 {
144 if (c == '&')
145 i = 0;
146 else
147 {
148 name[i++] = c;
149 if (!isalnum(c) && (i > 1 || c != '#') || i >= (elementsof(name) - 1))
150 goto bad;
151 }
152 }
153 name[i] = 0;
154 if (name[0] == '#')
155 {
156 switch (c = strtol(name + 1, NiL, 10))
157 {
158 case 91:
159 c = '[';
160 break;
161 case 93:
162 c = ']';
163 break;
164 }
165 }
166 else
167 {
168 for (i = 0; i < elementsof(codes); i++)
169 if (streq(codes[i].name, name))
170 {
171 c = codes[i].code;
172 break;
173 }
174 if (i >= elementsof(codes))
175 goto bad;
176 }
177 return c;
178 bad:
179 name[i] = 0;
180 if (c == ';')
181 error(1, "&%s: unknown HTML special character -- & assumed", name);
182 else
183 error(1, "&%s: invalid HTML special character -- & assumed", name);
184 while (i--)
185 sfungetc(ip, name[i]);
186 return '&';
187 }
188
189 static int
sfpututf(Sfio_t * op,register int w)190 sfpututf(Sfio_t* op, register int w)
191 {
192 if (!(w & ~0x7F))
193 return sfputc(op, w);
194 else if (!(w & ~0x7FF))
195 sfputc(op, 0xC0 + (w >> 6));
196 else if (!(w & ~0xFFFF))
197 {
198 sfputc(op, 0xE0 + (w >> 12));
199 sfputc(op, 0x80 + (w >> 6 ) & 0x3F);
200 }
201 else
202 return sfputc(op, '?');
203 return sfputc(op, 0x80 + (w & 0x3F));
204 }
205
206 static int
sfnext(Sfio_t * ip)207 sfnext(Sfio_t* ip)
208 {
209 register int c;
210
211 while (isspace(c = sfgetc(ip)));
212 return c;
213 }
214
215 static void
html2msg(register Sfio_t * ip,register Sfio_t * op,int flags)216 html2msg(register Sfio_t* ip, register Sfio_t* op, int flags)
217 {
218 register int c;
219 register int q;
220
221 again:
222 while ((c = sfgetc(ip)) != EOF)
223 if (c == '<')
224 {
225 if ((c = sfnext(ip)) == 'O' &&
226 (c = sfnext(ip)) == 'L' &&
227 isspace(c = sfgetc(ip)) &&
228 (c = sfnext(ip)) == 'S' &&
229 (c = sfnext(ip)) == 'T' &&
230 (c = sfnext(ip)) == 'A' &&
231 (c = sfnext(ip)) == 'R' &&
232 (c = sfnext(ip)) == 'T' &&
233 (c = sfnext(ip)) == '=' &&
234 (c = sfnext(ip)) == '"' &&
235 (c = sfnext(ip)) == '5' &&
236 (c = sfnext(ip)) == '5' &&
237 (c = sfnext(ip)) == '0' &&
238 (c = sfnext(ip)) == '7' &&
239 (c = sfnext(ip)) == '1' &&
240 (c = sfnext(ip)) == '7' &&
241 (c = sfnext(ip)) == '"' &&
242 (c = sfnext(ip)) == '>')
243 break;
244 while (c != EOF && c != '>')
245 c = sfgetc(ip);
246 }
247 if ((c = sfnext(ip)) != EOF)
248 sfungetc(ip, c);
249 q = 0;
250 for (;;)
251 {
252 switch (c = sfgetc(ip))
253 {
254 case EOF:
255 break;
256 case '&':
257 c = decode(ip);
258 sfpututf(op, c);
259 if (isspace(c))
260 {
261 while (isspace(c = sfgetc(ip)));
262 if (c == EOF)
263 break;
264 sfungetc(ip, c);
265 }
266 continue;
267 case '<':
268 switch (c = sfnext(ip))
269 {
270 case '/':
271 if ((c = sfnext(ip)) == 'O' &&
272 (c = sfgetc(ip)) == 'L' &&
273 (c = sfnext(ip)) == '>')
274 {
275 if (q)
276 {
277 sfputc(op, q);
278 q = '"';
279 }
280 goto again;
281 }
282 break;
283 case 'B':
284 if ((c = sfgetc(ip)) == 'R' &&
285 (c = sfnext(ip)) == '>')
286 sfputc(op, ' ');
287 break;
288 case 'L':
289 if ((c = sfgetc(ip)) == 'I' &&
290 (c = sfnext(ip)) == '>' &&
291 isdigit(c = sfnext(ip)))
292 {
293 if (q)
294 sfputc(op, q);
295 else
296 q = '"';
297 sfputc(op, '\n');
298 do
299 {
300 sfputc(op, c);
301 } while (isdigit(c = sfgetc(ip)));
302 if (c == EOF)
303 break;
304 sfputc(op, ' ');
305 sfputc(op, '"');
306 if (isspace(c))
307 c = sfnext(ip);
308 if (c == '<' &&
309 (c = sfnext(ip)) == 'L' &&
310 (c = sfgetc(ip)) == 'I' &&
311 (c = sfnext(ip)) == '>')
312 /* great */;
313 continue;
314 }
315 break;
316 case 'P':
317 if ((c = sfnext(ip)) == '>')
318 sfputc(op, '\n');
319 else if (c == 'C' &&
320 (c = sfgetc(ip)) == 'L' &&
321 (c = sfgetc(ip)) == 'A' &&
322 (c = sfgetc(ip)) == 'S' &&
323 (c = sfgetc(ip)) == 'S' &&
324 (c = sfnext(ip)) == '=' &&
325 (c = sfnext(ip)) == '"')
326 for (;;)
327 {
328 switch (c = sfgetc(ip))
329 {
330 case EOF:
331 case '"':
332 break;
333 case '&':
334 c = decode(ip);
335 sfpututf(op, c);
336 continue;
337 default:
338 sfpututf(op, c);
339 continue;
340 }
341 break;
342 }
343 break;
344 }
345 while (c != EOF && c != '>')
346 c = sfgetc(ip);
347 if (c == EOF || (c = sfgetc(ip)) == EOF)
348 break;
349 sfungetc(ip, c);
350 continue;
351 case '"':
352 if (!flags)
353 sfputc(op, '\\');
354 sfputc(op, c);
355 continue;
356 case '\n':
357 if (flags)
358 {
359 sfputc(op, c);
360 continue;
361 }
362 /*FALLTHROUGH*/
363 case ' ':
364 case '\t':
365 while ((c = sfgetc(ip)) != EOF)
366 if (c == '&')
367 {
368 c = decode(ip);
369 if (!isspace(c))
370 sfputc(op, ' ');
371 sfpututf(op, c);
372 break;
373 }
374 else if (!isspace(c))
375 {
376 if (c == '<')
377 {
378 c = sfgetc(ip);
379 if (c == EOF)
380 break;
381 sfungetc(ip, c);
382 sfungetc(ip, '<');
383 if (c != 'L' && c != '/')
384 sfputc(op, ' ');
385 }
386 else
387 {
388 if (c != EOF)
389 sfungetc(ip, c);
390 sfputc(op, ' ');
391 }
392 break;
393 }
394 continue;
395 case '\r':
396 case '[':
397 case ']':
398 continue;
399 default:
400 sfpututf(op, c);
401 continue;
402 }
403 break;
404 }
405 if (q)
406 sfputc(op, q);
407 sfputc(op, '\n');
408 }
409
410 static void
encode(Sfio_t * op,register int c)411 encode(Sfio_t* op, register int c)
412 {
413 if (c == '<')
414 sfprintf(op, "<");
415 else if (c == '>')
416 sfprintf(op, ">");
417 else if (c == '"')
418 sfprintf(op, """);
419 else if (c == '&')
420 sfprintf(op, "&");
421 else if (c == '[')
422 sfprintf(op, "[");
423 else if (c == ']')
424 sfprintf(op, "]");
425 else
426 sfputc(op, c);
427 }
428
429 static void
msg2html(register Sfio_t * ip,register Sfio_t * op,register int flags)430 msg2html(register Sfio_t* ip, register Sfio_t* op, register int flags)
431 {
432 register char* s;
433 register int c;
434 register int q;
435 register int p;
436
437 sfprintf(op, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"><HTML><HEAD><!-- text massaged for external translation --></HEAD><BODY>\n");
438 sfprintf(op, "<OL START=\"550717\">\n");
439 p = q = 0;
440 while (s = sfgetr(ip, '\n', 1))
441 {
442 error_info.line++;
443 if (flags)
444 sfprintf(op, "<P>");
445 else
446 {
447 if (*s == '$')
448 {
449 if (p)
450 sfprintf(op, "<P>");
451 else
452 p = 1;
453 sfprintf(op, "<P CLASS=\"", s);
454 while (c = *s++)
455 encode(op, c);
456 sfprintf(op, "\">\n");
457 continue;
458 }
459 p = 0;
460 if (!isdigit(*s))
461 continue;
462 sfprintf(op, "<LI>");
463 while (isdigit(c = *s++))
464 sfputc(op, c);
465 sfprintf(op, "<LI>");
466 while (c && c != '"')
467 c = *s++;
468 if (!c)
469 s--;
470 else if (isspace(*s))
471 {
472 s++;
473 sfprintf(op, "<BR>");
474 }
475 }
476 for (;;)
477 {
478 switch (c = *s++)
479 {
480 case 0:
481 flags &= ~MSG_SPLICE;
482 if (q)
483 {
484 q = 0;
485 sfprintf(op, "\">");
486 }
487 sfputc(op, '\n');
488 break;
489 case '<':
490 sfprintf(op, "<");
491 continue;
492 case '>':
493 sfprintf(op, ">");
494 continue;
495 case '&':
496 sfprintf(op, "&");
497 continue;
498 case '[':
499 sfprintf(op, "[");
500 continue;
501 case ']':
502 sfprintf(op, "]");
503 continue;
504 case '$':
505 if (!q)
506 {
507 q = 1;
508 sfprintf(op, "<P CLASS=\"");
509 }
510 sfputc(op, c);
511 while (isalnum(c = *s++))
512 sfputc(op, c);
513 s--;
514 continue;
515 case '%':
516 if (!q)
517 {
518 q = 1;
519 sfprintf(op, "<P CLASS=\"");
520 }
521 sfputc(op, c);
522 if (*s == '%')
523 sfputc(op, *s++);
524 else
525 do
526 {
527 if (!(c = *s++) || c == '"')
528 {
529 s--;
530 break;
531 }
532 encode(op, c);
533 } while (!isalpha(c) || (!islower(c) || c == 'h' || c == 'l') && isalpha(*s));
534 if (SPACE(s))
535 sfprintf(op, " ");
536 continue;
537 case '"':
538 if (!(flags & MSG_RAW))
539 {
540 s = "";
541 continue;
542 }
543 /*FALLTHROUGH*/
544 case '\'':
545 case ':':
546 case '/':
547 case '+':
548 case '@':
549 if (!q)
550 {
551 q = 1;
552 sfprintf(op, "<P CLASS=\"");
553 }
554 /*FALLTHROUGH*/
555 case '.':
556 case ',':
557 sfputc(op, c);
558 if (SPACE(s))
559 sfprintf(op, " ");
560 continue;
561 case '\\':
562 if (!(c = *s++))
563 {
564 flags |= MSG_SPLICE;
565 break;
566 }
567 if (c != 'n' && c != 't')
568 {
569 if (!q)
570 {
571 q = 1;
572 sfprintf(op, "<P CLASS=\"");
573 }
574 sfputc(op, '\\');
575 encode(op, c);
576 if (c == 'b')
577 {
578 for (;;)
579 {
580 if (!(c = *s++) || c == '"')
581 {
582 s--;
583 break;
584 }
585 if (c == '?')
586 {
587 if (*s != '?')
588 {
589 s--;
590 break;
591 }
592 sfputc(op, c);
593 sfputc(op, *s++);
594 continue;
595 }
596 if (c == '\\')
597 {
598 if (!*s)
599 break;
600 sfputc(op, c);
601 if (*s == 'a' || *s == 'b' || *s == '0')
602 {
603 sfputc(op, *s++);
604 break;
605 }
606 c = *s++;
607 }
608 encode(op, c);
609 }
610 }
611 else if (isdigit(c) && isdigit(*s))
612 {
613 sfputc(op, *s++);
614 if (isdigit(*s))
615 sfputc(op, *s++);
616 }
617 if (SPACE(s))
618 sfprintf(op, " ");
619 continue;
620 }
621 /*FALLTHROUGH*/
622 case ' ':
623 case '\t':
624 while (isspace(*s) || *s == '\\' && (*(s + 1) == 'n' || *(s + 1) == 't') && s++)
625 s++;
626 if (*s == '"')
627 {
628 if (q)
629 {
630 q = 0;
631 sfprintf(op, " \">");
632 }
633 else
634 sfprintf(op, "<BR>");
635 continue;
636 }
637 c = ' ';
638 /*FALLTHROUGH*/
639 default:
640 if (q)
641 {
642 q = 0;
643 sfprintf(op, "\">");
644 }
645 sfputc(op, c);
646 continue;
647 }
648 break;
649 }
650 }
651 sfprintf(op, "</OL>\n");
652 sfprintf(op, "</BODY></HTML>\n");
653 error_info.line = 0;
654 }
655
656 int
main(int argc,char ** argv)657 main(int argc, char** argv)
658 {
659 int flags = 0;
660 Convert_f convert = msg2html;
661
662 NoP(argc);
663 error_info.id = "msgcvt";
664 for (;;)
665 {
666 switch (optget(argv, usage))
667 {
668 case 'h':
669 convert = msg2html;
670 continue;
671 case 'm':
672 convert = html2msg;
673 continue;
674 case 'r':
675 flags |= MSG_RAW;
676 continue;
677 case '?':
678 error(ERROR_USAGE|4, "%s", opt_info.arg);
679 continue;
680 case ':':
681 error(2, "%s", opt_info.arg);
682 continue;
683 }
684 break;
685 }
686 argv += opt_info.index;
687 if (error_info.errors)
688 error(ERROR_USAGE|4, "%s", optusage(NiL));
689 (*convert)(sfstdin, sfstdout, flags);
690 return error_info.errors != 0;
691 }
692