Logo Search packages:      
Sourcecode: ksh version File versions

msgcvt.c

/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*           Copyright (c) 2000-2007 AT&T Knowledge Ventures            *
*                      and is licensed under the                       *
*                  Common Public License, Version 1.0                  *
*                      by AT&T Knowledge Ventures                      *
*                                                                      *
*                A copy of the License is available at                 *
*            http://www.opensource.org/licenses/cpl1.0.txt             *
*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                                                                      *
***********************************************************************/
#pragma prototyped
/*
 * Glenn Fowler
 * AT&T Research
 */

static const char usage[] =
"[-?\n@(#)$Id: msgcvt (AT&T Research) 2000-05-01 $\n]"
USAGE_LICENSE
"[+NAME?msgcvt - convert message file to/from html]"
"[+DESCRIPTION?\bmsgcvt\b reads a \bgencat\b(1) format file on the standard"
"     input and converts it to \bhtml\b on the standard output. The input"
"     file must contain the control statement \b$quote \"\b and use the \""
"     character to quote message text. The output is in a form suitable for"
"     automatic translation by web sites like"
"     \bhttp://babelfish.altavista.com/\b or filters like"
"     \btranslate\b(1).]"
"[h:html?Generate \bhtml\b from \bgencat\b(1) input. This is the default.]"
"[m:msg?Generate a \bgencat\b(1) message file from (presumably translated)"
"     \bhtml\b. Wide characters are UTF-8 encoded.]"
"[r:raw?The message file is raw message text, one message per line, with no"
"     quoting or line numbering.]"
"[+SEE ALSO?\bgencat\b(1), \bmsgcc\b(1), \bmsggen\b(1), \btranslate\b(1)]"
;

#include <ast.h>
#include <ctype.h>
#include <error.h>

#define MSG_RAW         (1<<0)
#define MSG_SPLICE      (1<<1)

#define SPACE(s)  (isspace(*s)&&(s+=1)||*s=='\\'&&(*(s+1)=='n'||*(s+1)=='t')&&(s+=2))

typedef void (*Convert_f)(Sfio_t*, Sfio_t*, int);

typedef struct
{
      const char* name;
      int         code;
} Code_t;

static const Code_t     codes[] =
{
      "aacute",   225,
      "Aacute",   193,
      "acirc",    226,
      "Acirc",    194,
      "aelig",    230,
      "AElig",    198,
      "agrave",   224,
      "Agrave",   192,
      "amp",            '&',
      "aring",    229,
      "Aring",    197,
      "atilde",   227,
      "Atilde",   195,
      "auml",           228,
      "Auml",           196,
      "ccedil",   231,
      "Ccedil",   199,
      "copy",           169,
      "eacute",   233,
      "Eacute",   201,
      "ecirc",    234,
      "Ecirc",    202,
      "egrave",   232,
      "Egrave",   200,
      "euml",           235,
      "Euml",           203,
      "gt",       '>',
      "iacute",   237,
      "Iacute",   205,
      "icirc",    238,
      "Icirc",    206,
      "igrave",   236,
      "Igrave",   204,
      "iuml",           239,
      "Iuml",           207,
      "lt",       '<',
      "nbsp",           ' ',
      "ntilde",   241,
      "Ntilde",   209,
      "oacute",   243,
      "Oacute",   211,
      "ocirc",    244,
      "Ocirc",    212,
      "ograve",   242,
      "Ograve",   210,
      "oslash",   248,
      "Oslash",   216,
      "otilde",   245,
      "Otilde",   213,
      "ouml",           246,
      "Ouml",           214,
      "quot",           '"',
      "reg",            174,
      "szlig",    223,
      "uacute",   250,
      "Uacute",   218,
      "ucirc",    251,
      "Ucirc",    219,
      "ugrave",   249,
      "Ugrave",   217,
      "uuml",           252,
      "Uuml",           220,
      "yuml",           255,
};

static int
decode(Sfio_t* ip)
{
      register int      c;
      register int      i;
      char        name[32];

      if ((c = sfgetc(ip)) == EOF)
            return '&';
      name[0] = c;
      i = 1;
      if (c != '#' && !isalpha(c))
            goto bad;
      while ((c = sfgetc(ip)) != EOF && c != ';')
      {
            if (c == '&')
                  i = 0;
            else
            {
                  name[i++] = c;
                  if (!isalnum(c) && (i > 1 || c != '#') || i >= (elementsof(name) - 1))
                        goto bad;
            }
      }
      name[i] = 0;
      if (name[0] == '#')
      {
            switch (c = strtol(name + 1, NiL, 10))
            {
            case 91:
                  c = '[';
                  break;
            case 93:
                  c = ']';
                  break;
            }
      }
      else
      {
            for (i = 0; i < elementsof(codes); i++)
                  if (streq(codes[i].name, name))
                  {
                        c = codes[i].code;
                        break;
                  }
            if (i >= elementsof(codes))
                  goto bad;
      }
      return c;
 bad:
      name[i] = 0;
      if (c == ';')
            error(1, "&%s: unknown HTML special character -- & assumed", name);
      else
            error(1, "&%s: invalid HTML special character -- & assumed", name);
      while (i--)
            sfungetc(ip, name[i]);
      return '&';
}

static int
sfpututf(Sfio_t* op, register int w)
{
      if (!(w & ~0x7F))
            return sfputc(op, w);
      else if (!(w & ~0x7FF))
            sfputc(op, 0xC0 + (w >> 6));
      else if (!(w & ~0xFFFF))
      {
            sfputc(op, 0xE0 + (w >> 12));
            sfputc(op, 0x80 + (w >> 6 ) & 0x3F);
      }
      else
            return sfputc(op, '?');
      return sfputc(op, 0x80 + (w & 0x3F));
}

static int
sfnext(Sfio_t* ip)
{
      register int      c;

      while (isspace(c = sfgetc(ip)));
      return c;
}

static void
html2msg(register Sfio_t* ip, register Sfio_t* op, int flags)
{
      register int      c;
      register int      q;

 again:
      while ((c = sfgetc(ip)) != EOF)
            if (c == '<')
            {
                  if ((c = sfnext(ip)) == 'O' &&
                      (c = sfnext(ip)) == 'L' &&
                      isspace(c = sfgetc(ip)) &&
                      (c = sfnext(ip)) == 'S' &&
                      (c = sfnext(ip)) == 'T' &&
                      (c = sfnext(ip)) == 'A' &&
                      (c = sfnext(ip)) == 'R' &&
                      (c = sfnext(ip)) == 'T' &&
                      (c = sfnext(ip)) == '=' &&
                      (c = sfnext(ip)) == '"' &&
                      (c = sfnext(ip)) == '5' &&
                      (c = sfnext(ip)) == '5' &&
                      (c = sfnext(ip)) == '0' &&
                      (c = sfnext(ip)) == '7' &&
                      (c = sfnext(ip)) == '1' &&
                      (c = sfnext(ip)) == '7' &&
                      (c = sfnext(ip)) == '"' &&
                      (c = sfnext(ip)) == '>')
                        break;
                  while (c != EOF && c != '>')
                        c = sfgetc(ip);
            }
      if ((c = sfnext(ip)) != EOF)
            sfungetc(ip, c);
      q = 0;
      for (;;)
      {
            switch (c = sfgetc(ip))
            {
            case EOF:
                  break;
            case '&':
                  c = decode(ip);
                  sfpututf(op, c);
                  if (isspace(c))
                  {
                        while (isspace(c = sfgetc(ip)));
                        if (c == EOF)
                              break;
                        sfungetc(ip, c);
                  }
                  continue;
            case '<':
                  switch (c = sfnext(ip))
                  {
                  case '/':
                        if ((c = sfnext(ip)) == 'O' &&
                            (c = sfgetc(ip)) == 'L' &&
                            (c = sfnext(ip)) == '>')
                        {
                              if (q)
                              {
                                    sfputc(op, q);
                                    q = '"';
                              }
                              goto again;
                        }
                        break;
                  case 'B':
                        if ((c = sfgetc(ip)) == 'R' &&
                            (c = sfnext(ip)) == '>')
                              sfputc(op, ' ');
                        break;
                  case 'L':
                        if ((c = sfgetc(ip)) == 'I' &&
                            (c = sfnext(ip)) == '>' &&
                             isdigit(c = sfnext(ip)))
                        {
                              if (q)
                                    sfputc(op, q);
                              else
                                    q = '"';
                              sfputc(op, '\n');
                              do
                              {
                                    sfputc(op, c);
                              } while (isdigit(c = sfgetc(ip)));
                              if (c == EOF)
                                    break;
                              sfputc(op, ' ');
                              sfputc(op, '"');
                              if (isspace(c))
                                    c = sfnext(ip);
                              if (c == '<' &&
                                  (c = sfnext(ip)) == 'L' &&
                                  (c = sfgetc(ip)) == 'I' &&
                                  (c = sfnext(ip)) == '>')
                                    /* great */;
                              continue;
                        }
                        break;
                  case 'P':
                        if ((c = sfnext(ip)) == '>')
                              sfputc(op, '\n');
                        else if (c == 'C' &&
                               (c = sfgetc(ip)) == 'L' &&
                               (c = sfgetc(ip)) == 'A' &&
                               (c = sfgetc(ip)) == 'S' &&
                               (c = sfgetc(ip)) == 'S' &&
                               (c = sfnext(ip)) == '=' &&
                               (c = sfnext(ip)) == '"')
                              for (;;)
                              {
                                    switch (c = sfgetc(ip))
                                    {
                                    case EOF:
                                    case '"':
                                          break;
                                    case '&':
                                          c = decode(ip);
                                          sfpututf(op, c);
                                          continue;
                                    default:
                                          sfpututf(op, c);
                                          continue;
                                    }
                                    break;
                              }
                        break;
                  }
                  while (c != EOF && c != '>')
                        c = sfgetc(ip);
                  if (c == EOF || (c = sfgetc(ip)) == EOF)
                        break;
                  sfungetc(ip, c);
                  continue;
            case '"':
                  if (!flags)
                        sfputc(op, '\\');
                  sfputc(op, c);
                  continue;
            case '\n':
                  if (flags)
                  {
                        sfputc(op, c);
                        continue;
                  }
                  /*FALLTHROUGH*/
            case ' ':
            case '\t':
                  while ((c = sfgetc(ip)) != EOF)
                        if (c == '&')
                        {
                              c = decode(ip);
                              if (!isspace(c))
                                    sfputc(op, ' ');
                              sfpututf(op, c);
                              break;
                        }
                        else if (!isspace(c))
                        {
                              if (c == '<')
                              {
                                    c = sfgetc(ip);
                                    if (c == EOF)
                                          break;
                                    sfungetc(ip, c);
                                    sfungetc(ip, '<');
                                    if (c != 'L' && c != '/')
                                          sfputc(op, ' ');
                              }
                              else
                              {
                                    if (c != EOF)
                                          sfungetc(ip, c);
                                    sfputc(op, ' ');
                              }
                              break;
                        }
                  continue;
            case '\r':
            case '[':
            case ']':
                  continue;
            default:
                  sfpututf(op, c);
                  continue;
            }
            break;
      }
      if (q)
            sfputc(op, q);
      sfputc(op, '\n');
}

static void
encode(Sfio_t* op, register int c)
{
      if (c == '<')
            sfprintf(op, "&lt;");
      else if (c == '>')
            sfprintf(op, "&gt;");
      else if (c == '"')
            sfprintf(op, "&quot;");
      else if (c == '&')
            sfprintf(op, "&amp;");
      else if (c == '[')
            sfprintf(op, "&#091;");
      else if (c == ']')
            sfprintf(op, "&#093;");
      else
            sfputc(op, c);
}

static void
msg2html(register Sfio_t* ip, register Sfio_t* op, register int flags)
{
      register char*    s;
      register int      c;
      register int      q;
      register int      p;

      sfprintf(op, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"><HTML><HEAD><!-- text massaged for external translation --></HEAD><BODY>\n");
      sfprintf(op, "<OL START=\"550717\">\n");
      p = q = 0;
      while (s = sfgetr(ip, '\n', 1))
      {
            error_info.line++;
            if (flags)
                  sfprintf(op, "<P>");
            else
            {
                  if (*s == '$')
                  {
                        if (p)
                              sfprintf(op, "<P>");
                        else
                              p = 1;
                        sfprintf(op, "<P CLASS=\"", s);
                        while (c = *s++)
                              encode(op, c);
                        sfprintf(op, "\">\n");
                        continue;
                  }
                  p = 0;
                  if (!isdigit(*s))
                        continue;
                  sfprintf(op, "<LI>");
                  while (isdigit(c = *s++))
                        sfputc(op, c);
                  sfprintf(op, "<LI>");
                  while (c && c != '"')
                        c = *s++;
                  if (!c)
                        s--;
                  else if (isspace(*s))
                  {
                        s++;
                        sfprintf(op, "<BR>");
                  }
            }
            for (;;)
            {
                  switch (c = *s++)
                  {
                  case 0:
                        flags &= ~MSG_SPLICE;
                        if (q)
                        {
                              q = 0;
                              sfprintf(op, "\">");
                        }
                        sfputc(op, '\n');
                        break;
                  case '<':
                        sfprintf(op, "&lt;");
                        continue;
                  case '>':
                        sfprintf(op, "&gt;");
                        continue;
                  case '&':
                        sfprintf(op, "&amp;");
                        continue;
                  case '[':
                        sfprintf(op, "&#091;");
                        continue;
                  case ']':
                        sfprintf(op, "&#093;");
                        continue;
                  case '$':
                        if (!q)
                        {
                              q = 1;
                              sfprintf(op, "<P CLASS=\"");
                        }
                        sfputc(op, c);
                        while (isalnum(c = *s++))
                              sfputc(op, c);
                        s--;
                        continue;
                  case '%':
                        if (!q)
                        {
                              q = 1;
                              sfprintf(op, "<P CLASS=\"");
                        }
                        sfputc(op, c);
                        if (*s == '%')
                              sfputc(op, *s++);
                        else
                              do
                              {
                                    if (!(c = *s++) || c == '"')
                                    {
                                          s--;
                                          break;
                                    }
                                    encode(op, c);
                              } while (!isalpha(c) || (!islower(c) || c == 'h' || c == 'l') && isalpha(*s));
                        if (SPACE(s))
                              sfprintf(op, "&nbsp;");
                        continue;
                  case '"':
                        if (!(flags & MSG_RAW))
                        {
                              s = "";
                              continue;
                        }
                        /*FALLTHROUGH*/
                  case '\'':
                  case ':':
                  case '/':
                  case '+':
                  case '@':
                        if (!q)
                        {
                              q = 1;
                              sfprintf(op, "<P CLASS=\"");
                        }
                        /*FALLTHROUGH*/
                  case '.':
                  case ',':
                        sfputc(op, c);
                        if (SPACE(s))
                              sfprintf(op, "&nbsp;");
                        continue;
                  case '\\':
                        if (!(c = *s++))
                        {
                              flags |= MSG_SPLICE;
                              break;
                        }
                        if (c != 'n' && c != 't')
                        {
                              if (!q)
                              {
                                    q = 1;
                                    sfprintf(op, "<P CLASS=\"");
                              }
                              sfputc(op, '\\');
                              encode(op, c);
                              if (c == 'b')
                              {
                                    for (;;)
                                    {
                                          if (!(c = *s++) || c == '"')
                                          {
                                                s--;
                                                break;
                                          }
                                          if (c == '?')
                                          {
                                                if (*s != '?')
                                                {
                                                      s--;
                                                      break;
                                                }
                                                sfputc(op, c);
                                                sfputc(op, *s++);
                                                continue;
                                          }
                                          if (c == '\\')
                                          {
                                                if (!*s)
                                                      break;
                                                sfputc(op, c);
                                                if (*s == 'a' || *s == 'b' || *s == '0')
                                                {
                                                      sfputc(op, *s++);
                                                      break;
                                                }
                                                c = *s++;
                                          }
                                          encode(op, c);
                                    }
                              }
                              else if (isdigit(c) && isdigit(*s))
                              {
                                    sfputc(op, *s++);
                                    if (isdigit(*s))
                                          sfputc(op, *s++);
                              }
                              if (SPACE(s))
                                    sfprintf(op, "&nbsp;");
                              continue;
                        }
                        /*FALLTHROUGH*/
                  case ' ':
                  case '\t':
                        while (isspace(*s) || *s == '\\' && (*(s + 1) == 'n' || *(s + 1) == 't') && s++)
                              s++;
                        if (*s == '"')
                        {
                              if (q)
                              {
                                    q = 0;
                                    sfprintf(op, " \">");
                              }
                              else
                                    sfprintf(op, "<BR>");
                              continue;
                        }
                        c = ' ';
                        /*FALLTHROUGH*/
                  default:
                        if (q)
                        {
                              q = 0;
                              sfprintf(op, "\">");
                        }
                        sfputc(op, c);
                        continue;
                  }
                  break;
            }
      }
      sfprintf(op, "</OL>\n");
      sfprintf(op, "</BODY></HTML>\n");
      error_info.line = 0;
}

int
main(int argc, char** argv)
{
      int         flags = 0;
      Convert_f   convert = msg2html;

      NoP(argc);
      error_info.id = "msgcvt";
      for (;;)
      {
            switch (optget(argv, usage))
            {
            case 'h':
                  convert = msg2html;
                  continue;
            case 'm':
                  convert = html2msg;
                  continue;
            case 'r':
                  flags |= MSG_RAW;
                  continue;
            case '?':
                  error(ERROR_USAGE|4, "%s", opt_info.arg);
                  continue;
            case ':':
                  error(2, "%s", opt_info.arg);
                  continue;
            }
            break;
      }
      argv += opt_info.index;
      if (error_info.errors)
            error(ERROR_USAGE|4, "%s", optusage(NiL));
      (*convert)(sfstdin, sfstdout, flags);
      return error_info.errors != 0;
}

Generated by  Doxygen 1.6.0   Back to index