Logo Search packages:      
Sourcecode: ksh version File versions

html2rtf.c

/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*           Copyright (c) 1996-2007 AT&T Knowledge Ventures            *
*                      and is licensed under the                       *
*                  Common Public License, Version 1.0                  *
*                      by AT&T Knowledge Ventures                      *
*                                                                      *
*                A copy of the License is available at                 *
*            http://www.opensource.org/licenses/cpl1.0.txt             *
*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                                                                      *
***********************************************************************/
#pragma prototyped
/*
 * Glenn Fowler
 * AT&T Research
 *
 * html to rtf filter
 */

static const char usage[] =
"[-?\n@(#)$Id: html2rtf (AT&T Research) 1999-01-01 $\n]"
USAGE_LICENSE
"[+NAME?html2rtf - html to rtf filter]"
"[+DESCRIPTION?\bhtml2rtf\b converts input \bhtml\b documents to an \bRTF\b"
"     document on the standard output. \bhtml2rtf\b expects properly nested"
"     begin/end tags in the input \bhtml\b and warns about imbalance.]"

"[d:debug?Set the debug trace level to \alevel\a. Higher levels produce"
"     more output.]#[level]"
"[f:font-size?Set the initial font size to \asize\a points.]#[size:=12]"
"[p:project-file?Appends MS HELP project information to the help project file"
"     \afile\a. This file combines individual RTF files into a"
"     hyper-linked collection. Note that MS expects \afile\a to have a"
"     \b.hlp\b extension.]:[file]"
"[v:verbose?Enable verbose error and warning messages. Some \bhtml\b source"
"     can't stand the heat.]"

"\n"
"\n[ file ... ]\n"
"\n"

"[+SEE ALSO?\bman\b(1), \bmm\b(1), \bmm2html\b(1), \btroff\b(1),"
"     \btroff2html\b(1)]"
;

#include "html2rtf.h"

#include <error.h>

#define LIST_INDENT           140
#define STK_LIST_COMPACT      (STK_TAG<<0)

#define a_close               data[0].number
#define a_label               data[1].string

#define font_size       data[0].number

#define list_counter          data[0].number
#define list_hanging          data[1].number
#define list_indent           data[2].number
#define list_label            data[3].string
#define list_type       data[4].number

#define title_cc        data[0].number
#define title_lastlastc       data[1].number
#define title_op        data[2].io
#define title_tc        data[3].number

State_t                 state;

/*
 * return the attribute pointer for name in ap
 */

static Attribute_t*
attribute(register Attribute_t* ap, const char* name)
{
      if (ap)
            for (; ap->name; ap++)
                  if (!strcasecmp(ap->name, name))
                        return ap;
      return 0;
}

/*
 * new paragraph with optional hanging indent
 */

static void
par(int hanging, const char* tail)
{
      if (hanging && !(state.sp->flags & STK_LIST_COMPACT))
            sfputr(state.out, "\\line", -1);
      sfputr(state.out, "\\par\\pard", -1);
      if (state.center)
            sfputr(state.out, "\\qc", -1);
      if (hanging)
            sfprintf(state.out, "\\fi%d", twips(state.hanging - state.indent));
      sfprintf(state.out, "\\li%d\\tx%d\\tx20000%s", twips(state.indent), twips(state.indent), tail ? tail : "");
      state.sep = 1;
}

static void
anchor(int ref, register char* s)
{
      register int      c;

      if (s)
      {
            if (ref)
            {
                  if (*s != '#')
                  {
                        error(1, "%s: unknown link", s);
                        return;
                  }
                  s++;
                  sfprintf(state.out, "{\\uldb %s}{\\v", s);
            }
            else
                  sfputr(state.out, "#{\\footnote", -1);
            sfprintf(state.out, " %s.", state.prefix);
            while (c = *s++)
                  sfputc(state.out, isalnum(c) ? c : '.');
            sfputc(state.out, '}');
            if (ref)
            {
                  sfputr(state.out, "{\\*\\comment", -1);
                  state.sep = 1;
                  state.sp->a_close = 1;
            }
      }
}

static int
start_a(Tag_t* tp, Attribute_t* ap)
{
      Attribute_t*      op;

      NoP(tp);
      state.sp->a_close = 0;
      if (op = attribute(ap, "HREF"))
            anchor(1, op->value);
      if (op = attribute(ap, "NAME"))
            anchor(0, op->value);
      return 1;
}

static int
end_a(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      if (state.sp->a_close)
      {
            sfputc(state.out, '}');
            state.sep = 0;
      }
      return 0;
}

static int
start_b(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "{\\b", -1);
      state.sep = 1;
      return 1;
}

static int
start_bq(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.sp->list_hanging = state.hanging;
      state.sp->list_indent = state.indent;
      state.hanging += LIST_INDENT;
      state.indent = state.hanging;
      par(0, NiL);
      sfprintf(state.out, "\\ri%d", state.indent);
      state.sep = 1;
      return 1;
}

static int
end_bq(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.hanging = state.sp->list_hanging;
      state.indent = state.sp->list_indent;
      sfprintf(state.out, "\\ri0");
      state.sep = 1;
      return 1;
}

static int
start_body(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
start_br(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "\\line", -1);
      state.sep = 1;
      return 0;
}

static int
start_caption(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
start_center(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.center++;
      state.pre++;
      par(0, NiL);
      return 1;
}

static int
end_center(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      if (state.center > 0)
            state.center--;
      if (state.pre > 0)
            state.pre--;
      return 1;
}

static int
start_dd(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      if (state.sp->flags & STK_HEADING)
            par(0, NiL);
      else
            sfputr(state.out, "\\tab", -1);
      state.sep = 1;
      return 1;
}

static int
start_dl(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      if (attribute(ap, "COMPACT"))
            state.sp->flags |= STK_LIST_COMPACT;
      state.sp->list_hanging = state.hanging;
      state.sp->list_indent = state.indent;
      state.hanging += LIST_INDENT;
      state.indent = state.hanging + LIST_INDENT * 2;
      return 1;
}

static int
end_LIST(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.hanging = state.sp->list_hanging;
      state.indent = state.sp->list_indent;
      return 1;
}

static int
start_dt(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      par(1, NiL);
      return 1;
}

static int
start_fn(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "\\~[\\~", -1);
      return 1;
}

static int
end_fn(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "\\~]\\~", -1);
      return 0;
}

static int
start_font(Tag_t* tp, Attribute_t* ap)
{
      char*       s;
      char*       e;
      int         n;
      Attribute_t*      op;

      NoP(tp);
      if ((op = attribute(ap, "SIZE")) && (s = op->value) && (n = strtol(s, &e, 10)) && !*e)
      {
            if (*s == '+' || *s == '-')
                  n += state.fontsize;
            state.sp->font_size = state.fontsize;
            state.fontsize = n;
            sfprintf(state.out, "{\\fs%d", twips(n));
            state.sep = 1;
            return 1;
      }
      return 0;
}

static int
end_font(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.fontsize = state.sp->font_size;
      sfputc(state.out, '}');
      state.sep = 0;
      return 1;
}

static int
start_H(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      (state.sp - 1)->flags |= STK_HEADING;
      state.sp->font_size = state.fontsize;
      state.fontsize += (7 - (tp->name[1] - '0')) * 1;
      sfprintf(state.out, "{\\b\\fs%d", twips(state.fontsize));
      state.sep = 1;
      return 1;
}

static int
end_H(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.fontsize = state.sp->font_size;
      sfputc(state.out, '}');
      state.sep = 0;
      return 1;
}

static int
start_head(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
end_head(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
start_hr(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "{\\brdrt\\brdrsh\\par}", -1);
      state.sep = 0;
      return 0;
}

static int
start_html(Tag_t* tp, Attribute_t* ap)
{
      char* s;

      NoP(tp);
      NoP(ap);
      sfputr(state.out, "{\\rtf1 \\ansi \\deff0", '\n');
      s = strchr(usage, '\n') + 5;
      sfprintf(state.out, "{\\*\\comment generator: %-.*s}\n", strchr(usage, '\n') - s, s);
      sfputr(state.out, "{\\fonttbl", '\n');
      sfputr(state.out, "{\\f0 \\froman Times New Roman;}", '\n');
      sfputr(state.out, "{\\f1 \\fmodern Line Printer;}", '\n');
      sfputr(state.out, "{\\f2 \\froman Symbol;}", '\n');
      sfputr(state.out, "{\\f3 \\fswiss Ariel;}", '\n');
      sfputr(state.out, "}", '\n');
      sfprintf(state.out, "\\fs%d\n", twips(state.fontsize));
      return 1;
}

static int
end_html(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "\n}", '\n');
      return 1;
}

static int
start_i(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "{\\i", -1);
      state.sep = 1;
      return 1;
}

static int
start_img(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

/*
 * NOTE: roman() transcribed from GNU groff
 */

static void
roman(register int n, int format)
{
      register char*    dig;
      register int      i;
      register int      m;

      dig = islower(format) ? "zwmdclxvi" : "ZWMDCLXVI";
      if (n <= -40000 || n >= 40000)
      {
            sfprintf(state.out, "<%d>", n);
            return;
      }
      if (n == 0)
      {
            sfputc(state.out, '0');
            return;
      }
      if (n < 0)
      {
            n = -n;
            sfputc(state.out, '-');
      }
      while (n >= 10000)
      {
            n -= 10000;
            sfputc(state.out, dig[0]);
      }
      for (i = 1000; i > 0; i /= 10, dig += 2)
      {
            m = n / i;
            n -= m * i;
            switch (m)
            {
            case 9:
                  sfputc(state.out, dig[2]);
                  sfputc(state.out, dig[0]);
                  break;
            case 8:
                  sfputc(state.out, dig[1]);
                  sfputc(state.out, dig[2]);
                  sfputc(state.out, dig[2]);
                  sfputc(state.out, dig[2]);
                  break;
            case 7:
                  sfputc(state.out, dig[1]);
                  sfputc(state.out, dig[2]);
                  sfputc(state.out, dig[2]);
                  break;
            case 6:
                  sfputc(state.out, dig[1]);
                  sfputc(state.out, dig[2]);
                  break;
            case 5:
                  sfputc(state.out, dig[1]);
                  break;
            case 4:
                  sfputc(state.out, dig[2]);
                  sfputc(state.out, dig[1]);
                  break;
            case 3:
                  sfputc(state.out, dig[2]);
                  /*FALLTHROUGH*/
            case 2:
                  sfputc(state.out, dig[2]);
                  /*FALLTHROUGH*/
            case 1:
                  sfputc(state.out, dig[2]);
                  break;
            }
      }
}

static int
start_li(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      par(1, "{\\b ");
      switch (state.sp->list_type)
      {
      case '1':
            sfprintf(state.out, "%d.", state.sp->list_counter);
            break;
      case 'A':
            sfprintf(state.out, "%c)", 'A' + state.sp->list_counter);
            break;
      case 'a':
            sfprintf(state.out, "%c)", 'a' + state.sp->list_counter);
            break;
      case 'I':
      case 'i':
            roman(state.sp->list_counter, state.sp->list_type);
            sfputc(state.out, ')');
            break;
      default:
            sfputr(state.out, state.sp && state.sp->list_label ? state.sp->list_label : "\\bullet", -1);
            break;
      }
      state.sp->list_counter++;
      sfputr(state.out, "}\\tab", -1);
      state.sep = 1;
      return 1;
}

static int
start_meta(Tag_t* tp, Attribute_t* ap)
{
      Attribute_t*      op;

      NoP(tp);
      if ((op = attribute(ap, "NAME")) && op->value)
      {
            sfprintf(state.out, "{\\*\\comment %s", op->value);
            if ((op = attribute(ap, "CONTENT")) && op->value)
                  sfprintf(state.out, ": %s", op->value);
            sfputr(state.out, "}", '\n');
      }
      return 0;
}

static int
start_ol(Tag_t* tp, Attribute_t* ap)
{
      char*       e;
      Attribute_t*      op;

      NoP(tp);
      if (attribute(ap, "COMPACT"))
            state.sp->flags |= STK_LIST_COMPACT;
      if (!(op = attribute(ap, "START")) || !op->value || (state.sp->list_counter = strtol(op->value, &e, 10)) < 0 || *e)
            state.sp->list_counter = 1;
      state.sp->list_type = (op = attribute(ap, "TYPE")) && op->value ? *op->value : '1';
      state.sp->list_hanging = state.hanging;
      state.sp->list_indent = state.indent;
      state.hanging += LIST_INDENT;
      state.indent = state.hanging + LIST_INDENT;
      return 1;
}

static int
start_p(Tag_t* tp, Attribute_t* ap)
{
      register char*    s;
      Attribute_t*      op;

      NoP(tp);
      par(0, NiL);
      if ((op = attribute(ap, "ALIGN")) && (s = op->value))
      {
            if (!strcasecmp(s, "CENTER"))
                  sfputr(state.out, "\\qc", -1);
            else if (!strcasecmp(s, "LEFT"))
                  sfputr(state.out, "\\ql", -1);
            else if (!strcasecmp(s, "RIGHT"))
                  sfputr(state.out, "\\qr", -1);
      }
      return 1;
}

static int
start_pre(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.pre++;
      return 1;
}

static int
end_pre(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      if (state.pre > 0)
            state.pre--;
      return 1;
}

static int
start_rendering(register Tag_t* tp, Attribute_t* ap)
{
      register Render_t*      rp;
      register int            i;

      if (rp = (Render_t*)tp->data)
            for (i = 0; i < rp->tags; i++)
                  if ((tp = rp->tag[i]) && tp->start)
                        (*tp->start)(tp, ap);
      return 1;
}

static int
end_rendering(register Tag_t* tp, Attribute_t* ap)
{
      register Render_t*      rp;
      register int            i;

      if (rp = (Render_t*)tp->data)
            for (i = rp->tags - 1; i > 0; i--)
                  if ((tp = rp->tag[i]) && tp->end)
                        (*tp->end)(tp, ap);
      return 1;
}

static int
start_render(register Tag_t* tp, Attribute_t* ap)
{
      register Render_t*      rp;
      register char*          s;
      register char*          e;
      register int            n;
      Attribute_t*            op;

      if ((op = attribute(ap, "TAG")) && (s = op->value))
      {
            if (tp = (Tag_t*)hashget(state.tags, s))
            {
                  if (tp->data)
                        free(tp->data);
                  tp->start = 0;
                  tp->end = 0;
                  tp->data = 0;
            }
            else if (!(tp = newof(NiL, Tag_t, 1, 0)) || !(tp->name = hashput(state.tags, 0, tp)))
                  error(ERROR_SYSTEM|3, "out of space [tag]");
            if ((op = attribute(ap, "STYLE")) && (s = op->value))
            {
                  for (n = 0, e = s; e && (e = strchr(e, ',')); n++, e++);
                  if (!(rp = newof(NiL, Render_t, 1, n * sizeof(Tag_t*))))
                        error(ERROR_SYSTEM|3, "out of space [render]");
                  n = 0;
                  do
                  {
                        if (e = strchr(s, ','))
                              *e++ = 0;
                        if (rp->tag[n] = (Tag_t*)hashget(state.tags, s))
                              n++;
                  } while (s = e);
                  if (!(rp->tags = n))
                        free(rp);
                  else
                  {
                        tp->start = start_rendering;
                        tp->end = end_rendering;
                        tp->data = (void*)rp;
                  }
            }
      }
      return 0;
}

static int
start_sub(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
end_sub(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
start_sup(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
end_sup(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
start_table(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.center++;
      par(0, NiL);
      return 1;
}

static int
end_table(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      if (state.center > 0)
            state.center++;
      sfputr(state.out, "}", '\n');
      return 0;
}

static int
start_td(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
end_td(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 1;
}

static int
start_th(Tag_t* tp, Attribute_t* ap)
{
      register Attribute_t*   op;

      NoP(tp);
      if (!(op = attribute(ap, "ALIGN")) || !op->value || *op->value != 'l' && *op->value != 'L')
            sfputr(state.out, "\\~\\~\\~\\~\\~\\~\\~\\~\\~\\~\\~\\~", -1);
      return 0;
}

static int
start_title(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      state.pre++;
      return 1;
}

static int
end_title(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      if (state.pre > 0)
            state.pre--;
      return 1;
}

static int
start_tr(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 0;
}

static int
end_tr(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      return 1;
}

static int
start_tt(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "{\\f1", -1);
      state.sep = 1;
      return 1;
}

static int
start_ul(Tag_t* tp, Attribute_t* ap)
{
      Attribute_t*      op;

      NoP(tp);
      if (attribute(ap, "COMPACT"))
            state.sp->flags |= STK_LIST_COMPACT;
      state.sp->list_type = 0;
      switch ((op = attribute(ap, "TYPE")) && op->value ? *op->value : 0)
      {
      case 'c':
            state.sp->list_label = "\\'b0";
            break;
      case 's':
            state.sp->list_label = "\\'a4";
            break;
      default:
            state.sp->list_label = "\\bullet";
            break;
      }
      state.sp->list_hanging = state.hanging;
      state.sp->list_indent = state.indent;
      state.hanging += LIST_INDENT;
      state.indent = state.hanging + LIST_INDENT;
      return 1;
}

static int
start_var(Tag_t* tp, Attribute_t* ap)
{
      NoP(tp);
      NoP(ap);
      sfputr(state.out, "{\\f3\\i", -1);
      state.sep = 1;
      return 1;
}

/*
 * generic tag end
 */

static int
end(Tag_t* tp, Attribute_t* ap)
{
      sfputc(state.out, '}');
      state.sep = 0;
      return 1;
}

/*
 * convert html file in to rtf file out
 */

#define COMMENT   1
#define PUN 4
#define STRING    2

static void
process(char* file, register Sfio_t* ip, register Sfio_t* op)
{
      register int      c;
      register int      lastc;
      register int      item;
      register int      cc;
      register int      tc;
      register char*    s;
      int         lastlastc;
      int         quote;
      int         n;
      Entity_t*   ep;
      Tag_t*            tp;
      Attribute_t attributes[16];
      Attribute_t*      ap;
      Stack_t*    sp;

      error_info.file = file;
      error_info.line = 1;
      state.center = 0;
      state.in = ip;
      state.out = op;
      state.pre = 0;
      state.sp = state.sp_min;
      ap = 0;
      item = 0;
      lastc = 0;
      cc = tc = 0;
      for (;;)
      {
            switch (c = sfgetc(ip))
            {
            case EOF:
                  goto done;
            case '<':
                  if (!item)
                  {
                        item = c;
                        lastlastc = lastc;
                        quote = 0;
                        ap = attributes;
                        ap->name = 0;
                        ap->value = 0;
                        op = state.tmp;
                        if ((c = sfgetc(ip)) != EOF)
                        {
                              sfungetc(ip, c);
                              if (c == '!')
                                    quote |= COMMENT;
                        }
                        continue;
                  }
                  break;
            case '>':
                  if (item == '<' && !(quote & STRING))
                  {
                        item = 0;
                        if (!(s = sfstruse(op)))
                              error(ERROR_SYSTEM|3, "out of space");
                        op = state.out;
                        if (*s == '!')
                        {
                              if ((cc -= strlen(s)) <= 0)
                              {
                                    cc = 0;
                                    if ((c = sfgetc(ip)) != EOF)
                                    {
                                          if (c == '\n')
                                                error_info.line++;
                                          else
                                                sfungetc(ip, c);
                                    }
                              }
                              continue;
                        }
                        (ap + 1)->name = 0;
                        for (;;)
                        {
                              ap->name = s + (((unsigned int)ap->name) >> PUN);
                              if (!*ap->name)
                                    ap->name = 0;
                              else if (ap->value)
                              {
                                    ap->value = s + (((unsigned int)ap->value) >> PUN);
                                    if (!*ap->value)
                                          ap->value = 0;
                              }
                              if (ap == attributes)
                                    break;
                              ap--;
                        }
                        if (c = *s == '/')
                              s++;
                        if (!(tp = (Tag_t*)hashget(state.tags, s)))
                              error(1, "<%s>: unknown tag", s);
                        else if (!c)
                        {
                              if (tp->end)
                              {
                                    if (state.sp >= state.sp_max)
                                    {
                                          c = state.sp - state.sp_min;
                                          n = (state.sp_max - state.sp_min + 1) * 2;
                                          if (!(state.sp_min = oldof(state.sp_min, Stack_t, n, 0)))
                                                error(ERROR_SYSTEM|3, "out of space [tag stack]");
                                          state.sp_max = state.sp_min + n - 1;
                                          state.sp = state.sp_min + c;
                                    }
                                    state.sp++;
                                    state.sp->tag = tp;
                                    state.sp->line = error_info.line;
                                    state.sp->flags = 0;
                                    if (tp->flags & TAG_IGNORE)
                                    {
                                          state.sp->title_cc = cc;
                                          state.sp->title_lastlastc = lastlastc;
                                          state.sp->title_op = op;
                                          state.sp->title_tc = tc;
                                          op = state.nul;
                                          sfstrseek(op, 0, SEEK_SET);
                                    }
                              }
                              if (tp->start && !(*tp->start)(tp, ap) && tp->end)
                                    state.sp->flags |= STK_NOEND;
                        }
                        else
                        {
                              sp = state.sp;
                              if (state.sp->tag != tp)
                              {
                                    for (;;)
                                    {
                                          if (sp == state.sp_min)
                                          {
                                                if (!(tp->flags & TAG_UNBALANCED))
                                                      error(1, "</%s> has no matching <%s>", tp->name, tp->name);
                                                sp = 0;
                                                break;
                                          }
                                          if (sp->tag == tp)
                                                break;
                                          sp--;
                                    }
                                    if (sp)
                                    {
                                          while (state.sp > sp)
                                          {
                                                if (state.sp->tag->end && !(state.sp->flags & STK_NOEND))
                                                {
                                                      if (!(state.sp->tag->flags & TAG_UNBALANCED))
                                                            error(1, "<%s> on line %d has no matching </%s>", state.sp->tag->name, state.sp->line, state.sp->tag->name);
                                                      (*state.sp->tag->end)(state.sp->tag, NiL);
                                                }
                                                state.sp--;
                                          }
                                    }
                              }
                              if (sp)
                              {
                                    if (tp->end && !(state.sp->flags & STK_NOEND))
                                          (*tp->end)(tp, ap);
                                    if (tp->flags & TAG_IGNORE)
                                    {
                                          cc = state.sp->title_cc;
                                          lastlastc = state.sp->title_lastlastc;
                                          op = state.sp->title_op;
                                          tc = state.sp->title_tc;
                                    }
                                    state.sp--;
                              }
                        }
                        ap = 0;
                        lastc = lastlastc;
                        continue;
                  }
                  break;
            case '=':
                  if (ap && !ap->value)
                  {
                        sfputc(op, 0);
                        ap->value = (char*)(sfstrtell(op) << PUN);
                        continue;
                  }
                  break;
            case '"':
                  if (ap)
                  {
                        quote ^= STRING;
                        if (!(quote & COMMENT))
                              continue;
                  }
                  break;
            case '&':
                  if (!item)
                  {
                        item = c;
                        op = state.tmp;
                        continue;
                  }
                  break;
            case ';':
                  if (item == '&')
                  {
                        item = 0;
                        if (!(s = sfstruse(op)))
                              error(ERROR_SYSTEM|3, "out of space");
                        op = state.out;
                        if (*s == '#')
                        {
                              n = (int)strtol(s + 1, NiL, 10) & 0377;
                              cc += sfprintf(op, "\\'%02x", n);
                              tc++;
                              if (isspace(n))
                                    lastc = ' ';
                        }
                        else if (ep = (Entity_t*)hashget(state.entities, s))
                        {
                              cc += sfputr(op, ep->value, -1);
                              tc++;
                              if (ep->flags & ENT_SPACE)
                                    lastc = ' ';
                        }
                        else
                        {
                              error(1, "&%s;: unknown entity reference", s);
                              cc += sfprintf(op, "&%s;", s);
                              tc++;
                        }
                        continue;
                  }
                  break;
            case '{':
            case '}':
            case '\\':
                  sfputc(op, '\\');
                  cc++;
                  state.sep = 0;
                  break;
            case '\n':
                  error_info.line++;
                  if (state.pre && !item)
                  {
                        state.sep = 0;
                        sfputr(op, "\\line", -1);
                        cc += 5;
                        tc = 0;
                        break;
                  }
                  /*FALLTHROUGH*/
            case ' ':
            case '\t':
            case '\v':
                  if (ap)
                  {
                        if (!quote)
                        {
                              if (lastc != ' ' && ap < &attributes[elementsof(attributes) - 1])
                              {
                                    sfputc(op, 0);
                                    ap++;
                                    ap->name = (char*)(sfstrtell(op) << PUN);
                                    ap->value = 0;
                                    lastc = ' ';
                              }
                              continue;
                        }
                  }
                  else if (!state.pre)
                  {
                        if (lastc == ' ')
                              continue;
                        c = ' ';
                        if (cc >= 72)
                        {
                              cc = 0;
                              sfputc(op, '\n');
                        }
                  }
                  else if (c == ' ')
                  {
                        sfputr(op, "\\~", -1);
                        cc += 2;
                        tc++;
                        state.sep = 0;
                        continue;
                  }
                  else if (c == '\t')
                  {
                        do
                        {
                              sfputr(op, "\\~", -1);
                              cc += 2;
                              tc++;
                        } while (tc % 8);
                        state.sep = 0;
                        continue;
                  }
                  break;
            default:
                  if (iscntrl(c))
                        continue;
                  if (c > 0177)
                  {
                        cc += sfprintf(op, "\\'%02x", c & 0377);
                        tc++;
                        continue;
                  }
                  break;
            }
            if (state.sep && op == state.out)
            {
                  state.sep = 0;
                  if (c != ' ')
                  {
                        sfputc(op, ' ');
                        cc++;
                        tc++;
                  }
            }
            lastc = c;
            sfputc(op, c);
            cc++;
            tc++;
      }
 done:
      while (state.sp > state.sp_min)
      {
            error(1, "<%s> on line %d has no matching </%s>", state.sp->tag->name, state.sp->line, state.sp->tag->name);
            state.sp--;
      }
      error_info.file = 0;
      error_info.line = 0;
}

/*
 * return 1 if project file must be updated
 */

static int
project_update(const char* s, char* v, void* h)
{
      NoP(s);
      return v == (char*)h;
}

/*
 * list project file names
 */

static int
project_list(const char* s, char* v, void* h)
{
      NoP(v);
      sfputr((Sfio_t*)h, s, '\n');
      return 0;
}

/*
 * create/update help project file
 */

static void
project(char* file)
{
      register char*    s;
      Sfio_t*           fp;

      if (state.files)
      {
            if (fp = sfopen(NiL, file, "r"))
            {
                  while (s = sfgetr(fp, '\n', 1))
                  {
                        if (*s == '[' && !strncasecmp(s, "[FILES]", 7))
                        {
                              while ((s = sfgetr(fp, '\n', 1)) && *s != '[')
                                    hashput(state.files, s, &state);
                              if (!s)
                                    break;
                        }
                        sfputr(state.tmp, s, '\n');
                  }
                  sfclose(fp);
                  if (!(s = sfstruse(state.tmp)))
                        error(ERROR_SYSTEM|3, "out of space");
            }
            else
                  s = "\
[OPTIONS]\n\
COMPRESS=TRUE\n\
REPORT=ON\n\
TITLE=Manual\n\
";
            if (hashwalk(state.files, 0, project_update, state.files))
            {
                  if (!(fp = sfopen(NiL, file, "w")))
                        error(ERROR_SYSTEM|2, "%s: cannot write", file);
                  else
                  {
                        sfputr(fp, s, -1);
                        sfputr(fp, "[FILES]", '\n');
                        hashwalk(state.files, 0, project_list, fp);
                        sfclose(fp);
                  }
            }
      }
}

/*
 * html to rtf entity reference map
 */

static const Entity_t entities[] =
{
      "AElig",    "\\'c6",    0,
      "Aacute",   "\\'c1",    0,
      "Acirc",    "\\'c2",    0,
      "Agrave",   "\\'c0",    0,
      "Aring",    "\\'c5",    0,
      "Atilde",   "\\'c3",    0,
      "Auml",           "\\'c4",    0,
      "Ccedil",   "\\'c7",    0,
      "ETH",            "\\'d0",    0,
      "Eacute",   "\\'c9",    0,
      "Ecirc",    "\\'ca",    0,
      "Egrave",   "\\'c8",    0,
      "Euml",           "\\'cb",    0,
      "Iacute",   "\\'cd",    0,
      "Icirc",    "\\'ce",    0,
      "Igrave",   "\\'cc",    0,
      "Iuml",           "\\'cf",    0,
      "Ntilde",   "\\'d1",    0,
      "Oacute",   "\\'d3",    0,
      "Ocirc",    "\\'d4",    0,
      "Ograve",   "\\'d2",    0,
      "Oslash",   "\\'d8",    0,
      "Otilde",   "\\'d5",    0,
      "Ouml",           "\\'d6",    0,
      "THORN",    "\\'de",    0,
      "Uacute",   "\\'da",    0,
      "Ucirc",    "\\'db",    0,
      "Ugrave",   "\\'d9",    0,
      "Uuml",           "\\'dc",    0,
      "Yacute",   "\\'dd",    0,
      "aacute",   "\\'e1",    0,
      "acirc",    "\\'e2",    0,
      "acute",    "\\'b4",    0,
      "aelig",    "\\'e6",    0,
      "agrave",   "\\'e0",    0,
      "amp",            "&",        0,
      "aring",    "\\'e5",    0,
      "atilde",   "\\'e3",    0,
      "auml",           "\\'e4",    0,
      "brvbar",   "\\'a6",    0,
      "ccedil",   "\\'e7",    0,
      "cedil",    "\\'b8",    0,
      "cent",           "\\'a2",    0,
      "copy",           "\\'a9",    0,
      "curren",   "\\'a4",    0,
      "deg",            "\\'b0",    0,
      "divide",   "\\'f7",    0,
      "eacute",   "\\'e9",    0,
      "ecirc",    "\\'ea",    0,
      "egrave",   "\\'e8",    0,
      "emdash",   "\\emdash", 0,
      "emspace",  "\\emspace",      ENT_SPACE,
      "endash",   "\\endash", 0,
      "enspace",  "\\enspace",      ENT_SPACE,
      "eth",            "\\'f0",    0,
      "euml",           "\\'eb",    0,
      "frac12",   "\\'bd",    0,
      "frac14",   "\\'bc",    0,
      "frac34",   "\\'be",    0,
      "gt",       ">",        0,
      "iacute",   "\\'ed",    0,
      "icirc",    "\\'ee",    0,
      "iexcl",    "\\'a1",    0,
      "igrave",   "\\'ec",    0,
      "iquest",   "\\'bf",    0,
      "iuml",           "\\'ef",    0,
      "laquo",    "\\'ab",    0,
      "lt",       "<",        0,
      "macr",           "\\'af",    0,
      "micro",    "\\'b5",    0,
      "middot",   "\\bullet", 0,
      "nbsp",           "\\~",            ENT_SPACE,
      "not",            "\\'ac",    0,
      "ntilde",   "\\'f1",    0,
      "oacute",   "\\'f3",    0,
      "ocirc",    "\\'f4",    0,
      "ograve",   "\\'f2",    0,
      "ordf",           "\\'aa",    0,
      "ordm",           "\\'ba",    0,
      "oslash",   "\\'f8",    0,
      "otilde",   "\\'f5",    0,
      "ouml",           "\\'f6",    0,
      "para",           "\\'b6",    0,
      "plusmn",   "\\'b1",    0,
      "pound",    "\\'a3",    0,
      "quot",           "\"",       0,
      "raquo",    "\\'bb",    0,
      "reg",            "\\'ae",    0,
      "sect",           "\\'a7",    0,
      "shy",            "\\'ad",    0,
      "sup1",           "\\'b9",    0,
      "sup2",           "\\'b2",    0,
      "sup3",           "\\'b3",    0,
      "szlig",    "\\'df",    0,
      "thorn",    "\\'fe",    0,
      "times",    "\\'d7",    0,
      "uacute",   "\\'fa",    0,
      "ucirc",    "\\'fb",    0,
      "ugrave",   "\\'f9",    0,
      "uml",            "\\'a8",    0,
      "uuml",           "\\'fc",    0,
      "yacute",   "\\'fd",    0,
      "yen",            "\\'a5",    0,
      "yuml",           "\\'ff",    0,
#if 0
      "trademark",      "",         0,
#endif
};

/*
 * html tag table
 */

static const Tag_t tags[] =
{
      "A",        start_a,    end_a,            0,0,
      "ADDRESS",  start_i,    end,        0,0,
      "B",        start_b,    end,        0,0,
      "BLOCKQUOTE",     start_bq,   end_bq,           0,0,
      "BQ",       start_bq,   end_bq,           0,0,
      "BODY",           start_body, end,        0,0,
      "BR",       start_br,   0,          0,0,
      "CAPTION",  start_caption,    end,        0,0,
      "CENTER",   start_center,     end_center, 0,0,
      "CITE",           start_i,    end,        0,0,
      "CODE",           start_tt,   end,        0,0,
      "DD",       start_dd,   0,          0,0,
      "DIR",            start_ul,   end_LIST,   0,0,
      "DL",       start_dl,   end_LIST,   0,0,
      "DT",       start_dt,   0,          0,0,
      "EM",       start_i,    end,        0,0,
      "FN",       start_fn,   end_fn,           0,0,
      "FONT",           start_font, end_font,   0,0,
      "H1",       start_H,    end_H,            0,0,
      "H2",       start_H,    end_H,            0,0,
      "H3",       start_H,    end_H,            0,0,
      "H4",       start_H,    end_H,            0,0,
      "H5",       start_H,    end_H,            0,0,
      "H6",       start_H,    end_H,            0,0,
      "HEAD",           start_head, end_head,   0,TAG_UNBALANCED,
      "HR",       start_hr,   0,          0,0,
      "HTML",           start_html, end_html,   0,0,
      "I",        start_i,    end,        0,0,
      "IMG",            start_img,  0,          0,0,
      "KBD",            start_tt,   end,        0,0,
      "LI",       start_li,   0,          0,TAG_UNBALANCED,
      "META",           start_meta, 0,          0,0,
      "MENU",           start_ul,   end_LIST,   0,0,
      "NULL",           0,          0,          0,0,
      "OL",       start_ol,   end_LIST,   0,0,
      "P",        start_p,    0,          0,TAG_UNBALANCED,
      "PRE",            start_pre,  end_pre,    0,0,
      "RENDER",   start_render,     0,          0,0,
      "SAMP",           start_tt,   end,        0,0,
      "STRONG",   start_b,    end,        0,0,
      "SUB",            start_sub,  end_sub,    0,0,
      "SUP",            start_sup,  end_sup,    0,0,
      "TABLE",    start_table,      end_table,  0,0,
      "TD",       start_td,   end_td,           0,0,
      "TH",       start_th,   0,          0,0,
      "TITLE",    start_title,      end_title,  0,TAG_IGNORE,
      "TR",       start_tr,   end_tr,           0,0,
      "TT",       start_tt,   end,        0,0,
      "UL",       start_ul,   end_LIST,   0,0,
      "UNKNOWN",  0,          0,          0,0,
      "VAR",            start_var,  end,        0,0,
};

/*
 * case insensitive hash
 */

static unsigned int
strcasehash(const char* s)
{
      register const unsigned char* p = (const unsigned char*)s;
      register unsigned int         h = 0;
      register unsigned int         c;

      while (c = *p++)
      {
            if (isupper(c))
                  c = tolower(c);
            HASHPART(h, c);
      }
      return h;
}

/*
 * initialize the global data
 */

static void
init(void)
{
      register int      i;

      if (!state.nul && !(state.nul = sfstropen()))
            error(ERROR_SYSTEM|3, "out of space [nul buffer]");
      if (!state.tmp && !(state.tmp = sfstropen()))
            error(ERROR_SYSTEM|3, "out of space [tmp buffer]");
      i = 1024;
      if (!(state.sp_min = oldof(NiL, Stack_t, i, 0)))
            error(ERROR_SYSTEM|3, "out of space [tag stack]");
      state.sp_max = state.sp_min + i - 1;
      if (!(state.entities = hashalloc(NiL, HASH_name, "entities", 0)))
            error(ERROR_SYSTEM|3, "out of space [entity hash]");
      if (!(state.tags = hashalloc(NiL, HASH_compare, strcasecmp, HASH_hash, strcasehash, HASH_name, "tags", 0)))
            error(ERROR_SYSTEM|3, "out of space [tag hash]");
      if (state.project && !(state.files = hashalloc(state.tags, HASH_set, HASH_ALLOCATE, HASH_name, "files", 0)))
            error(ERROR_SYSTEM|3, "out of space [file hash]");
      for (i = 0; i < elementsof(entities); i++)
            if (!(hashput(state.entities, entities[i].name, &entities[i])))
                  error(ERROR_SYSTEM|3, "out of space [entity hash put]");
      for (i = 0; i < elementsof(tags); i++)
            if (!(hashput(state.tags, tags[i].name, &tags[i])))
                  error(ERROR_SYSTEM|3, "out of space [tag hash put]");
      hashset(state.tags, HASH_ALLOCATE);
}

int
main(int argc, char** argv)
{
      register int            c;
      register char*          s;
      register char*          t;
      register char*          u;
      register Sfio_t*  ip;
      register Sfio_t*  op;

      NoP(argc);
      error_info.id = "html2rtf";
      state.fontsize = FONTSIZE;
      for (;;)
      {
            switch (optget(argv, usage))
            {
            case 'd':
                  error_info.trace = -opt_info.num;
                  continue;
            case 'f':
                  state.fontsize = opt_info.num;
                  continue;
            case 'p':
                  state.project = opt_info.arg;
                  continue;
            case 'v':
                  state.verbose = 1;
                  continue;
            case '?':
                  error(ERROR_USAGE|4, "%s", opt_info.arg);
                  continue;
            case ':':
                  error(2, "%s", opt_info.arg);
                  continue;
            }
            break;
      }
      argv += opt_info.index;
      if (error_info.errors)
            error(ERROR_USAGE|4, "%s", optusage(NiL));
      init();
      if (!*argv)
      {
            if (state.project)
                  error(ERROR_SYSTEM|3, "%s: input files required when project file specified", state.project);
            process(NiL, sfstdin, sfstdout);
      }
      else while (s = *argv++)
      {
            if (ip = sfopen(NiL, s, "r"))
            {
                  if (state.project)
                  {
                        if (!(t = strrchr(s, '/')))
                              t = s;
                        if (u = strrchr(t, '.'))
                              c = u - t;
                        else
                              c = strlen(t);
                        sfprintf(state.tmp, "%-.*s.rtf", c, t);
                        if (!(u = sfstruse(state.tmp)))
                              error(ERROR_SYSTEM|3, "out of space");
                        if (!(op = sfopen(NiL, u, "w")))
                        {
                              error(ERROR_SYSTEM|2, "%s: cannot write", u);
                              sfclose(ip);
                              continue;
                        }
                        hashput(state.files, u, state.files);
                        while (c = *t++)
                              sfputc(state.tmp, isalnum(c) ? c : '.');
                        if (!(state.prefix = strdup(sfstruse(state.tmp))))
                              error(ERROR_SYSTEM|3, "out of space");
                  }
                  else
                  {
                        state.prefix = "HTML2RTF";
                        op = sfstdout;
                  }
                  process(s, ip, op);
                  sfclose(ip);
                  if (state.project)
                  {
                        sfclose(op);
                        free(state.prefix);
                  }
            }
            else error(ERROR_SYSTEM|2, "%s: cannot read", s);
      }
      if (state.project)
            project(state.project);
      exit(error_info.errors != 0);
}

Generated by  Doxygen 1.6.0   Back to index