Logo Search packages:      
Sourcecode: ksh version File versions  Download package

recfmt.c

/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*                  Copyright (c) 1985-2005 AT&T Corp.                  *
*                      and is licensed under the                       *
*                  Common Public License, Version 1.0                  *
*                            by AT&T Corp.                             *
*                                                                      *
*                A copy of the License is available at                 *
*            http://www.opensource.org/licenses/cpl1.0.txt             *
*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                  David Korn <dgk@research.att.com>                   *
*                   Phong Vo <kpv@research.att.com>                    *
*                                                                      *
***********************************************************************/
#pragma prototyped

/*
 * determine record format by sampling data in <buf,size>
 * total is the total file size, <=0 if not available
 * return r:
 *    -1                      could not determine
 *    RECTYPE(r)==REC_fixed         fixed length REC_F_SIZE(r)
 *    RECTYPE(r)==REC_delimited     variable length delimiter=REC_D_DELIMITER(r)
 *    RECTYPE(r)==REC_variable      variable length
 */

#include <recfmt.h>

typedef struct
{
      unsigned int      rep[4 * 1024];
      unsigned int      hit[UCHAR_MAX + 1];
} Sample_t;

Recfmt_t
recfmt(const void* buf, size_t size, off_t total)
{
      register unsigned char*       s;
      register unsigned char*       t;
      register Sample_t*            q;
      register unsigned int*        h;
      register unsigned int         i;
      unsigned int                  j;
      unsigned int                  k;
      unsigned int                  n;
      unsigned int                  m;
      unsigned int                  x;
      unsigned long                 f;
      unsigned long                 g;

      static unsigned char          terminators[] = { '\n', 0x15, 0x25 };

      /*
       * check for V format
       */

      s = (unsigned char*)buf;
      t = s + size;
      while ((k = (t - s)) >= 4 && !s[2] && !s[3])
      {
            if ((i = (s[0]<<8)|s[1]) > k)
                  break;
            s += i;
      }
      if (!k || size > 2 * k)
            return REC_V_TYPE(4, 0, 2, 0, 1);
      s = (unsigned char*)buf;

      /*
       * check for terminated records
       */

      for (i = 0; i < elementsof(terminators); i++)
            if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n)))
            {
                  for (j = n - 1; j < size; j += n)
                        if (s[j] != k)
                        {
                              n = 0;
                              break;
                        }
                  if (n)
                        return REC_D_TYPE(terminators[i]);
            }

      /*
       * check fixed length record frequencies
       */

      if (!(q = newof(0, Sample_t, 1, 0)))
            return REC_N_TYPE();
      x = 0;
      for (i = 0; i < size; i++)
      {
            h = q->hit + s[i];
            m = i - *h;
            *h = i;
            if (m < elementsof(q->rep))
            {
                  if (m > x)
                        x = m;
                  q->rep[m]++;
            }
      }
      n = 0;
      m = 0;
      f = ~0;
      for (i = x; i > 1; i--)
      {
            if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n])
            {
                  m++;
                  g = 0;
                  for (j = i; j < size - i; j += i)
                        for (k = 0; k < i; k++)
                              if (s[j + k] != s[j + k - i])
                                    g++;
                  g = (((g * 100) / i) * 100) / q->rep[i];
                  if (g <= f)
                  {
                        f = g;
                        n = i;
                  }
            }
      }
      if (m <= 1 && n <= 2 && total > 1 && total < 256)
      {
            n = 0;
            for (i = 0; i < size; i++)
                  for (j = 0; j < elementsof(terminators); j++)
                        if (s[i] == terminators[j])
                              n++;
            n = n ? 0 : total;
      }
      free(q);
      return n ? REC_F_TYPE(n) : REC_N_TYPE();
}

#if MAIN

main()
{
      void* s;
      size_t      size;
      off_t total;

      if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0)))
      {
            sfprintf(sfstderr, "read error\n");
            return 1;
      }
      size = sfvalue(sfstdin);
      total = sfsize(sfstdin);
      sfprintf(sfstdout, "%d\n", recfmt(s, size, total));
      return 0;
}

#endif

Generated by  Doxygen 1.6.0   Back to index