Logo Search packages:      
Sourcecode: ksh version File versions  Download package

reglib.h

/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*          Copyright (c) 1985-2010 AT&T Intellectual Property          *
*                      and is licensed under the                       *
*                  Common Public License, Version 1.0                  *
*                    by AT&T Intellectual Property                     *
*                                                                      *
*                A copy of the License is available at                 *
*            http://www.opensource.org/licenses/cpl1.0.txt             *
*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                  David Korn <dgk@research.att.com>                   *
*                   Phong Vo <kpv@research.att.com>                    *
*                                                                      *
***********************************************************************/
#pragma prototyped

/*
 * posix regex implementation
 *
 * based on Doug McIlroy's C++ implementation
 * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
 * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
 */

#ifndef _REGLIB_H
#define _REGLIB_H

#define REG_VERSION_EXEC      20020509L
#define REG_VERSION_MAP       20030916L   /* regdisc_t.re_map     */

#define re_info         env

#define alloc           _reg_alloc
#define classfun  _reg_classfun
#define drop            _reg_drop
#define fatal           _reg_fatal
#define state           _reg_state

typedef struct regsubop_s
{
      int         op;         /* REG_SUB_LOWER,REG_SUB_UPPER      */
      int         off;        /* re_rhs or match[] offset   */
      int         len;        /* re_rhs len or len==0 match[]     */
} regsubop_t;

#define _REG_SUB_PRIVATE_ \
      char*       re_cur;           /* re_buf cursor        */ \
      char*       re_end;           /* re_buf end                 */ \
      regsubop_t* re_ops;           /* rhs ops              */ \
      char        re_rhs[1];  /* substitution rhs           */

#include <ast.h>
#include <cdt.h>
#include <stk.h>

#include "regex.h"

#include <ctype.h>
#include <errno.h>

#if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG)
#define _AST_REGEX_DEBUG      1
#endif

#define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)

#undef      RE_DUP_MAX              /* posix puts this in limits.h!     */
#define RE_DUP_MAX      (INT_MAX/2-1)     /* 2*RE_DUP_MAX won't overflow      */
#define RE_DUP_INF      (RE_DUP_MAX+1)    /* infinity, for *            */
#define BACK_REF_MAX    9

#define REG_COMP  (REG_DELIMITED|REG_ESCAPE|REG_EXTENDED|REG_FIRST|REG_ICASE|REG_NOSUB|REG_NEWLINE|REG_SHELL|REG_AUGMENTED|REG_LEFT|REG_LITERAL|REG_MINIMAL|REG_MULTIREF|REG_NULL|REG_RIGHT|REG_LENIENT|REG_MUSTDELIM)
#define REG_EXEC  (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)

#define REX_NULL        0     /* null string (internal)     */
#define REX_ALT               1     /* a|b                        */
#define REX_ALT_CATCH         2     /* REX_ALT catcher            */
#define REX_BACK        3     /* \1, \2, etc                */
#define REX_BEG               4     /* initial ^                  */
#define REX_BEG_STR           5     /* initial ^ w/ no newline    */
#define REX_BM                6     /* Boyer-Moore                */
#define REX_CAT               7     /* catenation catcher         */
#define REX_CLASS       8     /* [...]                */
#define REX_COLL_CLASS        9     /* collation order [...]      */
#define REX_CONJ        10    /* a&b                        */
#define REX_CONJ_LEFT         11    /* REX_CONJ left catcher      */
#define REX_CONJ_RIGHT        12    /* REX_CONJ right catcher     */
#define REX_DONE        13    /* completed match (internal) */
#define REX_DOT               14    /* .                    */
#define REX_END               15    /* final $              */
#define REX_END_STR           16    /* final $ before tail newline      */
#define REX_EXEC        17    /* call re.re_exec()          */
#define REX_FIN_STR           18    /* final $ w/ no newline      */
#define REX_GROUP       19    /* \(...\)              */
#define REX_GROUP_CATCH       20    /* REX_GROUP catcher          */
#define REX_GROUP_AHEAD       21    /* 0-width lookahead          */
#define REX_GROUP_AHEAD_CATCH 22    /* REX_GROUP_AHEAD catcher    */
#define REX_GROUP_AHEAD_NOT   23    /* inverted 0-width lookahead */
#define REX_GROUP_BEHIND      24    /* 0-width lookbehind         */
#define REX_GROUP_BEHIND_CATCH      25    /* REX_GROUP_BEHIND catcher   */
#define REX_GROUP_BEHIND_NOT  26    /* inverted 0-width lookbehind      */
#define REX_GROUP_BEHIND_NOT_CATCH 27     /* REX_GROUP_BEHIND_NOT catcher     */
#define REX_GROUP_COND        28    /* conditional group          */
#define REX_GROUP_COND_CATCH  29    /* conditional group catcher  */
#define REX_GROUP_CUT         30    /* don't backtrack over this  */
#define REX_GROUP_CUT_CATCH   31    /* REX_GROUP_CUT catcher      */
#define REX_KMP               32    /* Knuth-Morris-Pratt         */
#define REX_NEG               33    /* negation             */
#define REX_NEG_CATCH         34    /* REX_NEG catcher            */
#define REX_NEST        35    /* nested match               */
#define REX_ONECHAR           36    /* a single-character literal */
#define REX_REP               37    /* Kleene closure       */
#define REX_REP_CATCH         38    /* REX_REP catcher            */
#define REX_STRING            39    /* some chars                 */
#define REX_TRIE        40    /* alternation of strings     */
#define REX_WBEG        41    /* <                    */
#define REX_WEND        42    /* >                    */
#define REX_WORD        43    /* word boundary        */
#define REX_WORD_NOT          44    /* not word boundary          */

#define T_META          ((int)UCHAR_MAX+1)
#define T_STAR          (T_META+0)
#define T_PLUS          (T_META+1)
#define T_QUES          (T_META+2)
#define T_BANG          (T_META+3)
#define T_AT            (T_META+4)
#define T_TILDE         (T_META+5)
#define T_PERCENT (T_META+6)
#define T_LEFT          (T_META+7)
#define T_OPEN          (T_META+8)
#define T_CLOSE         (T_OPEN+1)
#define T_RIGHT         (T_OPEN+2)
#define T_CFLX          (T_OPEN+3)
#define T_DOT           (T_OPEN+4)
#define T_DOTSTAR (T_OPEN+5)
#define T_END           (T_OPEN+6)
#define T_BAD           (T_OPEN+7)
#define T_DOLL          (T_OPEN+8)
#define T_BRA           (T_OPEN+9)
#define T_BAR           (T_OPEN+10)
#define T_AND           (T_OPEN+11)
#define T_LT            (T_OPEN+12)
#define T_GT            (T_OPEN+13)
#define T_SLASHPLUS     (T_OPEN+14)
#define T_GROUP         (T_OPEN+15)
#define T_WORD          (T_OPEN+16)
#define T_WORD_NOT      (T_WORD+1)
#define T_BEG_STR (T_WORD+2)
#define T_END_STR (T_WORD+3)
#define T_FIN_STR (T_WORD+4)
#define T_ESCAPE  (T_WORD+5)
#define T_ALNUM         (T_WORD+6)
#define T_ALNUM_NOT     (T_ALNUM+1)
#define T_DIGIT         (T_ALNUM+2)
#define T_DIGIT_NOT     (T_ALNUM+3)
#define T_SPACE         (T_ALNUM+4)
#define T_SPACE_NOT     (T_ALNUM+5)
#define T_BACK          (T_ALNUM+6)

#define BRE       0
#define ERE       3
#define ARE       6
#define SRE       9
#define KRE       12

#define HIT       SSIZE_MAX

#define bitclr(p,c)     ((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
#define bitset(p,c)     ((p)[((c)>>3)&037]|=(1<<((c)&07)))
#define bittst(p,c)     ((p)[((c)>>3)&037]&(1<<((c)&07)))

#define setadd(p,c)     bitset((p)->bits,c)
#define setclr(p,c)     bitclr((p)->bits,c)
#define settst(p,c)     bittst((p)->bits,c)

#if _hdr_wchar && _lib_wctype && _lib_iswctype

#include <stdio.h> /* because <wchar.h> includes it and we generate it */
#include <wchar.h>
#if _hdr_wctype
#include <wctype.h>
#endif

#if !defined(iswblank) && !_lib_iswblank
#define _need_iswblank  1
#define iswblank(x)     _reg_iswblank(x)
extern int        _reg_iswblank(wint_t);
#endif

#if !defined(towupper) && !_lib_towupper
#define towupper(x)     toupper(x)
#endif

#if !defined(towlower) && !_lib_towlower
#define towlower(x)     tolower(x)
#endif

#else

#undef      _lib_wctype

#ifndef iswalnum
#define iswalnum(x)     isalnum(x)
#endif
#ifndef iswalpha
#define iswalpha(x)     isalpha(x)
#endif
#ifndef iswcntrl
#define iswcntrl(x)     iscntrl(x)
#endif
#ifndef iswdigit
#define iswdigit(x)     isdigit(x)
#endif
#ifndef iswgraph
#define iswgraph(x)     isgraph(x)
#endif
#ifndef iswlower
#define iswlower(x)     islower(x)
#endif
#ifndef iswprint
#define iswprint(x)     isprint(x)
#endif
#ifndef iswpunct
#define iswpunct(x)     ispunct(x)
#endif
#ifndef iswspace
#define iswspace(x)     isspace(x)
#endif
#ifndef iswupper
#define iswupper(x)     isupper(x)
#endif
#ifndef iswxdigit
#define iswxdigit(x)    isxdigit(x)
#endif

#ifndef towlower
#define towlower(x)     tolower(x)
#endif
#ifndef towupper
#define towupper(x)     toupper(x)
#endif

#endif

#ifndef     iswblank
#define     iswblank(x) ((x)==' '||(x)=='\t')
#endif

#ifndef iswgraph
#define     iswgraph(x) (iswprint(x)&&!iswblank(x))
#endif

#define isword(x) (isalnum(x)||(x)=='_')

/*
 * collation element support
 */

#define COLL_KEY_MAX    32

#if COLL_KEY_MAX < MB_LEN_MAX
#undef      COLL_KEY_MAX
#define COLL_KEY_MAX    MB_LEN_MAX
#endif

typedef unsigned char Ckey_t[COLL_KEY_MAX+1];

#define COLL_end  0
#define COLL_call 1
#define COLL_char 2
#define COLL_range      3
#define COLL_range_lc   4
#define COLL_range_uc   5

typedef struct Celt_s
{
      short       typ;
      short       min;
      short       max;
      regclass_t  fun;
      Ckey_t            beg;
      Ckey_t            end;
} Celt_t;

/*
 * private stuff hanging off regex_t
 */

typedef struct Stk_pos_s
{
      off_t       offset;
      char*       base;
} Stk_pos_t;

typedef struct Vector_s
{
      Stk_t*            stk;        /* stack pointer        */
      char*       vec;        /* the data             */
      int         inc;        /* growth increment           */
      int         siz;        /* element size               */
      int         max;        /* max index                  */
      int         cur;        /* current index -- user domain     */
} Vector_t;

/*
 * Rex_t subtypes
 */

typedef struct Cond_s
{
      unsigned char*    beg;        /* beginning of next match    */
      struct Rex_s*     next[2];    /* 0:no 1:yes next pattern    */
      struct Rex_s*     cont;       /* right catcher        */
      int         yes;        /* yes condition hit          */
} Cond_t;

typedef struct Conj_left_s
{
      unsigned char*    beg;        /* beginning of left match    */
      struct Rex_s*     right;            /* right pattern        */
      struct Rex_s*     cont;       /* right catcher        */
} Conj_left_t;

typedef struct Conj_right_s
{
      unsigned char*    end;        /* end of left match          */
      struct Rex_s*     cont;       /* ambient continuation       */
} Conj_right_t;

typedef unsigned int Bm_mask_t;

typedef struct Bm_s
{
      Bm_mask_t** mask;
      size_t*           skip;
      size_t*           fail;
      size_t            size;
      ssize_t           back;
      ssize_t           left;
      ssize_t           right;
      size_t            complete;
} Bm_t;

typedef struct String_s
{
      int*        fail;
      unsigned char*    base;
      size_t            size;
} String_t;

typedef struct Set_s
{
      unsigned char     bits[(UCHAR_MAX+1)/CHAR_BIT];
} Set_t;

typedef struct Collate_s
{
      int         invert;
      Celt_t*           elements;
} Collate_t;

typedef struct Binary_s
{
      struct Rex_s*     left;
      struct Rex_s*     right;
      int         serial;
} Binary_t;

typedef struct Group_s
{
      int         number;           /* group number               */
      int         last;       /* last contained group number      */
      int         size;       /* lookbehind size            */
      int         back;       /* backreferenced       */
      regflags_t  flags;            /* group flags                */
      union
      {
      Binary_t    binary;
      struct Rex_s*     rex;
      }           expr;
} Group_t;

typedef struct Exec_s
{
      void*       data;
      const char* text;
      size_t            size;
} Exec_t;

#define REX_NEST_open         0x01
#define REX_NEST_close        0x02
#define REX_NEST_escape       0x04
#define REX_NEST_quote        0x08
#define REX_NEST_literal      0x10
#define REX_NEST_delimiter    0x20
#define REX_NEST_terminator   0x40
#define REX_NEST_separator    0x80

#define REX_NEST_SHIFT        8

typedef struct Nest_s
{
      int         primary;
      unsigned short    none;       /* for Nest_t.type[-1] */
      unsigned short    type[1];
} Nest_t;

/*
 * REX_ALT catcher, solely to get control at the end of an
 * alternative to keep records for comparing matches.
 */

typedef struct Alt_catch_s
{
      struct Rex_s*     cont;
} Alt_catch_t;

typedef struct Group_catch_s
{
      struct Rex_s*     cont;
      regoff_t*   eo;
} Group_catch_t;

typedef struct Behind_catch_s
{
      struct Rex_s*     cont;
      unsigned char*    beg;
      unsigned char*    end;
} Behind_catch_t;

/*
 * REX_NEG catcher determines what string lengths can be matched,
 * then Neg investigates continuations of other lengths.
 * This is inefficient.  For !POSITIONS expressions, we can do better:
 * since matches to rex will be enumerated in decreasing order,
 * we can investigate continuations whenever a length is skipped.
 */

typedef struct Neg_catch_s
{
      unsigned char*    beg;
      unsigned char*    index;
} Neg_catch_t;

/*
 * REX_REP catcher.  One is created on the stack for
 * each iteration of a complex repetition.
 */

typedef struct Rep_catch_s
{
      struct Rex_s*     cont;
      struct Rex_s*     ref;
      unsigned char*    beg;
      int         n;
} Rep_catch_t;

/*
 * data structure for an alternation of pure strings
 * son points to a subtree of all strings with a common
 * prefix ending in character c.  sib links alternate
 * letters in the same position of a word.  end=1 if
 * some word ends with c.  the order of strings is
 * irrelevant, except long words must be investigated
 * before short ones.
 */

typedef struct Trie_node_s
{
      unsigned char           c;
      unsigned char           end;
      struct Trie_node_s*     son;
      struct Trie_node_s*     sib;
} Trie_node_t;

typedef struct Trie_s
{
      Trie_node_t**     root;
      int         min;
      int         max;
} Trie_t;

/*
 * Rex_t is a node in a regular expression
 */

typedef struct Rex_s
{
      unsigned char     type;             /* node type            */
      unsigned char     marked;                 /* already marked */
      short       serial;                 /* subpattern number    */
      regflags_t  flags;                  /* scoped flags         */
      int         explicit;         /* scoped explicit match*/
      struct Rex_s*     next;             /* remaining parts      */
      int         lo;               /* lo dup count         */
      int         hi;               /* hi dup count         */
      unsigned char*    map;              /* fold and/or ccode map*/
      union
      {
      Alt_catch_t alt_catch;        /* alt catcher          */
      Bm_t        bm;               /* bm             */
      Behind_catch_t    behind_catch;           /* behind catcher */
      Set_t*            charclass;        /* char class           */
      Collate_t   collate;          /* collation class      */
      Cond_t            cond_catch;       /* cond catcher         */
      Conj_left_t conj_left;        /* conj left catcher    */
      Conj_right_t      conj_right;       /* conj right catcher   */
      void*       data;             /* data after Rex_t     */
      Exec_t            exec;             /* re.re_exec() args    */
      Group_t           group;                  /* a|b or rep           */
      Group_catch_t     group_catch;            /* group catcher  */
      Neg_catch_t neg_catch;        /* neg catcher          */
      Nest_t            nest;             /* nested match         */
      unsigned char     onechar;          /* single char          */
      Rep_catch_t rep_catch;        /* rep catcher          */
      String_t    string;                 /* string/kmp           */
      Trie_t            trie;             /* trie                 */
      }           re;
} Rex_t;

typedef struct reglib_s             /* library private regex_t info     */
{
      struct Rex_s*     rex;        /* compiled expression        */
      regdisc_t*  disc;       /* REG_DISCIPLINE discipline  */
      const regex_t*    regex;            /* from regexec               */
      unsigned char*    beg;        /* beginning of string        */
      unsigned char*    end;        /* end of string        */
      Vector_t*   pos;        /* posns of certain subpatterns     */
      Vector_t*   bestpos;    /* ditto for best match       */
      regmatch_t* match;            /* subexrs in current match   */
      regmatch_t* best;       /* ditto in best match yet    */
      Stk_pos_t   stk;        /* exec stack pos       */
      size_t            min;        /* minimum match length       */
      size_t            nsub;       /* internal re_nsub           */
      regflags_t  flags;            /* flags from regcomp()       */
      int         error;            /* last error                 */
      int         explicit;   /* explicit match on this char      */
      int         leading;    /* leading match on this char */
      int         refs;       /* regcomp()+regdup() references*/
      Rex_t       done;       /* the last continuation      */
      regstat_t   stats;            /* for regstat()        */
      unsigned char     fold[UCHAR_MAX+1]; /* REG_ICASE map       */
      unsigned char     hard;       /* hard comp                  */
      unsigned char     once;       /* if 1st parse fails, quit   */
      unsigned char     separate;   /* cannot combine       */
      unsigned char     stack;            /* hard comp or exec          */
      unsigned char     sub;        /* re_sub is valid            */
      unsigned char     test;       /* debug/test bitmask         */
} Env_t;

typedef struct State_s                    /* shared state         */
{
      regmatch_t  nomatch;
      struct
      {
      unsigned char     key;
      short       val[15];
      }           escape[52];
      short*            magic[UCHAR_MAX+1];
      regdisc_t   disc;
      int         fatal;
      int         initialized;
      Dt_t*       attrs;
      Dt_t*       names;
      Dtdisc_t    dtdisc;
} State_t;

extern State_t          state;

extern void*            alloc(regdisc_t*, void*, size_t);
extern regclass_t classfun(int);
extern void       drop(regdisc_t*, Rex_t*);
extern int        fatal(regdisc_t*, int, const char*);

#endif

Generated by  Doxygen 1.6.0   Back to index