G:/ScriptBasic/source/extensions/re/regcomp.c

Go to the documentation of this file.
00001 /*-
00002  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
00003  * Copyright (c) 1992, 1993, 1994
00004  *      The Regents of the University of California.  All rights reserved.
00005  *
00006  * This code is derived from software contributed to Berkeley by
00007  * Henry Spencer.
00008  *
00009  * Redistribution and use in source and binary forms, with or without
00010  * modification, are permitted provided that the following conditions
00011  * are met:
00012  * 1. Redistributions of source code must retain the above copyright
00013  *    notice, this list of conditions and the following disclaimer.
00014  * 2. Redistributions in binary form must reproduce the above copyright
00015  *    notice, this list of conditions and the following disclaimer in the
00016  *    documentation and/or other materials provided with the distribution.
00017  * 3. All advertising materials mentioning features or use of this software
00018  *    must display the following acknowledgement:
00019  *      This product includes software developed by the University of
00020  *      California, Berkeley and its contributors.
00021  * 4. Neither the name of the University nor the names of its contributors
00022  *    may be used to endorse or promote products derived from this software
00023  *    without specific prior written permission.
00024  *
00025  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00026  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00028  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00029  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00030  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00031  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00032  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00033  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00034  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00035  * SUCH DAMAGE.
00036  *
00037  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
00038  */
00039 
00040 #if defined(LIBC_SCCS) && !defined(lint)
00041 static char sccsid[] = "@(#)regcomp.c   8.5 (Berkeley) 3/20/94";
00042 #endif /* LIBC_SCCS and not lint */
00043 
00044 #include <sys/types.h>
00045 #include <stdio.h>
00046 #include <string.h>
00047 #include <ctype.h>
00048 #include <limits.h>
00049 #include <stdlib.h>
00050 #include "regex.h"
00051 
00052 #include "utils.h"
00053 #include "regex2.h"
00054 
00055 #include "cclass.h"
00056 #include "cname.h"
00057 
00058 /*
00059  * parse structure, passed up and down to avoid global variables and
00060  * other clumsinesses
00061  */
00062 struct parse {
00063         char *next;             /* next character in RE */
00064         char *end;              /* end of string (-> NUL normally) */
00065         int error;              /* has an error been seen? */
00066         sop *strip;             /* malloced strip */
00067         sopno ssize;            /* malloced strip size (allocated) */
00068         sopno slen;             /* malloced strip length (used) */
00069         int ncsalloc;           /* number of csets allocated */
00070         struct re_guts *g;
00071 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
00072         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
00073         sopno pend[NPAREN];     /* -> ) ([0] unused) */
00074 };
00075 
00076 /* ========= begin header generated by ./mkh ========= */
00077 #ifdef __cplusplus
00078 extern "C" {
00079 #endif
00080 
00081 /* === regcomp.c === */
00082 static void p_ere __P((struct parse *p, int stop));
00083 static void p_ere_exp __P((struct parse *p));
00084 static void p_str __P((struct parse *p));
00085 static void p_bre __P((struct parse *p, int end1, int end2));
00086 static int p_simp_re __P((struct parse *p, int starordinary));
00087 static int p_count __P((struct parse *p));
00088 static void p_bracket __P((struct parse *p));
00089 static void p_b_term __P((struct parse *p, cset *cs));
00090 static void p_b_cclass __P((struct parse *p, cset *cs));
00091 static void p_b_eclass __P((struct parse *p, cset *cs));
00092 static char p_b_symbol __P((struct parse *p));
00093 static char p_b_coll_elem __P((struct parse *p, int endc));
00094 static char othercase __P((int ch));
00095 static void bothcases __P((struct parse *p, int ch));
00096 static void ordinary __P((struct parse *p, int ch));
00097 static void nonnewline __P((struct parse *p));
00098 static void repeat __P((struct parse *p, sopno start, int from, int to));
00099 static int seterr __P((struct parse *p, int e));
00100 static cset *allocset __P((struct parse *p));
00101 static void freeset __P((struct parse *p, cset *cs));
00102 static int freezeset __P((struct parse *p, cset *cs));
00103 static int firstch __P((struct parse *p, cset *cs));
00104 static int nch __P((struct parse *p, cset *cs));
00105 static void mcadd __P((struct parse *p, cset *cs, char *cp));
00106 static void mcsub __P((cset *cs, char *cp));
00107 static int mcin __P((cset *cs, char *cp));
00108 static char *mcfind __P((cset *cs, char *cp));
00109 static void mcinvert __P((struct parse *p, cset *cs));
00110 static void mccase __P((struct parse *p, cset *cs));
00111 static int isinsets __P((struct re_guts *g, int c));
00112 static int samesets __P((struct re_guts *g, int c1, int c2));
00113 static void categorize __P((struct parse *p, struct re_guts *g));
00114 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
00115 static void doemit __P((struct parse *p, sop op, size_t opnd));
00116 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
00117 static void dofwd __P((struct parse *p, sopno pos, sop value));
00118 static void enlarge __P((struct parse *p, sopno size));
00119 static void stripsnug __P((struct parse *p, struct re_guts *g));
00120 static void findmust __P((struct parse *p, struct re_guts *g));
00121 static sopno pluscount __P((struct parse *p, struct re_guts *g));
00122 
00123 #ifdef __cplusplus
00124 }
00125 #endif
00126 /* ========= end header generated by ./mkh ========= */
00127 
00128 static char nuls[10];           /* place to point scanner in event of error */
00129 
00130 /*
00131  * macros for use with parse structure
00132  * BEWARE:  these know that the parse structure is named `p' !!!
00133  */
00134 #define PEEK()  (*p->next)
00135 #define PEEK2() (*(p->next+1))
00136 #define MORE()  (p->next < p->end)
00137 #define MORE2() (p->next+1 < p->end)
00138 #define SEE(c)  (MORE() && PEEK() == (c))
00139 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
00140 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
00141 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
00142 #define NEXT()  (p->next++)
00143 #define NEXT2() (p->next += 2)
00144 #define NEXTn(n)        (p->next += (n))
00145 #define GETNEXT()       (*p->next++)
00146 #define SETERROR(e)     seterr(p, (e))
00147 #define REQUIRE(co, e)  ((co) || SETERROR(e))
00148 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
00149 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
00150 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
00151 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
00152 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
00153 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
00154 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
00155 #define HERE()          (p->slen)
00156 #define THERE()         (p->slen - 1)
00157 #define THERETHERE()    (p->slen - 2)
00158 #define DROP(n) (p->slen -= (n))
00159 
00160 #ifndef NDEBUG
00161 static int never = 0;           /* for use in asserts; shuts lint up */
00162 #else
00163 #define never   0               /* some <assert.h>s have bugs too */
00164 #endif
00165 
00166 /*
00167  - regcomp - interface for parser and compilation
00168  = extern int regcomp(regex_t *, const char *, int);
00169  = #define      REG_BASIC       0000
00170  = #define      REG_EXTENDED    0001
00171  = #define      REG_ICASE       0002
00172  = #define      REG_NOSUB       0004
00173  = #define      REG_NEWLINE     0010
00174  = #define      REG_NOSPEC      0020
00175  = #define      REG_PEND        0040
00176  = #define      REG_DUMP        0200
00177  */
00178 int     __stdcall                       /* 0 success, otherwise REG_something */
00179 regcomp(preg, pattern, cflags)
00180 regex_t *preg;
00181 const char *pattern;
00182 int cflags;
00183 {
00184         struct parse pa;
00185         register struct re_guts *g;
00186         register struct parse *p = &pa;
00187         register int i;
00188         register size_t len;
00189 #ifdef REDEBUG
00190 #       define  GOODFLAGS(f)    (f)
00191 #else
00192 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
00193 #endif
00194 
00195         cflags = GOODFLAGS(cflags);
00196         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
00197                 return(REG_INVARG);
00198 
00199         if (cflags&REG_PEND) {
00200                 if (preg->re_endp < pattern)
00201                         return(REG_INVARG);
00202                 len = preg->re_endp - pattern;
00203         } else
00204                 len = strlen((char *)pattern);
00205 
00206         /* do the mallocs early so failure handling is easy */
00207         g = (struct re_guts *)malloc(sizeof(struct re_guts) +
00208                                                         (NC-1)*sizeof(cat_t));
00209         if (g == NULL)
00210                 return(REG_ESPACE);
00211         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
00212         p->strip = (sop *)malloc(p->ssize * sizeof(sop));
00213         p->slen = 0;
00214         if (p->strip == NULL) {
00215                 free((char *)g);
00216                 return(REG_ESPACE);
00217         }
00218 
00219         /* set things up */
00220         p->g = g;
00221         p->next = (char *)pattern;      /* convenience; we do not modify it */
00222         p->end = p->next + len;
00223         p->error = 0;
00224         p->ncsalloc = 0;
00225         for (i = 0; i < NPAREN; i++) {
00226                 p->pbegin[i] = 0;
00227                 p->pend[i] = 0;
00228         }
00229         g->csetsize = NC;
00230         g->sets = NULL;
00231         g->setbits = NULL;
00232         g->ncsets = 0;
00233         g->cflags = cflags;
00234         g->iflags = 0;
00235         g->nbol = 0;
00236         g->neol = 0;
00237         g->must = NULL;
00238         g->mlen = 0;
00239         g->nsub = 0;
00240         g->ncategories = 1;     /* category 0 is "everything else" */
00241         g->categories = &g->catspace[-(CHAR_MIN)];
00242         (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
00243         g->backrefs = 0;
00244 
00245         /* do it */
00246         EMIT(OEND, 0);
00247         g->firststate = THERE();
00248         if (cflags&REG_EXTENDED)
00249                 p_ere(p, OUT);
00250         else if (cflags&REG_NOSPEC)
00251                 p_str(p);
00252         else
00253                 p_bre(p, OUT, OUT);
00254         EMIT(OEND, 0);
00255         g->laststate = THERE();
00256 
00257         /* tidy up loose ends and fill things in */
00258         categorize(p, g);
00259         stripsnug(p, g);
00260         findmust(p, g);
00261         g->nplus = pluscount(p, g);
00262         g->magic = MAGIC2;
00263         preg->re_nsub = g->nsub;
00264         preg->re_g = g;
00265         preg->re_magic = MAGIC1;
00266 #ifndef REDEBUG
00267         /* not debugging, so can't rely on the assert() in regexec() */
00268         if (g->iflags&BAD)
00269                 SETERROR(REG_ASSERT);
00270 #endif
00271 
00272         /* win or lose, we're done */
00273         if (p->error != 0)      /* lose */
00274                 regfree(preg);
00275         return(p->error);
00276 }
00277 
00278 /*
00279  - p_ere - ERE parser top level, concatenation and alternation
00280  == static void p_ere(register struct parse *p, int stop);
00281  */
00282 static void
00283 p_ere(p, stop)
00284 register struct parse *p;
00285 int stop;                       /* character this ERE should end at */
00286 {
00287         register char c;
00288         register sopno prevback;
00289         register sopno prevfwd;
00290         register sopno conc;
00291         register int first = 1;         /* is this the first alternative? */
00292 
00293         for (;;) {
00294                 /* do a bunch of concatenated expressions */
00295                 conc = HERE();
00296                 while (MORE() && (c = PEEK()) != '|' && c != stop)
00297                         p_ere_exp(p);
00298                 REQUIRE(HERE() != conc, REG_EMPTY);     /* require nonempty */
00299 
00300                 if (!EAT('|'))
00301                         break;          /* NOTE BREAK OUT */
00302 
00303                 if (first) {
00304                         INSERT(OCH_, conc);     /* offset is wrong */
00305                         prevfwd = conc;
00306                         prevback = conc;
00307                         first = 0;
00308                 }
00309                 ASTERN(OOR1, prevback);
00310                 prevback = THERE();
00311                 AHEAD(prevfwd);                 /* fix previous offset */
00312                 prevfwd = HERE();
00313                 EMIT(OOR2, 0);                  /* offset is very wrong */
00314         }
00315 
00316         if (!first) {           /* tail-end fixups */
00317                 AHEAD(prevfwd);
00318                 ASTERN(O_CH, prevback);
00319         }
00320 
00321         assert(!MORE() || SEE(stop));
00322 }
00323 
00324 /*
00325  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
00326  == static void p_ere_exp(register struct parse *p);
00327  */
00328 static void
00329 p_ere_exp(p)
00330 register struct parse *p;
00331 {
00332         register char c;
00333         register sopno pos;
00334         register int count;
00335         register int count2;
00336         register sopno subno;
00337         int wascaret = 0;
00338 
00339         assert(MORE());         /* caller should have ensured this */
00340         c = GETNEXT();
00341 
00342         pos = HERE();
00343         switch (c) {
00344         case '(':
00345                 REQUIRE(MORE(), REG_EPAREN);
00346                 p->g->nsub++;
00347                 subno = p->g->nsub;
00348                 if (subno < NPAREN)
00349                         p->pbegin[subno] = HERE();
00350                 EMIT(OLPAREN, subno);
00351                 if (!SEE(')'))
00352                         p_ere(p, ')');
00353                 if (subno < NPAREN) {
00354                         p->pend[subno] = HERE();
00355                         assert(p->pend[subno] != 0);
00356                 }
00357                 EMIT(ORPAREN, subno);
00358                 MUSTEAT(')', REG_EPAREN);
00359                 break;
00360 #ifndef POSIX_MISTAKE
00361         case ')':               /* happens only if no current unmatched ( */
00362                 /*
00363                  * You may ask, why the ifndef?  Because I didn't notice
00364                  * this until slightly too late for 1003.2, and none of the
00365                  * other 1003.2 regular-expression reviewers noticed it at
00366                  * all.  So an unmatched ) is legal POSIX, at least until
00367                  * we can get it fixed.
00368                  */
00369                 SETERROR(REG_EPAREN);
00370                 break;
00371 #endif
00372         case '^':
00373                 EMIT(OBOL, 0);
00374                 p->g->iflags |= USEBOL;
00375                 p->g->nbol++;
00376                 wascaret = 1;
00377                 break;
00378         case '$':
00379                 EMIT(OEOL, 0);
00380                 p->g->iflags |= USEEOL;
00381                 p->g->neol++;
00382                 break;
00383         case '|':
00384                 SETERROR(REG_EMPTY);
00385                 break;
00386         case '*':
00387         case '+':
00388         case '?':
00389                 SETERROR(REG_BADRPT);
00390                 break;
00391         case '.':
00392                 if (p->g->cflags&REG_NEWLINE)
00393                         nonnewline(p);
00394                 else
00395                         EMIT(OANY, 0);
00396                 break;
00397         case '[':
00398                 p_bracket(p);
00399                 break;
00400         case '\\':
00401                 REQUIRE(MORE(), REG_EESCAPE);
00402                 c = GETNEXT();
00403                 ordinary(p, c);
00404                 break;
00405         case '{':               /* okay as ordinary except if digit follows */
00406                 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
00407                 /* FALLTHROUGH */
00408         default:
00409                 ordinary(p, c);
00410                 break;
00411         }
00412 
00413         if (!MORE())
00414                 return;
00415         c = PEEK();
00416         /* we call { a repetition if followed by a digit */
00417         if (!( c == '*' || c == '+' || c == '?' ||
00418                                 (c == '{' && MORE2() && isdigit(PEEK2())) ))
00419                 return;         /* no repetition, we're done */
00420         NEXT();
00421 
00422         REQUIRE(!wascaret, REG_BADRPT);
00423         switch (c) {
00424         case '*':       /* implemented as +? */
00425                 /* this case does not require the (y|) trick, noKLUDGE */
00426                 INSERT(OPLUS_, pos);
00427                 ASTERN(O_PLUS, pos);
00428                 INSERT(OQUEST_, pos);
00429                 ASTERN(O_QUEST, pos);
00430                 break;
00431         case '+':
00432                 INSERT(OPLUS_, pos);
00433                 ASTERN(O_PLUS, pos);
00434                 break;
00435         case '?':
00436                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
00437                 INSERT(OCH_, pos);              /* offset slightly wrong */
00438                 ASTERN(OOR1, pos);              /* this one's right */
00439                 AHEAD(pos);                     /* fix the OCH_ */
00440                 EMIT(OOR2, 0);                  /* offset very wrong... */
00441                 AHEAD(THERE());                 /* ...so fix it */
00442                 ASTERN(O_CH, THERETHERE());
00443                 break;
00444         case '{':
00445                 count = p_count(p);
00446                 if (EAT(',')) {
00447                         if (isdigit(PEEK())) {
00448                                 count2 = p_count(p);
00449                                 REQUIRE(count <= count2, REG_BADBR);
00450                         } else          /* single number with comma */
00451                                 count2 = INFINITY;
00452                 } else          /* just a single number */
00453                         count2 = count;
00454                 repeat(p, pos, count, count2);
00455                 if (!EAT('}')) {        /* error heuristics */
00456                         while (MORE() && PEEK() != '}')
00457                                 NEXT();
00458                         REQUIRE(MORE(), REG_EBRACE);
00459                         SETERROR(REG_BADBR);
00460                 }
00461                 break;
00462         }
00463 
00464         if (!MORE())
00465                 return;
00466         c = PEEK();
00467         if (!( c == '*' || c == '+' || c == '?' ||
00468                                 (c == '{' && MORE2() && isdigit(PEEK2())) ) )
00469                 return;
00470         SETERROR(REG_BADRPT);
00471 }
00472 
00473 /*
00474  - p_str - string (no metacharacters) "parser"
00475  == static void p_str(register struct parse *p);
00476  */
00477 static void
00478 p_str(p)
00479 register struct parse *p;
00480 {
00481         REQUIRE(MORE(), REG_EMPTY);
00482         while (MORE())
00483                 ordinary(p, GETNEXT());
00484 }
00485 
00486 /*
00487  - p_bre - BRE parser top level, anchoring and concatenation
00488  == static void p_bre(register struct parse *p, register int end1, \
00489  ==     register int end2);
00490  * Giving end1 as OUT essentially eliminates the end1/end2 check.
00491  *
00492  * This implementation is a bit of a kludge, in that a trailing $ is first
00493  * taken as an ordinary character and then revised to be an anchor.  The
00494  * only undesirable side effect is that '$' gets included as a character
00495  * category in such cases.  This is fairly harmless; not worth fixing.
00496  * The amount of lookahead needed to avoid this kludge is excessive.
00497  */
00498 static void
00499 p_bre(p, end1, end2)
00500 register struct parse *p;
00501 register int end1;              /* first terminating character */
00502 register int end2;              /* second terminating character */
00503 {
00504         register sopno start = HERE();
00505         register int first = 1;                 /* first subexpression? */
00506         register int wasdollar = 0;
00507 
00508         if (EAT('^')) {
00509                 EMIT(OBOL, 0);
00510                 p->g->iflags |= USEBOL;
00511                 p->g->nbol++;
00512         }
00513         while (MORE() && !SEETWO(end1, end2)) {
00514                 wasdollar = p_simp_re(p, first);
00515                 first = 0;
00516         }
00517         if (wasdollar) {        /* oops, that was a trailing anchor */
00518                 DROP(1);
00519                 EMIT(OEOL, 0);
00520                 p->g->iflags |= USEEOL;
00521                 p->g->neol++;
00522         }
00523 
00524         REQUIRE(HERE() != start, REG_EMPTY);    /* require nonempty */
00525 }
00526 
00527 /*
00528  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
00529  == static int p_simp_re(register struct parse *p, int starordinary);
00530  */
00531 static int                      /* was the simple RE an unbackslashed $? */
00532 p_simp_re(p, starordinary)
00533 register struct parse *p;
00534 int starordinary;               /* is a leading * an ordinary character? */
00535 {
00536         register int c;
00537         register int count;
00538         register int count2;
00539         register sopno pos;
00540         register int i;
00541         register sopno subno;
00542 #       define  BACKSL  (1<<CHAR_BIT)
00543 
00544         pos = HERE();           /* repetion op, if any, covers from here */
00545 
00546         assert(MORE());         /* caller should have ensured this */
00547         c = GETNEXT();
00548         if (c == '\\') {
00549                 REQUIRE(MORE(), REG_EESCAPE);
00550                 c = BACKSL | (unsigned char)GETNEXT();
00551         }
00552         switch (c) {
00553         case '.':
00554                 if (p->g->cflags&REG_NEWLINE)
00555                         nonnewline(p);
00556                 else
00557                         EMIT(OANY, 0);
00558                 break;
00559         case '[':
00560                 p_bracket(p);
00561                 break;
00562         case BACKSL|'{':
00563                 SETERROR(REG_BADRPT);
00564                 break;
00565         case BACKSL|'(':
00566                 p->g->nsub++;
00567                 subno = p->g->nsub;
00568                 if (subno < NPAREN)
00569                         p->pbegin[subno] = HERE();
00570                 EMIT(OLPAREN, subno);
00571                 /* the MORE here is an error heuristic */
00572                 if (MORE() && !SEETWO('\\', ')'))
00573                         p_bre(p, '\\', ')');
00574                 if (subno < NPAREN) {
00575                         p->pend[subno] = HERE();
00576                         assert(p->pend[subno] != 0);
00577                 }
00578                 EMIT(ORPAREN, subno);
00579                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
00580                 break;
00581         case BACKSL|')':        /* should not get here -- must be user */
00582         case BACKSL|'}':
00583                 SETERROR(REG_EPAREN);
00584                 break;
00585         case BACKSL|'1':
00586         case BACKSL|'2':
00587         case BACKSL|'3':
00588         case BACKSL|'4':
00589         case BACKSL|'5':
00590         case BACKSL|'6':
00591         case BACKSL|'7':
00592         case BACKSL|'8':
00593         case BACKSL|'9':
00594                 i = (c&~BACKSL) - '0';
00595                 assert(i < NPAREN);
00596                 if (p->pend[i] != 0) {
00597                         assert(i <= p->g->nsub);
00598                         EMIT(OBACK_, i);
00599                         assert(p->pbegin[i] != 0);
00600                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
00601                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
00602                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
00603                         EMIT(O_BACK, i);
00604                 } else
00605                         SETERROR(REG_ESUBREG);
00606                 p->g->backrefs = 1;
00607                 break;
00608         case '*':
00609                 REQUIRE(starordinary, REG_BADRPT);
00610                 /* FALLTHROUGH */
00611         default:
00612                 ordinary(p, c &~ BACKSL);
00613                 break;
00614         }
00615 
00616         if (EAT('*')) {         /* implemented as +? */
00617                 /* this case does not require the (y|) trick, noKLUDGE */
00618                 INSERT(OPLUS_, pos);
00619                 ASTERN(O_PLUS, pos);
00620                 INSERT(OQUEST_, pos);
00621                 ASTERN(O_QUEST, pos);
00622         } else if (EATTWO('\\', '{')) {
00623                 count = p_count(p);
00624                 if (EAT(',')) {
00625                         if (MORE() && isdigit(PEEK())) {
00626                                 count2 = p_count(p);
00627                                 REQUIRE(count <= count2, REG_BADBR);
00628                         } else          /* single number with comma */
00629                                 count2 = INFINITY;
00630                 } else          /* just a single number */
00631                         count2 = count;
00632                 repeat(p, pos, count, count2);
00633                 if (!EATTWO('\\', '}')) {       /* error heuristics */
00634                         while (MORE() && !SEETWO('\\', '}'))
00635                                 NEXT();
00636                         REQUIRE(MORE(), REG_EBRACE);
00637                         SETERROR(REG_BADBR);
00638                 }
00639         } else if (c == (unsigned char)'$')     /* $ (but not \$) ends it */
00640                 return(1);
00641 
00642         return(0);
00643 }
00644 
00645 /*
00646  - p_count - parse a repetition count
00647  == static int p_count(register struct parse *p);
00648  */
00649 static int                      /* the value */
00650 p_count(p)
00651 register struct parse *p;
00652 {
00653         register int count = 0;
00654         register int ndigits = 0;
00655 
00656         while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
00657                 count = count*10 + (GETNEXT() - '0');
00658                 ndigits++;
00659         }
00660 
00661         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
00662         return(count);
00663 }
00664 
00665 /*
00666  - p_bracket - parse a bracketed character list
00667  == static void p_bracket(register struct parse *p);
00668  *
00669  * Note a significant property of this code:  if the allocset() did SETERROR,
00670  * no set operations are done.
00671  */
00672 static void
00673 p_bracket(p)
00674 register struct parse *p;
00675 {
00676         register char c;
00677         register cset *cs = allocset(p);
00678         register int invert = 0;
00679 
00680         /* Dept of Truly Sickening Special-Case Kludges */
00681         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
00682                 EMIT(OBOW, 0);
00683                 NEXTn(6);
00684                 return;
00685         }
00686         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
00687                 EMIT(OEOW, 0);
00688                 NEXTn(6);
00689                 return;
00690         }
00691 
00692         if (EAT('^'))
00693                 invert++;       /* make note to invert set at end */
00694         if (EAT(']'))
00695                 CHadd(cs, ']');
00696         else if (EAT('-'))
00697                 CHadd(cs, '-');
00698         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
00699                 p_b_term(p, cs);
00700         if (EAT('-'))
00701                 CHadd(cs, '-');
00702         MUSTEAT(']', REG_EBRACK);
00703 
00704         if (p->error != 0)      /* don't mess things up further */
00705                 return;
00706 
00707         if (p->g->cflags&REG_ICASE) {
00708                 register int i;
00709                 register int ci;
00710 
00711                 for (i = p->g->csetsize - 1; i >= 0; i--)
00712                         if (CHIN(cs, i) && isalpha(i)) {
00713                                 ci = othercase(i);
00714                                 if (ci != i)
00715                                         CHadd(cs, ci);
00716                         }
00717                 if (cs->multis != NULL)
00718                         mccase(p, cs);
00719         }
00720         if (invert) {
00721                 register int i;
00722 
00723                 for (i = p->g->csetsize - 1; i >= 0; i--)
00724                         if (CHIN(cs, i))
00725                                 CHsub(cs, i);
00726                         else
00727                                 CHadd(cs, i);
00728                 if (p->g->cflags&REG_NEWLINE)
00729                         CHsub(cs, '\n');
00730                 if (cs->multis != NULL)
00731                         mcinvert(p, cs);
00732         }
00733 
00734         assert(cs->multis == NULL);             /* xxx */
00735 
00736         if (nch(p, cs) == 1) {          /* optimize singleton sets */
00737                 ordinary(p, firstch(p, cs));
00738                 freeset(p, cs);
00739         } else
00740                 EMIT(OANYOF, freezeset(p, cs));
00741 }
00742 
00743 /*
00744  - p_b_term - parse one term of a bracketed character list
00745  == static void p_b_term(register struct parse *p, register cset *cs);
00746  */
00747 static void
00748 p_b_term(p, cs)
00749 register struct parse *p;
00750 register cset *cs;
00751 {
00752         register char c;
00753         register char start, finish;
00754         register int i;
00755 
00756         /* classify what we've got */
00757         switch ((MORE()) ? PEEK() : '\0') {
00758         case '[':
00759                 c = (MORE2()) ? PEEK2() : '\0';
00760                 break;
00761         case '-':
00762                 SETERROR(REG_ERANGE);
00763                 return;                 /* NOTE RETURN */
00764                 break;
00765         default:
00766                 c = '\0';
00767                 break;
00768         }
00769 
00770         switch (c) {
00771         case ':':               /* character class */
00772                 NEXT2();
00773                 REQUIRE(MORE(), REG_EBRACK);
00774                 c = PEEK();
00775                 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
00776                 p_b_cclass(p, cs);
00777                 REQUIRE(MORE(), REG_EBRACK);
00778                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
00779                 break;
00780         case '=':               /* equivalence class */
00781                 NEXT2();
00782                 REQUIRE(MORE(), REG_EBRACK);
00783                 c = PEEK();
00784                 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
00785                 p_b_eclass(p, cs);
00786                 REQUIRE(MORE(), REG_EBRACK);
00787                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
00788                 break;
00789         default:                /* symbol, ordinary character, or range */
00790 /* xxx revision needed for multichar stuff */
00791                 start = p_b_symbol(p);
00792                 if (SEE('-') && MORE2() && PEEK2() != ']') {
00793                         /* range */
00794                         NEXT();
00795                         if (EAT('-'))
00796                                 finish = '-';
00797                         else
00798                                 finish = p_b_symbol(p);
00799                 } else
00800                         finish = start;
00801 /* xxx what about signed chars here... */
00802                 REQUIRE(start <= finish, REG_ERANGE);
00803                 for (i = start; i <= finish; i++)
00804                         CHadd(cs, i);
00805                 break;
00806         }
00807 }
00808 
00809 /*
00810  - p_b_cclass - parse a character-class name and deal with it
00811  == static void p_b_cclass(register struct parse *p, register cset *cs);
00812  */
00813 static void
00814 p_b_cclass(p, cs)
00815 register struct parse *p;
00816 register cset *cs;
00817 {
00818         register char *sp = p->next;
00819         register struct cclass *cp;
00820         register size_t len;
00821         register char *u;
00822         register char c;
00823 
00824         while (MORE() && isalpha(PEEK()))
00825                 NEXT();
00826         len = p->next - sp;
00827         for (cp = cclasses; cp->name != NULL; cp++)
00828                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
00829                         break;
00830         if (cp->name == NULL) {
00831                 /* oops, didn't find it */
00832                 SETERROR(REG_ECTYPE);
00833                 return;
00834         }
00835 
00836         u = cp->chars;
00837         while ((c = *u++) != '\0')
00838                 CHadd(cs, c);
00839         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
00840                 MCadd(p, cs, u);
00841 }
00842 
00843 /*
00844  - p_b_eclass - parse an equivalence-class name and deal with it
00845  == static void p_b_eclass(register struct parse *p, register cset *cs);
00846  *
00847  * This implementation is incomplete. xxx
00848  */
00849 static void
00850 p_b_eclass(p, cs)
00851 register struct parse *p;
00852 register cset *cs;
00853 {
00854         register char c;
00855 
00856         c = p_b_coll_elem(p, '=');
00857         CHadd(cs, c);
00858 }
00859 
00860 /*
00861  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
00862  == static char p_b_symbol(register struct parse *p);
00863  */
00864 static char                     /* value of symbol */
00865 p_b_symbol(p)
00866 register struct parse *p;
00867 {
00868         register char value;
00869 
00870         REQUIRE(MORE(), REG_EBRACK);
00871         if (!EATTWO('[', '.'))
00872                 return(GETNEXT());
00873 
00874         /* collating symbol */
00875         value = p_b_coll_elem(p, '.');
00876         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
00877         return(value);
00878 }
00879 
00880 /*
00881  - p_b_coll_elem - parse a collating-element name and look it up
00882  == static char p_b_coll_elem(register struct parse *p, int endc);
00883  */
00884 static char                     /* value of collating element */
00885 p_b_coll_elem(p, endc)
00886 register struct parse *p;
00887 int endc;                       /* name ended by endc,']' */
00888 {
00889         register char *sp = p->next;
00890         register struct cname *cp;
00891         register int len;
00892         register char c;
00893 
00894         while (MORE() && !SEETWO(endc, ']'))
00895                 NEXT();
00896         if (!MORE()) {
00897                 SETERROR(REG_EBRACK);
00898                 return(0);
00899         }
00900         len = p->next - sp;
00901         for (cp = cnames; cp->name != NULL; cp++)
00902                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
00903                         return(cp->code);       /* known name */
00904         if (len == 1)
00905                 return(*sp);    /* single character */
00906         SETERROR(REG_ECOLLATE);                 /* neither */
00907         return(0);
00908 }
00909 
00910 /*
00911  - othercase - return the case counterpart of an alphabetic
00912  == static char othercase(int ch);
00913  */
00914 static char                     /* if no counterpart, return ch */
00915 othercase(ch)
00916 int ch;
00917 {
00918         assert(isalpha(ch));
00919         if (isupper(ch))
00920                 return(tolower(ch));
00921         else if (islower(ch))
00922                 return(toupper(ch));
00923         else                    /* peculiar, but could happen */
00924                 return(ch);
00925 }
00926 
00927 /*
00928  - bothcases - emit a dualcase version of a two-case character
00929  == static void bothcases(register struct parse *p, int ch);
00930  *
00931  * Boy, is this implementation ever a kludge...
00932  */
00933 static void
00934 bothcases(p, ch)
00935 register struct parse *p;
00936 int ch;
00937 {
00938         register char *oldnext = p->next;
00939         register char *oldend = p->end;
00940         char bracket[3];
00941 
00942         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
00943         p->next = bracket;
00944         p->end = bracket+2;
00945         bracket[0] = ch;
00946         bracket[1] = ']';
00947         bracket[2] = '\0';
00948         p_bracket(p);
00949         assert(p->next == bracket+2);
00950         p->next = oldnext;
00951         p->end = oldend;
00952 }
00953 
00954 /*
00955  - ordinary - emit an ordinary character
00956  == static void ordinary(register struct parse *p, register int ch);
00957  */
00958 static void
00959 ordinary(p, ch)
00960 register struct parse *p;
00961 register int ch;
00962 {
00963         register cat_t *cap = p->g->categories;
00964 
00965         if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
00966                 bothcases(p, ch);
00967         else {
00968                 EMIT(OCHAR, (unsigned char)ch);
00969                 if (cap[ch] == 0)
00970                         cap[ch] = p->g->ncategories++;
00971         }
00972 }
00973 
00974 /*
00975  - nonnewline - emit REG_NEWLINE version of OANY
00976  == static void nonnewline(register struct parse *p);
00977  *
00978  * Boy, is this implementation ever a kludge...
00979  */
00980 static void
00981 nonnewline(p)
00982 register struct parse *p;
00983 {
00984         register char *oldnext = p->next;
00985         register char *oldend = p->end;
00986         char bracket[4];
00987 
00988         p->next = bracket;
00989         p->end = bracket+3;
00990         bracket[0] = '^';
00991         bracket[1] = '\n';
00992         bracket[2] = ']';
00993         bracket[3] = '\0';
00994         p_bracket(p);
00995         assert(p->next == bracket+3);
00996         p->next = oldnext;
00997         p->end = oldend;
00998 }
00999 
01000 /*
01001  - repeat - generate code for a bounded repetition, recursively if needed
01002  == static void repeat(register struct parse *p, sopno start, int from, int to);
01003  */
01004 static void
01005 repeat(p, start, from, to)
01006 register struct parse *p;
01007 sopno start;                    /* operand from here to end of strip */
01008 int from;                       /* repeated from this number */
01009 int to;                         /* to this number of times (maybe INFINITY) */
01010 {
01011         register sopno finish = HERE();
01012 #       define  N       2
01013 #       define  INF     3
01014 #       define  REP(f, t)       ((f)*8 + (t))
01015 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
01016         register sopno copy;
01017 
01018         if (p->error != 0)      /* head off possible runaway recursion */
01019                 return;
01020 
01021         assert(from <= to);
01022 
01023         switch (REP(MAP(from), MAP(to))) {
01024         case REP(0, 0):                 /* must be user doing this */
01025                 DROP(finish-start);     /* drop the operand */
01026                 break;
01027         case REP(0, 1):                 /* as x{1,1}? */
01028         case REP(0, N):                 /* as x{1,n}? */
01029         case REP(0, INF):               /* as x{1,}? */
01030                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
01031                 INSERT(OCH_, start);            /* offset is wrong... */
01032                 repeat(p, start+1, 1, to);
01033                 ASTERN(OOR1, start);
01034                 AHEAD(start);                   /* ... fix it */
01035                 EMIT(OOR2, 0);
01036                 AHEAD(THERE());
01037                 ASTERN(O_CH, THERETHERE());
01038                 break;
01039         case REP(1, 1):                 /* trivial case */
01040                 /* done */
01041                 break;
01042         case REP(1, N):                 /* as x?x{1,n-1} */
01043                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
01044                 INSERT(OCH_, start);
01045                 ASTERN(OOR1, start);
01046                 AHEAD(start);
01047                 EMIT(OOR2, 0);                  /* offset very wrong... */
01048                 AHEAD(THERE());                 /* ...so fix it */
01049                 ASTERN(O_CH, THERETHERE());
01050                 copy = dupl(p, start+1, finish+1);
01051                 assert(copy == finish+4);
01052                 repeat(p, copy, 1, to-1);
01053                 break;
01054         case REP(1, INF):               /* as x+ */
01055                 INSERT(OPLUS_, start);
01056                 ASTERN(O_PLUS, start);
01057                 break;
01058         case REP(N, N):                 /* as xx{m-1,n-1} */
01059                 copy = dupl(p, start, finish);
01060                 repeat(p, copy, from-1, to-1);
01061                 break;
01062         case REP(N, INF):               /* as xx{n-1,INF} */
01063                 copy = dupl(p, start, finish);
01064                 repeat(p, copy, from-1, to);
01065                 break;
01066         default:                        /* "can't happen" */
01067                 SETERROR(REG_ASSERT);   /* just in case */
01068                 break;
01069         }
01070 }
01071 
01072 /*
01073  - seterr - set an error condition
01074  == static int seterr(register struct parse *p, int e);
01075  */
01076 static int                      /* useless but makes type checking happy */
01077 seterr(p, e)
01078 register struct parse *p;
01079 int e;
01080 {
01081         if (p->error == 0)      /* keep earliest error condition */
01082                 p->error = e;
01083         p->next = nuls;         /* try to bring things to a halt */
01084         p->end = nuls;
01085         return(0);              /* make the return value well-defined */
01086 }
01087 
01088 /*
01089  - allocset - allocate a set of characters for []
01090  == static cset *allocset(register struct parse *p);
01091  */
01092 static cset *
01093 allocset(p)
01094 register struct parse *p;
01095 {
01096         register int no = p->g->ncsets++;
01097         register size_t nc;
01098         register size_t nbytes;
01099         register cset *cs;
01100         register size_t css = (size_t)p->g->csetsize;
01101         register int i;
01102 
01103         if (no >= p->ncsalloc) {        /* need another column of space */
01104                 p->ncsalloc += CHAR_BIT;
01105                 nc = p->ncsalloc;
01106                 assert(nc % CHAR_BIT == 0);
01107                 nbytes = nc / CHAR_BIT * css;
01108                 if (p->g->sets == NULL)
01109                         p->g->sets = (cset *)malloc(nc * sizeof(cset));
01110                 else
01111                         p->g->sets = (cset *)realloc((char *)p->g->sets,
01112                                                         nc * sizeof(cset));
01113                 if (p->g->setbits == NULL)
01114                         p->g->setbits = (uch *)malloc(nbytes);
01115                 else {
01116                         p->g->setbits = (uch *)realloc((char *)p->g->setbits,
01117                                                                 nbytes);
01118                         /* xxx this isn't right if setbits is now NULL */
01119                         for (i = 0; i < no; i++)
01120                                 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
01121                 }
01122                 if (p->g->sets != NULL && p->g->setbits != NULL)
01123                         (void) memset((char *)p->g->setbits + (nbytes - css),
01124                                                                 0, css);
01125                 else {
01126                         no = 0;
01127                         SETERROR(REG_ESPACE);
01128                         /* caller's responsibility not to do set ops */
01129                 }
01130         }
01131 
01132         assert(p->g->sets != NULL);     /* xxx */
01133         cs = &p->g->sets[no];
01134         cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
01135         cs->mask = 1 << ((no) % CHAR_BIT);
01136         cs->hash = 0;
01137         cs->smultis = 0;
01138         cs->multis = NULL;
01139 
01140         return(cs);
01141 }
01142 
01143 /*
01144  - freeset - free a now-unused set
01145  == static void freeset(register struct parse *p, register cset *cs);
01146  */
01147 static void
01148 freeset(p, cs)
01149 register struct parse *p;
01150 register cset *cs;
01151 {
01152         register int i;
01153         register cset *top = &p->g->sets[p->g->ncsets];
01154         register size_t css = (size_t)p->g->csetsize;
01155 
01156         for (i = 0; i < css; i++)
01157                 CHsub(cs, i);
01158         if (cs == top-1)        /* recover only the easy case */
01159                 p->g->ncsets--;
01160 }
01161 
01162 /*
01163  - freezeset - final processing on a set of characters
01164  == static int freezeset(register struct parse *p, register cset *cs);
01165  *
01166  * The main task here is merging identical sets.  This is usually a waste
01167  * of time (although the hash code minimizes the overhead), but can win
01168  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
01169  * is done using addition rather than xor -- all ASCII [aA] sets xor to
01170  * the same value!
01171  */
01172 static int                      /* set number */
01173 freezeset(p, cs)
01174 register struct parse *p;
01175 register cset *cs;
01176 {
01177         register uch h = cs->hash;
01178         register int i;
01179         register cset *top = &p->g->sets[p->g->ncsets];
01180         register cset *cs2;
01181         register size_t css = (size_t)p->g->csetsize;
01182 
01183         /* look for an earlier one which is the same */
01184         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
01185                 if (cs2->hash == h && cs2 != cs) {
01186                         /* maybe */
01187                         for (i = 0; i < css; i++)
01188                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
01189                                         break;          /* no */
01190                         if (i == css)
01191                                 break;                  /* yes */
01192                 }
01193 
01194         if (cs2 < top) {        /* found one */
01195                 freeset(p, cs);
01196                 cs = cs2;
01197         }
01198 
01199         return((int)(cs - p->g->sets));
01200 }
01201 
01202 /*
01203  - firstch - return first character in a set (which must have at least one)
01204  == static int firstch(register struct parse *p, register cset *cs);
01205  */
01206 static int                      /* character; there is no "none" value */
01207 firstch(p, cs)
01208 register struct parse *p;
01209 register cset *cs;
01210 {
01211         register int i;
01212         register size_t css = (size_t)p->g->csetsize;
01213 
01214         for (i = 0; i < css; i++)
01215                 if (CHIN(cs, i))
01216                         return((char)i);
01217         assert(never);
01218         return(0);              /* arbitrary */
01219 }
01220 
01221 /*
01222  - nch - number of characters in a set
01223  == static int nch(register struct parse *p, register cset *cs);
01224  */
01225 static int
01226 nch(p, cs)
01227 register struct parse *p;
01228 register cset *cs;
01229 {
01230         register int i;
01231         register size_t css = (size_t)p->g->csetsize;
01232         register int n = 0;
01233 
01234         for (i = 0; i < css; i++)
01235                 if (CHIN(cs, i))
01236                         n++;
01237         return(n);
01238 }
01239 
01240 /*
01241  - mcadd - add a collating element to a cset
01242  == static void mcadd(register struct parse *p, register cset *cs, \
01243  ==     register char *cp);
01244  */
01245 static void
01246 mcadd(p, cs, cp)
01247 register struct parse *p;
01248 register cset *cs;
01249 register char *cp;
01250 {
01251         register size_t oldend = cs->smultis;
01252 
01253         cs->smultis += strlen(cp) + 1;
01254         if (cs->multis == NULL)
01255                 cs->multis = malloc(cs->smultis);
01256         else
01257                 cs->multis = realloc(cs->multis, cs->smultis);
01258         if (cs->multis == NULL) {
01259                 SETERROR(REG_ESPACE);
01260                 return;
01261         }
01262 
01263         (void) strcpy(cs->multis + oldend - 1, cp);
01264         cs->multis[cs->smultis - 1] = '\0';
01265 }
01266 
01267 /*
01268  - mcsub - subtract a collating element from a cset
01269  == static void mcsub(register cset *cs, register char *cp);
01270  */
01271 static void
01272 mcsub(cs, cp)
01273 register cset *cs;
01274 register char *cp;
01275 {
01276         register char *fp = mcfind(cs, cp);
01277         register size_t len = strlen(fp);
01278 
01279         assert(fp != NULL);
01280         (void) memmove(fp, fp + len + 1,
01281                                 cs->smultis - (fp + len + 1 - cs->multis));
01282         cs->smultis -= len;
01283 
01284         if (cs->smultis == 0) {
01285                 free(cs->multis);
01286                 cs->multis = NULL;
01287                 return;
01288         }
01289 
01290         cs->multis = realloc(cs->multis, cs->smultis);
01291         assert(cs->multis != NULL);
01292 }
01293 
01294 /*
01295  - mcin - is a collating element in a cset?
01296  == static int mcin(register cset *cs, register char *cp);
01297  */
01298 static int
01299 mcin(cs, cp)
01300 register cset *cs;
01301 register char *cp;
01302 {
01303         return(mcfind(cs, cp) != NULL);
01304 }
01305 
01306 /*
01307  - mcfind - find a collating element in a cset
01308  == static char *mcfind(register cset *cs, register char *cp);
01309  */
01310 static char *
01311 mcfind(cs, cp)
01312 register cset *cs;
01313 register char *cp;
01314 {
01315         register char *p;
01316 
01317         if (cs->multis == NULL)
01318                 return(NULL);
01319         for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
01320                 if (strcmp(cp, p) == 0)
01321                         return(p);
01322         return(NULL);
01323 }
01324 
01325 /*
01326  - mcinvert - invert the list of collating elements in a cset
01327  == static void mcinvert(register struct parse *p, register cset *cs);
01328  *
01329  * This would have to know the set of possibilities.  Implementation
01330  * is deferred.
01331  */
01332 static void
01333 mcinvert(p, cs)
01334 register struct parse *p;
01335 register cset *cs;
01336 {
01337         assert(cs->multis == NULL);     /* xxx */
01338 }
01339 
01340 /*
01341  - mccase - add case counterparts of the list of collating elements in a cset
01342  == static void mccase(register struct parse *p, register cset *cs);
01343  *
01344  * This would have to know the set of possibilities.  Implementation
01345  * is deferred.
01346  */
01347 static void
01348 mccase(p, cs)
01349 register struct parse *p;
01350 register cset *cs;
01351 {
01352         assert(cs->multis == NULL);     /* xxx */
01353 }
01354 
01355 /*
01356  - isinsets - is this character in any sets?
01357  == static int isinsets(register struct re_guts *g, int c);
01358  */
01359 static int                      /* predicate */
01360 isinsets(g, c)
01361 register struct re_guts *g;
01362 int c;
01363 {
01364         register uch *col;
01365         register int i;
01366         register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
01367         register unsigned uc = (unsigned char)c;
01368 
01369         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
01370                 if (col[uc] != 0)
01371                         return(1);
01372         return(0);
01373 }
01374 
01375 /*
01376  - samesets - are these two characters in exactly the same sets?
01377  == static int samesets(register struct re_guts *g, int c1, int c2);
01378  */
01379 static int                      /* predicate */
01380 samesets(g, c1, c2)
01381 register struct re_guts *g;
01382 int c1;
01383 int c2;
01384 {
01385         register uch *col;
01386         register int i;
01387         register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
01388         register unsigned uc1 = (unsigned char)c1;
01389         register unsigned uc2 = (unsigned char)c2;
01390 
01391         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
01392                 if (col[uc1] != col[uc2])
01393                         return(0);
01394         return(1);
01395 }
01396 
01397 /*
01398  - categorize - sort out character categories
01399  == static void categorize(struct parse *p, register struct re_guts *g);
01400  */
01401 static void
01402 categorize(p, g)
01403 struct parse *p;
01404 register struct re_guts *g;
01405 {
01406         register cat_t *cats = g->categories;
01407         register int c;
01408         register int c2;
01409         register cat_t cat;
01410 
01411         /* avoid making error situations worse */
01412         if (p->error != 0)
01413                 return;
01414 
01415         for (c = CHAR_MIN; c <= CHAR_MAX; c++)
01416                 if (cats[c] == 0 && isinsets(g, c)) {
01417                         cat = g->ncategories++;
01418                         cats[c] = cat;
01419                         for (c2 = c+1; c2 <= CHAR_MAX; c2++)
01420                                 if (cats[c2] == 0 && samesets(g, c, c2))
01421                                         cats[c2] = cat;
01422                 }
01423 }
01424 
01425 /*
01426  - dupl - emit a duplicate of a bunch of sops
01427  == static sopno dupl(register struct parse *p, sopno start, sopno finish);
01428  */
01429 static sopno                    /* start of duplicate */
01430 dupl(p, start, finish)
01431 register struct parse *p;
01432 sopno start;                    /* from here */
01433 sopno finish;                   /* to this less one */
01434 {
01435         register sopno ret = HERE();
01436         register sopno len = finish - start;
01437 
01438         assert(finish >= start);
01439         if (len == 0)
01440                 return(ret);
01441         enlarge(p, p->ssize + len);     /* this many unexpected additions */
01442         assert(p->ssize >= p->slen + len);
01443         (void) memcpy((char *)(p->strip + p->slen),
01444                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
01445         p->slen += len;
01446         return(ret);
01447 }
01448 
01449 /*
01450  - doemit - emit a strip operator
01451  == static void doemit(register struct parse *p, sop op, size_t opnd);
01452  *
01453  * It might seem better to implement this as a macro with a function as
01454  * hard-case backup, but it's just too big and messy unless there are
01455  * some changes to the data structures.  Maybe later.
01456  */
01457 static void
01458 doemit(p, op, opnd)
01459 register struct parse *p;
01460 sop op;
01461 size_t opnd;
01462 {
01463         /* avoid making error situations worse */
01464         if (p->error != 0)
01465                 return;
01466 
01467         /* deal with oversize operands ("can't happen", more or less) */
01468         assert(opnd < 1<<OPSHIFT);
01469 
01470         /* deal with undersized strip */
01471         if (p->slen >= p->ssize)
01472                 enlarge(p, (p->ssize+1) / 2 * 3);       /* +50% */
01473         assert(p->slen < p->ssize);
01474 
01475         /* finally, it's all reduced to the easy case */
01476         p->strip[p->slen++] = SOP(op, opnd);
01477 }
01478 
01479 /*
01480  - doinsert - insert a sop into the strip
01481  == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
01482  */
01483 static void
01484 doinsert(p, op, opnd, pos)
01485 register struct parse *p;
01486 sop op;
01487 size_t opnd;
01488 sopno pos;
01489 {
01490         register sopno sn;
01491         register sop s;
01492         register int i;
01493 
01494         /* avoid making error situations worse */
01495         if (p->error != 0)
01496                 return;
01497 
01498         sn = HERE();
01499         EMIT(op, opnd);         /* do checks, ensure space */
01500         assert(HERE() == sn+1);
01501         s = p->strip[sn];
01502 
01503         /* adjust paren pointers */
01504         assert(pos > 0);
01505         for (i = 1; i < NPAREN; i++) {
01506                 if (p->pbegin[i] >= pos) {
01507                         p->pbegin[i]++;
01508                 }
01509                 if (p->pend[i] >= pos) {
01510                         p->pend[i]++;
01511                 }
01512         }
01513 
01514         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
01515                                                 (HERE()-pos-1)*sizeof(sop));
01516         p->strip[pos] = s;
01517 }
01518 
01519 /*
01520  - dofwd - complete a forward reference
01521  == static void dofwd(register struct parse *p, sopno pos, sop value);
01522  */
01523 static void
01524 dofwd(p, pos, value)
01525 register struct parse *p;
01526 register sopno pos;
01527 sop value;
01528 {
01529         /* avoid making error situations worse */
01530         if (p->error != 0)
01531                 return;
01532 
01533         assert(value < 1<<OPSHIFT);
01534         p->strip[pos] = OP(p->strip[pos]) | value;
01535 }
01536 
01537 /*
01538  - enlarge - enlarge the strip
01539  == static void enlarge(register struct parse *p, sopno size);
01540  */
01541 static void
01542 enlarge(p, size)
01543 register struct parse *p;
01544 register sopno size;
01545 {
01546         register sop *sp;
01547 
01548         if (p->ssize >= size)
01549                 return;
01550 
01551         sp = (sop *)realloc(p->strip, size*sizeof(sop));
01552         if (sp == NULL) {
01553                 SETERROR(REG_ESPACE);
01554                 return;
01555         }
01556         p->strip = sp;
01557         p->ssize = size;
01558 }
01559 
01560 /*
01561  - stripsnug - compact the strip
01562  == static void stripsnug(register struct parse *p, register struct re_guts *g);
01563  */
01564 static void
01565 stripsnug(p, g)
01566 register struct parse *p;
01567 register struct re_guts *g;
01568 {
01569         g->nstates = p->slen;
01570         g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
01571         if (g->strip == NULL) {
01572                 SETERROR(REG_ESPACE);
01573                 g->strip = p->strip;
01574         }
01575 }
01576 
01577 /*
01578  - findmust - fill in must and mlen with longest mandatory literal string
01579  == static void findmust(register struct parse *p, register struct re_guts *g);
01580  *
01581  * This algorithm could do fancy things like analyzing the operands of |
01582  * for common subsequences.  Someday.  This code is simple and finds most
01583  * of the interesting cases.
01584  *
01585  * Note that must and mlen got initialized during setup.
01586  */
01587 static void
01588 findmust(p, g)
01589 struct parse *p;
01590 register struct re_guts *g;
01591 {
01592         register sop *scan;
01593         sop *start;
01594         register sop *newstart;
01595         register sopno newlen;
01596         register sop s;
01597         register char *cp;
01598         register sopno i;
01599 
01600         /* avoid making error situations worse */
01601         if (p->error != 0)
01602                 return;
01603 
01604         /* find the longest OCHAR sequence in strip */
01605         newlen = 0;
01606         scan = g->strip + 1;
01607         do {
01608                 s = *scan++;
01609                 switch (OP(s)) {
01610                 case OCHAR:             /* sequence member */
01611                         if (newlen == 0)                /* new sequence */
01612                                 newstart = scan - 1;
01613                         newlen++;
01614                         break;
01615                 case OPLUS_:            /* things that don't break one */
01616                 case OLPAREN:
01617                 case ORPAREN:
01618                         break;
01619                 case OQUEST_:           /* things that must be skipped */
01620                 case OCH_:
01621                         scan--;
01622                         do {
01623                                 scan += OPND(s);
01624                                 s = *scan;
01625                                 /* assert() interferes w debug printouts */
01626                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
01627                                                         OP(s) != OOR2) {
01628                                         g->iflags |= BAD;
01629                                         return;
01630                                 }
01631                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
01632                         /* fallthrough */
01633                 default:                /* things that break a sequence */
01634                         if (newlen > g->mlen) {         /* ends one */
01635                                 start = newstart;
01636                                 g->mlen = newlen;
01637                         }
01638                         newlen = 0;
01639                         break;
01640                 }
01641         } while (OP(s) != OEND);
01642 
01643         if (g->mlen == 0)               /* there isn't one */
01644                 return;
01645 
01646         /* turn it into a character string */
01647         g->must = malloc((size_t)g->mlen + 1);
01648         if (g->must == NULL) {          /* argh; just forget it */
01649                 g->mlen = 0;
01650                 return;
01651         }
01652         cp = g->must;
01653         scan = start;
01654         for (i = g->mlen; i > 0; i--) {
01655                 while (OP(s = *scan++) != OCHAR)
01656                         continue;
01657                 assert(cp < g->must + g->mlen);
01658                 *cp++ = (char)OPND(s);
01659         }
01660         assert(cp == g->must + g->mlen);
01661         *cp++ = '\0';           /* just on general principles */
01662 }
01663 
01664 /*
01665  - pluscount - count + nesting
01666  == static sopno pluscount(register struct parse *p, register struct re_guts *g);
01667  */
01668 static sopno                    /* nesting depth */
01669 pluscount(p, g)
01670 struct parse *p;
01671 register struct re_guts *g;
01672 {
01673         register sop *scan;
01674         register sop s;
01675         register sopno plusnest = 0;
01676         register sopno maxnest = 0;
01677 
01678         if (p->error != 0)
01679                 return(0);      /* there may not be an OEND */
01680 
01681         scan = g->strip + 1;
01682         do {
01683                 s = *scan++;
01684                 switch (OP(s)) {
01685                 case OPLUS_:
01686                         plusnest++;
01687                         break;
01688                 case O_PLUS:
01689                         if (plusnest > maxnest)
01690                                 maxnest = plusnest;
01691                         plusnest--;
01692                         break;
01693                 }
01694         } while (OP(s) != OEND);
01695         if (plusnest != 0)
01696                 g->iflags |= BAD;
01697         return(maxnest);
01698 }

Generated on Sun Mar 12 23:56:30 2006 for ScriptBasic by  doxygen 1.4.6-NO