windows-nt/Source/XPSP1/NT/sdktools/qgrep/qmatch.c
2020-09-26 16:20:57 +08:00

736 lines
33 KiB
C

/*static char *SCCSID = "@(#)qmatch.c 13.7 90/08/13";*/
#include <stdio.h>
#include <ctype.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#define ASCLEN 128 /* Number of ascii characters */
#define BUFLEN 256 /* Temporary buffer length */
#define EOS ('\r') /* End of string character */
#define PATMAX 512 /* Maximum parsed pattern length */
#define BEGLINE 0x08 /* Match at beginning of line */
#define DEBUG 0x20 /* Print debugging output */
#define ENDLINE 0x10 /* Match at end of line */
#define T_END 0 /* End of expression */
#define T_STRING 1 /* String to match */
#define T_SINGLE 2 /* Single character to match */
#define T_CLASS 3 /* Class to match */
#define T_ANY 4 /* Match any character */
#define T_STAR 5 /* *-expr */
typedef struct exprnode
{
struct exprnode *ex_next; /* Next node in list */
unsigned char *ex_pattern; /* Pointer to pattern to match */
}
EXPR; /* Expression node */
static int clists = 1; /* One is first available index */
static int toklen[] = /* Table of token lengths */
{
32767, /* T_END: invalid */
32767, /* T_STRING: invalid */
2, /* T_SINGLE */
ASCLEN/8+1, /* T_CLASS */
1, /* T_ANY */
32767 /* T_STAR: invalid */
};
int ( __cdecl *ncmp)(const char *,const char *,size_t);
/* String comparison pointer */
unsigned char *exprparse(unsigned char *p);
extern int casesen; /* Case-sensitivity flag */
extern char *(*find)(); /* Pointer to search function */
extern int flags; /* Flags */
extern int strcnt; /* String count */
extern char transtab[]; /* Translation table */
EXPR *stringlist[ASCLEN];
/* String table */
void addexpr(char *e, int n); /* Add expression */
char *get1stcharset(unsigned char *e, char *bitvec);
extern char *alloc(); /* User-defined heap allocator */
unsigned char *simpleprefix();/* Match simple prefix */
char *strnupr(); /* See QGREPSUB.ASM */
unsigned char *simpleprefix(s,pp)
register unsigned char *s; /* String pointer */
unsigned char **pp; /* Pointer to pattern pointer */
{
register unsigned char *p; /* Simple pattern pointer */
register int c; /* Single character */
p = *pp; /* Initialize */
while(*p != T_END && *p != T_STAR) /* While not at end of pattern */
{
switch(*p++) /* Switch on token type */
{
case T_STRING: /* String to compare */
if((*ncmp)(s,p + 1,*p) != 0) return(NULL);
/* Fail if mismatch found */
s += *p; /* Skip matched portion */
p += *p + 1; /* Skip to next token */
break;
case T_SINGLE: /* Single character */
c = *s++; /* Get character */
if(!casesen) c = toupper(c);
/* Map to upper case if necessary */
if(c != (int)*p++) return(NULL);
/* Fail if mismatch found */
break;
case T_CLASS: /* Class of characters */
if(!isascii(*s) || !(p[*s >> 3] & (1 << (*s & 7)))) return(NULL);
/* Failure if bit not set */
p += ASCLEN/8; /* Skip bit vector */
++s; /* Skip character */
break;
case T_ANY: /* Any character */
if(*s++ == EOS) return(NULL);
/* Match all but end of string */
break;
}
}
*pp = p; /* Update pointer */
return(s); /* Pattern is prefix of s */
}
int match(s,p)
register unsigned char *s; /* String to match */
unsigned char *p; /* Pattern to match against */
{
register unsigned char *q; /* Temporary pointer */
unsigned char *r; /* Temporary pointer */
register int c; /* Character */
if(*p != T_END && *p != T_STAR && (s = simpleprefix(s,&p)) == NULL)
return(0); /* Failure if prefix mismatch */
if(*p++ == T_END) return(1); /* Match if end of pattern */
q = r = p; /* Point to repeated token */
r += toklen[*q]; /* Skip repeated token */
switch(*q++) /* Switch on token type */
{
case T_ANY: /* Any character */
while(match(s,r) == 0) /* While match not found */
{
if(*s++ == EOS) return(0);/* Match all but end of string */
}
return(1); /* Success */
case T_SINGLE: /* Single character */
while(match(s,r) == 0) /* While match not found */
{
c = *s++; /* Get character */
if(!casesen) c = toupper(c);
/* Map to upper case if necessary */
if((unsigned char) c != *q) return(0);
/* Fail if mismatch found */
}
return(1); /* Success */
case T_CLASS: /* Class of characters */
while(match(s,r) == 0) /* While match not found */
{
if(!isascii(*s) || !(q[*s >> 3] & (1 << (*s & 7)))) return(0);
/* Fail if bit not set */
++s; /* Else skip character */
}
return(1); /* Success */
}
return(0); /* Return failure */
}
int exprmatch(s,p)
char *s; /* String */
char *p; /* Pattern */
{
ncmp = strncmp; /* Assume case-sensitive */
if(!casesen)
{
ncmp = _strnicmp;
} /* Be case-insensitive if flag set */
return(match(s,p)); /* See if pattern matches string */
}
void bitset(bitvec,first,last,bitval)
char *bitvec; /* Bit vector */
int first; /* First character */
int last; /* Last character */
int bitval; /* Bit value (0 or 1) */
{
int bitno; /* Bit number */
bitvec += first >> 3; /* Point at first byte */
bitno = first & 7; /* Calculate first bit number */
while(first <= last) /* Loop to set bits */
{
if(bitno == 0 && first + 8 <= last)
{ /* If we have a whole byte's worth */
*bitvec++ = (char)(bitval? '\xFF': '\0');
/* Set the bits */
first += 8; /* Increment the counter */
continue; /* Next iteration */
}
*bitvec=(char)(*bitvec & (unsigned char)(~(1 << bitno))) | (unsigned char)(bitval << bitno);
/* Set the appropriate bit */
if(++bitno == 8) /* If we wrap into next byte */
{
++bitvec; /* Increment pointer */
bitno = 0; /* Reset bit index */
}
++first; /* Increment bit index */
}
}
unsigned char *exprparse(p)
register unsigned char *p; /* Raw pattern */
{
register char *cp; /* Char pointer */
unsigned char *cp2; /* Char pointer */
int i; /* Counter/index */
int j; /* Counter/index */
int m; /* Counter/index */
int n; /* Counter/index */
int bitval; /* Bit value */
char buffer[PATMAX]; /* Temporary buffer */
if(!casesen) strnupr(p,strlen(p)); /* Force pattern to upper case */
cp = buffer; /* Initialize pointer */
if(*p == '^') *cp++ = *p++; /* Copy leading caret if any */
while(*p != '\0') /* While not end of pattern */
{
i = -2; /* Initialize */
for(n = 0;;) /* Loop to delimit ordinary string */
{
n += strcspn(p + n,".\\[*");/* Look for a special character */
if(p[n] != '\\') break; /* Break if not backslash */
i = n; /* Remember where backslash is */
if(p[++n] == '\0') return(NULL);
/* Cannot be at very end */
++n; /* Skip escaped character */
}
if(p[n] == '*') /* If we found a *-expr. */
{
if(n-- == 0) return(NULL); /* Illegal first character */
if(i == n - 1) n = i; /* Escaped single-char. *-expr. */
}
if(n > 0) /* If we have string or single */
{
if(n == 1 || (n == 2 && *p == '\\'))
{ /* If single character */
*cp++ = T_SINGLE; /* Set type */
if(*p == '\\') ++p; /* Skip escape if any */
*cp++ = *p++; /* Copy single character */
}
else /* Else we have a string */
{
*cp++ = T_STRING; /* Set type */
cp2 = cp++; /* Save pointer to length byte */
while(n-- > 0) /* While bytes to copy remain */
{
if(*p == '\\') /* If escape found */
{
++p; /* Skip escape */
--n; /* Adjust length */
}
*cp++ = *p++; /* Copy character */
}
*cp2 = (char)((cp - cp2) - 1);
/* Set string length */
}
}
if(*p == '\0') break; /* Break if end of pattern */
if(*p == '.') /* If matching any */
{
if(*++p == '*') /* If star follows any */
{
++p; /* Skip star, too */
*cp++ = T_STAR; /* Insert prefix ahead of token */
}
*cp++ = T_ANY; /* Match any character */
continue; /* Next iteration */
}
if(*p == '[') /* If character class */
{
if(*++p == '\0') return(NULL);
/* Skip '[' */
*cp++ = T_CLASS; /* Set type */
memset(cp,'\0',ASCLEN/8); /* Clear the vector */
bitval = 1; /* Assume we're setting bits */
if(*p == '^') /* If inverted class */
{
++p; /* Skip '^' */
memset(cp,'\xFF',ASCLEN/8);
/* Set all bits */
bitset(cp,EOS,EOS,0); /* All except end-of-string */
bitset(cp,'\n','\n',0); /* And linefeed! */
bitval = 0; /* Now we're clearing bits */
}
while(*p != ']') /* Loop to find ']' */
{
if(*p == '\0') return(NULL);
/* Check for malformed string */
if(*p == '\\') /* If escape found */
{
if(*++p == '\0') return(NULL);
/* Skip escape */
}
i = *p++; /* Get first character in range */
if(*p == '-' && p[1] != '\0' && p[1] != ']')
{ /* If range found */
++p; /* Skip hyphen */
if(*p == '\\' && p[1] != '\0') ++p;
/* Skip escape character */
j = *p++; /* Get end of range */
}
else j = i; /* Else just one character */
bitset(cp,i,j,bitval); /* Set bits in vector */
if(!casesen) /* If ignoring case */
{
m = (i < 'A')? 'A': i;
/* m = max(i,'A') */
n = (j > 'Z')? 'Z': j;
/* n = min(j,'Z') */
if(m <= n) bitset(cp,tolower(m),tolower(n),bitval);
/* Whack corresponding lower case */
m = (i < 'a')? 'a': i;
/* m = max(i,'a') */
n = (j > 'z')? 'z': j;
/* n = min(j,'z') */
if(m <= n) bitset(cp,toupper(m),toupper(n),bitval);
/* Whack corresponding upper case */
}
}
if(*++p == '*') /* If repeated class */
{
memmove(cp,cp - 1,ASCLEN/8 + 1);
/* Move vector forward 1 byte */
cp[-1] = T_STAR; /* Insert prefix */
++cp; /* Skip to start of vector */
++p; /* Skip star */
}
cp += ASCLEN/8; /* Skip over vector */
continue; /* Next iteration */
}
*cp++ = T_STAR; /* Repeated single character */
*cp++ = T_SINGLE;
if(*p == '\\') ++p; /* Skip escape if any */
*cp++ = *p++; /* Copy the character */
assert(*p == '*'); /* Validate assumption */
++p; /* Skip the star */
}
*cp++ = T_END; /* Mark end of parsed expression */
cp2 = alloc(cp - buffer); /* Allocate buffer */
memmove(cp2,buffer,(size_t)(cp - buffer)); /* Copy expression to buffer */
return(cp2); /* Return buffer pointer */
}
int istoken(s,n)
unsigned char *s; /* String */
int n; /* Length */
{
if(n >= 2 && s[0] == '\\' && s[1] == '<') return(1);
/* Token if starts with '\<' */
while(n-- > 0) /* Loop to find end of string */
{
if(*s++ == '\\') /* If escape found */
{
if(--n == 0 && *s == '>') return(1);
/* Token if ends with '\>' */
++s; /* Skip escaped character */
}
}
return(0); /* Not a token */
}
int isexpr(s,n)
unsigned char *s; /* String */
int n; /* Length */
{
unsigned char *cp; /* Char pointer */
int status; /* Return status */
char buffer[BUFLEN]; /* Temporary buffer */
if(istoken(s,n)) return(1); /* Tokens are exprs */
memmove(buffer,s,n); /* Copy string to buffer */
buffer[n] = '\0'; /* Null-terminate string */
if((s = exprparse(buffer)) == NULL) return(0);
/* Not an expression if parse fails */
status = 1; /* Assume we have an expression */
if(*s != '^' && *s != T_END) /* If no caret and not empty */
{
status = 0; /* Assume not an expression */
cp = s; /* Initialize */
do /* Loop to find special tokens */
{
switch(*cp++) /* Switch on token type */
{
case T_STAR: /* Repeat prefix */
case T_CLASS: /* Character class */
case T_ANY: /* Any character */
++status; /* This is an expression */
break;
case T_SINGLE: /* Single character */
++cp; /* Skip character */
break;
case T_STRING: /* String */
cp += *cp + 1; /* Skip string */
break;
}
}
while(!status && *cp != T_END); /* Do while not at end of expression */
}
free(s); /* Free expression */
return(status); /* Return status */
}
void exprprint(p,fo)
unsigned char *p; /* Pointer to expression */
FILE *fo; /* File pointer */
{
int bit; /* Bit value */
int count; /* Count of characters in string */
int first; /* First character in range */
int last; /* Last character in range */
int star; /* Repeat prefix flag */
if(*p == '^') fputc(*p++,fo); /* Print leading caret */
while(*p != T_END) /* While not at end of expression */
{
star = 0; /* Assume no prefix */
if(*p == T_STAR) /* If repeat prefix found */
{
++star; /* Set flag */
++p; /* Skip prefix */
}
switch(*p++) /* Switch on token type */
{
case T_END: /* End of expression */
case T_STAR: /* Repeat prefix */
fprintf(stderr,"Internal error: exprprint\n");
/* Not valid */
exit(2); /* Die abnormal death */
case T_STRING: /* String */
count = *p++; /* Get string length */
goto common; /* Forgive me, Djikstra! */
case T_SINGLE: /* Single character */
count = 1; /* Only one character */
common:
while(count-- > 0) /* While bytes remain */
{
if(*p == EOS) /* If end-of-string found */
{
++p; /* Skip character */
fputc('$',fo); /* Emit special marker */
continue; /* Next iteration */
}
if(strchr("*.[\\$",*p) != NULL) fputc('\\',fo);
/* Emit escape if needed */
fputc(*p++,fo); /* Emit the character */
}
break;
case T_ANY: /* Match any */
fputc('.',fo); /* Emit dot */
break;
case T_CLASS:
first = -1; /* Initialize */
fputc('[',fo); /* Open braces */
for(count = ' '; count <= '~'; ++count)
{ /* Loop through printable characters */
if((bit = p[count >> 3] & (1 << (count & 7))) != 0)
{ /* If bit is set */
if(first == -1) first = count;
/* Set first bit */
last = count; /* Set last bit */
}
if((!bit || count == '~') && first != -1)
{ /* If range to print */
if(strchr("\\]-",first) != NULL) fputc('\\',fo);
/* Emit escape if needed */
fputc(first,fo); /* Print first character in range */
if(last != first) /* If we have a range */
{
if(last > first + 1) fputc('-',fo);
/* Emit hyphen if needed */
if(strchr("\\]-",last) != NULL) fputc('\\',fo);
/* Emit escape if needed */
fputc(last,fo);
/* Print last character in range */
}
first = -1; /* Range printed */
}
}
fputc(']',fo); /* Close braces */
p += ASCLEN/8; /* Skip bit vector */
break;
}
if(star) fputc('*',fo); /* Print star if needed */
}
fputc('\n',fo); /* Print newline */
}
char *get1stcharset(e,bitvec)
unsigned char *e; /* Pointer to expression */
char *bitvec; /* Pointer to bit vector */
{
unsigned char *cp; /* Char pointer */
int i; /* Index/counter */
int star; /* Repeat prefix flag */
if(*e == '^') ++e; /* Skip leading caret if any */
memset(bitvec,'\0',ASCLEN/8); /* Clear bit vector */
cp = e; /* Initialize */
while(*e != T_END) /* Loop to process leading *-expr.s */
{
star = 0; /* Assume no repeat prefix */
if(*e == T_STAR) /* If repeat prefix found */
{
++star; /* Set flag */
++e; /* Skip repeat prefix */
}
switch(*e++) /* Switch on token type */
{
case T_END: /* End of expression */
case T_STAR: /* Repeat prefix */
fprintf(stderr,"Internal error: get1stcharset\n");
/* Not valid */
exit(2); /* Die abnormal death */
case T_STRING: /* String */
if(star || *e++ == '\0') /* If repeat prefix or zero count */
{
fprintf(stderr,"Internal error: get1stcharset\n");
/* Not valid */
exit(2); /* Die abnormal death */
}
/* Drop through */
case T_SINGLE: /* Single character */
bitset(bitvec,*e,*e,1); /* Set the bit */
++e; /* Skip the character */
break;
case T_ANY: /* Match any */
memset(bitvec,'\xFF',ASCLEN/8);
/* Set all the bits */
bitset(bitvec,EOS,EOS,0); /* Except end-of-string */
bitset(bitvec,'\n','\n',0);
/* And linefeed! */
break;
case T_CLASS:
for(i = 0; i < ASCLEN/8; ++i) bitvec[i] |= *e++;
/* Or in all the bits */
break;
}
if(!star) break; /* Break if not repeated */
cp = e; /* Update pointer */
}
return(cp); /* Point to 1st non-repeated expr. */
}
char *findall(buffer,bufend)
char *buffer; /* Buffer in which to search */
char *bufend; /* End of buffer */
{
return(buffer < bufend? buffer: NULL);
/* Fail only on empty buffer */
}
void addtoken(e,n)
char *e; /* Raw token expression */
int n; /* Length of expression */
{
static char achpref[] = "^";/* Prefix */
static char achprefsuf[] = "[^A-Za-z0-9_]";
/* Prefix/suffix */
static char achsuf[] = "$"; /* Suffix */
char buffer[BUFLEN]; /* Temporary buffer */
assert(n >= 2); /* Must have at least two characters */
if(e[0] == '\\' && e[1] == '<') /* If begin token */
{
if(!(flags & BEGLINE)) /* If not matching at beginning only */
{
memcpy(buffer,achprefsuf,sizeof achprefsuf - 1);
/* Copy first prefix */
memcpy(buffer + sizeof achprefsuf - 1,e + 2,n - 2);
/* Attach expression */
addexpr(buffer,n + sizeof achprefsuf - 3);
/* Add expression */
}
memcpy(buffer,achpref,sizeof achpref - 1);
/* Copy second prefix */
memcpy(buffer + sizeof achpref - 1,e + 2,n - 2);
/* Attach expression */
addexpr(buffer,n + sizeof achpref - 3);
/* Add expression */
return; /* Done */
}
assert(e[n-2] == '\\' && e[n - 1] == '>');
/* Must be end token */
if(!(flags & ENDLINE)) /* If not matching at end only */
{
memcpy(buffer,e,n - 2); /* Copy expression */
memcpy(buffer + n - 2,achprefsuf,sizeof achprefsuf - 1);
/* Attach first suffix */
addexpr(buffer,n + sizeof achprefsuf - 3);
/* Add expression */
}
memcpy(buffer,e,n - 2); /* Copy expression */
memcpy(buffer + n - 2,achsuf,sizeof achsuf - 1);
/* Attach second suffix */
addexpr(buffer,n + sizeof achsuf - 3);
/* Add expression */
}
void addexpr(e,n)
char *e; /* Expression to add */
int n; /* Length of expression */
{
EXPR *expr; /* Expression node pointer */
int i; /* Index */
int j; /* Index */
int locflags; /* Local copy of flags */
char bitvec[ASCLEN/8];
/* First char. bit vector */
char buffer[BUFLEN]; /* Temporary buffer */
if(find == findall) return; /* Return if matching everything */
if(istoken(e,n)) /* If expr is token */
{
addtoken(e,n); /* Convert and add tokens */
return; /* Done */
}
locflags = flags; /* Initialize local copy */
if(*e == '^') locflags |= BEGLINE; /* Set flag if match must begin line */
j = -2; /* Assume no escapes in string */
for(i = 0; i < n - 1; ++i) /* Loop to find last escape */
{
if(e[i] == '\\') j = i++; /* Save index of last escape */
}
if(n > 0 && e[n-1] == '$' && j != n-2)
{ /* If expr. ends in unescaped '$' */
--n; /* Skip dollar sign */
locflags |= ENDLINE; /* Match must be at end */
}
strncpy(buffer,e,n); /* Copy pattern to buffer */
if(locflags & ENDLINE) buffer[n++] = EOS;
/* Add end character if needed */
buffer[n] = '\0'; /* Null-terminate string */
if((e = exprparse(buffer)) == NULL) return;
/* Return if invalid expression */
++strcnt; /* Increment string count */
if(!(locflags & BEGLINE)) /* If match needn't be at beginning */
{
e = get1stcharset(e,bitvec); /* Remove leading *-expr.s */
}
/*
* E now points to a buffer containing a preprocessed expression.
* We need to find the set of allowable first characters and make
* the appropriate entries in the string node table.
*/
if(*get1stcharset(e,bitvec) == T_END)
{ /* If expression will match anything */
find = findall; /* Match everything */
return; /* All done */
}
for(j = 0; j < ASCLEN; ++j) /* Loop to examine bit vector */
{
if(bitvec[j >> 3] & (1 << (j & 7)))
{ /* If the bit is set */
expr = (EXPR *) alloc(sizeof(EXPR));
/* Allocate record */
expr->ex_pattern = e; /* Point it at pattern */
if((i = transtab[j]) == 0) /* If no existing list */
{
if((i = clists++) >= ASCLEN)
{ /* If too many string lists */
fprintf(stderr,"Too many string lists\n");
/* Error message */
exit(2); /* Die */
}
stringlist[i] = NULL; /* Initialize */
transtab[j] = (char) i; /* Set pointer to new list */
if(!casesen && isalpha(j)) transtab[j ^ 0x20] = (char) i;
/* Set pointer for other case */
}
expr->ex_next = stringlist[i];
/* Link new record into table */
stringlist[i] = expr;
}
}
if(locflags & DEBUG) exprprint(e,stderr);
/* Print the expression if debugging */
}
char *findexpr(buffer,bufend)
unsigned char *buffer; /* Buffer in which to search */
char *bufend; /* End of buffer */
{
EXPR *expr; /* Expression list pointer */
unsigned char *pattern; /* Pattern */
int i; /* Index */
while(buffer < bufend) /* Loop to find match */
{
if((i = transtab[*buffer++]) == 0) continue;
/* Continue if not valid 1st char */
if((expr = (EXPR *) stringlist[i]) == NULL)
{ /* If null pointer */
fprintf(stderr,"Internal error: findexpr\n");
/* Print error message */
exit(2); /* Die */
}
--buffer; /* Back up to first character */
while(expr != NULL) /* Loop to find match */
{
pattern = expr->ex_pattern; /* Point to pattern */
expr = expr->ex_next; /* Point to next record */
if(pattern[0] == '^') /* If match begin line */
{
++pattern; /* Skip caret */
if(buffer[-1] != '\n') continue;
/* Don't bother if not at beginning */
}
if(exprmatch(buffer,pattern)) return(buffer);
/* Return pointer if match found */
}
++buffer; /* Skip first character */
}
return(NULL); /* No match */
}