/* $Header: /usr/cvsroot/libutf/ure.c,v 1.1.1.1 2006/05/26 20:14:56 agc Exp $ */

/*
 * Copyright © 1996-2006 Alistair Crooks.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <config.h>

#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif

#include <stdio.h>
#include <ctype.h>

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#ifdef HAVE_STRING_H
#include <string.h>
#endif

#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif

#include "utf.h"
#include "ure.h"

/*************************************************************************/
/* structural regular expression routines */

/*
 *	Changed by agc for various uses to include URE_Subst,
 *	and to make the functions use ANSI C type arguments.
 *	Subsequent changes to:
 *	- make regexps into SREs
 *	- include \<\>, \s\S, \w\W, \p\P, \d\D recognition
 *	- added '@' metacharacter for any char including \n
 *	- added {min,max} recognition of strings
 *	- do case insensitive searching on execution, not compilation
 *	- added SRE_SetAlphabet, and use the alphabets
 *	- calculate character ranges on execution, not compilation
 *	- modified to use 16-bit unicode characters, rename SRE to URE.
 */

/*
 * urecomp and ureexec -- URE_Subst and ureerror are elsewhere
 *
 *	Copyright (c) 1986 by University of Toronto.
 *	Written by Henry Spencer.  Not derived from licensed software.
 *
 *	Permission is granted to anyone to use this software for any
 *	purpose on any computer system, and to redistribute it freely,
 *	subject to the following restrictions:
 *
 *	1. The author is not responsible for the consequences of use of
 *		this software, no matter how awful, even if they arise
 *		from defects in it.
 *
 *	2. The origin of this software must not be misrepresented, either
 *		by explicit claim or by omission.
 *
 *	3. Altered versions must be plainly marked as such, and must not
 *		be misrepresented as being the original software.
 *
 * Beware that some of this code is subtly aware of the way operator
 * precedence is structured in regular expressions.  Serious changes in
 * regular-expression syntax might require a total rethink.
 */
/*
 * The "internal use only" fields in regexp.h are present to pass info from
 * compile to execute that permits the execute phase to run lots faster on
 * simple cases.  They are:
 *
 * u_start	char that must begin a match; '\0' if none obvious
 * u_anch	is the match anchored (at beginning-of-line only)?
 * u_must	string (pointer into u_prog) that match must include, or NULL
 * u_mlen	length of u_must string
 *
 * u_start and u_anch permit very fast decisions on suitable
 * starting points for a match, cutting down the work a lot.  u_must
 * permits fast rejection of lines that cannot possibly match.  The
 * u_must tests are costly enough that urecomp() supplies a
 * u_must only if the r.e.  contains something potentially expensive
 * (at present, the only such thing detected is * or + at the start of
 * the r.e., which can involve a lot of backup).  u_mlen is supplied
 * because the test in ureexec() needs it and urecomp() is
 * computing it anyway.
 */

/*
 * Structure for URE "program".  This is essentially a linear encoding
 * of a nondeterministic finite-state machine (aka syntax charts or
 * "railroad normal form" in parsing technology).  Each node is an opcode
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
 * all nodes except BRANCH implement concatenation; a "next" pointer with
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
 * opposed to a collection of them) is never concatenated with anything
 * because of operator precedence.)  The operand of some types of node is
 * a literal string; for others, it is a node leading into a sub-FSM.  In
 * particular, the operand of a BRANCH node is the first node of the branch.
 * (NB this is *not* a tree structure:  the tail of the branch connects
 * to the thing following the set of BRANCHes.)  The opcodes are:
 */

enum URE_Opcodes {
	END,		/* no	End of program. */
	BOL,		/* no	Match "" at beginning of line. */
	EOL,		/* no	Match "" at end of line. */
	ANYNONL,	/* no	Match any one character (not \n). */
	ANYOF,		/* str	Match any character in this string. */
	ANYBUT,		/* str	Match any character not in this string. */
	BRANCH,		/* node	Match this alternative, or the next... */
	BACK,		/* no	Match "", "next" ptr points backward. */
	EXACTLY,	/* str	Match this string. */
	NOTHING,	/* no	Match empty string. */
	STAR,		/* node	Match this (simple) thing 0 or more times. */
	PLUS,		/* node	Match this (simple) thing 1 or more times. */
	BEGWORD,	/* node Match "" between nonword and word. */
	ENDWORD,	/* node Match "" between word and nonword. */
	WHITESP,	/* node Match any whitespace character */
	NWHITESP,	/* node Match any nonwhitespace character */
	ALNUM,		/* node Match any alphanumeric, include _ */
	NALNUM,		/* node inverse above, including BOL and EOL */
	DIGIT,		/* node Match any digit */
	NDIGIT,		/* node Match any non-digit */
	PRINT,		/* node Match any printable char (including whitesp) */
	NPRINT,		/* node Match any non-printable char */
	MINMAX,		/* node Match this thing (min, max) times */
	ANY,		/* no	Match any character, even \n */
	BORDER,		/* node Match "" border between nonword and word. */
	NBORDER,	/* node Match "" non-border between nonword and word. */
	CLASSRANGE,	/*      used internally for class range matching */
	CLASSPOSIX,	/*      used internally for posix class matching */
	CLASSRUNE,	/*      used internally for class rune matching */
	OPEN = 0x2000,	/* no   Mark this point in input as start of #n. */
			/*      OPEN+1 is number 1, etc. */
	CLOSE = 0x4000	/* no   Analogous to OPEN. */
};

/*
 * Opcode notes:
 *
 * BRANCH	The set of branches constituting a single choice are hooked
 *		together with their "next" pointers, since precedence prevents
 *		anything being concatenated to any individual branch.  The
 *		"next" pointer of the last BRANCH in a choice points to the
 *		thing following the whole choice.  This is also where the
 *		final "next" pointer of each individual branch points; each
 *		branch starts with the operand node of a BRANCH node.
 *
 * BACK		Normal "next" pointers all implicitly point forward; BACK
 *		exists to make loop structures possible.
 *
 * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
 *		BRANCH structures using BACK.  Simple cases (one character
 *		per match) are implemented with STAR and PLUS for speed
 *		and to minimize recursive plunges.
 *
 * OPEN,CLOSE	...are numbered at compile time.
 */

/*
 *	This used to be in "regmagic.h" - agc
 */
 
/*
 * The first rune of the ure_t internal "program" is actually this magic
 * number; the start node begins in the second byte.
 */

#define	URE_MAGIC	0x2341

/*
 * A node is one char of opcode followed by two chars of "next" pointer.
 * "Next" pointers are stored as two 16-bit pieces, high order first.  The
 * value is a positive offset from the opcode of the node containing it.
 * An operand, if any, simply follows the node.  (Note that much of the
 * code generation knows about this implicit relationship.)
 *
 * Using four bytes for the "next" pointer is VAST overkill for most things,
 * but allows patterns to get big without disasters.
 */
#define	OP(p)	(*(p))
#define	NEXT(p)	(((*((p)+1) & 0xffff)<<16) + (*((p)+2) & 0xffff))
#define	OPERAND(p)	((p) + 3)

/*
 * Utility definitions.
 */
#define UCHARAT(p)	(*(p))

#define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
#define	META		"^$.@[()|?+*\\{"

/*
 * Flags to be passed up and down.
 */
#define	HASWIDTH	01	/* Known never to match null string. */
#define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
#define	SPSTART		04	/* Starts with * or +. */
#define	WORST		0	/* Worst case. */

/* This struct describes the internals of utf re's. Used in compiling */
typedef struct gubbins {
	unsigned int	g_parc;		/* # of parens */
	long		g_size;		/* size of expr */
	Rune		*g_code;	/* pointer to position in expr */
	char		*g_parse;	/* position in input string */
} gubbins_t;

/* Sorry, my little boy's going through an "anatomy" phase. */
/* used in matching phase */
typedef struct tummy {
	char		*t_input;	/* input string */
	char		*t_bol;		/* pointer to beginning of line */
	char		*t_start;	/* start of string */
	int		t_matchc;	/* # of slots in t_matchv */
	urematch_t	*t_matchv;	/* matching array */
	ureoff_t	t_off;		/* offset from start of string */
} tummy_t;

/* maximum number of distinct ranges that can be specified in one class */
#ifndef MAXRANGE
#define MAXRANGE	1024
#endif

/*
 * Forward declarations for urecomp()'s friends.
 */
static Rune	*ure(int paren, int *flagp, int *ep, int sizing, gubbins_t *gp);

/*
 - ureNext - dig the "next" pointer out of a node
 */
static Rune *
ureNext(Rune *p, int sizing)
{
	register int offset;

	if (sizing) {
		return NULL;
	}

	offset = NEXT(p);
	if (offset == 0) {
		return NULL;
	}

	if (OP(p) == BACK) {
		return p-offset;
	} else {
		return p+offset;
	}
}

/*
 - ureTail - set the next-pointer at the end of a node chain
 */
static void
ureTail(Rune *p, Rune *val, int sizing, gubbins_t *gp)
{
	register Rune	*scan;
	register Rune	*temp;
	register int	offset;

	if (!sizing) {
		/* Find last node. */
		scan = p;
		for (;;) {
			temp = ureNext(scan, sizing);
			if (temp == NULL) {
				break;
			}
			scan = temp;
		}

		if (OP(scan) == BACK) {
			offset = scan - val;
		} else {
			offset = val - scan;
		}
		*(scan+1) = (offset>>16) & 0xffff;
		*(scan+2) = offset & 0xffff;
	}
}

/*
 - ureOpTail - ureTail on operand of first argument; nop if operandless
 */
static void
ureOpTail(Rune *p, Rune *val, int sizing, gubbins_t *gp)
{
	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
	if (p != NULL && !sizing && OP(p) == BRANCH) {
		ureTail(OPERAND(p), val, sizing, gp);
	}
}

/*
 - ureInsert - insert an operator in front of already-emitted operand
 *
 * Means relocating the operand.
 */
static void
ureInsert(Rune op, Rune *opnd, int sizing, gubbins_t *gp)
{
	register Rune *src;
	register Rune *dst;
	register Rune *place;

	if (sizing) {
		gp->g_size += 3;
	} else {
		src = gp->g_code;
		gp->g_code += 3;
		dst = gp->g_code;
		while (src > opnd) {
			*--dst = *--src;
		}

		place = opnd;		/* Op node, where operand used to be. */
		*place++ = op;
		*place++ = '\0';
		*place++ = '\0';
	}
}

/*
 - ureC - emit (if appropriate) a byte of code
 */
static void
ureC(Rune b, int sizing, gubbins_t *gp)
{
	if (sizing) {
		gp->g_size++;
	} else {
		*gp->g_code++ = b;
	}
}

/*
 - ureNode - emit a node
 */
static Rune *			/* Location. */
ureNode(Rune op, int sizing, gubbins_t *gp)
{
	register Rune *ret;
	register Rune *ptr;

	if (sizing) {
		gp->g_size += 3;
		return gp->g_code;
	}

	ptr = ret = gp->g_code;
	*ptr++ = op;
	*ptr++ = '\0';		/* Null "next" pointer. */
	*ptr++ = '\0';
	gp->g_code = ptr;

	return ret;
}

static Rune *
compileRange(int sizing, gubbins_t *gp, int *errp)
{
	Rune	rangev[MAXRANGE][2];
	Rune	*ret;
	Rune	r2;
	Rune	r;
	int	rangec;
	int	i;

	i = chartorune(&r, gp->g_parse);
	if (r == '^') {
		/* Complement of range. */
		ret = ureNode(ANYBUT, sizing, gp);
		gp->g_parse += i;
		i = chartorune(&r, gp->g_parse);
	} else {
		ret = ureNode(ANYOF, sizing, gp);
	}
	if (r == ']' || r == '-') {
		ureC(r, sizing, gp);
		gp->g_parse += i;
		i = chartorune(&r, gp->g_parse);
	}
	for (rangec = 0 ; r != 0 && r != ']' ; ) {
		/* XXX agc escape \\ ??? XXX */
		(void) chartorune(&r2, gp->g_parse + i);
		if (r2 == '-') {
			rangev[rangec][0] = r;
			gp->g_parse += i;
			i = chartorune(&r, gp->g_parse);
			gp->g_parse += i;
			i = chartorune(&r, gp->g_parse);
			rangev[rangec++][1] = r;
		} else {
			ureC(r, sizing, gp);
		}
		gp->g_parse += i;
		i = chartorune(&r, gp->g_parse);
	}
	if (r != ']') {
		*errp = URE_ERR_UNMATCHED_BRACKET;
		return (Rune *) NULL;
	}
	gp->g_parse += i;
	ureC(0, sizing, gp);
	ureC(rangec, sizing, gp);
	for (i = 0 ; i < rangec ; i++) {
		ureC(rangev[i][0], sizing, gp);
		ureC(rangev[i][1], sizing, gp);
	}
	return ret;
}

	/*
	if (strncmp(gp->g_parse, "alnum:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "alpha:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "blank:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "cntrl:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "digit:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "graph:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "lower:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "print:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "punct:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "space:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "upper:]", 7) == 0) {
	} else if (strncmp(gp->g_parse, "xdigit:]", 8) == 0) {
	}
	*/

/*
 - ureAtom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
static Rune *
ureAtom(int *flagp, int *errp, int sizing, gubbins_t *gp)
{
	register Rune	*ret;
	Rune		r;
	int		flags;
	int		i;

	*flagp = WORST;		/* Tentatively. */

	i = chartorune(&r, gp->g_parse);
	gp->g_parse += i;
	switch (r) {
	case '^':
		ret = ureNode(BOL, sizing, gp);
		break;
	case '$':
		ret = ureNode(EOL, sizing, gp);
		break;
	case '.':
		ret = ureNode(ANYNONL, sizing, gp);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case '@':
		ret = ureNode(ANY, sizing, gp);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case '[':
		if ((ret = compileRange(sizing, gp, errp)) == (Rune *) NULL) {
			return (Rune *) NULL;
		}
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case '(':
		if ((ret = ure(1, &flags, errp, sizing, gp)) == (Rune *) NULL) {
			return (Rune *) NULL;
		}
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case '\0':
	case '|':
	case ')':
		/* Supposed to be caught earlier. */
		*errp = URE_ERR_INTERNAL_MESS;
		return (Rune *) NULL;
	case '?':
	case '+':
	case '*':
	case '{':
		*errp = URE_ERR_REPEAT_FOLLOWS_NOTHING;
		return (Rune *) NULL;
	case '\\':
		i = chartorune(&r, gp->g_parse);
		if (r == 0) {
			*errp = URE_ERR_TRAILING_BACKSLASH;
			return (Rune *) NULL;
		}
		switch(r) {
		case '<':
			ret = ureNode(BEGWORD, sizing, gp);
			break;
		case '>':
			ret = ureNode(ENDWORD, sizing, gp);
			break;
		case 'b':
			ret = ureNode(BORDER, sizing, gp);
			break;
		case 'B':
			ret = ureNode(NBORDER, sizing, gp);
			break;
		case 'd':
			ret = ureNode(DIGIT, sizing, gp);
			*flagp |= (HASWIDTH|SIMPLE);
			break;
		case 'D':
			ret = ureNode(NDIGIT, sizing, gp);
			*flagp |= (HASWIDTH|SIMPLE);
			break;
		case 'n' :
			ret = ureNode(EXACTLY, sizing, gp);
			ureC('\n', sizing, gp);
			ureC('\0', sizing, gp);
			*flagp |= (HASWIDTH|SIMPLE);
			break;
		case 'p':
			ret = ureNode(PRINT, sizing, gp);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 'P':
			ret = ureNode(NPRINT, sizing, gp);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 's':
			ret = ureNode(WHITESP, sizing, gp);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 'S':
			ret = ureNode(NWHITESP, sizing, gp);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 't' :
			ret = ureNode(EXACTLY, sizing, gp);
			ureC('\t', sizing, gp);
			ureC('\0', sizing, gp);
			*flagp |= (HASWIDTH|SIMPLE);
			break;
		case 'w':
			ret = ureNode(ALNUM, sizing, gp);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case 'W':
			ret = ureNode(NALNUM, sizing, gp);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		default :
			if (r == 'u' && isxdigit(*(gp->g_parse + i)) &&
			    isxdigit(*(gp->g_parse + i + 1)) &&
			    isxdigit(*(gp->g_parse + i + 2)) &&
			    isxdigit(*(gp->g_parse + i + 3))) {
				/* check next 4 chars are hex digits, so rune */
				r = AsciiToNumber(gp->g_parse + i, 4, 16);
				gp->g_parse += 4;
			}
			ret = ureNode(EXACTLY, sizing, gp);
			ureC(r, sizing, gp);
			ureC('\0', sizing, gp);
			*flagp |= HASWIDTH|SIMPLE;
		}
		gp->g_parse += i;
		break;
	default: {
			register int	len;
			Rune		ender;
			int		rc;

			gp->g_parse -= i;
			/* look for a meta in the rest of gp->g_parse */
			len = utfcspan(gp->g_parse, META, &rc);
			if (len <= 0) {
				*errp = URE_ERR_INTERNAL_DISASTER;
				return (Rune *) NULL;
			}
			(void) chartorune(&ender, gp->g_parse + len);
			if (rc > 1 && ISMULT(ender)) {
				/* Back off clear of ?+* operand. */
				rc--;
			}
			*flagp |= HASWIDTH;
			if (rc == 1) {
				*flagp |= SIMPLE;
			}
			ret = ureNode(EXACTLY, sizing, gp);
			while (rc > 0) {
				i = chartorune(&r, gp->g_parse);
				ureC(r, sizing, gp);
				gp->g_parse += i;
				rc--;
			}
			ureC(0, sizing, gp);
		}
		break;
	}

	return ret;
}

/*
 - urePiece - something followed by possible [*+?{]
 *
 * Note that the branching code sequences used for ? and the general cases
 * of * and + are somewhat optimized:  they use the same NOTHING node as
 * both the endmarker for their branch list and the body of the last branch.
 * It might seem that this node could be dispensed with entirely, but the
 * endmarker role is not redundant.
 */
static Rune *
urePiece(int *flagp, int *errp, int sizing, gubbins_t *gp)
{
	register Rune	*next;
	register Rune	*ret;
	Rune		op;
	Rune		r;
	Rune		max;
	Rune		min;
	int		flags;
	int		i;

	ret = ureAtom(&flags, errp, sizing, gp);
	if (ret == NULL) {
		return NULL;
	}

	i = chartorune(&op, gp->g_parse);
	if (!ISMULT(op)) {
		*flagp = flags;
		return ret;
	}

	if (!(flags&HASWIDTH) && op != '?') {
		*errp = URE_ERR_OPERAND_EMPTY;
		return (Rune *) NULL;
	}
	*flagp = (op != '+' && op != '{') ? (WORST|SPSTART) : (WORST|HASWIDTH);

	if (op == '*' && (flags&SIMPLE)) {
		ureInsert(STAR, ret, sizing, gp);
	} else if (op == '*') {
		/* Emit x* as (x&|), where & means "self". */
		ureInsert(BRANCH, ret, sizing, gp);		/* Either x */
		ureOpTail(ret, ureNode(BACK, sizing, gp), sizing, gp);	/* and loop */
		ureOpTail(ret, ret, sizing, gp);		/* back */
		ureTail(ret, ureNode(BRANCH, sizing, gp), sizing, gp);	/* or */
		ureTail(ret, ureNode(NOTHING, sizing, gp), sizing, gp);	/* null. */
	} else if (op == '+' && (flags&SIMPLE)) {
		ureInsert(PLUS, ret, sizing, gp);
	} else if (op == '+') {
		/* Emit x+ as x(&|), where & means "self". */
		next = ureNode(BRANCH, sizing, gp);		/* Either */
		ureTail(ret, next, sizing, gp);
		ureTail(ureNode(BACK, sizing, gp), ret, sizing, gp);	/* loop back */
		ureTail(next, ureNode(BRANCH, sizing, gp), sizing, gp);	/* or */
		ureTail(ret, ureNode(NOTHING, sizing, gp), sizing, gp);	/* null. */
	} else if (op == '{') {
		min = max = 0;
		gp->g_parse += i;
		for (;;) {
			i = chartorune(&r, gp->g_parse);
			if (r == 0 || !isdigit(r)) {
				break;
			}
			min = (min * 10) + (r - '0');
			gp->g_parse += i;
		}
		gp->g_parse += i;
		for (;;) {
			i = chartorune(&r, gp->g_parse);
			if (r == 0 || !isdigit(r)) {
				break;
			}
			max = (max * 10) + (r - '0');
			gp->g_parse += i;
		}
		ureInsert(max, ret, sizing, gp);
		next = OPERAND(ret);
		ureInsert(min, ret, sizing, gp);
		next = OPERAND(next);
		ureInsert(MINMAX, ret, sizing, gp);
		ureTail(ret, OPERAND(next), sizing, gp);	/* MINMAX->next = x */
	} else if (op == '?') {
		/* Emit x? as (x|) */
		ureInsert(BRANCH, ret, sizing, gp);		/* Either x */
		ureTail(ret, ureNode(BRANCH, sizing, gp), sizing, gp);	/* or */
		next = ureNode(NOTHING, sizing, gp);		/* null. */
		ureTail(ret, next, sizing, gp);
		ureOpTail(ret, next, sizing, gp);
	}
	gp->g_parse += i;
	(void) chartorune(&r, gp->g_parse);
	if (ISMULT(r)) {
		*errp = URE_ERR_NESTED_REPEAT;
		return (Rune *) NULL;
	}

	return ret;
}

/*
 - ureBranch - one alternative of an | operator
 *
 * Implements the concatenation operator.
 */
static Rune *
ureBranch(int *flagp, int *errp, int sizing, gubbins_t *gp)
{
	register Rune	*latest;
	register Rune	*chain;
	register Rune	*ret;
	Rune		r;
	int 		flags;
	int 		i;

	*flagp = WORST;		/* Tentatively. */

	ret = ureNode(BRANCH, sizing, gp);
	chain = NULL;
	for (;;) {
		i = chartorune(&r, gp->g_parse);
		if (r == 0 || r == '|' || r == ')') {
			break;
		}
		latest = urePiece(&flags, errp, sizing, gp);
		if (latest == NULL) {
			return NULL;
		}
		*flagp |= flags&HASWIDTH;
		if (chain == NULL) {	/* First piece. */
			*flagp |= flags&SPSTART;
		} else {
			ureTail(chain, latest, sizing, gp);
		}
		chain = latest;
	}
	if (chain == NULL) {	/* Loop ran zero times. */
		(void) ureNode(NOTHING, sizing, gp);
	}

	return ret;
}

/*
 - ure - regular expression, i.e. main body or parenthesized thing
 *
 * Caller must absorb opening parenthesis.
 *
 * Combining parenthesis handling with the base level of regular expression
 * is a trifle forced, but the need to tie the tails of the branches to what
 * follows makes it hard to avoid.
 */
static Rune *
ure(int paren, int *flagp, int *errp, int sizing, gubbins_t *gp)
{
	register Rune	*ender;
	register Rune	*ret;
	register Rune	*br;
	register int	parno;
	Rune		r;
	int		flags;
	int		i;

	*flagp = HASWIDTH;	/* Tentatively. */

	/* Make an OPEN node, if parenthesized. */
	if (paren) {
		if (gp->g_parc >= URE_NSUBEXP) {
			*errp = URE_ERR_TOO_MANY_PARENS;
			return (Rune *) NULL;
		}
		parno = gp->g_parc++;
		ret = ureNode(OPEN+parno, sizing, gp);
	} else {
		ret = NULL;
		parno = 0;
	}

	/* Pick up the branches, linking them together. */
	br = ureBranch(&flags, errp, sizing, gp);
	if (br == NULL) {
		return (Rune *) NULL;
	}
	if (ret != NULL) {
		ureTail(ret, br, sizing, gp);	/* OPEN -> first. */
	} else {
		ret = br;
	}
	if (!(flags&HASWIDTH)) {
		*flagp &= ~HASWIDTH;
	}
	*flagp |= flags&SPSTART;

	for (;;) {
		i = chartorune(&r, gp->g_parse);
		if (r != '|') {
			break;
		}
		gp->g_parse += i;
		br = ureBranch(&flags, errp, sizing, gp);
		if (br == NULL) {
			return (Rune *) NULL;
		}
		ureTail(ret, br, sizing, gp);	/* BRANCH -> BRANCH. */
		if (!(flags&HASWIDTH)) {
			*flagp &= ~HASWIDTH;
		}
		*flagp |= flags&SPSTART;
	}

	/* Make a closing node, and hook it on the end. */
	ender = ureNode((paren) ? CLOSE+parno : END, sizing, gp);	
	ureTail(ret, ender, sizing, gp);

	/* Hook the tails of the branches to the closing node. */
	for (br = ret; br != NULL; br = ureNext(br, sizing)) {
		ureOpTail(br, ender, sizing, gp);
	}

	/* Check for proper termination. */
	i = chartorune(&r, gp->g_parse);
	if (paren) {
		gp->g_parse += i;
		if (r != ')') {
			*errp = URE_ERR_UNMATCHED_PAREN;
			return (Rune *) NULL;
		}
	} else if (!paren && r != 0) {
		if (r == ')') {
			*errp = URE_ERR_UNMATCHED_PAREN;
			return (Rune *) NULL;
		}
		/* "Can't happen". */
		*errp = URE_ERR_JUNK_ON_END;
		return (Rune *) NULL;
	}

	return ret;
}

/*
 - urecomp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled URE.
 */
int
urecomp(ure_t *up, char *exp, int cflags)
{
	register Rune	*longest;
	register Rune	*scan;
	register int	len;
	gubbins_t	g;
	Rune		dummy;
	int		flags;
	int		err;

	if (exp == (char *) NULL) {
		return URE_ERR_NULL_ARG;
	}

	/* First pass: determine size, legality. */
	g.g_parc = 1;
	g.g_size = 0L;

	/* arbirary value (non-null) for g_code here */
	g.g_code = &dummy;

	g.g_parse = exp;
	ureC(URE_MAGIC, 1, &g);
	if (ure(0, &flags, &err, 1, &g) == NULL) {
		return err;
	}

	/* Small enough for pointer-storage convention? */
	/* agc - actually, this limit is arbitrary -
	 * just enough to trap errors */
	if (g.g_size >= 0xffffff) {
		return URE_ERR_TOO_BIG;
	}

	/* Allocate space. */
	up->u_prog = (Rune *)malloc((unsigned)(g.g_size * sizeof(Rune)));
	if (up->u_prog == (Rune *) NULL) {
		return URE_ERR_OUT_OF_SPACE;
	}

	/* Second pass: emit code. */
	g.g_parc = 1;
	g.g_code = up->u_prog;
	g.g_parse = exp;

	ureC(URE_MAGIC, 0, &g);
	if (ure(0, &flags, &err, 0, &g) == NULL) {
		(void) free(up);
		return err;
	}

	up->u_subc = g.g_parc;	/* Actual count of sub-expressions */
	up->u_flags = cflags;

	/* Dig out information for optimizations. */
	up->u_start = '\0';	/* Worst-case defaults. */
	up->u_anch = 0;
	up->u_must = NULL;
	up->u_mlen = 0;
	scan = up->u_prog+1;			/* First BRANCH */
	if (OP(ureNext(scan, 0)) == END) {	/* Only one top-level choice */
		scan = OPERAND(scan);

		/* Starting-point info. */
		if (OP(scan) == EXACTLY) {
			up->u_start = *OPERAND(scan);
		} else if ((OP(scan) == BEGWORD || OP(scan) == BORDER) &&
			   OP(ureNext(scan, 0)) == EXACTLY) {
			up->u_start = *OPERAND(ureNext(scan, 0));
		} else if (OP(scan) == BOL) {
			up->u_anch++;
		}

		/*
		 * If there's something expensive in the r.e., find the
		 * longest literal string that must appear and make it the
		 * u_must.  Resolve ties in favor of later strings, since
		 * the u_start check works with the beginning of the r.e.
		 * and avoiding duplication strengthens checking.  Not a
		 * strong reason, but sufficient in the absence of others.
		 */
		if (flags&SPSTART) {
			longest = NULL;
			len = 0;
			for (; scan != NULL; scan = ureNext(scan, 0)) {
				if (OP(scan) == EXACTLY &&
				    UNICODE_strlen(OPERAND(scan)) >= len) {
					longest = OPERAND(scan);
					len = UNICODE_strlen(OPERAND(scan));
				}
			}
			up->u_must = longest;
			up->u_mlen = len;
		}
	}

	return URE_SUCCESS;
}

/*
 * ureexec and friends
 */
static int
MatchClass(Rune *cl, Rune ch)
{
	Rune	*cp;
	int	rangec;
	int	i;

	if (UNICODE_strchr(cp = cl, ch) != (Rune *) NULL) {
		return 1;
	}
	cp += UNICODE_strlen(cp) + 1;
	for (rangec = *cp++, i = 0 ; i < rangec ; i++, cp += 2) {
		if (UNICODE_InRange(*cp, *(cp + 1), ch)) {
			return 1;
		}
	}
	return 0;
}

/*
 - ureRepeat - repeatedly match something simple, report how many
 */
static int
ureRepeat(Rune *p, int *errp, tummy_t *tp)
{
	register Rune	*opnd;
	register char	*scan;
	register int	count = 0;
	Rune		r;
	int		len;

	scan = tp->t_input;
	opnd = OPERAND(p);
	switch (OP(p)) {
	case ANYNONL:
		/* make sure '.' doesn't match a \n character */
		for (len = chartorune(&r, scan) ; r != 0 && r != '\n' ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case ANY:
		/* make sure '@' matches any character */
		for (len = chartorune(&r, scan) ; r != 0 ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case EXACTLY:
		for (len = chartorune(&r, scan) ; *opnd == r ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case ANYOF:
		for (len = chartorune(&r, scan) ; r != 0 && MatchClass(opnd, r) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case ANYBUT:
		for (len = chartorune(&r, scan) ; r != 0 && !MatchClass(opnd, r) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case WHITESP :
		for (len = chartorune(&r, scan) ; r != 0 && UNICODE_isspace(r) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case NWHITESP :
		for (len = chartorune(&r, scan) ; r != 0 && !UNICODE_isspace(r) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case DIGIT :
		for (len = chartorune(&r, scan) ; r != 0 && UNICODE_isdigit(r) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case NDIGIT :
		for (len = chartorune(&r, scan) ; r != 0 && !UNICODE_isdigit(r) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case ALNUM :
		for (len = chartorune(&r, scan) ; r != 0 && UNICODE_isalnum(r) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case NALNUM :
		for (len = chartorune(&r, scan) ; r != 0 && !UNICODE_isalnum(r) ; ) {
			/* XXX - while (*scan != 0 && !UNICODE_IsIdent(*scan)) */
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case PRINT :
		for (len = chartorune(&r, scan) ; r != 0 && (UNICODE_isprint(r) || UNICODE_isspace(r)) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	case NPRINT :
		for (len = chartorune(&r, scan) ; r != 0 && !(UNICODE_isprint(r) || UNICODE_isspace(r)) ; ) {
			count++;
			scan += len;
			len = chartorune(&r, scan);
		}
		break;
	default:		/* Oh dear.  Called inappropriately. */
		*errp = URE_ERR_EXECUTION_BUG;
		count = 0;	/* Best compromise. */
		break;
	}
	tp->t_input = scan;

	return count;
}

/* return 1 if t_input is at the beginning of a word */
static int
IsBegWord(tummy_t *tp)
{
	Rune	r;

	(void) chartorune(&r, tp->t_input);
	if (UNICODE_IsIdent(r)) {
		if (tp->t_input == tp->t_bol) {
			return 1;
		}
		(void) priorrune(&r, tp->t_input);
		if (!UNICODE_IsIdent(r)) {
			return 1;
		}
	}
	return 0;
}

/* return 1 if t_input is at the end of a word */
static int
IsEndWord(tummy_t *tp)
{
	Rune	r;

	(void) chartorune(&r, tp->t_input);
	if (!UNICODE_IsIdent(r)) {
		(void) priorrune(&r, tp->t_input);
		if (tp->t_input != tp->t_bol && UNICODE_IsIdent(r)) {
			return 1;
		}
	}
	return 0;
}

/* return 1 if t_input is at the beginning of a line */
static int
IsBegLine(tummy_t *tp)
{
	Rune	r;

	if (tp->t_input == tp->t_bol) {
		return 1;
	}
	(void) priorrune(&r, tp->t_input);
	return (r == '\n');
}

/* compare, case-insensitively, a rune array to a utf string */
static int
runeutfncasecmp(Rune *rp, char *up, int len)
{
	Rune	r;
	int	diff;
	int	rc;
	int	i;

	for (diff = rc = 0 ; rc < len ; rc++) {
		i = chartorune(&r, up);
		up += i;
		if ((diff = UNICODE_tolower(r) - UNICODE_tolower(rp[rc])) != 0) {
			break;
		}
	}
	return diff;
}

/*
 - ureMatch - main matching routine
 *
 * Conceptually the strategy is simple:  check to see whether the current
 * node matches, call self recursively to see whether the rest matches,
 * and then act accordingly.  In practice we make some effort to avoid
 * recursion, in particular by going through "ordinary" nodes (that don't
 * need to know whether the rest of the match failed) by a loop instead of
 * by recursion.
 */
static int			/* 0 failure, 1 success */
ureMatch(Rune *prog, int flags, int *errp, tummy_t *tp)
{
	register Rune	*scan;		/* Current node. */
	Rune 		*next;		/* Next node. */
	Rune		r;
	int		i;

	for (scan = prog; scan != (Rune *) NULL ; ) {
		next = ureNext(scan, 0);

		switch (OP(scan)) {
		case BOL:
			if (!IsBegLine(tp)) {
				return 0;
			}
			break;
		case EOL:
			i = chartorune(&r, tp->t_input);
			if (r != '\0' && r != '\n') {
				return 0;
			}
			break;
		case BEGWORD:
			/* Match if current char is the beginning of a word */
			if (!IsBegWord(tp)) {
				return 0;
			}
			break;
		case ENDWORD:
			if (!IsEndWord(tp)) {
				return 0;
			}
 			break;
		case BORDER:
			if (!IsBegWord(tp) && !IsEndWord(tp)) {
				return 0;
			}
			break;
		case NBORDER:
			if (IsBegWord(tp) || IsEndWord(tp)) {
				return 0;
			}
			break;
		case WHITESP:
			/* match single whitespace */
			i = chartorune(&r, tp->t_input);
			if (r != 0 && !isspace(r)) {
				return 0;
			}
			tp->t_input += i;
			break;
		case NWHITESP:
			/* don't match eol, or space or tab */
			i = chartorune(&r, tp->t_input);
			if (r == 0 || isspace(r)) {
				return 0;
			}
			tp->t_input += i;
			break;
		case ALNUM: /* includes _ */
			i = chartorune(&r, tp->t_input);
			if (r == 0 || !UNICODE_IsIdent(r)) {
				return 0;
			}
			tp->t_input += i;
			break;
		case NALNUM:
			i = chartorune(&r, tp->t_input);
			if (r == 0 || UNICODE_IsIdent(r)) {
				return 0;
			}
			tp->t_input += i;
			break;
		case DIGIT:
			i = chartorune(&r, tp->t_input);
			if (r == 0 || !UNICODE_isdigit(r)) {
				return 0;
			}
			tp->t_input += i;
			break;
		case NDIGIT:
			i = chartorune(&r, tp->t_input);
			if (r == 0 || UNICODE_isdigit(r)) {
				return 0;
			}
			tp->t_input += i;
			break;
		case PRINT:
			i = chartorune(&r, tp->t_input);
			if (r == 0 || !(isprint(r) || isspace(r))) {
				return 0;
			}
			tp->t_input += i;
			break;
		case NPRINT:
			i = chartorune(&r, tp->t_input);
			if (r == 0 || isprint(r) || isspace(r)) {
				return 0;
			}
			tp->t_input += i;
			break;
		case ANYNONL:
			i = chartorune(&r, tp->t_input);
			if (r == '\0' || r == '\n') {
				return 0;
			}
			tp->t_input += i;
			break;
		case ANY:
			i = chartorune(&r, tp->t_input);
			if (r == '\0') {
				return 0;
			}
			tp->t_input += i;
			break;
		case EXACTLY: {
				register Rune	*opnd;
				register int	len;

				opnd = OPERAND(scan);
				/* Inline the first character, for speed. */
				if (flags & URE_ICASE) {
					i = chartorune(&r, tp->t_input);
					if (UNICODE_tolower(*opnd) !=
					    UNICODE_tolower(r)) {
						return 0;
					}
					if ((len = UNICODE_strlen(opnd)) > 1 &&
					    runeutfncasecmp(opnd, tp->t_input, len) != 0) {
						return 0;
					}
				} else {
					i = chartorune(&r, tp->t_input);
					if (*opnd != r) {
						return 0;
					}
					if ((len = UNICODE_strlen(opnd)) > 1 &&
					    runeutfncmp(opnd, tp->t_input, len) != 0) {
						return 0;
					}
				}
				while (len-- > 0) {
					i = chartorune(&r, tp->t_input);
					tp->t_input += i;
				}
			}
			break;
		case ANYOF:
			i = chartorune(&r, tp->t_input);
			if (flags & URE_ICASE) {
				if (!MatchClass(OPERAND(scan), UNICODE_tolower(r)) &&
				    !MatchClass(OPERAND(scan), UNICODE_toupper(r))) {
					return 0;
				}
			} else {
				if (!MatchClass(OPERAND(scan), r)) {
					return 0;
				}
			}
			tp->t_input += i;
			break;
		case ANYBUT:
			i = chartorune(&r, tp->t_input);
			if (flags & URE_ICASE) {
				if (MatchClass(OPERAND(scan), UNICODE_tolower(r)) &&
				    MatchClass(OPERAND(scan), UNICODE_toupper(r))) {
					return 0;
				}
			} else {
				if (MatchClass(OPERAND(scan), r)) {
					return 0;
				}
			}
			tp->t_input += i;
			break;
		case NOTHING:
			break;
		case BACK:
			break;
		case BRANCH: {
				register char	*save;
				int		c;

				if (OP(next) != BRANCH) {	/* No choice. */
					next = OPERAND(scan);	/* Avoid recursion. */
				} else {
					do {
						save = tp->t_input;
						if ((c = ureMatch(OPERAND(scan),
						    flags, errp, tp)) == 1 || c == -1) {
							return c;
						}
						tp->t_input = save;
						scan = ureNext(scan, 0);
					} while (scan != NULL &&
						 OP(scan) == BRANCH);
					return 0;
					/* NOTREACHED */
				}
			}
			break;
		case STAR:
		case PLUS: {
				register Rune	nextch;
				register char	*save;
				register char	*cp;
				register int	min;
				register int	num;
				int		j;
				int		c;

				/*
				 * Lookahead to avoid useless match attempts
				 * when we know what character comes next.
				 */
				nextch = '\0';
				if (OP(next) == EXACTLY) {
					nextch = *OPERAND(next);
				}
				min = (OP(scan) == STAR) ? 0 : 1;
				save = tp->t_input;
				num = ureRepeat(OPERAND(scan), errp, tp);
				if (num < 0) {
					return -1;
				}
				while (num >= min) {
					/* If it could work, try it. */
					i = chartorune(&r, tp->t_input);
					if (nextch == '\0' || r == nextch) {
						if ((c = ureMatch(next, flags, errp, tp)) == 1 || c == -1) {
							return c;
						}
					}
					/* Couldn't or didn't -- back up. */
					for (cp = save, j = 0, --num ; j < num ; j++, cp += i) {
						i = chartorune(&r, cp);
					}
					tp->t_input = cp;
				}
				return 0;
			}
			break;
		case MINMAX: {
				register char	*save;
				register int	no;
				Rune		min;
				Rune		max;
				int		c;

				next = OPERAND(scan);
				min = OP(next);
				next = OPERAND(next);
				max = OP(next);
				next = OPERAND(next);
				save = tp->t_input;
				for (no = 0 ; no < min ; no++) {
					c = ureMatch(next, flags, errp, tp);
					if (c == 0 || c == -1) {
						tp->t_input = save;
						return c;
					}
				}
				for ( ; no < max ; no++) {
					c = ureMatch(next, flags, errp, tp);
					if (c == -1) {
						return -1;
					}
					if (c == 0) {
						break;
					}
				}
				return 1;
			}
			break;
		case END:
			return 1;	/* Success! */
		default:
			if (OP(scan) & OPEN) {
				register char	*save;
				register int	no;
				int		c;

				no = OP(scan) - OPEN;
				save = tp->t_input;

				if ((c = ureMatch(next, flags, errp, tp)) == 1) {
					/*
					 * Don't set startp if some later
					 * invocation of the same parentheses
					 * already has.
					 */
					if (tp->t_matchv[no].rm_so == -1) {
						tp->t_matchv[no].rm_so = tp->t_off + (save - tp->t_start);
					}
					return 1;
				}
				return c;
			}
			if (OP(scan) & CLOSE) {
				register char	*save;
				register int	no;
				int		c;

				no = OP(scan) - CLOSE;
				save = tp->t_input;

				if ((c = ureMatch(next, flags, errp, tp)) == 1) {
					/*
					 * Don't set endp if some later
					 * invocation of the same parentheses
					 * already has.
					 */
					if (tp->t_matchv[no].rm_eo == -1) {
						tp->t_matchv[no].rm_eo = tp->t_off + (save - tp->t_start);
					}
					return 1;
				}
				return c;
			}
			*errp = URE_ERR_MEMORY_CORRUPTION;
			return -1;
		}

		scan = next;
	}

	/*
	 * We get here only if there's trouble -- normally "case END" is
	 * the terminating point.
	 */
	*errp = URE_ERR_CORRUPTED_POINTERS;
	return -1;
}

/*
 - ureTry - try match at specific point
 */
static int			/* 0 failure, 1 success */
ureTry(ure_t *prog, tummy_t *tp, int flags, int *errp)
{
	register int	i;

	for (i = 0 ; i < tp->t_matchc ; i++) {
		tp->t_matchv[i].rm_so = tp->t_matchv[i].rm_eo = (ureoff_t) -1;
	}
	if (ureMatch(prog->u_prog + 1, flags, errp, tp)) {
		if (tp->t_matchc > 0) {
			tp->t_matchv[0].rm_so = (ureoff_t) tp->t_off;
			tp->t_matchv[0].rm_eo = (ureoff_t) tp->t_off + (tp->t_input - tp->t_start);
		}
		return 1;
	}
	return 0;
}

/*
 - ureexec - match a URE against a string
 */
int
ureexec(ure_t *up, char *string, int matchc, urematch_t *matchv, int eflags, char *collseq)
{
	register char	*s;
	tummy_t		t;
	Rune		r;
	char		str[20];
	int		err;
	int		cc;
	int		i;

	/* Be paranoid... */
	if (up == (ure_t *) NULL || string == (char *) NULL) {
		return URE_ERR_NULL_PARAM;
	}

	/* Check validity of program. */
	if (UCHARAT(up->u_prog) != URE_MAGIC) {
		return URE_ERR_BAD_MAGIC;
	}

	/* sort out our language */
	urecollseq(collseq);

	/*
	 * if we were compiled ignoring case-sensitivity, add
	 * case-insensitivity at run-time
	 */
	if ((up->u_flags & URE_FLAG_BITS) & URE_ICASE) {
		eflags |= URE_ICASE;
	}

	/* funny range to search */
	if ((eflags & URE_FLAG_BITS) & URE_STARTEND) {
		string += matchv[0].rm_so;
		/* ends at matchv[0].rm_eo; */
	}

	/* If there is a "must appear" string, look for it. */
	if (up->u_must != NULL) {
		if ((eflags & URE_FLAG_BITS) & URE_ICASE) {
			s = string;
			r = UNICODE_toupper(up->u_must[0]);
			cc = runetochar(str, &r);
			r = UNICODE_tolower(up->u_must[0]);
			cc += runetochar(&str[cc], &r);
			r = 0;
			(void) runetochar(&str[cc], &r);
			while ((s = utffindrune(s, str)) != NULL) {
				if (runeutfncasecmp(up->u_must, s,
							up->u_mlen) == 0) {
					break;	/* Found it. */
				}
				i = chartorune(&r, s);
				s += i;
			}
			if (s == NULL)	/* Not present. */
				return URE_NOMATCH;
		} else {
			s = string;
			while ((s = utfrune(s, up->u_must[0])) != NULL) {
				if (runeutfncmp(up->u_must, s, up->u_mlen) == 0) {
					break;	/* Found it. */
				}
				i = chartorune(&r, s);
				s += i;
			}
			if (s == NULL)	/* Not present. */
				return URE_NOMATCH;
		}
	}

	/* Mark beginning of line for ^ . */
	t.t_bol = ((eflags & URE_FLAG_BITS) & URE_NOTBOL) ?
			(char *) NULL : string;

	t.t_matchc = matchc;
	t.t_matchv = matchv;

	/* Simplest case:  try anchored match for each beginning of line */
	if (up->u_anch) {
		t.t_off = 0;
		t.t_start = t.t_input = s = string;
		if (*s == 0) {
			/* don't match ^ with 0 byte */
			return URE_NOMATCH;
		}
		if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) {
			return URE_SUCCESS;
		}
		while ((s = utfrune(s, '\n')) != (char *) NULL) {
			s += chartorune(&r, s);
			if (*s == 0) {
				/* don't match ^ with 0 byte */
				return URE_NOMATCH;
			}
			t.t_bol = s;
			t.t_off = s - string;
			t.t_input = t.t_start = s;
			if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) {
				return URE_SUCCESS;
			}  
		}
		return URE_NOMATCH;
	}

	/* Messy cases:  unanchored match. */
	s = string;
	if (up->u_start != '\0') {
		if ((eflags & URE_FLAG_BITS) & URE_ICASE) {
			/* We know what char it must start with. */
			r = UNICODE_toupper(up->u_start);
			cc = runetochar(str, &r);
			r = UNICODE_tolower(up->u_start);
			cc += runetochar(&str[cc], &r);
			r = 0;
			(void) runetochar(&str[cc], &r);
			while ((s = utffindrune(s, str)) != NULL) {
				t.t_off = s - string;
				t.t_input = t.t_start = s;
				if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) {
					return URE_SUCCESS;
				}  
				i = chartorune(&r, s);
				s += i;
				if (r == '\n') {
					t.t_bol = s;
				}
			}
		} else {
			/* We know what char it must start with. */
			while ((s = utfrune(s, up->u_start)) != NULL) {
				t.t_off = s - string;
				t.t_input = t.t_start = s;
				if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) {
					return URE_SUCCESS;
				}
				i = chartorune(&r, s);
				s += i;
				if (r == '\n') {
					t.t_bol = s;
				}
			}
		}
	} else {
		/* We don't -- general case. */
		do {
			t.t_off = s - string;
			t.t_input = t.t_start = s;
			if (ureTry(up, &t, eflags & URE_FLAG_BITS, &err)) {
				return URE_SUCCESS;
			}
			i = chartorune(&r, s);
			s += i;
			if (r == '\n') {
				t.t_bol = s;
			}
		} while (r != 0);
	}

	/* Failure. */
	return URE_NOMATCH;
}


/* Textual error messages */
static char	*ureErrMsgs[] = {
	/* URE_SUCCESS */			"No error",
	/* URE_NOMATCH */			"No matches found",
	/* URE_ERR_UNMATCHED_BRACKET */		"Unmatched []",
	/* URE_ERR_INTERNAL_MESS */		"internal urp",
	/* URE_ERR_REPEAT_FOLLOWS_NOTHING */	"?+*{ follows nothing",
	/* URE_ERR_TRAILING_BACKSLASH */	"trailing \\",
	/* URE_ERR_INTERNAL_DISASTER */		"internal disaster",
	/* URE_ERR_OPERAND_EMPTY */		"*+{ operand could be empty",
	/* URE_ERR_NESTED_REPEAT */		"nested *?+{",
	/* URE_ERR_TOO_MANY_PARENS */		"too many ()",
	/* URE_ERR_UNMATCHED_PAREN */		"unmatched ()",
	/* URE_ERR_JUNK_ON_END */		"junk on end",
	/* URE_ERR_NULL_ARG */			"NULL argument",
	/* URE_ERR_TOO_BIG */			"URE too big",
	/* URE_ERR_EXECUTION_BUG */		"internal execution problems",
	/* URE_ERR_MEMORY_CORRUPTION */		"memory corruption",
	/* URE_ERR_CORRUPTED_POINTERS */	"corrupted pointers",
	/* URE_ERR_NULL_PARAM */		"NULL parameter",
	/* URE_ERR_BAD_MAGIC */			"corrupted program",
	/* URE_ERR_CORRUPTED_OPCODE */		"corrupted opcode",
	/* URE_ERR_SUBST_NULL_ARG */		"NULL parm to URE_Subst",
	/* URE_ERR_SUBST_BAD_MAGIC */		"damaged URE in URE_Subst",
	/* URE_ERR_SUBST_DAMAGED_MATCH */	"damaged match string",
	/* URE_ERR_OUT_OF_SPACE */		"out of space",
	/* URE_ERR_UNKNOWN */			"unknown error"
};

/*
 - ureerror - the interface to errors
 */
int
ureerror(int errnum, ure_t *up, char *buf, int size)
{
	char	*s;
	int	len;

	if (errnum < 0 || errnum > URE_ERR_OUT_OF_SPACE) {
		errnum = URE_ERR_UNKNOWN;
	}
	s = ureErrMsgs[errnum];
	len = strlen(s) + 1;
	if (size > 0) {
		if (size > len)
			(void) strcpy(buf, s);
		else {
			(void) strncpy(buf, s, size-1);
			buf[size-1] = '\0';
		}
	}
	return len;
}

/* Free the space we allocated */
void
urefree(ure_t *up)
{
	(void) free(up->u_prog);
}
