/*
 * SimpleText(str, n, start, clip)
 *
 *   Writes text to the framebuffer, a DEC QVSS.  Takes (n) characters from
 * the string (str) and writes them starting at (start), subject to the
 * clipping rectangle in (clip).  Uses StandardFont[], a fixed-size font in
 * which every character occupies 8 pixels horizontally and 16 vertically.
 *
 * General Outline:
 *
 *   This version of SimpleText has been tuned in an attempt to make its
 * performance on a MicroVAX II almost acceptable.  The main bottleneck is the
 * access path to the framebuffer - a Q-BUS, which has a 16-bit data path and
 * a long cycle time.  We win by trying, whenever possible, to ensure that
 * Q-BUS accesses are longword-aligned [the bus does handle 32-bit requests,
 * and these go no slower than two 16-bit requests] rather than aligned on
 * arbitrary bit boundaries.
 *
 *   We regard the QVSS as 30 columns, each 32 bits wide and aligned on a
 * 32-bit boundary.  When we write an arbitrary string, possibly clipped, it
 * will consist of a left section (0..31 bits), a centre section (0 or more
 * columns) and a right section (0..31 bits).  If the clipped string is at most
 * 32 bits wide, we treat it as a left section that may lap over a column
 * boundary.
 *
 *   The general procedure for the centre section is
 *	for each column do
 *	    for each row do
 *	      {
 *		get bytes from the font table for this row of all the
 *		    characters in this column.  Note that in general five, not
 *		    four, characters are (partially) displayed in a column.
 *		shift the assembled bytes to drop bits we don't need on the
 *		    left of the column (i.e. least significant bits, so do a
 *		    right shift)
 *		write the resulting 32-bit value to the current (row,column)
 *	      }
 *   For the right section, we don't write out a full 32 bits.  We also only
 * look up as many font-table entries as we need (this may be five, but will
 * often be less).  For the left section, we must also start writing at an
 * arbitrary bit position in the left column.
 *
 *   We use registers r1-r5 as pointers into the font table for the five
 * characters in the column.  Because of overlap, r5 in one column will become
 * r1 in the next.  The code for the centre and right sections assumes that r1
 * already contains the correct value; there are some hacks in the left section
 * to ensure that this assumption is reasonable.
 *
 *   If the string happens to be aligned on a byte boundary, rather than any
 * old bit boundary, then there is no overlap between columns.  We could write
 * code so that only four font-table pointers were used and there was no carry-
 * over from one column to the next, but a present we don't; we are content to
 * load and ignore one byte.  We choose to ignore the leftmost (least
 * significant) byte rather than the rightmost one.  The (not very solid)
 * reason for this goes: it would be unfortunate to read one byte beyond the
 * end of the string if the string happened to end on a page boundary and the
 * next page was not readable.  It might actually be cleaner to write special-
 * case code for the non-overlap case (there is already some special-casing in
 * the left section which could be abolished).
 *
 * Speed of bit-mangling operations:
 *
 *   As implied above the VAX insv instruction is best avoided, both because
 * it involves a lot of Q-BUS traffic and because the instruction is slow in
 * its own right.  We still have to use it, though, for the left and right
 * sections.
 *
 *   Alternative instructions, and the relative times they take when all their
 * operands live in registers, are:
 *	    rotl (15..18), ashl (14..25), ashq (28..36), extv (>= 41).
 * [The numbers are garnered from some crude timing tests and are should not be
 *  treated with too much reverence].
 * Hence we use rotl whenever possible (in the left and right sections where we
 * are manipulating at most 4 bytes).  For dealing with five bytes, it turns
 * out to be about as fast to {copy the bytes to a scratch area on the stack
 * and then do an extv from there} as it is to do anything else.
 *
 *   Another set of timing tests suggested that autopredecrement addressing was
 * somewhat faster than autopostincrement (beats me why), so the row loops have
 * been coded to run from the bottom row to the top.  One could conceivably
 * recode the column loops to run from right to left, but it's not clear that
 * this would be such a win.
 *
 * Registers and the C/assembler interface
 *
 *   Various crocks have been used to keep as much data as possible in
 * registers, particularly in the inner loops.
 *
 * WARNING: We assume quite a lot about the C compiler -
 *	- r11 down to r6 are used for C register variables if requested, and
 *	  are otherwise untouched
 *	- r0 to r5 are used by C for scratch values, but we assume in some
 *	  places that r1 will be preserved across C statements (dangerous)
 *	- non-register variables are allocated at successively larger negative
 *	  offsets from the frame pointer, so that the first one starts just
 *	  below 0(fp).
 *
 *   In the first part of the code (the part without asm()s), I've attempted a
 * manual register allocation for r6-r11.  Some of the registers are used for
 * more than one purpose.  In an attempt to prevent complete anarchy, we have:
 *	- r6 to r10 are declared as int registers (r11 always holds the string
 *	  pointer, so we treat it specially)
 *	- we use #defines to indicate what the current contents of each
 *	  register is; the definition may include a type-cast to something
 *	  other than int.  The name we define always includes the register-name
 *	  to make it obvious what we're up to.  We #undef the name when we use
 *	  the register for something else.
 *	- since we can't type-cast lvalues, we always assign to the unadorned
 *	  register-name, and type-cast the rvalue to int if necessary.
 *
 *   In the parts of the code which draw the left, centre and right sections,
 * we use as many of r0-r11 as we need.  There are times when another register
 * or two might be useful; we could use ap and even sp (just preserve fp), but
 * it doesn't seem quite kosher.  The registers are used roughly as follows:
 *	r11	pointer to the string
 *	     or scratch: assemble the bytes for one column
 *	r10	pointer to the next column
 *	     or pointer to current row of current column
 *	r9	bit shift used to throw away low-order bits we don't want
 *		  (positive if we're using extv, or negative for rotl).
 *	r8	number of rows to display (1..16)
 *	     or scratch: down-counter of number of rows left in current column
 *	     or width (number of bits) of left section
 *	r7	pointer to current row of current column (centre section)
 *	     or offset from left column-boundary to start of left section
 *	     or width (number of bits) of right section
 *	r6	pointer to the font table (precisely: address of the font-table
 *		  entry for character 0x00, plus an offset to the byte for the
 *		  row immediately after the last one we would actually display)
 *	r5	pointer to font-table entry for rightmost character in column
 *	r4	pointer to font-table entry
 *	r3	pointer to font-table entry
 *	r2	pointer to font-table entry
 *	     or scratch: assemble the bytes for left section
 *	r1	pointer to font-table entry for leftmost  character in column
 *	     or pointer to current row of left section
 *	r0	down-counter of number of rows left in current column
 *
 * Sundries:
 *
 *   We don't perform any consistency checks on the string's length, the start
 * vector or the clipping rectangle.  If these tell us to write data at an
 * arbitrary location in memory, we will do so.
 *
 *   This file must be compiled WITHOUT THE C OPTIMIZER, which seems to think
 * that the value in r0 becomes dead at the end of basic blocks (well, it
 * blithely removes some assignments to r0 that we need).
 *
 * Results:
 *
 *   All timing tests were conducted with a full 16 rows.  Left and right
 * sections are markedly more expensive than centre columns, no great surprise.
 * For a given width (i.e. clip->size.h, assuming sensible things about n and
 * clip->start.h), start->h was varied from 0 to 7 and clip->start.h was varied
 * from start->h to start->h + 31; each combination was sampled twice, giving
 * 512 data points.  This version has an overhead of about 500 usec and then
 * takes roughly 11 usec per pixel of width (for all 16 rows).  The previous
 * incarnation of this routine had a similar overhead and then took about 30
 * usec per pixel.  This translates to about 7.2 msec to write an 80-character
 * line with this version, or about 19.5 with the previous version.
 *
 *   The above are best-case results.  There are some cases where this version
 * actually loses by 5% or so.  All such cases are for widths of at most 16
 * bits.
 * 
 * Further work:
 *
 *   It is possible that things might run faster if we copied all the desired
 * font data into a large scratchpad, then shifted it and wrote it to the QVSS.
 * This has not been tried.  If one interpreted "all" literally, then the
 * scratchpad would have to hold 16 pows of pixels, i.e. would be 2Kbytes.  One
 * could, I suppose, compromise and do the thing in chunks of, say, 16 rows by
 * 256 bits (32 characters).
 *
 *   Given that shifting 5 bytes is such a pain, one might hope to do better by
 * using 16- instead of 32-bit columns.  Preliminary tests suggest that this is
 * not the case: the 16-bit version wins in some cases but loses overall.
 * Using 48-bit columns seemed just a bit too strange.
 *
 *   So much for major changes (though if anyone finds one which is an overall
 * win, I'll stand them a beer).  There are minor changes which can still be
 * made and which should slightly reduce the length-independent overhead:
 * essentially, use asm()s wherever possible.  This both removes some of the
 * inanely inefficient code generated by cc and also gives us more control over
 * the clobbering of registers.  Some of the code has to be in C, because we're
 * using struct definitions for Vector and Rectangle and don't have assembler
 * equivalents.
 *
 *   It may also be worth introducing special-case code in the centre and right
 * sections for byte-aligned characters, as suggested above.
 */

#include <Vfont.h>
#include <Vio.h>

extern u_char StandardFont[];
extern VRaster *WholeScreen;

#ifdef DEBUG
extern int Debug_SimpleText;	/* Debug flag: non-zero value means print    */
#endif DEBUG			/*   debugging info (to stdout)		     */

#define NROWS 16		/* Number of rows per character 	     */
#define FbStride (1024>>3)	/* Offset (in bytes) from one row to the next*/
;asm("	.set	X_FbStride,	1024>>3");	/* Same, used in asm()s      */

#ifndef ROUTINE_NAME		/* This crock lets us build versions with    */
#define ROUTINE_NAME SimpleText	/*   different names, so we can compare them.*/
#endif  ROUTINE_NAME

static void logic_error();

ROUTINE_NAME(str, n, start, clip)
    unsigned char *str;		/* String to display       */
    int		   n;		/* Number of characters    */
    Vector	  *start;	/* Coordinates of top left */
    Rectangle	  *clip;	/* Clipping rectangle      */
  {
    register unsigned char *r11;
    register int	    r10;
    register int	    r9;
    register int	    r8;
    register int	    r7;
    register int	    r6;

/*
 * Variables that (initially) live in registers:
 */

#define	r11_str		    		      r11	/* )		     */
#define	r10_start	    ((Vector        *)r10)	/* ) see above	     */
#define	r9_clip		    ((Rectangle     *)r9 )	/* )		     */
#define	r7_min_row			      r7	/* First row 	     */
							/* displayed (0..15) */
#define	r6_max_row_p1			      r6	/* Row after last row*/
							/* displayed         */
	/* yes folks, r8 is underutilized */

/*
 * Variables that live on the stack, used by C code and by asm()s:
 */

    int      nrows;		/* -4(fp) Number of rows displayed ( <= 16 ) */
    char     char_scratch[8];	/*-12(fp) Scratchpad for extv instructions   */
				/*        We only use 5 bytes, but this keeps*/
				/*	  the stack longword-aligned.	     */
    char     temp[8];		/*-20(fp) Used to save r10, r11		     */

/*
 * Variables that live on the stack, used only by C code:
 */

    unsigned centre_extv_pos;	/* Offset for 'extv' instruction in centre   */
				/*   and right sections			     */
    BitUnit *right_boundary;	/* Pointer to right section (strictly, to the*/
				/*   row after the last row in the right     */
				/*   section).  If there is no right section */
				/*   and no centre section, then this can    */
				/*   take any value less than or equal to a  */
				/*   pointer to the left section.	     */
    unsigned right_length;	/* Length (bits) of right section ( <= 31 )  */
    int	     left_nbytes;	/* Number of characters used in left section */
    int      right_nbytes;	/* Number of characters used in right section*/
				/*   (if data is byte-aligned, this includes */
				/*    the character that we ignore)	     */
    int	     right_bit_p1;	/* Number of bit after rightmost bit we show */

/*
 * Assembler equivalents of the variables shared by C and by asm()s
 */

    asm("	.set	X_nrows,		 -4");
    asm("	.set	X_char_scratch,		-12");
    asm("	.set	X_temp,			-20");


    r11 =       str;	/* Yes, we could have just declared the parameters */
    r10 = (int) start;	/*   to be register parameters, but then we'd be   */
    r9  = (int) clip;	/*   stuck with funny types for r9 and r10.	   */

#ifdef DEBUG
    if (Debug_SimpleText)
	printf("SimpleText(%s, %d, (%d,%d), [(%d,%d),(%d,%d)])\n",
	    r11_str, n, r10_start->v, r10_start->h,
	    r9_clip->start.v,r9_clip->start.h,r9_clip->size.v,r9_clip->size.h);
#endif DEBUG

    if ((r7 = r9_clip->start.v - r10_start->v) > 0)
	 /* # of lines to clip at top */
      {
#ifdef DEBUG
	if (Debug_SimpleText) printf("[Top clip: %d]\n", r7_min_row);
#endif DEBUG
      }
    else
      {
	r7 = 0;
      }

    if ( (r6 = r9_clip->start.v + r9_clip->size.v - r10_start->v) <
	 NROWS )
      { /* bottom clipping */
#ifdef DEBUG
	if (Debug_SimpleText)
	    printf("[Bottom clip: %d; t=%d,r=%d]\n",
		    NROWS-r6_max_row_p1, r7_min_row,
		    r6_max_row_p1 - r7_min_row);
#endif DEBUG
      }
    else
      {
	r6 = NROWS;
      }

    if ( (nrows = r6_max_row_p1 - r7_min_row) <= 0 )
	return;

#undef	r7_min_row
#define	r7_left_bit	r7	 /* Number of leftmost bit displayed */

    if ( (r7 = r10_start->h) < r9_clip->start.h)
	r7 = r9_clip->start.h;

    if ( (right_bit_p1 = r10_start->h + 8*n) >
		       r9_clip->start.h + r9_clip->size.h)
	right_bit_p1 = r9_clip->start.h + r9_clip->size.h;

#undef	r9_clip
#define r9_scratch      r9
#define	r8_left_length	r8  /* Length (bits) of left section */

    if ( (r8 = right_bit_p1 - r7_left_bit) <= 32 )
      {
	if (r8_left_length <= 0)
	    return;
	right_length   = 0;	/* There's no right section       */
	right_boundary = 0;	/*   and no centre section either */
      }
    else
      {
	r8	       = (-r7_left_bit) & 31;
	centre_extv_pos= 8 - (r10_start->h & 7); 
	    /* If no shifting needed, start at 8 (not 0 as you'd expect), */
	    /*   i.e. use bytes 2..5, not 1..4				  */
	right_nbytes   = (centre_extv_pos + (right_length=right_bit_p1&31) +7)
			   >> 3;
	right_boundary = (BitUnit *) ( (right_bit_p1 >> 3) & ~3 );
      }

    r9   = r7_left_bit - r10_start->h;
    r11 += r9_scratch >> 3;		   /* Ignore left-clipped characters */
    r9   = - (r9_scratch & 7);
#undef  r9_scratch
#define	r9_left_rotl_count  r9		   /* Bitcount for 'rotl' instruction*/
					   /*   (ranges from -7 to 0)	     */
    left_nbytes    = (r8_left_length - r9_left_rotl_count + 7) >> 3;
    r10		   = (int)WholeScreen[-1].start + 
				((r10_start->v + r6_max_row_p1) <<7 );
#undef	r10_start
    /*
     * r10 now points to the start of the row after the last row we display.
     *   We use this to generate right_boundary and then update it to point
     *   to the leftmost column (or partial column) we display.  Both of these
     *   point to the last-row-plus-one because we'll use autopredecrement
     *   addressing.
     */
    right_boundary+= (unsigned)r10;
    r10		  += (r7_left_bit >> 3) & ~3;
#define	r10_start_addr	    ((BitUnit *)r10)
    r7		  &=  31;
#undef	r7_left_bit				/* Note that r7 changes      */
#define	r7_left_start	    ((unsigned )r7)	/* meaning in mid-expression */

    r6		  += (int)StandardFont;
#undef	r6_max_row_p1				/* Note that r6 changes	     */
#define	r6_fontStart	    ((BitUnit *)r6)	/* meaning in mid-expression */

#ifdef DEBUG
    if (Debug_SimpleText)
      {
	printf("  str->'%c', start_addr=0x%x, left_rotl_count=%d, left_length=%d\n",
		*r11_str, r10_start_addr, r9_left_rotl_count, r8_left_length);
	printf(
	"  left_start=%d, nrows=%d, centre_extv_pos=%d, right_boundary=0x%x\n",
		r7_left_start, nrows, centre_extv_pos, right_boundary);
	printf("  right_length=%d, left_nbytes=%d, right_nbytes=%d\n",
		right_length, left_nbytes, right_nbytes);
      }
#endif DEBUG

#ifdef DEBUG
#define DEBUG_CHAR		\
    asm("	pushr	$63");	\
    if (Debug_SimpleText)	\
	printf("%c", *r11_str);	\
    0 == 1; /* Make sure the branch doesn't miss the popr */ \
    asm("	popr	$63");
#else DEBUG
#define DEBUG_CHAR
#endif DEBUG

    if (r8_left_length)
	switch (left_nbytes)
	    /*
	     * We use a switch rather than asm("caseb ...") because the latter
	     *   confuses the optimizer (it gets the labels wrong).  We could
	     *   have one general piece of code instead of five cases, but it
	     *   would be slower and probably even less comprehensible.
	     */
	  {
	    case 1:	/* Just one character; r5 points to font entry for it*/
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r5");
		asm("	rotl	$4,	r5,	r5");
		asm("	addl2	r6,		r5");

		asm("	movl	X_nrows(fp),r0");
		asm("	moval	(r10)+,		r1");
		asm("left_5_loop:");
		asm("	subl2	$X_FbStride,	r1");
		asm("	movb	-(r5),	r2");
		asm("	rotl	r9,	r2,	r2");
		asm("	insv	r2,	r7,	r8,	(r1)");
		asm("	sobgtr	r0,		left_5_loop");
		asm("	addl3	X_nrows(fp),	r5,	r1");
		break;

	    case 2:	/* Two characters; similar but use r4 for first */
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r4");
		asm("	rotl	$4,	r4,	r4");
		asm("	addl2	r6,		r4");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r5");
		asm("	rotl	$4,	r5,	r5");
		asm("	addl2	r6,		r5");

		asm("	movl	X_nrows(fp),r0");
		asm("	moval	(r10)+,		r1");
		asm("left_45_loop:");
		asm("	subl2	$X_FbStride,	r1");
		asm("	movb	-(r5),	r2");
		asm("	rotl	$8,	r2,	r2");
		asm("	movb	-(r4),	r2");
		asm("	rotl	r9,	r2,	r2");
		asm("	insv	r2,  r7, r8, (r1)");
		asm("	sobgtr	r0,		left_45_loop");
		asm("	addl3	X_nrows(fp),	r5,	r1");
		break;

	    case 3:	/* Three characters; more of the same */
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r3");
		asm("	rotl	$4,	r3,	r3");
		asm("	addl2	r6,		r3");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r4");
		asm("	rotl	$4,	r4,	r4");
		asm("	addl2	r6,		r4");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r5");
		asm("	rotl	$4,	r5,	r5");
		asm("	addl2	r6,		r5");
		asm("	movl	X_nrows(fp),r0");
		asm("	moval	(r10)+,		r1");
		asm("left_345_loop:");
		asm("	subl2	$X_FbStride,	r1");
		asm("	movb	-(r5),	r2");
		asm("	rotl	$8,	r2,	r2");
		asm("	movb	-(r4),	r2");
		asm("	rotl	$8,	r2,	r2");
		asm("	movb	-(r3),	r2");
		asm("	rotl	r9,	r2,	r2");
		asm("	insv	r2,  r7, r8, (r1)");
		asm("	sobgtr	r0,		left_345_loop");
		asm("	addl3	X_nrows(fp),	r5,	r1");
		break;

	    case 4:	/* Four characters.  Use r2 as a font pointer, so we */
			/* use r11 instead of r2 as a scratch register, hence*/
			/* we have to save/restore r11.			     */
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r2");
		asm("	rotl	$4,	r2,	r2");
		asm("	addl2	r6,		r2");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r3");
		asm("	rotl	$4,	r3,	r3");
		asm("	addl2	r6,		r3");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r4");
		asm("	rotl	$4,	r4,	r4");
		asm("	addl2	r6,		r4");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r5");
		asm("	rotl	$4,	r5,	r5");
		asm("	addl2	r6,		r5");
		asm("	movl	r11,	X_temp(fp)");	/* Faster than push? */
		asm("	movl	X_nrows(fp),r0");
		asm("	moval	(r10)+,	r1");
		asm("left_2345_loop:");
		asm("	subl2	$X_FbStride,	r1");
		asm("	movb	-(r5),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r4),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r3),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r2),	r11");
		asm("	rotl	r9,   	r11,	r11");
		asm("	insv	r11,  r7, r8, (r1)");
		asm("	sobgtr	r0,		left_2345_loop");
		asm("	movl	X_temp(fp),	r11");	/* Restore r11 */
		asm("	addl3	X_nrows(fp),	r5,	r1");
		break;

	    case 5:	/* Five characters.  Instead of putting them in a    */
			/* register and shifting/rotating/extv'ing, we put   */
			/* them in scratch RAM and extv from there; it takes */
			/* about the same time and uses one register fewer.  */
			/* Again, we use r11 as scratch; we also use r10 as  */
			/* the frame-buffer pointer, since we can't use r1.  */
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r1");
		asm("	rotl	$4,	r1,	r1");
		asm("	addl2	r6,		r1");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r2");
		asm("	rotl	$4,	r2,	r2");
		asm("	addl2	r6,		r2");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r3");
		asm("	rotl	$4,	r3,	r3");
		asm("	addl2	r6,		r3");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r4");
		asm("	rotl	$4,	r4,	r4");
		asm("	addl2	r6,		r4");
		DEBUG_CHAR
		asm("	movzbl	(r11)+,	r5");
		asm("	rotl	$4,	r5,	r5");
		asm("	addl2	r6,		r5");

		asm("	movq	r10,	X_temp(fp)");
		asm("	movl	X_nrows(fp),r0");
		/*
		 * We're doing an extv, not a rotl, so negate the count.  The
		 *   count cannot be 0, or we'd be moving > 32 bits, since at
		 *   least 1 bit from -(r5) is always used.
		 */
		asm("	mnegl	r9,	r9");

		asm("left_12345_loop:");
		asm("	subl2	$X_FbStride,	r10");
		asm("	movb	-(r1), X_char_scratch+0(fp)");
		asm("	movb	-(r2), X_char_scratch+1(fp)");
		asm("	movb	-(r3), X_char_scratch+2(fp)");
		asm("	movb	-(r4), X_char_scratch+3(fp)");
		asm("	movb	-(r5), X_char_scratch+4(fp)");
		asm("	extv	r9,   r8, X_char_scratch  (fp), r11");
		asm("	insv	r11,  r7, r8, (r10)");
		asm("	sobgtr	r0,		left_12345_loop");
		asm("	movq	X_temp(fp),	r10");
		/*
		 * Adjust (the restored) r10 to point to the next column.
		 *   In the cases above we got this as a side-effect of
		 *   "moval (r10)+, r1".
		 */
		asm("	addl2	$4,	r10");
		asm("	addl3	X_nrows(fp),	r5,	r1");
		break;

	    default:
		logic_error("switch() for left section: bogus left_nbytes",
			str, n, start, clip);
	  }
	else /* left_nbytes == 0 */
	  {
	    if (centre_extv_pos == 8)
	      {
		/*
		 * The data is byte-aligned, so we'll ignore all the stuff
		 *   pointed to by r1, but it had better point somewhere safe
		 */
		;asm("	movl	r6,	r1");	/* Safe dummy value */
	      }
	    else
	      {
		/*
		 * centre_extv_pos < 8, so r1 must point to the font-table
		 *   entry for the first character, even though there was no
		 *   left section.  Something tells me this shouldn't be such
		 *   a special case.
		 */
		;asm("	movzbl	(r11)+,	r1");
		asm("	rotl	$4,	r1,	r1");
		asm("	addl2	r6,	r1");
	      }
	  }
    /*
     * Intermixing C and assembler in the code above can only be regarded as
     *   playing with fire.  In particular, we're blithely assuming that the
     *   compiler won't clobber r1.
     */

#undef r7_left_start
#undef r8_left_length
#undef r9_left_rotl_count

    r9 = centre_extv_pos;
    r8 = nrows;
    while ( r10_start_addr < right_boundary )
      {
	;
	asm("	moval	(r10)+,	r7");		/* Address of new column     */
	asm("	movl	r8,		r0");	/* Number of rows to display */
	asm("	movzbl	(r11)+,	r2");
	asm("	rotl	$4,	r2,	r2");
	asm("	addl2	r6,		r2");
	asm("	movzbl	(r11)+,	r3");
	asm("	rotl	$4,	r3,	r3");
	asm("	addl2	r6,		r3");
	asm("	movzbl	(r11)+,	r4");
	asm("	rotl	$4,	r4,	r4");
	asm("	addl2	r6,		r4");
	asm("	movzbl	(r11)+,	r5");
	asm("	rotl	$4,	r5,	r5");
	asm("	addl2	r6,		r5");
	asm("row_loop:");
	asm("	subl2	$X_FbStride,	r7");
	asm("	movb	-(r1), X_char_scratch+0(fp)");
	asm("	movb	-(r2), X_char_scratch+1(fp)");
	asm("	movb	-(r3), X_char_scratch+2(fp)");
	asm("	movb	-(r4), X_char_scratch+3(fp)");
	asm("	movb	-(r5), X_char_scratch+4(fp)");
	asm("	extv	r9,  $32, X_char_scratch  (fp),	(r7)");
	asm("	sobgtr	r0,		row_loop");
	asm("	addl3	r5,	r8,	r1");	/* Byte 5 in this column is */
						/*   byte 1 in the next.    */
      }

    if ( (r7 = (int)right_length) != 0)
	switch (right_nbytes)
	  {
	    case 1:	/* Just one character.  Much the same as the one-    */
			/* character case in the left section, but we can do */
			/* what we like with r11.			     */
			/* Note that r1 is already valid, since it represents*/
			/* overlap from the centre (or from the left if there*/
			/* was no centre; there must be a centre or a left). */
			/* The shift we apply is the same as for the centre  */
			/* section, and lives in r9; if we're using rotl     */
			/* instead of extv, we have to negate it.	     */
		asm("	mnegl	r9,	r9");
		asm("right_1_loop:");
		asm("	subl2	$X_FbStride,	r10");
		asm("	movb	-(r1),	r11");
		asm("	rotl	r9,	r11,	r11");
		asm("	insv	r11,  $0, r7, (r10)");
		asm("	sobgtr	r8,	right_1_loop");
		break;

	    case 2:
		asm("	mnegl	r9,	r9");
		asm("	movzbl	(r11),		r2");
		asm("	rotl	$4,	r2,	r2");
		asm("	addl2	r6,		r2");
		asm("right_12_loop:");
		asm("	subl2	$X_FbStride,	r10");
		asm("	movb	-(r2),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r1),	r11");
		asm("	rotl	r9,	r11,	r11");
		asm("	insv	r11,  $0, r7, (r10)");
		asm("	sobgtr	r8,	right_12_loop");
		break;

	    case 3:	/* We use offsets from r11, not autoincrement; it   */
			/* seems to be faster.  In the left and centre      */
			/* sections we needed the incremented value (and it */
			/* is NOT faster if you have to do an addl2 $4, r10)*/
		asm("	mnegl	r9,	r9");
		asm("	movzbl	(r11),		r2");
		asm("	rotl	$4,	r2,	r2");
		asm("	addl2	r6,		r2");
		asm("	movzbl	1(r11),		r3");
		asm("	rotl	$4,	r3,	r3");
		asm("	addl2	r6,		r3");
		asm("right_123_loop:");
		asm("	subl2	$X_FbStride,	r10");
		asm("	movb	-(r3),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r2),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r1),	r11");
		asm("	rotl	r9,	r11,	r11");
		asm("	insv	r11,  $0, r7, (r10)");
		asm("	sobgtr	r8,	right_123_loop");
		break;

	    case 4:
		asm("	mnegl	r9,	r9");
		asm("	movzbl	(r11),		r2");
		asm("	rotl	$4,	r2,	r2");
		asm("	addl2	r6,		r2");
		asm("	movzbl	1(r11),		r3");
		asm("	rotl	$4,	r3,	r3");
		asm("	addl2	r6,		r3");
		asm("	movzbl	2(r11),		r4");
		asm("	rotl	$4,	r4,	r4");
		asm("	addl2	r6,		r4");
		asm("right_1234_loop:");
		asm("	subl2	$X_FbStride,	r10");
		asm("	movb	-(r4),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r3),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r2),	r11");
		asm("	rotl	$8,	r11,	r11");
		asm("	movb	-(r1),	r11");
		asm("	rotl	r9,	r11,	r11");
		asm("	insv	r11,  $0, r7, (r10)");
		asm("	sobgtr	r8,	right_1234_loop");
		break;

	    case 5:
		asm("	movzbl	(r11),		r2");
		asm("	rotl	$4,	r2,	r2");
		asm("	addl2	r6,		r2");
		asm("	movzbl	1(r11),		r3");
		asm("	rotl	$4,	r3,	r3");
		asm("	addl2	r6,		r3");
		asm("	movzbl	2(r11),		r4");
		asm("	rotl	$4,	r4,	r4");
		asm("	addl2	r6,		r4");
		asm("	movzbl	3(r11),		r5");
		asm("	rotl	$4,	r5,	r5");
		asm("	addl2	r6,		r5");
		asm("right_12345_loop:");
		asm("	subl2	$X_FbStride,	r10");
		asm("	movb	-(r1), X_char_scratch+0(fp)");
		asm("	movb	-(r2), X_char_scratch+1(fp)");
		asm("	movb	-(r3), X_char_scratch+2(fp)");
		asm("	movb	-(r4), X_char_scratch+3(fp)");
		asm("	movb	-(r5), X_char_scratch+4(fp)");
		asm("	extv	r9,   r7, X_char_scratch  (fp), r11");
		asm("	insv	r11,  $0, r7, (r10)");
		asm("	sobgtr	r8,	right_12345_loop");
		break;

	    default:
		logic_error("switch() for right section: bogus right_nbytes",
			str, n, start, clip);
	  }
    return;
  }

static int bug_count = 0;
static char *bug_msg = 
		"***** SimpleText() suffered an internal error -\n    %s\n";

static void logic_error(message, str, n, start, clip)
    char *message, *str;
    int		    n;
    Vector	   *start;
    Rectangle	   *clip;
  {
    switch (++bug_count)
      {
	case 1:
	    printf(bug_msg, message);
	    printf("      n = %d, start = (%d,%d), clip = (%d,%d,%d,%d)\n",
		   n, start->h, start->v, clip->start.h, clip->start.v,
		   clip->size.h, clip->size.v );
	    printf("      str (at 0x%x): \"", str);
	    while (n-- > 0)
		putchar(*str++);
	    printf("\"\n----- Will try to continue\n");
	    bug_count--;
	    break;
	case 2:
	    abort(bug_msg, message);
	default:
	    ;asm("halt");	/* rather drastic */
      }
    return;
  }
