/* dither.S */

/* dither(d,l1,l2,cb,cr,mod1,mod2,mod3,width,height) */

	.text
	.align	2

	.lcomm	loop_count,4

#define PAR_D		20(%esp)
#define PAR_L1		24(%esp)
#define PAR_L2		28(%esp)
#define PAR_CB		32(%esp)
#define PAR_CR		36(%esp)
#define PAR_MOD1	40(%esp)
#define PAR_MOD2	44(%esp)
#define PAR_MOD3	48(%esp)
#define PAR_WIDTH	52(%esp)
#define PAR_HEIGHT	56(%esp)

#define V_D		%edi
#define V_L1		%ecx
#define V_L2		%edx
#define V_CB		%esi
#define V_CR		%ebp

	.align	2

	.globl	_dither
_dither:
	pushl	%ebp
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	PAR_D,V_D
	movl	PAR_L1,V_L1
	movl	PAR_L2,V_L2
	movl	PAR_CB,V_CB
	movl	PAR_CR,V_CR
	shrl	PAR_WIDTH
	incl	V_L1
loop_1:
	movl	PAR_WIDTH,%eax
	movl	%eax,loop_count
	.align	2
loop_2:
#if CPU_NUMBER > 3
	movb	1(V_CB),%ah
	movb	1(V_CR),%bl
	movb	2(V_L1),%al
	movb	2(V_L2),%bh
	bswap	%eax
	bswap	%ebx
#else
	movb	1(V_CB),%al
	movb	1(V_CR),%bh
	movb	2(V_L1),%ah
	movb	2(V_L2),%bl
	shll	$16,%eax
	shll	$16,%ebx
#endif

	movb	(V_CB),%al
	movb	(V_CR),%bh
	movb	(V_L1),%ah
	movb	(V_L2),%bl
	orl	$0x00800080,%eax
	orl	$0x40004000,%ebx
	movl	%eax,(V_D)
	movl	%ebx,320(V_D)

	addl	$2,V_CB
	addl	$2,V_CR
	addl	$4,V_L1
	addl	$4,V_L2
	addl	$4,V_D
	decl	loop_count
	jnz	loop_2
	addl	PAR_MOD1,V_CB
	addl	PAR_MOD2,V_L1
	addl	PAR_MOD3,V_D
	addl	PAR_MOD1,V_CR
	addl	PAR_MOD2,V_L2
	decl	PAR_HEIGHT
	jnz	loop_1
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret

	.align	2

	.globl	_dither2
_dither2:
	pushl	%ebp
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	PAR_D,V_D
	movl	PAR_L1,V_L1
	movl	PAR_L2,V_L2
	movl	PAR_CB,V_CB
	movl	PAR_CR,V_CR
dloop_1:
	movl	PAR_WIDTH,%eax
	movl	%eax,loop_count
	.align	2
dloop_2:
#if 0
/* writes are more efficient, but this is somehow slower on my computer */
#if CPU_NUMBER > 3
	movb	(V_CR),%bl
	movb	(V_CB),%bh
	orl	$0x8040,%ebx
	movb	%bh,%ah
	bswap	%eax
	movb	%bh,%al
	movb	%bl,%ah
	bswap	%ebx
#else
	movb	(V_CB),%bl
	movb	(V_CR),%bh
	orl	$0x4080,%ebx
	movb	%bl,%ah
	roll	$8,%eax
	movb	%bl,%al
	movb	%bh,%ah
	rorl	$8,%ebx
#endif
	movb	%ah,%bh

	movb	1(V_L1),%ah
	movb	%ah,%bl
	roll	$16,%eax
	roll	$16,%ebx
	movb	(V_L1),%ah
	movb	%ah,%bl

	movl	%eax,(V_D)
	movl	%ebx,320(V_D)

	movb	1(V_L2),%ah
	movb	%ah,%bl
	roll	$16,%eax
	roll	$16,%ebx
	movb	(V_L2),%ah
	movb	%ah,%bl

	movl	%eax,640(V_D)
	movl	%ebx,960(V_D)

	incl	V_CB
	incl	V_CR
	addl	$2,V_L1
	addl	$2,V_L2
	addl	$4,V_D
#else
	movb	(V_CB),%al
	movb	(V_CR),%bh
	orb	$128,%al
	orb	$64,%bh

	movb	(V_L1),%ah
	incl	V_L1
	movb	%ah,%bl
	movw	%ax,(V_D)
	movw	%bx,320(V_D)

	movb	(V_L1),%ah
	incl	V_CB
	movb	%ah,%bl
	movw	%ax,2(V_D)
	movw	%bx,322(V_D)

	movb	(V_L2),%ah
	incl	V_L2
	movb	%ah,%bl
	movw	%ax,640(V_D)
	movw	%bx,960(V_D)

	movb	(V_L2),%ah
	incl	V_CR
	movb	%ah,%bl
	movw	%ax,642(V_D)
	movw	%bx,962(V_D)

	incl	V_L1
	incl	V_L2
	addl	$4,V_D
#endif

	decl	loop_count
	jnz	dloop_2
	addl	PAR_MOD1,V_CB
	addl	PAR_MOD2,V_L1
	addl	PAR_MOD3,V_D
	addl	PAR_MOD1,V_CR
	addl	PAR_MOD2,V_L2
	decl	PAR_HEIGHT
	jnz	dloop_1
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
