;; mulsi3
;; 32x32 multiply
;; 
;; I think that shift and add may be sufficient for this. Using the
;; supplied 8x8->16 would need 10 ops of 14 cycles each + overhead . this way
;; the inner loop uses maybe 20 cycles + overhead, but terminates
;; quickly on small args.
;;
;; Steve Chamberlain
;;
;; A0/A1 src_a
;; A2/A3 src_b
;; 
;;
;;  while (a) 
;;  {
;;    if(a & 1)
;;     r += b;
;;    a>>=1;
;;
;;    b<<=1;
;;  }
;;    

#include "defines.h"

#ifdef __H8300__

	.global	___mulsi3
___mulsi3:
	PUSHP	S0P
	PUSHP	S1P
	PUSHP	S2P
	
	sub.w	S0,S0
	sub.w	S1,S1
	
	; while (a)
_top:	mov.w	A0,A0
	bne	_more
	mov.w	A1,A1
	beq	_done
_more:	; if a & 1
	bld	#0,A1L
	bcc	_nobit
	; r+=b
	add.w	A3,S1
	addx	A2L,S0L
	addx	A2H,S0H
_nobit:
	; a>>=1
	shlr	A0H
	rotxr	A0L
	rotxr	A1H
	rotxr	A1L
	
	; b<<=1
	add.w	A3,A3
	addx	A2L,A2L
	addx	A2H,A2H
	bra 	_top

_done:
	mov.w	S0,A0	
	mov.w	S1,A1
	POPP	S2P
	POPP	S1P
	POPP	S0P
	rts

#else /* __H8300H__ */

	.h8300h

	.global	___mulsi3
___mulsi3:
	sub.l	A2P,A2P

	; while (a)
_top:	mov.l	A0P,A0P
	beq	_done

	; if a & 1
	bld	#0,A0L
	bcc	_nobit

	; r+=b
	add.l	A1P,A2P

_nobit:
	; a>>=1
	shlr.l	A0P

	; b<<=1
	shll.l	A1P
	bra	_top

_done:
	mov.l	A2P,A0P
	rts

#endif
