
/* Quad precision square root calculation
 *
 * Copyright (c) 2008, Andrew Vaught
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * * Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *
 * * The name of Andrew Vaught may not be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE. */

#include "runtime.h"


/* unpacked_sqrt16()-- Square root of a normalized number.  We use
 * Newton-Raphson iteration at various precisions, using results for
 * the initial guess of the next stage.  The first guess is for the
 * square root of the most significant word.  Since the top word is
 * 0x80000000 to 0xFFFFFFFF, we use a Taylor series centered on
 * 0xC0000000. */

void unpacked_sqrt16(unpacked16 *sq) {
unsigned long long y0, y1, sq2;
unpacked16 z0, z1, q, t;
unsigned x0, x1, sq1;
int i;

    sq1 = (sq->m[0] << 15) | (sq->m[1] >> 17);
    x1 = 0xDDB4 + sq->m[0] / (2*0xDDB4);

    do {
	x0 = x1;
	x1 = (x1 + sq1 / x1) >> 1;
    } while(x1 != x0+1 && x1 != x0 && x1 != x0-1);

    /* x1 now provides a guess to y1, the square root of the two most
     * significant words. */

    sq2 = sq->m[0];
    sq2 = (sq2 << 32) |  sq->m[1];
    sq2 = (sq2 << 15) | (sq->m[2] >> 17);

    y1 = x1 << 16;

    do {
	y0 = y1;
	y1 = (y1 + sq2 / y1) >> 1;
    } while(y1 != y0+1 && y1 != y0 && y1 != y0-1);

    /* y1 is now a guess to z1, a guess at the original square root of
     * the whole thing. */

    if (sq->exp & 1)
	/* For odd exponents, we have to multiply the guess by the
	 * square root of two.  y1 has at most 32 significant bits.
	 * Now, 2^29 * \sqrt(2) = 759250124.99401248, which is awfully
	 * close to 759250125, so \sqrt(2) \approx 759250125 / 2^29. */

	y1 = (759250125 * y1) >> 29;

    z1.exp  = ((sq->exp - EXP16_BIAS + 1) >> 1) + EXP16_BIAS - 1;
    z1.sign = 0;

    z1.m[0] = y1 >> 32;
    z1.m[1] = y1;
    z1.m[2] = 0;
    z1.m[3] = 0;

    /* Special normalization, the top word might be zero, or not. */

    if (z1.m[0] != 0) {
	i = top_bit(z1.m[0]);
	z1.exp -= i - 1;

       	i = 16 - i;
	z1.m[0] = (z1.m[0] << i) | (z1.m[1] >> (32 - i));
	z1.m[1] = (z1.m[1] << i);

    } else {
	i = top_bit(z1.m[1]);
	z1.exp += 31 - i;

	i = i - 16;   /* Right shifts */

	if (i > 0) {
	    z1.m[0] = z1.m[1] >> i;
	    z1.m[1] = z1.m[1] << (32 - i);

	} else if (i < 0) {
	    i = -i;

	    z1.m[0] = z1.m[i] << i;
	    z1.m[1] = z1.m[i] >> (32 - i);;       	    

	} else {
	    z1.m[0] = z1.m[1];
	    z1.m[1] = 0;
	}
    }

    /* At this point we've got a guess of the square root to about 30
     * bits.  The quadratic convergence of Newton-Raphson doubles the
     * number of digits on each iteration, so we expect that two
     * iterations should suffice for 120 bits, with a third iteration
     * to convince us that we've converged. */

    i = 0;

    for(;;) {
	z0 = z1;

	divide_unpacked(&z1, sq, &q);
	add_unpacked(&q, &z1, &t);
	i++;

	z1 = t;
	z1.exp--;

	/* Compare z1 and z0 for equality, or one bit off.  Quickly
	 * disposed of special cases first. */

	if (z1.exp+1 == z0.exp &&
	    z1.m[0] == 0x00010000 && z1.m[1] == 0 &&
	    z1.m[2] == 0          && z1.m[3] == 0 &&
	    z0.m[0] == 0x0001FFFF && z0.m[1] == 0xFFFFFFFF &&
	    z0.m[2] == 0xFFFFFFFF && z0.m[3] == 0xFFFFFFFF)
	    break;

	if (z1.exp-1 == z0.exp &&
	    z0.m[0] == 0x00010000 && z0.m[1] == 0 &&
	    z0.m[2] == 0          && z0.m[3] == 0 &&
	    z1.m[0] == 0x0001FFFF && z1.m[1] == 0xFFFFFFFF &&
	    z1.m[2] == 0xFFFFFFFF && z1.m[3] == 0xFFFFFFFF)
	    break;

	if (z1.exp != z0.exp)
	    continue;

	if (z1.m[0] == z0.m[0] && z1.m[1] == z0.m[1] &&
	    z1.m[2] == z0.m[2] && z1.m[3] == z0.m[3])
	    break;   /* Equal */

	/* Off-by-one test.  Figure out which is larger, then
	 * increment or decrement z0 before making a final test for
	 * equality.  */

	if (z0.m[0] > z1.m[1] ||
	    (z0.m[1] == z1.m[1] && z0.m[2] > z1.m[2] ||
	     (z0.m[2] == z1.m[2] && z0.m[3] > z1.m[3]))) {  /* z0 > z1 */

	    if (z0.m[3]-- == 0 && z0.m[2]-- == 0 && z0.m[1]-- == 0)
		z0.m[0]--;

	} else {  /* z0 < z1 */

	    if (++z0.m[3] == 0 && ++z0.m[2] == 0 && ++z0.m[1] == 0)
		z0.m[0]++;
	}

	if (z1.m[0] == z0.m[0] && z1.m[1] == z0.m[1] &&
	    z1.m[2] == z0.m[2] && z1.m[3] == z0.m[3])
	    break;   /* Off by one */
    }

    *sq = z1;
}



#define sqrt_r16 prefix(sqrt_r16)

packed16 sqrt_r16(packed16 *q) {
packed16 root;
unpacked16 n;

    unpack_real_16(q, n.m, &n.exp, &n.sign);

    if (n.exp == EXP16_NAN) {
	/* Plus infinity stays as it is, others are Not-a-Numbers */
	if (n.sign || n.m[0] != 0 || n.m[1] != 0 || n.m[2] != 0 || n.m[3] != 0)
	    set_nan16(&n);

    } else if (n.exp == 0)
	;    /* Zero or denorm */

    else if (n.sign)
	set_nan16(&n);   /* Negative non-zero */

    else
	unpacked_sqrt16(&n);

    pack_real_16(&root, n.m, &n.exp, &n.sign);
    return root;
}


