/*
 * $Id: math-sll.c,v 1.15 2002/08/20 18:01:54 andrewm Exp $
 *
 * Purpose
 *	A fixed point (31.32 bit) math library.
 *
 * Description
 *	Floating point packs the most accuracy in the available bits, but it
 *	often provides more accuracy than is required.  It is time consuming to
 *	carry the extra precision around, particularly on platforms that don't
 *	have a dedicated floating point processor.
 *
 *	This library is a compromise.  All math is done using the 64 bit signed
 *	"long long" format (sll), and is not intended to be portable, just as
 *	fast as possible.  Since "long long" is a elementary type, it can be
 *	passed around without resorting to the use of pointers.  Since the
 *	format used is fixed point, there is never a need to do time consuming
 *	checks and adjustments to maintain normalized numbers, as is the case
 *	in floating point.
 *
 *	Simply put, this library is limited to handling numbers with a whole
 *	part of up to 2^31 - 1 = 2.147483647e9 in magnitude, and fractional
 *	parts down to 2^-32 = 2.3283064365e-10 in magnitude.  This yields a
 *	decent range and accuracy for many applications.
 *
 * IMPORTANT
 *	No checking for arguments out of range (error).
 *	No checking for divide by zero (error).
 *	No checking for overflow (error).
 *	No checking for underflow (warning).
 *	Chops, doesn't round.
 *
 * Functions
 *	sll dbl2sll(double x)			double -> sll
 *	double slldbl(sll x)			sll -> double
 *
 *	sll slladd(sll x, sll y)		x + y
 *	sll sllsub(sll x, sll y)		x - y
 *	sll sllmul(sll x, sll y)		x * y
 *	sll slldiv(sll x, sll y)		x / y
 *
 *	sll sllinv(sll v)			1 / x
 *	sll sllmul2(sll x)			x * 2
 *	sll sllmul4(sll x)			x * 4
 *	sll sllmul2n(sll x, int n)		x * 2^n, 0 <= n <= 31
 *	sll slldiv2(sll x)			x / 2
 *	sll slldiv4(sll x)			x / 4
 *	sll slldiv2n(sll x, int n)		x / 2^n, 0 <= n <= 31
 *
 *	sll sllcos(sll x)			cos x
 *	sll sllsin(sll x)			sin x
 *	sll slltan(sll x)			tan x
 *	sll sllatan(sll x)			atan x
 *
 *	sll sllexp(sll x)			e^x
 *	sll slllog(sll x)			ln x
 *
 *	sll sllpow(sll x, sll y)		x^y
 *	sll sllsqrt(sll x)			x^(1 / 2)
 *
 * History
 *	* Aug 20 2002 Nicolas Pitre <nico@cam.org> v1.15
 *	- Replaced all shifting assembly with C equivalents
 *	- Reformated ARM asm and changed comments to begin with @
 *	- Updated C version of sllmul()
 *	- Removed the unsupported architecture #error - should be portable now
 *
 *	* Aug 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.14
 *	- Fixed sign handling of ARM sllmul()
 *	- Ported sllmul() to x86 - it can be inlined now
 *	- Updated the sllmul() comments to reflect my changes
 *	- Updated the header comments
 *
 *	* Aug 17 2002 Nicolas Pitre <nico@cam.org> v1.13
 *	- Corrected and expanded upon Andrew's sllmul() comments
 *	- Added in an non-optimal but portable C version of sllmul()
 *
 *	* Aug 16 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.12
 *	- Added in corrected optimized sllmul() for ARM by Nicolas Pitre
 *	- Changed comments on multiplication to describe Nicolas's method
 *
 *	* Jun 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.11
 *	- Reverted optimized sllmul() for ARM because of bug
 *
 *	* Jun 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.10
 *	- Added in optimized sllmul() for ARM by Nicolas Pitre
 *	- Changed comments on multiplication to describe Nicolas's method
 *	- Optimized multiplications and divisions by powers of 2
 *
 *	* Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.9
 *	- Optimized multiplcations and divisions by powers of 2
 *
 *	* Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.8
 *	- Consolidated constants
 *	- Added macro for _slladd() _sllsub()
 *	- Removed __inline__ from slladd() sllsub()
 *	- Renamed umul() to ullmul() and made global
 *	- Added function prototypes
 *	- Corrected header comment about fractional range
 *	- Added warning for non-Linux operating systems
 *
 *	* Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.7
 *	- Corrected some i386 assembly comments
 *	- Renamed calc_*() to _sll*()
 *	- Moved _sllexp() closer to sllexp()
 *
 *	* Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.6
 *	- Added sllmul2() sllmul4() sllmul2n() for i386
 *	- Added slldiv2() slldiv4() slldiv2n() for i386
 *	- Removed input constraints on sllmul2() sllmul4() sllmul2n() for ARM
 *	- Removed input constraints on slldiv2() slldiv4() slldiv2n() for ARM
 *	- Modified ARM assembly for WYSIWYG output
 *	- Changed asm to __asm__
 *
 *	* Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.5
 *	- Fixed umul() for i386
 *	- Fixed dbl2sll() and sll2dbl() - I forgot ARM doubles are big-endian
 *	- Very lightly tested on ARM and i386 and it seems okay
 *
 *	* Feb  4 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.4
 *	- Added umul() for i386
 *
 *	* Jan 20 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.3
 *	- Added fast multiplication functions sllmul2(), sllmul4(), sllmul2n()
 *	- Added fast division functions slldiv2() slldiv(), slldiv4n()
 *	- Added square root function sllsqrt()
 *	- Added library roll-call
 *	- Reformatted the history to RPM format (ick)
 *	- Moved sllexp() closer to related functions
 *	- Added algorithm description to sllpow()
 *
 *	* Jan 19 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.1
 *	- Corrected constants, thanks to Mark A. Lisher for noticing
 *	- Put source under CVS control
 *
 *	* Jan 18 2002 Andrew E. Mileski <andrewm@isoar.ca>
 *	- Added some more explanation to calc_cos() and calc_sin()
 *	- Added sllatan() and documented it fairly verbosely
 *
 *	* July 13 2000 Andrew E. Mileski <andrewm@isoar.ca>
 *	- Corrected documentation for multiplication algorithm
 *
 *	* May 21 2000 Andrew E. Mileski <andrewm@isoar.ca>
 *	- Rewrote slltanx() to avoid scaling argument for both sine and cosine
 *
 *	* May 19 2000  Andrew E. Mileski <andrewm@isoar.ca>
 *	- Unrolled loops
 *	- Added sllinv(), and sllneg()
 *	- Changed constants to type "LL" (was "UL" - oops)
 *	- Changed all routines to use inverse constants instead of division
 *
 *	* May 15, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *	- Fixed slltan() - used sin/cos instead of sllsin/sllcos.  Doh!
 *	- Added slllog(x) and sllpow(x,y)
 *
 *	* May 11, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *	- Added simple tan(x) that could stand some optimization
 *	- Added more constants (see <math.h>)
 *
 *	* May 3, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *	- Added sllsin(), sllcos(), and trig constants
 *
 *	* May 2, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *	- All routines and macros now have sll their identifiers
 *	- Changed mul() to umul() and added sllmul() to handle signed numbers
 *	- Added and tested sllexp(), sllint(), and sllfrac()
 *	- Added some constants
 *
 *	* Apr 26, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *	- Added mul(), and began testing it (unsigned only)
 *
 *	* Apr 25, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *	- Added sll2dbl() [convert a signed long long to a double]
 *	- Began testing.  Well gee whiz it works! :)
 *
 *	* Apr 24, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *	- Added dbl2sll() [convert a double to signed long long]
 *	- Began documenting
 *
 *	* Apr ??, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
 *	- Conceived, written, and fiddled with
 *
 *
 *		Copyright (C) 2000 Andrew E. Mileski
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License version 2 as published by the Free Software Foundation.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
 */
#if !defined(__GNUC__)
#  error Requires support for type long long (64 bits)
#endif
#if !defined(linux)
#  warn Not tested on this operating system!
#endif

/* Data types */
typedef signed long long sll;
typedef unsigned long long ull;

/* Function prototypes */
sll slladd(sll x, sll y);
sll sllsub(sll x, sll y);
sll sllmul(sll left, sll right);
sll slldiv(sll left, sll right);
sll sllmul2(sll x);
sll sllmul4(sll x);
sll sllmul2n(sll x, int n);
sll slldiv2(sll x);
sll slldiv4(sll x);
sll slldiv2n(sll x, int n);
sll sllsin(sll x);
sll sllcos(sll x);
sll slltan(sll x);
sll sllatan(sll x);
sll sllexp(sll x);
sll slllog(sll x);
sll sllpow(sll x, sll y);
sll sllinv(sll v);
sll sllsqrt(sll x);
sll dbl2sll(double dbl);
double sll2dbl(sll s);
sll _sllsin(sll x);
sll _sllcos(sll x);
sll _slltan(sll x);
sll _sllatan(sll x);
sll _sllexp(sll x);

/* Macros */
#define int2sll(X)	(((sll) (X)) << 32)
#define sll2int(X)	((int) ((X) >> 32))
#define sllint(X)	((X) & 0xffffffff00000000LL)
#define sllfrac(X)	((X) & 0x00000000ffffffffLL)
#define sllneg(X)	(-(X))
#define _slladd(X,Y)	((X) + (Y))
#define _sllsub(X,Y)	((X) - (Y))

/* Constants (converted from double) */
#define CONST_0		0x0000000000000000LL
#define CONST_1		0x0000000100000000LL
#define CONST_2		0x0000000200000000LL
#define CONST_3		0x0000000300000000LL
#define CONST_4		0x0000000400000000LL
#define CONST_10	0x0000000a00000000LL
#define CONST_1_2	0x0000000080000000LL
#define CONST_1_3	0x0000000055555555LL
#define CONST_1_4	0x0000000040000000LL
#define CONST_1_5	0x0000000033333333LL
#define CONST_1_6	0x000000002aaaaaaaLL
#define CONST_1_7	0x0000000024924924LL
#define CONST_1_8	0x0000000020000000LL
#define CONST_1_9	0x000000001c71c71cLL
#define CONST_1_10	0x0000000019999999LL
#define CONST_1_11	0x000000001745d174LL
#define CONST_1_12	0x0000000015555555LL
#define CONST_1_20	0x000000000cccccccLL
#define CONST_1_30	0x0000000008888888LL
#define CONST_1_42	0x0000000006186186LL
#define CONST_1_56	0x0000000004924924LL
#define CONST_1_72	0x00000000038e38e3LL
#define CONST_1_90	0x0000000002d82d82LL
#define CONST_1_110	0x000000000253c825LL
#define CONST_1_132	0x0000000001f07c1fLL
#define CONST_1_156	0x0000000001a41a41LL
#define CONST_E		0x00000002b7e15162LL
#define CONST_1_E	0x000000005e2d58d8LL
#define CONST_SQRTE	0x00000001a61298e1LL
#define CONST_1_SQRTE	0x000000009b4597e3LL
#define CONST_LOG2_E	0x0000000171547652LL
#define CONST_LOG10_E	0x000000006f2dec54LL
#define CONST_LN2	0x00000000b17217f7LL
#define CONST_LN10	0x000000024d763776LL
#define CONST_PI	0x00000003243f6a88LL
#define CONST_PI_2	0x00000001921fb544LL
#define CONST_PI_4	0x00000000c90fdaa2LL
#define CONST_1_PI	0x00000000517cc1b7LL
#define CONST_2_PI	0x00000000a2f9836eLL
#define CONST_2_SQRTPI	0x0000000120dd7504LL
#define CONST_SQRT2	0x000000016a09e667LL
#define CONST_1_SQRT2	0x00000000b504f333LL

sll slladd(sll x, sll y)
{
	return (x + y);
}

sll sllsub(sll x, sll y)
{
	return (x - y);
}

/*
 * Let a = A * 2^32 + a_hi * 2^0 + a_lo * 2^(-32)
 * Let b = B * 2^32 + b_hi * 2^0 + b_lo * 2^(-32)
 *
 * Where:
 *   *_hi is the integer part
 *   *_lo the fractional part
 *   A and B are the sign (0 for positive, -1 for negative).
 *
 * a * b = (A * 2^32 + a_hi * 2^0 + a_lo * 2^-32)
 *       * (B * 2^32 + b_hi * 2^0 + b_lo * 2^-32)
 *
 * Expanding the terms, we get:
 *
 *	 = A * B * 2^64 + A * b_h * 2^32 + A * b_l * 2^0
 *	 + a_h * B * 2^32 + a_h * b_h * 2^0 + a_h * b_l * 2^-32
 *	 + a_l * B * 2^0 + a_l * b_h * 2^-32 + a_l * b_l * 2^-64
 *
 * Grouping by powers of 2, we get:
 *
 *	 = A * B * 2^64
 *	 Meaningless overflow from sign extension - ignore
 *
 *	 + (A * b_h + a_h * B) * 2^32
 *	 Overflow which we can't handle - ignore
 *
 *	 + (A * b_l + a_h * b_h + a_l * B) * 2^0
 *	 We only need the low 32 bits of this term, as the rest is overflow
 *
 *	 + (a_h * b_l + a_l * b_h) * 2^-32
 *	 We need all 64 bits of this term
 *
 *	 +  a_l * b_l * 2^-64
 *	 We only need the high 32 bits of this term, as the rest is underflow
 *
 * Note that:
 *   a > 0 && b > 0: A =  0, B =  0 and the third term is a_h * b_h
 *   a < 0 && b > 0: A = -1, B =  0 and the third term is a_h * b_h - b_l
 *   a > 0 && b < 0: A =  0, B = -1 and the third term is a_h * b_h - a_l
 *   a < 0 && b < 0: A = -1, B = -1 and the third term is a_h * b_h - a_l - b_l
 */
#if defined(__arm__)
__inline__ sll sllmul(sll left, sll right)
{
	/*
	 * From gcc/config/arm/arm.h:
	 *   In a pair of registers containing a DI or DF value the 'Q'
	 *   operand returns the register number of the register containing
	 *   the least significant part of the value.  The 'R' operand returns
	 *   the register number of the register containing the most
	 *   significant part of the value.
	 */
	sll retval;

	__asm__ (
		"@ sllmul\n\t"
		"umull	%R0, %Q0, %Q1, %Q2\n\t"
		"mul	%R0, %R1, %R2\n\t"
		"umlal	%Q0, %R0, %Q1, %R2\n\t"
		"umlal	%Q0, %R0, %Q2, %R1\n\t"
	        "tst	%R1, #0x80000000\n\t"
	        "subne	%R0, %R0, %Q2\n\t"
	        "tst	%R2, #0x80000000\n\t"
	        "subne	%R0, %R0, %Q1\n\t"
		: "=&r" (retval)
		: "%r" (left), "r" (right)
		: "cc"
	);

	return retval;
}
#elif defined(__i386__)
__inline__ sll sllmul(sll left, sll right)
{
	register sll retval;
	__asm__(
		"# sllmul\n\t"
		"	movl	%1, %%eax\n\t"
		"	mull 	%3\n\t"
		"	movl	%%edx, %%ebx\n\t"
		"\n\t"
		"	movl	%2, %%eax\n\t"
		"	mull 	%4\n\t"
		"	movl	%%eax, %%ecx\n\t"
		"\n\t"
		"	movl	%1, %%eax\n\t"
		"	mull	%4\n\t"
		"	addl	%%eax, %%ebx\n\t"
		"	adcl	%%edx, %%ecx\n\t"
		"\n\t"
		"	movl	%2, %%eax\n\t"
		"	mull	%3\n\t"
		"	addl	%%ebx, %%eax\n\t"
		"	adcl	%%ecx, %%edx\n\t"
		"\n\t"
		"	btl	$31, %2\n\t"
		"	jnc	1f\n\t"
		"	subl	%3, %%edx\n\t"
		"1:	btl	$31, %4\n\t"
		"	jnc	1f\n\t"
		"	subl	%1, %%edx\n\t"
		"1:\n\t"
		: "=&A" (retval)
		: "m" (left), "m" (((unsigned *) &left)[1]),
		  "m" (right), "m" (((unsigned *) &right)[1])
		: "ebx", "ecx", "cc"
	);
	return retval;
}
#else
/* Plain C version: not optimal but portable. */
sll sllmul(sll a, sll b)
{
	unsigned int a_lo, b_lo;
	signed int a_hi, b_hi;
	sll x;

	a_lo = a;
	a_hi = (ull) a >> 32;
	b_lo = b;
	b_hi = (ull) b >> 32;

	x = ((ull) (a_hi * b_hi) << 32)
	  + (((ull) a_lo * b_lo) >> 32)
	  + (sll) a_lo * b_hi
	  + (sll) b_lo * a_hi;

	return x;
}
#endif

sll sllinv(sll v)
{
	int sgn = 0;
	sll u;
	ull s = -1;

	/* Use positive numbers, or the approximation won't work */
	if (v < CONST_0) {
		v = sllneg(v);
		sgn = 1;
	}

	/* An approximation - must be larger than the actual value */
	for (u = v; u; ((ull)u) >>= 1)
		s >>= 1;

	/* Newton's Method */
	u = sllmul(s, _sllsub(CONST_2, sllmul(v, s)));
	u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
	u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
	u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
	u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
	u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));

	return ((sgn) ? sllneg(u): u);
}

__inline__ sll slldiv(sll left, sll right)
{
	return sllmul(left, sllinv(right));
}

__inline__ sll sllmul2(sll x)
{
	return x << 1;
}

__inline__ sll sllmul4(sll x)
{
	return x << 2;
}

__inline__ sll sllmul2n(sll x, int n)
{
	sll y;

#if defined(__arm__)
	/* 
	 * On ARM we need to do explicit assembly since the compiler
	 * doesn't know the range of n is limited and decides to call
	 * a library function instead.
	 */
	__asm__ (
		"@ sllmul2n\n\t"
		"mov	%R0, %R1, lsl %2\n\t"
		"orr	%R0, %R0, %Q1, lsr %3\n\t"
		"mov	%Q0, %Q1, lsl %2\n\t"
		: "=r" (y)
		: "r" (x), "rM" (n), "rM" (32 - n)
	);
#else
	y = x << n;
#endif

	return y;
}

__inline__ sll slldiv2(sll x)
{
	return x >> 1;
}

__inline__ sll slldiv4(sll x)
{
	return x >> 2;
}

__inline__ sll slldiv2n(sll x, int n)
{
	sll y;

#if defined(__arm__)
	/* 
	 * On ARM we need to do explicit assembly since the compiler
	 * doesn't know the range of n is limited and decides to call
	 * a library function instead.
	 */
	__asm__ (
		"@ slldiv2n\n\t"
		"mov	%Q0, %Q1, lsr %2\n\t"
		"orr	%Q0, %Q0, %R1, lsl %3\n\t"
		"mov	%R0, %R1, asr %2\n\t"
		: "=r" (y)
		: "r" (x), "rM" (n), "rM" (32 - n)
	);
#else
	y = x >> n;
#endif

	return y;
}

/*
 * Unpack the IEEE floating point double format and put it in fixed point
 * sll format.
 */
sll dbl2sll(double dbl)
{
	union {
		double d;
		unsigned u[2];
		ull ull;
		sll sll;
	} in, retval;
	register unsigned exp;

	/* Move into memory as args might be passed in regs */
	in.d = dbl;

#if defined(__arm__)

	/* ARM architecture has a big-endian double */
	exp = in.u[0];
	in.u[0] = in.u[1];
	in.u[1] = exp;

#endif /* defined(__arm__) */

	/* Leading 1 is assumed by IEEE */
	retval.u[1] = 0x40000000;

	/* Unpack the mantissa into the unsigned long */
	retval.u[1] |= (in.u[1] << 10) & 0x3ffffc00;
	retval.u[1] |= (in.u[0] >> 22) & 0x000003ff;
	retval.u[0] = in.u[0] << 10;

	/* Extract the exponent and align the decimals */
	exp = (in.u[1] >> 20) & 0x7ff;
	if (exp)
		retval.ull >>= 1053 - exp;
	else
		return 0L;

	/* Negate if negative flag set */
	if (in.u[1] & 0x80000000)
		retval.sll = -retval.sll;

	return retval.sll;
}

double sll2dbl(sll s)
{
	union {
		double d;
		unsigned u[2];
		ull ull;
		sll sll;
	} in, retval;
	register unsigned exp;
	register unsigned flag;

	if (s == 0)
		return 0.0;

	/* Move into memory as args might be passed in regs */
	in.sll = s;

	/* Handle the negative flag */
	if (in.sll < 1) {
		flag = 0x80000000;
		in.ull = sllneg(in.sll);
	} else
		flag = 0x00000000;

	/* Normalize */
	for (exp = 1053; in.ull && (in.u[1] & 0x80000000) == 0; exp--) {
		in.ull <<= 1;
	}
	in.ull <<= 1;
	exp++;
	in.ull >>= 12;
	retval.ull = in.ull;
	retval.u[1] |= flag | (exp << 20);

#if defined(__arm__)

	/* ARM architecture has a big-endian double */
	exp = retval.u[0];
	retval.u[0] = retval.u[1];
	retval.u[1] = exp;

#endif /* defined(__arm__) */

	return retval.d;
}

/*
 * Calculate cos x where -pi/4 <= x <= pi/4
 *
 * Description:
 *	cos x = 1 - x^2 / 2! + x^4 / 4! - ... + x^(2N) / (2N)!
 *	Note that (pi/4)^12 / 12! < 2^-32 which is the smallest possible number.
 */
sll _sllcos(sll x)
{
	sll retval, x2;
	x2 = sllmul(x, x);
	/*
	 * cos x = t0 + t1 + t2 + t3 + t4 + t5 + t6
	 *
	 * f0 =  0! =  1
	 * f1 =  2! =  2 *  1 * f0 =   2 * f0
	 * f2 =  4! =  4 *  3 * f1 =  12 x f1
	 * f3 =  6! =  6 *  5 * f2 =  30 * f2
	 * f4 =  8! =  8 *  7 * f3 =  56 * f3
	 * f5 = 10! = 10 *  9 * f4 =  90 * f4
	 * f6 = 12! = 12 * 11 * f5 = 132 * f5
	 *
	 * t0 = 1
	 * t1 = -t0 * x2 /   2 = -t0 * x2 * CONST_1_2
	 * t2 = -t1 * x2 /  12 = -t1 * x2 * CONST_1_12
	 * t3 = -t2 * x2 /  30 = -t2 * x2 * CONST_1_30
	 * t4 = -t3 * x2 /  56 = -t3 * x2 * CONST_1_56
	 * t5 = -t4 * x2 /  90 = -t4 * x2 * CONST_1_90
	 * t6 = -t5 * x2 / 132 = -t5 * x2 * CONST_1_132
	 */
	retval = _sllsub(CONST_1, sllmul(x2, CONST_1_132));
	retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_90));
	retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_56));
	retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_30));
	retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_12));
	retval = _sllsub(CONST_1, slldiv2(sllmul(x2, retval)));
	return retval;
}

/*
 * Calculate sin x where -pi/4 <= x <= pi/4
 *
 * Description:
 *	sin x = x - x^3 / 3! + x^5 / 5! - ... + x^(2N+1) / (2N+1)!
 *	Note that (pi/4)^13 / 13! < 2^-32 which is the smallest possible number.
 */
sll _sllsin(sll x)
{
	sll retval, x2;
	x2 = sllmul(x, x);
	/*
	 * sin x = t0 + t1 + t2 + t3 + t4 + t5 + t6
	 *
	 * f0 =  0! =  1
	 * f1 =  3! =  3 *  2 * f0 =   6 * f0
	 * f2 =  5! =  5 *  4 * f1 =  20 x f1
	 * f3 =  7! =  7 *  6 * f2 =  42 * f2
	 * f4 =  9! =  9 *  8 * f3 =  72 * f3
	 * f5 = 11! = 11 * 10 * f4 = 110 * f4
	 * f6 = 13! = 13 * 12 * f5 = 156 * f5
	 *
	 * t0 = 1
	 * t1 = -t0 * x2 /   6 = -t0 * x2 * CONST_1_6
	 * t2 = -t1 * x2 /  20 = -t1 * x2 * CONST_1_20
	 * t3 = -t2 * x2 /  42 = -t2 * x2 * CONST_1_42
	 * t4 = -t3 * x2 /  72 = -t3 * x2 * CONST_1_72
	 * t5 = -t4 * x2 / 110 = -t4 * x2 * CONST_1_110
	 * t6 = -t5 * x2 / 156 = -t5 * x2 * CONST_1_156
	 */
	retval = _sllsub(x, sllmul(x2, CONST_1_156));
	retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_110));
	retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_72));
	retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_42));
	retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_20));
	retval = _sllsub(x, sllmul(sllmul(x2, retval), CONST_1_6));
	return retval;
}

sll sllcos(sll x)
{
	int i;
	sll retval;

	/* Calculate cos (x - i * pi/2), where -pi/4 <= x - i * pi/2 <= pi/4  */
	i = sll2int(_slladd(sllmul(x, CONST_2_PI), CONST_1_2));
	x = _sllsub(x, sllmul(int2sll(i), CONST_PI_2));

	switch (i & 3) {
		default:
		case 0:
			retval = _sllcos(x);
			break;
		case 1:
			retval = sllneg(_sllsin(x));
			break;
		case 2:
			retval = sllneg(_sllcos(x));
			break;
		case 3:
			retval = _sllsin(x);
			break;
	}
	return retval;
}

sll sllsin(sll x)
{
	int i;
	sll retval;

	/* Calculate sin (x - n * pi/2), where -pi/4 <= x - i * pi/2 <= pi/4 */
	i = sll2int(_slladd(sllmul(x, CONST_2_PI), CONST_1_2));
	x = _sllsub(x, sllmul(int2sll(i), CONST_PI_2));

	switch (i & 3) {
		default:
		case 0:
			retval = _sllsin(x);
			break;
		case 1:
			retval = _sllcos(x);
			break;
		case 2:
			retval = sllneg(_sllsin(x));
			break;
		case 3:
			retval = sllneg(_sllcos(x));
			break;
	}
	return retval;
}

sll slltan(sll x)
{
	int i;
	sll retval;
	i = sll2int(_slladd(sllmul(x, CONST_2_PI), CONST_1_2));
	x = _sllsub(x, sllmul(int2sll(i), CONST_PI_2));
	switch (i & 3) {
		default:
		case 0:
		case 2:
			retval = slldiv(_sllsin(x), _sllcos(x));
			break;
		case 1:
		case 3:
			retval = sllneg(slldiv(_sllcos(x), _sllsin(x)));
			break;
	}
	return retval;
}

/*
 * atan x = SUM[n=0,) (-1)^n * x^(2n + 1)/(2n + 1), |x| < 1
 *
 * Two term approximation
 *	a = x - x^3/3
 * Gives us
 *	atan x = a + ??
 * Let ?? = arctan ?
 *	atan x = a + arctan ?
 * Rearrange
 *	atan x - a = arctan ?
 * Apply tan to both sides
 *	tan (atan x - a) = tan arctan ?
 *	tan (atan x - a) = ?
 * Applying the standard formula
 *	tan (u - v) = (tan u - tan v) / (1 + tan u * tan v)
 * Gives us
 *	tan (atan x - a) = (tan atan x - tan a) / (1 + tan arctan x * tan a)
 * Let t = tan a
 *	tan (atan x - a) = (x - t) / (1 + x * t)
 * So finally
 *	arctan x = a + arctan ((tan x - t) / (1 + x * t))
 * And the typical worst case is x = 1.0 which converges in 3 iterations.
 */
sll _sllatan(sll x)
{
	sll a, t, retval;

	/* First iteration */
	a = sllmul(x, _sllsub(CONST_1, sllmul(x, sllmul(x, CONST_1_3))));
	retval = a;

	/* Second iteration */
	t = slldiv(_sllsin(a), _sllcos(a));
	x = slldiv(_sllsub(x, t), _slladd(CONST_1, sllmul(t, x)));
	a = sllmul(x, _sllsub(CONST_1, sllmul(x, sllmul(x, CONST_1_3))));
	retval = _slladd(retval, a);

	/* Third  iteration */
	t = slldiv(_sllsin(a), _sllcos(a));
	x = slldiv(_sllsub(x, t), _slladd(CONST_1, sllmul(t, x)));
	a = sllmul(x, _sllsub(CONST_1, sllmul(x, sllmul(x, CONST_1_3))));
	return _slladd(retval, a);
}

sll sllatan(sll x)
{
	sll retval;

	if (x < -sllneg(CONST_1))
		retval = sllneg(CONST_PI_2);
	else if (x > CONST_1)
		retval = CONST_PI_2;
	else
		return _sllatan(x);
	return _sllsub(retval, _sllatan(sllinv(x)));
}

/*
 * Calculate e^x where -0.5 <= x <= 0.5
 *
 * Description:
 *	e^x = x^0 / 0! + x^1 / 1! + ... + x^N / N!
 *	Note that 0.5^11 / 11! < 2^-32 which is the smallest possible number.
 */
sll _sllexp(sll x)
{
	sll retval;
	retval = _slladd(CONST_1, sllmul(0, sllmul(x, CONST_1_11)));
	retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_11)));
	retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_10)));
	retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_9)));
	retval = _slladd(CONST_1, sllmul(retval, slldiv2n(x, 3)));
	retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_7)));
	retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_6)));
	retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_5)));
	retval = _slladd(CONST_1, sllmul(retval, slldiv4(x)));
	retval = _slladd(CONST_1, sllmul(retval, sllmul(x, CONST_1_3)));
	retval = _slladd(CONST_1, sllmul(retval, slldiv2(x)));
	return retval;
}

/*
 * Calculate e^x where x is arbitrary
 */
sll sllexp(sll x)
{
	int i;
	sll e, retval;

	e = CONST_E;

	/* -0.5 <= x <= 0.5  */
	i = sll2int(_slladd(x, CONST_1_2));
	retval = _sllexp(_sllsub(x, int2sll(i)));

	/* i >= 0 */
	if (i < 0) {
		i = -i;
		e = CONST_1_E;
	}

	/* Scale the result */
	for (;i; i >>= 1) {
		if (i & 1)
			retval = sllmul(retval, e);
		e = sllmul(e, e);
	}
	return retval;
}

/*
 * Calculate natural logarithm using Netwton-Raphson method
 */
sll slllog(sll x)
{
	sll x1, ln = 0;

	/* Scale: e^(-1/2) <= x <= e^(1/2) */
	while (x < CONST_1_SQRTE) {
		ln = _sllsub(ln, CONST_1);
		x = sllmul(x, CONST_E);
	}
	while (x > CONST_SQRTE) {
		ln = _slladd(ln, CONST_1);
		x = sllmul(x, CONST_1_E);
	}

	/* First iteration */
	x1 = sllmul(_sllsub(x, CONST_1), slldiv2(_sllsub(x, CONST_3)));
	ln = _sllsub(ln, x1);
	x = sllmul(x, _sllexp(x1));

	/* Second iteration */
	x1 = sllmul(_sllsub(x, CONST_1), slldiv2(_sllsub(x, CONST_3)));
	ln = _sllsub(ln, x1);
	x = sllmul(x, _sllexp(x1));

	/* Third iteration */
	x1 = sllmul(_sllsub(x, CONST_1), slldiv2(_sllsub(x, CONST_3)));
	ln = _sllsub(ln, x1);

	return ln;
}

/*
 * ln x^y = y * log x
 * e^(ln x^y) = e^(y * log x)
 * x^y = e^(y * ln x)
 */
sll sllpow(sll x, sll y)
{
	if (y == CONST_0)
		return CONST_1;
	return sllexp(sllmul(y, slllog(x)));
}

/*
 * Consider a parabola centered on the y-axis
 * 	y = a * x^2 + b
 * Has zeros (y = 0)  at
 *	a * x^2 + b = 0
 *	a * x^2 = -b
 *	x^2 = -b / a
 *	x = +- (-b / a)^(1 / 2)
 * Letting a = 1 and b = -X
 *	y = x^2 - X
 *	x = +- X^(1 / 2)
 * Which is convenient since we want to find the square root of X, and we can
 * use Newton's Method to find the zeros of any f(x)
 *	xn = x - f(x) / f'(x)
 * Applied Newton's Method to our parabola
 *	f(x) = x^2 - X
 *	xn = x - (x^2 - X) / (2 * x)
 *	xn = x - (x - X / x) / 2
 * To make this converge quickly, we scale X so that
 *	X = 4^N * z
 * Taking the roots of both sides
 *	X^(1 / 2) = (4^n * z)^(1 / 2)
 *	X^(1 / 2) = 2^n * z^(1 / 2)
 * Let N = 2^n
 *	x^(1 / 2) = N * z^(1 / 2)
 * We want this to converge to the positive root, so we must start at a point
 *	0 < start <= x^(1 / 2)
 * or
 *	x^(1/2) <= start <= infinity
 * since
 *	(1/2)^(1/2) = 0.707
 *	2^(1/2) = 1.414
 * A good choice is 1 which lies in the middle, and takes 4 iterations to
 * converge from either extreme.
 */
sll sllsqrt(sll x)
{
	sll n, xn;
       
	/* Start with a scaling factor of 1 */
	n = CONST_1;

	/* Quick solutions for the simple cases */
	if (x <= CONST_0 || x == CONST_1)
		return x;

	/* Scale x so that 0.5 <= x < 2 */
	while (x >= CONST_2) {
		x = slldiv4(x);
		n = sllmul2(n);
	}
	while (x < CONST_1_2) {
		x = sllmul4(x);
		n = slldiv2(n);
	}

	/* Simple solution if x = 4^n */
	if (x == CONST_1)
		return n;

	/* The starting point */
	xn = CONST_1;

	/* Four iterations will be enough */
	xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x, xn))));
	xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x, xn))));
	xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x, xn))));
	xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x, xn))));

	/* Scale the result */
	return sllmul(n, xn);
}