/* > ecdl2K-108.32bit.c
 * Purpose: Fast arithmetic for computing discrete logs on elliptic curves.
 * Copyright: Robert J. Harley, 1997-1999.
 * Contact: Robert.Harley@inria.fr
 * Legalese: This source code is subject to the GNU General Public Licence v2.
 */

/* This is Rob's 32-bit ECDL program for Unix, version 1.1.0.
 *
 * Consult the following URL for more information:
 *   http://cristal.inria.fr/~harley/ecdl7/
 */


/*** Quickstart ***/

/* Do this:

cc -O3 ecdl2K-108.32bit.c -o ecdl.exe
echo Machine name is `hostname`
mkdir `hostname`
cd `hostname`
nice ../ecdl.exe mail by my@email.address on `hostname` > ecdl.log &

 *
 * Then go sign up, using the exact same email address, at this form:
 *
 *   http://cristal.inria.fr/bin/ecdldb
 *
 */


/*** Detailed instructions ***/

/** 1. Signing up **
 *
 * Go to this form to sign up:
 *
 *   http://cristal.inria.fr/bin/ecdldb
 *
 * Give your email address (or at least something that identifies you
 * uniquely) and a passphrase you just invented.  Keep a safe copy of
 * the passphrase!
 *
 * Note: If you prefer to stay anonymous and not sign up, that's fine.
 *
 **/

/** 2. Compiling **
 *
 * Compile with something like:
 *   gcc -O3 ecdl2K-108.32bit.c -o ecdl.exe
 *
 * You can test the binary like this:
 *   ./ecdl.exe test by me@here on bla
 *
 * The test output should match this sample except for the reported
 * times and rates:

> ./ecdl.exe test by me@here on bla
This is Rob's ECDL program for Unix32, version 1.1.0 MMX
Mode ...... test
Email ..... me@here
Machine ... bla
Starting at Sun Dec 12 15:34:13 1999
Generating 32 new starting points.
Computing iterations...

Test point found at: Sun Dec 12 15:34:18 1999
TEST-L0|Unix32|1.1.0 MMX|
TEST-L1|EF7DC6C12ACAF01D48CE4D44EB6|5163373649C868AD3265FACCF99|
TEST-L2|me@here|
TEST-L3|000000004887|
TEST-L4|bla|
Iterations used = 18567.
Total iterations = 594144.
Rate = 150416 per second.

Test point found at: Sun Dec 12 15:34:45 1999
TEST-L0|Unix32|1.1.0 MMX|
TEST-L1|B0152E85E4E22B6EE3C7120FC46|026063744AFC9DB003098294BD6|
TEST-L2|me@here|
TEST-L3|00000001BACA|
TEST-L4|bla|
Iterations used = 113354.
Total iterations = 3.62733e+06.
Rate = 148600 per second.

Test point found at: Sun Dec 12 15:34:54 1999
TEST-L0|Unix32|1.1.0 MMX|
TEST-L1|607BB600DECAF410A7B62ED3DD3|F43D17AAF642BBF4EBFAA9C1984|
TEST-L2|me@here|
TEST-L3|000000023C69|
TEST-L4|bla|
Iterations used = 146537.
Total iterations = 4.68918e+06.
Rate = 147598 per second.

[...and so on...]

 * If the program complains that it cannot find sendmail, you can
 * compile with -DSENDMAIL='"/usr/sbin/sendmail"' or something similar.
 *
 **/


/** 3. Optimising **
 *
 * You can compile with various optimisation flags to maximise speed
 * before starting on the real calculations.  Speeds to aim for are
 * roughly 150 k iterations per second for a 500 MHz x86 (80 k without
 * MMX), 100 k for a 450 MHz PowerPC G3 and 40 k for a 275 MHz StrongARM.
 *
 * Good flags to use with gcc are:
 *   -fomit-frame-pointer -funroll-loops -fexpensive-optimizations
 *
 * You might also consider specifying "-march=pentiumpro" or similar.
 *
 * If you have a chip with MMX instructions, definitely use "-DMMX" as
 * the program will run about twice as fast!  (Compiling the MMX
 * version requires a recent version of gcc).  For non-MMX chips, try
 * adding -DPROD=2 (or 3, 4 or 5) to the compilation flags to use a
 * different implementation of speed critical functions.  It might be
 * a little faster; then again, maybe not!
 *
 * You can also try tweaking the value of PARAL.  Just add -DPARAL=40,
 * for instance.  The default is 32 and useful values are roughly in
 * the range 20 to 50.  Note that the test output will no longer
 * exactly match the sample given above.
 *
 **/

/** 4. Starting up **
 *
 * When you're ready to start up: on each machine you want to run on
 * make a directory for that machine, "cd" into it and go.
 *
 *   echo Machine name is `hostname`
 *   mkdir `hostname`
 *   cd `hostname`
 *   unlimit cputime || (echo Using ulimit instead; ulimit -t unlimited)
 *   nice +15 ../ecdl.exe [...arguments, see below...] > ecdl.log &
 *
 * Note: Some shells need "nice -15" instead.
 * Note: You can keep an eye on the output using "tail -f ecdl.log".
 * Note: If you have a machine with several CPUs, make a directory for
 *       each CPU and start up an ecdl process in each one.
 *
 * The arguments have the following syntax:
 *   ecdl <mode> by <email> on <machine>
 *
 * <mode>:      Either "test", "batch", "mail", "alt" or "http".
 * <email>:     Something to identify you uniquely e.g., your email address.
 * <machine>:   The machine name e.g., as given by "hostname".
 *
 * Note: For <email> give the exact same string you used when signing up,
 *       since otherwise the Web pages will not be able to show correct
 *       statistics for your contribution.
 *
 * Examples:
 *   ecdl mail by John.Smith@example.com on avalon
 *   ecdl batch by anonymous on 'Mystery machine'
 *
 * Test mode quickly produces some (useless) test output to stdout along
 * with info on iterations per second.  This allows you to check that
 * your binary is OK, to compare compiler optimisation flags etc.
 *
 * The other modes produce things called "distinguished points" every
 * few hours.  Each distinguished point looks something like this:

Distinguished point found at: Wed Nov 24 03:52:02 1999
ECC-L0|Unix32|1.1.0|
ECC-L1|6B445FC02D031AEA01F28B7AF92|ABD2FF8960D196A4AB43719A64F|
ECC-L2|me@here|
ECC-L3|000015AE4C35|
ECC-L4|bla|

 * These points are valuable!  The ECDL project needs to collect
 * roughly 1.3 million of them to solve Certicom's ECC2K-108 problem.
 * The points are always appended to a file called "dist.points" and
 * also written to stdout along with some verbose info.
 *
 * When running in "mail" mode each point is automatically emailed
 * (using "sendmail") to ecdl2K-108@cristal.inria.fr as well.  INRIA's
 * excessively aggressive spam filters might refuse the email; in that
 * case switch to "alt" mode which sends to ecdl2K-108@rupture.net instead.
 *
 * When running in "http" mode each point is automatically sent (using
 * the HTTP POST protocol) to the Web server on cristal.inria.fr.  If
 * your machine is behind a firewall, you'll need to specify a Web
 * proxy to use for sending the points via HTTP.  Specify it on the
 * command line as follows:
 *
 *   ecdl http by <email> on <machine> proxy <proxy-host>:<proxy-port>
 *
 * where <proxy-host> is the host name of the proxy, e.g.
 * "myproxy.mydomain.com", and <proxy-port> is the port number of the
 * HTTP proxy on this host, e.g. "3128".
 *
 * For machines unable to send email or make HTTP connections,
 * for instance not permanently connected to the Net, use "batch" mode.
 * Then every few days do:
 *
 *   mv dist.points dist.points.nov.25
 *
 * or similar using the correct date, and email the "dist.points.nov.25"
 * file manually.  When the next distinguished point is found, a new
 * "dist.points" file will be created for it and further points will
 * be appended to this new file until the next time you "mv" it.
 *
 * Mode:                           test    batch   mail    alt     http
 * ----------------------------------------------------------------------
 *
 * Writes info on stdout            x       x       x       x       x
 *
 * Writes points to "dist.points"           x       x       x       x
 *
 * Sends points via sendmail                        x       x
 *
 * Sends points via http                                            x
 *
 **/

/** 5. Shutting down (and saved states) **
 *
 * When the program is running it saves its internal state in a file
 * called "saved.state" from time to time.  If you want to stop the
 * program, send it a SIGINT or SIGTERM signal so it can save its
 * state before exiting.  If it is running in the foreground, you can
 * send SIGINT by typing Ctrl-C.  If it is running in the background
 * send SIGTERM by "killall ecdl.exe".  If you don't have "killall"
 * installed, find the PID (first column of the "ps" output) and use "kill":
 *
 *   ps x | grep ecdl | grep -v grep
 *   kill <the PID>
 *
 * When you restart in the same directory:
 *
 *   nice +15 ../ecdl.exe [...arguments...] >> ecdl.log &
 *
 * the program will look for the saved state file.  If no saved state
 * is found it will generate a new state and work from there.  If one
 * is found, the program reads it and continues where it left off.
 * Note that it would be wasteful to use a given saved state on more
 * than one CPU because work would just be duplicated.
 *
 * The reason each machine should have its own directory is to avoid
 * the dist.points and saved.state files getting mixed up!  Note that
 * test mode does not mess with these files.
 *
 **/


/*== #includes =============================================================*/

/** Ansi includes. **/

#include <stdio.h>

/* For strerror(), strcmp() and strlen(). */
#include <string.h>

/* For signal(), SIGINT and SIGTERM. */
#include <signal.h>

/* For errno. */
#include <errno.h>

/* For time and date: time() and ctime(). */
#include <time.h>

/* For malloc(). */
#include <stdlib.h>


/** Unix includes. **/

/* For getpid(), access() and timing. */
#include <unistd.h>

/* For timing: getrusage() and struct rusage. */
#include <sys/time.h>
#if !MACOS
#include <sys/resource.h>
#endif

/* Socket stuff */
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>


/** Windows (Cygwin32) includes. **/

#if __CYGWIN__
/* For SetPriorityClass */
#include <winbase.h>
/* For times() */
#include <sys/times.h>
#endif


/** MacOS includes. **/

#if MACOS
#include "main.h"
#endif


/*== Types =================================================================*/

/* Abbreviations used when bit-length does not matter. */
typedef unsigned int uint;

/* Integers. */
typedef unsigned char u8;
typedef unsigned int u32;

typedef struct { u32 hi, lo; } u64;
typedef struct { u32 t, h, m, l; } u128;

/* Polynomials over the two-element field GF(2) = Z/2Z. */
typedef struct { u32 t, h, m, l; } poly128;


/* MMX types.  Endianness, alignment and packing matter here. */
#ifdef MMX

typedef struct {
  u32 lo, hi;
} __attribute__((packed)) __attribute__ ((aligned (8))) mmx64;

typedef struct { mmx64 hi, lo; } mmx128;

#endif


/* Operating modes. */
typedef enum { Bad, Test, Batch, Mail, Alt, Http } modeType;


/*== #defines ==============================================================*/

/*-= Stuff that you can change =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/* The place to find sendmail.  Often it's in /usr/sbin/sendmail but
 * then /usr/lib/sendmail is a link to it, so this works fairly universally.
 */
#ifndef SENDMAIL
#define SENDMAIL "/usr/lib/sendmail"
#endif

/* Name of device for reading entropy from (it isn't strictly necessary). */
#ifndef RANDOM_DEV_FILENAME
#define RANDOM_DEV_FILENAME "/dev/urandom"
#endif

/* Name of file for writing distinguished points to (as well as stdout). */
#ifndef DIST_POINTS_FILENAME
#define DIST_POINTS_FILENAME "dist.points"
#endif

/* Name of file for saving state after each distinguished point or when
 * stopping due to a SIGINT or SIGTERM.  It is read when restarting.
 */
#ifndef SAVED_STATE_FILENAME
#define SAVED_STATE_FILENAME "saved.state"
#endif


/* The state is checked and saved every 2^SAVED_STATE_SHIFT iterations
 * and also after a distinguished point is found.
 */
#ifndef SAVED_STATE_SHIFT
#define SAVED_STATE_SHIFT (26)
#endif

#if SAVED_STATE_SHIFT < 20 || SAVED_STATE_SHIFT > 31
#error Please define SAVED_STATE_SHIFT in the range 20..31.
#endif


/* Number of iterations to run "in parallel".  This is done so that
 * PARAL inversions can be replaced by one inversion and 3*PARAL-3 mults.
 * Try something roughly in the range 20 to 50.
 */
#ifndef PARAL
#define PARAL (32)
#endif

#if PARAL < 1
#error Please define PARAL to be greater than 0.
#endif


#ifdef MMX

#ifdef PROD
#warning PROD is ignored in MMX mode...
#undef PROD
#endif

#else

/* Define PROD to choose a different implementation of the critical
 * GF2ProductNNxNN() and product() functions.  The default is 1.
 * Try 2, 3, 4 or 5.  They might be faster.  Then again maybe not!
 */
#ifndef PROD
#define PROD 1
#endif

#if PROD < 1 || PROD > 5
#error Please define PROD in the range 1..5.
#endif

#endif


#ifdef MMX

/* For good performance with MMX, variables of type mmx64 need 8-byte
 * alignment.  On systems that do not align the stack sufficiently, it
 * is best to force such variables into the data area with "static".
 * Even if the stack is aligned this does not hurt performance, so we
 * do it by default.
 */

/*#define MMX_STATIC*/
#define MMX_STATIC static

#endif


/* Keyword for giving "inline" hint.
 * Adjust for your compiler or leave it out.
 */
#if defined(__GNUC__)
#define INLINE __inline
#else
#define INLINE
#endif


/*-= Stuff that must not be changed =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/* Version 0.9.0: 31 September 1999
 * - Does 66 k iters/sec with gcc 2.95+Linux+500 MHz Pentium III (PARAL=20).
 * Version 0.99.0: 24 November 1999
 * - Does 82 k iters/sec with gcc 2.95+Linux+500 MHz Pentium III (PARAL=32)
 *   and 145 k when using MMX!
 * Version 1.0.0: 3 December 1999
 * Version 1.0.1: 7 December 1999
 * - Added HTTP mode (Xavier Leroy).
 * - Ported to MS Windows with Cygwin32 (Xavier Leroy).
 * Version 1.0.2: 9 December 1999
 * - Put variables of type mmx64 in the data area (Jacques Garrigue).
 * - Added MacOS stuff (Damien Doligez).
 * - Gross hack to work around spurious error reports in http mode.
 * Version 1.1.0: 12 December 1999
 * - Print time at start-up.
 * - Proper solution for http mode: use HTTP/1.0 (Roy T. Fielding).
 */
#if __CYGWIN__
#define VARIANT "Cygwin32"
#elif MACOS
#define VARIANT "MacOS"
#else
#define VARIANT "Unix32"
#endif

#ifdef MMX
#define VERSION "1.1.0 MMX"
#else
#define VERSION "1.1.0"
#endif


/* These are used a few places for types poly128 and u128. */
#define ZERO { 0, 0, 0, 0 }
#define ONE { 0, 0, 0, 1 }


/* The number of cases used in pseudo-random function. */
#define CASES (7)


/* Population count used to decide if a point is distinguished.
 * Expect a given popc to occur one time in 2^108/binomial(109, popc)
 * if it's odd (even ones never occur).
 */
/* 1 time in 1422267 or so (for testing). */
#define TEST_POP_COUNT (29)

/* 1 time in 1407700775 or so (for real). */
#define DIST_POP_COUNT (23)

/* For clarity. */
#define square(x) (squareNTimes((x), 1))


/*-= Stuff for MMX =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/* Define this for a version that uses lots of MMX instructions.
 * It is roughly twice as fast with MMX as without.
 */
/*#define MMX*/


/* The MMX instructions are emitted using gcc-specific asm()s. */
#if defined(MMX) && !defined(__GNUC__)
#error Compiling for MMX requires gcc.
#endif


#ifdef MMX

#define STRNG(x) #x
#define STRING(x) STRNG(x)

extern mmx64 mmx_regs[8]; /* Dummy memory area. */

#define START_MMX() \
  asm volatile ("/* Start of MMX section */" : :  )

#define STR(MEM, RVAL) \
  asm("movq %%mm" STRING(RVAL) ", %0" : "=m" (MEM) : "m" (mmx_regs[RVAL]))

#define LOD(LVAL, MEM) \
  asm("movq %1, %%mm" STRING(LVAL) : "=m" (mmx_regs[LVAL]) : "m" (MEM))

#define CPY(LVAL, RVAL) \
  asm( "movq %%mm" STRING(RVAL) ", %%mm" STRING(LVAL)                   \
     : "=m" (mmx_regs[LVAL]) : "m" (mmx_regs[RVAL])                     \
     )

#define SLL(LVAL, SH) \
  asm("psllq $" STRING(SH) ", %%mm" STRING(LVAL) : "+m" (mmx_regs[LVAL]) : )

#define SRL(LVAL, SH) \
  asm("psrlq $" STRING(SH) ", %%mm" STRING(LVAL) : "+m" (mmx_regs[LVAL]) : )

#define ORR(LVAL, RVAL) \
  asm( "por %%mm" STRING(RVAL) ", %%mm" STRING(LVAL)                    \
     : "+m" (mmx_regs[LVAL]) : "m" (mmx_regs[RVAL])                     \
     )

#define AND(LVAL, RVAL) \
  asm( "pand %%mm" STRING(RVAL) ", %%mm" STRING(LVAL)                   \
     : "+m" (mmx_regs[LVAL]) : "m" (mmx_regs[RVAL])                     \
     )

#define XOR(LVAL, RVAL) \
  asm( "pxor %%mm" STRING(RVAL) ", %%mm" STRING(LVAL)                   \
     : "+m" (mmx_regs[LVAL]) : "m" (mmx_regs[RVAL])                     \
     )

#define AND_MEM(LVAL, MEM) \
  asm("pand %1, %%mm" STRING(LVAL) : "+m" (mmx_regs[LVAL]) : "m" (MEM))

#define ORR_MEM(LVAL, MEM) \
  asm("por %1, %%mm" STRING(LVAL) : "+m" (mmx_regs[LVAL]) : "m" (MEM))

#define XOR_MEM(LVAL, MEM) \
  asm("pxor %1, %%mm" STRING(LVAL) : "+m" (mmx_regs[LVAL]) : "m" (MEM))

#define END_MMX() \
  asm volatile ("emms /* End of MMX section */" : : )

#endif
/* defined(MMX) ... */


/*-= Data defining which ECDL problem to solve =-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/* Field is GF(2^109) represented as (Z/2Z)[t] / (t^109+t^9+t^2+t+1)
 *   (t^109+t^9+t^2+t+1 is irreducible in (Z/2Z)[t]).
 */

/* Curve from Certicom ECC2K-108 problem is y^2 + x*y = x^3 + x^2 + 1
 * over GF(2^109).
 *
 * Group order is G = 2^109+1-w^109-conj(w)^109 where w = (1-sqrt(-7))/2.
 * Use subgroup of order G/2 = 324518553658426701487448656461467 (prime).
 *
 * Frobenius is multiplication by w i.e., by
 * 138423345589698157369693034392981 modulo g.
 */
#define A       { 0, 0, 0, 1 }
#define B       { 0, 0, 0, 1 }
#define ORDER   { 0x0FFF, 0xFFFFFFFF, 0xFFA621B0, 0x2C383E9B }

/* Points from Certicom ECC2K-108 problem: */
#define XP      { 0x0478, 0xC46CC963, 0x38CED915, 0x65E17257 }
#define YP      { 0x1E79, 0x65E4A3AF, 0xB73A48FC, 0x9AB790E9 }
#define XQ      { 0x1FF0, 0xCE5EC618, 0x93F2119C, 0x3077C59E }
#define YQ      { 0x1F20, 0xE9B010AC, 0x691C9B87, 0xB438241D }


/*== Function declarations =================================================*/

/*-= Short little functions =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

static INLINE int ge(u128 x, u128 y);
static INLINE u128 diff(u128 x, u128 y);
static INLINE u128 sumMod(u128 x, u128 y, u128 m);
static INLINE u128 doubleMod(u128 x, u128 m);
static INLINE u128 incMod(u128 x, u128 m);
static INLINE int equal(poly128 x, poly128 y);
static INLINE poly128 xor(poly128 x, poly128 y);
static INLINE u32 topBit(u32 x);
static INLINE u128 squareMod(u128 x, u128 m);


/*-= Main and friends =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

static modeType parse(int argc, char *argv[]);
void catchSignal(int sigNum);
int main(int argc, char *argv[]);


/*-= Stuff for checking that a point is distinguished =-=-=-=-=-=-=-=-=-=-=-*/

static uint popc(poly128 n);
static poly128 changeBasis(poly128 x);


/*-= Arithmetic in the field GF(2^109) -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

static poly128 squareNTimes(poly128 x, uint n);

#if PROD == 1
static u32 GF2Product27x27(u32 a, u32 b, u32 *ph);
#elif PROD == 3
static u32 GF2Product28x28(u32 a, u32 b, u32 *ph);
#elif PROD == 5
static u32 GF2Product32x32(u32 a, u32 b, u32 *ph);
#endif

#ifdef MMX
static void MMX_GF2Product56x56
  (u32 xh, u32 xl, const mmx64 *py, mmx64 *ph, mmx64 *pl);
#elif PROD == 1 || PROD == 2
static u32 GF2Product54x54
  (u32 xh, u32 xl, u32 yh, u32 yl, u32 *pt, u32 *ph, u32 *pm);
#else
static u32 GF2Product56x56
  (u32 xh, u32 xl, u32 yh, u32 yl, u32 *pt, u32 *ph, u32 *pm);
#endif

static poly128 product(poly128 x, poly128 y);
static poly128 inverse(poly128 y);
static INLINE poly128 quotient(poly128 x, poly128 y);


/*-= Arithmetic mod the group order =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

static u128 productMod(u128 x, u128 y, u128 m);
static u128 powerMod(u128 x, u64 y, u128 m);
static u128 findMultiplier(u64 *cases);


/*-= Arithmetic on elliptic curve over the field =-=-=-=-=-=-=-=-=-=-=-=-=-=*/

static int ellipticDouble
  (poly128 x, poly128 y, int z, poly128 *px2, poly128 *py2);
static int ellipticSum
  ( poly128 x1, poly128 y1, int z1, poly128 x2, poly128 y2, int z2
  , poly128 *px3, poly128 *py3
  );
static int ellipticProduct
  (poly128 x, poly128 y, int z, u128 fac, poly128 *px2, poly128 *py2);


/*-= Various file manipulation thingies =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

static int syncAndCheck(u64 *cases, u128 *pu, u128 *pv, poly128 x, poly128 y);
static void reportDistinguished
  ( modeType mode, char *argv[]
  , u64 iters, u128 u, u128 v, u64 total, double dStart
  );
static void u32ToBytes(u32 data, u8 *p);
static u32 bytesToU32(u8 *p);
static void writeState
  (u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT);
static uint readState
  (u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT);


/*-= Stuff for http mode =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

static int httpPost(char * payload);
static int httpParseProxy(char * proxy);
static int httpInit(void);


/*== Global variables ======================================================*/

int gotSignal = 0;


/*== Function definitions ==================================================*/

/*-= Short little functions =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- ge --------------------------------------------------------------------*/

/* Return whether x >= y, for 0 <= x, y < 2^128. */
static INLINE int ge(u128 x, u128 y) {

  return x.t > y.t
         || x.t == y.t
            && ( x.h > y.h
                 || x.h == y.h && (x.m > y.m || x.m == y.m && x.l >= y.l)
               )
         ;
} /* end function ge */


/*-- diff ------------------------------------------------------------------*/

/* Return x-y for 0 <= y <= x < 2^128. */
static INLINE u128 diff(u128 x, u128 y) {
  u32 c, d;

  c = x.l < y.l; x.l -= y.l;
  d = x.m < c; x.m -= c; c = d | x.m < y.m; x.m -= y.m;
  d = x.h < c; x.h -= c; c = d | x.h < y.h; x.h -= y.h;
  x.t -= c; x.t -= y.t;

  return x;
} /* end function diff */


/*-- sumMod ----------------------------------------------------------------*/

/* Return sum x+y modulo m, 0 <= x, y < m <= 2^127. */
static INLINE u128 sumMod(u128 x, u128 y, u128 m) {
  u32 c;

  x.l += y.l; c = x.l < y.l;
  x.m += c; c = x.m < c; x.m += y.m; c |= x.m < y.m;
  x.h += c; c = x.h < c; x.h += y.h; c |= x.h < y.h;
  x.t += c; x.t += y.t;

  if (ge(x, m)) x = diff(x, m);

  return x;
} /* end function sumMod */


/*-- doubleMod -------------------------------------------------------------*/

/* Return double x<<1 modulo m, where 0 <= x < m <= 2^127. */
static INLINE u128 doubleMod(u128 x, u128 m) {

  x.t <<= 1; x.t |= x.h>>31;
  x.h <<= 1; x.h |= x.m>>31;
  x.m <<= 1; x.m |= x.l>>31;
  x.l <<= 1;

  if (ge(x, m)) x = diff(x, m);

  return x;
} /* end function doubleMod */


/*-- incMod ----------------------------------------------------------------*/

/* Increment x modulo m, for 0 <= x < m < 2^128. */
static INLINE u128 incMod(u128 x, u128 m) {
  const u128 intZero = ZERO;

  if (!++x.l) if (!++x.m) if (!++x.h) ++x.t;

  if (x.l == m.l && x.m == m.m && x.h == m.h && x.t == m.t) {
    x = intZero;
  } /* end if */

  return x;
} /* end function incMod */


/*-- equal -----------------------------------------------------------------*/

/* Return whether x == y, for polys over Z/2Z. */
static INLINE int equal(poly128 x, poly128 y) {

  return x.l == y.l && x.m == y.m && x.h == y.h && x.t == y.t;
} /* end function equal */


/*-- xor -------------------------------------------------------------------*/

/* Return x XOR y, in other words add them as polys over Z/2Z. */
static INLINE poly128 xor(poly128 x, poly128 y) {

  x.l ^= y.l; x.m ^= y.m; x.h ^= y.h; x.t ^= y.t;

  return x;
} /* end function xor */


/*-- topBit ----------------------------------------------------------------*/

/* Returns top bit of x, or 0 if x = 0. */
static INLINE u32 topBit(u32 x) {

  x |= x>>1; x |= x>>2; x |= x>>4; x |= x>>8; x |= x>>16;

  return x ^ x>>1;
} /* end function topBit */


/*-- squareMod -------------------------------------------------------------*/

static INLINE u128 squareMod(u128 x, u128 m) {

  return productMod(x, x, m);
} /* end function squareMod */


/*-= Main and friends =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- parse -----------------------------------------------------------------*/

/* Parse command line and return mode Bad, Batch, Test, Mail, Alt or Http. */
static modeType parse(int argc, char *argv[]) {
  char ch, *str;
  modeType mode;

  if (argc != 6 && argc != 8) return Bad;

  str = argv[1];

  if (!strcmp(str, "test")) mode = Test;
  else if (!strcmp(str, "batch")) mode = Batch;
  else if (!strcmp(str, "mail")) mode = Mail;
  else if (!strcmp(str, "alt")) mode = Alt;
  else if (!strcmp(str, "http")) mode = Http;
  else return Bad;

  if (strcmp(argv[2], "by") ) return Bad;

  for (str = argv[3]; (ch = *str) != '\0'; ++str) if (ch == '|') return Bad;

  if (strcmp(argv[4], "on")) return Bad;

  for (str = argv[5]; (ch = *str) != '\0'; ++str) if (ch == '|') return Bad;

  if ( argc == 8
       && ( mode != Http || strcmp(argv[6], "proxy")
            || httpParseProxy(argv[7]) == -1
          )
     ) {
    return Bad;
  } /* end if */

  return mode;
} /* end function parse */


/*-- catchSignal -----------------------------------------------------------*/

/* Signal catcher for SIGINT or SIGTERM.
 * Allows program to stop cleanly and save state.
 * User can send the signal by Ctrl-C or "kill <PID>", for instance.
 */
void catchSignal(int sigNum) {

  gotSignal = 1;

} /* end function catchSignal */


/*-- main ------------------------------------------------------------------*/

int main(int argc, char *argv[]) {
  /* Constant stuff. */
  const int zP = 1, zQ = 1;
  const u128 order = ORDER;
  const poly128 xP = XP, yP = YP, xQ = XQ, yQ = YQ;

  /* These get initialised once and for all. */
  uint distPopc;
  modeType mode;

  /* These are variable. */
  u64 total;
  double dStart; /* For timing. */

  uint pops[PARAL];
  u64 casesT[PARAL][CASES], itersT[PARAL];
  u128 uT[PARAL], vT[PARAL];
  poly128 denT[PARAL], x1T[PARAL], x2T[PARAL], y1T[PARAL], y2T[PARAL];


  /** Simple sanity check. **/

  if (sizeof(u32) != 4) {
    puts("Fatal error: *** the size of u32 is wrong. ***");
    return 1;
  } /* end if */


#if MACOS
  init_mac(&argc, &argv, &total);
#endif


  /** Parse command line. **/

  mode = parse(argc, argv);
  if (mode == Bad) {
    printf( "Syntax: %s <mode> by <email> on <machine> "
              "[proxy <host>:<port>]\n"
            "See top of source code or README file for details.  Stopping.\n"
          , argv[0]
          );
    return 1;
  } /* end if */


  /** See if we can find sendmail with execute permission. **/

#if !MACOS
/* Note: Mac mailing is different... */

  if ((mode == Mail || mode == Alt) && access(SENDMAIL, X_OK)) {
    printf( "Fatal error: *** sendmail program " SENDMAIL " does not exist\n"
            "                 or is not executable ***\n"
            "             (errno = %d: %s).\n"
          , errno, strerror(errno)
          );
    return 1;
  } /* end if */

#endif


  /** Initialize HTTP stuff if http mode selected */

  if (mode == Http && httpInit() == -1) {
    puts("Fatal error: *** http initialisation failed. ***");
    return 1;
  } /* end if */


  /** Put up banner. **/

  printf( "This is Rob's ECDL program for " VARIANT ", version " VERSION "\n"
          "Mode ...... %s\n"
          "Email ..... %s\n"
          "Machine ... %s\n"
        , argv[1], argv[3], argv[5]
        );
  { time_t now; /* For time and date. */

    now = time(NULL);
    if (now == -1) puts("Warning: can't determine time and date.");
    else printf("Starting at %s", ctime(&now));
  } /* end block */

  fflush(stdout);


  /** Lower our priority (under Unix, this is done by the shell) */

#ifdef _WIN32
  SetPriorityClass(GetCurrentProcess(), IDLE_PRIORITY_CLASS);
#endif


  /** Initialisation. **/

  /* Read starting points from the saved state or generate new random ones. */
  { uint i;

    if (mode == Test) i = 0;
    else i = readState(itersT, uT, vT, x1T, y1T);

    if (i < PARAL) {
      u128 seed;

      printf("Generating %u new starting points.\n", (uint)PARAL-i);
      fflush(stdout);

      if (mode == Test) {
        /* Fixed seed for testing. */
        seed.t = 0;
        seed.h = 0;
        seed.m = 0x01234567;
        seed.l = 0x89ABCDEF;
      } else {
        FILE *handle;

        /* Get really random seed. */
        handle = fopen(RANDOM_DEV_FILENAME, "rb");
        if (handle) {
          int status;

          fread((void *)&seed, sizeof(seed), (size_t)1, handle);
          status = fclose(handle);
          if (status == EOF) {
            printf( "Warning: fclose() of random device " RANDOM_DEV_FILENAME
                      " failed\n"
                    "         (errno = %d: %s).\n"
                  , errno, strerror(errno)
                  );
          } /* end if */
        } else {
          printf( "Warning: random device " RANDOM_DEV_FILENAME " not found\n"
                  "  (errno = %d: %s).  Making do without...\n"
                , errno, strerror(errno)
                );
        } /* end if */

        /* Randomise seed some more (if random device was not found,
         * seed is stack junk which is OK but may not be random enough!)
         */
        seed.h ^= getpid(); seed.l ^= time(NULL);

        /* Ensure seed < order. */
        seed.t &= 0x00000FFF; seed.h &= 0xFFFFFFFE;
      } /* end if/else */


      /* Generate random points. */
      for (/* i was set above */ ; i < PARAL; ++i) {
        int z, zu, zv;
        poly128 xu, xv, yu, yv;
        /* Euler gamma times 2^108. */
        const u128 gamma = { 0x0000093C, 0x467E37DB, 0xC7A4D1B, 0xE3F81015 };

        itersT[i].hi = itersT[i].lo = 0;
        do {
          /* Not terribly random, but it will do! */
          uT[i] = seed;
          zu = ellipticProduct(xP, yP, zP, seed, &xu, &yu);
          seed = sumMod(seed, gamma, order);

          vT[i] = seed;
          zv = ellipticProduct(xQ, yQ, zQ, seed, &xv, &yv);
          seed = sumMod(seed, gamma, order);

          z = ellipticSum(xu, yu, zu, xv, yv, zv, &x1T[i], &y1T[i]);
        } while (!z);
      } /* end for (i) */
    } /* end if */

    /* Initialise counts of cases to 0. */
    for (i = 0; i < PARAL; ++i) {
      uint j;
      for (j = 0; j < CASES; ++j) casesT[i][j].hi = casesT[i][j].lo = 0;
    } /* end for (i) */

  } /* end block */


  /* Timing (measured in seconds of user time for this process). */
#if MACOS
  dStart = get_time();
#elif __CYGWIN__
  { struct tms tm;

    times(&tm);
    dStart = (double)tm.tms_utime/(double)CLK_TCK;
  } /* end block */
#else
  { struct rusage ru;

    getrusage(RUSAGE_SELF, &ru);
    dStart = (double)ru.ru_utime.tv_sec+(double)ru.ru_utime.tv_usec*0.000001;
  } /* end block */
#endif


  /** Main loop. **/

  /* Sketch of main loop structure:
   * for ever {
   *   for (all PARAL points) {
   *     while (this one is distinguished) { output it; replace by new one; }
   *   }
   *   for (all PARAL points) {
   *     get denominator for this one;
   *   }
   *   get inverses of all PARAL denominators at once;
   *   for (all PARAL points) {
   *     apply pseudo-random function to this one using inverse(denominator);
   *   }
   * }
   */


  puts("Computing iterations...");
  fflush(stdout);

  distPopc = mode == Test ? TEST_POP_COUNT : DIST_POP_COUNT;

  signal(SIGINT, catchSignal);
  signal(SIGTERM, catchSignal);

  total.hi = total.lo = 0;
  while (!gotSignal) {
    uint i;
    int someWereDistinguished = 0;

#if MACOS
    ROTATECURSOR_MAGIC();
#endif

    /** Check for distinguished points. **/
    for (i = 0; i < PARAL; ++i) {

      while ((pops[i] = popc(changeBasis(x1T[i]))) == distPopc) {
        /* Point is distinguished. */
        int z;

        /* Check that the point is OK. */
        if (!syncAndCheck(&casesT[i][0], &uT[i], &vT[i], x1T[i], y1T[i])) {
          puts("Fatal error: *** bad distinguished point detected! ***");
          return 2;
        } /* end if */

        /* Report it. */
        reportDistinguished( mode, argv
                           , itersT[i], uT[i], vT[i], total, dStart
                           );

        /* Restart this point by incrementing v. */
        itersT[i].hi = itersT[i].lo = 0;
        do {
	  vT[i] = incMod(vT[i], order);
	  z = ellipticSum(x1T[i], y1T[i], 1, xQ, yQ, zQ, &x1T[i], &y1T[i]);
        } while (!z);

        /* Note this for use immediately below. */
        someWereDistinguished = 1;

      } /* end while (distinguished point) */
    } /* end for (i) */


    /** Save state after a bunch of iterations or a distinguished point. **/
    if ( (total.lo | total.hi)
         && (total.lo & ((1UL<<SAVED_STATE_SHIFT)-1)) < PARAL
         || someWereDistinguished
       ) {

      /* Check all points. */
      for (i = 0; i < PARAL; ++i) {
        if (!syncAndCheck(&casesT[i][0], &uT[i], &vT[i], x1T[i], y1T[i])) {
          puts("Fatal error: *** bad point detected! ***");
          return 3;
        } /* end if */
      } /* end for */

      /* Save state (except when testing). */
      if (mode != Test) writeState(itersT, uT, vT, x1T, y1T);
    } /* end if */


    /** Get denominators for additions of points. **/
    for (i = 0; i < PARAL; ++i) {
      uint m;

      m = pops[i]; /* Note: m <= 109 */
      m -= 7*(m*9>>6); if (m >= 7) m -= 7; /* Reduce modulo 7. */
      ++casesT[i][m].lo; if (!casesT[i][m].lo) ++casesT[i][m].hi;

      m += 2; if (m == 3) m = 1;
      x2T[i] = squareNTimes(x1T[i], m);
      denT[i] = xor(x1T[i], x2T[i]);
      y2T[i] = squareNTimes(y1T[i], m);
    } /* end for (i) */


    /** Invert PARAL denominators with 1 inversion and 3*PARAL-3 mults. **/
    { poly128 ix, prod, q, prodT[PARAL];

      prod = denT[0];
      for (i = 1; i < PARAL; ++i) {
        prodT[i] = prod;
        prod = product(prod, denT[i]);
      } /* end for */

      q = inverse(prod);

      for (i = PARAL; --i; ) {
        ix = product(q, prodT[i]);
        q = product(q, denT[i]);
        denT[i] = ix;
      } /* end for */
      denT[0] = q;

    } /* end block */


    /** Get new points. **/
    for (i = 0; i < PARAL; ++i) { /* Always in general case. */
      poly128 nx, y1 = y1T[i];
      poly128 lam = product(xor(y1, y2T[i]), denT[i]);
      poly128 t = xor(xor(square(lam), lam), x2T[i]);
      const poly128 a = A;

      t.l ^= a.l; /* t = xor(t, a) with a = 0 or 1. */
      x1T[i] = nx = xor(t, x1T[i]);
      y1T[i] = xor(xor(product(lam, t), nx), y1);
      ++itersT[i].lo; if (!itersT[i].lo) ++itersT[i].hi;
    } /* end for (i) */

    total.lo += PARAL;
    total.hi += total.lo < PARAL;
  } /* end main loop */

  puts("Received signal.");


  /** Save state after receiving signal (except when testing). **/
  { uint i;

    /* Check all points. */
    for (i = 0; i < PARAL; ++i) {
      if (!syncAndCheck(&casesT[i][0], &uT[i], &vT[i], x1T[i], y1T[i])) {
        puts("Fatal error: *** bad point detected after signal! ***");
        return 4;
      } /* end if */
    } /* end for */

    /* Save state. */
    if (mode != Test) writeState(itersT, uT, vT, x1T, y1T);
  } /* end block */


  puts("Bye!");
  return 0;
} /* end function main */


/*-= Stuff for checking that a point is distinguished =-=-=-=-=-=-=-=-=-=-=-*/

/*-- popc ------------------------------------------------------------------*/

/* Counts bits set in a 109-bit poly over Z/2Z. */
static uint popc(poly128 n) {
  u32 n1, n2, nt;
  const u32 mask1 = 0x55555555, mask2 = 0x33333333, mask3 = 0x0F0F0F0F;

  /* Full adder. */
  n1 = n.l ^ n.m; n2 = n.l & n.m;
  n2 |= n1 & n.h; n1 ^= n.h;

  /* Partial popc. */
  n2 -= n2>>1 & mask1;                  /* Count 2-by-2. */
  n2 = (n2 & mask2)+(n2>>2 & mask2);    /* Count 4-by-4. */
  n2 += n2>>4; n2 &= mask3;             /* Count 8-by-8. */

  /* Partial popc. */
  n1 -= n1>>1 & mask1;
  n1 = (n1 & mask2)+(n1>>2 & mask2);
  n1 += n1>>4; n1 &= mask3;

  nt = n.t;

  /* Partial popc. */
  nt -= nt>>1 & mask1;
  nt = (nt & mask2)+(nt>>2 & mask2);
  nt += nt>>4; nt &= mask3;

  /* Accumulate pieces. */
  n1 += n2<<1;
  n1 += nt;
  n1 += n1>> 8;
  n1 += n1>>16;

  return n1 & 255;
} /* end function popc */


/*-- changeBasis -----------------------------------------------------------*/

/* Go from basis { t^k | 0 <= k < 109 } to basis { (t+1)^2^k | 0 <= k < 109 }
 * (both over Z/2Z).
 */
static poly128 changeBasis(poly128 x) {

#ifdef MMX

  static const mmx128 matrix[218] =
  { { { 0, 0 }, { 0, 0 } }
  , { { 0xFFFFFFFF, 0x1FFF }, { 0xFFFFFFFF, 0xFFFFFFFF } }
  , { { 0xFFFFFFFF, 0x1FFF }, { 0xFFFFFFFE, 0xFFFFFFFF } }
  , { { 0x00000000, 0x0000 }, { 0x00000001, 0x00000000 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xFFFFFFFF, 0x1FFF }, { 0xFFFFFFFD, 0xFFFFFFFF } }
  , { { 0x1A34BEEC, 0x09EA }, { 0xBDEDA6AB, 0x0A908559 } }
  , { { 0xE5CB4113, 0x1615 }, { 0x42125956, 0xF56F7AA6 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xFFFFFFFF, 0x1FFF }, { 0xFFFFFFFB, 0xFFFFFFFF } }
  , { { 0x810828A6, 0x0536 }, { 0x2FC46FD6, 0x03436150 } }
  , { { 0x7EF7D759, 0x1AC9 }, { 0xD03B902D, 0xFCBC9EAF } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x34697DD8, 0x13D4 }, { 0x7BDB4D56, 0x15210AB3 } }
  , { { 0xB0C11920, 0x1748 }, { 0x09A60AC2, 0x5DC6E0E9 } }
  , { { 0x84A864F8, 0x049C }, { 0x727D4794, 0x48E7EA5A } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xFFFFFFFF, 0x1FFF }, { 0xFFFFFFF7, 0xFFFFFFFF } }
  , { { 0xE07B050A, 0x1E31 }, { 0x30FAD343, 0x23A45550 } }
  , { { 0x1F84FAF5, 0x01CE }, { 0xCF052CB4, 0xDC5BAAAF } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x0210514C, 0x0A6D }, { 0x5F88DFAC, 0x0686C2A0 } }
  , { { 0x94F5E66A, 0x02C7 }, { 0x6F735E1B, 0xAA1EE736 } }
  , { { 0x96E5B726, 0x08AA }, { 0x30FB81B7, 0xAC982596 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x68D2FBB0, 0x07A8 }, { 0xF7B69AAD, 0x2A421566 } }
  , { { 0x9E40B570, 0x0346 }, { 0x71B8CCC4, 0xCE15C900 } }
  , { { 0xF6924EC0, 0x04EE }, { 0x860E5669, 0xE457DC66 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x61823240, 0x0E91 }, { 0x134C1585, 0xBB8DC1D2 } }
  , { { 0x6CC8E146, 0x0A5E }, { 0x0ACD96E5, 0x486897C2 } }
  , { { 0x0D4AD306, 0x04CF }, { 0x19818360, 0xF3E55610 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xFFFFFFFF, 0x1FFF }, { 0xFFFFFFEF, 0xFFFFFFFF } }
  , { { 0xDF8084BB, 0x1DE5 }, { 0xA3BF99C1, 0x9970280C } }
  , { { 0x207F7B44, 0x021A }, { 0x5C40662E, 0x668FD7F3 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xC0F60A14, 0x1C63 }, { 0x61F5A687, 0x4748AAA0 } }
  , { { 0x01B79A31, 0x1F55 }, { 0x9FBDDCC1, 0xD77E3E4D } }
  , { { 0xC1419025, 0x0336 }, { 0xFE487A46, 0x903694ED } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x0420A298, 0x14DA }, { 0xBF11BF58, 0x0D0D8540 } }
  , { { 0x396CA3D7, 0x0952 }, { 0xF9CBB7EC, 0x5297C756 } }
  , { { 0x3D4C014F, 0x1D88 }, { 0x46DA08B4, 0x5F9A4216 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x29EBCCD5, 0x058F }, { 0xDEE6BC36, 0x543DCE6C } }
  , { { 0x18AA70FE, 0x1BFA }, { 0xCD057310, 0x45B2D97A } }
  , { { 0x3141BC2B, 0x1E75 }, { 0x13E3CF26, 0x118F1716 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xD1A5F760, 0x0F50 }, { 0xEF6D355A, 0x54842ACD } }
  , { { 0xAFD26253, 0x192A }, { 0x56604C4E, 0x0FD989FC } }
  , { { 0x7E779533, 0x167A }, { 0xB90D7914, 0x5B5DA331 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x3C816AE1, 0x068D }, { 0xE3719988, 0x9C2B9200 } }
  , { { 0xEAC2D27F, 0x0095 }, { 0xB6D89E6C, 0xAD933BEF } }
  , { { 0xD643B89E, 0x0618 }, { 0x55A907E4, 0x31B8A9EF } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xC3046481, 0x1D22 }, { 0x26982B0A, 0x771B83A4 } }
  , { { 0x542E870A, 0x0A5E }, { 0x4E573666, 0x9F98AD6E } }
  , { { 0x972AE38B, 0x177C }, { 0x68CF1D6C, 0xE8832ECA } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xD991C28C, 0x14BC }, { 0x159B2DCA, 0x90D12F84 } }
  , { { 0x27449B58, 0x09C5 }, { 0x2E783164, 0x9E98DABF } }
  , { { 0xFED559D4, 0x1D79 }, { 0x3BE31CAE, 0x0E49F53B } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xFFFFFFFF, 0x1FFF }, { 0xFFFFFFDF, 0xFFFFFFFF } }
  , { { 0xE648868C, 0x1C9F }, { 0x22FDFD76, 0xB844216D } }
  , { { 0x19B77973, 0x0360 }, { 0xDD0202A9, 0x47BBDE92 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xBF010977, 0x1BCB }, { 0x477F3383, 0x32E05019 } }
  , { { 0x4DACA9C6, 0x00C4 }, { 0x2356535F, 0xF0CE03AB } }
  , { { 0xF2ADA0B1, 0x1B0F }, { 0x642960DC, 0xC22E53B2 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x81EC1428, 0x18C7 }, { 0xC3EB4D0F, 0x8E915540 } }
  , { { 0x982774E6, 0x1003 }, { 0x52DD5203, 0xEC619F31 } }
  , { { 0x19CB60CE, 0x08C4 }, { 0x91361F0C, 0x62F0CA71 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x036F3463, 0x1EAA }, { 0x3F7BB983, 0xAEFC7C9B } }
  , { { 0xDA6A67E3, 0x0BE2 }, { 0x32ADBEF9, 0x9960FC56 } }
  , { { 0xD9055380, 0x1548 }, { 0x0DD6077A, 0x379C80CD } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x08414530, 0x09B4 }, { 0x7E237EB1, 0x1A1B0A81 } }
  , { { 0x11FD1F88, 0x1CF9 }, { 0xD921E2F2, 0x9BF5DF89 } }
  , { { 0x19BC5AB8, 0x154D }, { 0xA7029C43, 0x81EED508 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x72D947AE, 0x12A4 }, { 0xF3976FD8, 0xA52F8EAD } }
  , { { 0x8176618A, 0x1CE4 }, { 0x362CFC89, 0x5063CF7B } }
  , { { 0xF3AF2624, 0x0E40 }, { 0xC5BB9351, 0xF54C41D6 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x53D799AA, 0x0B1E }, { 0xBDCD786C, 0xA87B9CD9 } }
  , { { 0xA61FFC6D, 0x0048 }, { 0xB0EC51FA, 0x7BBE85CF } }
  , { { 0xF5C865C7, 0x0B56 }, { 0x0D212996, 0xD3C51916 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x3154E1FC, 0x17F4 }, { 0x9A0AE621, 0x8B65B2F5 } }
  , { { 0xB34684B4, 0x19E3 }, { 0x14E90A5D, 0xAE959235 } }
  , { { 0x82126548, 0x0E17 }, { 0x8EE3EC7C, 0x25F020C0 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xA34BEEC0, 0x1EA1 }, { 0xDEDA6AB4, 0xA908559B } }
  , { { 0xFFAC3CF8, 0x128C }, { 0x4DB5CD19, 0x8C2931BE } }
  , { { 0x5CE7D238, 0x0C2D }, { 0x936FA7AD, 0x25216425 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x5FA4C4A6, 0x1255 }, { 0xACC0989D, 0x1FB313F8 } }
  , { { 0xF28924AA, 0x0EEF }, { 0x1C93544F, 0x2F887351 } }
  , { { 0xAD2DE00C, 0x1CBA }, { 0xB053CCD2, 0x303B60A9 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x7902D5C3, 0x0D1A }, { 0xC6E33310, 0x38572401 } }
  , { { 0x19AD60B0, 0x0C5C }, { 0x09CBC60F, 0x5E11225D } }
  , { { 0x60AFB573, 0x0146 }, { 0xCF28F51F, 0x6646065C } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xD585A4FF, 0x012B }, { 0x6DB13CD8, 0x5B2677DF } }
  , { { 0x8C1277D0, 0x01C3 }, { 0xF132BC82, 0x060B23FC } }
  , { { 0x5997D32F, 0x00E8 }, { 0x9C83805A, 0x5D2D5423 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x8608C902, 0x1A45 }, { 0x4D305615, 0xEE370748 } }
  , { { 0xB2113F0F, 0x179D }, { 0x277A9EE3, 0x7814A56C } }
  , { { 0x3419F60D, 0x0DD8 }, { 0x6A4AC8F6, 0x9623A224 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xA85D0E15, 0x14BC }, { 0x9CAE6CCC, 0x3F315ADC } }
  , { { 0x5B345C1C, 0x157C }, { 0x30FA7A39, 0x643A6D33 } }
  , { { 0xF3695209, 0x01C0 }, { 0xAC5416F5, 0x5B0B37EF } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xB3238519, 0x0979 }, { 0x2B365B95, 0x21A25F08 } }
  , { { 0x5D70D551, 0x0103 }, { 0x5B6FF9C9, 0xB4E6A8BE } }
  , { { 0xEE535048, 0x087A }, { 0x7059A25C, 0x9544F7B6 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x4E8936B1, 0x138A }, { 0x5CF062C8, 0x3D31B57E } }
  , { { 0x11203F3F, 0x1C2F }, { 0xDF433D07, 0xCAB69770 } }
  , { { 0x5FA9098E, 0x0FA5 }, { 0x83B35FCF, 0xF787220E } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xFFFFFFFF, 0x1FFF }, { 0xFFFFFFBF, 0xFFFFFFFF } }
  , { { 0xE8DE6EB8, 0x01CD }, { 0x7FD9AA80, 0x69E4FFE2 } }
  , { { 0x17219147, 0x1E32 }, { 0x8026553F, 0x961B001D } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xCC910D19, 0x193F }, { 0x45FBFAED, 0x708842DA } }
  , { { 0xE34869DD, 0x127C }, { 0x225B3C14, 0x0640B805 } }
  , { { 0x2FD964C4, 0x0B43 }, { 0x67A0C6F9, 0x76C8FADF } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x7E0212EE, 0x1797 }, { 0x8EFE6707, 0x65C0A032 } }
  , { { 0xD4CA755E, 0x14C6 }, { 0x2567C9A5, 0x9F969267 } }
  , { { 0xAAC867B0, 0x0351 }, { 0xAB99AEA2, 0xFA563255 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x9B59538D, 0x0188 }, { 0x46ACA6BE, 0xE19C0756 } }
  , { { 0x331E30C9, 0x0A9A }, { 0x5AA1F939, 0xEFA2FE39 } }
  , { { 0xA8476344, 0x0B12 }, { 0x1C0D5F87, 0x0E3EF96F } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x03D82851, 0x118F }, { 0x87D69A1F, 0x1D22AA81 } }
  , { { 0xB83B634D, 0x095F }, { 0xE280D9AC, 0x284C5684 } }
  , { { 0xBBE34B1C, 0x18D0 }, { 0x655643B3, 0x356EFC05 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x304EE9CD, 0x0007 }, { 0xA5BAA407, 0xD8C33E62 } }
  , { { 0x5EFB7E85, 0x0076 }, { 0xD82D749F, 0x388546D3 } }
  , { { 0x6EB59748, 0x0071 }, { 0x7D97D098, 0xE04678B1 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x06DE68C7, 0x1D54 }, { 0x7EF77307, 0x5DF8F936 } }
  , { { 0x78C41EED, 0x07BA }, { 0x29DF40AF, 0x02B46968 } }
  , { { 0x7E1A762A, 0x1AEE }, { 0x572833A8, 0x5F4C905E } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xB4D4CFC7, 0x17C5 }, { 0x655B7DF2, 0x32C1F8AC } }
  , { { 0x7D6E6970, 0x1D45 }, { 0xB0A43683, 0xC19185E5 } }
  , { { 0xC9BAA6B7, 0x0A80 }, { 0xD5FF4B71, 0xF3507D49 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x10828A60, 0x1368 }, { 0xFC46FD62, 0x34361502 } }
  , { { 0x0759C297, 0x1F9F }, { 0x64DC124E, 0x1F06E180 } }
  , { { 0x17DB48F7, 0x0CF7 }, { 0x989AEF2C, 0x2B30F482 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x23FA3F11, 0x19F2 }, { 0xB243C5E5, 0x37EBBF13 } }
  , { { 0x46F4B00F, 0x17B1 }, { 0xE76AB97D, 0xA9CBE82C } }
  , { { 0x650E8F1E, 0x0E43 }, { 0x55297C98, 0x9E20573F } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xE5B28F5D, 0x0548 }, { 0xE72EDFB1, 0x4A5F1D5B } }
  , { { 0x4CC047A9, 0x0797 }, { 0x4F38005C, 0x517EC6F3 } }
  , { { 0xA972C8F4, 0x02DF }, { 0xA816DFED, 0x1B21DBA8 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x02ECC314, 0x19C9 }, { 0x6C59F913, 0xA0C79EF6 } }
  , { { 0xFBA4F1D8, 0x0544 }, { 0xDEE1E43F, 0xE3F79DAF } }
  , { { 0xF94832CC, 0x1C8D }, { 0xB2B81D2C, 0x43300359 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xA7AF3355, 0x163C }, { 0x7B9AF0D8, 0x50F739B3 } }
  , { { 0xE42CECEE, 0x0408 }, { 0x2E18F5E8, 0xD1B4C9D2 } }
  , { { 0x4383DFBB, 0x1234 }, { 0x55820530, 0x8143F061 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x4C3FF8DA, 0x0091 }, { 0x61D8A3F4, 0xF77D0B9F } }
  , { { 0x7ABA658A, 0x186E }, { 0x16AA6668, 0x7FF0C893 } }
  , { { 0x36859D50, 0x18FF }, { 0x7772C59C, 0x888DC30C } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x62A9C3F9, 0x0FE8 }, { 0x3415CC43, 0x16CB65EB } }
  , { { 0xD9C93DA7, 0x1A79 }, { 0xC83CF966, 0x2891C56B } }
  , { { 0xBB60FE5E, 0x1591 }, { 0xFC293525, 0x3E5AA080 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x668D0969, 0x13C7 }, { 0x29D214BB, 0x5D2B246A } }
  , { { 0x97345AA9, 0x08A6 }, { 0x2816E691, 0xBB2DDE90 } }
  , { { 0xF1B953C0, 0x1B61 }, { 0x01C4F22A, 0xE606FAFA } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x4697DD81, 0x1D43 }, { 0xBDB4D569, 0x5210AB37 } }
  , { { 0xE982BE06, 0x146E }, { 0x6A2904E5, 0x27B2F0E0 } }
  , { { 0xAF156387, 0x092D }, { 0xD79DD18C, 0x75A25BD7 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xFF5879F1, 0x0519 }, { 0x9B6B9A33, 0x1852637C } }
  , { { 0x16F88427, 0x1CF7 }, { 0x6100D523, 0xA0B5B4D1 } }
  , { { 0xE9A0FDD6, 0x19EE }, { 0xFA6B4F10, 0xB8E7D7AD } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xBF49894C, 0x04AA }, { 0x5981313B, 0x3F6627F1 } }
  , { { 0xE6B11399, 0x0E63 }, { 0xF86F51E9, 0xC8C3EC99 } }
  , { { 0x59F89AD5, 0x0AC9 }, { 0xA1EE60D2, 0xF7A5CB68 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xE5124954, 0x1DDF }, { 0x3926A89E, 0x5F10E6A2 } }
  , { { 0x19BC4158, 0x1A9C }, { 0xC1EAE3F5, 0x11BBB1FB } }
  , { { 0xFCAE080C, 0x0743 }, { 0xF8CC4B6B, 0x4EAB5759 } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xF205AB86, 0x1A34 }, { 0x8DC66620, 0x70AE4803 } }
  , { { 0x811DFA1A, 0x0553 }, { 0xF16331C3, 0xC53F7E1C } }
  , { { 0x7318519C, 0x1F67 }, { 0x7CA557E3, 0xB591361F } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0x335AC160, 0x18B8 }, { 0x13978C1E, 0xBC2244BA } }
  , { { 0xB2233CDC, 0x0E4F }, { 0x9BFF7FF6, 0x78EC2FD5 } }
  , { { 0x8179FDBC, 0x16F7 }, { 0x8868F3E8, 0xC4CE6B6F } }
  , { { 0, 0 }, { 0, 0 } }
  , { { 0xAB0B49FE, 0x0257 }, { 0xDB6279B0, 0xB64CEFBE } }
  }
  ;

  uint i;
  u32 s, t;
  MMX_STATIC mmx64 yhi, ylo;
  const mmx128 *p;

  p = &matrix[0];

  START_MMX();

#define mm_XHI 0
#define mm_XLO 1
#define mm_YHI 2
#define mm_YLO 3

  XOR(mm_YHI, mm_YHI);
  XOR(mm_YLO, mm_YLO);

#define FOR_BLOCK \
  { s = t & 3; XOR_MEM(mm_YHI, p[s].hi); XOR_MEM(mm_YLO, p[s].lo); \
    t >>= 2; p += 4; \
    s = t & 3; XOR_MEM(mm_YHI, p[s].hi); XOR_MEM(mm_YLO, p[s].lo); \
    t >>= 2; p += 4; \
  }

  t = x.l; for (i = 8; i; --i) FOR_BLOCK
  t = x.m; for (i = 8; i; --i) FOR_BLOCK
  t = x.h; for (i = 8; i; --i) FOR_BLOCK
  t = x.t; for (i = 3; i; --i) FOR_BLOCK

  /* Last bit. */
  s = t & 1; XOR_MEM(mm_YHI, p[s].hi); XOR_MEM(mm_YLO, p[s].lo);

#undef FOR_BLOCK

  STR(yhi, mm_YHI);
  STR(ylo, mm_YLO);

#undef mm_XHI
#undef mm_XLO
#undef mm_YHI
#undef mm_YLO

  END_MMX();

  { poly128 res;
    res.t = yhi.hi; res.h = yhi.lo; res.m = ylo.hi; res.l = ylo.lo;
    return res;
  } /* end block */

#else
/* !defined(MMX) */

  static const poly128 matrix[218] =
  { { 0, 0, 0, 0 }
  , { 0x1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
  , { 0x1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE }
  , { 0x0000, 0x00000000, 0x00000000, 0x00000001 }
  , { 0, 0, 0, 0 }
  , { 0x1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFD }
  , { 0x09EA, 0x1A34BEEC, 0x0A908559, 0xBDEDA6AB }
  , { 0x1615, 0xE5CB4113, 0xF56F7AA6, 0x42125956 }
  , { 0, 0, 0, 0 }
  , { 0x1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFB }
  , { 0x0536, 0x810828A6, 0x03436150, 0x2FC46FD6 }
  , { 0x1AC9, 0x7EF7D759, 0xFCBC9EAF, 0xD03B902D }
  , { 0, 0, 0, 0 }
  , { 0x13D4, 0x34697DD8, 0x15210AB3, 0x7BDB4D56 }
  , { 0x1748, 0xB0C11920, 0x5DC6E0E9, 0x09A60AC2 }
  , { 0x049C, 0x84A864F8, 0x48E7EA5A, 0x727D4794 }
  , { 0, 0, 0, 0 }
  , { 0x1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF7 }
  , { 0x1E31, 0xE07B050A, 0x23A45550, 0x30FAD343 }
  , { 0x01CE, 0x1F84FAF5, 0xDC5BAAAF, 0xCF052CB4 }
  , { 0, 0, 0, 0 }
  , { 0x0A6D, 0x0210514C, 0x0686C2A0, 0x5F88DFAC }
  , { 0x02C7, 0x94F5E66A, 0xAA1EE736, 0x6F735E1B }
  , { 0x08AA, 0x96E5B726, 0xAC982596, 0x30FB81B7 }
  , { 0, 0, 0, 0 }
  , { 0x07A8, 0x68D2FBB0, 0x2A421566, 0xF7B69AAD }
  , { 0x0346, 0x9E40B570, 0xCE15C900, 0x71B8CCC4 }
  , { 0x04EE, 0xF6924EC0, 0xE457DC66, 0x860E5669 }
  , { 0, 0, 0, 0 }
  , { 0x0E91, 0x61823240, 0xBB8DC1D2, 0x134C1585 }
  , { 0x0A5E, 0x6CC8E146, 0x486897C2, 0x0ACD96E5 }
  , { 0x04CF, 0x0D4AD306, 0xF3E55610, 0x19818360 }
  , { 0, 0, 0, 0 }
  , { 0x1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFEF }
  , { 0x1DE5, 0xDF8084BB, 0x9970280C, 0xA3BF99C1 }
  , { 0x021A, 0x207F7B44, 0x668FD7F3, 0x5C40662E }
  , { 0, 0, 0, 0 }
  , { 0x1C63, 0xC0F60A14, 0x4748AAA0, 0x61F5A687 }
  , { 0x1F55, 0x01B79A31, 0xD77E3E4D, 0x9FBDDCC1 }
  , { 0x0336, 0xC1419025, 0x903694ED, 0xFE487A46 }
  , { 0, 0, 0, 0 }
  , { 0x14DA, 0x0420A298, 0x0D0D8540, 0xBF11BF58 }
  , { 0x0952, 0x396CA3D7, 0x5297C756, 0xF9CBB7EC }
  , { 0x1D88, 0x3D4C014F, 0x5F9A4216, 0x46DA08B4 }
  , { 0, 0, 0, 0 }
  , { 0x058F, 0x29EBCCD5, 0x543DCE6C, 0xDEE6BC36 }
  , { 0x1BFA, 0x18AA70FE, 0x45B2D97A, 0xCD057310 }
  , { 0x1E75, 0x3141BC2B, 0x118F1716, 0x13E3CF26 }
  , { 0, 0, 0, 0 }
  , { 0x0F50, 0xD1A5F760, 0x54842ACD, 0xEF6D355A }
  , { 0x192A, 0xAFD26253, 0x0FD989FC, 0x56604C4E }
  , { 0x167A, 0x7E779533, 0x5B5DA331, 0xB90D7914 }
  , { 0, 0, 0, 0 }
  , { 0x068D, 0x3C816AE1, 0x9C2B9200, 0xE3719988 }
  , { 0x0095, 0xEAC2D27F, 0xAD933BEF, 0xB6D89E6C }
  , { 0x0618, 0xD643B89E, 0x31B8A9EF, 0x55A907E4 }
  , { 0, 0, 0, 0 }
  , { 0x1D22, 0xC3046481, 0x771B83A4, 0x26982B0A }
  , { 0x0A5E, 0x542E870A, 0x9F98AD6E, 0x4E573666 }
  , { 0x177C, 0x972AE38B, 0xE8832ECA, 0x68CF1D6C }
  , { 0, 0, 0, 0 }
  , { 0x14BC, 0xD991C28C, 0x90D12F84, 0x159B2DCA }
  , { 0x09C5, 0x27449B58, 0x9E98DABF, 0x2E783164 }
  , { 0x1D79, 0xFED559D4, 0x0E49F53B, 0x3BE31CAE }
  , { 0, 0, 0, 0 }
  , { 0x1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFDF }
  , { 0x1C9F, 0xE648868C, 0xB844216D, 0x22FDFD76 }
  , { 0x0360, 0x19B77973, 0x47BBDE92, 0xDD0202A9 }
  , { 0, 0, 0, 0 }
  , { 0x1BCB, 0xBF010977, 0x32E05019, 0x477F3383 }
  , { 0x00C4, 0x4DACA9C6, 0xF0CE03AB, 0x2356535F }
  , { 0x1B0F, 0xF2ADA0B1, 0xC22E53B2, 0x642960DC }
  , { 0, 0, 0, 0 }
  , { 0x18C7, 0x81EC1428, 0x8E915540, 0xC3EB4D0F }
  , { 0x1003, 0x982774E6, 0xEC619F31, 0x52DD5203 }
  , { 0x08C4, 0x19CB60CE, 0x62F0CA71, 0x91361F0C }
  , { 0, 0, 0, 0 }
  , { 0x1EAA, 0x036F3463, 0xAEFC7C9B, 0x3F7BB983 }
  , { 0x0BE2, 0xDA6A67E3, 0x9960FC56, 0x32ADBEF9 }
  , { 0x1548, 0xD9055380, 0x379C80CD, 0x0DD6077A }
  , { 0, 0, 0, 0 }
  , { 0x09B4, 0x08414530, 0x1A1B0A81, 0x7E237EB1 }
  , { 0x1CF9, 0x11FD1F88, 0x9BF5DF89, 0xD921E2F2 }
  , { 0x154D, 0x19BC5AB8, 0x81EED508, 0xA7029C43 }
  , { 0, 0, 0, 0 }
  , { 0x12A4, 0x72D947AE, 0xA52F8EAD, 0xF3976FD8 }
  , { 0x1CE4, 0x8176618A, 0x5063CF7B, 0x362CFC89 }
  , { 0x0E40, 0xF3AF2624, 0xF54C41D6, 0xC5BB9351 }
  , { 0, 0, 0, 0 }
  , { 0x0B1E, 0x53D799AA, 0xA87B9CD9, 0xBDCD786C }
  , { 0x0048, 0xA61FFC6D, 0x7BBE85CF, 0xB0EC51FA }
  , { 0x0B56, 0xF5C865C7, 0xD3C51916, 0x0D212996 }
  , { 0, 0, 0, 0 }
  , { 0x17F4, 0x3154E1FC, 0x8B65B2F5, 0x9A0AE621 }
  , { 0x19E3, 0xB34684B4, 0xAE959235, 0x14E90A5D }
  , { 0x0E17, 0x82126548, 0x25F020C0, 0x8EE3EC7C }
  , { 0, 0, 0, 0 }
  , { 0x1EA1, 0xA34BEEC0, 0xA908559B, 0xDEDA6AB4 }
  , { 0x128C, 0xFFAC3CF8, 0x8C2931BE, 0x4DB5CD19 }
  , { 0x0C2D, 0x5CE7D238, 0x25216425, 0x936FA7AD }
  , { 0, 0, 0, 0 }
  , { 0x1255, 0x5FA4C4A6, 0x1FB313F8, 0xACC0989D }
  , { 0x0EEF, 0xF28924AA, 0x2F887351, 0x1C93544F }
  , { 0x1CBA, 0xAD2DE00C, 0x303B60A9, 0xB053CCD2 }
  , { 0, 0, 0, 0 }
  , { 0x0D1A, 0x7902D5C3, 0x38572401, 0xC6E33310 }
  , { 0x0C5C, 0x19AD60B0, 0x5E11225D, 0x09CBC60F }
  , { 0x0146, 0x60AFB573, 0x6646065C, 0xCF28F51F }
  , { 0, 0, 0, 0 }
  , { 0x012B, 0xD585A4FF, 0x5B2677DF, 0x6DB13CD8 }
  , { 0x01C3, 0x8C1277D0, 0x060B23FC, 0xF132BC82 }
  , { 0x00E8, 0x5997D32F, 0x5D2D5423, 0x9C83805A }
  , { 0, 0, 0, 0 }
  , { 0x1A45, 0x8608C902, 0xEE370748, 0x4D305615 }
  , { 0x179D, 0xB2113F0F, 0x7814A56C, 0x277A9EE3 }
  , { 0x0DD8, 0x3419F60D, 0x9623A224, 0x6A4AC8F6 }
  , { 0, 0, 0, 0 }
  , { 0x14BC, 0xA85D0E15, 0x3F315ADC, 0x9CAE6CCC }
  , { 0x157C, 0x5B345C1C, 0x643A6D33, 0x30FA7A39 }
  , { 0x01C0, 0xF3695209, 0x5B0B37EF, 0xAC5416F5 }
  , { 0, 0, 0, 0 }
  , { 0x0979, 0xB3238519, 0x21A25F08, 0x2B365B95 }
  , { 0x0103, 0x5D70D551, 0xB4E6A8BE, 0x5B6FF9C9 }
  , { 0x087A, 0xEE535048, 0x9544F7B6, 0x7059A25C }
  , { 0, 0, 0, 0 }
  , { 0x138A, 0x4E8936B1, 0x3D31B57E, 0x5CF062C8 }
  , { 0x1C2F, 0x11203F3F, 0xCAB69770, 0xDF433D07 }
  , { 0x0FA5, 0x5FA9098E, 0xF787220E, 0x83B35FCF }
  , { 0, 0, 0, 0 }
  , { 0x1FFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFBF }
  , { 0x01CD, 0xE8DE6EB8, 0x69E4FFE2, 0x7FD9AA80 }
  , { 0x1E32, 0x17219147, 0x961B001D, 0x8026553F }
  , { 0, 0, 0, 0 }
  , { 0x193F, 0xCC910D19, 0x708842DA, 0x45FBFAED }
  , { 0x127C, 0xE34869DD, 0x0640B805, 0x225B3C14 }
  , { 0x0B43, 0x2FD964C4, 0x76C8FADF, 0x67A0C6F9 }
  , { 0, 0, 0, 0 }
  , { 0x1797, 0x7E0212EE, 0x65C0A032, 0x8EFE6707 }
  , { 0x14C6, 0xD4CA755E, 0x9F969267, 0x2567C9A5 }
  , { 0x0351, 0xAAC867B0, 0xFA563255, 0xAB99AEA2 }
  , { 0, 0, 0, 0 }
  , { 0x0188, 0x9B59538D, 0xE19C0756, 0x46ACA6BE }
  , { 0x0A9A, 0x331E30C9, 0xEFA2FE39, 0x5AA1F939 }
  , { 0x0B12, 0xA8476344, 0x0E3EF96F, 0x1C0D5F87 }
  , { 0, 0, 0, 0 }
  , { 0x118F, 0x03D82851, 0x1D22AA81, 0x87D69A1F }
  , { 0x095F, 0xB83B634D, 0x284C5684, 0xE280D9AC }
  , { 0x18D0, 0xBBE34B1C, 0x356EFC05, 0x655643B3 }
  , { 0, 0, 0, 0 }
  , { 0x0007, 0x304EE9CD, 0xD8C33E62, 0xA5BAA407 }
  , { 0x0076, 0x5EFB7E85, 0x388546D3, 0xD82D749F }
  , { 0x0071, 0x6EB59748, 0xE04678B1, 0x7D97D098 }
  , { 0, 0, 0, 0 }
  , { 0x1D54, 0x06DE68C7, 0x5DF8F936, 0x7EF77307 }
  , { 0x07BA, 0x78C41EED, 0x02B46968, 0x29DF40AF }
  , { 0x1AEE, 0x7E1A762A, 0x5F4C905E, 0x572833A8 }
  , { 0, 0, 0, 0 }
  , { 0x17C5, 0xB4D4CFC7, 0x32C1F8AC, 0x655B7DF2 }
  , { 0x1D45, 0x7D6E6970, 0xC19185E5, 0xB0A43683 }
  , { 0x0A80, 0xC9BAA6B7, 0xF3507D49, 0xD5FF4B71 }
  , { 0, 0, 0, 0 }
  , { 0x1368, 0x10828A60, 0x34361502, 0xFC46FD62 }
  , { 0x1F9F, 0x0759C297, 0x1F06E180, 0x64DC124E }
  , { 0x0CF7, 0x17DB48F7, 0x2B30F482, 0x989AEF2C }
  , { 0, 0, 0, 0 }
  , { 0x19F2, 0x23FA3F11, 0x37EBBF13, 0xB243C5E5 }
  , { 0x17B1, 0x46F4B00F, 0xA9CBE82C, 0xE76AB97D }
  , { 0x0E43, 0x650E8F1E, 0x9E20573F, 0x55297C98 }
  , { 0, 0, 0, 0 }
  , { 0x0548, 0xE5B28F5D, 0x4A5F1D5B, 0xE72EDFB1 }
  , { 0x0797, 0x4CC047A9, 0x517EC6F3, 0x4F38005C }
  , { 0x02DF, 0xA972C8F4, 0x1B21DBA8, 0xA816DFED }
  , { 0, 0, 0, 0 }
  , { 0x19C9, 0x02ECC314, 0xA0C79EF6, 0x6C59F913 }
  , { 0x0544, 0xFBA4F1D8, 0xE3F79DAF, 0xDEE1E43F }
  , { 0x1C8D, 0xF94832CC, 0x43300359, 0xB2B81D2C }
  , { 0, 0, 0, 0 }
  , { 0x163C, 0xA7AF3355, 0x50F739B3, 0x7B9AF0D8 }
  , { 0x0408, 0xE42CECEE, 0xD1B4C9D2, 0x2E18F5E8 }
  , { 0x1234, 0x4383DFBB, 0x8143F061, 0x55820530 }
  , { 0, 0, 0, 0 }
  , { 0x0091, 0x4C3FF8DA, 0xF77D0B9F, 0x61D8A3F4 }
  , { 0x186E, 0x7ABA658A, 0x7FF0C893, 0x16AA6668 }
  , { 0x18FF, 0x36859D50, 0x888DC30C, 0x7772C59C }
  , { 0, 0, 0, 0 }
  , { 0x0FE8, 0x62A9C3F9, 0x16CB65EB, 0x3415CC43 }
  , { 0x1A79, 0xD9C93DA7, 0x2891C56B, 0xC83CF966 }
  , { 0x1591, 0xBB60FE5E, 0x3E5AA080, 0xFC293525 }
  , { 0, 0, 0, 0 }
  , { 0x13C7, 0x668D0969, 0x5D2B246A, 0x29D214BB }
  , { 0x08A6, 0x97345AA9, 0xBB2DDE90, 0x2816E691 }
  , { 0x1B61, 0xF1B953C0, 0xE606FAFA, 0x01C4F22A }
  , { 0, 0, 0, 0 }
  , { 0x1D43, 0x4697DD81, 0x5210AB37, 0xBDB4D569 }
  , { 0x146E, 0xE982BE06, 0x27B2F0E0, 0x6A2904E5 }
  , { 0x092D, 0xAF156387, 0x75A25BD7, 0xD79DD18C }
  , { 0, 0, 0, 0 }
  , { 0x0519, 0xFF5879F1, 0x1852637C, 0x9B6B9A33 }
  , { 0x1CF7, 0x16F88427, 0xA0B5B4D1, 0x6100D523 }
  , { 0x19EE, 0xE9A0FDD6, 0xB8E7D7AD, 0xFA6B4F10 }
  , { 0, 0, 0, 0 }
  , { 0x04AA, 0xBF49894C, 0x3F6627F1, 0x5981313B }
  , { 0x0E63, 0xE6B11399, 0xC8C3EC99, 0xF86F51E9 }
  , { 0x0AC9, 0x59F89AD5, 0xF7A5CB68, 0xA1EE60D2 }
  , { 0, 0, 0, 0 }
  , { 0x1DDF, 0xE5124954, 0x5F10E6A2, 0x3926A89E }
  , { 0x1A9C, 0x19BC4158, 0x11BBB1FB, 0xC1EAE3F5 }
  , { 0x0743, 0xFCAE080C, 0x4EAB5759, 0xF8CC4B6B }
  , { 0, 0, 0, 0 }
  , { 0x1A34, 0xF205AB86, 0x70AE4803, 0x8DC66620 }
  , { 0x0553, 0x811DFA1A, 0xC53F7E1C, 0xF16331C3 }
  , { 0x1F67, 0x7318519C, 0xB591361F, 0x7CA557E3 }
  , { 0, 0, 0, 0 }
  , { 0x18B8, 0x335AC160, 0xBC2244BA, 0x13978C1E }
  , { 0x0E4F, 0xB2233CDC, 0x78EC2FD5, 0x9BFF7FF6 }
  , { 0x16F7, 0x8179FDBC, 0xC4CE6B6F, 0x8868F3E8 }
  , { 0, 0, 0, 0 }
  , { 0x0257, 0xAB0B49FE, 0xB64CEFBE, 0xDB6279B0 }
  }
  ;

  uint i;
  u32 t, yt,yh,ym,yl;
  const poly128 *p, *q;

  yt = yh = ym = yl = 0;
  p = &matrix[0];

#define FOR_BLOCK \
  { q = &p[t & 3]; yt ^= q->t; yh ^= q->h; ym ^= q->m; yl ^= q->l; \
    t >>= 2; p += 4; \
    q = &p[t & 3]; yt ^= q->t; yh ^= q->h; ym ^= q->m; yl ^= q->l; \
    t >>= 2; p += 4; \
  }

  t = x.l; for (i = 8; i; --i) FOR_BLOCK
  t = x.m; for (i = 8; i; --i) FOR_BLOCK
  t = x.h; for (i = 8; i; --i) FOR_BLOCK
  t = x.t; for (i = 3; i; --i) FOR_BLOCK

  /* Last bit. */
  q = &p[t & 1]; yt ^= q->t; yh ^= q->h; ym ^= q->m; yl ^= q->l;

#undef FOR_BLOCK

  { poly128 res;
    res.t = yt; res.h = yh; res.m = ym; res.l = yl;
    return res;
  } /* end block */

#endif
/* defined(MMX) ... #else ... */

} /* end function changeBasis */


/*-= Arithmetic in the field GF(2^109) -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- squareNTimes ----------------------------------------------------------*/

/* Square x, n times, in the field GF(2^109) i.e., as a poly in (Z/2Z)[t]
 * reduced modulo t^109+t^9+t^2+t+1.  Degree x < 109.
 */
static poly128 squareNTimes(poly128 x, uint n) {

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#ifdef MMX

  static const mmx64 tab2[256] =
  { {     0, 0 }, {     1, 0 }, {     4, 0 }, {     5, 0 }, {    16, 0 }
  , {    17, 0 }, {    20, 0 }, {    21, 0 }, {    64, 0 }, {    65, 0 }
  , {    68, 0 }, {    69, 0 }, {    80, 0 }, {    81, 0 }, {    84, 0 }
  , {    85, 0 }, {   256, 0 }, {   257, 0 }, {   260, 0 }, {   261, 0 }
  , {   272, 0 }, {   273, 0 }, {   276, 0 }, {   277, 0 }, {   320, 0 }
  , {   321, 0 }, {   324, 0 }, {   325, 0 }, {   336, 0 }, {   337, 0 }
  , {   340, 0 }, {   341, 0 }, {  1024, 0 }, {  1025, 0 }, {  1028, 0 }
  , {  1029, 0 }, {  1040, 0 }, {  1041, 0 }, {  1044, 0 }, {  1045, 0 }
  , {  1088, 0 }, {  1089, 0 }, {  1092, 0 }, {  1093, 0 }, {  1104, 0 }
  , {  1105, 0 }, {  1108, 0 }, {  1109, 0 }, {  1280, 0 }, {  1281, 0 }
  , {  1284, 0 }, {  1285, 0 }, {  1296, 0 }, {  1297, 0 }, {  1300, 0 }
  , {  1301, 0 }, {  1344, 0 }, {  1345, 0 }, {  1348, 0 }, {  1349, 0 }
  , {  1360, 0 }, {  1361, 0 }, {  1364, 0 }, {  1365, 0 }, {  4096, 0 }
  , {  4097, 0 }, {  4100, 0 }, {  4101, 0 }, {  4112, 0 }, {  4113, 0 }
  , {  4116, 0 }, {  4117, 0 }, {  4160, 0 }, {  4161, 0 }, {  4164, 0 }
  , {  4165, 0 }, {  4176, 0 }, {  4177, 0 }, {  4180, 0 }, {  4181, 0 }
  , {  4352, 0 }, {  4353, 0 }, {  4356, 0 }, {  4357, 0 }, {  4368, 0 }
  , {  4369, 0 }, {  4372, 0 }, {  4373, 0 }, {  4416, 0 }, {  4417, 0 }
  , {  4420, 0 }, {  4421, 0 }, {  4432, 0 }, {  4433, 0 }, {  4436, 0 }
  , {  4437, 0 }, {  5120, 0 }, {  5121, 0 }, {  5124, 0 }, {  5125, 0 }
  , {  5136, 0 }, {  5137, 0 }, {  5140, 0 }, {  5141, 0 }, {  5184, 0 }
  , {  5185, 0 }, {  5188, 0 }, {  5189, 0 }, {  5200, 0 }, {  5201, 0 }
  , {  5204, 0 }, {  5205, 0 }, {  5376, 0 }, {  5377, 0 }, {  5380, 0 }
  , {  5381, 0 }, {  5392, 0 }, {  5393, 0 }, {  5396, 0 }, {  5397, 0 }
  , {  5440, 0 }, {  5441, 0 }, {  5444, 0 }, {  5445, 0 }, {  5456, 0 }
  , {  5457, 0 }, {  5460, 0 }, {  5461, 0 }, { 16384, 0 }, { 16385, 0 }
  , { 16388, 0 }, { 16389, 0 }, { 16400, 0 }, { 16401, 0 }, { 16404, 0 }
  , { 16405, 0 }, { 16448, 0 }, { 16449, 0 }, { 16452, 0 }, { 16453, 0 }
  , { 16464, 0 }, { 16465, 0 }, { 16468, 0 }, { 16469, 0 }, { 16640, 0 }
  , { 16641, 0 }, { 16644, 0 }, { 16645, 0 }, { 16656, 0 }, { 16657, 0 }
  , { 16660, 0 }, { 16661, 0 }, { 16704, 0 }, { 16705, 0 }, { 16708, 0 }
  , { 16709, 0 }, { 16720, 0 }, { 16721, 0 }, { 16724, 0 }, { 16725, 0 }
  , { 17408, 0 }, { 17409, 0 }, { 17412, 0 }, { 17413, 0 }, { 17424, 0 }
  , { 17425, 0 }, { 17428, 0 }, { 17429, 0 }, { 17472, 0 }, { 17473, 0 }
  , { 17476, 0 }, { 17477, 0 }, { 17488, 0 }, { 17489, 0 }, { 17492, 0 }
  , { 17493, 0 }, { 17664, 0 }, { 17665, 0 }, { 17668, 0 }, { 17669, 0 }
  , { 17680, 0 }, { 17681, 0 }, { 17684, 0 }, { 17685, 0 }, { 17728, 0 }
  , { 17729, 0 }, { 17732, 0 }, { 17733, 0 }, { 17744, 0 }, { 17745, 0 }
  , { 17748, 0 }, { 17749, 0 }, { 20480, 0 }, { 20481, 0 }, { 20484, 0 }
  , { 20485, 0 }, { 20496, 0 }, { 20497, 0 }, { 20500, 0 }, { 20501, 0 }
  , { 20544, 0 }, { 20545, 0 }, { 20548, 0 }, { 20549, 0 }, { 20560, 0 }
  , { 20561, 0 }, { 20564, 0 }, { 20565, 0 }, { 20736, 0 }, { 20737, 0 }
  , { 20740, 0 }, { 20741, 0 }, { 20752, 0 }, { 20753, 0 }, { 20756, 0 }
  , { 20757, 0 }, { 20800, 0 }, { 20801, 0 }, { 20804, 0 }, { 20805, 0 }
  , { 20816, 0 }, { 20817, 0 }, { 20820, 0 }, { 20821, 0 }, { 21504, 0 }
  , { 21505, 0 }, { 21508, 0 }, { 21509, 0 }, { 21520, 0 }, { 21521, 0 }
  , { 21524, 0 }, { 21525, 0 }, { 21568, 0 }, { 21569, 0 }, { 21572, 0 }
  , { 21573, 0 }, { 21584, 0 }, { 21585, 0 }, { 21588, 0 }, { 21589, 0 }
  , { 21760, 0 }, { 21761, 0 }, { 21764, 0 }, { 21765, 0 }, { 21776, 0 }
  , { 21777, 0 }, { 21780, 0 }, { 21781, 0 }, { 21824, 0 }, { 21825, 0 }
  , { 21828, 0 }, { 21829, 0 }, { 21840, 0 }, { 21841, 0 }, { 21844, 0 }
  , { 21845, 0 }
  };
  static const mmx64 tab4[256] =
  { { 0x00000000, 0 }, { 0x00000001, 0 }, { 0x00000010, 0 }, { 0x00000011, 0 }
  , { 0x00000100, 0 }, { 0x00000101, 0 }, { 0x00000110, 0 }, { 0x00000111, 0 }
  , { 0x00001000, 0 }, { 0x00001001, 0 }, { 0x00001010, 0 }, { 0x00001011, 0 }
  , { 0x00001100, 0 }, { 0x00001101, 0 }, { 0x00001110, 0 }, { 0x00001111, 0 }
  , { 0x00010000, 0 }, { 0x00010001, 0 }, { 0x00010010, 0 }, { 0x00010011, 0 }
  , { 0x00010100, 0 }, { 0x00010101, 0 }, { 0x00010110, 0 }, { 0x00010111, 0 }
  , { 0x00011000, 0 }, { 0x00011001, 0 }, { 0x00011010, 0 }, { 0x00011011, 0 }
  , { 0x00011100, 0 }, { 0x00011101, 0 }, { 0x00011110, 0 }, { 0x00011111, 0 }
  , { 0x00100000, 0 }, { 0x00100001, 0 }, { 0x00100010, 0 }, { 0x00100011, 0 }
  , { 0x00100100, 0 }, { 0x00100101, 0 }, { 0x00100110, 0 }, { 0x00100111, 0 }
  , { 0x00101000, 0 }, { 0x00101001, 0 }, { 0x00101010, 0 }, { 0x00101011, 0 }
  , { 0x00101100, 0 }, { 0x00101101, 0 }, { 0x00101110, 0 }, { 0x00101111, 0 }
  , { 0x00110000, 0 }, { 0x00110001, 0 }, { 0x00110010, 0 }, { 0x00110011, 0 }
  , { 0x00110100, 0 }, { 0x00110101, 0 }, { 0x00110110, 0 }, { 0x00110111, 0 }
  , { 0x00111000, 0 }, { 0x00111001, 0 }, { 0x00111010, 0 }, { 0x00111011, 0 }
  , { 0x00111100, 0 }, { 0x00111101, 0 }, { 0x00111110, 0 }, { 0x00111111, 0 }
  , { 0x01000000, 0 }, { 0x01000001, 0 }, { 0x01000010, 0 }, { 0x01000011, 0 }
  , { 0x01000100, 0 }, { 0x01000101, 0 }, { 0x01000110, 0 }, { 0x01000111, 0 }
  , { 0x01001000, 0 }, { 0x01001001, 0 }, { 0x01001010, 0 }, { 0x01001011, 0 }
  , { 0x01001100, 0 }, { 0x01001101, 0 }, { 0x01001110, 0 }, { 0x01001111, 0 }
  , { 0x01010000, 0 }, { 0x01010001, 0 }, { 0x01010010, 0 }, { 0x01010011, 0 }
  , { 0x01010100, 0 }, { 0x01010101, 0 }, { 0x01010110, 0 }, { 0x01010111, 0 }
  , { 0x01011000, 0 }, { 0x01011001, 0 }, { 0x01011010, 0 }, { 0x01011011, 0 }
  , { 0x01011100, 0 }, { 0x01011101, 0 }, { 0x01011110, 0 }, { 0x01011111, 0 }
  , { 0x01100000, 0 }, { 0x01100001, 0 }, { 0x01100010, 0 }, { 0x01100011, 0 }
  , { 0x01100100, 0 }, { 0x01100101, 0 }, { 0x01100110, 0 }, { 0x01100111, 0 }
  , { 0x01101000, 0 }, { 0x01101001, 0 }, { 0x01101010, 0 }, { 0x01101011, 0 }
  , { 0x01101100, 0 }, { 0x01101101, 0 }, { 0x01101110, 0 }, { 0x01101111, 0 }
  , { 0x01110000, 0 }, { 0x01110001, 0 }, { 0x01110010, 0 }, { 0x01110011, 0 }
  , { 0x01110100, 0 }, { 0x01110101, 0 }, { 0x01110110, 0 }, { 0x01110111, 0 }
  , { 0x01111000, 0 }, { 0x01111001, 0 }, { 0x01111010, 0 }, { 0x01111011, 0 }
  , { 0x01111100, 0 }, { 0x01111101, 0 }, { 0x01111110, 0 }, { 0x01111111, 0 }
  , { 0x10000000, 0 }, { 0x10000001, 0 }, { 0x10000010, 0 }, { 0x10000011, 0 }
  , { 0x10000100, 0 }, { 0x10000101, 0 }, { 0x10000110, 0 }, { 0x10000111, 0 }
  , { 0x10001000, 0 }, { 0x10001001, 0 }, { 0x10001010, 0 }, { 0x10001011, 0 }
  , { 0x10001100, 0 }, { 0x10001101, 0 }, { 0x10001110, 0 }, { 0x10001111, 0 }
  , { 0x10010000, 0 }, { 0x10010001, 0 }, { 0x10010010, 0 }, { 0x10010011, 0 }
  , { 0x10010100, 0 }, { 0x10010101, 0 }, { 0x10010110, 0 }, { 0x10010111, 0 }
  , { 0x10011000, 0 }, { 0x10011001, 0 }, { 0x10011010, 0 }, { 0x10011011, 0 }
  , { 0x10011100, 0 }, { 0x10011101, 0 }, { 0x10011110, 0 }, { 0x10011111, 0 }
  , { 0x10100000, 0 }, { 0x10100001, 0 }, { 0x10100010, 0 }, { 0x10100011, 0 }
  , { 0x10100100, 0 }, { 0x10100101, 0 }, { 0x10100110, 0 }, { 0x10100111, 0 }
  , { 0x10101000, 0 }, { 0x10101001, 0 }, { 0x10101010, 0 }, { 0x10101011, 0 }
  , { 0x10101100, 0 }, { 0x10101101, 0 }, { 0x10101110, 0 }, { 0x10101111, 0 }
  , { 0x10110000, 0 }, { 0x10110001, 0 }, { 0x10110010, 0 }, { 0x10110011, 0 }
  , { 0x10110100, 0 }, { 0x10110101, 0 }, { 0x10110110, 0 }, { 0x10110111, 0 }
  , { 0x10111000, 0 }, { 0x10111001, 0 }, { 0x10111010, 0 }, { 0x10111011, 0 }
  , { 0x10111100, 0 }, { 0x10111101, 0 }, { 0x10111110, 0 }, { 0x10111111, 0 }
  , { 0x11000000, 0 }, { 0x11000001, 0 }, { 0x11000010, 0 }, { 0x11000011, 0 }
  , { 0x11000100, 0 }, { 0x11000101, 0 }, { 0x11000110, 0 }, { 0x11000111, 0 }
  , { 0x11001000, 0 }, { 0x11001001, 0 }, { 0x11001010, 0 }, { 0x11001011, 0 }
  , { 0x11001100, 0 }, { 0x11001101, 0 }, { 0x11001110, 0 }, { 0x11001111, 0 }
  , { 0x11010000, 0 }, { 0x11010001, 0 }, { 0x11010010, 0 }, { 0x11010011, 0 }
  , { 0x11010100, 0 }, { 0x11010101, 0 }, { 0x11010110, 0 }, { 0x11010111, 0 }
  , { 0x11011000, 0 }, { 0x11011001, 0 }, { 0x11011010, 0 }, { 0x11011011, 0 }
  , { 0x11011100, 0 }, { 0x11011101, 0 }, { 0x11011110, 0 }, { 0x11011111, 0 }
  , { 0x11100000, 0 }, { 0x11100001, 0 }, { 0x11100010, 0 }, { 0x11100011, 0 }
  , { 0x11100100, 0 }, { 0x11100101, 0 }, { 0x11100110, 0 }, { 0x11100111, 0 }
  , { 0x11101000, 0 }, { 0x11101001, 0 }, { 0x11101010, 0 }, { 0x11101011, 0 }
  , { 0x11101100, 0 }, { 0x11101101, 0 }, { 0x11101110, 0 }, { 0x11101111, 0 }
  , { 0x11110000, 0 }, { 0x11110001, 0 }, { 0x11110010, 0 }, { 0x11110011, 0 }
  , { 0x11110100, 0 }, { 0x11110101, 0 }, { 0x11110110, 0 }, { 0x11110111, 0 }
  , { 0x11111000, 0 }, { 0x11111001, 0 }, { 0x11111010, 0 }, { 0x11111011, 0 }
  , { 0x11111100, 0 }, { 0x11111101, 0 }, { 0x11111110, 0 }, { 0x11111111, 0 }
  };

  u32 xt,xh,xm,xl;

  xt = x.t; xh = x.h; xm = x.m; xl = x.l;

  START_MMX();

#define mm_YL 0
#define mm_YH 1
#define mm_T 2
#define mm_U 3

#define MUNGE \
  CPY(mm_U, mm_T); SRL(mm_U, 36); SLL(mm_T, 19);        \
  XOR(mm_YH, mm_U); XOR(mm_YL, mm_T);                   \
  SRL(mm_U, 7); SLL(mm_T, 1);                           \
  XOR(mm_YH, mm_U); XOR(mm_YL, mm_T);                   \
  SRL(mm_U, 1); SLL(mm_T, 1);                           \
  XOR(mm_YH, mm_U); XOR(mm_YL, mm_T);                   \
  SRL(mm_U, 1); SLL(mm_T, 7);                           \
  XOR(mm_YH, mm_U); XOR(mm_YL, mm_T) /* ; */

  /* Two at a time. */
  for ( ; n >= 2; n -= 2) {
    MMX_STATIC mmx64 yh, yl;

    LOD(mm_T, tab4[xt>> 8 /* & 31 */ ]);
    LOD(mm_YH, tab4[xh>>24]);
    LOD(mm_YL, tab4[xh>> 8 & 255]);

    SLL(mm_T, 32);
    SLL(mm_YH, 32);
    SLL(mm_YL, 32);

    XOR_MEM(mm_T, tab4[xt & 255]);
    XOR_MEM(mm_YH, tab4[xh>>16 & 255]);
    XOR_MEM(mm_YL, tab4[xh & 255]);

    MUNGE;

    CPY(mm_T, mm_YH); CPY(mm_YH, mm_YL);
    LOD(mm_YL, tab4[xm>>24]); SLL(mm_YL, 32);
    XOR_MEM(mm_YL, tab4[xm>>16 & 255]);

    MUNGE;

    CPY(mm_T, mm_YH); CPY(mm_YH, mm_YL);
    LOD(mm_YL, tab4[xm>> 8 & 255]); SLL(mm_YL, 32);
    XOR_MEM(mm_YL, tab4[xm & 255]);

    MUNGE;

    CPY(mm_T, mm_YH); CPY(mm_YH, mm_YL);
    LOD(mm_YL, tab4[xl>>24]); SLL(mm_YL, 32);
    XOR_MEM(mm_YL, tab4[xl>>16 & 255]);

    MUNGE;

    CPY(mm_T, mm_YH); CPY(mm_YH, mm_YL);
    LOD(mm_YL, tab4[xl>> 8 & 255]); SLL(mm_YL, 32);
    XOR_MEM(mm_YL, tab4[xl & 255]);

    MUNGE;

    /* Reduce modulo t^109+t^9+t^2+t+1. */
    CPY(mm_T, mm_YH); SRL(mm_T, 45);
    SLL(mm_YH, 19); SRL(mm_YH, 19);

    XOR(mm_YL, mm_T);
    SLL(mm_T, 1); XOR(mm_YL, mm_T);
    SLL(mm_T, 1); XOR(mm_YL, mm_T);
    SLL(mm_T, 7); XOR(mm_YL, mm_T);

    STR(yh, mm_YH);
    STR(yl, mm_YL);

    xt = yh.hi; xh = yh.lo;
    xm = yl.hi; xl = yl.lo;
  } /* end for */

#undef MUNGE


#define mm_TMP 4
#define mm_TMP2 5

  /* Last one (if n was odd). */
  if (n) {
    MMX_STATIC mmx64 yh,yl;

    LOD(mm_U, tab2[xt>> 8 /* & 31 */ ]);
    LOD(mm_T, tab2[xh>>24]);
    LOD(mm_YH, tab2[xm>>24]);
    LOD(mm_YL, tab2[xl>>24]);

    SLL(mm_U, 16); ORR_MEM(mm_U, tab2[xt     & 255]);
    SLL(mm_T, 16); ORR_MEM(mm_T, tab2[xh>>16 & 255]);
    SLL(mm_YH, 16); ORR_MEM(mm_YH, tab2[xm>>16 & 255]);
    SLL(mm_YL, 16); ORR_MEM(mm_YL, tab2[xl>>16 & 255]);

    SLL(mm_T, 16); ORR_MEM(mm_T, tab2[xh>> 8 & 255]);
    SLL(mm_YH, 16); ORR_MEM(mm_YH, tab2[xm>> 8 & 255]);
    SLL(mm_YL, 16); ORR_MEM(mm_YL, tab2[xl>> 8 & 255]);

    SLL(mm_T, 16); ORR_MEM(mm_T, tab2[xh & 255]);
    SLL(mm_YH, 16); ORR_MEM(mm_YH, tab2[xm & 255]);
    SLL(mm_YL, 16); ORR_MEM(mm_YL, tab2[xl & 255]);

    /* Reduce modulo t^109+t^9+t^2+t+1. */
    CPY(mm_TMP, mm_T);

    SLL(mm_U, 19);  XOR(mm_YH, mm_U);
    SRL(mm_TMP, 36); XOR(mm_YH, mm_TMP);
    SLL(mm_T, 19);  XOR(mm_YL, mm_T);

    SLL(mm_U, 1);  XOR(mm_YH, mm_U);
    SRL(mm_TMP, 7); XOR(mm_YH, mm_TMP);
    SLL(mm_T, 1);  XOR(mm_YL, mm_T);

    SLL(mm_U, 1);  XOR(mm_YH, mm_U);
    SRL(mm_TMP, 1); XOR(mm_YH, mm_TMP);
    SLL(mm_T, 1);  XOR(mm_YL, mm_T);

    SLL(mm_U, 7);  XOR(mm_YH, mm_U);
    SRL(mm_TMP, 1); XOR(mm_YH, mm_TMP);
    SLL(mm_T, 7);  XOR(mm_YL, mm_T);


    CPY(mm_TMP, mm_YH); SRL(mm_TMP, 45);
    SLL(mm_YH, 19); SRL(mm_YH, 19);

    XOR(mm_YL, mm_TMP);
    SLL(mm_TMP, 1); XOR(mm_YL, mm_TMP);
    SLL(mm_TMP, 1); XOR(mm_YL, mm_TMP);
    SLL(mm_TMP, 7); XOR(mm_YL, mm_TMP);

    STR(yh, mm_YH); STR(yl, mm_YL);

    xt = yh.hi; xh = yh.lo;
    xm = yl.hi; xl = yl.lo;
  } /* end if */

#undef mm_YL
#undef mm_YH
#undef mm_T
#undef mm_U
#undef mm_TMP
#undef mm_TMP2

  END_MMX();

  { poly128 r;
    r.t = xt; r.h = xh; r.m = xm; r.l = xl;
    return r;
  } /* end block */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#else
/* !defined(MMX) */

  static const u32 tab[256] =
  {     0,     1,     4,     5,    16,    17,    20,    21
  ,    64,    65,    68,    69,    80,    81,    84,    85
  ,   256,   257,   260,   261,   272,   273,   276,   277
  ,   320,   321,   324,   325,   336,   337,   340,   341
  ,  1024,  1025,  1028,  1029,  1040,  1041,  1044,  1045
  ,  1088,  1089,  1092,  1093,  1104,  1105,  1108,  1109
  ,  1280,  1281,  1284,  1285,  1296,  1297,  1300,  1301
  ,  1344,  1345,  1348,  1349,  1360,  1361,  1364,  1365
  ,  4096,  4097,  4100,  4101,  4112,  4113,  4116,  4117
  ,  4160,  4161,  4164,  4165,  4176,  4177,  4180,  4181
  ,  4352,  4353,  4356,  4357,  4368,  4369,  4372,  4373
  ,  4416,  4417,  4420,  4421,  4432,  4433,  4436,  4437
  ,  5120,  5121,  5124,  5125,  5136,  5137,  5140,  5141
  ,  5184,  5185,  5188,  5189,  5200,  5201,  5204,  5205
  ,  5376,  5377,  5380,  5381,  5392,  5393,  5396,  5397
  ,  5440,  5441,  5444,  5445,  5456,  5457,  5460,  5461
  , 16384, 16385, 16388, 16389, 16400, 16401, 16404, 16405
  , 16448, 16449, 16452, 16453, 16464, 16465, 16468, 16469
  , 16640, 16641, 16644, 16645, 16656, 16657, 16660, 16661
  , 16704, 16705, 16708, 16709, 16720, 16721, 16724, 16725
  , 17408, 17409, 17412, 17413, 17424, 17425, 17428, 17429
  , 17472, 17473, 17476, 17477, 17488, 17489, 17492, 17493
  , 17664, 17665, 17668, 17669, 17680, 17681, 17684, 17685
  , 17728, 17729, 17732, 17733, 17744, 17745, 17748, 17749
  , 20480, 20481, 20484, 20485, 20496, 20497, 20500, 20501
  , 20544, 20545, 20548, 20549, 20560, 20561, 20564, 20565
  , 20736, 20737, 20740, 20741, 20752, 20753, 20756, 20757
  , 20800, 20801, 20804, 20805, 20816, 20817, 20820, 20821
  , 21504, 21505, 21508, 21509, 21520, 21521, 21524, 21525
  , 21568, 21569, 21572, 21573, 21584, 21585, 21588, 21589
  , 21760, 21761, 21764, 21765, 21776, 21777, 21780, 21781
  , 21824, 21825, 21828, 21829, 21840, 21841, 21844, 21845
  };

  u32 xt,xh,xm,xl;

  xt = x.t; xh = x.h; xm = x.m; xl = x.l;

  for ( ; n; --n) {
    u32 t0,t1,t2,t3,t4,t5,t6, w;

    t6 = tab[xt & 255] | tab[xt>>8 & 255]<<16;
    t5 = tab[xh>>16 & 255] | tab[xh>>24]<<16;
    t4 = tab[xh & 255] | tab[xh>>8 & 255]<<16;
    t3 = tab[xm>>16 & 255] | tab[xm>>24]<<16;
    t2 = tab[xm & 255] | tab[xm>>8 & 255]<<16;
    t1 = tab[xl>>16 & 255] | tab[xl>>24]<<16;
    t0 = tab[xl & 255] | tab[xl>>8 & 255]<<16;

    /* Reduce modulo t^109+t^9+t^2+t+1. */
    t3 ^= t6>>13; t3 ^= t6>>12; t3 ^= t6>>11; t3 ^= t6>>4;
    t2 ^= t6<<19; t2 ^= t6<<20; t2 ^= t6<<21; t2 ^= t6<<28;
    t2 ^= t5>>13; t2 ^= t5>>12; t2 ^= t5>>11; t2 ^= t5>>4;
    t1 ^= t5<<19; t1 ^= t5<<20; t1 ^= t5<<21; t1 ^= t5<<28;
    t1 ^= t4>>13; t1 ^= t4>>12; t1 ^= t4>>11; t1 ^= t4>>4;
    t0 ^= t4<<19; t0 ^= t4<<20; t0 ^= t4<<21; t0 ^= t4<<28;

    w = t3>>13; t3 ^= w<<13; t0 ^= w; t0 ^= w<<1; t0 ^= w<<2; t0 ^= w<<9;
    xt = t3; xh = t2; xm = t1; xl = t0;
  } /* end for */

  { poly128 r;
    r.t = xt; r.h = xh; r.m = xm; r.l = xl;
    return r;
  } /* end block */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#endif
/* defined(MMX) ... #else ... */

} /* end function squareNTimes */


/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#if PROD == 1

/*-- GF2Product27x27 -------------------------------------------------------*/

/* Computes 53-bit product of polynomials over Z/2Z with degree < 27.
 * Returns low 32 bits of result, and leaves top 21 bits in *ph.
 */
static u32 GF2Product27x27(u32 a, u32 b, u32 *ph) {
  u32 a2,a4, h, l;

  a2 = a<<1; a4 = a<<2;

  { u32 s, t, tab[8];

    tab[0] = 0;  tab[1] = a;      tab[2] = a2;      tab[3] = a ^ a2;
    tab[4] = a4; tab[5] = a ^ a4; tab[6] = a2 ^ a4; tab[7] = a ^ a2 ^ a4;

#if 0
    l = tab[b>>24];
    l = l<<3 ^ tab[b>>21 & 7];
    h = l>>29; l = l<<3 ^ tab[b>>18 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>15 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>12 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>9 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>6 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>3 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b & 7];
#else
    s = tab[b     & 7];
    t = tab[b>> 3 & 7]; l = s;
    s = tab[b>> 6 & 7]; l ^= t<< 3;
    t = tab[b>> 9 & 7]; l ^= s<< 6; h = s>>26;
    s = tab[b>>12 & 7]; l ^= t<< 9; h ^= t>>23;
    t = tab[b>>15 & 7]; l ^= s<<12; h ^= s>>20;
    s = tab[b>>18 & 7]; l ^= t<<15; h ^= t>>17;
    t = tab[b>>21 & 7]; l ^= s<<18; h ^= s>>14;
    s = tab[b>>24    ]; l ^= t<<21; h ^= t>>11;
                        l ^= s<<24; h ^= s>> 8;
#endif

    *ph = h;
    return l;
  } /* end block */

} /* end function GF2Product27x27 */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#elif PROD == 3

/*-- GF2Product28x28 -------------------------------------------------------*/

/* Computes 55-bit product of polynomials over Z/2Z with degree < 28.
 * Returns low 32 bits of result, and leaves top 23 bits in *ph.
 */
static u32 GF2Product28x28(u32 a, u32 b, u32 *ph) {
  u32 a2,a4, h, l;

  a2 = a<<1; a4 = a<<2;

  { u32 s, t, tab[8];

    tab[0] = 0;  tab[1] = a;      tab[2] = a2;      tab[3] = a ^ a2;
    tab[4] = a4; tab[5] = a ^ a4; tab[6] = a2 ^ a4; tab[7] = a ^ a2 ^ a4;

#if 0
    l = tab[b>>27];
    l = l<<3 ^ tab[b>>24 & 7];
    h = l>>29; l = l<<3 ^ tab[b>>21 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>18 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>15 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>12 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>9 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>6 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b>>3 & 7];
    h = h<<3 | l>>29; l = l<<3 ^ tab[b & 7];
#else
    s = tab[b     & 7];
    t = tab[b>> 3 & 7]; l = s;
    s = tab[b>> 6 & 7]; l ^= t<< 3; h = t>>29;
    t = tab[b>> 9 & 7]; l ^= s<< 6; h ^= s>>26;
    s = tab[b>>12 & 7]; l ^= t<< 9; h ^= t>>23;
    t = tab[b>>15 & 7]; l ^= s<<12; h ^= s>>20;
    s = tab[b>>18 & 7]; l ^= t<<15; h ^= t>>17;
    t = tab[b>>21 & 7]; l ^= s<<18; h ^= s>>14;
    s = tab[b>>24 & 7]; l ^= t<<21; h ^= t>>11;
    t = tab[b>>27    ]; l ^= s<<24; h ^= s>> 8;
                        l ^= t<<27; h ^= t>> 5;
#endif

    *ph = h;
    return l;
  } /* end block */

} /* end function GF2Product28x28 */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#elif PROD == 5

/*-- GF2Product32x32 -------------------------------------------------------*/

/* Computes 63-bit product of polynomials over Z/2Z with degree < 32.
 * Returns low 32 bits of result, and leaves top 31 bits in *ph.
 */
static u32 GF2Product32x32(u32 a, u32 b, u32 *ph) {
  u32 a1,a2,a4, h, l;

  a4 = a<<2; a2 = a4>>1; a1 = a4>>2;

  { u32 s, t, tab[8];

    tab[0] = 0;  tab[1] = a1;      tab[2] = a2;      tab[3] = a1 ^ a2;
    tab[4] = a4; tab[5] = a1 ^ a4; tab[6] = a2 ^ a4; tab[7] = a1 ^ a2 ^ a4;

#if 0
    l = h = 0;
    if (a>>31 & 1) { l = b<<31; h = b>>1; }
    if (a>>30 & 1) { l ^= b<<30; h ^= b>>2; }

    s = tab[b     & 7];
    t = tab[b>> 3 & 7]; l ^= s;
    s = tab[b>> 6 & 7]; l ^= t<< 3; h ^= t>>29;
    t = tab[b>> 9 & 7]; l ^= s<< 6; h ^= s>>26;
    s = tab[b>>12 & 7]; l ^= t<< 9; h ^= t>>23;
    t = tab[b>>15 & 7]; l ^= s<<12; h ^= s>>20;
    s = tab[b>>18 & 7]; l ^= t<<15; h ^= t>>17;
    t = tab[b>>21 & 7]; l ^= s<<18; h ^= s>>14;
    s = tab[b>>24 & 7]; l ^= t<<21; h ^= t>>11;
    t = tab[b>>27 & 7]; l ^= s<<24; h ^= s>> 8;
    s = tab[b>>30    ]; l ^= t<<27; h ^= t>> 5;
                        l ^= s<<30; h ^= s>> 2;
#else
    s = tab[b     & 7];
    t = tab[b>> 3 & 7]; l = s;
    s = tab[b>> 6 & 7]; l ^= t<< 3; h = t>>29;
    t = tab[b>> 9 & 7]; l ^= s<< 6; h ^= s>>26;
    s = tab[b>>12 & 7]; l ^= t<< 9; h ^= t>>23;
    t = tab[b>>15 & 7]; l ^= s<<12; h ^= s>>20;
    s = tab[b>>18 & 7]; l ^= t<<15; h ^= t>>17;
    t = tab[b>>21 & 7]; l ^= s<<18; h ^= s>>14;
    s = tab[b>>24 & 7]; l ^= t<<21; h ^= t>>11;
    t = tab[b>>27 & 7]; l ^= s<<24; h ^= s>> 8;
    s = tab[b>>30    ]; l ^= t<<27; h ^= t>> 5;
                        l ^= s<<30; h ^= s>> 2;

    if (a>>31 & 1) { l ^= b<<31; h ^= b>>1; }
    if (a>>30 & 1) { l ^= b<<30; h ^= b>>2; }
#endif
  } /* end block */

  *ph = h;
  return l;
} /* end function GF2Product32x32 */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#endif
/* PROD == 1 ... #elif PROD == 3 ... #elif PROD == 5 ... */


/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#ifdef MMX

/*-- MMX_GF2Product56x56 ---------------------------------------------------*/

/* Computes 111-bit product of polynomials xh:xl and *py over Z/2Z
 * with degree < 56.  Speed-critical auxiliary function used for product().
 * Returns with low 64 bits of result in *pl and high 47 bits in *ph.
 */
static void MMX_GF2Product56x56
  ( u32 xh, u32 xl, const mmx64 *py, mmx64 *ph, mmx64 *pl
  ) {
  u32 w;
  MMX_STATIC mmx64 tab[16];

  /* Note: No START_MMX() here.  It is done at start of product(). */

#define mm_E 0
#define mm_Y1 1
#define mm_Y2 2
#define mm_Y4 3
#define mm_Y8 4

  LOD(mm_Y1, *py);
  CPY(mm_Y2, mm_Y1); SLL(mm_Y2, 1);
  CPY(mm_Y4, mm_Y1); SLL(mm_Y4, 2);
  CPY(mm_Y8, mm_Y1); SLL(mm_Y8, 3);

  /* Gray code walk through table. */
  XOR(mm_E, mm_E);  STR(tab[ 0], mm_E);
  CPY(mm_E, mm_Y1); STR(tab[ 1], mm_Y1);
  XOR(mm_E, mm_Y2); STR(tab[ 3], mm_E);
  CPY(mm_E, mm_Y2); STR(tab[ 2], mm_Y2);
  XOR(mm_E, mm_Y4); STR(tab[ 6], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[ 7], mm_E);
  XOR(mm_E, mm_Y2); STR(tab[ 5], mm_E);
  CPY(mm_E, mm_Y4); STR(tab[ 4], mm_Y4);
  XOR(mm_E, mm_Y8); STR(tab[12], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[13], mm_E);
  XOR(mm_E, mm_Y2); STR(tab[15], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[14], mm_E);
  XOR(mm_E, mm_Y4); STR(tab[10], mm_E);
  XOR(mm_E, mm_Y1); STR(tab[11], mm_E);
  XOR(mm_E, mm_Y2); STR(tab[ 9], mm_E);
                    STR(tab[ 8], mm_Y8);

#undef mm_E
#undef mm_Y1
#undef mm_Y2
#undef mm_Y4
#undef mm_Y8

#define mm_A 0
#define mm_B 1
#define mm_C 2
#define mm_D 3
#define mm_E 4
#define mm_F 5
#define mm_G 6
#define mm_T 7

  w = xl;

  LOD(mm_A, tab[w>> 4 & 15]); LOD(mm_B, tab[w>>12 & 15]);
  SLL(mm_A, 4); SLL(mm_B, 4);
  XOR_MEM(mm_A, tab[w     & 15]); XOR_MEM(mm_B, tab[w>> 8 & 15]);

  LOD(mm_C, tab[w>>20 & 15]); LOD(mm_D, tab[w>>28/* & 15*/]);
  SLL(mm_C, 4); SLL(mm_D, 4);
  XOR_MEM(mm_C, tab[w>>16 & 15]); XOR_MEM(mm_D, tab[w>>24 & 15]);

  w = xh;

  LOD(mm_E, tab[w>> 4 & 15]); LOD(mm_F, tab[w>>12 & 15]);
  SLL(mm_E, 4); SLL(mm_F, 4);
  XOR_MEM(mm_E, tab[w     & 15]); XOR_MEM(mm_F, tab[w>> 8 & 15]);

  LOD(mm_G, tab[w>>20/* & 15*/]);
  SLL(mm_G, 4);
  XOR_MEM(mm_G, tab[w>>16 & 15]);

  CPY(mm_T, mm_B); SLL(mm_B,  8); SRL(mm_T,  8); XOR(mm_A, mm_B);
  XOR(mm_T, mm_C); SLL(mm_C, 16); SRL(mm_T,  8); XOR(mm_A, mm_C);
  XOR(mm_T, mm_D); SLL(mm_D, 24); SRL(mm_T,  8); XOR(mm_A, mm_D);
  XOR(mm_T, mm_E); SLL(mm_E, 32); SRL(mm_T,  8); XOR(mm_A, mm_E);
  XOR(mm_T, mm_F); SLL(mm_F, 40); SRL(mm_T,  8); XOR(mm_A, mm_F);
  XOR(mm_T, mm_G); SLL(mm_G, 48); SRL(mm_T, 16); XOR(mm_A, mm_G);

  STR(*ph, mm_T); STR(*pl, mm_A);

#undef mm_A
#undef mm_B
#undef mm_C
#undef mm_D
#undef mm_E
#undef mm_F
#undef mm_G
#undef mm_T

  /* Note: No END_MMX() here.  It is done at end of product(). */

} /* end function MMX_GF2Product56x56 */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#elif PROD == 1 || PROD == 2

/*-- GF2Product54x54 -------------------------------------------------------*/

/* Computes 107-bit product of polynomials xh:xl and yh:yl over Z/2Z
 * with degree < 54.  Speed-critical auxiliary function used for product().
 * Returns low 32 bits of result, leaves middle 64 bits in *pm and *ph
 * and high 11 bits in *pt.
 */
static u32 GF2Product54x54
  ( u32 xh, u32 xl, u32 yh, u32 yl, u32 *pt, u32 *ph, u32 *pm
  ) {

#if PROD == 1

  u32 l0,l1, m0,m1, h0,h1;
  const u32 mask = (1UL<<27)-1;

  xh = xh<<5 | xl>>27; xl &= mask;
  yh = yh<<5 | yl>>27; yl &= mask;

  h0 = GF2Product27x27(xh, yh, &h1);
  l0 = GF2Product27x27(xl, yl, &l1);
  m0 = GF2Product27x27(xl ^ xh, yl ^ yh, &m1);

  m0 ^= l0; m1 ^= l1; m0 ^= h0; m1 ^= h1;
  l1 ^= h0<<22; h0 = h0>>10 | h1<<22; h1 >>= 10;
  l0 ^= m0<<27; l1 ^= m0>>5 | m1<<27; h0 ^= m1>>5;

  *pt = h1; *ph = h0; *pm = l1;
  return l0;

#else
/* PROD == 2 */

  u32 tab[32], w0, w1;

  { u32 yh2, yh4, yh8;

    yh2 = yh<<1 ^ yl>>31;
    yh4 = yh<<2 ^ yl>>30;
    yh8 = yh<<3 ^ yl>>29;

#define STASH(t) do { tab[2*t] = w0; tab[2*t+1] = w1; } while (0)

    /* Gray code walk through table. */
    /* 0000 */ w0 = 0;          w1 = 0;         STASH(0);
    /* 0001 */ w0 = yl;         w1 = yh;        STASH(1);
    /* 0011 */ w0 = yl ^ yl<<1; w1 = yh ^ yh2;  STASH(3);
    /* 0010 */ w0 = yl<<1;      w1 = yh2;       STASH(2);
    /* 0110 */ w0 ^= yl<<2;     w1 = yh2 ^ yh4; STASH(6);
    /* 0111 */ w0 ^= yl;        w1 ^= yh;       STASH(7);
    /* 0101 */ w0 = yl ^ yl<<2; w1 = yh ^ yh4;  STASH(5);
    /* 0100 */ w0 = yl<<2;      w1 = yh4;       STASH(4);
    /* 1100 */ w0 ^= yl<<3;     w1 = yh4 ^ yh8; STASH(12);
    /* 1101 */ w0 ^= yl;        w1 ^= yh;       STASH(13);
    /* 1111 */ w0 ^= yl<<1;     w1 ^= yh2;      STASH(15);
    /* 1110 */ w0 ^= yl;        w1 ^= yh;       STASH(14);
    /* 1010 */ w0 ^= yl<<2;     w1 = yh2 ^ yh8; STASH(10);
    /* 1011 */ w0 ^= yl;        w1 ^= yh;       STASH(11);
    /* 1001 */ w0 = yl ^ yl<<3; w1 = yh ^ yh8;  STASH(9);
    /* 1000 */ w0 = yl<<3;      w1 = yh8;       STASH(8);

#undef STASH
  } /* end block */

  { u32 a0,a1,a2,a3, t;

#define GRAB do { w0 = tab[2*t]; w1 = tab[2*t+1]; } while (0)

    t = xl & 15;     GRAB;
    a0 = w0;      a1 = w1;
    t = xl>> 4 & 15;  GRAB;
    a0 ^= w0<< 4; a1 ^= w0>>28; a1 ^= w1<< 4;
    t = xl>> 8 & 15;  GRAB;
    a0 ^= w0<< 8; a1 ^= w0>>24; a1 ^= w1<< 8;  a2 = w1>>24;
    t = xl>>12 & 15; GRAB;
    a0 ^= w0<<12; a1 ^= w0>>20; a1 ^= w1<<12; a2 ^= w1>>20;
    t = xl>>16 & 15; GRAB;
    a0 ^= w0<<16; a1 ^= w0>>16; a1 ^= w1<<16; a2 ^= w1>>16;
    t = xl>>20 & 15; GRAB;
    a0 ^= w0<<20; a1 ^= w0>>12; a1 ^= w1<<20; a2 ^= w1>>12;
    t = xl>>24 & 15; GRAB;
    a0 ^= w0<<24; a1 ^= w0>> 8; a1 ^= w1<<24; a2 ^= w1>>8;
    t = xl>>28; GRAB;
    a0 ^= w0<<28; a1 ^= w0>> 4; a1 ^= w1<<28; a2 ^= w1>>4;

    t = xh & 15;     GRAB;
    a1 ^= w0;     a2 ^= w1;
    t = xh>> 4 & 15;  GRAB;
    a1 ^= w0<< 4; a2 ^= w0>>28; a2 ^= w1<< 4;
    t = xh>> 8 & 15;  GRAB;
    a1 ^= w0<< 8; a2 ^= w0>>24; a2 ^= w1<< 8; a3 = w1>>24;
    t = xh>>12 & 15; GRAB;
    a1 ^= w0<<12; a2 ^= w0>>20; a2 ^= w1<<12; a3 ^= w1>>20;
    t = xh>>16 & 15; GRAB;
    a1 ^= w0<<16; a2 ^= w0>>16; a2 ^= w1<<16; a3 ^= w1>>16;
    t = xh>>20 /* & 3 */;  GRAB;
    a1 ^= w0<<20; a2 ^= w0>>12; a2 ^= w1<<20; a3 ^= w1>>12;

#undef GRAB

    *pt = a3; *ph = a2; *pm = a1;
    return a0;
  } /* end block */

#endif
/* PROD == 1 ... #else ... */

} /* end function GF2Product54x54 */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#else
/* PROD == 3 || PROD == 4 || PROD = 5 */

/*-- GF2Product56x56 -------------------------------------------------------*/

/* Computes 111-bit product of polynomials xh:xl and yh:yl over Z/2Z
 * with degree < 56.  Speed-critical auxiliary function used for product().
 * Returns low 32 bits of result, leaves middle 64 bits in *pm and *ph
 * and high 15 bits in *pt.
 */
static u32 GF2Product56x56
  ( u32 xh, u32 xl, u32 yh, u32 yl, u32 *pt, u32 *ph, u32 *pm
  ) {

#if PROD == 3

  u32 l0,l1, m0,m1, h0,h1;
  const u32 mask = (1UL<<28)-1;

  xh = xh<<4 | xl>>28; xl &= mask;
  yh = yh<<4 | yl>>28; yl &= mask;

  h0 = GF2Product28x28(xh, yh, &h1);
  l0 = GF2Product28x28(xl, yl, &l1);
  m0 = GF2Product28x28(xl ^ xh, yl ^ yh, &m1);

  m0 ^= l0; m1 ^= l1; m0 ^= h0; m1 ^= h1;
  l1 ^= h0<<24; h0 = h0>>8 | h1<<24; h1 >>= 8;
  l0 ^= m0<<28; l1 ^= m0>>4 | m1<<28; h0 ^= m1>>4;

  *pt = h1; *ph = h0; *pm = l1;
  return l0;

#elif PROD == 4

  u32 tab[32], w0, w1;

  { u32 yh2, yh4, yh8;

    yh2 = yh<<1 ^ yl>>31;
    yh4 = yh<<2 ^ yl>>30;
    yh8 = yh<<3 ^ yl>>29;

#define STASH(t) do { tab[2*t] = w0; tab[2*t+1] = w1; } while (0)

    /* Gray code walk through table. */
    /* 0000 */ w0 = 0;          w1 = 0;         STASH(0);
    /* 0001 */ w0 = yl;         w1 = yh;        STASH(1);
    /* 0011 */ w0 = yl ^ yl<<1; w1 = yh ^ yh2;  STASH(3);
    /* 0010 */ w0 = yl<<1;      w1 = yh2;       STASH(2);
    /* 0110 */ w0 ^= yl<<2;     w1 = yh2 ^ yh4; STASH(6);
    /* 0111 */ w0 ^= yl;        w1 ^= yh;       STASH(7);
    /* 0101 */ w0 = yl ^ yl<<2; w1 = yh ^ yh4;  STASH(5);
    /* 0100 */ w0 = yl<<2;      w1 = yh4;       STASH(4);
    /* 1100 */ w0 ^= yl<<3;     w1 = yh4 ^ yh8; STASH(12);
    /* 1101 */ w0 ^= yl;        w1 ^= yh;       STASH(13);
    /* 1111 */ w0 ^= yl<<1;     w1 ^= yh2;      STASH(15);
    /* 1110 */ w0 ^= yl;        w1 ^= yh;       STASH(14);
    /* 1010 */ w0 ^= yl<<2;     w1 = yh2 ^ yh8; STASH(10);
    /* 1011 */ w0 ^= yl;        w1 ^= yh;       STASH(11);
    /* 1001 */ w0 = yl ^ yl<<3; w1 = yh ^ yh8;  STASH(9);
    /* 1000 */ w0 = yl<<3;      w1 = yh8;       STASH(8);

#undef STASH
  } /* end block */

  { u32 a0,a1,a2,a3, t;

#define GRAB do { w0 = tab[2*t]; w1 = tab[2*t+1]; } while (0)

    t = xl & 15;     GRAB;
    a0 = w0;      a1 = w1;
    t = xl>> 4 & 15;  GRAB;
    a0 ^= w0<< 4; a1 ^= w0>>28; a1 ^= w1<< 4;
    t = xl>> 8 & 15;  GRAB;
    a0 ^= w0<< 8; a1 ^= w0>>24; a1 ^= w1<< 8; a2 = w1>>24;
    t = xl>>12 & 15; GRAB;
    a0 ^= w0<<12; a1 ^= w0>>20; a1 ^= w1<<12; a2 ^= w1>>20;
    t = xl>>16 & 15; GRAB;
    a0 ^= w0<<16; a1 ^= w0>>16; a1 ^= w1<<16; a2 ^= w1>>16;
    t = xl>>20 & 15; GRAB;
    a0 ^= w0<<20; a1 ^= w0>>12; a1 ^= w1<<20; a2 ^= w1>>12;
    t = xl>>24 & 15; GRAB;
    a0 ^= w0<<24; a1 ^= w0>> 8; a1 ^= w1<<24; a2 ^= w1>>8;
    t = xl>>28; GRAB;
    a0 ^= w0<<28; a1 ^= w0>> 4; a1 ^= w1<<28; a2 ^= w1>>4;

    t = xh & 15;     GRAB;
    a1 ^= w0;     a2 ^= w1;
    t = xh>> 4 & 15;  GRAB;
    a1 ^= w0<< 4; a2 ^= w0>>28; a2 ^= w1<< 4;
    t = xh>> 8 & 15;  GRAB;
    a1 ^= w0<< 8; a2 ^= w0>>24; a2 ^= w1<< 8; a3 = w1>>24;
    t = xh>>12 & 15; GRAB;
    a1 ^= w0<<12; a2 ^= w0>>20; a2 ^= w1<<12; a3 ^= w1>>20;
    t = xh>>16 & 15; GRAB;
    a1 ^= w0<<16; a2 ^= w0>>16; a2 ^= w1<<16; a3 ^= w1>>16;
    t = xh>>20 /* & 15 */;  GRAB;
    a1 ^= w0<<20; a2 ^= w0>>12; a2 ^= w1<<20; a3 ^= w1>>12;

#undef GRAB

    *pt = a3; *ph = a2; *pm = a1;
    return a0;
  } /* end block */


#else
/* PROD == 5 */

  u32 l0,l1, m0,m1, h0,h1;

  m0 = GF2Product32x32(xl ^ xh, yl ^ yh, &m1);
  l0 = GF2Product32x32(xl, yl, &l1);
  h0 = GF2Product32x32(xh, yh, &h1);

  { u32 t; t = l1 ^ h0; l1 = m0 ^ l0 ^ t; h0 = m1 ^ h1 ^ t; }

  *pt = h1; *ph = h0; *pm = l1;
  return l0;

#endif
/* PROD == 3 ... #elif PROD == 4 ... #else ... */

} /* end function GF2Product56x56 */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#endif
/* defined(MMX) ... #elif PROD == 1 || PROD == 2 ... #else ... */


/*-- product ---------------------------------------------------------------*/

/* Product of x and y in the field GF(2^109) i.e., as polys in (Z/2Z)[t]
 * reduced modulo t^109+t^9+t^2+t+1.  Degree x, y < 109.
 */
static poly128 product(poly128 x, poly128 y) {

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#ifdef MMX

  MMX_STATIC mmx64 hh,hl, lh,ll, mh,ml;

  START_MMX();

  { const u32 mask = (1UL<<24)-1;
    u32 xt,xh, yt,yh;
    MMX_STATIC mmx64 t;

    xt = x.t<<8 | x.h>>24; xh = x.h<<8 | x.m>>24;
    yt = y.t<<8 | y.h>>24; yh = y.h<<8 | y.m>>24;

    t.hi = yt & mask; t.lo = yh;
    MMX_GF2Product56x56(xt & mask, xh, &t, &hh, &hl);
    t.hi = (yt ^ y.m) & mask; t.lo = yh ^ y.l;
    MMX_GF2Product56x56((xt ^ x.m) & mask, xh ^ x.l, &t, &mh, &ml);
    t.hi = y.m & mask; t.lo = y.l;
    MMX_GF2Product56x56(x.m & mask, x.l, &t, &lh, &ll);
  } /* end block */

#define mm_T0 0
#define mm_T1 1
#define mm_T2 2
#define mm_T3 3
#define mm_MH 4
#define mm_ML 5
#define mm_TMP 6

  LOD(mm_T0, ll); LOD(mm_T1, lh);
  LOD(mm_T2, hl); LOD(mm_T3, hh);
  LOD(mm_ML, hl); LOD(mm_MH, hh);
  SRL(mm_T2, 16); SRL(mm_T3, 16);
  SLL(mm_ML, 48); SLL(mm_MH, 48);
  ORR(mm_T1, mm_ML); ORR(mm_T2, mm_MH);

  LOD(mm_ML, ml); LOD(mm_MH, mh);
  XOR_MEM(mm_ML, ll); XOR_MEM(mm_MH, lh);
  XOR_MEM(mm_ML, hl); XOR_MEM(mm_MH, hh);

  CPY(mm_TMP, mm_ML); SLL(mm_TMP, 56); XOR(mm_T0, mm_TMP);
  SRL(mm_ML, 8); XOR(mm_T1, mm_ML);
  CPY(mm_TMP, mm_MH); SLL(mm_TMP, 56); XOR(mm_T1, mm_TMP);
  SRL(mm_MH, 8); XOR(mm_T2, mm_MH);

  /* Reduce modulo t^109+t^9+t^2+t+1. */
  CPY(mm_TMP, mm_T2);

  SLL(mm_T3, 19);  XOR(mm_T1, mm_T3);
  SRL(mm_TMP, 36); XOR(mm_T1, mm_TMP);
  SLL(mm_T2, 19);  XOR(mm_T0, mm_T2);

  SLL(mm_T3, 1);  XOR(mm_T1, mm_T3);
  SRL(mm_TMP, 7); XOR(mm_T1, mm_TMP);
  SLL(mm_T2, 1);  XOR(mm_T0, mm_T2);

  SLL(mm_T3, 1);  XOR(mm_T1, mm_T3);
  SRL(mm_TMP, 1); XOR(mm_T1, mm_TMP);
  SLL(mm_T2, 1);  XOR(mm_T0, mm_T2);

  SLL(mm_T3, 7);  XOR(mm_T1, mm_T3);
  SRL(mm_TMP, 1); XOR(mm_T1, mm_TMP);
  SLL(mm_T2, 7);  XOR(mm_T0, mm_T2);


  CPY(mm_TMP, mm_T1); SRL(mm_TMP, 45);
  SLL(mm_T1, 19); SRL(mm_T1, 19);

  XOR(mm_T0, mm_TMP);
  SLL(mm_TMP, 1); XOR(mm_T0, mm_TMP);
  SLL(mm_TMP, 1); XOR(mm_T0, mm_TMP);
  SLL(mm_TMP, 7); XOR(mm_T0, mm_TMP);

  { MMX_STATIC mmx64 rh, rl;
    poly128 r;

    STR(rh, mm_T1); STR(rl, mm_T0);

#undef mm_T0
#undef mm_T1
#undef mm_T2
#undef mm_T3
#undef mm_MH
#undef mm_ML
#undef mm_TMP

    END_MMX();

    r.t = rh.hi; r.h = rh.lo; r.m = rl.hi; r.l = rl.lo;
    return r;
  } /* end block */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#else
/* !defined(MMX) */

  u32 lt,lh,lm,ll, mt,mh,mm,ml, ht,hh,hm,hl, t0,t1,t2,t3,t4,t5,t6, w;

#if PROD == 1 || PROD == 2

  { const u32 mask = (1UL<<22)-1;
    u32 xt,xh, yt,yh;

    ll = GF2Product54x54(x.m & mask, x.l, y.m & mask, y.l, &lt, &lh, &lm);

    xt = x.t<<10 | x.h>>22; xh = x.h<<10 | x.m>>22;
    yt = y.t<<10 | y.h>>22; yh = y.h<<10 | y.m>>22;

    ml = GF2Product54x54( (xt ^ x.m) & mask, xh ^ x.l
                        , (yt ^ y.m) & mask, yh ^ y.l, &mt, &mh, &mm
                        );
    hl = GF2Product54x54(xt & mask, xh, yt & mask, yh, &ht, &hh, &hm);
  } /* end block */

  ml ^= hl; mm ^= hm; mh ^= lh; mt ^= lt;
  ml ^= ll; mm ^= lm; mh ^= hh; mt ^= ht;

  t0 = ll; t1 = lm; t2 = lh; t3 = lt | hl<<12;
  t4 = hl>>20 | hm<<12; t5 = hm>>20 | hh<<12; t6 = hh>>20 | ht<<12;
  t1 ^= ml<<22; t2 ^= ml>>10 | mm<<22; t3 ^= mm>>10 | mh<<22;
  t4 ^= mh>>10 | mt<<22; t5 ^= mt>>10;

  /* Handle 109th bit. */
  /* TO DO: the "& 1" is not necessary */
  if (x.t>>12 & 1) {
    t3 ^= y.l<<12; t4 ^= y.l>>20 | y.m<<12;
    t5 ^= y.m>>20 | y.h<<12; t6 ^= y.h>>20 | y.t<<12;
  } /* end if */
  if (y.t>>12 & 1) {
    t3 ^= x.l<<12; t4 ^= x.l>>20 | x.m<<12;
    t5 ^= x.m>>20 | x.h<<12; t6 ^= x.h>>20 | x.t<<12;
  } /* end if */
  if ((x.t & y.t)>>12 & 1) t6 ^= 1UL<<24;

#else
/* PROD == 3 || PROD == 4 || PROD == 5 */

  { const u32 mask = (1UL<<24)-1;
    u32 xt,xh, yt,yh;

    xt = x.t<<8 | x.h>>24; xh = x.h<<8 | x.m>>24;
    yt = y.t<<8 | y.h>>24; yh = y.h<<8 | y.m>>24;

    ml = GF2Product56x56( (xt ^ x.m) & mask, xh ^ x.l
                        , (yt ^ y.m) & mask, yh ^ y.l, &mt, &mh, &mm
                        );
    hl = GF2Product56x56(xt & mask, xh, yt & mask, yh, &ht, &hh, &hm);
    ll = GF2Product56x56(x.m & mask, x.l, y.m & mask, y.l, &lt, &lh, &lm);
  } /* end block */

  ml ^= ll; mm ^= lm; mh ^= hh; mt ^= ht;
  ml ^= hl; mm ^= hm; mh ^= lh; mt ^= lt;

  t0 = ll; t1 = lm; t2 = lh; t3 = lt | hl<<16;
  t4 = hl>>16 | hm<<16; t5 = hm>>16 | hh<<16; t6 = hh>>16 | ht<<16;
  t1 ^= ml<<24; t2 ^= ml>>8 | mm<<24; t3 ^= mm>>8 | mh<<24;
  t4 ^= mh>>8 | mt<<24; t5 ^= mt>>8;

#endif
/* PROD == 1 || PROD == 2 ... #else ... */

  /* Reduce modulo t^109+t^9+t^2+t+1. */
  t3 ^= t6>>13; t3 ^= t6>>12; t3 ^= t6>>11; t3 ^= t6>>4;
  t2 ^= t6<<19; t2 ^= t6<<20; t2 ^= t6<<21; t2 ^= t6<<28;
  t2 ^= t5>>13; t2 ^= t5>>12; t2 ^= t5>>11; t2 ^= t5>>4;
  t1 ^= t5<<19; t1 ^= t5<<20; t1 ^= t5<<21; t1 ^= t5<<28;
  t1 ^= t4>>13; t1 ^= t4>>12; t1 ^= t4>>11; t1 ^= t4>>4;
  t0 ^= t4<<19; t0 ^= t4<<20; t0 ^= t4<<21; t0 ^= t4<<28;

  w = t3>>13; t3 ^= w<<13; t0 ^= w; t0 ^= w<<1; t0 ^= w<<2; t0 ^= w<<9;

  { poly128 r;
    r.t = t3; r.h = t2; r.m = t1; r.l = t0;
    return r;
  } /* end block */

/*# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # */
#endif
/* defined(MMX) ... #else ... */

} /* end function product */


/*-- inverse ---------------------------------------------------------------*/

/* Useful macros:
 *   SHR(al, am, ah, at, sh, th) shifts at:ah:am:al right by sh bits.
 *     (where 0 < sh < 32 and th = 32-sh).
 *   SHR1(al, am, ah, at) shifts at:ah:am:al right by one bit.
 */

#define SHR(al, am, ah, at, sh, th) \
  do {                                  \
    al = al>>sh | am<<th;               \
    am = am>>sh | ah<<th;               \
    ah = ah>>sh | at<<th;               \
    at >>= sh;                          \
  } while (0)

#define SHR1(al, am, ah, at) \
  do {                                  \
    al = al>>1 | am<<31;                \
    am = am>>1 | ah<<31;                \
    ah = ah>>1 | at<<31;                \
    at >>= 1;                           \
  } while (0)

/* Invert y in the field GF(2^109) i.e., as a poly in (Z/2Z)[t]
 * reduced modulo t^109+t^9+t^2+t+1.  Degree y < 109, y != 0.
 * Note: 1/(1+t+t^2+t^9+O(t^10)) = 1+t+t^3+t^4+t^6+t^7+O(t^10)
 */
static poly128 inverse(poly128 y) {
  uint sh, th;
  u32 al,am,ah,at, bl,bm,bh,bt, t, ul,um,uh,ut, vl,vm,vh,vt;
  const u32 mt = 1UL<<13, ml = 519; /* Note: mm and mh are 0 */
  static const u8 shiftTab[64] =
  { 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
  , 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
  };
  static const u8 xorTab[64] =
  {  0, 27, 54, 45, 44, 55, 26,  1, 24,  3, 46, 53, 52, 47,  2, 25, 48, 43
  ,  6, 29, 28,  7, 42, 49, 40, 51, 30,  5,  4, 31, 50, 41, 32, 59, 22, 13
  , 12, 23, 58, 33, 56, 35, 14, 21, 20, 15, 34, 57, 16, 11, 38, 61, 60, 39
  , 10, 17,  8, 19, 62, 37, 36, 63, 18,  9
  };
  static const u32 xorTab2[64] =
  {     0, 13889, 27778, 23235, 22724, 28293, 13382,   519, 12360,  1545
  , 23754, 27275, 26764, 24269,  1038, 12879, 24720, 22225,  3090, 14931
  , 14420,  3605, 21718, 25239, 20696, 26265, 15450,  2587,  2076, 15965
  , 25758, 21215, 16608, 30369, 11362,  6691,  6180, 11877, 29862, 17127
  , 28840, 18153,  7210, 10859, 10348,  7725, 17646, 29359,  8304,  5681
  , 19698, 31411, 30900, 20213,  5174,  8823,  4152,  9849, 31930, 19195
  , 18684, 32445,  9342,  4671
  };


  /* Maintain: u.y = a and v.y = b modulo t^109+t^9+t^2+t+1. */

  al = y.l; am = y.m; ah = y.h; at = y.t;
  ul = 1; um = uh = ut = 0;

  while (!(al & 1)) {
    t = ul & 1; ut ^= t<<13; if (t) ul ^= ml;
    SHR1(al, am, ah, at);
    SHR1(ul, um, uh, ut);
  } /* end while */

  bl = ml; bm = bh = 0; bt = mt;
  vl = vm = vh = vt = 0;

  do {
    do {
      do {
        do {

          do {
            bl ^= al; bm ^= am; bh ^= ah; bt ^= at;
            vl ^= ul; vm ^= um; vh ^= uh; vt ^= ut;

            sh = shiftTab[bl & 63]; th = 32-sh;
            t = vl & 63; vt ^= (u32)xorTab[t]<<13; vl ^= xorTab2[t];
            SHR(bl, bm, bh, bt, sh, th);
            SHR(vl, vm, vh, vt, sh, th);
            while (!(bl & 1)) {
              t = vl & 1; vt ^= t<<13; if (t) vl ^= ml;
              SHR1(bl, bm, bh, bt);
	      SHR1(vl, vm, vh, vt);
            } /* end while */
          } while ( at < bt
                    || at == bt
                         && ( ah < bh
                              || ah == bh && (am < bm || am == bm && al < bl)
                            )
                  );

          if (al == bl && am == bm && ah == bh && at == bt) break;

          do {
            al ^= bl; am ^= bm; ah ^= bh; at ^= bt;
            ul ^= vl; um ^= vm; uh ^= vh; ut ^= vt;

            sh = shiftTab[al & 63]; th = 32-sh;
            t = ul & 63; ut ^= (u32)xorTab[t]<<13; ul ^= xorTab2[t];
            SHR(al, am, ah, at, sh, th);
            SHR(ul, um, uh, ut, sh, th);
            while (!(al & 1)) {
              t = ul & 1; ut ^= t<<13; if (t) ul ^= ml;
              SHR1(al, am, ah, at);
	      SHR1(ul, um, uh, ut);
            } /* end while */
          } while ( at > bt
                    || at == bt
                         && ( ah > bh
                              || ah == bh && (am > bm || am == bm && al > bl)
                            )
                  );

        } while (al != bl);
      } while (am != bm);
    } while (ah != bh);
  } while (at != bt);

  /* Now a (and b) must equal 1. */

  /* Reduce u modulo t^109+t^9+t^2+t+1. */
  t = ut>>13; ut ^= t<<13; ul ^= t; ul ^= t<<1; ul ^= t<<2; ul ^= t<<9;

  { poly128 r;
    r.t = ut; r.h = uh; r.m = um; r.l = ul;
    return r;
  } /* end block */
} /* end function inverse */

#undef SHR
#undef SHR1


/*-- quotient --------------------------------------------------------------*/

/* Divide x by y in the field GF(2^109) i.e., as polys in (Z/2Z)[t]
 * reduced modulo t^109+t^9+t^2+t+1.  Degree x, y < 109, y != 0.
 */
static INLINE poly128 quotient(poly128 x, poly128 y) {

  return product(x, inverse(y));
} /* end function quotient */


/*-= Arithmetic mod the group order =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- productMod ------------------------------------------------------------*/

/* Return product x*y modulo m, 0 <= x, y < m <= 2^127.
 * Could this possibly be any slower?  =:-)
 */
static u128 productMod(u128 x, u128 y, u128 m) {
  uint i;
  u32 b, w, yy[3];
  u128 z;
  const u128 uZero = ZERO;

  yy[0] = y.l; yy[1] = y.m; yy[2] = y.h;
  w = y.t;
  for (i = 3; !w && i; ) w = yy[--i];
  if (!w) return uZero;

  b = topBit(w);
  z = x;
  while (b >>= 1) {
    z = doubleMod(z, m);
    if (w & b) z = sumMod(z, x, m);
  } /* end while */

  while (i) {
    b = 1UL<<31;
    w = yy[--i];
    do {
      z = doubleMod(z, m);
      if (w & b) z = sumMod(z, x, m);
      b >>= 1;
    } while (b);
  } /* end if */

  return z;
} /* end function productMod */


/*-- powerMod --------------------------------------------------------------*/

/* Return power x^y modulo m, 0 <= x < m <= 2^127, 0 <= y < 2^64.
 * Note that 0^0 returns 1 (even when m = 1).
 */
static u128 powerMod(u128 x, u64 y, u128 m) {
  u32 b, w;
  u128 z;
  const u128 uOne = ONE;

  w = y.hi;
  if (!w) { w = y.lo; if (!w) return uOne; }

  b = topBit(w);
  z = x;
  while (b >>= 1) {
    z = squareMod(z, m);
    if (w & b) z = productMod(z, x, m);
  } /* end while */

  if (y.hi) {
    w = y.lo;
    b = 1UL<<31;
    do {
      z = squareMod(z, m);
      if (w & b) z = productMod(z, x, m);
    } while (b >>= 1);
  } /* end if */

  return z;
} /* end function powerMod */


/*-- findMultiplier --------------------------------------------------------*/

/* Computes multiplier Prod_i pow(multipliers[i], cases[i]) modulo order. */
static u128 findMultiplier(u64 *cases) {
  uint i;
  u128 f;
  const u128 order = ORDER;
  /* Here multiplier[i] = (Frobenius to power i) + 1 modulo group order. */
  static const u128 multipliers[CASES] =
  { { 0x6D3, 0x25F3CBE9, 0x20245E26, 0x59822D94 }
  , { 0x6D3, 0x25F3CBE9, 0x20245E26, 0x59822D96 }
  , { 0xB86, 0x8E249C44, 0x9EDF28ED, 0x4BE9F47A }
  , { 0x92C, 0xDA0C3416, 0xDF81C389, 0xD2B6110D }
  , { 0x21F, 0xBDC2FB8D, 0xA169935F, 0x671A66B6 }
  , { 0xFC6, 0x09AA935F, 0xE1B24FAC, 0x1A1EC1D4 }
  , { 0xB86, 0x8E249C44, 0x9EDF28ED, 0x4BE9F46A }
  };

  f = powerMod(multipliers[0], cases[0], order);
  cases[0].hi = cases[0].lo = 0;

  for (i = 1; i < CASES; ++i) {
    f = productMod(f, powerMod(multipliers[i], cases[i], order), order);
    cases[i].hi = cases[i].lo = 0;
  } /* end for */

  return f;
} /* end function findMultiplier */


/*-= Arithmetic on elliptic curve over the field =-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/*-- ellipticDouble --------------------------------------------------------*/

/* Given a point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes
 * its double (x2:y2:z2) by the group law.
 * It puts x2 in *px2, y2 in *py2 and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticDouble
  ( poly128 x, poly128 y, int z, poly128 *px2, poly128 *py2
  ) {
  poly128 lam, x2,y2;
  const poly128 a = A, one = ONE, zero = ZERO;

  if (!z || equal(x, zero)) { *px2 = zero; *py2 = one; return 0; }

  lam = xor(x, quotient(y, x));
  x2 = xor(xor(square(lam), lam), a);
  y2 = xor(xor(product(lam, xor(x, x2)), x2), y);

  *px2 = x2; *py2 = y2; return 1;
} /* end function ellipticDouble */


/*-- ellipticSum -----------------------------------------------------------*/

/* Given points (x1:y1:z1) and (x2:y2:z2) on y^2 + x*y = x^3 + a*x^2 + b,
 * this computes their sum (x3:y3:z3) by the group law.
 * It puts x3 in *px3, y3 in *py3 and returns z3.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticSum
  ( poly128 x1, poly128 y1, int z1, poly128 x2, poly128 y2, int z2
  , poly128 *px3, poly128 *py3
  ) {
  poly128 lam, t, x3,y3;
  const poly128 a = A;

  if (!z1) { *px3 = x2; *py3 = y2; return z2; }
  if (!z2) { *px3 = x1; *py3 = y1; return z1; }

  if (equal(x1, x2)) {
    const poly128 zero = ZERO, one = ONE;

    if (equal(y1, y2)) return ellipticDouble(x1, y1, z1, px3, py3);
    *px3 = zero; *py3 = one; return 0;
  } /* end if */

  lam = quotient(xor(y1, y2), xor(x1, x2));
  t = xor(xor(xor(square(lam), lam), x2), a);
  x3 = xor(t, x1);
  y3 = xor(xor(product(lam, t), x3), y1);

  *px3 = x3; *py3 = y3; return 1;
} /* end function ellipticSum */


/*-- ellipticProduct -------------------------------------------------------*/

/* Given a point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes
 * its fac-th multiple (x2:y2:z2) by the group law.
 * It puts x2 in *px2, y2 in *py2 and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticProduct
  ( poly128 x, poly128 y, int z, u128 fac, poly128 *px2, poly128 *py2
  ) {
  int z2;
  u32 f, i, m, tab[4];
  poly128 x2,y2;

  /* Get most significant 32-bit word of fac. */
  tab[0] = fac.l; tab[1] = fac.m; tab[2] = fac.h; tab[3] = fac.t;
  for (i = 3; !(f = tab[i]) && i; --i) ;

  /* Quick exit if fac was 0. */
  if (!f) {
    const poly128 zero = ZERO, one = ONE;

    *px2 = zero; *py2 = one; return 0;
  } /* end if */

  /* Run through bits of most significant word. */
  m = topBit(f);
  x2 = x; y2 = y; z2 = z;
  while (m >>= 1) {
    z2 = ellipticDouble(x2, y2, z2, &x2, &y2);
    if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, &x2, &y2);
  } /* end while */

  /* Run through bits of less significant words. */
  while (i--) {
    f = tab[i];
    m = 1U<<31;
    do {
      z2 = ellipticDouble(x2, y2, z2, &x2, &y2);
      if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, &x2, &y2);
      m >>= 1;
    } while (m);
  } /* end while */

  *px2 = x2; *py2 = y2; return z2;
} /* end function ellipticProduct */


/*-= Various file manipulation thingies =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- syncAndCheck ----------------------------------------------------------*/

/* Check that *pu, *pv are in range, update them using cases[] and reset it.
 * Then check that x, y are in range and that [u]P+[v]Q = (x:y:1).
 * Note: used before reporting a distinguished point or saving state.
 */
static int syncAndCheck(u64 *cases, u128 *pu, u128 *pv, poly128 x, poly128 y) {
  int zc, zu, zv;
  u128 f;
  poly128 xc,yc, xu,yu, xv,yv;

  const int zP = 1, zQ = 1;
  const poly128 xP = XP, xQ = XQ, yP = YP, yQ = YQ;
  const u128 order = ORDER;


  if (ge(*pu, order)) return 0;
  if (ge(*pv, order)) return 0;

  f = findMultiplier(&cases[0]); /* Also resets cases[]. */
  *pu = productMod(f, *pu, order);
  *pv = productMod(f, *pv, order);

  if (x.t > 0x00001FFF) return 0;
  if (y.t > 0x00001FFF) return 0;

  zu = ellipticProduct(xP, yP, zP, *pu, &xu, &yu);
  zv = ellipticProduct(xQ, yQ, zQ, *pv, &xv, &yv);
  zc = ellipticSum(xu, yu, zu, xv, yv, zv, &xc, &yc);

  if (!equal(xc, x) || !equal(yc, y) || zc != 1) return 0;

  return 1;
} /* end function syncAndCheck */


/*-- reportDistinguished ---------------------------------------------------*/

/* Reporting of distinguished points back to headquarters. */
static void reportDistinguished
  ( modeType mode, char *argv[]
  , u64 iters, u128 u, u128 v, u64 total, double dStart
  ) {
  const char format[] =
    "ECC-L0|" VARIANT "|" VERSION "|\n"
    "ECC-L1|%03X%08X%08X%08X|%03X%08X%08X%08X|\n"
    "ECC-L2|%s|\n"
    "ECC-L3|%04X%08X|\n"
    "ECC-L4|%s|\n"
  , test_format[] =
    "TEST-L0|" VARIANT "|" VERSION "|\n"
    "TEST-L1|%03X%08X%08X%08X|%03X%08X%08X%08X|\n"
    "TEST-L2|%s|\n"
    "TEST-L3|%04X%08X|\n"
    "TEST-L4|%s|\n"
  ;
  int status;
  FILE *handle;
  time_t now; /* For time and date. */
  static char *buf = NULL;


  /** Allocate buffer first time around. **/

  if (!buf) {
    buf =
      malloc( strlen(format)    /* Upper bound on size of fixed text. */
              + 27 + 27         /* Two 27-digit hex numbers on line 1. */
              + strlen(argv[3]) /* The user ID on line 2. */
              + 12              /* One 12-digit hex number on line 3. */
              + strlen(argv[5]) /* The machine ID on line 4. */
              + 1               /* The terminating '\0'. */
            );
  } /* end if */


  /** Output the distinguished point to stdout and the points file. **/

  if (mode == Test) handle = NULL;
  else {
    handle = fopen(DIST_POINTS_FILENAME, "a");
    if (!handle) {
      printf( "Warning: couldn't fopen() file " DIST_POINTS_FILENAME
                " for appending\n"
              "         (errno = %d: %s).  Following point is not saved!\n"
            , errno, strerror(errno)
            );
    } /* end if */
  } /* end if */

  putchar('\n');
  if (handle) putc('\n', handle);

  /* Time and date. */
  now = time(NULL);
  if (now == -1) puts("Warning: can't determine time and date.");
  else {
    char *str1, *str2;

    str1 = mode == Test ? "Test" : "Distinguished";
    str2 = ctime(&now);
    printf( "%s point found at: %s", str1, str2);
    if (handle) fprintf(handle, "%s point found at: %s", str1, str2);
  } /* end if/else */

  /* The distinguished point. */
  printf( mode == Test ? test_format : format
        , u.t, u.h, u.m, u.l, v.t, v.h, v.m, v.l
        , argv[3], iters.hi, iters.lo, argv[5]
        );
  if (handle) { /* Note: mode != Test. */
    fprintf( handle, format
           , u.t, u.h, u.m, u.l, v.t, v.h, v.m, v.l
           , argv[3], iters.hi, iters.lo, argv[5]
           );
  } /* end if */

  if (handle) {
    status = fclose(handle);
    if (status == EOF) {
      printf( "Warning: fclose() of file " DIST_POINTS_FILENAME " failed\n"
              "         (errno = %d: %s).\n"
            , errno, strerror(errno)
            );
    } /* end if */
  } /* end if */


  /** Output extra verbosity to stdout. **/

  { double dIters, dTotal
         , dNow, dUserTime /* For timing. */
    ;

    /* Iteration counts. */
    dIters = (double)iters.hi*4294967296.0+(double)iters.lo;
    dTotal = (double)total.hi*4294967296.0+(double)total.lo;
    printf( "Iterations used = %g.\n"
            "Total iterations = %g.\n"
          , dIters, dTotal
          );

    /* Iteration rate. */
#if MACOS
    dNow = get_time();
#elif __CYGWIN__
    { struct tms tm;

      times(&tm);
      dNow = (double)tm.tms_utime/(double)CLK_TCK;
    } /* end block */
#else
    { struct rusage ru;

      getrusage(RUSAGE_SELF, &ru);
      dNow = (double)ru.ru_utime.tv_sec+(double)ru.ru_utime.tv_usec*0.000001;
    } /* end block */
#endif
    dUserTime = dNow-dStart;
    if (dUserTime) printf("Rate = %g per second.\n", dTotal/dUserTime);
  } /* end block */


  /** Send the distinguished point by email. **/

#if MACOS

  if (mode == Batch || mode == Mail || mode == Alt) {
    if (!buf) {
      puts( "Warning: couldn't allocate temp buffer for sending mail, "
            "send by hand!"
          );
    } else {
      sprintf( buf, format
             , u.t, u.h, u.m, u.l, v.t, v.h, v.m, v.l
             , argv[3], iters.hi, iters.lo, argv[5]
             );
      switch (mode){
      case Mail: destination = "ecdl2K-108@cristal.inria.fr"; break;
      case Alt:  destination = "ecdl2K-108@rupture.net"; break;
      default:   destination = ""; break;
      } /* end switch */
      resultsOut(buf);
    } /* end if/else */
  } /* end if */

#else
/* !MACOS */

  if (mode == Mail || mode == Alt) {

    handle = popen(SENDMAIL " -t", "w");
    if (handle == NULL) {
      puts("Warning: couldn't pipe to " SENDMAIL ", send by hand!");
    } else {
      int status;

      fprintf( handle
             , "To: %s\n\n"
             , mode == Mail
               ? "ecdl2K-108@cristal.inria.fr"
               : "ecdl2K-108@rupture.net"
             );

      /* The distinguished point.  Note: mode != Test. */
      fprintf( handle, format
             , u.t, u.h, u.m, u.l, v.t, v.h, v.m, v.l
             , argv[3], iters.hi, iters.lo, argv[5]
             );

      status = fflush(handle);
      if (status == EOF) {
        printf( "Warning: fflush() of pipe failed\n"
                "         (errno = %d: %s).\n"
              , errno, strerror(errno)
              );
      } /* end if */

      status = pclose(handle);
      if (status) {
        if (status == -1) {
          printf( "Warning: pclose() failed\n"
                  "         (errno = %d: %s).\n"
                , errno, strerror(errno)
                );
        } else printf("Warning: " SENDMAIL " returned status %d.\n", status);
      } /* end if */

    } /* end if/else */
  } /* end if (mode == Mail || mode == Alt) */

#endif
/* MACOS */


  /** Send the distinguished point by HTTP. **/

  if (mode == Http) {
    if (!buf) {
      puts( "Warning: couldn't allocate temp buffer for HTTP sending, "
            "send by hand!"
          );
    } else {
      sprintf( buf, format
             , u.t, u.h, u.m, u.l, v.t, v.h, v.m, v.l
             , argv[3], iters.hi, iters.lo, argv[5]
             );
      if (httpPost(buf) == -1) {
        puts( "Warning: an error occurred while sending point via HTTP, "
              "send by hand!"
            );
      }
#if MACOS
      else ++points_sent;
#endif
    } /* end if/else */
  } /* end if (mode == Http) */


  fflush(stdout);
} /* end function reportDistinguished */


/*-- u32ToBytes ------------------------------------------------------------*/

static void u32ToBytes(u32 data, u8 *p) {

  p[0] = data;
  p[1] = data>> 8;
  p[2] = data>>16;
  p[3] = data>>24;

} /* end function u32ToBytes */


/*-- bytesToU32 ------------------------------------------------------------*/

static u32 bytesToU32(u8 *p) {

  return p[0] | (u32)p[1]<<8 | (u32)p[2]<<16 | (u32)p[3]<<24;
} /* end function bytesToU32 */


/*-- writeState ------------------------------------------------------------*/

/* Write state out to file in a portable binary format. */
static void writeState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT
  ) {
  FILE *handle;


  puts("Saving state to " SAVED_STATE_FILENAME ".");
  handle = fopen(SAVED_STATE_FILENAME, "wb");
  if (handle) {
    uint i;
    int status;
    u8 buf[72];

    for (i = 0; i < PARAL; ++i) {

#define OUT128(offset, X) \
  u32ToBytes(X.l, buf+offset); \
  u32ToBytes(X.m, buf+offset+4); \
  u32ToBytes(X.h, buf+offset+8); \
  u32ToBytes(X.t, buf+offset+12) /* ; */

      u32ToBytes(itersT[i].lo, buf);
      u32ToBytes(itersT[i].hi, buf+4);
      OUT128( 8, uT[i]);
      OUT128(24, vT[i]);
      OUT128(40, xT[i]);
      OUT128(56, yT[i]);

#undef OUT128

      if (fwrite((void *)buf, sizeof(buf), 1, handle) != 1) {
        printf( "Warning: couldn't fwrite() to saved state file\n"
                "         (errno = %d: %s).\n"
              , errno, strerror(errno)
              );
        break;
      } /* end if */
    } /* end for */

    status = fclose(handle);
    if (status == EOF) {
      printf( "Warning: fclose() of saved state file failed\n"
              "         (errno = %d: %s).\n"
            , errno, strerror(errno)
            );
    } /* end if */

  } else printf( "Warning: couldn't fopen() saved state file for writing\n"
                 "         (errno = %d: %s).\n"
               , errno, strerror(errno)
               );

  fflush(stdout);
} /* end function writeState */


/*-- readState -------------------------------------------------------------*/

/* Reads state back from file (if it exists) in a portable binary format
 * and returns number of valid items (i.e., u,v,x,y,z data) found.
 * Note: There is nothing to check for the itersT[] array.
 */
static uint readState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT
  ) {
  uint i;
  FILE *handle;

  i = 0;
  handle = fopen(SAVED_STATE_FILENAME, "rb");
  if (handle) {
    int status;
    u8 buf[72];

    for (i = 0; i < PARAL; ++i) {
      int OK;

      if (fread((void *)buf, sizeof(buf), 1, handle) != 1) {
        if (feof(handle)) puts("Warning: short saved state file.");
        else {
          printf( "Warning: couldn't fread() from saved state file\n"
                  "         (errno = %d: %s).\n"
                , errno, strerror(errno)
                );
        } /* end if/else */
        break;
      } /* end if */

#define IN128(offset, X) \
  X.l = bytesToU32(buf+offset); \
  X.m = bytesToU32(buf+offset+4); \
  X.h = bytesToU32(buf+offset+8); \
  X.t = bytesToU32(buf+offset+12) /* ; */

      itersT[i].lo = bytesToU32(buf);
      itersT[i].hi = bytesToU32(buf+4);
      IN128( 8, uT[i]);
      IN128(24, vT[i]);
      IN128(40, xT[i]);
      IN128(56, yT[i]);

#undef IN128

      OK = 0;
      do {
        int z, zu, zv;
        poly128 x,y, xu,yu, xv,yv;

        const int zP = 1, zQ = 1;
        const poly128 xP = XP, xQ = XQ, yP = YP, yQ = YQ;
        const u128 order = ORDER;

        if (ge(uT[i], order)) break;
        if (ge(vT[i], order)) break;

        if (xT[i].t > 0x00001FFF) break;
        if (yT[i].t > 0x00001FFF) break;

        zu = ellipticProduct(xP, yP, zP, uT[i], &xu, &yu);
        zv = ellipticProduct(xQ, yQ, zQ, vT[i], &xv, &yv);
        z = ellipticSum(xu, yu, zu, xv, yv, zv, &x, &y);

        if (!equal(x, xT[i]) || !equal(y, yT[i]) || z != 1) break;

        OK = 1;
      } while (0);

      if (!OK) {
        puts("Warning: invalid starting point found in saved state!");
        break;
      } /* end if */

    } /* end for */

    printf( "Read %u of %u starting points from saved state.\n"
          , i, (uint)PARAL
          );

    status = fclose(handle);
    if (status == EOF) {
      printf( "Warning: fclose() of saved state file failed\n"
              "         (errno = %d: %s).\n"
            , errno, strerror(errno)
            );
    } /* end if */

  } else printf( "Warning: couldn't fopen() saved state file for reading\n"
                 "         (errno = %d: %s).\n"
                 "Note: 'No such file' is normal for the first run.\n"
               , errno, strerror(errno)
               );

  fflush(stdout);

  return i;
} /* end function readState */


/*-= Stuff for http mode =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/*-- httpPost --------------------------------------------------------------*/

/* Configuration */

#define HTTP_URL "http://cristal.inria.fr/bin/ecdl7"
#define HTTP_SERVER_HOST "cristal.inria.fr"
#define HTTP_SERVER_PORT 80

/* INET address of HTTP server or proxy */
static struct sockaddr_in http_addr;
/* Name of HTTP server or proxy */
static char * http_host = HTTP_SERVER_HOST;
/* Port number on HTTP server or proxy */
static int http_port = HTTP_SERVER_PORT;

/* Send a string back to the base using the HTTP POST protocol */

static int httpPost(char * payload)
{
  int s, retcode;
  FILE * f, * r;

  /* Connect to HTTP server or proxy */
  s = socket(AF_INET, SOCK_STREAM, 0);
  if (s == -1) {
    printf("Warning: socket() failed (errno = %d: %s).\n",
           errno, strerror(errno));
    return -1;
  }
  retcode = connect(s, (struct sockaddr *) &http_addr, sizeof(http_addr));
  if (retcode == -1) {
    printf("Warning: couldn't connect to host %s port %d (errno = %d: %s).\n",
           http_host, http_port, errno, strerror(errno));
    return -1;
  }
  /* Prepare to send data */
  f = fdopen(s, "wb");
  if (f == NULL) {
    fclose(f);
    printf("Warning: fdopen(socket, \"wb\") failed (errno = %d: %s).\n",
           errno, strerror(errno));
    return -1;
  }
  /* Send POST request + data */
  fprintf(f, "POST " HTTP_URL " HTTP/1.0\r\n");
  fprintf(f, "Host: %s:%d\r\n", HTTP_SERVER_HOST, HTTP_SERVER_PORT);
  fprintf(f, "Content-length: %d\r\n", strlen(payload));
  fprintf(f, "\r\n");
  fputs(payload, f);
  fflush(f);
  if (ferror(f)) {
    fclose(f);
    printf("Warning: error while sending data via HTTP to %s.\n", http_host);
    return -1;
  }
  /* Say that we're done sending data */
  shutdown(s, 1);
  /* Read response from server or proxy */
  r = fdopen(s, "rb");
  if (r == NULL) {
    close(s);
    printf("Warning: fdopen(socket, \"rb\") failed (errno = %d: %s).\n",
           errno, strerror(errno));
    return -1;
  }
  /* First line of response is the status line, with format
     HTTP-Version SP Status-Code SP Reason-Phrase CRLF. */
  retcode = -1;
  fscanf(r, "%*s %d ", &retcode);
  /* Discard remainder of response */
  fclose(f);
  fclose(r);
  /* Check status code -- should be 2xx */
  if (! (retcode >= 200 && retcode < 300)) {
    printf("Warning: Web server or proxy %s returned an error "
           "(status code = %d)\n",
           http_host, retcode);
    return -1;
  }
  return 0;
}


/*-- httpParseProxy --------------------------------------------------------*/

/* Parse proxy specification and store proxy host name and port number
 * in http_host and http_port */

static int httpParseProxy(char * proxy)
{
  http_host = malloc(strlen(proxy));
  if (http_host == NULL) {
    printf("Error: cannot allocate proxy host name\n");
    return -1;
  }
  if (sscanf(proxy, "%[^:]:%d", http_host, &http_port) != 2) {
    printf("Error: bad proxy specification `%s'\n", proxy);
  return -1;
  }
  return 0;
}


/*-- httpInit --------------------------------------------------------------*/

/* Initialization of the HTTP machinery.  Determine INET address of
 * server or proxy and store it in http_addr */

static int httpInit(void)
{
  struct hostent * entry;

  entry = gethostbyname(http_host);
  if (entry == NULL) {
    printf("Error: unknown host %s\n", http_host);
    return -1;
  }
  memset(&http_addr, 0, sizeof(struct sockaddr_in));
  http_addr.sin_family = AF_INET;
  memcpy(&http_addr.sin_addr.s_addr, entry->h_addr, entry->h_length);
  http_addr.sin_port = htons(http_port);
  return 0;
}


/*== end of file ecdl2K-108.32bit.c ========================================*/

