/*
 * nasd_cheops_nwayxor.c
 *
 * nWayXorN xors N input buffers into the destination buffer.
 * adapted from danner's longword_bxor code.
 *
 * Khalil Amiri, CMU ECE/SCS
 * Adapted from raidframe (Original Authors: Mark Holland, Daniel Stodolsky)
 */
/*
 * Copyright (c) of Carnegie Mellon University, 1995,1996,1997,1998,1999.
 *
 * Permission to reproduce, use, and prepare derivative works of
 * this software for internal use is granted provided the copyright
 * and "No Warranty" statements are included with all reproductions
 * and derivative works. This software may also be redistributed
 * without charge provided that the copyright and "No Warranty"
 * statements are included in all redistributions.
 *
 * NO WARRANTY. THIS SOFTWARE IS FURNISHED ON AN "AS IS" BASIS.
 * CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER
 * EXPRESSED OR IMPLIED AS TO THE MATTER INCLUDING, BUT NOT LIMITED
 * TO: WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY
 * OF RESULTS OR RESULTS OBTAINED FROM USE OF THIS SOFTWARE. CARNEGIE
 * MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT
 * TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 */


#include <nasd/nasd_options.h>
#include <nasd/nasd_cheops_types.h>
#include <nasd/nasd_cheops_common.h>

#ifndef ALLOW_REGISTER_DECLARATIONS
# define __REGISTER__ /* */
#else
# define __REGISTER__ register
#endif /* ALLOW_REGISTER_DECLARATIONS */

static int callcount[10];

void
_nasd_cheops_nWayXor1(
  _nasd_cheops_buffer_t  **src_rbs,
  _nasd_cheops_buffer_t   *dest_rb,
  int                      len)
{
  __REGISTER__ unsigned long *src = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *dest= (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *end = src+len;
  __REGISTER__ unsigned long d0, d1, d2, d3, s0, s1, s2, s3;

  callcount[1]++;
  while (len >= 4 )
    {
      d0 = dest[0];
      d1 = dest[1];
      d2 = dest[2];
      d3 = dest[3];
      s0 = src[0];
      s1 = src[1];
      s2 = src[2];
      s3 = src[3];
      dest[0] = d0 ^ s0;
      dest[1] = d1 ^ s1;
      dest[2] = d2 ^ s2;
      dest[3] = d3 ^ s3;
      src += 4;
      dest += 4;
      len -= 4;
    }
  while (src < end) {*dest++ ^=  *src++;}
}

void
_nasd_cheops_nWayXor2(
  _nasd_cheops_buffer_t  **src_rbs,
  _nasd_cheops_buffer_t   *dest_rb,
  int                      len)
{
  __REGISTER__ unsigned long *dst = (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *a   = dst;
  __REGISTER__ unsigned long *b   = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *c   = (unsigned long *) src_rbs[1];
  unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
  
  callcount[2]++;
  /* align dest to cache line */
  while ((((unsigned long) dst) & 0x1f))
    {
      *dst++ = *a++ ^ *b++ ^ *c++;
      len--;
    }
  while (len > 4 )
    {
      a0 = a[0]; len -= 4;
      a1 = a[1];
      a2 = a[2];
      a3 = a[3];  a += 4;
      b0 = b[0];
      b1 = b[1];
      b2 = b[2];
      b3 = b[3];
      /* start dual issue */
      a0 ^= b0; b0 =  c[0];
      b += 4;  a1 ^= b1;
      a2 ^= b2; a3 ^= b3;
      b1 =  c[1]; a0 ^= b0;
      b2 =  c[2]; a1 ^= b1;
      b3 =  c[3]; a2 ^= b2;
      dst[0] = a0; a3 ^= b3;
      dst[1] = a1; c += 4;
      dst[2] = a2;
      dst[3] = a3; dst += 4;
    }
  while (len)
    {
      *dst++ = *a++ ^ *b++ ^ *c++;
      len--;
    }
}

/* note that first arg is not incremented but 2nd arg is */
#define LOAD_FIRST(_dst,_b)                                             \
  a0 = _dst[0]; len -= 4;                                               \
  a1 = _dst[1];                                                         \
  a2 = _dst[2];                                                         \
  a3 = _dst[3];                                                         \
  b0 = _b[0];                                                           \
  b1 = _b[1];                                                           \
  b2 = _b[2];                                                           \
  b3 = _b[3];  _b += 4;

/* note: arg is incremented */
#define XOR_AND_LOAD_NEXT(_n)                                           \
  a0 ^= b0; b0 = _n[0];                                                 \
  a1 ^= b1; b1 = _n[1];                                                 \
  a2 ^= b2; b2 = _n[2];                                                 \
  a3 ^= b3; b3 = _n[3];                                                 \
  _n += 4;

/* arg is incremented */
#define XOR_AND_STORE(_dst)                                             \
  a0 ^= b0; _dst[0] = a0;                                               \
  a1 ^= b1; _dst[1] = a1;                                               \
  a2 ^= b2; _dst[2] = a2;                                               \
  a3 ^= b3; _dst[3] = a3;                                               \
  _dst += 4;


void
_nasd_cheops_nWayXor3(
  _nasd_cheops_buffer_t  **src_rbs,
  _nasd_cheops_buffer_t   *dest_rb,
  int                      len)
{
  __REGISTER__ unsigned long *dst = (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *b   = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *c   = (unsigned long *) src_rbs[1];
  __REGISTER__ unsigned long *d   = (unsigned long *) src_rbs[2];
  unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
  
  callcount[3]++;
  /* align dest to cache line */
  while ((((unsigned long) dst) & 0x1f)) {
    *dst++ ^= *b++ ^ *c++ ^ *d++;
    len--;
  }
  while (len > 4 ) {
    LOAD_FIRST(dst,b);
    XOR_AND_LOAD_NEXT(c);
    XOR_AND_LOAD_NEXT(d);
    XOR_AND_STORE(dst);
  }
  while (len) {
    *dst++ ^=  *b++ ^ *c++ ^ *d++;
    len--;
  }
}

void
_nasd_cheops_nWayXor4(
  _nasd_cheops_buffer_t  **src_rbs,
  _nasd_cheops_buffer_t   *dest_rb,
  int                      len)
{
  __REGISTER__ unsigned long *dst = (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *b   = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *c   = (unsigned long *) src_rbs[1];
  __REGISTER__ unsigned long *d   = (unsigned long *) src_rbs[2];
  __REGISTER__ unsigned long *e   = (unsigned long *) src_rbs[3];
  unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
  
  callcount[4]++;
  /* align dest to cache line */
  while ((((unsigned long) dst) & 0x1f)) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
    len--;
  }
  while (len > 4 ) {
    LOAD_FIRST(dst,b);
    XOR_AND_LOAD_NEXT(c);
    XOR_AND_LOAD_NEXT(d);
    XOR_AND_LOAD_NEXT(e);
    XOR_AND_STORE(dst);
  }
  while (len) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
    len--;
  }
}

void
_nasd_cheops_nWayXor5(
  _nasd_cheops_buffer_t  **src_rbs,
  _nasd_cheops_buffer_t   *dest_rb,
  int                      len)
{
  __REGISTER__ unsigned long *dst = (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *b   = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *c   = (unsigned long *) src_rbs[1];
  __REGISTER__ unsigned long *d   = (unsigned long *) src_rbs[2];
  __REGISTER__ unsigned long *e   = (unsigned long *) src_rbs[3];
  __REGISTER__ unsigned long *f   = (unsigned long *) src_rbs[4];
  unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
  
  callcount[5]++;
  /* align dest to cache line */
  while ((((unsigned long) dst) & 0x1f)) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
    len--;
  }
  while (len > 4 ) {
    LOAD_FIRST(dst,b);
    XOR_AND_LOAD_NEXT(c);
    XOR_AND_LOAD_NEXT(d);
    XOR_AND_LOAD_NEXT(e);
    XOR_AND_LOAD_NEXT(f);
    XOR_AND_STORE(dst);
  }
  while (len) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
    len--;
  }
}

void
_nasd_cheops_nWayXor6(
  _nasd_cheops_buffer_t  **src_rbs,
  _nasd_cheops_buffer_t   *dest_rb,
  int                      len)
{
  __REGISTER__ unsigned long *dst = (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *b   = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *c   = (unsigned long *) src_rbs[1];
  __REGISTER__ unsigned long *d   = (unsigned long *) src_rbs[2];
  __REGISTER__ unsigned long *e   = (unsigned long *) src_rbs[3];
  __REGISTER__ unsigned long *f   = (unsigned long *) src_rbs[4];
  __REGISTER__ unsigned long *g   = (unsigned long *) src_rbs[5];
  unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
  
  callcount[6]++;
  /* align dest to cache line */
  while ((((unsigned long) dst) & 0x1f)) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
    len--;
  }
  while (len > 4 ) {
    LOAD_FIRST(dst,b);
    XOR_AND_LOAD_NEXT(c);
    XOR_AND_LOAD_NEXT(d);
    XOR_AND_LOAD_NEXT(e);
    XOR_AND_LOAD_NEXT(f);
    XOR_AND_LOAD_NEXT(g);
    XOR_AND_STORE(dst);
  }
  while (len) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
    len--;
  }
}

void
_nasd_cheops_nWayXor7(
  _nasd_cheops_buffer_t  **src_rbs,
  _nasd_cheops_buffer_t   *dest_rb,
  int                      len)
{
  __REGISTER__ unsigned long *dst = (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *b   = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *c   = (unsigned long *) src_rbs[1];
  __REGISTER__ unsigned long *d   = (unsigned long *) src_rbs[2];
  __REGISTER__ unsigned long *e   = (unsigned long *) src_rbs[3];
  __REGISTER__ unsigned long *f   = (unsigned long *) src_rbs[4];
  __REGISTER__ unsigned long *g   = (unsigned long *) src_rbs[5];
  __REGISTER__ unsigned long *h   = (unsigned long *) src_rbs[6];
  unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
  
  callcount[7]++;
  /* align dest to cache line */
  while ((((unsigned long) dst) & 0x1f)) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
    len--;
  }
  while (len > 4 ) {
    LOAD_FIRST(dst,b);
    XOR_AND_LOAD_NEXT(c);
    XOR_AND_LOAD_NEXT(d);
    XOR_AND_LOAD_NEXT(e);
    XOR_AND_LOAD_NEXT(f);
    XOR_AND_LOAD_NEXT(g);
    XOR_AND_LOAD_NEXT(h);
    XOR_AND_STORE(dst);
  }
  while (len) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
    len--;
  }
}

void
_nasd_cheops_nWayXor8(
  _nasd_cheops_buffer_t  **src_rbs,
  _nasd_cheops_buffer_t   *dest_rb,
  int                      len)
{
  __REGISTER__ unsigned long *dst = (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *b   = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *c   = (unsigned long *) src_rbs[1];
  __REGISTER__ unsigned long *d   = (unsigned long *) src_rbs[2];
  __REGISTER__ unsigned long *e   = (unsigned long *) src_rbs[3];
  __REGISTER__ unsigned long *f   = (unsigned long *) src_rbs[4];
  __REGISTER__ unsigned long *g   = (unsigned long *) src_rbs[5];
  __REGISTER__ unsigned long *h   = (unsigned long *) src_rbs[6];
  __REGISTER__ unsigned long *i   = (unsigned long *) src_rbs[7];
  unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
  
  callcount[8]++;
  /* align dest to cache line */
  while ((((unsigned long) dst) & 0x1f)) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
    len--;
  }
  while (len > 4 ) {
    LOAD_FIRST(dst,b);
    XOR_AND_LOAD_NEXT(c);
    XOR_AND_LOAD_NEXT(d);
    XOR_AND_LOAD_NEXT(e);
    XOR_AND_LOAD_NEXT(f);
    XOR_AND_LOAD_NEXT(g);
    XOR_AND_LOAD_NEXT(h);
    XOR_AND_LOAD_NEXT(i);
    XOR_AND_STORE(dst);
  }
  while (len) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
    len--;
  }
}


void
_nasd_cheops_nWayXor9(
  _nasd_cheops_buffer_t   **src_rbs,
  _nasd_cheops_buffer_t    *dest_rb,
  int                       len)
{
  __REGISTER__ unsigned long *dst = (unsigned long *) dest_rb;
  __REGISTER__ unsigned long *b   = (unsigned long *) src_rbs[0];
  __REGISTER__ unsigned long *c   = (unsigned long *) src_rbs[1];
  __REGISTER__ unsigned long *d   = (unsigned long *) src_rbs[2];
  __REGISTER__ unsigned long *e   = (unsigned long *) src_rbs[3];
  __REGISTER__ unsigned long *f   = (unsigned long *) src_rbs[4];
  __REGISTER__ unsigned long *g   = (unsigned long *) src_rbs[5];
  __REGISTER__ unsigned long *h   = (unsigned long *) src_rbs[6];
  __REGISTER__ unsigned long *i   = (unsigned long *) src_rbs[7];
  __REGISTER__ unsigned long *j   = (unsigned long *) src_rbs[8];
  unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
  
  callcount[9]++;
  /* align dest to cache line */
  while ((((unsigned long) dst) & 0x1f)) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
    len--;
  }
  while (len > 4 ) {
    LOAD_FIRST(dst,b);
    XOR_AND_LOAD_NEXT(c);
    XOR_AND_LOAD_NEXT(d);
    XOR_AND_LOAD_NEXT(e);
    XOR_AND_LOAD_NEXT(f);
    XOR_AND_LOAD_NEXT(g);
    XOR_AND_LOAD_NEXT(h);
    XOR_AND_LOAD_NEXT(i);
    XOR_AND_LOAD_NEXT(j);
    XOR_AND_STORE(dst);
  }
  while (len) {
    *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
    len--;
  }
}

/* Local Variables:  */
/* indent-tabs-mode: nil */
/* tab-width: 2 */
/* End: */
