/*      $NetBSD: rf_nwayxor.c,v 1.11 2006/11/16 01:33:23 christos Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
*
* Author: Mark Holland, Daniel Stodolsky
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
*  Software Distribution Coordinator  or  [email protected]
*  School of Computer Science
*  Carnegie Mellon University
*  Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/

/************************************************************
*
* nwayxor.c -- code to do N-way xors for reconstruction
*
* nWayXorN xors N input buffers into the destination buffer.
* adapted from danner's longword_bxor code.
*
************************************************************/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rf_nwayxor.c,v 1.11 2006/11/16 01:33:23 christos Exp $");

#include "rf_nwayxor.h"
#include "rf_shutdown.h"

static int callcount[10];
static void rf_ShutdownNWayXor(void *);

static void
rf_ShutdownNWayXor(void *ignored)
{
       int     i;

       if (rf_showXorCallCounts == 0)
               return;
       printf("Call counts for n-way xor routines:  ");
       for (i = 0; i < 10; i++)
               printf("%d ", callcount[i]);
       printf("\n");
}

int
rf_ConfigureNWayXor(RF_ShutdownList_t **listp)
{
       int     i;

       for (i = 0; i < 10; i++)
               callcount[i] = 0;
       rf_ShutdownCreate(listp, rf_ShutdownNWayXor, NULL);
       return (0);
}

void
rf_nWayXor1(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *src = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *dest = (unsigned long *) dest_rb->buffer;
       unsigned long *end = src + len;
       unsigned long d0, d1, d2, d3, s0, s1, s2, s3;

       callcount[1]++;
       while (len >= 4) {
               d0 = dest[0];
               d1 = dest[1];
               d2 = dest[2];
               d3 = dest[3];
               s0 = src[0];
               s1 = src[1];
               s2 = src[2];
               s3 = src[3];
               dest[0] = d0 ^ s0;
               dest[1] = d1 ^ s1;
               dest[2] = d2 ^ s2;
               dest[3] = d3 ^ s3;
               src += 4;
               dest += 4;
               len -= 4;
       }
       while (src < end) {
               *dest++ ^= *src++;
       }
}

void
rf_nWayXor2(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *dst = (unsigned long *) dest_rb->buffer;
       unsigned long *a = dst;
       unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
       unsigned long a0, a1, a2, a3, b0, b1, b2, b3;

       callcount[2]++;
       /* align dest to cache line */
       while ((((unsigned long) dst) & 0x1f)) {
               *dst++ = *a++ ^ *b++ ^ *c++;
               len--;
       }
       while (len > 4) {
               a0 = a[0];
               len -= 4;

               a1 = a[1];
               a2 = a[2];

               a3 = a[3];
               a += 4;

               b0 = b[0];
               b1 = b[1];

               b2 = b[2];
               b3 = b[3];
               /* start dual issue */
               a0 ^= b0;
               b0 = c[0];

               b += 4;
               a1 ^= b1;

               a2 ^= b2;
               a3 ^= b3;

               b1 = c[1];
               a0 ^= b0;

               b2 = c[2];
               a1 ^= b1;

               b3 = c[3];
               a2 ^= b2;

               dst[0] = a0;
               a3 ^= b3;
               dst[1] = a1;
               c += 4;
               dst[2] = a2;
               dst[3] = a3;
               dst += 4;
       }
       while (len) {
               *dst++ = *a++ ^ *b++ ^ *c++;
               len--;
       }
}
/* note that first arg is not incremented but 2nd arg is */
#define LOAD_FIRST(_dst,_b) \
 a0 = _dst[0]; len -= 4;   \
 a1 = _dst[1];             \
 a2 = _dst[2];             \
 a3 = _dst[3];             \
 b0 = _b[0];               \
 b1 = _b[1];               \
 b2 = _b[2];               \
 b3 = _b[3];  _b += 4;

/* note: arg is incremented */
#define XOR_AND_LOAD_NEXT(_n) \
 a0 ^= b0; b0 = _n[0];       \
 a1 ^= b1; b1 = _n[1];       \
 a2 ^= b2; b2 = _n[2];       \
 a3 ^= b3; b3 = _n[3];       \
 _n += 4;

/* arg is incremented */
#define XOR_AND_STORE(_dst)       \
 a0 ^= b0; _dst[0] = a0;         \
 a1 ^= b1; _dst[1] = a1;         \
 a2 ^= b2; _dst[2] = a2;         \
 a3 ^= b3; _dst[3] = a3;         \
 _dst += 4;


void
rf_nWayXor3(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *dst = (unsigned long *) dest_rb->buffer;
       unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
       unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
       unsigned long a0, a1, a2, a3, b0, b1, b2, b3;

       callcount[3]++;
       /* align dest to cache line */
       while ((((unsigned long) dst) & 0x1f)) {
               *dst++ ^= *b++ ^ *c++ ^ *d++;
               len--;
       }
       while (len > 4) {
               LOAD_FIRST(dst, b);
               XOR_AND_LOAD_NEXT(c);
               XOR_AND_LOAD_NEXT(d);
               XOR_AND_STORE(dst);
       }
       while (len) {
               *dst++ ^= *b++ ^ *c++ ^ *d++;
               len--;
       }
}

void
rf_nWayXor4(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *dst = (unsigned long *) dest_rb->buffer;
       unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
       unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
       unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
       unsigned long a0, a1, a2, a3, b0, b1, b2, b3;

       callcount[4]++;
       /* align dest to cache line */
       while ((((unsigned long) dst) & 0x1f)) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
               len--;
       }
       while (len > 4) {
               LOAD_FIRST(dst, b);
               XOR_AND_LOAD_NEXT(c);
               XOR_AND_LOAD_NEXT(d);
               XOR_AND_LOAD_NEXT(e);
               XOR_AND_STORE(dst);
       }
       while (len) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
               len--;
       }
}

void
rf_nWayXor5(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *dst = (unsigned long *) dest_rb->buffer;
       unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
       unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
       unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
       unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
       unsigned long a0, a1, a2, a3, b0, b1, b2, b3;

       callcount[5]++;
       /* align dest to cache line */
       while ((((unsigned long) dst) & 0x1f)) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
               len--;
       }
       while (len > 4) {
               LOAD_FIRST(dst, b);
               XOR_AND_LOAD_NEXT(c);
               XOR_AND_LOAD_NEXT(d);
               XOR_AND_LOAD_NEXT(e);
               XOR_AND_LOAD_NEXT(f);
               XOR_AND_STORE(dst);
       }
       while (len) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
               len--;
       }
}

void
rf_nWayXor6(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *dst = (unsigned long *) dest_rb->buffer;
       unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
       unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
       unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
       unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
       unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
       unsigned long a0, a1, a2, a3, b0, b1, b2, b3;

       callcount[6]++;
       /* align dest to cache line */
       while ((((unsigned long) dst) & 0x1f)) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
               len--;
       }
       while (len > 4) {
               LOAD_FIRST(dst, b);
               XOR_AND_LOAD_NEXT(c);
               XOR_AND_LOAD_NEXT(d);
               XOR_AND_LOAD_NEXT(e);
               XOR_AND_LOAD_NEXT(f);
               XOR_AND_LOAD_NEXT(g);
               XOR_AND_STORE(dst);
       }
       while (len) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
               len--;
       }
}

void
rf_nWayXor7(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *dst = (unsigned long *) dest_rb->buffer;
       unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
       unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
       unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
       unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
       unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
       unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
       unsigned long a0, a1, a2, a3, b0, b1, b2, b3;

       callcount[7]++;
       /* align dest to cache line */
       while ((((unsigned long) dst) & 0x1f)) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
               len--;
       }
       while (len > 4) {
               LOAD_FIRST(dst, b);
               XOR_AND_LOAD_NEXT(c);
               XOR_AND_LOAD_NEXT(d);
               XOR_AND_LOAD_NEXT(e);
               XOR_AND_LOAD_NEXT(f);
               XOR_AND_LOAD_NEXT(g);
               XOR_AND_LOAD_NEXT(h);
               XOR_AND_STORE(dst);
       }
       while (len) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
               len--;
       }
}

void
rf_nWayXor8(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *dst = (unsigned long *) dest_rb->buffer;
       unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
       unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
       unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
       unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
       unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
       unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
       unsigned long *i = (unsigned long *) src_rbs[7]->buffer;
       unsigned long a0, a1, a2, a3, b0, b1, b2, b3;

       callcount[8]++;
       /* align dest to cache line */
       while ((((unsigned long) dst) & 0x1f)) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
               len--;
       }
       while (len > 4) {
               LOAD_FIRST(dst, b);
               XOR_AND_LOAD_NEXT(c);
               XOR_AND_LOAD_NEXT(d);
               XOR_AND_LOAD_NEXT(e);
               XOR_AND_LOAD_NEXT(f);
               XOR_AND_LOAD_NEXT(g);
               XOR_AND_LOAD_NEXT(h);
               XOR_AND_LOAD_NEXT(i);
               XOR_AND_STORE(dst);
       }
       while (len) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
               len--;
       }
}


void
rf_nWayXor9(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len)
{
       unsigned long *dst = (unsigned long *) dest_rb->buffer;
       unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
       unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
       unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
       unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
       unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
       unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
       unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
       unsigned long *i = (unsigned long *) src_rbs[7]->buffer;
       unsigned long *j = (unsigned long *) src_rbs[8]->buffer;
       unsigned long a0, a1, a2, a3, b0, b1, b2, b3;

       callcount[9]++;
       /* align dest to cache line */
       while ((((unsigned long) dst) & 0x1f)) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
               len--;
       }
       while (len > 4) {
               LOAD_FIRST(dst, b);
               XOR_AND_LOAD_NEXT(c);
               XOR_AND_LOAD_NEXT(d);
               XOR_AND_LOAD_NEXT(e);
               XOR_AND_LOAD_NEXT(f);
               XOR_AND_LOAD_NEXT(g);
               XOR_AND_LOAD_NEXT(h);
               XOR_AND_LOAD_NEXT(i);
               XOR_AND_LOAD_NEXT(j);
               XOR_AND_STORE(dst);
       }
       while (len) {
               *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
               len--;
       }
}