#include <stdio.h>
#include <string.h>
#include <time.h>
#define mmxcpy(dest, src, len) \
__asm__ ( \
"movl %2, %%ecx\n\t" \
"shrl $5, %%ecx\n\t" \
"cplp:\n\t" \
"movq (%1), %%mm0\n\t" \
"movq 8(%1), %%mm1\n\t" \
"movq 16(%1), %%mm2\n\t" \
"movq 24(%1), %%mm3\n\t" \
"movq (%0), %%mm4\n\t" \
"movq %%mm0, (%0)\n\t" \
"movq %%mm1, 8(%0)\n\t" \
"movq %%mm2, 16(%0)\n\t" \
"movq %%mm3, 24(%0)\n\t" \
"addl $32, %1\n\t" \
"addl $32, %0\n\t" \
"decl %%ecx\n\t" \
"jnz cplp\n\t" \
: : "D" (dest), "S" (src), "g" (len) : "%ecx", "%esi", "%edi" )
static unsigned char a[4000] __attribute__ ((aligned(32)));
static unsigned char b[4000] __attribute__ ((aligned(32)));
main()
{
clock_t t1, t2;
int i;
t1 = clock();
while(t1 == clock());
t1 = clock();
for(i = 0; i < 200000; i++)
mmxcpy(a, b, 4000);
t2 = clock();
printf("mmxcpy: %d\n", t2 - t1);
t1 = clock();
while(t1 == clock());
t1 = clock();
for(i = 0; i < 200000; i++)
memcpy(a, b, 4000);
t2 = clock();
printf("memcpy: %d\n", t2 - t1);
}