/*
* Written by J.T. Conklin <[email protected]>
* Public domain.
*/

#include <machine/asm.h>

#if defined(LIBC_SCCS)
       RCSID("$NetBSD: strcpy.S,v 1.2 2014/03/22 19:38:46 jakllsch Exp $")
#endif

/*
* This strcpy implementation copies a byte at a time until the
* source pointer is aligned to a word boundary, it then copies by
* words until it finds a word containing a zero byte, and finally
* copies by bytes until the end of the string is reached.
*
* While this may result in unaligned stores if the source and
* destination pointers are unaligned with respect to each other,
* it is still faster than either byte copies or the overhead of
* an implementation suitable for machines with strict alignment
* requirements.
*/

ENTRY(strcpy)
       pushl   %ebx
       movl    8(%esp),%ecx
       movl    12(%esp),%eax

       /*
        * Align source to a word boundary.
        * Consider unrolling loop?
        */
       _ALIGN_TEXT
Lalign:
       testl   $3,%eax
       je      .Lword_aligned
       movb    (%eax),%bl
       incl    %eax
       movb    %bl,(%ecx)
       incl    %ecx
       testb   %bl,%bl
       jne     .Lalign
       jmp     .Ldone

       _ALIGN_TEXT
Lloop:
       movl    %ebx,(%ecx)
       addl    $4,%ecx
Lword_aligned:
       movl    (%eax),%ebx
       addl    $4,%eax
       leal    -0x01010101(%ebx),%edx
       testl   $0x80808080,%edx
       je      .Lloop

       /*
        * In rare cases, the above loop may exit prematurely. We must
        * return to the loop if none of the bytes in the word equal 0.
        */

       movb    %bl,(%ecx)
       incl    %ecx
       testb   %bl,%bl
       je      .Ldone

       movb    %bh,(%ecx)
       incl    %ecx
       testb   %bh,%bh
       je      .Ldone

       shrl    $16,%ebx
       movb    %bl,(%ecx)
       incl    %ecx
       testb   %bl,%bl
       je      .Ldone

       movb    %bh,(%ecx)
       incl    %ecx
       testb   %bh,%bh
       jne     .Lword_aligned

Ldone:
       movl    8(%esp),%eax
       popl    %ebx
       ret
END(strcpy)