* Written by J.T. Conklin <jtc@acorntoolworks.com>

/*
* Written by J.T. Conklin <[email protected]>
* Public domain.
*/

#include <machine/asm.h>

#if defined(LIBC_SCCS)
RCSID("$NetBSD: strcpy.S,v 1.2 2014/03/22 19:38:46 jakllsch Exp $")
#endif

/*
* This strcpy implementation copies a byte at a time until the
* source pointer is aligned to a word boundary, it then copies by
* words until it finds a word containing a zero byte, and finally
* copies by bytes until the end of the string is reached.
*
* While this may result in unaligned stores if the source and
* destination pointers are unaligned with respect to each other,
* it is still faster than either byte copies or the overhead of
* an implementation suitable for machines with strict alignment
* requirements.
*/

ENTRY(strcpy)
pushl %ebx
movl 8(%esp),%ecx
movl 12(%esp),%eax

/*
* Align source to a word boundary.
* Consider unrolling loop?
*/
_ALIGN_TEXT
Lalign:
testl $3,%eax
je .Lword_aligned
movb (%eax),%bl
incl %eax
movb %bl,(%ecx)
incl %ecx
testb %bl,%bl
jne .Lalign
jmp .Ldone

_ALIGN_TEXT
Lloop:
movl %ebx,(%ecx)
addl $4,%ecx
Lword_aligned:
movl (%eax),%ebx
addl $4,%eax
leal -0x01010101(%ebx),%edx
testl $0x80808080,%edx
je .Lloop

/*
* In rare cases, the above loop may exit prematurely. We must
* return to the loop if none of the bytes in the word equal 0.
*/

movb %bl,(%ecx)
incl %ecx
testb %bl,%bl
je .Ldone

movb %bh,(%ecx)
incl %ecx
testb %bh,%bh
je .Ldone

shrl $16,%ebx
movb %bl,(%ecx)
incl %ecx
testb %bl,%bl
je .Ldone

movb %bh,(%ecx)
incl %ecx
testb %bh,%bh
jne .Lword_aligned

Ldone:
movl 8(%esp),%eax
popl %ebx
ret
END(strcpy)