/* $NetBSD: memcmp.S,v 1.3 2018/07/09 06:07:06 ryo Exp $ */

/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Matt Thomas of 3am Software Foundry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

#include <machine/asm.h>

RCSID("$NetBSD: memcmp.S,v 1.3 2018/07/09 06:07:06 ryo Exp $")

ENTRY(memcmp)
       mov     x9, x0
       mov     x10, x1
       mov     x0, xzr
       cbz     x2, .Lmemcmp_ret
#ifdef _KERNEL
       cmp     x2, #6
       b.eq    .Lmemcmp_6bytes
#endif
       cmp     x2, #8
       b.ls    .Lmemcmp_lessthan8

       ands    x3, x9, #7
       b.eq    .Lmemcmp_dword_loop

/*
* The src1 address is not dword aligned.
*/
       add     x2, x2, x3              /* add unalignment to length */
       sub     x2, x2, #8              /* now subtract a dword */

       sub     x9, x9, x3              /* dword align src1 */

       ldr     x6, [x10], #8           /* load dword from src2 */
       sub     x10, x10, x3            /* src2 -= x3 */
       lsl     x3, x3, #3              /* convert bytes to bits */
       ldr     x4, [x9], #8            /* load dword from src1 */
#ifdef __AARCH64EB__
       lsl     x4, x4, x3              /* discard leading bytes from data1 */
       lsr     x6, x6, x3              /* discard leading bytes from data2 */
       lsl     x6, x6, x3              /* get back bit position */
#else
       lsr     x4, x4, x3              /* discard leading bytes from data1 */
       lsl     x6, x6, x3              /* discard leading bytes from data2 */
       lsr     x6, x6, x3              /* get back bit position */
#endif
       subs    x0, x4, x6              /* compare data */
       b.ne    .Lmemcmp_last_compare   /* difference.  find it */

Lmemcmp_dword_loop:
       subs    x2, x2, #8
       b.mi    .Lmemcmp_finish_dword
       ldr     x4, [x9], #8
       ldr     x6, [x10], #8
       subs    x0, x4, x6
       b.eq    .Lmemcmp_dword_loop     /* no difference.  go to loop */
       b       .Lmemcmp_last_compare   /* go find the difference. */

Lmemcmp_finish_dword:
       /*
        * we might have gotten here with nothing left.  If so, just bail.
        */
       tst     x2, #7
       b.eq    .Lmemcmp_ret
       mov     x4, xzr
       mov     x6, xzr
       /*
        *
        */
       tbz     x2, #2, .Lmemcmp_finish_word
       ldr     w4, [x9], #4
       ldr     w6, [x10], #4
#ifdef __AARCH64EB__
       lsl     x4, x4, #32             /* move to MSW */
       lsl     x6, x6, #32             /* move to MSW */
#endif

Lmemcmp_finish_word:
       tbz     x2, #1, .Lmemcmp_finish_hword
       ldrh    w5, [x9], #2
       ldrh    w7, [x10], #2
#ifdef __AARCH64EB__
       orr     x4, x4, x5, lsl #16
       orr     x6, x6, x7, lsl #16
#else
       orr     x4, x4, x5, lsl #32
       orr     x6, x6, x7, lsl #32
#endif

Lmemcmp_finish_hword:
       tbz     x2, #0, .Lmemcmp_last_compare0

       ldrb    w5, [x9]
       ldrb    w7, [x10]
#ifdef __AARCH64EB__
       orr     x4, x4, x5, lsl #8
       orr     x6, x6, x7, lsl #8
#else
       orr     x4, x4, x5, lsl #48
       orr     x6, x6, x7, lsl #48
#endif
       b       .Lmemcmp_last_compare0  /* go find the difference. */

/*
* D
*/
Lmemcmp_lessthan8:
       sub     x2, x2, #1
1:      ldrb    w4, [x9], #1
       ldrb    w5, [x10], #1
       subs    x2, x2, #1
       ccmp    x4, x5, #0, cs
       b.eq    1b
       sub     x0, x4, x5

Lmemcmp_ret:
       ret

#ifdef _KERNEL
Lmemcmp_6bytes:
       ldr     w4, [x9], #4
       ldrh    w5, [x9]
#if __AARCH64EB__
       orr     x4, x4, x5, lsl #48
       rev     x4, x4
#else
       orr     x4, x4, x5, lsl #32
#endif
       ldr     w6, [x10], #4
       ldrh    w7, [x10]
#if __AARCH64EB__
       orr     x6, x6, x7, lsl #48
       rev     x6, x6
#else
       orr     x6, x6, x7, lsl #32
#endif
#endif /* _KERNEL */

/*
* We have loaded the final bytes in x4 and x6 in host-endian.  Now we have
* to figure what the difference is (if any).  First we subtract.  Any bytes
* that are the same will be 0. So to find the first non-zero byte we byterev
* and then use clz to find that byte.
* We mask the location to get the start of the byte.  We shift both
* data dwords left to remove the equal part.  Then we shift right to discard
* the trailing bytes.  Then we subtract and return.
*/
Lmemcmp_last_compare0:
       subs    x0, x4, x6
       b.eq    .Lmemcmp_ret
Lmemcmp_last_compare:
#if __AARCH64EB__
       clz     x1, x0          /* find first non-zero byte */
       rev     x0, x0
#else
       rev     x1, x0
       clz     x1, x1          /* find first non-zero byte */
#endif
       bfi     x1, xzr, #0, #3 /* make it byte aligned */
       lsr     x1, x0, x1      /* shift to LSB */
#if __AARCH64EL__
       rev     x4, x4          /* byte reverse */
       rev     x6, x6          /* byte reverse */
#endif
       subs    x0, x4, x6
       csetm   x0, cc          /* set mask bits as sign */
       bfm     x0, x1, #0, #7  /* extend with sign bit */
       ret
END(memcmp)