/*      $NetBSD: bcopy.S,v 1.17 2021/08/09 19:57:58 andvar Exp $        */

/*
* Copyright (c) 2002 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Matthew Fredette.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

/*
* Copy routines for NetBSD/hppa.
*/

#undef _LOCORE
#define _LOCORE /* XXX fredette - unfortunate */

#if defined(SPCOPY) && !defined(_STANDALONE)

#include "opt_multiprocessor.h"

#include <machine/cpu.h>

#endif

#include <machine/asm.h>
#include <machine/frame.h>
#include <machine/reg.h>

#if defined(LIBC_SCCS) && !defined(lint)
RCSID("$NetBSD: bcopy.S,v 1.17 2021/08/09 19:57:58 andvar Exp $")
#endif /* LIBC_SCCS and not lint */

/*
* The stbys instruction is a little asymmetric.  When (%r2 & 3)
* is zero, stbys,b,m %r1, 4(%r2) works like stws,ma.  You
* might then wish that when (%r2 & 3) == 0, stbys,e,m %r1, -4(%r2)
* worked like stws,mb.  But it doesn't.
*
* This macro works around this problem.  It requires that %t2
* hold the number of bytes that will be written by this store
* (meaning that it ranges from one to four).
*
* Watch the delay-slot trickery here.  The comib is used to set
* up which instruction, either the stws or the stbys, is run
* in the delay slot of the b instruction.
*/
#define _STBYS_E_M(r, dst_spc, dst_off)                           \
       comib,<>        4, %t2, 4                               ! \
       b               4                                       ! \
       stws,mb         r, -4(dst_spc, dst_off)                 ! \
       stbys,e,m       r, 0(dst_spc, dst_off)

/*
* This macro does a bulk copy with no shifting.  cmplt and m are
* the completer and displacement multiplier, respectively, for
* the load and store instructions.
*/
#define _COPY(src_spc, src_off, dst_spc, dst_off, count, cmplt, m) \
                                                               ! \
       /*                                                      ! \
        * Loop storing 16 bytes at a time.  Since count        ! \
        * may be > INT_MAX, we have to be careful and          ! \
        * avoid comparisons that treat it as a signed          ! \
        * quantity, until after this loop, when count          ! \
        * is guaranteed to be less than 16.                    ! \
        */                                                     ! \
       comib,>>=,n     15, count, _LABEL(_skip16)              ! \
label _LABEL(_loop16)                                           ! \
       addi            -16, count, count                       ! \
       ldws,cmplt      m*4(src_spc, src_off), %t1              ! \
       ldws,cmplt      m*4(src_spc, src_off), %t2              ! \
       ldws,cmplt      m*4(src_spc, src_off), %t3              ! \
       ldws,cmplt      m*4(src_spc, src_off), %t4              ! \
       stws,cmplt      %t1, m*4(dst_spc, dst_off)              ! \
       stws,cmplt      %t2, m*4(dst_spc, dst_off)              ! \
       stws,cmplt      %t3, m*4(dst_spc, dst_off)              ! \
       comib,<<        15, count, _LABEL(_loop16)              ! \
       stws,cmplt      %t4, m*4(dst_spc, dst_off)              ! \
label _LABEL(_skip16)                                           ! \
                                                               ! \
       /* Loop storing 4 bytes at a time. */                   ! \
       addib,<,n       -4, count, _LABEL(_skip4)               ! \
label _LABEL(_loop4)                                            ! \
       ldws,cmplt      m*4(src_spc, src_off), %t1              ! \
       addib,>=        -4, count, _LABEL(_loop4)               ! \
       stws,cmplt      %t1, m*4(dst_spc, dst_off)              ! \
label _LABEL(_skip4)                                            ! \
       /* Restore the correct count. */                        ! \
       addi            4, count, count                         ! \
                                                               ! \
label _LABEL(_do1)                                              ! \
                                                               ! \
       /* Loop storing 1 byte at a time. */                    ! \
       addib,<,n       -1, count, _LABEL(_skip1)               ! \
label _LABEL(_loop1)                                            ! \
       ldbs,cmplt      m*1(src_spc, src_off), %t1              ! \
       addib,>=        -1, count, _LABEL(_loop1)               ! \
       stbs,cmplt      %t1, m*1(dst_spc, dst_off)              ! \
label _LABEL(_skip1)                                            ! \
       /* Restore the correct count. */                        ! \
       b               _LABEL(_done)                           ! \
       addi            1, count, count

/*
* This macro is definitely strange.  It exists purely to
* allow the _COPYS macro to be reused, but because it
* requires this long attempt to explain it, I'm starting
* to doubt the value of that.
*
* Part of the expansion of the _COPYS macro below are loops
* that copy four words or one word at a time, performing shifts
* to get data to line up correctly in the destination buffer.
*
* The _COPYS macro is used when copying backwards, as well
* as forwards.  The 4-word loop always loads into %t1, %t2, %t3,
* and %t4 in that order.  This means that when copying forward,
* %t1 will have the word from the lowest address, and %t4 will
* have the word from the highest address.  When copying
* backwards, the opposite is true.
*
* The shift instructions need pairs of registers with adjacent
* words, with the register containing the word from the lowest
* address *always* coming first.  It is this asymmetry that
* gives rise to this macro - depending on which direction
* we're copying in, these ordered pairs are different.
*
* Fortunately, we can compute those register numbers at compile
* time, and assemble them manually into a shift instruction.
* That's what this macro does.
*
* This macro takes two arguments.  n ranges from 0 to 3 and
* is the "shift number", i.e., n = 0 means we're doing the
* shift for what will be the first store.
*
* m is the displacement multiplier from the _COPYS macro call.
* This is 1 for a forward copy and -1 for a backwards copy.
* So, the ((m + 1) / 2) term yields 0 for a backwards copy and
* 1 for a forward copy, and the ((m - 1) / 2) term yields
* 0 for a forward copy, and -1 for a backwards copy.
* These terms are used to discriminate the register computations
* below.
*
* When copying forward, then, the first register used with
* the first vshd will be 19 + (3 - ((0 - 1) & 3)), or %t4,
* which matches _COPYS' requirement that the word last loaded
* be in %t4.  The first register used for the second vshd
* will then "wrap" around to 19 + (3 - ((1 - 1) & 3)), or %t1.
* And so on to %t2 and %t3.
*
* When copying forward, the second register used with the first
* vshd will be (19 + (3 - ((n + 0) & 3)), or %t1.  It will
* continue to be %t2, then %t3, and finally %t4.
*
* When copying backwards, the values for the first and second
* register for each vshd are reversed from the forwards case.
* (Symmetry reclaimed!)  Proving this is "left as an exercise
* for the reader" (remember the different discriminating values!)
*/
#define _VSHD(n, m, t)                                            \
       .word (0xd0000000                                       | \
       ((19 + (3 - ((n - 1 * ((m + 1) / 2)) & 3))) << 16)      | \
       ((19 + (3 - ((n + 1 * ((m - 1) / 2)) & 3))) << 21)      | \
       (t))

/*
* This macro does a bulk copy with shifting.  cmplt and m are
* the completer and displacement multiplier, respectively, for
* the load and store instructions.  It is assumed that the
* word last loaded is already in %t4.
*/
#define _COPYS(src_spc, src_off, dst_spc, dst_off, count, cmplt, m) \
                                                               ! \
       /*                                                      ! \
        * Loop storing 16 bytes at a time.  Since count        ! \
        * may be > INT_MAX, we have to be careful and          ! \
        * avoid comparisons that treat it as a signed          ! \
        * quantity, until after this loop, when count          ! \
        * is guaranteed to be less than 16.                    ! \
        */                                                     ! \
       comib,>>=,n     15, count, _LABEL(S_skip16)             ! \
label _LABEL(S_loop16)                                          ! \
       addi            -16, count, count                       ! \
       ldws,cmplt      m*4(src_spc, src_off), %t1              ! \
       ldws,cmplt      m*4(src_spc, src_off), %t2              ! \
       ldws,cmplt      m*4(src_spc, src_off), %t3              ! \
       _VSHD(0, m, 1)  /* vshd %t4, %t1, %r1 */                ! \
       ldws,cmplt      m*4(src_spc, src_off), %t4              ! \
       _VSHD(1, m, 22) /* vshd %t1, %t2, %t1 */                ! \
       _VSHD(2, m, 21) /* vshd %t2, %t3, %t2 */                ! \
       _VSHD(3, m, 20) /* vshd %t3, %t4, %t3 */                ! \
       stws,cmplt      %r1, m*4(dst_spc, dst_off)              ! \
       stws,cmplt      %t1, m*4(dst_spc, dst_off)              ! \
       stws,cmplt      %t2, m*4(dst_spc, dst_off)              ! \
       comib,<<        15, count, _LABEL(S_loop16)             ! \
       stws,cmplt      %t3, m*4(dst_spc, dst_off)              ! \
label _LABEL(S_skip16)                                          ! \
                                                               ! \
       /* Loop storing 4 bytes at a time. */                   ! \
       addib,<,n       -4, count, _LABEL(S_skip4)              ! \
label _LABEL(S_loop4)                                           ! \
       ldws,cmplt      m*4(src_spc, src_off), %t1              ! \
       _VSHD(0, m, 1)  /* into %r1 (1) */                      ! \
       copy            %t1, %t4                                ! \
       addib,>=        -4, count, _LABEL(S_loop4)              ! \
       stws,cmplt      %r1, m*4(dst_spc, dst_off)              ! \
label _LABEL(S_skip4)                                           ! \
                                                               ! \
       /*                                                      ! \
        * We now need to "back up" src_off by the              ! \
        * number of bytes remaining in the FIFO                ! \
        * (i.e., the number of bytes remaining in %t4),        ! \
        * because (the correct) count still includes           ! \
        * these bytes, and we intent to keep it that           ! \
        * way, and finish with the single-byte copier.         ! \
        *                                                      ! \
        * The number of bytes remaining in the FIFO is         ! \
        * related to the shift count, so recover it,           ! \
        * restoring the correct count at the same time.        ! \
        */                                                     ! \
       mfctl   %cr11, %t1                                      ! \
       addi    4, count, count                                 ! \
       shd     %r0, %t1, 3, %t1                                ! \
                                                               ! \
       /*                                                      ! \
        * If we're copying forward, the shift count            ! \
        * is the number of bytes remaining in the              ! \
        * FIFO, and we want to subtract it from src_off.       ! \
        * If we're copying backwards, (4 - shift count)        ! \
        * is the number of bytes remaining in the FIFO,        ! \
        * and we want to add it to src_off.                    ! \
        *                                                      ! \
        * We observe that x + (4 - y) = x - (y - 4),           ! \
        * and introduce this instruction to add -4 when        ! \
        * m is -1, although this does mean one extra           ! \
        * instruction in the forward case.                     ! \
        */                                                     ! \
       addi    4*((m - 1) / 2), %t1, %t1                       ! \
                                                               ! \
       /* Now branch to the byte-at-a-time loop. */            ! \
       b       _LABEL(_do1)                                    ! \
       sub     src_off, %t1, src_off

/*
* This macro copies a region in the forward direction.
*/
#define _COPY_FORWARD(src_spc, src_off, dst_spc, dst_off, count)  \
                                                               ! \
       /*                                                      ! \
        * Since in the shifting-left case we will              ! \
        * load 8 bytes before checking count, to               ! \
        * keep things simple, branch to the byte               ! \
        * copier unless we're copying at least 8.              ! \
        */                                                     ! \
       comib,>>,n      8, count, _LABEL(_do1)                  ! \
                                                               ! \
       /*                                                      ! \
        * Once we 4-byte align the source offset,              ! \
        * figure out how many bytes from the region            ! \
        * will be in the first 4-byte word we read.            ! \
        * Ditto for writing the destination offset.            ! \
        */                                                     ! \
       extru           src_off, 31, 2, %t1                     ! \
       extru           dst_off, 31, 2, %t2                     ! \
       subi            4, %t1, %t1                             ! \
       subi            4, %t2, %t2                             ! \
                                                               ! \
       /*                                                      ! \
        * Calculate the byte shift required.  A                ! \
        * positive value means a source 4-byte word            ! \
        * has to be shifted to the right to line up            ! \
        * as a destination 4-byte word.                        ! \
        */                                                     ! \
       sub             %t1, %t2, %t1                           ! \
                                                               ! \
       /* 4-byte align src_off. */                             ! \
       depi            0, 31, 2, src_off                       ! \
                                                               ! \
       /*                                                      ! \
        * It's somewhat important to note that this            ! \
        * code thinks of count as "the number of bytes         ! \
        * that haven't been stored yet", as opposed to         ! \
        * "the number of bytes that haven't been copied        ! \
        * yet".  The distinction is subtle, but becomes        ! \
        * apparent at the end of the shifting code, where      ! \
        * we "back up" src_off to correspond to count,         ! \
        * as opposed to flushing the FIFO.                     ! \
        *                                                      ! \
        * We calculated above how many bytes our first         ! \
        * store will store, so update count now.               ! \
        *                                                      ! \
        * If the shift is zero, strictly as an optimization    ! \
        * we use a copy loop that does no shifting.            ! \
        */                                                     ! \
       comb,<>         %r0, %t1, _LABEL(_shifting)             ! \
       sub             count, %t2, count                       ! \
                                                               ! \
       /* Load and store the first word. */                    ! \
       ldws,ma         4(src_spc, src_off), %t4                ! \
       stbys,b,m       %t4, 4(dst_spc, dst_off)                ! \
                                                               ! \
       /* Do the rest of the copy. */                          ! \
       _COPY(src_spc,src_off,dst_spc,dst_off,count,ma,1)       ! \
                                                               ! \
label _LABEL(_shifting)                                 ! \
                                                               ! \
       /*                                                      ! \
        * If shift < 0, we need to shift words to the          ! \
        * left.  Since we can't do this directly, we           ! \
        * adjust the shift so it's a shift to the right        ! \
        * and load the first word into the high word of        ! \
        * the FIFO.  Otherwise, we load a zero into the        ! \
        * high word of the FIFO.                               ! \
        */                                                     ! \
       comb,<=         %r0, %t1, _LABEL(_shiftingrt)           ! \
       copy            %r0, %t3                                ! \
       addi            4, %t1, %t1                             ! \
       ldws,ma         4(src_spc, src_off), %t3                ! \
label _LABEL(_shiftingrt)                                       ! \
                                                               ! \
       /*                                                      ! \
        * Turn the shift byte count into a bit count,          ! \
        * load the next word, set the Shift Amount             ! \
        * Register, and form and store the first word.         ! \
        */                                                     ! \
       sh3add          %t1, %r0, %t1                           ! \
       ldws,ma         4(src_spc, src_off), %t4                ! \
       mtctl           %t1, %cr11                              ! \
       vshd            %t3, %t4, %r1                           ! \
       stbys,b,m       %r1, 4(dst_spc, dst_off)                ! \
                                                               ! \
       /* Do the rest of the copy. */                          ! \
       _COPYS(src_spc,src_off,dst_spc,dst_off,count,ma,1)

/* This macro copies a region in the reverse direction. */
#define _COPY_REVERSE(src_spc, src_off, dst_spc, dst_off, count)  \
                                                               ! \
       /* Immediately add count to both offsets. */            ! \
       add     src_off, count, src_off                         ! \
       add     dst_off, count, dst_off                         ! \
                                                               ! \
       /*                                                      ! \
        * Since in the shifting-right case we                  ! \
        * will load 8 bytes before checking                    ! \
        * count, to keep things simple, branch                 ! \
        * to the byte copier unless we're                      ! \
        * copying at least 8 bytes.                            ! \
        */                                                     ! \
       comib,>>,n      8, count, _LABEL(_do1)                  ! \
                                                               ! \
       /*                                                      ! \
        * Once we 4-byte align the source offset,              ! \
        * figure out how many bytes from the region            ! \
        * will be in the first 4-byte word we read.            ! \
        * Ditto for writing the destination offset.            ! \
        */                                                     ! \
       extru,<>        src_off, 31, 2, %t1                     ! \
       ldi             4, %t1                                  ! \
       extru,<>        dst_off, 31, 2, %t2                     ! \
       ldi             4, %t2                                  ! \
                                                               ! \
       /*                                                      ! \
        * Calculate the byte shift required.  A                ! \
        * positive value means a source 4-byte                 ! \
        * word has to be shifted to the right to               ! \
        * line up as a destination 4-byte word.                ! \
        */                                                     ! \
       sub             %t2, %t1, %t1                           ! \
                                                               ! \
       /*                                                      ! \
        * 4-byte align src_off, leaving it pointing            ! \
        * to the 4-byte word *after* the next word             ! \
        * we intend to load.                                   ! \
        *                                                      ! \
        * It's somewhat important to note that this            ! \
        * code thinks of count as "the number of bytes         ! \
        * that haven't been stored yet", as opposed to         ! \
        * "the number of bytes that haven't been copied        ! \
        * yet".  The distinction is subtle, but becomes        ! \
        * apparent at the end of the shifting code, where      ! \
        * we "back up" src_off to correspond to count,         ! \
        * as opposed to flushing the FIFO.                     ! \
        *                                                      ! \
        * We calculated above how many bytes our first         ! \
        * store will store, so update count now.               ! \
        *                                                      ! \
        * If the shift is zero, we use a copy loop that        ! \
        * does no shifting.  NB: unlike the forward case,      ! \
        * this is NOT strictly an optimization.  If the        ! \
        * SAR is zero the vshds do NOT do the right thing.     ! \
        * This is another asymmetry more or less the "fault"   ! \
        * of vshd.                                             ! \
        */                                                     ! \
       addi            3, src_off, src_off                     ! \
       sub             count, %t2, count                       ! \
       comb,<>         %r0, %t1, _LABEL(_shifting)             ! \
       depi            0, 31, 2, src_off                       ! \
                                                               ! \
       /* Load and store the first word. */                    ! \
       ldws,mb         -4(src_spc, src_off), %t4               ! \
       _STBYS_E_M(%t4, dst_spc, dst_off)                       ! \
                                                               ! \
       /* Do the rest of the copy. */                          ! \
       _COPY(src_spc,src_off,dst_spc,dst_off,count,mb,-1)      ! \
                                                               ! \
label _LABEL(_shifting)                                 ! \
                                                               ! \
       /*                                                      ! \
        * If shift < 0, we need to shift words to the          ! \
        * left.  Since we can't do this directly, we           ! \
        * adjust the shift so it's a shift to the right        ! \
        * and load a zero in to the low word of the FIFO.      ! \
        * Otherwise, we load the first word into the           ! \
        * low word of the FIFO.                                ! \
        *                                                      ! \
        * Note the nullification trickery here.  We            ! \
        * assume that we're shifting to the left, and          ! \
        * load zero into the low word of the FIFO.  Then       ! \
        * we nullify the addi if we're shifting to the         ! \
        * right.  If the addi is not nullified, we are         ! \
        * shifting to the left, so we nullify the load.        ! \
        * we branch if we're shifting to the                   ! \
        */                                                     ! \
       copy            %r0, %t3                                ! \
       comb,<=,n       %r0, %t1, 0                             ! \
       addi,tr         4, %t1, %t1                             ! \
       ldws,mb         -4(src_spc, src_off), %t3               ! \
                                                               ! \
       /*                                                      ! \
        * Turn the shift byte count into a bit count,          ! \
        * load the next word, set the Shift Amount             ! \
        * Register, and form and store the first word.         ! \
        */                                                     ! \
       sh3add          %t1, %r0, %t1                           ! \
       ldws,mb         -4(src_spc, src_off), %t4               ! \
       mtctl           %t1, %cr11                              ! \
       vshd            %t4, %t3, %r1                           ! \
       _STBYS_E_M(%r1, dst_spc, dst_off)                       ! \
                                                               ! \
       /* Do the rest of the copy. */                          ! \
       _COPYS(src_spc,src_off,dst_spc,dst_off,count,mb,-1)

/*
* For paranoia, when things aren't going well, enable this
* code to assemble byte-at-a-time-only copying.
*/
#if 1
#undef _COPY_FORWARD
#define _COPY_FORWARD(src_spc, src_off, dst_spc, dst_off, count)  \
       comb,=,n        %r0, count, _LABEL(_done)               ! \
       ldbs,ma         1(src_spc, src_off), %r1                ! \
       addib,<>        -1, count, -12                          ! \
       stbs,ma         %r1, 1(dst_spc, dst_off)                ! \
       b,n             _LABEL(_done)
#undef _COPY_REVERSE
#define _COPY_REVERSE(src_spc, src_off, dst_spc, dst_off, count)  \
       comb,=          %r0, count, _LABEL(_done)               ! \
       add             src_off, count, src_off                 ! \
       add             dst_off, count, dst_off                 ! \
       ldbs,mb         -1(src_spc, src_off), %r1               ! \
       addib,<>        -1, count, -12                          ! \
       stbs,mb         %r1, -1(dst_spc, dst_off)               ! \
       b,n             _LABEL(_done)
#endif

/*
* If none of the following are defined, define BCOPY.
*/
#if !(defined(SPCOPY) || defined(MEMCPY) || defined(MEMMOVE))
#define BCOPY
#endif

#if defined(SPCOPY) && !defined(_STANDALONE)

#include <sys/errno.h>
#include "assym.h"

/*
* int spcopy(pa_space_t ssp, const void *src, pa_space_t dsp, void *dst,
*      size_t len)
*
* We assume that the regions do not overlap.
*/
LEAF_ENTRY(spcopy)

       /*
        * Setup the fault handler, which will fill in %ret0 if triggered.
        */
       GET_CURLWP(%r31)
#ifdef  DIAGNOSTIC
       comb,<>,n %r0, %r31, Lspcopy_curlwp_ok
       ldil    L%panic, %r1
       ldil    L%Lspcopy_curlwp_bad, %arg0
       ldo     R%panic(%r1), %r1
       ldo     R%Lspcopy_curlwp_bad(%arg0), %arg0
       .call
       bv,n    %r0(%r1)
       nop
Lspcopy_curlwp_bad:
       .asciz  "spcopy: curlwp == NULL\n"
       .align  8
Lspcopy_curlwp_ok:
#endif /* DIAGNOSTIC */
       ldil    L%spcopy_fault, %r1
       ldw     L_PCB(%r31), %r31
       ldo     R%spcopy_fault(%r1), %r1
       stw     %r1, PCB_ONFAULT(%r31)

       /* Setup the space registers. */
       mfsp    %sr2, %ret1
       mtsp    %arg0, %sr1
       mtsp    %arg2, %sr2

       /* Get the len argument and do the copy. */
       ldw     HPPA_FRAME_ARG(4)(%sp), %arg0
#define _LABEL(l) __CONCAT(spcopy,l)
       _COPY_FORWARD(%sr1,%arg1,%sr2,%arg3,%arg0)
_LABEL(_done):

       /* Return. */
       copy    %r0, %ret0
ALTENTRY(spcopy_fault)
       stw     %r0, PCB_ONFAULT(%r31)
       bv      %r0(%rp)
       mtsp    %ret1, %sr2
EXIT(spcopy)
#endif /* SPCOPY && !_STANDALONE */

#ifdef MEMCPY
/*
* void *memcpy(void *restrict dst, const void *restrict src, size_t len);
*
* memcpy is specifically restricted to working on
* non-overlapping regions, so we can just copy forward.
*/
LEAF_ENTRY(memcpy)
       copy    %arg0, %ret0
#define _LABEL(l) __CONCAT(memcpy,l)
       _COPY_FORWARD(%sr0,%arg1,%sr0,%arg0,%arg2)
_LABEL(_done):
       bv,n    %r0(%rp)
       nop
EXIT(memcpy)
#endif /* MEMCPY */

#ifdef BCOPY
/*
* void bcopy(const void *src, void *dst, size_t len);
*/
LEAF_ENTRY(bcopy)
       copy    %arg0, %r1
       copy    %arg1, %arg0
       copy    %r1, %arg1
       /* FALLTHROUGH */
#define _LABEL_F(l) __CONCAT(bcopy_F,l)
#define _LABEL_R(l) __CONCAT(bcopy_R,l)
#endif

#ifdef MEMMOVE
/*
* void *memmove(void *dst, const void *src, size_t len);
*/
LEAF_ENTRY(memmove)
#define _LABEL_F(l) __CONCAT(memmove_F,l)
#define _LABEL_R(l) __CONCAT(memmove_R,l)
       copy    %arg0, %ret0
#endif /* MEMMOVE */

#if defined(BCOPY) || defined(MEMMOVE)

       /*
        * If src >= dst or src + len <= dst, we copy
        * forward, else we copy in reverse.
        */
       add             %arg1, %arg2, %r1
       comb,>>=,n      %arg1, %arg0, 0
       comb,>>,n       %r1, %arg0, _LABEL_R(_go)

#define _LABEL _LABEL_F
       _COPY_FORWARD(%sr0,%arg1,%sr0,%arg0,%arg2)
#undef _LABEL

_LABEL_R(_go):
#define _LABEL _LABEL_R
       _COPY_REVERSE(%sr0,%arg1,%sr0,%arg0,%arg2)
#undef _LABEL

_LABEL_F(_done):
_LABEL_R(_done):
       bv,n    %r0(%rp)
       nop
#ifdef BCOPY
EXIT(bcopy)
#else
EXIT(memmove)
#endif
#endif /* BCOPY || MEMMOVE */