/* $NetBSD: memset.S,v 1.3 2020/04/11 05:12:52 ryo Exp $ */

/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Matt Thomas of 3am Software Foundry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

#include <machine/asm.h>

ENTRY(memset)
       cbz     x2, .Lret
       mov     x15, x0                 /* working data pointer */
       cbz     x1, .Lzerofill
       cbz     x1, .Lfilled
       /*
        * Non zero fill, replicate to all 64 bits of x1.
        */
       and     x1, x1, #0xff
       orr     x1, x1, x1, lsl #8
       orr     x1, x1, x1, lsl #16
       orr     x1, x1, x1, lsl #32
Lfilled:
       cmp     x2, #15                 /* if it's small, ignore alignment */
       b.ls    .Llast_subqword

       mov     x6, x1
       tst     x15, #15
       b.eq    .Lqword_loop

/*
* We have at least 15 to copy which means we can get qword alignment
* without having to check the amount left.
*/
       tbz     x15, #0, .Lhword_aligned
       strb    w1, [x15], #1
Lhword_aligned:
       tbz     x15, #1, .Lword_aligned
       strh    w1, [x15], #2
Lword_aligned:
       tbz     x15, #2, .Ldword_aligned
       str     w1, [x15], #4
Ldword_aligned:
       tbz     x15, #3, .Lqword_aligned
       str     x1, [x15], #8
/*
* Now we qword aligned. Figure how much we have to write to get here.
* Then subtract from the length.  If we get 0, we're done.
*/
Lqword_aligned:
       sub     x5, x15, x0
       subs    x2, x2, x5
       b.eq    .Lret

/*
* Write 16 bytes at time.  If we don't have 16 bytes to write, bail.
* Keep looping if there's data to set.
*/
Lqword_loop:
       subs    x2, x2, #16
       b.mi    .Llast_subqword
       stp     x1, x6, [x15], #16
       b.ne    .Lqword_loop
       ret

/*
* We have less than a qword to write.  We hope we are aligned but since
* unaligned access works, we don't have to be aligned.
*/
Llast_subqword:
       tbz     x2, #3, .Llast_subdword
       str     x1, [x15], #8
Llast_subdword:
       tbz     x2, #2, .Llast_subword
       str     w1, [x15], #4
Llast_subword:
       tbz     x2, #1, .Llast_subhword
       strh    w1, [x15], #2
Llast_subhword:
       tbz     x2, #0, .Lret
       strb    w1, [x15]
Lret:   ret

/*
* If we are filling with zeros then let's see if we can use the
*      dc zva, <Xt>
* instruction to speed things up.
*/
Lzerofill:
       mrs     x9, dczid_el0
       /*
        * Make sure we can the instruction isn't prohibited.
        */
       tbnz    x9, #4, .Lfilled
       /*
        * Now find out the block size.
        */
       ubfx    x9, x9, #0, #4  /* extract low 4 bits */
       add     x9, x9, #2      /* add log2(word) */
       mov     x10, #1         /* the value is log2(words) */
       lsl     x10, x10, x9    /* shift to get the block size */
       cmp     x2, x10         /* are we even copying a block? */
       b.lt    .Lfilled        /*   no, do it 16 bytes at a time */
       /*
        * Now we figure out how many aligned blocks we have
        */
       sub     x11, x10, #1    /* make block size a mask */
       add     x12, x15, x11   /* round start to a block boundary */
       asr     x12, x12, x9    /* "starting" block number */
       add     x13, x15, x2    /* get ending address */
       asr     x13, x13, x9    /* "ending" block numebr */
       cmp     x13, x12        /* how many blocks? */
       b.ls    .Lfilled        /*   none, do it 16 bytes at a time */

       /*
        * Now we have one or more blocks to deal with.  First now we need
        * to get block aligned.
        */
       and     x7, x15, x11    /* are already aligned on a block boundary? */
       cbz     x7, .Lblock_aligned

       sub     x7, x10, x7     /* subtract offset from block length */
       sub     x2, x2, x7      /* subtract that from length */
       asr     x7, x7, #4      /* length -> N*16 */

       tbz     x15, #0, .Lzero_hword_aligned
       strb    wzr, [x15], #1
Lzero_hword_aligned:
       tbz     x15, #1, .Lzero_word_aligned
       strh    wzr, [x15], #2
Lzero_word_aligned:
       tbz     x15, #2, .Lzero_dword_aligned
       str     wzr, [x15], #4
Lzero_dword_aligned:
       tbz     x15, #3, .Lzero_qword_aligned
       str     xzr, [x15], #8
Lzero_qword_aligned:
       cbz     x7, .Lblock_aligned     /* aligned? just branch */

       /* align to DCZID_EL0:BS boundary */
       tbz     x7, #0, 0f              /* fill 16byte? */
       stp     xzr, xzr, [x15], #16
0:
       tbz     x7, #1, 1f              /* fill 32byte? */
       stp     xzr, xzr, [x15], #16
       stp     xzr, xzr, [x15], #16
1:
       lsr     x7, x7, #2
       cbz     x7, 9f
L64bytes_fill:
       sub     x7, x7, #1
       stp     xzr, xzr, [x15], #16
       stp     xzr, xzr, [x15], #16
       stp     xzr, xzr, [x15], #16
       stp     xzr, xzr, [x15], #16
       cbnz    x7, .L64bytes_fill
9:

/*
* Now we are block aligned.
*/
Lblock_aligned:
       subs    x2, x2, x10
       b.mi    .Lblock_done
       dc      zva, x15
       add     x15, x15, x10
       b.ne    .Lblock_aligned
       ret

Lblock_done:
       and     x2, x2, x11     /* make positive again */
       mov     x6, xzr         /* fill 2nd xword */
       b       .Lqword_loop    /* and finish filling */

END(memset)