/*      $NetBSD: muldi3.S,v 1.2 2020/05/31 12:37:07 rin Exp $   */

/*
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Rin Okuyama.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

#include <machine/asm.h>

RCSID("$NetBSD: muldi3.S,v 1.2 2020/05/31 12:37:07 rin Exp $")

| int64_t __muldi3(int64_t X, int64_t Y);
|
| * Return lower 64bit of (X * Y) into %d0:%d1.
|
| * Intended for 68060:
|   - GCC does not emit __muldi3() for 68020-40, that have 32 * 32 --> 64 mulul.
|   - mulsl (and moveml) are not implemented for 68010.
|
| * Notation:
|   - H32:L32 --> higher:lower 32bit of variable
|   - H:L     --> higher:lower 16bit of variable/register

#ifdef __mc68010__
#error "not for 68010"
#endif

#define X_H32 (4 * 4)
#define X_L32 (X_H32 + 4)
#define Y_H32 (X_L32 + 4)
#define Y_L32 (Y_H32 + 4)

ENTRY(__muldi3)
       moveml  %d2-%d4, -(%sp) | push %d2-%d4

| First, calculate (X_L32 * Y_L32) as a 64bit integer.

       movel   X_L32(%sp), %a0 | save X_L32
       movel   Y_L32(%sp), %a1 | save Y_L32

       movel   %a0, %d2        | prepare for X_L32(H) in L
       movel   %a1, %d3        | prepare for Y_L32(H) in L

       movel   %a0, %d4        | X_L32(L) in L
       movel   %a1, %d1        | Y_L32(L) in L
       movel   %a0, %d0        | X_L32(L) in L

       swap    %d2             | X_L32(H) in L
       swap    %d3             | Y_L32(H) in L

       muluw   %d1, %d4        | A = X_L32(L) * Y_L32(L)
       muluw   %d2, %d1        | B = X_L32(H) * Y_L32(L)
       muluw   %d3, %d2        | C = X_L32(H) * Y_L32(H)
       muluw   %d0, %d3        | D = X_L32(L) * Y_L32(H)

       movel   %d4, %d0        | extract A(H)
       clrw    %d0
       swap    %d0

       addl    %d0, %d1        | B += A(H) (no carry; max 0xffff0000)

       addl    %d3, %d1        | B += D
       bccs    1f              | if (carry)
       addil   #0x10000, %d2   |       C += 0x10000

1:      swap    %d1             | B(H) <--> B(L)

| (%d0), (%d1), %d2 = C, %d3 = free, %d4 = A

       clrl    %d3             | extract B(H)
       movew   %d1, %d3

       movew   %d4, %d1        | %d1 = (B(L) << 16) + A(L)

       addl    %d3, %d2        | C += B(H)

| We have (X_L32 * Y_L32) in %d2:%d1. Lower 32bit was completed.
| Add (X_L32 * Y_H32 + X_H32 * Y_L32) to higher 32bit.
|
| (%d0), (%d1), %d2 = C, %d3 = free, %d4 = free

       movel   %a0, %d0        | restore X_L32
       movel   %a1, %d3        | restore Y_L32
       mulsl   Y_H32(%sp), %d0 | E = X_L32 * Y_H32
       mulsl   X_H32(%sp), %d3 | F = X_H32 * Y_L32
       addl    %d2, %d0        | E += C
       addl    %d3, %d0        | %d0 = E + F

       moveml  (%sp)+, %d2-%d4 | pop %d2-%d4
       rts
END(__muldi3)