From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2005 15:20:36 -0700 Subject: Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! --- arch/m68k/math-emu/Makefile | 11 + arch/m68k/math-emu/fp_arith.c | 701 ++++++++++++++++++ arch/m68k/math-emu/fp_arith.h | 52 ++ arch/m68k/math-emu/fp_cond.S | 334 +++++++++ arch/m68k/math-emu/fp_decode.h | 417 +++++++++++ arch/m68k/math-emu/fp_emu.h | 146 ++++ arch/m68k/math-emu/fp_entry.S | 325 +++++++++ arch/m68k/math-emu/fp_log.c | 223 ++++++ arch/m68k/math-emu/fp_move.S | 244 +++++++ arch/m68k/math-emu/fp_movem.S | 368 ++++++++++ arch/m68k/math-emu/fp_scan.S | 478 +++++++++++++ arch/m68k/math-emu/fp_trig.c | 183 +++++ arch/m68k/math-emu/fp_trig.h | 32 + arch/m68k/math-emu/fp_util.S | 1455 ++++++++++++++++++++++++++++++++++++++ arch/m68k/math-emu/multi_arith.h | 819 +++++++++++++++++++++ 15 files changed, 5788 insertions(+) create mode 100644 arch/m68k/math-emu/Makefile create mode 100644 arch/m68k/math-emu/fp_arith.c create mode 100644 arch/m68k/math-emu/fp_arith.h create mode 100644 arch/m68k/math-emu/fp_cond.S create mode 100644 arch/m68k/math-emu/fp_decode.h create mode 100644 arch/m68k/math-emu/fp_emu.h create mode 100644 arch/m68k/math-emu/fp_entry.S create mode 100644 arch/m68k/math-emu/fp_log.c create mode 100644 arch/m68k/math-emu/fp_move.S create mode 100644 arch/m68k/math-emu/fp_movem.S create mode 100644 arch/m68k/math-emu/fp_scan.S create mode 100644 arch/m68k/math-emu/fp_trig.c create mode 100644 arch/m68k/math-emu/fp_trig.h create mode 100644 arch/m68k/math-emu/fp_util.S create mode 100644 arch/m68k/math-emu/multi_arith.h (limited to 'arch/m68k/math-emu') diff --git a/arch/m68k/math-emu/Makefile b/arch/m68k/math-emu/Makefile new file mode 100644 index 00000000000..53994040181 --- /dev/null +++ b/arch/m68k/math-emu/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for the linux kernel. +# + +EXTRA_AFLAGS := -traditional + +#EXTRA_AFLAGS += -DFPU_EMU_DEBUG +#EXTRA_CFLAGS += -DFPU_EMU_DEBUG + +obj-y := fp_entry.o fp_scan.o fp_util.o fp_move.o fp_movem.o \ + fp_cond.o fp_arith.o fp_log.o fp_trig.o diff --git a/arch/m68k/math-emu/fp_arith.c b/arch/m68k/math-emu/fp_arith.c new file mode 100644 index 00000000000..08f286db3c5 --- /dev/null +++ b/arch/m68k/math-emu/fp_arith.c @@ -0,0 +1,701 @@ +/* + + fp_arith.c: floating-point math routines for the Linux-m68k + floating point emulator. + + Copyright (c) 1998-1999 David Huggins-Daines. + + Somewhat based on the AlphaLinux floating point emulator, by David + Mosberger-Tang. + + You may copy, modify, and redistribute this file under the terms of + the GNU General Public License, version 2, or any later version, at + your convenience. + */ + +#include "fp_emu.h" +#include "multi_arith.h" +#include "fp_arith.h" + +const struct fp_ext fp_QNaN = +{ + .exp = 0x7fff, + .mant = { .m64 = ~0 } +}; + +const struct fp_ext fp_Inf = +{ + .exp = 0x7fff, +}; + +/* let's start with the easy ones */ + +struct fp_ext * +fp_fabs(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fabs\n"); + + fp_monadic_check(dest, src); + + dest->sign = 0; + + return dest; +} + +struct fp_ext * +fp_fneg(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fneg\n"); + + fp_monadic_check(dest, src); + + dest->sign = !dest->sign; + + return dest; +} + +/* Now, the slightly harder ones */ + +/* fp_fadd: Implements the kernel of the FADD, FSADD, FDADD, FSUB, + FDSUB, and FCMP instructions. */ + +struct fp_ext * +fp_fadd(struct fp_ext *dest, struct fp_ext *src) +{ + int diff; + + dprint(PINSTR, "fadd\n"); + + fp_dyadic_check(dest, src); + + if (IS_INF(dest)) { + /* infinity - infinity == NaN */ + if (IS_INF(src) && (src->sign != dest->sign)) + fp_set_nan(dest); + return dest; + } + if (IS_INF(src)) { + fp_copy_ext(dest, src); + return dest; + } + + if (IS_ZERO(dest)) { + if (IS_ZERO(src)) { + if (src->sign != dest->sign) { + if (FPDATA->rnd == FPCR_ROUND_RM) + dest->sign = 1; + else + dest->sign = 0; + } + } else + fp_copy_ext(dest, src); + return dest; + } + + dest->lowmant = src->lowmant = 0; + + if ((diff = dest->exp - src->exp) > 0) + fp_denormalize(src, diff); + else if ((diff = -diff) > 0) + fp_denormalize(dest, diff); + + if (dest->sign == src->sign) { + if (fp_addmant(dest, src)) + if (!fp_addcarry(dest)) + return dest; + } else { + if (dest->mant.m64 < src->mant.m64) { + fp_submant(dest, src, dest); + dest->sign = !dest->sign; + } else + fp_submant(dest, dest, src); + } + + return dest; +} + +/* fp_fsub: Implements the kernel of the FSUB, FSSUB, and FDSUB + instructions. + + Remember that the arguments are in assembler-syntax order! */ + +struct fp_ext * +fp_fsub(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fsub "); + + src->sign = !src->sign; + return fp_fadd(dest, src); +} + + +struct fp_ext * +fp_fcmp(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fcmp "); + + FPDATA->temp[1] = *dest; + src->sign = !src->sign; + return fp_fadd(&FPDATA->temp[1], src); +} + +struct fp_ext * +fp_ftst(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "ftst\n"); + + (void)dest; + + return src; +} + +struct fp_ext * +fp_fmul(struct fp_ext *dest, struct fp_ext *src) +{ + union fp_mant128 temp; + int exp; + + dprint(PINSTR, "fmul\n"); + + fp_dyadic_check(dest, src); + + /* calculate the correct sign now, as it's necessary for infinities */ + dest->sign = src->sign ^ dest->sign; + + /* Handle infinities */ + if (IS_INF(dest)) { + if (IS_ZERO(src)) + fp_set_nan(dest); + return dest; + } + if (IS_INF(src)) { + if (IS_ZERO(dest)) + fp_set_nan(dest); + else + fp_copy_ext(dest, src); + return dest; + } + + /* Of course, as we all know, zero * anything = zero. You may + not have known that it might be a positive or negative + zero... */ + if (IS_ZERO(dest) || IS_ZERO(src)) { + dest->exp = 0; + dest->mant.m64 = 0; + dest->lowmant = 0; + + return dest; + } + + exp = dest->exp + src->exp - 0x3ffe; + + /* shift up the mantissa for denormalized numbers, + so that the highest bit is set, this makes the + shift of the result below easier */ + if ((long)dest->mant.m32[0] >= 0) + exp -= fp_overnormalize(dest); + if ((long)src->mant.m32[0] >= 0) + exp -= fp_overnormalize(src); + + /* now, do a 64-bit multiply with expansion */ + fp_multiplymant(&temp, dest, src); + + /* normalize it back to 64 bits and stuff it back into the + destination struct */ + if ((long)temp.m32[0] > 0) { + exp--; + fp_putmant128(dest, &temp, 1); + } else + fp_putmant128(dest, &temp, 0); + + if (exp >= 0x7fff) { + fp_set_ovrflw(dest); + return dest; + } + dest->exp = exp; + if (exp < 0) { + fp_set_sr(FPSR_EXC_UNFL); + fp_denormalize(dest, -exp); + } + + return dest; +} + +/* fp_fdiv: Implements the "kernel" of the FDIV, FSDIV, FDDIV and + FSGLDIV instructions. + + Note that the order of the operands is counter-intuitive: instead + of src / dest, the result is actually dest / src. */ + +struct fp_ext * +fp_fdiv(struct fp_ext *dest, struct fp_ext *src) +{ + union fp_mant128 temp; + int exp; + + dprint(PINSTR, "fdiv\n"); + + fp_dyadic_check(dest, src); + + /* calculate the correct sign now, as it's necessary for infinities */ + dest->sign = src->sign ^ dest->sign; + + /* Handle infinities */ + if (IS_INF(dest)) { + /* infinity / infinity = NaN (quiet, as always) */ + if (IS_INF(src)) + fp_set_nan(dest); + /* infinity / anything else = infinity (with approprate sign) */ + return dest; + } + if (IS_INF(src)) { + /* anything / infinity = zero (with appropriate sign) */ + dest->exp = 0; + dest->mant.m64 = 0; + dest->lowmant = 0; + + return dest; + } + + /* zeroes */ + if (IS_ZERO(dest)) { + /* zero / zero = NaN */ + if (IS_ZERO(src)) + fp_set_nan(dest); + /* zero / anything else = zero */ + return dest; + } + if (IS_ZERO(src)) { + /* anything / zero = infinity (with appropriate sign) */ + fp_set_sr(FPSR_EXC_DZ); + dest->exp = 0x7fff; + dest->mant.m64 = 0; + + return dest; + } + + exp = dest->exp - src->exp + 0x3fff; + + /* shift up the mantissa for denormalized numbers, + so that the highest bit is set, this makes lots + of things below easier */ + if ((long)dest->mant.m32[0] >= 0) + exp -= fp_overnormalize(dest); + if ((long)src->mant.m32[0] >= 0) + exp -= fp_overnormalize(src); + + /* now, do the 64-bit divide */ + fp_dividemant(&temp, dest, src); + + /* normalize it back to 64 bits and stuff it back into the + destination struct */ + if (!temp.m32[0]) { + exp--; + fp_putmant128(dest, &temp, 32); + } else + fp_putmant128(dest, &temp, 31); + + if (exp >= 0x7fff) { + fp_set_ovrflw(dest); + return dest; + } + dest->exp = exp; + if (exp < 0) { + fp_set_sr(FPSR_EXC_UNFL); + fp_denormalize(dest, -exp); + } + + return dest; +} + +struct fp_ext * +fp_fsglmul(struct fp_ext *dest, struct fp_ext *src) +{ + int exp; + + dprint(PINSTR, "fsglmul\n"); + + fp_dyadic_check(dest, src); + + /* calculate the correct sign now, as it's necessary for infinities */ + dest->sign = src->sign ^ dest->sign; + + /* Handle infinities */ + if (IS_INF(dest)) { + if (IS_ZERO(src)) + fp_set_nan(dest); + return dest; + } + if (IS_INF(src)) { + if (IS_ZERO(dest)) + fp_set_nan(dest); + else + fp_copy_ext(dest, src); + return dest; + } + + /* Of course, as we all know, zero * anything = zero. You may + not have known that it might be a positive or negative + zero... */ + if (IS_ZERO(dest) || IS_ZERO(src)) { + dest->exp = 0; + dest->mant.m64 = 0; + dest->lowmant = 0; + + return dest; + } + + exp = dest->exp + src->exp - 0x3ffe; + + /* do a 32-bit multiply */ + fp_mul64(dest->mant.m32[0], dest->mant.m32[1], + dest->mant.m32[0] & 0xffffff00, + src->mant.m32[0] & 0xffffff00); + + if (exp >= 0x7fff) { + fp_set_ovrflw(dest); + return dest; + } + dest->exp = exp; + if (exp < 0) { + fp_set_sr(FPSR_EXC_UNFL); + fp_denormalize(dest, -exp); + } + + return dest; +} + +struct fp_ext * +fp_fsgldiv(struct fp_ext *dest, struct fp_ext *src) +{ + int exp; + unsigned long quot, rem; + + dprint(PINSTR, "fsgldiv\n"); + + fp_dyadic_check(dest, src); + + /* calculate the correct sign now, as it's necessary for infinities */ + dest->sign = src->sign ^ dest->sign; + + /* Handle infinities */ + if (IS_INF(dest)) { + /* infinity / infinity = NaN (quiet, as always) */ + if (IS_INF(src)) + fp_set_nan(dest); + /* infinity / anything else = infinity (with approprate sign) */ + return dest; + } + if (IS_INF(src)) { + /* anything / infinity = zero (with appropriate sign) */ + dest->exp = 0; + dest->mant.m64 = 0; + dest->lowmant = 0; + + return dest; + } + + /* zeroes */ + if (IS_ZERO(dest)) { + /* zero / zero = NaN */ + if (IS_ZERO(src)) + fp_set_nan(dest); + /* zero / anything else = zero */ + return dest; + } + if (IS_ZERO(src)) { + /* anything / zero = infinity (with appropriate sign) */ + fp_set_sr(FPSR_EXC_DZ); + dest->exp = 0x7fff; + dest->mant.m64 = 0; + + return dest; + } + + exp = dest->exp - src->exp + 0x3fff; + + dest->mant.m32[0] &= 0xffffff00; + src->mant.m32[0] &= 0xffffff00; + + /* do the 32-bit divide */ + if (dest->mant.m32[0] >= src->mant.m32[0]) { + fp_sub64(dest->mant, src->mant); + fp_div64(quot, rem, dest->mant.m32[0], 0, src->mant.m32[0]); + dest->mant.m32[0] = 0x80000000 | (quot >> 1); + dest->mant.m32[1] = (quot & 1) | rem; /* only for rounding */ + } else { + fp_div64(quot, rem, dest->mant.m32[0], 0, src->mant.m32[0]); + dest->mant.m32[0] = quot; + dest->mant.m32[1] = rem; /* only for rounding */ + exp--; + } + + if (exp >= 0x7fff) { + fp_set_ovrflw(dest); + return dest; + } + dest->exp = exp; + if (exp < 0) { + fp_set_sr(FPSR_EXC_UNFL); + fp_denormalize(dest, -exp); + } + + return dest; +} + +/* fp_roundint: Internal rounding function for use by several of these + emulated instructions. + + This one rounds off the fractional part using the rounding mode + specified. */ + +static void fp_roundint(struct fp_ext *dest, int mode) +{ + union fp_mant64 oldmant; + unsigned long mask; + + if (!fp_normalize_ext(dest)) + return; + + /* infinities and zeroes */ + if (IS_INF(dest) || IS_ZERO(dest)) + return; + + /* first truncate the lower bits */ + oldmant = dest->mant; + switch (dest->exp) { + case 0 ... 0x3ffe: + dest->mant.m64 = 0; + break; + case 0x3fff ... 0x401e: + dest->mant.m32[0] &= 0xffffffffU << (0x401e - dest->exp); + dest->mant.m32[1] = 0; + if (oldmant.m64 == dest->mant.m64) + return; + break; + case 0x401f ... 0x403e: + dest->mant.m32[1] &= 0xffffffffU << (0x403e - dest->exp); + if (oldmant.m32[1] == dest->mant.m32[1]) + return; + break; + default: + return; + } + fp_set_sr(FPSR_EXC_INEX2); + + /* We might want to normalize upwards here... however, since + we know that this is only called on the output of fp_fdiv, + or with the input to fp_fint or fp_fintrz, and the inputs + to all these functions are either normal or denormalized + (no subnormals allowed!), there's really no need. + + In the case of fp_fdiv, observe that 0x80000000 / 0xffff = + 0xffff8000, and the same holds for 128-bit / 64-bit. (i.e. the + smallest possible normal dividend and the largest possible normal + divisor will still produce a normal quotient, therefore, (normal + << 64) / normal is normal in all cases) */ + + switch (mode) { + case FPCR_ROUND_RN: + switch (dest->exp) { + case 0 ... 0x3ffd: + return; + case 0x3ffe: + /* As noted above, the input is always normal, so the + guard bit (bit 63) is always set. therefore, the + only case in which we will NOT round to 1.0 is when + the input is exactly 0.5. */ + if (oldmant.m64 == (1ULL << 63)) + return; + break; + case 0x3fff ... 0x401d: + mask = 1 << (0x401d - dest->exp); + if (!(oldmant.m32[0] & mask)) + return; + if (oldmant.m32[0] & (mask << 1)) + break; + if (!(oldmant.m32[0] << (dest->exp - 0x3ffd)) && + !oldmant.m32[1]) + return; + break; + case 0x401e: + if (!(oldmant.m32[1] >= 0)) + return; + if (oldmant.m32[0] & 1) + break; + if (!(oldmant.m32[1] << 1)) + return; + break; + case 0x401f ... 0x403d: + mask = 1 << (0x403d - dest->exp); + if (!(oldmant.m32[1] & mask)) + return; + if (oldmant.m32[1] & (mask << 1)) + break; + if (!(oldmant.m32[1] << (dest->exp - 0x401d))) + return; + break; + default: + return; + } + break; + case FPCR_ROUND_RZ: + return; + default: + if (dest->sign ^ (mode - FPCR_ROUND_RM)) + break; + return; + } + + switch (dest->exp) { + case 0 ... 0x3ffe: + dest->exp = 0x3fff; + dest->mant.m64 = 1ULL << 63; + break; + case 0x3fff ... 0x401e: + mask = 1 << (0x401e - dest->exp); + if (dest->mant.m32[0] += mask) + break; + dest->mant.m32[0] = 0x80000000; + dest->exp++; + break; + case 0x401f ... 0x403e: + mask = 1 << (0x403e - dest->exp); + if (dest->mant.m32[1] += mask) + break; + if (dest->mant.m32[0] += 1) + break; + dest->mant.m32[0] = 0x80000000; + dest->exp++; + break; + } +} + +/* modrem_kernel: Implementation of the FREM and FMOD instructions + (which are exactly the same, except for the rounding used on the + intermediate value) */ + +static struct fp_ext * +modrem_kernel(struct fp_ext *dest, struct fp_ext *src, int mode) +{ + struct fp_ext tmp; + + fp_dyadic_check(dest, src); + + /* Infinities and zeros */ + if (IS_INF(dest) || IS_ZERO(src)) { + fp_set_nan(dest); + return dest; + } + if (IS_ZERO(dest) || IS_INF(src)) + return dest; + + /* FIXME: there is almost certainly a smarter way to do this */ + fp_copy_ext(&tmp, dest); + fp_fdiv(&tmp, src); /* NOTE: src might be modified */ + fp_roundint(&tmp, mode); + fp_fmul(&tmp, src); + fp_fsub(dest, &tmp); + + /* set the quotient byte */ + fp_set_quotient((dest->mant.m64 & 0x7f) | (dest->sign << 7)); + return dest; +} + +/* fp_fmod: Implements the kernel of the FMOD instruction. + + Again, the argument order is backwards. The result, as defined in + the Motorola manuals, is: + + fmod(src,dest) = (dest - (src * floor(dest / src))) */ + +struct fp_ext * +fp_fmod(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fmod\n"); + return modrem_kernel(dest, src, FPCR_ROUND_RZ); +} + +/* fp_frem: Implements the kernel of the FREM instruction. + + frem(src,dest) = (dest - (src * round(dest / src))) + */ + +struct fp_ext * +fp_frem(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "frem\n"); + return modrem_kernel(dest, src, FPCR_ROUND_RN); +} + +struct fp_ext * +fp_fint(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fint\n"); + + fp_copy_ext(dest, src); + + fp_roundint(dest, FPDATA->rnd); + + return dest; +} + +struct fp_ext * +fp_fintrz(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fintrz\n"); + + fp_copy_ext(dest, src); + + fp_roundint(dest, FPCR_ROUND_RZ); + + return dest; +} + +struct fp_ext * +fp_fscale(struct fp_ext *dest, struct fp_ext *src) +{ + int scale, oldround; + + dprint(PINSTR, "fscale\n"); + + fp_dyadic_check(dest, src); + + /* Infinities */ + if (IS_INF(src)) { + fp_set_nan(dest); + return dest; + } + if (IS_INF(dest)) + return dest; + + /* zeroes */ + if (IS_ZERO(src) || IS_ZERO(dest)) + return dest; + + /* Source exponent out of range */ + if (src->exp >= 0x400c) { + fp_set_ovrflw(dest); + return dest; + } + + /* src must be rounded with round to zero. */ + oldround = FPDATA->rnd; + FPDATA->rnd = FPCR_ROUND_RZ; + scale = fp_conv_ext2long(src); + FPDATA->rnd = oldround; + + /* new exponent */ + scale += dest->exp; + + if (scale >= 0x7fff) { + fp_set_ovrflw(dest); + } else if (scale <= 0) { + fp_set_sr(FPSR_EXC_UNFL); + fp_denormalize(dest, -scale); + } else + dest->exp = scale; + + return dest; +} + diff --git a/arch/m68k/math-emu/fp_arith.h b/arch/m68k/math-emu/fp_arith.h new file mode 100644 index 00000000000..2cc3f846c39 --- /dev/null +++ b/arch/m68k/math-emu/fp_arith.h @@ -0,0 +1,52 @@ +/* + + fp_arith.h: floating-point math routines for the Linux-m68k + floating point emulator. + + Copyright (c) 1998 David Huggins-Daines. + + Somewhat based on the AlphaLinux floating point emulator, by David + Mosberger-Tang. + + You may copy, modify, and redistribute this file under the terms of + the GNU General Public License, version 2, or any later version, at + your convenience. + + */ + +#ifndef FP_ARITH_H +#define FP_ARITH_H + +/* easy ones */ +struct fp_ext * +fp_fabs(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_fneg(struct fp_ext *dest, struct fp_ext *src); + +/* straightforward arithmetic */ +struct fp_ext * +fp_fadd(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_fsub(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_fcmp(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_ftst(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_fmul(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_fdiv(struct fp_ext *dest, struct fp_ext *src); + +/* ones that do rounding and integer conversions */ +struct fp_ext * +fp_fmod(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_frem(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_fint(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_fintrz(struct fp_ext *dest, struct fp_ext *src); +struct fp_ext * +fp_fscale(struct fp_ext *dest, struct fp_ext *src); + +#endif /* FP_ARITH__H */ diff --git a/arch/m68k/math-emu/fp_cond.S b/arch/m68k/math-emu/fp_cond.S new file mode 100644 index 00000000000..ddae8b1b8b8 --- /dev/null +++ b/arch/m68k/math-emu/fp_cond.S @@ -0,0 +1,334 @@ +/* + * fp_cond.S + * + * Copyright Roman Zippel, 1997. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fp_emu.h" +#include "fp_decode.h" + + .globl fp_fscc, fp_fbccw, fp_fbccl + +#ifdef FPU_EMU_DEBUG +fp_fnop: + printf PDECODE,"fnop\n" + jra fp_end +#else +#define fp_fnop fp_end +#endif + +fp_fbccw: + tst.w %d2 + jeq fp_fnop + printf PDECODE,"fbccw " + fp_get_pc %a0 + lea (-2,%a0,%d2.w),%a0 + jra 1f + +fp_fbccl: + printf PDECODE,"fbccl " + fp_get_pc %a0 + move.l %d2,%d0 + swap %d0 + fp_get_instr_word %d0,fp_err_ua1 + lea (-2,%a0,%d0.l),%a0 +1: printf PDECODE,"%x",1,%a0 + move.l %d2,%d0 + swap %d0 + jsr fp_compute_cond + tst.l %d0 + jeq 1f + fp_put_pc %a0,1 +1: printf PDECODE,"\n" + jra fp_end + +fp_fdbcc: + printf PDECODE,"fdbcc " + fp_get_pc %a1 | calculate new pc + fp_get_instr_word %d0,fp_err_ua1 + add.w %d0,%a1 + fp_decode_addr_reg + printf PDECODE,"d%d,%x\n",2,%d0,%a1 + swap %d1 | test condition in %d1 + tst.w %d1 + jne 2f + move.l %d0,%d1 + jsr fp_get_data_reg + subq.w #1,%d0 + jcs 1f + fp_put_pc %a1,1 +1: jsr fp_put_data_reg +2: jra fp_end + +| set flags for decode macros for fs +do_fscc=1 +do_no_pc_mode=1 + +fp_fscc: + printf PDECODE,"fscc " + move.l %d2,%d0 + jsr fp_compute_cond + move.w %d0,%d1 + swap %d1 + + | decode addressing mode + fp_decode_addr_mode + + .long fp_data, fp_fdbcc + .long fp_indirect, fp_postinc + .long fp_predecr, fp_disp16 + .long fp_extmode0, fp_extmode1 + + | addressing mode: data register direct +fp_data: + fp_mode_data_direct + move.w %d0,%d1 | save register nr + jsr fp_get_data_reg + swap %d1 + move.b %d1,%d0 + swap %d1 + jsr fp_put_data_reg + printf PDECODE,"\n" + jra fp_end + +fp_indirect: + fp_mode_addr_indirect + jra fp_do_scc + +fp_postinc: + fp_mode_addr_indirect_postinc + jra fp_do_scc + +fp_predecr: + fp_mode_addr_indirect_predec + jra fp_do_scc + +fp_disp16: + fp_mode_addr_indirect_disp16 + jra fp_do_scc + +fp_extmode0: + fp_mode_addr_indirect_extmode0 + jra fp_do_scc + +fp_extmode1: + bfextu %d2{#13,#3},%d0 + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: + .long fp_absolute_short, fp_absolute_long + .long fp_ill, fp_ill | NOTE: jump here to ftrap.x + .long fp_ill, fp_ill + .long fp_ill, fp_ill + +fp_absolute_short: + fp_mode_abs_short + jra fp_do_scc + +fp_absolute_long: + fp_mode_abs_long +| jra fp_do_scc + +fp_do_scc: + swap %d1 + putuser.b %d1,(%a0),fp_err_ua1,%a0 + printf PDECODE,"\n" + jra fp_end + + +#define tst_NAN btst #24,%d1 +#define tst_Z btst #26,%d1 +#define tst_N btst #27,%d1 + +fp_compute_cond: + move.l (FPD_FPSR,FPDATA),%d1 + btst #4,%d0 + jeq 1f + tst_NAN + jeq 1f + bset #15,%d1 + bset #7,%d1 + move.l %d1,(FPD_FPSR,FPDATA) +1: and.w #0xf,%d0 + jmp ([0f:w,%pc,%d0.w*4]) + + .align 4 +0: + .long fp_f , fp_eq , fp_ogt, fp_oge + .long fp_olt, fp_ole, fp_ogl, fp_or + .long fp_un , fp_ueq, fp_ugt, fp_uge + .long fp_ult, fp_ule, fp_ne , fp_t + +fp_f: + moveq #0,%d0 + rts + +fp_eq: + moveq #0,%d0 + tst_Z + jeq 1f + moveq #-1,%d0 +1: rts + +fp_ogt: + moveq #0,%d0 + tst_NAN + jne 1f + tst_Z + jne 1f + tst_N + jne 1f + moveq #-1,%d0 +1: rts + +fp_oge: + moveq #-1,%d0 + tst_Z + jne 2f + tst_NAN + jne 1f + tst_N + jeq 2f +1: moveq #0,%d0 +2: rts + +fp_olt: + moveq #0,%d0 + tst_NAN + jne 1f + tst_Z + jne 1f + tst_N + jeq 1f + moveq #-1,%d0 +1: rts + +fp_ole: + moveq #-1,%d0 + tst_Z + jne 2f + tst_NAN + jne 1f + tst_N + jne 2f +1: moveq #0,%d0 +2: rts + +fp_ogl: + moveq #0,%d0 + tst_NAN + jne 1f + tst_Z + jne 1f + moveq #-1,%d0 +1: rts + +fp_or: + moveq #0,%d0 + tst_NAN + jne 1f + moveq #-1,%d0 +1: rts + +fp_un: + moveq #0,%d0 + tst_NAN + jeq 1f + moveq #-1,%d0 + rts + +fp_ueq: + moveq #-1,%d0 + tst_NAN + jne 1f + tst_Z + jne 1f + moveq #0,%d0 +1: rts + +fp_ugt: + moveq #-1,%d0 + tst_NAN + jne 2f + tst_N + jne 1f + tst_Z + jeq 2f +1: moveq #0,%d0 +2: rts + +fp_uge: + moveq #-1,%d0 + tst_NAN + jne 1f + tst_Z + jne 1f + tst_N + jeq 1f + moveq #0,%d0 +1: rts + +fp_ult: + moveq #-1,%d0 + tst_NAN + jne 2f + tst_Z + jne 1f + tst_N + jne 2f +1: moveq #0,%d0 +2: rts + +fp_ule: + moveq #-1,%d0 + tst_NAN + jne 1f + tst_Z + jne 1f + tst_N + jne 1f + moveq #0,%d0 +1: rts + +fp_ne: + moveq #0,%d0 + tst_Z + jne 1f + moveq #-1,%d0 +1: rts + +fp_t: + moveq #-1,%d0 + rts diff --git a/arch/m68k/math-emu/fp_decode.h b/arch/m68k/math-emu/fp_decode.h new file mode 100644 index 00000000000..759679d9ab9 --- /dev/null +++ b/arch/m68k/math-emu/fp_decode.h @@ -0,0 +1,417 @@ +/* + * fp_decode.h + * + * Copyright Roman Zippel, 1997. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FP_DECODE_H +#define _FP_DECODE_H + +/* These macros do the dirty work of the instr decoding, several variables + * can be defined in the source file to modify the work of these macros, + * currently the following variables are used: + * ... + * The register usage: + * d0 - will contain source operand for data direct mode, + * otherwise scratch register + * d1 - upper 16bit are reserved for caller + * lower 16bit may contain further arguments, + * is destroyed during decoding + * d2 - contains first two instruction words, + * first word will be used for extension word + * a0 - will point to source/dest operand for any indirect mode + * otherwise scratch register + * a1 - scratch register + * a2 - base addr to the task structure + * + * the current implementation doesn't check for every disallowed + * addressing mode (e.g. pc relative modes as destination), as long + * as it only means a new addressing mode, which should not appear + * in a program and that doesn't crash the emulation, I think it's + * not a problem to allow these modes. + */ + +do_fmovem=0 +do_fmovem_cr=0 +do_no_pc_mode=0 +do_fscc=0 + +| first decoding of the instr type +| this separates the conditional instr +.macro fp_decode_cond_instr_type + bfextu %d2{#8,#2},%d0 + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: +| .long "f","fscc/fdbcc" +| .long "fbccw","fbccl" +.endm + +| second decoding of the instr type +| this separates most move instr +.macro fp_decode_move_instr_type + bfextu %d2{#16,#3},%d0 + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: +| .long "f fpx,fpx","invalid instr" +| .long "f ,fpx","fmove fpx," +| .long "fmovem ,fpcr","fmovem ,fpx" +| .long "fmovem fpcr,","fmovem fpx," +.endm + +| extract the source specifier, specifies +| either source fp register or data format +.macro fp_decode_sourcespec + bfextu %d2{#19,#3},%d0 +.endm + +| decode destination format for fmove reg,ea +.macro fp_decode_dest_format + bfextu %d2{#19,#3},%d0 +.endm + +| decode source register for fmove reg,ea +.macro fp_decode_src_reg + bfextu %d2{#22,#3},%d0 +.endm + +| extract the addressing mode +| it depends on the instr which of the modes is valid +.macro fp_decode_addr_mode + bfextu %d2{#10,#3},%d0 + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: +| .long "data register direct","addr register direct" +| .long "addr register indirect" +| .long "addr register indirect postincrement" +| .long "addr register indirect predecrement" +| .long "addr register + index16" +| .long "extension mode1","extension mode2" +.endm + +| extract the register for the addressing mode +.macro fp_decode_addr_reg + bfextu %d2{#13,#3},%d0 +.endm + +| decode the 8bit diplacement from the brief extension word +.macro fp_decode_disp8 + move.b %d2,%d0 + ext.w %d0 +.endm + +| decode the index of the brief/full extension word +.macro fp_decode_index + bfextu %d2{#17,#3},%d0 | get the register nr + btst #15,%d2 | test for data/addr register + jne 1\@f + printf PDECODE,"d%d",1,%d0 + jsr fp_get_data_reg + jra 2\@f +1\@: printf PDECODE,"a%d",1,%d0 + jsr fp_get_addr_reg + move.l %a0,%d0 +2\@: +debug lea "'l'.w,%a0" + btst #11,%d2 | 16/32 bit size? + jne 3\@f +debug lea "'w'.w,%a0" + ext.l %d0 +3\@: printf PDECODE,":%c",1,%a0 + move.w %d2,%d1 | scale factor + rol.w #7,%d1 + and.w #3,%d1 +debug move.l "%d1,-(%sp)" +debug ext.l "%d1" + printf PDECODE,":%d",1,%d1 +debug move.l "(%sp)+,%d1" + lsl.l %d1,%d0 +.endm + +| decode the base displacement size +.macro fp_decode_basedisp + bfextu %d2{#26,#2},%d0 + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: +| .long "reserved","null displacement" +| .long "word displacement","long displacement" +.endm + +.macro fp_decode_outerdisp + bfextu %d2{#30,#2},%d0 + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: +| .long "no memory indirect action/reserved","null outer displacement" +| .long "word outer displacement","long outer displacement" +.endm + +| get the extension word and test for brief or full extension type +.macro fp_get_test_extword label + fp_get_instr_word %d2,fp_err_ua1 + btst #8,%d2 + jne \label +.endm + + +| test if %pc is the base register for the indirect addr mode +.macro fp_test_basereg_d16 label + btst #20,%d2 + jeq \label +.endm + +| test if %pc is the base register for one of the extended modes +.macro fp_test_basereg_ext label + btst #19,%d2 + jeq \label +.endm + +.macro fp_test_suppr_index label + btst #6,%d2 + jne \label +.endm + + +| addressing mode: data register direct +.macro fp_mode_data_direct + fp_decode_addr_reg + printf PDECODE,"d%d",1,%d0 +.endm + +| addressing mode: address register indirect +.macro fp_mode_addr_indirect + fp_decode_addr_reg + printf PDECODE,"(a%d)",1,%d0 + jsr fp_get_addr_reg +.endm + +| adjust stack for byte moves from/to stack +.macro fp_test_sp_byte_move + .if !do_fmovem + .if do_fscc + move.w #6,%d1 + .endif + cmp.w #7,%d0 + jne 1\@f + .if !do_fscc + cmp.w #6,%d1 + jne 1\@f + .endif + move.w #4,%d1 +1\@: + .endif +.endm + +| addressing mode: address register indirect with postincrement +.macro fp_mode_addr_indirect_postinc + fp_decode_addr_reg + printf PDECODE,"(a%d)+",1,%d0 + fp_test_sp_byte_move + jsr fp_get_addr_reg + move.l %a0,%a1 | save addr + .if do_fmovem + lea (%a0,%d1.w*4),%a0 + .if !do_fmovem_cr + lea (%a0,%d1.w*8),%a0 + .endif + .else + add.w (fp_datasize,%d1.w*2),%a0 + .endif + jsr fp_put_addr_reg + move.l %a1,%a0 +.endm + +| addressing mode: address register indirect with predecrement +.macro fp_mode_addr_indirect_predec + fp_decode_addr_reg + printf PDECODE,"-(a%d)",1,%d0 + fp_test_sp_byte_move + jsr fp_get_addr_reg + .if do_fmovem + .if !do_fmovem_cr + lea (-12,%a0),%a1 | setup to addr of 1st reg to move + neg.w %d1 + lea (%a0,%d1.w*4),%a0 + add.w %d1,%d1 + lea (%a0,%d1.w*4),%a0 + jsr fp_put_addr_reg + move.l %a1,%a0 + .else + neg.w %d1 + lea (%a0,%d1.w*4),%a0 + jsr fp_put_addr_reg + .endif + .else + sub.w (fp_datasize,%d1.w*2),%a0 + jsr fp_put_addr_reg + .endif +.endm + +| addressing mode: address register/programm counter indirect +| with 16bit displacement +.macro fp_mode_addr_indirect_disp16 + .if !do_no_pc_mode + fp_test_basereg_d16 1f + printf PDECODE,"pc" + fp_get_pc %a0 + jra 2f + .endif +1: fp_decode_addr_reg + printf PDECODE,"a%d",1,%d0 + jsr fp_get_addr_reg +2: fp_get_instr_word %a1,fp_err_ua1 + printf PDECODE,"@(%x)",1,%a1 + add.l %a1,%a0 +.endm + +| perform preindex (if I/IS == 0xx and xx != 00) +.macro fp_do_preindex + moveq #3,%d0 + and.w %d2,%d0 + jeq 1f + btst #2,%d2 + jne 1f + printf PDECODE,")@(" + getuser.l (%a1),%a1,fp_err_ua1,%a1 +debug jra "2f" +1: printf PDECODE,"," +2: +.endm + +| perform postindex (if I/IS == 1xx) +.macro fp_do_postindex + btst #2,%d2 + jeq 1f + printf PDECODE,")@(" + getuser.l (%a1),%a1,fp_err_ua1,%a1 +debug jra "2f" +1: printf PDECODE,"," +2: +.endm + +| all other indirect addressing modes will finally end up here +.macro fp_mode_addr_indirect_extmode0 + .if !do_no_pc_mode + fp_test_basereg_ext 1f + printf PDECODE,"pc" + fp_get_pc %a0 + jra 2f + .endif +1: fp_decode_addr_reg + printf PDECODE,"a%d",1,%d0 + jsr fp_get_addr_reg +2: move.l %a0,%a1 + swap %d2 + fp_get_test_extword 3f + | addressing mode: address register/programm counter indirect + | with index and 8bit displacement + fp_decode_disp8 +debug ext.l "%d0" + printf PDECODE,"@(%x,",1,%d0 + add.w %d0,%a1 + fp_decode_index + add.l %d0,%a1 + printf PDECODE,")" + jra 9f +3: | addressing mode: address register/programm counter memory indirect + | with base and/or outer displacement + btst #7,%d2 | base register suppressed? + jeq 1f + printf PDECODE,"!" + sub.l %a1,%a1 +1: printf PDECODE,"@(" + fp_decode_basedisp + + .long fp_ill,1f + .long 2f,3f + +#ifdef FPU_EMU_DEBUG +1: printf PDECODE,"0" | null base displacement + jra 1f +#endif +2: fp_get_instr_word %a0,fp_err_ua1 | 16bit base displacement + printf PDECODE,"%x:w",1,%a0 + jra 4f +3: fp_get_instr_long %a0,fp_err_ua1 | 32bit base displacement + printf PDECODE,"%x:l",1,%a0 +4: add.l %a0,%a1 +1: + fp_do_postindex + fp_test_suppr_index 1f + fp_decode_index + add.l %d0,%a1 +1: fp_do_preindex + + fp_decode_outerdisp + + .long 5f,1f + .long 2f,3f + +#ifdef FPU_EMU_DEBUG +1: printf PDECODE,"0" | null outer displacement + jra 1f +#endif +2: fp_get_instr_word %a0,fp_err_ua1 | 16bit outer displacement + printf PDECODE,"%x:w",1,%a0 + jra 4f +3: fp_get_instr_long %a0,fp_err_ua1 | 32bit outer displacement + printf PDECODE,"%x:l",1,%a0 +4: add.l %a0,%a1 +1: +5: printf PDECODE,")" +9: move.l %a1,%a0 + swap %d2 +.endm + +| get the absolute short address from user space +.macro fp_mode_abs_short + fp_get_instr_word %a0,fp_err_ua1 + printf PDECODE,"%x.w",1,%a0 +.endm + +| get the absolute long address from user space +.macro fp_mode_abs_long + fp_get_instr_long %a0,fp_err_ua1 + printf PDECODE,"%x.l",1,%a0 +.endm + +#endif /* _FP_DECODE_H */ diff --git a/arch/m68k/math-emu/fp_emu.h b/arch/m68k/math-emu/fp_emu.h new file mode 100644 index 00000000000..1d6edc975d8 --- /dev/null +++ b/arch/m68k/math-emu/fp_emu.h @@ -0,0 +1,146 @@ +/* + * fp_emu.h + * + * Copyright Roman Zippel, 1997. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FP_EMU_H +#define _FP_EMU_H + +#ifdef __ASSEMBLY__ +#include +#endif +#include + +#ifndef __ASSEMBLY__ + +#define IS_INF(a) ((a)->exp == 0x7fff) +#define IS_ZERO(a) ((a)->mant.m64 == 0) + + +#define fp_set_sr(bit) ({ \ + FPDATA->fpsr |= 1 << (bit); \ +}) + +#define fp_set_quotient(quotient) ({ \ + FPDATA->fpsr &= 0xff00ffff; \ + FPDATA->fpsr |= ((quotient) & 0xff) << 16; \ +}) + +/* linkage for several useful functions */ + +/* Normalize the extended struct, return 0 for a NaN */ +#define fp_normalize_ext(fpreg) ({ \ + register struct fp_ext *reg asm ("a0") = fpreg; \ + register int res asm ("d0"); \ + \ + asm volatile ("jsr fp_conv_ext2ext" \ + : "=d" (res) : "a" (reg) \ + : "a1", "d1", "d2", "memory"); \ + res; \ +}) + +#define fp_copy_ext(dest, src) ({ \ + *dest = *src; \ +}) + +#define fp_monadic_check(dest, src) ({ \ + fp_copy_ext(dest, src); \ + if (!fp_normalize_ext(dest)) \ + return dest; \ +}) + +#define fp_dyadic_check(dest, src) ({ \ + if (!fp_normalize_ext(dest)) \ + return dest; \ + if (!fp_normalize_ext(src)) { \ + fp_copy_ext(dest, src); \ + return dest; \ + } \ +}) + +extern const struct fp_ext fp_QNaN; +extern const struct fp_ext fp_Inf; + +#define fp_set_nan(dest) ({ \ + fp_set_sr(FPSR_EXC_OPERR); \ + *dest = fp_QNaN; \ +}) + +/* TODO check rounding mode? */ +#define fp_set_ovrflw(dest) ({ \ + fp_set_sr(FPSR_EXC_OVFL); \ + dest->exp = 0x7fff; \ + dest->mant.m64 = 0; \ +}) + +#define fp_conv_ext2long(src) ({ \ + register struct fp_ext *__src asm ("a0") = src; \ + register int __res asm ("d0"); \ + \ + asm volatile ("jsr fp_conv_ext2long" \ + : "=d" (__res) : "a" (__src) \ + : "a1", "d1", "d2", "memory"); \ + __res; \ +}) + +#define fp_conv_long2ext(dest, src) ({ \ + register struct fp_ext *__dest asm ("a0") = dest; \ + register int __src asm ("d0") = src; \ + \ + asm volatile ("jsr fp_conv_ext2long" \ + : : "d" (__src), "a" (__dest) \ + : "a1", "d1", "d2", "memory"); \ +}) + +#else /* __ASSEMBLY__ */ + +/* + * set, reset or clear a bit in the fp status register + */ +.macro fp_set_sr bit + bset #(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA) +.endm + +.macro fp_clr_sr bit + bclr #(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA) +.endm + +.macro fp_tst_sr bit + btst #(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA) +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* _FP_EMU_H */ diff --git a/arch/m68k/math-emu/fp_entry.S b/arch/m68k/math-emu/fp_entry.S new file mode 100644 index 00000000000..5ec2d9101ea --- /dev/null +++ b/arch/m68k/math-emu/fp_entry.S @@ -0,0 +1,325 @@ +/* + * fp_emu.S + * + * Copyright Roman Zippel, 1997. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include "fp_emu.h" + + .globl fpu_emu + .globl fp_debugprint + .globl fp_err_ua1,fp_err_ua2 + + .text +fpu_emu: + SAVE_ALL_INT + GET_CURRENT(%d0) + +#if defined(CPU_M68020_OR_M68030) && defined(CPU_M68040_OR_M68060) + tst.l m68k_is040or060 + jeq 1f +#endif +#if defined(CPU_M68040_OR_M68060) + move.l (FPS_PC2,%sp),(FPS_PC,%sp) +#endif +1: + | emulate the instruction + jsr fp_scan + +#if defined(CONFIG_M68060) +#if !defined(CPU_M68060_ONLY) + btst #3,m68k_cputype+3 + jeq 1f +#endif + btst #7,(FPS_SR,%sp) + jne fp_sendtrace060 +#endif +1: + | emulation successful? + tst.l %d0 + jeq ret_from_exception + + | send some signal to program here + + jra ret_from_exception + + | we jump here after an access error while trying to access + | user space, we correct stackpointer and send a SIGSEGV to + | the user process +fp_err_ua2: + addq.l #4,%sp +fp_err_ua1: + addq.l #4,%sp + move.l %a0,-(%sp) + pea SEGV_MAPERR + pea SIGSEGV + jsr fpemu_signal + add.w #12,%sp + jra ret_from_exception + +#if defined(CONFIG_M68060) + | send a trace signal if we are debugged + | it does not really belong here, but... +fp_sendtrace060: + move.l (FPS_PC,%sp),-(%sp) + pea TRAP_TRACE + pea SIGTRAP + jsr fpemu_signal + add.w #12,%sp + jra ret_from_exception +#endif + + .globl fp_get_data_reg, fp_put_data_reg + .globl fp_get_addr_reg, fp_put_addr_reg + + | Entry points to get/put a register. Some of them can be get/put + | directly, others are on the stack, as we read/write the stack + | directly here, these function may only be called from within + | instruction decoding, otherwise the stack pointer is incorrect + | and the stack gets corrupted. +fp_get_data_reg: + jmp ([0f:w,%pc,%d0.w*4]) + + .align 4 +0: + .long fp_get_d0, fp_get_d1 + .long fp_get_d2, fp_get_d3 + .long fp_get_d4, fp_get_d5 + .long fp_get_d6, fp_get_d7 + +fp_get_d0: + move.l (PT_D0+8,%sp),%d0 + printf PREGISTER,"{d0->%08x}",1,%d0 + rts + +fp_get_d1: + move.l (PT_D1+8,%sp),%d0 + printf PREGISTER,"{d1->%08x}",1,%d0 + rts + +fp_get_d2: + move.l (PT_D2+8,%sp),%d0 + printf PREGISTER,"{d2->%08x}",1,%d0 + rts + +fp_get_d3: + move.l %d3,%d0 + printf PREGISTER,"{d3->%08x}",1,%d0 + rts + +fp_get_d4: + move.l %d4,%d0 + printf PREGISTER,"{d4->%08x}",1,%d0 + rts + +fp_get_d5: + move.l %d5,%d0 + printf PREGISTER,"{d5->%08x}",1,%d0 + rts + +fp_get_d6: + move.l %d6,%d0 + printf PREGISTER,"{d6->%08x}",1,%d0 + rts + +fp_get_d7: + move.l %d7,%d0 + printf PREGISTER,"{d7->%08x}",1,%d0 + rts + +fp_put_data_reg: + jmp ([0f:w,%pc,%d1.w*4]) + + .align 4 +0: + .long fp_put_d0, fp_put_d1 + .long fp_put_d2, fp_put_d3 + .long fp_put_d4, fp_put_d5 + .long fp_put_d6, fp_put_d7 + +fp_put_d0: + printf PREGISTER,"{d0<-%08x}",1,%d0 + move.l %d0,(PT_D0+8,%sp) + rts + +fp_put_d1: + printf PREGISTER,"{d1<-%08x}",1,%d0 + move.l %d0,(PT_D1+8,%sp) + rts + +fp_put_d2: + printf PREGISTER,"{d2<-%08x}",1,%d0 + move.l %d0,(PT_D2+8,%sp) + rts + +fp_put_d3: + printf PREGISTER,"{d3<-%08x}",1,%d0 +| move.l %d0,%d3 + move.l %d0,(PT_D3+8,%sp) + rts + +fp_put_d4: + printf PREGISTER,"{d4<-%08x}",1,%d0 +| move.l %d0,%d4 + move.l %d0,(PT_D4+8,%sp) + rts + +fp_put_d5: + printf PREGISTER,"{d5<-%08x}",1,%d0 +| move.l %d0,%d5 + move.l %d0,(PT_D5+8,%sp) + rts + +fp_put_d6: + printf PREGISTER,"{d6<-%08x}",1,%d0 + move.l %d0,%d6 + rts + +fp_put_d7: + printf PREGISTER,"{d7<-%08x}",1,%d0 + move.l %d0,%d7 + rts + +fp_get_addr_reg: + jmp ([0f:w,%pc,%d0.w*4]) + + .align 4 +0: + .long fp_get_a0, fp_get_a1 + .long fp_get_a2, fp_get_a3 + .long fp_get_a4, fp_get_a5 + .long fp_get_a6, fp_get_a7 + +fp_get_a0: + move.l (PT_A0+8,%sp),%a0 + printf PREGISTER,"{a0->%08x}",1,%a0 + rts + +fp_get_a1: + move.l (PT_A1+8,%sp),%a0 + printf PREGISTER,"{a1->%08x}",1,%a0 + rts + +fp_get_a2: + move.l (PT_A2+8,%sp),%a0 + printf PREGISTER,"{a2->%08x}",1,%a0 + rts + +fp_get_a3: + move.l %a3,%a0 + printf PREGISTER,"{a3->%08x}",1,%a0 + rts + +fp_get_a4: + move.l %a4,%a0 + printf PREGISTER,"{a4->%08x}",1,%a0 + rts + +fp_get_a5: + move.l %a5,%a0 + printf PREGISTER,"{a5->%08x}",1,%a0 + rts + +fp_get_a6: + move.l %a6,%a0 + printf PREGISTER,"{a6->%08x}",1,%a0 + rts + +fp_get_a7: + move.l %usp,%a0 + printf PREGISTER,"{a7->%08x}",1,%a0 + rts + +fp_put_addr_reg: + jmp ([0f:w,%pc,%d0.w*4]) + + .align 4 +0: + .long fp_put_a0, fp_put_a1 + .long fp_put_a2, fp_put_a3 + .long fp_put_a4, fp_put_a5 + .long fp_put_a6, fp_put_a7 + +fp_put_a0: + printf PREGISTER,"{a0<-%08x}",1,%a0 + move.l %a0,(PT_A0+8,%sp) + rts + +fp_put_a1: + printf PREGISTER,"{a1<-%08x}",1,%a0 + move.l %a0,(PT_A1+8,%sp) + rts + +fp_put_a2: + printf PREGISTER,"{a2<-%08x}",1,%a0 + move.l %a0,(PT_A2+8,%sp) + rts + +fp_put_a3: + printf PREGISTER,"{a3<-%08x}",1,%a0 + move.l %a0,%a3 + rts + +fp_put_a4: + printf PREGISTER,"{a4<-%08x}",1,%a0 + move.l %a0,%a4 + rts + +fp_put_a5: + printf PREGISTER,"{a5<-%08x}",1,%a0 + move.l %a0,%a5 + rts + +fp_put_a6: + printf PREGISTER,"{a6<-%08x}",1,%a0 + move.l %a0,%a6 + rts + +fp_put_a7: + printf PREGISTER,"{a7<-%08x}",1,%a0 + move.l %a0,%usp + rts + + .data + .align 4 + +fp_debugprint: +| .long PMDECODE + .long PMINSTR+PMDECODE+PMCONV+PMNORM +| .long PMCONV+PMNORM+PMINSTR +| .long 0 diff --git a/arch/m68k/math-emu/fp_log.c b/arch/m68k/math-emu/fp_log.c new file mode 100644 index 00000000000..87b4f015856 --- /dev/null +++ b/arch/m68k/math-emu/fp_log.c @@ -0,0 +1,223 @@ +/* + + fp_trig.c: floating-point math routines for the Linux-m68k + floating point emulator. + + Copyright (c) 1998-1999 David Huggins-Daines / Roman Zippel. + + I hereby give permission, free of charge, to copy, modify, and + redistribute this software, in source or binary form, provided that + the above copyright notice and the following disclaimer are included + in all such copies. + + THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL + OR IMPLIED. + +*/ + +#include "fp_emu.h" + +static const struct fp_ext fp_one = +{ + .exp = 0x3fff, +}; + +extern struct fp_ext *fp_fadd(struct fp_ext *dest, const struct fp_ext *src); +extern struct fp_ext *fp_fdiv(struct fp_ext *dest, const struct fp_ext *src); +extern struct fp_ext *fp_fmul(struct fp_ext *dest, const struct fp_ext *src); + +struct fp_ext * +fp_fsqrt(struct fp_ext *dest, struct fp_ext *src) +{ + struct fp_ext tmp, src2; + int i, exp; + + dprint(PINSTR, "fsqrt\n"); + + fp_monadic_check(dest, src); + + if (IS_ZERO(dest)) + return dest; + + if (dest->sign) { + fp_set_nan(dest); + return dest; + } + if (IS_INF(dest)) + return dest; + + /* + * sqrt(m) * 2^(p) , if e = 2*p + * sqrt(m*2^e) = + * sqrt(2*m) * 2^(p) , if e = 2*p + 1 + * + * So we use the last bit of the exponent to decide wether to + * use the m or 2*m. + * + * Since only the fractional part of the mantissa is stored and + * the integer part is assumed to be one, we place a 1 or 2 into + * the fixed point representation. + */ + exp = dest->exp; + dest->exp = 0x3FFF; + if (!(exp & 1)) /* lowest bit of exponent is set */ + dest->exp++; + fp_copy_ext(&src2, dest); + + /* + * The taylor row arround a for sqrt(x) is: + * sqrt(x) = sqrt(a) + 1/(2*sqrt(a))*(x-a) + R + * With a=1 this gives: + * sqrt(x) = 1 + 1/2*(x-1) + * = 1/2*(1+x) + */ + fp_fadd(dest, &fp_one); + dest->exp--; /* * 1/2 */ + + /* + * We now apply the newton rule to the function + * f(x) := x^2 - r + * which has a null point on x = sqrt(r). + * + * It gives: + * x' := x - f(x)/f'(x) + * = x - (x^2 -r)/(2*x) + * = x - (x - r/x)/2 + * = (2*x - x + r/x)/2 + * = (x + r/x)/2 + */ + for (i = 0; i < 9; i++) { + fp_copy_ext(&tmp, &src2); + + fp_fdiv(&tmp, dest); + fp_fadd(dest, &tmp); + dest->exp--; + } + + dest->exp += (exp - 0x3FFF) / 2; + + return dest; +} + +struct fp_ext * +fp_fetoxm1(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fetoxm1\n"); + + fp_monadic_check(dest, src); + + if (IS_ZERO(dest)) + return dest; + + return dest; +} + +struct fp_ext * +fp_fetox(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fetox\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_ftwotox(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("ftwotox\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_ftentox(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("ftentox\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_flogn(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("flogn\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_flognp1(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("flognp1\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_flog10(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("flog10\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_flog2(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("flog2\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_fgetexp(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fgetexp\n"); + + fp_monadic_check(dest, src); + + if (IS_INF(dest)) { + fp_set_nan(dest); + return dest; + } + if (IS_ZERO(dest)) + return dest; + + fp_conv_long2ext(dest, (int)dest->exp - 0x3FFF); + + fp_normalize_ext(dest); + + return dest; +} + +struct fp_ext * +fp_fgetman(struct fp_ext *dest, struct fp_ext *src) +{ + dprint(PINSTR, "fgetman\n"); + + fp_monadic_check(dest, src); + + if (IS_ZERO(dest)) + return dest; + + if (IS_INF(dest)) + return dest; + + dest->exp = 0x3FFF; + + return dest; +} + diff --git a/arch/m68k/math-emu/fp_move.S b/arch/m68k/math-emu/fp_move.S new file mode 100644 index 00000000000..71bdf83ba61 --- /dev/null +++ b/arch/m68k/math-emu/fp_move.S @@ -0,0 +1,244 @@ +/* + * fp_move.S + * + * Copyright Roman Zippel, 1997. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fp_emu.h" +#include "fp_decode.h" + +do_no_pc_mode=1 + + .globl fp_fmove_fp2mem + +fp_fmove_fp2mem: + clr.b (2+FPD_FPSR,FPDATA) + fp_decode_dest_format + move.w %d0,%d1 | store data size twice in %d1 + swap %d1 | one can be trashed below + move.w %d0,%d1 +#ifdef FPU_EMU_DEBUG + lea 0f,%a0 + clr.l %d0 + move.b (%a0,%d1.w),%d0 + printf PDECODE,"fmove.%c ",1,%d0 + fp_decode_src_reg + printf PDECODE,"fp%d,",1,%d0 + + .data +0: .byte 'l','s','x','p','w','d','b','p' + .previous +#endif + + | encode addressing mode for dest + fp_decode_addr_mode + + .long fp_data, fp_ill + .long fp_indirect, fp_postinc + .long fp_predecr, fp_disp16 + .long fp_extmode0, fp_extmode1 + + | addressing mode: data register direct +fp_data: + fp_mode_data_direct + move.w %d0,%d1 + fp_decode_src_reg + fp_get_fp_reg + lea (FPD_TEMPFP1,FPDATA),%a1 + move.l (%a0)+,(%a1)+ + move.l (%a0)+,(%a1)+ + move.l (%a0),(%a1) + lea (-8,%a1),%a0 + swap %d1 + move.l %d1,%d2 + printf PDECODE,"\n" + jmp ([0f:w,%pc,%d1.w*4]) + + .align 4 +0: + .long fp_data_long, fp_data_single + .long fp_ill, fp_ill + .long fp_data_word, fp_ill + .long fp_data_byte, fp_ill + +fp_data_byte: + jsr fp_normalize_ext + jsr fp_conv_ext2byte + move.l %d0,%d1 + swap %d2 + move.w %d2,%d0 + jsr fp_get_data_reg + move.b %d1,%d0 + move.w %d2,%d1 + jsr fp_put_data_reg + jra fp_final + +fp_data_word: + jsr fp_normalize_ext + jsr fp_conv_ext2short + move.l %d0,%d1 + swap %d2 + move.w %d2,%d0 + jsr fp_get_data_reg + move.w %d1,%d0 + move.l %d2,%d1 + jsr fp_put_data_reg + jra fp_final + +fp_data_long: + jsr fp_normalize_ext + jsr fp_conv_ext2long + swap %d2 + move.w %d2,%d1 + jsr fp_put_data_reg + jra fp_final + +fp_data_single: + jsr fp_normalize_ext + jsr fp_conv_ext2single + swap %d2 + move.w %d2,%d1 + jsr fp_put_data_reg + jra fp_final + + | addressing mode: address register indirect +fp_indirect: + fp_mode_addr_indirect + jra fp_putdest + + | addressing mode: address register indirect with postincrement +fp_postinc: + fp_mode_addr_indirect_postinc + jra fp_putdest + + | addressing mode: address register indirect with predecrement +fp_predecr: + fp_mode_addr_indirect_predec + jra fp_putdest + + | addressing mode: address register indirect with 16bit displacement +fp_disp16: + fp_mode_addr_indirect_disp16 + jra fp_putdest + +fp_extmode0: + fp_mode_addr_indirect_extmode0 + jra fp_putdest + +fp_extmode1: + fp_decode_addr_reg + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: + .long fp_abs_short, fp_abs_long + .long fp_ill, fp_ill + .long fp_ill, fp_ill + .long fp_ill, fp_ill + +fp_abs_short: + fp_mode_abs_short + jra fp_putdest + +fp_abs_long: + fp_mode_abs_long + jra fp_putdest + +fp_putdest: + move.l %a0,%a1 + fp_decode_src_reg + move.l %d1,%d2 | save size + fp_get_fp_reg + printf PDECODE,"\n" + addq.l #8,%a0 + move.l (%a0),-(%sp) + move.l -(%a0),-(%sp) + move.l -(%a0),-(%sp) + move.l %sp,%a0 + jsr fp_normalize_ext + + swap %d2 + jmp ([0f:w,%pc,%d2.w*4]) + + .align 4 +0: + .long fp_format_long, fp_format_single + .long fp_format_extended, fp_format_packed + .long fp_format_word, fp_format_double + .long fp_format_byte, fp_format_packed + +fp_format_long: + jsr fp_conv_ext2long + putuser.l %d0,(%a1),fp_err_ua1,%a1 + jra fp_finish_move + +fp_format_single: + jsr fp_conv_ext2single + putuser.l %d0,(%a1),fp_err_ua1,%a1 + jra fp_finish_move + +fp_format_extended: + move.l (%a0)+,%d0 + lsl.w #1,%d0 + lsl.l #7,%d0 + lsl.l #8,%d0 + putuser.l %d0,(%a1)+,fp_err_ua1,%a1 + move.l (%a0)+,%d0 + putuser.l %d0,(%a1)+,fp_err_ua1,%a1 + move.l (%a0),%d0 + putuser.l %d0,(%a1),fp_err_ua1,%a1 + jra fp_finish_move + +fp_format_packed: + /* not supported yet */ + lea (12,%sp),%sp + jra fp_ill + +fp_format_word: + jsr fp_conv_ext2short + putuser.w %d0,(%a1),fp_err_ua1,%a1 + jra fp_finish_move + +fp_format_double: + jsr fp_conv_ext2double + jra fp_finish_move + +fp_format_byte: + jsr fp_conv_ext2byte + putuser.b %d0,(%a1),fp_err_ua1,%a1 +| jra fp_finish_move + +fp_finish_move: + lea (12,%sp),%sp + jra fp_final diff --git a/arch/m68k/math-emu/fp_movem.S b/arch/m68k/math-emu/fp_movem.S new file mode 100644 index 00000000000..8354d39e6c4 --- /dev/null +++ b/arch/m68k/math-emu/fp_movem.S @@ -0,0 +1,368 @@ +/* + * fp_movem.S + * + * Copyright Roman Zippel, 1997. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fp_emu.h" +#include "fp_decode.h" + +| set flags for decode macros for fmovem +do_fmovem=1 + + .globl fp_fmovem_fp, fp_fmovem_cr + +| %d1 contains the mask and count of the register list +| for other register usage see fp_decode.h + +fp_fmovem_fp: + printf PDECODE,"fmovem.x " + | get register list and count them + btst #11,%d2 + jne 1f + bfextu %d2{#24,#8},%d0 | static register list + jra 2f +1: bfextu %d2{#25,#3},%d0 | dynamic register list + jsr fp_get_data_reg +2: move.l %d0,%d1 + swap %d1 + jra 2f +1: addq.w #1,%d1 | count the # of registers in +2: lsr.b #1,%d0 | register list and keep it in %d1 + jcs 1b + jne 2b + printf PDECODE,"#%08x",1,%d1 +#ifdef FPU_EMU_DEBUG + btst #12,%d2 + jne 1f + printf PDECODE,"-" | decremental move + jra 2f +1: printf PDECODE,"+" | incremental move +2: btst #13,%d2 + jeq 1f + printf PDECODE,"->" | fpu -> cpu + jra 2f +1: printf PDECODE,"<-" | fpu <- cpu +2: +#endif + + | decode address mode + fp_decode_addr_mode + + .long fp_ill, fp_ill + .long fpr_indirect, fpr_postinc + .long fpr_predecr, fpr_disp16 + .long fpr_extmode0, fpr_extmode1 + + | addressing mode: address register indirect +fpr_indirect: + fp_mode_addr_indirect + jra fpr_do_movem + + | addressing mode: address register indirect with postincrement +fpr_postinc: + fp_mode_addr_indirect_postinc + jra fpr_do_movem + +fpr_predecr: + fp_mode_addr_indirect_predec + jra fpr_do_movem + + | addressing mode: address register/programm counter indirect + | with 16bit displacement +fpr_disp16: + fp_mode_addr_indirect_disp16 + jra fpr_do_movem + +fpr_extmode0: + fp_mode_addr_indirect_extmode0 + jra fpr_do_movem + +fpr_extmode1: + fp_decode_addr_reg + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: + .long fpr_absolute_short, fpr_absolute_long + .long fpr_disp16, fpr_extmode0 + .long fp_ill, fp_ill + .long fp_ill, fp_ill + +fpr_absolute_short: + fp_mode_abs_short + jra fpr_do_movem + +fpr_absolute_long: + fp_mode_abs_long +| jra fpr_do_movem + +fpr_do_movem: + swap %d1 | get fpu register list + lea (FPD_FPREG,FPDATA),%a1 + moveq #12,%d0 + btst #12,%d2 + jne 1f + lea (-12,%a1,%d0*8),%a1 + neg.l %d0 +1: btst #13,%d2 + jne 4f + | move register from memory into fpu + jra 3f +1: printf PMOVEM,"(%p>%p)",2,%a0,%a1 + getuser.l (%a0)+,%d2,fp_err_ua1,%a0 + lsr.l #8,%d2 + lsr.l #7,%d2 + lsr.w #1,%d2 + move.l %d2,(%a1)+ + getuser.l (%a0)+,%d2,fp_err_ua1,%a0 + move.l %d2,(%a1)+ + getuser.l (%a0),%d2,fp_err_ua1,%a0 + move.l %d2,(%a1) + subq.l #8,%a0 + subq.l #8,%a1 + add.l %d0,%a0 +2: add.l %d0,%a1 +3: lsl.b #1,%d1 + jcs 1b + jne 2b + jra 5f + | move register from fpu into memory +1: printf PMOVEM,"(%p>%p)",2,%a1,%a0 + move.l (%a1)+,%d2 + lsl.w #1,%d2 + lsl.l #7,%d2 + lsl.l #8,%d2 + putuser.l %d2,(%a0)+,fp_err_ua1,%a0 + move.l (%a1)+,%d2 + putuser.l %d2,(%a0)+,fp_err_ua1,%a0 + move.l (%a1),%d2 + putuser.l %d2,(%a0),fp_err_ua1,%a0 + subq.l #8,%a1 + subq.l #8,%a0 + add.l %d0,%a0 +2: add.l %d0,%a1 +4: lsl.b #1,%d1 + jcs 1b + jne 2b +5: + printf PDECODE,"\n" +#if 0 + lea (FPD_FPREG,FPDATA),%a0 + printf PMOVEM,"fp:" + printx PMOVEM,%a0@(0) + printx PMOVEM,%a0@(12) + printf PMOVEM,"\n " + printx PMOVEM,%a0@(24) + printx PMOVEM,%a0@(36) + printf PMOVEM,"\n " + printx PMOVEM,%a0@(48) + printx PMOVEM,%a0@(60) + printf PMOVEM,"\n " + printx PMOVEM,%a0@(72) + printx PMOVEM,%a0@(84) + printf PMOVEM,"\n" +#endif + jra fp_end + +| set flags for decode macros for fmovem control register +do_fmovem=1 +do_fmovem_cr=1 + +fp_fmovem_cr: + printf PDECODE,"fmovem.cr " + | get register list and count them + bfextu %d2{#19,#3},%d0 + move.l %d0,%d1 + swap %d1 + jra 2f +1: addq.w #1,%d1 +2: lsr.l #1,%d0 + jcs 1b + jne 2b + printf PDECODE,"#%08x",1,%d1 +#ifdef FPU_EMU_DEBUG + btst #13,%d2 + jeq 1f + printf PDECODE,"->" | fpu -> cpu + jra 2f +1: printf PDECODE,"<-" | fpu <- cpu +2: +#endif + + | decode address mode + fp_decode_addr_mode + + .long fpc_data, fpc_addr + .long fpc_indirect, fpc_postinc + .long fpc_predecr, fpc_disp16 + .long fpc_extmode0, fpc_extmode1 + +fpc_data: + fp_mode_data_direct + move.w %d0,%d1 + bfffo %d2{#19,#3},%d0 + sub.w #19,%d0 + lea (FPD_FPCR,FPDATA,%d0.w*4),%a1 + btst #13,%d2 + jne 1f + move.w %d1,%d0 + jsr fp_get_data_reg + move.l %d0,(%a1) + jra fpc_movem_fin +1: move.l (%a1),%d0 + jsr fp_put_data_reg + jra fpc_movem_fin + +fpc_addr: + fp_decode_addr_reg + printf PDECODE,"a%d",1,%d0 + btst #13,%d2 + jne 1f + jsr fp_get_addr_reg + move.l %a0,(FPD_FPIAR,FPDATA) + jra fpc_movem_fin +1: move.l (FPD_FPIAR,FPDATA),%a0 + jsr fp_put_addr_reg + jra fpc_movem_fin + +fpc_indirect: + fp_mode_addr_indirect + jra fpc_do_movem + +fpc_postinc: + fp_mode_addr_indirect_postinc + jra fpc_do_movem + +fpc_predecr: + fp_mode_addr_indirect_predec + jra fpc_do_movem + +fpc_disp16: + fp_mode_addr_indirect_disp16 + jra fpc_do_movem + +fpc_extmode0: + fp_mode_addr_indirect_extmode0 + jra fpc_do_movem + +fpc_extmode1: + fp_decode_addr_reg + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: + .long fpc_absolute_short, fpc_absolute_long + .long fpc_disp16, fpc_extmode0 + .long fpc_immediate, fp_ill + .long fp_ill, fp_ill + +fpc_absolute_short: + fp_mode_abs_short + jra fpc_do_movem + +fpc_absolute_long: + fp_mode_abs_long + jra fpc_do_movem + +fpc_immediate: + fp_get_pc %a0 + lea (%a0,%d1.w*4),%a1 + fp_put_pc %a1 + printf PDECODE,"#imm" +| jra fpc_do_movem +#if 0 + swap %d1 + lsl.l #5,%d1 + lea (FPD_FPCR,FPDATA),%a0 + jra 3f +1: move.l %d0,(%a0) +2: addq.l #4,%a0 +3: lsl.b #1,%d1 + jcs 1b + jne 2b + jra fpc_movem_fin +#endif + +fpc_do_movem: + swap %d1 | get fpu register list + lsl.l #5,%d1 + lea (FPD_FPCR,FPDATA),%a1 +1: btst #13,%d2 + jne 4f + + | move register from memory into fpu + jra 3f +1: printf PMOVEM,"(%p>%p)",2,%a0,%a1 + getuser.l (%a0)+,%d0,fp_err_ua1,%a0 + move.l %d0,(%a1) +2: addq.l #4,%a1 +3: lsl.b #1,%d1 + jcs 1b + jne 2b + jra fpc_movem_fin + + | move register from fpu into memory +1: printf PMOVEM,"(%p>%p)",2,%a1,%a0 + move.l (%a1),%d0 + putuser.l %d0,(%a0)+,fp_err_ua1,%a0 +2: addq.l #4,%a1 +4: lsl.b #1,%d1 + jcs 1b + jne 2b + +fpc_movem_fin: + and.l #0x0000fff0,(FPD_FPCR,FPDATA) + and.l #0x0ffffff8,(FPD_FPSR,FPDATA) + move.l (FPD_FPCR,FPDATA),%d0 + lsr.l #4,%d0 + moveq #3,%d1 + and.l %d0,%d1 + move.w %d1,(FPD_RND,FPDATA) + lsr.l #2,%d0 + moveq #3,%d1 + and.l %d0,%d1 + move.w %d1,(FPD_PREC,FPDATA) + printf PDECODE,"\n" +#if 0 + printf PMOVEM,"fpcr : %08x\n",1,FPDATA@(FPD_FPCR) + printf PMOVEM,"fpsr : %08x\n",1,FPDATA@(FPD_FPSR) + printf PMOVEM,"fpiar: %08x\n",1,FPDATA@(FPD_FPIAR) + clr.l %d0 + move.w (FPD_PREC,FPDATA),%d0 + printf PMOVEM,"prec : %04x\n",1,%d0 + move.w (FPD_RND,FPDATA),%d0 + printf PMOVEM,"rnd : %04x\n",1,%d0 +#endif + jra fp_end diff --git a/arch/m68k/math-emu/fp_scan.S b/arch/m68k/math-emu/fp_scan.S new file mode 100644 index 00000000000..e4146ed574d --- /dev/null +++ b/arch/m68k/math-emu/fp_scan.S @@ -0,0 +1,478 @@ +/* + * fp_scan.S + * + * Copyright Roman Zippel, 1997. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fp_emu.h" +#include "fp_decode.h" + + .globl fp_scan, fp_datasize + + .data + +| %d2 - first two instr words +| %d1 - operand size + +/* operand formats are: + + Long = 0, i.e. fmove.l + Single, i.e. fmove.s + Extended, i.e. fmove.x + Packed-BCD, i.e. fmove.p + Word, i.e. fmove.w + Double, i.e. fmove.d +*/ + + .text + +| On entry: +| FPDATA - base of emulated FPU registers + +fp_scan: +| normal fpu instruction? (this excludes fsave/frestore) + fp_get_pc %a0 + printf PDECODE,"%08x: ",1,%a0 + getuser.b (%a0),%d0,fp_err_ua1,%a0 +#if 1 + cmp.b #0xf2,%d0 | cpid = 1 +#else + cmp.b #0xfc,%d0 | cpid = 6 +#endif + jne fp_nonstd +| first two instruction words are kept in %d2 + getuser.l (%a0)+,%d2,fp_err_ua1,%a0 + fp_put_pc %a0 +fp_decode_cond: | separate conditional instr + fp_decode_cond_instr_type + + .long fp_decode_move, fp_fscc + .long fp_fbccw, fp_fbccl + +fp_decode_move: | separate move instr + fp_decode_move_instr_type + + .long fp_fgen_fp, fp_ill + .long fp_fgen_ea, fp_fmove_fp2mem + .long fp_fmovem_cr, fp_fmovem_cr + .long fp_fmovem_fp, fp_fmovem_fp + +| now all arithmetic instr and a few move instr are left +fp_fgen_fp: | source is a fpu register + clr.b (FPD_FPSR+2,FPDATA) | clear the exception byte + fp_decode_sourcespec + printf PDECODE,"f.x fp%d",1,%d0 + fp_get_fp_reg + lea (FPD_TEMPFP1,FPDATA),%a1 | copy src into a temp location + move.l (%a0)+,(%a1)+ + move.l (%a0)+,(%a1)+ + move.l (%a0),(%a1) + lea (-8,%a1),%a0 + jra fp_getdest + +fp_fgen_ea: | source is + clr.b (FPD_FPSR+2,FPDATA) | clear the exception byte + | sort out fmovecr, keep data size in %d1 + fp_decode_sourcespec + cmp.w #7,%d0 + jeq fp_fmovecr + move.w %d0,%d1 | store data size twice in %d1 + swap %d1 | one can be trashed below + move.w %d0,%d1 +#ifdef FPU_EMU_DEBUG + lea 0f,%a0 + clr.l %d0 + move.b (%a0,%d1.w),%d0 + printf PDECODE,"f.%c ",1,%d0 + + .data +0: .byte 'l','s','x','p','w','d','b',0 + .previous +#endif + +/* + fp_getsource, fp_getdest + + basically, we end up with a pointer to the source operand in + %a1, and a pointer to the destination operand in %a0. both + are, of course, 96-bit extended floating point numbers. +*/ + +fp_getsource: + | decode addressing mode for source + fp_decode_addr_mode + + .long fp_data, fp_ill + .long fp_indirect, fp_postinc + .long fp_predecr, fp_disp16 + .long fp_extmode0, fp_extmode1 + + | addressing mode: data register direct +fp_data: + fp_mode_data_direct + jsr fp_get_data_reg + lea (FPD_TEMPFP1,FPDATA),%a0 + jmp ([0f:w,%pc,%d1.w*4]) + + .align 4 +0: + .long fp_data_long, fp_data_single + .long fp_ill, fp_ill + .long fp_data_word, fp_ill + .long fp_data_byte, fp_ill + + | data types that fit in an integer data register +fp_data_byte: + extb.l %d0 + jra fp_data_long + +fp_data_word: + ext.l %d0 + +fp_data_long: + jsr fp_conv_long2ext + jra fp_getdest + +fp_data_single: + jsr fp_conv_single2ext + jra fp_getdest + + | addressing mode: address register indirect +fp_indirect: + fp_mode_addr_indirect + jra fp_fetchsource + + | addressing mode: address register indirect with postincrement +fp_postinc: + fp_mode_addr_indirect_postinc + jra fp_fetchsource + + | addressing mode: address register indirect with predecrement +fp_predecr: + fp_mode_addr_indirect_predec + jra fp_fetchsource + + | addressing mode: address register/programm counter indirect + | with 16bit displacement +fp_disp16: + fp_mode_addr_indirect_disp16 + jra fp_fetchsource + + | all other indirect addressing modes will finally end up here +fp_extmode0: + fp_mode_addr_indirect_extmode0 + jra fp_fetchsource + +| all pc relative addressing modes and immediate/absolute modes end up here +| the first ones are sent to fp_extmode0 or fp_disp16 +| and only the latter are handled here +fp_extmode1: + fp_decode_addr_reg + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: + .long fp_abs_short, fp_abs_long + .long fp_disp16, fp_extmode0 + .long fp_immediate, fp_ill + .long fp_ill, fp_ill + + | addressing mode: absolute short +fp_abs_short: + fp_mode_abs_short + jra fp_fetchsource + + | addressing mode: absolute long +fp_abs_long: + fp_mode_abs_long + jra fp_fetchsource + + | addressing mode: immediate data +fp_immediate: + printf PDECODE,"#" + fp_get_pc %a0 + move.w (fp_datasize,%d1.w*2),%d0 + addq.w #1,%d0 + and.w #-2,%d0 +#ifdef FPU_EMU_DEBUG + movem.l %d0/%d1,-(%sp) + movel %a0,%a1 + clr.l %d1 + jra 2f +1: getuser.b (%a1)+,%d1,fp_err_ua1,%a1 + printf PDECODE,"%02x",1,%d1 +2: dbra %d0,1b + movem.l (%sp)+,%d0/%d1 +#endif + lea (%a0,%d0.w),%a1 + fp_put_pc %a1 +| jra fp_fetchsource + +fp_fetchsource: + move.l %a0,%a1 + swap %d1 + lea (FPD_TEMPFP1,FPDATA),%a0 + jmp ([0f:w,%pc,%d1.w*4]) + + .align 4 +0: .long fp_long, fp_single + .long fp_ext, fp_pack + .long fp_word, fp_double + .long fp_byte, fp_ill + +fp_long: + getuser.l (%a1),%d0,fp_err_ua1,%a1 + jsr fp_conv_long2ext + jra fp_getdest + +fp_single: + getuser.l (%a1),%d0,fp_err_ua1,%a1 + jsr fp_conv_single2ext + jra fp_getdest + +fp_ext: + getuser.l (%a1)+,%d0,fp_err_ua1,%a1 + lsr.l #8,%d0 + lsr.l #7,%d0 + lsr.w #1,%d0 + move.l %d0,(%a0)+ + getuser.l (%a1)+,%d0,fp_err_ua1,%a1 + move.l %d0,(%a0)+ + getuser.l (%a1),%d0,fp_err_ua1,%a1 + move.l %d0,(%a0) + subq.l #8,%a0 + jra fp_getdest + +fp_pack: + /* not supported yet */ + jra fp_ill + +fp_word: + getuser.w (%a1),%d0,fp_err_ua1,%a1 + ext.l %d0 + jsr fp_conv_long2ext + jra fp_getdest + +fp_double: + jsr fp_conv_double2ext + jra fp_getdest + +fp_byte: + getuser.b (%a1),%d0,fp_err_ua1,%a1 + extb.l %d0 + jsr fp_conv_long2ext +| jra fp_getdest + +fp_getdest: + move.l %a0,%a1 + bfextu %d2{#22,#3},%d0 + printf PDECODE,",fp%d\n",1,%d0 + fp_get_fp_reg + movem.l %a0/%a1,-(%sp) + pea fp_finalrounding + bfextu %d2{#25,#7},%d0 + jmp ([0f:w,%pc,%d0*4]) + + .align 4 +0: + .long fp_fmove_mem2fp, fp_fint, fp_fsinh, fp_fintrz + .long fp_fsqrt, fp_ill, fp_flognp1, fp_ill + .long fp_fetoxm1, fp_ftanh, fp_fatan, fp_ill + .long fp_fasin, fp_fatanh, fp_fsin, fp_ftan + .long fp_fetox, fp_ftwotox, fp_ftentox, fp_ill + .long fp_flogn, fp_flog10, fp_flog2, fp_ill + .long fp_fabs, fp_fcosh, fp_fneg, fp_ill + .long fp_facos, fp_fcos, fp_fgetexp, fp_fgetman + .long fp_fdiv, fp_fmod, fp_fadd, fp_fmul + .long fpa_fsgldiv, fp_frem, fp_fscale, fpa_fsglmul + .long fp_fsub, fp_ill, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_fsincos0, fp_fsincos1, fp_fsincos2, fp_fsincos3 + .long fp_fsincos4, fp_fsincos5, fp_fsincos6, fp_fsincos7 + .long fp_fcmp, fp_ill, fp_ftst, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_fsmove, fp_fssqrt, fp_ill, fp_ill + .long fp_fdmove, fp_fdsqrt, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_fsabs, fp_ill, fp_fsneg, fp_ill + .long fp_fdabs, fp_ill, fp_fdneg, fp_ill + .long fp_fsdiv, fp_ill, fp_fsadd, fp_fsmul + .long fp_fddiv, fp_ill, fp_fdadd, fp_fdmul + .long fp_fssub, fp_ill, fp_ill, fp_ill + .long fp_fdsub, fp_ill, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + .long fp_ill, fp_ill, fp_ill, fp_ill + + | Instructions follow + + | Move an (emulated) ROM constant +fp_fmovecr: + bfextu %d2{#27,#5},%d0 + printf PINSTR,"fp_fmovecr #%d",1,%d0 + move.l %d0,%d1 + add.l %d0,%d0 + add.l %d1,%d0 + lea (fp_constants,%d0*4),%a0 + move.l #0x801cc0ff,%d0 + addq.l #1,%d1 + lsl.l %d1,%d0 + jcc 1f + fp_set_sr FPSR_EXC_INEX2 | INEX2 exception +1: moveq #-128,%d0 | continue with fmove + and.l %d0,%d2 + jra fp_getdest + + .data + .align 4 +fp_constants: + .long 0x00004000,0xc90fdaa2,0x2168c235 | pi + .extend 0,0,0,0,0,0,0,0,0,0 + .long 0x00003ffd,0x9a209a84,0xfbcff798 | log10(2) + .long 0x00004000,0xadf85458,0xa2bb4a9a | e + .long 0x00003fff,0xb8aa3b29,0x5c17f0bc | log2(e) + .long 0x00003ffd,0xde5bd8a9,0x37287195 | log10(e) + .long 0x00000000,0x00000000,0x00000000 | 0.0 + .long 0x00003ffe,0xb17217f7,0xd1cf79ac | 1n(2) + .long 0x00004000,0x935d8ddd,0xaaa8ac17 | 1n(10) + | read this as "1.0 * 2^0" - note the high bit in the mantissa + .long 0x00003fff,0x80000000,0x00000000 | 10^0 + .long 0x00004002,0xa0000000,0x00000000 | 10^1 + .long 0x00004005,0xc8000000,0x00000000 | 10^2 + .long 0x0000400c,0x9c400000,0x00000000 | 10^4 + .long 0x00004019,0xbebc2000,0x00000000 | 10^8 + .long 0x00004034,0x8e1bc9bf,0x04000000 | 10^16 + .long 0x00004069,0x9dc5ada8,0x2b70b59e | 10^32 + .long 0x000040d3,0xc2781f49,0xffcfa6d5 | 10^64 + .long 0x000041a8,0x93ba47c9,0x80e98ce0 | 10^128 + .long 0x00004351,0xaa7eebfb,0x9df9de8e | 10^256 + .long 0x000046a3,0xe319a0ae,0xa60e91c7 | 10^512 + .long 0x00004d48,0xc9767586,0x81750c17 | 10^1024 + .long 0x00005a92,0x9e8b3b5d,0xc53d5de5 | 10^2048 + .long 0x00007525,0xc4605202,0x8a20979b | 10^4096 + .previous + +fp_fmove_mem2fp: + printf PINSTR,"fmove %p,%p\n",2,%a0,%a1 + move.l (%a1)+,(%a0)+ + move.l (%a1)+,(%a0)+ + move.l (%a1),(%a0) + subq.l #8,%a0 + rts + +fpa_fsglmul: + move.l #fp_finalrounding_single_fast,(%sp) + jra fp_fsglmul + +fpa_fsgldiv: + move.l #fp_finalrounding_single_fast,(%sp) + jra fp_fsgldiv + +.macro fp_dosingleprec instr + printf PINSTR,"single " + move.l #fp_finalrounding_single,(%sp) + jra \instr +.endm + +.macro fp_dodoubleprec instr + printf PINSTR,"double " + move.l #fp_finalrounding_double,(%sp) + jra \instr +.endm + +fp_fsmove: + fp_dosingleprec fp_fmove_mem2fp + +fp_fssqrt: + fp_dosingleprec fp_fsqrt + +fp_fdmove: + fp_dodoubleprec fp_fmove_mem2fp + +fp_fdsqrt: + fp_dodoubleprec fp_fsqrt + +fp_fsabs: + fp_dosingleprec fp_fabs + +fp_fsneg: + fp_dosingleprec fp_fneg + +fp_fdabs: + fp_dodoubleprec fp_fabs + +fp_fdneg: + fp_dodoubleprec fp_fneg + +fp_fsdiv: + fp_dosingleprec fp_fdiv + +fp_fsadd: + fp_dosingleprec fp_fadd + +fp_fsmul: + fp_dosingleprec fp_fmul + +fp_fddiv: + fp_dodoubleprec fp_fdiv + +fp_fdadd: + fp_dodoubleprec fp_fadd + +fp_fdmul: + fp_dodoubleprec fp_fmul + +fp_fssub: + fp_dosingleprec fp_fsub + +fp_fdsub: + fp_dodoubleprec fp_fsub + +fp_nonstd: + fp_get_pc %a0 + getuser.l (%a0),%d0,fp_err_ua1,%a0 + printf ,"nonstd ((%08x)=%08x)\n",2,%a0,%d0 + moveq #-1,%d0 + rts + + .data + .align 4 + + | data sizes corresponding to the operand formats +fp_datasize: + .word 4, 4, 12, 12, 2, 8, 1, 0 diff --git a/arch/m68k/math-emu/fp_trig.c b/arch/m68k/math-emu/fp_trig.c new file mode 100644 index 00000000000..6361d0784df --- /dev/null +++ b/arch/m68k/math-emu/fp_trig.c @@ -0,0 +1,183 @@ +/* + + fp_trig.c: floating-point math routines for the Linux-m68k + floating point emulator. + + Copyright (c) 1998-1999 David Huggins-Daines / Roman Zippel. + + I hereby give permission, free of charge, to copy, modify, and + redistribute this software, in source or binary form, provided that + the above copyright notice and the following disclaimer are included + in all such copies. + + THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL + OR IMPLIED. + +*/ + +#include "fp_emu.h" +#include "fp_trig.h" + +struct fp_ext * +fp_fsin(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsin\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_fcos(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fcos\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_ftan(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("ftan\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_fasin(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fasin\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_facos(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("facos\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_fatan(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fatan\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_fsinh(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsinh\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_fcosh(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fcosh\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_ftanh(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("ftanh\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_fatanh(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fatanh\n"); + + fp_monadic_check(dest, src); + + return dest; +} + +struct fp_ext * +fp_fsincos0(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsincos0\n"); + + return dest; +} + +struct fp_ext * +fp_fsincos1(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsincos1\n"); + + return dest; +} + +struct fp_ext * +fp_fsincos2(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsincos2\n"); + + return dest; +} + +struct fp_ext * +fp_fsincos3(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsincos3\n"); + + return dest; +} + +struct fp_ext * +fp_fsincos4(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsincos4\n"); + + return dest; +} + +struct fp_ext * +fp_fsincos5(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsincos5\n"); + + return dest; +} + +struct fp_ext * +fp_fsincos6(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsincos6\n"); + + return dest; +} + +struct fp_ext * +fp_fsincos7(struct fp_ext *dest, struct fp_ext *src) +{ + uprint("fsincos7\n"); + + return dest; +} diff --git a/arch/m68k/math-emu/fp_trig.h b/arch/m68k/math-emu/fp_trig.h new file mode 100644 index 00000000000..af8b247e9c9 --- /dev/null +++ b/arch/m68k/math-emu/fp_trig.h @@ -0,0 +1,32 @@ +/* + + fp_trig.h: floating-point math routines for the Linux-m68k + floating point emulator. + + Copyright (c) 1998 David Huggins-Daines. + + I hereby give permission, free of charge, to copy, modify, and + redistribute this software, in source or binary form, provided that + the above copyright notice and the following disclaimer are included + in all such copies. + + THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL + OR IMPLIED. + +*/ + +#ifndef FP_TRIG_H +#define FP_TRIG_H + +#include "fp_emu.h" + +/* floating point trigonometric instructions: + + the arguments to these are in the "internal" extended format, that + is, an "exploded" version of the 96-bit extended fp format used by + the 68881. + + they return a status code, which should end up in %d0, if all goes + well. */ + +#endif /* FP_TRIG__H */ diff --git a/arch/m68k/math-emu/fp_util.S b/arch/m68k/math-emu/fp_util.S new file mode 100644 index 00000000000..a9f7f012906 --- /dev/null +++ b/arch/m68k/math-emu/fp_util.S @@ -0,0 +1,1455 @@ +/* + * fp_util.S + * + * Copyright Roman Zippel, 1997. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "fp_emu.h" + +/* + * Here are lots of conversion and normalization functions mainly + * used by fp_scan.S + * Note that these functions are optimized for "normal" numbers, + * these are handled first and exit as fast as possible, this is + * especially important for fp_normalize_ext/fp_conv_ext2ext, as + * it's called very often. + * The register usage is optimized for fp_scan.S and which register + * is currently at that time unused, be careful if you want change + * something here. %d0 and %d1 is always usable, sometimes %d2 (or + * only the lower half) most function have to return the %a0 + * unmodified, so that the caller can immediately reuse it. + */ + + .globl fp_ill, fp_end + + | exits from fp_scan: + | illegal instruction +fp_ill: + printf ,"fp_illegal\n" + rts + | completed instruction +fp_end: + tst.l (TASK_MM-8,%a2) + jmi 1f + tst.l (TASK_MM-4,%a2) + jmi 1f + tst.l (TASK_MM,%a2) + jpl 2f +1: printf ,"oops:%p,%p,%p\n",3,%a2@(TASK_MM-8),%a2@(TASK_MM-4),%a2@(TASK_MM) +2: clr.l %d0 + rts + + .globl fp_conv_long2ext, fp_conv_single2ext + .globl fp_conv_double2ext, fp_conv_ext2ext + .globl fp_normalize_ext, fp_normalize_double + .globl fp_normalize_single, fp_normalize_single_fast + .globl fp_conv_ext2double, fp_conv_ext2single + .globl fp_conv_ext2long, fp_conv_ext2short + .globl fp_conv_ext2byte + .globl fp_finalrounding_single, fp_finalrounding_single_fast + .globl fp_finalrounding_double + .globl fp_finalrounding, fp_finaltest, fp_final + +/* + * First several conversion functions from a source operand + * into the extended format. Note, that only fp_conv_ext2ext + * normalizes the number and is always called after the other + * conversion functions, which only move the information into + * fp_ext structure. + */ + + | fp_conv_long2ext: + | + | args: %d0 = source (32-bit long) + | %a0 = destination (ptr to struct fp_ext) + +fp_conv_long2ext: + printf PCONV,"l2e: %p -> %p(",2,%d0,%a0 + clr.l %d1 | sign defaults to zero + tst.l %d0 + jeq fp_l2e_zero | is source zero? + jpl 1f | positive? + moveq #1,%d1 + neg.l %d0 +1: swap %d1 + move.w #0x3fff+31,%d1 + move.l %d1,(%a0)+ | set sign / exp + move.l %d0,(%a0)+ | set mantissa + clr.l (%a0) + subq.l #8,%a0 | restore %a0 + printx PCONV,%a0@ + printf PCONV,")\n" + rts + | source is zero +fp_l2e_zero: + clr.l (%a0)+ + clr.l (%a0)+ + clr.l (%a0) + subq.l #8,%a0 + printx PCONV,%a0@ + printf PCONV,")\n" + rts + + | fp_conv_single2ext + | args: %d0 = source (single-precision fp value) + | %a0 = dest (struct fp_ext *) + +fp_conv_single2ext: + printf PCONV,"s2e: %p -> %p(",2,%d0,%a0 + move.l %d0,%d1 + lsl.l #8,%d0 | shift mantissa + lsr.l #8,%d1 | exponent / sign + lsr.l #7,%d1 + lsr.w #8,%d1 + jeq fp_s2e_small | zero / denormal? + cmp.w #0xff,%d1 | NaN / Inf? + jeq fp_s2e_large + bset #31,%d0 | set explizit bit + add.w #0x3fff-0x7f,%d1 | re-bias the exponent. +9: move.l %d1,(%a0)+ | fp_ext.sign, fp_ext.exp + move.l %d0,(%a0)+ | high lword of fp_ext.mant + clr.l (%a0) | low lword = 0 + subq.l #8,%a0 + printx PCONV,%a0@ + printf PCONV,")\n" + rts + | zeros and denormalized +fp_s2e_small: + | exponent is zero, so explizit bit is already zero too + tst.l %d0 + jeq 9b + move.w #0x4000-0x7f,%d1 + jra 9b + | infinities and NAN +fp_s2e_large: + bclr #31,%d0 | clear explizit bit + move.w #0x7fff,%d1 + jra 9b + +fp_conv_double2ext: +#ifdef FPU_EMU_DEBUG + getuser.l %a1@(0),%d0,fp_err_ua2,%a1 + getuser.l %a1@(4),%d1,fp_err_ua2,%a1 + printf PCONV,"d2e: %p%p -> %p(",3,%d0,%d1,%a0 +#endif + getuser.l (%a1)+,%d0,fp_err_ua2,%a1 + move.l %d0,%d1 + lsl.l #8,%d0 | shift high mantissa + lsl.l #3,%d0 + lsr.l #8,%d1 | exponent / sign + lsr.l #7,%d1 + lsr.w #5,%d1 + jeq fp_d2e_small | zero / denormal? + cmp.w #0x7ff,%d1 | NaN / Inf? + jeq fp_d2e_large + bset #31,%d0 | set explizit bit + add.w #0x3fff-0x3ff,%d1 | re-bias the exponent. +9: move.l %d1,(%a0)+ | fp_ext.sign, fp_ext.exp + move.l %d0,(%a0)+ + getuser.l (%a1)+,%d0,fp_err_ua2,%a1 + move.l %d0,%d1 + lsl.l #8,%d0 + lsl.l #3,%d0 + move.l %d0,(%a0) + moveq #21,%d0 + lsr.l %d0,%d1 + or.l %d1,-(%a0) + subq.l #4,%a0 + printx PCONV,%a0@ + printf PCONV,")\n" + rts + | zeros and denormalized +fp_d2e_small: + | exponent is zero, so explizit bit is already zero too + tst.l %d0 + jeq 9b + move.w #0x4000-0x3ff,%d1 + jra 9b + | infinities and NAN +fp_d2e_large: + bclr #31,%d0 | clear explizit bit + move.w #0x7fff,%d1 + jra 9b + + | fp_conv_ext2ext: + | originally used to get longdouble from userspace, now it's + | called before arithmetic operations to make sure the number + | is normalized [maybe rename it?]. + | args: %a0 = dest (struct fp_ext *) + | returns 0 in %d0 for a NaN, otherwise 1 + +fp_conv_ext2ext: + printf PCONV,"e2e: %p(",1,%a0 + printx PCONV,%a0@ + printf PCONV,"), " + move.l (%a0)+,%d0 + cmp.w #0x7fff,%d0 | Inf / NaN? + jeq fp_e2e_large + move.l (%a0),%d0 + jpl fp_e2e_small | zero / denorm? + | The high bit is set, so normalization is irrelevant. +fp_e2e_checkround: + subq.l #4,%a0 +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC + move.b (%a0),%d0 + jne fp_e2e_round +#endif + printf PCONV,"%p(",1,%a0 + printx PCONV,%a0@ + printf PCONV,")\n" + moveq #1,%d0 + rts +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC +fp_e2e_round: + fp_set_sr FPSR_EXC_INEX2 + clr.b (%a0) + move.w (FPD_RND,FPDATA),%d2 + jne fp_e2e_roundother | %d2 == 0, round to nearest + tst.b %d0 | test guard bit + jpl 9f | zero is closer + btst #0,(11,%a0) | test lsb bit + jne fp_e2e_doroundup | round to infinity + lsl.b #1,%d0 | check low bits + jeq 9f | round to zero +fp_e2e_doroundup: + addq.l #1,(8,%a0) + jcc 9f + addq.l #1,(4,%a0) + jcc 9f + move.w #0x8000,(4,%a0) + addq.w #1,(2,%a0) +9: printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +fp_e2e_roundother: + subq.w #2,%d2 + jcs 9b | %d2 < 2, round to zero + jhi 1f | %d2 > 2, round to +infinity + tst.b (1,%a0) | to -inf + jne fp_e2e_doroundup | negative, round to infinity + jra 9b | positive, round to zero +1: tst.b (1,%a0) | to +inf + jeq fp_e2e_doroundup | positive, round to infinity + jra 9b | negative, round to zero +#endif + | zeros and subnormals: + | try to normalize these anyway. +fp_e2e_small: + jne fp_e2e_small1 | high lword zero? + move.l (4,%a0),%d0 + jne fp_e2e_small2 +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC + clr.l %d0 + move.b (-4,%a0),%d0 + jne fp_e2e_small3 +#endif + | Genuine zero. + clr.w -(%a0) + subq.l #2,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + moveq #1,%d0 + rts + | definitely subnormal, need to shift all 64 bits +fp_e2e_small1: + bfffo %d0{#0,#32},%d1 + move.w -(%a0),%d2 + sub.w %d1,%d2 + jcc 1f + | Pathologically small, denormalize. + add.w %d2,%d1 + clr.w %d2 +1: move.w %d2,(%a0)+ + move.w %d1,%d2 + jeq fp_e2e_checkround + | fancy 64-bit double-shift begins here + lsl.l %d2,%d0 + move.l %d0,(%a0)+ + move.l (%a0),%d0 + move.l %d0,%d1 + lsl.l %d2,%d0 + move.l %d0,(%a0) + neg.w %d2 + and.w #0x1f,%d2 + lsr.l %d2,%d1 + or.l %d1,-(%a0) +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC +fp_e2e_extra1: + clr.l %d0 + move.b (-4,%a0),%d0 + neg.w %d2 + add.w #24,%d2 + jcc 1f + clr.b (-4,%a0) + lsl.l %d2,%d0 + or.l %d0,(4,%a0) + jra fp_e2e_checkround +1: addq.w #8,%d2 + lsl.l %d2,%d0 + move.b %d0,(-4,%a0) + lsr.l #8,%d0 + or.l %d0,(4,%a0) +#endif + jra fp_e2e_checkround + | pathologically small subnormal +fp_e2e_small2: + bfffo %d0{#0,#32},%d1 + add.w #32,%d1 + move.w -(%a0),%d2 + sub.w %d1,%d2 + jcc 1f + | Beyond pathologically small, denormalize. + add.w %d2,%d1 + clr.w %d2 +1: move.w %d2,(%a0)+ + ext.l %d1 + jeq fp_e2e_checkround + clr.l (4,%a0) + sub.w #32,%d2 + jcs 1f + lsl.l %d1,%d0 | lower lword needs only to be shifted + move.l %d0,(%a0) | into the higher lword +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC + clr.l %d0 + move.b (-4,%a0),%d0 + clr.b (-4,%a0) + neg.w %d1 + add.w #32,%d1 + bfins %d0,(%a0){%d1,#8} +#endif + jra fp_e2e_checkround +1: neg.w %d1 | lower lword is splitted between + bfins %d0,(%a0){%d1,#32} | higher and lower lword +#ifndef CONFIG_M68KFPU_EMU_EXTRAPREC + jra fp_e2e_checkround +#else + move.w %d1,%d2 + jra fp_e2e_extra1 + | These are extremely small numbers, that will mostly end up as zero + | anyway, so this is only important for correct rounding. +fp_e2e_small3: + bfffo %d0{#24,#8},%d1 + add.w #40,%d1 + move.w -(%a0),%d2 + sub.w %d1,%d2 + jcc 1f + | Pathologically small, denormalize. + add.w %d2,%d1 + clr.w %d2 +1: move.w %d2,(%a0)+ + ext.l %d1 + jeq fp_e2e_checkround + cmp.w #8,%d1 + jcs 2f +1: clr.b (-4,%a0) + sub.w #64,%d1 + jcs 1f + add.w #24,%d1 + lsl.l %d1,%d0 + move.l %d0,(%a0) + jra fp_e2e_checkround +1: neg.w %d1 + bfins %d0,(%a0){%d1,#8} + jra fp_e2e_checkround +2: lsl.l %d1,%d0 + move.b %d0,(-4,%a0) + lsr.l #8,%d0 + move.b %d0,(7,%a0) + jra fp_e2e_checkround +#endif +1: move.l %d0,%d1 | lower lword is splitted between + lsl.l %d2,%d0 | higher and lower lword + move.l %d0,(%a0) + move.l %d1,%d0 + neg.w %d2 + add.w #32,%d2 + lsr.l %d2,%d0 + move.l %d0,-(%a0) + jra fp_e2e_checkround + | Infinities and NaNs +fp_e2e_large: + move.l (%a0)+,%d0 + jne 3f +1: tst.l (%a0) + jne 4f + moveq #1,%d0 +2: subq.l #8,%a0 + printf PCONV,"%p(",1,%a0 + printx PCONV,%a0@ + printf PCONV,")\n" + rts + | we have maybe a NaN, shift off the highest bit +3: lsl.l #1,%d0 + jeq 1b + | we have a NaN, clear the return value +4: clrl %d0 + jra 2b + + +/* + * Normalization functions. Call these on the output of general + * FP operators, and before any conversion into the destination + * formats. fp_normalize_ext has always to be called first, the + * following conversion functions expect an already normalized + * number. + */ + + | fp_normalize_ext: + | normalize an extended in extended (unpacked) format, basically + | it does the same as fp_conv_ext2ext, additionally it also does + | the necessary postprocessing checks. + | args: %a0 (struct fp_ext *) + | NOTE: it does _not_ modify %a0/%a1 and the upper word of %d2 + +fp_normalize_ext: + printf PNORM,"ne: %p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,"), " + move.l (%a0)+,%d0 + cmp.w #0x7fff,%d0 | Inf / NaN? + jeq fp_ne_large + move.l (%a0),%d0 + jpl fp_ne_small | zero / denorm? + | The high bit is set, so normalization is irrelevant. +fp_ne_checkround: + subq.l #4,%a0 +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC + move.b (%a0),%d0 + jne fp_ne_round +#endif + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC +fp_ne_round: + fp_set_sr FPSR_EXC_INEX2 + clr.b (%a0) + move.w (FPD_RND,FPDATA),%d2 + jne fp_ne_roundother | %d2 == 0, round to nearest + tst.b %d0 | test guard bit + jpl 9f | zero is closer + btst #0,(11,%a0) | test lsb bit + jne fp_ne_doroundup | round to infinity + lsl.b #1,%d0 | check low bits + jeq 9f | round to zero +fp_ne_doroundup: + addq.l #1,(8,%a0) + jcc 9f + addq.l #1,(4,%a0) + jcc 9f + addq.w #1,(2,%a0) + move.w #0x8000,(4,%a0) +9: printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +fp_ne_roundother: + subq.w #2,%d2 + jcs 9b | %d2 < 2, round to zero + jhi 1f | %d2 > 2, round to +infinity + tst.b (1,%a0) | to -inf + jne fp_ne_doroundup | negative, round to infinity + jra 9b | positive, round to zero +1: tst.b (1,%a0) | to +inf + jeq fp_ne_doroundup | positive, round to infinity + jra 9b | negative, round to zero +#endif + | Zeros and subnormal numbers + | These are probably merely subnormal, rather than "denormalized" + | numbers, so we will try to make them normal again. +fp_ne_small: + jne fp_ne_small1 | high lword zero? + move.l (4,%a0),%d0 + jne fp_ne_small2 +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC + clr.l %d0 + move.b (-4,%a0),%d0 + jne fp_ne_small3 +#endif + | Genuine zero. + clr.w -(%a0) + subq.l #2,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts + | Subnormal. +fp_ne_small1: + bfffo %d0{#0,#32},%d1 + move.w -(%a0),%d2 + sub.w %d1,%d2 + jcc 1f + | Pathologically small, denormalize. + add.w %d2,%d1 + clr.w %d2 + fp_set_sr FPSR_EXC_UNFL +1: move.w %d2,(%a0)+ + move.w %d1,%d2 + jeq fp_ne_checkround + | This is exactly the same 64-bit double shift as seen above. + lsl.l %d2,%d0 + move.l %d0,(%a0)+ + move.l (%a0),%d0 + move.l %d0,%d1 + lsl.l %d2,%d0 + move.l %d0,(%a0) + neg.w %d2 + and.w #0x1f,%d2 + lsr.l %d2,%d1 + or.l %d1,-(%a0) +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC +fp_ne_extra1: + clr.l %d0 + move.b (-4,%a0),%d0 + neg.w %d2 + add.w #24,%d2 + jcc 1f + clr.b (-4,%a0) + lsl.l %d2,%d0 + or.l %d0,(4,%a0) + jra fp_ne_checkround +1: addq.w #8,%d2 + lsl.l %d2,%d0 + move.b %d0,(-4,%a0) + lsr.l #8,%d0 + or.l %d0,(4,%a0) +#endif + jra fp_ne_checkround + | May or may not be subnormal, if so, only 32 bits to shift. +fp_ne_small2: + bfffo %d0{#0,#32},%d1 + add.w #32,%d1 + move.w -(%a0),%d2 + sub.w %d1,%d2 + jcc 1f + | Beyond pathologically small, denormalize. + add.w %d2,%d1 + clr.w %d2 + fp_set_sr FPSR_EXC_UNFL +1: move.w %d2,(%a0)+ + ext.l %d1 + jeq fp_ne_checkround + clr.l (4,%a0) + sub.w #32,%d1 + jcs 1f + lsl.l %d1,%d0 | lower lword needs only to be shifted + move.l %d0,(%a0) | into the higher lword +#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC + clr.l %d0 + move.b (-4,%a0),%d0 + clr.b (-4,%a0) + neg.w %d1 + add.w #32,%d1 + bfins %d0,(%a0){%d1,#8} +#endif + jra fp_ne_checkround +1: neg.w %d1 | lower lword is splitted between + bfins %d0,(%a0){%d1,#32} | higher and lower lword +#ifndef CONFIG_M68KFPU_EMU_EXTRAPREC + jra fp_ne_checkround +#else + move.w %d1,%d2 + jra fp_ne_extra1 + | These are extremely small numbers, that will mostly end up as zero + | anyway, so this is only important for correct rounding. +fp_ne_small3: + bfffo %d0{#24,#8},%d1 + add.w #40,%d1 + move.w -(%a0),%d2 + sub.w %d1,%d2 + jcc 1f + | Pathologically small, denormalize. + add.w %d2,%d1 + clr.w %d2 +1: move.w %d2,(%a0)+ + ext.l %d1 + jeq fp_ne_checkround + cmp.w #8,%d1 + jcs 2f +1: clr.b (-4,%a0) + sub.w #64,%d1 + jcs 1f + add.w #24,%d1 + lsl.l %d1,%d0 + move.l %d0,(%a0) + jra fp_ne_checkround +1: neg.w %d1 + bfins %d0,(%a0){%d1,#8} + jra fp_ne_checkround +2: lsl.l %d1,%d0 + move.b %d0,(-4,%a0) + lsr.l #8,%d0 + move.b %d0,(7,%a0) + jra fp_ne_checkround +#endif + | Infinities and NaNs, again, same as above. +fp_ne_large: + move.l (%a0)+,%d0 + jne 3f +1: tst.l (%a0) + jne 4f +2: subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts + | we have maybe a NaN, shift off the highest bit +3: move.l %d0,%d1 + lsl.l #1,%d1 + jne 4f + clr.l (-4,%a0) + jra 1b + | we have a NaN, test if it is signaling +4: bset #30,%d0 + jne 2b + fp_set_sr FPSR_EXC_SNAN + move.l %d0,(-4,%a0) + jra 2b + + | these next two do rounding as per the IEEE standard. + | values for the rounding modes appear to be: + | 0: Round to nearest + | 1: Round to zero + | 2: Round to -Infinity + | 3: Round to +Infinity + | both functions expect that fp_normalize was already + | called (and extended argument is already normalized + | as far as possible), these are used if there is different + | rounding precision is selected and before converting + | into single/double + + | fp_normalize_double: + | normalize an extended with double (52-bit) precision + | args: %a0 (struct fp_ext *) + +fp_normalize_double: + printf PNORM,"nd: %p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,"), " + move.l (%a0)+,%d2 + tst.w %d2 + jeq fp_nd_zero | zero / denormalized + cmp.w #0x7fff,%d2 + jeq fp_nd_huge | NaN / infinitive. + sub.w #0x4000-0x3ff,%d2 | will the exponent fit? + jcs fp_nd_small | too small. + cmp.w #0x7fe,%d2 + jcc fp_nd_large | too big. + addq.l #4,%a0 + move.l (%a0),%d0 | low lword of mantissa + | now, round off the low 11 bits. +fp_nd_round: + moveq #21,%d1 + lsl.l %d1,%d0 | keep 11 low bits. + jne fp_nd_checkround | Are they non-zero? + | nothing to do here +9: subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts + | Be careful with the X bit! It contains the lsb + | from the shift above, it is needed for round to nearest. +fp_nd_checkround: + fp_set_sr FPSR_EXC_INEX2 | INEX2 bit + and.w #0xf800,(2,%a0) | clear bits 0-10 + move.w (FPD_RND,FPDATA),%d2 | rounding mode + jne 2f | %d2 == 0, round to nearest + tst.l %d0 | test guard bit + jpl 9b | zero is closer + | here we test the X bit by adding it to %d2 + clr.w %d2 | first set z bit, addx only clears it + addx.w %d2,%d2 | test lsb bit + | IEEE754-specified "round to even" behaviour. If the guard + | bit is set, then the number is odd, so rounding works like + | in grade-school arithmetic (i.e. 1.5 rounds to 2.0) + | Otherwise, an equal distance rounds towards zero, so as not + | to produce an odd number. This is strange, but it is what + | the standard says. + jne fp_nd_doroundup | round to infinity + lsl.l #1,%d0 | check low bits + jeq 9b | round to zero +fp_nd_doroundup: + | round (the mantissa, that is) towards infinity + add.l #0x800,(%a0) + jcc 9b | no overflow, good. + addq.l #1,-(%a0) | extend to high lword + jcc 1f | no overflow, good. + | Yow! we have managed to overflow the mantissa. Since this + | only happens when %d1 was 0xfffff800, it is now zero, so + | reset the high bit, and increment the exponent. + move.w #0x8000,(%a0) + addq.w #1,-(%a0) + cmp.w #0x43ff,(%a0)+ | exponent now overflown? + jeq fp_nd_large | yes, so make it infinity. +1: subq.l #4,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +2: subq.w #2,%d2 + jcs 9b | %d2 < 2, round to zero + jhi 3f | %d2 > 2, round to +infinity + | Round to +Inf or -Inf. High word of %d2 contains the + | sign of the number, by the way. + swap %d2 | to -inf + tst.b %d2 + jne fp_nd_doroundup | negative, round to infinity + jra 9b | positive, round to zero +3: swap %d2 | to +inf + tst.b %d2 + jeq fp_nd_doroundup | positive, round to infinity + jra 9b | negative, round to zero + | Exponent underflow. Try to make a denormal, and set it to + | the smallest possible fraction if this fails. +fp_nd_small: + fp_set_sr FPSR_EXC_UNFL | set UNFL bit + move.w #0x3c01,(-2,%a0) | 2**-1022 + neg.w %d2 | degree of underflow + cmp.w #32,%d2 | single or double shift? + jcc 1f + | Again, another 64-bit double shift. + move.l (%a0),%d0 + move.l %d0,%d1 + lsr.l %d2,%d0 + move.l %d0,(%a0)+ + move.l (%a0),%d0 + lsr.l %d2,%d0 + neg.w %d2 + add.w #32,%d2 + lsl.l %d2,%d1 + or.l %d1,%d0 + move.l (%a0),%d1 + move.l %d0,(%a0) + | Check to see if we shifted off any significant bits + lsl.l %d2,%d1 + jeq fp_nd_round | Nope, round. + bset #0,%d0 | Yes, so set the "sticky bit". + jra fp_nd_round | Now, round. + | Another 64-bit single shift and store +1: sub.w #32,%d2 + cmp.w #32,%d2 | Do we really need to shift? + jcc 2f | No, the number is too small. + move.l (%a0),%d0 + clr.l (%a0)+ + move.l %d0,%d1 + lsr.l %d2,%d0 + neg.w %d2 + add.w #32,%d2 + | Again, check to see if we shifted off any significant bits. + tst.l (%a0) + jeq 1f + bset #0,%d0 | Sticky bit. +1: move.l %d0,(%a0) + lsl.l %d2,%d1 + jeq fp_nd_round + bset #0,%d0 + jra fp_nd_round + | Sorry, the number is just too small. +2: clr.l (%a0)+ + clr.l (%a0) + moveq #1,%d0 | Smallest possible fraction, + jra fp_nd_round | round as desired. + | zero and denormalized +fp_nd_zero: + tst.l (%a0)+ + jne 1f + tst.l (%a0) + jne 1f + subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts | zero. nothing to do. + | These are not merely subnormal numbers, but true denormals, + | i.e. pathologically small (exponent is 2**-16383) numbers. + | It is clearly impossible for even a normal extended number + | with that exponent to fit into double precision, so just + | write these ones off as "too darn small". +1: fp_set_sr FPSR_EXC_UNFL | Set UNFL bit + clr.l (%a0) + clr.l -(%a0) + move.w #0x3c01,-(%a0) | i.e. 2**-1022 + addq.l #6,%a0 + moveq #1,%d0 + jra fp_nd_round | round. + | Exponent overflow. Just call it infinity. +fp_nd_large: + move.w #0x7ff,%d0 + and.w (6,%a0),%d0 + jeq 1f + fp_set_sr FPSR_EXC_INEX2 +1: fp_set_sr FPSR_EXC_OVFL + move.w (FPD_RND,FPDATA),%d2 + jne 3f | %d2 = 0 round to nearest +1: move.w #0x7fff,(-2,%a0) + clr.l (%a0)+ + clr.l (%a0) +2: subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +3: subq.w #2,%d2 + jcs 5f | %d2 < 2, round to zero + jhi 4f | %d2 > 2, round to +infinity + tst.b (-3,%a0) | to -inf + jne 1b + jra 5f +4: tst.b (-3,%a0) | to +inf + jeq 1b +5: move.w #0x43fe,(-2,%a0) + moveq #-1,%d0 + move.l %d0,(%a0)+ + move.w #0xf800,%d0 + move.l %d0,(%a0) + jra 2b + | Infinities or NaNs +fp_nd_huge: + subq.l #4,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts + + | fp_normalize_single: + | normalize an extended with single (23-bit) precision + | args: %a0 (struct fp_ext *) + +fp_normalize_single: + printf PNORM,"ns: %p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,") " + addq.l #2,%a0 + move.w (%a0)+,%d2 + jeq fp_ns_zero | zero / denormalized + cmp.w #0x7fff,%d2 + jeq fp_ns_huge | NaN / infinitive. + sub.w #0x4000-0x7f,%d2 | will the exponent fit? + jcs fp_ns_small | too small. + cmp.w #0xfe,%d2 + jcc fp_ns_large | too big. + move.l (%a0)+,%d0 | get high lword of mantissa +fp_ns_round: + tst.l (%a0) | check the low lword + jeq 1f + | Set a sticky bit if it is non-zero. This should only + | affect the rounding in what would otherwise be equal- + | distance situations, which is what we want it to do. + bset #0,%d0 +1: clr.l (%a0) | zap it from memory. + | now, round off the low 8 bits of the hi lword. + tst.b %d0 | 8 low bits. + jne fp_ns_checkround | Are they non-zero? + | nothing to do here + subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +fp_ns_checkround: + fp_set_sr FPSR_EXC_INEX2 | INEX2 bit + clr.b -(%a0) | clear low byte of high lword + subq.l #3,%a0 + move.w (FPD_RND,FPDATA),%d2 | rounding mode + jne 2f | %d2 == 0, round to nearest + tst.b %d0 | test guard bit + jpl 9f | zero is closer + btst #8,%d0 | test lsb bit + | round to even behaviour, see above. + jne fp_ns_doroundup | round to infinity + lsl.b #1,%d0 | check low bits + jeq 9f | round to zero +fp_ns_doroundup: + | round (the mantissa, that is) towards infinity + add.l #0x100,(%a0) + jcc 9f | no overflow, good. + | Overflow. This means that the %d1 was 0xffffff00, so it + | is now zero. We will set the mantissa to reflect this, and + | increment the exponent (checking for overflow there too) + move.w #0x8000,(%a0) + addq.w #1,-(%a0) + cmp.w #0x407f,(%a0)+ | exponent now overflown? + jeq fp_ns_large | yes, so make it infinity. +9: subq.l #4,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts + | check nondefault rounding modes +2: subq.w #2,%d2 + jcs 9b | %d2 < 2, round to zero + jhi 3f | %d2 > 2, round to +infinity + tst.b (-3,%a0) | to -inf + jne fp_ns_doroundup | negative, round to infinity + jra 9b | positive, round to zero +3: tst.b (-3,%a0) | to +inf + jeq fp_ns_doroundup | positive, round to infinity + jra 9b | negative, round to zero + | Exponent underflow. Try to make a denormal, and set it to + | the smallest possible fraction if this fails. +fp_ns_small: + fp_set_sr FPSR_EXC_UNFL | set UNFL bit + move.w #0x3f81,(-2,%a0) | 2**-126 + neg.w %d2 | degree of underflow + cmp.w #32,%d2 | single or double shift? + jcc 2f + | a 32-bit shift. + move.l (%a0),%d0 + move.l %d0,%d1 + lsr.l %d2,%d0 + move.l %d0,(%a0)+ + | Check to see if we shifted off any significant bits. + neg.w %d2 + add.w #32,%d2 + lsl.l %d2,%d1 + jeq 1f + bset #0,%d0 | Sticky bit. + | Check the lower lword +1: tst.l (%a0) + jeq fp_ns_round + clr (%a0) + bset #0,%d0 | Sticky bit. + jra fp_ns_round + | Sorry, the number is just too small. +2: clr.l (%a0)+ + clr.l (%a0) + moveq #1,%d0 | Smallest possible fraction, + jra fp_ns_round | round as desired. + | Exponent overflow. Just call it infinity. +fp_ns_large: + tst.b (3,%a0) + jeq 1f + fp_set_sr FPSR_EXC_INEX2 +1: fp_set_sr FPSR_EXC_OVFL + move.w (FPD_RND,FPDATA),%d2 + jne 3f | %d2 = 0 round to nearest +1: move.w #0x7fff,(-2,%a0) + clr.l (%a0)+ + clr.l (%a0) +2: subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +3: subq.w #2,%d2 + jcs 5f | %d2 < 2, round to zero + jhi 4f | %d2 > 2, round to +infinity + tst.b (-3,%a0) | to -inf + jne 1b + jra 5f +4: tst.b (-3,%a0) | to +inf + jeq 1b +5: move.w #0x407e,(-2,%a0) + move.l #0xffffff00,(%a0)+ + clr.l (%a0) + jra 2b + | zero and denormalized +fp_ns_zero: + tst.l (%a0)+ + jne 1f + tst.l (%a0) + jne 1f + subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts | zero. nothing to do. + | These are not merely subnormal numbers, but true denormals, + | i.e. pathologically small (exponent is 2**-16383) numbers. + | It is clearly impossible for even a normal extended number + | with that exponent to fit into single precision, so just + | write these ones off as "too darn small". +1: fp_set_sr FPSR_EXC_UNFL | Set UNFL bit + clr.l (%a0) + clr.l -(%a0) + move.w #0x3f81,-(%a0) | i.e. 2**-126 + addq.l #6,%a0 + moveq #1,%d0 + jra fp_ns_round | round. + | Infinities or NaNs +fp_ns_huge: + subq.l #4,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts + + | fp_normalize_single_fast: + | normalize an extended with single (23-bit) precision + | this is only used by fsgldiv/fsgdlmul, where the + | operand is not completly normalized. + | args: %a0 (struct fp_ext *) + +fp_normalize_single_fast: + printf PNORM,"nsf: %p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,") " + addq.l #2,%a0 + move.w (%a0)+,%d2 + cmp.w #0x7fff,%d2 + jeq fp_nsf_huge | NaN / infinitive. + move.l (%a0)+,%d0 | get high lword of mantissa +fp_nsf_round: + tst.l (%a0) | check the low lword + jeq 1f + | Set a sticky bit if it is non-zero. This should only + | affect the rounding in what would otherwise be equal- + | distance situations, which is what we want it to do. + bset #0,%d0 +1: clr.l (%a0) | zap it from memory. + | now, round off the low 8 bits of the hi lword. + tst.b %d0 | 8 low bits. + jne fp_nsf_checkround | Are they non-zero? + | nothing to do here + subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +fp_nsf_checkround: + fp_set_sr FPSR_EXC_INEX2 | INEX2 bit + clr.b -(%a0) | clear low byte of high lword + subq.l #3,%a0 + move.w (FPD_RND,FPDATA),%d2 | rounding mode + jne 2f | %d2 == 0, round to nearest + tst.b %d0 | test guard bit + jpl 9f | zero is closer + btst #8,%d0 | test lsb bit + | round to even behaviour, see above. + jne fp_nsf_doroundup | round to infinity + lsl.b #1,%d0 | check low bits + jeq 9f | round to zero +fp_nsf_doroundup: + | round (the mantissa, that is) towards infinity + add.l #0x100,(%a0) + jcc 9f | no overflow, good. + | Overflow. This means that the %d1 was 0xffffff00, so it + | is now zero. We will set the mantissa to reflect this, and + | increment the exponent (checking for overflow there too) + move.w #0x8000,(%a0) + addq.w #1,-(%a0) + cmp.w #0x407f,(%a0)+ | exponent now overflown? + jeq fp_nsf_large | yes, so make it infinity. +9: subq.l #4,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts + | check nondefault rounding modes +2: subq.w #2,%d2 + jcs 9b | %d2 < 2, round to zero + jhi 3f | %d2 > 2, round to +infinity + tst.b (-3,%a0) | to -inf + jne fp_nsf_doroundup | negative, round to infinity + jra 9b | positive, round to zero +3: tst.b (-3,%a0) | to +inf + jeq fp_nsf_doroundup | positive, round to infinity + jra 9b | negative, round to zero + | Exponent overflow. Just call it infinity. +fp_nsf_large: + tst.b (3,%a0) + jeq 1f + fp_set_sr FPSR_EXC_INEX2 +1: fp_set_sr FPSR_EXC_OVFL + move.w (FPD_RND,FPDATA),%d2 + jne 3f | %d2 = 0 round to nearest +1: move.w #0x7fff,(-2,%a0) + clr.l (%a0)+ + clr.l (%a0) +2: subq.l #8,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts +3: subq.w #2,%d2 + jcs 5f | %d2 < 2, round to zero + jhi 4f | %d2 > 2, round to +infinity + tst.b (-3,%a0) | to -inf + jne 1b + jra 5f +4: tst.b (-3,%a0) | to +inf + jeq 1b +5: move.w #0x407e,(-2,%a0) + move.l #0xffffff00,(%a0)+ + clr.l (%a0) + jra 2b + | Infinities or NaNs +fp_nsf_huge: + subq.l #4,%a0 + printf PNORM,"%p(",1,%a0 + printx PNORM,%a0@ + printf PNORM,")\n" + rts + + | conv_ext2int (macro): + | Generates a subroutine that converts an extended value to an + | integer of a given size, again, with the appropriate type of + | rounding. + + | Macro arguments: + | s: size, as given in an assembly instruction. + | b: number of bits in that size. + + | Subroutine arguments: + | %a0: source (struct fp_ext *) + + | Returns the integer in %d0 (like it should) + +.macro conv_ext2int s,b + .set inf,(1<<(\b-1))-1 | i.e. MAXINT + printf PCONV,"e2i%d: %p(",2,#\b,%a0 + printx PCONV,%a0@ + printf PCONV,") " + addq.l #2,%a0 + move.w (%a0)+,%d2 | exponent + jeq fp_e2i_zero\b | zero / denorm (== 0, here) + cmp.w #0x7fff,%d2 + jeq fp_e2i_huge\b | Inf / NaN + sub.w #0x3ffe,%d2 + jcs fp_e2i_small\b + cmp.w #\b,%d2 + jhi fp_e2i_large\b + move.l (%a0),%d0 + move.l %d0,%d1 + lsl.l %d2,%d1 + jne fp_e2i_round\b + tst.l (4,%a0) + jne fp_e2i_round\b + neg.w %d2 + add.w #32,%d2 + lsr.l %d2,%d0 +9: tst.w (-4,%a0) + jne 1f + tst.\s %d0 + jmi fp_e2i_large\b + printf PCONV,"-> %p\n",1,%d0 + rts +1: neg.\s %d0 + jeq 1f + jpl fp_e2i_large\b +1: printf PCONV,"-> %p\n",1,%d0 + rts +fp_e2i_round\b: + fp_set_sr FPSR_EXC_INEX2 | INEX2 bit + neg.w %d2 + add.w #32,%d2 + .if \b>16 + jeq 5f + .endif + lsr.l %d2,%d0 + move.w (FPD_RND,FPDATA),%d2 | rounding mode + jne 2f | %d2 == 0, round to nearest + tst.l %d1 | test guard bit + jpl 9b | zero is closer + btst %d2,%d0 | test lsb bit (%d2 still 0) + jne fp_e2i_doroundup\b + lsl.l #1,%d1 | check low bits + jne fp_e2i_doroundup\b + tst.l (4,%a0) + jeq 9b +fp_e2i_doroundup\b: + addq.l #1,%d0 + jra 9b + | check nondefault rounding modes +2: subq.w #2,%d2 + jcs 9b | %d2 < 2, round to zero + jhi 3f | %d2 > 2, round to +infinity + tst.w (-4,%a0) | to -inf + jne fp_e2i_doroundup\b | negative, round to infinity + jra 9b | positive, round to zero +3: tst.w (-4,%a0) | to +inf + jeq fp_e2i_doroundup\b | positive, round to infinity + jra 9b | negative, round to zero + | we are only want -2**127 get correctly rounded here, + | since the guard bit is in the lower lword. + | everything else ends up anyway as overflow. + .if \b>16 +5: move.w (FPD_RND,FPDATA),%d2 | rounding mode + jne 2b | %d2 == 0, round to nearest + move.l (4,%a0),%d1 | test guard bit + jpl 9b | zero is closer + lsl.l #1,%d1 | check low bits + jne fp_e2i_doroundup\b + jra 9b + .endif +fp_e2i_zero\b: + clr.l %d0 + tst.l (%a0)+ + jne 1f + tst.l (%a0) + jeq 3f +1: subq.l #4,%a0 + fp_clr_sr FPSR_EXC_UNFL | fp_normalize_ext has set this bit +fp_e2i_small\b: + fp_set_sr FPSR_EXC_INEX2 + clr.l %d0 + move.w (FPD_RND,FPDATA),%d2 | rounding mode + subq.w #2,%d2 + jcs 3f | %d2 < 2, round to nearest/zero + jhi 2f | %d2 > 2, round to +infinity + tst.w (-4,%a0) | to -inf + jeq 3f + subq.\s #1,%d0 + jra 3f +2: tst.w (-4,%a0) | to +inf + jne 3f + addq.\s #1,%d0 +3: printf PCONV,"-> %p\n",1,%d0 + rts +fp_e2i_large\b: + fp_set_sr FPSR_EXC_OPERR + move.\s #inf,%d0 + tst.w (-4,%a0) + jeq 1f + addq.\s #1,%d0 +1: printf PCONV,"-> %p\n",1,%d0 + rts +fp_e2i_huge\b: + move.\s (%a0),%d0 + tst.l (%a0) + jne 1f + tst.l (%a0) + jeq fp_e2i_large\b + | fp_normalize_ext has set this bit already + | and made the number nonsignaling +1: fp_tst_sr FPSR_EXC_SNAN + jne 1f + fp_set_sr FPSR_EXC_OPERR +1: printf PCONV,"-> %p\n",1,%d0 + rts +.endm + +fp_conv_ext2long: + conv_ext2int l,32 + +fp_conv_ext2short: + conv_ext2int w,16 + +fp_conv_ext2byte: + conv_ext2int b,8 + +fp_conv_ext2double: + jsr fp_normalize_double + printf PCONV,"e2d: %p(",1,%a0 + printx PCONV,%a0@ + printf PCONV,"), " + move.l (%a0)+,%d2 + cmp.w #0x7fff,%d2 + jne 1f + move.w #0x7ff,%d2 + move.l (%a0)+,%d0 + jra 2f +1: sub.w #0x3fff-0x3ff,%d2 + move.l (%a0)+,%d0 + jmi 2f + clr.w %d2 +2: lsl.w #5,%d2 + lsl.l #7,%d2 + lsl.l #8,%d2 + move.l %d0,%d1 + lsl.l #1,%d0 + lsr.l #4,%d0 + lsr.l #8,%d0 + or.l %d2,%d0 + putuser.l %d0,(%a1)+,fp_err_ua2,%a1 + moveq #21,%d0 + lsl.l %d0,%d1 + move.l (%a0),%d0 + lsr.l #4,%d0 + lsr.l #7,%d0 + or.l %d1,%d0 + putuser.l %d0,(%a1),fp_err_ua2,%a1 +#ifdef FPU_EMU_DEBUG + getuser.l %a1@(-4),%d0,fp_err_ua2,%a1 + getuser.l %a1@(0),%d1,fp_err_ua2,%a1 + printf PCONV,"%p(%08x%08x)\n",3,%a1,%d0,%d1 +#endif + rts + +fp_conv_ext2single: + jsr fp_normalize_single + printf PCONV,"e2s: %p(",1,%a0 + printx PCONV,%a0@ + printf PCONV,"), " + move.l (%a0)+,%d1 + cmp.w #0x7fff,%d1 + jne 1f + move.w #0xff,%d1 + move.l (%a0)+,%d0 + jra 2f +1: sub.w #0x3fff-0x7f,%d1 + move.l (%a0)+,%d0 + jmi 2f + clr.w %d1 +2: lsl.w #8,%d1 + lsl.l #7,%d1 + lsl.l #8,%d1 + bclr #31,%d0 + lsr.l #8,%d0 + or.l %d1,%d0 + printf PCONV,"%08x\n",1,%d0 + rts + + | special return addresses for instr that + | encode the rounding precision in the opcode + | (e.g. fsmove,fdmove) + +fp_finalrounding_single: + addq.l #8,%sp + jsr fp_normalize_ext + jsr fp_normalize_single + jra fp_finaltest + +fp_finalrounding_single_fast: + addq.l #8,%sp + jsr fp_normalize_ext + jsr fp_normalize_single_fast + jra fp_finaltest + +fp_finalrounding_double: + addq.l #8,%sp + jsr fp_normalize_ext + jsr fp_normalize_double + jra fp_finaltest + + | fp_finaltest: + | set the emulated status register based on the outcome of an + | emulated instruction. + +fp_finalrounding: + addq.l #8,%sp +| printf ,"f: %p\n",1,%a0 + jsr fp_normalize_ext + move.w (FPD_PREC,FPDATA),%d0 + subq.w #1,%d0 + jcs fp_finaltest + jne 1f + jsr fp_normalize_single + jra 2f +1: jsr fp_normalize_double +2:| printf ,"f: %p\n",1,%a0 +fp_finaltest: + | First, we do some of the obvious tests for the exception + | status byte and condition code bytes of fp_sr here, so that + | they do not have to be handled individually by every + | emulated instruction. + clr.l %d0 + addq.l #1,%a0 + tst.b (%a0)+ | sign + jeq 1f + bset #FPSR_CC_NEG-24,%d0 | N bit +1: cmp.w #0x7fff,(%a0)+ | exponent + jeq 2f + | test for zero + moveq #FPSR_CC_Z-24,%d1 + tst.l (%a0)+ + jne 9f + tst.l (%a0) + jne 9f + jra 8f + | infinitiv and NAN +2: moveq #FPSR_CC_NAN-24,%d1 + move.l (%a0)+,%d2 + lsl.l #1,%d2 | ignore high bit + jne 8f + tst.l (%a0) + jne 8f + moveq #FPSR_CC_INF-24,%d1 +8: bset %d1,%d0 +9: move.b %d0,(FPD_FPSR+0,FPDATA) | set condition test result + | move instructions enter here + | Here, we test things in the exception status byte, and set + | other things in the accrued exception byte accordingly. + | Emulated instructions can set various things in the former, + | as defined in fp_emu.h. +fp_final: + move.l (FPD_FPSR,FPDATA),%d0 +#if 0 + btst #FPSR_EXC_SNAN,%d0 | EXC_SNAN + jne 1f + btst #FPSR_EXC_OPERR,%d0 | EXC_OPERR + jeq 2f +1: bset #FPSR_AEXC_IOP,%d0 | set IOP bit +2: btst #FPSR_EXC_OVFL,%d0 | EXC_OVFL + jeq 1f + bset #FPSR_AEXC_OVFL,%d0 | set OVFL bit +1: btst #FPSR_EXC_UNFL,%d0 | EXC_UNFL + jeq 1f + btst #FPSR_EXC_INEX2,%d0 | EXC_INEX2 + jeq 1f + bset #FPSR_AEXC_UNFL,%d0 | set UNFL bit +1: btst #FPSR_EXC_DZ,%d0 | EXC_INEX1 + jeq 1f + bset #FPSR_AEXC_DZ,%d0 | set DZ bit +1: btst #FPSR_EXC_OVFL,%d0 | EXC_OVFL + jne 1f + btst #FPSR_EXC_INEX2,%d0 | EXC_INEX2 + jne 1f + btst #FPSR_EXC_INEX1,%d0 | EXC_INEX1 + jeq 2f +1: bset #FPSR_AEXC_INEX,%d0 | set INEX bit +2: move.l %d0,(FPD_FPSR,FPDATA) +#else + | same as above, greatly optimized, but untested (yet) + move.l %d0,%d2 + lsr.l #5,%d0 + move.l %d0,%d1 + lsr.l #4,%d1 + or.l %d0,%d1 + and.b #0x08,%d1 + move.l %d2,%d0 + lsr.l #6,%d0 + or.l %d1,%d0 + move.l %d2,%d1 + lsr.l #4,%d1 + or.b #0xdf,%d1 + and.b %d1,%d0 + move.l %d2,%d1 + lsr.l #7,%d1 + and.b #0x80,%d1 + or.b %d1,%d0 + and.b #0xf8,%d0 + or.b %d0,%d2 + move.l %d2,(FPD_FPSR,FPDATA) +#endif + move.b (FPD_FPSR+2,FPDATA),%d0 + and.b (FPD_FPCR+2,FPDATA),%d0 + jeq 1f + printf ,"send signal!!!\n" +1: jra fp_end diff --git a/arch/m68k/math-emu/multi_arith.h b/arch/m68k/math-emu/multi_arith.h new file mode 100644 index 00000000000..02251e5afd8 --- /dev/null +++ b/arch/m68k/math-emu/multi_arith.h @@ -0,0 +1,819 @@ +/* multi_arith.h: multi-precision integer arithmetic functions, needed + to do extended-precision floating point. + + (c) 1998 David Huggins-Daines. + + Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c) + David Mosberger-Tang. + + You may copy, modify, and redistribute this file under the terms of + the GNU General Public License, version 2, or any later version, at + your convenience. */ + +/* Note: + + These are not general multi-precision math routines. Rather, they + implement the subset of integer arithmetic that we need in order to + multiply, divide, and normalize 128-bit unsigned mantissae. */ + +#ifndef MULTI_ARITH_H +#define MULTI_ARITH_H + +#if 0 /* old code... */ + +/* Unsigned only, because we don't need signs to multiply and divide. */ +typedef unsigned int int128[4]; + +/* Word order */ +enum { + MSW128, + NMSW128, + NLSW128, + LSW128 +}; + +/* big-endian */ +#define LO_WORD(ll) (((unsigned int *) &ll)[1]) +#define HI_WORD(ll) (((unsigned int *) &ll)[0]) + +/* Convenience functions to stuff various integer values into int128s */ + +static inline void zero128(int128 a) +{ + a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0; +} + +/* Human-readable word order in the arguments */ +static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1, + unsigned int i0, int128 a) +{ + a[LSW128] = i0; + a[NLSW128] = i1; + a[NMSW128] = i2; + a[MSW128] = i3; +} + +/* Convenience functions (for testing as well) */ +static inline void int64_to_128(unsigned long long src, int128 dest) +{ + dest[LSW128] = (unsigned int) src; + dest[NLSW128] = src >> 32; + dest[NMSW128] = dest[MSW128] = 0; +} + +static inline void int128_to_64(const int128 src, unsigned long long *dest) +{ + *dest = src[LSW128] | (long long) src[NLSW128] << 32; +} + +static inline void put_i128(const int128 a) +{ + printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128], + a[NLSW128], a[LSW128]); +} + +/* Internal shifters: + + Note that these are only good for 0 < count < 32. + */ + +static inline void _lsl128(unsigned int count, int128 a) +{ + a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count)); + a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count)); + a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count)); + a[LSW128] <<= count; +} + +static inline void _lsr128(unsigned int count, int128 a) +{ + a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count)); + a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count)); + a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count)); + a[MSW128] >>= count; +} + +/* Should be faster, one would hope */ + +static inline void lslone128(int128 a) +{ + asm volatile ("lsl.l #1,%0\n" + "roxl.l #1,%1\n" + "roxl.l #1,%2\n" + "roxl.l #1,%3\n" + : + "=d" (a[LSW128]), + "=d"(a[NLSW128]), + "=d"(a[NMSW128]), + "=d"(a[MSW128]) + : + "0"(a[LSW128]), + "1"(a[NLSW128]), + "2"(a[NMSW128]), + "3"(a[MSW128])); +} + +static inline void lsrone128(int128 a) +{ + asm volatile ("lsr.l #1,%0\n" + "roxr.l #1,%1\n" + "roxr.l #1,%2\n" + "roxr.l #1,%3\n" + : + "=d" (a[MSW128]), + "=d"(a[NMSW128]), + "=d"(a[NLSW128]), + "=d"(a[LSW128]) + : + "0"(a[MSW128]), + "1"(a[NMSW128]), + "2"(a[NLSW128]), + "3"(a[LSW128])); +} + +/* Generalized 128-bit shifters: + + These bit-shift to a multiple of 32, then move whole longwords. */ + +static inline void lsl128(unsigned int count, int128 a) +{ + int wordcount, i; + + if (count % 32) + _lsl128(count % 32, a); + + if (0 == (wordcount = count / 32)) + return; + + /* argh, gak, endian-sensitive */ + for (i = 0; i < 4 - wordcount; i++) { + a[i] = a[i + wordcount]; + } + for (i = 3; i >= 4 - wordcount; --i) { + a[i] = 0; + } +} + +static inline void lsr128(unsigned int count, int128 a) +{ + int wordcount, i; + + if (count % 32) + _lsr128(count % 32, a); + + if (0 == (wordcount = count / 32)) + return; + + for (i = 3; i >= wordcount; --i) { + a[i] = a[i - wordcount]; + } + for (i = 0; i < wordcount; i++) { + a[i] = 0; + } +} + +static inline int orl128(int a, int128 b) +{ + b[LSW128] |= a; +} + +static inline int btsthi128(const int128 a) +{ + return a[MSW128] & 0x80000000; +} + +/* test bits (numbered from 0 = LSB) up to and including "top" */ +static inline int bftestlo128(int top, const int128 a) +{ + int r = 0; + + if (top > 31) + r |= a[LSW128]; + if (top > 63) + r |= a[NLSW128]; + if (top > 95) + r |= a[NMSW128]; + + r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1); + + return (r != 0); +} + +/* Aargh. We need these because GCC is broken */ +/* FIXME: do them in assembly, for goodness' sake! */ +static inline void mask64(int pos, unsigned long long *mask) +{ + *mask = 0; + + if (pos < 32) { + LO_WORD(*mask) = (1 << pos) - 1; + return; + } + LO_WORD(*mask) = -1; + HI_WORD(*mask) = (1 << (pos - 32)) - 1; +} + +static inline void bset64(int pos, unsigned long long *dest) +{ + /* This conditional will be optimized away. Thanks, GCC! */ + if (pos < 32) + asm volatile ("bset %1,%0":"=m" + (LO_WORD(*dest)):"id"(pos)); + else + asm volatile ("bset %1,%0":"=m" + (HI_WORD(*dest)):"id"(pos - 32)); +} + +static inline int btst64(int pos, unsigned long long dest) +{ + if (pos < 32) + return (0 != (LO_WORD(dest) & (1 << pos))); + else + return (0 != (HI_WORD(dest) & (1 << (pos - 32)))); +} + +static inline void lsl64(int count, unsigned long long *dest) +{ + if (count < 32) { + HI_WORD(*dest) = (HI_WORD(*dest) << count) + | (LO_WORD(*dest) >> count); + LO_WORD(*dest) <<= count; + return; + } + count -= 32; + HI_WORD(*dest) = LO_WORD(*dest) << count; + LO_WORD(*dest) = 0; +} + +static inline void lsr64(int count, unsigned long long *dest) +{ + if (count < 32) { + LO_WORD(*dest) = (LO_WORD(*dest) >> count) + | (HI_WORD(*dest) << (32 - count)); + HI_WORD(*dest) >>= count; + return; + } + count -= 32; + LO_WORD(*dest) = HI_WORD(*dest) >> count; + HI_WORD(*dest) = 0; +} +#endif + +static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt) +{ + reg->exp += cnt; + + switch (cnt) { + case 0 ... 8: + reg->lowmant = reg->mant.m32[1] << (8 - cnt); + reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) | + (reg->mant.m32[0] << (32 - cnt)); + reg->mant.m32[0] = reg->mant.m32[0] >> cnt; + break; + case 9 ... 32: + reg->lowmant = reg->mant.m32[1] >> (cnt - 8); + if (reg->mant.m32[1] << (40 - cnt)) + reg->lowmant |= 1; + reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) | + (reg->mant.m32[0] << (32 - cnt)); + reg->mant.m32[0] = reg->mant.m32[0] >> cnt; + break; + case 33 ... 39: + asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant) + : "m" (reg->mant.m32[0]), "d" (64 - cnt)); + if (reg->mant.m32[1] << (40 - cnt)) + reg->lowmant |= 1; + reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32); + reg->mant.m32[0] = 0; + break; + case 40 ... 71: + reg->lowmant = reg->mant.m32[0] >> (cnt - 40); + if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1]) + reg->lowmant |= 1; + reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32); + reg->mant.m32[0] = 0; + break; + default: + reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1]; + reg->mant.m32[0] = 0; + reg->mant.m32[1] = 0; + break; + } +} + +static inline int fp_overnormalize(struct fp_ext *reg) +{ + int shift; + + if (reg->mant.m32[0]) { + asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0])); + reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift)); + reg->mant.m32[1] = (reg->mant.m32[1] << shift); + } else { + asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1])); + reg->mant.m32[0] = (reg->mant.m32[1] << shift); + reg->mant.m32[1] = 0; + shift += 32; + } + + return shift; +} + +static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src) +{ + int carry; + + /* we assume here, gcc only insert move and a clr instr */ + asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant) + : "g,d" (src->lowmant), "0,0" (dest->lowmant)); + asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1]) + : "d" (src->mant.m32[1]), "0" (dest->mant.m32[1])); + asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0]) + : "d" (src->mant.m32[0]), "0" (dest->mant.m32[0])); + asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0)); + + return carry; +} + +static inline int fp_addcarry(struct fp_ext *reg) +{ + if (++reg->exp == 0x7fff) { + if (reg->mant.m64) + fp_set_sr(FPSR_EXC_INEX2); + reg->mant.m64 = 0; + fp_set_sr(FPSR_EXC_OVFL); + return 0; + } + reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0); + reg->mant.m32[1] = (reg->mant.m32[1] >> 1) | + (reg->mant.m32[0] << 31); + reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000; + + return 1; +} + +static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1, + struct fp_ext *src2) +{ + /* we assume here, gcc only insert move and a clr instr */ + asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant) + : "g,d" (src2->lowmant), "0,0" (src1->lowmant)); + asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1]) + : "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1])); + asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0]) + : "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0])); +} + +#define fp_mul64(desth, destl, src1, src2) ({ \ + asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth) \ + : "g" (src1), "0" (src2)); \ +}) +#define fp_div64(quot, rem, srch, srcl, div) \ + asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem) \ + : "dm" (div), "1" (srch), "0" (srcl)) +#define fp_add64(dest1, dest2, src1, src2) ({ \ + asm ("add.l %1,%0" : "=d,dm" (dest2) \ + : "dm,d" (src2), "0,0" (dest2)); \ + asm ("addx.l %1,%0" : "=d" (dest1) \ + : "d" (src1), "0" (dest1)); \ +}) +#define fp_addx96(dest, src) ({ \ + /* we assume here, gcc only insert move and a clr instr */ \ + asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2]) \ + : "g,d" (temp.m32[1]), "0,0" (dest->m32[2])); \ + asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1]) \ + : "d" (temp.m32[0]), "0" (dest->m32[1])); \ + asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0]) \ + : "d" (0), "0" (dest->m32[0])); \ +}) +#define fp_sub64(dest, src) ({ \ + asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1]) \ + : "dm,d" (src.m32[1]), "0,0" (dest.m32[1])); \ + asm ("subx.l %1,%0" : "=d" (dest.m32[0]) \ + : "d" (src.m32[0]), "0" (dest.m32[0])); \ +}) +#define fp_sub96c(dest, srch, srcm, srcl) ({ \ + char carry; \ + asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2]) \ + : "dm,d" (srcl), "0,0" (dest.m32[2])); \ + asm ("subx.l %1,%0" : "=d" (dest.m32[1]) \ + : "d" (srcm), "0" (dest.m32[1])); \ + asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0]) \ + : "d" (srch), "1" (dest.m32[0])); \ + carry; \ +}) + +static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1, + struct fp_ext *src2) +{ + union fp_mant64 temp; + + fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]); + fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]); + + fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]); + fp_addx96(dest, temp); + + fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]); + fp_addx96(dest, temp); +} + +static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src, + struct fp_ext *div) +{ + union fp_mant128 tmp; + union fp_mant64 tmp64; + unsigned long *mantp = dest->m32; + unsigned long fix, rem, first, dummy; + int i; + + /* the algorithm below requires dest to be smaller than div, + but both have the high bit set */ + if (src->mant.m64 >= div->mant.m64) { + fp_sub64(src->mant, div->mant); + *mantp = 1; + } else + *mantp = 0; + mantp++; + + /* basic idea behind this algorithm: we can't divide two 64bit numbers + (AB/CD) directly, but we can calculate AB/C0, but this means this + quotient is off by C0/CD, so we have to multiply the first result + to fix the result, after that we have nearly the correct result + and only a few corrections are needed. */ + + /* C0/CD can be precalculated, but it's an 64bit division again, but + we can make it a bit easier, by dividing first through C so we get + 10/1D and now only a single shift and the value fits into 32bit. */ + fix = 0x80000000; + dummy = div->mant.m32[1] / div->mant.m32[0] + 1; + dummy = (dummy >> 1) | fix; + fp_div64(fix, dummy, fix, 0, dummy); + fix--; + + for (i = 0; i < 3; i++, mantp++) { + if (src->mant.m32[0] == div->mant.m32[0]) { + fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]); + + fp_mul64(*mantp, dummy, first, fix); + *mantp += fix; + } else { + fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]); + + fp_mul64(*mantp, dummy, first, fix); + } + + fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp); + fp_add64(tmp.m32[0], tmp.m32[1], 0, rem); + tmp.m32[2] = 0; + + fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]); + fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]); + + src->mant.m32[0] = tmp.m32[1]; + src->mant.m32[1] = tmp.m32[2]; + + while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) { + src->mant.m32[0] = tmp.m32[1]; + src->mant.m32[1] = tmp.m32[2]; + *mantp += 1; + } + } +} + +#if 0 +static inline unsigned int fp_fls128(union fp_mant128 *src) +{ + unsigned long data; + unsigned int res, off; + + if ((data = src->m32[0])) + off = 0; + else if ((data = src->m32[1])) + off = 32; + else if ((data = src->m32[2])) + off = 64; + else if ((data = src->m32[3])) + off = 96; + else + return 128; + + asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data)); + return res + off; +} + +static inline void fp_shiftmant128(union fp_mant128 *src, int shift) +{ + unsigned long sticky; + + switch (shift) { + case 0: + return; + case 1: + asm volatile ("lsl.l #1,%0" + : "=d" (src->m32[3]) : "0" (src->m32[3])); + asm volatile ("roxl.l #1,%0" + : "=d" (src->m32[2]) : "0" (src->m32[2])); + asm volatile ("roxl.l #1,%0" + : "=d" (src->m32[1]) : "0" (src->m32[1])); + asm volatile ("roxl.l #1,%0" + : "=d" (src->m32[0]) : "0" (src->m32[0])); + return; + case 2 ... 31: + src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift)); + src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift)); + src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift)); + src->m32[3] = (src->m32[3] << shift); + return; + case 32 ... 63: + shift -= 32; + src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift)); + src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift)); + src->m32[2] = (src->m32[3] << shift); + src->m32[3] = 0; + return; + case 64 ... 95: + shift -= 64; + src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift)); + src->m32[1] = (src->m32[3] << shift); + src->m32[2] = src->m32[3] = 0; + return; + case 96 ... 127: + shift -= 96; + src->m32[0] = (src->m32[3] << shift); + src->m32[1] = src->m32[2] = src->m32[3] = 0; + return; + case -31 ... -1: + shift = -shift; + sticky = 0; + if (src->m32[3] << (32 - shift)) + sticky = 1; + src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky; + src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)); + src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)); + src->m32[0] = (src->m32[0] >> shift); + return; + case -63 ... -32: + shift = -shift - 32; + sticky = 0; + if ((src->m32[2] << (32 - shift)) || src->m32[3]) + sticky = 1; + src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky; + src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)); + src->m32[1] = (src->m32[0] >> shift); + src->m32[0] = 0; + return; + case -95 ... -64: + shift = -shift - 64; + sticky = 0; + if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3]) + sticky = 1; + src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky; + src->m32[2] = (src->m32[0] >> shift); + src->m32[1] = src->m32[0] = 0; + return; + case -127 ... -96: + shift = -shift - 96; + sticky = 0; + if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3]) + sticky = 1; + src->m32[3] = (src->m32[0] >> shift) | sticky; + src->m32[2] = src->m32[1] = src->m32[0] = 0; + return; + } + + if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3])) + src->m32[3] = 1; + else + src->m32[3] = 0; + src->m32[2] = 0; + src->m32[1] = 0; + src->m32[0] = 0; +} +#endif + +static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src, + int shift) +{ + unsigned long tmp; + + switch (shift) { + case 0: + dest->mant.m64 = src->m64[0]; + dest->lowmant = src->m32[2] >> 24; + if (src->m32[3] || (src->m32[2] << 8)) + dest->lowmant |= 1; + break; + case 1: + asm volatile ("lsl.l #1,%0" + : "=d" (tmp) : "0" (src->m32[2])); + asm volatile ("roxl.l #1,%0" + : "=d" (dest->mant.m32[1]) : "0" (src->m32[1])); + asm volatile ("roxl.l #1,%0" + : "=d" (dest->mant.m32[0]) : "0" (src->m32[0])); + dest->lowmant = tmp >> 24; + if (src->m32[3] || (tmp << 8)) + dest->lowmant |= 1; + break; + case 31: + asm volatile ("lsr.l #1,%1; roxr.l #1,%0" + : "=d" (dest->mant.m32[0]) + : "d" (src->m32[0]), "0" (src->m32[1])); + asm volatile ("roxr.l #1,%0" + : "=d" (dest->mant.m32[1]) : "0" (src->m32[2])); + asm volatile ("roxr.l #1,%0" + : "=d" (tmp) : "0" (src->m32[3])); + dest->lowmant = tmp >> 24; + if (src->m32[3] << 7) + dest->lowmant |= 1; + break; + case 32: + dest->mant.m32[0] = src->m32[1]; + dest->mant.m32[1] = src->m32[2]; + dest->lowmant = src->m32[3] >> 24; + if (src->m32[3] << 8) + dest->lowmant |= 1; + break; + } +} + +#if 0 /* old code... */ +static inline int fls(unsigned int a) +{ + int r; + + asm volatile ("bfffo %1{#0,#32},%0" + : "=d" (r) : "md" (a)); + return r; +} + +/* fls = "find last set" (cf. ffs(3)) */ +static inline int fls128(const int128 a) +{ + if (a[MSW128]) + return fls(a[MSW128]); + if (a[NMSW128]) + return fls(a[NMSW128]) + 32; + /* XXX: it probably never gets beyond this point in actual + use, but that's indicative of a more general problem in the + algorithm (i.e. as per the actual 68881 implementation, we + really only need at most 67 bits of precision [plus + overflow]) so I'm not going to fix it. */ + if (a[NLSW128]) + return fls(a[NLSW128]) + 64; + if (a[LSW128]) + return fls(a[LSW128]) + 96; + else + return -1; +} + +static inline int zerop128(const int128 a) +{ + return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]); +} + +static inline int nonzerop128(const int128 a) +{ + return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]); +} + +/* Addition and subtraction */ +/* Do these in "pure" assembly, because "extended" asm is unmanageable + here */ +static inline void add128(const int128 a, int128 b) +{ + /* rotating carry flags */ + unsigned int carry[2]; + + carry[0] = a[LSW128] > (0xffffffff - b[LSW128]); + b[LSW128] += a[LSW128]; + + carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]); + b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0]; + + carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]); + b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1]; + + b[MSW128] = a[MSW128] + b[MSW128] + carry[0]; +} + +/* Note: assembler semantics: "b -= a" */ +static inline void sub128(const int128 a, int128 b) +{ + /* rotating borrow flags */ + unsigned int borrow[2]; + + borrow[0] = b[LSW128] < a[LSW128]; + b[LSW128] -= a[LSW128]; + + borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0]; + b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0]; + + borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1]; + b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1]; + + b[MSW128] = b[MSW128] - a[MSW128] - borrow[0]; +} + +/* Poor man's 64-bit expanding multiply */ +static inline void mul64(unsigned long long a, unsigned long long b, int128 c) +{ + unsigned long long acc; + int128 acc128; + + zero128(acc128); + zero128(c); + + /* first the low words */ + if (LO_WORD(a) && LO_WORD(b)) { + acc = (long long) LO_WORD(a) * LO_WORD(b); + c[NLSW128] = HI_WORD(acc); + c[LSW128] = LO_WORD(acc); + } + /* Next the high words */ + if (HI_WORD(a) && HI_WORD(b)) { + acc = (long long) HI_WORD(a) * HI_WORD(b); + c[MSW128] = HI_WORD(acc); + c[NMSW128] = LO_WORD(acc); + } + /* The middle words */ + if (LO_WORD(a) && HI_WORD(b)) { + acc = (long long) LO_WORD(a) * HI_WORD(b); + acc128[NMSW128] = HI_WORD(acc); + acc128[NLSW128] = LO_WORD(acc); + add128(acc128, c); + } + /* The first and last words */ + if (HI_WORD(a) && LO_WORD(b)) { + acc = (long long) HI_WORD(a) * LO_WORD(b); + acc128[NMSW128] = HI_WORD(acc); + acc128[NLSW128] = LO_WORD(acc); + add128(acc128, c); + } +} + +/* Note: unsigned */ +static inline int cmp128(int128 a, int128 b) +{ + if (a[MSW128] < b[MSW128]) + return -1; + if (a[MSW128] > b[MSW128]) + return 1; + if (a[NMSW128] < b[NMSW128]) + return -1; + if (a[NMSW128] > b[NMSW128]) + return 1; + if (a[NLSW128] < b[NLSW128]) + return -1; + if (a[NLSW128] > b[NLSW128]) + return 1; + + return (signed) a[LSW128] - b[LSW128]; +} + +inline void div128(int128 a, int128 b, int128 c) +{ + int128 mask; + + /* Algorithm: + + Shift the divisor until it's at least as big as the + dividend, keeping track of the position to which we've + shifted it, i.e. the power of 2 which we've multiplied it + by. + + Then, for this power of 2 (the mask), and every one smaller + than it, subtract the mask from the dividend and add it to + the quotient until the dividend is smaller than the raised + divisor. At this point, divide the dividend and the mask + by 2 (i.e. shift one place to the right). Lather, rinse, + and repeat, until there are no more powers of 2 left. */ + + /* FIXME: needless to say, there's room for improvement here too. */ + + /* Shift up */ + /* XXX: since it just has to be "at least as big", we can + probably eliminate this horribly wasteful loop. I will + have to prove this first, though */ + set128(0, 0, 0, 1, mask); + while (cmp128(b, a) < 0 && !btsthi128(b)) { + lslone128(b); + lslone128(mask); + } + + /* Shift down */ + zero128(c); + do { + if (cmp128(a, b) >= 0) { + sub128(b, a); + add128(mask, c); + } + lsrone128(mask); + lsrone128(b); + } while (nonzerop128(mask)); + + /* The remainder is in a... */ +} +#endif + +#endif /* MULTI_ARITH_H */ -- cgit v1.2.3