dolphin-emulator/Source/Core/Common/FloatUtils.h

// Copyright 2018 Dolphin Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#pragma once

#include <array>
#include <bit>
#include <limits>

#include "Common/CommonTypes.h"
#include "Core/PowerPC/Gekko.h"

namespace Common
{
template <typename T>
constexpr T SNANConstant()
{
  return std::numeric_limits<T>::signaling_NaN();
}

// The most significant bit of the fraction is an is-quiet bit on all architectures we care about.
static constexpr u64 DOUBLE_QBIT = 0x0008000000000000ULL;
static constexpr u64 DOUBLE_SIGN = 0x8000000000000000ULL;
static constexpr u64 DOUBLE_EXP = 0x7FF0000000000000ULL;
static constexpr u64 DOUBLE_FRAC = 0x000FFFFFFFFFFFFFULL;
static constexpr u64 DOUBLE_ZERO = 0x0000000000000000ULL;
static constexpr int DOUBLE_EXP_WIDTH = 11;
static constexpr int DOUBLE_FRAC_WIDTH = 52;

static constexpr u32 FLOAT_SIGN = 0x80000000;
static constexpr u32 FLOAT_EXP = 0x7F800000;
static constexpr u32 FLOAT_FRAC = 0x007FFFFF;
static constexpr u32 FLOAT_ZERO = 0x00000000;
static constexpr int FLOAT_EXP_WIDTH = 8;
static constexpr int FLOAT_FRAC_WIDTH = 23;

inline bool IsQNAN(double d)
{
  const u64 i = std::bit_cast<u64>(d);
  return ((i & DOUBLE_EXP) == DOUBLE_EXP) && ((i & DOUBLE_QBIT) == DOUBLE_QBIT);
}

inline bool IsSNAN(double d)
{
  const u64 i = std::bit_cast<u64>(d);
  return ((i & DOUBLE_EXP) == DOUBLE_EXP) && ((i & DOUBLE_FRAC) != DOUBLE_ZERO) &&
         ((i & DOUBLE_QBIT) == DOUBLE_ZERO);
}

inline float FlushToZero(float f)
{
  u32 i = std::bit_cast<u32>(f);
  if ((i & FLOAT_EXP) == 0)
  {
    // Turn into signed zero
    i &= FLOAT_SIGN;
  }
  return std::bit_cast<float>(i);
}

inline double FlushToZero(double d)
{
  u64 i = std::bit_cast<u64>(d);
  if ((i & DOUBLE_EXP) == 0)
  {
    // Turn into signed zero
    i &= DOUBLE_SIGN;
  }
  return std::bit_cast<double>(i);
}

enum PPCFpClass
{
  PPC_FPCLASS_QNAN = 0x11,
  PPC_FPCLASS_NINF = 0x9,
  PPC_FPCLASS_NN = 0x8,
  PPC_FPCLASS_ND = 0x18,
  PPC_FPCLASS_NZ = 0x12,
  PPC_FPCLASS_PZ = 0x2,
  PPC_FPCLASS_PD = 0x14,
  PPC_FPCLASS_PN = 0x4,
  PPC_FPCLASS_PINF = 0x5,
};

// Uses PowerPC conventions for the return value, so it can be easily
// used directly in CPU emulation.
u32 ClassifyDouble(double dvalue);
u32 ClassifyFloat(float fvalue);

struct BaseAndDec
{
  int m_base;
  int m_dec;
};
extern const std::array<BaseAndDec, 32> frsqrte_expected;
extern const std::array<BaseAndDec, 32> fres_expected;

// PowerPC approximation algorithms
double ApproximateReciprocalSquareRoot(double val);
double ApproximateReciprocal(const UReg_FPSCR& fpscr, double val);

// Instructions which move data without performing operations round a bit weirdly
// Specifically they rounding the mantissa to be like that of a 32-bit float,
// going as far as to focus on the rounding mode, but never actually care about
// making sure the exponent becomes 32-bit
// Either this, or they'll truncate the mantissa down, which will always happen to
// PS1 OR PS0 in ps_rsqrte
inline u64 TruncateMantissaBits(u64 bits)
{
  // Truncation can be done by simply cutting off the mantissa bits that don't
  // exist in a single precision float
  constexpr u64 remove_bits = Common::DOUBLE_FRAC_WIDTH - Common::FLOAT_FRAC_WIDTH;
  constexpr u64 remove_mask = (1 << remove_bits) - 1;
  return bits & ~remove_mask;
}

inline double TruncateMantissa(double value)
{
  u64 bits = std::bit_cast<u64>(value);
  u64 trunc_bits = TruncateMantissaBits(bits);
  return std::bit_cast<double>(trunc_bits);
}

inline u64 RoundMantissaBitsFinite(u64 bits)
{
  const u64 replacement_exp = 0x4000000000000000ull;

  // To round only the mantissa, we assume the CPU can change the rounding mode,
  // create new double with an exponent that won't cause issues, round to a single,
  // and convert back to a double while restoring the original exponent again!
  // The removing the exponent is done via subtraction instead of bitwise
  // operations due to the possibility that the rounding will cause an overflow
  // into the exponent
  u64 resized_bits = (bits & (Common::DOUBLE_FRAC | Common::DOUBLE_SIGN)) | replacement_exp;

  float rounded_float = static_cast<float>(std::bit_cast<double>(resized_bits));
  double extended_float = static_cast<double>(rounded_float);
  u64 rounded_bits = std::bit_cast<u64>(extended_float);

  u64 orig_exp_bits = bits & Common::DOUBLE_EXP;

  if (orig_exp_bits == 0)
  {
    // The exponent isn't incremented for double subnormals
    return rounded_bits & ~Common::DOUBLE_EXP;
  }

  // Handle the change accordingly otherwise!
  rounded_bits = (rounded_bits - replacement_exp) + orig_exp_bits;
  return rounded_bits;
}

inline u64 RoundMantissaBits(u64 bits)
{
  // Checking if the value is non-finite
  if ((bits & Common::DOUBLE_EXP) == Common::DOUBLE_EXP)
  {
    // For infinite and NaN values, the mantissa is simply truncated
    return TruncateMantissaBits(bits);
  }

  return RoundMantissaBitsFinite(bits);
}

inline double RoundMantissaFinite(double value)
{
  // This function is only ever used by ps_sum1, because
  // for some reason it assumes that ps0 should be rounded with
  // finite values rather than checking if they might be infinite
  u64 bits = std::bit_cast<u64>(value);
  u64 rounded_bits = RoundMantissaBitsFinite(bits);
  return std::bit_cast<double>(rounded_bits);
}

inline double RoundMantissa(double value)
{
  // The double version of the function just converts to and from bits again
  // This would be a necessary step anyways, so it just simplifies code
  u64 bits = std::bit_cast<u64>(value);
  u64 rounded_bits = RoundMantissaBits(bits);
  return std::bit_cast<double>(rounded_bits);
}

}  // namespace Common
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`// Copyright 2018 Dolphin Emulator Project`
treewide: convert GPLv2+ license info to SPDX tags SPDX standardizes how source code conveys its copyright and licensing information. See https://spdx.github.io/spdx-spec/1-rationale/ . SPDX tags are adopted in many large projects, including things like the Linux kernel. 2021-07-05 03:22:19 +02:00			`// SPDX-License-Identifier: GPL-2.0-or-later`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00
			`#pragma once`

			`#include <array>`
Replace Common::BitCast with std::bit_cast 2022-08-05 21:10:17 -07:00			`#include <bit>`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`#include <limits>`

			`#include "Common/CommonTypes.h"`
Fix To Match Hwtests Also changes >= 0 to > 0.0 This technically leads to fewer branches taken ^^; More importantly it looks/feels nicer to me Fixes the approximate reciprocal function - Currently not optimized - Considering rewrite for cleanliness Moves PS rounding to FloatUtils - Done because it's used in more places now Changes TruncateMantissa to occur on read - This is to account for reciprocal cases Adds PS1 getting function for reciprocals Fixes ps_sum1 edge case with rounding TODO: Test what ops can set PS1 edge case - ps_merge is known to be able to 2024-10-02 01:09:23 -05:00			`#include "Core/PowerPC/Gekko.h"`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00
			`namespace Common`
			`{`
			`template <typename T>`
			`constexpr T SNANConstant()`
			`{`
			`return std::numeric_limits<T>::signaling_NaN();`
			`}`

			`// The most significant bit of the fraction is an is-quiet bit on all architectures we care about.`
JitArm64: Resolve deprecated enum conversion warning Resolve warning caused by using values from two different enums in a conditional expression which was deprecated in c++20. The warning in question is clang -Wdeprecated-anon-enum-enum-conversion and gcc -Wenum-compare. 2023-08-23 10:36:52 -07:00			`static constexpr u64 DOUBLE_QBIT = 0x0008000000000000ULL;`
			`static constexpr u64 DOUBLE_SIGN = 0x8000000000000000ULL;`
			`static constexpr u64 DOUBLE_EXP = 0x7FF0000000000000ULL;`
			`static constexpr u64 DOUBLE_FRAC = 0x000FFFFFFFFFFFFFULL;`
			`static constexpr u64 DOUBLE_ZERO = 0x0000000000000000ULL;`
JitArm64: Use LSL+CLS for classifying floats This is a little trick I came up with that lets us restructure our float classification code so we can exit earlier when the float is normal, which is the case more often than not. First we shift left by 1 to get rid of the sign bit, and then we count the number of leading sign bits. If the result is less than 10 (for doubles) or 7 (for floats), the float is normal. This is because, if the float isn't normal, the exponent is either all zeroes or all ones. 2023-10-13 19:27:03 +02:00			`static constexpr int DOUBLE_EXP_WIDTH = 11;`
			`static constexpr int DOUBLE_FRAC_WIDTH = 52;`
JitArm64: Resolve deprecated enum conversion warning Resolve warning caused by using values from two different enums in a conditional expression which was deprecated in c++20. The warning in question is clang -Wdeprecated-anon-enum-enum-conversion and gcc -Wenum-compare. 2023-08-23 10:36:52 -07:00
			`static constexpr u32 FLOAT_SIGN = 0x80000000;`
			`static constexpr u32 FLOAT_EXP = 0x7F800000;`
			`static constexpr u32 FLOAT_FRAC = 0x007FFFFF;`
			`static constexpr u32 FLOAT_ZERO = 0x00000000;`
JitArm64: Use LSL+CLS for classifying floats This is a little trick I came up with that lets us restructure our float classification code so we can exit earlier when the float is normal, which is the case more often than not. First we shift left by 1 to get rid of the sign bit, and then we count the number of leading sign bits. If the result is less than 10 (for doubles) or 7 (for floats), the float is normal. This is because, if the float isn't normal, the exponent is either all zeroes or all ones. 2023-10-13 19:27:03 +02:00			`static constexpr int FLOAT_EXP_WIDTH = 8;`
			`static constexpr int FLOAT_FRAC_WIDTH = 23;`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00
			`inline bool IsQNAN(double d)`
			`{`
Replace Common::BitCast with std::bit_cast 2022-08-05 21:10:17 -07:00			`const u64 i = std::bit_cast<u64>(d);`
FloatUtils: Remove IntDouble and IntFloat Type punning via unions in C++ invokes undefined behavior. Instead, leverage BitCast, our variant of C++2a's std::bit_cast 2018-05-10 10:35:00 -04:00			`return ((i & DOUBLE_EXP) == DOUBLE_EXP) && ((i & DOUBLE_QBIT) == DOUBLE_QBIT);`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`}`

			`inline bool IsSNAN(double d)`
			`{`
Replace Common::BitCast with std::bit_cast 2022-08-05 21:10:17 -07:00			`const u64 i = std::bit_cast<u64>(d);`
FloatUtils: Remove IntDouble and IntFloat Type punning via unions in C++ invokes undefined behavior. Instead, leverage BitCast, our variant of C++2a's std::bit_cast 2018-05-10 10:35:00 -04:00			`return ((i & DOUBLE_EXP) == DOUBLE_EXP) && ((i & DOUBLE_FRAC) != DOUBLE_ZERO) &&`
			`((i & DOUBLE_QBIT) == DOUBLE_ZERO);`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`}`

			`inline float FlushToZero(float f)`
			`{`
Replace Common::BitCast with std::bit_cast 2022-08-05 21:10:17 -07:00			`u32 i = std::bit_cast<u32>(f);`
FloatUtils: Remove IntDouble and IntFloat Type punning via unions in C++ invokes undefined behavior. Instead, leverage BitCast, our variant of C++2a's std::bit_cast 2018-05-10 10:35:00 -04:00			`if ((i & FLOAT_EXP) == 0)`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`{`
FloatUtils: Remove IntDouble and IntFloat Type punning via unions in C++ invokes undefined behavior. Instead, leverage BitCast, our variant of C++2a's std::bit_cast 2018-05-10 10:35:00 -04:00			`// Turn into signed zero`
			`i &= FLOAT_SIGN;`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`}`
Replace Common::BitCast with std::bit_cast 2022-08-05 21:10:17 -07:00			`return std::bit_cast<float>(i);`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`}`

			`inline double FlushToZero(double d)`
			`{`
Replace Common::BitCast with std::bit_cast 2022-08-05 21:10:17 -07:00			`u64 i = std::bit_cast<u64>(d);`
FloatUtils: Remove IntDouble and IntFloat Type punning via unions in C++ invokes undefined behavior. Instead, leverage BitCast, our variant of C++2a's std::bit_cast 2018-05-10 10:35:00 -04:00			`if ((i & DOUBLE_EXP) == 0)`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`{`
FloatUtils: Remove IntDouble and IntFloat Type punning via unions in C++ invokes undefined behavior. Instead, leverage BitCast, our variant of C++2a's std::bit_cast 2018-05-10 10:35:00 -04:00			`// Turn into signed zero`
			`i &= DOUBLE_SIGN;`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`}`
Replace Common::BitCast with std::bit_cast 2022-08-05 21:10:17 -07:00			`return std::bit_cast<double>(i);`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00			`}`

			`enum PPCFpClass`
			`{`
			`PPC_FPCLASS_QNAN = 0x11,`
			`PPC_FPCLASS_NINF = 0x9,`
			`PPC_FPCLASS_NN = 0x8,`
			`PPC_FPCLASS_ND = 0x18,`
			`PPC_FPCLASS_NZ = 0x12,`
			`PPC_FPCLASS_PZ = 0x2,`
			`PPC_FPCLASS_PD = 0x14,`
			`PPC_FPCLASS_PN = 0x4,`
			`PPC_FPCLASS_PINF = 0x5,`
			`};`

			`// Uses PowerPC conventions for the return value, so it can be easily`
			`// used directly in CPU emulation.`
			`u32 ClassifyDouble(double dvalue);`
			`u32 ClassifyFloat(float fvalue);`

			`struct BaseAndDec`
			`{`
			`int m_base;`
			`int m_dec;`
			`};`
			`extern const std::array<BaseAndDec, 32> frsqrte_expected;`
			`extern const std::array<BaseAndDec, 32> fres_expected;`

			`// PowerPC approximation algorithms`
			`double ApproximateReciprocalSquareRoot(double val);`
Fix To Match Hwtests Also changes >= 0 to > 0.0 This technically leads to fewer branches taken ^^; More importantly it looks/feels nicer to me Fixes the approximate reciprocal function - Currently not optimized - Considering rewrite for cleanliness Moves PS rounding to FloatUtils - Done because it's used in more places now Changes TruncateMantissa to occur on read - This is to account for reciprocal cases Adds PS1 getting function for reciprocals Fixes ps_sum1 edge case with rounding TODO: Test what ops can set PS1 edge case - ps_merge is known to be able to 2024-10-02 01:09:23 -05:00			`double ApproximateReciprocal(const UReg_FPSCR& fpscr, double val);`

			`// Instructions which move data without performing operations round a bit weirdly`
			`// Specifically they rounding the mantissa to be like that of a 32-bit float,`
			`// going as far as to focus on the rounding mode, but never actually care about`
			`// making sure the exponent becomes 32-bit`
			`// Either this, or they'll truncate the mantissa down, which will always happen to`
			`// PS1 OR PS0 in ps_rsqrte`
			`inline u64 TruncateMantissaBits(u64 bits)`
			`{`
			`// Truncation can be done by simply cutting off the mantissa bits that don't`
			`// exist in a single precision float`
			`constexpr u64 remove_bits = Common::DOUBLE_FRAC_WIDTH - Common::FLOAT_FRAC_WIDTH;`
			`constexpr u64 remove_mask = (1 << remove_bits) - 1;`
			`return bits & ~remove_mask;`
			`}`

			`inline double TruncateMantissa(double value)`
			`{`
			`u64 bits = std::bit_cast<u64>(value);`
			`u64 trunc_bits = TruncateMantissaBits(bits);`
			`return std::bit_cast<double>(trunc_bits);`
			`}`

			`inline u64 RoundMantissaBitsFinite(u64 bits)`
			`{`
			`const u64 replacement_exp = 0x4000000000000000ull;`

			`// To round only the mantissa, we assume the CPU can change the rounding mode,`
			`// create new double with an exponent that won't cause issues, round to a single,`
			`// and convert back to a double while restoring the original exponent again!`
			`// The removing the exponent is done via subtraction instead of bitwise`
			`// operations due to the possibility that the rounding will cause an overflow`
			`// into the exponent`
			`u64 resized_bits = (bits & (Common::DOUBLE_FRAC \| Common::DOUBLE_SIGN)) \| replacement_exp;`

			`float rounded_float = static_cast<float>(std::bit_cast<double>(resized_bits));`
			`double extended_float = static_cast<double>(rounded_float);`
			`u64 rounded_bits = std::bit_cast<u64>(extended_float);`

			`u64 orig_exp_bits = bits & Common::DOUBLE_EXP;`

			`if (orig_exp_bits == 0)`
			`{`
			`// The exponent isn't incremented for double subnormals`
			`return rounded_bits & ~Common::DOUBLE_EXP;`
			`}`

			`// Handle the change accordingly otherwise!`
			`rounded_bits = (rounded_bits - replacement_exp) + orig_exp_bits;`
			`return rounded_bits;`
			`}`

			`inline u64 RoundMantissaBits(u64 bits)`
			`{`
			`// Checking if the value is non-finite`
			`if ((bits & Common::DOUBLE_EXP) == Common::DOUBLE_EXP)`
			`{`
			`// For infinite and NaN values, the mantissa is simply truncated`
			`return TruncateMantissaBits(bits);`
			`}`

			`return RoundMantissaBitsFinite(bits);`
			`}`

			`inline double RoundMantissaFinite(double value)`
			`{`
			`// This function is only ever used by ps_sum1, because`
			`// for some reason it assumes that ps0 should be rounded with`
			`// finite values rather than checking if they might be infinite`
			`u64 bits = std::bit_cast<u64>(value);`
			`u64 rounded_bits = RoundMantissaBitsFinite(bits);`
			`return std::bit_cast<double>(rounded_bits);`
			`}`

			`inline double RoundMantissa(double value)`
			`{`
			`// The double version of the function just converts to and from bits again`
			`// This would be a necessary step anyways, so it just simplifies code`
			`u64 bits = std::bit_cast<u64>(value);`
			`u64 rounded_bits = RoundMantissaBits(bits);`
			`return std::bit_cast<double>(rounded_bits);`
			`}`
Common: Move floating-point utility functions to FloatUtils.h/.cpp Keeps all of the floating-point utility functions in their own file to keep them all together. This also provides a place for other general-purpose floating-point functions to be added in the future, which will be necessary when improving the flag-setting within the interpreter. 2018-05-07 01:18:41 -04:00
			`} // namespace Common`