It is well known that fixed point arithmetics is equavalent to integer arithmetics with some constraints. In this way I caught the issue. Let's see:
#include <stdio.h>
#include <stdint.h>
#define USE_C_FUNC 1
#if USE_C_FUNC != 0
uint64_t fixmd(uint64_t a, uint64_t b)
{
uint64_t c = a * b / 10000000000LL;
return c;
}
#else
uint64_t fixmd(uint64_t a, uint64_t b)
{
uint64_t c;
asm (
"mov %1, %%rax\n"
"mul %2\n"
"mov $10000000000, %%r8\n"
"div %%r8\n"
"mov %%rax, %0\n"
: "=r" (c)
: "r" (a), "r" (b)
: "rax", "rdx", "r8"
);
return c;
}
#endif
void main(void)
{
uint64_t x = 12589254118LL;
uint64_t y = fixmd(x, x);
printf("x=0x%llX=%llu, y=0x%llX=%llu\n", x, x, y, y);
}
In case #define USE_C_FUNC 1
program prints WRONG result:
x=0x2EE60C5E6=12589254118, y=0x410F8719=1091536665
In case #define USE_C_FUNC 0
program (with inline assembly) prints RIGHT result:
x=0x2EE60C5E6=12589254118, y=0x3B0AB8254=15848931924
I had compiling this example without any optimization (with gcc option -O0).
$ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/10/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa:hsa
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 10.2.1-6' --with-bugurl=file:///usr/share/doc/gcc-10/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-10 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-10-Km9U7s/gcc-10-10.2.1/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-10-Km9U7s/gcc-10-10.2.1/debian/tmp-gcn/usr,hsa --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-mutex
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 10.2.1 20210110 (Debian 10.2.1-6)
objdump -D uint64
for WRONG result for function fixmd() is:
0000000000001135 <fixmd>:
1135: 55 push %rbp
1136: 48 89 e5 mov %rsp,%rbp
1139: 48 89 7d e8 mov %rdi,-0x18(%rbp)
113d: 48 89 75 e0 mov %rsi,-0x20(%rbp)
1141: 48 8b 45 e8 mov -0x18(%rbp),%rax
1145: 48 0f af 45 e0 imul -0x20(%rbp),%rax
114a: 48 ba bf d5 ed bd ce movabs $0xdbe6fecebdedd5bf,%rdx
1151: fe e6 db
1154: 48 f7 e2 mul %rdx
1157: 48 89 d0 mov %rdx,%rax
115a: 48 c1 e8 21 shr $0x21,%rax
115e: 48 89 45 f8 mov %rax,-0x8(%rbp)
1162: 48 8b 45 f8 mov -0x8(%rbp),%rax
1166: 5d pop %rbp
1167: c3 retq
objdump -D uint64
for GOOD result for function fixmd() is:
0000000000001135 <fixmd>:
1135: 55 push %rbp
1136: 48 89 e5 mov %rsp,%rbp
1139: 48 89 7d e8 mov %rdi,-0x18(%rbp)
113d: 48 89 75 e0 mov %rsi,-0x20(%rbp)
1141: 48 8b 4d e8 mov -0x18(%rbp),%rcx
1145: 48 8b 75 e0 mov -0x20(%rbp),%rsi
1149: 48 89 c8 mov %rcx,%rax
114c: 48 f7 e6 mul %rsi
114f: 49 b8 00 e4 0b 54 02 movabs $0x2540be400,%r8
1156: 00 00 00
1159: 49 f7 f0 div %r8
115c: 48 89 c1 mov %rax,%rcx
115f: 48 89 4d f8 mov %rcx,-0x8(%rbp)
1163: 48 8b 45 f8 mov -0x8(%rbp),%rax
1167: 5d pop %rbp
1168: c3 retq
In the code for fixmd() in C the problem is quite clean: gcc uses signed multiplication (imul) for unsigned values and substitute division with multiplication on the coefficient (with overflow) and binary shifting. Seems, it is the bug, is not it?
This is not a bug. Your expectations are incorrect.
Unsigned integer arithmetic is performed modulo the largest value the type can hold +1. Loosely speaking, this means that for a 64 bit type that anything that overflows 64 bits is truncated.
In your particular case, you're multiplying 12589254118 * 12589254118. The arithmetic result of this is 158,489,319,247,579,957,924. This is larger than what will fit in a 64 bit type, so the result is modulo 2 64 which gives us 10,915,366,657,903,544,996, and dividing by 10000000000 gives us 1,091,536,665 which matches what the C code generates.
gcc supports 128 bit types, so you can fix this by casting one of the operands to __int128
to perform the math with 128 bits.
uint64_t c = (unsigned __int128)a * b / 10000000000LL;
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.