Two uint64_t numbers multiplication and division on coefficient in the C program are wrong compiled with GCC

Question

It is well known that fixed point arithmetics is equavalent to integer arithmetics with some constraints. In this way I caught the issue. Let's see:

#include <stdio.h>
#include <stdint.h>

#define USE_C_FUNC 1

#if USE_C_FUNC != 0
uint64_t fixmd(uint64_t a, uint64_t b)
{
    uint64_t c = a * b / 10000000000LL;
    return c;
}
#else
uint64_t fixmd(uint64_t a, uint64_t b)
{
    uint64_t c;

    asm (
    "mov %1, %%rax\n"
    "mul %2\n"
    "mov $10000000000, %%r8\n"
    "div %%r8\n"
    "mov %%rax, %0\n"
    : "=r" (c) 
    : "r" (a), "r" (b)
    : "rax", "rdx", "r8"
    );
    return c;
}
#endif

void main(void)
{
    uint64_t x = 12589254118LL;
    uint64_t y = fixmd(x, x);
    printf("x=0x%llX=%llu, y=0x%llX=%llu\n", x, x, y, y);
}

In case #define USE_C_FUNC 1 program prints WRONG result:

x=0x2EE60C5E6=12589254118, y=0x410F8719=1091536665

In case #define USE_C_FUNC 0 program (with inline assembly) prints RIGHT result:

x=0x2EE60C5E6=12589254118, y=0x3B0AB8254=15848931924

I had compiling this example without any optimization (with gcc option -O0).

$ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/10/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa:hsa
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 10.2.1-6' --with-bugurl=file:///usr/share/doc/gcc-10/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-10 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-10-Km9U7s/gcc-10-10.2.1/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-10-Km9U7s/gcc-10-10.2.1/debian/tmp-gcn/usr,hsa --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-mutex
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 10.2.1 20210110 (Debian 10.2.1-6)

objdump -D uint64 for WRONG result for function fixmd() is:

0000000000001135 <fixmd>:
    1135:   55                      push   %rbp
    1136:   48 89 e5                mov    %rsp,%rbp
    1139:   48 89 7d e8             mov    %rdi,-0x18(%rbp)
    113d:   48 89 75 e0             mov    %rsi,-0x20(%rbp)
    1141:   48 8b 45 e8             mov    -0x18(%rbp),%rax
    1145:   48 0f af 45 e0          imul   -0x20(%rbp),%rax
    114a:   48 ba bf d5 ed bd ce    movabs $0xdbe6fecebdedd5bf,%rdx
    1151:   fe e6 db 
    1154:   48 f7 e2                mul    %rdx
    1157:   48 89 d0                mov    %rdx,%rax
    115a:   48 c1 e8 21             shr    $0x21,%rax
    115e:   48 89 45 f8             mov    %rax,-0x8(%rbp)
    1162:   48 8b 45 f8             mov    -0x8(%rbp),%rax
    1166:   5d                      pop    %rbp
    1167:   c3                      retq

objdump -D uint64 for GOOD result for function fixmd() is:

0000000000001135 <fixmd>:
    1135:   55                      push   %rbp
    1136:   48 89 e5                mov    %rsp,%rbp
    1139:   48 89 7d e8             mov    %rdi,-0x18(%rbp)
    113d:   48 89 75 e0             mov    %rsi,-0x20(%rbp)
    1141:   48 8b 4d e8             mov    -0x18(%rbp),%rcx
    1145:   48 8b 75 e0             mov    -0x20(%rbp),%rsi
    1149:   48 89 c8                mov    %rcx,%rax
    114c:   48 f7 e6                mul    %rsi
    114f:   49 b8 00 e4 0b 54 02    movabs $0x2540be400,%r8
    1156:   00 00 00 
    1159:   49 f7 f0                div    %r8
    115c:   48 89 c1                mov    %rax,%rcx
    115f:   48 89 4d f8             mov    %rcx,-0x8(%rbp)
    1163:   48 8b 45 f8             mov    -0x8(%rbp),%rax
    1167:   5d                      pop    %rbp
    1168:   c3                      retq

In the code for fixmd() in C the problem is quite clean: gcc uses signed multiplication (imul) for unsigned values and substitute division with multiplication on the coefficient (with overflow) and binary shifting. Seems, it is the bug, is not it?

Answer 1

This is not a bug. Your expectations are incorrect.

Unsigned integer arithmetic is performed modulo the largest value the type can hold +1. Loosely speaking, this means that for a 64 bit type that anything that overflows 64 bits is truncated.

In your particular case, you're multiplying 12589254118 * 12589254118. The arithmetic result of this is 158,489,319,247,579,957,924. This is larger than what will fit in a 64 bit type, so the result is modulo 2 ⁶⁴ which gives us 10,915,366,657,903,544,996, and dividing by 10000000000 gives us 1,091,536,665 which matches what the C code generates.

gcc supports 128 bit types, so you can fix this by casting one of the operands to __int128 to perform the math with 128 bits.

uint64_t c = (unsigned __int128)a * b / 10000000000LL;

Two uint64_t numbers multiplication and division on coefficient in the C program are wrong compiled with GCC

Question

1 answers

solution1
5 2021-05-27 15:45:33

Two uint64_t numbers multiplication and division on coefficient in the C program are wrong compiled with GCC

Question

1 answers

solution1 5 2021-05-27 15:45:33

solution1
5 2021-05-27 15:45:33