gcc -O3 removing live code in int-to-string converter

Question

I was 64-bit-izing Terje Mathisen's itoa function , which takes a char* which must point to a buffer of at least 20 characters and a number, and I created this:

#define LOOP_WORK(number, shift) high *= 5; low *= 5; buf[number] = (high >> shift) + '0'; \
    buf[number+10] = (low >> shift) + '0'; high &= andop; low &= andop; andop >>= 1

#define uint128_t __uint128_t
#define uint64_t  unsigned long

void u2s(char* buf, unsigned long num) {
    // Split number into low/high pair.
    uint128_t split = num * 7922816251426433760;
    split += num >> 1;
    uint64_t high = split >> 96;
    uint64_t low = num - (high * 10000000000);
    // Transform numbers into usable decimal fractions.
    split = high * 18446744074;
    buf[0]  = (split >> 64) + '0';
    high = (uint64_t)split;
    split = low  * 18446744074;
    buf[10] = (split >> 64) + '0';
    low  = (uint64_t)split;
    // Adjust numbers and multiply by 2 (so we don't have to multiply by 10 later)
    high = (high + 7) >> 3;
    low  = (low  + 7) >> 3;
    // Store special and number
    uint64_t andop = 0x0fffffffffffffff;
    LOOP_WORK(1, 60);
    LOOP_WORK(2, 59);
    LOOP_WORK(3, 58);
    LOOP_WORK(4, 57);
    LOOP_WORK(5, 56);
    LOOP_WORK(6, 55);
    LOOP_WORK(7, 54);
    LOOP_WORK(8, 53);
    // Final loop, without extra stuffs
    high *= 5;
    low  *= 5;
    buf[9]  = (high >> 52) + '0';
    buf[19] = (low  >> 52) + '0';
}

#undef LOOP_WORK

Here's an equivalent version in assembly (handwritten in AT&T):

u2s:
    // tmp128(rax:rdx) = num * 7922816251426433760
    movq $7922816251426433760, %rax
    mulq %rsi
    // tmp128(rax:rdx) += num >> 1
    movq %rsi, %rcx
    shrq $0x1, %rcx
    addq %rcx, %rax
    adcq $0x0, %rdx
    // high(rdx) = tmp128(rax:rdx) >> 96
    shrq $32,  %rdx
    // high(rcx); low(rsi) = num - (high * 10^10)
    movq %rdx, %rcx
    movq $10000000000, %rax
    mulq %rdx
    subq %rax, %rsi
    // high2(rax:rdx) = high * 18446744074
    movq $18446744074, %rax
    mulq %rcx
    // buf[0] = (high2 >> 64) + '0'
    addb $'0', %dl
    movb %dl,  (%rdi)
    // low2(rax:rdx) = low * 18446744074
    movq %rax, %rcx
    movq $18446744074, %rax
    mulq %rsi
    // buf[10] = (low2 >> 64) + '0'
    addb $'0', %dl
    movb %dl,  10(%rdi)
    // high(rcx) = (u64)high2
    // low(rax)  = (u64)low2
    // high = (high + 7) >> 3
    addb $0x7, %cl
    shrq $0x3, %rcx
    // low  = (low  + 7) >> 3
    addb $0x7, %al
    shrq $0x3, %rax
    // low (rax) *= 5
    leaq (%rax,%rax,4), %rax
    // high(rcx) *= 5
    leaq (%rcx,%rcx,4), %rcx
    // buf[1]  = (high >> 60) + '0'
    movq %rcx, %rdx
    shrq $60,  %rdx
    addb $'0', %dl
    movb %dl,  1(%rdi)
    // buf[11] = (low  >> 60) + '0'
    movq %rax, %rdx
    shrq $60,  %rdx
    addb $'0', %dl
    movb %dl,  11(%rdi)
    // Store number 0x0fffffffffffffff
    movq $0x0fffffffffffffff, %rsi
    // high &= 0x0fffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x0fffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[2]  = (high >> 59) + '0'
    movq %rcx, %rdx
    shrq $59,  %rdx
    addb $'0', %dl
    movb %dl,  2(%rdi)
    // buf[12] = (low  >> 59) + '0'
    movq %rax, %rdx
    shrq $59,  %rdx
    addb $'0', %dl
    movb %dl,  12(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x07ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x07ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low *= 5
    leaq (%rax,%rax,4), %rax
    // buf[3]  = (high >> 58) + '0'
    movq %rcx, %rdx
    shrq $58,  %rdx
    addb $'0', %dl
    movb %dl,  3(%rdi)
    // buf[13] = (low  >> 58) + '0'
    movq %rax, %rdx
    shrq $58,  %rdx
    addb $'0', %dl
    movb %dl,  13(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x03ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x03ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[4]  = (high >> 57) + '0'
    movq %rcx, %rdx
    shrq $57,  %rdx
    addb $'0', %dl
    movb %dl,  4(%rdi)
    // buf[14] = (low  >> 57) + '0'
    movq %rax, %rdx
    shrq $57,  %rdx
    addb $'0', %dl
    movb %dl,  14(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x01ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x01ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[5]  = (high >> 56) + '0'
    movq %rcx, %rdx
    shrq $56,  %rdx
    addb $'0', %dl
    movb %dl,  5(%rdi)
    // buf[15] = (low  >> 56) + '0'
    movq %rax, %rdx
    shrq $56,  %rdx
    addb $'0', %dl
    movb %dl,  15(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x00ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x00ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[6]  = (high >> 55) + '0'
    movq %rcx, %rdx
    shrq $55,  %rdx
    addb $'0', %dl
    movb %dl,  6(%rdi)
    // buf[16] = (low  >> 55) + '0'
    movq %rax, %rdx
    shrq $55,  %rdx
    addb $'0', %dl
    movb %dl,  16(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x007fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x007fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[7]  = (high >> 54) + '0'
    movq %rcx, %rdx
    shrq $54,  %rdx
    addb $'0', %dl
    movb %dl,  7(%rdi)
    // buf[17] = (low  >> 54) + '0'
    movq %rax, %rdx
    shrq $54,  %rdx
    addb $'0', %dl
    movb %dl,  17(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x003fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x003fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[8]  = (high >> 53) + '0'
    movq %rcx, %rdx
    shrq $53,  %rdx
    addb $'0', %dl
    movb %dl,  8(%rdi)
    // buf[18] = (low  >> 53) + '0e'
    movq %rax, %rdx
    shrq $53,  %rdx
    addb $'0', %dl
    movb %dl,  18(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x001fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x001fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[9]  = (high >> 52) + '0'
    shrq $52,  %rcx
    addb $'0', %cl
    movb %cl,  9(%rdi)
    // buf[19] = (high >> 52) + '0'
    shrq $52,  %rax
    addb $'0', %al
    movb %al,  19(%rdi)
    retq

When I compile the C version in GCC with optimizations -O3 , I find that code for one of the high/low variables is optimized out, it has hardcoded values for andop everywhere, and the code at the start to load in buf[0] and buf[10] inputs hard-coded values ( 48 and 3472328296227680304 , respectively). When I run GCC with -fverbose-asm -S , I find that GCC optimized away high completely! I'm guessing that my C code's the problem (I'm not too great at C), but I don't know why. Terje Mathisen's post has it's own version in C, but it does not include the handwritten assembly optimizations also given there. Why is GCC messing me up so much?

BTW, here's the code from gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0 (USE flags: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv ) with flags -O3 -c (an objdump with flags -d ):

u2s.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <u2s>:
   0:   48 b8 0a fa 82 4b 04    movabs $0x44b82fa0a,%rax
   7:   00 00 00 
   a:   48 b9 30 30 30 30 30    movabs $0x3030303030303030,%rcx
  11:   30 30 30 
  14:   c6 47 0a 30             movb   $0x30,0xa(%rdi) // What?
  18:   48 0f af f0             imul   %rax,%rsi
  1c:   48 89 0f                mov    %rcx,(%rdi) // What?
  1f:   48 83 c6 07             add    $0x7,%rsi
  23:   48 c1 ee 03             shr    $0x3,%rsi
  27:   48 8d 04 b6             lea    (%rsi,%rsi,4),%rax
  2b:   48 89 c2                mov    %rax,%rdx
  2e:   48 c1 ea 3c             shr    $0x3c,%rdx
  32:   83 c2 30                add    $0x30,%edx
  35:   88 57 0b                mov    %dl,0xb(%rdi)
  38:   48 ba ff ff ff ff ff    movabs $0xfffffffffffffff,%rdx
  3f:   ff ff 0f 
  42:   48 21 d0                and    %rdx,%rax // There should be another AND
  45:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  49:   48 89 c2                mov    %rax,%rdx
  4c:   48 c1 ea 3b             shr    $0x3b,%rdx
  50:   83 c2 30                add    $0x30,%edx
  53:   88 57 0c                mov    %dl,0xc(%rdi)
  56:   48 ba ff ff ff ff ff    movabs $0x7ffffffffffffff,%rdx
  5d:   ff ff 07 
  60:   48 21 d0                and    %rdx,%rax
  63:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  67:   48 89 c2                mov    %rax,%rdx
  6a:   48 c1 ea 3a             shr    $0x3a,%rdx
  6e:   83 c2 30                add    $0x30,%edx
  71:   88 57 0d                mov    %dl,0xd(%rdi)
  74:   48 ba ff ff ff ff ff    movabs $0x3ffffffffffffff,%rdx
  7b:   ff ff 03 
  7e:   48 21 d0                and    %rdx,%rax
  81:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  85:   48 89 c2                mov    %rax,%rdx
  88:   48 c1 ea 39             shr    $0x39,%rdx
  8c:   83 c2 30                add    $0x30,%edx
  8f:   88 57 0e                mov    %dl,0xe(%rdi)
  92:   48 ba ff ff ff ff ff    movabs $0x1ffffffffffffff,%rdx
  99:   ff ff 01 
  9c:   48 21 d0                and    %rdx,%rax
  9f:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  a3:   48 89 c2                mov    %rax,%rdx
  a6:   48 c1 ea 38             shr    $0x38,%rdx
  aa:   83 c2 30                add    $0x30,%edx
  ad:   88 57 0f                mov    %dl,0xf(%rdi)
  b0:   48 ba ff ff ff ff ff    movabs $0xffffffffffffff,%rdx
  b7:   ff ff 00 
  ba:   48 21 d0                and    %rdx,%rax
  bd:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  c1:   48 89 c2                mov    %rax,%rdx
  c4:   48 c1 ea 37             shr    $0x37,%rdx
  c8:   83 c2 30                add    $0x30,%edx
  cb:   88 57 10                mov    %dl,0x10(%rdi)
  ce:   48 ba ff ff ff ff ff    movabs $0x7fffffffffffff,%rdx
  d5:   ff 7f 00 
  d8:   48 21 d0                and    %rdx,%rax
  db:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  df:   48 89 c2                mov    %rax,%rdx
  e2:   48 c1 ea 36             shr    $0x36,%rdx
  e6:   83 c2 30                add    $0x30,%edx
  e9:   88 57 11                mov    %dl,0x11(%rdi)
  ec:   48 ba ff ff ff ff ff    movabs $0x3fffffffffffff,%rdx
  f3:   ff 3f 00 
  f6:   48 21 d0                and    %rdx,%rax
  f9:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  fd:   48 89 c2                mov    %rax,%rdx
 100:   48 c1 ea 35             shr    $0x35,%rdx
 104:   83 c2 30                add    $0x30,%edx
 107:   88 57 12                mov    %dl,0x12(%rdi)
 10a:   48 ba ff ff ff ff ff    movabs $0x1fffffffffffff,%rdx
 111:   ff 1f 00 
 114:   48 21 d0                and    %rdx,%rax
 117:   ba 30 30 00 00          mov    $0x3030,%edx
 11c:   48 8d 04 80             lea    (%rax,%rax,4),%rax
 120:   66 89 57 08             mov    %dx,0x8(%rdi)
 124:   48 c1 e8 34             shr    $0x34,%rax
 128:   83 c0 30                add    $0x30,%eax
 12b:   88 47 13                mov    %al,0x13(%rdi)
 12e:   c3                      retq

PS: I've tested my handwritten assembly well, and so most differences between that and the objdump are mostly either code reordering or errors.

PPS: @Peter_Corde's answer has prevented the optimization-away of high , but the starting code is still messed up! Here's an excerpt:

   0:   48 b9 e0 ea f6 5e 67    movabs $0x6df37f675ef6eae0,%rcx
   7:   7f f3 6d 
   a:   49 b8 00 e4 0b 54 02    movabs $0x2540be400,%r8
  11:   00 00 00 
  14:   c6 07 30                movb   $0x30,(%rdi) // NOT GOOD
  17:   48 89 c8                mov    %rcx,%rax
  1a:   48 89 f1                mov    %rsi,%rcx
  1d:   c6 47 0a 30             movb   $0x30,0xa(%rdi) // NOT GOOD
  21:   48 f7 e6                mul    %rsi
  24:   48 d1 e9                shr    %rcx
  27:   49 89 c1                mov    %rax,%r9
  2a:   48 89 c8                mov    %rcx,%rax
  2d:   49 89 d2                mov    %rdx,%r10
  30:   31 d2                   xor    %edx,%edx
  32:   4c 01 c8                add    %r9,%rax
  35:   48 b9 0a fa 82 4b 04    movabs $0x44b82fa0a,%rcx
  3c:   00 00 00 
  3f:   4c 11 d2                adc    %r10,%rdx
  42:   48 c1 ea 20             shr    $0x20,%rdx
  46:   48 89 d0                mov    %rdx,%rax
  49:   49 0f af d0             imul   %r8,%rdx
  4d:   48 0f af c1             imul   %rcx,%rax
  51:   48 29 d6                sub    %rdx,%rsi
  54:   48 0f af f1             imul   %rcx,%rsi
  58:   48 83 c0 07             add    $0x7,%rax
  5c:   48 c1 e8 03             shr    $0x3,%rax

Answer 1

Yes, when the compiler shows you that some of your function outputs unexpectedly don't depend on the input, that's usually a sign that your C source doesn't mean what you thought it did.

In this case it looks like uint128_t split = num * 7922816251426433760; is the problem. num is an unsigned long ( uint64_t in the x86-64 SysV ABI which you're compiling for). Thus, the * operator produces a 64-bit result which is zero-extended as an initializer for uint128_t split .

uint128_t split = (unsigned __int128) num * 7922816251426433760;

casts num to a 128-bit integer before the multiply, so you get a full 128-bit result with mulq . ( gcc7.3 -O3 on Godbolt ).

I didn't look into the full details of the rest of your function; there may be other problems, but that's the first one I saw.

re: update:

Is split = high * 18446744074; supposed to be split = high * (unsigned __int128)18446744074; ? Looks like exactly the same bug. Check the rest of your C code for any more cases where you assign the result of a calculation to a wider variable.

gcc -O3 removing live code in int-to-string converter

Question

1 answers

solution1
2 ACCPTED 2018-05-12 16:44:29

gcc -O3 removing live code in int-to-string converter

Question

1 answers

solution1 2 ACCPTED 2018-05-12 16:44:29

solution1
2 ACCPTED 2018-05-12 16:44:29