简体   繁体   English

gcc -O3删除int到字符串转换器中的实时代码

[英]gcc -O3 removing live code in int-to-string converter

I was 64-bit-izing Terje Mathisen's itoa function , which takes a char* which must point to a buffer of at least 20 characters and a number, and I created this: 我正在使用64位大小的Terje Mathisen的itoa函数 ,该函数需要一个char* ,该char*必须指向至少包含20个字符和一个数字的缓冲区,并且创建了以下代码:

#define LOOP_WORK(number, shift) high *= 5; low *= 5; buf[number] = (high >> shift) + '0'; \
    buf[number+10] = (low >> shift) + '0'; high &= andop; low &= andop; andop >>= 1

#define uint128_t __uint128_t
#define uint64_t  unsigned long

void u2s(char* buf, unsigned long num) {
    // Split number into low/high pair.
    uint128_t split = num * 7922816251426433760;
    split += num >> 1;
    uint64_t high = split >> 96;
    uint64_t low = num - (high * 10000000000);
    // Transform numbers into usable decimal fractions.
    split = high * 18446744074;
    buf[0]  = (split >> 64) + '0';
    high = (uint64_t)split;
    split = low  * 18446744074;
    buf[10] = (split >> 64) + '0';
    low  = (uint64_t)split;
    // Adjust numbers and multiply by 2 (so we don't have to multiply by 10 later)
    high = (high + 7) >> 3;
    low  = (low  + 7) >> 3;
    // Store special and number
    uint64_t andop = 0x0fffffffffffffff;
    LOOP_WORK(1, 60);
    LOOP_WORK(2, 59);
    LOOP_WORK(3, 58);
    LOOP_WORK(4, 57);
    LOOP_WORK(5, 56);
    LOOP_WORK(6, 55);
    LOOP_WORK(7, 54);
    LOOP_WORK(8, 53);
    // Final loop, without extra stuffs
    high *= 5;
    low  *= 5;
    buf[9]  = (high >> 52) + '0';
    buf[19] = (low  >> 52) + '0';
}

#undef LOOP_WORK

Here's an equivalent version in assembly (handwritten in AT&T): 这是汇编中的等效版本(在AT&T中手写):

u2s:
    // tmp128(rax:rdx) = num * 7922816251426433760
    movq $7922816251426433760, %rax
    mulq %rsi
    // tmp128(rax:rdx) += num >> 1
    movq %rsi, %rcx
    shrq $0x1, %rcx
    addq %rcx, %rax
    adcq $0x0, %rdx
    // high(rdx) = tmp128(rax:rdx) >> 96
    shrq $32,  %rdx
    // high(rcx); low(rsi) = num - (high * 10^10)
    movq %rdx, %rcx
    movq $10000000000, %rax
    mulq %rdx
    subq %rax, %rsi
    // high2(rax:rdx) = high * 18446744074
    movq $18446744074, %rax
    mulq %rcx
    // buf[0] = (high2 >> 64) + '0'
    addb $'0', %dl
    movb %dl,  (%rdi)
    // low2(rax:rdx) = low * 18446744074
    movq %rax, %rcx
    movq $18446744074, %rax
    mulq %rsi
    // buf[10] = (low2 >> 64) + '0'
    addb $'0', %dl
    movb %dl,  10(%rdi)
    // high(rcx) = (u64)high2
    // low(rax)  = (u64)low2
    // high = (high + 7) >> 3
    addb $0x7, %cl
    shrq $0x3, %rcx
    // low  = (low  + 7) >> 3
    addb $0x7, %al
    shrq $0x3, %rax
    // low (rax) *= 5
    leaq (%rax,%rax,4), %rax
    // high(rcx) *= 5
    leaq (%rcx,%rcx,4), %rcx
    // buf[1]  = (high >> 60) + '0'
    movq %rcx, %rdx
    shrq $60,  %rdx
    addb $'0', %dl
    movb %dl,  1(%rdi)
    // buf[11] = (low  >> 60) + '0'
    movq %rax, %rdx
    shrq $60,  %rdx
    addb $'0', %dl
    movb %dl,  11(%rdi)
    // Store number 0x0fffffffffffffff
    movq $0x0fffffffffffffff, %rsi
    // high &= 0x0fffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x0fffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[2]  = (high >> 59) + '0'
    movq %rcx, %rdx
    shrq $59,  %rdx
    addb $'0', %dl
    movb %dl,  2(%rdi)
    // buf[12] = (low  >> 59) + '0'
    movq %rax, %rdx
    shrq $59,  %rdx
    addb $'0', %dl
    movb %dl,  12(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x07ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x07ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low *= 5
    leaq (%rax,%rax,4), %rax
    // buf[3]  = (high >> 58) + '0'
    movq %rcx, %rdx
    shrq $58,  %rdx
    addb $'0', %dl
    movb %dl,  3(%rdi)
    // buf[13] = (low  >> 58) + '0'
    movq %rax, %rdx
    shrq $58,  %rdx
    addb $'0', %dl
    movb %dl,  13(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x03ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x03ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[4]  = (high >> 57) + '0'
    movq %rcx, %rdx
    shrq $57,  %rdx
    addb $'0', %dl
    movb %dl,  4(%rdi)
    // buf[14] = (low  >> 57) + '0'
    movq %rax, %rdx
    shrq $57,  %rdx
    addb $'0', %dl
    movb %dl,  14(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x01ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x01ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[5]  = (high >> 56) + '0'
    movq %rcx, %rdx
    shrq $56,  %rdx
    addb $'0', %dl
    movb %dl,  5(%rdi)
    // buf[15] = (low  >> 56) + '0'
    movq %rax, %rdx
    shrq $56,  %rdx
    addb $'0', %dl
    movb %dl,  15(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x00ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x00ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[6]  = (high >> 55) + '0'
    movq %rcx, %rdx
    shrq $55,  %rdx
    addb $'0', %dl
    movb %dl,  6(%rdi)
    // buf[16] = (low  >> 55) + '0'
    movq %rax, %rdx
    shrq $55,  %rdx
    addb $'0', %dl
    movb %dl,  16(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x007fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x007fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[7]  = (high >> 54) + '0'
    movq %rcx, %rdx
    shrq $54,  %rdx
    addb $'0', %dl
    movb %dl,  7(%rdi)
    // buf[17] = (low  >> 54) + '0'
    movq %rax, %rdx
    shrq $54,  %rdx
    addb $'0', %dl
    movb %dl,  17(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x003fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x003fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[8]  = (high >> 53) + '0'
    movq %rcx, %rdx
    shrq $53,  %rdx
    addb $'0', %dl
    movb %dl,  8(%rdi)
    // buf[18] = (low  >> 53) + '0e'
    movq %rax, %rdx
    shrq $53,  %rdx
    addb $'0', %dl
    movb %dl,  18(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x001fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x001fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[9]  = (high >> 52) + '0'
    shrq $52,  %rcx
    addb $'0', %cl
    movb %cl,  9(%rdi)
    // buf[19] = (high >> 52) + '0'
    shrq $52,  %rax
    addb $'0', %al
    movb %al,  19(%rdi)
    retq

When I compile the C version in GCC with optimizations -O3 , I find that code for one of the high/low variables is optimized out, it has hardcoded values for andop everywhere, and the code at the start to load in buf[0] and buf[10] inputs hard-coded values ( 48 and 3472328296227680304 , respectively). 当我使用-O3优化在GCC中编译C版本时,我发现其中一个高/低变量的代码已被优化,它在各处都有对andop硬编码值,并且该代码开始加载到buf[0]buf[10]输入硬编码值(分别为483472328296227680304 )。 When I run GCC with -fverbose-asm -S , I find that GCC optimized away high completely! 当我运行GCC -fverbose-asm -S我发现,GCC优化掉high彻底! I'm guessing that my C code's the problem (I'm not too great at C), but I don't know why. 我猜我的C代码就是问题所在(我不太擅长C语言),但我不知道为什么。 Terje Mathisen's post has it's own version in C, but it does not include the handwritten assembly optimizations also given there. Terje Mathisen的帖子使用C语言编写了自己的版本,但不包括此处给出的手写汇编优化。 Why is GCC messing me up so much? 为什么GCC这么把我搞砸了?

BTW, here's the code from gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0 (USE flags: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv ) with flags -O3 -c (an objdump with flags -d ): 顺便说一句,这是gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0 (USE标志: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv带有标志-O3 -c (带有标志-d的objdump)的-cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv ):

u2s.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <u2s>:
   0:   48 b8 0a fa 82 4b 04    movabs $0x44b82fa0a,%rax
   7:   00 00 00 
   a:   48 b9 30 30 30 30 30    movabs $0x3030303030303030,%rcx
  11:   30 30 30 
  14:   c6 47 0a 30             movb   $0x30,0xa(%rdi) // What?
  18:   48 0f af f0             imul   %rax,%rsi
  1c:   48 89 0f                mov    %rcx,(%rdi) // What?
  1f:   48 83 c6 07             add    $0x7,%rsi
  23:   48 c1 ee 03             shr    $0x3,%rsi
  27:   48 8d 04 b6             lea    (%rsi,%rsi,4),%rax
  2b:   48 89 c2                mov    %rax,%rdx
  2e:   48 c1 ea 3c             shr    $0x3c,%rdx
  32:   83 c2 30                add    $0x30,%edx
  35:   88 57 0b                mov    %dl,0xb(%rdi)
  38:   48 ba ff ff ff ff ff    movabs $0xfffffffffffffff,%rdx
  3f:   ff ff 0f 
  42:   48 21 d0                and    %rdx,%rax // There should be another AND
  45:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  49:   48 89 c2                mov    %rax,%rdx
  4c:   48 c1 ea 3b             shr    $0x3b,%rdx
  50:   83 c2 30                add    $0x30,%edx
  53:   88 57 0c                mov    %dl,0xc(%rdi)
  56:   48 ba ff ff ff ff ff    movabs $0x7ffffffffffffff,%rdx
  5d:   ff ff 07 
  60:   48 21 d0                and    %rdx,%rax
  63:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  67:   48 89 c2                mov    %rax,%rdx
  6a:   48 c1 ea 3a             shr    $0x3a,%rdx
  6e:   83 c2 30                add    $0x30,%edx
  71:   88 57 0d                mov    %dl,0xd(%rdi)
  74:   48 ba ff ff ff ff ff    movabs $0x3ffffffffffffff,%rdx
  7b:   ff ff 03 
  7e:   48 21 d0                and    %rdx,%rax
  81:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  85:   48 89 c2                mov    %rax,%rdx
  88:   48 c1 ea 39             shr    $0x39,%rdx
  8c:   83 c2 30                add    $0x30,%edx
  8f:   88 57 0e                mov    %dl,0xe(%rdi)
  92:   48 ba ff ff ff ff ff    movabs $0x1ffffffffffffff,%rdx
  99:   ff ff 01 
  9c:   48 21 d0                and    %rdx,%rax
  9f:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  a3:   48 89 c2                mov    %rax,%rdx
  a6:   48 c1 ea 38             shr    $0x38,%rdx
  aa:   83 c2 30                add    $0x30,%edx
  ad:   88 57 0f                mov    %dl,0xf(%rdi)
  b0:   48 ba ff ff ff ff ff    movabs $0xffffffffffffff,%rdx
  b7:   ff ff 00 
  ba:   48 21 d0                and    %rdx,%rax
  bd:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  c1:   48 89 c2                mov    %rax,%rdx
  c4:   48 c1 ea 37             shr    $0x37,%rdx
  c8:   83 c2 30                add    $0x30,%edx
  cb:   88 57 10                mov    %dl,0x10(%rdi)
  ce:   48 ba ff ff ff ff ff    movabs $0x7fffffffffffff,%rdx
  d5:   ff 7f 00 
  d8:   48 21 d0                and    %rdx,%rax
  db:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  df:   48 89 c2                mov    %rax,%rdx
  e2:   48 c1 ea 36             shr    $0x36,%rdx
  e6:   83 c2 30                add    $0x30,%edx
  e9:   88 57 11                mov    %dl,0x11(%rdi)
  ec:   48 ba ff ff ff ff ff    movabs $0x3fffffffffffff,%rdx
  f3:   ff 3f 00 
  f6:   48 21 d0                and    %rdx,%rax
  f9:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  fd:   48 89 c2                mov    %rax,%rdx
 100:   48 c1 ea 35             shr    $0x35,%rdx
 104:   83 c2 30                add    $0x30,%edx
 107:   88 57 12                mov    %dl,0x12(%rdi)
 10a:   48 ba ff ff ff ff ff    movabs $0x1fffffffffffff,%rdx
 111:   ff 1f 00 
 114:   48 21 d0                and    %rdx,%rax
 117:   ba 30 30 00 00          mov    $0x3030,%edx
 11c:   48 8d 04 80             lea    (%rax,%rax,4),%rax
 120:   66 89 57 08             mov    %dx,0x8(%rdi)
 124:   48 c1 e8 34             shr    $0x34,%rax
 128:   83 c0 30                add    $0x30,%eax
 12b:   88 47 13                mov    %al,0x13(%rdi)
 12e:   c3                      retq

PS: I've tested my handwritten assembly well, and so most differences between that and the objdump are mostly either code reordering or errors. PS:我已经对手写汇编进行了很好的测试,因此它与objdump之间的大部分差异主要是代码重新排序或错误。

PPS: @Peter_Corde's answer has prevented the optimization-away of high , but the starting code is still messed up! PPS:@Peter_Corde的回答阻止了high的优化,但是起始代码仍然很混乱! Here's an excerpt: 摘录如下:

   0:   48 b9 e0 ea f6 5e 67    movabs $0x6df37f675ef6eae0,%rcx
   7:   7f f3 6d 
   a:   49 b8 00 e4 0b 54 02    movabs $0x2540be400,%r8
  11:   00 00 00 
  14:   c6 07 30                movb   $0x30,(%rdi) // NOT GOOD
  17:   48 89 c8                mov    %rcx,%rax
  1a:   48 89 f1                mov    %rsi,%rcx
  1d:   c6 47 0a 30             movb   $0x30,0xa(%rdi) // NOT GOOD
  21:   48 f7 e6                mul    %rsi
  24:   48 d1 e9                shr    %rcx
  27:   49 89 c1                mov    %rax,%r9
  2a:   48 89 c8                mov    %rcx,%rax
  2d:   49 89 d2                mov    %rdx,%r10
  30:   31 d2                   xor    %edx,%edx
  32:   4c 01 c8                add    %r9,%rax
  35:   48 b9 0a fa 82 4b 04    movabs $0x44b82fa0a,%rcx
  3c:   00 00 00 
  3f:   4c 11 d2                adc    %r10,%rdx
  42:   48 c1 ea 20             shr    $0x20,%rdx
  46:   48 89 d0                mov    %rdx,%rax
  49:   49 0f af d0             imul   %r8,%rdx
  4d:   48 0f af c1             imul   %rcx,%rax
  51:   48 29 d6                sub    %rdx,%rsi
  54:   48 0f af f1             imul   %rcx,%rsi
  58:   48 83 c0 07             add    $0x7,%rax
  5c:   48 c1 e8 03             shr    $0x3,%rax

Yes, when the compiler shows you that some of your function outputs unexpectedly don't depend on the input, that's usually a sign that your C source doesn't mean what you thought it did. 是的,当编译器向您显示某些函数输出意外地不依赖于输入时,通常表明您的C源代码并不意味着您认为它做了什么。

In this case it looks like uint128_t split = num * 7922816251426433760; 在这种情况下,看起来像uint128_t split = num * 7922816251426433760; is the problem. 是问题。 num is an unsigned long ( uint64_t in the x86-64 SysV ABI which you're compiling for). num是一个unsigned long uint64_t (您要为其编译的x86-64 SysV ABI中的uint64_t )。 Thus, the * operator produces a 64-bit result which is zero-extended as an initializer for uint128_t split . 因此, *运算符会产生一个64位结果,该结果将零扩展作为uint128_t split的初始化uint128_t split

uint128_t split = (unsigned __int128) num * 7922816251426433760;

casts num to a 128-bit integer before the multiply, so you get a full 128-bit result with mulq . 在乘法之前将num强制转换为128位整数,因此您可以使用mulq获得完整的128位结果。 ( gcc7.3 -O3 on Godbolt ). (在Godbolt上为gcc7.3 -O3 )。

I didn't look into the full details of the rest of your function; 我没有研究其余功能的全部细节。 there may be other problems, but that's the first one I saw. 可能还有其他问题,但这是我看到的第一个问题。


re: update: 回复:更新:

Is split = high * 18446744074; split = high * 18446744074; supposed to be split = high * (unsigned __int128)18446744074; 应该split = high * (unsigned __int128)18446744074; ? Looks like exactly the same bug. 看起来完全一样的错误。 Check the rest of your C code for any more cases where you assign the result of a calculation to a wider variable. 在将计算结果分配给更大的变量的更多情况下,请检查其余C代码。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM