簡體   English   中英

gcc -O3刪除int到字符串轉換器中的實時代碼

[英]gcc -O3 removing live code in int-to-string converter

我正在使用64位大小的Terje Mathisen的itoa函數 ,該函數需要一個char* ,該char*必須指向至少包含20個字符和一個數字的緩沖區,並且創建了以下代碼:

#define LOOP_WORK(number, shift) high *= 5; low *= 5; buf[number] = (high >> shift) + '0'; \
    buf[number+10] = (low >> shift) + '0'; high &= andop; low &= andop; andop >>= 1

#define uint128_t __uint128_t
#define uint64_t  unsigned long

void u2s(char* buf, unsigned long num) {
    // Split number into low/high pair.
    uint128_t split = num * 7922816251426433760;
    split += num >> 1;
    uint64_t high = split >> 96;
    uint64_t low = num - (high * 10000000000);
    // Transform numbers into usable decimal fractions.
    split = high * 18446744074;
    buf[0]  = (split >> 64) + '0';
    high = (uint64_t)split;
    split = low  * 18446744074;
    buf[10] = (split >> 64) + '0';
    low  = (uint64_t)split;
    // Adjust numbers and multiply by 2 (so we don't have to multiply by 10 later)
    high = (high + 7) >> 3;
    low  = (low  + 7) >> 3;
    // Store special and number
    uint64_t andop = 0x0fffffffffffffff;
    LOOP_WORK(1, 60);
    LOOP_WORK(2, 59);
    LOOP_WORK(3, 58);
    LOOP_WORK(4, 57);
    LOOP_WORK(5, 56);
    LOOP_WORK(6, 55);
    LOOP_WORK(7, 54);
    LOOP_WORK(8, 53);
    // Final loop, without extra stuffs
    high *= 5;
    low  *= 5;
    buf[9]  = (high >> 52) + '0';
    buf[19] = (low  >> 52) + '0';
}

#undef LOOP_WORK

這是匯編中的等效版本(在AT&T中手寫):

u2s:
    // tmp128(rax:rdx) = num * 7922816251426433760
    movq $7922816251426433760, %rax
    mulq %rsi
    // tmp128(rax:rdx) += num >> 1
    movq %rsi, %rcx
    shrq $0x1, %rcx
    addq %rcx, %rax
    adcq $0x0, %rdx
    // high(rdx) = tmp128(rax:rdx) >> 96
    shrq $32,  %rdx
    // high(rcx); low(rsi) = num - (high * 10^10)
    movq %rdx, %rcx
    movq $10000000000, %rax
    mulq %rdx
    subq %rax, %rsi
    // high2(rax:rdx) = high * 18446744074
    movq $18446744074, %rax
    mulq %rcx
    // buf[0] = (high2 >> 64) + '0'
    addb $'0', %dl
    movb %dl,  (%rdi)
    // low2(rax:rdx) = low * 18446744074
    movq %rax, %rcx
    movq $18446744074, %rax
    mulq %rsi
    // buf[10] = (low2 >> 64) + '0'
    addb $'0', %dl
    movb %dl,  10(%rdi)
    // high(rcx) = (u64)high2
    // low(rax)  = (u64)low2
    // high = (high + 7) >> 3
    addb $0x7, %cl
    shrq $0x3, %rcx
    // low  = (low  + 7) >> 3
    addb $0x7, %al
    shrq $0x3, %rax
    // low (rax) *= 5
    leaq (%rax,%rax,4), %rax
    // high(rcx) *= 5
    leaq (%rcx,%rcx,4), %rcx
    // buf[1]  = (high >> 60) + '0'
    movq %rcx, %rdx
    shrq $60,  %rdx
    addb $'0', %dl
    movb %dl,  1(%rdi)
    // buf[11] = (low  >> 60) + '0'
    movq %rax, %rdx
    shrq $60,  %rdx
    addb $'0', %dl
    movb %dl,  11(%rdi)
    // Store number 0x0fffffffffffffff
    movq $0x0fffffffffffffff, %rsi
    // high &= 0x0fffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x0fffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[2]  = (high >> 59) + '0'
    movq %rcx, %rdx
    shrq $59,  %rdx
    addb $'0', %dl
    movb %dl,  2(%rdi)
    // buf[12] = (low  >> 59) + '0'
    movq %rax, %rdx
    shrq $59,  %rdx
    addb $'0', %dl
    movb %dl,  12(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x07ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x07ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low *= 5
    leaq (%rax,%rax,4), %rax
    // buf[3]  = (high >> 58) + '0'
    movq %rcx, %rdx
    shrq $58,  %rdx
    addb $'0', %dl
    movb %dl,  3(%rdi)
    // buf[13] = (low  >> 58) + '0'
    movq %rax, %rdx
    shrq $58,  %rdx
    addb $'0', %dl
    movb %dl,  13(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x03ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x03ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[4]  = (high >> 57) + '0'
    movq %rcx, %rdx
    shrq $57,  %rdx
    addb $'0', %dl
    movb %dl,  4(%rdi)
    // buf[14] = (low  >> 57) + '0'
    movq %rax, %rdx
    shrq $57,  %rdx
    addb $'0', %dl
    movb %dl,  14(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x01ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x01ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[5]  = (high >> 56) + '0'
    movq %rcx, %rdx
    shrq $56,  %rdx
    addb $'0', %dl
    movb %dl,  5(%rdi)
    // buf[15] = (low  >> 56) + '0'
    movq %rax, %rdx
    shrq $56,  %rdx
    addb $'0', %dl
    movb %dl,  15(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x00ffffffffffffff
    andq %rsi, %rcx
    // low  &= 0x00ffffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[6]  = (high >> 55) + '0'
    movq %rcx, %rdx
    shrq $55,  %rdx
    addb $'0', %dl
    movb %dl,  6(%rdi)
    // buf[16] = (low  >> 55) + '0'
    movq %rax, %rdx
    shrq $55,  %rdx
    addb $'0', %dl
    movb %dl,  16(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x007fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x007fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[7]  = (high >> 54) + '0'
    movq %rcx, %rdx
    shrq $54,  %rdx
    addb $'0', %dl
    movb %dl,  7(%rdi)
    // buf[17] = (low  >> 54) + '0'
    movq %rax, %rdx
    shrq $54,  %rdx
    addb $'0', %dl
    movb %dl,  17(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x003fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x003fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[8]  = (high >> 53) + '0'
    movq %rcx, %rdx
    shrq $53,  %rdx
    addb $'0', %dl
    movb %dl,  8(%rdi)
    // buf[18] = (low  >> 53) + '0e'
    movq %rax, %rdx
    shrq $53,  %rdx
    addb $'0', %dl
    movb %dl,  18(%rdi)
    // update the `and` number
    shrq $0x1, %rsi
    // high &= 0x001fffffffffffff
    andq %rsi, %rcx
    // low  &= 0x001fffffffffffff
    andq %rsi, %rax
    // high *= 5
    leaq (%rcx,%rcx,4), %rcx
    // low  *= 5
    leaq (%rax,%rax,4), %rax
    // buf[9]  = (high >> 52) + '0'
    shrq $52,  %rcx
    addb $'0', %cl
    movb %cl,  9(%rdi)
    // buf[19] = (high >> 52) + '0'
    shrq $52,  %rax
    addb $'0', %al
    movb %al,  19(%rdi)
    retq

當我使用-O3優化在GCC中編譯C版本時,我發現其中一個高/低變量的代碼已被優化,它在各處都有對andop硬編碼值,並且該代碼開始加載到buf[0]buf[10]輸入硬編碼值(分別為483472328296227680304 )。 當我運行GCC -fverbose-asm -S我發現,GCC優化掉high徹底! 我猜我的C代碼就是問題所在(我不太擅長C語言),但我不知道為什么。 Terje Mathisen的帖子使用C語言編寫了自己的版本,但不包括此處給出的手寫匯編優化。 為什么GCC這么把我搞砸了?

順便說一句,這是gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0 (USE標志: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv帶有標志-O3 -c (帶有標志-d的objdump)的-cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv ):

u2s.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <u2s>:
   0:   48 b8 0a fa 82 4b 04    movabs $0x44b82fa0a,%rax
   7:   00 00 00 
   a:   48 b9 30 30 30 30 30    movabs $0x3030303030303030,%rcx
  11:   30 30 30 
  14:   c6 47 0a 30             movb   $0x30,0xa(%rdi) // What?
  18:   48 0f af f0             imul   %rax,%rsi
  1c:   48 89 0f                mov    %rcx,(%rdi) // What?
  1f:   48 83 c6 07             add    $0x7,%rsi
  23:   48 c1 ee 03             shr    $0x3,%rsi
  27:   48 8d 04 b6             lea    (%rsi,%rsi,4),%rax
  2b:   48 89 c2                mov    %rax,%rdx
  2e:   48 c1 ea 3c             shr    $0x3c,%rdx
  32:   83 c2 30                add    $0x30,%edx
  35:   88 57 0b                mov    %dl,0xb(%rdi)
  38:   48 ba ff ff ff ff ff    movabs $0xfffffffffffffff,%rdx
  3f:   ff ff 0f 
  42:   48 21 d0                and    %rdx,%rax // There should be another AND
  45:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  49:   48 89 c2                mov    %rax,%rdx
  4c:   48 c1 ea 3b             shr    $0x3b,%rdx
  50:   83 c2 30                add    $0x30,%edx
  53:   88 57 0c                mov    %dl,0xc(%rdi)
  56:   48 ba ff ff ff ff ff    movabs $0x7ffffffffffffff,%rdx
  5d:   ff ff 07 
  60:   48 21 d0                and    %rdx,%rax
  63:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  67:   48 89 c2                mov    %rax,%rdx
  6a:   48 c1 ea 3a             shr    $0x3a,%rdx
  6e:   83 c2 30                add    $0x30,%edx
  71:   88 57 0d                mov    %dl,0xd(%rdi)
  74:   48 ba ff ff ff ff ff    movabs $0x3ffffffffffffff,%rdx
  7b:   ff ff 03 
  7e:   48 21 d0                and    %rdx,%rax
  81:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  85:   48 89 c2                mov    %rax,%rdx
  88:   48 c1 ea 39             shr    $0x39,%rdx
  8c:   83 c2 30                add    $0x30,%edx
  8f:   88 57 0e                mov    %dl,0xe(%rdi)
  92:   48 ba ff ff ff ff ff    movabs $0x1ffffffffffffff,%rdx
  99:   ff ff 01 
  9c:   48 21 d0                and    %rdx,%rax
  9f:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  a3:   48 89 c2                mov    %rax,%rdx
  a6:   48 c1 ea 38             shr    $0x38,%rdx
  aa:   83 c2 30                add    $0x30,%edx
  ad:   88 57 0f                mov    %dl,0xf(%rdi)
  b0:   48 ba ff ff ff ff ff    movabs $0xffffffffffffff,%rdx
  b7:   ff ff 00 
  ba:   48 21 d0                and    %rdx,%rax
  bd:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  c1:   48 89 c2                mov    %rax,%rdx
  c4:   48 c1 ea 37             shr    $0x37,%rdx
  c8:   83 c2 30                add    $0x30,%edx
  cb:   88 57 10                mov    %dl,0x10(%rdi)
  ce:   48 ba ff ff ff ff ff    movabs $0x7fffffffffffff,%rdx
  d5:   ff 7f 00 
  d8:   48 21 d0                and    %rdx,%rax
  db:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  df:   48 89 c2                mov    %rax,%rdx
  e2:   48 c1 ea 36             shr    $0x36,%rdx
  e6:   83 c2 30                add    $0x30,%edx
  e9:   88 57 11                mov    %dl,0x11(%rdi)
  ec:   48 ba ff ff ff ff ff    movabs $0x3fffffffffffff,%rdx
  f3:   ff 3f 00 
  f6:   48 21 d0                and    %rdx,%rax
  f9:   48 8d 04 80             lea    (%rax,%rax,4),%rax
  fd:   48 89 c2                mov    %rax,%rdx
 100:   48 c1 ea 35             shr    $0x35,%rdx
 104:   83 c2 30                add    $0x30,%edx
 107:   88 57 12                mov    %dl,0x12(%rdi)
 10a:   48 ba ff ff ff ff ff    movabs $0x1fffffffffffff,%rdx
 111:   ff 1f 00 
 114:   48 21 d0                and    %rdx,%rax
 117:   ba 30 30 00 00          mov    $0x3030,%edx
 11c:   48 8d 04 80             lea    (%rax,%rax,4),%rax
 120:   66 89 57 08             mov    %dx,0x8(%rdi)
 124:   48 c1 e8 34             shr    $0x34,%rax
 128:   83 c0 30                add    $0x30,%eax
 12b:   88 47 13                mov    %al,0x13(%rdi)
 12e:   c3                      retq

PS:我已經對手寫匯編進行了很好的測試,因此它與objdump之間的大部分差異主要是代碼重新排序或錯誤。

PPS:@Peter_Corde的回答阻止了high的優化,但是起始代碼仍然很混亂! 摘錄如下:

   0:   48 b9 e0 ea f6 5e 67    movabs $0x6df37f675ef6eae0,%rcx
   7:   7f f3 6d 
   a:   49 b8 00 e4 0b 54 02    movabs $0x2540be400,%r8
  11:   00 00 00 
  14:   c6 07 30                movb   $0x30,(%rdi) // NOT GOOD
  17:   48 89 c8                mov    %rcx,%rax
  1a:   48 89 f1                mov    %rsi,%rcx
  1d:   c6 47 0a 30             movb   $0x30,0xa(%rdi) // NOT GOOD
  21:   48 f7 e6                mul    %rsi
  24:   48 d1 e9                shr    %rcx
  27:   49 89 c1                mov    %rax,%r9
  2a:   48 89 c8                mov    %rcx,%rax
  2d:   49 89 d2                mov    %rdx,%r10
  30:   31 d2                   xor    %edx,%edx
  32:   4c 01 c8                add    %r9,%rax
  35:   48 b9 0a fa 82 4b 04    movabs $0x44b82fa0a,%rcx
  3c:   00 00 00 
  3f:   4c 11 d2                adc    %r10,%rdx
  42:   48 c1 ea 20             shr    $0x20,%rdx
  46:   48 89 d0                mov    %rdx,%rax
  49:   49 0f af d0             imul   %r8,%rdx
  4d:   48 0f af c1             imul   %rcx,%rax
  51:   48 29 d6                sub    %rdx,%rsi
  54:   48 0f af f1             imul   %rcx,%rsi
  58:   48 83 c0 07             add    $0x7,%rax
  5c:   48 c1 e8 03             shr    $0x3,%rax

是的,當編譯器向您顯示某些函數輸出意外地不依賴於輸入時,通常表明您的C源代碼並不意味着您認為它做了什么。

在這種情況下,看起來像uint128_t split = num * 7922816251426433760; 是問題。 num是一個unsigned long uint64_t (您要為其編譯的x86-64 SysV ABI中的uint64_t )。 因此, *運算符會產生一個64位結果,該結果將零擴展作為uint128_t split的初始化uint128_t split

uint128_t split = (unsigned __int128) num * 7922816251426433760;

在乘法之前將num強制轉換為128位整數,因此您可以使用mulq獲得完整的128位結果。 (在Godbolt上為gcc7.3 -O3 )。

我沒有研究其余功能的全部細節。 可能還有其他問題,但這是我看到的第一個問題。


回復:更新:

split = high * 18446744074; 應該split = high * (unsigned __int128)18446744074; 看起來完全一樣的錯誤。 在將計算結果分配給更大的變量的更多情況下,請檢查其余C代碼。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM