[英]gcc -O3 removing live code in int-to-string converter
我正在使用64位大小的Terje Mathisen的itoa函數 ,該函數需要一個char*
,該char*
必須指向至少包含20個字符和一個數字的緩沖區,並且創建了以下代碼:
#define LOOP_WORK(number, shift) high *= 5; low *= 5; buf[number] = (high >> shift) + '0'; \
buf[number+10] = (low >> shift) + '0'; high &= andop; low &= andop; andop >>= 1
#define uint128_t __uint128_t
#define uint64_t unsigned long
void u2s(char* buf, unsigned long num) {
// Split number into low/high pair.
uint128_t split = num * 7922816251426433760;
split += num >> 1;
uint64_t high = split >> 96;
uint64_t low = num - (high * 10000000000);
// Transform numbers into usable decimal fractions.
split = high * 18446744074;
buf[0] = (split >> 64) + '0';
high = (uint64_t)split;
split = low * 18446744074;
buf[10] = (split >> 64) + '0';
low = (uint64_t)split;
// Adjust numbers and multiply by 2 (so we don't have to multiply by 10 later)
high = (high + 7) >> 3;
low = (low + 7) >> 3;
// Store special and number
uint64_t andop = 0x0fffffffffffffff;
LOOP_WORK(1, 60);
LOOP_WORK(2, 59);
LOOP_WORK(3, 58);
LOOP_WORK(4, 57);
LOOP_WORK(5, 56);
LOOP_WORK(6, 55);
LOOP_WORK(7, 54);
LOOP_WORK(8, 53);
// Final loop, without extra stuffs
high *= 5;
low *= 5;
buf[9] = (high >> 52) + '0';
buf[19] = (low >> 52) + '0';
}
#undef LOOP_WORK
這是匯編中的等效版本(在AT&T中手寫):
u2s:
// tmp128(rax:rdx) = num * 7922816251426433760
movq $7922816251426433760, %rax
mulq %rsi
// tmp128(rax:rdx) += num >> 1
movq %rsi, %rcx
shrq $0x1, %rcx
addq %rcx, %rax
adcq $0x0, %rdx
// high(rdx) = tmp128(rax:rdx) >> 96
shrq $32, %rdx
// high(rcx); low(rsi) = num - (high * 10^10)
movq %rdx, %rcx
movq $10000000000, %rax
mulq %rdx
subq %rax, %rsi
// high2(rax:rdx) = high * 18446744074
movq $18446744074, %rax
mulq %rcx
// buf[0] = (high2 >> 64) + '0'
addb $'0', %dl
movb %dl, (%rdi)
// low2(rax:rdx) = low * 18446744074
movq %rax, %rcx
movq $18446744074, %rax
mulq %rsi
// buf[10] = (low2 >> 64) + '0'
addb $'0', %dl
movb %dl, 10(%rdi)
// high(rcx) = (u64)high2
// low(rax) = (u64)low2
// high = (high + 7) >> 3
addb $0x7, %cl
shrq $0x3, %rcx
// low = (low + 7) >> 3
addb $0x7, %al
shrq $0x3, %rax
// low (rax) *= 5
leaq (%rax,%rax,4), %rax
// high(rcx) *= 5
leaq (%rcx,%rcx,4), %rcx
// buf[1] = (high >> 60) + '0'
movq %rcx, %rdx
shrq $60, %rdx
addb $'0', %dl
movb %dl, 1(%rdi)
// buf[11] = (low >> 60) + '0'
movq %rax, %rdx
shrq $60, %rdx
addb $'0', %dl
movb %dl, 11(%rdi)
// Store number 0x0fffffffffffffff
movq $0x0fffffffffffffff, %rsi
// high &= 0x0fffffffffffffff
andq %rsi, %rcx
// low &= 0x0fffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[2] = (high >> 59) + '0'
movq %rcx, %rdx
shrq $59, %rdx
addb $'0', %dl
movb %dl, 2(%rdi)
// buf[12] = (low >> 59) + '0'
movq %rax, %rdx
shrq $59, %rdx
addb $'0', %dl
movb %dl, 12(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x07ffffffffffffff
andq %rsi, %rcx
// low &= 0x07ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[3] = (high >> 58) + '0'
movq %rcx, %rdx
shrq $58, %rdx
addb $'0', %dl
movb %dl, 3(%rdi)
// buf[13] = (low >> 58) + '0'
movq %rax, %rdx
shrq $58, %rdx
addb $'0', %dl
movb %dl, 13(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x03ffffffffffffff
andq %rsi, %rcx
// low &= 0x03ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[4] = (high >> 57) + '0'
movq %rcx, %rdx
shrq $57, %rdx
addb $'0', %dl
movb %dl, 4(%rdi)
// buf[14] = (low >> 57) + '0'
movq %rax, %rdx
shrq $57, %rdx
addb $'0', %dl
movb %dl, 14(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x01ffffffffffffff
andq %rsi, %rcx
// low &= 0x01ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[5] = (high >> 56) + '0'
movq %rcx, %rdx
shrq $56, %rdx
addb $'0', %dl
movb %dl, 5(%rdi)
// buf[15] = (low >> 56) + '0'
movq %rax, %rdx
shrq $56, %rdx
addb $'0', %dl
movb %dl, 15(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x00ffffffffffffff
andq %rsi, %rcx
// low &= 0x00ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[6] = (high >> 55) + '0'
movq %rcx, %rdx
shrq $55, %rdx
addb $'0', %dl
movb %dl, 6(%rdi)
// buf[16] = (low >> 55) + '0'
movq %rax, %rdx
shrq $55, %rdx
addb $'0', %dl
movb %dl, 16(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x007fffffffffffff
andq %rsi, %rcx
// low &= 0x007fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[7] = (high >> 54) + '0'
movq %rcx, %rdx
shrq $54, %rdx
addb $'0', %dl
movb %dl, 7(%rdi)
// buf[17] = (low >> 54) + '0'
movq %rax, %rdx
shrq $54, %rdx
addb $'0', %dl
movb %dl, 17(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x003fffffffffffff
andq %rsi, %rcx
// low &= 0x003fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[8] = (high >> 53) + '0'
movq %rcx, %rdx
shrq $53, %rdx
addb $'0', %dl
movb %dl, 8(%rdi)
// buf[18] = (low >> 53) + '0e'
movq %rax, %rdx
shrq $53, %rdx
addb $'0', %dl
movb %dl, 18(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x001fffffffffffff
andq %rsi, %rcx
// low &= 0x001fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[9] = (high >> 52) + '0'
shrq $52, %rcx
addb $'0', %cl
movb %cl, 9(%rdi)
// buf[19] = (high >> 52) + '0'
shrq $52, %rax
addb $'0', %al
movb %al, 19(%rdi)
retq
當我使用-O3
優化在GCC中編譯C版本時,我發現其中一個高/低變量的代碼已被優化,它在各處都有對andop
硬編碼值,並且該代碼開始加載到buf[0]
和buf[10]
輸入硬編碼值(分別為48
和3472328296227680304
)。 當我運行GCC -fverbose-asm -S
我發現,GCC優化掉high
徹底! 我猜我的C代碼就是問題所在(我不太擅長C語言),但我不知道為什么。 Terje Mathisen的帖子使用C語言編寫了自己的版本,但不包括此處給出的手寫匯編優化。 為什么GCC這么把我搞砸了?
順便說一句,這是gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0
(USE標志: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv
帶有標志-O3 -c
(帶有標志-d
的objdump)的-cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv
):
u2s.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <u2s>:
0: 48 b8 0a fa 82 4b 04 movabs $0x44b82fa0a,%rax
7: 00 00 00
a: 48 b9 30 30 30 30 30 movabs $0x3030303030303030,%rcx
11: 30 30 30
14: c6 47 0a 30 movb $0x30,0xa(%rdi) // What?
18: 48 0f af f0 imul %rax,%rsi
1c: 48 89 0f mov %rcx,(%rdi) // What?
1f: 48 83 c6 07 add $0x7,%rsi
23: 48 c1 ee 03 shr $0x3,%rsi
27: 48 8d 04 b6 lea (%rsi,%rsi,4),%rax
2b: 48 89 c2 mov %rax,%rdx
2e: 48 c1 ea 3c shr $0x3c,%rdx
32: 83 c2 30 add $0x30,%edx
35: 88 57 0b mov %dl,0xb(%rdi)
38: 48 ba ff ff ff ff ff movabs $0xfffffffffffffff,%rdx
3f: ff ff 0f
42: 48 21 d0 and %rdx,%rax // There should be another AND
45: 48 8d 04 80 lea (%rax,%rax,4),%rax
49: 48 89 c2 mov %rax,%rdx
4c: 48 c1 ea 3b shr $0x3b,%rdx
50: 83 c2 30 add $0x30,%edx
53: 88 57 0c mov %dl,0xc(%rdi)
56: 48 ba ff ff ff ff ff movabs $0x7ffffffffffffff,%rdx
5d: ff ff 07
60: 48 21 d0 and %rdx,%rax
63: 48 8d 04 80 lea (%rax,%rax,4),%rax
67: 48 89 c2 mov %rax,%rdx
6a: 48 c1 ea 3a shr $0x3a,%rdx
6e: 83 c2 30 add $0x30,%edx
71: 88 57 0d mov %dl,0xd(%rdi)
74: 48 ba ff ff ff ff ff movabs $0x3ffffffffffffff,%rdx
7b: ff ff 03
7e: 48 21 d0 and %rdx,%rax
81: 48 8d 04 80 lea (%rax,%rax,4),%rax
85: 48 89 c2 mov %rax,%rdx
88: 48 c1 ea 39 shr $0x39,%rdx
8c: 83 c2 30 add $0x30,%edx
8f: 88 57 0e mov %dl,0xe(%rdi)
92: 48 ba ff ff ff ff ff movabs $0x1ffffffffffffff,%rdx
99: ff ff 01
9c: 48 21 d0 and %rdx,%rax
9f: 48 8d 04 80 lea (%rax,%rax,4),%rax
a3: 48 89 c2 mov %rax,%rdx
a6: 48 c1 ea 38 shr $0x38,%rdx
aa: 83 c2 30 add $0x30,%edx
ad: 88 57 0f mov %dl,0xf(%rdi)
b0: 48 ba ff ff ff ff ff movabs $0xffffffffffffff,%rdx
b7: ff ff 00
ba: 48 21 d0 and %rdx,%rax
bd: 48 8d 04 80 lea (%rax,%rax,4),%rax
c1: 48 89 c2 mov %rax,%rdx
c4: 48 c1 ea 37 shr $0x37,%rdx
c8: 83 c2 30 add $0x30,%edx
cb: 88 57 10 mov %dl,0x10(%rdi)
ce: 48 ba ff ff ff ff ff movabs $0x7fffffffffffff,%rdx
d5: ff 7f 00
d8: 48 21 d0 and %rdx,%rax
db: 48 8d 04 80 lea (%rax,%rax,4),%rax
df: 48 89 c2 mov %rax,%rdx
e2: 48 c1 ea 36 shr $0x36,%rdx
e6: 83 c2 30 add $0x30,%edx
e9: 88 57 11 mov %dl,0x11(%rdi)
ec: 48 ba ff ff ff ff ff movabs $0x3fffffffffffff,%rdx
f3: ff 3f 00
f6: 48 21 d0 and %rdx,%rax
f9: 48 8d 04 80 lea (%rax,%rax,4),%rax
fd: 48 89 c2 mov %rax,%rdx
100: 48 c1 ea 35 shr $0x35,%rdx
104: 83 c2 30 add $0x30,%edx
107: 88 57 12 mov %dl,0x12(%rdi)
10a: 48 ba ff ff ff ff ff movabs $0x1fffffffffffff,%rdx
111: ff 1f 00
114: 48 21 d0 and %rdx,%rax
117: ba 30 30 00 00 mov $0x3030,%edx
11c: 48 8d 04 80 lea (%rax,%rax,4),%rax
120: 66 89 57 08 mov %dx,0x8(%rdi)
124: 48 c1 e8 34 shr $0x34,%rax
128: 83 c0 30 add $0x30,%eax
12b: 88 47 13 mov %al,0x13(%rdi)
12e: c3 retq
PS:我已經對手寫匯編進行了很好的測試,因此它與objdump
之間的大部分差異主要是代碼重新排序或錯誤。
PPS:@Peter_Corde的回答阻止了high
的優化,但是起始代碼仍然很混亂! 摘錄如下:
0: 48 b9 e0 ea f6 5e 67 movabs $0x6df37f675ef6eae0,%rcx
7: 7f f3 6d
a: 49 b8 00 e4 0b 54 02 movabs $0x2540be400,%r8
11: 00 00 00
14: c6 07 30 movb $0x30,(%rdi) // NOT GOOD
17: 48 89 c8 mov %rcx,%rax
1a: 48 89 f1 mov %rsi,%rcx
1d: c6 47 0a 30 movb $0x30,0xa(%rdi) // NOT GOOD
21: 48 f7 e6 mul %rsi
24: 48 d1 e9 shr %rcx
27: 49 89 c1 mov %rax,%r9
2a: 48 89 c8 mov %rcx,%rax
2d: 49 89 d2 mov %rdx,%r10
30: 31 d2 xor %edx,%edx
32: 4c 01 c8 add %r9,%rax
35: 48 b9 0a fa 82 4b 04 movabs $0x44b82fa0a,%rcx
3c: 00 00 00
3f: 4c 11 d2 adc %r10,%rdx
42: 48 c1 ea 20 shr $0x20,%rdx
46: 48 89 d0 mov %rdx,%rax
49: 49 0f af d0 imul %r8,%rdx
4d: 48 0f af c1 imul %rcx,%rax
51: 48 29 d6 sub %rdx,%rsi
54: 48 0f af f1 imul %rcx,%rsi
58: 48 83 c0 07 add $0x7,%rax
5c: 48 c1 e8 03 shr $0x3,%rax
是的,當編譯器向您顯示某些函數輸出意外地不依賴於輸入時,通常表明您的C源代碼並不意味着您認為它做了什么。
在這種情況下,看起來像uint128_t split = num * 7922816251426433760;
是問題。 num
是一個unsigned long
uint64_t
(您要為其編譯的x86-64 SysV ABI中的uint64_t
)。 因此, *
運算符會產生一個64位結果,該結果將零擴展作為uint128_t split
的初始化uint128_t split
。
uint128_t split = (unsigned __int128) num * 7922816251426433760;
在乘法之前將num
強制轉換為128位整數,因此您可以使用mulq
獲得完整的128位結果。 (在Godbolt上為gcc7.3 -O3 )。
我沒有研究其余功能的全部細節。 可能還有其他問題,但這是我看到的第一個問題。
回復:更新:
被split = high * 18446744074;
應該split = high * (unsigned __int128)18446744074;
? 看起來完全一樣的錯誤。 在將計算結果分配給更大的變量的更多情況下,請檢查其余C代碼。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.