[英]gcc -O3 removing live code in int-to-string converter
I was 64-bit-izing Terje Mathisen's itoa function , which takes a char*
which must point to a buffer of at least 20 characters and a number, and I created this: 我正在使用64位大小的Terje Mathisen的itoa函数 ,该函数需要一个
char*
,该char*
必须指向至少包含20个字符和一个数字的缓冲区,并且创建了以下代码:
#define LOOP_WORK(number, shift) high *= 5; low *= 5; buf[number] = (high >> shift) + '0'; \
buf[number+10] = (low >> shift) + '0'; high &= andop; low &= andop; andop >>= 1
#define uint128_t __uint128_t
#define uint64_t unsigned long
void u2s(char* buf, unsigned long num) {
// Split number into low/high pair.
uint128_t split = num * 7922816251426433760;
split += num >> 1;
uint64_t high = split >> 96;
uint64_t low = num - (high * 10000000000);
// Transform numbers into usable decimal fractions.
split = high * 18446744074;
buf[0] = (split >> 64) + '0';
high = (uint64_t)split;
split = low * 18446744074;
buf[10] = (split >> 64) + '0';
low = (uint64_t)split;
// Adjust numbers and multiply by 2 (so we don't have to multiply by 10 later)
high = (high + 7) >> 3;
low = (low + 7) >> 3;
// Store special and number
uint64_t andop = 0x0fffffffffffffff;
LOOP_WORK(1, 60);
LOOP_WORK(2, 59);
LOOP_WORK(3, 58);
LOOP_WORK(4, 57);
LOOP_WORK(5, 56);
LOOP_WORK(6, 55);
LOOP_WORK(7, 54);
LOOP_WORK(8, 53);
// Final loop, without extra stuffs
high *= 5;
low *= 5;
buf[9] = (high >> 52) + '0';
buf[19] = (low >> 52) + '0';
}
#undef LOOP_WORK
Here's an equivalent version in assembly (handwritten in AT&T): 这是汇编中的等效版本(在AT&T中手写):
u2s:
// tmp128(rax:rdx) = num * 7922816251426433760
movq $7922816251426433760, %rax
mulq %rsi
// tmp128(rax:rdx) += num >> 1
movq %rsi, %rcx
shrq $0x1, %rcx
addq %rcx, %rax
adcq $0x0, %rdx
// high(rdx) = tmp128(rax:rdx) >> 96
shrq $32, %rdx
// high(rcx); low(rsi) = num - (high * 10^10)
movq %rdx, %rcx
movq $10000000000, %rax
mulq %rdx
subq %rax, %rsi
// high2(rax:rdx) = high * 18446744074
movq $18446744074, %rax
mulq %rcx
// buf[0] = (high2 >> 64) + '0'
addb $'0', %dl
movb %dl, (%rdi)
// low2(rax:rdx) = low * 18446744074
movq %rax, %rcx
movq $18446744074, %rax
mulq %rsi
// buf[10] = (low2 >> 64) + '0'
addb $'0', %dl
movb %dl, 10(%rdi)
// high(rcx) = (u64)high2
// low(rax) = (u64)low2
// high = (high + 7) >> 3
addb $0x7, %cl
shrq $0x3, %rcx
// low = (low + 7) >> 3
addb $0x7, %al
shrq $0x3, %rax
// low (rax) *= 5
leaq (%rax,%rax,4), %rax
// high(rcx) *= 5
leaq (%rcx,%rcx,4), %rcx
// buf[1] = (high >> 60) + '0'
movq %rcx, %rdx
shrq $60, %rdx
addb $'0', %dl
movb %dl, 1(%rdi)
// buf[11] = (low >> 60) + '0'
movq %rax, %rdx
shrq $60, %rdx
addb $'0', %dl
movb %dl, 11(%rdi)
// Store number 0x0fffffffffffffff
movq $0x0fffffffffffffff, %rsi
// high &= 0x0fffffffffffffff
andq %rsi, %rcx
// low &= 0x0fffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[2] = (high >> 59) + '0'
movq %rcx, %rdx
shrq $59, %rdx
addb $'0', %dl
movb %dl, 2(%rdi)
// buf[12] = (low >> 59) + '0'
movq %rax, %rdx
shrq $59, %rdx
addb $'0', %dl
movb %dl, 12(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x07ffffffffffffff
andq %rsi, %rcx
// low &= 0x07ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[3] = (high >> 58) + '0'
movq %rcx, %rdx
shrq $58, %rdx
addb $'0', %dl
movb %dl, 3(%rdi)
// buf[13] = (low >> 58) + '0'
movq %rax, %rdx
shrq $58, %rdx
addb $'0', %dl
movb %dl, 13(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x03ffffffffffffff
andq %rsi, %rcx
// low &= 0x03ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[4] = (high >> 57) + '0'
movq %rcx, %rdx
shrq $57, %rdx
addb $'0', %dl
movb %dl, 4(%rdi)
// buf[14] = (low >> 57) + '0'
movq %rax, %rdx
shrq $57, %rdx
addb $'0', %dl
movb %dl, 14(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x01ffffffffffffff
andq %rsi, %rcx
// low &= 0x01ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[5] = (high >> 56) + '0'
movq %rcx, %rdx
shrq $56, %rdx
addb $'0', %dl
movb %dl, 5(%rdi)
// buf[15] = (low >> 56) + '0'
movq %rax, %rdx
shrq $56, %rdx
addb $'0', %dl
movb %dl, 15(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x00ffffffffffffff
andq %rsi, %rcx
// low &= 0x00ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[6] = (high >> 55) + '0'
movq %rcx, %rdx
shrq $55, %rdx
addb $'0', %dl
movb %dl, 6(%rdi)
// buf[16] = (low >> 55) + '0'
movq %rax, %rdx
shrq $55, %rdx
addb $'0', %dl
movb %dl, 16(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x007fffffffffffff
andq %rsi, %rcx
// low &= 0x007fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[7] = (high >> 54) + '0'
movq %rcx, %rdx
shrq $54, %rdx
addb $'0', %dl
movb %dl, 7(%rdi)
// buf[17] = (low >> 54) + '0'
movq %rax, %rdx
shrq $54, %rdx
addb $'0', %dl
movb %dl, 17(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x003fffffffffffff
andq %rsi, %rcx
// low &= 0x003fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[8] = (high >> 53) + '0'
movq %rcx, %rdx
shrq $53, %rdx
addb $'0', %dl
movb %dl, 8(%rdi)
// buf[18] = (low >> 53) + '0e'
movq %rax, %rdx
shrq $53, %rdx
addb $'0', %dl
movb %dl, 18(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x001fffffffffffff
andq %rsi, %rcx
// low &= 0x001fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[9] = (high >> 52) + '0'
shrq $52, %rcx
addb $'0', %cl
movb %cl, 9(%rdi)
// buf[19] = (high >> 52) + '0'
shrq $52, %rax
addb $'0', %al
movb %al, 19(%rdi)
retq
When I compile the C version in GCC with optimizations -O3
, I find that code for one of the high/low variables is optimized out, it has hardcoded values for andop
everywhere, and the code at the start to load in buf[0]
and buf[10]
inputs hard-coded values ( 48
and 3472328296227680304
, respectively). 当我使用
-O3
优化在GCC中编译C版本时,我发现其中一个高/低变量的代码已被优化,它在各处都有对andop
硬编码值,并且该代码开始加载到buf[0]
和buf[10]
输入硬编码值(分别为48
和3472328296227680304
)。 When I run GCC with -fverbose-asm -S
, I find that GCC optimized away high
completely! 当我运行GCC
-fverbose-asm -S
我发现,GCC优化掉high
彻底! I'm guessing that my C code's the problem (I'm not too great at C), but I don't know why. 我猜我的C代码就是问题所在(我不太擅长C语言),但我不知道为什么。 Terje Mathisen's post has it's own version in C, but it does not include the handwritten assembly optimizations also given there.
Terje Mathisen的帖子使用C语言编写了自己的版本,但不包括此处给出的手写汇编优化。 Why is GCC messing me up so much?
为什么GCC这么把我搞砸了?
BTW, here's the code from gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0
(USE flags: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv
) with flags -O3 -c
(an objdump with flags -d
): 顺便说一句,这是
gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0
(USE标志: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv
带有标志-O3 -c
(带有标志-d
的objdump)的-cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv
):
u2s.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <u2s>:
0: 48 b8 0a fa 82 4b 04 movabs $0x44b82fa0a,%rax
7: 00 00 00
a: 48 b9 30 30 30 30 30 movabs $0x3030303030303030,%rcx
11: 30 30 30
14: c6 47 0a 30 movb $0x30,0xa(%rdi) // What?
18: 48 0f af f0 imul %rax,%rsi
1c: 48 89 0f mov %rcx,(%rdi) // What?
1f: 48 83 c6 07 add $0x7,%rsi
23: 48 c1 ee 03 shr $0x3,%rsi
27: 48 8d 04 b6 lea (%rsi,%rsi,4),%rax
2b: 48 89 c2 mov %rax,%rdx
2e: 48 c1 ea 3c shr $0x3c,%rdx
32: 83 c2 30 add $0x30,%edx
35: 88 57 0b mov %dl,0xb(%rdi)
38: 48 ba ff ff ff ff ff movabs $0xfffffffffffffff,%rdx
3f: ff ff 0f
42: 48 21 d0 and %rdx,%rax // There should be another AND
45: 48 8d 04 80 lea (%rax,%rax,4),%rax
49: 48 89 c2 mov %rax,%rdx
4c: 48 c1 ea 3b shr $0x3b,%rdx
50: 83 c2 30 add $0x30,%edx
53: 88 57 0c mov %dl,0xc(%rdi)
56: 48 ba ff ff ff ff ff movabs $0x7ffffffffffffff,%rdx
5d: ff ff 07
60: 48 21 d0 and %rdx,%rax
63: 48 8d 04 80 lea (%rax,%rax,4),%rax
67: 48 89 c2 mov %rax,%rdx
6a: 48 c1 ea 3a shr $0x3a,%rdx
6e: 83 c2 30 add $0x30,%edx
71: 88 57 0d mov %dl,0xd(%rdi)
74: 48 ba ff ff ff ff ff movabs $0x3ffffffffffffff,%rdx
7b: ff ff 03
7e: 48 21 d0 and %rdx,%rax
81: 48 8d 04 80 lea (%rax,%rax,4),%rax
85: 48 89 c2 mov %rax,%rdx
88: 48 c1 ea 39 shr $0x39,%rdx
8c: 83 c2 30 add $0x30,%edx
8f: 88 57 0e mov %dl,0xe(%rdi)
92: 48 ba ff ff ff ff ff movabs $0x1ffffffffffffff,%rdx
99: ff ff 01
9c: 48 21 d0 and %rdx,%rax
9f: 48 8d 04 80 lea (%rax,%rax,4),%rax
a3: 48 89 c2 mov %rax,%rdx
a6: 48 c1 ea 38 shr $0x38,%rdx
aa: 83 c2 30 add $0x30,%edx
ad: 88 57 0f mov %dl,0xf(%rdi)
b0: 48 ba ff ff ff ff ff movabs $0xffffffffffffff,%rdx
b7: ff ff 00
ba: 48 21 d0 and %rdx,%rax
bd: 48 8d 04 80 lea (%rax,%rax,4),%rax
c1: 48 89 c2 mov %rax,%rdx
c4: 48 c1 ea 37 shr $0x37,%rdx
c8: 83 c2 30 add $0x30,%edx
cb: 88 57 10 mov %dl,0x10(%rdi)
ce: 48 ba ff ff ff ff ff movabs $0x7fffffffffffff,%rdx
d5: ff 7f 00
d8: 48 21 d0 and %rdx,%rax
db: 48 8d 04 80 lea (%rax,%rax,4),%rax
df: 48 89 c2 mov %rax,%rdx
e2: 48 c1 ea 36 shr $0x36,%rdx
e6: 83 c2 30 add $0x30,%edx
e9: 88 57 11 mov %dl,0x11(%rdi)
ec: 48 ba ff ff ff ff ff movabs $0x3fffffffffffff,%rdx
f3: ff 3f 00
f6: 48 21 d0 and %rdx,%rax
f9: 48 8d 04 80 lea (%rax,%rax,4),%rax
fd: 48 89 c2 mov %rax,%rdx
100: 48 c1 ea 35 shr $0x35,%rdx
104: 83 c2 30 add $0x30,%edx
107: 88 57 12 mov %dl,0x12(%rdi)
10a: 48 ba ff ff ff ff ff movabs $0x1fffffffffffff,%rdx
111: ff 1f 00
114: 48 21 d0 and %rdx,%rax
117: ba 30 30 00 00 mov $0x3030,%edx
11c: 48 8d 04 80 lea (%rax,%rax,4),%rax
120: 66 89 57 08 mov %dx,0x8(%rdi)
124: 48 c1 e8 34 shr $0x34,%rax
128: 83 c0 30 add $0x30,%eax
12b: 88 47 13 mov %al,0x13(%rdi)
12e: c3 retq
PS: I've tested my handwritten assembly well, and so most differences between that and the objdump
are mostly either code reordering or errors. PS:我已经对手写汇编进行了很好的测试,因此它与
objdump
之间的大部分差异主要是代码重新排序或错误。
PPS: @Peter_Corde's answer has prevented the optimization-away of high
, but the starting code is still messed up! PPS:@Peter_Corde的回答阻止了
high
的优化,但是起始代码仍然很混乱! Here's an excerpt: 摘录如下:
0: 48 b9 e0 ea f6 5e 67 movabs $0x6df37f675ef6eae0,%rcx
7: 7f f3 6d
a: 49 b8 00 e4 0b 54 02 movabs $0x2540be400,%r8
11: 00 00 00
14: c6 07 30 movb $0x30,(%rdi) // NOT GOOD
17: 48 89 c8 mov %rcx,%rax
1a: 48 89 f1 mov %rsi,%rcx
1d: c6 47 0a 30 movb $0x30,0xa(%rdi) // NOT GOOD
21: 48 f7 e6 mul %rsi
24: 48 d1 e9 shr %rcx
27: 49 89 c1 mov %rax,%r9
2a: 48 89 c8 mov %rcx,%rax
2d: 49 89 d2 mov %rdx,%r10
30: 31 d2 xor %edx,%edx
32: 4c 01 c8 add %r9,%rax
35: 48 b9 0a fa 82 4b 04 movabs $0x44b82fa0a,%rcx
3c: 00 00 00
3f: 4c 11 d2 adc %r10,%rdx
42: 48 c1 ea 20 shr $0x20,%rdx
46: 48 89 d0 mov %rdx,%rax
49: 49 0f af d0 imul %r8,%rdx
4d: 48 0f af c1 imul %rcx,%rax
51: 48 29 d6 sub %rdx,%rsi
54: 48 0f af f1 imul %rcx,%rsi
58: 48 83 c0 07 add $0x7,%rax
5c: 48 c1 e8 03 shr $0x3,%rax
Yes, when the compiler shows you that some of your function outputs unexpectedly don't depend on the input, that's usually a sign that your C source doesn't mean what you thought it did. 是的,当编译器向您显示某些函数输出意外地不依赖于输入时,通常表明您的C源代码并不意味着您认为它做了什么。
In this case it looks like uint128_t split = num * 7922816251426433760;
在这种情况下,看起来像
uint128_t split = num * 7922816251426433760;
is the problem. 是问题。
num
is an unsigned long
( uint64_t
in the x86-64 SysV ABI which you're compiling for). num
是一个unsigned long
uint64_t
(您要为其编译的x86-64 SysV ABI中的uint64_t
)。 Thus, the *
operator produces a 64-bit result which is zero-extended as an initializer for uint128_t split
. 因此,
*
运算符会产生一个64位结果,该结果将零扩展作为uint128_t split
的初始化uint128_t split
。
uint128_t split = (unsigned __int128) num * 7922816251426433760;
casts num
to a 128-bit integer before the multiply, so you get a full 128-bit result with mulq
. 在乘法之前将
num
强制转换为128位整数,因此您可以使用mulq
获得完整的128位结果。 ( gcc7.3 -O3 on Godbolt ). (在Godbolt上为gcc7.3 -O3 )。
I didn't look into the full details of the rest of your function; 我没有研究其余功能的全部细节。 there may be other problems, but that's the first one I saw.
可能还有其他问题,但这是我看到的第一个问题。
re: update: 回复:更新:
Is split = high * 18446744074;
被
split = high * 18446744074;
supposed to be split = high * (unsigned __int128)18446744074;
应该
split = high * (unsigned __int128)18446744074;
? ? Looks like exactly the same bug.
看起来完全一样的错误。 Check the rest of your C code for any more cases where you assign the result of a calculation to a wider variable.
在将计算结果分配给更大的变量的更多情况下,请检查其余C代码。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.