I was 64-bit-izing Terje Mathisen's itoa function , which takes a char*
which must point to a buffer of at least 20 characters and a number, and I created this:
#define LOOP_WORK(number, shift) high *= 5; low *= 5; buf[number] = (high >> shift) + '0'; \
buf[number+10] = (low >> shift) + '0'; high &= andop; low &= andop; andop >>= 1
#define uint128_t __uint128_t
#define uint64_t unsigned long
void u2s(char* buf, unsigned long num) {
// Split number into low/high pair.
uint128_t split = num * 7922816251426433760;
split += num >> 1;
uint64_t high = split >> 96;
uint64_t low = num - (high * 10000000000);
// Transform numbers into usable decimal fractions.
split = high * 18446744074;
buf[0] = (split >> 64) + '0';
high = (uint64_t)split;
split = low * 18446744074;
buf[10] = (split >> 64) + '0';
low = (uint64_t)split;
// Adjust numbers and multiply by 2 (so we don't have to multiply by 10 later)
high = (high + 7) >> 3;
low = (low + 7) >> 3;
// Store special and number
uint64_t andop = 0x0fffffffffffffff;
LOOP_WORK(1, 60);
LOOP_WORK(2, 59);
LOOP_WORK(3, 58);
LOOP_WORK(4, 57);
LOOP_WORK(5, 56);
LOOP_WORK(6, 55);
LOOP_WORK(7, 54);
LOOP_WORK(8, 53);
// Final loop, without extra stuffs
high *= 5;
low *= 5;
buf[9] = (high >> 52) + '0';
buf[19] = (low >> 52) + '0';
}
#undef LOOP_WORK
Here's an equivalent version in assembly (handwritten in AT&T):
u2s:
// tmp128(rax:rdx) = num * 7922816251426433760
movq $7922816251426433760, %rax
mulq %rsi
// tmp128(rax:rdx) += num >> 1
movq %rsi, %rcx
shrq $0x1, %rcx
addq %rcx, %rax
adcq $0x0, %rdx
// high(rdx) = tmp128(rax:rdx) >> 96
shrq $32, %rdx
// high(rcx); low(rsi) = num - (high * 10^10)
movq %rdx, %rcx
movq $10000000000, %rax
mulq %rdx
subq %rax, %rsi
// high2(rax:rdx) = high * 18446744074
movq $18446744074, %rax
mulq %rcx
// buf[0] = (high2 >> 64) + '0'
addb $'0', %dl
movb %dl, (%rdi)
// low2(rax:rdx) = low * 18446744074
movq %rax, %rcx
movq $18446744074, %rax
mulq %rsi
// buf[10] = (low2 >> 64) + '0'
addb $'0', %dl
movb %dl, 10(%rdi)
// high(rcx) = (u64)high2
// low(rax) = (u64)low2
// high = (high + 7) >> 3
addb $0x7, %cl
shrq $0x3, %rcx
// low = (low + 7) >> 3
addb $0x7, %al
shrq $0x3, %rax
// low (rax) *= 5
leaq (%rax,%rax,4), %rax
// high(rcx) *= 5
leaq (%rcx,%rcx,4), %rcx
// buf[1] = (high >> 60) + '0'
movq %rcx, %rdx
shrq $60, %rdx
addb $'0', %dl
movb %dl, 1(%rdi)
// buf[11] = (low >> 60) + '0'
movq %rax, %rdx
shrq $60, %rdx
addb $'0', %dl
movb %dl, 11(%rdi)
// Store number 0x0fffffffffffffff
movq $0x0fffffffffffffff, %rsi
// high &= 0x0fffffffffffffff
andq %rsi, %rcx
// low &= 0x0fffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[2] = (high >> 59) + '0'
movq %rcx, %rdx
shrq $59, %rdx
addb $'0', %dl
movb %dl, 2(%rdi)
// buf[12] = (low >> 59) + '0'
movq %rax, %rdx
shrq $59, %rdx
addb $'0', %dl
movb %dl, 12(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x07ffffffffffffff
andq %rsi, %rcx
// low &= 0x07ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[3] = (high >> 58) + '0'
movq %rcx, %rdx
shrq $58, %rdx
addb $'0', %dl
movb %dl, 3(%rdi)
// buf[13] = (low >> 58) + '0'
movq %rax, %rdx
shrq $58, %rdx
addb $'0', %dl
movb %dl, 13(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x03ffffffffffffff
andq %rsi, %rcx
// low &= 0x03ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[4] = (high >> 57) + '0'
movq %rcx, %rdx
shrq $57, %rdx
addb $'0', %dl
movb %dl, 4(%rdi)
// buf[14] = (low >> 57) + '0'
movq %rax, %rdx
shrq $57, %rdx
addb $'0', %dl
movb %dl, 14(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x01ffffffffffffff
andq %rsi, %rcx
// low &= 0x01ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[5] = (high >> 56) + '0'
movq %rcx, %rdx
shrq $56, %rdx
addb $'0', %dl
movb %dl, 5(%rdi)
// buf[15] = (low >> 56) + '0'
movq %rax, %rdx
shrq $56, %rdx
addb $'0', %dl
movb %dl, 15(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x00ffffffffffffff
andq %rsi, %rcx
// low &= 0x00ffffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[6] = (high >> 55) + '0'
movq %rcx, %rdx
shrq $55, %rdx
addb $'0', %dl
movb %dl, 6(%rdi)
// buf[16] = (low >> 55) + '0'
movq %rax, %rdx
shrq $55, %rdx
addb $'0', %dl
movb %dl, 16(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x007fffffffffffff
andq %rsi, %rcx
// low &= 0x007fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[7] = (high >> 54) + '0'
movq %rcx, %rdx
shrq $54, %rdx
addb $'0', %dl
movb %dl, 7(%rdi)
// buf[17] = (low >> 54) + '0'
movq %rax, %rdx
shrq $54, %rdx
addb $'0', %dl
movb %dl, 17(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x003fffffffffffff
andq %rsi, %rcx
// low &= 0x003fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[8] = (high >> 53) + '0'
movq %rcx, %rdx
shrq $53, %rdx
addb $'0', %dl
movb %dl, 8(%rdi)
// buf[18] = (low >> 53) + '0e'
movq %rax, %rdx
shrq $53, %rdx
addb $'0', %dl
movb %dl, 18(%rdi)
// update the `and` number
shrq $0x1, %rsi
// high &= 0x001fffffffffffff
andq %rsi, %rcx
// low &= 0x001fffffffffffff
andq %rsi, %rax
// high *= 5
leaq (%rcx,%rcx,4), %rcx
// low *= 5
leaq (%rax,%rax,4), %rax
// buf[9] = (high >> 52) + '0'
shrq $52, %rcx
addb $'0', %cl
movb %cl, 9(%rdi)
// buf[19] = (high >> 52) + '0'
shrq $52, %rax
addb $'0', %al
movb %al, 19(%rdi)
retq
When I compile the C version in GCC with optimizations -O3
, I find that code for one of the high/low variables is optimized out, it has hardcoded values for andop
everywhere, and the code at the start to load in buf[0]
and buf[10]
inputs hard-coded values ( 48
and 3472328296227680304
, respectively). When I run GCC with -fverbose-asm -S
, I find that GCC optimized away high
completely! I'm guessing that my C code's the problem (I'm not too great at C), but I don't know why. Terje Mathisen's post has it's own version in C, but it does not include the handwritten assembly optimizations also given there. Why is GCC messing me up so much?
BTW, here's the code from gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0
(USE flags: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv
) with flags -O3 -c
(an objdump with flags -d
):
u2s.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <u2s>:
0: 48 b8 0a fa 82 4b 04 movabs $0x44b82fa0a,%rax
7: 00 00 00
a: 48 b9 30 30 30 30 30 movabs $0x3030303030303030,%rcx
11: 30 30 30
14: c6 47 0a 30 movb $0x30,0xa(%rdi) // What?
18: 48 0f af f0 imul %rax,%rsi
1c: 48 89 0f mov %rcx,(%rdi) // What?
1f: 48 83 c6 07 add $0x7,%rsi
23: 48 c1 ee 03 shr $0x3,%rsi
27: 48 8d 04 b6 lea (%rsi,%rsi,4),%rax
2b: 48 89 c2 mov %rax,%rdx
2e: 48 c1 ea 3c shr $0x3c,%rdx
32: 83 c2 30 add $0x30,%edx
35: 88 57 0b mov %dl,0xb(%rdi)
38: 48 ba ff ff ff ff ff movabs $0xfffffffffffffff,%rdx
3f: ff ff 0f
42: 48 21 d0 and %rdx,%rax // There should be another AND
45: 48 8d 04 80 lea (%rax,%rax,4),%rax
49: 48 89 c2 mov %rax,%rdx
4c: 48 c1 ea 3b shr $0x3b,%rdx
50: 83 c2 30 add $0x30,%edx
53: 88 57 0c mov %dl,0xc(%rdi)
56: 48 ba ff ff ff ff ff movabs $0x7ffffffffffffff,%rdx
5d: ff ff 07
60: 48 21 d0 and %rdx,%rax
63: 48 8d 04 80 lea (%rax,%rax,4),%rax
67: 48 89 c2 mov %rax,%rdx
6a: 48 c1 ea 3a shr $0x3a,%rdx
6e: 83 c2 30 add $0x30,%edx
71: 88 57 0d mov %dl,0xd(%rdi)
74: 48 ba ff ff ff ff ff movabs $0x3ffffffffffffff,%rdx
7b: ff ff 03
7e: 48 21 d0 and %rdx,%rax
81: 48 8d 04 80 lea (%rax,%rax,4),%rax
85: 48 89 c2 mov %rax,%rdx
88: 48 c1 ea 39 shr $0x39,%rdx
8c: 83 c2 30 add $0x30,%edx
8f: 88 57 0e mov %dl,0xe(%rdi)
92: 48 ba ff ff ff ff ff movabs $0x1ffffffffffffff,%rdx
99: ff ff 01
9c: 48 21 d0 and %rdx,%rax
9f: 48 8d 04 80 lea (%rax,%rax,4),%rax
a3: 48 89 c2 mov %rax,%rdx
a6: 48 c1 ea 38 shr $0x38,%rdx
aa: 83 c2 30 add $0x30,%edx
ad: 88 57 0f mov %dl,0xf(%rdi)
b0: 48 ba ff ff ff ff ff movabs $0xffffffffffffff,%rdx
b7: ff ff 00
ba: 48 21 d0 and %rdx,%rax
bd: 48 8d 04 80 lea (%rax,%rax,4),%rax
c1: 48 89 c2 mov %rax,%rdx
c4: 48 c1 ea 37 shr $0x37,%rdx
c8: 83 c2 30 add $0x30,%edx
cb: 88 57 10 mov %dl,0x10(%rdi)
ce: 48 ba ff ff ff ff ff movabs $0x7fffffffffffff,%rdx
d5: ff 7f 00
d8: 48 21 d0 and %rdx,%rax
db: 48 8d 04 80 lea (%rax,%rax,4),%rax
df: 48 89 c2 mov %rax,%rdx
e2: 48 c1 ea 36 shr $0x36,%rdx
e6: 83 c2 30 add $0x30,%edx
e9: 88 57 11 mov %dl,0x11(%rdi)
ec: 48 ba ff ff ff ff ff movabs $0x3fffffffffffff,%rdx
f3: ff 3f 00
f6: 48 21 d0 and %rdx,%rax
f9: 48 8d 04 80 lea (%rax,%rax,4),%rax
fd: 48 89 c2 mov %rax,%rdx
100: 48 c1 ea 35 shr $0x35,%rdx
104: 83 c2 30 add $0x30,%edx
107: 88 57 12 mov %dl,0x12(%rdi)
10a: 48 ba ff ff ff ff ff movabs $0x1fffffffffffff,%rdx
111: ff 1f 00
114: 48 21 d0 and %rdx,%rax
117: ba 30 30 00 00 mov $0x3030,%edx
11c: 48 8d 04 80 lea (%rax,%rax,4),%rax
120: 66 89 57 08 mov %dx,0x8(%rdi)
124: 48 c1 e8 34 shr $0x34,%rax
128: 83 c0 30 add $0x30,%eax
12b: 88 47 13 mov %al,0x13(%rdi)
12e: c3 retq
PS: I've tested my handwritten assembly well, and so most differences between that and the objdump
are mostly either code reordering or errors.
PPS: @Peter_Corde's answer has prevented the optimization-away of high
, but the starting code is still messed up! Here's an excerpt:
0: 48 b9 e0 ea f6 5e 67 movabs $0x6df37f675ef6eae0,%rcx
7: 7f f3 6d
a: 49 b8 00 e4 0b 54 02 movabs $0x2540be400,%r8
11: 00 00 00
14: c6 07 30 movb $0x30,(%rdi) // NOT GOOD
17: 48 89 c8 mov %rcx,%rax
1a: 48 89 f1 mov %rsi,%rcx
1d: c6 47 0a 30 movb $0x30,0xa(%rdi) // NOT GOOD
21: 48 f7 e6 mul %rsi
24: 48 d1 e9 shr %rcx
27: 49 89 c1 mov %rax,%r9
2a: 48 89 c8 mov %rcx,%rax
2d: 49 89 d2 mov %rdx,%r10
30: 31 d2 xor %edx,%edx
32: 4c 01 c8 add %r9,%rax
35: 48 b9 0a fa 82 4b 04 movabs $0x44b82fa0a,%rcx
3c: 00 00 00
3f: 4c 11 d2 adc %r10,%rdx
42: 48 c1 ea 20 shr $0x20,%rdx
46: 48 89 d0 mov %rdx,%rax
49: 49 0f af d0 imul %r8,%rdx
4d: 48 0f af c1 imul %rcx,%rax
51: 48 29 d6 sub %rdx,%rsi
54: 48 0f af f1 imul %rcx,%rsi
58: 48 83 c0 07 add $0x7,%rax
5c: 48 c1 e8 03 shr $0x3,%rax
Yes, when the compiler shows you that some of your function outputs unexpectedly don't depend on the input, that's usually a sign that your C source doesn't mean what you thought it did.
In this case it looks like uint128_t split = num * 7922816251426433760;
is the problem. num
is an unsigned long
( uint64_t
in the x86-64 SysV ABI which you're compiling for). Thus, the *
operator produces a 64-bit result which is zero-extended as an initializer for uint128_t split
.
uint128_t split = (unsigned __int128) num * 7922816251426433760;
casts num
to a 128-bit integer before the multiply, so you get a full 128-bit result with mulq
. ( gcc7.3 -O3 on Godbolt ).
I didn't look into the full details of the rest of your function; there may be other problems, but that's the first one I saw.
re: update:
Is split = high * 18446744074;
supposed to be split = high * (unsigned __int128)18446744074;
? Looks like exactly the same bug. Check the rest of your C code for any more cases where you assign the result of a calculation to a wider variable.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.