C++: aa-bb vs (a+b)*(a-b) what is faster to compute?

Question

Which way of computing difference of squares in C++ is faster: a*ab*b or (a+b)*(ab) ? The first expression uses two multiplications and one addition, while the second one needs two additions and one multiplication. So the second approach seems faster. On the other hand, the number of loads of data to registers in the first approach is smaller, and this might compensate one multiplication vs addition.

If you run this code

#include <iostream>
int main()
{
    int a = 6, b = 7;
    int c1 = a*a-b*b;
    int c2 = (a-b)*(a+b);
    return 0;
}

say here and without optimization flags -O, then the number of assembler instruction will be the same:

for the line: int c1 = a*ab*b; :

 mov    eax,DWORD PTR [rbp-0x4]
 imul   eax,eax
 mov    edx,eax
 mov    eax,DWORD PTR [rbp-0x8]
 imul   eax,eax
 sub    edx,eax
 mov    DWORD PTR [rbp-0xc],edx

for the line: int c2 = (ab)*(a+b); :

 mov    eax,DWORD PTR [rbp-0x4]
 sub    eax,DWORD PTR [rbp-0x8]
 mov    ecx,DWORD PTR [rbp-0x4]
 mov    edx,DWORD PTR [rbp-0x8]
 add    edx,ecx
 imul   eax,edx
 mov    DWORD PTR [rbp-0x10],eax

On the other hand, the first collection of instructions contains 4 operations which are produced only between registers, while for the second collection only 2 such operations between registers are presented, and the others use memory and registers.

So the question is also whether it is possible to estimate which of collections of instructions is faster?

Added after answers.

Thank you for responding I found the answer. Look at the following code :

#include <iostream>

int dsq1(int a, int b) 
{
    return a*a-b*b;
};


int dsq2(int a, int b) 
{
    return (a+b)*(a-b);
};

int main()
{
    int a,b;
    // just to be sure that the compiler does not know
    // precise values of a and b and will not optimize them
    std::cin >> a; 
    std::cin >> b; 
    volatile int c1 = dsq1(a,b);
    volatile int c2 = dsq2(a,b);
    return 0;
}

Now the first function for a*ab*b takes the following 5 assembler instructions with two multiplications:

 mov    esi,eax
 mov    ecx,edx
 imul   esi,eax
 imul   ecx,edx
 sub    ecx,esi

while (ab)*(a+b) takes only 4 instructions and only one multiplication:

 mov    ecx,edx
 sub    ecx,eax
 add    eax,edx
 imul   eax,ecx

It seems that (ab)*(a+b) should be faster than a*ab*b .

Answer 1

Now this really depends on the compiler and architecture. Lets look these two functions:

int f1(int a, int b) {
    return a*a-b*b;
}

int f2(int a, int b) {
    return (a-b)*(a+b);
}

Lets look what that produces on x86_64:

MSVC

a$ = 8
b$ = 16
int f1(int,int) PROC                                 ; f1, COMDAT
        imul    ecx, ecx
        imul    edx, edx
        sub     ecx, edx
        mov     eax, ecx
        ret     0
int f1(int,int) ENDP                                 ; f1

a$ = 8
b$ = 16
int f2(int,int) PROC                                 ; f2, COMDAT
        mov     eax, ecx
        add     ecx, edx
        sub     eax, edx
        imul    eax, ecx
        ret     0
int f2(int,int) ENDP                                 ; f2

gcc 12.1

f1(int, int):
        imul    edi, edi
        imul    esi, esi
        mov     eax, edi
        sub     eax, esi
        ret
f2(int, int):
        mov     eax, edi
        add     edi, esi
        sub     eax, esi
        imul    eax, edi
        ret

clang 14.0

f1(int, int):                                # @f1(int, int)
        mov     eax, edi
        imul    eax, edi
        imul    esi, esi
        sub     eax, esi
        ret
f2(int, int):                                # @f2(int, int)
        lea     eax, [rsi + rdi]
        mov     ecx, edi
        sub     ecx, esi
        imul    eax, ecx
        ret

All just permutation of the same 4 opcodes each. You are trading an imul for an add . Which might be faster, or rather have more execution units running in parallel.

The clang f2 I find most interesting because it uses the address calculation unit instead of the arithmetic adder. So all 4 opcodes use different execution units.

Now contrast that with ARM/ARM64:

ARM MSVC

|int f1(int,int)| PROC                           ; f1
        mul         r2,r0,r0
        mul         r3,r1,r1
        subs        r0,r2,r3
|$M4|
        bx          lr

        ENDP  ; |int f1(int,int)|, f1

|int f2(int,int)| PROC                           ; f2
        subs        r2,r0,r1
        adds        r3,r0,r1
        mul         r0,r2,r3
|$M4|
        bx          lr

        ENDP  ; |int f2(int,int)|, f2

ARM64 msvc

|int f1(int,int)| PROC                           ; f1
        mul         w8,w0,w0
        msub        w0,w1,w1,w8
        ret

        ENDP  ; |int f1(int,int)|, f1

|int f2(int,int)| PROC                           ; f2
        sub         w9,w0,w1
        add         w8,w0,w1
        mul         w0,w9,w8
        ret

        ENDP  ; |int f2(int,int)|, f2

ARM gcc 12.1

f1(int, int):
        mul     r0, r0, r0
        mls     r0, r1, r1, r0
        bx      lr
f2(int, int):
        subs    r3, r0, r1
        add     r0, r0, r1
        mul     r0, r3, r0
        bx      lr

ARM64 gcc 12.1

f1(int, int):
        mul     w0, w0, w0
        msub    w0, w1, w1, w0
        ret
f2(int, int):
        sub     w2, w0, w1
        add     w0, w0, w1
        mul     w0, w2, w0
        ret

ARM clang 11.0.1

f1(int, int):
        mul     r2, r1, r1
        mul     r1, r0, r0
        sub     r0, r1, r2
        bx      lr
f2(int, int):
        add     r2, r1, r0
        sub     r1, r0, r1
        mul     r0, r1, r2
        bx      lr

ARM64 clang 11.0.1

f1(int, int):                                // @f1(int, int)
        mul     w8, w1, w1
        neg     w8, w8
        madd    w0, w0, w0, w8
        ret
f2(int, int):                                // @f2(int, int)
        sub     w8, w0, w1
        add     w9, w1, w0
        mul     w0, w8, w9
        ret

All compilers have eliminated the mov instruction since there is more choice of what input and output registers to use. But there is a big difference in the generated codes. Not all compilers seem to know that ARM/ARM64 has a multiply-and-subtract opcode. clang seems to know about multiply-and-addition though.

Now the question becomes: Is a mls faster or slower than add + sub . With gcc f1 seems to be better, with msvc only for arm64 and clang I think is undecided.

And now for something completely different:

AVR gcc 11.1.0

f1(int, int):
        mov r19,r22
        mov r18,r23
        mov r22,r24
        mov r23,r25
        rcall __mulhi3
        mov r31,r25
        mov r30,r24
        mov r24,r19
        mov r25,r18
        mov r22,r19
        mov r23,r18
        rcall __mulhi3
        mov r19,r31
        mov r18,r30
        sub r18,r24
        sbc r19,r25
        mov r25,r19
        mov r24,r18
ret
f2(int, int):
        mov r18,r22
        mov r19,r23
        mov r23,r25
        mov r22,r24
        add r22,r18
        adc r23,r19
        sub r24,r18
        sbc r25,r19
        rcall __mulhi3
        ret

I think there is no argument that f2 is worlds better.

PS: Beware that the 2 functions are not equivalent. Their behavior differs with overflows. Or rather when they overflow.

C++: aa-bb vs (a+b)*(a-b) what is faster to compute?

Question

1 answers

solution1
1 2022-06-26 10:34:37

C++: a*a-b*b vs (a+b)*(a-b) what is faster to compute?

Question

1 answers

solution1 1 2022-06-26 10:34:37

C++: aa-bb vs (a+b)*(a-b) what is faster to compute?

solution1
1 2022-06-26 10:34:37