简体   繁体   中英

Understanding exactly how the increased efficiency is achieved in Assembly language

I have generated two assembly files - one that is optimized, and one that is not. The assembly-language code generated with optimization on should be more efficient than the other assembly-language code. I am more interested in how the efficiency is achieved. To my understanding, in the non-optimized version there will always have to be an offset call to the register %rbp to find the address. In the optimized version, the addresses are being stored in the registers, so you don't have to rely and call on %rbp to find them.

Am I correct? And if so, would there ever be a time when the optimized version will not be advantageous? Thank you for your time.

Here is a function that converts from 42 GIF to CYMK.

void rgb2cmyk(int r, int g, int b, int ret[]) {
int c = 255 - r;
int m = 255 - g;
int y = 255 - b;
int k = (c < m) ? (c < y ? c : y) : (m < y ? m : y);
c -= k; m -= k; y -= k;
ret[0] = c; ret[1] = m; ret[2] = y; ret[3] = k;
}

Here is the assembly-language code that has not been optimized. Note I have made notes using ;; in the code.

No Opt:

   .section   __TEXT,__text,regular,pure_instructions
   .globl   _rgb2cmyk
   .align   4, 0x90
_rgb2cmyk: ## @rgb2cmyk
   .cfi_startproc
## BB#0:
   pushq   %rbp
Ltmp2:
   .cfi_def_cfa_offset 16
Ltmp3:
   .cfi_offset %rbp, -16
   movq   %rsp, %rbp
Ltmp4:
   .cfi_def_cfa_register %rbp
   ;;initializing variable c, m, y
   movl   $255, %eax
   movl   %edi, -4(%rbp)
   movl   %esi, -8(%rbp)
   movl   %edx, -12(%rbp)
   movq   %rcx, -24(%rbp)
   movl   %eax, %edx
   subl   -4(%rbp), %edx
   movl   %edx, -28(%rbp)
   movl   %eax, %edx
   subl   -8(%rbp), %edx
   movl   %edx, -32(%rbp)
   subl   -12(%rbp), %eax
   movl   %eax, -36(%rbp)

   movl   -28(%rbp), %eax
   ;;compare
   cmpl   -32(%rbp), %eax
   jge   LBB0_5
## BB#1:
   movl   -28(%rbp), %eax
   cmpl   -36(%rbp), %eax
   jge   LBB0_3
## BB#2:
   movl   -28(%rbp), %eax
   movl   %eax, -44(%rbp) ## 4-byte Spill
   jmp   LBB0_4
LBB0_3:
   movl   -36(%rbp), %eax
   movl   %eax, -44(%rbp) ## 4-byte Spill
LBB0_4:
   movl   -44(%rbp), %eax ## 4-byte Reload
   movl   %eax, -48(%rbp) ## 4-byte Spill
   jmp   LBB0_9
LBB0_5:
   movl   -32(%rbp), %eax
   cmpl   -36(%rbp), %eax
   jge   LBB0_7
## BB#6:
   movl   -32(%rbp), %eax
   movl   %eax, -52(%rbp) ## 4-byte Spill
   jmp   LBB0_8
LBB0_7:
   movl   -36(%rbp), %eax
   movl   %eax, -52(%rbp) ## 4-byte Spill
LBB0_8:
   movl   -52(%rbp), %eax ## 4-byte Reload
   movl   %eax, -48(%rbp) ## 4-byte Spill
LBB0_9:
   movl   -48(%rbp), %eax ## 4-byte Reload
   movl   %eax, -40(%rbp)
   movl   -40(%rbp), %eax
   movl   -28(%rbp), %ecx
   subl   %eax, %ecx
   movl   %ecx, -28(%rbp)
   movl   -40(%rbp), %eax
   movl   -32(%rbp), %ecx
   subl   %eax, %ecx
   movl   %ecx, -32(%rbp)
   movl   -40(%rbp), %eax
   movl   -36(%rbp), %ecx
   subl   %eax, %ecx
   movl   %ecx, -36(%rbp)
   movl   -28(%rbp), %eax
   movq   -24(%rbp), %rdx
   movl   %eax, (%rdx)
   movl   -32(%rbp), %eax
   movq   -24(%rbp), %rdx
   movl   %eax, 4(%rdx)
   movl   -36(%rbp), %eax
   movq   -24(%rbp), %rdx
   movl   %eax, 8(%rdx)
   movl   -40(%rbp), %eax
   movq   -24(%rbp), %rdx
   movl   %eax, 12(%rdx)
   popq   %rbp
   retq
   .cfi_endproc


.subsections_via_symbols

Optimization:

   .section   __TEXT,__text,regular,pure_instructions
   .globl   _rgb2cmyk
   .align   4, 0x90
_rgb2cmyk: ## @rgb2cmyk
   .cfi_startproc
## BB#0:
   pushq   %rbp
Ltmp2:
   .cfi_def_cfa_offset 16
Ltmp3:
   .cfi_offset %rbp, -16
   movq   %rsp, %rbp
Ltmp4:
   .cfi_def_cfa_register %rbp
   movl   $255, %r8d
   movl   $255, %eax
   subl   %edi, %eax
   movl   $255, %edi
   subl   %esi, %edi
   subl   %edx, %r8d
   cmpl   %edi, %eax ##;; compare m and c
   jge   LBB0_2
## BB#1: ;; c < m
   cmpl   %r8d, %eax ## compare y and c
   movl   %r8d, %edx
   cmovlel   %eax, %edx
   jmp   LBB0_3
LBB0_2: ##;; c >= m
   cmpl   %r8d, %edi ## compare y and m
   movl   %r8d, %edx
   cmovlel   %edi, %edx
LBB0_3:
   subl   %edx, %eax
   subl   %edx, %edi
   subl   %edx, %r8d
   movl   %eax, (%rcx)
   movl   %edi, 4(%rcx)
   movl   %r8d, 8(%rcx)
   movl   %edx, 12(%rcx)
   popq   %rbp
   retq
   .cfi_endproc


.subsections_via_symbols

Yes. The optimized version performs many fewer memory read operations by storing intermediate values in registers and not reloading them over and over.

You are using call wrong. It is a technical term that means to push a return address on the stack and branch to a new location for instructions. The term you mean is simply to use the register.

Can you think of a reason that longer, slower code is "better"?

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM