准确了解如何以汇编语言实现提高的效率

Question

I have generated two assembly files - one that is optimized, and one that is not. 我生成了两个程序集文件-一个已优化，而另一个未优化。 The assembly-language code generated with optimization on should be more efficient than the other assembly-language code. 启用优化后生成的汇编语言代码应比其他汇编语言代码更有效。 I am more interested in how the efficiency is achieved. 我对如何实现效率更感兴趣。 To my understanding, in the non-optimized version there will always have to be an offset call to the register %rbp to find the address. 据我了解，在非优化版本中，始终必须对寄存器％rbp进行偏移调用以查找地址。 In the optimized version, the addresses are being stored in the registers, so you don't have to rely and call on %rbp to find them. 在优化版本中，地址被存储在寄存器中，因此您不必依赖并调用％rbp来查找它们。

Am I correct? 我对么？ And if so, would there ever be a time when the optimized version will not be advantageous? 如果是这样，优化版本将永远不会有优势吗？ Thank you for your time. 感谢您的时间。

Here is a function that converts from 42 GIF to CYMK. 这是一个将42 GIF转换为CYMK的函数。

void rgb2cmyk(int r, int g, int b, int ret[]) {
int c = 255 - r;
int m = 255 - g;
int y = 255 - b;
int k = (c < m) ? (c < y ? c : y) : (m < y ? m : y);
c -= k; m -= k; y -= k;
ret[0] = c; ret[1] = m; ret[2] = y; ret[3] = k;
}

Here is the assembly-language code that has not been optimized. 这是尚未优化的汇编语言代码。 Note I have made notes using ;; 注意我已经使用;;做过注释。 in the code. 在代码中。

No Opt: 没有选择：

   .section   __TEXT,__text,regular,pure_instructions
   .globl   _rgb2cmyk
   .align   4, 0x90
_rgb2cmyk: ## @rgb2cmyk
   .cfi_startproc
## BB#0:
   pushq   %rbp
Ltmp2:
   .cfi_def_cfa_offset 16
Ltmp3:
   .cfi_offset %rbp, -16
   movq   %rsp, %rbp
Ltmp4:
   .cfi_def_cfa_register %rbp
   ;;initializing variable c, m, y
   movl   $255, %eax
   movl   %edi, -4(%rbp)
   movl   %esi, -8(%rbp)
   movl   %edx, -12(%rbp)
   movq   %rcx, -24(%rbp)
   movl   %eax, %edx
   subl   -4(%rbp), %edx
   movl   %edx, -28(%rbp)
   movl   %eax, %edx
   subl   -8(%rbp), %edx
   movl   %edx, -32(%rbp)
   subl   -12(%rbp), %eax
   movl   %eax, -36(%rbp)

   movl   -28(%rbp), %eax
   ;;compare
   cmpl   -32(%rbp), %eax
   jge   LBB0_5
## BB#1:
   movl   -28(%rbp), %eax
   cmpl   -36(%rbp), %eax
   jge   LBB0_3
## BB#2:
   movl   -28(%rbp), %eax
   movl   %eax, -44(%rbp) ## 4-byte Spill
   jmp   LBB0_4
LBB0_3:
   movl   -36(%rbp), %eax
   movl   %eax, -44(%rbp) ## 4-byte Spill
LBB0_4:
   movl   -44(%rbp), %eax ## 4-byte Reload
   movl   %eax, -48(%rbp) ## 4-byte Spill
   jmp   LBB0_9
LBB0_5:
   movl   -32(%rbp), %eax
   cmpl   -36(%rbp), %eax
   jge   LBB0_7
## BB#6:
   movl   -32(%rbp), %eax
   movl   %eax, -52(%rbp) ## 4-byte Spill
   jmp   LBB0_8
LBB0_7:
   movl   -36(%rbp), %eax
   movl   %eax, -52(%rbp) ## 4-byte Spill
LBB0_8:
   movl   -52(%rbp), %eax ## 4-byte Reload
   movl   %eax, -48(%rbp) ## 4-byte Spill
LBB0_9:
   movl   -48(%rbp), %eax ## 4-byte Reload
   movl   %eax, -40(%rbp)
   movl   -40(%rbp), %eax
   movl   -28(%rbp), %ecx
   subl   %eax, %ecx
   movl   %ecx, -28(%rbp)
   movl   -40(%rbp), %eax
   movl   -32(%rbp), %ecx
   subl   %eax, %ecx
   movl   %ecx, -32(%rbp)
   movl   -40(%rbp), %eax
   movl   -36(%rbp), %ecx
   subl   %eax, %ecx
   movl   %ecx, -36(%rbp)
   movl   -28(%rbp), %eax
   movq   -24(%rbp), %rdx
   movl   %eax, (%rdx)
   movl   -32(%rbp), %eax
   movq   -24(%rbp), %rdx
   movl   %eax, 4(%rdx)
   movl   -36(%rbp), %eax
   movq   -24(%rbp), %rdx
   movl   %eax, 8(%rdx)
   movl   -40(%rbp), %eax
   movq   -24(%rbp), %rdx
   movl   %eax, 12(%rdx)
   popq   %rbp
   retq
   .cfi_endproc


.subsections_via_symbols

Optimization: 优化：

   .section   __TEXT,__text,regular,pure_instructions
   .globl   _rgb2cmyk
   .align   4, 0x90
_rgb2cmyk: ## @rgb2cmyk
   .cfi_startproc
## BB#0:
   pushq   %rbp
Ltmp2:
   .cfi_def_cfa_offset 16
Ltmp3:
   .cfi_offset %rbp, -16
   movq   %rsp, %rbp
Ltmp4:
   .cfi_def_cfa_register %rbp
   movl   $255, %r8d
   movl   $255, %eax
   subl   %edi, %eax
   movl   $255, %edi
   subl   %esi, %edi
   subl   %edx, %r8d
   cmpl   %edi, %eax ##;; compare m and c
   jge   LBB0_2
## BB#1: ;; c < m
   cmpl   %r8d, %eax ## compare y and c
   movl   %r8d, %edx
   cmovlel   %eax, %edx
   jmp   LBB0_3
LBB0_2: ##;; c >= m
   cmpl   %r8d, %edi ## compare y and m
   movl   %r8d, %edx
   cmovlel   %edi, %edx
LBB0_3:
   subl   %edx, %eax
   subl   %edx, %edi
   subl   %edx, %r8d
   movl   %eax, (%rcx)
   movl   %edi, 4(%rcx)
   movl   %r8d, 8(%rcx)
   movl   %edx, 12(%rcx)
   popq   %rbp
   retq
   .cfi_endproc


.subsections_via_symbols

Answer 1

Yes. 是。 The optimized version performs many fewer memory read operations by storing intermediate values in registers and not reloading them over and over. 通过将中间值存储在寄存器中，而不是一遍又一遍地重新加载，优化后的版本执行的内存读取操作要少得多。

You are using call wrong. 您使用的call错误。 It is a technical term that means to push a return address on the stack and branch to a new location for instructions. 这是一个技术术语，表示将返回地址压入堆栈，然后跳转到新的位置以获取指令。 The term you mean is simply to use the register. 您的意思是简单地使用寄存器。

Can you think of a reason that longer, slower code is "better"? 您能想到更长，更慢的代码“更好”的原因吗？

准确了解如何以汇编语言实现提高的效率

问题描述

1 个解决方案

解决方案1
1 2014-10-12 23:04:08

准确了解如何以汇编语言实现提高的效率

问题描述

1 个解决方案

解决方案1 1 2014-10-12 23:04:08

解决方案1
1 2014-10-12 23:04:08