[英]Understanding exactly how the increased efficiency is achieved in Assembly language
I have generated two assembly files - one that is optimized, and one that is not. 我生成了两个程序集文件-一个已优化,而另一个未优化。 The assembly-language code generated with optimization on should be more efficient than the other assembly-language code.
启用优化后生成的汇编语言代码应比其他汇编语言代码更有效。 I am more interested in how the efficiency is achieved.
我对如何实现效率更感兴趣。 To my understanding, in the non-optimized version there will always have to be an offset call to the register %rbp to find the address.
据我了解,在非优化版本中,始终必须对寄存器%rbp进行偏移调用以查找地址。 In the optimized version, the addresses are being stored in the registers, so you don't have to rely and call on %rbp to find them.
在优化版本中,地址被存储在寄存器中,因此您不必依赖并调用%rbp来查找它们。
Am I correct? 我对么? And if so, would there ever be a time when the optimized version will not be advantageous?
如果是这样,优化版本将永远不会有优势吗? Thank you for your time.
感谢您的时间。
Here is a function that converts from 42 GIF to CYMK. 这是一个将42 GIF转换为CYMK的函数。
void rgb2cmyk(int r, int g, int b, int ret[]) {
int c = 255 - r;
int m = 255 - g;
int y = 255 - b;
int k = (c < m) ? (c < y ? c : y) : (m < y ? m : y);
c -= k; m -= k; y -= k;
ret[0] = c; ret[1] = m; ret[2] = y; ret[3] = k;
}
Here is the assembly-language code that has not been optimized. 这是尚未优化的汇编语言代码。 Note I have made notes using ;;
注意我已经使用;;做过注释。 in the code.
在代码中。
No Opt: 没有选择:
.section __TEXT,__text,regular,pure_instructions
.globl _rgb2cmyk
.align 4, 0x90
_rgb2cmyk: ## @rgb2cmyk
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
;;initializing variable c, m, y
movl $255, %eax
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
movl %edx, -12(%rbp)
movq %rcx, -24(%rbp)
movl %eax, %edx
subl -4(%rbp), %edx
movl %edx, -28(%rbp)
movl %eax, %edx
subl -8(%rbp), %edx
movl %edx, -32(%rbp)
subl -12(%rbp), %eax
movl %eax, -36(%rbp)
movl -28(%rbp), %eax
;;compare
cmpl -32(%rbp), %eax
jge LBB0_5
## BB#1:
movl -28(%rbp), %eax
cmpl -36(%rbp), %eax
jge LBB0_3
## BB#2:
movl -28(%rbp), %eax
movl %eax, -44(%rbp) ## 4-byte Spill
jmp LBB0_4
LBB0_3:
movl -36(%rbp), %eax
movl %eax, -44(%rbp) ## 4-byte Spill
LBB0_4:
movl -44(%rbp), %eax ## 4-byte Reload
movl %eax, -48(%rbp) ## 4-byte Spill
jmp LBB0_9
LBB0_5:
movl -32(%rbp), %eax
cmpl -36(%rbp), %eax
jge LBB0_7
## BB#6:
movl -32(%rbp), %eax
movl %eax, -52(%rbp) ## 4-byte Spill
jmp LBB0_8
LBB0_7:
movl -36(%rbp), %eax
movl %eax, -52(%rbp) ## 4-byte Spill
LBB0_8:
movl -52(%rbp), %eax ## 4-byte Reload
movl %eax, -48(%rbp) ## 4-byte Spill
LBB0_9:
movl -48(%rbp), %eax ## 4-byte Reload
movl %eax, -40(%rbp)
movl -40(%rbp), %eax
movl -28(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -28(%rbp)
movl -40(%rbp), %eax
movl -32(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -32(%rbp)
movl -40(%rbp), %eax
movl -36(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -36(%rbp)
movl -28(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, (%rdx)
movl -32(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 4(%rdx)
movl -36(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 8(%rdx)
movl -40(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 12(%rdx)
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
Optimization: 优化:
.section __TEXT,__text,regular,pure_instructions
.globl _rgb2cmyk
.align 4, 0x90
_rgb2cmyk: ## @rgb2cmyk
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
movl $255, %r8d
movl $255, %eax
subl %edi, %eax
movl $255, %edi
subl %esi, %edi
subl %edx, %r8d
cmpl %edi, %eax ##;; compare m and c
jge LBB0_2
## BB#1: ;; c < m
cmpl %r8d, %eax ## compare y and c
movl %r8d, %edx
cmovlel %eax, %edx
jmp LBB0_3
LBB0_2: ##;; c >= m
cmpl %r8d, %edi ## compare y and m
movl %r8d, %edx
cmovlel %edi, %edx
LBB0_3:
subl %edx, %eax
subl %edx, %edi
subl %edx, %r8d
movl %eax, (%rcx)
movl %edi, 4(%rcx)
movl %r8d, 8(%rcx)
movl %edx, 12(%rcx)
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
Yes. 是。 The optimized version performs many fewer memory read operations by storing intermediate values in registers and not reloading them over and over.
通过将中间值存储在寄存器中,而不是一遍又一遍地重新加载,优化后的版本执行的内存读取操作要少得多。
You are using call
wrong. 您使用的
call
错误。 It is a technical term that means to push a return address on the stack and branch to a new location for instructions. 这是一个技术术语,表示将返回地址压入堆栈,然后跳转到新的位置以获取指令。 The term you mean is simply to use the register.
您的意思是简单地使用寄存器。
Can you think of a reason that longer, slower code is "better"? 您能想到更长,更慢的代码“更好”的原因吗?
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.