[英]can you suggest me better solutions for this in C++ inline-assembly?
i am learning assembly and i started experiments on SSE and MMX registers within the Digital-Mars C++ compiler (intel sytanx more easily readable). 我正在学习汇编程序,并开始在Digital-Mars C ++编译器(更易于阅读的intel sytanx)中的SSE和MMX寄存器上进行实验。 I have finished a program that takes var_1 as a value and converts it to the var_2 number system(this is in 8 bit for now. will expand it to 32 64 128 later) . 我已经完成了一个将var_1作为值并将其转换为var_2数字系统的程序(目前为8位。稍后将其扩展为32 64 128)。 Program does this by two ways: 程序通过两种方式执行此操作:
__asm
inlining __asm
内联
Usual C++ way of %(modulo) operator. %(模)运算符的通常C ++方式。
Question: Can you tell me more efficient way to use xmm0-7 and mm0-7 registers and can you tell me how to exchange exact bytes of them with al,ah... 8 bit registers? 问题:您能告诉我使用xmm0-7和mm0-7寄存器的更有效方法吗?您能告诉我如何与al,ah ... 8位寄存器交换它们的确切字节吗?
Usual %(modulo) operator in the C++ usual way is very slow in comparison with __asm on my computer(pentium-m centrino 2.0GHz). 与计算机上的__asm(pentium-m centrino 2.0GHz)相比,C ++常用方式的%(modulo)运算符非常慢。 If you can tell me how to get rid of division instruction in __asmm, it will be even faster. 如果您能告诉我如何摆脱__asmm中的除法指令,它将更快。
When i run the program it gives me: 当我运行程序时,它给了我:
(for the values: var_1=17,var_2=2,all loops are 200M times)
17 is 10001 in number system 2
__asm(clock)...........: 7250 <------too bad. it is 8-bit calc.
C++(clock).............: 12250 <------not very slow(var_2 is a power of 2)
(for the values: var_1=33,var_2=7,all loops are 200M times)
33 is 45 in number system 7
__asm(clock)..........: 2875 <-------not good. it is 8-bit calc.
C++(clock)............: 6328 <----------------really slow(var_2 is not a power of 2)
The second C++ code(the one with % operator): ///////////////////////////////////////////////////////// 第二个C ++代码(带有%运算符的代码):////////////////////////////////////// ///////////////////
t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
y=x;
counter=0;
while(y>g)
{
var_3[counter]=y%g;
y/=g;
counter++;
}
var_3[counter]=y%g;
}
t2=clock();//final time
_asm code://////////////////////////////////////////////////////////////////////////////////////////////////////////// _asm代码://////////////////////////////////////////////// ////////////////////////////////////////////////// ///////////
__asm // i love assembly in some parts of C++
{
pushf //here does register backup
push eax
push ebx
push ecx
push edx
push edi
mov eax,0h //this will be outer loop counter init to zero
//init of medium-big registers to zero
movd xmm0,eax //cannot set to immediate constant: xmm0=outer loop counter
shufps xmm0,xmm0,0h //this makes all bits zero
movd xmm1,eax
movd xmm2,eax
shufps xmm1,xmm1,0h
shufps xmm2,xmm2,0h
movd xmm2,eax
shufps xmm3,xmm3,0h//could have made pxor xmm3,xmm3(single instruction)
//init complete(xmm0,xmm1,xmm2,xmm3 are zero)
movd xmm1,[var_1] //storing variable_1 to register
movd xmm2,[var_2] //storing var_2 to register
lea ebx,var_3 //calculate var_3 address
movd xmm3,ebx //storing var_3's address to register
for_loop:
mov eax,0h
//this line is index-init to zero(digit array index)
movd edx,xmm2
mov cl,dl //this is the var_1 stored in cl
movd edx,xmm1
mov al,dl //this is the var_2 stored in al
mov edx,0h
dng:
mov ah,00h //preparation for a 8-bit division
div cl //divide
movd ebx,xmm3 //get var_3 address
add ebx,edx //i couldnt find a way to multiply with 4
add ebx,edx //so i added 4 times ^^
add ebx,edx //add
add ebx,edx //last adding
//below, mov [ebx],ah is the only memory accessing instruction
mov [ebx],ah //(8 bit)this line is equivalent to var_3[i]=remainder
inc edx //i++;
cmp al,00h //is division zero?
jne dng //if no, loop again
//here edi register has the number of digits
movd eax,xmm0 //get the outer loop counter from medium-big register
add eax,01h //j++;
movd xmm0,eax //store the new counter to medium-big register
cmp eax,0BEBC200h //is j<(200,000,000) ?
jb for_loop //if yes, go loop again
mov [var_3_size],edx //now we have number of digits too!
//here does registers revert back to old values
pop edi
pop edx
pop ecx
pop ebx
pop eax
popf
}
Whole code:///////////////////////////////////////////////////////////////////////////////////////// 整个代码:///////////////////////////////////////////////// //////////////////////////////////////////
#include <iostream.h>
#include <cmath>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
int main()
{
srand(time(0));
clock_t t1=clock();
clock_t t2=clock();
int var_1=17; //number itself
int var_2=2; //number system
int var_3[100]; //digits to be showed(maximum 100 as seen )
int var_3_size=0;//asm block will decide what will the number of digits be
for(int i=0;i<100;i++)
{
var_3[i]=0; //here we initialize digits to zeroes
}
t1=clock();//reference time to take
__asm // i love assembly in some parts of C++
{
pushf //here does register backup
push eax
push ebx
push ecx
push edx
push edi
mov eax,0h //this will be outer loop counter init to zero
//init of medium-big registers to zero
movd xmm0,eax //cannot set to immediate constant: xmm0=outer loop counter
shufps xmm0,xmm0,0h //this makes all bits zero
movd xmm1,eax
movd xmm2,eax
shufps xmm1,xmm1,0h
shufps xmm2,xmm2,0h
movd xmm2,eax
shufps xmm3,xmm3,0h
//init complete(xmm0,xmm1,xmm2,xmm3 are zero)
movd xmm1,[var_1] //storing variable_1 to register
movd xmm2,[var_2] //storing var_2 to register
lea ebx,var_3 //calculate var_3 address
movd xmm3,ebx //storing var_3's address to register
for_loop:
mov eax,0h
//this line is index-init to zero(digit array index)
movd edx,xmm2
mov cl,dl //this is the var_1 stored in cl
movd edx,xmm1
mov al,dl //this is the var_2 stored in al
mov edx,0h
dng:
mov ah,00h //preparation for a 8-bit division
div cl //divide
movd ebx,xmm3 //get var_3 address
add ebx,edx //i couldnt find a way to multiply with 4
add ebx,edx //so i added 4 times ^^
add ebx,edx //add
add ebx,edx //last adding
//below, mov [ebx],ah is the only memory accessing instruction
mov [ebx],ah //(8 bit)this line is equivalent to var_3[i]=remainder
inc edx //i++;
cmp al,00h //is division zero?
jne dng //if no, loop again
//here edi register has the number of digits
movd eax,xmm0 //get the outer loop counter from medium-big register
add eax,01h //j++;
movd xmm0,eax //store the new counter to medium-big register
cmp eax,0BEBC200h //is j<(200,000,000) ?
jb for_loop //if yes, go loop again
mov [var_3_size],edx //now we have number of digits too!
//here does registers revert back to old values
pop edi
pop edx
pop ecx
pop ebx
pop eax
popf
}
t2=clock(); //finish time
printf("\n assembly_inline(clocks): %i for the 200 million calculations",(t2-t1));
printf("\n value %i(in decimal) is: ",var_1);
for(int i=var_3_size-1;i>=0;i--)
{
printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",var_2);
//and: more readable form(end easier)
int counter=var_3_size;
int x=var_1;
int g=var_2;
int y=x;// backup
t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
y=x;
counter=0;
while(y>g)
{
var_3[counter]=y%g;
y/=g;
counter++;
}
var_3[counter]=y%g;
}
t2=clock();//final time
printf("\n C++(clocks): %i for the 200 million calculations",(t2-t1));
printf("\n value %i(in decimal) is: ",x);
for(int i=var_3_size-1;i>=0;i--)
{
printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",g);
return 0;
} }
edit: this is 32-bit version 编辑:这是32位版本
void get_digits_asm()
{
__asm
{
pushf //couldnt store this in other registers
movd xmm0,eax//storing in xmm registers instead of pushing
movd xmm1,ebx//
movd xmm2,ecx//
movd xmm3,edx//
movd xmm4,edi//end of push backups
mov eax,[variable_x]
mov ebx,[number_system]
mov ecx,0h
mov edi,0h
begin_loop:
mov edx,0h
div ebx
lea edi,digits
mov [edi+ecx*4],edx
add ecx,01h
cmp eax,ebx
ja begin_loop
mov edx,0
div ebx
lea edi,digits
mov [edi+ecx*4],edx
inc ecx
mov [digits_total],ecx
movd edi,xmm4//pop edi
movd edx,xmm3//pop edx
movd ecx,xmm2//pop ecx
movd ebx,xmm1//pop ebx
movd eax,xmm0//pop eax
popf
}
}
The code can be much simpler of course: (modeled after the C++ version, does not include pushes and pops, and not tested) 代码当然可以简单得多:(以C ++版本为模型,不包含push和pop,并且未经测试)
mov esi,200000000
_bigloop:
mov eax,[y]
mov ebx,[g]
lea edi,var_3
; eax = y
; ebx = g
; edi = var_3
xor ecx,ecx
; ecx = counter
_loop:
xor edx,edx
div ebx
mov [edi+ecx*4],edx
add ecx,1
test eax,eax
jnz _loop
sub esi,1
jnz _bigloop
But I would be surprised if it was faster than the C++ version, and in fact it'll almost certainly be slower if the base is a power of two - all sane compilers know how to turn a division and/or modulo by a power of two into bitshifts and bitwise ands. 但是如果它比C ++版本快,我会感到惊讶,实际上,如果基数是2的幂,它几乎肯定会变慢-所有理智的编译器都知道如何使用p的幂进行除法和/或模分为移位和按位与。
Here's a version that uses ab 8-bit division. 这是使用8位除法的版本。 Similar caveats apply, but now the division could even overflow (if y / g
is more than 255). 适用类似的警告,但现在除法甚至可能溢出(如果y / g
大于255)。
mov esi,200000000
_bigloop:
mov eax,[y]
mov ebx,[g]
lea edi,var_3
; eax = y
; ebx = g
; edi = var_3
xor ecx,ecx
; ecx = counter
_loop:
div bl
mov [edi+ecx],ah
add ecx,1
and eax,0xFF
jnz _loop
sub esi,1
jnz _bigloop
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.