優化2D旋轉

Question

給出在2D空間中旋轉點的經典公式：

cv::Point pt[NPOINTS];
cv::Point rotated[NPOINTS];
float angle = WHATEVER;
float cosine = cos(angle);
float sine = sin(angle);

for (int i = 0; i < NPOINTS; i++)
{
    rotated[i].x = pt[i].x * cosine - pt[i].y * sine;
    rotated[i].y = pt[i].x * sine   + pt[i].y * cosine;
}

鑒於NPOINTS是32並且陣列是對齊的，如何優化SSE或AVX的代碼？ 在這里和其他地方搜索沒有發現任何有用的東西，我在這里迷路了：

__m128i onePoint = _mm_set_epi32(pt[i].x, pt[i].y, pt[i].x, pt[i].y);
__m128 onefPoint = _m128_cvtepi32_ps(onePoint);
__m128 sinCos = _mm_set_ps(cosine, -sine, sine, cosine);
__m128 rotated = _mm_mul_ps(onefPoint, sinCos);

但是如何從[y*cosine, -x*sine, x*sine, y*cosine]到[y*cosine + -x*sine, x*sine + y*cosine] ？ 這是最好的方法嗎？ 它容易擴展到__m512嗎？

更新：我做了一些研究，現在我有大約：

__m128i onePoint = _mm_set_epi32(pt[i].x, pt[i].y, pt[i].x, pt[i].y);
__m128 onefPoint = _m128_cvtepi32_ps(onePoint);
__m128i twoPoint = _mm_set_epi32(pt[i+1].x, pt[i+1].y, pt[i+1].x, pt[i+1].y);
__m128 twofPoint = _m128_cvtepi32_ps(twoPoint);
__m128 sinCos = _mm_set_ps(cosine, -sine, sine, cosine);
__m128 rotated1 = _mm_mul_ps(onefPoint, sinCos);
__m128 rotated2 = _mm_mul_ps(twofPoint, sinCos);
__m128 added = _mm_hadd_ps(rotated1, rotated2);
__m128i intResult = _mm_cvtps_epi32(added);
int results[4];
_mm_storeu_si128((__m128i*)results, intResult);

這使得從處理器時間的11％到約6％的速度提高了50％。 擴展到__m256並一次做四個點可以提高速度。 這看起來非常糟糕，但我正朝着正確的方向前進嗎？

Answer 1

使用數組結構數組（AoSoA）並一次處理八個點。 在下面的代碼中， point8是包含八個點的數組結構。 函數rotate_point8旋轉八個點並具有與旋轉單個點的函數rotate_point相同的代數結構。 函數rotate_all8使用AoSoA point8*旋轉32個點。

單點旋轉代碼進行4次乘法，一次加法和一次減法。

如果我們查看rotate_point8的程序集，我們會看到GCC展開循環並進行4次SIMD乘法，一次SIMD加法，每次展開一次SIMD減法。 這是你能做的最好的事情：一個價格八個。

#include <x86intrin.h>
#include <stdio.h>
#include <math.h>

struct point8 {
  __m256 x;
  __m256 y;
};

struct point {
  float x;
  float y;
};

static point rotate_point(point p, float a, float b) {
  point r;
  r.x = p.x*a - p.y*b;
  r.y = p.x*b + p.y*a;
  return r;
}

static point8 rotate_point8(point8 p, float a, float b) {
  __m256 va = _mm256_set1_ps(a), vb = _mm256_set1_ps(b);
  point8 r;
  r.x = _mm256_sub_ps(_mm256_mul_ps(p.x,va), _mm256_mul_ps(p.y,vb));
  r.y = _mm256_add_ps(_mm256_mul_ps(p.x,vb), _mm256_mul_ps(p.y,va));
  return r;
}

void rotate_all(point* points, point* r, float angle) {
  float a = cos(angle), b = sin(angle);
  for(int i=0; i<32; i++) r[i] = rotate_point(points[i], a, b);
}

void rotate_all8(point8* points, point8* r8, float angle) {
  float a = cos(angle), b = sin(angle);
  for(int i=0; i<4; i++) r8[i] = rotate_point8(points[i], a, b);
}

int main(void) {
  float x[32], y[32];
  point p[32], r[32];
  point8 p8[4], r8[4];
  float angle = 3.14159f/4;

  for(int i=0; i<32; i++) y[i] = 1.0*i/31, x[i] = sqrt(1-y[i]*y[i]);
  for(int i=0; i<32; i++) p[i].x = x[i], p[i].y = y[i];
  for(int i=0; i<4; i++) p8[i].x = _mm256_load_ps(&x[8*i]), p8[i].y = _mm256_load_ps(&y[8*i]); 

  for(int i=0; i<32; i++) printf("%f %f\n", p[i].x, p[i].y); puts("");

  rotate_all(p, r, angle);
  for(int i=0; i<32; i++) printf("%f %f\n", r[i].x, r[i].y); puts("");

  rotate_all8(p8, r8, angle);
  for(int i=0; i<4; i++) {
    _mm256_storeu_ps(x, r8[i].x), _mm256_storeu_ps(y, r8[i].y);
    for(int j=0; j<8; j++) printf("%f %f\n", x[j], y[j]);
  }
}

優化2D旋轉

問題描述

1 個解決方案

解決方案1
1 已采納 2017-02-24 10:14:11

優化2D旋轉

問題描述

1 個解決方案

解決方案1 1 已采納 2017-02-24 10:14:11

解決方案1
1 已采納 2017-02-24 10:14:11