我怎样才能优化这个计算? (x ^ a + y ^ a + z ^ a)^(1 / a)

正如标题所示。 我需要做很多这样的计算:

re = (x^a + y^a + z^a)^(1/a).

其中{x,y,z}> = 0.更具体地说,a是正浮点常量,x,y,z是浮点数。 ^是一个指数运算符。

目前,我不想使用SIMD,但希望其他一些技巧加快速度。

static void heavy_load(void) {
  static struct xyz_t {
    float x,y,z;
  };
  struct xyz_t xyzs[10000];
  float re[10000] = {.0f};
  const float a = 0.2;

  /* here fill xyzs using some random positive floating point values */

  for (i = 0; i < 10000; ++ i) {
    /* here is what we need to optimize */
    re[i] = pow((pow(xyzs[i].x, a) + pow(xyzs[i].y, a) + pow(xyzs[i].z, a)), 1.0/a);
  }
}

首先要做的是通过找出其中一个术语来摆脱其中一个幂指数。 下面的代码使用

(x^a + y^a + z^a)^(1/a) = x * ((1 + (y/x)^a + (z/x)^a)^(1/a))

分解三者中最大的一个会更安全,也许更准确。

另一个优化是利用a是0.1或0.2的事实。 使用Chebychev多项式近似逼近x ^ a。 下面的代码只有x ^ 0.1的近似值; x ^ 0.2就是那个平方。 最后,由于1 / a是一个小整数(5或10),最终的乘方可以用少量的乘法代替。

要查看函数powtenthpowtenthnorm中发生了powtenth ,请参阅此答案:使用const非整数指数优化pow()? 。

#include <stdlib.h>
#include <math.h>


float powfive (float x);
float powtenth (float x);
float powtenthnorm (float x);

// Returns (x^0.2 + y^0.2 + z^0.2)^5
float pnormfifth (float x, float y, float z) {
   float u = powtenth(y/x);
   float v = powtenth(z/x);
   return x * powfive (1.0f + u*u + v*v);
}

// Returns (x^0.1 + y^0.1 + z^0.1)^10
float pnormtenth (float x, float y, float z) {
   float u = powtenth(y/x);
   float v = powtenth(z/x);
   float w = powfive (1.0f + u + v);
   return x * w * w;
}

// Returns x^5
float powfive (float x) {
   float xsq = x*x;
   return xsq*xsq*x;
}

// Returns x^0.1.
float powtenth (float x) {
   static const float pow2_tenth[10] = {
      1.0,
      pow(2.0, 0.1),
      pow(4.0, 0.1),
      pow(8.0, 0.1),
      pow(16.0, 0.1),
      pow(32.0, 0.1),
      pow(64.0, 0.1),
      pow(128.0, 0.1),
      pow(256.0, 0.1),
      pow(512.0, 0.1)
   };

   float s;
   int iexp;

   s = frexpf (x, &iexp);
   s *= 2.0;
   iexp -= 1;

   div_t qr = div (iexp, 10);
   if (qr.rem < 0) {
      qr.quot -= 1;
      qr.rem += 10;
   }

   return ldexpf (powtenthnorm(s)*pow2_tenth[qr.rem], qr.quot);
}

// Returns x^0.1 for x in [1,2), to within 1.2e-7 (relative error).
// Want more precision? Add more Chebychev polynomial coefs.
float powtenthnorm (float x) {
   static const int N = 8;

   // Chebychev polynomial terms.
   // Non-zero terms calculated via
   //   integrate (2/pi)*ChebyshevT[n,u]/sqrt(1-u^2)*((u+3)/2)^0.1
   //   from -1 to 1
   // Zeroth term is similar except it uses 1/pi rather than 2/pi.
   static const float Cn[N] = {
       1.0386703502389972,
       3.55833786872637476e-2,
      -2.7458105122604368629e-3,
       2.9828558990819401155e-4,
      -3.70977182883591745389e-5,
       4.96412322412154169407e-6,
      -6.9550743747592849e-7,
       1.00572368333558264698e-7};

   float Tn[N];

   float u = 2.0*x - 3.0;


   Tn[0] = 1.0;
   Tn[1] = u;
   for (int ii = 2; ii < N; ++ii) {
      Tn[ii] = 2*u*Tn[ii-1] - Tn[ii-2];
   }

   float y = 0.0;
   for (int ii = N-1; ii > 0; --ii) {
      y += Cn[ii]*Tn[ii];
   }

   return y + Cn[0];
}

一般的优化是改善缓存局部性。 将结果与参数一起保存。 使用powf而不是pow可以避免浪费时间浪费双向往返。 任何合理的编译器都会将1/a提升出循环,所以这不成问题。

你应该剖析一下,编译器可能完成的自动矢量化是否会因为没有对指向数组的指针使用显式索引而被愚弄。

typedef struct {
  float x,y,z, re;
} element_t;

void heavy_load(element_t * const elements, const int num_elts, const float a) {
  element_t * elts = elements;
  for (i = 0; i < num_elts; ++ i) {
    elts->re = 
      powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    elts ++;
  }
}

使用SSE2 SIMD,它将如下所示。 我正在使用并包括Julien Pommier的sse_mathfun.h摘录的摘录; 许可证附在下面。

#include <emmintrin.h>
#include <math.h>
#include <assert.h>

#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif

typedef __m128 v4sf;  // vector of 4 float (sse1)
typedef __m128i v4si; // vector of 4 int (sse2)

#define _PS_CONST(Name, Val)                                            
    static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PI32_CONST(Name, Val)                                            
    static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

_PS_CONST(1  , 1.0f);
_PS_CONST(0p5, 0.5f);
_PI32_CONST(0x7f, 0x7f);
_PS_CONST(exp_hi,   88.3762626647949f);
_PS_CONST(exp_lo,   -88.3762626647949f);
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS_CONST(cephes_exp_C1, 0.693359375);
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);

v4sf exp_ps(v4sf x) {
    v4sf tmp = _mm_setzero_ps(), fx;
    v4si emm0;
    v4sf one = *(v4sf*)_ps_1;

    x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
    x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);

    fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
    fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);

    emm0 = _mm_cvttps_epi32(fx);
    tmp  = _mm_cvtepi32_ps(emm0);

    v4sf mask = _mm_cmpgt_ps(tmp, fx);
    mask = _mm_and_ps(mask, one);
    fx = _mm_sub_ps(tmp, mask);

    tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
    v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
    x = _mm_sub_ps(x, tmp);
    x = _mm_sub_ps(x, z);

    z = _mm_mul_ps(x,x);

    v4sf y = *(v4sf*)_ps_cephes_exp_p0;
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, x);
    y = _mm_add_ps(y, one);

    emm0 = _mm_cvttps_epi32(fx);
    emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
    emm0 = _mm_slli_epi32(emm0, 23);
    v4sf pow2n = _mm_castsi128_ps(emm0);
    y = _mm_mul_ps(y, pow2n);
    return y;
}

typedef struct {
    float x,y,z, re;
} element_t;   

void fast(element_t * const elements, const int num_elts, const float a) {
    element_t * elts = elements;
    float logf_a = logf(a);
    float logf_1_a = logf(1.0/a);
    v4sf log_a = _mm_load1_ps(&logf_a);
    v4sf log_1_a = _mm_load1_ps(&logf_1_a);
    assert(num_elts % 3 == 0); // operates on 3 elements at a time

    // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    for (int i = 0; i < num_elts; i += 3) {
        // transpose
        // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output
        v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0
        v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0
        v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0
        v4sf r3 = _mm_setzero_ps();        // 0, 0, 0, 0
        v4sf t0 = _mm_unpacklo_ps(r0, r1); //  x1,x2,y1,y2
        v4sf t1 = _mm_unpacklo_ps(r2, r3); //  x3,0, y3,0
        v4sf t2 = _mm_unpackhi_ps(r0, r1); //  z1,z2,0, 0
        v4sf t3 = _mm_unpackhi_ps(r2, r3); //  z3,0, 0, 0
        r0 = _mm_movelh_ps(t0, t1);        // x1,x2,x3,0
        r1 = _mm_movehl_ps(t1, t0);        // y1,y2,y3,0
        r2 = _mm_movelh_ps(t2, t3);        // z1,z2,z3,0
        // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a))
        v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0
        v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0
        v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0
        v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0
        v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0
        v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0
        // sum
        v4sf s1 = _mm_add_ps(ex0, ex1);
        v4sf s2 = _mm_add_ps(sum1, ex2);
        // pow(sum, 1/a) = exp(sum * log(1/a))
        v4sf ps = _mm_mul_ps(s2, log_1_a);
        v4sf es = exp_ps(ps);
        ALIGN16_BEG float re[4] ALIGN16_END;
        _mm_store_ps(re, es);
        elts[0].re = re[0];
        elts[1].re = re[1];
        elts[2].re = re[2];
        elts += 3;
    }
}


/* Copyright (C) 2007  Julien Pommier
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the authors be held liable for any damages
  arising from the use of this software.
  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:
  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution. */

一些C优化 - 并不多,但是有些东西。

[编辑]对不起 - 回头来烦恼:如果FP值变化很大,那么通常是re = a (或b或c)。 做一个大幅度差异的测试会否定在x,y或z的某些区域调用pow()的需要。 这有助于平均时间,但不是最坏时间。

用在循环之前设置的a_inverse代替1.0/a

powf()替换pow() ,否则你调用pow()double版本。

次要:除了清理内存高速缓存之外,不需要使用float re[10000] = { 0.0f}初始化。

次要:使用指针而不是索引数组可能会节省一点。

次要:对于选择的平台,使用类型double可能运行得更快。 - 颠倒我上面的pow()/powf()评论。

轻微:尝试3个独立的x,y,z阵列。 每个float *

显然,分析有助于理解,但让我们假设这里已经很清楚。

链接地址: http://www.djcxy.com/p/15027.html

上一篇: How could I optimize this calculation ? (x^a + y^a +z^a)^(1/a)

下一篇: 1,000,000,000 calculations per microsecond?