C++优化方法

jf_B3xDfZry 2022-10-24 323

嵌入式技术

1335人已加入

描述

C++ 优化方法

1.单个线程中，将加锁变量取出来，传递给局部变量，然后释放加锁变量，可有效减少加锁时间。

void QCJSynchronizer::core() {
    while (run_flag_) {
        std::deque data;
        {
            std::unique_lock lock(mutex_);
            if (msg_deque_.empty()) {
                condition_.wait(lock);
                if (!run_flag_) {
                    continue;
                }
            }
            data = msg_deque_;   // msg指针传递给局部变量data指针
            msg_deque_.clear();  // msg指针清空，不再指向内存空间
        }
  // 后续就是线程内部对局部变量的处理，不需要加速
        while (!data.empty()) {
   // work
            // 1. 正常情况,所有的消息序列都在均匀地接收消息数据
            if (deque.size() == (size_t) 1) {
    // work
            } else {
            }

            // 2. 不正常情况,某一个已经超过了queue_size
   // work
        }
    }
}

2.4个变量的向量需要计算,可使用vmulq_f32并行处理加速

// ××××××××××××××  x86平台   ××××××××××××××
void so3(const float *src, float *tgt) const {
    __m128 p0 = _mm_mul_ps(_mm_load_ps1(&src[0]), c[0]);   // 并行处理
    __m128 p1 = _mm_mul_ps(_mm_load_ps1(&src[1]), c[1]);
    __m128 p2 = _mm_mul_ps(_mm_load_ps1(&src[2]), c[2]);
    _mm_store_ps(tgt, _mm_add_ps(p0, _mm_add_ps(p1, p2)));
}

_mm_mul_ps:

Multiplies the four single-precision, floating-point values of a and b.

3.Strength reduction

3.1 由于不同指令本身的速度就是不一样的，比较、整型的加减、位操作速度都是最快的，而除法/取余却很慢。

#include 
#include 


// 获取一个整数对应10进制的位数
uint32_t digits10_v1(uint64_t v) {
    uint32_t result = 0;
    do {
        ++result;
        v /= 10;
    } while (v);
    return result;
}

uint32_t digits10_v2(uint64_t v) {
  uint32_t result = 1;
  for (;;) {
    if (v < 10) return result;
    if (v < 100) return result + 1;
    if (v < 1000) return result + 2;
    if (v < 10000) return result + 3;
    // Skip ahead by 4 orders of magnitude
    v /= 10000U;
    result += 4;
  }
}

uint32_t digits10_v3(uint64_t v) {
    if (v < 10) return 1;
    if (v < 100) return 2;
    if (v < 1000) return 3;
    if (v < 1000000000000) {    // 10^12
        if (v < 100000000) {    // 10^7
            if (v < 1000000) {  // 10^6
                if (v < 10000) return 4;
                return 5 + (v >= 100000); // 10^5
            }
            return 7 + (v >= 10000000); // 10^7
        }
        if (v < 10000000000) {  // 10^10
            return 9 + (v >= 1000000000); // 10^9
        }
        return 11 + (v >= 100000000000); // 10^11
    }
    return 12 + digits10_v3(v / 1000000000000); // 10^12
}

#define ITEM_COUNT 100
#define RUN_TIMES 99

int main(int argc, char **argv)
{
    srand(100);
    uint64_t digit10_array[ITEM_COUNT];
    for( int i = 0; i < ITEM_COUNT; ++i )
    {
        digit10_array[i] = rand();
    }

  struct timeval start, end;

  // digits10_v1
    uint64_t sum1 = 0;
    uint64_t time1 = 0;
    gettimeofday(&start,NULL);
    for( int i = 0; i < RUN_TIMES; ++i )
    {
        sum1 += digits10_v1(digit10_array[i]);
    }
    gettimeofday(&end,NULL);
    time1 = ( end.tv_sec - start.tv_sec ) * 1000 * 1000 +  end.tv_usec - start.tv_usec;

// digits10_v2
    uint64_t sum2 = 0;
    uint64_t time2 = 0;
    gettimeofday(&start,NULL);
    for( int i = 0; i < RUN_TIMES; ++i )
    {
        sum2 += digits10_v2(digit10_array[i]);
    }
    gettimeofday(&end,NULL);
    time2 = ( end.tv_sec - start.tv_sec ) * 1000 * 1000 +  end.tv_usec - start.tv_usec;

// digits10_v3
    uint64_t sum3 = 0;
    uint64_t time3 = 0;
    gettimeofday(&start,NULL);
    for( int i = 0; i < RUN_TIMES; ++i )
    {
        sum3 += digits10_v3(digit10_array[i]);
    }
    gettimeofday(&end,NULL);
    time3 = ( end.tv_sec - start.tv_sec ) * 1000 * 1000 +  end.tv_usec - start.tv_usec;


    std::cout << "sum1:" << sum1 << "	 sum2:" << sum2 << "	 sum3:" << sum3 << std::endl;
    std::cout << "cost1:" << time1 << "us	 cost2:" << time2 << "us	 cost3:" << time3 << "us"
         << "	 cost2/cost1:" << (1.0*time2)/time1
         << "	 cost3/cost1:" << (1.0*time3)/time1 << std::endl;

  return 0;
}

编译一下:

qiancj@qianchengjun:~/ShareFolder/5Codes/test/build$ g++ -g -O2 ../optimizatize.cpp  && ./a.out

结果:

sum1:935         sum2:935        sum3:935
cost1:6us        cost2:3us       cost3:1us       cost2/cost1:0.5         cost3/cost1:0.166667

核心原因是 digits10_v2 用比较和加法来减少除法 (/=) 操作，digits10_v3 通过搜索的方式进一步减少了除法操作。

3.2 整形:

1.类型转换：

int--> short/char (0~1 clock cycle)

int --> float/double (4~16个clock cycle), signed int 快于 unsigned int，唯一一个场景 signed 比 unsigned 快的

short/char 的计算通常使用 32bit 存储，只是返回的时候做了截取，故只在要考虑内存大小的时候才使用 short/char，如 array

注：隐式类型转换可能会溢出，有符号的溢出变成负数，无符号的溢出变成小的整数

2.运算：

除法、取余运算unsigned int 快于signed int

除以常量比除以变量效率高，因为可以在编译期做优化，尤其是常量可以表示成2^n时

++i和i++本身性能一样，但不同的语境效果不一样，如array[i++]比arry[++i]性能好；当依赖自增结果时，++i性能更好，如a=++b,a和b可复用同一个寄存器

代码示例

// div和mod效率
Timer timer;
int a, b, c;
timer.tic();
a = b / c; // This is slow
double time4 = timer.toc(true);
std::cout << "a = b / c " << time4 * 1000.0 << " us" << std::endl;

timer.tic();
a = b / 10; // Division by a constant is faster
double time5 = timer.toc(true);
std::cout << "a = b / 10 " << time5 * 1000.0 << " us" << std::endl;

timer.tic();
a = (unsigned int)b / 10; // Still faster if unsigned
double time6 = timer.toc(true);
std::cout << "a = (unsigned int)b / 10 " << time6 * 1000.0 << " us" << std::endl;

timer.tic();
a = b / 16; // Faster if divisor is a power of 2
double time7 = timer.toc(true);
std::cout << "a = b / 16 " << time7 * 1000.0 << " us" << std::endl;

timer.tic();
a = (unsigned int)b / 16; // Still faster if unsigned
double time8 = timer.toc(true);
std::cout << "a = (unsigned int)b / 16 " << time8 * 1000.0 << " us" << std::endl;

结果:

公式	耗时(us)
a = b / c	0.054
a = b / 10	0.039
a = (unsigned int)b / 10	0.038
a = b / 16	0.036
a = (unsigned int)b / 16	0.037

3.3 浮点型:

单精度、双精度的计算性能是一样的

常量的默认精度是双精度

不要混淆单精度、双精度，混合精度计算会带来额外的精度转换开销

// 混用
float a, b;
a = b * 1.2; // bad. 先将b转换成double，返回结果转回成float

float a, b;
a = b * 1.2f; // ok. everything is float

double a, b;
a = b * 1.2; // ok. everything is double

浮点除法比乘法慢很多，故可以利用乘法来加快速度

double y, a1, a2, b1, b2;
y = a1/b1 + a2/b2;              // slow
double y, a1, a2, b1, b2;
y = (a1*b2 + a2*b1) / (b1*b2); // faster

实际结果差不多,在除法较多的情况下,第二种方式肯定要快一点.

y = a1/b1 + a2/b2              0.026 us
y = (a1*b2 + a2*b1) / (b1*b2); 0.026 us

时间比较：

comparisons                         (1 clock cycle)
(u)int add, subtract, bitops, shift (1 clock cycle)
floating point add, sub             (3~6 clock cycle) 
indexed array access                (cache effects)
(u)int32 mul                        (3~4 clock cycle)
Floating point mul                  (4~8 clock cycle)
Float Point division, remainder     (14~45 clock cycle)
(u)int division, remainder          (40~80 clock cycle)

3.4 减少内存写操作 一个很自然的优化想法，应该尽量避免内存写操作；对于内存读取,尽可能顺序访问内存

struct Bitfield {
int a:4;
int b:2;
int c:2;
};

Bitfield x;
int A, B, C;
x.a = A;
x.b = B;
x.c = C;

假定 A、B、C 都很小，且不会溢出，可以写成

union Bitfield {
struct {
int a:4;
int b:2;
int c:2;
};
char abc;
};
Bitfield x;
int A, B, C;
x.abc = A | (B << 4) | (C << 6);

如果需要考虑溢出，也可以改为

x.abc = (A & 0x0F) | ((B & 3) << 4) | ((C & 3) <<6 );

3.5 避免不必要的函数，特别在最底层的循环，应该尽量让代码在一个函数内。看起来与良好的编码习惯冲突(一个函数最好不要超过80行)，其实不然，跟这个系列其他优化一样，我们应该知道何时去使用这些优化，而不是一上来就让代码不可读。

3.6 尝试使用inline函数，让函数调用的地方直接用函数体替换。inline对编译器来说是个建议，而且不是inline了性能就好，一般当函数比较小或者只有一个地方调用的时候，inline效果会比较好

3.7 在函数内部使用循环

(e.g., change for(i=0;i<100;i++) DoSomething(); into DoSomething() { for(i=0;i<100;i++) { ... } } )

3.8 减少函数的间接调用，如偏向静态链接而不是动态链接，尽量少用或者不用多继承、虚拟继承

3.9 优先使用迭代而不是递归

3.10 使用函数来替换define，从而避免多次求值。宏的其他缺点：不能overload和限制作用域（undef除外）

// Use macro as inline function
#define MAX(a,b) ((a) > (b) ? (a) : (b))
y = MAX(f(x), g(x));

// Replace macro by template
template 
static inline T max(T const & a, T const & b) {
return a > b ? a : b;
}

3.11 ** 尽可能的减少跳转和分支**,对于长的if...else，使用switch case，以减少后面条件的判断，把最容易出现的条件放在最前面

3.12 在跳转之间的代码尽量减少数据依赖

3.13 多使用引用传递参数，减少值传递

4 常见的优化手段

1. 消除条件分支

代码实例

if (a < b) {
  r = c;
} else {
  r = d;
}

优化版本1

int mask = (a-b) >> 31;
r = (mask & c) | (~mask & d);

优化版本2

int mask = (a-b) >> 31;
r = d + mask & (c-d);

优化版本3

// cmovg版本
r = (a < b) ？c : d;

bool 类型变换

实例代码

bool a, b, c, d;
c = a && b;
d = a || b;

编译器的行为是

bool a, b, c, d;
if (a != 0) {
    if (b != 0) {
        c = 1;
    }
    else {
        goto CFALSE;
    }
}
else {
CFALSE:
    c = 0;
}
if (a == 0) {
    if (b == 0) {
        d = 0;
    }
    else {
        goto DTRUE;
    }
}
else {
DTRUE:
    d = 1;
}

优化版本

char a = 0, b = 0, c, d;
c = a & b;
d = a | b;

实例代码2

bool a, b;
b = !a;
// 优化成
char a = 0, b;
b = a ^ 1;

反例

a && b 何时不能转换成 a & b，当 a 不可能为 false 的情况下

a | | b 何时不能转换成 a | b，当 a 不可能为 true 的情况下

2. 循环展开

实例代码

int i;
for (i = 0; i < 20; i++) {
    if (i % 2 == 0) {
        FuncA(i);
    }
    else {
        FuncB(i);
    }
    FuncC(i);
}

优化版本

int i;
for (i = 0; i < 20; i += 2) {
    FuncA(i);
    FuncC(i);
    FuncB(i+1);
    FuncC(i+1);
}

优化说明

优点：减少比较次数、某些CPU上重复次数越少预测越准、去掉了if判断

缺点：需要更多的code cache or micro-op cache、有些处理器(core 2)对于小循环性能很好(小于65bytes code)、循环的次数和展开的个数不匹配

一般编译器会自动展开循环，程序员不需要主动去做，除非有一些明显优点，比如减少上面的if判断

3. 边界检查

实例代码1

const int size = 16; int i;
float list[size];
...
if (i < 0 || i >= size) {
    cout << "Error: Index out of range";
}
else {
    list[i] += 1.0f;
}

// 优化版本

if ((unsigned int)i >= (unsigned int)size) {
    cout << "Error: Index out of range";
}else {
    list[i] += 1.0f;
}

实例代码2

const int min = 100, max = 110; int i;
...
if (i >= min && i <= max) { ...

//优化版本

if ((unsigned int)(i - min) <= (unsigned int)(max - min)) { ...

4. 使用数组

实例代码1

float a; int b;
a = (b == 0) ? 1.0f : 2.5f;

// 使用静态数组
float a; int b;
static const float OneOrTwo5[2] = {1.0f, 2.5f};
a = OneOrTwo5[b & 1];

实例代码2

// 数组的长度是2的幂
float list[16]; int i;
...
list[i & 15] += 1.0f;

5. 整形的 bit array 语义，适用于 enum、const、define

enum Weekdays {
    Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday
};
Weekdays Day;
if (Day == Tuesday || Day == Wednesday || Day == Friday) {
    DoThisThreeTimesAWeek();
}

// 优化版本 using &
enum Weekdays {
    Sunday = 1, Monday = 2, Tuesday = 4, Wednesday = 8,
    Thursday = 0x10, Friday = 0x20, Saturday = 0x40
};
Weekdays Day;
if (Day & (Tuesday | Wednesday | Friday)) {
    DoThisThreeTimesAWeek();
}

　　审核编辑：汤梓红

打开APP阅读更多精彩内容