SIMD与普通代码的效率比较
#include "testfun.h"
#include <iostream>
#include <intrin.h>
#include <time.h>
#include "global_tmpl.h"
#define dataLen 1000000000
void testfun()
{
float *buf1 = new float[dataLen];
float *buf2 = new float[dataLen];
float *buf3 = new float[dataLen];
for(int i=0; i<dataLen; i++)
{
buf1[i] = i;
buf2[i] = 1;
buf3[i] = 0;
}
//==================SIMD代码==================================
int t = clock();
__m128 data1,data2,data3=_mm_setzero_ps();
float *p1= buf1;
float *p2= buf2;
float *p3= buf3;
for(int theta=0; theta<dataLen/4; theta++)
{
data1 = _mm_load_ps(p1);
data2 = _mm_load_ps(p2);
data3 = _mm_mul_ps(data1, data2);
_mm_store_ps(p3, data3);
p1 = p1+4;
p2 = p2+4;
p3 = p3+4;
}
int timePassed = clock() - t;
std::cout<<"total time used: "<<timePassed<<"ms"<<std::endl;
//==================普通代码=============================================
for(int i=0; i<dataLen; i++)
{
buf1[i] = i;
buf2[i] = 1;
buf3[i] = 0;
}
t = clock();
float *pp1 = buf1;
float *pp2 = buf2;
float *pp3 = buf3;
for(int i=0; i<dataLen; i++)
{
*pp3 = (*pp1) * (*pp2);
pp1++;
pp2++;
pp3++;
}
timePassed = clock() - t;
std::cout<<"total time used: "<<timePassed<<"ms"<<std::endl;
}
我32G内存, 不过一般时候只有12G可用, 20G放系统盘了, 哎, 以后跑测试代码都要到正常系统里跑了...
还是感到奇怪, mulps 怎么跟 mulss * 4 运行的时间差不多, 我把代码改成下面的, 希望编译器能优化成 mulps , 结果还是一样的是 mulss * 4 ( VS2010 , gcc 4.7.2 都是) , 看了下 cl 生成的代码看起好看些, 结果几个的运行的时间都基本一样, 哎, 搞不懂...
ALIGN(16) float *buf1, *buf2, *buf3;
void testSIMD()
{
__m128 *p1= (__m128*)buf1 , *p1end = p1 + dataLen / 4;
__m128 *p2= (__m128*)buf2;
__m128 *p3= (__m128*)buf3;
for(; p1 < p1end; p1 += 4 , p2 += 4 , p3 += 4 )
{
p3[0] = _mm_mul_ps(p1[0], p2[0]);
p3[1] = _mm_mul_ps(p1[1], p2[1]);
p3[2] = _mm_mul_ps(p1[2], p2[2]);
p3[3] = _mm_mul_ps(p1[3], p2[3]);
}
}
void test()
{
ALIGN(16) float *pp1 = buf1 , *pp1end = pp1 + dataLen;
ALIGN(16) float *pp2 = buf2;
ALIGN(16) float *pp3 = buf3;
for(; pp1 < pp1end; pp1 += 4 , pp2 += 4 , pp3 += 4)
{
pp3[0] = pp1[0] * pp2[0];
pp3[1] = pp1[1] * pp2[1];
pp3[2] = pp1[2] * pp2[2];
pp3[3] = pp1[3] * pp2[3];
}
}