小彭老师,麻烦使用SSE优化一下下面的代码
以下的calSimilarity函数在算法过程中会执行(-180°~180°步长为0.1)很多次,下面时从工程中摘出来的代码,运行单次时,在编译器开O0优化,耗时为2046.41ms;在编译器开O3优化,耗时为310.06ms
gcc normal_calSimilarity.cpp -O0 -o normal_calSimilarity_gcc -lstdc++ -lm
硬件 CPU: i7-10700 [email protected] x 16**
//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
struct templateFeat
{
int x;
int y;
short dx;
short dy;
float mag;
};
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to match_result.txt!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> calSimilarity(const std::vector<templateFeat> template_point,const std::vector<searchFeat> search_point)
{
int template_feat_size = template_point.size();
int search_feat_size = search_point.size();
std::vector<matchResult> results0Deg;
float anMinScore = 0.4 - 1;
float NormMinScore = 0.4 / template_feat_size;
float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j++)
{
float PartialScore = 0;
float PartialSum = 0;
int SumOfCoords = 0;
for(int m = 0; m < template_feat_size; m++)
{
int curX = i + template_point[m].x;
int curY = j + template_point[m].y;
if(curX < 0 || curY < 0 || curX > 230 || curY > 350)
{
continue;
}
short iTx = template_point[m].dx;
short iTy = template_point[m].dy;
float iTm = template_point[m].mag;
int offSet = curY * 350 + curX;
short iSx = search_point[offSet].dx;
short iSy = search_point[offSet].dy;
float iSm = search_point[offSet].mag;
if((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
{
PartialSum += ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm);
}
SumOfCoords = m + 1;
PartialScore = PartialSum / SumOfCoords;
if(PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords)))
{
break;
}
}
if(PartialScore > 0.4)
{
results0Deg.push_back({i, j, 0, PartialScore});
}
}
}
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point(215);
for (auto &feat : template_point)
{
feat.x = intXDist(gen);
feat.y = intYDist(gen);
feat.dx = shortDXDist(gen);
feat.dy = shortDYDist(gen);
feat.mag = floatDist(gen);
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = shortDxDist(gen);
feat.dy = shortDyDist(gen);
feat.mag = float_Dist(gen);
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> results0Deg = calSimilarity(template_point,search_point);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("./match_result.txt",results0Deg);
return true;
}
原版 418.72ms
SSE 50.32ms
SSE+OpenMP 21.99ms
g++ normal_calSimilarity.cpp -O3 -fopenmp -mavx2 -o normal_calSimilarity_gcc
#include <iostream>
#include <immintrin.h>
#include <atomic>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#define REP4(x) (x, x, x, x)
struct templateFeat
{
__m128i x;
__m128i y;
__m64 dx;
__m64 dy;
__m128 mag;
};
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to " << txt_path << "!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,size_t template_feat_size,const std::vector<searchFeat> &search_point,int search_feat_size)
{
static std::vector<matchResult> results0Deg;
std::atomic<size_t> results0DegSize{0};
results0Deg.resize(1920 * 1200);
auto $anMinScore = _mm_set1_ps(0.4f - 1);
auto $NormMinScore = _mm_set1_ps(0.4f / template_feat_size);
auto $NormGreediness = _mm_set1_ps(((1 - 0.8f * 0.4f) / (1 - 0.8f)) / template_feat_size);
#pragma omp parallel for collapse(2)
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j++)
{
auto $PartialSum = _mm_setzero_ps();
auto $i = _mm_set1_epi32(i);
auto $j = _mm_set1_epi32(j);
auto $m1 = _mm_setr_ps(1, 2, 3, 4);
auto $0 = _mm_setzero_si128();
auto $4 = _mm_set1_ps(4);
auto $230 = _mm_set1_epi32(230);
auto $350 = _mm_set1_epi32(350);
for(int m = 0; m < template_feat_size / 4 * 4; m += 4)
{
auto $curX = _mm_add_epi32($i, template_point[m / 4].x);
auto $curY = _mm_add_epi32($j, template_point[m / 4].y);
auto $ok = _mm_or_si128(
_mm_or_si128(_mm_cmplt_epi32($curX, $0), _mm_cmplt_epi32($curY, $0)),
_mm_or_si128(_mm_cmpgt_epi32($curX, $230), _mm_cmpgt_epi32($curY, $350)));
auto $iTx = _mm_cvtpi16_ps(template_point[m / 4].dx);
auto $iTy = _mm_cvtpi16_ps(template_point[m / 4].dy);
auto $iTm = template_point[m / 4].mag;
auto $offSet = _mm_and_si128($curY * $350 + $curX, $ok);
auto $iSxy = _mm_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
// isx 0 isx 0 isx 0 isx 0
// isy 0 isy 0 isy 0 isy 0
auto $iSx = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16($0, $iSxy), 16));
auto $iSy = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16($0, $iSxy), 16));
auto $iSm = _mm_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
$PartialSum = _mm_add_ps($PartialSum,
_mm_and_ps(_mm_castsi128_ps($ok),
_mm_mul_ps(
_mm_mul_ps($iSm, $iTm),
_mm_add_ps(
_mm_mul_ps($iSx, $iTx),
_mm_mul_ps($iSy, $iTy))
)));
auto $cond = _mm_mul_ps($m1,
_mm_min_ps(
_mm_add_ps($anMinScore, _mm_mul_ps($NormGreediness, $m1)),
_mm_mul_ps($NormMinScore, $m1)));
/* for (int _ = 0; _ < 4; _++) */
/* printf("%f %f %f %f %f %f %f %f\n", $iSx[_], $iSy[_], $iTx[_], $iTy[_], $iSm[_], $PartialSum[_], $m1[_], $cond[_]); */
auto $cmp = _mm_cmplt_ps($PartialSum, $cond);
int cmpmask = _mm_movemask_ps($cmp);
if (cmpmask) [[unlikely]] {
$PartialSum = _mm_setzero_ps();
break;
}
$m1 = _mm_add_ps($m1, $4);
}
$PartialSum = _mm_hadd_ps($PartialSum, $PartialSum);
$PartialSum = _mm_hadd_ps($PartialSum, $PartialSum);
auto PartialScore = _mm_cvtss_f32($PartialSum) / (template_feat_size + 1);
if(PartialScore > 0.4f)
{
results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j, 0, PartialScore};
}
}
}
results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::mt19937 gen;
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point(215 / 4);
for (auto &feat : template_point)
{
feat.x = _mm_setr_epi32 REP4(intXDist(gen));
feat.y = _mm_setr_epi32 REP4(intYDist(gen));
feat.dx = _mm_setr_pi16 REP4(shortDXDist(gen));
feat.dy = _mm_setr_pi16 REP4(shortDYDist(gen));
feat.mag = _mm_setr_ps REP4(floatDist(gen));
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = (shortDxDist(gen));
feat.dy = (shortDyDist(gen));
feat.mag = (float_Dist(gen));
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> const &results0Deg = calSimilarity(template_point,215,search_point,2304000);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("/tmp/match_result.txt",results0Deg);
return 0;
}
速度是上去了,但是:将随机数据改成固定数据,运行费SSE与SSE代码,产生的结果不一致,match_result.txt文件内容不一致。
- 问题项
for循环内部计算:非SSE :PartialScore = PartialSum / SumOfCoords; 的SumOfCoords值不应该是 SSE:auto PartialScore = _mm_cvtss_f32($PartialSum) / (template_feat_size + 1); 中的template_feat_size固定值
修复后的SSE版本
#include <iostream>
#include <immintrin.h>
#include <atomic>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#define REP4(x) (x, x, x, x)
struct templateFeat
{
__m128i x;
__m128i y;
__m64 dx;
__m64 dy;
__m128 mag;
};
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to " << txt_path << "!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,size_t template_feat_size,const std::vector<searchFeat> &search_point,int search_feat_size)
{
static std::vector<matchResult> results0Deg;
std::atomic<size_t> results0DegSize{0};
results0Deg.resize(1920 * 1200);
auto anMinScore = (0.4 - 1);
auto NormMinScore = (0.4 / template_feat_size);
auto NormGreediness = (((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size);
static std::vector<float> thresholds;
thresholds.resize(((template_feat_size + 3) & ~3));
for (int m = 0; m < ((template_feat_size + 3) & ~3); m++) {
auto SumOfCoords = m + 1;
thresholds[m] = SumOfCoords * std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords);
}
/* #pragma omp parallel for collapse(2) */
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j++)
{
float PartialScore = -999999;
auto $PartialScore = _mm_setzero_ps();
auto $PartialSum = _mm_setzero_ps();
auto $SumOfCoords = _mm_setr_ps(-3, -2, -1, 0);
auto $4 = _mm_set1_ps(4);
auto $template_feat_size = _mm_set1_ps(template_feat_size);
auto $i = _mm_set1_epi32(i);
auto $j = _mm_set1_epi32(j);
auto $0 = _mm_setzero_si128();
auto $230 = _mm_set1_epi32(230);
auto $350 = _mm_set1_epi32(350);
for(int m = 0; m < ((template_feat_size + 3) & ~3); m += 4)
{
$SumOfCoords = _mm_add_ps($SumOfCoords, $4);
auto $curX = _mm_add_epi32($i, template_point[m / 4].x);
auto $curY = _mm_add_epi32($j, template_point[m / 4].y);
auto $notok = _mm_or_si128(_mm_or_si128(_mm_castps_si128(_mm_cmpge_ps($SumOfCoords, $template_feat_size)),
_mm_or_si128(_mm_cmplt_epi32($curX, $0), _mm_cmplt_epi32($curY, $0))),
_mm_or_si128(_mm_cmpgt_epi32($curX, $230), _mm_cmpgt_epi32($curY, $350)));
/* if (0xffff == _mm_movemask_epi8($notok)) { */
/* continue; */
/* } */
auto $iTx = _mm_cvtepi16_epi32(_mm_set1_epi64(template_point[m / 4].dx));
auto $iTy = _mm_cvtepi16_epi32(_mm_set1_epi64(template_point[m / 4].dy));
auto $iTm = template_point[m / 4].mag;
auto $offSet = _mm_andnot_si128($notok, _mm_add_epi32(_mm_mullo_epi32($curY, $350), $curX));
auto $iSxy = _mm_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm_srai_epi32(_mm_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm_srai_epi32($iSxy, 16);
auto $iSm = _mm_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
auto $accum = _mm_andnot_ps(_mm_castsi128_ps($notok),
_mm_mul_ps(
_mm_mul_ps($iSm, $iTm),
_mm_cvtepi32_ps(
_mm_add_epi32(
_mm_mullo_epi32($iSx, $iTx),
_mm_mullo_epi32($iSy, $iTy)))));
/* auto $cond = _mm_mul_ps($m1, */
/* _mm_min_ps( */
/* _mm_add_ps($anMinScore, _mm_mul_ps($NormGreediness, $m1)), */
/* _mm_mul_ps($NormMinScore, $m1))); */
/* for (int _ = 0; _ < 4; _++) */
/* printf("%d %f %f %f %f %f %f %f %f %f %f %f\n", m + _, $accum[_], _mm_cvtepi32_ps($offSet)[_], _mm_cvtepi32_ps($curX)[_], _mm_cvtepi32_ps($curY)[_], _mm_cvtepi32_ps($iSx)[_], _mm_cvtepi32_ps($iSy)[_], _mm_cvtepi32_ps($iTx)[_], _mm_cvtepi32_ps($iTy)[_], $iSm[_], $iTm[_], _mm_cvtepi32_ps($notok)[_]); */
/* exit(1); */
// 0 1 2 3 (HI)
$accum = _mm_add_ps($accum, _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128($accum), 4)));
// 0 1+0 2+1 3+2
$accum = _mm_add_ps($accum, _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128($accum), 8)));
// 0 1+0 2+1+0 3+2+1+0
$PartialScore = _mm_add_ps($PartialSum, $accum);
/* auto $cond = _mm_mul_ps($SumOfCoords, _mm_min_ps( */
/* _mm_add_ps(_mm_set1_ps(anMinScore), _mm_mul_ps(_mm_set1_ps(NormGreediness), $SumOfCoords)), */
/* _mm_mul_ps(_mm_set1_ps(NormMinScore), $SumOfCoords))); */
auto $cond = _mm_load_ps(thresholds.data() + m);
auto $cmp = _mm_andnot_ps(_mm_castsi128_ps($notok), _mm_cmplt_ps($PartialScore, $cond));
int cmpmask = _mm_movemask_ps($cmp);
if (cmpmask) {
int bit = __builtin_ctz(cmpmask);
PartialScore = $PartialScore[bit] / (m + bit + 1);
goto out;
}
$PartialSum = _mm_add_ps($PartialSum, _mm_shuffle_ps($accum, $accum, 0xff));
}
$PartialScore = _mm_div_ps($PartialScore, $SumOfCoords);
PartialScore = $PartialScore[(template_feat_size - 1) & 3];
out:
if(PartialScore > 0.4f)
{
results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j, 0, PartialScore};
}
}
}
results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point((215 + 3) / 4);
for (auto &feat : template_point)
{
feat.x = _mm_set_epi32 REP4(intXDist(gen1));
feat.y = _mm_set_epi32 REP4(intYDist(gen2));
feat.dx = _mm_set_pi16 REP4(shortDXDist(gen3));
feat.dy = _mm_set_pi16 REP4(shortDYDist(gen4));
feat.mag = _mm_set_ps REP4(floatDist(gen5));
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = (shortDxDist(gen));
/* static int once = printf("%d\n", feat.dx); */
feat.dy = (shortDyDist(gen));
feat.mag = (float_Dist(gen));
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> const &results0Deg = calSimilarity(template_point,215,search_point,2304000);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("/tmp/match_result.txt",results0Deg);
return 0;
}
旧版本
//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
struct templateFeat
{
int x;
int y;
short dx;
short dy;
float mag;
};
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to match_result.txt!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> calSimilarity(const std::vector<templateFeat> template_point,const std::vector<searchFeat> search_point)
{
int template_feat_size = template_point.size();
int search_feat_size = search_point.size();
std::vector<matchResult> results0Deg;
float anMinScore = 0.4 - 1;
float NormMinScore = 0.4 / template_feat_size;
float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j++)
{
float PartialScore = 0;
float PartialSum = 0;
int SumOfCoords = 0;
for(int m = 0; m < template_feat_size; m++)
{
int curX = i + template_point[m].x;
int curY = j + template_point[m].y;
if(curX < 0 || curY < 0 || curX > 230 || curY > 350)
{
continue;
}
int iTx = template_point[m].dx;
int iTy = template_point[m].dy;
float iTm = template_point[m].mag;
int offSet = curY * 350 + curX;
int iSx = search_point[offSet].dx;
int iSy = search_point[offSet].dy;
float iSm = search_point[offSet].mag;
if((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
{
PartialSum += ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm);
}
SumOfCoords = m + 1;
PartialScore = PartialSum / SumOfCoords;
/* printf("%d %f %d %d %d %d %d %d %d %f %f %f\n", m, ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm), offSet, curX, curY, iSx, iSy, iTx, iTy, iSm, iTm, PartialSum); */
/* exit(1); */
/* */
/* printf("%d %f %f\n", m, PartialSum, ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm)); */
if(PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords)))
{
break;
}
}
/* printf("%f %d\n", PartialScore, SumOfCoords); */
if(PartialScore > 0.4)
{
/* printf("%d %d %f\n", i, j, PartialScore); */
/* exit(1); */
results0Deg.push_back({i, j, 0, PartialScore});
}
}
}
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point(215);
for (auto &feat : template_point)
{
feat.x = intXDist(gen1);
feat.y = intYDist(gen2);
feat.dx = shortDXDist(gen3);
feat.dy = shortDYDist(gen4);
feat.mag = floatDist(gen5);
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = shortDxDist(gen);
/* static int once = printf("%d\n", feat.dx); */
feat.dy = shortDyDist(gen);
feat.mag = float_Dist(gen);
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> results0Deg = calSimilarity(template_point,search_point);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("./match_result.txt",results0Deg);
return 0;
}
最新版本的测试结果未有加速效果,测试记录:
- 非SSE版本
$ g++ normal_calSimilarity.cpp -O3 -o normal_calSimilarity_gcc -lstdc++ -lm
$ ./normal_calSimilarity_gcc
Function took 311.048 milliseconds.
Match result datas saved to match_result.txt!
耗时:311.048毫秒
- SSE版本
$ g++ normal_calSimilaritySSE.cpp -O3 -mavx2 -o normal_calSimilarity_gccSSE
$ ./normal_calSimilarity_gccSSE
Function took 817.517 milliseconds.
Match result datas saved to /tmp/match_result.txt!
耗时:817.517毫秒
考虑到:
测试数据的不可变性
数据的计算复杂度
数据类型转换
非必要条件判断
我将原先的非SSE代码修改如下:
- 代码
//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
struct templateFeat
{
int x;
int y;
float dx;
float dy;
};
struct matchResult
{
int i;
int j;
float angle;
float score;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to match_result.txt!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> calSimilarity(const std::vector<float> search_grad_x,const std::vector<float> search_grad_y,const std::vector<templateFeat> template_grad,int width,int height)
{
std::vector<matchResult> results0Deg;
float anMinScore = 0.5 - 1;
float NormMinScore = 0.5 / template_grad.size();
float NormGreediness = ((1 - 0.9 * 0.5) / (1 - 0.9))/template_grad.size();
for (int i = 12; i < 117; i++)
{
for (int j = 13; j < 123; j++)
{
// 去掉条件[1]
std::vector<templateFeat> template_grad_copy = template_grad;
for(auto it = template_grad_copy.begin(); it != template_grad_copy.end();)
{
int curX = i + it->x;
int curY = j + it->y;
if(curX < 0 || curY < 0 || curX > width - 1 || curY > height - 1)
{
it = template_grad_copy.erase(it);
}
else
{
++it;
}
}
float PartialScore = 0;
float PartialSum = 0;
int SumOfCoords = 0;
for (int m = 0; m < template_grad.size(); m++)
{
int curX = i + template_grad[m].x;
int curY = j + template_grad[m].y;
// 条件[1]
// if (curX < 0 || curY < 0 || curX > width - 1 || curY > height - 1) {
// continue;
// }
int offSet = curY * width + curX;
float iTx = template_grad[m].dx;
float iTy = template_grad[m].dy;
float iSx = search_grad_x[offSet];
float iSy = search_grad_y[offSet];
if ((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
{
PartialSum += (iSx * iTx) + (iSy * iTy);
}
SumOfCoords = m + 1;
PartialScore = PartialSum / SumOfCoords;
if (PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords,NormMinScore * SumOfCoords)))
break;
}
if(PartialScore > 0.5)
{
results0Deg.push_back({i, j, 0, PartialScore});
}
}
}
return results0Deg;
}
int main()
{
int width = 1273;
int height = 1231;
int buffer_size = width * height;
std::vector<float> search_grad_x(buffer_size);
std::vector<float> search_grad_y(buffer_size);
for (int i = 0; i < buffer_size; ++i) {
search_grad_x[i] = static_cast<float>(i) * 0.5f;
}
for (int i = 0; i < buffer_size; ++i) {
search_grad_y[i] = static_cast<float>(i) * 1.5f;
}
std::vector<templateFeat> template_grad;
for (int i = 0; i < 125; ++i) {
templateFeat feat;
feat.x = i * 2.0;
feat.y = i * 2.0 + 1.0;
feat.dx = static_cast<float>(i) * 0.25f;
feat.dy = static_cast<float>(i) * 1.5f;
template_grad.push_back(feat);
}
//运行算法,计算耗时:std::milli 就是毫秒,std::micro 就是微秒seconds
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> results0Deg = calSimilarity(search_grad_x,search_grad_y,template_grad,width,height);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " ms." << std::endl;
//保存匹配结果
saveDataToTxt("./match_result.txt",results0Deg);
return 1;
}
- 运行结果
$ g++ normal_calSimilarity.cpp -O3 -o normal_calSimilarity_gcc -lstdc++ -lm
$ ./normal_calSimilarity_gcc
Function took 10.2154 ms.
Match result datas saved to match_result.txt!
新版本的耗时:10.2154 ms
请在上面新的基准代码上进行SSE修改吧!谢谢~
对比结果文件 match_result.txt 可以使用Meld工具www.meldmerge.org/ 进行对比测试~~~
//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
//#define USE_MULTISCORE//causing wrong result
#define USE_FIRSTUNROLL
#define USE_OPENMP
#define USE_UNROLLM4
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#include <immintrin.h>
#ifdef USE_OPENMP
#include <atomic>
#endif
struct templateFeat
{
int x;
int y;
short dx;
short dy;
float mag;
};
#ifdef USE_MULTISCORE
struct templateFeatSimd
{
__m256i x;
__m256i y;
__m128i dx;
__m128i dy;
__m256 mag;
};
#endif
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to match_result.txt!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
#ifdef USE_MULTISCORE
static float multiScore(templateFeatSimd const *template_point_simd,
searchFeat const *search_point,
float const *thresholds,
int template_feat_size,
int search_feat_size,
int i, int j, int mbase,
float PartialSumBase) {
auto $PartialScore = _mm256_setzero_ps();
auto $PartialSum = _mm256_set1_ps(PartialSumBase);
auto $SumOfCoords = _mm256_add_ps(_mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8), _mm256_set1_ps(mbase));
auto $8 = _mm256_set1_ps(8);
auto $template_feat_size_plus1 = _mm256_set1_ps(template_feat_size + 1);
auto $i = _mm256_set1_epi32(i);
auto $j = _mm256_set1_epi32(j);
auto $0 = _mm256_setzero_si256();
auto $230 = _mm256_set1_epi32(230);
auto $350 = _mm256_set1_epi32(350);
/* printf("%d\n", mbase); */
for(int m = mbase; m < ((template_feat_size + 7) & ~7); m += 8, $SumOfCoords = _mm256_add_ps($SumOfCoords, $8))
{
auto $curX = _mm256_add_epi32($i, template_point_simd[m / 8].x);
auto $curY = _mm256_add_epi32($j, template_point_simd[m / 8].y);
auto $notok = _mm256_or_si256(_mm256_or_si256(_mm256_castps_si256(_mm256_cmp_ps($SumOfCoords, $template_feat_size_plus1, _CMP_GE_OQ)),
_mm256_or_si256(_mm256_srai_epi32($curX, 31), _mm256_srai_epi32($curY, 31))),
_mm256_or_si256(_mm256_cmpgt_epi32($curX, $230), _mm256_cmpgt_epi32($curY, $350)));
/* if (0xffff == _mm256_movemask_epi8($notok)) { */
/* continue; */
/* } */
auto $iTx = _mm256_cvtepi16_epi32(template_point_simd[m / 8].dx);
auto $iTy = _mm256_cvtepi16_epi32(template_point_simd[m / 8].dy);
auto $iTm = template_point_simd[m / 8].mag;
auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), $curX));
auto $iSxy = _mm256_i32gather_epi32((int *)search_point, $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm256_srai_epi32($iSxy, 16);
auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point, $offSet, sizeof(searchFeat));
auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
_mm256_mul_ps(
_mm256_mul_ps($iSm, $iTm),
_mm256_cvtepi32_ps(
_mm256_add_epi32(
_mm256_mullo_epi32($iSx, $iTx),
_mm256_mullo_epi32($iSy, $iTy)))));
/* auto $cond = _mm256_mul_ps($m1, */
/* _mm256_min_ps( */
/* _mm256_add_ps($anMinScore, _mm256_mul_ps($NormGreediness, $m1)), */
/* _mm256_mul_ps($NormMinScore, $m1))); */
/* for (int _ = 0; _ < 4; _++) */
/* printf("%d %f %f %f %f %f %f %f %f %f %f %f\n", m + _, $accum[_], _mm256_cvtepi32_ps($offSet)[_], _mm256_cvtepi32_ps($curX)[_], _mm256_cvtepi32_ps($curY)[_], _mm256_cvtepi32_ps($iSx)[_], _mm256_cvtepi32_ps($iSy)[_], _mm256_cvtepi32_ps($iTx)[_], _mm256_cvtepi32_ps($iTy)[_], $iSm[_], $iTm[_], _mm256_cvtepi32_ps($notok)[_]); */
/* exit(1); */
/* for (int _ = 0; _ < 8; _++) */
/* printf("%d %f\n", _, $accum[_]); */
// 0 1 2 3 4 5 6 7
$accum = _mm256_add_ps($accum, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256($accum), 4)));
// 0 1+0 2+1 3+2 4 5+4 6+5 7+6
$accum = _mm256_add_ps($accum, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256($accum), 8)));
// 0 1+0 2+1+0 3+2+1+0 4 5+4 6+5+4 7+6+5+4
$accum = _mm256_add_ps($accum, _mm256_set_m128(_mm_permute_ps(_mm256_castps256_ps128($accum), 0xff), _mm_setzero_ps()));
// 0 1+0 2+1+0 3+2+1+0 3+2+1+0+4 3+2+1+0+5+4 3+2+1+0+6+5+4 3+2+1+0+7+6+5+4
/* for (int _ = 0; _ < 8; _++) */
/* printf("%d %f\n", _, $accum[_]); */
/* exit(1); */
$PartialScore = _mm256_add_ps($PartialSum, $accum);
/* auto $cond = _mm256_mul_ps($SumOfCoords, _mm256_min_ps( */
/* _mm256_add_ps(_mm256_set1_ps(anMinScore), _mm256_mul_ps(_mm256_set1_ps(NormGreediness), $SumOfCoords)), */
/* _mm256_mul_ps(_mm256_set1_ps(NormMinScore), $SumOfCoords))); */
auto $cond = _mm256_load_ps(thresholds + m);
auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialScore, $cond, _CMP_LT_OQ));
int cmpmask = _mm256_movemask_ps($cmp);
if (cmpmask) [[unlikely]] {
int bit = __builtin_ctz(cmpmask);
return $PartialScore[bit] / (m + bit + 1);
}
$PartialSum = _mm256_add_ps($PartialSum, _mm256_broadcastss_ps(_mm256_castps256_ps128(_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(_mm256_permute_ps($accum, 0xff)), 0xf)))));
}
/* for (int _ = 0; _ < 8; _++) */
/* printf("%d %f %f\n", _, $PartialSum[_], $SumOfCoords[_]); */
/* exit(1); */
$PartialScore = _mm256_div_ps($PartialScore, $SumOfCoords);
return $PartialScore[(template_feat_size - 1) & 7];
}
#endif
std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,const std::vector<searchFeat> &search_point)
{
int template_feat_size = template_point.size();
int search_feat_size = search_point.size();
#ifdef USE_MULTISCORE
static std::vector<templateFeatSimd> template_point_simd;
template_point_simd.resize((template_feat_size + 7) & ~7);
for (int m = 0; m < template_point_simd.size() * 8; m += 8) {
template_point_simd[m / 8].x = _mm256_setr_epi32(template_point[(m) % template_feat_size].x, template_point[(m + 1) % template_feat_size].x, template_point[(m + 2) % template_feat_size].x, template_point[(m + 3) % template_feat_size].x, template_point[(m + 4) % template_feat_size].x, template_point[(m + 4 + 1) % template_feat_size].x, template_point[(m + 4 + 2) % template_feat_size].x, template_point[(m + 4 + 3) % template_feat_size].x);
template_point_simd[m / 8].y = _mm256_setr_epi32(template_point[(m) % template_feat_size].y, template_point[(m + 1) % template_feat_size].y, template_point[(m + 2) % template_feat_size].y, template_point[(m + 3) % template_feat_size].y, template_point[(m + 4) % template_feat_size].y, template_point[(m + 4 + 1) % template_feat_size].y, template_point[(m + 4 + 2) % template_feat_size].y, template_point[(m + 4 + 3) % template_feat_size].y);
template_point_simd[m / 8].dx = _mm_setr_epi16(template_point[(m) % template_feat_size].dx, template_point[(m + 1) % template_feat_size].dx, template_point[(m + 2) % template_feat_size].dx, template_point[(m + 3) % template_feat_size].dx, template_point[(m + 4) % template_feat_size].dx, template_point[(m + 4 + 1) % template_feat_size].dx, template_point[(m + 4 + 2) % template_feat_size].dx, template_point[(m + 4 + 3) % template_feat_size].dx);
template_point_simd[m / 8].dy = _mm_setr_epi16(template_point[(m) % template_feat_size].dy, template_point[(m + 1) % template_feat_size].dy, template_point[(m + 2) % template_feat_size].dy, template_point[(m + 3) % template_feat_size].dy, template_point[(m + 4) % template_feat_size].dy, template_point[(m + 4 + 1) % template_feat_size].dy, template_point[(m + 4 + 2) % template_feat_size].dy, template_point[(m + 4 + 3) % template_feat_size].dy);
template_point_simd[m / 8].mag = _mm256_setr_ps(template_point[(m) % template_feat_size].mag, template_point[(m + 1) % template_feat_size].mag, template_point[(m + 2) % template_feat_size].mag, template_point[(m + 3) % template_feat_size].mag, template_point[(m + 4) % template_feat_size].mag, template_point[(m + 4 + 1) % template_feat_size].mag, template_point[(m + 4 + 2) % template_feat_size].mag, template_point[(m + 4 + 3) % template_feat_size].mag);
}
#endif
static std::vector<matchResult> results0Deg;
results0Deg.resize(1920 * 1200);
#ifdef USE_OPENMP
std::atomic<size_t> results0DegSize = 0;
#else
size_t results0DegSize = 0;
#endif
float anMinScore = 0.4 - 1;
float NormMinScore = 0.4 / template_feat_size;
float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
static std::vector<float> thresholds;
thresholds.resize(((template_feat_size + 7) & ~7));
for (int m = 0; m < ((template_feat_size + 7) & ~7); m++) {
auto SumOfCoords = m + 1;
thresholds[m] = SumOfCoords * std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords);
}
#ifdef USE_OPENMP
#pragma omp parallel for schedule(dynamic, 8)
#endif
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j += 8)
{
int m;
auto $PartialSum = _mm256_setzero_ps();
#ifdef USE_FIRSTUNROLL
auto $SumOfCoords = _mm256_set1_epi32(1);
#else
auto $SumOfCoords = _mm256_set1_epi32(1);
#endif
auto $j = _mm256_add_epi32(_mm256_set1_epi32(j), _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7));
auto $350 = _mm256_set1_epi32(350);
auto $loopterm = _mm256_setzero_si256();
#ifdef USE_FIRSTUNROLL
{
int curX = i + template_point[0].x;
if (!(curX < 0 || curX > 230)) [[likely]] {
auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[0].y));
auto $notok = _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350));
int iTx = template_point[0].dx;
int iTy = template_point[0].dy;
float iTm = template_point[0].mag;
/* int offSet = curY * 350 + curX; */
auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm256_srai_epi32($iSxy, 16);
auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
_mm256_mul_ps(
_mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
_mm256_cvtepi32_ps(
_mm256_add_epi32(
_mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
_mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
$PartialSum = _mm256_add_ps($PartialSum, $accum);
auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[0]), _CMP_LT_OQ));
$loopterm = _mm256_castps_si256($cmp);
$SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)));
}
}
if (_mm256_movemask_epi8($loopterm) != -1) {
int curX = i + template_point[1].x;
if (!(curX < 0 || curX > 230)) [[likely]] {
auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[1].y));
auto $notok = _mm256_or_si256($loopterm, _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350)));
int iTx = template_point[1].dx;
int iTy = template_point[1].dy;
float iTm = template_point[1].mag;
/* int offSet = curY * 350 + curX; */
auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm256_srai_epi32($iSxy, 16);
auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
_mm256_mul_ps(
_mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
_mm256_cvtepi32_ps(
_mm256_add_epi32(
_mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
_mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
$PartialSum = _mm256_add_ps($PartialSum, $accum);
auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[1]), _CMP_LT_OQ));
$loopterm = _mm256_or_si256($loopterm, _mm256_castps_si256($cmp));
$SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)));
}
if (_mm256_movemask_epi8($loopterm) != -1) [[unlikely]] {
m = 2;
#else
m = 0;
#endif
#ifdef USE_UNROLLM4
#pragma GCC unroll 4
#endif
for(;
m < template_feat_size;
m++, $SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)))
) {
int curX = i + template_point[m].x;
if (curX < 0 || curX > 230) [[unlikely]] { continue; }
auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[m].y));
auto $notok = _mm256_or_si256($loopterm, _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350)));
int iTx = template_point[m].dx;
int iTy = template_point[m].dy;
float iTm = template_point[m].mag;
/* int offSet = curY * 350 + curX; */
auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm256_srai_epi32($iSxy, 16);
auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
_mm256_mul_ps(
_mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
_mm256_cvtepi32_ps(
_mm256_add_epi32(
_mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
_mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
$PartialSum = _mm256_add_ps($PartialSum, $accum);
auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[m]), _CMP_LT_OQ));
$loopterm = _mm256_or_si256($loopterm, _mm256_castps_si256($cmp));
#ifndef USE_MULTISCORE
if (_mm256_movemask_epi8($loopterm) == -1) {
break;
}
#else
auto popcorn = __builtin_popcount(_mm256_movemask_ps(_mm256_castsi256_ps($loopterm)));
if ((popcorn == 8 || popcorn >= 7) && !(m & 7)) [[unlikely]] {
break;
}
#endif
}
#ifdef USE_FIRSTUNROLL
}
}
#endif
auto $PartialScore = _mm256_div_ps($PartialSum, _mm256_cvtepi32_ps($SumOfCoords));
#ifdef USE_MULTISCORE
auto mask = _mm256_movemask_ps(_mm256_castsi256_ps($loopterm));
if (mask != 0xff && m < template_feat_size) {
auto k = __builtin_ctz(~mask);
$PartialScore[k] = multiScore(
template_point_simd.data(), search_point.data(), thresholds.data(),
template_feat_size, search_feat_size, i, j, m, $PartialSum[k]);
}
#endif
auto $PartialScoreAbove = _mm256_cmp_ps($PartialScore, _mm256_set1_ps(0.4f), _CMP_GT_OQ);
for (int k = 0; k < 8; k++) {
if($PartialScoreAbove[k]) {
/* printf("%d %d %f\n", i, j, PartialScore); */
/* exit(1); */
#ifdef USE_OPENMP
results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j + k, 0, $PartialScore[k]};
#else
results0Deg[results0DegSize++] = {i, j + k, 0, $PartialScore[k]};
#endif
}
}
}
}
#ifdef USE_OPENMP
results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
#else
results0Deg.resize(results0DegSize);
#endif
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point(215);
for (auto &feat : template_point)
{
feat.x = intXDist(gen1);
feat.y = intYDist(gen2);
feat.dx = shortDXDist(gen3);
feat.dy = shortDYDist(gen4);
feat.mag = floatDist(gen5);
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = shortDxDist(gen);
/* static int once = printf("%d\n", feat.dx); */
feat.dy = shortDyDist(gen);
feat.mag = float_Dist(gen);
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
/* for (int i = 0; i < 100; i++) */
/* calSimilarity(template_point,search_point); */
std::vector<matchResult> const &results0Deg = calSimilarity(template_point,search_point);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("/tmp/match_result.txt",results0Deg);
return 0;
}
修复了性能提升不大的问题,现在是418ms -> 21ms了。