小彭老师，麻烦使用SSE优化一下下面的代码

Open zhangnatha opened this issue 2 years ago • 5 comments

以下的calSimilarity函数在算法过程中会执行（-180°~180°步长为0.1）很多次，下面时从工程中摘出来的代码，运行单次时，在编译器开O0优化，耗时为2046.41ms；在编译器开O3优化，耗时为310.06ms

gcc normal_calSimilarity.cpp -O0 -o normal_calSimilarity_gcc -lstdc++ -lm

硬件 CPU: i7-10700 [email protected] x 16**

//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>

struct templateFeat
{
	int x;
	int y;
	short dx;
	short dy;
	float mag;
};

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to match_result.txt!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> calSimilarity(const std::vector<templateFeat> template_point,const std::vector<searchFeat> search_point)
{
    int template_feat_size =  template_point.size();
    int search_feat_size =  search_point.size();

    std::vector<matchResult> results0Deg;
    float anMinScore     = 0.4 - 1;
    float NormMinScore   = 0.4 / template_feat_size;
    float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
    
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j++)
        {
            float PartialScore = 0;
            float PartialSum   = 0;
            int   SumOfCoords  = 0;

            for(int m = 0; m < template_feat_size; m++)
            {
                int curX = i + template_point[m].x;
                int curY = j + template_point[m].y;
				
                if(curX < 0 || curY < 0 || curX > 230 || curY > 350)
                {
                    continue;
                }

                short iTx = template_point[m].dx;
                short iTy = template_point[m].dy;
                float iTm = template_point[m].mag;

                int offSet = curY * 350 + curX;
                short iSx        = search_point[offSet].dx;
                short iSy        = search_point[offSet].dy; 
                float iSm        = search_point[offSet].mag;

                if((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
                {
                    PartialSum += ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm);
                }
                SumOfCoords  = m + 1;
                PartialScore = PartialSum / SumOfCoords;

                if(PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords)))
                {
                    break;
                }
            }

            if(PartialScore > 0.4)
            {
                results0Deg.push_back({i, j, 0, PartialScore});
            } 
        }
    }

    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数：template_point/search_point
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point(215);
    for (auto &feat : template_point)
    {
        feat.x = intXDist(gen);
        feat.y = intYDist(gen);
        feat.dx = shortDXDist(gen);
        feat.dy = shortDYDist(gen);
        feat.mag = floatDist(gen);
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = shortDxDist(gen);
        feat.dy = shortDyDist(gen);
        feat.mag = float_Dist(gen);
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> results0Deg = calSimilarity(template_point,search_point);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("./match_result.txt",results0Deg);

    return true;
}

Aug 30 '23 09:08 zhangnatha

原版 418.72ms SSE 50.32ms SSE+OpenMP 21.99ms g++ normal_calSimilarity.cpp -O3 -fopenmp -mavx2 -o normal_calSimilarity_gcc

#include <iostream>
#include <immintrin.h>
#include <atomic>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#define REP4(x) (x, x, x, x)

struct templateFeat
{
	__m128i x;
	__m128i y;
	__m64 dx;
	__m64 dy;
	__m128 mag;
};

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to " << txt_path << "!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,size_t template_feat_size,const std::vector<searchFeat> &search_point,int search_feat_size)
{
    static std::vector<matchResult> results0Deg;
    std::atomic<size_t> results0DegSize{0};
    results0Deg.resize(1920 * 1200);
    auto $anMinScore     = _mm_set1_ps(0.4f - 1);
    auto $NormMinScore   = _mm_set1_ps(0.4f / template_feat_size);
    auto $NormGreediness = _mm_set1_ps(((1 - 0.8f * 0.4f) / (1 - 0.8f)) / template_feat_size);
    
    #pragma omp parallel for collapse(2)
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j++)
        {
            auto $PartialSum = _mm_setzero_ps();

            auto $i = _mm_set1_epi32(i);
            auto $j = _mm_set1_epi32(j);
            auto $m1 = _mm_setr_ps(1, 2, 3, 4);
            auto $0 = _mm_setzero_si128();
            auto $4 = _mm_set1_ps(4);
            auto $230 = _mm_set1_epi32(230);
            auto $350 = _mm_set1_epi32(350);

            for(int m = 0; m < template_feat_size / 4 * 4; m += 4)
            {
                auto $curX = _mm_add_epi32($i, template_point[m / 4].x);
                auto $curY = _mm_add_epi32($j, template_point[m / 4].y);

                auto $ok = _mm_or_si128(
                    _mm_or_si128(_mm_cmplt_epi32($curX, $0), _mm_cmplt_epi32($curY, $0)),
                    _mm_or_si128(_mm_cmpgt_epi32($curX, $230), _mm_cmpgt_epi32($curY, $350)));

                auto $iTx = _mm_cvtpi16_ps(template_point[m / 4].dx);
                auto $iTy = _mm_cvtpi16_ps(template_point[m / 4].dy);
                auto $iTm = template_point[m / 4].mag;

                auto $offSet = _mm_and_si128($curY * $350 + $curX, $ok);
                auto $iSxy = _mm_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                // isx isy isx isy isx isy isx isy
                // isx 0 isx 0 isx 0 isx 0
                // isy 0 isy 0 isy 0 isy 0
                auto $iSx = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16($0, $iSxy), 16));
                auto $iSy = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16($0, $iSxy), 16));
                auto $iSm = _mm_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                $PartialSum = _mm_add_ps($PartialSum,
                                         _mm_and_ps(_mm_castsi128_ps($ok),
                                                    _mm_mul_ps(
                                                    _mm_mul_ps($iSm, $iTm),
                                                    _mm_add_ps(
                                                    _mm_mul_ps($iSx, $iTx),
                                                    _mm_mul_ps($iSy, $iTy))
                                                    )));
                auto $cond = _mm_mul_ps($m1,
                                        _mm_min_ps(
                                        _mm_add_ps($anMinScore, _mm_mul_ps($NormGreediness, $m1)),
                                        _mm_mul_ps($NormMinScore, $m1)));
                /* for (int _ = 0; _ < 4; _++) */
                /*      printf("%f %f %f %f %f %f %f %f\n", $iSx[_], $iSy[_], $iTx[_], $iTy[_], $iSm[_], $PartialSum[_], $m1[_], $cond[_]); */

                auto $cmp = _mm_cmplt_ps($PartialSum, $cond);
                int cmpmask = _mm_movemask_ps($cmp);
                if (cmpmask) [[unlikely]] {
                    $PartialSum = _mm_setzero_ps();
                    break;
                }
                $m1 = _mm_add_ps($m1, $4);
            }
            $PartialSum = _mm_hadd_ps($PartialSum, $PartialSum);
            $PartialSum = _mm_hadd_ps($PartialSum, $PartialSum);
            auto PartialScore = _mm_cvtss_f32($PartialSum) / (template_feat_size + 1);
            if(PartialScore > 0.4f)
            {
                results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j, 0, PartialScore};
            }
        }
    }

    results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数：template_point/search_point
    std::mt19937 gen;
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point(215 / 4);
    for (auto &feat : template_point)
    {
        feat.x = _mm_setr_epi32 REP4(intXDist(gen));
        feat.y = _mm_setr_epi32 REP4(intYDist(gen));
        feat.dx = _mm_setr_pi16 REP4(shortDXDist(gen));
        feat.dy = _mm_setr_pi16 REP4(shortDYDist(gen));
        feat.mag = _mm_setr_ps REP4(floatDist(gen));
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = (shortDxDist(gen));
        feat.dy = (shortDyDist(gen));
        feat.mag = (float_Dist(gen));
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> const &results0Deg = calSimilarity(template_point,215,search_point,2304000);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("/tmp/match_result.txt",results0Deg);

    return 0;
}

Aug 30 '23 21:08 archibate

速度是上去了，但是：将随机数据改成固定数据，运行费SSE与SSE代码，产生的结果不一致，match_result.txt文件内容不一致。

问题项

for循环内部计算：非SSE ：PartialScore = PartialSum / SumOfCoords; 的SumOfCoords值不应该是 SSE：auto PartialScore = _mm_cvtss_f32($PartialSum) / (template_feat_size + 1); 中的template_feat_size固定值

Aug 31 '23 07:08 zhangnatha

修复后的SSE版本

#include <iostream>
#include <immintrin.h>
#include <atomic>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#define REP4(x) (x, x, x, x)

struct templateFeat
{
	__m128i x;
	__m128i y;
	__m64 dx;
	__m64 dy;
	__m128 mag;
};

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to " << txt_path << "!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,size_t template_feat_size,const std::vector<searchFeat> &search_point,int search_feat_size)
{
    static std::vector<matchResult> results0Deg;
    std::atomic<size_t> results0DegSize{0};
    results0Deg.resize(1920 * 1200);
    auto anMinScore     = (0.4 - 1);
    auto NormMinScore   = (0.4 / template_feat_size);
    auto NormGreediness = (((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size);

    static std::vector<float> thresholds;
    thresholds.resize(((template_feat_size + 3) & ~3));
    for (int m = 0; m < ((template_feat_size + 3) & ~3); m++) {
        auto SumOfCoords = m + 1;
        thresholds[m] = SumOfCoords * std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords);
    }
    
    /* #pragma omp parallel for collapse(2) */
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j++)
        {
            float PartialScore = -999999;
            auto $PartialScore = _mm_setzero_ps();
            auto $PartialSum = _mm_setzero_ps();
            auto $SumOfCoords = _mm_setr_ps(-3, -2, -1, 0);
            auto $4 = _mm_set1_ps(4);
            auto $template_feat_size = _mm_set1_ps(template_feat_size);
            auto $i = _mm_set1_epi32(i);
            auto $j = _mm_set1_epi32(j);
            auto $0 = _mm_setzero_si128();
            auto $230 = _mm_set1_epi32(230);
            auto $350 = _mm_set1_epi32(350);

            for(int m = 0; m < ((template_feat_size + 3) & ~3); m += 4)
            {
                $SumOfCoords = _mm_add_ps($SumOfCoords, $4);

                auto $curX = _mm_add_epi32($i, template_point[m / 4].x);
                auto $curY = _mm_add_epi32($j, template_point[m / 4].y);

                auto $notok = _mm_or_si128(_mm_or_si128(_mm_castps_si128(_mm_cmpge_ps($SumOfCoords, $template_feat_size)),
                    _mm_or_si128(_mm_cmplt_epi32($curX, $0), _mm_cmplt_epi32($curY, $0))),
                    _mm_or_si128(_mm_cmpgt_epi32($curX, $230), _mm_cmpgt_epi32($curY, $350)));
                /* if (0xffff == _mm_movemask_epi8($notok)) { */
                /*     continue; */
                /* } */

                auto $iTx = _mm_cvtepi16_epi32(_mm_set1_epi64(template_point[m / 4].dx));
                auto $iTy = _mm_cvtepi16_epi32(_mm_set1_epi64(template_point[m / 4].dy));
                auto $iTm = template_point[m / 4].mag;

                auto $offSet = _mm_andnot_si128($notok, _mm_add_epi32(_mm_mullo_epi32($curY, $350), $curX));
                auto $iSxy = _mm_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                // isx isy isx isy isx isy isx isy
                auto $iSx = _mm_srai_epi32(_mm_slli_epi32($iSxy, 16), 16);
                auto $iSy = _mm_srai_epi32($iSxy, 16);
                auto $iSm = _mm_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                auto $accum = _mm_andnot_ps(_mm_castsi128_ps($notok),
                                            _mm_mul_ps(
                                            _mm_mul_ps($iSm, $iTm),
                                            _mm_cvtepi32_ps(
                                            _mm_add_epi32(
                                            _mm_mullo_epi32($iSx, $iTx),
                                            _mm_mullo_epi32($iSy, $iTy)))));
                /* auto $cond = _mm_mul_ps($m1, */
                /*                         _mm_min_ps( */
                /*                         _mm_add_ps($anMinScore, _mm_mul_ps($NormGreediness, $m1)), */
                /*                         _mm_mul_ps($NormMinScore, $m1))); */
                /* for (int _ = 0; _ < 4; _++) */
                /*      printf("%d %f %f %f %f %f %f %f %f %f %f %f\n", m + _, $accum[_], _mm_cvtepi32_ps($offSet)[_], _mm_cvtepi32_ps($curX)[_], _mm_cvtepi32_ps($curY)[_], _mm_cvtepi32_ps($iSx)[_], _mm_cvtepi32_ps($iSy)[_], _mm_cvtepi32_ps($iTx)[_], _mm_cvtepi32_ps($iTy)[_], $iSm[_], $iTm[_], _mm_cvtepi32_ps($notok)[_]); */
                /* exit(1); */

                // 0 1 2 3 (HI)
                $accum = _mm_add_ps($accum, _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128($accum), 4)));
                // 0 1+0 2+1 3+2
                $accum = _mm_add_ps($accum, _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128($accum), 8)));
                // 0 1+0 2+1+0 3+2+1+0

                $PartialScore = _mm_add_ps($PartialSum, $accum);
                /* auto $cond = _mm_mul_ps($SumOfCoords, _mm_min_ps( */
                /*     _mm_add_ps(_mm_set1_ps(anMinScore), _mm_mul_ps(_mm_set1_ps(NormGreediness), $SumOfCoords)), */
                /*     _mm_mul_ps(_mm_set1_ps(NormMinScore), $SumOfCoords))); */
                auto $cond = _mm_load_ps(thresholds.data() + m);
                auto $cmp = _mm_andnot_ps(_mm_castsi128_ps($notok), _mm_cmplt_ps($PartialScore, $cond));
                int cmpmask = _mm_movemask_ps($cmp);
                if (cmpmask) {
                    int bit = __builtin_ctz(cmpmask);
                    PartialScore = $PartialScore[bit] / (m + bit + 1);
                    goto out;
                }
                $PartialSum = _mm_add_ps($PartialSum, _mm_shuffle_ps($accum, $accum, 0xff));
            }
            $PartialScore = _mm_div_ps($PartialScore, $SumOfCoords);
            PartialScore = $PartialScore[(template_feat_size - 1) & 3];
        out:
            if(PartialScore > 0.4f)
            {
                results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j, 0, PartialScore};
            }
        }
    }

    results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数：template_point/search_point
    std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point((215 + 3) / 4);
    for (auto &feat : template_point)
    {
        feat.x = _mm_set_epi32 REP4(intXDist(gen1));
        feat.y = _mm_set_epi32 REP4(intYDist(gen2));
        feat.dx = _mm_set_pi16 REP4(shortDXDist(gen3));
        feat.dy = _mm_set_pi16 REP4(shortDYDist(gen4));
        feat.mag = _mm_set_ps REP4(floatDist(gen5));
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = (shortDxDist(gen));
        /* static int once = printf("%d\n", feat.dx); */
        feat.dy = (shortDyDist(gen));
        feat.mag = (float_Dist(gen));
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> const &results0Deg = calSimilarity(template_point,215,search_point,2304000);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("/tmp/match_result.txt",results0Deg);

    return 0;
}

旧版本

//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>

struct templateFeat
{
	int x;
	int y;
	short dx;
	short dy;
	float mag;
};

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to match_result.txt!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> calSimilarity(const std::vector<templateFeat> template_point,const std::vector<searchFeat> search_point)
{
    int template_feat_size =  template_point.size();
    int search_feat_size =  search_point.size();

    std::vector<matchResult> results0Deg;
    float anMinScore     = 0.4 - 1;
    float NormMinScore   = 0.4 / template_feat_size;
    float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
    
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j++)
        {
            float PartialScore = 0;
            float PartialSum   = 0;
            int   SumOfCoords  = 0;

            for(int m = 0; m < template_feat_size; m++)
            {
                int curX = i + template_point[m].x;
                int curY = j + template_point[m].y;
				
                if(curX < 0 || curY < 0 || curX > 230 || curY > 350)
                {
                    continue;
                }

                int iTx = template_point[m].dx;
                int iTy = template_point[m].dy;
                float iTm = template_point[m].mag;

                int offSet = curY * 350 + curX;
                int iSx        = search_point[offSet].dx;
                int iSy        = search_point[offSet].dy; 
                float iSm        = search_point[offSet].mag;

                if((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
                {
                    PartialSum += ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm);
                }
                SumOfCoords  = m + 1;
                PartialScore = PartialSum / SumOfCoords;

                /* printf("%d %f %d %d %d %d %d %d %d %f %f %f\n", m, ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm), offSet, curX, curY, iSx, iSy, iTx, iTy, iSm, iTm, PartialSum); */
                /* exit(1); */
                /*  */
                /* printf("%d %f %f\n", m, PartialSum, ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm)); */

                if(PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords)))
                {
                    break;
                }
            }

            /* printf("%f %d\n", PartialScore, SumOfCoords); */
            if(PartialScore > 0.4)
            {
            /* printf("%d %d %f\n", i, j, PartialScore); */
            /*     exit(1); */
                results0Deg.push_back({i, j, 0, PartialScore});
            } 
        }
    }

    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数：template_point/search_point
    std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point(215);
    for (auto &feat : template_point)
    {
        feat.x = intXDist(gen1);
        feat.y = intYDist(gen2);
        feat.dx = shortDXDist(gen3);
        feat.dy = shortDYDist(gen4);
        feat.mag = floatDist(gen5);
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = shortDxDist(gen);
        /* static int once = printf("%d\n", feat.dx); */
        feat.dy = shortDyDist(gen);
        feat.mag = float_Dist(gen);
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> results0Deg = calSimilarity(template_point,search_point);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("./match_result.txt",results0Deg);

    return 0;
}

Aug 31 '23 15:08 archibate

最新版本的测试结果未有加速效果，测试记录：

非SSE版本

$ g++ normal_calSimilarity.cpp -O3 -o normal_calSimilarity_gcc -lstdc++ -lm
$ ./normal_calSimilarity_gcc
Function took 311.048 milliseconds.
Match result datas saved to match_result.txt!

耗时：311.048毫秒

SSE版本

$ g++ normal_calSimilaritySSE.cpp -O3 -mavx2 -o normal_calSimilarity_gccSSE
$ ./normal_calSimilarity_gccSSE 
Function took 817.517 milliseconds.
Match result datas saved to /tmp/match_result.txt!

耗时：817.517毫秒

考虑到：

测试数据的不可变性

数据的计算复杂度

数据类型转换

非必要条件判断

我将原先的非SSE代码修改如下：

代码

//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>

struct templateFeat
{
	int x;
	int y;
	float dx;
	float dy;
};

struct matchResult
{
	int i;
	int j;
	float angle;
	float score;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to match_result.txt!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> calSimilarity(const std::vector<float> search_grad_x,const std::vector<float> search_grad_y,const std::vector<templateFeat> template_grad,int width,int height)
{
	std::vector<matchResult> results0Deg;

	float anMinScore = 0.5 - 1;
	float NormMinScore = 0.5 / template_grad.size();
	float NormGreediness = ((1 - 0.9 * 0.5) / (1 - 0.9))/template_grad.size();

	for (int i = 12; i < 117; i++) 
	{
	  for (int j = 13; j < 123; j++) 
	  {
		// 去掉条件[1]
		std::vector<templateFeat> template_grad_copy = template_grad;
		for(auto it = template_grad_copy.begin(); it != template_grad_copy.end();)
		{
			int curX = i + it->x;
			int curY = j + it->y;
			if(curX < 0 || curY < 0 || curX > width - 1 || curY > height - 1)
			{
				it = template_grad_copy.erase(it);
			}
			else
			{
				++it;
			}
		}

		float PartialScore = 0;
		float PartialSum = 0;
		int SumOfCoords = 0;

		for (int m = 0; m < template_grad.size(); m++) 
		{
		  int curX = i + template_grad[m].x;
		  int curY = j + template_grad[m].y;

		//   条件[1]
		//   if (curX < 0 || curY < 0 || curX > width - 1 || curY > height - 1) {
		// 	continue; 
		//   }

		  int offSet = curY * width + curX;
		  float iTx = template_grad[m].dx;
		  float iTy = template_grad[m].dy;
		  
		  float iSx = search_grad_x[offSet];
		  float iSy = search_grad_y[offSet];

		  if ((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0)) 
		  {
			PartialSum += (iSx * iTx) + (iSy * iTy);
		  }
		  SumOfCoords = m + 1;
		  PartialScore = PartialSum / SumOfCoords;

		  if (PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords,NormMinScore * SumOfCoords)))
			break;
		}
		if(PartialScore > 0.5)
		{
			results0Deg.push_back({i, j, 0, PartialScore});
		} 
	  }
	}
	return results0Deg;
}

int main()
{	
    int width = 1273;
    int height = 1231;
    int buffer_size = width * height;
	
    std::vector<float> search_grad_x(buffer_size);
    std::vector<float> search_grad_y(buffer_size);

    for (int i = 0; i < buffer_size; ++i) {
        search_grad_x[i] = static_cast<float>(i) * 0.5f;
    }

    for (int i = 0; i < buffer_size; ++i) {
        search_grad_y[i] = static_cast<float>(i) * 1.5f;
    }
    
    std::vector<templateFeat> template_grad;
    for (int i = 0; i < 125; ++i) {
        templateFeat feat;
        feat.x = i * 2.0;
        feat.y = i * 2.0 + 1.0;
		feat.dx = static_cast<float>(i) * 0.25f;
		feat.dy = static_cast<float>(i) * 1.5f;
        template_grad.push_back(feat);
    }
	
    //运行算法,计算耗时:std::milli 就是毫秒，std::micro 就是微秒seconds
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> results0Deg = calSimilarity(search_grad_x,search_grad_y,template_grad,width,height);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " ms." << std::endl;
	
	//保存匹配结果
    saveDataToTxt("./match_result.txt",results0Deg);
	
    return 1;
}

运行结果

$ g++ normal_calSimilarity.cpp -O3 -o normal_calSimilarity_gcc -lstdc++ -lm
$ ./normal_calSimilarity_gcc
Function took 10.2154 ms.
Match result datas saved to match_result.txt!

新版本的耗时：10.2154 ms

请在上面新的基准代码上进行SSE修改吧！谢谢～

对比结果文件 match_result.txt 可以使用Meld工具www.meldmerge.org/ 进行对比测试～～～

Sep 01 '23 02:09 zhangnatha

//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
//#define USE_MULTISCORE//causing wrong result
#define USE_FIRSTUNROLL
#define USE_OPENMP
#define USE_UNROLLM4
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#include <immintrin.h>
#ifdef USE_OPENMP
#include <atomic>
#endif

struct templateFeat
{
	int x;
	int y;
	short dx;
	short dy;
	float mag;
};

#ifdef USE_MULTISCORE
struct templateFeatSimd
{
	__m256i x;
	__m256i y;
	__m128i dx;
	__m128i dy;
	__m256 mag;
};
#endif

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to match_result.txt!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

#ifdef USE_MULTISCORE
static float multiScore(templateFeatSimd const *template_point_simd,
                        searchFeat const *search_point,
                        float const *thresholds,
                        int template_feat_size,
                        int search_feat_size,
                        int i, int j, int mbase,
                        float PartialSumBase) {
    auto $PartialScore = _mm256_setzero_ps();
    auto $PartialSum = _mm256_set1_ps(PartialSumBase);
    auto $SumOfCoords = _mm256_add_ps(_mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8), _mm256_set1_ps(mbase));
    auto $8 = _mm256_set1_ps(8);
    auto $template_feat_size_plus1 = _mm256_set1_ps(template_feat_size + 1);
    auto $i = _mm256_set1_epi32(i);
    auto $j = _mm256_set1_epi32(j);
    auto $0 = _mm256_setzero_si256();
    auto $230 = _mm256_set1_epi32(230);
    auto $350 = _mm256_set1_epi32(350);

    /* printf("%d\n", mbase); */
    for(int m = mbase; m < ((template_feat_size + 7) & ~7); m += 8, $SumOfCoords = _mm256_add_ps($SumOfCoords, $8))
    {
        auto $curX = _mm256_add_epi32($i, template_point_simd[m / 8].x);
        auto $curY = _mm256_add_epi32($j, template_point_simd[m / 8].y);

        auto $notok = _mm256_or_si256(_mm256_or_si256(_mm256_castps_si256(_mm256_cmp_ps($SumOfCoords, $template_feat_size_plus1, _CMP_GE_OQ)),
                                                _mm256_or_si256(_mm256_srai_epi32($curX, 31), _mm256_srai_epi32($curY, 31))),
                                   _mm256_or_si256(_mm256_cmpgt_epi32($curX, $230), _mm256_cmpgt_epi32($curY, $350)));
        /* if (0xffff == _mm256_movemask_epi8($notok)) { */
        /*     continue; */
        /* } */

        auto $iTx = _mm256_cvtepi16_epi32(template_point_simd[m / 8].dx);
        auto $iTy = _mm256_cvtepi16_epi32(template_point_simd[m / 8].dy);
        auto $iTm = template_point_simd[m / 8].mag;

        auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), $curX));
        auto $iSxy = _mm256_i32gather_epi32((int *)search_point, $offSet, sizeof(searchFeat));
        // isx isy isx isy isx isy isx isy
        auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
        auto $iSy = _mm256_srai_epi32($iSxy, 16);
        auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point, $offSet, sizeof(searchFeat));

        auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
                                    _mm256_mul_ps(
                                    _mm256_mul_ps($iSm, $iTm),
                                    _mm256_cvtepi32_ps(
                                    _mm256_add_epi32(
                                    _mm256_mullo_epi32($iSx, $iTx),
                                    _mm256_mullo_epi32($iSy, $iTy)))));
        /* auto $cond = _mm256_mul_ps($m1, */
        /*                         _mm256_min_ps( */
        /*                         _mm256_add_ps($anMinScore, _mm256_mul_ps($NormGreediness, $m1)), */
        /*                         _mm256_mul_ps($NormMinScore, $m1))); */
        /* for (int _ = 0; _ < 4; _++) */
        /*      printf("%d %f %f %f %f %f %f %f %f %f %f %f\n", m + _, $accum[_], _mm256_cvtepi32_ps($offSet)[_], _mm256_cvtepi32_ps($curX)[_], _mm256_cvtepi32_ps($curY)[_], _mm256_cvtepi32_ps($iSx)[_], _mm256_cvtepi32_ps($iSy)[_], _mm256_cvtepi32_ps($iTx)[_], _mm256_cvtepi32_ps($iTy)[_], $iSm[_], $iTm[_], _mm256_cvtepi32_ps($notok)[_]); */
        /* exit(1); */

        /* for (int _ = 0; _ < 8; _++) */
        /*      printf("%d %f\n", _, $accum[_]); */

        // 0 1 2 3 4 5 6 7
        $accum = _mm256_add_ps($accum, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256($accum), 4)));
        // 0 1+0 2+1 3+2 4 5+4 6+5 7+6
        $accum = _mm256_add_ps($accum, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256($accum), 8)));
        // 0 1+0 2+1+0 3+2+1+0 4 5+4 6+5+4 7+6+5+4
        $accum = _mm256_add_ps($accum, _mm256_set_m128(_mm_permute_ps(_mm256_castps256_ps128($accum), 0xff), _mm_setzero_ps()));
        // 0 1+0 2+1+0 3+2+1+0 3+2+1+0+4 3+2+1+0+5+4 3+2+1+0+6+5+4 3+2+1+0+7+6+5+4

        /* for (int _ = 0; _ < 8; _++) */
        /*      printf("%d %f\n", _, $accum[_]); */
        /* exit(1); */

        $PartialScore = _mm256_add_ps($PartialSum, $accum);
        /* auto $cond = _mm256_mul_ps($SumOfCoords, _mm256_min_ps( */
        /*     _mm256_add_ps(_mm256_set1_ps(anMinScore), _mm256_mul_ps(_mm256_set1_ps(NormGreediness), $SumOfCoords)), */
        /*     _mm256_mul_ps(_mm256_set1_ps(NormMinScore), $SumOfCoords))); */
        auto $cond = _mm256_load_ps(thresholds + m);
        auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialScore, $cond, _CMP_LT_OQ));
        int cmpmask = _mm256_movemask_ps($cmp);
        if (cmpmask) [[unlikely]] {
            int bit = __builtin_ctz(cmpmask);
            return $PartialScore[bit] / (m + bit + 1);
        }
        $PartialSum = _mm256_add_ps($PartialSum, _mm256_broadcastss_ps(_mm256_castps256_ps128(_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(_mm256_permute_ps($accum, 0xff)), 0xf)))));
    }

        /* for (int _ = 0; _ < 8; _++) */
        /*      printf("%d %f %f\n", _, $PartialSum[_], $SumOfCoords[_]); */
        /* exit(1); */

    $PartialScore = _mm256_div_ps($PartialScore, $SumOfCoords);
    return $PartialScore[(template_feat_size - 1) & 7];
}
#endif

std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,const std::vector<searchFeat> &search_point)
{
    int template_feat_size =  template_point.size();
    int search_feat_size =  search_point.size();

#ifdef USE_MULTISCORE
    static std::vector<templateFeatSimd> template_point_simd;
    template_point_simd.resize((template_feat_size + 7) & ~7);
    for (int m = 0; m < template_point_simd.size() * 8; m += 8) {
        template_point_simd[m / 8].x = _mm256_setr_epi32(template_point[(m) % template_feat_size].x, template_point[(m + 1) % template_feat_size].x, template_point[(m + 2) % template_feat_size].x, template_point[(m + 3) % template_feat_size].x, template_point[(m + 4) % template_feat_size].x, template_point[(m + 4 + 1) % template_feat_size].x, template_point[(m + 4 + 2) % template_feat_size].x, template_point[(m + 4 + 3) % template_feat_size].x);
        template_point_simd[m / 8].y = _mm256_setr_epi32(template_point[(m) % template_feat_size].y, template_point[(m + 1) % template_feat_size].y, template_point[(m + 2) % template_feat_size].y, template_point[(m + 3) % template_feat_size].y, template_point[(m + 4) % template_feat_size].y, template_point[(m + 4 + 1) % template_feat_size].y, template_point[(m + 4 + 2) % template_feat_size].y, template_point[(m + 4 + 3) % template_feat_size].y);
        template_point_simd[m / 8].dx = _mm_setr_epi16(template_point[(m) % template_feat_size].dx, template_point[(m + 1) % template_feat_size].dx, template_point[(m + 2) % template_feat_size].dx, template_point[(m + 3) % template_feat_size].dx, template_point[(m + 4) % template_feat_size].dx, template_point[(m + 4 + 1) % template_feat_size].dx, template_point[(m + 4 + 2) % template_feat_size].dx, template_point[(m + 4 + 3) % template_feat_size].dx);
        template_point_simd[m / 8].dy = _mm_setr_epi16(template_point[(m) % template_feat_size].dy, template_point[(m + 1) % template_feat_size].dy, template_point[(m + 2) % template_feat_size].dy, template_point[(m + 3) % template_feat_size].dy, template_point[(m + 4) % template_feat_size].dy, template_point[(m + 4 + 1) % template_feat_size].dy, template_point[(m + 4 + 2) % template_feat_size].dy, template_point[(m + 4 + 3) % template_feat_size].dy);
        template_point_simd[m / 8].mag = _mm256_setr_ps(template_point[(m) % template_feat_size].mag, template_point[(m + 1) % template_feat_size].mag, template_point[(m + 2) % template_feat_size].mag, template_point[(m + 3) % template_feat_size].mag, template_point[(m + 4) % template_feat_size].mag, template_point[(m + 4 + 1) % template_feat_size].mag, template_point[(m + 4 + 2) % template_feat_size].mag, template_point[(m + 4 + 3) % template_feat_size].mag);
    }
#endif

    static std::vector<matchResult> results0Deg;
    results0Deg.resize(1920 * 1200);
#ifdef USE_OPENMP
    std::atomic<size_t> results0DegSize = 0;
#else
    size_t results0DegSize = 0;
#endif
    float anMinScore     = 0.4 - 1;
    float NormMinScore   = 0.4 / template_feat_size;
    float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;

    static std::vector<float> thresholds;
    thresholds.resize(((template_feat_size + 7) & ~7));
    for (int m = 0; m < ((template_feat_size + 7) & ~7); m++) {
        auto SumOfCoords = m + 1;
        thresholds[m] = SumOfCoords * std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords);
    }
    
#ifdef USE_OPENMP
    #pragma omp parallel for schedule(dynamic, 8)
#endif
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j += 8)
        {
            int m;
            auto $PartialSum   = _mm256_setzero_ps();
            #ifdef USE_FIRSTUNROLL
            auto $SumOfCoords  = _mm256_set1_epi32(1);
            #else
            auto $SumOfCoords  = _mm256_set1_epi32(1);
            #endif
            auto $j = _mm256_add_epi32(_mm256_set1_epi32(j), _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7));
            auto $350 = _mm256_set1_epi32(350);
            auto $loopterm = _mm256_setzero_si256();

            #ifdef USE_FIRSTUNROLL
            {
                int curX = i + template_point[0].x;
                if (!(curX < 0 || curX > 230)) [[likely]] {
                    auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[0].y));
                    auto $notok = _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350));

                    int iTx = template_point[0].dx;
                    int iTy = template_point[0].dy;
                    float iTm = template_point[0].mag;

                    /* int offSet = curY * 350 + curX; */
                    auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
                    auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                    // isx isy isx isy isx isy isx isy
                    auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
                    auto $iSy = _mm256_srai_epi32($iSxy, 16);
                    auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                    auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
                                                _mm256_mul_ps(
                                                _mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
                                                _mm256_cvtepi32_ps(
                                                _mm256_add_epi32(
                                                _mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
                                                _mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
                    $PartialSum = _mm256_add_ps($PartialSum, $accum);

                    auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[0]), _CMP_LT_OQ));
                    $loopterm = _mm256_castps_si256($cmp);
                    $SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)));
                }
            }
            if (_mm256_movemask_epi8($loopterm) != -1) {
                int curX = i + template_point[1].x;
                if (!(curX < 0 || curX > 230)) [[likely]] {
                    auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[1].y));
                    auto $notok = _mm256_or_si256($loopterm, _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350)));

                    int iTx = template_point[1].dx;
                    int iTy = template_point[1].dy;
                    float iTm = template_point[1].mag;

                    /* int offSet = curY * 350 + curX; */
                    auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
                    auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                    // isx isy isx isy isx isy isx isy
                    auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
                    auto $iSy = _mm256_srai_epi32($iSxy, 16);
                    auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                    auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
                                                _mm256_mul_ps(
                                                _mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
                                                _mm256_cvtepi32_ps(
                                                _mm256_add_epi32(
                                                _mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
                                                _mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
                    $PartialSum = _mm256_add_ps($PartialSum, $accum);

                    auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[1]), _CMP_LT_OQ));
                    $loopterm = _mm256_or_si256($loopterm, _mm256_castps_si256($cmp));
                    $SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)));
                }
                if (_mm256_movemask_epi8($loopterm) != -1) [[unlikely]] {
                    m = 2;
                    #else
                    m = 0;
                    #endif

                    #ifdef USE_UNROLLM4
                    #pragma GCC unroll 4
                    #endif
                    for(;
                        m < template_feat_size;
                        m++, $SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)))
                    ) {
                        int curX = i + template_point[m].x;
                        if (curX < 0 || curX > 230) [[unlikely]] { continue; }

                        auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[m].y));
                        auto $notok = _mm256_or_si256($loopterm, _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350)));

                        int iTx = template_point[m].dx;
                        int iTy = template_point[m].dy;
                        float iTm = template_point[m].mag;

                        /* int offSet = curY * 350 + curX; */
                        auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
                        auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                        // isx isy isx isy isx isy isx isy
                        auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
                        auto $iSy = _mm256_srai_epi32($iSxy, 16);
                        auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                        auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
                                                    _mm256_mul_ps(
                                                    _mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
                                                    _mm256_cvtepi32_ps(
                                                    _mm256_add_epi32(
                                                    _mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
                                                    _mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
                        $PartialSum = _mm256_add_ps($PartialSum, $accum);

                        auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[m]), _CMP_LT_OQ));
                        $loopterm = _mm256_or_si256($loopterm, _mm256_castps_si256($cmp));
                        #ifndef USE_MULTISCORE
                        if (_mm256_movemask_epi8($loopterm) == -1) {
                            break;
                        }
                        #else
                        auto popcorn = __builtin_popcount(_mm256_movemask_ps(_mm256_castsi256_ps($loopterm)));
                        if ((popcorn == 8 || popcorn >= 7) && !(m & 7)) [[unlikely]] {
                            break;
                        }
                        #endif
                    }
            #ifdef USE_FIRSTUNROLL
                }
            }
            #endif

            auto $PartialScore = _mm256_div_ps($PartialSum, _mm256_cvtepi32_ps($SumOfCoords));
            #ifdef USE_MULTISCORE
            auto mask = _mm256_movemask_ps(_mm256_castsi256_ps($loopterm));
            if (mask != 0xff && m < template_feat_size) {
                auto k = __builtin_ctz(~mask);
                $PartialScore[k] = multiScore(
                    template_point_simd.data(), search_point.data(), thresholds.data(),
                    template_feat_size, search_feat_size, i, j, m, $PartialSum[k]);
            }
            #endif
            auto $PartialScoreAbove = _mm256_cmp_ps($PartialScore, _mm256_set1_ps(0.4f), _CMP_GT_OQ);
            for (int k = 0; k < 8; k++) {
                if($PartialScoreAbove[k]) {
                /* printf("%d %d %f\n", i, j, PartialScore); */
                /*     exit(1); */
#ifdef USE_OPENMP
                    results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j + k, 0, $PartialScore[k]};
#else
                    results0Deg[results0DegSize++] = {i, j + k, 0, $PartialScore[k]};
#endif
                }
            }
        }
    }

#ifdef USE_OPENMP
    results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
#else
    results0Deg.resize(results0DegSize);
#endif
    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数：template_point/search_point
    std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point(215);
    for (auto &feat : template_point)
    {
        feat.x = intXDist(gen1);
        feat.y = intYDist(gen2);
        feat.dx = shortDXDist(gen3);
        feat.dy = shortDYDist(gen4);
        feat.mag = floatDist(gen5);
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = shortDxDist(gen);
        /* static int once = printf("%d\n", feat.dx); */
        feat.dy = shortDyDist(gen);
        feat.mag = float_Dist(gen);
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    /* for (int i = 0; i < 100; i++) */
    /*     calSimilarity(template_point,search_point); */
    std::vector<matchResult> const &results0Deg = calSimilarity(template_point,search_point);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("/tmp/match_result.txt",results0Deg);

    return 0;
}

修复了性能提升不大的问题，现在是418ms -> 21ms了。

Sep 01 '23 05:09 archibate