simdtutor 请教小彭老师，我要怎么用SSE优化这个程序呢

https://github.com/Obj4ct/Image 我需要优化RGBYUV.cpp这个程序

这是我自己优化的代码,我想知道哪里出现了问题呢:

#include<base/BMPFile.h>
// RGB到YUV的转换
void RGB2YUV(std::vector<uint8_t> &imageData, int width, int height)

{
    int numPixels = width * height * 3;
    int numProcessedPixels = numPixels - (numPixels % 4);

    for (int i = 0; i < numProcessedPixels; i += 12) {
        //  load rgb
        __m128i r = _mm_loadu_si128((__m128i *)&imageData[i]);
        __m128i g = _mm_loadu_si128((__m128i *)&imageData[i + 4]);
        __m128i b = _mm_loadu_si128((__m128i *)&imageData[i + 8]);

        __m128i y = _mm_cvtps_epi32(_mm_add_ps(
            _mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.299f), _mm_cvtepi32_ps(r)), _mm_mul_ps(_mm_set1_ps(0.587f),
            _mm_cvtepi32_ps(g))), _mm_mul_ps(_mm_set1_ps(0.114f), _mm_cvtepi32_ps(b))));

        __m128i u = _mm_cvtps_epi32(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.147f), _mm_cvtepi32_ps(r)),
                                                          _mm_mul_ps(_mm_set1_ps(-0.289f), _mm_cvtepi32_ps(g))),
                                               _mm_mul_ps(_mm_set1_ps(0.436f), _mm_cvtepi32_ps(b))));

        __m128i v = _mm_cvtps_epi32(_mm_add_ps(
            _mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.615f), _mm_cvtepi32_ps(r)), _mm_mul_ps(_mm_set1_ps(-0.515f),
            _mm_cvtepi32_ps(g))), _mm_mul_ps(_mm_set1_ps(-0.100f), _mm_cvtepi32_ps(b))));

        // 将 16 位整数的结果存储
        _mm_storeu_si128((__m128i *)&imageData[i], y);
        _mm_storeu_si128((__m128i *)&imageData[i + 4], u);
        _mm_storeu_si128((__m128i *)&imageData[i + 8], v);
    }
}

// YUV到RGB的转换
void YUV2RGB(std::vector<uint8_t> &imageData, int width, int height)
{
    int numPixels = width * height * 2;
    int numProcessedPixels = numPixels - (numPixels % 4);

    for (int i = 0; i < numProcessedPixels; i += 12) {

        __m128i y =_mm_loadu_si128((__m128i *)&imageData[i]);
        __m128i u =_mm_loadu_si128((__m128i *)&imageData[i + 4]);

        __m128i v =_mm_loadu_si128((__m128i *)&imageData[i + 8]);

        // 计算RGB分量
        __m128i r = _mm_cvtps_epi32(_mm_add_ps(y, _mm_mul_ps(_mm_set1_ps(1.13983f), v)));

        __m128i g = _mm_cvtps_epi32(_mm_sub_ps(_mm_sub_ps(y, _mm_mul_ps(_mm_set1_ps(0.39465f), u)),
                                    _mm_mul_ps(_mm_set1_ps(0.5806f), v)));

        __m128i b = _mm_cvtps_epi32(_mm_add_ps(y, _mm_mul_ps(_mm_set1_ps(2.03211f), u)));

        // 将16位整数的RGB数据存储回imageData
        _mm_storeu_si128((__m128i *)&imageData[i], r);
        _mm_storeu_si128((__m128i *)&imageData[i + 4], g);
        _mm_storeu_si128((__m128i *)&imageData[i + 8], b);
    }
}

int main()
{
    MyValue myValue = MYFunction::ReadBMPFile(FILENAME);
    int32_t height = myValue.bmpInfo.GetHeight();
    int32_t width = myValue.bmpInfo.GetWidth();
    int32_t newHeight = height;
    int32_t newWidth = width;
    std::vector<uint8_t> YUVImageData = myValue.imageData;
    std::vector<uint8_t> RGBImageData;
    auto beforeTime = std::chrono::steady_clock::now();
    RGB2YUV(YUVImageData, width, height);
    RGBImageData = YUVImageData;
    YUV2RGB(RGBImageData, newWidth, newHeight);

    auto afterTime = std::chrono::steady_clock::now();

    MYFunction::WriteBMPFile("outputRGBtoYUV.bmp", YUVImageData, myValue.bmp, myValue.bmpInfo);
    MYFunction::WriteBMPFile("outputYUVtoRGB.bmp", RGBImageData, myValue.bmp, myValue.bmpInfo);
    double duration_second = std::chrono::duration<double>(afterTime - beforeTime).count();
    float_t duration_milliseconds = duration_second * 1000;
    std::cout << duration_milliseconds << "毫秒" << std::endl;
    return 0;
}

Oct 26 '23 03:10 Obj4ct

        __m128i r = _mm_loadu_si128((__m128i *)&imageData[i]);
        __m128i g = _mm_loadu_si128((__m128i *)&imageData[i + 4]);
        __m128i b = _mm_loadu_si128((__m128i *)&imageData[i + 8]);

这是错的。这样读取所需的内存排布是RRRRGGGGBBB，而你实际的内存排布是RGBRGBRGBRGB，需要进行3x4转置后才能保证r寄存器里四个分量分别是四个像素的R分量。

Oct 27 '23 13:10 archibate

而且，你这个是uint8_t的数据，但你却用epi32把数据当作int32_t进行转换，这是错误的。如果有的话，应该用epu8的函数。但是并没有cvtepu8_ps，并且转换成浮点后再做乘法也并不高效，正确的做法是直接把epu8扩张成epu16然后作为定点数计算。

Oct 27 '23 13:10 archibate

RGB和YUV互转是很常见的需求，已经有很多人做过了，请看：https://blog.csdn.net/just_sort/article/details/99545096 所以小彭老师这里稍微偷个懒，只是解读一下他的代码：

void RGBToYUVSSE_1(unsigned char *RGB, unsigned char *Y, unsigned char *U, unsigned char *V, int Width, int Height, int Stride) {
	const int Shift = 13;
	const int HalfV = 1 << (Shift - 1);
        // 小彭老师注：把YUV转换所用到的浮点数转换成定点数（例如原来1.0转换成2^13，原来1.5转换成2^13+2^12）
	const int Y_B_WT = 0.114f * (1 << Shift), Y_G_WT = 0.587f * (1 << Shift), Y_R_WT = (1 << Shift) - Y_B_WT - Y_G_WT;
	const int U_B_WT = 0.436f * (1 << Shift), U_G_WT = -0.28886f * (1 << Shift), U_R_WT = -(U_B_WT + U_G_WT);
	const int V_B_WT = -0.10001 * (1 << Shift), V_G_WT = -0.51499f * (1 << Shift), V_R_WT = -(V_B_WT + V_G_WT);
	__m128i Weight_YB = _mm_set1_epi32(Y_B_WT), Weight_YG = _mm_set1_epi32(Y_G_WT), Weight_YR = _mm_set1_epi32(Y_R_WT);
	__m128i Weight_UB = _mm_set1_epi32(U_B_WT), Weight_UG = _mm_set1_epi32(U_G_WT), Weight_UR = _mm_set1_epi32(U_R_WT);
	__m128i Weight_VB = _mm_set1_epi32(V_B_WT), Weight_VG = _mm_set1_epi32(V_G_WT), Weight_VR = _mm_set1_epi32(V_R_WT);
	__m128i C128 = _mm_set1_epi32(128);
	__m128i Half = _mm_set1_epi32(HalfV);
	__m128i Zero = _mm_setzero_si128();
	const int BlockSize = 16, Block = Width / BlockSize; // 小彭老师注：16个像素分为一块，因为128位的SSE寄存器里可以存16个uint8_t（unsigned char）类型
	for (int YY = 0; YY < Height; YY++) {
		unsigned char *LinePS = RGB + YY * Stride;
		unsigned char *LinePY = Y + YY * Width;
		unsigned char *LinePU = U + YY * Width;
		unsigned char *LinePV = V + YY * Width;
		for (int XX = 0; XX < Block * BlockSize; XX += BlockSize, LinePS += BlockSize * 3) {
			__m128i Src1, Src2, Src3, Blue, Green, Red;

			Src1 = _mm_loadu_si128((__m128i *)(LinePS + 0));
			Src2 = _mm_loadu_si128((__m128i *)(LinePS + 16));
			Src3 = _mm_loadu_si128((__m128i *)(LinePS + 32));

                        // 小彭老师注：下面是他的16x3转置实现
			// 以下操作把16个连续像素的像素顺序由 B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R 
			// 更改为适合于SIMD指令处理的连续序列 B B B B B B B B B B B B B B B B G G G G G G G G G G G G G G G G R R R R R R R R R R R R R R R R  

			Blue = _mm_shuffle_epi8(Src1, _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
			Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1)));
			Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13)));

			Green = _mm_shuffle_epi8(Src1, _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
			Green = _mm_or_si128(Green, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1)));
			Green = _mm_or_si128(Green, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14)));

			Red = _mm_shuffle_epi8(Src1, _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
			Red = _mm_or_si128(Red, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1)));
			Red = _mm_or_si128(Red, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15)));

                        // 小彭老师注：下面是小彭老师刚刚说的epu8扩容成epu16，unpack这类指令不在乎符号所以用了等效的epi16
			// 以下操作将三个SSE变量里的字节数据分别提取到12个包含4个int类型的数据的SSE变量里，以便后续的乘积操作不溢出
			
			__m128i Blue16L = _mm_unpacklo_epi8(Blue, Zero);
			__m128i Blue16H = _mm_unpackhi_epi8(Blue, Zero);
			__m128i Blue32LL = _mm_unpacklo_epi16(Blue16L, Zero);
			__m128i Blue32LH = _mm_unpackhi_epi16(Blue16L, Zero);
			__m128i Blue32HL = _mm_unpacklo_epi16(Blue16H, Zero);
			__m128i Blue32HH = _mm_unpackhi_epi16(Blue16H, Zero);

			__m128i Green16L = _mm_unpacklo_epi8(Green, Zero);
			__m128i Green16H = _mm_unpackhi_epi8(Green, Zero);
			__m128i Green32LL = _mm_unpacklo_epi16(Green16L, Zero);
			__m128i Green32LH = _mm_unpackhi_epi16(Green16L, Zero);
			__m128i Green32HL = _mm_unpacklo_epi16(Green16H, Zero);
			__m128i Green32HH = _mm_unpackhi_epi16(Green16H, Zero);

			__m128i Red16L = _mm_unpacklo_epi8(Red, Zero);
			__m128i Red16H = _mm_unpackhi_epi8(Red, Zero);
			__m128i Red32LL = _mm_unpacklo_epi16(Red16L, Zero);
			__m128i Red32LH = _mm_unpackhi_epi16(Red16L, Zero);
			__m128i Red32HL = _mm_unpacklo_epi16(Red16H, Zero);
			__m128i Red32HH = _mm_unpackhi_epi16(Red16H, Zero);

                        // 小彭老师注：下面是完成了0.114 * R + 0.587 * G的核心计算部分
			// 以下操作完成：Y[0 - 15] = (Y_B_WT * Blue[0 - 15]+ Y_G_WT * Green[0 - 15] + Y_R_WT * Red[0 - 15] + HalfV) >> Shift;   
			__m128i LL_Y = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LL, Weight_YB), _mm_add_epi32(_mm_mullo_epi32(Green32LL, Weight_YG), _mm_mullo_epi32(Red32LL, Weight_YR))), Half), Shift);
			__m128i LH_Y = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LH, Weight_YB), _mm_add_epi32(_mm_mullo_epi32(Green32LH, Weight_YG), _mm_mullo_epi32(Red32LH, Weight_YR))), Half), Shift);
			__m128i HL_Y = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HL, Weight_YB), _mm_add_epi32(_mm_mullo_epi32(Green32HL, Weight_YG), _mm_mullo_epi32(Red32HL, Weight_YR))), Half), Shift);
			__m128i HH_Y = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HH, Weight_YB), _mm_add_epi32(_mm_mullo_epi32(Green32HH, Weight_YG), _mm_mullo_epi32(Red32HH, Weight_YR))), Half), Shift);
			_mm_storeu_si128((__m128i*)(LinePY + XX), _mm_packus_epi16(_mm_packus_epi32(LL_Y, LH_Y), _mm_packus_epi32(HL_Y, HH_Y)));    //    4个包含4个int类型的SSE变量重新打包为1个包含16个字节数据的SSE变量

			// 以下操作完成: U[0 - 15] = ((U_B_WT * Blue[0 - 15]+ U_G_WT * Green[0 - 15] + U_R_WT * Red[0 - 15] + HalfV) >> Shift) + 128;
			__m128i LL_U = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LL, Weight_UB), _mm_add_epi32(_mm_mullo_epi32(Green32LL, Weight_UG), _mm_mullo_epi32(Red32LL, Weight_UR))), Half), Shift), C128);
			__m128i LH_U = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LH, Weight_UB), _mm_add_epi32(_mm_mullo_epi32(Green32LH, Weight_UG), _mm_mullo_epi32(Red32LH, Weight_UR))), Half), Shift), C128);
			__m128i HL_U = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HL, Weight_UB), _mm_add_epi32(_mm_mullo_epi32(Green32HL, Weight_UG), _mm_mullo_epi32(Red32HL, Weight_UR))), Half), Shift), C128);
			__m128i HH_U = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HH, Weight_UB), _mm_add_epi32(_mm_mullo_epi32(Green32HH, Weight_UG), _mm_mullo_epi32(Red32HH, Weight_UR))), Half), Shift), C128);
			_mm_storeu_si128((__m128i*)(LinePU + XX), _mm_packus_epi16(_mm_packus_epi32(LL_U, LH_U), _mm_packus_epi32(HL_U, HH_U)));

			// 以下操作完成：V[0 - 15] = ((V_B_WT * Blue[0 - 15]+ V_G_WT * Green[0 - 15] + V_R_WT * Red[0 - 15] + HalfV) >> Shift) + 128; 
			__m128i LL_V = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LL, Weight_VB), _mm_add_epi32(_mm_mullo_epi32(Green32LL, Weight_VG), _mm_mullo_epi32(Red32LL, Weight_VR))), Half), Shift), C128);
			__m128i LH_V = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LH, Weight_VB), _mm_add_epi32(_mm_mullo_epi32(Green32LH, Weight_VG), _mm_mullo_epi32(Red32LH, Weight_VR))), Half), Shift), C128);
			__m128i HL_V = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HL, Weight_VB), _mm_add_epi32(_mm_mullo_epi32(Green32HL, Weight_VG), _mm_mullo_epi32(Red32HL, Weight_VR))), Half), Shift), C128);
			__m128i HH_V = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HH, Weight_VB), _mm_add_epi32(_mm_mullo_epi32(Green32HH, Weight_VG), _mm_mullo_epi32(Red32HH, Weight_VR))), Half), Shift), C128);
			_mm_storeu_si128((__m128i*)(LinePV + XX), _mm_packus_epi16(_mm_packus_epi32(LL_V, LH_V), _mm_packus_epi32(HL_V, HH_V)));
		}
                // 小彭老师：他这里输出好像是NV20？分三个平面分别写入了。如果你要的是YUV444，就应该再用个16x3转置转回去再写入
		for (int XX = Block * BlockSize; XX < Width; XX++, LinePS += 3) {
			int Blue = LinePS[0], Green = LinePS[1], Red = LinePS[2];
			LinePY[XX] = (Y_B_WT * Blue + Y_G_WT * Green + Y_R_WT * Red + HalfV) >> Shift;
			LinePU[XX] = ((U_B_WT * Blue + U_G_WT * Green + U_R_WT * Red + HalfV) >> Shift) + 128;
			LinePV[XX] = ((V_B_WT * Blue + V_G_WT * Green + V_R_WT * Red + HalfV) >> Shift) + 128;
		}
	}
}

Oct 27 '23 13:10 archibate

感谢小彭老师的解答,我在等待您的回复时,修改了代码,把RGB像素连续存储在一起,就是这样

__m128i rgb[4];  // 创建数组存4组RGB像素
for (int j = 0; j < 4; j++) {
     rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3]));
}

最外层循环是 for (int i = 0; i < numProcessedPixels; i += 12) 也就是说每12次一个循环

下面是我RGB到YUV的代码,代码存在一个问题,图片可以被正常处理,但是正常处理的范围只有图片的3分之一,也就是说图片并不能完整被处理,以下是这个函数的代码

// RGB到YUV的转换

void RGB2YUV(std::vector<uint8_t> &imageData, int width, int height)
{

    int numPixels = width * height;  
    int numProcessedPixels = (numPixels / 12) * 12; 
                                           
    for (int i = 0; i < numProcessedPixels; i += 12) {
        __m128i rgb[4];  // 创建数组存4组RGB像素
        for (int j = 0; j < 4; j++) {
            rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3]));
        }

        // 分别提取4组RGB通道的值
        __m128i r[4], g[4], b[4];
        for (int j = 0; j < 4; j++) {
            r[j] = _mm_and_si128(rgb[j], _mm_set1_epi32(0xFF));
            g[j] = _mm_and_si128(_mm_srli_epi32(rgb[j], 8), _mm_set1_epi32(0xFF));
            b[j] = _mm_and_si128(_mm_srli_epi32(rgb[j], 16), _mm_set1_epi32(0xFF));
        }

        // 计算YUV值并存储回图像数据
        for (int j = 0; j < 4; j++) {
            __m128i y = _mm_cvtsi32_si128(static_cast<uint8_t>(0.299 * _mm_cvtsi128_si32(r[j]) + 0.587 * _mm_cvtsi128_si32(g[j]) +
                                                               0.114 * _mm_cvtsi128_si32(b[j])));
            __m128i u = _mm_cvtsi32_si128(static_cast<uint8_t>((_mm_cvtsi128_si32(b[j]) - _mm_cvtsi128_si32(y)) * 0.564 + 128));
            __m128i v = _mm_cvtsi32_si128(static_cast<uint8_t>((_mm_cvtsi128_si32(r[j]) - _mm_cvtsi128_si32(y)) * 0.713 + 128));

            imageData[i + j * 3] = _mm_cvtsi128_si32(y);
            imageData[i + j * 3 + 1] = _mm_cvtsi128_si32(u);
            imageData[i + j * 3 + 2] = _mm_cvtsi128_si32(v);

            // 范围检测
            for (int channel = 0; channel < 3; channel++) {
                if (imageData[i + j * 3 + channel] < 0) {
                    imageData[i + j * 3 + channel] = 0;
                }
                else if (imageData[i + j * 3 + channel] > 255) {
                    imageData[i + j * 3 + channel] = 255;
                }
            }
        }
        std::cout << "imageData size :" << imageData.size() << std::endl;
        std::cout << "numProcessedPixels: " << numProcessedPixels << std::endl;
        std::cout << "numPixels: " << numPixels << std::endl;
        std::cout << "line: " << i << std::endl;
    }
}

经过测试发现,如果写成int numPixels = width * height * 3,asan会报内存错误. 不写 * 3,图片只会处理三分之一.如果*2.9954(总之就是接近3的数,忘记了具体值),可以处理整个图片经过输出发现(输入的图片512x512,大小还要乘上通道数3): int numPixels = width * height * 3的情况下:

imageDtata.size()=786432
numProcessedPixels: 786432,
numPixels: 786432
line: 786396

asan报错: heap-buffer-overflow,错误代码是在

rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3]));

我想知道这是什么原因所导致的,是数据类型吗

int numPixels = width * height*2.9954的情况下:

imageData size :786432
numProcessedPixels: 785220
numPixels: 785226
line: 785208

可以看到 imageDtata.size()=numProcessedPixels=numPixels=786432 当这三个变量相等时,报了 heap-buffer-overflow,错误只有当numProcessedPixels和numPixels小于imageData.size()时才不会报错

请问小彭老师,这是什么原因呢

Oct 28 '23 04:10 Obj4ct

你这样一个个分别取出rgb分量是不行的，不会提升任何效率，还不如正常写标量代码

无法顺畅的大口呼吸，是活着的最好证明

---原始邮件--- 发件人: @.> 发送时间: 2023年10月28日(周六) 中午12:28 收件人: @.>; 抄送: @.@.>; 主题: Re: [parallel101/simdtutor] 请教小彭老师，我要怎么用SSE优化这个程序呢 (Issue #7)

感谢小彭老师的解答,我在等待您的回复时,修改了代码,把RGB像素连续存储在一起,就是这样 __m128i rgb[4]; // 创建数组存4组RGB像素 for (int j = 0; j < 4; j++) { rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3])); }
最外层循环是 for (int i = 0; i < numProcessedPixels; i += 12) 也就是说每12次一个循环

下面是我RGB到YUV的代码,代码存在一个问题,图片可以被正常处理,但是正常处理的范围只有图片的3分之一,也就是说图片并不能完整被处理,以下是这个函数的代码 // RGB到YUV的转换 void RGB2YUV(std::vector<uint8_t> &imageData, int width, int height) { int numPixels = width * height; int numProcessedPixels = (numPixels / 12) * 12; for (int i = 0; i < numProcessedPixels; i += 12) { __m128i rgb[4]; // 创建数组存4组RGB像素 for (int j = 0; j < 4; j++) { rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i >(&imageData[i + j * 3])); } // 分别提取4组RGB通道的值 __m128i r[4], g[4], b[4]; for (int j = 0; j < 4; j++) { r[j] = _mm_and_si128(rgb[j], _mm_set1_epi32(0xFF)); g[j] = _mm_and_si128(_mm_srli_epi32(rgb[j], 8), _mm_set1_epi32(0xFF)); b[j] = _mm_and_si128(_mm_srli_epi32(rgb[j], 16), _mm_set1_epi32(0xFF)); } // 计算YUV值并存储回图像数据 for (int j = 0; j < 4; j++) { __m128i y = _mm_cvtsi32_si128(static_cast<uint8_t>(0.299 * _mm_cvtsi128_si32(r[j]) + 0.587 * _mm_cvtsi128_si32(g[j]) + 0.114 * _mm_cvtsi128_si32(b[j]))); __m128i u = _mm_cvtsi32_si128(static_cast<uint8_t>((_mm_cvtsi128_si32(b[j]) - _mm_cvtsi128_si32(y)) * 0.564 + 128)); __m128i v = _mm_cvtsi32_si128(static_cast<uint8_t>((_mm_cvtsi128_si32(r[j]) - _mm_cvtsi128_si32(y)) * 0.713 + 128)); imageData[i + j * 3] = _mm_cvtsi128_si32(y); imageData[i + j * 3 + 1] = _mm_cvtsi128_si32(u); imageData[i + j * 3 + 2] = _mm_cvtsi128_si32(v); // 范围检测 for (int channel = 0; channel < 3; channel++) { if (imageData[i + j * 3 + channel] < 0) { imageData[i + j * 3 + channel] = 0; } else if (imageData[i + j * 3 + channel] > 255) { imageData[i + j * 3 + channel] = 255; } } } std::cout << "imageData size :" << imageData.size() << std::endl; std::cout << "numProcessedPixels: " << numProcessedPixels << std::endl; std::cout << "numPixels: " << numPixels << std::endl; std::cout << "line: " << i << std::endl; } }
经过测试发现,如果写成int numPixels = width * height * 3,asan会报内存错误. 不写 * 3,图片只会处理三分之一.如果2.9954(总之就是接近3的数,忘记了具体值),可以处理整个图片经过输出发现(输入的图片512x512,大小还要乘上通道数3): int numPixels = width * height * 3的情况下: imageDtata.size()=786432 numProcessedPixels: 786432, numPixels: 786432 line: 786396
asan报错: heap-buffer-overflow,错误代码是在

rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3]));

我想知道这是什么原因所导致的,是数据类型吗

int numPixels = width * height*2.9954的情况下: imageData size :786432 numProcessedPixels: 785220 numPixels: 785226 line: 785208
可以看到 imageDtata.size()=numProcessedPixels=numPixels=786432 当这三个变量相等时,报了 heap-buffer-overflow,错误只有当numProcessedPixels和numPixels小于imageData.size()时才不会报错

请问小彭老师,这是什么原因呢

— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you commented.Message ID: @.***>

Oct 29 '23 18:10 archibate

好的👌🏻，了解了，谢谢老师 @archibate

Oct 30 '23 01:10 Obj4ct