请教小彭老师,我要怎么用SSE优化这个程序呢
https://github.com/Obj4ct/Image 我需要优化RGBYUV.cpp这个程序
这是我自己优化的代码,我想知道哪里出现了问题呢:
#include<base/BMPFile.h>
// RGB到YUV的转换
void RGB2YUV(std::vector<uint8_t> &imageData, int width, int height)
{
int numPixels = width * height * 3;
int numProcessedPixels = numPixels - (numPixels % 4);
for (int i = 0; i < numProcessedPixels; i += 12) {
// load rgb
__m128i r = _mm_loadu_si128((__m128i *)&imageData[i]);
__m128i g = _mm_loadu_si128((__m128i *)&imageData[i + 4]);
__m128i b = _mm_loadu_si128((__m128i *)&imageData[i + 8]);
__m128i y = _mm_cvtps_epi32(_mm_add_ps(
_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.299f), _mm_cvtepi32_ps(r)), _mm_mul_ps(_mm_set1_ps(0.587f),
_mm_cvtepi32_ps(g))), _mm_mul_ps(_mm_set1_ps(0.114f), _mm_cvtepi32_ps(b))));
__m128i u = _mm_cvtps_epi32(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.147f), _mm_cvtepi32_ps(r)),
_mm_mul_ps(_mm_set1_ps(-0.289f), _mm_cvtepi32_ps(g))),
_mm_mul_ps(_mm_set1_ps(0.436f), _mm_cvtepi32_ps(b))));
__m128i v = _mm_cvtps_epi32(_mm_add_ps(
_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.615f), _mm_cvtepi32_ps(r)), _mm_mul_ps(_mm_set1_ps(-0.515f),
_mm_cvtepi32_ps(g))), _mm_mul_ps(_mm_set1_ps(-0.100f), _mm_cvtepi32_ps(b))));
// 将 16 位整数的结果存储
_mm_storeu_si128((__m128i *)&imageData[i], y);
_mm_storeu_si128((__m128i *)&imageData[i + 4], u);
_mm_storeu_si128((__m128i *)&imageData[i + 8], v);
}
}
// YUV到RGB的转换
void YUV2RGB(std::vector<uint8_t> &imageData, int width, int height)
{
int numPixels = width * height * 2;
int numProcessedPixels = numPixels - (numPixels % 4);
for (int i = 0; i < numProcessedPixels; i += 12) {
__m128i y =_mm_loadu_si128((__m128i *)&imageData[i]);
__m128i u =_mm_loadu_si128((__m128i *)&imageData[i + 4]);
__m128i v =_mm_loadu_si128((__m128i *)&imageData[i + 8]);
// 计算RGB分量
__m128i r = _mm_cvtps_epi32(_mm_add_ps(y, _mm_mul_ps(_mm_set1_ps(1.13983f), v)));
__m128i g = _mm_cvtps_epi32(_mm_sub_ps(_mm_sub_ps(y, _mm_mul_ps(_mm_set1_ps(0.39465f), u)),
_mm_mul_ps(_mm_set1_ps(0.5806f), v)));
__m128i b = _mm_cvtps_epi32(_mm_add_ps(y, _mm_mul_ps(_mm_set1_ps(2.03211f), u)));
// 将16位整数的RGB数据存储回imageData
_mm_storeu_si128((__m128i *)&imageData[i], r);
_mm_storeu_si128((__m128i *)&imageData[i + 4], g);
_mm_storeu_si128((__m128i *)&imageData[i + 8], b);
}
}
int main()
{
MyValue myValue = MYFunction::ReadBMPFile(FILENAME);
int32_t height = myValue.bmpInfo.GetHeight();
int32_t width = myValue.bmpInfo.GetWidth();
int32_t newHeight = height;
int32_t newWidth = width;
std::vector<uint8_t> YUVImageData = myValue.imageData;
std::vector<uint8_t> RGBImageData;
auto beforeTime = std::chrono::steady_clock::now();
RGB2YUV(YUVImageData, width, height);
RGBImageData = YUVImageData;
YUV2RGB(RGBImageData, newWidth, newHeight);
auto afterTime = std::chrono::steady_clock::now();
MYFunction::WriteBMPFile("outputRGBtoYUV.bmp", YUVImageData, myValue.bmp, myValue.bmpInfo);
MYFunction::WriteBMPFile("outputYUVtoRGB.bmp", RGBImageData, myValue.bmp, myValue.bmpInfo);
double duration_second = std::chrono::duration<double>(afterTime - beforeTime).count();
float_t duration_milliseconds = duration_second * 1000;
std::cout << duration_milliseconds << "毫秒" << std::endl;
return 0;
}
__m128i r = _mm_loadu_si128((__m128i *)&imageData[i]);
__m128i g = _mm_loadu_si128((__m128i *)&imageData[i + 4]);
__m128i b = _mm_loadu_si128((__m128i *)&imageData[i + 8]);
这是错的。这样读取所需的内存排布是RRRRGGGGBBB,而你实际的内存排布是RGBRGBRGBRGB,需要进行3x4转置后才能保证r寄存器里四个分量分别是四个像素的R分量。
而且,你这个是uint8_t的数据,但你却用epi32把数据当作int32_t进行转换,这是错误的。如果有的话,应该用epu8的函数。但是并没有cvtepu8_ps,并且转换成浮点后再做乘法也并不高效,正确的做法是直接把epu8扩张成epu16然后作为定点数计算。
RGB和YUV互转是很常见的需求,已经有很多人做过了,请看:https://blog.csdn.net/just_sort/article/details/99545096 所以小彭老师这里稍微偷个懒,只是解读一下他的代码:
void RGBToYUVSSE_1(unsigned char *RGB, unsigned char *Y, unsigned char *U, unsigned char *V, int Width, int Height, int Stride) {
const int Shift = 13;
const int HalfV = 1 << (Shift - 1);
// 小彭老师注:把YUV转换所用到的浮点数转换成定点数(例如原来1.0转换成2^13,原来1.5转换成2^13+2^12)
const int Y_B_WT = 0.114f * (1 << Shift), Y_G_WT = 0.587f * (1 << Shift), Y_R_WT = (1 << Shift) - Y_B_WT - Y_G_WT;
const int U_B_WT = 0.436f * (1 << Shift), U_G_WT = -0.28886f * (1 << Shift), U_R_WT = -(U_B_WT + U_G_WT);
const int V_B_WT = -0.10001 * (1 << Shift), V_G_WT = -0.51499f * (1 << Shift), V_R_WT = -(V_B_WT + V_G_WT);
__m128i Weight_YB = _mm_set1_epi32(Y_B_WT), Weight_YG = _mm_set1_epi32(Y_G_WT), Weight_YR = _mm_set1_epi32(Y_R_WT);
__m128i Weight_UB = _mm_set1_epi32(U_B_WT), Weight_UG = _mm_set1_epi32(U_G_WT), Weight_UR = _mm_set1_epi32(U_R_WT);
__m128i Weight_VB = _mm_set1_epi32(V_B_WT), Weight_VG = _mm_set1_epi32(V_G_WT), Weight_VR = _mm_set1_epi32(V_R_WT);
__m128i C128 = _mm_set1_epi32(128);
__m128i Half = _mm_set1_epi32(HalfV);
__m128i Zero = _mm_setzero_si128();
const int BlockSize = 16, Block = Width / BlockSize; // 小彭老师注:16个像素分为一块,因为128位的SSE寄存器里可以存16个uint8_t(unsigned char)类型
for (int YY = 0; YY < Height; YY++) {
unsigned char *LinePS = RGB + YY * Stride;
unsigned char *LinePY = Y + YY * Width;
unsigned char *LinePU = U + YY * Width;
unsigned char *LinePV = V + YY * Width;
for (int XX = 0; XX < Block * BlockSize; XX += BlockSize, LinePS += BlockSize * 3) {
__m128i Src1, Src2, Src3, Blue, Green, Red;
Src1 = _mm_loadu_si128((__m128i *)(LinePS + 0));
Src2 = _mm_loadu_si128((__m128i *)(LinePS + 16));
Src3 = _mm_loadu_si128((__m128i *)(LinePS + 32));
// 小彭老师注:下面是他的16x3转置实现
// 以下操作把16个连续像素的像素顺序由 B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R B G R
// 更改为适合于SIMD指令处理的连续序列 B B B B B B B B B B B B B B B B G G G G G G G G G G G G G G G G R R R R R R R R R R R R R R R R
Blue = _mm_shuffle_epi8(Src1, _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1)));
Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13)));
Green = _mm_shuffle_epi8(Src1, _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
Green = _mm_or_si128(Green, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1)));
Green = _mm_or_si128(Green, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14)));
Red = _mm_shuffle_epi8(Src1, _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
Red = _mm_or_si128(Red, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1)));
Red = _mm_or_si128(Red, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15)));
// 小彭老师注:下面是小彭老师刚刚说的epu8扩容成epu16,unpack这类指令不在乎符号所以用了等效的epi16
// 以下操作将三个SSE变量里的字节数据分别提取到12个包含4个int类型的数据的SSE变量里,以便后续的乘积操作不溢出
__m128i Blue16L = _mm_unpacklo_epi8(Blue, Zero);
__m128i Blue16H = _mm_unpackhi_epi8(Blue, Zero);
__m128i Blue32LL = _mm_unpacklo_epi16(Blue16L, Zero);
__m128i Blue32LH = _mm_unpackhi_epi16(Blue16L, Zero);
__m128i Blue32HL = _mm_unpacklo_epi16(Blue16H, Zero);
__m128i Blue32HH = _mm_unpackhi_epi16(Blue16H, Zero);
__m128i Green16L = _mm_unpacklo_epi8(Green, Zero);
__m128i Green16H = _mm_unpackhi_epi8(Green, Zero);
__m128i Green32LL = _mm_unpacklo_epi16(Green16L, Zero);
__m128i Green32LH = _mm_unpackhi_epi16(Green16L, Zero);
__m128i Green32HL = _mm_unpacklo_epi16(Green16H, Zero);
__m128i Green32HH = _mm_unpackhi_epi16(Green16H, Zero);
__m128i Red16L = _mm_unpacklo_epi8(Red, Zero);
__m128i Red16H = _mm_unpackhi_epi8(Red, Zero);
__m128i Red32LL = _mm_unpacklo_epi16(Red16L, Zero);
__m128i Red32LH = _mm_unpackhi_epi16(Red16L, Zero);
__m128i Red32HL = _mm_unpacklo_epi16(Red16H, Zero);
__m128i Red32HH = _mm_unpackhi_epi16(Red16H, Zero);
// 小彭老师注:下面是完成了0.114 * R + 0.587 * G的核心计算部分
// 以下操作完成:Y[0 - 15] = (Y_B_WT * Blue[0 - 15]+ Y_G_WT * Green[0 - 15] + Y_R_WT * Red[0 - 15] + HalfV) >> Shift;
__m128i LL_Y = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LL, Weight_YB), _mm_add_epi32(_mm_mullo_epi32(Green32LL, Weight_YG), _mm_mullo_epi32(Red32LL, Weight_YR))), Half), Shift);
__m128i LH_Y = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LH, Weight_YB), _mm_add_epi32(_mm_mullo_epi32(Green32LH, Weight_YG), _mm_mullo_epi32(Red32LH, Weight_YR))), Half), Shift);
__m128i HL_Y = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HL, Weight_YB), _mm_add_epi32(_mm_mullo_epi32(Green32HL, Weight_YG), _mm_mullo_epi32(Red32HL, Weight_YR))), Half), Shift);
__m128i HH_Y = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HH, Weight_YB), _mm_add_epi32(_mm_mullo_epi32(Green32HH, Weight_YG), _mm_mullo_epi32(Red32HH, Weight_YR))), Half), Shift);
_mm_storeu_si128((__m128i*)(LinePY + XX), _mm_packus_epi16(_mm_packus_epi32(LL_Y, LH_Y), _mm_packus_epi32(HL_Y, HH_Y))); // 4个包含4个int类型的SSE变量重新打包为1个包含16个字节数据的SSE变量
// 以下操作完成: U[0 - 15] = ((U_B_WT * Blue[0 - 15]+ U_G_WT * Green[0 - 15] + U_R_WT * Red[0 - 15] + HalfV) >> Shift) + 128;
__m128i LL_U = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LL, Weight_UB), _mm_add_epi32(_mm_mullo_epi32(Green32LL, Weight_UG), _mm_mullo_epi32(Red32LL, Weight_UR))), Half), Shift), C128);
__m128i LH_U = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LH, Weight_UB), _mm_add_epi32(_mm_mullo_epi32(Green32LH, Weight_UG), _mm_mullo_epi32(Red32LH, Weight_UR))), Half), Shift), C128);
__m128i HL_U = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HL, Weight_UB), _mm_add_epi32(_mm_mullo_epi32(Green32HL, Weight_UG), _mm_mullo_epi32(Red32HL, Weight_UR))), Half), Shift), C128);
__m128i HH_U = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HH, Weight_UB), _mm_add_epi32(_mm_mullo_epi32(Green32HH, Weight_UG), _mm_mullo_epi32(Red32HH, Weight_UR))), Half), Shift), C128);
_mm_storeu_si128((__m128i*)(LinePU + XX), _mm_packus_epi16(_mm_packus_epi32(LL_U, LH_U), _mm_packus_epi32(HL_U, HH_U)));
// 以下操作完成:V[0 - 15] = ((V_B_WT * Blue[0 - 15]+ V_G_WT * Green[0 - 15] + V_R_WT * Red[0 - 15] + HalfV) >> Shift) + 128;
__m128i LL_V = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LL, Weight_VB), _mm_add_epi32(_mm_mullo_epi32(Green32LL, Weight_VG), _mm_mullo_epi32(Red32LL, Weight_VR))), Half), Shift), C128);
__m128i LH_V = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32LH, Weight_VB), _mm_add_epi32(_mm_mullo_epi32(Green32LH, Weight_VG), _mm_mullo_epi32(Red32LH, Weight_VR))), Half), Shift), C128);
__m128i HL_V = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HL, Weight_VB), _mm_add_epi32(_mm_mullo_epi32(Green32HL, Weight_VG), _mm_mullo_epi32(Red32HL, Weight_VR))), Half), Shift), C128);
__m128i HH_V = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(Blue32HH, Weight_VB), _mm_add_epi32(_mm_mullo_epi32(Green32HH, Weight_VG), _mm_mullo_epi32(Red32HH, Weight_VR))), Half), Shift), C128);
_mm_storeu_si128((__m128i*)(LinePV + XX), _mm_packus_epi16(_mm_packus_epi32(LL_V, LH_V), _mm_packus_epi32(HL_V, HH_V)));
}
// 小彭老师:他这里输出好像是NV20?分三个平面分别写入了。如果你要的是YUV444,就应该再用个16x3转置转回去再写入
for (int XX = Block * BlockSize; XX < Width; XX++, LinePS += 3) {
int Blue = LinePS[0], Green = LinePS[1], Red = LinePS[2];
LinePY[XX] = (Y_B_WT * Blue + Y_G_WT * Green + Y_R_WT * Red + HalfV) >> Shift;
LinePU[XX] = ((U_B_WT * Blue + U_G_WT * Green + U_R_WT * Red + HalfV) >> Shift) + 128;
LinePV[XX] = ((V_B_WT * Blue + V_G_WT * Green + V_R_WT * Red + HalfV) >> Shift) + 128;
}
}
}
感谢小彭老师的解答,我在等待您的回复时,修改了代码,把RGB像素连续存储在一起,就是这样
__m128i rgb[4]; // 创建数组存4组RGB像素
for (int j = 0; j < 4; j++) {
rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3]));
}
最外层循环是
for (int i = 0; i < numProcessedPixels; i += 12)
也就是说每12次一个循环
下面是我RGB到YUV的代码,代码存在一个问题,图片可以被正常处理,但是正常处理的范围只有图片的3分之一,也就是说图片并不能完整被处理,以下是这个函数的代码
// RGB到YUV的转换
void RGB2YUV(std::vector<uint8_t> &imageData, int width, int height)
{
int numPixels = width * height;
int numProcessedPixels = (numPixels / 12) * 12;
for (int i = 0; i < numProcessedPixels; i += 12) {
__m128i rgb[4]; // 创建数组存4组RGB像素
for (int j = 0; j < 4; j++) {
rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3]));
}
// 分别提取4组RGB通道的值
__m128i r[4], g[4], b[4];
for (int j = 0; j < 4; j++) {
r[j] = _mm_and_si128(rgb[j], _mm_set1_epi32(0xFF));
g[j] = _mm_and_si128(_mm_srli_epi32(rgb[j], 8), _mm_set1_epi32(0xFF));
b[j] = _mm_and_si128(_mm_srli_epi32(rgb[j], 16), _mm_set1_epi32(0xFF));
}
// 计算YUV值并存储回图像数据
for (int j = 0; j < 4; j++) {
__m128i y = _mm_cvtsi32_si128(static_cast<uint8_t>(0.299 * _mm_cvtsi128_si32(r[j]) + 0.587 * _mm_cvtsi128_si32(g[j]) +
0.114 * _mm_cvtsi128_si32(b[j])));
__m128i u = _mm_cvtsi32_si128(static_cast<uint8_t>((_mm_cvtsi128_si32(b[j]) - _mm_cvtsi128_si32(y)) * 0.564 + 128));
__m128i v = _mm_cvtsi32_si128(static_cast<uint8_t>((_mm_cvtsi128_si32(r[j]) - _mm_cvtsi128_si32(y)) * 0.713 + 128));
imageData[i + j * 3] = _mm_cvtsi128_si32(y);
imageData[i + j * 3 + 1] = _mm_cvtsi128_si32(u);
imageData[i + j * 3 + 2] = _mm_cvtsi128_si32(v);
// 范围检测
for (int channel = 0; channel < 3; channel++) {
if (imageData[i + j * 3 + channel] < 0) {
imageData[i + j * 3 + channel] = 0;
}
else if (imageData[i + j * 3 + channel] > 255) {
imageData[i + j * 3 + channel] = 255;
}
}
}
std::cout << "imageData size :" << imageData.size() << std::endl;
std::cout << "numProcessedPixels: " << numProcessedPixels << std::endl;
std::cout << "numPixels: " << numPixels << std::endl;
std::cout << "line: " << i << std::endl;
}
}
经过测试发现,如果写成int numPixels = width * height * 3,asan会报内存错误. 不写 * 3,图片只会处理三分之一.如果*2.9954(总之就是接近3的数,忘记了具体值),可以处理整个图片 经过输出发现(输入的图片512x512,大小还要乘上通道数3): int numPixels = width * height * 3的情况下:
imageDtata.size()=786432
numProcessedPixels: 786432,
numPixels: 786432
line: 786396
asan报错: heap-buffer-overflow,错误代码是在
rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3]));
我想知道这是什么原因所导致的,是数据类型吗
int numPixels = width * height*2.9954的情况下:
imageData size :786432
numProcessedPixels: 785220
numPixels: 785226
line: 785208
可以看到 imageDtata.size()=numProcessedPixels=numPixels=786432 当这三个变量相等时,报了 heap-buffer-overflow,错误 只有当numProcessedPixels和numPixels小于imageData.size()时才不会报错
请问小彭老师,这是什么原因呢
你这样一个个分别取出rgb分量是不行的,不会提升任何效率,还不如正常写标量代码
无法顺畅的大口呼吸,是活着的最好证明
---原始邮件--- 发件人: @.> 发送时间: 2023年10月28日(周六) 中午12:28 收件人: @.>; 抄送: @.@.>; 主题: Re: [parallel101/simdtutor] 请教小彭老师,我要怎么用SSE优化这个程序呢 (Issue #7)
感谢小彭老师的解答,我在等待您的回复时,修改了代码,把RGB像素连续存储在一起,就是这样
__m128i rgb[4]; // 创建数组存4组RGB像素 for (int j = 0; j < 4; j++) { rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3])); }
最外层循环是
for (int i = 0; i < numProcessedPixels; i += 12)
也就是说每12次一个循环
下面是我RGB到YUV的代码,代码存在一个问题,图片可以被正常处理,但是正常处理的范围只有图片的3分之一,也就是说图片并不能完整被处理,以下是这个函数的代码
// RGB到YUV的转换 void RGB2YUV(std::vector<uint8_t> &imageData, int width, int height) { int numPixels = width * height; int numProcessedPixels = (numPixels / 12) * 12; for (int i = 0; i < numProcessedPixels; i += 12) { __m128i rgb[4]; // 创建数组存4组RGB像素 for (int j = 0; j < 4; j++) { rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i >(&imageData[i + j * 3])); } // 分别提取4组RGB通道的值 __m128i r[4], g[4], b[4]; for (int j = 0; j < 4; j++) { r[j] = _mm_and_si128(rgb[j], _mm_set1_epi32(0xFF)); g[j] = _mm_and_si128(_mm_srli_epi32(rgb[j], 8), _mm_set1_epi32(0xFF)); b[j] = _mm_and_si128(_mm_srli_epi32(rgb[j], 16), _mm_set1_epi32(0xFF)); } // 计算YUV值并存储回图像数据 for (int j = 0; j < 4; j++) { __m128i y = _mm_cvtsi32_si128(static_cast<uint8_t>(0.299 * _mm_cvtsi128_si32(r[j]) + 0.587 * _mm_cvtsi128_si32(g[j]) + 0.114 * _mm_cvtsi128_si32(b[j]))); __m128i u = _mm_cvtsi32_si128(static_cast<uint8_t>((_mm_cvtsi128_si32(b[j]) - _mm_cvtsi128_si32(y)) * 0.564 + 128)); __m128i v = _mm_cvtsi32_si128(static_cast<uint8_t>((_mm_cvtsi128_si32(r[j]) - _mm_cvtsi128_si32(y)) * 0.713 + 128)); imageData[i + j * 3] = _mm_cvtsi128_si32(y); imageData[i + j * 3 + 1] = _mm_cvtsi128_si32(u); imageData[i + j * 3 + 2] = _mm_cvtsi128_si32(v); // 范围检测 for (int channel = 0; channel < 3; channel++) { if (imageData[i + j * 3 + channel] < 0) { imageData[i + j * 3 + channel] = 0; } else if (imageData[i + j * 3 + channel] > 255) { imageData[i + j * 3 + channel] = 255; } } } std::cout << "imageData size :" << imageData.size() << std::endl; std::cout << "numProcessedPixels: " << numProcessedPixels << std::endl; std::cout << "numPixels: " << numPixels << std::endl; std::cout << "line: " << i << std::endl; } }
经过测试发现,如果写成int numPixels = width * height * 3,asan会报内存错误.
不写 * 3,图片只会处理三分之一.如果2.9954(总之就是接近3的数,忘记了具体值),可以处理整个图片
经过输出发现(输入的图片512x512,大小还要乘上通道数3):
int numPixels = width * height * 3的情况下:
imageDtata.size()=786432 numProcessedPixels: 786432, numPixels: 786432 line: 786396
asan报错:
heap-buffer-overflow,错误代码是在
rgb[j] = _mm_loadu_si128(reinterpret_cast<__m128i *>(&imageData[i + j * 3]));
我想知道这是什么原因所导致的,是数据类型吗
int numPixels = width * height*2.9954的情况下:
imageData size :786432 numProcessedPixels: 785220 numPixels: 785226 line: 785208
可以看到
imageDtata.size()=numProcessedPixels=numPixels=786432
当这三个变量相等时,报了
heap-buffer-overflow,错误
只有当numProcessedPixels和numPixels小于imageData.size()时才不会报错
请问小彭老师,这是什么原因呢
— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you commented.Message ID: @.***>
好的👌🏻,了解了,谢谢老师 @archibate