c++ - 将 RGBA 图像转换为 RGB 图像
问题描述
我尝试将 RGBA 图像转换为 RGB 图像(每个通道 8 位无符号整数)。起初我使用 OpenCV 和以下函数
m_bufferMat.data = (uchar*) (ptr1);
m_bufferMat.convertTo(m_bufferMat, CV_8UC3);
但是对于应用程序的其他部分,我不需要使用 OpenCV,所以我尝试自己转换图像,所以我不需要链接和包含 OpenCV 库。我能想象的最快的方法是遍历缓冲区并将前 3 个字节复制到另一个缓冲区,如下所示:
for(int i = 0; i < width * height; i++) {
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
ptr1++;
}
但是为此我需要复制它可能不是很快。OpenCV 函数比我自己的函数快 1.5 倍。有人知道为什么吗?我可以实现不需要复制的功能吗?
解决方案
有很多优化可以做。这是一个测试台程序来尝试它们和一些示例优化:
#include <iostream>
#include <string>
#include <vector>
#include <intrin.h>
#include <functional>
volatile int width = 1920;
volatile int height = 1080;
unsigned char* src = new unsigned char[width * height * 4];
unsigned char* dst = new unsigned char[width * height * 3];
unsigned char* refDst = new unsigned char[width * height * 3];
void DefaultFunc() {
auto ptr1 = src;
auto ptr2 = dst;
for (int i = 0; i < width * height; i++) {
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
ptr1++;
}
}
void NPreCalculatedFunc() {
auto ptr1 = src;
auto ptr2 = dst;
auto n = width * height;
for (int i = 0; i < n; i++) {
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
*(ptr2++) = *(ptr1++);
ptr1++;
}
}
void ReadFullPixelFunc() {
unsigned int* ptr1 = (unsigned int*)src;
auto ptr2 = dst;
auto n = width * height;
for (int i = 0; i < n; i++) {
auto srcPix = *(ptr1++);
*(ptr2++) = srcPix & 0xff;
*(ptr2++) = (srcPix >> 8) & 0xff;
*(ptr2++) = (srcPix >> 16) & 0xff;
}
}
void ReadAndWriteFullPixelFunc() {
unsigned int* ptr1 = (unsigned int*)src;
unsigned int* ptr2 = (unsigned int*)dst;
auto n = width * height / 4;
unsigned int writeBuf = 0;
for (int i = n; i; i--) {
// by reading 4 pixels, we get to store 3 unsigned ints
auto srcPix = *(ptr1++);
writeBuf = srcPix & 0x00ffffff;
srcPix = *(ptr1++);
writeBuf |= srcPix << 24;
*(ptr2++) = writeBuf;
writeBuf = (srcPix >> 8) & 0xffff;
srcPix = *(ptr1++);
writeBuf |= (srcPix << 16);
*(ptr2++) = writeBuf;
writeBuf = (srcPix >> 16) & 0xff;
srcPix = *(ptr1++);
writeBuf |= (srcPix << 8);
*(ptr2++) = writeBuf;
}
// todo: if width * height is not divisible by 4, process the last max 3 pixels here with the unoptimized loop
}
void ReadAndWriteFullPixelXmmFunc() {
unsigned int* ptr1 = (unsigned int*)src;
unsigned int* ptr2 = (unsigned int*)dst;
auto n = width * height / 4;
unsigned int writeBuf = 0;
__m128i reorder = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
for (int i = n; i; i--) {
auto srcPix4_ro = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)ptr1), reorder); // read 4 source pixels, remove alpha bytes, pack to low 12 bytes of srcPix4
ptr1 += 4;
_mm_storel_epi64((__m128i*)ptr2, srcPix4_ro); // store 2 first pixels
ptr2 += 2;
auto shifted = _mm_bsrli_si128(srcPix4_ro, 8);
_mm_storeu_si32(ptr2, shifted); // store 3rd pixel
ptr2 += 1;
}
// todo: if width * height is not divisible by 4, process the last max 3 pixels here with the unoptimized loop
}
unsigned long long PrintShortestTime(std::function<void()> f, const char *label, unsigned long long refTime) {
unsigned long long minTicks = ~0ull;
memset(dst, 0, width * height * 3);
for (int i = 0; i < 500; i++) {
auto start = __rdtsc();
f();
auto end = __rdtsc();
auto duration = end - start;
if (duration < minTicks) {
minTicks = duration;
}
}
if (memcmp(refDst, dst, width * height * 3)) { // test that we got the right answer
printf("Fail - result does not equal refrence!\n");
}
printf("%s : %llu clock cycles - %0.3lf x base implementation time\n", label, minTicks, refTime ? ((double)minTicks/(double)refTime):1.0);
return minTicks;
}
int main() {
for (int i = 0; i < width * height * 4; i++) {
src[i] = rand() & 0xff;
}
DefaultFunc();
memcpy(refDst, dst, width * height * 3);
auto refTime = PrintShortestTime(DefaultFunc, "default, unoptimized", 0);
PrintShortestTime(NPreCalculatedFunc, "n precalculated", refTime);
PrintShortestTime(ReadFullPixelFunc, "n precalculated, reading 1 pixel at a time", refTime);
PrintShortestTime(ReadAndWriteFullPixelFunc, "reading and writing ints at a time", refTime);
PrintShortestTime(ReadAndWriteFullPixelXmmFunc, "with xmm intrinsincs", refTime);
}
对我来说,在 Visual Studio & x64 或 x86 上,最后一个版本需要的时间大约是基本版本的 0.4 倍:
default, unoptimized : 7511848 clock cycles - 1.000 x base implementation time
n precalculated : 7383696 clock cycles - 0.983 x base implementation time
n precalculated, reading 1 pixel at a time : 7354644 clock cycles - 0.979 x base implementation time
reading and writing ints at a time : 4613816 clock cycles - 0.614 x base implementation time
with xmm intrinsincs : 3036824 clock cycles - 0.404 x base implementation time
通过展开循环,以更大的块写入内存,可能会进一步优化。
推荐阅读
- codeigniter-4 - 我的问题我如何将代码放入模型和控制器 Codeigniter4
- .net - Alpine Docker 缺少 kernel32.dll 库
- excel - 在已查找的另一个值的同一行中查找值?
- c# - IAsyncActionFilter 中的 OnActionExecutionAsync 有时只会命中
- azure - 如何将 Azure 订阅密钥标头添加到 JAX-WS SOAP 消息
- apache-flink - 在源头分配时间戳时,flink如何处理延迟?
- python - 如何在 python 中使用元类来增加或覆盖添加到类中的方法
- retrofit2 - 与协程流一起使用时取消改造请求
- python - 如何在执行语句时每 x 秒打印到 python 控制台经过的时间?
- sql - 每 1 小时运行一次的 Postgres 查询