Files
grd/grd-cuda-avc-utils.cu
2026-02-13 13:06:50 +09:00

179 lines
4.6 KiB
Plaintext

/*
* Copyright (C) 2021 Pascal Nowack
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
* 02111-1307, USA.
*/
/*
* Generate the PTX instructions with:
* clang --cuda-gpu-arch=sm_30 -S src/grd-cuda-avc-utils.cu -o data/grd-cuda-avc-utils_30.ptx --no-cuda-version-check -O3 --cuda-device-only -Wall -Wextra
*
* or
*
* nvcc -arch=compute_30 -ptx grd-cuda-avc-utils.cu -o grd-cuda-avc-utils_30.ptx
*
* Note: This requires CUDA < 11, since the generation of Kepler capable
* PTX code was removed from CUDA 11.
*/
#include <stdint.h>
extern "C"
{
__device__ uint8_t
rgb_to_y (uint8_t r,
uint8_t g,
uint8_t b)
{
return (54 * r + 183 * g + 18 * b) >> 8;
}
__device__ uint8_t
rgb_to_u (uint8_t r,
uint8_t g,
uint8_t b)
{
return ((-29 * r - 99 * g + 128 * b) >> 8) + 128;
}
__device__ uint8_t
rgb_to_v (uint8_t r,
uint8_t g,
uint8_t b)
{
return ((128 * r - 116 * g - 12 * b) >> 8) + 128;
}
__global__ void
convert_2x2_bgrx_area_to_yuv420_nv12 (uint8_t *dst_data,
uint32_t *src_data,
uint16_t src_width,
uint16_t src_height,
uint16_t aligned_width,
uint16_t aligned_height,
uint16_t aligned_stride)
{
uint8_t *dst_y0, *dst_y1, *dst_y2, *dst_y3, *dst_u, *dst_v;
uint32_t *src_u32;
uint16_t s0, s1, s2, s3;
uint32_t bgrx;
int32_t r_a, g_a, b_a;
uint8_t r, g, b;
uint16_t x_1x1, y_1x1;
uint16_t x_2x2, y_2x2;
x_2x2 = blockIdx.x * blockDim.x + threadIdx.x;
y_2x2 = blockIdx.y * blockDim.y + threadIdx.y;
if (x_2x2 >= aligned_width >> 1 || y_2x2 >= aligned_height >> 1)
return;
/*
* -------------
* | d_0 | d_1 |
* -------------
* | d_2 | d_3 |
* -------------
*/
s0 = 0;
s1 = 1;
s2 = src_width;
s3 = src_width + 1;
x_1x1 = x_2x2 << 1;
y_1x1 = y_2x2 << 1;
src_u32 = src_data + y_1x1 * src_width + x_1x1;
dst_y0 = dst_data + y_1x1 * aligned_stride + x_1x1;
dst_y1 = dst_y0 + 1;
dst_y2 = dst_data + (y_1x1 + 1) * aligned_stride + x_1x1;
dst_y3 = dst_y2 + 1;
dst_u = dst_data + aligned_height * aligned_stride +
y_2x2 * aligned_stride + x_1x1;
dst_v = dst_u + 1;
/* d_0 */
if (x_1x1 < src_width && y_1x1 < src_height)
{
bgrx = src_u32[s0];
b_a = b = *(((uint8_t *) &bgrx) + 0);
g_a = g = *(((uint8_t *) &bgrx) + 1);
r_a = r = *(((uint8_t *) &bgrx) + 2);
*dst_y0 = rgb_to_y (r, g, b);
}
else
{
b_a = b = 0;
g_a = g = 0;
r_a = r = 0;
*dst_y0 = 0;
}
if (x_1x1 + 1 < src_width && y_1x1 < src_height)
{
bgrx = src_u32[s1];
/* d_1 */
b_a += b = *(((uint8_t *) &bgrx) + 0);
g_a += g = *(((uint8_t *) &bgrx) + 1);
r_a += r = *(((uint8_t *) &bgrx) + 2);
*dst_y1 = rgb_to_y (r, g, b);
}
else
{
*dst_y1 = 0;
}
if (x_1x1 < src_width && y_1x1 + 1 < src_height)
{
bgrx = src_u32[s2];
/* d_2 */
b_a += b = *(((uint8_t *) &bgrx) + 0);
g_a += g = *(((uint8_t *) &bgrx) + 1);
r_a += r = *(((uint8_t *) &bgrx) + 2);
*dst_y2 = rgb_to_y (r, g, b);
if (x_1x1 + 1 < src_width)
{
bgrx = src_u32[s3];
/* d_3 */
b_a += b = *(((uint8_t *) &bgrx) + 0);
g_a += g = *(((uint8_t *) &bgrx) + 1);
r_a += r = *(((uint8_t *) &bgrx) + 2);
*dst_y3 = rgb_to_y (r, g, b);
}
else
{
*dst_y3 = 0;
}
}
else
{
*dst_y2 = 0;
*dst_y3 = 0;
}
b_a >>= 2;
g_a >>= 2;
r_a >>= 2;
*dst_u = rgb_to_u (r_a, g_a, b_a);
*dst_v = rgb_to_v (r_a, g_a, b_a);
}
}