Introduction
This article is an optimized NV12 image scaling program.
There are different kinds of image scaling algorithms. The complexity of the algorithm for image scaling
is related with the loss of image quality and low performance. I decided to choose the most simple ones which are 'nearest neighbor interpolation'
and bilinear interpolation to resize NV12 image.
Background
NV12 is a kind of YUV series format. Before you read my tip.You need to have some basic concept of the format.And know what are interpolation scaling algorithms.
If you have tired RGBA format image scale before,it would be easier for you to understand how my program works.
NV12 format
NV12 format image array in memory is like: YYYYYYYY... UVUV... NV12 is a planar format.
It is also called YUV420sp. There are three planes:
- The length of Y plane in memory is 'width * height'.
- The length of U or V plane in memory is 'width * height / 4'.
- U and V is interleaved.
- Y plane is grey value if discarding U and V plane
So 'width * height * 3 / 2' is the total memory length of the image. Here is
a more clear 8*4 resolution sample:
Logical view:
Obviously, width = 8, height = 4
- ylen = 8*4, ulen = 8*4/4, vlen= 8*4/4.
- total_length = ylen + ulen + vlen = ylen * 3 / 2
and every four Y value matches
the same U value and V value.
For example:
- Y00 Y01 Y10 Y11 share U00 and V00
- Y20 Y21 Y30 Y31 share U10 and V10
Algorithms
- Nearest interpolation
srcX = dstX * (srcWidth / dstWidth), srcY = dstY * (srcHeight / dstHeight)
The proportion usually has a decimal part. This algorithm simply use 'round up', store the nearest pixel value from
the source image in the dest image array. So the effect would not be great and usually
will have some serious mosaic.
- Bilinear interpolation
Bilinear interpolation uses both fractional part and
integer together to calculate the final pixel value according to four pixels.The fractional part is used as weighted value. It removes sharp and mosaic.
input: src_nv12_array, src_width, src_height,dest_width,dest_height
output: dst_nv12_array
Code
This is a C version optimized program.
- Restrict and register keyword
- Use shift operation to optimize float division
- Move irrelevant code out of inner loop
- Better use logic operation but not arithmetic operation in loop
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/stat.h>
typedef unsigned char uint8_t;
void nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst,
int srcWidth, int srcHeight, int dstWidth, int
dstHeight) {
register int sw = srcWidth; register int sh = srcHeight;
register int dw = dstWidth;
register int dh = dstHeight;
register int y, x;
unsigned long int srcy, srcx, src_index, dst_index;
unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1;
uint8_t* dst_uv = dst + dh * dw; uint8_t* src_uv = src + sh * sw; uint8_t* dst_uv_yScanline;
uint8_t* src_uv_yScanline;
uint8_t* dst_y_slice = dst; uint8_t* src_y_slice;
uint8_t* sp;
uint8_t* dp;
for (y = 0; y < (dh & ~7); ++y) {
srcy = (y * yrIntFloat_16) >> 16;
src_y_slice = src + srcy * sw;
if((y & 1) == 0)
{
dst_uv_yScanline = dst_uv + (y / 2) * dw;
src_uv_yScanline = src_uv + (srcy / 2) * sw;
}
for(x = 0; x < (dw & ~7); ++x)
{
srcx = (x * xrIntFloat_16) >> 16;
dst_y_slice[x] = src_y_slice[srcx];
if((y & 1) == 0) {
if((x & 1) == 0) {
src_index = (srcx / 2) * 2;
sp = dst_uv_yScanline + x;
dp = src_uv_yScanline + src_index;
*sp = *dp;
++sp;
++dp;
*sp = *dp;
}
}
}
dst_y_slice += dw;
}
}
void nv12_bilinear_scale (uint8_t* src, uint8_t* dst,
int srcWidth, int srcHeight, int dstWidth,int dstHeight)
{
int x, y;
int ox, oy;
int tmpx, tmpy;
int xratio = (srcWidth << 8)/dstWidth;
int yratio = (srcHeight << 8)/dstHeight;
uint8_t* dst_y = dst;
uint8_t* dst_uv = dst + dstHeight * dstWidth;
uint8_t* src_y = src;
uint8_t* src_uv = src + srcHeight * srcWidth;
uint8_t y_plane_color[2][2];
uint8_t u_plane_color[2][2];
uint8_t v_plane_color[2][2];
int j,i;
int size = srcWidth * srcHeight;
int offsetY;
int y_final, u_final, v_final;
int u_final1 = 0;
int v_final1 = 0;
int u_final2 = 0;
int v_final2 = 0;
int u_final3 = 0;
int v_final3 = 0;
int u_final4 = 0;
int v_final4 = 0;
int u_sum = 0;
int v_sum = 0;
tmpy = 0;
for (j = 0; j < (dstHeight & ~7); ++j)
{
oy = tmpy >> 8;
y = tmpy & 0xFF;
tmpx = 0;
for (i = 0; i < (dstWidth & ~7); ++i)
{
ox = tmpx >> 8;
x = tmpx & 0xFF;
offsetY = oy * srcWidth;
y_plane_color[0][0] = src[ offsetY + ox ];
y_plane_color[1][0] = src[ offsetY + ox + 1 ];
y_plane_color[0][1] = src[ offsetY + srcWidth + ox ];
y_plane_color[1][1] = src[ offsetY + srcWidth + ox + 1 ];
int y_final = (0x100 - x) * (0x100 - y) * y_plane_color[0][0]
+ x * (0x100 - y) * y_plane_color[1][0]
+ (0x100 - x) * y * y_plane_color[0][1]
+ x * y * y_plane_color[1][1];
y_final = y_final >> 16;
if (y_final>255)
y_final = 255;
if (y_final<0)
y_final = 0;
dst_y[ j * dstWidth + i] = (uint8_t)y_final; if((j & 1) == 0) {
if((i & 1) == 0) {
u_plane_color[0][0] = src[ size + offsetY + ox ];
u_plane_color[1][0] = src[ size + offsetY + ox ];
u_plane_color[0][1] = src[ size + offsetY + ox ];
u_plane_color[1][1] = src[ size + offsetY + ox ];
v_plane_color[0][0] = src[ size + offsetY + ox + 1];
v_plane_color[1][0] = src[ size + offsetY + ox + 1];
v_plane_color[0][1] = src[ size + offsetY + ox + 1];
v_plane_color[1][1] = src[ size + offsetY + ox + 1];
}
else {
u_plane_color[0][0] = src[ size + offsetY + ox - 1 ];
u_plane_color[1][0] = src[ size + offsetY + ox + 1 ];
u_plane_color[0][1] = src[ size + offsetY + ox - 1 ];
u_plane_color[1][1] = src[ size + offsetY + ox + 1 ];
v_plane_color[0][0] = src[ size + offsetY + ox ];
v_plane_color[1][0] = src[ size + offsetY + ox + 1 ];
v_plane_color[0][1] = src[ size + offsetY + ox ];
v_plane_color[1][1] = src[ size + offsetY + ox + 1 ];
}
}
else {
if((i & 1) == 0) {
u_plane_color[0][0] = src[ size + offsetY + ox ];
u_plane_color[1][0] = src[ size + offsetY + ox ];
u_plane_color[0][1] = src[ size + offsetY + srcWidth + ox ];
u_plane_color[1][1] = src[ size + offsetY + srcWidth + ox ];
v_plane_color[0][0] = src[ size + offsetY + ox + 1];
v_plane_color[1][0] = src[ size + offsetY + ox + 1];
v_plane_color[0][1] = src[ size + offsetY + srcWidth + ox + 1];
v_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 1];
}
else {
u_plane_color[0][0] = src[ size + offsetY + ox - 1 ];
u_plane_color[1][0] = src[ size + offsetY + srcWidth + ox - 1 ];
u_plane_color[0][1] = src[ size + offsetY + ox + 1];
u_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 1];
v_plane_color[0][0] = src[ size + offsetY + ox ];
v_plane_color[1][0] = src[ size + offsetY + srcWidth + ox ];
v_plane_color[0][1] = src[ size + offsetY + ox + 2 ];
v_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 2 ];
}
}
int u_final = (0x100 - x) * (0x100 - y) * u_plane_color[0][0]
+ x * (0x100 - y) * u_plane_color[1][0]
+ (0x100 - x) * y * u_plane_color[0][1]
+ x * y * u_plane_color[1][1];
u_final = u_final >> 16;
int v_final = (0x100 - x) * (0x100 - y) * v_plane_color[0][0]
+ x * (0x100 - y) * v_plane_color[1][0]
+ (0x100 - x) * y * v_plane_color[0][1]
+ x * y * v_plane_color[1][1];
v_final = v_final >> 16;
if((j & 1) == 0)
{
if((i & 1) == 0)
{
dst_uv[(j / 2) * dstWidth + i ] = (uint8_t)(u_sum / 4);
dst_uv[(j / 2) * dstWidth + i + 1] = (uint8_t)(v_sum / 4);
u_sum = 0;
v_sum = 0;
}
}
else
{
u_sum += u_final;
v_sum += v_final;
}
tmpx += xratio;
}
tmpy += yratio;
}
}
int ImageResize(uint8_t * src, uint8_t* dst, int sw,
int sh,int dw,int dh)
{
if( (src == NULL) || (dst == NULL) || (0 == dw) || (0 == dh) ||
(0 == sw) || (0 == sh))
{
printf("params error\n");
return -1;
}
nv12_nearest_scale(src, dst, sw, sh, dw, dh);
return 0;
}
int main(int argc,char**argv)
{
if(argc!=7)
{
printf("Input Error!\n");
printf("Usage : <Input NV12file> <Output NV12file>
<sw><sh> <dw> <dh>");
return 0;
}
FILE *inputfp = NULL;
FILE *outputfp = NULL;
inputfp = fopen(argv[1], "rb");
if (!inputfp)
{
fprintf(stderr, "fopen failed for input file[%s]\n",argv[1]);
return -1;
}
outputfp = fopen(argv[2], "wb");
if (!outputfp)
{
fprintf(stderr, "fopen failed for output file[%s]\n",argv[2]);
return -1;
}
int sw = atoi(argv[3]);
int sh = atoi(argv[4]);
int dw = atoi(argv[5]);
int dh = atoi(argv[6]);
if(sw <= 0 || sh <= 0 || dw <= 0 || dh <=0)
{
fprintf(stderr, "parameter error [sw= %d,sh= %d,dw= %d,dh= %d]\n",sw,sh,dw,dh);
return -1;
}
int inPixels = sw * sh * 3/2;
int outPixels = dw * dh * 3/2;
uint8_t* pInBuffer = (uint8_t*)malloc(inPixels);
fread(pInBuffer,1,inPixels,inputfp);
uint8_t* pOutBuffer = (uint8_t*)malloc(outPixels);
ImageResize(pInBuffer,pOutBuffer,sw,sh,dw,dh);
int i = 0;
clock_t start = clock();
for(;i<1000;++i)
{
ImageResize(pInBuffer,pOutBuffer,1536,1088,1024,600); }
clock_t finish = clock();
float duration = (float)(finish-start)/CLOCKS_PER_SEC;
float fps = 1000 / duration;
printf("nv12Scaling:%d*%d-->%d*%d,time cost:%6.2ffps\n",sw,sh,dw,dh,fps);
fwrite(pOutBuffer, 1 , outPixels, outputfp);
free(pInBuffer);
free(pOutBuffer);
fclose(inputfp);
fclose(outputfp);
pInBuffer = NULL;
pOutBuffer = NULL;
inputfp = NULL;
outputfp = NULL;
return 0;
}
Furthermore, I would like to use the ARM assembly language to optimize my program. Maybe
the NEON vectorized assembly for the Android project. Or simply use different instrinsics of different
CPUs instead.But sometimes it needs a huge change(including re-design algorithm) while translating C code to assembly code. This depends on the feature of CPU.