mirror of https://git.ffmpeg.org/ffmpeg.git
whitespace cosmetics
Originally committed as revision 27188 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
This commit is contained in:
parent
7a4d5e174c
commit
4bdc44c7fe
|
@ -24,74 +24,73 @@
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
|
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
|
||||||
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
|
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
|
||||||
|
|
||||||
|
|
||||||
The following calculation is used for the conversion:
|
The following calculation is used for the conversion:
|
||||||
|
|
||||||
r = clipz((y-oy)*cy + crv*(v-128))
|
r = clipz((y-oy)*cy + crv*(v-128))
|
||||||
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
||||||
b = clipz((y-oy)*cy + cbu*(u-128))
|
b = clipz((y-oy)*cy + cbu*(u-128))
|
||||||
|
|
||||||
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
|
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
|
||||||
|
|
||||||
|
|
||||||
New factorization to eliminate the truncation error which was
|
New factorization to eliminate the truncation error which was
|
||||||
occuring due to the byteop3p.
|
occuring due to the byteop3p.
|
||||||
|
|
||||||
|
|
||||||
1) use the bytop16m to subtract quad bytes we use this in U8 this
|
1) use the bytop16m to subtract quad bytes we use this in U8 this
|
||||||
then so the offsets need to be renormalized to 8bits.
|
then so the offsets need to be renormalized to 8bits.
|
||||||
|
|
||||||
2) scale operands up by a factor of 4 not 8 because Blackfin
|
2) scale operands up by a factor of 4 not 8 because Blackfin
|
||||||
multiplies include a shift.
|
multiplies include a shift.
|
||||||
|
|
||||||
3) compute into the accumulators cy*yx0, cy*yx1
|
3) compute into the accumulators cy*yx0, cy*yx1
|
||||||
|
|
||||||
4) compute each of the linear equations
|
4) compute each of the linear equations
|
||||||
r = clipz((y-oy)*cy + crv*(v-128))
|
r = clipz((y - oy) * cy + crv * (v - 128))
|
||||||
|
|
||||||
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
|
||||||
|
|
||||||
b = clipz((y-oy)*cy + cbu*(u-128))
|
b = clipz((y - oy) * cy + cbu * (u - 128))
|
||||||
|
|
||||||
reuse of the accumulators requires that we actually multiply
|
reuse of the accumulators requires that we actually multiply
|
||||||
twice once with addition and the second time with a subtaction.
|
twice once with addition and the second time with a subtaction.
|
||||||
|
|
||||||
because of this we need to compute the equations in the order R B
|
because of this we need to compute the equations in the order R B
|
||||||
then G saving the writes for B in the case of 24/32 bit color
|
then G saving the writes for B in the case of 24/32 bit color
|
||||||
formats.
|
formats.
|
||||||
|
|
||||||
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
|
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
|
||||||
int dW, uint32_t *coeffs);
|
int dW, uint32_t *coeffs);
|
||||||
|
|
||||||
A B
|
A B
|
||||||
--- ---
|
--- ---
|
||||||
i2 = cb i3 = cr
|
i2 = cb i3 = cr
|
||||||
i1 = coeff i0 = y
|
i1 = coeff i0 = y
|
||||||
|
|
||||||
Where coeffs have the following layout in memory.
|
Where coeffs have the following layout in memory.
|
||||||
|
|
||||||
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
|
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
|
||||||
|
|
||||||
coeffs is a pointer to oy.
|
coeffs is a pointer to oy.
|
||||||
|
|
||||||
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
|
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
|
||||||
replication is used to simplify the internal algorithms for the dual mac architecture
|
replication is used to simplify the internal algorithms for the dual mac architecture
|
||||||
of BlackFin.
|
of BlackFin.
|
||||||
|
|
||||||
All routines are exported with _ff_bfin_ as a symbol prefix
|
All routines are exported with _ff_bfin_ as a symbol prefix
|
||||||
|
|
||||||
rough performance gain compared against -O3:
|
rough performance gain compared against -O3:
|
||||||
|
|
||||||
2779809/1484290 187.28%
|
2779809/1484290 187.28%
|
||||||
|
|
||||||
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
|
|
||||||
c/pel for the optimized implementations. Not sure why there is such a
|
|
||||||
huge variation on the reference codes on Blackfin I guess it must have
|
|
||||||
to do with the memory system.
|
|
||||||
|
|
||||||
|
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
|
||||||
|
c/pel for the optimized implementations. Not sure why there is such a
|
||||||
|
huge variation on the reference codes on Blackfin I guess it must have
|
||||||
|
to do with the memory system.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define mL3 .text
|
#define mL3 .text
|
||||||
|
|
|
@ -21,63 +21,63 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
convert I420 YV12 to RGB in various formats,
|
convert I420 YV12 to RGB in various formats,
|
||||||
it rejects images that are not in 420 formats
|
it rejects images that are not in 420 formats
|
||||||
it rejects images that don't have widths of multiples of 16
|
it rejects images that don't have widths of multiples of 16
|
||||||
it rejects images that don't have heights of multiples of 2
|
it rejects images that don't have heights of multiples of 2
|
||||||
reject defers to C simulation codes.
|
reject defers to C simulation codes.
|
||||||
|
|
||||||
lots of optimizations to be done here
|
lots of optimizations to be done here
|
||||||
|
|
||||||
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
|
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
|
||||||
so we currently use max min to clip
|
so we currently use max min to clip
|
||||||
|
|
||||||
2. the inefficient use of chroma loading needs a bit of brushing up
|
2. the inefficient use of chroma loading needs a bit of brushing up
|
||||||
|
|
||||||
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
|
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
|
||||||
|
|
||||||
|
|
||||||
MODIFIED to calculate coeffs from currently selected color space.
|
MODIFIED to calculate coeffs from currently selected color space.
|
||||||
MODIFIED core to be a macro which you spec the output format.
|
MODIFIED core to be a macro which you spec the output format.
|
||||||
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
|
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
|
||||||
CORRECTED algorithim selection to be strict on input formats.
|
CORRECTED algorithim selection to be strict on input formats.
|
||||||
ADDED runtime detection of altivec.
|
ADDED runtime detection of altivec.
|
||||||
|
|
||||||
ADDED altivec_yuv2packedX vertical scl + RGB converter
|
ADDED altivec_yuv2packedX vertical scl + RGB converter
|
||||||
|
|
||||||
March 27,2004
|
March 27,2004
|
||||||
PERFORMANCE ANALYSIS
|
PERFORMANCE ANALYSIS
|
||||||
|
|
||||||
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
|
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
|
||||||
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
|
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
|
||||||
|
|
||||||
720*480*30 ~10MPS
|
720*480*30 ~10MPS
|
||||||
|
|
||||||
so we have roughly 10clocks per pixel this is too high something has to be wrong.
|
so we have roughly 10clocks per pixel this is too high something has to be wrong.
|
||||||
|
|
||||||
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
|
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
|
||||||
|
|
||||||
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
|
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
|
||||||
guaranteed to have the input video frame it was just decompressed so
|
guaranteed to have the input video frame it was just decompressed so
|
||||||
it probably resides in L1 caches. However we are creating the
|
it probably resides in L1 caches. However we are creating the
|
||||||
output video stream this needs to use the DSTST instruction to
|
output video stream this needs to use the DSTST instruction to
|
||||||
optimize for the cache. We couple this with the fact that we are
|
optimize for the cache. We couple this with the fact that we are
|
||||||
not going to be visiting the input buffer again so we mark it Least
|
not going to be visiting the input buffer again so we mark it Least
|
||||||
Recently Used. This shaves 25% of the processor cycles off.
|
Recently Used. This shaves 25% of the processor cycles off.
|
||||||
|
|
||||||
Now MEMCPY is the largest mips consumer in the system, probably due
|
Now MEMCPY is the largest mips consumer in the system, probably due
|
||||||
to the inefficient X11 stuff.
|
to the inefficient X11 stuff.
|
||||||
|
|
||||||
GL libraries seem to be very slow on this machine 1.33Ghz PB running
|
GL libraries seem to be very slow on this machine 1.33Ghz PB running
|
||||||
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
|
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
|
||||||
a versioning issues, however I have libGL.1.2.dylib for both
|
a versioning issues, however I have libGL.1.2.dylib for both
|
||||||
machines. ((We need to figure this out now))
|
machines. ((We need to figure this out now))
|
||||||
|
|
||||||
GL2 libraries work now with patch for RGB32
|
GL2 libraries work now with patch for RGB32
|
||||||
|
|
||||||
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
|
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
|
||||||
|
|
||||||
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
|
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
Loading…
Reference in New Issue