diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c index d222d3913e..3f0a43ee72 100644 --- a/libavfilter/vf_nlmeans.c +++ b/libavfilter/vf_nlmeans.c @@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32 { int x, y; + /* SIMD-friendly assumptions allowed here */ + av_assert2(!(w & 0xf) && w >= 16 && h >= 1); + for (y = 0; y < h; y++) { uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1]; @@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32, // to compare the 2 sources pixels const int startx_safe = FFMAX(s1x, s2x); const int starty_safe = FFMAX(s1y, s2y); - const int endx_safe = FFMIN(s1x + w, s2x + w); + const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned const int endy_safe = FFMIN(s1y + h, s2y + h); + // deduce the safe area width and height + const int safe_pw = (u_endx_safe - startx_safe) & ~0xf; + const int safe_ph = endy_safe - starty_safe; + + // adjusted end x position of the safe area after width of the safe area gets aligned + const int endx_safe = startx_safe + safe_pw; + // top part where only one of s1 and s2 is still readable, or none at all compute_unsafe_ssd_integral_image(ii, ii_linesize_32, 0, 0, @@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32, 0, starty_safe, src, linesize, offx, offy, e, w, h, - startx_safe, endy_safe - starty_safe); + startx_safe, safe_ph); // main and safe part of the integral av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w); av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h); av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w); av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h); - compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, - src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize, - src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize, - endx_safe - startx_safe, endy_safe - starty_safe); + if (safe_pw && safe_ph) + compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32, + src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize, + src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize, + safe_pw, safe_ph); // right part of the integral compute_unsafe_ssd_integral_image(ii, ii_linesize_32, endx_safe, starty_safe, src, linesize, offx, offy, e, w, h, - ii_w - endx_safe, endy_safe - starty_safe); + ii_w - endx_safe, safe_ph); // bottom part where only one of s1 and s2 is still readable, or none at all compute_unsafe_ssd_integral_image(ii, ii_linesize_32,