[FFmpeg-devel] [PATCH 27/34] aarch64: vp9lpf: Use dup+rev16+uzp1 instead of dup+lsr+dup+trn1

Martin Storsjö martin at martin.st
Wed Mar 8 12:01:07 EET 2017


This is one cycle faster in total, and three instructions fewer.

Before:
vp9_loop_filter_mix2_v_44_16_neon: 123.2
After:
vp9_loop_filter_mix2_v_44_16_neon: 122.2

This is cherrypicked from libav commit
3bf9c48320f25f3d5557485b0202f22ae60748b0.
---
 libavcodec/aarch64/vp9lpf_neon.S | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index a9eea7f..0878763 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -162,18 +162,15 @@
         dup             v2\sz,  w3        // I
         dup             v3\sz,  w4        // H
 .else
-        dup             v0.8b,  w2        // E
-        dup             v2.8b,  w3        // I
-        dup             v3.8b,  w4        // H
-        lsr             w5,     w2,  #8
-        lsr             w6,     w3,  #8
-        lsr             w7,     w4,  #8
-        dup             v1.8b,  w5        // E
-        dup             v4.8b,  w6        // I
-        dup             v5.8b,  w7        // H
-        trn1            v0.2d,  v0.2d,  v1.2d
-        trn1            v2.2d,  v2.2d,  v4.2d
-        trn1            v3.2d,  v3.2d,  v5.2d
+        dup             v0.8h,  w2        // E
+        dup             v2.8h,  w3        // I
+        dup             v3.8h,  w4        // H
+        rev16           v1.16b, v0.16b    // E
+        rev16           v4.16b, v2.16b    // I
+        rev16           v5.16b, v3.16b    // H
+        uzp1            v0.16b, v0.16b, v1.16b
+        uzp1            v2.16b, v2.16b, v4.16b
+        uzp1            v3.16b, v3.16b, v5.16b
 .endif
 
         uabd            v4\sz,  v20\sz, v21\sz        // abs(p3 - p2)
-- 
2.7.4



More information about the ffmpeg-devel mailing list