[tbb-commits] [tor-browser] 160/311: Bug 1758482 - Update dav1d to new version 28a9c46e1c36540d3276299f2e284ece1d2386be from 2022-02-04T23:02:17.000-03:00. r=media-playback-reviewers, padenot a=dmeehan
gitolite role
git at cupani.torproject.org
Tue Apr 26 15:29:20 UTC 2022
This is an automated email from the git hooks/post-receive script.
pierov pushed a commit to branch geckoview-99.0.1-11.0-1
in repository tor-browser.
commit 0c755a24b73056a9381d7cf3cd2b38281dbe4c43
Author: Jon Bauman <jbauman at mozilla.com>
AuthorDate: Mon Mar 14 17:17:59 2022 +0000
Bug 1758482 - Update dav1d to new version 28a9c46e1c36540d3276299f2e284ece1d2386be from 2022-02-04T23:02:17.000-03:00. r=media-playback-reviewers,padenot a=dmeehan
Normally updatebot would create a revision, allowing me to review it single-handedly, but https://phabricator.services.mozilla.com/D140519 was a failure because of changes updatebot didn't know how to handle, so I just need **someone** to approve the update to get this landed. This has a bit of priority since it's blocking https://bugzilla.mozilla.org/show_bug.cgi?id=1757971, which we want to get uplifted for Fx99.
Differential Revision: https://phabricator.services.mozilla.com/D140921
---
media/libdav1d/README_MOZILLA | 3 +-
media/libdav1d/asm/moz.build | 30 +-
media/libdav1d/moz.build | 6 +-
media/libdav1d/moz.yaml | 4 +-
media/libdav1d/vcs_version.h | 2 +-
media/libdav1d/version.h | 4 +-
third_party/dav1d/NEWS | 29 +
third_party/dav1d/README.md | 12 +-
third_party/dav1d/THANKS.md | 28 +-
third_party/dav1d/include/dav1d/common.h | 5 +
third_party/dav1d/include/dav1d/dav1d.h | 13 +
third_party/dav1d/meson.build | 8 +-
third_party/dav1d/src/arm/32/filmgrain.S | 2039 +++++++++++++++++++
third_party/dav1d/src/arm/32/filmgrain16.S | 2137 ++++++++++++++++++++
third_party/dav1d/src/arm/32/itx.S | 358 ++--
third_party/dav1d/src/arm/64/filmgrain.S | 2010 ++++++++++++++++++
third_party/dav1d/src/arm/64/filmgrain16.S | 1997 ++++++++++++++++++
third_party/dav1d/src/arm/64/itx.S | 282 +--
third_party/dav1d/src/arm/64/looprestoration.S | 4 +
third_party/dav1d/src/arm/64/looprestoration16.S | 4 +
third_party/dav1d/src/arm/asm.S | 115 +-
...ilm_grain_init_tmpl.c => filmgrain_init_tmpl.c} | 2 +-
third_party/dav1d/src/data.c | 13 +-
third_party/dav1d/src/data.h | 1 +
third_party/dav1d/src/decode.c | 20 +-
third_party/dav1d/src/ext/x86/x86inc.asm | 35 +-
third_party/dav1d/src/fg_apply.h | 25 +-
third_party/dav1d/src/fg_apply_tmpl.c | 140 +-
third_party/dav1d/src/filmgrain.h | 86 +
third_party/dav1d/src/filmgrain_tmpl.c | 433 ++++
third_party/dav1d/src/internal.h | 53 +-
third_party/dav1d/src/lib.c | 92 +-
third_party/dav1d/src/meson.build | 26 +-
third_party/dav1d/src/obu.c | 44 +-
third_party/dav1d/src/picture.c | 1 +
third_party/dav1d/src/tables.c | 2 +-
third_party/dav1d/src/thread_task.c | 137 +-
third_party/dav1d/src/thread_task.h | 2 +
third_party/dav1d/src/x86/cdef16_avx2.asm | 8 -
third_party/dav1d/src/x86/cdef16_sse.asm | 8 -
...{film_grain16_avx2.asm => filmgrain16_avx2.asm} | 31 +-
.../{film_grain16_sse.asm => filmgrain16_sse.asm} | 31 +-
.../{film_grain_avx2.asm => filmgrain_avx2.asm} | 48 +-
third_party/dav1d/src/x86/filmgrain_avx512.asm | 1079 ++++++++++
third_party/dav1d/src/x86/filmgrain_common.asm | 46 +
...ilm_grain_init_tmpl.c => filmgrain_init_tmpl.c} | 53 +-
.../x86/{film_grain_sse.asm => filmgrain_sse.asm} | 31 +-
third_party/dav1d/src/x86/ipred16_avx2.asm | 10 +-
third_party/dav1d/src/x86/ipred16_avx512.asm | 833 ++++++++
third_party/dav1d/src/x86/ipred16_sse.asm | 8 -
third_party/dav1d/src/x86/ipred_init_tmpl.c | 2 +-
third_party/dav1d/src/x86/itx16_avx2.asm | 8 -
third_party/dav1d/src/x86/itx16_sse.asm | 8 -
third_party/dav1d/src/x86/itx_avx2.asm | 9 -
third_party/dav1d/src/x86/itx_avx512.asm | 9 -
third_party/dav1d/src/x86/itx_init_tmpl.c | 22 +-
third_party/dav1d/src/x86/itx_sse.asm | 340 ++--
third_party/dav1d/src/x86/loopfilter16_avx2.asm | 8 -
third_party/dav1d/src/x86/loopfilter16_sse.asm | 8 -
.../dav1d/src/x86/looprestoration16_avx2.asm | 8 -
third_party/dav1d/src/x86/mc16_avx2.asm | 8 -
third_party/dav1d/src/x86/mc16_avx512.asm | 8 -
third_party/dav1d/src/x86/mc16_sse.asm | 12 +-
third_party/dav1d/src/x86/mc_avx512.asm | 8 -
third_party/dav1d/tests/checkasm/filmgrain.c | 10 +-
65 files changed, 11853 insertions(+), 1003 deletions(-)
diff --git a/media/libdav1d/README_MOZILLA b/media/libdav1d/README_MOZILLA
index eba613b2969da..bc5c2708696e1 100644
--- a/media/libdav1d/README_MOZILLA
+++ b/media/libdav1d/README_MOZILLA
@@ -39,7 +39,8 @@ The rough steps are:
moz.build which has a condition on CONFIG['CPU_ARCH'].
- Files ending in _tmpl.c may be automatically added to SOURCES, resulting in build failures.
To fix this, the file must be moved to the appropriate bitdepth_basenames list where
- generate_source.py will generate the templates into buildable source files.
+ generate_source.py will generate the templates into buildable source files. In general,
+ all *_tmpl.c files require BITDEPTH to be defined.
- Clone the tag from the dav1d repo and build a stand-alone libdav1d following the steps here:
https://code.videolan.org/videolan/dav1d#compile
- Copy vcs_version.h from the local build/include/vcs_version.h
diff --git a/media/libdav1d/asm/moz.build b/media/libdav1d/asm/moz.build
index 02647876d4904..7530c087ce30a 100644
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@@ -87,8 +87,14 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/cdef16_avx2.asm', # moved from autovendored
'../../../third_party/dav1d/src/x86/cdef_avx2.asm',
'../../../third_party/dav1d/src/x86/cdef_avx512.asm',
- '../../../third_party/dav1d/src/x86/film_grain16_avx2.asm',
- '../../../third_party/dav1d/src/x86/film_grain_avx2.asm',
+ '../../../third_party/dav1d/src/x86/filmgrain16_avx2.asm',
+ '../../../third_party/dav1d/src/x86/filmgrain_avx2.asm',
+ '../../../third_party/dav1d/src/x86/filmgrain_avx512.asm',
+ # '../../../third_party/dav1d/src/x86/filmgrain_common.asm',
+ # ^ filmgrain_common.asm must *not* be in SOURCES because it's only
+ # used as an %include for other .asm files. Trying to assemble it
+ # will result in a nasm error: parser: instruction expected
+ '../../../third_party/dav1d/src/x86/ipred16_avx512.asm',
'../../../third_party/dav1d/src/x86/ipred_avx2.asm',
'../../../third_party/dav1d/src/x86/ipred_avx512.asm',
'../../../third_party/dav1d/src/x86/itx16_avx2.asm',
@@ -111,8 +117,8 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/cdef16_sse.asm',
'../../../third_party/dav1d/src/x86/cdef_sse.asm',
'../../../third_party/dav1d/src/x86/cpuid.asm',
- '../../../third_party/dav1d/src/x86/film_grain16_sse.asm',
- '../../../third_party/dav1d/src/x86/film_grain_sse.asm',
+ '../../../third_party/dav1d/src/x86/filmgrain16_sse.asm',
+ '../../../third_party/dav1d/src/x86/filmgrain_sse.asm',
'../../../third_party/dav1d/src/x86/ipred16_avx2.asm',
'../../../third_party/dav1d/src/x86/ipred16_sse.asm',
'../../../third_party/dav1d/src/x86/ipred_sse.asm',
@@ -129,10 +135,12 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
]
# BITDEPTH
+ # All the files here should be *_tmpl.c, and the should not appear in SOURCES,
+ # since they require BITDEPTH to be defined
relative_path = '../../../third_party/dav1d/src/x86/'
bitdepth_basenames = [
'cdef_init_tmpl.c',
- 'film_grain_init_tmpl.c',
+ 'filmgrain_init_tmpl.c',
'ipred_init_tmpl.c',
'itx_init_tmpl.c',
'loopfilter_init_tmpl.c',
@@ -167,10 +175,12 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
]
# BITDEPTH c file
+ # All the files here should be *_tmpl.c, and the should not appear in SOURCES,
+ # since they require BITDEPTH to be defined
relative_path = '../../../third_party/dav1d/src/arm/'
bitdepth_basenames = [
'cdef_init_tmpl.c',
- 'film_grain_init_tmpl.c',
+ 'filmgrain_init_tmpl.c',
'ipred_init_tmpl.c',
'itx_init_tmpl.c',
'loopfilter_init_tmpl.c',
@@ -198,8 +208,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
'../../../third_party/dav1d/src/arm/64/cdef.S',
'../../../third_party/dav1d/src/arm/64/cdef16.S',
'../../../third_party/dav1d/src/arm/64/cdef_tmpl.S',
- '../../../third_party/dav1d/src/arm/64/film_grain.S',
- '../../../third_party/dav1d/src/arm/64/film_grain16.S',
+ '../../../third_party/dav1d/src/arm/64/filmgrain.S',
+ '../../../third_party/dav1d/src/arm/64/filmgrain16.S',
'../../../third_party/dav1d/src/arm/64/ipred.S',
'../../../third_party/dav1d/src/arm/64/ipred16.S',
# itx.S is used for both 8 and 16 bpc.
@@ -221,8 +231,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
'../../../third_party/dav1d/src/arm/32/cdef.S',
'../../../third_party/dav1d/src/arm/32/cdef16.S',
'../../../third_party/dav1d/src/arm/32/cdef_tmpl.S',
- '../../../third_party/dav1d/src/arm/32/film_grain.S',
- '../../../third_party/dav1d/src/arm/32/film_grain16.S',
+ '../../../third_party/dav1d/src/arm/32/filmgrain.S',
+ '../../../third_party/dav1d/src/arm/32/filmgrain16.S',
'../../../third_party/dav1d/src/arm/32/ipred.S',
'../../../third_party/dav1d/src/arm/32/ipred16.S',
'../../../third_party/dav1d/src/arm/32/itx.S',
diff --git a/media/libdav1d/moz.build b/media/libdav1d/moz.build
index c84cef8802b8e..923dca05ebead 100644
--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@@ -103,7 +103,7 @@ EXPORTS.dav1d.src += [
'../../third_party/dav1d/src/data.h',
'../../third_party/dav1d/src/decode.h',
'../../third_party/dav1d/src/dequant_tables.h',
- '../../third_party/dav1d/src/film_grain.h',
+ '../../third_party/dav1d/src/filmgrain.h',
'../../third_party/dav1d/src/getbits.h',
'../../third_party/dav1d/src/intra_edge.h',
'../../third_party/dav1d/src/lf_mask.h',
@@ -123,12 +123,14 @@ EXPORTS.dav1d.src += [
]
# common BITDEPTH 8, 16
+# All the files here should be *_tmpl.c, and the should not appear in SOURCES,
+# since they require BITDEPTH to be defined
relative_path = '../../third_party/dav1d/src/'
bitdepth_basenames = [
'cdef_apply_tmpl.c',
'cdef_tmpl.c',
'fg_apply_tmpl.c',
- 'film_grain_tmpl.c',
+ 'filmgrain_tmpl.c',
'ipred_prepare_tmpl.c',
'ipred_tmpl.c',
'itx_tmpl.c',
diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml
index 2de4cdc699621..623ce9625d4de 100644
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
- release: commit 1f09a9119fb794ab41b1e527d848c2a210ca43d4 (2022-02-04T23:02:17.000-03:00).
+ release: commit 28a9c46e1c36540d3276299f2e284ece1d2386be (2022-03-10T14:49:23.000+00:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
- revision: 1f09a9119fb794ab41b1e527d848c2a210ca43d4
+ revision: 28a9c46e1c36540d3276299f2e284ece1d2386be
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h
index 980b8d307dcaf..a449a2c0360d7 100644
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@@ -1,2 +1,2 @@
/* auto-generated, do not edit */
-#define DAV1D_VERSION "1f09a9119fb794ab41b1e527d848c2a210ca43d4"
+#define DAV1D_VERSION "0.9.2-170-g28a9c46"
diff --git a/media/libdav1d/version.h b/media/libdav1d/version.h
index a797de5df28c5..5619d66bb76a1 100644
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@@ -27,8 +27,8 @@
#ifndef DAV1D_VERSION_H
#define DAV1D_VERSION_H
-#define DAV1D_API_VERSION_MAJOR 9
-#define DAV1D_API_VERSION_MINOR 2
+#define DAV1D_API_VERSION_MAJOR 6
+#define DAV1D_API_VERSION_MINOR 6
#define DAV1D_API_VERSION_PATCH 0
#endif /* DAV1D_VERSION_H */
diff --git a/third_party/dav1d/NEWS b/third_party/dav1d/NEWS
index 2a4c4d00ca637..6158d780487bd 100644
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@@ -1,3 +1,32 @@
+Changes for 1.0.0 'Peregrine falcon':
+-------------------------------------
+
+1.0.0 is a major release of dav1d, adding important features and bug fixes.
+
+It notably changes, in an important way, the way threading works, by adding
+an automatic thread management.
+
+It also adds support for AVX-512 acceleration, and adds speedups to existing x86
+code (from SSE2 to AVX2).
+
+1.0.0 adds new grain API to ease acceleration on the GPU.
+
+Finally, 1.0.0 fixes numerous small bugs that were reported since the beginning
+of the project to have a proper release.
+
+ .''.
+ .''. . *''* :_\/_: .
+ :_\/_: _\(/_ .:.*_\/_* : /\ : .'.:.'.
+ .''.: /\ : ./)\ ':'* /\ * : '..'. -=:o:=-
+ :_\/_:'.:::. ' *''* * '.\'/.' _\(/_'.':'.'
+ : /\ : ::::: *_\/_* -= o =- /)\ ' *
+ '..' ':::' * /\ * .'/.\'. '
+ * *..* :
+ * :
+ * 1.0.0
+
+
+
Changes for 0.9.2 'Golden Eagle':
---------------------------------
diff --git a/third_party/dav1d/README.md b/third_party/dav1d/README.md
index 34a351e247f21..a8a35e1c6fe2d 100644
--- a/third_party/dav1d/README.md
+++ b/third_party/dav1d/README.md
@@ -2,11 +2,13 @@
# dav1d
-**dav1d** is a new **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness.
+**dav1d** is an **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness.
+
+It is now battle-tested and production-ready and can be used everywhere.
The canonical repository URL for this repo is https://code.videolan.org/videolan/dav1d
-This project is partially funded by the *Alliance for Open Media*/**AOM**.
+This project was partially funded by the *Alliance for Open Media*/**AOM**.
## Goal and Features
@@ -38,11 +40,11 @@ The plan is the following:
9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips,
10. Make high bit-depth fast on desktop, by writing asm for AVX2 chips,
11. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips,
+12. Improve threading.
### On-going
-12. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
-13. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
-14. Improve threading.
+13. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
+14. Accelerate for less common architectures, like PPC, SSE2, RISC-V or AVX-512.
### After
15. Use more GPU decoding, when possible.
diff --git a/third_party/dav1d/THANKS.md b/third_party/dav1d/THANKS.md
index 06d879853c3e6..2791e3dcc61bc 100644
--- a/third_party/dav1d/THANKS.md
+++ b/third_party/dav1d/THANKS.md
@@ -16,16 +16,18 @@ The Alliance for Open Media (AOM) for funding this project.
And all the dav1d Authors (git shortlog -sn), including:
-Martin Storsjö, Janne Grunau, Henrik Gramner, Ronald S. Bultje, James Almer,
-Marvin Scholz, Luc Trudeau, Victorien Le Couviour--Tuffet, Jean-Baptiste Kempf,
-Hugo Beauzée-Luyssen, Matthias Dressel, Konstantin Pavlov, David Michael Barr,
-Steve Lhomme, Niklas Haas, B Krishnan Iyer, Francois Cartegnie, Liwei Wang,
-Nathan E. Egge, Derek Buitenhuis, Michael Bradshaw, Raphaël Zumer,
-Xuefeng Jiang, Luca Barbato, Jan Beich, Wan-Teh Chang, Justin Bull, Boyuan Xiao,
-Dale Curtis, Kyle Siefring, Raphael Zumer, Rupert Swarbrick, Thierry Foucu,
-Thomas Daede, Colin Lee, Emmanuel Gil Peyrot, Lynne, Michail Alvanos,
-Nico Weber, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier,
-Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard,
-Mark Shuttleworth, Matthieu Bouron, Nicolas Frattaroli, Pablo Stebler,
-Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvestre Ledru, Timo Gurr,
-Tristan Matthews, Xavier Claessens, Xu Guangxin, kossh1 and skal.
+Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer,
+Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz, Luc Trudeau,
+Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Niklas Haas, Konstantin Pavlov,
+David Michael Barr, Steve Lhomme, Nathan E. Egge, Kyle Siefring, Raphaël Zumer,
+B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Derek Buitenhuis,
+Michael Bradshaw, Wan-Teh Chang, Xuefeng Jiang, Luca Barbato, Jan Beich,
+Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot,
+Rupert Swarbrick, Thierry Foucu, Thomas Daede, Colin Lee, Jonathan Wright,
+Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf, Tristan Laurent,
+Vittorio Giovara, Yannis Guyon, André Kempe, Anisse Astier, Anton Mitrofanov,
+Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago,
+Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli,
+Pablo Stebler, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvain BERTRAND,
+Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens,
+Xu Guangxin, kossh1 and skal
diff --git a/third_party/dav1d/include/dav1d/common.h b/third_party/dav1d/include/dav1d/common.h
index b55e9399f06f6..8685b4f078285 100644
--- a/third_party/dav1d/include/dav1d/common.h
+++ b/third_party/dav1d/include/dav1d/common.h
@@ -78,4 +78,9 @@ typedef struct Dav1dDataProps {
struct Dav1dUserData user_data; ///< user-configurable data, default NULL members
} Dav1dDataProps;
+/**
+ * Release reference to a Dav1dDataProps.
+ */
+DAV1D_API void dav1d_data_props_unref(Dav1dDataProps *props);
+
#endif /* DAV1D_COMMON_H */
diff --git a/third_party/dav1d/include/dav1d/dav1d.h b/third_party/dav1d/include/dav1d/dav1d.h
index 0938156f759d5..fd3b622505b7e 100644
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@@ -274,6 +274,19 @@ enum Dav1dEventFlags {
*/
DAV1D_API int dav1d_get_event_flags(Dav1dContext *c, enum Dav1dEventFlags *flags);
+/**
+ * Retrieve the user-provided metadata associated with the input data packet
+ * for the last decoding error reported to the user, i.e. a negative return
+ * value (not EAGAIN) from dav1d_send_data() or dav1d_get_picture().
+ *
+ * @param c Input decoder instance.
+ * @param out Output Dav1dDataProps. On success, the caller assumes ownership of
+ * the returned reference.
+ *
+ * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
+ */
+DAV1D_API int dav1d_get_decode_error_data_props(Dav1dContext *c, Dav1dDataProps *out);
+
# ifdef __cplusplus
}
# endif
diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build
index 0e63d3bced698..5efb88c5d3fb4 100644
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -30,7 +30,7 @@ project('dav1d', ['c'],
'b_ndebug=if-release'],
meson_version: '>= 0.49.0')
-dav1d_soname_version = '6.4.0'
+dav1d_soname_version = '6.6.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@@ -413,11 +413,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
# check NASM version
if nasm.found()
- nasm_r = run_command(nasm, '-v')
-
- if nasm_r.returncode() != 0
- error('failed running nasm to obtain its version')
- endif
+ nasm_r = run_command(nasm, '-v', check: true)
out = nasm_r.stdout().strip().split()
if out[1].to_lower() == 'version'
diff --git a/third_party/dav1d/src/arm/32/filmgrain.S b/third_party/dav1d/src/arm/32/filmgrain.S
new file mode 100644
index 0000000000000..d1f83efb98e01
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain.S
@@ -0,0 +1,2039 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r5, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r6, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r7, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r8, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r9, q0
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r10, q0
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5, \r6, \r7}, [r0]!
+ vst1.16 {\r8, \r9}, [r0]!
+ vst1.16 {\r10[0]}, [r0]!
+.endm
+
+.macro get_grain_row_44 r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ read_rand r12, 11, 0
+ vld1.16 {d0[2]}, [r11]
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r5, q0
+.endm
+
+.macro store_grain_row_44 r0, r1, r2, r3, r4, r5
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5}, [r0]
+ add r0, r0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 d0, q0
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mov lr, #-128
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #1
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.8 d1[7], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ vmull.s8 q2, d6, d28
+ vmull.s8 q3, d7, d28
+ vmull.s8 q4, d0, d27
+ vmull.s8 q5, d1, d27
+
+ vaddl.s16 q0, d4, d8
+ vaddl.s16 q2, d5, d9
+ vaddl.s16 q4, d6, d10
+ vaddl.s16 q5, d7, d11
+
+ vmull.s8 q3, d3, d29
+ vmull.s8 q1, d2, d29
+
+ vaddw.s16 q4, q4, d6
+ vaddw.s16 q5, q5, d7
+ vaddw.s16 q3, q2, d3
+ vaddw.s16 q2, q0, d2
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vpaddl.s8 q6, q6
+ vpaddl.s8 q7, q7
+ vadd.i16 q0, q0, q6
+ vadd.i16 q1, q1, q7
+ vpop {q6-q7}
+ vrshrn.s16 d0, q0, #2
+ vrshrn.s16 d1, q1, #2
+.endif
+.ifc \type, uv_422
+ vld1.8 {q0, q1}, [r11]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vrshrn.s16 d0, q0, #1
+ vrshrn.s16 d1, q1, #1
+.endif
+.ifc \type, uv_444
+ vld1.8 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+.endif
+ vmull.s8 q1, d0, d13
+ vmull.s8 q0, d1, d13
+ vaddw.s16 q2, q2, d2
+ vaddw.s16 q3, q3, d3
+ vaddw.s16 q4, q4, d0
+ vaddw.s16 q5, q5, d1
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vmovn.i16 d1, q0
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s8 r10, d1[5]
+.endif
+.ifnc \lag, lag1
+ vmov.s8 r8, d1[6]
+.endif
+ vmov.s8 r6, d1[7]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q4
+.if \elems == 9
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vmovn.i16 d2, q1
+ vext.8 q0, q0, q1, #7
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q5
+
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r11, 11
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #1
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ vst1.8 {q0}, [r0]!
+.endif
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ vmov q3, \mid
+ vext.8 q0, \left, \mid, #15
+ vext.8 q1, \mid, \right, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ vmov \dst, q0
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH - 16
+ sub lr, r0, #1*GRAIN_WIDTH - 16
+ vld1.8 {q10}, [r12] // load top right
+ vld1.8 {q13}, [lr]
+
+ vext.8 q6, q8, q9, #14 // top left, top mid
+ vdup.8 d14, d28[0]
+ vext.8 q8, q8, q9, #15
+ vdup.8 d15, d28[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d16
+ vaddl.s16 q5, d3, d17
+
+ vext.8 q6, q9, q10, #1 // top mid, top right
+ vdup.8 d14, d28[3]
+ vext.8 q8, q9, q10, #2
+ vdup.8 d15, d28[4]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q11, q12, #14 // top left, top mid
+ vdup.8 d14, d28[5]
+ vext.8 q8, q11, q12, #15
+ vdup.8 d15, d28[6]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q12, q13, #1 // top mid, top right
+ vdup.8 d14, d29[0]
+ vext.8 q8, q12, q13, #2
+ vdup.8 d15, d29[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vdup.8 d14, d28[2]
+ vdup.8 d15, d28[7]
+
+ vmull.s8 q0, d18, d14
+ vmull.s8 q1, d19, d14
+ vmull.s8 q6, d24, d15
+ vmull.s8 q8, d25, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH
+ sub lr, r0, #1*GRAIN_WIDTH
+ vld1.8 {q9}, [r12] // load the previous block right above
+ vld1.8 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #13
+ vext.8 q11, q11, q11, #13
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ sub r12, r0, #3*GRAIN_WIDTH + 3
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d20, d26[0]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d26[1]
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vext.8 q8, q11, q12, #2
+ vdup.8 d20, d26[2]
+ vext.8 q9, q11, q12, #3
+ vdup.8 d21, d26[3]
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d14
+ vaddl.s16 q5, d3, d15
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #4
+ vdup.8 d20, d26[4]
+ vext.8 q7, q11, q12, #5
+ vdup.8 d21, d26[5]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ sub r12, r0, #2*GRAIN_WIDTH + 3
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #6
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d20, d26[6]
+ vdup.8 d21, d26[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d22, d21
+ vmull.s8 q7, d23, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #1
+ vdup.8 d20, d27[0]
+ vext.8 q7, q11, q12, #2
+ vdup.8 d21, d27[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #3
+ vdup.8 d20, d27[2]
+ vext.8 q9, q11, q12, #4
+ vdup.8 d21, d27[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ sub r12, r0, #1*GRAIN_WIDTH + 3
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #5
+ vdup.8 d20, d27[4]
+ vext.8 q7, q11, q12, #6
+ vdup.8 d21, d27[5]
+
+ vld1.8 {q11, q12}, [r12]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vdup.8 d20, d27[6]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d27[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #2
+ vdup.8 d20, d28[0]
+ vext.8 q7, q11, q12, #3
+ vdup.8 d21, d28[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #4
+ vdup.8 d20, d28[2]
+ vext.8 q9, q11, q12, #5
+ vdup.8 d21, d28[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #6
+ vdup.8 d20, d28[4]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+
+ vaddw.s16 q2, q2, d0
+ vaddw.s16 q3, q3, d1
+ vaddw.s16 q4, q4, d2
+ vaddw.s16 q5, q5, d3
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ push {r11,lr}
+1:
+ get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ subs r1, r1, #1
+ store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r11,lr}
+1:
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ subs r1, r1, #1
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.8 {q3}, [r11]!
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q8, q0, q15
+ bl get_gaussian_neon
+ vrshl.s16 q9, q0, q15
+ vqmovn.s16 d0, q8
+ vqmovn.s16 d1, q9
+
+ vand q3, q3, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ vst1.8 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function get_grain_row_44_neon
+ push {r11,lr}
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ pop {r11,pc}
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vld1.16 {q4, q5}, [r12]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vpaddl.s8 q4, q4
+ vpaddl.s8 q5, q5
+ vadd.i16 q2, q2, q4
+ vadd.i16 q3, q3, q5
+ vrshrn.s16 d4, q2, #2
+ vrshrn.s16 d5, q3, #2
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vrshrn.s16 d4, q2, #1
+ vrshrn.s16 d5, q3, #1
+
+add_coeff_lag0_start:
+ vand q3, q2, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ bx lr
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH
+ mov r1, r2
+ mul r12, r12, lr
+.endif
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #1
+ vneg.s16 q12, q12
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 16
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 64
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #2
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q9, q9, q10, q11
+ sum_\type\()_lag1 q10, q10, q11, q12
+ sum_\type\()_lag1 q12, q11, q12, q13, right
+ get_grain_2 d26
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
+ vmov q11, q10
+ vmov q10, q9
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH-3
+ mov r1, r2
+ mul r12, r12, lr
+
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #7
+ vneg.s16 q12, q12
+
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add r12, r11, #GRAIN_WIDTH
+.endif
+ vmov q1, q13
+ vmov q0, q8
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, #255
+ vmov q0, q9
+ vmov q8, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, q14
+ vmov q0, q10
+ vmov q9, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov q10, q2
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q10, q9, q10, q11, right
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d14, d15, d16, d17, d20, d21
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ vmov.u8 r11, \src1[0+\off]
+ vmov.u8 r12, \src2[0+\off]
+ add r11, r11, r3
+ vmov.u8 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u8 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u8 r12, \src1[4+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u8 lr, \src2[4+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u8 r11, \src1[6+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u8 r12, \src2[6+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst3, \src1, \src3, 0
+ gather_interleaved \dst1, \dst3, \src1, \src3, 1
+ gather_interleaved \dst2, \dst4, \src2, \src4, 0
+ gather_interleaved \dst2, \dst4, \src2, \src4, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, 0
+ gather_interleaved d8, d9, d0, d1, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH // grain_lut stride
+
+ neg r4, r4
+ vdup.16 q13, r4 // -scaling_shift
+ cmp r8, #0
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i8 q14, #16
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #9 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.8 d14, d24[0]
+ vdup.8 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q2, q3}, [r6], r9 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r8], r9 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+ vld1.8 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d4, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d4, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d15
+ vmull.s8 q5, d21, d15
+ vmull.s8 q8, d22, d15
+ vmull.s8 q9, d23, d15
+ vmlal.s8 q4, d4, d14
+ vmlal.s8 q5, d5, d14
+ vmlal.s8 q8, d6, d14
+ vmlal.s8 q9, d7, d14
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q8, #5
+ vqrshrn.s16 d23, q9, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+
+ bl gather32_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q2, d8 // scaling
+ vmovl.u8 q3, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q2 // scaling * grain
+ vmul.i16 q9, q9, q3
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r7, r7, #1
+.if \oy
+ vdup.8 d14, d25[0]
+ vdup.8 d15, d25[1]
+.endif
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+ ldrd r8, r9, [sp, #116] // offsets, h
+ ldrd r10, r11, [sp, #124] // uv, is_id
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ add r10, r10, #FGD_UV_OFFSET
+ vld1.16 {d4[]}, [r12] // uv_luma_mult
+ vld1.16 {d4[2]}, [r10] // uv_offset
+ vld1.16 {d4[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg lr, lr // -scaling_shift
+
+ cmp r12, #0
+ vdup.16 q13, lr // -scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ vmov.i8 q14, #16
+ vmov.i8 q15, #240
+ beq 2f
+ // is_id
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ mov r10, #GRAIN_WIDTH // grain_lut stride
+
+ add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+
+.if \sy
+ vmov.i8 d6, #23
+ vmov.i8 d7, #22
+.else
+ vmov.i8 d6, #27
+ vmov.i8 d7, #17
+.endif
+
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8, q9}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmull.s8 q6, d22, d7
+ vmull.s8 q7, d23, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vmlal.s8 q6, d18, d6
+ vmlal.s8 q7, d19, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q6, #5
+ vqrshrn.s16 d23, q7, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if !\csfl
+ vld1.8 {q8, q9}, [r1, :128] // src
+ vmovl.u8 q4, d0
+ vmovl.u8 q5, d1
+ vmovl.u8 q6, d2
+ vmovl.u8 q7, d3
+ vmovl.u8 q0, d16
+ vmovl.u8 q1, d17
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q6, q6, d4[0]
+ vmul.i16 q7, q7, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vmul.i16 q8, q8, d4[1]
+ vmul.i16 q9, q9, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vqadd.s16 q6, q6, q8
+ vqadd.s16 q7, q7, q9
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vshr.s16 q6, q6, #6
+ vshr.s16 q7, q7, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vadd.i16 q6, q6, q0
+ vadd.i16 q7, q7, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6
+ vqmovun.s16 d3, q7
+.endif
+
+ bl gather32_neon
+
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r9, r9, #1
+.if \oy
+ vdup.8 d6, d25[0]
+ vdup.8 d7, d25[1]
+.endif
+
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10}, [r5], r10 // grain_lut
+ vld1.8 {q11}, [r1, :128], r2 // src
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if \csfl
+ vrshrn.u16 d0, q0, #1
+ vrshrn.u16 d1, q1, #1
+.else
+ vrshr.u16 q4, q0, #1
+ vrshr.u16 q5, q1, #1
+ vmovl.u8 q0, d22
+ vmovl.u8 q1, d23
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+.endif
+
+ bl gather16_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+
+ vaddw.u8 q8, q8, d22 // *src + noise
+ vaddw.u8 q9, q9, d23
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+
+ vmax.u8 q0, q0, q14
+ vmin.u8 q0, q0, q15
+
+ subs r9, r9, #1
+.if \oy
+ vswp d6, d7
+.endif
+ vst1.8 {q0}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/filmgrain16.S b/third_party/dav1d/src/arm/32/filmgrain16.S
new file mode 100644
index 0000000000000..6c36caceae5a5
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain16.S
@@ -0,0 +1,2137 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+function get_grain_4_neon
+ push {r11,lr}
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[2]}, [r11]
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_4 dst
+ bl get_grain_4_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mvn lr, r5 // grain_min = ~grain_max
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #2
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mvn lr, r5 // grain_min = ~grain_max
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mvn lr, r5 // grain_min = ~grain_max
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.16 d1[3], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ sub r12, r0, #1*GRAIN_WIDTH*2 - 16
+ vld1.16 {q10}, [r12] // load top right
+
+ vext.8 q0, q8, q9, #14 // top left, top mid
+ vext.8 q1, q9, q10, #2 // top left, top mid
+
+ vmull.s16 q2, d18, d28
+ vmlal.s16 q2, d0, d27
+ vmlal.s16 q2, d2, d29
+ vmull.s16 q3, d19, d28
+ vmlal.s16 q3, d1, d27
+ vmlal.s16 q3, d3, d29
+
+ vmov q8, q9
+ vmov q9, q10
+
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d12, d12, d13
+ vpadd.i16 d13, d14, d15
+ vadd.i16 q0, q0, q6
+ vpop {q6-q7}
+ vrshr.s16 q0, q0, #2
+.endif
+.ifc \type, uv_422
+ vld1.16 {q0, q1}, [r11]!
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vrshr.s16 q0, q0, #1
+.endif
+.ifc \type, uv_444
+ vld1.16 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+ vmovl.s8 q6, d13
+.endif
+ vmlal.s16 q2, d0, d13
+ vmlal.s16 q3, d1, d13
+.endif
+.if \uv_layout && \elems == 8
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.if \elems > 4
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s16 r10, d1[1]
+.endif
+.ifnc \lag, lag1
+ vmov.s16 r8, d1[2]
+.endif
+ vmov.s16 r6, d1[3]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r12, 11
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r12]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #2
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.else
+ // elems == 1
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #14
+.endif
+ vst1.16 {q0}, [r0]!
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #1*GRAIN_WIDTH*2
+ vld1.8 {q9}, [r12] // load the previous block right above
+.endif
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH*2 - 16
+ sub lr, r0, #1*GRAIN_WIDTH*2 - 16
+ vld1.16 {q10}, [r12] // load top right
+ vld1.16 {q13}, [lr]
+
+ vdup.8 d10, d28[0]
+ vext.8 q0, q8, q9, #12 // top left, top mid
+ vdup.8 d12, d28[1]
+ vext.8 q1, q8, q9, #14
+ vdup.8 d14, d28[3]
+ vext.8 q4, q9, q10, #2 // top mid, top right
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+
+ vmull.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d8, d14
+ vmull.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d9, d14
+
+ vdup.8 d10, d28[4]
+ vext.8 q0, q9, q10, #4 // top mid, top right
+ vdup.8 d12, d28[5]
+ vext.8 q1, q11, q12, #12 // top left, top mid
+ vdup.8 d14, d28[6]
+ vext.8 q4, q11, q12, #14
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+
+ vmlal.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d8, d14
+ vmlal.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d9, d14
+
+ vdup.8 d10, d29[0]
+ vext.8 q0, q12, q13, #2 // top mid, top right
+ vdup.8 d12, d29[1]
+ vext.8 q1, q12, q13, #4
+
+ vdup.8 d14, d28[2]
+ vdup.8 d8, d28[7]
+
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q4, d8
+
+ vmlal.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d18, d14
+ vmlal.s16 q2, d24, d8
+ vmlal.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d19, d14
+ vmlal.s16 q3, d25, d8
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH*2
+ sub lr, r0, #1*GRAIN_WIDTH*2
+ vld1.16 {q9}, [r12] // load the previous block right above
+ vld1.16 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH*2
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #10
+ vext.8 q11, q11, q11, #10
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ movw r12, #(3*GRAIN_WIDTH + 3)*2
+ sub r12, r0, r12
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d12, d26[0]
+ vext.8 q1, q11, q12, #2
+ vdup.8 d14, d26[1]
+ vext.8 q4, q11, q12, #4
+ vdup.8 d16, d26[2]
+ vext.8 q5, q11, q12, #6
+ vdup.8 d18, d26[3]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ movw r12, #(2*GRAIN_WIDTH + 3)*2
+ sub r12, r0, r12
+
+ vmull.s16 q2, d22, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmull.s16 q3, d23, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d26[4]
+ vext.8 q0, q11, q12, #8
+ vdup.8 d14, d26[5]
+ vext.8 q1, q11, q12, #10
+ vdup.8 d16, d26[6]
+ vext.8 q4, q11, q12, #12
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d18, d26[7]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d22, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d23, d18
+
+ vdup.8 d12, d27[0]
+ vext.8 q0, q11, q12, #2
+ vdup.8 d14, d27[1]
+ vext.8 q1, q11, q12, #4
+ vdup.8 d16, d27[2]
+ vext.8 q4, q11, q12, #6
+ vdup.8 d18, d27[3]
+ vext.8 q5, q11, q12, #8
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ sub r12, r0, #(1*GRAIN_WIDTH + 3)*2
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d27[4]
+ vext.8 q0, q11, q12, #10
+ vdup.8 d14, d27[5]
+ vext.8 q1, q11, q12, #12
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d16, d27[6]
+ vdup.8 d18, d27[7]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vext.8 q5, q11, q12, #2
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d22, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d23, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d28[0]
+ vext.8 q0, q11, q12, #4
+ vdup.8 d14, d28[1]
+ vext.8 q1, q11, q12, #6
+ vdup.8 d16, d28[2]
+ vext.8 q4, q11, q12, #8
+ vdup.8 d18, d28[3]
+ vext.8 q5, q11, q12, #10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d28[4]
+ vext.8 q0, q11, q12, #12
+ vmovl.s8 q6, d12
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q3, d1, d12
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+ push {r10-r11,lr}
+1:
+ mov r10, #80
+2:
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ subs r10, r10, #8
+ vst1.16 {q0}, [r0]!
+ bgt 2b
+ get_grain_2 d0
+ subs r1, r1, #1
+ vst1.32 {d0[0]}, [r0]!
+ bgt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r10-r11,lr}
+1:
+ mov r10, #40
+2:
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ subs r10, r10, #8
+ vst1.16 {q0}, [r0]!
+ bgt 2b
+ get_grain_4 d0
+ subs r1, r1, #1
+ vst1.16 {d0}, [r0]
+ add r0, r0, #GRAIN_WIDTH*2-80
+ bgt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.16 {q3}, [r11]!
+gen_grain_uv_lag0_8_start:
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+gen_grain_uv_lag0_8_add:
+ vand q3, q3, q1
+ vmull.s16 q2, d6, d22
+ vmull.s16 q3, d7, d22
+ vrshl.s32 q2, q2, q12
+ vrshl.s32 q3, q3, q12
+ vqmovn.s32 d4, q2
+ vqmovn.s32 d5, q3
+ vqadd.s16 q2, q2, q0
+ vmin.s16 q2, q2, q9
+ vmax.s16 q2, q2, q10
+ vst1.16 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q2,q3}, [r11]!
+ vld1.16 {q4,q5}, [r12]
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d8, d8, d9
+ vpadd.i16 d9, d10, d11
+ vadd.i16 q2, q2, q4
+ vrshr.s16 q3, q2, #2
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+ vld1.16 {q2,q3}, [r11]!
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vrshr.s16 q3, q2, #1
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q2}, [r11]
+ vld1.16 {q0}, [r12]
+ add r11, r11, #32
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d0, d0, d1
+ vadd.i16 d4, d4, d0
+ vrshr.s16 d6, d4, #2
+ push {r11,lr}
+ get_grain_4 d0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+ vld1.16 {q2}, [r11]
+ add r11, r11, #32
+ vpadd.i16 d4, d4, d5
+ vrshr.s16 d6, d4, #1
+ push {r11,lr}
+ get_grain_4 d0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ ldr r4, [sp, #36]
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH*2
+ mov r1, r2
+ mul r12, r12, lr
+ clz lr, r4
+.else
+ clz lr, r2
+.endif
+ movrel r3, X(gaussian_sequence)
+ sub lr, lr, #24 // -bitdepth_min_8
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ add r9, r9, lr // grain_scale_shift - bitdepth_min_8
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ push {lr}
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+ pop {lr}
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ neg lr, lr // bitdepth_min_8
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ mvn r6, r5 // grain_min = ~grain_max
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.32 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vdup.16 q9, r5
+ vdup.16 q10, r6
+ vext.8 q13, q0, q1, #10
+ vext.8 q14, q1, q0, #2
+ vneg.s32 q12, q12
+ vmovl.s8 q11, d22
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 8
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 16
+ bl gen_grain_uv_444_lag0_neon // 24
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 40
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 56
+ bl gen_grain_uv_444_lag0_neon // 64
+ bl gen_grain_uv_444_lag0_neon // 72
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #4
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+ vmovl.s8 q13, d27
+ vmovl.s8 q12, d29
+ vmovl.s8 q14, d28
+ vmov d29, d24
+.ifc \type, uv_444
+ vmovl.s8 q6, d13
+.endif
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_mid_neon // 48
+ bl sum_\type\()_lag1_mid_neon // 56
+ bl sum_\type\()_lag1_mid_neon // 64
+ bl sum_\type\()_lag1_mid_neon // 72
+ bl sum_\type\()_lag1_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_mid_neon // 48
+ bl sum_\type\()_lag2_mid_neon // 56
+ bl sum_\type\()_lag2_mid_neon // 64
+ bl sum_\type\()_lag2_mid_neon // 72
+ bl sum_\type\()_lag2_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_mid_neon // 48
+ bl sum_\type\()_lag3_mid_neon // 56
+ bl sum_\type\()_lag3_mid_neon // 64
+ bl sum_\type\()_lag3_mid_neon // 72
+ bl sum_\type\()_lag3_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+ sub \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+
+ ldr r4, [sp, #36]
+ mov r12, r3
+ movw r11, #(3*GRAIN_WIDTH-3)*2
+ mov lr, #28
+ add r11, r1, r11
+ mov r1, r2
+ mul r12, r12, lr
+ clz lr, r4
+
+ movrel r3, X(gaussian_sequence)
+ sub lr, lr, #24 // -bitdepth_min_8
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ add r9, r9, lr // grain_scale_shift - bitdepth_min_8
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ push {lr}
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+ pop {lr}
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ neg lr, lr
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ mvn r6, r5 // grain_min = ~grain_max
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.32 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vdup.16 q9, r5
+ vdup.16 q10, r6
+ vext.8 q13, q0, q1, #10
+ vext.8 q14, q1, q0, #14
+ vneg.s32 q12, q12
+ vmovl.s8 q11, d22
+
+1:
+ vmov q1, q13
+ bl gen_grain_\type\()_lag0_8_neon // 8
+ vmov.i8 q1, #255
+ bl gen_grain_\type\()_lag0_8_neon // 16
+ bl gen_grain_\type\()_lag0_8_neon // 24
+ bl gen_grain_\type\()_lag0_8_neon // 32
+ bl gen_grain_\type\()_lag0_8_neon // 40
+ vmov q1, q14
+ bl gen_grain_\type\()_lag0_4_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+ vmovl.s8 q13, d27
+ vmovl.s8 q12, d29
+ vmovl.s8 q14, d28
+ vmov d29, d24
+ vmovl.s8 q6, d13
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
+ vmov.u16 r11, \src1[0+\off]
+ vmov.u16 r12, \src3[0+\off]
+ add r11, r11, r3
+ vmov.u16 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u16 r11, \src3[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u16 r12, \src2[0+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u16 lr, \src4[0+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u16 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u16 r12, \src4[2+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
+ gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
+ gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
+ gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
+ gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, d2, d3, 0
+ gather_interleaved d8, d9, d0, d1, d2, d3, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .short 27, 17, 0, 0
+ .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .short 23, 0, 0, 0
+ .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, lsl #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH*2 // grain_lut stride
+ ldr r10, [sp, #124] // bitdepth_max
+
+ eor r4, r4, #15 // 15 - scaling_shift
+ vdup.16 q6, r10 // bitdepth_max
+ clz r10, r10
+ vdup.16 q13, r4 // 15 - scaling_shift
+ rsb r10, r10, #24 // bitdepth_min_8
+ cmp r8, #0
+ vdup.16 q12, r10 // bitdepth_min_8
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i16 q14, #16
+ vmov.i16 q15, #235
+ vshl.s16 q14, q14, q12
+ vshl.s16 q15, q15, q12
+ b 2f
+1:
+ // no clip
+ vmov.i16 q14, #0
+ vmov q15, q6
+2:
+ vshr.u16 q6, q6, #1 // grain_max
+
+ vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #18 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.16 d14, d24[0]
+ vdup.16 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ sub r2, r2, #32 // src_stride -= 32
+ sub r9, r9, #32 // grain_stride -= 32
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r6]! // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r8], r9 // grain_lut top old
+.endif
+.if \oy
+ vld1.16 {q4, q5}, [r6], r9 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r1, :128]! // src
+.endif
+ vld1.16 {q8, q9}, [r5]! // grain_lut
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+.endif
+.if !\oy
+ vmvn.i16 q5, #0xf000 // 0x0fff
+.endif
+ vld1.16 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ add r4, r4, #32
+ vmull.s16 q0, d0, d24
+ vmlal.s16 q0, d16, d25
+.endif
+
+.if \oy
+.if \ox
+ add r8, r8, #32
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vmvn d0, d12 // grain_min
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d16, d16, d12
+ vmin.s16 d4, d4, d12
+ vmax.s16 d16, d16, d0
+ vmax.s16 d4, d4, d0
+.endif
+
+ vmull.s16 q0, d4, d14
+ vmull.s16 q1, d5, d14
+ vmull.s16 q2, d6, d14
+ vmull.s16 q3, d7, d14
+ vmlal.s16 q0, d16, d15
+ vmlal.s16 q1, d17, d15
+ vmlal.s16 q2, d18, d15
+ vmlal.s16 q3, d19, d15
+ vmull.s16 q8, d20, d15
+ vmull.s16 q9, d21, d15
+ vmull.s16 q10, d22, d15
+ vmull.s16 q11, d23, d15
+ vmlal.s16 q8, d8, d14
+ vmlal.s16 q9, d9, d14
+ vmlal.s16 q10, d10, d14
+ vmlal.s16 q11, d11, d14
+ vmvn q4, q6 // grain_min
+ vqrshrn.s32 d0, q0, #5
+ vqrshrn.s32 d1, q1, #5
+ vqrshrn.s32 d2, q2, #5
+ vqrshrn.s32 d3, q3, #5
+ vqrshrn.s32 d4, q8, #5
+ vqrshrn.s32 d5, q9, #5
+ vqrshrn.s32 d6, q10, #5
+ vqrshrn.s32 d7, q11, #5
+ vmin.s16 q8, q0, q6
+ vmin.s16 q9, q1, q6
+ vld1.16 {q0, q1}, [r1, :128]! // src
+ vmin.s16 q10, q2, q6
+ vmin.s16 q11, q3, q6
+ vmax.s16 q8, q8, q4
+ vmax.s16 q9, q9, q4
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+ vmvn.i16 q5, #0xf000 // 0x0fff
+ vmax.s16 q10, q10, q4
+ vmax.s16 q11, q11, q4
+.elseif \ox
+ vmvn d4, d12 // grain_min
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r1, :128]! // src
+ vmin.s16 d16, d16, d12
+ vmax.s16 d16, d16, d4
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+.endif
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q5
+ vand q1, q1, q5
+ vand q2, q2, q5
+ vand q3, q3, q5
+
+ bl gather32_neon
+
+.if \ox || \oy
+ vpush {q6-q7}
+.endif
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+ vshl.u16 q4, q4, q13
+ vshl.u16 q5, q5, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+ vqrdmulh.s16 q10, q10, q4
+ vqrdmulh.s16 q11, q11, q5
+
+.if \ox || \oy
+ vpop {q6-q7}
+.endif
+
+ vqadd.s16 q0, q0, q8 // *src + noise
+ vqadd.s16 q1, q1, q9
+ vqadd.s16 q2, q2, q10
+ vqadd.s16 q3, q3, q11
+
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+
+ vst1.16 {q0, q1}, [r0, :128]! // dst
+ subs r7, r7, #1
+.if \oy
+ vdup.16 d14, d25[0]
+ vdup.16 d15, d25[1]
+.endif
+ vst1.16 {q2, q3}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r10, r11, [sp, #124] // uv, is_id
+ ldr r6, [sp, #136] // bitdepth_max
+
+ clz r7, r6
+ rsb r7, r7, #24 // bitdepth_min_8
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset
+ vld1.16 {d30[]}, [r12] // uv_luma_mult
+ lsl r10, r10, r7 // uv_offset << bitdepth_min_8
+ vld1.16 {d30[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ eor lr, lr, #15 // 15 - scaling_shift
+
+ vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8
+
+ cmp r12, #0
+ vdup.16 q13, lr // 15 - scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ mov r8, #16
+ mov r9, #240
+ lsl r8, r8, r7
+ lsl r9, r9, r7
+ beq 2f
+ // is_id
+ mov r9, #235
+ lsl r9, r9, r7
+ b 2f
+1:
+ // no clip
+ mov r8, #0
+ mov r9, r6 // bitdepth_max
+2:
+ vmov.16 d30[3], r6 // bitdepth_max
+ vdup.16 d31, r8 // clip_min
+
+ mov r10, #GRAIN_WIDTH*2 // grain_lut stride
+
+.if \sy
+ mov r6, #23
+ mov r7, #22
+.else
+ mov r6, #27
+ mov r7, #17
+.endif
+ vmov.16 d31[1], r9 // clip_max
+
+ ldrd r8, r9, [sp, #116] // offsets, h
+
+ add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+ vmov.16 d31[2], r6 // overlap y [0]
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ vmov.16 d31[3], r7 // overlap y [1]
+
+ add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+
+ vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+ sub r7, r7, #32 // luma_stride -= 32
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ sub r2, r2, #32 // src_stride -= 32
+ sub r10, r10, #32 // grain_stride -= 32
+.if \oy
+ mov r12, lr
+.endif
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r8]! // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r11], r10 // grain_lut top old
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+.endif
+ vld1.16 {q8, q9}, [r5]! // grain_lut
+.if \oy
+ vld1.16 {q4, q5}, [r8], r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.endif
+.if \oy
+ vdup.16 d28, d31[2] // overlap y coeff
+ vdup.16 d29, d31[3] // overlap y coeff
+.endif
+ vld1.16 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vdup.16 q7, d30[3] // bitdepth_max
+ add r4, r4, #32
+ vmull.s16 q0, d0, d24
+ vshr.u16 q7, q7, #1 // grain_max
+ vmlal.s16 q0, d16, d25
+ vmvn q6, q7 // grain_min
+.endif
+
+.if \oy
+.if \ox
+ add r11, r11, #32
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d4, d4, d14
+ vmin.s16 d16, d16, d14
+ vmax.s16 d4, d4, d12
+ vmax.s16 d16, d16, d12
+.endif
+
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+.if !\ox
+ vdup.16 q7, d30[3] // bitdepth_max
+.endif
+ vmlal.s16 q0, d16, d29
+ vmlal.s16 q1, d17, d29
+ vmlal.s16 q2, d18, d29
+ vmlal.s16 q3, d19, d29
+.if !\ox
+ vshr.u16 q7, q7, #1 // grain_max
+.endif
+ vmull.s16 q8, d20, d29
+ vmull.s16 q9, d21, d29
+ vmull.s16 q10, d22, d29
+ vmull.s16 q11, d23, d29
+.if !\ox
+ vmvn q6, q7 // grain_min
+.endif
+ vmlal.s16 q8, d8, d28
+ vmlal.s16 q9, d9, d28
+ vmlal.s16 q10, d10, d28
+ vmlal.s16 q11, d11, d28
+ vqrshrn.s32 d0, q0, #5
+ vqrshrn.s32 d1, q1, #5
+ vqrshrn.s32 d2, q2, #5
+ vqrshrn.s32 d3, q3, #5
+ vqrshrn.s32 d4, q8, #5
+ vqrshrn.s32 d5, q9, #5
+ vqrshrn.s32 d6, q10, #5
+ vqrshrn.s32 d7, q11, #5
+ vmin.s16 q8, q0, q7
+ vmin.s16 q9, q1, q7
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 q10, q2, q7
+ vmin.s16 q11, q3, q7
+ vmax.s16 q8, q8, q6
+ vmax.s16 q9, q9, q6
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 q10, q10, q6
+ vmax.s16 q11, q11, q6
+.elseif \ox
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 d16, d16, d14
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 d16, d16, d12
+.endif
+
+.if !\csfl
+ vdup.16 d28, d30[0] // uv_luma_mult
+ vld1.16 {q4, q5}, [r1, :128]! // src
+ vdup.16 d29, d30[1] // uv_mult
+ vmull.s16 q6, d0, d28
+ vmull.s16 q7, d1, d28
+ vmull.s16 q0, d2, d28
+ vmull.s16 q1, d3, d28
+ vmlal.s16 q6, d8, d29
+ vmlal.s16 q7, d9, d29
+ vmlal.s16 q0, d10, d29
+ vmlal.s16 q1, d11, d29
+ vld1.16 {q4, q5}, [r1, :128] // src
+ sub r1, r1, #32
+ vshrn.s32 d12, q6, #6
+ vshrn.s32 d13, q7, #6
+ vshrn.s32 d14, q0, #6
+ vshrn.s32 d15, q1, #6
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+ vmlal.s16 q0, d8, d29
+ vmlal.s16 q1, d9, d29
+ vmlal.s16 q2, d10, d29
+ vmlal.s16 q3, d11, d29
+ vdup.16 q14, d30[2] // uv_offset
+ vshrn.s32 d0, q0, #6
+ vshrn.s32 d1, q1, #6
+ vshrn.s32 d2, q2, #6
+ vshrn.s32 d3, q3, #6
+ vdup.16 q4, d30[3] // bitdepth_max
+ vmov.i16 q5, #0
+ vadd.i16 q6, q6, q14
+ vadd.i16 q7, q7, q14
+ vadd.i16 q2, q0, q14
+ vadd.i16 q3, q1, q14
+ vmin.s16 q0, q6, q4
+ vmin.s16 q1, q7, q4
+ vmin.s16 q2, q2, q4
+ vmin.s16 q3, q3, q4
+ vmax.s16 q0, q0, q5
+ vmax.s16 q1, q1, q5
+ vmax.s16 q2, q2, q5
+ vmax.s16 q3, q3, q5
+.else
+ vdup.16 q14, d30[3] // bitdepth_max
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q14
+ vand q1, q1, q14
+ vand q2, q2, q14
+ vand q3, q3, q14
+.endif
+
+ bl gather32_neon
+
+ vld1.16 {q0, q1}, [r1, :128]! // src
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+ vshl.u16 q4, q4, q13
+ vshl.u16 q5, q5, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+ vqrdmulh.s16 q10, q10, q4
+ vqrdmulh.s16 q11, q11, q5
+
+
+ vdup.16 q4, d31[0] // clip_min
+ vdup.16 q5, d31[1] // clip_max
+
+ vqadd.s16 q0, q0, q8 // *src + noise
+ vqadd.s16 q1, q1, q9
+ vqadd.s16 q2, q2, q10
+ vqadd.s16 q3, q3, q11
+
+.if \oy
+ vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x
+.endif
+
+ vmax.s16 q0, q0, q4
+ vmax.s16 q1, q1, q4
+ vmax.s16 q2, q2, q4
+ vmax.s16 q3, q3, q4
+ vmin.s16 q0, q0, q5
+ vmin.s16 q1, q1, q5
+ vmin.s16 q2, q2, q5
+ vmin.s16 q3, q3, q5
+
+ vst1.16 {q0, q1}, [r0, :128]! // dst
+
+ subs r9, r9, #1
+.if \oy
+ vmov.32 d31[1], lr // new coeffs for overlap y
+.endif
+
+ vst1.16 {q2, q3}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r10 // grain_lut old
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r11], r10 // grain_lut top old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r8], r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+.endif
+ vld1.16 {q8, q9}, [r5], r10 // grain_lut
+.if \oy
+ vdup.16 d28, d31[2] // overlap y coeff
+ vdup.16 d29, d31[3] // overlap y coeff
+.endif
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.endif
+
+.if \ox
+ vdup.16 q7, d30[3] // bitdepth_max
+ vmull.s16 q0, d0, d24
+ vshr.u16 q7, q7, #1 // grain_max
+ vmlal.s16 q0, d16, d25
+ vmvn q6, q7 // grain_min
+.endif
+
+.if \oy
+.if \ox
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d4, d4, d14
+ vmin.s16 d16, d16, d14
+ vmax.s16 d4, d4, d12
+ vmax.s16 d16, d16, d12
+.endif
+
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+.if !\ox
+ vdup.16 q7, d30[3] // bitdepth_max
+.endif
+ vmlal.s16 q0, d16, d29
+ vmlal.s16 q1, d17, d29
+ vmlal.s16 q2, d18, d29
+ vmlal.s16 q3, d19, d29
+.if !\ox
+ vshr.u16 q7, q7, #1 // grain_max
+.endif
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d17, q1, #5
+ vqrshrn.s32 d18, q2, #5
+ vqrshrn.s32 d19, q3, #5
+.if !\ox
+ vmvn q6, q7 // grain_min
+.endif
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 q8, q8, q7
+ vmin.s16 q9, q9, q7
+ vmax.s16 q8, q8, q6
+ vmax.s16 q9, q9, q6
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.elseif \ox
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 d16, d16, d14
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 d16, d16, d12
+.endif
+
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vrshr.u16 q0, q0, #1
+ vrshr.u16 q1, q1, #1
+.if !\csfl
+ vdup.16 d28, d30[0] // uv_luma_mult
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+ vdup.16 d29, d30[1] // uv_mult
+ vmull.s16 q6, d0, d28
+ vmull.s16 q7, d1, d28
+ vmull.s16 q0, d2, d28
+ vmull.s16 q1, d3, d28
+ vmlal.s16 q6, d4, d29
+ vmlal.s16 q7, d5, d29
+ vmlal.s16 q0, d6, d29
+ vmlal.s16 q1, d7, d29
+ vshrn.s32 d12, q6, #6
+ vshrn.s32 d13, q7, #6
+ vshrn.s32 d14, q0, #6
+ vshrn.s32 d15, q1, #6
+ vdup.16 q14, d30[2] // uv_offset
+ vdup.16 q4, d30[3] // bitdepth_max
+ vmov.i16 q5, #0
+ vadd.i16 q6, q6, q14
+ vadd.i16 q7, q7, q14
+ vmin.s16 q0, q6, q4
+ vmin.s16 q1, q7, q4
+ vmax.s16 q0, q0, q5
+ vmax.s16 q1, q1, q5
+.else
+ vdup.16 q14, d30[3] // bitdepth_max
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q14
+ vand q1, q1, q14
+.endif
+
+ bl gather16_neon
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+
+
+ vdup.16 q4, d31[0] // clip_min
+ vdup.16 q5, d31[1] // clip_max
+
+ vqadd.s16 q0, q2, q8 // *src + noise
+ vqadd.s16 q1, q3, q9
+
+.if \oy
+ // Swap the two last coefficients of d31, place them first in d28
+ vrev64.16 d28, d31
+.endif
+
+ vmax.s16 q0, q0, q4
+ vmax.s16 q1, q1, q4
+ vmin.s16 q0, q0, q5
+ vmin.s16 q1, q1, q5
+
+ subs r9, r9, #1
+.if \oy
+ // Take the first two 16 bit coefficients of d28 and place them at the
+ // end of d31
+ vtrn.32 d31, d28
+.endif
+
+ vst1.16 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S
index a1aea4139baf5..ceea025e45ce9 100644
--- a/third_party/dav1d/src/arm/32/itx.S
+++ b/third_party/dav1d/src/arm/32/itx.S
@@ -134,9 +134,9 @@ endconst
vmlsl.s16 \d1, \s3, \c1
.endm
-.macro vrshrn_8h d0, d1, s0, s1, shift
- vrshrn.i32 \d0, \s0, \shift
- vrshrn.i32 \d1, \s1, \shift
+.macro vqrshrn_8h d0, d1, s0, s1, shift
+ vqrshrn.s32 \d0, \s0, \shift
+ vqrshrn.s32 \d1, \s1, \shift
.endm
.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
@@ -418,11 +418,11 @@ endfunc
vmull_vmlal q3, \r1, \r3, d0[3], d0[2]
vmull_vmlsl q2, \r1, \r3, d0[2], d0[3]
vmull_vmlal q1, \r0, \r2, d0[0], d0[0]
- vrshrn.i32 d6, q3, #12
- vrshrn.i32 d7, q2, #12
+ vqrshrn.s32 d6, q3, #12
+ vqrshrn.s32 d7, q2, #12
vmull_vmlsl q2, \r0, \r2, d0[0], d0[0]
- vrshrn.i32 d2, q1, #12
- vrshrn.i32 d3, q2, #12
+ vqrshrn.s32 d2, q1, #12
+ vqrshrn.s32 d3, q2, #12
vqadd.s16 \r0, d2, d6
vqsub.s16 \r3, d2, d6
vqadd.s16 \r1, d3, d7
@@ -433,11 +433,11 @@ endfunc
vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2]
vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3]
vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0]
- vrshrn_8h d12, d13, q6, q7, #12
- vrshrn_8h d14, d15, q4, q5, #12
+ vqrshrn_8h d12, d13, q6, q7, #12
+ vqrshrn_8h d14, d15, q4, q5, #12
vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0]
- vrshrn_8h d4, d5, q2, q3, #12
- vrshrn_8h d6, d7, q4, q5, #12
+ vqrshrn_8h d4, d5, q2, q3, #12
+ vqrshrn_8h d6, d7, q4, q5, #12
vqadd.s16 \q0, q2, q6
vqsub.s16 \q3, q2, q6
vqadd.s16 \q1, q3, q7
@@ -478,10 +478,10 @@ endfunc
vadd.s32 q3, q3, q10
vsub.s32 q11, q11, q10
- vrshrn.i32 \o0, q2, #12
- vrshrn.i32 \o2, q1, #12
- vrshrn.i32 \o1, q3, #12
- vrshrn.i32 \o3, q11, #12
+ vqrshrn.s32 \o0, q2, #12
+ vqrshrn.s32 \o2, q1, #12
+ vqrshrn.s32 \o1, q3, #12
+ vqrshrn.s32 \o3, q11, #12
.endm
function inv_adst_4h_x4_neon, export=1
@@ -533,21 +533,21 @@ endfunc
vsub.s32 q4, q4, q2 // out3
vsub.s32 q5, q5, q3
- vrshrn.i32 d20, q10, #12
- vrshrn.i32 d21, q11, #12
+ vqrshrn.s32 d20, q10, #12
+ vqrshrn.s32 d21, q11, #12
- vrshrn.i32 \o0, q8, #12
- vrshrn.i32 \o1, q9, #12
+ vqrshrn.s32 \o0, q8, #12
+ vqrshrn.s32 \o1, q9, #12
.ifc \o4, d18
vmov q9, q10
.endif
- vrshrn.i32 \o2, q6, #12
- vrshrn.i32 \o3, q7, #12
+ vqrshrn.s32 \o2, q6, #12
+ vqrshrn.s32 \o3, q7, #12
- vrshrn.i32 \o6, q4, #12
- vrshrn.i32 \o7, q5, #12
+ vqrshrn.s32 \o6, q4, #12
+ vqrshrn.s32 \o7, q5, #12
.endm
function inv_adst_8h_x4_neon, export=1
@@ -702,11 +702,11 @@ def_fn_4x4 identity, flipadst
vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a
vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a
vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a
- vrshrn_8h \r2, \r3, q2, q3, #12 // t4a
- vrshrn_8h \r14, \r15, q4, q5, #12 // t7a
+ vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a
+ vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a
vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a
- vrshrn_8h \r6, \r7, q6, q7, #12 // t5a
- vrshrn_8h \r10, \r11, q2, q3, #12 // t6a
+ vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a
+ vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a
vqadd.s16 q2, \q1, \q3 // t4
vqsub.s16 \q1, \q1, \q3 // t5a
@@ -715,8 +715,8 @@ def_fn_4x4 identity, flipadst
vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5
vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6
- vrshrn_8h d8, d9, q4, q5, #12 // t5
- vrshrn_8h d10, d11, q6, q7, #12 // t6
+ vqrshrn_8h d8, d9, q4, q5, #12 // t5
+ vqrshrn_8h d10, d11, q6, q7, #12 // t6
vqsub.s16 \q7, \q0, q3 // out7
vqadd.s16 \q0, \q0, q3 // out0
@@ -735,11 +735,11 @@ def_fn_4x4 identity, flipadst
vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a
vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a
vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a
- vrshrn.i32 \r1, q1, #12 // t4a
+ vqrshrn.s32 \r1, q1, #12 // t4a
vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a
- vrshrn.i32 \r7, q2, #12 // t7a
- vrshrn.i32 \r3, q3, #12 // t5a
- vrshrn.i32 \r5, q1, #12 // taa
+ vqrshrn.s32 \r7, q2, #12 // t7a
+ vqrshrn.s32 \r3, q3, #12 // t5a
+ vqrshrn.s32 \r5, q1, #12 // taa
vqadd.s16 d2, \r1, \r3 // t4
vqsub.s16 \r1, \r1, \r3 // t5a
@@ -748,8 +748,8 @@ def_fn_4x4 identity, flipadst
vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5
vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6
- vrshrn.i32 d4, q2, #12 // t5
- vrshrn.i32 d5, q3, #12 // t6
+ vqrshrn.s32 d4, q2, #12 // t5
+ vqrshrn.s32 d5, q3, #12 // t6
vqsub.s16 \r7, \r0, d3 // out7
vqadd.s16 \r0, \r0, d3 // out0
@@ -783,19 +783,19 @@ endfunc
vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1]
vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0]
vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3]
- vrshrn_8h d16, d17, q2, q3, #12 // t0a
- vrshrn_8h d30, d31, q4, q5, #12 // t1a
+ vqrshrn_8h d16, d17, q2, q3, #12 // t0a
+ vqrshrn_8h d30, d31, q4, q5, #12 // t1a
vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2]
vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1]
- vrshrn_8h d20, d21, q6, q7, #12 // t2a
- vrshrn_8h d26, d27, q2, q3, #12 // t3a
+ vqrshrn_8h d20, d21, q6, q7, #12 // t2a
+ vqrshrn_8h d26, d27, q2, q3, #12 // t3a
vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0]
vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3]
- vrshrn_8h d24, d25, q4, q5, #12 // t4a
- vrshrn_8h d22, d23, q6, q7, #12 // t5a
+ vqrshrn_8h d24, d25, q4, q5, #12 // t4a
+ vqrshrn_8h d22, d23, q6, q7, #12 // t5a
vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2]
- vrshrn_8h d28, d29, q2, q3, #12 // t6a
- vrshrn_8h d18, d19, q4, q5, #12 // t7a
+ vqrshrn_8h d28, d29, q2, q3, #12 // t6a
+ vqrshrn_8h d18, d19, q4, q5, #12 // t7a
vqadd.s16 q2, q8, q12 // t0
vqsub.s16 q3, q8, q12 // t4
@@ -810,13 +810,13 @@ endfunc
vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3]
vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2]
- vrshrn_8h d6, d7, q8, q9, #12 // t4a
- vrshrn_8h d10, d11, q12, q13, #12 // t5a
+ vqrshrn_8h d6, d7, q8, q9, #12 // t4a
+ vqrshrn_8h d10, d11, q12, q13, #12 // t5a
vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3]
- vrshrn_8h d14, d15, q14, q15, #12 // t6a
- vrshrn_8h d22, d23, q8, q9, #12 // t7a
+ vqrshrn_8h d14, d15, q14, q15, #12 // t6a
+ vqrshrn_8h d22, d23, q8, q9, #12 // t7a
vqadd.s16 \q0, q2, q6 // out0
vqsub.s16 q2, q2, q6 // t2
@@ -833,11 +833,11 @@ endfunc
vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12)
vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11)
vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10)
- vrshrn_8h d4, d5, q10, q11, #12 // out3
+ vqrshrn_8h d4, d5, q10, q11, #12 // out3
vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13)
- vrshrn_8h d6, d7, q12, q13, #12 // out5
- vrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
- vrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11)
+ vqrshrn_8h d6, d7, q12, q13, #12 // out5
+ vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
+ vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11)
vqneg.s16 \q3, q2 // out3
vqneg.s16 \q5, q3 // out5
@@ -850,19 +850,19 @@ endfunc
vmull_vmlal q2, d23, d16, d0[0], d0[1]
vmull_vmlsl q3, d23, d16, d0[1], d0[0]
vmull_vmlal q4, d21, d18, d0[2], d0[3]
- vrshrn.i32 d16, q2, #12 // t0a
- vrshrn.i32 d23, q3, #12 // t1a
+ vqrshrn.s32 d16, q2, #12 // t0a
+ vqrshrn.s32 d23, q3, #12 // t1a
vmull_vmlsl q5, d21, d18, d0[3], d0[2]
vmull_vmlal q6, d19, d20, d1[0], d1[1]
- vrshrn.i32 d18, q4, #12 // t2a
- vrshrn.i32 d21, q5, #12 // t3a
+ vqrshrn.s32 d18, q4, #12 // t2a
+ vqrshrn.s32 d21, q5, #12 // t3a
vmull_vmlsl q7, d19, d20, d1[1], d1[0]
vmull_vmlal q2, d17, d22, d1[2], d1[3]
- vrshrn.i32 d20, q6, #12 // t4a
- vrshrn.i32 d19, q7, #12 // t5a
+ vqrshrn.s32 d20, q6, #12 // t4a
+ vqrshrn.s32 d19, q7, #12 // t5a
vmull_vmlsl q3, d17, d22, d1[3], d1[2]
- vrshrn.i32 d22, q2, #12 // t6a
- vrshrn.i32 d17, q3, #12 // t7a
+ vqrshrn.s32 d22, q2, #12 // t6a
+ vqrshrn.s32 d17, q3, #12 // t7a
vqadd.s16 d4, d16, d20 // t0
vqsub.s16 d5, d16, d20 // t4
@@ -877,13 +877,13 @@ endfunc
vmull_vmlsl q10, d5, d7, d2[2], d2[3]
vmull_vmlsl q11, d19, d9, d2[3], d2[2]
- vrshrn.i32 d5, q8, #12 // t4a
- vrshrn.i32 d7, q10, #12 // t5a
+ vqrshrn.s32 d5, q8, #12 // t4a
+ vqrshrn.s32 d7, q10, #12 // t5a
vmull_vmlal q8, d19, d9, d2[2], d2[3]
- vrshrn.i32 d9, q11, #12 // t6a
- vrshrn.i32 d19, q8, #12 // t7a
+ vqrshrn.s32 d9, q11, #12 // t6a
+ vqrshrn.s32 d19, q8, #12 // t7a
vqadd.s16 \r0, d4, d8 // out0
vqsub.s16 d4, d4, d8 // t2
@@ -900,11 +900,11 @@ endfunc
vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20)
vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19)
vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18)
- vrshrn.i32 d4, q9, #12 // out3
+ vqrshrn.s32 d4, q9, #12 // out3
vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21)
- vrshrn.i32 d5, q10, #12 // out5
- vrshrn.i32 \r2, q9, #12 // out2 (d18 or d21)
- vrshrn.i32 \r4, q4, #12 // out4 (d20 or d19)
+ vqrshrn.s32 d5, q10, #12 // out5
+ vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21)
+ vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19)
vqneg.s16 \r3, d4 // out3
vqneg.s16 \r5, d5 // out5
@@ -1122,19 +1122,19 @@ function inv_dct_4h_x16_neon, export=1
vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a
vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a
vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a
- vrshrn.i32 d17, q2, #12 // t8a
- vrshrn.i32 d31, q3, #12 // t15a
+ vqrshrn.s32 d17, q2, #12 // t8a
+ vqrshrn.s32 d31, q3, #12 // t15a
vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a
vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a
- vrshrn.i32 d23, q4, #12 // t9a
- vrshrn.i32 d25, q2, #12 // t14a
+ vqrshrn.s32 d23, q4, #12 // t9a
+ vqrshrn.s32 d25, q2, #12 // t14a
vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a
vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a
- vrshrn.i32 d21, q3, #12 // t10a
- vrshrn.i32 d27, q4, #12 // t13a
+ vqrshrn.s32 d21, q3, #12 // t10a
+ vqrshrn.s32 d27, q4, #12 // t13a
vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a
- vrshrn.i32 d19, q2, #12 // t11a
- vrshrn.i32 d29, q3, #12 // t12a
+ vqrshrn.s32 d19, q2, #12 // t11a
+ vqrshrn.s32 d29, q3, #12 // t12a
idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30
@@ -1149,14 +1149,14 @@ function inv_dct_4h_x16_neon, export=1
vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a
vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a
- vrshrn.i32 d21, q3, #12 // t9a
- vrshrn.i32 d27, q4, #12 // t14a
+ vqrshrn.s32 d21, q3, #12 // t9a
+ vqrshrn.s32 d27, q4, #12 // t14a
vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a
vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a
- vrshrn.i32 d29, q3, #12 // t13a
+ vqrshrn.s32 d29, q3, #12 // t13a
vneg.s32 q4, q4
- vrshrn.i32 d23, q4, #12 // t10a
+ vqrshrn.s32 d23, q4, #12 // t10a
vqsub.s16 d4, d17, d19 // t11a
vqadd.s16 d17, d17, d19 // t8a
@@ -1171,11 +1171,11 @@ function inv_dct_4h_x16_neon, export=1
vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12
vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a
- vrshrn.i32 d6, q3, #12 // t11
- vrshrn.i32 d7, q4, #12 // t12
+ vqrshrn.s32 d6, q3, #12 // t11
+ vqrshrn.s32 d7, q4, #12 // t12
vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a
- vrshrn.i32 d4, q2, #12 // t10a
- vrshrn.i32 d5, q4, #12 // t13a
+ vqrshrn.s32 d4, q2, #12 // t10a
+ vqrshrn.s32 d5, q4, #12 // t13a
vqadd.s16 d8, d16, d31 // out0
vqsub.s16 d31, d16, d31 // out15
@@ -1208,35 +1208,35 @@ endfunc
vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0
vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1
vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2
- vrshrn.i32 d16, q2, #12 // t0
- vrshrn.i32 d31, q3, #12 // t1
+ vqrshrn.s32 d16, q2, #12 // t0
+ vqrshrn.s32 d31, q3, #12 // t1
vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3
vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4
- vrshrn.i32 d18, q4, #12 // t2
- vrshrn.i32 d29, q2, #12 // t3
+ vqrshrn.s32 d18, q4, #12 // t2
+ vqrshrn.s32 d29, q2, #12 // t3
vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5
vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6
- vrshrn.i32 d20, q3, #12 // t4
- vrshrn.i32 d27, q4, #12 // t5
+ vqrshrn.s32 d20, q3, #12 // t4
+ vqrshrn.s32 d27, q4, #12 // t5
vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7
vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8
- vrshrn.i32 d22, q2, #12 // t6
- vrshrn.i32 d25, q3, #12 // t7
+ vqrshrn.s32 d22, q2, #12 // t6
+ vqrshrn.s32 d25, q3, #12 // t7
vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9
vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10
- vrshrn.i32 d23, q4, #12 // t8
- vrshrn.i32 d24, q2, #12 // t9
+ vqrshrn.s32 d23, q4, #12 // t8
+ vqrshrn.s32 d24, q2, #12 // t9
vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11
vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12
- vrshrn.i32 d21, q3, #12 // t10
- vrshrn.i32 d26, q4, #12 // t11
+ vqrshrn.s32 d21, q3, #12 // t10
+ vqrshrn.s32 d26, q4, #12 // t11
vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13
vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14
- vrshrn.i32 d19, q2, #12 // t12
- vrshrn.i32 d28, q3, #12 // t13
+ vqrshrn.s32 d19, q2, #12 // t12
+ vqrshrn.s32 d28, q3, #12 // t13
vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15
- vrshrn.i32 d17, q4, #12 // t14
- vrshrn.i32 d30, q2, #12 // t15
+ vqrshrn.s32 d17, q4, #12 // t14
+ vqrshrn.s32 d30, q2, #12 // t15
vld1.16 {q0}, [r12, :128]
@@ -1260,19 +1260,19 @@ endfunc
vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8
vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9
vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10
- vrshrn.i32 d17, q2, #12 // t8
- vrshrn.i32 d30, q3, #12 // t9
+ vqrshrn.s32 d17, q2, #12 // t8
+ vqrshrn.s32 d30, q3, #12 // t9
vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11
vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12
- vrshrn.i32 d18, q4, #12 // t10
- vrshrn.i32 d29, q2, #12 // t11
+ vqrshrn.s32 d18, q4, #12 // t10
+ vqrshrn.s32 d29, q2, #12 // t11
vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13
vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14
- vrshrn.i32 d27, q3, #12 // t12
- vrshrn.i32 d20, q4, #12 // t13
+ vqrshrn.s32 d27, q3, #12 // t12
+ vqrshrn.s32 d20, q4, #12 // t13
vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15
- vrshrn.i32 d25, q2, #12 // t14
- vrshrn.i32 d22, q3, #12 // t15
+ vqrshrn.s32 d25, q2, #12 // t14
+ vqrshrn.s32 d22, q3, #12 // t15
vqsub.s16 d2, d16, d21 // t4
vqadd.s16 d16, d16, d21 // t0
@@ -1294,19 +1294,19 @@ endfunc
vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a
vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a
vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a
- vrshrn.i32 d22, q2, #12 // t4a
- vrshrn.i32 d25, q3, #12 // t5a
+ vqrshrn.s32 d22, q2, #12 // t4a
+ vqrshrn.s32 d25, q3, #12 // t5a
vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a
vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12
- vrshrn.i32 d24, q4, #12 // t6a
- vrshrn.i32 d23, q2, #12 // t7a
+ vqrshrn.s32 d24, q4, #12 // t6a
+ vqrshrn.s32 d23, q2, #12 // t7a
vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13
vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14
- vrshrn.i32 d17, q3, #12 // t12
+ vqrshrn.s32 d17, q3, #12 // t12
vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15
- vrshrn.i32 d29, q4, #12 // t13
- vrshrn.i32 d30, q2, #12 // t14
- vrshrn.i32 d18, q3, #12 // t15
+ vqrshrn.s32 d29, q4, #12 // t13
+ vqrshrn.s32 d30, q2, #12 // t14
+ vqrshrn.s32 d18, q3, #12 // t15
vqsub.s16 d2, d16, d21 // t2a
.ifc \o0, d16
@@ -1343,21 +1343,21 @@ endfunc
vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24)
vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
- vrshrn.i32 d24, q12, #12 // out8
- vrshrn.i32 d4, q2, #12 // out7
- vrshrn.i32 d5, q3, #12 // out5
+ vqrshrn.s32 d24, q12, #12 // out8
+ vqrshrn.s32 d4, q2, #12 // out7
+ vqrshrn.s32 d5, q3, #12 // out5
vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21)
vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
- vrshrn.i32 d26, q4, #12 // out10
+ vqrshrn.s32 d26, q4, #12 // out10
vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
- vrshrn.i32 \o4, q1, #12 // out4
- vrshrn.i32 d7, q3, #12 // out9
- vrshrn.i32 d6, q4, #12 // out11
- vrshrn.i32 \o6, q11, #12 // out6
+ vqrshrn.s32 \o4, q1, #12 // out4
+ vqrshrn.s32 d7, q3, #12 // out9
+ vqrshrn.s32 d6, q4, #12 // out11
+ vqrshrn.s32 \o6, q11, #12 // out6
.ifc \o8, d23
vmov \o8, d24
@@ -1927,35 +1927,35 @@ function inv_dct32_odd_4h_x16_neon, export=1
vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a
vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a
vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a
- vrshrn.i32 d16, q2, #12 // t16a
- vrshrn.i32 d31, q3, #12 // t31a
+ vqrshrn.s32 d16, q2, #12 // t16a
+ vqrshrn.s32 d31, q3, #12 // t31a
vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a
vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a
- vrshrn.i32 d24, q4, #12 // t17a
- vrshrn.i32 d23, q2, #12 // t30a
+ vqrshrn.s32 d24, q4, #12 // t17a
+ vqrshrn.s32 d23, q2, #12 // t30a
vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a
vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a
- vrshrn.i32 d20, q3, #12 // t18a
- vrshrn.i32 d27, q4, #12 // t29a
+ vqrshrn.s32 d20, q3, #12 // t18a
+ vqrshrn.s32 d27, q4, #12 // t29a
vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a
vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a
- vrshrn.i32 d28, q2, #12 // t19a
- vrshrn.i32 d19, q3, #12 // t28a
+ vqrshrn.s32 d28, q2, #12 // t19a
+ vqrshrn.s32 d19, q3, #12 // t28a
vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a
vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a
- vrshrn.i32 d18, q4, #12 // t20a
- vrshrn.i32 d29, q2, #12 // t27a
+ vqrshrn.s32 d18, q4, #12 // t20a
+ vqrshrn.s32 d29, q2, #12 // t27a
vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a
vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a
- vrshrn.i32 d26, q3, #12 // t21a
- vrshrn.i32 d21, q4, #12 // t26a
+ vqrshrn.s32 d26, q3, #12 // t21a
+ vqrshrn.s32 d21, q4, #12 // t26a
vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a
vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a
- vrshrn.i32 d22, q2, #12 // t22a
- vrshrn.i32 d25, q3, #12 // t25a
+ vqrshrn.s32 d22, q2, #12 // t22a
+ vqrshrn.s32 d25, q3, #12 // t25a
vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a
- vrshrn.i32 d30, q4, #12 // t23a
- vrshrn.i32 d17, q2, #12 // t24a
+ vqrshrn.s32 d30, q4, #12 // t23a
+ vqrshrn.s32 d17, q2, #12 // t24a
vld1.16 {q0}, [r12, :128]
@@ -1979,21 +1979,21 @@ function inv_dct32_odd_4h_x16_neon, export=1
vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a
vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a
vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a
- vrshrn.i32 d21, q2, #12 // t17a
- vrshrn.i32 d27, q3, #12 // t30a
+ vqrshrn.s32 d21, q2, #12 // t17a
+ vqrshrn.s32 d27, q3, #12 // t30a
vneg.s32 q4, q4 // -> t18a
vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a
vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a
- vrshrn.i32 d19, q4, #12 // t18a
- vrshrn.i32 d24, q1, #12 // t29a
+ vqrshrn.s32 d19, q4, #12 // t18a
+ vqrshrn.s32 d24, q1, #12 // t29a
vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a
vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a
- vrshrn.i32 d22, q2, #12 // t21a
- vrshrn.i32 d18, q3, #12 // t26a
+ vqrshrn.s32 d22, q2, #12 // t21a
+ vqrshrn.s32 d18, q3, #12 // t26a
vneg.s32 q4, q4 // -> t22a
vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a
- vrshrn.i32 d17, q4, #12 // t22a
- vrshrn.i32 d20, q1, #12 // t25a
+ vqrshrn.s32 d17, q4, #12 // t22a
+ vqrshrn.s32 d20, q1, #12 // t25a
vqsub.s16 d2, d27, d24 // t29
vqadd.s16 d27, d27, d24 // t30
@@ -2015,21 +2015,21 @@ function inv_dct32_odd_4h_x16_neon, export=1
vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a
vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a
vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19
- vrshrn.i32 d18, q2, #12 // t18a
- vrshrn.i32 d25, q3, #12 // t29a
+ vqrshrn.s32 d18, q2, #12 // t18a
+ vqrshrn.s32 d25, q3, #12 // t29a
vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28
vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20
- vrshrn.i32 d29, q4, #12 // t19
- vrshrn.i32 d24, q1, #12 // t28
+ vqrshrn.s32 d29, q4, #12 // t19
+ vqrshrn.s32 d24, q1, #12 // t28
vneg.s32 q2, q2 // -> t20
vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27
vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a
- vrshrn.i32 d26, q2, #12 // t20
- vrshrn.i32 d19, q3, #12 // t27
+ vqrshrn.s32 d26, q2, #12 // t20
+ vqrshrn.s32 d19, q3, #12 // t27
vneg.s32 q4, q4 // -> t21a
vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a
- vrshrn.i32 d20, q4, #12 // t21a
- vrshrn.i32 d28, q1, #12 // t26a
+ vqrshrn.s32 d20, q4, #12 // t21a
+ vqrshrn.s32 d28, q1, #12 // t26a
vqsub.s16 d2, d16, d30 // t23
vqadd.s16 d16, d16, d30 // t16 = out16
@@ -2051,24 +2051,24 @@ function inv_dct32_odd_4h_x16_neon, export=1
vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20
vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27
- vrshrn.i32 d20, q2, #12 // t20
- vrshrn.i32 d22, q3, #12 // t27
+ vqrshrn.s32 d20, q2, #12 // t20
+ vqrshrn.s32 d22, q3, #12 // t27
vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a
vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a
vmov d27, d22 // t27
- vrshrn.i32 d26, q2, #12 // t26a
+ vqrshrn.s32 d26, q2, #12 // t26a
vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22
vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25
- vrshrn.i32 d21, q3, #12 // t21a
- vrshrn.i32 d22, q12, #12 // t22
- vrshrn.i32 d25, q2, #12 // t25
+ vqrshrn.s32 d21, q3, #12 // t21a
+ vqrshrn.s32 d22, q12, #12 // t22
+ vqrshrn.s32 d25, q2, #12 // t25
vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a
vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a
- vrshrn.i32 d23, q2, #12 // t23a
- vrshrn.i32 d24, q3, #12 // t24a
+ vqrshrn.s32 d23, q2, #12 // t23a
+ vqrshrn.s32 d24, q3, #12 // t24a
bx lr
endfunc
@@ -2679,11 +2679,11 @@ function inv_dct64_step1_neon
vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a
vneg.s32 q2, q2 // t34a
vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a
- vrshrn.i32 d26, q2, #12 // t34a
+ vqrshrn.s32 d26, q2, #12 // t34a
vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a
- vrshrn.i32 d29, q3, #12 // t61a
- vrshrn.i32 d25, q4, #12 // t33a
- vrshrn.i32 d30, q2, #12 // t62a
+ vqrshrn.s32 d29, q3, #12 // t61a
+ vqrshrn.s32 d25, q4, #12 // t33a
+ vqrshrn.s32 d30, q2, #12 // t62a
vqadd.s16 d16, d24, d27 // t32a
vqsub.s16 d19, d24, d27 // t35a
@@ -2697,11 +2697,11 @@ function inv_dct64_step1_neon
vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a
vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a
vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60
- vrshrn.i32 d21, q2, #12 // t61a
- vrshrn.i32 d18, q3, #12 // t34a
+ vqrshrn.s32 d21, q2, #12 // t61a
+ vqrshrn.s32 d18, q3, #12 // t34a
vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35
- vrshrn.i32 d20, q4, #12 // t60
- vrshrn.i32 d19, q2, #12 // t35
+ vqrshrn.s32 d20, q4, #12 // t60
+ vqrshrn.s32 d19, q2, #12 // t35
vst1.16 {d16, d17, d18, d19}, [r6, :128]!
vst1.16 {d20, d21, d22, d23}, [r6, :128]!
@@ -2738,12 +2738,12 @@ function inv_dct64_step2_neon
vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a
vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a
vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a
- vrshrn.i32 d25, q2, #12 // t56a
- vrshrn.i32 d27, q3, #12 // t39a
+ vqrshrn.s32 d25, q2, #12 // t56a
+ vqrshrn.s32 d27, q3, #12 // t39a
vneg.s32 q4, q4 // t40a
vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a
- vrshrn.i32 d31, q4, #12 // t40a
- vrshrn.i32 d28, q2, #12 // t55a
+ vqrshrn.s32 d31, q4, #12 // t40a
+ vqrshrn.s32 d28, q2, #12 // t55a
vqadd.s16 d16, d24, d29 // t32a
vqsub.s16 d19, d24, d29 // t47a
@@ -2757,11 +2757,11 @@ function inv_dct64_step2_neon
vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a
vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a
vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47
- vrshrn.i32 d18, q2, #12 // t40a
- vrshrn.i32 d21, q3, #12 // t55a
+ vqrshrn.s32 d18, q2, #12 // t40a
+ vqrshrn.s32 d21, q3, #12 // t55a
vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48
- vrshrn.i32 d19, q4, #12 // t47
- vrshrn.i32 d20, q2, #12 // t48
+ vqrshrn.s32 d19, q4, #12 // t47
+ vqrshrn.s32 d20, q2, #12 // t48
vstr d16, [r6, #2*4*0] // t32a
vstr d17, [r9, #2*4*0] // t39
diff --git a/third_party/dav1d/src/arm/64/filmgrain.S b/third_party/dav1d/src/arm/64/filmgrain.S
new file mode 100644
index 0000000000000..6cdd7ec5fa399
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain.S
@@ -0,0 +1,2010 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr w11, w2, #3
+ lsr w12, w2, #12
+ lsr w13, w2, #1
+ eor w11, w2, w11 // (r >> 0) ^ (r >> 3)
+ eor w12, w12, w13 // (r >> 12) ^ (r >> 1)
+ eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr w2, w2, #\steps
+.endif
+ and w11, w11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr w2, w2, w11, lsl #(16 - \steps) // *state
+.else
+ orr w2, w2, w11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, x2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, x2, #17 - \bits, #\bits
+ lsr w2, w2, #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ increment_seed 4
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ read_rand x14, 11, 3
+ ld1 {v0.h}[3], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 2
+ ld1 {v0.h}[4], [x14]
+ add x15, x3, x15, lsl #1
+ read_rand x14, 11, 1
+ ld1 {v0.h}[5], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[6], [x14]
+ ld1 {v0.h}[7], [x15]
+ ret
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r0\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r0\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r1\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r1\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r2\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r2\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r3\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r3\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r4\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r4\().16b, \r5\().8h
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {\r5\().h}[0], [x14]
+ ld1 {\r5\().h}[1], [x15]
+ srshl v0.4h, \r5\().4h, v31.4h
+ xtn \r5\().8b, v0.8h
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b,\r3\().16b}, [x0], #32
+ st1 {\r4\().16b}, [x0], #16
+ st1 {\r5\().h}[0], [x0], #2
+.endm
+
+.macro get_grain_row_44 r0, r1, r2
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r0\().8b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn2 \r0\().16b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r1\().8b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn2 \r1\().16b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r2\().8b, \r2\().8h
+
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ ld1 {v0.h}[3], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ xtn2 \r2\().16b, v0.8h
+.endm
+
+.macro store_grain_row_44 r0, r1, r2
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b}, [x0]
+ add x0, x0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ ld1 {v0.h}[1], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ xtn v0.8b, v0.8h
+ ret
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+ read_shift_rand x13, 11
+ mov w11, v1.s[0]
+ ldrsh w12, [x3, x13, lsl #1]
+ ext v0.16b, v0.16b, v0.16b, #1
+.if \n == 1
+ madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
+.elseif \n == 2
+ madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w14, w17, w11 // += *coeff * prev output 2
+ mov w16, w14
+.else
+ madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
+ madd w11, w14, w21, w11 // += *coeff * prev output 3
+ mov w17, w16
+ mov w16, w14
+.endif
+ add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
+ add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1)
+ asr w14, w14, w7 // >> ar_coeff_shift
+ asr w12, w12, w9 // >> (4 + grain_scale_shift)
+ add w14, w14, w12
+ cmp w14, w5
+ csel w14, w14, w5, le
+ cmp w14, w6
+ csel w14, w14, w6, ge
+ subs w15, w15, #1
+ ext v1.16b, v1.16b, v1.16b, #4
+ ins v0.b[15], w14
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ smull v2.8h, v3.8b, v28.8b
+ smull2 v3.8h, v3.16b, v28.16b
+ smull v4.8h, v0.8b, v27.8b
+ smull2 v5.8h, v0.16b, v27.16b
+ smull v6.8h, v1.8b, v29.8b
+ smull2 v7.8h, v1.16b, v29.16b
+ saddl v0.4s, v2.4h, v4.4h
+ saddl2 v1.4s, v2.8h, v4.8h
+ saddl v2.4s, v3.4h, v5.4h
+ saddl2 v3.4s, v3.8h, v5.8h
+ saddw v4.4s, v0.4s, v6.4h
+ saddw2 v5.4s, v1.4s, v6.8h
+ saddw v6.4s, v2.4s, v7.4h
+ saddw2 v7.4s, v3.4s, v7.8h
+ ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+ bl sum_\lag\()_above_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH
+ ld1 {v22.16b, v23.16b}, [x19], #32
+ ld1 {v24.16b, v25.16b}, [x12]
+ saddlp v22.8h, v22.16b
+ saddlp v23.8h, v23.16b
+ saddlp v24.8h, v24.16b
+ saddlp v25.8h, v25.16b
+ add v22.8h, v22.8h, v24.8h
+ add v23.8h, v23.8h, v25.8h
+ rshrn v0.8b, v22.8h, #2
+ rshrn2 v0.16b, v23.8h, #2
+.endif
+.ifc \type, uv_422
+ ld1 {v22.16b, v23.16b}, [x19], #32
+ saddlp v22.8h, v22.16b
+ saddlp v23.8h, v23.16b
+ rshrn v0.8b, v22.8h, #1
+ rshrn2 v0.16b, v23.8h, #1
+.endif
+.ifc \type, uv_444
+ ld1 {v0.16b}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ dup v1.16b, \uv_coeff
+ smull v2.8h, v0.8b, v1.8b
+ smull2 v3.8h, v0.16b, v1.16b
+.else
+ smull v2.8h, v0.8b, v30.8b
+ smull2 v3.8h, v0.16b, v30.16b
+.endif
+ saddw v4.4s, v4.4s, v2.4h
+ saddw2 v5.4s, v5.4s, v2.8h
+ saddw v6.4s, v6.4s, v3.4h
+ saddw2 v7.4s, v7.4s, v3.8h
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.ifc \edge, left
+ increment_seed 4
+ read_rand x12, 11, 3
+ read_rand x13, 11, 2
+ read_rand x14, 11, 1
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v0.h}[5], [x12]
+ ld1 {v0.h}[6], [x13]
+ ld1 {v0.h}[7], [x14]
+ lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ srshl v0.8h, v0.8h, v31.8h
+ xtn2 v0.16b, v0.8h
+ ext v4.16b, v4.16b, v4.16b, #12
+.ifc \lag, lag3
+ smov w17, v0.b[13]
+.endif
+.ifnc \lag, lag1
+ smov w16, v0.b[14]
+.endif
+ smov w14, v0.b[15]
+
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ mov v1.16b, v5.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ mov v1.16b, v6.16b
+.if \elems == 9
+ mov w15, #1
+ bl output_\lag\()_neon
+ lsr w2, w2, #3
+
+ read_rand x12, 11, 2
+ read_rand x13, 11, 1
+ read_rand x14, 11, 0
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v1.h}[0], [x12]
+ ld1 {v1.h}[1], [x13]
+ ld1 {v1.h}[2], [x14]
+ srshl v1.4h, v1.4h, v31.4h
+ xtn v1.8b, v1.8h
+ ext v0.16b, v0.16b, v1.16b, #7
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ mov v1.16b, v7.16b
+
+.ifc \edge, right
+ mov w15, #3
+ bl output_\lag\()_neon
+ read_shift_rand x15, 11
+ add x15, x3, x15, lsl #1
+ ld1 {v1.h}[0], [x15]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #1
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ st1 {v0.16b}, [x0], #16
+.endif
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ mov v3.16b, \mid\().16b
+ ext v0.16b, \left\().16b, \mid\().16b, #15
+ ext v1.16b, \mid\().16b, \right\().16b, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ mov \dst\().16b, v0.16b
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ sub x12, x0, #2*GRAIN_WIDTH - 16
+ sub x13, x0, #1*GRAIN_WIDTH - 16
+ ld1 {v18.16b}, [x12] // load top right
+ ld1 {v21.16b}, [x13]
+
+ ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid
+ dup v26.16b, v30.b[0]
+ ext v23.16b, v16.16b, v17.16b, #15
+ dup v27.16b, v30.b[1]
+ ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right
+ dup v28.16b, v30.b[3]
+ ext v1.16b, v17.16b, v18.16b, #2
+ dup v29.16b, v30.b[4]
+
+ smull v2.8h, v22.8b, v26.8b
+ smull2 v3.8h, v22.16b, v26.16b
+ smull v4.8h, v23.8b, v27.8b
+ smull2 v5.8h, v23.16b, v27.16b
+ smull v6.8h, v0.8b, v28.8b
+ smull2 v7.8h, v0.16b, v28.16b
+ smull v0.8h, v1.8b, v29.8b
+ smull2 v1.8h, v1.16b, v29.16b
+ saddl v22.4s, v2.4h, v4.4h
+ saddl2 v23.4s, v2.8h, v4.8h
+ saddl v26.4s, v3.4h, v5.4h
+ saddl2 v27.4s, v3.8h, v5.8h
+ saddl v2.4s, v0.4h, v6.4h
+ saddl2 v3.4s, v0.8h, v6.8h
+ saddl v6.4s, v1.4h, v7.4h
+ saddl2 v7.4s, v1.8h, v7.8h
+ add v4.4s, v22.4s, v2.4s
+ add v5.4s, v23.4s, v3.4s
+ add v6.4s, v26.4s, v6.4s
+ add v7.4s, v27.4s, v7.4s
+
+ ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid
+ dup v26.16b, v30.b[5]
+ ext v23.16b, v19.16b, v20.16b, #15
+ dup v27.16b, v30.b[6]
+ ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right
+ dup v28.16b, v30.b[8]
+ ext v1.16b, v20.16b, v21.16b, #2
+ dup v29.16b, v30.b[9]
+
+ smull v2.8h, v22.8b, v26.8b
+ smull2 v3.8h, v22.16b, v26.16b
+ smull v22.8h, v23.8b, v27.8b
+ smull2 v23.8h, v23.16b, v27.16b
+ smull v26.8h, v0.8b, v28.8b
+ smull2 v27.8h, v0.16b, v28.16b
+ smull v28.8h, v1.8b, v29.8b
+ smull2 v29.8h, v1.16b, v29.16b
+ saddl v0.4s, v2.4h, v22.4h
+ saddl2 v1.4s, v2.8h, v22.8h
+ saddl v2.4s, v3.4h, v23.4h
+ saddl2 v3.4s, v3.8h, v23.8h
+ saddl v22.4s, v26.4h, v28.4h
+ saddl2 v23.4s, v26.8h, v28.8h
+ saddl v26.4s, v27.4h, v29.4h
+ saddl2 v27.4s, v27.8h, v29.8h
+ add v0.4s, v0.4s, v22.4s
+ add v1.4s, v1.4s, v23.4s
+ add v2.4s, v2.4s, v26.4s
+ add v3.4s, v3.4s, v27.4s
+ dup v26.16b, v30.b[2]
+ dup v27.16b, v30.b[7]
+ smull v22.8h, v17.8b, v26.8b
+ smull2 v23.8h, v17.16b, v26.16b
+ smull v24.8h, v20.8b, v27.8b
+ smull2 v25.8h, v20.16b, v27.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ saddl v0.4s, v22.4h, v24.4h
+ saddl2 v1.4s, v22.8h, v24.8h
+ saddl v2.4s, v23.4h, v25.4h
+ saddl2 v3.4s, v23.8h, v25.8h
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #2*GRAIN_WIDTH
+ sub x13, x0, #1*GRAIN_WIDTH
+ ld1 {v17.16b}, [x12] // load the previous block right above
+ ld1 {v20.16b}, [x13]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_above_neon
+ sub x11, x0, #3*GRAIN_WIDTH - 16
+ sub x12, x0, #2*GRAIN_WIDTH - 16
+ sub x13, x0, #1*GRAIN_WIDTH - 16
+ ld1 {v15.16b}, [x11] // load top right
+ ld1 {v18.16b}, [x12]
+ ld1 {v21.16b}, [x13]
+
+ ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[0]
+ ext v9.16b, v13.16b, v14.16b, #14
+ dup v23.16b, v29.b[1]
+ ext v10.16b, v13.16b, v14.16b, #15
+ dup v24.16b, v29.b[2]
+ dup v25.16b, v29.b[3]
+ ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right
+ dup v26.16b, v29.b[4]
+ ext v12.16b, v14.16b, v15.16b, #2
+ dup v27.16b, v29.b[5]
+ ext v13.16b, v14.16b, v15.16b, #3
+ dup v28.16b, v29.b[6]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v14.8b, v25.8b
+ smull2 v13.8h, v14.16b, v25.16b
+ add v4.4s, v22.4s, v0.4s
+ add v5.4s, v23.4s, v1.4s
+ add v6.4s, v24.4s, v2.4s
+ add v7.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v13.4h
+ saddw2 v7.4s, v7.4s, v13.8h
+
+ ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[7]
+ ext v9.16b, v16.16b, v17.16b, #14
+ dup v23.16b, v29.b[8]
+ ext v10.16b, v16.16b, v17.16b, #15
+ dup v24.16b, v29.b[9]
+ dup v25.16b, v29.b[10]
+ ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right
+ dup v26.16b, v29.b[11]
+ ext v12.16b, v17.16b, v18.16b, #2
+ dup v27.16b, v29.b[12]
+ ext v13.16b, v17.16b, v18.16b, #3
+ dup v28.16b, v29.b[13]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v17.8b, v25.8b
+ smull2 v13.8h, v17.16b, v25.16b
+ add v22.4s, v22.4s, v0.4s
+ add v23.4s, v23.4s, v1.4s
+ add v24.4s, v24.4s, v2.4s
+ add v26.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v22.4s
+ add v5.4s, v5.4s, v23.4s
+ add v6.4s, v6.4s, v24.4s
+ add v7.4s, v7.4s, v26.4s
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v13.4h
+ saddw2 v7.4s, v7.4s, v13.8h
+
+ ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[14]
+ ext v9.16b, v19.16b, v20.16b, #14
+ dup v23.16b, v29.b[15]
+ ext v10.16b, v19.16b, v20.16b, #15
+ dup v24.16b, v30.b[0]
+ dup v25.16b, v30.b[1]
+ ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right
+ dup v26.16b, v30.b[2]
+ ext v12.16b, v20.16b, v21.16b, #2
+ dup v27.16b, v30.b[3]
+ ext v13.16b, v20.16b, v21.16b, #3
+ dup v28.16b, v30.b[4]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v20.8b, v25.8b
+ smull2 v19.8h, v20.16b, v25.16b
+ add v22.4s, v22.4s, v0.4s
+ add v23.4s, v23.4s, v1.4s
+ add v24.4s, v24.4s, v2.4s
+ add v26.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v22.4s
+ add v5.4s, v5.4s, v23.4s
+ add v6.4s, v6.4s, v24.4s
+ add v7.4s, v7.4s, v26.4s
+ mov v13.16b, v14.16b
+ mov v14.16b, v15.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v19.4h
+ saddw2 v7.4s, v7.4s, v19.8h
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x11, x0, #3*GRAIN_WIDTH
+ sub x12, x0, #2*GRAIN_WIDTH
+ sub x13, x0, #1*GRAIN_WIDTH
+ ld1 {v14.16b}, [x11] // load the previous block right above
+ ld1 {v17.16b}, [x12]
+ ld1 {v20.16b}, [x13]
+.endif
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ get_grain_row v16, v17, v18, v19, v20, v21
+ subs w1, w1, #1
+ store_grain_row v16, v17, v18, v19, v20, v21
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function generate_grain_rows_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ get_grain_row_44 v16, v17, v18
+ subs w1, w1, #1
+ store_grain_row_44 v16, v17, v18
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function get_grain_row_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ get_grain_row v16, v17, v18, v19, v20, v21
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function get_grain_row_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ get_grain_row_44 v16, v17, v18
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function add_uv_444_coeff_lag0_neon
+add_coeff_lag0_start:
+ smull v2.8h, v0.8b, v27.8b
+ smull2 v3.8h, v0.16b, v27.16b
+ srshl v2.8h, v2.8h, v28.8h
+ srshl v3.8h, v3.8h, v28.8h
+ saddw v2.8h, v2.8h, v1.8b
+ saddw2 v3.8h, v3.8h, v1.16b
+ sqxtn v2.8b, v2.8h
+ sqxtn2 v2.16b, v3.8h
+ ret
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ ld1 {v4.16b, v5.16b}, [x19], #32
+ ld1 {v6.16b, v7.16b}, [x12], #32
+ saddlp v4.8h, v4.16b
+ saddlp v5.8h, v5.16b
+ saddlp v6.8h, v6.16b
+ saddlp v7.8h, v7.16b
+ add v4.8h, v4.8h, v6.8h
+ add v5.8h, v5.8h, v7.8h
+ rshrn v4.8b, v4.8h, #2
+ rshrn2 v4.16b, v5.8h, #2
+ and v0.16b, v4.16b, v0.16b
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ ld1 {v4.16b, v5.16b}, [x19], #32
+ saddlp v4.8h, v4.16b
+ saddlp v5.8h, v5.16b
+ rshrn v4.8b, v4.8h, #1
+ rshrn2 v4.16b, v5.8h, #1
+ and v0.16b, v4.16b, v0.16b
+ b add_coeff_lag0_start
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH
+ mov x1, x2
+ mul w13, w13, w14
+.endif
+ movrel x3, X(gaussian_sequence)
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add x4, x1, #FGD_AR_COEFFS_Y
+.else
+ add x4, x1, #FGD_AR_COEFFS_UV
+.endif
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+.ifc \type, uv_444
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+.endif
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #127
+ mov w6, #-128
+
+.ifc \type, uv_444
+ eor w2, w2, w11
+.endif
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+ mov w1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ dup v28.8h, w7
+ ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ ext v29.16b, v0.16b, v1.16b, #13
+ ext v30.16b, v1.16b, v0.16b, #1
+ neg v28.8h, v28.8h
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+ mov w1, #GRAIN_HEIGHT-3
+1:
+ ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64
+ bl get_grain_row_neon
+ and v0.16b, v22.16b, v29.16b
+ mov v1.16b, v16.16b
+ bl add_uv_444_coeff_lag0_neon
+ mov v0.16b, v23.16b
+ mov v1.16b, v17.16b
+ mov v16.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ ld1 {v26.16b}, [x19], #16
+ mov v0.16b, v24.16b
+ mov v1.16b, v18.16b
+ mov v17.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ add x19, x19, #2
+ mov v0.16b, v25.16b
+ mov v1.16b, v19.16b
+ mov v18.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ and v0.16b, v26.16b, v30.16b
+ mov v1.16b, v20.16b
+ mov v19.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ mov v20.16b, v2.16b
+ subs w1, w1, #1
+ store_grain_row v16, v17, v18, v19, v20, v21
+ b.gt 1b
+.endif
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0]
+ ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1]
+ ld1r {v29.16b}, [x4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb w4, [x4, #1] // ar_coeffs_y[3]
+.else
+ add x4, x4, #2
+.endif
+
+ mov w1, #3
+.ifc \type, uv_444
+ ld1r {v30.16b}, [x4] // ar_coeffs_uv[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 v22, v16, v16, v17, left
+ sum_\type\()_lag1 v23, v16, v17, v18
+ sum_\type\()_lag1 v24, v17, v18, v19
+ sum_\type\()_lag1 v25, v18, v19, v20
+ sum_\type\()_lag1 v20, v19, v20, v21, right
+ get_grain_2 v21
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ store_grain_row v22, v23, v24, v25, v20, v21
+ mov v16.16b, v22.16b
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ mov v19.16b, v25.16b
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ st1 {v16.h}[0], [x0], #2
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ st1 {v16.h}[0], [x0], #2
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH-3
+ mov x1, x2
+ mul w13, w13, w14
+
+ movrel x3, X(gaussian_sequence)
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+ add x4, x1, #FGD_AR_COEFFS_UV
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #127
+ mov w6, #-128
+
+ eor w2, w2, w11
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w7
+ ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ ext v29.16b, v0.16b, v1.16b, #13
+ ext v30.16b, v1.16b, v0.16b, #7
+ neg v28.8h, v28.8h
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+ set_height w1, \type
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH
+.endif
+ mov v0.16b, v29.16b
+ mov v1.16b, v16.16b
+ bl add_\type\()_coeff_lag0_neon
+ movi v0.16b, #255
+ mov v1.16b, v17.16b
+ mov v16.16b, v2.16b
+ bl add_\type\()_coeff_lag0_neon
+ mov v0.16b, v30.16b
+ mov v1.16b, v18.16b
+ mov v17.16b, v2.16b
+ bl add_\type\()_coeff_lag0_neon
+ mov v18.16b, v2.16b
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ store_grain_row_44 v16, v17, v18
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0]
+ ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1]
+ ld1r {v29.16b}, [x4] // ar_coeffs_uv[2]
+ add x4, x4, #2
+
+ mov w1, #3
+ ld1r {v30.16b}, [x4] // ar_coeffs_u4[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ sum_\type\()_lag1 v20, v16, v16, v17, left
+ sum_\type\()_lag1 v21, v16, v17, v18
+ sum_\type\()_lag1 v18, v17, v18, v18, right
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ store_grain_row_44 v20, v21, v18
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH-48
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, [x4] // ar_coeffs_uv[0-15]
+ ldr q30, [x4, #16] // ar_coeffs_uv[16-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH-48
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ umov w14, \src1[0+\off]
+ umov w15, \src2[8+\off]
+ umov w16, \src1[2+\off]
+ add x14, x14, x3
+ umov w17, \src2[10+\off]
+ add x15, x15, x3
+ ld1 {\dst1}[0+\off], [x14]
+ umov w14, \src1[4+\off]
+ add x16, x16, x3
+ ld1 {\dst2}[8+\off], [x15]
+ umov w15, \src2[12+\off]
+ add x17, x17, x3
+ ld1 {\dst1}[2+\off], [x16]
+ umov w16, \src1[6+\off]
+ add x14, x14, x3
+ ld1 {\dst2}[10+\off], [x17]
+ umov w17, \src2[14+\off]
+ add x15, x15, x3
+ ld1 {\dst1}[4+\off], [x14]
+ add x16, x16, x3
+ ld1 {\dst2}[12+\off], [x15]
+ add x17, x17, x3
+ ld1 {\dst1}[6+\off], [x16]
+ ld1 {\dst2}[14+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2
+ gather_interleaved \dst1, \dst2, \src1, \src2, 0
+ gather_interleaved \dst2, \dst1, \src2, \src1, 0
+ gather_interleaved \dst1, \dst2, \src1, \src2, 1
+ gather_interleaved \dst2, \dst1, \src2, \src1, 1
+.endm
+
+function gather32_neon
+ gather v4.b, v5.b, v0.b, v1.b
+ ret
+endfunc
+
+function gather16_neon
+ gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
+ gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
+ ins v4.d[1], v5.d[1]
+ ret
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, uxtw // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ldr w11, [x6, #8] // offsets[1][0]
+ ldr w13, [x6, #4] // offsets[0][1]
+ ldr w15, [x6, #12] // offsets[1][1]
+ ldr w6, [x6] // offsets[0][0]
+ ldr w8, [sp, #16] // clip
+ mov x9, #GRAIN_WIDTH // grain_lut stride
+
+ neg w4, w4
+ dup v29.8h, w4 // -scaling_shift
+
+ movrel x16, overlap_coeffs_0
+
+ cbz w8, 1f
+ // clip
+ movi v30.16b, #16
+ movi v31.16b, #235
+ b 2f
+1:
+ // no clip
+ movi v30.16b, #0
+ movi v31.16b, #255
+2:
+
+ ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
+
+ add x5, x5, #9 // grain_lut += 9
+ add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x9 // grain_lut += grain_stride
+
+ calc_offset w11, w12, w11, 0, 0
+ calc_offset w13, w14, w13, 0, 0
+ calc_offset w15, w16, w15, 0, 0
+ calc_offset w6, w10, w6, 0, 0
+
+ add_offset x12, w11, x12, x5, x9
+ add_offset x14, w13, x14, x5, x9
+ add_offset x16, w15, x16, x5, x9
+ add_offset x5, w6, x10, x5, x9
+
+ ldr w11, [sp, #24] // type
+ adr x13, L(fgy_loop_tbl)
+
+ add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx
+ add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ tst w11, #1
+ ldrh w11, [x13, w11, uxtw #1]
+
+ add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx
+
+ sub x11, x13, w11, uxtw
+
+ b.eq 1f
+ // y overlap
+ dup v6.16b, v27.b[0]
+ dup v7.16b, v27.b[1]
+ mov w10, w7 // backup actual h
+ mov w7, #2
+1:
+ br x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x9 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x8], x9 // grain_lut top old
+.endif
+ ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut
+
+ bl gather32_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v7.8b
+.else
+ smull v16.8h, v18.8b, v7.8b
+.endif
+ smull2 v17.8h, v18.16b, v7.16b
+ smull v18.8h, v19.8b, v7.8b
+ smull2 v19.8h, v19.16b, v7.16b
+.if \ox
+ smlal v16.8h, v21.8b, v6.8b
+.else
+ smlal v16.8h, v22.8b, v6.8b
+.endif
+ smlal2 v17.8h, v22.16b, v6.16b
+ smlal v18.8h, v23.8b, v6.8b
+ smlal2 v19.8h, v23.16b, v6.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+ sqrshrn v23.8b, v18.8h, #5
+ sqrshrn2 v23.16b, v19.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+ sxtl v18.8h, v23.8b
+ sxtl2 v19.8h, v23.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+ mul v18.8h, v18.8h, v4.8h
+ mul v19.8h, v19.8h, v5.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+ srshl v18.8h, v18.8h, v29.8h
+ srshl v19.8h, v19.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v0.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v0.16b
+ uaddw v18.8h, v18.8h, v1.8b
+ uaddw2 v19.8h, v19.8h, v1.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun2 v1.16b, v19.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umax v1.16b, v1.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+ umin v1.16b, v1.16b, v31.16b
+
+ subs w7, w7, #1
+.if \oy
+ dup v6.16b, v28.b[0]
+ dup v7.16b, v28.b[1]
+.endif
+ st1 {v0.16b, v1.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w10, #2
+ sub w7, w10, #2 // restore actual remaining h
+ b.gt L(loop_\ox\()0)
+.endif
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+
+L(fgy_loop_tbl):
+ .hword L(fgy_loop_tbl) - L(loop_00)
+ .hword L(fgy_loop_tbl) - L(loop_01)
+ .hword L(fgy_loop_tbl) - L(loop_10)
+ .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-32]!
+ str d8, [sp, #16]
+ ldp x8, x9, [sp, #32] // offsets, h
+ ldp x10, x11, [sp, #48] // uv, is_id
+
+ ldr w13, [x4, #FGD_SCALING_SHIFT]
+ ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg w13, w13 // -scaling_shift
+
+ // !csfl
+ add x10, x4, x10, lsl #2 // + 4*uv
+ add x14, x10, #FGD_UV_LUMA_MULT
+ add x15, x10, #FGD_UV_MULT
+ add x10, x10, #FGD_UV_OFFSET
+ ld1 {v8.h}[0], [x14] // uv_luma_mult
+ ld1r {v24.8h}, [x10] // uv_offset
+ ld1 {v8.h}[1], [x15] // uv_mult
+
+ dup v29.8h, w13 // -scaling_shift
+
+ cbz w12, 1f
+ // clip
+ movi v30.16b, #16
+ movi v31.16b, #240
+ cbz w11, 2f
+ // is_id
+ movi v31.16b, #235
+ b 2f
+1:
+ // no clip
+ movi v30.16b, #0
+ movi v31.16b, #255
+2:
+
+ ldr w12, [x8, #8] // offsets[1][0]
+ ldr w14, [x8, #4] // offsets[0][1]
+ ldr w16, [x8, #12] // offsets[1][1]
+ ldr w8, [x8] // offsets[0][0]
+
+ mov x10, #GRAIN_WIDTH // grain_lut stride
+
+ add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
+ add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x10 // grain_lut += grain_stride
+.endif
+
+ calc_offset w12, w13, w12, \sx, \sy
+ calc_offset w14, w15, w14, \sx, \sy
+ calc_offset w16, w17, w16, \sx, \sy
+ calc_offset w8, w11, w8, \sx, \sy
+
+ add_offset x13, w12, x13, x5, x10
+ add_offset x15, w14, x15, x5, x10
+ add_offset x17, w16, x17, x5, x10
+ add_offset x5, w8, x11, x5, x10
+
+ add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ ldr w13, [sp, #64] // type
+
+ movrel x16, overlap_coeffs_\sx
+ adr x14, L(fguv_loop_sx\sx\()_tbl)
+
+ ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
+ tst w13, #1
+ ldrh w13, [x14, w13, uxtw #1]
+
+ b.eq 1f
+ // y overlap
+ sub w12, w9, #(2 >> \sy) // backup remaining h
+ mov w9, #(2 >> \sy)
+
+1:
+ sub x13, x14, w13, uxtw
+
+.if \sy
+ movi v25.16b, #23
+ movi v26.16b, #22
+.else
+ movi v25.16b, #27
+ movi v26.16b, #17
+.endif
+
+.if \sy
+ add x7, x7, x7 // luma_stride *= 2
+.endif
+
+ br x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x6], x7 // luma
+ ld1 {v6.16b, v7.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut
+
+.if !\csfl
+ uxtl v2.8h, v0.8b
+ uxtl2 v3.8h, v0.16b
+ uxtl v4.8h, v1.8b
+ uxtl2 v5.8h, v1.16b
+ uxtl v0.8h, v6.8b
+ uxtl2 v1.8h, v6.16b
+ uxtl v16.8h, v7.8b
+ uxtl2 v17.8h, v7.16b
+ mul v2.8h, v2.8h, v8.h[0]
+ mul v3.8h, v3.8h, v8.h[0]
+ mul v4.8h, v4.8h, v8.h[0]
+ mul v5.8h, v5.8h, v8.h[0]
+ mul v0.8h, v0.8h, v8.h[1]
+ mul v1.8h, v1.8h, v8.h[1]
+ mul v16.8h, v16.8h, v8.h[1]
+ mul v17.8h, v17.8h, v8.h[1]
+ sqadd v2.8h, v2.8h, v0.8h
+ sqadd v3.8h, v3.8h, v1.8h
+ sqadd v4.8h, v4.8h, v16.8h
+ sqadd v5.8h, v5.8h, v17.8h
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ sshr v4.8h, v4.8h, #6
+ sshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v24.8h
+ add v3.8h, v3.8h, v24.8h
+ add v4.8h, v4.8h, v24.8h
+ add v5.8h, v5.8h, v24.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+ sqxtun v1.8b, v4.8h
+ sqxtun2 v1.16b, v5.8h
+.endif
+
+ bl gather32_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v26.8b
+.else
+ smull v16.8h, v18.8b, v26.8b
+.endif
+ smull2 v17.8h, v18.16b, v26.16b
+ smull v18.8h, v19.8b, v26.8b
+ smull2 v19.8h, v19.16b, v26.16b
+.if \ox
+ smlal v16.8h, v21.8b, v25.8b
+.else
+ smlal v16.8h, v22.8b, v25.8b
+.endif
+ smlal2 v17.8h, v22.16b, v25.16b
+ smlal v18.8h, v23.8b, v25.8b
+ smlal2 v19.8h, v23.16b, v25.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+ sqrshrn v23.8b, v18.8h, #5
+ sqrshrn2 v23.16b, v19.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+ sxtl v18.8h, v23.8b
+ sxtl2 v19.8h, v23.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+ mul v18.8h, v18.8h, v4.8h
+ mul v19.8h, v19.8h, v5.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+ srshl v18.8h, v18.8h, v29.8h
+ srshl v19.8h, v19.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v6.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v6.16b
+ uaddw v18.8h, v18.8h, v7.8b
+ uaddw2 v19.8h, v19.8h, v7.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun2 v1.16b, v19.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umax v1.16b, v1.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+ umin v1.16b, v1.16b, v31.16b
+
+ subs w9, w9, #1
+.if \oy
+ dup v25.16b, v28.b[0]
+ dup v26.16b, v28.b[1]
+.endif
+ st1 {v0.16b, v1.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ ldr d8, [sp, #16]
+ ldr x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx0_tbl):
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x6], x7 // luma
+ ld1 {v6.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v18.16b}, [x5], x10 // grain_lut
+
+ uaddlp v2.8h, v0.16b
+ uaddlp v3.8h, v1.16b
+.if \csfl
+ rshrn v0.8b, v2.8h, #1
+ rshrn2 v0.16b, v3.8h, #1
+.else
+ urshr v2.8h, v2.8h, #1
+ urshr v3.8h, v3.8h, #1
+ uxtl v0.8h, v6.8b
+ uxtl2 v1.8h, v6.16b
+ mul v2.8h, v2.8h, v8.h[0]
+ mul v3.8h, v3.8h, v8.h[0]
+ mul v0.8h, v0.8h, v8.h[1]
+ mul v1.8h, v1.8h, v8.h[1]
+ sqadd v2.8h, v2.8h, v0.8h
+ sqadd v3.8h, v3.8h, v1.8h
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ add v2.8h, v2.8h, v24.8h
+ add v3.8h, v3.8h, v24.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+.endif
+
+ bl gather16_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v26.8b
+.else
+ smull v16.8h, v18.8b, v26.8b
+.endif
+ smull2 v17.8h, v18.16b, v26.16b
+.if \ox
+ smlal v16.8h, v21.8b, v25.8b
+.else
+ smlal v16.8h, v22.8b, v25.8b
+.endif
+ smlal2 v17.8h, v22.16b, v25.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v6.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v6.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+
+.if \oy
+ mov v16.16b, v25.16b
+.endif
+ subs w9, w9, #1
+.if \oy
+ mov v25.16b, v26.16b
+ mov v26.16b, v16.16b
+.endif
+ st1 {v0.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ ldr d8, [sp, #16]
+ ldr x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx1_tbl):
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S
new file mode 100644
index 0000000000000..7c4ff6dda9435
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain16.S
@@ -0,0 +1,1997 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr w11, w2, #3
+ lsr w12, w2, #12
+ lsr w13, w2, #1
+ eor w11, w2, w11 // (r >> 0) ^ (r >> 3)
+ eor w12, w12, w13 // (r >> 12) ^ (r >> 1)
+ eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr w2, w2, #\steps
+.endif
+ and w11, w11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr w2, w2, w11, lsl #(16 - \steps) // *state
+.else
+ orr w2, w2, w11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, x2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, x2, #17 - \bits, #\bits
+ lsr w2, w2, #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ increment_seed 4
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ read_rand x14, 11, 3
+ ld1 {v0.h}[3], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 2
+ ld1 {v0.h}[4], [x14]
+ add x15, x3, x15, lsl #1
+ read_rand x14, 11, 1
+ ld1 {v0.h}[5], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[6], [x14]
+ ld1 {v0.h}[7], [x15]
+ ret
+endfunc
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b,\r3\().16b}, [x0], #32
+ st1 {\r4\().16b}, [x0], #16
+ st1 {\r5\().h}[0], [x0], #2
+.endm
+
+function get_grain_2_neon
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ ld1 {v0.h}[1], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ ret
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+function get_grain_4_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ ld1 {v0.h}[3], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ ret
+endfunc
+
+.macro get_grain_4 dst
+ bl get_grain_4_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+ read_shift_rand x13, 11
+ mov w11, v1.s[0]
+ ldrsh w12, [x3, x13, lsl #1]
+ ext v0.16b, v0.16b, v0.16b, #2
+.if \n == 1
+ madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
+.elseif \n == 2
+ madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w14, w17, w11 // += *coeff * prev output 2
+ mov w16, w14
+.else
+ madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
+ madd w11, w14, w21, w11 // += *coeff * prev output 3
+ mov w17, w16
+ mov w16, w14
+.endif
+ add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
+ add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr w14, w14, w7 // >> ar_coeff_shift
+ asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add w14, w14, w12
+ cmp w14, w5
+ csel w14, w14, w5, le
+ cmp w14, w6
+ csel w14, w14, w6, ge
+ subs w15, w15, #1
+ ext v1.16b, v1.16b, v1.16b, #4
+ ins v0.h[7], w14
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ sub x12, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v18.8h}, [x12] // load top right
+
+ ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid
+ ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right
+
+ smull v4.4s, v17.4h, v28.4h
+ smlal v4.4s, v0.4h, v27.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smull2 v5.4s, v17.8h, v28.8h
+ smlal2 v5.4s, v0.8h, v27.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+ bl sum_\lag\()_above_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH*2
+ ld1 {v22.8h, v23.8h}, [x19], #32
+ ld1 {v24.8h, v25.8h}, [x12]
+ addp v22.8h, v22.8h, v23.8h
+ addp v23.8h, v24.8h, v25.8h
+ add v22.8h, v22.8h, v23.8h
+ srshr v0.8h, v22.8h, #2
+.endif
+.ifc \type, uv_422
+ ld1 {v22.8h, v23.8h}, [x19], #32
+ addp v22.8h, v22.8h, v23.8h
+ srshr v0.8h, v22.8h, #1
+.endif
+.ifc \type, uv_444
+ ld1 {v0.8h}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ dup v1.8b, \uv_coeff
+ sxtl v1.8h, v1.8b
+ smlal v4.4s, v0.4h, v1.4h
+ smlal2 v5.4s, v0.8h, v1.8h
+.else
+ smlal v4.4s, v0.4h, v30.4h
+ smlal2 v5.4s, v0.8h, v30.8h
+.endif
+.endif
+.if \uv_layout && \elems == 8
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.if \elems > 4
+.ifc \edge, left
+ increment_seed 4
+ read_rand x12, 11, 3
+ read_rand x13, 11, 2
+ read_rand x14, 11, 1
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v0.h}[5], [x12]
+ ld1 {v0.h}[6], [x13]
+ ld1 {v0.h}[7], [x14]
+ lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ srshl v0.8h, v0.8h, v31.8h
+ ext v4.16b, v4.16b, v4.16b, #12
+.ifc \lag, lag3
+ smov w17, v0.h[5]
+.endif
+.ifnc \lag, lag1
+ smov w16, v0.h[6]
+.endif
+ smov w14, v0.h[7]
+
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ mov v1.16b, v5.16b
+.ifc \edge, right
+ mov w15, #3
+ bl output_\lag\()_neon
+ read_shift_rand x15, 11
+ add x15, x3, x15, lsl #1
+ ld1 {v1.h}[0], [x15]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #2
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+.else
+ // elems == 1
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+ lsr w2, w2, #3
+
+ read_rand x12, 11, 2
+ read_rand x13, 11, 1
+ read_rand x14, 11, 0
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v1.h}[0], [x12]
+ ld1 {v1.h}[1], [x13]
+ ld1 {v1.h}[2], [x14]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #14
+.endif
+ st1 {v0.8h}, [x0], #16
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #1*GRAIN_WIDTH*2
+ ld1 {v17.8h}, [x12] // load the previous block right above
+.endif
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+ sub x12, x0, #2*GRAIN_WIDTH*2 - 16
+ sub x13, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v18.8h}, [x12] // load top right
+ ld1 {v21.8h}, [x13]
+
+ dup v26.8b, v30.b[0]
+ ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid
+ dup v27.8b, v30.b[1]
+ ext v23.16b, v16.16b, v17.16b, #14
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v30.b[3]
+ ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right
+ sxtl v27.8h, v27.8b
+ dup v29.8b, v30.b[4]
+ ext v1.16b, v17.16b, v18.16b, #4
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+
+ smull v4.4s, v22.4h, v26.4h
+ smlal v4.4s, v23.4h, v27.4h
+ smlal v4.4s, v0.4h, v28.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smull2 v5.4s, v22.8h, v26.8h
+ smlal2 v5.4s, v23.8h, v27.8h
+ smlal2 v5.4s, v0.8h, v28.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ dup v26.16b, v30.b[5]
+ ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid
+ dup v27.16b, v30.b[6]
+ ext v23.16b, v19.16b, v20.16b, #14
+ sxtl v26.8h, v26.8b
+ dup v28.16b, v30.b[8]
+ ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right
+ sxtl v27.8h, v27.8b
+ dup v29.16b, v30.b[9]
+ ext v1.16b, v20.16b, v21.16b, #4
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+
+ smlal v4.4s, v22.4h, v26.4h
+ smlal v4.4s, v23.4h, v27.4h
+ smlal v4.4s, v0.4h, v28.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smlal2 v5.4s, v22.8h, v26.8h
+ smlal2 v5.4s, v23.8h, v27.8h
+ smlal2 v5.4s, v0.8h, v28.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ dup v26.16b, v30.b[2]
+ dup v27.16b, v30.b[7]
+ sxtl v26.8h, v26.8b
+ sxtl v27.8h, v27.8b
+
+ smlal v4.4s, v17.4h, v26.4h
+ smlal v4.4s, v20.4h, v27.4h
+ smlal2 v5.4s, v17.8h, v26.8h
+ smlal2 v5.4s, v20.8h, v27.8h
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #2*GRAIN_WIDTH*2
+ sub x13, x0, #1*GRAIN_WIDTH*2
+ ld1 {v17.8h}, [x12] // load the previous block right above
+ ld1 {v20.8h}, [x13]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_above_neon
+ sub x11, x0, #3*GRAIN_WIDTH*2 - 16
+ sub x12, x0, #2*GRAIN_WIDTH*2 - 16
+ sub x13, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v15.8h}, [x11] // load top right
+ ld1 {v18.8h}, [x12]
+ ld1 {v21.8h}, [x13]
+
+ dup v22.8b, v29.b[0]
+ ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[1]
+ ext v9.16b, v13.16b, v14.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v29.b[2]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v29.b[3]
+ ext v10.16b, v13.16b, v14.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v29.b[4]
+ ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v29.b[5]
+ ext v12.16b, v14.16b, v15.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v29.b[6]
+ ext v13.16b, v14.16b, v15.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smull v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v14.4h, v25.4h
+ smull2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v14.8h, v25.8h
+
+ dup v22.8b, v29.b[7]
+ ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[8]
+ ext v9.16b, v16.16b, v17.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v29.b[9]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v29.b[10]
+ ext v10.16b, v16.16b, v17.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v29.b[11]
+ ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v29.b[12]
+ ext v12.16b, v17.16b, v18.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v29.b[13]
+ ext v13.16b, v17.16b, v18.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smlal v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v17.4h, v25.4h
+ smlal2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v17.8h, v25.8h
+
+ dup v22.8b, v29.b[14]
+ ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[15]
+ ext v9.16b, v19.16b, v20.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v30.b[0]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v30.b[1]
+ ext v10.16b, v19.16b, v20.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v30.b[2]
+ ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v30.b[3]
+ ext v12.16b, v20.16b, v21.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v30.b[4]
+ ext v13.16b, v20.16b, v21.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smlal v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v20.4h, v25.4h
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ smlal2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v20.8h, v25.8h
+
+ mov v13.16b, v14.16b
+ mov v14.16b, v15.16b
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x11, x0, #3*GRAIN_WIDTH*2
+ sub x12, x0, #2*GRAIN_WIDTH*2
+ sub x13, x0, #1*GRAIN_WIDTH*2
+ ld1 {v14.8h}, [x11] // load the previous block right above
+ ld1 {v17.8h}, [x12]
+ ld1 {v20.8h}, [x13]
+.endif
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ mov w16, #80
+2:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+ subs w16, w16, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 2b
+ get_grain_2 v0
+ subs w1, w1, #1
+ st1 {v0.s}[0], [x0], #4
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function generate_grain_rows_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ mov w16, #40
+2:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+ subs w16, w16, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 2b
+ get_grain_4 v0
+ subs w1, w1, #1
+ st1 {v0.4h}, [x0]
+ add x0, x0, #GRAIN_WIDTH*2-80
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v4.8h}, [x19], #16
+gen_grain_uv_lag0_8_start:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+gen_grain_uv_lag0_8_add:
+ and v4.16b, v4.16b, v1.16b
+ smull v2.4s, v4.4h, v27.4h
+ smull2 v3.4s, v4.8h, v27.8h
+ srshl v2.4s, v2.4s, v28.4s
+ srshl v3.4s, v3.4s, v28.4s
+ sqxtn v2.4h, v2.4s
+ sqxtn2 v2.8h, v3.4s
+ sqadd v2.8h, v2.8h, v0.8h
+ smin v2.8h, v2.8h, v25.8h
+ smax v2.8h, v2.8h, v26.8h
+ st1 {v2.8h}, [x0], #16
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+ AARCH64_SIGN_LINK_REGISTER
+ add x12, x19, #GRAIN_WIDTH*2
+ str x30, [sp, #-16]!
+ ld1 {v16.8h, v17.8h}, [x19], #32
+ ld1 {v18.8h, v19.8h}, [x12]
+ addp v16.8h, v16.8h, v17.8h
+ addp v17.8h, v18.8h, v19.8h
+ add v16.8h, v16.8h, v17.8h
+ srshr v4.8h, v16.8h, #2
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.8h, v17.8h}, [x19], #32
+ addp v16.8h, v16.8h, v17.8h
+ srshr v4.8h, v16.8h, #1
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+ add x12, x19, #GRAIN_WIDTH*2
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.4h, v17.4h}, [x19]
+ ld1 {v18.4h, v19.4h}, [x12]
+ add x19, x19, #32
+ addp v16.4h, v16.4h, v17.4h
+ addp v17.4h, v18.4h, v19.4h
+ add v16.4h, v16.4h, v17.4h
+ srshr v4.4h, v16.4h, #2
+ get_grain_4 v0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.4h, v17.4h}, [x19]
+ add x19, x19, #32
+ addp v16.4h, v16.4h, v17.4h
+ srshr v4.4h, v16.4h, #1
+ get_grain_4 v0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH*2
+ mov x1, x2
+ mul w13, w13, w14
+ clz w15, w4
+.else
+ clz w15, w2
+.endif
+ movrel x3, X(gaussian_sequence)
+ sub w15, w15, #24 // -bitdepth_min_8
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add x4, x1, #FGD_AR_COEFFS_Y
+.else
+ add x4, x1, #FGD_AR_COEFFS_UV
+.endif
+ add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+.ifc \type, uv_444
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+.endif
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ neg w15, w15 // bitdepth_min_8
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #128
+ lsl w5, w5, w15 // 128 << bitdepth_min_8
+ neg w6, w5 // -(128 << bitpdeth_min_8)
+ sub w5, w5, #1 // (128 << bitdepth_min_8) - 1
+
+.ifc \type, uv_444
+ eor w2, w2, w11
+.endif
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+ mov w1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ dup v28.4s, w7
+ ld1r {v27.8b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v25.8h, w5
+ dup v26.8h, w6
+ ext v29.16b, v0.16b, v1.16b, #10
+ ext v30.16b, v1.16b, v0.16b, #2
+ neg v28.4s, v28.4s
+ sxtl v27.8h, v27.8b
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+ mov w1, #GRAIN_HEIGHT-3
+1:
+ mov v1.16b, v29.16b
+ bl gen_grain_uv_444_lag0_neon // 8
+ movi v1.16b, #255
+ bl gen_grain_uv_444_lag0_neon // 16
+ bl gen_grain_uv_444_lag0_neon // 24
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 40
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 56
+ bl gen_grain_uv_444_lag0_neon // 64
+ bl gen_grain_uv_444_lag0_neon // 72
+ mov v1.16b, v30.16b
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+ add x19, x19, #4
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+.endif
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0]
+ ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1]
+ ld1r {v29.8b}, [x4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb w4, [x4, #1] // ar_coeffs_y[3]
+.else
+ add x4, x4, #2
+.endif
+
+ mov w1, #3
+.ifc \type, uv_444
+ ld1r {v30.8b}, [x4] // ar_coeffs_uv[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+.ifc \type, uv_444
+ sxtl v30.8h, v30.8b
+.endif
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_mid_neon // 48
+ bl sum_\type\()_lag1_mid_neon // 56
+ bl sum_\type\()_lag1_mid_neon // 64
+ bl sum_\type\()_lag1_mid_neon // 72
+ bl sum_\type\()_lag1_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_mid_neon // 48
+ bl sum_\type\()_lag2_mid_neon // 56
+ bl sum_\type\()_lag2_mid_neon // 64
+ bl sum_\type\()_lag2_mid_neon // 72
+ bl sum_\type\()_lag2_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_mid_neon // 48
+ bl sum_\type\()_lag3_mid_neon // 56
+ bl sum_\type\()_lag3_mid_neon // 64
+ bl sum_\type\()_lag3_mid_neon // 72
+ bl sum_\type\()_lag3_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+ sub \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #(3*GRAIN_WIDTH-3)*2
+ mov x1, x2
+ mul w13, w13, w14
+ clz w15, w4
+
+ movrel x3, X(gaussian_sequence)
+ sub w15, w15, #24 // -bitdepth_min_8
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+ add x4, x1, #FGD_AR_COEFFS_UV
+ add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ neg w15, w15 // bitdepth_min_8
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #128
+ lsl w5, w5, w15 // 128 << bitdepth_min_8
+ neg w6, w5 // -(128 << bitpdeth_min_8)
+ sub w5, w5, #1 // (128 << bitdepth_min_8) - 1
+
+ eor w2, w2, w11
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.4s, w7
+ ld1r {v27.8b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v25.8h, w5
+ dup v26.8h, w6
+ ext v29.16b, v0.16b, v1.16b, #10
+ ext v30.16b, v1.16b, v0.16b, #14
+ neg v28.4s, v28.4s
+ sxtl v27.8h, v27.8b
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+ set_height w1, \type
+1:
+ mov v1.16b, v29.16b
+ bl gen_grain_\type\()_lag0_8_neon // 8
+ movi v1.16b, #255
+ bl gen_grain_\type\()_lag0_8_neon // 16
+ bl gen_grain_\type\()_lag0_8_neon // 24
+ bl gen_grain_\type\()_lag0_8_neon // 32
+ bl gen_grain_\type\()_lag0_8_neon // 40
+ mov v1.16b, v30.16b
+ bl gen_grain_\type\()_lag0_4_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0]
+ ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1]
+ ld1r {v29.8b}, [x4] // ar_coeffs_uv[2]
+ add x4, x4, #2
+
+ mov w1, #3
+ ld1r {v30.8b}, [x4] // ar_coeffs_u4[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+ sxtl v30.8h, v30.8b
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, [x4] // ar_coeffs_uv[0-15]
+ ldr q30, [x4, #16] // ar_coeffs_uv[16-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ umov w14, \src1[0]
+ umov w15, \src2[1]
+ umov w16, \src1[2]
+ add x14, x14, x3
+ umov w17, \src2[3]
+ add x15, x15, x3
+ ld1 {\dst1}[0+\off], [x14]
+ umov w14, \src1[4]
+ add x16, x16, x3
+ ld1 {\dst2}[1+\off], [x15]
+ umov w15, \src2[5]
+ add x17, x17, x3
+ ld1 {\dst1}[2+\off], [x16]
+ umov w16, \src1[6]
+ add x14, x14, x3
+ ld1 {\dst2}[3+\off], [x17]
+ umov w17, \src2[7]
+ add x15, x15, x3
+ ld1 {\dst1}[4+\off], [x14]
+ add x16, x16, x3
+ ld1 {\dst2}[5+\off], [x15]
+ add x17, x17, x3
+ ld1 {\dst1}[6+\off], [x16]
+ ld1 {\dst2}[7+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst2, \src1, \src3, 0
+ gather_interleaved \dst2, \dst1, \src3, \src1, 0
+ gather_interleaved \dst1, \dst2, \src2, \src4, 8
+ gather_interleaved \dst2, \dst1, \src4, \src2, 8
+.endm
+
+function gather32_neon
+ gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
+ ret
+endfunc
+
+function gather16_neon
+ gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
+ gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
+ ins v6.d[1], v7.d[0]
+ ret
+endfunc
+
+const overlap_coeffs_0, align=4
+ .short 27, 17, 0, 0
+ .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .short 23, 0, 0, 0
+ .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, uxtw #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ str d14, [sp, #64]
+ eor w4, w4, #15 // 15 - scaling_shift
+ ldr w11, [x6, #8] // offsets[1][0]
+ ldr w13, [x6, #4] // offsets[0][1]
+ ldr w15, [x6, #12] // offsets[1][1]
+ ldr w10, [sp, #96] // bitdepth_max
+ ldr w6, [x6] // offsets[0][0]
+ dup v26.8h, w10 // bitdepth_max
+ clz w10, w10
+ ldr w8, [sp, #80] // clip
+ sub w10, w10, #24 // -bitdepth_min_8
+ mov x9, #GRAIN_WIDTH*2 // grain_lut stride
+ neg w10, w10 // bitdepth_min_8
+
+ dup v29.8h, w4 // 15 - scaling_shift
+ dup v27.8h, w10 // bitdepth_min_8
+
+ movrel x16, overlap_coeffs_0
+
+ cbz w8, 1f
+ // clip
+ movi v30.8h, #16
+ movi v31.8h, #235
+ sshl v30.8h, v30.8h, v27.8h
+ sshl v31.8h, v31.8h, v27.8h
+ b 2f
+1:
+ // no clip
+ movi v30.8h, #0
+ mov v31.16b, v26.16b // bitdepth_max
+2:
+
+ ushr v26.8h, v26.8h, #1 // grain_max
+ not v25.16b, v26.16b // grain_min
+
+ ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
+
+ add x5, x5, #18 // grain_lut += 9
+ add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x9 // grain_lut += grain_stride
+
+ calc_offset w11, w12, w11, 0, 0
+ calc_offset w13, w14, w13, 0, 0
+ calc_offset w15, w16, w15, 0, 0
+ calc_offset w6, w10, w6, 0, 0
+
+ add_offset x12, w11, x12, x5, x9
+ add_offset x14, w13, x14, x5, x9
+ add_offset x16, w15, x16, x5, x9
+ add_offset x5, w6, x10, x5, x9
+
+ ldr w11, [sp, #88] // type
+ adr x13, L(fgy_loop_tbl)
+
+ add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx
+ add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ tst w11, #1
+ ldrh w11, [x13, w11, uxtw #1]
+
+ add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx
+
+ sub x11, x13, w11, uxtw
+
+ b.eq 1f
+ // y overlap
+ dup v8.8h, v27.h[0]
+ dup v9.8h, v27.h[1]
+ mov w10, w7 // backup actual h
+ mov w7, #2
+1:
+ br x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src
+.if \ox
+ ld1 {v20.4h}, [x4], x9 // grain_lut old
+.endif
+.if \oy
+ ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v14.4h}, [x8], x9 // grain_lut top old
+.endif
+ mvni v4.8h, #0xf0, lsl #8 // 0x0fff
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v4.16b
+ and v1.16b, v1.16b, v4.16b
+ and v2.16b, v2.16b, v4.16b
+ and v3.16b, v3.16b, v4.16b
+ bl gather32_neon
+
+.if \ox
+ smull v20.4s, v20.4h, v27.4h
+ smlal v20.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v14.4s, v14.4h, v27.4h
+ smlal v14.4s, v21.4h, v28.4h
+ sqrshrn v20.4h, v20.4s, #5
+ sqrshrn v14.4h, v14.4s, #5
+ smin v20.4h, v20.4h, v26.4h
+ smin v14.4h, v14.4h, v26.4h
+ smax v20.4h, v20.4h, v25.4h
+ smax v14.4h, v14.4h, v25.4h
+.endif
+
+.if \ox
+ smull v10.4s, v20.4h, v9.4h
+.else
+ smull v10.4s, v16.4h, v9.4h
+.endif
+ smull2 v11.4s, v16.8h, v9.8h
+ smull v12.4s, v17.4h, v9.4h
+ smull2 v13.4s, v17.8h, v9.8h
+ smull v16.4s, v18.4h, v9.4h
+ smull2 v17.4s, v18.8h, v9.8h
+ smull v18.4s, v19.4h, v9.4h
+ smull2 v19.4s, v19.8h, v9.8h
+.if \ox
+ smlal v10.4s, v14.4h, v8.4h
+.else
+ smlal v10.4s, v21.4h, v8.4h
+.endif
+ smlal2 v11.4s, v21.8h, v8.8h
+ smlal v12.4s, v22.4h, v8.4h
+ smlal2 v13.4s, v22.8h, v8.8h
+ smlal v16.4s, v23.4h, v8.4h
+ smlal2 v17.4s, v23.8h, v8.8h
+ smlal v18.4s, v24.4h, v8.4h
+ smlal2 v19.4s, v24.8h, v8.8h
+ sqrshrn v10.4h, v10.4s, #5
+ sqrshrn2 v10.8h, v11.4s, #5
+ sqrshrn v11.4h, v12.4s, #5
+ sqrshrn2 v11.8h, v13.4s, #5
+ sqrshrn v12.4h, v16.4s, #5
+ sqrshrn2 v12.8h, v17.4s, #5
+ sqrshrn v13.4h, v18.4s, #5
+ sqrshrn2 v13.8h, v19.4s, #5
+ smin v16.8h, v10.8h, v26.8h
+ smin v17.8h, v11.8h, v26.8h
+ smin v18.8h, v12.8h, v26.8h
+ smin v19.8h, v13.8h, v26.8h
+ smax v16.8h, v16.8h, v25.8h
+ smax v17.8h, v17.8h, v25.8h
+ smax v18.8h, v18.8h, v25.8h
+ smax v19.8h, v19.8h, v25.8h
+.endif
+
+ uxtl v4.8h, v6.8b // scaling
+.if \ox && !\oy
+ sqrshrn v20.4h, v20.4s, #5
+.endif
+ uxtl2 v5.8h, v6.16b
+.if \ox && !\oy
+ smin v20.4h, v20.4h, v26.4h
+.endif
+ uxtl v6.8h, v7.8b
+.if \ox && !\oy
+ smax v20.4h, v20.4h, v25.4h
+.endif
+ uxtl2 v7.8h, v7.16b
+.if \ox && !\oy
+ ins v16.d[0], v20.d[0]
+.endif
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v21.8h, v17.8h, v5.8h
+ sqrdmulh v22.8h, v18.8h, v6.8h
+ sqrdmulh v23.8h, v19.8h, v7.8h
+
+ usqadd v0.8h, v20.8h // *src + noise
+ usqadd v1.8h, v21.8h
+ usqadd v2.8h, v22.8h
+ usqadd v3.8h, v23.8h
+
+ umax v0.8h, v0.8h, v30.8h
+ umax v1.8h, v1.8h, v30.8h
+ umax v2.8h, v2.8h, v30.8h
+ umax v3.8h, v3.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+
+ subs w7, w7, #1
+.if \oy
+ dup v8.8h, v28.h[0]
+ dup v9.8h, v28.h[1]
+.endif
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w10, #2
+ sub w7, w10, #2 // restore actual remaining h
+ b.gt L(loop_\ox\()0)
+.endif
+ ldr d14, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+
+L(fgy_loop_tbl):
+ .hword L(fgy_loop_tbl) - L(loop_00)
+ .hword L(fgy_loop_tbl) - L(loop_01)
+ .hword L(fgy_loop_tbl) - L(loop_10)
+ .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+
+ ldp x8, x9, [sp, #80] // offsets, h
+ ldp x10, x11, [sp, #96] // uv, is_id
+ ldr w16, [sp, #120] // bitdepth_max
+
+ ldr w13, [x4, #FGD_SCALING_SHIFT]
+ ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ dup v23.8h, w16 // bitdepth_max
+ clz w16, w16
+ eor w13, w13, #15 // 15 - scaling_shift
+ sub w16, w16, #24 // -bitdepth_min_8
+
+ // !csfl
+ add x10, x4, x10, lsl #2 // + 4*uv
+ add x14, x10, #FGD_UV_LUMA_MULT
+ add x15, x10, #FGD_UV_MULT
+ add x10, x10, #FGD_UV_OFFSET
+ neg w16, w16 // bitdepth_min_8
+ ld1r {v8.8h}, [x14] // uv_luma_mult
+ ld1r {v24.8h}, [x10] // uv_offset
+ ld1r {v9.8h}, [x15] // uv_mult
+
+ dup v29.8h, w13 // 15 - scaling_shift
+ dup v27.8h, w16 // bitdepth_min_8
+
+ cbz w12, 1f
+ // clip
+ movi v30.8h, #16
+ movi v31.8h, #240
+ sshl v30.8h, v30.8h, v27.8h
+ sshl v31.8h, v31.8h, v27.8h
+ cbz w11, 2f
+ // is_id
+ movi v31.8h, #235
+ sshl v31.8h, v31.8h, v27.8h
+ b 2f
+1:
+ // no clip
+ movi v30.8h, #0
+ mov v31.16b, v23.16b // bitdepth_max
+2:
+
+ ushr v15.8h, v23.8h, #1 // grain_max
+ sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8
+ not v14.16b, v15.16b // grain_min
+
+ ldr w12, [x8, #8] // offsets[1][0]
+ ldr w14, [x8, #4] // offsets[0][1]
+ ldr w16, [x8, #12] // offsets[1][1]
+ ldr w8, [x8] // offsets[0][0]
+
+ mov x10, #GRAIN_WIDTH*2 // grain_lut stride
+
+ add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+ add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
+ add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x10 // grain_lut += grain_stride
+.endif
+
+ calc_offset w12, w13, w12, \sx, \sy
+ calc_offset w14, w15, w14, \sx, \sy
+ calc_offset w16, w17, w16, \sx, \sy
+ calc_offset w8, w11, w8, \sx, \sy
+
+ add_offset x13, w12, x13, x5, x10
+ add_offset x15, w14, x15, x5, x10
+ add_offset x17, w16, x17, x5, x10
+ add_offset x5, w8, x11, x5, x10
+
+ add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ ldr w13, [sp, #112] // type
+
+ movrel x16, overlap_coeffs_\sx
+ adr x14, L(fguv_loop_sx\sx\()_tbl)
+
+ ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
+ tst w13, #1
+ ldrh w13, [x14, w13, uxtw #1]
+
+ b.eq 1f
+ // y overlap
+ sub w12, w9, #(2 >> \sy) // backup remaining h
+ mov w9, #(2 >> \sy)
+
+1:
+ sub x13, x14, w13, uxtw
+
+.if \sy
+ movi v25.8h, #23
+ movi v26.8h, #22
+.else
+ movi v25.8h, #27
+ movi v26.8h, #17
+.endif
+
+.if \sy
+ add x7, x7, x7 // luma_stride *= 2
+.endif
+
+ br x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+ ld1 {v4.4h}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v5.4h}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut
+
+.if \ox
+ smull v4.4s, v4.4h, v27.4h
+ smlal v4.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v5.4s, v5.4h, v27.4h
+ smlal v5.4s, v0.4h, v28.4h
+ sqrshrn v4.4h, v4.4s, #5
+ sqrshrn v5.4h, v5.4s, #5
+ smin v4.4h, v4.4h, v15.4h
+ smin v5.4h, v5.4h, v15.4h
+ smax v4.4h, v4.4h, v14.4h
+ smax v5.4h, v5.4h, v14.4h
+ ins v16.d[0], v4.d[0]
+ ins v0.d[0], v5.d[0]
+.endif
+
+ smull v6.4s, v16.4h, v26.4h
+ smull2 v7.4s, v16.8h, v26.8h
+ smull v10.4s, v17.4h, v26.4h
+ smull2 v11.4s, v17.8h, v26.8h
+ smull v16.4s, v18.4h, v26.4h
+ smull2 v17.4s, v18.8h, v26.8h
+ smull v18.4s, v19.4h, v26.4h
+ smull2 v19.4s, v19.8h, v26.8h
+ smlal v6.4s, v0.4h, v25.4h
+ smlal2 v7.4s, v0.8h, v25.8h
+ smlal v10.4s, v1.4h, v25.4h
+ smlal2 v11.4s, v1.8h, v25.8h
+ smlal v16.4s, v2.4h, v25.4h
+ smlal2 v17.4s, v2.8h, v25.8h
+ smlal v18.4s, v3.4h, v25.4h
+ smlal2 v19.4s, v3.8h, v25.8h
+ sqrshrn v6.4h, v6.4s, #5
+ sqrshrn2 v6.8h, v7.4s, #5
+ sqrshrn v7.4h, v10.4s, #5
+ sqrshrn2 v7.8h, v11.4s, #5
+ sqrshrn v10.4h, v16.4s, #5
+ sqrshrn2 v10.8h, v17.4s, #5
+ sqrshrn v11.4h, v18.4s, #5
+ sqrshrn2 v11.8h, v19.4s, #5
+.endif
+
+.if \ox && !\oy
+ sqrshrn v4.4h, v4.4s, #5
+ smin v4.4h, v4.4h, v15.4h
+.endif
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
+.if \oy
+ smin v16.8h, v6.8h, v15.8h
+ smin v17.8h, v7.8h, v15.8h
+ smin v18.8h, v10.8h, v15.8h
+ smin v19.8h, v11.8h, v15.8h
+ smax v16.8h, v16.8h, v14.8h
+ smax v17.8h, v17.8h, v14.8h
+ smax v18.8h, v18.8h, v14.8h
+ smax v19.8h, v19.8h, v14.8h
+.endif
+
+.if \ox && !\oy
+ smax v4.4h, v4.4h, v14.4h
+.endif
+ ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src
+.if \ox && !\oy
+ ins v16.d[0], v4.d[0]
+.endif
+
+.if !\csfl
+ smull v4.4s, v0.4h, v8.4h
+ smull2 v5.4s, v0.8h, v8.8h
+ smull v6.4s, v1.4h, v8.4h
+ smull2 v7.4s, v1.8h, v8.8h
+ smull v0.4s, v2.4h, v8.4h
+ smull2 v1.4s, v2.8h, v8.8h
+ smull v2.4s, v3.4h, v8.4h
+ smull2 v3.4s, v3.8h, v8.8h
+ smlal v4.4s, v10.4h, v9.4h
+ smlal2 v5.4s, v10.8h, v9.8h
+ smlal v6.4s, v11.4h, v9.4h
+ smlal2 v7.4s, v11.8h, v9.8h
+ smlal v0.4s, v12.4h, v9.4h
+ smlal2 v1.4s, v12.8h, v9.8h
+ smlal v2.4s, v13.4h, v9.4h
+ smlal2 v3.4s, v13.8h, v9.8h
+ shrn v4.4h, v4.4s, #6
+ shrn2 v4.8h, v5.4s, #6
+ shrn v5.4h, v6.4s, #6
+ shrn2 v5.8h, v7.4s, #6
+ shrn v6.4h, v0.4s, #6
+ shrn2 v6.8h, v1.4s, #6
+ shrn v7.4h, v2.4s, #6
+ shrn2 v7.8h, v3.4s, #6
+ add v0.8h, v4.8h, v24.8h
+ add v1.8h, v5.8h, v24.8h
+ add v2.8h, v6.8h, v24.8h
+ add v3.8h, v7.8h, v24.8h
+ movi v20.8h, #0
+ smin v0.8h, v0.8h, v23.8h
+ smin v1.8h, v1.8h, v23.8h
+ smin v2.8h, v2.8h, v23.8h
+ smin v3.8h, v3.8h, v23.8h
+ smax v0.8h, v0.8h, v20.8h
+ smax v1.8h, v1.8h, v20.8h
+ smax v2.8h, v2.8h, v20.8h
+ smax v3.8h, v3.8h, v20.8h
+.else
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v23.16b
+ and v1.16b, v1.16b, v23.16b
+ and v2.16b, v2.16b, v23.16b
+ and v3.16b, v3.16b, v23.16b
+.endif
+
+ bl gather32_neon
+
+ uxtl v4.8h, v6.8b // scaling
+ uxtl2 v5.8h, v6.16b
+ uxtl v6.8h, v7.8b
+ uxtl2 v7.8h, v7.16b
+
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v17.8h, v17.8h, v5.8h
+ sqrdmulh v18.8h, v18.8h, v6.8h
+ sqrdmulh v19.8h, v19.8h, v7.8h
+
+ usqadd v10.8h, v16.8h // *src + noise
+ usqadd v11.8h, v17.8h
+ usqadd v12.8h, v18.8h
+ usqadd v13.8h, v19.8h
+
+ umax v0.8h, v10.8h, v30.8h
+ umax v1.8h, v11.8h, v30.8h
+ umax v2.8h, v12.8h, v30.8h
+ umax v3.8h, v13.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+
+ subs w9, w9, #1
+.if \oy
+ dup v25.8h, v28.h[0]
+ dup v26.8h, v28.h[1]
+.endif
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx0_tbl):
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+ ld1 {v18.4h}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v19.4h}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut
+
+.if \ox
+ smull v18.4s, v18.4h, v27.4h
+ smlal v18.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v19.4s, v19.4h, v27.4h
+ smlal v19.4s, v20.4h, v28.4h
+ sqrshrn v18.4h, v18.4s, #5
+ sqrshrn v19.4h, v19.4s, #5
+ smin v18.4h, v18.4h, v15.4h
+ smin v19.4h, v19.4h, v15.4h
+ smax v18.4h, v18.4h, v14.4h
+ smax v19.4h, v19.4h, v14.4h
+ ins v16.d[0], v18.d[0]
+ ins v20.d[0], v19.d[0]
+.endif
+
+ smull v0.4s, v16.4h, v26.4h
+ smull2 v1.4s, v16.8h, v26.8h
+ smull v2.4s, v17.4h, v26.4h
+ smull2 v3.4s, v17.8h, v26.8h
+ smlal v0.4s, v20.4h, v25.4h
+ smlal2 v1.4s, v20.8h, v25.8h
+ smlal v2.4s, v21.4h, v25.4h
+ smlal2 v3.4s, v21.8h, v25.8h
+ sqrshrn v16.4h, v0.4s, #5
+ sqrshrn2 v16.8h, v1.4s, #5
+ sqrshrn v17.4h, v2.4s, #5
+ sqrshrn2 v17.8h, v3.4s, #5
+.endif
+
+.if \ox && !\oy
+ sqrshrn v18.4h, v18.4s, #5
+ smin v18.4h, v18.4h, v15.4h
+.endif
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
+.if \oy
+ smin v16.8h, v16.8h, v15.8h
+ smin v17.8h, v17.8h, v15.8h
+ smax v16.8h, v16.8h, v14.8h
+ smax v17.8h, v17.8h, v14.8h
+.endif
+
+.if \ox && !\oy
+ smax v18.4h, v18.4h, v14.4h
+.endif
+ ld1 {v10.8h, v11.8h}, [x1], x2 // src
+.if \ox && !\oy
+ ins v16.d[0], v18.d[0]
+.endif
+ addp v0.8h, v0.8h, v1.8h
+ addp v1.8h, v2.8h, v3.8h
+ urshr v0.8h, v0.8h, #1
+ urshr v1.8h, v1.8h, #1
+.if !\csfl
+ smull v2.4s, v0.4h, v8.4h
+ smull2 v3.4s, v0.8h, v8.8h
+ smull v0.4s, v1.4h, v8.4h
+ smull2 v1.4s, v1.8h, v8.8h
+ smlal v2.4s, v10.4h, v9.4h
+ smlal2 v3.4s, v10.8h, v9.8h
+ smlal v0.4s, v11.4h, v9.4h
+ smlal2 v1.4s, v11.8h, v9.8h
+ shrn v2.4h, v2.4s, #6
+ shrn2 v2.8h, v3.4s, #6
+ shrn v3.4h, v0.4s, #6
+ shrn2 v3.8h, v1.4s, #6
+ add v0.8h, v2.8h, v24.8h
+ add v1.8h, v3.8h, v24.8h
+ movi v2.8h, #0
+ smin v0.8h, v0.8h, v23.8h
+ smin v1.8h, v1.8h, v23.8h
+ smax v0.8h, v0.8h, v2.8h
+ smax v1.8h, v1.8h, v2.8h
+.else
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v23.16b
+ and v1.16b, v1.16b, v23.16b
+.endif
+
+ bl gather16_neon
+
+ uxtl v4.8h, v6.8b // scaling
+ uxtl2 v5.8h, v6.16b
+
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+
+ sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v17.8h, v17.8h, v5.8h
+
+ usqadd v10.8h, v16.8h // *src + noise
+ usqadd v11.8h, v17.8h
+
+ umax v0.8h, v10.8h, v30.8h
+ umax v1.8h, v11.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+
+.if \oy
+ mov v16.16b, v25.16b
+.endif
+ subs w9, w9, #1
+.if \oy
+ mov v25.16b, v26.16b
+ mov v26.16b, v16.16b
+.endif
+ st1 {v0.8h, v1.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx1_tbl):
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S
index ec932af0ce66d..c9650e9d544b7 100644
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -133,10 +133,10 @@ endconst
.endif
.endm
-.macro rshrn_sz d0, s0, s1, shift, sz
- rshrn \d0\().4h, \s0\().4s, \shift
+.macro sqrshrn_sz d0, s0, s1, shift, sz
+ sqrshrn \d0\().4h, \s0\().4s, \shift
.ifc \sz, .8h
- rshrn2 \d0\().8h, \s1\().4s, \shift
+ sqrshrn2 \d0\().8h, \s1\().4s, \shift
.endif
.endm
@@ -438,11 +438,11 @@ endfunc
smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz
smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz
smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz
- rshrn_sz v6, v6, v7, #12, \sz
- rshrn_sz v7, v4, v5, #12, \sz
+ sqrshrn_sz v6, v6, v7, #12, \sz
+ sqrshrn_sz v7, v4, v5, #12, \sz
smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz
- rshrn_sz v2, v2, v3, #12, \sz
- rshrn_sz v3, v4, v5, #12, \sz
+ sqrshrn_sz v2, v2, v3, #12, \sz
+ sqrshrn_sz v3, v4, v5, #12, \sz
sqadd \r0\sz, v2\sz, v6\sz
sqsub \r3\sz, v2\sz, v6\sz
sqadd \r1\sz, v3\sz, v7\sz
@@ -714,11 +714,11 @@ def_fn_4x4 identity, flipadst
smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a
smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a
smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a
- rshrn_sz \r1, v2, v3, #12, \sz // t4a
- rshrn_sz \r7, v4, v5, #12, \sz // t7a
+ sqrshrn_sz \r1, v2, v3, #12, \sz // t4a
+ sqrshrn_sz \r7, v4, v5, #12, \sz // t7a
smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
- rshrn_sz \r3, v6, v7, #12, \sz // t5a
- rshrn_sz \r5, v2, v3, #12, \sz // t6a
+ sqrshrn_sz \r3, v6, v7, #12, \sz // t5a
+ sqrshrn_sz \r5, v2, v3, #12, \sz // t6a
sqadd v2\sz, \r1\sz, \r3\sz // t4
sqsub \r1\sz, \r1\sz, \r3\sz // t5a
@@ -727,8 +727,8 @@ def_fn_4x4 identity, flipadst
smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
- rshrn_sz v4, v4, v5, #12, \sz // t5
- rshrn_sz v5, v6, v7, #12, \sz // t6
+ sqrshrn_sz v4, v4, v5, #12, \sz // t5
+ sqrshrn_sz v5, v6, v7, #12, \sz // t6
sqsub \r7\sz, \r0\sz, v3\sz // out7
sqadd \r0\sz, \r0\sz, v3\sz // out0
@@ -762,19 +762,19 @@ endfunc
smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz
smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz
smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz
- rshrn_sz v16, v2, v3, #12, \sz // t0a
- rshrn_sz v23, v4, v5, #12, \sz // t1a
+ sqrshrn_sz v16, v2, v3, #12, \sz // t0a
+ sqrshrn_sz v23, v4, v5, #12, \sz // t1a
smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz
smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz
- rshrn_sz v18, v6, v7, #12, \sz // t2a
- rshrn_sz v21, v2, v3, #12, \sz // t3a
+ sqrshrn_sz v18, v6, v7, #12, \sz // t2a
+ sqrshrn_sz v21, v2, v3, #12, \sz // t3a
smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz
smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz
- rshrn_sz v20, v4, v5, #12, \sz // t4a
- rshrn_sz v19, v6, v7, #12, \sz // t5a
+ sqrshrn_sz v20, v4, v5, #12, \sz // t4a
+ sqrshrn_sz v19, v6, v7, #12, \sz // t5a
smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz
- rshrn_sz v22, v2, v3, #12, \sz // t6a
- rshrn_sz v17, v4, v5, #12, \sz // t7a
+ sqrshrn_sz v22, v2, v3, #12, \sz // t6a
+ sqrshrn_sz v17, v4, v5, #12, \sz // t7a
sqadd v2\sz, v16\sz, v20\sz // t0
sqsub v3\sz, v16\sz, v20\sz // t4
@@ -789,13 +789,13 @@ endfunc
smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz
smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz
- rshrn_sz v3, v16, v17, #12, \sz // t4a
- rshrn_sz v5, v20, v21, #12, \sz // t5a
+ sqrshrn_sz v3, v16, v17, #12, \sz // t4a
+ sqrshrn_sz v5, v20, v21, #12, \sz // t5a
smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz
- rshrn_sz v7, v22, v23, #12, \sz // t6a
- rshrn_sz v19, v16, v17, #12, \sz // t7a
+ sqrshrn_sz v7, v22, v23, #12, \sz // t6a
+ sqrshrn_sz v19, v16, v17, #12, \sz // t7a
sqadd \o0\()\sz, v2\sz, v6\sz // out0
sqsub v2\sz, v2\sz, v6\sz // t2
@@ -812,11 +812,11 @@ endfunc
smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
- rshrn_sz v2, v18, v19, #12, \sz // out3
+ sqrshrn_sz v2, v18, v19, #12, \sz // out3
smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
- rshrn_sz v3, v20, v21, #12, \sz // out5
- rshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21)
- rshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19)
+ sqrshrn_sz v3, v20, v21, #12, \sz // out5
+ sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21)
+ sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19)
sqneg \o3\()\sz, v2\sz // out3
sqneg \o5\()\sz, v3\sz // out5
@@ -1033,19 +1033,19 @@ def_fns_48 8, 4
smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a
smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a
smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a
- rshrn_sz v17, v2, v3, #12, \sz // t8a
- rshrn_sz v31, v4, v5, #12, \sz // t15a
+ sqrshrn_sz v17, v2, v3, #12, \sz // t8a
+ sqrshrn_sz v31, v4, v5, #12, \sz // t15a
smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a
smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a
- rshrn_sz v23, v6, v7, #12, \sz // t9a
- rshrn_sz v25, v2, v3, #12, \sz // t14a
+ sqrshrn_sz v23, v6, v7, #12, \sz // t9a
+ sqrshrn_sz v25, v2, v3, #12, \sz // t14a
smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a
smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a
- rshrn_sz v21, v4, v5, #12, \sz // t10a
- rshrn_sz v27, v6, v7, #12, \sz // t13a
+ sqrshrn_sz v21, v4, v5, #12, \sz // t10a
+ sqrshrn_sz v27, v6, v7, #12, \sz // t13a
smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a
- rshrn_sz v19, v2, v3, #12, \sz // t11a
- rshrn_sz v29, v4, v5, #12, \sz // t12a
+ sqrshrn_sz v19, v2, v3, #12, \sz // t11a
+ sqrshrn_sz v29, v4, v5, #12, \sz // t12a
sqsub v2\sz, v17\sz, v23\sz // t9
sqadd v17\sz, v17\sz, v23\sz // t8
@@ -1058,17 +1058,17 @@ def_fns_48 8, 4
smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a
smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a
- rshrn_sz v21, v4, v5, #12, \sz // t9a
- rshrn_sz v27, v6, v7, #12, \sz // t14a
+ sqrshrn_sz v21, v4, v5, #12, \sz // t9a
+ sqrshrn_sz v27, v6, v7, #12, \sz // t14a
smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
- rshrn_sz v29, v4, v5, #12, \sz // t13a
+ sqrshrn_sz v29, v4, v5, #12, \sz // t13a
neg v6.4s, v6.4s
.ifc \sz, .8h
neg v7.4s, v7.4s
.endif
- rshrn_sz v23, v6, v7, #12, \sz // t10a
+ sqrshrn_sz v23, v6, v7, #12, \sz // t10a
sqsub v2\sz, v17\sz, v19\sz // t11a
sqadd v17\sz, v17\sz, v19\sz // t8a
@@ -1083,11 +1083,11 @@ def_fns_48 8, 4
smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12
smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
- rshrn_sz v4, v4, v5, #12, \sz // t11
- rshrn_sz v5, v6, v7, #12, \sz // t12
+ sqrshrn_sz v4, v4, v5, #12, \sz // t11
+ sqrshrn_sz v5, v6, v7, #12, \sz // t12
smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
- rshrn_sz v2, v2, v3, #12, \sz // t10a
- rshrn_sz v3, v6, v7, #12, \sz // t13a
+ sqrshrn_sz v2, v2, v3, #12, \sz // t10a
+ sqrshrn_sz v3, v6, v7, #12, \sz // t13a
sqadd v6\sz, v16\sz, v31\sz // out0
sqsub v31\sz, v16\sz, v31\sz // out15
@@ -1132,35 +1132,35 @@ endfunc
smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0
smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1
smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2
- rshrn_sz v16, v2, v3, #12, \sz // t0
- rshrn_sz v31, v4, v5, #12, \sz // t1
+ sqrshrn_sz v16, v2, v3, #12, \sz // t0
+ sqrshrn_sz v31, v4, v5, #12, \sz // t1
smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3
smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4
- rshrn_sz v18, v6, v7, #12, \sz // t2
- rshrn_sz v29, v2, v3, #12, \sz // t3
+ sqrshrn_sz v18, v6, v7, #12, \sz // t2
+ sqrshrn_sz v29, v2, v3, #12, \sz // t3
smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5
smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6
- rshrn_sz v20, v4, v5, #12, \sz // t4
- rshrn_sz v27, v6, v7, #12, \sz // t5
+ sqrshrn_sz v20, v4, v5, #12, \sz // t4
+ sqrshrn_sz v27, v6, v7, #12, \sz // t5
smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7
smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8
- rshrn_sz v22, v2, v3, #12, \sz // t6
- rshrn_sz v25, v4, v5, #12, \sz // t7
+ sqrshrn_sz v22, v2, v3, #12, \sz // t6
+ sqrshrn_sz v25, v4, v5, #12, \sz // t7
smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9
smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10
- rshrn_sz v23, v6, v7, #12, \sz // t8
- rshrn_sz v24, v2, v3, #12, \sz // t9
+ sqrshrn_sz v23, v6, v7, #12, \sz // t8
+ sqrshrn_sz v24, v2, v3, #12, \sz // t9
smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11
smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12
- rshrn_sz v21, v4, v5, #12, \sz // t10
- rshrn_sz v26, v6, v7, #12, \sz // t11
+ sqrshrn_sz v21, v4, v5, #12, \sz // t10
+ sqrshrn_sz v26, v6, v7, #12, \sz // t11
smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13
smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14
- rshrn_sz v19, v2, v3, #12, \sz // t12
- rshrn_sz v28, v4, v5, #12, \sz // t13
+ sqrshrn_sz v19, v2, v3, #12, \sz // t12
+ sqrshrn_sz v28, v4, v5, #12, \sz // t13
smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15
- rshrn_sz v17, v6, v7, #12, \sz // t14
- rshrn_sz v30, v2, v3, #12, \sz // t15
+ sqrshrn_sz v17, v6, v7, #12, \sz // t14
+ sqrshrn_sz v30, v2, v3, #12, \sz // t15
ld1 {v0.8h}, [x16]
@@ -1184,19 +1184,19 @@ endfunc
smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8
smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9
smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10
- rshrn_sz v17, v4, v5, #12, \sz // t8
- rshrn_sz v30, v6, v7, #12, \sz // t9
+ sqrshrn_sz v17, v4, v5, #12, \sz // t8
+ sqrshrn_sz v30, v6, v7, #12, \sz // t9
smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11
smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12
- rshrn_sz v18, v2, v3, #12, \sz // t10
- rshrn_sz v29, v4, v5, #12, \sz // t11
+ sqrshrn_sz v18, v2, v3, #12, \sz // t10
+ sqrshrn_sz v29, v4, v5, #12, \sz // t11
smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13
smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14
- rshrn_sz v27, v6, v7, #12, \sz // t12
- rshrn_sz v20, v2, v3, #12, \sz // t13
+ sqrshrn_sz v27, v6, v7, #12, \sz // t12
+ sqrshrn_sz v20, v2, v3, #12, \sz // t13
smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15
- rshrn_sz v25, v4, v5, #12, \sz // t14
- rshrn_sz v22, v6, v7, #12, \sz // t15
+ sqrshrn_sz v25, v4, v5, #12, \sz // t14
+ sqrshrn_sz v22, v6, v7, #12, \sz // t15
sqsub v2\sz, v16\sz, v21\sz // t4
sqadd v16\sz, v16\sz, v21\sz // t0
@@ -1218,19 +1218,19 @@ endfunc
smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a
smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a
smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a
- rshrn_sz v22, v4, v5, #12, \sz // t4a
- rshrn_sz v25, v6, v7, #12, \sz // t5a
+ sqrshrn_sz v22, v4, v5, #12, \sz // t4a
+ sqrshrn_sz v25, v6, v7, #12, \sz // t5a
smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a
smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12
- rshrn_sz v24, v2, v3, #12, \sz // t6a
- rshrn_sz v23, v4, v5, #12, \sz // t7a
+ sqrshrn_sz v24, v2, v3, #12, \sz // t6a
+ sqrshrn_sz v23, v4, v5, #12, \sz // t7a
smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13
smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14
- rshrn_sz v17, v6, v7, #12, \sz // t12
+ sqrshrn_sz v17, v6, v7, #12, \sz // t12
smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15
- rshrn_sz v29, v2, v3, #12, \sz // t13
- rshrn_sz v30, v4, v5, #12, \sz // t14
- rshrn_sz v18, v6, v7, #12, \sz // t15
+ sqrshrn_sz v29, v2, v3, #12, \sz // t13
+ sqrshrn_sz v30, v4, v5, #12, \sz // t14
+ sqrshrn_sz v18, v6, v7, #12, \sz // t15
sqsub v2\sz, v16\sz, v21\sz // t2a
.ifc \o0, v16
@@ -1267,21 +1267,21 @@ endfunc
smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
- rshrn_sz v24, v24, v25, #12, \sz // out8
- rshrn_sz v4, v4, v5, #12, \sz // out7
- rshrn_sz v5, v6, v7, #12, \sz // out5
+ sqrshrn_sz v24, v24, v25, #12, \sz // out8
+ sqrshrn_sz v4, v4, v5, #12, \sz // out7
+ sqrshrn_sz v5, v6, v7, #12, \sz // out5
smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
- rshrn_sz v26, v6, v7, #12, \sz // out10
+ sqrshrn_sz v26, v6, v7, #12, \sz // out10
smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
- rshrn_sz \o4, v2, v3, #12, \sz // out4
- rshrn_sz v6, v6, v7, #12, \sz // out11
- rshrn_sz v7, v21, v25, #12, \sz // out9
- rshrn_sz \o6, v22, v23, #12, \sz // out6
+ sqrshrn_sz \o4, v2, v3, #12, \sz // out4
+ sqrshrn_sz v6, v6, v7, #12, \sz // out11
+ sqrshrn_sz v7, v21, v25, #12, \sz // out9
+ sqrshrn_sz \o6, v22, v23, #12, \sz // out6
.ifc \o8, v23
mov \o8\szb, v24\szb
@@ -1860,35 +1860,35 @@ function inv_dct32_odd_8h_x16_neon, export=1
smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a
smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a
smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a
- rshrn_sz v16, v2, v3, #12, .8h // t16a
- rshrn_sz v31, v4, v5, #12, .8h // t31a
+ sqrshrn_sz v16, v2, v3, #12, .8h // t16a
+ sqrshrn_sz v31, v4, v5, #12, .8h // t31a
smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a
smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a
- rshrn_sz v24, v6, v7, #12, .8h // t17a
- rshrn_sz v23, v2, v3, #12, .8h // t30a
+ sqrshrn_sz v24, v6, v7, #12, .8h // t17a
+ sqrshrn_sz v23, v2, v3, #12, .8h // t30a
smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a
smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a
- rshrn_sz v20, v4, v5, #12, .8h // t18a
- rshrn_sz v27, v6, v7, #12, .8h // t29a
+ sqrshrn_sz v20, v4, v5, #12, .8h // t18a
+ sqrshrn_sz v27, v6, v7, #12, .8h // t29a
smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a
smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a
- rshrn_sz v28, v2, v3, #12, .8h // t19a
- rshrn_sz v19, v4, v5, #12, .8h // t28a
+ sqrshrn_sz v28, v2, v3, #12, .8h // t19a
+ sqrshrn_sz v19, v4, v5, #12, .8h // t28a
smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a
smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a
- rshrn_sz v18, v6, v7, #12, .8h // t20a
- rshrn_sz v29, v2, v3, #12, .8h // t27a
+ sqrshrn_sz v18, v6, v7, #12, .8h // t20a
+ sqrshrn_sz v29, v2, v3, #12, .8h // t27a
smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a
smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a
- rshrn_sz v26, v4, v5, #12, .8h // t21a
- rshrn_sz v21, v6, v7, #12, .8h // t26a
+ sqrshrn_sz v26, v4, v5, #12, .8h // t21a
+ sqrshrn_sz v21, v6, v7, #12, .8h // t26a
smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a
smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a
- rshrn_sz v22, v2, v3, #12, .8h // t22a
- rshrn_sz v25, v4, v5, #12, .8h // t25a
+ sqrshrn_sz v22, v2, v3, #12, .8h // t22a
+ sqrshrn_sz v25, v4, v5, #12, .8h // t25a
smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a
- rshrn_sz v30, v6, v7, #12, .8h // t23a
- rshrn_sz v17, v2, v3, #12, .8h // t24a
+ sqrshrn_sz v30, v6, v7, #12, .8h // t23a
+ sqrshrn_sz v17, v2, v3, #12, .8h // t24a
ld1 {v0.8h}, [x16]
@@ -1912,23 +1912,23 @@ function inv_dct32_odd_8h_x16_neon, export=1
smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a
smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a
smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a
- rshrn_sz v21, v4, v5, #12, .8h // t17a
- rshrn_sz v27, v6, v7, #12, .8h // t30a
+ sqrshrn_sz v21, v4, v5, #12, .8h // t17a
+ sqrshrn_sz v27, v6, v7, #12, .8h // t30a
neg v2.4s, v2.4s // -> t18a
neg v3.4s, v3.4s // -> t18a
smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a
smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a
- rshrn_sz v19, v2, v3, #12, .8h // t18a
- rshrn_sz v24, v4, v5, #12, .8h // t29a
+ sqrshrn_sz v19, v2, v3, #12, .8h // t18a
+ sqrshrn_sz v24, v4, v5, #12, .8h // t29a
smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a
smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a
- rshrn_sz v22, v6, v7, #12, .8h // t21a
- rshrn_sz v18, v2, v3, #12, .8h // t26a
+ sqrshrn_sz v22, v6, v7, #12, .8h // t21a
+ sqrshrn_sz v18, v2, v3, #12, .8h // t26a
neg v4.4s, v4.4s // -> t22a
neg v5.4s, v5.4s // -> t22a
smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a
- rshrn_sz v17, v4, v5, #12, .8h // t22a
- rshrn_sz v20, v6, v7, #12, .8h // t25a
+ sqrshrn_sz v17, v4, v5, #12, .8h // t22a
+ sqrshrn_sz v20, v6, v7, #12, .8h // t25a
sqsub v2.8h, v27.8h, v24.8h // t29
sqadd v27.8h, v27.8h, v24.8h // t30
@@ -1950,23 +1950,23 @@ function inv_dct32_odd_8h_x16_neon, export=1
smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a
smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a
smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19
- rshrn_sz v18, v4, v5, #12, .8h // t18a
- rshrn_sz v25, v6, v7, #12, .8h // t29a
+ sqrshrn_sz v18, v4, v5, #12, .8h // t18a
+ sqrshrn_sz v25, v6, v7, #12, .8h // t29a
smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28
smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20
- rshrn_sz v29, v2, v3, #12, .8h // t19
- rshrn_sz v24, v4, v5, #12, .8h // t28
+ sqrshrn_sz v29, v2, v3, #12, .8h // t19
+ sqrshrn_sz v24, v4, v5, #12, .8h // t28
neg v6.4s, v6.4s // -> t20
neg v7.4s, v7.4s // -> t20
smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27
smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a
- rshrn_sz v26, v6, v7, #12, .8h // t20
- rshrn_sz v19, v2, v3, #12, .8h // t27
+ sqrshrn_sz v26, v6, v7, #12, .8h // t20
+ sqrshrn_sz v19, v2, v3, #12, .8h // t27
neg v4.4s, v4.4s // -> t21a
neg v5.4s, v5.4s // -> t21a
smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a
- rshrn_sz v20, v4, v5, #12, .8h // t21a
- rshrn_sz v28, v6, v7, #12, .8h // t26a
+ sqrshrn_sz v20, v4, v5, #12, .8h // t21a
+ sqrshrn_sz v28, v6, v7, #12, .8h // t26a
sqsub v2.8h, v16.8h, v30.8h // t23
sqadd v16.8h, v16.8h, v30.8h // t16 = out16
@@ -1988,24 +1988,24 @@ function inv_dct32_odd_8h_x16_neon, export=1
smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20
smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27
- rshrn_sz v20, v4, v5, #12, .8h // t20
- rshrn_sz v22, v6, v7, #12, .8h // t27
+ sqrshrn_sz v20, v4, v5, #12, .8h // t20
+ sqrshrn_sz v22, v6, v7, #12, .8h // t27
smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
mov v27.16b, v22.16b // t27
- rshrn_sz v26, v4, v5, #12, .8h // t26a
+ sqrshrn_sz v26, v4, v5, #12, .8h // t26a
smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25
- rshrn_sz v21, v6, v7, #12, .8h // t21a
- rshrn_sz v22, v24, v25, #12, .8h // t22
- rshrn_sz v25, v4, v5, #12, .8h // t25
+ sqrshrn_sz v21, v6, v7, #12, .8h // t21a
+ sqrshrn_sz v22, v24, v25, #12, .8h // t22
+ sqrshrn_sz v25, v4, v5, #12, .8h // t25
smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a
smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a
- rshrn_sz v23, v4, v5, #12, .8h // t23a
- rshrn_sz v24, v6, v7, #12, .8h // t24a
+ sqrshrn_sz v23, v4, v5, #12, .8h // t23a
+ sqrshrn_sz v24, v6, v7, #12, .8h // t24a
ret
endfunc
@@ -2594,11 +2594,11 @@ function inv_dct64_step1_neon
neg v2.4s, v2.4s // t34a
neg v3.4s, v3.4s // t34a
smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a
- rshrn_sz v26, v2, v3, #12, .8h // t34a
+ sqrshrn_sz v26, v2, v3, #12, .8h // t34a
smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a
- rshrn_sz v29, v4, v5, #12, .8h // t61a
- rshrn_sz v25, v6, v7, #12, .8h // t33a
- rshrn_sz v30, v2, v3, #12, .8h // t62a
+ sqrshrn_sz v29, v4, v5, #12, .8h // t61a
+ sqrshrn_sz v25, v6, v7, #12, .8h // t33a
+ sqrshrn_sz v30, v2, v3, #12, .8h // t62a
sqadd v16.8h, v24.8h, v27.8h // t32a
sqsub v19.8h, v24.8h, v27.8h // t35a
@@ -2612,11 +2612,11 @@ function inv_dct64_step1_neon
smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a
smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a
smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60
- rshrn_sz v21, v2, v3, #12, .8h // t61a
- rshrn_sz v18, v4, v5, #12, .8h // t34a
+ sqrshrn_sz v21, v2, v3, #12, .8h // t61a
+ sqrshrn_sz v18, v4, v5, #12, .8h // t34a
smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35
- rshrn_sz v20, v6, v7, #12, .8h // t60
- rshrn_sz v19, v2, v3, #12, .8h // t35
+ sqrshrn_sz v20, v6, v7, #12, .8h // t60
+ sqrshrn_sz v19, v2, v3, #12, .8h // t35
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
@@ -2653,13 +2653,13 @@ function inv_dct64_step2_neon
smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a
smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a
smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a
- rshrn_sz v25, v2, v3, #12, .8h // t56a
- rshrn_sz v27, v4, v5, #12, .8h // t39a
+ sqrshrn_sz v25, v2, v3, #12, .8h // t56a
+ sqrshrn_sz v27, v4, v5, #12, .8h // t39a
neg v6.4s, v6.4s // t40a
neg v7.4s, v7.4s // t40a
smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a
- rshrn_sz v31, v6, v7, #12, .8h // t40a
- rshrn_sz v28, v2, v3, #12, .8h // t55a
+ sqrshrn_sz v31, v6, v7, #12, .8h // t40a
+ sqrshrn_sz v28, v2, v3, #12, .8h // t55a
sqadd v16.8h, v24.8h, v29.8h // t32a
sqsub v19.8h, v24.8h, v29.8h // t47a
@@ -2673,11 +2673,11 @@ function inv_dct64_step2_neon
smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a
smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a
smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47
- rshrn_sz v18, v2, v3, #12, .8h // t40a
- rshrn_sz v21, v4, v5, #12, .8h // t55a
+ sqrshrn_sz v18, v2, v3, #12, .8h // t40a
+ sqrshrn_sz v21, v4, v5, #12, .8h // t55a
smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48
- rshrn_sz v19, v6, v7, #12, .8h // t47
- rshrn_sz v20, v2, v3, #12, .8h // t48
+ sqrshrn_sz v19, v6, v7, #12, .8h // t47
+ sqrshrn_sz v20, v2, v3, #12, .8h // t48
str q16, [x6, #2*8*0] // t32a
str q17, [x9, #2*8*0] // t39
diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S
index 778448a0f3f02..a598b72b03951 100644
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@@ -50,6 +50,7 @@ endconst
// const int16_t filter[2][8],
// const enum LrEdgeFlags edges);
function wiener_filter7_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v0.8h, v1.8h}, [x6]
@@ -121,6 +122,7 @@ L(v1_7):
mov sp, x29
ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
ret
L(no_top_7):
@@ -538,6 +540,7 @@ endfunc
// const int16_t filter[2][8],
// const enum LrEdgeFlags edges);
function wiener_filter5_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v0.8h, v1.8h}, [x6]
@@ -598,6 +601,7 @@ L(end_5):
mov sp, x29
ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
ret
L(no_top_5):
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S
index fcb4f84e7ef88..8954e604cf559 100644
--- a/third_party/dav1d/src/arm/64/looprestoration16.S
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@@ -52,6 +52,7 @@ endconst
// const int bitdepth_max);
function wiener_filter7_16bpc_neon, export=1
ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
stp x29, x30, [sp, #-32]!
stp d8, d9, [sp, #16]
mov x29, sp
@@ -137,6 +138,7 @@ L(v1_7):
mov sp, x29
ldp d8, d9, [sp, #16]
ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
ret
L(no_top_7):
@@ -595,6 +597,7 @@ endfunc
// const int bitdepth_max);
function wiener_filter5_16bpc_neon, export=1
ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
stp x29, x30, [sp, #-32]!
stp d8, d9, [sp, #16]
mov x29, sp
@@ -669,6 +672,7 @@ L(end_5):
mov sp, x29
ldp d8, d9, [sp, #16]
ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
ret
L(no_top_5):
diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S
index 017c89c9117d7..d1083c6b561b5 100644
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@@ -34,10 +34,78 @@
#define x18 do_not_use_x18
#define w18 do_not_use_w18
-/* Support macros for the Armv8.5-A Branch Target Identification feature which
- * requires emitting a .note.gnu.property section with the appropriate
- * architecture-dependent feature bits set.
- * Read more: "ELF for the Arm® 64-bit Architecture"
+/* Support macros for
+ * - Armv8.3-A Pointer Authentication and
+ * - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ *
+ * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+ * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+ * used immediately before saving the LR register (x30) to the stack.
+ * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+ * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+ * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+ * have the same value at the two points. For example:
+ *
+ * .global f
+ * f:
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+ * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+ * indirect call target. In particular, all symbols exported from a file must
+ * begin with one of these macros. For example, a leaf function that does not
+ * save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+ *
+ * .globl return_zero
+ * return_zero:
+ * AARCH64_VALID_CALL_TARGET
+ * mov x0, #0
+ * ret
+ *
+ * A non-leaf function which does not immediately save LR may need both macros
+ * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+ * may jump to an alternate implementation before setting up the stack:
+ *
+ * .globl with_early_jump
+ * with_early_jump:
+ * AARCH64_VALID_CALL_TARGET
+ * cmp x0, #128
+ * b.lt .Lwith_early_jump_128
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * .Lwith_early_jump_128:
+ * ...
+ * ret
+ *
+ * These annotations are only required with indirect calls. Private symbols that
+ * are only the target of direct calls do not require annotations. Also note
+ * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+ * indirect jumps (BR). Indirect jumps in assembly are supported through
+ * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
+ * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
+ *
+ * Although not necessary, it is safe to use these macros in 32-bit ARM
+ * assembly. This may be used to simplify dual 32-bit and 64-bit files.
+ *
+ * References:
+ * - "ELF for the Arm® 64-bit Architecture"
+ * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+ * - "Providing protection for complex software"
+ * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
*/
#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification
@@ -51,7 +119,32 @@
#define AARCH64_VALID_JUMP_TARGET
#endif
-#if (GNU_PROPERTY_AARCH64_BTI != 0)
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
+#define AARCH64_SIGN_LINK_REGISTER paciasp
+#define AARCH64_VALIDATE_LINK_REGISTER autiasp
+#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+#else
+#error Pointer authentication defines no valid key!
+#endif
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions
+#error Authentication of leaf functions is enabled but not supported in dav1d!
+#endif
+#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+
+#else /* __ARM_FEATURE_PAC_DEFAULT */
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER
+#define AARCH64_VALIDATE_LINK_REGISTER
+
+#endif /* !__ARM_FEATURE_PAC_DEFAULT */
+
+
+#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
.pushsection .note.gnu.property, "a"
.balign 8
.long 4
@@ -60,11 +153,11 @@
.asciz "GNU"
.long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
.long 4
- .long GNU_PROPERTY_AARCH64_BTI
+ .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
.long 0
.popsection
-#endif
-#endif
+#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */
+#endif /* ARCH_AARCH64 */
#if ARCH_ARM
.syntax unified
@@ -74,7 +167,7 @@
.eabi_attribute 10, 0 // suppress Tag_FP_arch
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
-#endif
+#endif /* __ELF__ */
#ifdef _WIN32
#define CONFIG_THUMB 1
@@ -89,8 +182,8 @@
#else
#define A
#define T @
-#endif
-#endif
+#endif /* CONFIG_THUMB */
+#endif /* ARCH_ARM */
#if !defined(PIC)
#if defined(__PIC__)
diff --git a/third_party/dav1d/src/arm/film_grain_init_tmpl.c b/third_party/dav1d/src/arm/filmgrain_init_tmpl.c
similarity index 99%
rename from third_party/dav1d/src/arm/film_grain_init_tmpl.c
rename to third_party/dav1d/src/arm/filmgrain_init_tmpl.c
index 3a416020532f5..2156047d02ad6 100644
--- a/third_party/dav1d/src/arm/film_grain_init_tmpl.c
+++ b/third_party/dav1d/src/arm/filmgrain_init_tmpl.c
@@ -28,7 +28,7 @@
*/
#include "src/cpu.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
#include "asm-offsets.h"
CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
diff --git a/third_party/dav1d/src/data.c b/third_party/dav1d/src/data.c
index 4c7bf82715c48..fa6165ec72179 100644
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@@ -116,11 +116,17 @@ void dav1d_data_props_copy(Dav1dDataProps *const dst,
void dav1d_data_props_set_defaults(Dav1dDataProps *const props) {
assert(props != NULL);
+ memset(props, 0, sizeof(*props));
props->timestamp = INT64_MIN;
- props->duration = 0;
props->offset = -1;
- props->user_data.data = NULL;
- props->user_data.ref = NULL;
+}
+
+void dav1d_data_props_unref_internal(Dav1dDataProps *const props) {
+ validate_input(props != NULL);
+
+ struct Dav1dRef *user_data_ref = props->user_data.ref;
+ dav1d_data_props_set_defaults(props);
+ dav1d_ref_dec(&user_data_ref);
}
void dav1d_data_unref_internal(Dav1dData *const buf) {
@@ -132,5 +138,6 @@ void dav1d_data_unref_internal(Dav1dData *const buf) {
dav1d_ref_dec(&buf->ref);
}
memset(buf, 0, sizeof(*buf));
+ dav1d_data_props_set_defaults(&buf->m);
dav1d_ref_dec(&user_data_ref);
}
diff --git a/third_party/dav1d/src/data.h b/third_party/dav1d/src/data.h
index 5b07021c532ef..b34c1db702ca3 100644
--- a/third_party/dav1d/src/data.h
+++ b/third_party/dav1d/src/data.h
@@ -51,5 +51,6 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
void *cookie),
void *cookie);
void dav1d_data_unref_internal(Dav1dData *buf);
+void dav1d_data_props_unref_internal(Dav1dDataProps *props);
#endif /* DAV1D_SRC_DATA_H */
diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c
index cec91e9fb3009..bd13014947e3b 100644
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -42,7 +42,7 @@
#include "src/decode.h"
#include "src/dequant_tables.h"
#include "src/env.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
#include "src/log.h"
#include "src/qm.h"
#include "src/recon.h"
@@ -1242,6 +1242,7 @@ static int decode_b(Dav1dTaskContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
+ b->uv_angle = 0;
if (b->uv_mode == CFL_PRED) {
#define SIGN(a) (!!(a) + ((a) > 0))
const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
@@ -1274,8 +1275,6 @@ static int decode_b(Dav1dTaskContext *const t,
uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
b->uv_angle = angle - 3;
- } else {
- b->uv_angle = 0;
}
}
@@ -3231,8 +3230,6 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
if (ret < 0) goto error;
}
- retval = DAV1D_ERR(EINVAL);
-
// setup dequant tables
init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
if (f->frame_hdr->quant.qm)
@@ -3356,7 +3353,7 @@ error:
int dav1d_decode_frame_main(Dav1dFrameContext *const f) {
const Dav1dContext *const c = f->c;
- int retval = DAV1D_ERR(ENOMEM);
+ int retval = DAV1D_ERR(EINVAL);
assert(f->c->n_tc == 1);
@@ -3500,7 +3497,13 @@ int dav1d_submit_frame(Dav1dContext *const c) {
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
- if (out_delayed->p.data[0]) {
+ const int error = f->task_thread.retval;
+ if (error) {
+ f->task_thread.retval = 0;
+ c->cached_error = error;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ } else if (out_delayed->p.data[0]) {
const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
memory_order_relaxed);
if ((out_delayed->visible || c->output_invisible_frames) &&
@@ -3842,7 +3845,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
dav1d_ref_dec(&c->refs[i].refmvs);
}
}
- return res;
+ goto error;
}
} else {
dav1d_task_frame_init(f);
@@ -3869,6 +3872,7 @@ error:
dav1d_ref_dec(&f->mvs_ref);
dav1d_ref_dec(&f->seq_hdr_ref);
dav1d_ref_dec(&f->frame_hdr_ref);
+ dav1d_data_props_copy(&c->cached_error_props, &c->in.m);
for (int i = 0; i < f->n_tile_data; i++)
dav1d_data_unref_internal(&f->tile[i].data);
diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm
index c9f2d6398cde1..68b1f74f4bb02 100644
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@@ -1,7 +1,7 @@
;*****************************************************************************
;* x86inc.asm: x86 abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2021 x264 project
+;* Copyright (C) 2005-2022 x264 project
;*
;* Authors: Loren Merritt <lorenm at u.washington.edu>
;* Henrik Gramner <henrik at gramner.com>
@@ -238,6 +238,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endif
%endmacro
+; Repeats an instruction/operation for multiple arguments.
+; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3"
+%macro REPX 2-* ; operation, args
+ %xdefine %%f(x) %1
+ %rep %0 - 1
+ %rotate 1
+ %%f(%1)
+ %endrep
+%endmacro
+
%macro PUSH 1
push %1
%ifidn rstk, rsp
@@ -1342,7 +1352,20 @@ INIT_XMM
%1 %6, __src2
%endif
%elif %0 >= 9
- __instr %6, %7, %8, %9
+ %if avx_enabled && __sizeofreg >= 16 && %4 == 1
+ %ifnnum regnumof%7
+ %if %3
+ vmovaps %6, %7
+ %else
+ vmovdqa %6, %7
+ %endif
+ __instr %6, %6, %8, %9
+ %else
+ __instr %6, %7, %8, %9
+ %endif
+ %else
+ __instr %6, %7, %8, %9
+ %endif
%elif %0 == 8
%if avx_enabled && __sizeofreg >= 16 && %4 == 0
%xdefine __src1 %7
@@ -1379,7 +1402,7 @@ INIT_XMM
%else
vmovdqa %6, %7
%endif
- __instr %6, %8
+ __instr %6, %6, %8
%else
__instr %6, __src1, __src2
%endif
@@ -1448,8 +1471,8 @@ AVX_INSTR andpd, sse2, 1, 0, 1
AVX_INSTR andps, sse, 1, 0, 1
AVX_INSTR blendpd, sse4, 1, 1, 0
AVX_INSTR blendps, sse4, 1, 1, 0
-AVX_INSTR blendvpd, sse4 ; can't be emulated
-AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
+AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
AVX_INSTR cmpeqpd, sse2, 1, 0, 1
AVX_INSTR cmpeqps, sse, 1, 0, 1
AVX_INSTR cmpeqsd, sse2, 1, 0, 0
@@ -1582,7 +1605,7 @@ AVX_INSTR pand, mmx, 0, 0, 1
AVX_INSTR pandn, mmx, 0, 0, 0
AVX_INSTR pavgb, mmx2, 0, 0, 1
AVX_INSTR pavgw, mmx2, 0, 0, 1
-AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
AVX_INSTR pblendw, sse4, 0, 1, 0
AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
diff --git a/third_party/dav1d/src/fg_apply.h b/third_party/dav1d/src/fg_apply.h
index 779549bb2d020..be6685d8018bc 100644
--- a/third_party/dav1d/src/fg_apply.h
+++ b/third_party/dav1d/src/fg_apply.h
@@ -32,10 +32,27 @@
#include "common/bitdepth.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
-bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
- Dav1dPicture *const out,
- const Dav1dPicture *const in);
+#ifdef BITDEPTH
+# define array_decl(type, name, sz) type name sz
+#else
+# define array_decl(type, name, sz) void *name
+#endif
+
+bitfn_decls(void dav1d_apply_grain,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in);
+bitfn_decls(void dav1d_prep_grain,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in,
+ array_decl(uint8_t, scaling, [3][SCALING_SIZE]),
+ array_decl(entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]));
+bitfn_decls(void dav1d_apply_grain_row,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in,
+ array_decl(const uint8_t, scaling, [3][SCALING_SIZE]),
+ array_decl(const entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]),
+ const int row);
#endif /* DAV1D_SRC_FG_APPLY_H */
diff --git a/third_party/dav1d/src/fg_apply_tmpl.c b/third_party/dav1d/src/fg_apply_tmpl.c
index c254a3dffa4f8..ee14db9a4cea6 100644
--- a/third_party/dav1d/src/fg_apply_tmpl.c
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@@ -30,13 +30,13 @@
#include <stdint.h>
+#include "dav1d/common.h"
#include "dav1d/picture.h"
-#include "common.h"
#include "common/intops.h"
#include "common/bitdepth.h"
-#include "fg_apply.h"
+#include "src/fg_apply.h"
static void generate_scaling(const int bitdepth,
const uint8_t points[][2], const int num,
@@ -44,14 +44,15 @@ static void generate_scaling(const int bitdepth,
{
#if BITDEPTH == 8
const int shift_x = 0;
+ const int scaling_size = SCALING_SIZE;
#else
+ assert(bitdepth > 8);
const int shift_x = bitdepth - 8;
-#endif
const int scaling_size = 1 << bitdepth;
+#endif
// Fill up the preceding entries with the initial value
- for (int i = 0; i < points[0][0] << shift_x; i++)
- scaling[i] = points[0][1];
+ memset(scaling, points[0][1], points[0][0] << shift_x);
// Linearly interpolate the values in the middle
for (int i = 0; i < num - 1; i++) {
@@ -61,16 +62,17 @@ static void generate_scaling(const int bitdepth,
const int ey = points[i+1][1];
const int dx = ex - bx;
const int dy = ey - by;
+ assert(dx > 0);
const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
- for (int x = 0; x < dx; x++) {
- const int v = by + ((x * delta + 0x8000) >> 16);
- scaling[(bx + x) << shift_x] = v;
+ for (int x = 0, d = 0x8000; x < dx; x++) {
+ scaling[(bx + x) << shift_x] = by + (d >> 16);
+ d += delta;
}
}
// Fill up the remaining entries with the final value
- for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
- scaling[i] = points[num - 1][1];
+ const int n = points[num - 1][0] << shift_x;
+ memset(&scaling[n], points[num - 1][1], scaling_size - n);
#if BITDEPTH != 8
const int pad = 1 << shift_x, rnd = pad >> 1;
@@ -80,8 +82,9 @@ static void generate_scaling(const int bitdepth,
const int dx = ex - bx;
for (int x = 0; x < dx; x += pad) {
const int range = scaling[bx + x + pad] - scaling[bx + x];
- for (int n = 1; n < pad; n++) {
- scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x);
+ for (int n = 1, r = rnd; n < pad; n++) {
+ r += range;
+ scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x);
}
}
}
@@ -89,14 +92,13 @@ static void generate_scaling(const int bitdepth,
}
#ifndef UNIT_TEST
-void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
- Dav1dPicture *const out,
- const Dav1dPicture *const in)
+void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in,
+ uint8_t scaling[3][SCALING_SIZE],
+ entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH])
{
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
-
- ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]);
- uint8_t scaling[3][SCALING_SIZE];
#if BITDEPTH != 8
const int bitdepth_max = (1 << out->p.bpc) - 1;
#endif
@@ -150,60 +152,86 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
memcpy(out->data[2], in->data[2], sz);
}
}
+}
+void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in,
+ const uint8_t scaling[3][SCALING_SIZE],
+ const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH],
+ const int row)
+{
// Synthesize grain for the affected planes
- const int rows = (out->p.h + 31) >> 5;
+ const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int cpw = (out->p.w + ss_x) >> ss_x;
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
- for (int row = 0; row < rows; row++) {
- pixel *const luma_src =
- ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
-
- if (data->num_y_points) {
- const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
- dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
- luma_src, out->stride[0], data,
- out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
- }
+ pixel *const luma_src =
+ ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
+#if BITDEPTH != 8
+ const int bitdepth_max = (1 << out->p.bpc) - 1;
+#endif
- if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
- !data->chroma_scaling_from_luma)
- {
- continue;
- }
+ if (data->num_y_points) {
+ const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
+ dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
+ luma_src, out->stride[0], data,
+ out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
+ }
- const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
+ if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
+ !data->chroma_scaling_from_luma)
+ {
+ return;
+ }
- // extend padding pixels
- if (out->p.w & ss_x) {
- pixel *ptr = luma_src;
- for (int y = 0; y < bh; y++) {
- ptr[out->p.w] = ptr[out->p.w - 1];
- ptr += PXSTRIDE(in->stride[0]) << ss_y;
- }
+ const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
+
+ // extend padding pixels
+ if (out->p.w & ss_x) {
+ pixel *ptr = luma_src;
+ for (int y = 0; y < bh; y++) {
+ ptr[out->p.w] = ptr[out->p.w - 1];
+ ptr += PXSTRIDE(in->stride[0]) << ss_y;
}
+ }
- const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
- if (data->chroma_scaling_from_luma) {
- for (int pl = 0; pl < 2; pl++)
+ const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
+ if (data->chroma_scaling_from_luma) {
+ for (int pl = 0; pl < 2; pl++)
+ dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+ ((const pixel *) in->data[1 + pl]) + uv_off,
+ in->stride[1], data, cpw,
+ scaling[0], grain_lut[1 + pl],
+ bh, row, luma_src, in->stride[0],
+ pl, is_id HIGHBD_TAIL_SUFFIX);
+ } else {
+ for (int pl = 0; pl < 2; pl++)
+ if (data->num_uv_points[pl])
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
((const pixel *) in->data[1 + pl]) + uv_off,
in->stride[1], data, cpw,
- scaling[0], grain_lut[1 + pl],
+ scaling[1 + pl], grain_lut[1 + pl],
bh, row, luma_src, in->stride[0],
pl, is_id HIGHBD_TAIL_SUFFIX);
- } else {
- for (int pl = 0; pl < 2; pl++)
- if (data->num_uv_points[pl])
- dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
- ((const pixel *) in->data[1 + pl]) + uv_off,
- in->stride[1], data, cpw,
- scaling[1 + pl], grain_lut[1 + pl],
- bh, row, luma_src, in->stride[0],
- pl, is_id HIGHBD_TAIL_SUFFIX);
- }
}
}
+
+void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in)
+{
+ ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]);
+#if ARCH_X86_64 && BITDEPTH == 8
+ ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]);
+#else
+ uint8_t scaling[3][SCALING_SIZE];
+#endif
+ const int rows = (out->p.h + 31) >> 5;
+
+ bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
+ for (int row = 0; row < rows; row++)
+ bitfn(dav1d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row);
+}
#endif
diff --git a/third_party/dav1d/src/filmgrain.h b/third_party/dav1d/src/filmgrain.h
new file mode 100644
index 0000000000000..d953542a82a73
--- /dev/null
+++ b/third_party/dav1d/src/filmgrain.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FILM_GRAIN_H
+#define DAV1D_SRC_FILM_GRAIN_H
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+#define BLOCK_SIZE 32
+#if !defined(BITDEPTH) || BITDEPTH == 8
+#define SCALING_SIZE 256
+typedef int8_t entry;
+#else
+#define SCALING_SIZE 4096
+typedef int16_t entry;
+#endif
+
+#define decl_generate_grain_y_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
+
+#define decl_generate_grain_uv_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+ const entry buf_y[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
+
+#define decl_fgy_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+ const Dav1dFilmGrainData *data, \
+ size_t pw, const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ int bh, int row_num HIGHBD_DECL_SUFFIX)
+typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
+
+#define decl_fguv_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+ const Dav1dFilmGrainData *data, int pw, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
+ const pixel *luma_row, ptrdiff_t luma_stride, \
+ int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
+typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
+
+typedef struct Dav1dFilmGrainDSPContext {
+ generate_grain_y_fn generate_grain_y;
+ generate_grain_uv_fn generate_grain_uv[3];
+
+ fgy_32x32xn_fn fgy_32x32xn;
+ fguv_32x32xn_fn fguv_32x32xn[3];
+} Dav1dFilmGrainDSPContext;
+
+bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
+bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c);
+bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c);
+
+#endif /* DAV1D_SRC_FILM_GRAIN_H */
diff --git a/third_party/dav1d/src/filmgrain_tmpl.c b/third_party/dav1d/src/filmgrain_tmpl.c
new file mode 100644
index 0000000000000..883c5cbb7b9a8
--- /dev/null
+++ b/third_party/dav1d/src/filmgrain_tmpl.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/filmgrain.h"
+#include "src/tables.h"
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+static inline int get_random_number(const int bits, unsigned *const state) {
+ const int r = *state;
+ unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+ *state = (r >> 1) | (bit << 15);
+
+ return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const uint64_t shift) {
+ return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data
+ HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ unsigned seed = data->seed;
+ const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ for (int y = 0; y < GRAIN_HEIGHT; y++) {
+ for (int x = 0; x < GRAIN_WIDTH; x++) {
+ const int value = get_random_number(11, &seed);
+ buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ const int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+ for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+ const int8_t *coeff = data->ar_coeffs_y;
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ if (!dx && !dy)
+ break;
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ buf[y][x] = iclip(grain, grain_min, grain_max);
+ }
+ }
+}
+
+static NOINLINE void
+generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
+ const entry buf_y[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data, const intptr_t uv,
+ const int subx, const int suby HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
+ const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH;
+ const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+ for (int y = 0; y < chromaH; y++) {
+ for (int x = 0; x < chromaW; x++) {
+ const int value = get_random_number(11, &seed);
+ buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ const int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < chromaH; y++) {
+ for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+ const int8_t *coeff = data->ar_coeffs_uv[uv];
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ // For the final (current) pixel, we need to add in the
+ // contribution from the luma grain texture
+ if (!dx && !dy) {
+ if (!data->num_y_points)
+ break;
+ int luma = 0;
+ const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+ const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+ for (int i = 0; i <= suby; i++) {
+ for (int j = 0; j <= subx; j++) {
+ luma += buf_y[lumaY + i][lumaX + j];
+ }
+ }
+ luma = round2(luma, subx + suby);
+ sum += luma * (*coeff);
+ break;
+ }
+
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ buf[y][x] = iclip(grain, grain_min, grain_max);
+ }
+ }
+}
+
+#define gnuv_ss_fn(nm, ss_x, ss_y) \
+static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
+ generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
+}
+
+gnuv_ss_fn(420, 1, 1);
+gnuv_ss_fn(422, 1, 0);
+gnuv_ss_fn(444, 0, 0);
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
+ const int offsets[2][2], const int subx, const int suby,
+ const int bx, const int by, const int x, const int y)
+{
+ const int randval = offsets[bx][by];
+ const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+ const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+ return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
+ [offx + x + (BLOCK_SIZE >> subx) * bx];
+}
+
+static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride,
+ const Dav1dFilmGrainData *const data, const size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ int min_value, max_value;
+ if (data->clip_to_restricted_range) {
+ min_value = 16 << bitdepth_min_8;
+ max_value = 235 << bitdepth_min_8;
+ } else {
+ min_value = 0;
+ max_value = BITDEPTH_MAX;
+ }
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in BLOCK_SIZE^2 blocks
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+ const int bw = imin(BLOCK_SIZE, (int) pw - bx);
+
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ // x/y block offsets to compensate for overlapped regions
+ const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0;
+ const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0;
+
+ static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+#define add_noise_y(x, y, grain) \
+ const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \
+ pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \
+ const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
+ *dst = iclip(*src + noise, min_value, max_value);
+
+ for (int y = ystart; y < bh; y++) {
+ // Non-overlapped image region (straightforward)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ add_noise_y(x, y, grain);
+ }
+
+ // Special case for overlapped column
+ for (int x = 0; x < xstart; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+ grain = round2(old * w[x][0] + grain * w[x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+ }
+
+ for (int y = 0; y < ystart; y++) {
+ // Special case for overlapped row (sans corner)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+ grain = round2(old * w[y][0] + grain * w[y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+
+ // Special case for doubly-overlapped corner
+ for (int x = 0; x < xstart; x++) {
+ // Blend the top pixel with the top left block
+ int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
+ top = round2(old * w[x][0] + top * w[x][1], 5);
+ top = iclip(top, grain_min, grain_max);
+
+ // Blend the current pixel with the left block
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+ grain = round2(old * w[x][0] + grain * w[x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+
+ // Mix the row rows together and apply grain
+ grain = round2(top * w[y][0] + grain * w[y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+ }
+ }
+}
+
+static NOINLINE void
+fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
+ const int pw, const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH], const int bh,
+ const int row_num, const pixel *const luma_row,
+ const ptrdiff_t luma_stride, const int uv, const int is_id,
+ const int sx, const int sy HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ int min_value, max_value;
+ if (data->clip_to_restricted_range) {
+ min_value = 16 << bitdepth_min_8;
+ max_value = (is_id ? 235 : 240) << bitdepth_min_8;
+ } else {
+ min_value = 0;
+ max_value = BITDEPTH_MAX;
+ }
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in BLOCK_SIZE^2 blocks (subsampled)
+ for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
+ const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ // x/y block offsets to compensate for overlapped regions
+ const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0;
+ const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0;
+
+ static const int w[2 /* sub */][2 /* off */][2] = {
+ { { 27, 17 }, { 17, 27 } },
+ { { 23, 22 } },
+ };
+
+#define add_noise_uv(x, y, grain) \
+ const int lx = (bx + x) << sx; \
+ const int ly = y << sy; \
+ const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \
+ pixel avg = luma[0]; \
+ if (sx) \
+ avg = (avg + luma[1] + 1) >> 1; \
+ const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
+ pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
+ int val = avg; \
+ if (!data->chroma_scaling_from_luma) { \
+ const int combined = avg * data->uv_luma_mult[uv] + \
+ *src * data->uv_mult[uv]; \
+ val = iclip_pixel( (combined >> 6) + \
+ (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \
+ } \
+ const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
+ *dst = iclip(*src + noise, min_value, max_value);
+
+ for (int y = ystart; y < bh; y++) {
+ // Non-overlapped image region (straightforward)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ add_noise_uv(x, y, grain);
+ }
+
+ // Special case for overlapped column
+ for (int x = 0; x < xstart; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+ grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+ }
+
+ for (int y = 0; y < ystart; y++) {
+ // Special case for overlapped row (sans corner)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+ grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+
+ // Special case for doubly-overlapped corner
+ for (int x = 0; x < xstart; x++) {
+ // Blend the top pixel with the top left block
+ int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
+ top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5);
+ top = iclip(top, grain_min, grain_max);
+
+ // Blend the current pixel with the left block
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+ grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+
+ // Mix the row rows together and apply to image
+ grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+ }
+ }
+}
+
+#define fguv_ss_fn(nm, ss_x, ss_y) \
+static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
+ fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
+ row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+fguv_ss_fn(420, 1, 1);
+fguv_ss_fn(422, 1, 0);
+fguv_ss_fn(444, 0, 0);
+
+COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
+ c->generate_grain_y = generate_grain_y_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
+
+ c->fgy_32x32xn = fgy_32x32xn_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ bitfn(dav1d_film_grain_dsp_init_arm)(c);
+#elif ARCH_X86
+ bitfn(dav1d_film_grain_dsp_init_x86)(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h
index 8edb80f21ce3e..eceda98eca4da 100644
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@@ -43,7 +43,7 @@ typedef struct Dav1dTask Dav1dTask;
#include "src/cdf.h"
#include "src/data.h"
#include "src/env.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
#include "src/intra_edge.h"
#include "src/ipred.h"
#include "src/itx.h"
@@ -73,6 +73,22 @@ struct Dav1dTileGroup {
int start, end;
};
+enum TaskType {
+ DAV1D_TASK_TYPE_INIT,
+ DAV1D_TASK_TYPE_INIT_CDF,
+ DAV1D_TASK_TYPE_TILE_ENTROPY,
+ DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
+ DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
+ DAV1D_TASK_TYPE_DEBLOCK_COLS,
+ DAV1D_TASK_TYPE_DEBLOCK_ROWS,
+ DAV1D_TASK_TYPE_CDEF,
+ DAV1D_TASK_TYPE_SUPER_RESOLUTION,
+ DAV1D_TASK_TYPE_LOOP_RESTORATION,
+ DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
+ DAV1D_TASK_TYPE_FG_PREP,
+ DAV1D_TASK_TYPE_FG_APPLY,
+};
+
struct Dav1dContext {
Dav1dFrameContext *fc;
unsigned n_fc;
@@ -123,6 +139,24 @@ struct Dav1dContext {
// See src/thread_task.c:reset_task_cur().
atomic_uint reset_task_cur;
atomic_int cond_signaled;
+ struct {
+ int exec;
+ pthread_cond_t cond;
+ const Dav1dPicture *in;
+ Dav1dPicture *out;
+ enum TaskType type;
+ atomic_int progress[2]; /* [0]=started, [1]=completed */
+ union {
+ struct {
+ ALIGN(int8_t grain_lut_8bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16);
+ ALIGN(uint8_t scaling_8bpc[3][256], 64);
+ };
+ struct {
+ ALIGN(int16_t grain_lut_16bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16);
+ ALIGN(uint8_t scaling_16bpc[3][4096], 64);
+ };
+ };
+ } delayed_fg;
int inited;
} task_thread;
@@ -155,6 +189,7 @@ struct Dav1dContext {
int operating_point;
unsigned operating_point_idc;
int all_layers;
+ int max_spatial_id;
unsigned frame_size_limit;
int strict_std_compliance;
int output_invisible_frames;
@@ -162,26 +197,14 @@ struct Dav1dContext {
int drain;
enum PictureFlags frame_flags;
enum Dav1dEventFlags event_flags;
+ Dav1dDataProps cached_error_props;
+ int cached_error;
Dav1dLogger logger;
Dav1dMemPool *picture_pool;
};
-enum TaskType {
- DAV1D_TASK_TYPE_INIT,
- DAV1D_TASK_TYPE_INIT_CDF,
- DAV1D_TASK_TYPE_TILE_ENTROPY,
- DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
- DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
- DAV1D_TASK_TYPE_DEBLOCK_COLS,
- DAV1D_TASK_TYPE_DEBLOCK_ROWS,
- DAV1D_TASK_TYPE_CDEF,
- DAV1D_TASK_TYPE_SUPER_RESOLUTION,
- DAV1D_TASK_TYPE_LOOP_RESTORATION,
- DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
-};
-
struct Dav1dTask {
unsigned frame_idx; // frame thread id
enum TaskType type; // task work
diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c
index 6b50a536786d2..b21a735964f23 100644
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@@ -120,7 +120,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
pthread_attr_setstacksize(&thread_attr, stack_size);
- Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32);
+ Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64);
if (!c) goto error;
memset(c, 0, sizeof(*c));
@@ -134,6 +134,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
c->output_invisible_frames = s->output_invisible_frames;
c->inloop_filters = s->inloop_filters;
+ dav1d_data_props_set_defaults(&c->cached_error_props);
+
if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
dav1d_mem_pool_init(&c->frame_hdr_pool) ||
dav1d_mem_pool_init(&c->segmap_pool) ||
@@ -197,6 +199,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
pthread_mutex_destroy(&c->task_thread.lock);
goto error;
}
+ if (pthread_cond_init(&c->task_thread.delayed_fg.cond, NULL)) {
+ pthread_cond_destroy(&c->task_thread.cond);
+ pthread_mutex_destroy(&c->task_thread.lock);
+ goto error;
+ }
c->task_thread.cur = c->n_fc;
atomic_init(&c->task_thread.reset_task_cur, UINT_MAX);
atomic_init(&c->task_thread.cond_signaled, 0);
@@ -317,7 +324,8 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
{
int res = 0;
- Dav1dThreadPicture *const in = c->all_layers ? &c->out : &c->cache;
+ Dav1dThreadPicture *const in = (c->all_layers || !c->max_spatial_id)
+ ? &c->out : &c->cache;
if (!c->apply_grain || !has_grain(&in->p)) {
dav1d_picture_move_ref(out, &in->p);
dav1d_thread_picture_unref(in);
@@ -327,18 +335,17 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
res = dav1d_apply_grain(c, out, &in->p);
dav1d_thread_picture_unref(in);
end:
- if (!c->all_layers && c->out.p.data[0]) {
+ if (!c->all_layers && c->max_spatial_id && c->out.p.data[0]) {
dav1d_thread_picture_move_ref(in, &c->out);
}
return res;
}
static int output_picture_ready(Dav1dContext *const c, const int drain) {
- if (!c->all_layers) {
+ if (c->cached_error) return 1;
+ if (!c->all_layers && c->max_spatial_id) {
if (c->out.p.data[0] && c->cache.p.data[0]) {
- const unsigned spatial_mask = c->operating_point_idc >> 8;
- const int max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
- if (max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
+ if (c->max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT)
return 1;
dav1d_thread_picture_unref(&c->cache);
@@ -377,6 +384,13 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
if (++c->frame_thread.next == c->n_fc)
c->frame_thread.next = 0;
pthread_mutex_unlock(&c->task_thread.lock);
+ const int error = f->task_thread.retval;
+ if (error) {
+ f->task_thread.retval = 0;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ return error;
+ }
if (out_delayed->p.data[0]) {
const unsigned progress =
atomic_load_explicit(&out_delayed->progress[1],
@@ -457,6 +471,12 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
if (res < 0)
return res;
+ if (c->cached_error) {
+ const int res = c->cached_error;
+ c->cached_error = 0;
+ return res;
+ }
+
if (output_picture_ready(c, c->n_fc == 1))
return output_image(c, out);
@@ -479,33 +499,43 @@ int dav1d_apply_grain(Dav1dContext *const c, Dav1dPicture *const out,
}
int res = dav1d_picture_alloc_copy(c, out, in->p.w, in);
- if (res < 0) {
- dav1d_picture_unref_internal(out);
- return res;
- }
+ if (res < 0) goto error;
- switch (out->p.bpc) {
+ if (c->n_tc > 1) {
+ dav1d_task_delayed_fg(c, out, in);
+ } else {
+ switch (out->p.bpc) {
#if CONFIG_8BPC
- case 8:
- dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
- break;
+ case 8:
+ dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
+ break;
#endif
#if CONFIG_16BPC
- case 10:
- case 12:
- dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
- break;
+ case 10:
+ case 12:
+ dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
+ break;
#endif
- default:
- assert(0);
+ default: abort();
+ }
}
return 0;
+
+error:
+ dav1d_picture_unref_internal(out);
+ return res;
}
void dav1d_flush(Dav1dContext *const c) {
dav1d_data_unref_internal(&c->in);
+ if (c->out.p.data[0])
+ dav1d_thread_picture_unref(&c->out);
+ if (c->cache.p.data[0])
+ dav1d_thread_picture_unref(&c->cache);
+
c->drain = 0;
+ c->cached_error = 0;
for (int i = 0; i < 8; i++) {
if (c->refs[i].p.p.data[0])
@@ -525,6 +555,8 @@ void dav1d_flush(Dav1dContext *const c) {
dav1d_ref_dec(&c->content_light_ref);
dav1d_ref_dec(&c->itut_t35_ref);
+ dav1d_data_props_unref_internal(&c->cached_error_props);
+
if (c->n_fc == 1 && c->n_tc == 1) return;
atomic_store(c->flush, 1);
@@ -556,6 +588,7 @@ void dav1d_flush(Dav1dContext *const c) {
Dav1dFrameContext *const f = &c->fc[next];
dav1d_decode_frame_exit(f, -1);
f->n_tile_data = 0;
+ f->task_thread.retval = 0;
Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0]) {
dav1d_thread_picture_unref(out_delayed);
@@ -592,6 +625,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
pthread_cond_destroy(&pf->task_thread.td.cond);
pthread_mutex_destroy(&pf->task_thread.td.lock);
}
+ pthread_cond_destroy(&ttd->delayed_fg.cond);
pthread_cond_destroy(&ttd->cond);
pthread_mutex_destroy(&ttd->lock);
}
@@ -631,7 +665,6 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
dav1d_free_aligned(f->lf.lr_line_buf);
}
dav1d_free_aligned(c->fc);
- dav1d_data_unref_internal(&c->in);
if (c->n_fc > 1 && c->frame_thread.out_delayed) {
for (unsigned n = 0; n < c->n_fc; n++)
if (c->frame_thread.out_delayed[n].p.data[0])
@@ -674,6 +707,17 @@ int dav1d_get_event_flags(Dav1dContext *const c, enum Dav1dEventFlags *const fla
return 0;
}
+int dav1d_get_decode_error_data_props(Dav1dContext *const c, Dav1dDataProps *const out) {
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+ dav1d_data_props_unref_internal(out);
+ *out = c->cached_error_props;
+ dav1d_data_props_set_defaults(&c->cached_error_props);
+
+ return 0;
+}
+
void dav1d_picture_unref(Dav1dPicture *const p) {
dav1d_picture_unref_internal(p);
}
@@ -706,3 +750,7 @@ int dav1d_data_wrap_user_data(Dav1dData *const buf,
void dav1d_data_unref(Dav1dData *const buf) {
dav1d_data_unref_internal(buf);
}
+
+void dav1d_data_props_unref(Dav1dDataProps *const props) {
+ dav1d_data_props_unref_internal(props);
+}
diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build
index 9095f0ba97a94..b06aee6d70d84 100644
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@@ -58,7 +58,7 @@ libdav1d_tmpl_sources = files(
'cdef_apply_tmpl.c',
'cdef_tmpl.c',
'fg_apply_tmpl.c',
- 'film_grain_tmpl.c',
+ 'filmgrain_tmpl.c',
'ipred_prepare_tmpl.c',
'ipred_tmpl.c',
'itx_tmpl.c',
@@ -96,7 +96,7 @@ if is_asm_enabled
)
libdav1d_tmpl_sources += files(
'arm/cdef_init_tmpl.c',
- 'arm/film_grain_init_tmpl.c',
+ 'arm/filmgrain_init_tmpl.c',
'arm/ipred_init_tmpl.c',
'arm/itx_init_tmpl.c',
'arm/loopfilter_init_tmpl.c',
@@ -116,7 +116,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'arm/64/cdef.S',
- 'arm/64/film_grain.S',
+ 'arm/64/filmgrain.S',
'arm/64/ipred.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
@@ -127,7 +127,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
'arm/64/cdef16.S',
- 'arm/64/film_grain16.S',
+ 'arm/64/filmgrain16.S',
'arm/64/ipred16.S',
'arm/64/itx16.S',
'arm/64/loopfilter16.S',
@@ -147,7 +147,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'arm/32/cdef.S',
- 'arm/32/film_grain.S',
+ 'arm/32/filmgrain.S',
'arm/32/ipred.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
@@ -158,7 +158,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
'arm/32/cdef16.S',
- 'arm/32/film_grain16.S',
+ 'arm/32/filmgrain16.S',
'arm/32/ipred16.S',
'arm/32/itx16.S',
'arm/32/loopfilter16.S',
@@ -183,7 +183,7 @@ if is_asm_enabled
libdav1d_tmpl_sources += files(
'x86/cdef_init_tmpl.c',
- 'x86/film_grain_init_tmpl.c',
+ 'x86/filmgrain_init_tmpl.c',
'x86/ipred_init_tmpl.c',
'x86/itx_init_tmpl.c',
'x86/loopfilter_init_tmpl.c',
@@ -206,16 +206,17 @@ if is_asm_enabled
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'x86/cdef_avx512.asm',
+ 'x86/filmgrain_avx512.asm',
'x86/ipred_avx512.asm',
'x86/itx_avx512.asm',
'x86/loopfilter_avx512.asm',
'x86/looprestoration_avx512.asm',
'x86/mc_avx512.asm',
- 'x86/mc_avx2.asm',
- 'x86/film_grain_avx2.asm',
+ 'x86/filmgrain_avx2.asm',
'x86/ipred_avx2.asm',
'x86/loopfilter_avx2.asm',
- 'x86/film_grain_sse.asm',
+ 'x86/mc_avx2.asm',
+ 'x86/filmgrain_sse.asm',
'x86/ipred_sse.asm',
'x86/loopfilter_sse.asm',
'x86/looprestoration_sse.asm',
@@ -225,17 +226,18 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
+ 'x86/ipred16_avx512.asm',
'x86/looprestoration16_avx512.asm',
'x86/mc16_avx512.asm',
'x86/cdef16_avx2.asm',
- 'x86/film_grain16_avx2.asm',
+ 'x86/filmgrain16_avx2.asm',
'x86/ipred16_avx2.asm',
'x86/itx16_avx2.asm',
'x86/loopfilter16_avx2.asm',
'x86/looprestoration16_avx2.asm',
'x86/mc16_avx2.asm',
'x86/cdef16_sse.asm',
- 'x86/film_grain16_sse.asm',
+ 'x86/filmgrain16_sse.asm',
'x86/ipred16_sse.asm',
'x86/itx16_sse.asm',
'x86/loopfilter16_sse.asm',
diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c
index dee6e13de913d..7df6850a8c392 100644
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@@ -135,15 +135,18 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
op->initial_display_delay = dav1d_get_bits(gb, 4) + 1;
}
}
- const int op_idx =
- c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
- c->operating_point_idc = hdr->operating_points[op_idx].idc;
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-operating-points: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
}
+ const int op_idx =
+ c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
+ c->operating_point_idc = hdr->operating_points[op_idx].idc;
+ const unsigned spatial_mask = c->operating_point_idc >> 8;
+ c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
+
hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1;
hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1;
hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
@@ -383,7 +386,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
if (seqhdr->frame_id_numbers_present) {
hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr;
- if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) return DAV1D_ERR(EINVAL);
+ if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) goto error;
}
return 0;
}
@@ -767,7 +770,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
// segmentation data from the reference frame.
assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
- if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+ if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
hdr->segmentation.seg_data =
c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data;
}
@@ -829,7 +832,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
} else {
const int ref = hdr->refidx[hdr->primary_ref_frame];
- if (!c->refs[ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+ if (!c->refs[ref].p.p.frame_hdr) goto error;
hdr->loopfilter.mode_ref_deltas =
c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
}
@@ -932,7 +935,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
int off_after = -1;
int off_before_idx, off_after_idx;
for (int i = 0; i < 7; i++) {
- if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
+ if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error;
const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
@@ -960,7 +963,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
unsigned off_before2 = 0xFFFFFFFFU;
int off_before2_idx;
for (int i = 0; i < 7; i++) {
- if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
+ if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error;
const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
if (get_poc_diff(seqhdr->order_hint_n_bits,
refpoc, off_before) < 0) {
@@ -1015,7 +1018,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
ref_gmv = &dav1d_default_wm_params;
} else {
const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
- if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
+ if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i];
}
int32_t *const mat = hdr->gmv[i].matrix;
@@ -1245,11 +1248,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
memset(seq_hdr, 0, sizeof(*seq_hdr));
if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) {
dav1d_ref_dec(&ref);
- return res;
+ goto error;
}
if (check_for_overrun(c, &gb, init_bit_pos, len)) {
dav1d_ref_dec(&ref);
- return DAV1D_ERR(EINVAL);
+ goto error;
}
// If we have read a sequence header which is different from
// the old one, this is a new video sequence and can't use any
@@ -1307,7 +1310,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
c->frame_hdr->spatial_id = spatial_id;
if ((res = parse_frame_hdr(c, &gb)) < 0) {
c->frame_hdr = NULL;
- return res;
+ goto error;
}
for (int n = 0; n < c->n_tile_data; n++)
dav1d_data_unref_internal(&c->tile[n].data);
@@ -1319,7 +1322,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
dav1d_get_bits(&gb, 1);
if (check_for_overrun(c, &gb, init_bit_pos, len)) {
c->frame_hdr = NULL;
- return DAV1D_ERR(EINVAL);
+ goto error;
}
}
@@ -1360,7 +1363,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
// Align to the next byte boundary and check for overrun.
dav1d_bytealign_get_bits(&gb);
if (check_for_overrun(c, &gb, init_bit_pos, len))
- return DAV1D_ERR(EINVAL);
+ goto error;
// The current bit position is a multiple of 8 (because we
// just aligned it) and less than 8*pkt_bytelen because
// otherwise the overrun check would have fired.
@@ -1547,7 +1550,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (c->seq_hdr && c->frame_hdr) {
if (c->frame_hdr->show_existing_frame) {
- if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL);
+ if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error;
if (c->n_fc == 1) {
dav1d_thread_picture_ref(&c->out,
&c->refs[c->frame_hdr->existing_frame_idx].p);
@@ -1574,7 +1577,13 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
- if (out_delayed->p.data[0]) {
+ const int error = f->task_thread.retval;
+ if (error) {
+ c->cached_error = error;
+ f->task_thread.retval = 0;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ } else if (out_delayed->p.data[0]) {
const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
memory_order_relaxed);
if ((out_delayed->visible || c->output_invisible_frames) &&
@@ -1613,7 +1622,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
c->frame_hdr = NULL;
} else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
if (!c->n_tile_data)
- return DAV1D_ERR(EINVAL);
+ goto error;
if ((res = dav1d_submit_frame(c)) < 0)
return res;
assert(!c->n_tile_data);
@@ -1625,6 +1634,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
return len + init_byte_pos;
error:
+ dav1d_data_props_copy(&c->cached_error_props, &in->m);
dav1d_log(c, "Error parsing OBU data\n");
return DAV1D_ERR(EINVAL);
}
diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c
index 461c9d0522b37..bebc4dd9c17b1 100644
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@@ -283,6 +283,7 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
dav1d_ref_dec(&p->itut_t35_ref);
}
memset(p, 0, sizeof(*p));
+ dav1d_data_props_set_defaults(&p->m);
}
void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
diff --git a/third_party/dav1d/src/tables.c b/third_party/dav1d/src/tables.c
index 92b8c4b5a8dcc..9752f15c40df2 100644
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@@ -756,7 +756,7 @@ const uint16_t dav1d_dr_intra_derivative[44] = {
[1*idx+32] = f4, [1*idx+40] = f5, \
[1*idx+48] = f6
#endif
-const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
+const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 64) = {
{
F( 0, -6, 10, 0, 0, 0, 12, 0 ),
F( 1, -5, 2, 10, 0, 0, 9, 0 ),
diff --git a/third_party/dav1d/src/thread_task.c b/third_party/dav1d/src/thread_task.c
index bcda3182894a7..53aa41e5c8ad7 100644
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@@ -30,6 +30,7 @@
#include "common/frame.h"
#include "src/thread_task.h"
+#include "src/fg_apply.h"
// This function resets the cur pointer to the first frame theoretically
// executable after a task completed (ie. each time we update some progress or
@@ -281,6 +282,22 @@ void dav1d_task_frame_init(Dav1dFrameContext *const f) {
insert_task(f, t, 1);
}
+void dav1d_task_delayed_fg(Dav1dContext *const c, Dav1dPicture *const out,
+ const Dav1dPicture *const in)
+{
+ struct TaskThreadData *const ttd = &c->task_thread;
+ ttd->delayed_fg.in = in;
+ ttd->delayed_fg.out = out;
+ ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_PREP;
+ atomic_init(&ttd->delayed_fg.progress[0], 0);
+ atomic_init(&ttd->delayed_fg.progress[1], 0);
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 1;
+ pthread_cond_signal(&ttd->cond);
+ pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock);
+ pthread_mutex_unlock(&ttd->lock);
+}
+
static inline int ensure_progress(struct TaskThreadData *const ttd,
Dav1dFrameContext *const f,
Dav1dTask *const t, const enum TaskType type,
@@ -352,18 +369,104 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
return 0;
}
-static inline void abort_frame(Dav1dFrameContext *const f) {
- atomic_store(&f->task_thread.error, 1);
+static inline void abort_frame(Dav1dFrameContext *const f, const int error) {
+ atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1);
f->task_thread.task_counter = 0;
f->task_thread.done[0] = 1;
f->task_thread.done[1] = 1;
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
- dav1d_decode_frame_exit(f, -1);
+ dav1d_decode_frame_exit(f, error);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
+static inline void delayed_fg_task(const Dav1dContext *const c,
+ struct TaskThreadData *const ttd)
+{
+ const Dav1dPicture *const in = ttd->delayed_fg.in;
+ Dav1dPicture *const out = ttd->delayed_fg.out;
+#if CONFIG_16BPC
+ int off;
+ if (out->p.bpc != 8)
+ off = (out->p.bpc >> 1) - 4;
+#endif
+ switch (ttd->delayed_fg.type) {
+ case DAV1D_TASK_TYPE_FG_PREP:
+ ttd->delayed_fg.exec = 0;
+ if (atomic_load(&ttd->cond_signaled))
+ pthread_cond_signal(&ttd->cond);
+ pthread_mutex_unlock(&ttd->lock);
+ switch (out->p.bpc) {
+#if CONFIG_8BPC
+ case 8:
+ dav1d_prep_grain_8bpc(&c->dsp[0].fg, out, in,
+ ttd->delayed_fg.scaling_8bpc,
+ ttd->delayed_fg.grain_lut_8bpc);
+ break;
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ dav1d_prep_grain_16bpc(&c->dsp[off].fg, out, in,
+ ttd->delayed_fg.scaling_16bpc,
+ ttd->delayed_fg.grain_lut_16bpc);
+ break;
+#endif
+ default: abort();
+ }
+ ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_APPLY;
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 1;
+ // fall-through
+ case DAV1D_TASK_TYPE_FG_APPLY:;
+ int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+ pthread_mutex_unlock(&ttd->lock);
+ int progmax = (out->p.h + 31) >> 5;
+ fg_apply_loop:
+ if (row + 1 < progmax)
+ pthread_cond_signal(&ttd->cond);
+ else if (row + 1 >= progmax) {
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 0;
+ if (row >= progmax) goto end_add;
+ pthread_mutex_unlock(&ttd->lock);
+ }
+ switch (out->p.bpc) {
+#if CONFIG_8BPC
+ case 8:
+ dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in,
+ ttd->delayed_fg.scaling_8bpc,
+ ttd->delayed_fg.grain_lut_8bpc, row);
+ break;
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in,
+ ttd->delayed_fg.scaling_16bpc,
+ ttd->delayed_fg.grain_lut_16bpc, row);
+ break;
+#endif
+ default: abort();
+ }
+ row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+ int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+ if (row < progmax) goto fg_apply_loop;
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 0;
+ end_add:
+ done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+ progmax = atomic_load(&ttd->delayed_fg.progress[0]);
+ // signal for completion only once the last runner reaches this
+ if (done < progmax)
+ break;
+ pthread_cond_signal(&ttd->delayed_fg.cond);
+ break;
+ default: abort();
+ }
+}
+
void *dav1d_worker_task(void *data) {
Dav1dTaskContext *const tc = data;
const Dav1dContext *const c = tc->c;
@@ -373,11 +476,15 @@ void *dav1d_worker_task(void *data) {
pthread_mutex_lock(&ttd->lock);
for (;;) {
- Dav1dFrameContext *f;
- Dav1dTask *t, *prev_t = NULL;
if (tc->task_thread.die) break;
if (atomic_load(c->flush)) goto park;
- if (c->n_fc > 1) { // run init tasks first
+ if (ttd->delayed_fg.exec) { // run delayed film grain first
+ delayed_fg_task(c, ttd);
+ continue;
+ }
+ Dav1dFrameContext *f;
+ Dav1dTask *t, *prev_t = NULL;
+ if (c->n_fc > 1) { // run init tasks second
for (unsigned i = 0; i < c->n_fc; i++) {
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + i) % c->n_fc];
@@ -395,7 +502,7 @@ void *dav1d_worker_task(void *data) {
}
}
}
- while (ttd->cur < c->n_fc) {
+ while (ttd->cur < c->n_fc) { // run decoding tasks last
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + ttd->cur) % c->n_fc];
prev_t = f->task_thread.task_cur_prev;
@@ -497,7 +604,7 @@ void *dav1d_worker_task(void *data) {
int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
if (res || p1 == TILE_ERROR) {
pthread_mutex_lock(&ttd->lock);
- abort_frame(f);
+ abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
} else if (!res) {
t->type = DAV1D_TASK_TYPE_INIT_CDF;
if (p1) goto found_unlocked;
@@ -509,7 +616,7 @@ void *dav1d_worker_task(void *data) {
}
case DAV1D_TASK_TYPE_INIT_CDF: {
assert(c->n_fc > 1);
- int res = -1;
+ int res = DAV1D_ERR(EINVAL);
if (!atomic_load(&f->task_thread.error))
res = dav1d_decode_frame_init_cdf(f);
pthread_mutex_lock(&ttd->lock);
@@ -523,19 +630,19 @@ void *dav1d_worker_task(void *data) {
if (res) {
// memory allocation failed
f->task_thread.done[2 - p] = 1;
- atomic_store(&f->task_thread.error, 1);
+ atomic_store(&f->task_thread.error, -1);
f->task_thread.task_counter -= f->sbh +
f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
if (p == 2 && f->task_thread.done[1]) {
assert(!f->task_thread.task_counter);
- dav1d_decode_frame_exit(f, -1);
+ dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
}
}
- } else abort_frame(f);
+ } else abort_frame(f, res);
reset_task_cur(c, ttd, t->frame_idx);
f->task_thread.init_done = 1;
continue;
@@ -588,7 +695,8 @@ void *dav1d_worker_task(void *data) {
if (!--f->task_thread.task_counter && f->task_thread.done[0] &&
(!uses_2pass || f->task_thread.done[1]))
{
- dav1d_decode_frame_exit(f, error ? -1 : 0);
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
@@ -703,7 +811,8 @@ void *dav1d_worker_task(void *data) {
if (!--f->task_thread.task_counter &&
f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1]))
{
- dav1d_decode_frame_exit(f, error ? -1 : 0);
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
diff --git a/third_party/dav1d/src/thread_task.h b/third_party/dav1d/src/thread_task.h
index 0ff2228bed7c5..257da1a470c70 100644
--- a/third_party/dav1d/src/thread_task.h
+++ b/third_party/dav1d/src/thread_task.h
@@ -39,6 +39,8 @@
int dav1d_task_create_tile_sbrow(Dav1dFrameContext *f, int pass, int cond_signal);
void dav1d_task_frame_init(Dav1dFrameContext *f);
+void dav1d_task_delayed_fg(Dav1dContext *c, Dav1dPicture *out, const Dav1dPicture *in);
+
void *dav1d_worker_task(void *data);
int dav1d_decode_frame_init(Dav1dFrameContext *f);
diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm
index 9e2c7b361e0b1..4c8d3bca4377b 100644
--- a/third_party/dav1d/src/x86/cdef16_avx2.asm
+++ b/third_party/dav1d/src/x86/cdef16_avx2.asm
@@ -59,14 +59,6 @@ cextern cdef_dir_8bpc_avx2.main
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%macro CDEF_FILTER 2 ; w, h
DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
movifnidn prid, r5m
diff --git a/third_party/dav1d/src/x86/cdef16_sse.asm b/third_party/dav1d/src/x86/cdef16_sse.asm
index 03736b422c0b9..1bd67ace64d09 100644
--- a/third_party/dav1d/src/x86/cdef16_sse.asm
+++ b/third_party/dav1d/src/x86/cdef16_sse.asm
@@ -64,14 +64,6 @@ cextern shufw_6543210x
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%if ARCH_X86_32
DECLARE_REG_TMP 5, 3
%elif WIN64
diff --git a/third_party/dav1d/src/x86/film_grain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
similarity index 98%
rename from third_party/dav1d/src/x86/film_grain16_avx2.asm
rename to third_party/dav1d/src/x86/filmgrain16_avx2.asm
index 1ece90b68433b..5c2e3868a349a 100644
--- a/third_party/dav1d/src/x86/film_grain16_avx2.asm
+++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
@@ -25,6 +25,7 @@
%include "config.asm"
%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
%if ARCH_X86_64
@@ -81,38 +82,8 @@ JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
-struc FGData
- .seed: resd 1
- .num_y_points: resd 1
- .y_points: resb 14 * 2
- .chroma_scaling_from_luma: resd 1
- .num_uv_points: resd 2
- .uv_points: resb 2 * 10 * 2
- .scaling_shift: resd 1
- .ar_coeff_lag: resd 1
- .ar_coeffs_y: resb 24
- .ar_coeffs_uv: resb 2 * 28 ; includes padding
- .ar_coeff_shift: resq 1
- .grain_scale_shift: resd 1
- .uv_mult: resd 2
- .uv_luma_mult: resd 2
- .uv_offset: resd 2
- .overlap_flag: resd 1
- .clip_to_restricted_range: resd 1
-endstruc
-
-cextern gaussian_sequence
-
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
INIT_YMM avx2
diff --git a/third_party/dav1d/src/x86/film_grain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm
similarity index 99%
rename from third_party/dav1d/src/x86/film_grain16_sse.asm
rename to third_party/dav1d/src/x86/filmgrain16_sse.asm
index 3f86e7d9a5907..6b0daaac0ba35 100644
--- a/third_party/dav1d/src/x86/film_grain16_sse.asm
+++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm
@@ -25,6 +25,7 @@
%include "config.asm"
%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
SECTION_RODATA 16
pd_16: times 4 dd 16
@@ -66,38 +67,8 @@ JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
-struc FGData
- .seed: resd 1
- .num_y_points: resd 1
- .y_points: resb 14 * 2
- .chroma_scaling_from_luma: resd 1
- .num_uv_points: resd 2
- .uv_points: resb 2 * 10 * 2
- .scaling_shift: resd 1
- .ar_coeff_lag: resd 1
- .ar_coeffs_y: resb 24
- .ar_coeffs_uv: resb 2 * 28 ; includes padding
- .ar_coeff_shift: resq 1
- .grain_scale_shift: resd 1
- .uv_mult: resd 2
- .uv_luma_mult: resd 2
- .uv_offset: resd 2
- .overlap_flag: resd 1
- .clip_to_restricted_range: resd 1
-endstruc
-
-cextern gaussian_sequence
-
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%if ARCH_X86_32
%undef base
%define PIC_ptr(a) base+a
diff --git a/third_party/dav1d/src/x86/film_grain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm
similarity index 98%
rename from third_party/dav1d/src/x86/film_grain_avx2.asm
rename to third_party/dav1d/src/x86/filmgrain_avx2.asm
index dda43a9baa579..7da8105dfacd0 100644
--- a/third_party/dav1d/src/x86/film_grain_avx2.asm
+++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm
@@ -25,6 +25,7 @@
%include "config.asm"
%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
%if ARCH_X86_64
@@ -74,38 +75,8 @@ JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
-struc FGData
- .seed: resd 1
- .num_y_points: resd 1
- .y_points: resb 14 * 2
- .chroma_scaling_from_luma: resd 1
- .num_uv_points: resd 2
- .uv_points: resb 2 * 10 * 2
- .scaling_shift: resd 1
- .ar_coeff_lag: resd 1
- .ar_coeffs_y: resb 24
- .ar_coeffs_uv: resb 2 * 28 ; includes padding
- .ar_coeff_shift: resq 1
- .grain_scale_shift: resd 1
- .uv_mult: resd 2
- .uv_luma_mult: resd 2
- .uv_offset: resd 2
- .overlap_flag: resd 1
- .clip_to_restricted_range: resd 1
-endstruc
-
-cextern gaussian_sequence
-
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
INIT_YMM avx2
cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
%define base r4-generate_grain_y_8bpc_avx2_table
@@ -1097,9 +1068,6 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
jne .loop_x_hv_overlap
jmp .loop_x_h_overlap
-.end:
- RET
-
.vertical_overlap:
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
unused, sby, see, overlap
@@ -1214,7 +1182,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
jmp .loop_y
.end_y_v_overlap:
add wq, 32
- jge .end_hv
+ jge .end
lea srcq, [src_bakq+wq]
; since fg_dataq.overlap is guaranteed to be set, we never jump
@@ -1334,7 +1302,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
add wq, 32
lea srcq, [src_bakq+wq]
jl .loop_x_hv_overlap
-.end_hv:
+.end:
RET
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
@@ -1691,9 +1659,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
jne %%loop_x_hv_overlap
jmp %%loop_x_h_overlap
-%%end:
- RET
-
%%vertical_overlap:
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
sby, see, overlap, unused1, unused2, lstride
@@ -1887,7 +1852,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%%end_y_v_overlap:
add wq, 32>>%2
- jge %%end_hv
+ jge %%end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*(1+%2)]
@@ -2116,15 +2081,14 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%%end_y_hv_overlap:
add wq, 32>>%2
- jge %%end_hv
+ jge %%end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
jmp %%loop_x_hv_overlap
-
-%%end_hv:
+%%end:
RET
%endmacro
diff --git a/third_party/dav1d/src/x86/filmgrain_avx512.asm b/third_party/dav1d/src/x86/filmgrain_avx512.asm
new file mode 100644
index 0000000000000..6d277464a5976
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_avx512.asm
@@ -0,0 +1,1079 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
+ db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
+pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
+ db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
+interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7
+pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32
+pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32
+pb_27_17: times 2 db 27, 17
+pb_23_22: times 2 db 23, 22
+pw_8: times 2 dw 8
+pw_1024: times 2 dw 1024
+pb_17_27: times 2 db 17, 27
+fg_max: times 4 db 255
+ times 4 db 240
+ times 4 db 235
+fg_min: times 4 db 0
+ times 4 db 16
+noise_rnd: times 2 dw 128
+ times 2 dw 64
+ times 2 dw 32
+ times 2 dw 16
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, see, overlap
+%define base r11-fg_min
+ lea r11, [fg_min]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ mov r12, 0x0000000f0000000f ; h_overlap mask
+ mova m0, [scalingq+64*0]
+ mova m1, [scalingq+64*1]
+ mova m2, [scalingq+64*2]
+ mova m3, [scalingq+64*3]
+ kmovq k1, r12
+ vbroadcasti32x4 m4, [base+interleave_hl]
+ vpbroadcastd ym16, [base+pb_27_17]
+ vpbroadcastd m12, [base+pb_17_27]
+ vpbroadcastd m6, [base+noise_rnd+r6*4-32]
+ test sbyd, sbyd
+ setnz r6b
+ vpbroadcastd m7, [base+fg_min+r7*4]
+ vpbroadcastd m8, [base+fg_max+r7*8]
+ pxor m5, m5
+ vpbroadcastd m9, [base+pw_1024]
+ vpbroadcastq m10, [base+pb_27_17_17_27]
+ vmovdqa64 m12{k1}, m16
+ test r6b, overlapb
+ jnz .v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, overlap
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ mova ym18, [srcq+strideq*0]
+ vinserti32x8 m18, [srcq+strideq*1], 1
+ movu ym21, [grain_lutq+offxyq-82]
+ vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1
+ mova m19, m0
+ vpmovb2m k2, m18
+ punpcklbw m16, m18, m5
+ punpckhbw m17, m18, m5
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ punpcklbw m20, m5, m21 ; grain
+ punpckhbw m21, m5
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+ mova [dstq+srcq], ym16
+ add srcq, strideq
+ vextracti32x8 [dstq+srcq], m16, 1
+ add srcq, strideq
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test overlapd, overlapd
+ jz .loop_x
+ test sbyd, sbyd
+ jnz .hv_overlap
+
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, left_offxy
+
+ rorx offyd, seed, 8
+ mov left_offxyd, offxd ; previous column's offy*stride
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ movu ym20, [grain_lutq+offxyq-82]
+ vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1
+ movd xm21, [grain_lutq+left_offxyq-50]
+ vinserti32x4 m21, [grain_lutq+left_offxyq+32], 2
+ mova ym18, [srcq+strideq*0]
+ vinserti32x8 m18, [srcq+strideq*1], 1
+ mova m19, m0
+ punpcklbw m21, m20
+ vpmovb2m k2, m18
+ punpcklbw m16, m18, m5
+ punpckhbw m17, m18, m5
+ pmaddubsw m21, m10, m21
+ vpermt2b m19, m18, m1
+ vpermi2b m18, m2, m3
+ pmulhrsw m21, m9
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ punpckhbw m18, m20, m5
+ pshufb m19, m4
+ packsswb m20{k1}, m21, m21
+ punpcklbw m20, m5, m20 ; grain
+ pmaddubsw m18, m19, m18
+ pmaddubsw m19, m20
+ add grain_lutq, 82*2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ paddw m17, m18
+ paddw m16, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+ mova [dstq+srcq], ym16
+ add srcq, strideq
+ vextracti32x8 [dstq+srcq], m16, 1
+ add srcq, strideq
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test sbyd, sbyd
+ jnz .hv_overlap
+ jmp .loop_x_h_overlap
+
+.v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
+ h, sby, see, overlap
+
+ movzx r6d, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, r6d, 173 * 0x00010001
+ imul r6d, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add r6d, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and r6d, 0xff00ff00
+ xor seed, r7d
+ xor seed, r6d ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, overlap, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+ movu ym16, [grain_lutq+offxyq-82]
+ vinserti32x8 m16, [grain_lutq+offxyq+ 0], 1
+ movu ym21, [grain_lutq+top_offxyq-82]
+ vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
+ mova ym18, [srcq+strideq*0]
+ vinserti32x8 m18, [srcq+strideq*1], 1
+ mova m19, m0
+ punpcklbw m20, m21, m16
+ punpckhbw m21, m16
+ vpmovb2m k2, m18
+ pmaddubsw m20, m12, m20
+ pmaddubsw m21, m12, m21
+ punpcklbw m16, m18, m5
+ punpckhbw m17, m18, m5
+ vpermt2b m19, m18, m1
+ vpermi2b m18, m2, m3
+ pmulhrsw m20, m9
+ pmulhrsw m21, m9
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ packsswb m20, m21
+ punpcklbw m18, m5, m20 ; grain
+ punpckhbw m20, m5
+ pmaddubsw m18, m19, m18
+ pmaddubsw m19, m20
+ add grain_lutq, 82*2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+ mova [dstq+srcq], ym16
+ add srcq, strideq
+ vextracti32x8 [dstq+srcq], m16, 1
+ add srcq, strideq
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to .v_overlap, and instead always fall-through to h+v overlap
+.hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+ mov topleft_offxyd, top_offxyd
+ rorx offyd, seed, 8
+ mov left_offxyd, offxd
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+ movu ym19, [grain_lutq+offxyq-82]
+ vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
+ movd xm16, [grain_lutq+left_offxyq-50]
+ vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2
+ movu ym21, [grain_lutq+top_offxyq-82]
+ vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
+ movd xm17, [grain_lutq+topleft_offxyq-50]
+ vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2
+ mova ym18, [srcq+strideq*0]
+ vinserti32x8 m18, [srcq+strideq*1], 1
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m16, m19
+ punpcklbw m17, m21
+ pmaddubsw m16, m10, m16
+ pmaddubsw m17, m10, m17
+ punpckhbw m20, m21, m19
+ vpmovb2m k2, m18
+ pmulhrsw m16, m9
+ pmulhrsw m17, m9
+ packsswb m19{k1}, m16, m16
+ packsswb m21{k1}, m17, m17
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m21, m19
+ mova m19, m0
+ pmaddubsw m20, m12, m20
+ pmaddubsw m21, m12, m21
+ punpcklbw m16, m18, m5
+ punpckhbw m17, m18, m5
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ pmulhrsw m20, m9
+ pmulhrsw m21, m9
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ packsswb m21, m20
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+ mova [dstq+srcq], ym16
+ add srcq, strideq
+ vextracti32x8 [dstq+srcq], m16, 1
+ add srcq, strideq
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ lea srcq, [src_bakq+wq]
+ jl .hv_overlap
+.end:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
+ scaling, grain_lut, h, sby, luma, \
+ overlap, uv_pl, is_id, _, stride3
+ lea r11, [fg_min]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r9d, is_idm
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+%if %2
+ mov r12, 0x000f000f000f000f ; h_overlap mask
+ vpbroadcastq m10, [base+pb_23_22_0_32]
+ lea stride3q, [strideq*3]
+%else
+ mov r12, 0x0000000f0000000f
+ vpbroadcastq m10, [base+pb_27_17_17_27]
+%endif
+ mova m0, [scalingq+64*0]
+ mova m1, [scalingq+64*1]
+ mova m2, [scalingq+64*2]
+ mova m3, [scalingq+64*3]
+ kmovq k1, r12
+ vbroadcasti32x4 m4, [base+interleave_hl]
+ vpbroadcastd m6, [base+noise_rnd+r6*4-32]
+ vpbroadcastd m7, [base+fg_min+r7*4]
+ shlx r7d, r7d, r9d
+ vpbroadcastd m8, [base+fg_max+r7*4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m9, [base+pw_1024]
+ mova m11, [base+pb_even]
+ mova m12, [base+pb_odd]
+ pxor m5, m5
+ mov r5, r10mp ; lstride
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+ h, sby, see, overlap, uv_pl, _, _, stride3
+%if %1
+ mov r6d, uv_plm
+ vpbroadcastd m16, [base+pw_8]
+ vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m16 ; uv_luma_mult, uv_mult
+%endif
+ test r7b, overlapb
+ jnz %%v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, overlap, _, _, _, stride3
+
+ mov lumaq, r9mp
+ lea r11, [srcq+wq]
+ lea r12, [dstq+wq]
+ lea r13, [lumaq+wq*(1+%2)]
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, overlap, _, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+ mova ym18, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%if %2
+ mova ym20, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m20, [lumaq+lstrideq*(1<<%3)], 1
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova xm17, [srcq+strideq*0]
+ movu xm21, [grain_lutq+offxyq+82*0]
+ vinserti128 ym17, [srcq+strideq*1], 1
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ mova m19, m11
+ vpermi2b m19, m18, m20
+ vpermt2b m18, m12, m20
+ vinserti32x4 m17, [srcq+strideq*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ pavgb m18, m19
+ vinserti32x4 m17, [srcq+stride3q ], 3
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+%else
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+%endif
+ lea srcq, [srcq+strideq*(2<<%2)]
+%if %1
+ punpckhbw m19, m18, m17
+ punpcklbw m18, m17 ; { luma, chroma }
+ pmaddubsw m19, m14
+ pmaddubsw m18, m14
+ psraw m19, 6
+ psraw m18, 6
+ paddw m19, m15
+ paddw m18, m15
+ packuswb m18, m19
+%endif
+ mova m19, m0
+ vpmovb2m k2, m18
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ punpcklbw m20, m5, m21 ; grain
+ punpckhbw m21, m5
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2<<%2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ punpcklbw m16, m17, m5 ; chroma
+ punpckhbw m17, m5
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+%if %2
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+ lea dstq, [dstq+strideq*(2<<%2)]
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge %%end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ test overlapd, overlapd
+ jz %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, left_offxy, _, _, _, stride3
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, left_offxy, _, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+ ; src
+%if %2
+ mova ym18, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova ym20, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m20, [lumaq+lstrideq*(1<<%3)], 1
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova xm17, [srcq+strideq*0]
+ vinserti128 ym17, [srcq+strideq*1], 1
+ mova m19, m11
+ vpermi2b m19, m18, m20
+ vpermt2b m18, m12, m20
+ vinserti32x4 m17, [srcq+strideq*2], 2
+ pavgb m18, m19
+ vinserti32x4 m17, [srcq+stride3q ], 3
+%else
+ mova ym18, [lumaq+lstrideq*0]
+ vinserti32x8 m18, [lumaq+lstrideq*1], 1
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+ lea lumaq, [lumaq+lstrideq*2]
+%endif
+ lea srcq, [srcq+strideq*(2<<%2)]
+%if %1
+ punpckhbw m19, m18, m17
+ punpcklbw m18, m17 ; { luma, chroma }
+ pmaddubsw m19, m14
+ pmaddubsw m18, m14
+ psraw m19, 6
+ psraw m18, 6
+ paddw m19, m15
+ paddw m18, m15
+ packuswb m18, m19
+%endif
+ mova m19, m0
+ vpmovb2m k2, m18
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+%if %2
+ movu xm20, [grain_lutq+offxyq +82*0]
+ movd xm18, [grain_lutq+left_offxyq+82*0]
+ vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1
+ vinserti32x4 ym18, [grain_lutq+left_offxyq+82*1], 1
+ vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2
+ vinserti32x4 m18, [grain_lutq+left_offxyq+82*2], 2
+ vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3
+ vinserti32x4 m18, [grain_lutq+left_offxyq+82*3], 3
+%else
+ movu ym20, [grain_lutq+offxyq + 0]
+ movd xm18, [grain_lutq+left_offxyq+ 0]
+ vinserti32x8 m20, [grain_lutq+offxyq +82], 1
+ vinserti32x4 m18, [grain_lutq+left_offxyq+82], 2
+%endif
+ punpcklbw m18, m20
+ pmaddubsw m18, m10, m18
+ punpckhbw m21, m20, m5
+ pshufb m19, m4
+ pmulhrsw m18, m9
+ vpacksswb m20{k1}, m18, m18
+ punpcklbw m20, m5, m20
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2<<%2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ punpcklbw m16, m17, m5 ; chroma
+ punpckhbw m17, m5
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+%if %2
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+ lea dstq, [dstq+strideq*(2<<%2)]
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge %%end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+ _, sby, see, overlap, _, _, _, stride3
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+%if %3
+ vpbroadcastd m13, [base+pb_23_22]
+ kxnorw k3, k3, k3 ; v_overlap mask
+%elif %2
+ vbroadcasti32x8 m13, [base+pb_27_17]
+ kxnord k3, k3, k3
+ pshufd m13, m13, q0000 ; 8x27_17, 8x17_27
+%else
+ vpbroadcastd ym16, [base+pb_27_17]
+ vpbroadcastd m13, [base+pb_17_27]
+ vmovdqa64 m13{k1}, m16
+%endif
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, overlap, top_offxy, _, _, stride3
+
+ mov lumaq, r9mp
+ lea r11, [srcq+wq]
+ lea r12, [dstq+wq]
+ lea r13, [lumaq+wq*(1<<%2)]
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0x000f000f
+ and offxd, 0x000f000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, overlap, top_offxy, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %2
+ mova ym18, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova ym20, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m20, [lumaq+lstrideq*(1<<%3)], 1
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova xm17, [srcq+strideq*0]
+ vinserti128 ym17, [srcq+strideq*1], 1
+ mova m19, m11
+ vpermi2b m19, m18, m20
+ vpermt2b m18, m12, m20
+ vinserti32x4 m17, [srcq+strideq*2], 2
+ pavgb m18, m19
+ vinserti32x4 m17, [srcq+stride3q ], 3
+%else
+ mova ym18, [lumaq+lstrideq*0]
+ vinserti32x8 m18, [lumaq+lstrideq*1], 1
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+ lea lumaq, [lumaq+lstrideq*2]
+%endif
+ lea srcq, [srcq+strideq*(2<<%2)]
+%if %1
+ punpckhbw m19, m18, m17
+ punpcklbw m18, m17 ; { luma, chroma }
+ pmaddubsw m19, m14
+ pmaddubsw m18, m14
+ psraw m19, 6
+ psraw m18, 6
+ paddw m19, m15
+ paddw m18, m15
+ packuswb m18, m19
+%endif
+ mova m19, m0
+ vpmovb2m k2, m18
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+%if %3
+ movu xm21, [grain_lutq+offxyq+82*0]
+ movu xm16, [grain_lutq+top_offxyq+82*0]
+ punpcklbw xm20, xm16, xm21
+ punpckhbw xm16, xm21
+ pmaddubsw xm20, xm13, xm20
+ pmaddubsw xm16, xm13, xm16
+ ; only interpolate first line, insert remaining line unmodified
+ vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ pmulhrsw xm20, xm9
+ pmulhrsw xm16, xm9
+ vpacksswb m21{k3}, m20, m16
+%elif %2
+ movu xm21, [grain_lutq+offxyq+82*0]
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ movu xm16, [grain_lutq+top_offxyq+82*0]
+ vinserti32x4 ym16, [grain_lutq+top_offxyq+82*1], 1
+ punpcklbw ym20, ym16, ym21
+ punpckhbw ym16, ym21
+ pmaddubsw ym20, ym13, ym20
+ pmaddubsw ym16, ym13, ym16
+ vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ pmulhrsw ym20, ym9
+ pmulhrsw ym16, ym9
+ packsswb m21{k3}, m20, m16
+%else
+ movu ym16, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m16, [grain_lutq+offxyq+82*1], 1
+ movu ym20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
+ punpcklbw m21, m20, m16
+ punpckhbw m20, m16
+ pmaddubsw m21, m13, m21
+ pmaddubsw m20, m13, m20
+ pmulhrsw m21, m9
+ pmulhrsw m20, m9
+ packsswb m21, m20
+%endif
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2<<%2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ punpcklbw m16, m17, m5 ; chroma
+ punpckhbw m17, m5
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+%if %2
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+ lea dstq, [dstq+strideq*(2<<%2)]
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge %%end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+
+%%hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+ lea topleft_offxyd, [top_offxyq+(32>>%2)]
+ lea left_offxyd, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0x000f000f
+ and offxd, 0x000f000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %2
+ movu xm21, [grain_lutq+offxyq+82*0]
+ movd xm16, [grain_lutq+left_offxyq+82*0]
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3
+ movd xm18, [grain_lutq+topleft_offxyq+82*0]
+ movu xm20, [grain_lutq+top_offxyq]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m16, m21
+%if %3
+ punpcklbw xm18, xm20
+%else
+ vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1
+ vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1
+ punpcklbw ym18, ym20
+%endif
+ punpcklqdq m16, m18
+ pmaddubsw m16, m10, m16
+ pmulhrsw m16, m9
+ packsswb m16, m16
+%if %3
+ vpalignr xm20{k1}, xm16, xm16, 4
+%else
+ vpalignr ym20{k1}, ym16, ym16, 4
+%endif
+ vmovdqu8 m21{k1}, m16
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+ movd xm16, [grain_lutq+left_offxyq+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2
+ movu ym20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
+ movd xm18, [grain_lutq+topleft_offxyq+82*0]
+ vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2
+ punpcklbw m16, m21
+ punpcklbw m18, m20
+ punpcklqdq m16, m18
+ pmaddubsw m16, m10, m16
+ pmulhrsw m16, m9
+ packsswb m16, m16
+ vpalignr m20{k1}, m16, m16, 4
+ vmovdqu8 m21{k1}, m16
+%endif
+%if %2
+ mova ym18, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova ym16, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova xm17, [srcq+strideq*0]
+ vinserti128 ym17, [srcq+strideq*1], 1
+ mova m19, m11
+ vpermi2b m19, m18, m16
+ vpermt2b m18, m12, m16
+ vinserti32x4 m17, [srcq+strideq*2], 2
+ pavgb m18, m19
+ vinserti32x4 m17, [srcq+stride3q ], 3
+%else
+ mova ym18, [lumaq+lstrideq*0]
+ vinserti32x8 m18, [lumaq+lstrideq*1], 1
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+ lea lumaq, [lumaq+lstrideq*2]
+%endif
+ lea srcq, [srcq+strideq*(2<<%2)]
+%if %1
+ punpckhbw m19, m18, m17
+ punpcklbw m18, m17 ; { luma, chroma }
+ pmaddubsw m19, m14
+ pmaddubsw m18, m14
+ psraw m19, 6
+ psraw m18, 6
+ paddw m19, m15
+ paddw m18, m15
+ packuswb m18, m19
+%endif
+ mova m19, m0
+ vpmovb2m k2, m18
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ ; followed by v interpolation (top | cur -> cur)
+%if %3
+ punpcklbw xm16, xm20, xm21
+ punpckhbw xm20, xm21
+ pmaddubsw xm16, xm13, xm16
+ pmaddubsw xm20, xm13, xm20
+ pmulhrsw xm16, xm9
+ pmulhrsw xm20, xm9
+ vpacksswb m21{k3}, m16, m20
+%elif %2
+ punpcklbw ym16, ym20, ym21
+ punpckhbw ym20, ym21
+ pmaddubsw ym16, ym13, ym16
+ pmaddubsw ym20, ym13, ym20
+ pmulhrsw ym16, ym9
+ pmulhrsw ym20, ym9
+ vpacksswb m21{k3}, m16, m20
+%else
+ punpcklbw m16, m20, m21
+ punpckhbw m20, m21
+ pmaddubsw m16, m13, m16
+ pmaddubsw m20, m13, m20
+ pmulhrsw m16, m9
+ pmulhrsw m20, m9
+ packsswb m21, m16, m20
+%endif
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2<<%2
+ pmulhrsw m18, m6 ; grain
+ pmulhrsw m19, m6
+ punpcklbw m16, m17, m5 ; chroma
+ punpckhbw m17, m5
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+%if %2
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+ lea dstq, [dstq+strideq*(2<<%2)]
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge %%end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ jmp %%hv_overlap
+%%end:
+ RET
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain_common.asm b/third_party/dav1d/src/x86/filmgrain_common.asm
new file mode 100644
index 0000000000000..74f7044e666cb
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_common.asm
@@ -0,0 +1,46 @@
+; Copyright © 2019-2022, VideoLAN and dav1d authors
+; Copyright © 2019-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+struc FGData
+ .seed: resd 1
+ .num_y_points: resd 1
+ .y_points: resb 14 * 2
+ .chroma_scaling_from_luma: resd 1
+ .num_uv_points: resd 2
+ .uv_points: resb 2 * 10 * 2
+ .scaling_shift: resd 1
+ .ar_coeff_lag: resd 1
+ .ar_coeffs_y: resb 24
+ .ar_coeffs_uv: resb 2 * 28 ; includes padding
+ .ar_coeff_shift: resq 1
+ .grain_scale_shift: resd 1
+ .uv_mult: resd 2
+ .uv_luma_mult: resd 2
+ .uv_offset: resd 2
+ .overlap_flag: resd 1
+ .clip_to_restricted_range: resd 1
+endstruc
+
+cextern gaussian_sequence
diff --git a/third_party/dav1d/src/x86/film_grain_init_tmpl.c b/third_party/dav1d/src/x86/filmgrain_init_tmpl.c
similarity index 67%
rename from third_party/dav1d/src/x86/film_grain_init_tmpl.c
rename to third_party/dav1d/src/x86/filmgrain_init_tmpl.c
index 606ea3cb56cff..0b783d10d3c45 100644
--- a/third_party/dav1d/src/x86/film_grain_init_tmpl.c
+++ b/third_party/dav1d/src/x86/filmgrain_init_tmpl.c
@@ -1,6 +1,6 @@
/*
- * Copyright © 2018-2021, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -26,25 +26,21 @@
*/
#include "src/cpu.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
-decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ssse3));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ssse3));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ssse3));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ssse3));
-decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ssse3));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ssse3));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ssse3));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ssse3));
+#define decl_fg_fns(ext) \
+decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \
+decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext))
-decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, avx2));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, avx2));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, avx2));
-decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, avx2));
-decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, avx2));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, avx2));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, avx2));
-decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, avx2));
+decl_fg_fns(ssse3);
+decl_fg_fns(avx2);
+decl_fg_fns(avx512icl);
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -68,11 +64,20 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
- if (flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER) return;
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+ }
- c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
- c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
- c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
- c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+#if BITDEPTH == 8
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+#endif
#endif
}
diff --git a/third_party/dav1d/src/x86/film_grain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm
similarity index 99%
rename from third_party/dav1d/src/x86/film_grain_sse.asm
rename to third_party/dav1d/src/x86/filmgrain_sse.asm
index 20334591a93db..0172f987607a7 100644
--- a/third_party/dav1d/src/x86/film_grain_sse.asm
+++ b/third_party/dav1d/src/x86/filmgrain_sse.asm
@@ -25,6 +25,7 @@
%include "config.asm"
%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
SECTION_RODATA
@@ -66,38 +67,8 @@ JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
-struc FGData
- .seed: resd 1
- .num_y_points: resd 1
- .y_points: resb 14 * 2
- .chroma_scaling_from_luma: resd 1
- .num_uv_points: resd 2
- .uv_points: resb 2 * 10 * 2
- .scaling_shift: resd 1
- .ar_coeff_lag: resd 1
- .ar_coeffs_y: resb 24
- .ar_coeffs_uv: resb 2 * 28 ; includes padding
- .ar_coeff_shift: resq 1
- .grain_scale_shift: resd 1
- .uv_mult: resd 2
- .uv_luma_mult: resd 2
- .uv_offset: resd 2
- .overlap_flag: resd 1
- .clip_to_restricted_range: resd 1
-endstruc
-
-cextern gaussian_sequence
-
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%if ARCH_X86_32
%define PIC_ptr(a) base+a
%else
diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm
index e6d4faddee8ae..72300c2a4cd07 100644
--- a/third_party/dav1d/src/x86/ipred16_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@@ -26,7 +26,7 @@
%include "config.asm"
%include "ext/x86/x86inc.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
%macro SMOOTH_WEIGHTS 1-*
const smooth_weights_1d_16bpc ; sm_weights[] << 7
@@ -134,14 +134,6 @@ cextern filter_intra_taps
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
INIT_YMM avx2
cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
movifnidn hd, hm
diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm
new file mode 100644
index 0000000000000..4a1b060bd5f67
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx512.asm
@@ -0,0 +1,833 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3
+ db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11
+ db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7
+ db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15
+smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
+ db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
+pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
+ times 4 db 10, 11, 12, 13, 2, 3, -1, -1
+filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
+ times 4 db 26, 27, 28, 29, 14, 15, -1, -1
+filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9
+pw_1: times 2 dw 1
+ dd 10
+filter_rnd: dd 32
+ dd 1
+ dd 8
+ dd 11
+filter_shift: times 2 dw 6
+ dd 0
+ times 2 dw 4
+ dd 9
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern filter_intra_taps
+
+SECTION .text
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+ paddw m0, m%2, m2
+ psubw m1, m0, m3 ; tldiff
+ psubw m0, m%1 ; tdiff
+ pabsw m1, m1
+ pabsw m0, m0
+ pcmpgtw k1, m0, m1
+ pminsw m0, m1
+ pcmpgtw k2, m%3, m0
+ vpblendmw m0{k1}, m%1, m3
+ vpblendmw m0{k2}, m2, m0
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
+%define base r6-ipred_paeth_16bpc_avx512icl_table
+ lea r6, [ipred_paeth_16bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastw m3, [tlq] ; topleft
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w4_loop:
+ sub tlq, 16
+ vbroadcasti32x4 m2, [tlq]
+ pshufb m2, m7 ; left
+ PAETH 4, 5, 6
+ vextracti32x4 xmm1, m0, 2
+ vextracti32x4 xmm2, ym0, 1
+ vextracti32x4 xmm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*2], xmm2
+ movq [dstq+r6 ], xmm3
+ sub hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm2
+ movhps [dstq+r6 ], xmm3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.w8:
+ vbroadcasti32x4 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w8_loop:
+ sub tlq, 8
+ vpbroadcastq m2, [tlq]
+ pshufb m2, m7
+ PAETH 4, 5, 6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w16_loop:
+ sub tlq, 4
+ vpbroadcastd m2, [tlq]
+ pshufb m2, m7
+ PAETH 4, 5, 6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ movu m4, [tlq+2]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w32_loop:
+ sub tlq, 2
+ vpbroadcastw m2, [tlq]
+ PAETH 4, 5, 6
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+.w64:
+ movu m4, [tlq+ 2]
+ movu m7, [tlq+66]
+ psubw m5, m4, m3
+ psubw m8, m7, m3
+ pabsw m6, m5
+ pabsw m9, m8
+.w64_loop:
+ sub tlq, 2
+ vpbroadcastw m2, [tlq]
+ PAETH 4, 5, 6
+ mova [dstq+64*0], m0
+ PAETH 7, 8, 9
+ mova [dstq+64*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
+%define base r6-$$
+ lea r6, [$$]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4]
+ lea weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+ neg hq
+ vpbroadcastw m6, [tlq+hq*2] ; bottom
+ lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq]
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastq m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w4_loop:
+ vbroadcasti32x4 m3, [weightsq+hq*2]
+ pshufb m3, m4
+ pmulhrsw m3, m5
+ paddw m3, m6
+ vextracti32x4 xmm0, m3, 3
+ vextracti32x4 xmm1, ym3, 1
+ vextracti32x4 xmm2, m3, 2
+ movhps [dstq+strideq*0], xmm0
+ movhps [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm2
+ movhps [dstq+stride3q ], xm3
+ add hq, 8
+ jg .end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xmm0
+ movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*2], xmm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jl .w4_loop
+.end:
+ RET
+.w8:
+ vbroadcasti32x4 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w8_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m4
+ pmulhrsw m0, m5
+ paddw m0, m6
+ vextracti32x4 [dstq+strideq*0], m0, 3
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w16_loop:
+ vpbroadcastd m0, [weightsq+hq*2+0]
+ vpbroadcastd m1, [weightsq+hq*2+4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m6
+ paddw m1, m6
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], ym0
+ vextracti32x8 [dstq+strideq*2], m1, 1
+ mova [dstq+stride3q ], ym1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ movu m5, [tlq+2]
+ psubw m5, m6
+.w32_loop:
+ vpbroadcastw m0, [weightsq+hq*2+0]
+ vpbroadcastw m1, [weightsq+hq*2+2]
+ vpbroadcastw m2, [weightsq+hq*2+4]
+ vpbroadcastw m3, [weightsq+hq*2+6]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w32_loop
+ RET
+.w64:
+ movu m4, [tlq+ 2]
+ movu m5, [tlq+66]
+ psubw m4, m6
+ psubw m5, m6
+.w64_loop:
+ vpbroadcastw m1, [weightsq+hq*2+0]
+ vpbroadcastw m3, [weightsq+hq*2+2]
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m5
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m5
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ mova [dstq+strideq*1+64*0], m2
+ mova [dstq+strideq*1+64*1], m3
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
+ lea r6, [$$]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m6, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4]
+ sub tlq, hq
+ lea stride3q, [strideq*3]
+ lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
+ jmp wq
+.w4:
+ movsldup m4, [base+ipred_shuf]
+ vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2]
+.w4_loop:
+ vbroadcasti32x4 m0, [tlq+hq-16] ; left
+ pshufb m0, m4
+ psubw m0, m6 ; left - right
+ pmulhrsw m0, m5
+ paddw m0, m6
+ vextracti32x4 xmm1, m0, 2
+ vextracti32x4 xmm2, ym0, 1
+ vextracti32x4 xmm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*2], xmm2
+ movq [dstq+stride3q ], xmm3
+ sub hd, 8*2
+ jl .end
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm2
+ movhps [dstq+stride3q ], xmm3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.end:
+ RET
+.w8:
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2]
+.w8_loop:
+ vpbroadcastq m0, [tlq+hq-8] ; left
+ pshufb m0, m4
+ psubw m0, m6 ; left - right
+ pmulhrsw m0, m5
+ paddw m0, m6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastd m1, [tlq+hq-8]
+ pshufb m0, m4
+ pshufb m1, m4
+ psubw m0, m6
+ psubw m1, m6
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m6
+ paddw m1, m6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w16_loop
+ RET
+.w32:
+ movu m5, [base+smooth_weights_1d_16bpc+32*2]
+.w32_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+ punpcklwd m3, m3
+ psubw m3, m6
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w32_loop
+ RET
+.w64:
+ movu m4, [base+smooth_weights_1d_16bpc+64*2]
+ movu m5, [base+smooth_weights_1d_16bpc+64*3]
+.w64_loop:
+ vpbroadcastw m1, [tlq+hq-2]
+ vpbroadcastw m3, [tlq+hq-4]
+ psubw m1, m6
+ psubw m3, m6
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m5
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m5
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ mova [dstq+strideq*1+64*0], m2
+ mova [dstq+strideq*1+64*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
+ lea r6, [$$]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m13, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4]
+ mov r5d, 0x55555555
+ sub tlq, hq
+ mova m14, [base+smooth_perm]
+ kmovd k1, r5d
+ vpbroadcastw m0, [tlq] ; bottom
+ mov r5, 0x3333333333333333
+ pxor m15, m15
+ lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq]
+ kmovq k2, r5
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
+ jmp wq
+.w4:
+ vpbroadcastq m5, [tlq+hq+2]
+ movshdup m3, [base+ipred_shuf]
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4]
+ lea stride3q, [strideq*3]
+ punpcklwd m5, m0 ; top, bottom
+.w4_loop:
+ vbroadcasti32x4 m0, [v_weightsq]
+ vpbroadcastq m2, [tlq+hq-8]
+ mova m1, m13
+ pshufb m0, m3
+ pmaddwd m0, m5
+ pshufb m1{k2}, m2, m4 ; left, right
+ vpdpwssd m0, m1, m6
+ vpermb m0, m14, m0
+ pavgw ym0, ym15
+ vextracti32x4 xmm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xmm1
+ lea dstq, [dstq+strideq*4]
+ add v_weightsq, 4*4
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym5, [tlq+hq+2]
+ movshdup m6, [base+ipred_shuf]
+ movsldup m7, [base+ipred_shuf]
+ pmovzxwd m5, ym5
+ vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4]
+ lea stride3q, [strideq*3]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+.w8_loop:
+ vpbroadcastq m0, [v_weightsq+0]
+ vpbroadcastq m1, [v_weightsq+8]
+ vpbroadcastd m3, [tlq+hq-4]
+ vpbroadcastd m4, [tlq+hq-8]
+ pshufb m0, m6
+ pmaddwd m0, m5
+ pshufb m1, m6
+ pmaddwd m1, m5
+ mova m2, m13
+ pshufb m2{k2}, m3, m7 ; left, right
+ mova m3, m13
+ pshufb m3{k2}, m4, m7
+ vpdpwssd m0, m2, m8
+ vpdpwssd m1, m3, m8
+ add v_weightsq, 4*4
+ vpermt2b m0, m14, m1
+ pavgw m0, m15
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ pmovzxwd m5, [tlq+hq+2]
+ mova m6, [base+smooth_weights_2d_16bpc+16*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+.w16_loop:
+ vpbroadcastd m0, [v_weightsq+0]
+ vpbroadcastd m1, [v_weightsq+4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ mova m2, m13
+ vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right
+ mova m3, m13
+ vpbroadcastw m3{k1}, [tlq+hq-4]
+ vpdpwssd m0, m2, m6
+ vpdpwssd m1, m3, m6
+ add v_weightsq, 2*4
+ vpermt2b m0, m14, m1
+ pavgw m0, m15
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ mova m7, [base+smooth_weights_2d_16bpc+32*4]
+ mova m8, [base+smooth_weights_2d_16bpc+32*6]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+ vpblendmw m6{k1}, m0, m6
+.w32_loop:
+ vpbroadcastd m2, [v_weightsq+0]
+ vpbroadcastd m3, [v_weightsq+4]
+ pmaddwd m0, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right
+ vpdpwssd m0, m4, m7
+ vpdpwssd m2, m4, m8
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-4]
+ vpdpwssd m1, m4, m7
+ vpdpwssd m3, m4, m8
+ add v_weightsq, 2*4
+ vpermt2b m0, m14, m2
+ vpermt2b m1, m14, m3
+ pavgw m0, m15
+ pavgw m1, m15
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ pmovzxwd m7, [tlq+hq+66]
+ pmovzxwd m8, [tlq+hq+98]
+ mova m9, [base+smooth_weights_2d_16bpc+64*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+ mova m10, [base+smooth_weights_2d_16bpc+64*5]
+ vpblendmw m6{k1}, m0, m6
+ mova m11, [base+smooth_weights_2d_16bpc+64*6]
+ vpblendmw m7{k1}, m0, m7
+ mova m12, [base+smooth_weights_2d_16bpc+64*7]
+ vpblendmw m8{k1}, m0, m8
+.w64_loop:
+ vpbroadcastd m3, [v_weightsq]
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right
+ pmaddwd m0, m5, m3
+ pmaddwd m2, m6, m3
+ pmaddwd m1, m7, m3
+ pmaddwd m3, m8
+ vpdpwssd m0, m4, m9
+ vpdpwssd m2, m4, m10
+ vpdpwssd m1, m4, m11
+ vpdpwssd m3, m4, m12
+ add v_weightsq, 1*4
+ vpermt2b m0, m14, m2
+ vpermt2b m1, m14, m3
+ pavgw m0, m15
+ pavgw m1, m15
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ sub hd, 1*2
+ jg .w64_loop
+ RET
+
+cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
+ lea r6, [pal_pred_16bpc_avx512icl_table]
+ tzcnt wd, wm
+ mova m2, [pal_pred_perm]
+ movsxd wq, [r6+wq*4]
+ mova xm3, [palq]
+ movifnidn hd, hm
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ pmovzxbw ym0, [idxq]
+ add idxq, 16
+ vpermw ym0, ym0, ym3
+ vextracti32x4 xmm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xmm1
+ movhps [dstq+stride3q ], xmm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbw m0, [idxq]
+ add idxq, 32
+ vpermw m0, m0, m3
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row.
+; w4 w8 w16 w32
+; 1 1 2 1 2 5 6 1 2 5 6 9 a d e
+; 2 2 3 2 3 6 7 2 3 6 7 a b e f
+; 3 3 4 3 4 7 8 3 4 7 8 b c f g
+; 4 4 5 4 5 8 9 4 5 8 9 c d g h
+
+cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top
+%define base r6-$$
+ lea r6, [$$]
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ movifnidn hd, hm
+ movu xm0, [tlq-6]
+ pmovsxbw m7, [base+filter_intra_taps+filterq+32*0]
+ pmovsxbw m8, [base+filter_intra_taps+filterq+32*1]
+ mov r5d, r8m ; bitdepth_max
+ movsldup m9, [base+filter_permA]
+ movshdup m10, [base+filter_permA]
+ shr r5d, 11 ; is_12bpc
+ jnz .12bpc
+ psllw m7, 2 ; upshift multipliers so that packusdw
+ psllw m8, 2 ; will perform clipping for free
+.12bpc:
+ vpbroadcastd m5, [base+filter_rnd+r5*8]
+ vpbroadcastd m6, [base+filter_shift+r5*8]
+ sub wd, 8
+ jl .w4
+.w8:
+ call .main4
+ movsldup m11, [filter_permB]
+ lea r5d, [hq*2+2]
+ movshdup m12, [filter_permB]
+ lea topq, [tlq+2]
+ mova m13, [filter_permC]
+ sub hd, 4
+ vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1
+ sub tlq, r5
+%if WIN64
+ push r7
+ push r8
+%endif
+ mov r7, dstq
+ mov r8d, hd
+.w8_loop:
+ movlps xm4, xm0, [tlq+hq*2]
+ call .main8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jge .w8_loop
+ test wd, wd
+ jz .end
+ mov r2d, 0x0d
+ kmovb k1, r2d
+ lea r2, [strideq*3]
+.w16:
+ movd xmm0, [r7+strideq*1+12]
+ vpblendd xmm0, [topq+8], 0x0e ; t1 t2
+ pinsrw xm4, xmm0, [r7+strideq*0+14], 2
+ call .main8
+ add r7, 16
+ vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3
+ mov hd, r8d
+ mov dstq, r7
+ add topq, 16
+.w16_loop:
+ movd xmm1, [dstq+strideq*2-4]
+ punpcklwd xm4, xmm1, xmm0
+ movd xmm0, [dstq+r2-4]
+ shufps xm4{k1}, xmm0, xm0, q3210
+ call .main8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jge .w16_loop
+ sub wd, 8
+ jg .w16
+.end:
+ vpermb m2, m11, m0
+ mova ym1, ym5
+ vpdpwssd m1, m2, m7
+ vpermb m2, m12, m0
+ vpdpwssd m1, m2, m8
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ vextracti32x8 ym2, m1, 1
+ paddd ym1, ym2
+ packusdw ym1, ym1
+ vpsrlvw ym1, ym6
+ vpermt2q m0, m13, m1
+ vextracti32x4 [dstq+strideq*0], m0, 2
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ RET
+.w4_loop:
+ movlps xm0, [tlq-10]
+ lea dstq, [dstq+strideq*2]
+ sub tlq, 4
+.w4:
+ call .main4
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.main4:
+ vpermb m2, m9, m0
+ mova ym1, ym5
+ vpdpwssd m1, m2, m7
+ vpermb m0, m10, m0
+ vpdpwssd m1, m0, m8
+ vextracti32x8 ym0, m1, 1
+ paddd ym0, ym1
+ vextracti32x4 xm1, ym0, 1
+ packusdw xm0, xm1 ; clip
+ vpsrlvw xm0, xm6
+ ret
+ALIGN function_align
+.main8:
+ vpermb m3, m11, m0
+ mova ym2, ym5
+ vpdpwssd m2, m3, m7
+ vpermb m3, m9, m4
+ mova ym1, ym5
+ vpdpwssd m1, m3, m7
+ vpermb m3, m12, m0
+ vpdpwssd m2, m3, m8
+ vpermb m3, m10, m4
+ vpdpwssd m1, m3, m8
+ vextracti32x8 ym4, m2, 1
+ vextracti32x8 ym3, m1, 1
+ paddd ym2, ym4
+ paddd ym1, ym3
+ packusdw ym1, ym2 ; clip
+ vpsrlvw ym1, ym6
+ vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1
+ vextracti32x4 [dstq+strideq*0], m0, 2
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ ret
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred16_sse.asm b/third_party/dav1d/src/x86/ipred16_sse.asm
index eaa56b67bcf05..07ea9567e1ab9 100644
--- a/third_party/dav1d/src/x86/ipred16_sse.asm
+++ b/third_party/dav1d/src/x86/ipred16_sse.asm
@@ -70,14 +70,6 @@ cextern filter_intra_taps
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
INIT_XMM ssse3
cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
LEA r5, ipred_dc_left_16bpc_ssse3_table
diff --git a/third_party/dav1d/src/x86/ipred_init_tmpl.c b/third_party/dav1d/src/x86/ipred_init_tmpl.c
index 3f1a3493c2551..0ba0a41088001 100644
--- a/third_party/dav1d/src/x86/ipred_init_tmpl.c
+++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c
@@ -134,6 +134,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl);
init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
+#endif
init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
@@ -142,5 +143,4 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
c->pal_pred = BF(dav1d_pal_pred, avx512icl);
#endif
-#endif
}
diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm
index 071ecbc33a268..c580944c7bbf5 100644
--- a/third_party/dav1d/src/x86/itx16_avx2.asm
+++ b/third_party/dav1d/src/x86/itx16_avx2.asm
@@ -145,14 +145,6 @@ cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%macro WRAP_XMM 1+
diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm
index fa8724691f3ae..4fb30ef4e7a6f 100644
--- a/third_party/dav1d/src/x86/itx16_sse.asm
+++ b/third_party/dav1d/src/x86/itx16_sse.asm
@@ -174,14 +174,6 @@ tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
%define m(x) m_suffix(x, SUFFIX)
diff --git a/third_party/dav1d/src/x86/itx_avx2.asm b/third_party/dav1d/src/x86/itx_avx2.asm
index 9cd66443482e8..092c842786dfe 100644
--- a/third_party/dav1d/src/x86/itx_avx2.asm
+++ b/third_party/dav1d/src/x86/itx_avx2.asm
@@ -132,15 +132,6 @@ SECTION .text
; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
%define o_base deint_shuf + 128
%define o(x) (r6 - (o_base) + (x))
-
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
; flags: 1 = swap, 2 = interleave, 4: coef_regs
diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm
index 32b68b1548367..7d01bccb4f52c 100644
--- a/third_party/dav1d/src/x86/itx_avx512.asm
+++ b/third_party/dav1d/src/x86/itx_avx512.asm
@@ -242,15 +242,6 @@ SECTION .text
%define o_base int8_permA+64*18
%define o(x) (r5 - (o_base) + (x))
-
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
diff --git a/third_party/dav1d/src/x86/itx_init_tmpl.c b/third_party/dav1d/src/x86/itx_init_tmpl.c
index 251d77e4fa1ee..467d38293209d 100644
--- a/third_party/dav1d/src/x86/itx_init_tmpl.c
+++ b/third_party/dav1d/src/x86/itx_init_tmpl.c
@@ -278,7 +278,27 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
if (bpc > 10) return;
-#if BITDEPTH == 16
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx2);
+ assign_itx16_fn(R, 4, 8, avx2);
+ assign_itx16_fn(R, 4, 16, avx2);
+ assign_itx16_fn(R, 8, 4, avx2);
+ assign_itx16_fn( , 8, 8, avx2);
+ assign_itx16_fn(R, 8, 16, avx2);
+ assign_itx2_fn (R, 8, 32, avx2);
+ assign_itx16_fn(R, 16, 4, avx2);
+ assign_itx16_fn(R, 16, 8, avx2);
+ assign_itx12_fn( , 16, 16, avx2);
+ assign_itx2_fn (R, 16, 32, avx2);
+ assign_itx1_fn (R, 16, 64, avx2);
+ assign_itx2_fn (R, 32, 8, avx2);
+ assign_itx2_fn (R, 32, 16, avx2);
+ assign_itx2_fn ( , 32, 32, avx2);
+ assign_itx1_fn (R, 32, 64, avx2);
+ assign_itx1_fn (R, 64, 16, avx2);
+ assign_itx1_fn (R, 64, 32, avx2);
+ assign_itx1_fn ( , 64, 64, avx2);
+#elif BITDEPTH == 16
assign_itx16_bpc_fn( , 4, 4, 10, avx2);
assign_itx16_bpc_fn(R, 4, 8, 10, avx2);
assign_itx16_bpc_fn(R, 4, 16, 10, avx2);
diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm
index 7cbd9c3f3be59..ec7e3a52f4676 100644
--- a/third_party/dav1d/src/x86/itx_sse.asm
+++ b/third_party/dav1d/src/x86/itx_sse.asm
@@ -142,14 +142,6 @@ pw_m301x8: times 8 dw -301*8
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%if ARCH_X86_64
@@ -2388,7 +2380,7 @@ INV_TXFM_8X16_FN identity, identity
cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*1, 32, 1
mov r3, tx2q
- lea tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
mova [rsp+gprsize+16*1], m6
jmp m(idct_8x8_internal_8bpc).pass1_end3
@@ -2400,7 +2392,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal_8bpc).pass1_end3
.pass2:
- lea tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)]
+ lea tx2q, [o(.end1)]
.end:
mova [rsp+gprsize+16*0], m7
@@ -2456,7 +2448,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*1, 32, 1
call .main
mov r3, tx2q
- lea tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end:
@@ -2467,7 +2459,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass2:
- lea tx2q, [o(m(idct_16x8_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
lea r3, [dstq+8]
jmp m(idct_8x8_internal_8bpc).pass2_main
@@ -2595,7 +2587,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
call .main
call .main_pass1_end
mov r3, tx2q
- lea tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(iadst_8x8_internal_8bpc).pass1_end
.pass1_end:
@@ -2606,7 +2598,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iadst_8x8_internal_8bpc).pass1_end
.pass2:
- lea tx2q, [o(m(iadst_16x8_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
lea r3, [dstq+8]
jmp m(iadst_8x8_internal_8bpc).pass2_main
@@ -2880,7 +2872,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
mov r3, tx2q
- lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(iflipadst_8x8_internal_8bpc).pass1_end
.pass1_end:
@@ -2891,7 +2883,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iflipadst_8x8_internal_8bpc).pass1_end
.pass2:
- lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
lea r3, [dstq+8]
jmp m(iflipadst_8x8_internal_8bpc).pass2_main
@@ -2914,7 +2906,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m6, [coeffq-16*3]
mova m7, [coeffq-16*1]
mov r3, tx2q
- lea tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
.pass1:
mova m0, [o(pw_2896x8)]
@@ -2972,7 +2964,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp .pass1
.pass2:
- lea tx2q, [o(m(iidentity_16x8_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
lea r3, [dstq+8]
jmp m(iidentity_8x8_internal_8bpc).end
@@ -3010,7 +3002,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*3, 64
call m(idct_16x8_internal_8bpc).main
mov r3, tx2q
- lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
mova m7, [o(pw_8192)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
@@ -3018,7 +3010,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_8ROWS coeffq+16*17, 32
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
mova m7, [o(pw_8192)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
@@ -3029,7 +3021,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_7ROWS rsp+gprsize+16*3, 16
LOAD_8ROWS coeffq+16*2, 64
call m(idct_16x8_internal_8bpc).main
- lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
mova m7, [o(pw_8192)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
@@ -3042,13 +3034,13 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass2:
- lea tx2q, [o(m(idct_16x16_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
jmp m(idct_8x16_internal_8bpc).pass2_pre
.end:
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_16x16_internal_8bpc).end1)]
+ lea tx2q, [o(.end1)]
mov dstq, r3
lea r3, [dstq+8]
jmp m(idct_8x8_internal_8bpc).end
@@ -3136,7 +3128,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
call m(iadst_16x8_internal_8bpc).main_pass1_end
mov r3, tx2q
- lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
mova m7, [o(pw_8192)]
jmp m(iadst_8x8_internal_8bpc).pass1_end1
@@ -3144,7 +3136,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_8ROWS coeffq+16*17, 32
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
mova m7, [o(pw_8192)]
jmp m(iadst_8x8_internal_8bpc).pass1_end1
@@ -3154,7 +3146,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
call m(iadst_16x8_internal_8bpc).main
call m(iadst_16x8_internal_8bpc).main_pass1_end
- lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
mova m7, [o(pw_8192)]
jmp m(iadst_8x8_internal_8bpc).pass1_end1
@@ -3167,13 +3159,13 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iadst_8x8_internal_8bpc).pass1_end1
.pass2:
- lea tx2q, [o(m(iadst_16x16_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
jmp m(iadst_8x16_internal_8bpc).pass2_pre
.end:
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(iadst_16x16_internal_8bpc).end1)]
+ lea tx2q, [o(.end1)]
mov dstq, r3
lea r3, [dstq+8]
jmp m(iadst_8x8_internal_8bpc).end
@@ -3211,7 +3203,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
call m(iadst_16x8_internal_8bpc).main_pass1_end
mov r3, tx2q
- lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
mova m7, [o(pw_m8192)]
jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
@@ -3219,7 +3211,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_8ROWS coeffq+16*1, 32
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
mova m7, [o(pw_m8192)]
jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
@@ -3233,7 +3225,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_8ROWS coeffq+16*0, 32
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
mova m7, [o(pw_m8192)]
jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
@@ -3246,14 +3238,14 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
.pass2:
- lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
lea r3, [dstq+8]
jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
.end:
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)]
+ lea tx2q, [o(.end1)]
lea dstq, [dstq+strideq*2]
jmp m(iflipadst_8x8_internal_8bpc).end
@@ -3276,7 +3268,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
mova [rsp+gprsize+16*5], m6
mova [rsp+gprsize+16*6], m7
- lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)]
+ lea tx2q, [o(.end2)]
mov dstq, r3
jmp m(iflipadst_8x16_internal_8bpc).pass2_main
@@ -3300,7 +3292,7 @@ INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
add coeffq, 16*17
mov r3, tx2q
- lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
.pass1:
mova m6, [o(pw_1697x16)]
@@ -3321,13 +3313,13 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end:
SAVE_8ROWS coeffq, 32
sub coeffq, 16
- lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp .pass1
.pass1_end1:
SAVE_8ROWS coeffq, 32
sub coeffq, 15*16
- lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
jmp .pass1
.pass1_end2:
@@ -3338,7 +3330,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2:
lea r3, [dstq+8]
- lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)]
+ lea tx2q, [o(.end1)]
.end:
mova [rsp+gprsize+16*0], m7
@@ -3361,7 +3353,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.end1:
LOAD_8ROWS coeffq+16*1, 32
- lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)]
+ lea tx2q, [o(.end2)]
lea dstq, [dstq+strideq*2]
jmp .end
@@ -3371,7 +3363,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
add coeffq, 32*8
LOAD_8ROWS coeffq, 32
- lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)]
+ lea tx2q, [o(.end3)]
mov dstq, r3
jmp .end
@@ -3403,7 +3395,7 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
pshuflw m0, m0, q0000
punpcklwd m0, m0
mov r3d, 8
- lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)]
+ lea tx2q, [o(.end)]
jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
.end:
@@ -3412,14 +3404,13 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
cmp eobd, 106
jle .fast
LOAD_8ROWS coeffq+16*3, 64
call m(idct_8x8_internal_8bpc).main
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1)]
+ lea tx2q, [o(.pass1)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1:
@@ -3434,7 +3425,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*2, 64
call m(idct_8x8_internal_8bpc).main
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)]
+ lea tx2q, [o(.pass1_1)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_1:
@@ -3451,7 +3442,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*1, 64
call m(idct_8x8_internal_8bpc).main
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end:
@@ -3466,7 +3457,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*0, 64
call m(idct_8x8_internal_8bpc).main
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1:
@@ -3514,11 +3505,11 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
call .main
.pass2:
- lea r3, [o(m(idct_8x32_internal_8bpc).end6)]
+ lea r3, [o(.end6)]
.end:
mova [rsp+gprsize+16*0 ], m7
- lea tx2q, [o(m(idct_8x32_internal_8bpc).end2)]
+ lea tx2q, [o(.end2)]
.end1:
pxor m7, m7
@@ -3530,21 +3521,21 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp tx2q
.end2:
- lea tx2q, [o(m(idct_8x32_internal_8bpc).end3)]
+ lea tx2q, [o(.end3)]
jmp m(idct_8x8_internal_8bpc).end
.end3:
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0 ], m7
lea dstq, [dstq+strideq*2]
- lea tx2q, [o(m(idct_8x32_internal_8bpc).end4)]
+ lea tx2q, [o(.end4)]
jmp m(idct_8x8_internal_8bpc).end
.end4:
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0 ], m7
lea dstq, [dstq+strideq*2]
- lea tx2q, [o(m(idct_8x32_internal_8bpc).end5)]
+ lea tx2q, [o(.end5)]
jmp m(idct_8x8_internal_8bpc).end
.end5:
@@ -3883,7 +3874,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
movd m2, [o(pw_8192)]
mov [coeffq], eobd
mov r3d, 8
- lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ lea tx2q, [o(.end)]
.body:
pmulhrsw m0, m2
@@ -3919,7 +3910,6 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
LOAD_8ROWS coeffq+16*0, 64
call m(idct_8x8_internal_8bpc).main
SAVE_7ROWS rsp+gprsize+16*3, 16
@@ -3958,55 +3948,55 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2:
mova [rsp+gprsize+16*0 ], m7
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
jmp m(idct_8x32_internal_8bpc).end1
.end:
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end1)]
+ lea tx2q, [o(.end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.end1:
lea r3, [dstq+8]
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end2)]
+ lea tx2q, [o(.end2)]
jmp m(idct_8x8_internal_8bpc).pass2_main
.end2:
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0 ], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end3)]
+ lea tx2q, [o(.end3)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.end3:
mov dstq, r3
add r3, 8
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end4)]
+ lea tx2q, [o(.end4)]
jmp m(idct_8x8_internal_8bpc).pass2_main
.end4:
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0 ], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end5)]
+ lea tx2q, [o(.end5)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.end5:
mov dstq, r3
add r3, 8
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end6)]
+ lea tx2q, [o(.end6)]
jmp m(idct_8x8_internal_8bpc).pass2_main
.end6:
LOAD_8ROWS rsp+gprsize+16*27, 16
mova [rsp+gprsize+16*0 ], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end7)]
+ lea tx2q, [o(.end7)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.end7:
mov dstq, r3
- lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+ lea tx2q, [o(.end8)]
jmp m(idct_8x8_internal_8bpc).pass2_main
.end8:
@@ -4085,6 +4075,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
test eobd, eobd
jz .dconly
call m(idct_16x32_internal_8bpc)
+.end:
RET
.dconly:
@@ -4094,28 +4085,24 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
mov [coeffq], eobd
pmulhrsw m0, m1
mov r2d, 16
- lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)]
+ lea tx2q, [o(.end)]
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
-.end:
- RET
cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
-
LOAD_8ROWS coeffq+16*1, 128, 1
call m(idct_8x8_internal_8bpc).main
SAVE_7ROWS rsp+gprsize+16*3, 16
LOAD_8ROWS coeffq+16*5, 128, 1
call m(idct_16x8_internal_8bpc).main
- lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end:
SAVE_8ROWS coeffq+16*33, 64 ;in8~in15
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end1:
@@ -4132,14 +4119,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_7ROWS rsp+gprsize+16*3, 16
LOAD_8ROWS coeffq+16*4, 128, 1
call m(idct_16x8_internal_8bpc).main
- lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end2:
SAVE_8ROWS coeffq+16*32, 64 ;in0~in7
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)]
+ lea tx2q, [o(.pass1_end3)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end3:
@@ -4182,14 +4169,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_7ROWS rsp+gprsize+16*3, 16
LOAD_8ROWS coeffq+16*6, 128, 1
call m(idct_16x8_internal_8bpc).main
- lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)]
+ lea tx2q, [o(.pass1_end4)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end4:
SAVE_8ROWS coeffq+16*34, 64 ;in16~in23
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)]
+ lea tx2q, [o(.pass1_end5)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end5:
@@ -4207,14 +4194,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_7ROWS rsp+gprsize+16*3, 16
LOAD_8ROWS coeffq+16*7, 128, 1
call m(idct_16x8_internal_8bpc).main
- lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)]
+ lea tx2q, [o(.pass1_end6)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end6:
SAVE_8ROWS coeffq+16*35, 64 ;in24~in31
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)]
+ lea tx2q, [o(.pass1_end7)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end7:
@@ -4246,7 +4233,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
mov [rsp+gprsize*1+16*35], eobd
lea r3, [dstq+8]
mov [rsp+gprsize*2+16*35], r3
- lea r3, [o(m(idct_16x32_internal_8bpc).end)]
+ lea r3, [o(.end)]
jmp m(idct_8x32_internal_8bpc).end
.end:
@@ -4296,7 +4283,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_8ROWS rsp+gprsize+16*11, 16
call m(idct_8x32_internal_8bpc).main_fast
- jmp .end1
+ jmp m(idct_8x32_internal_8bpc).pass2
.full1:
mova m4, [coeffq+16*2 ] ;in16
@@ -4337,12 +4324,9 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
mova [rsp+gprsize+16*34], m7 ;in31
call m(idct_8x32_internal_8bpc).main
-
-.end1:
jmp m(idct_8x32_internal_8bpc).pass2
-
cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
%if ARCH_X86_32
LEA r5, $$
@@ -4390,10 +4374,8 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
-
add coeffq, 16
- lea r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)]
+ lea r3, [o(.pass1_end1)]
.pass1:
LOAD_8ROWS coeffq+16*0, 128, 1
call m(idct_8x8_internal_8bpc).main
@@ -4434,28 +4416,28 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_8ROWS coeffq+16*0, 32
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0 ], m7
- lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end2:
SAVE_8ROWS coeffq+16*16, 32
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0 ], m7
- lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)]
+ lea tx2q, [o(.pass1_end3)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end3:
SAVE_8ROWS coeffq+16*32, 32
LOAD_8ROWS rsp+gprsize+16*27, 16
mova [rsp+gprsize+16*0 ], m7
- lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)]
+ lea tx2q, [o(.pass1_end4)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end4:
SAVE_8ROWS coeffq+16*48, 32
sub coeffq, 16
- lea r3, [o(m(idct_32x16_internal_8bpc).end)]
+ lea r3, [o(.end)]
jmp .pass1
.end:
@@ -4463,8 +4445,6 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
- %undef cmp
-
mov r4d, eobd
cmp eobd, 43 ;if (eob > 43)
sbb r3d, r3d ; iteration_count++
@@ -4528,8 +4508,6 @@ cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, c
cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
- %undef cmp
-
mov r4d, 12 ;0100b
mov r5d, 136 ;1000 1000b
cmp eobd, 44 ;if (eob > 43)
@@ -4608,8 +4586,6 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
-
mov r4d, 2
sub eobd, 136
mov [rsp+gprsize*1+16*35], eobd
@@ -4684,7 +4660,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end:
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1:
@@ -4692,7 +4668,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end2:
@@ -4700,7 +4676,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)]
+ lea tx2q, [o(.pass1_end3)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end3:
@@ -4708,7 +4684,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*27, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)]
+ lea tx2q, [o(.pass1_end4)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end4:
@@ -4722,7 +4698,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2:
mov coeffq, [rsp+gprsize*2+16*35]
mov r3d, 4
- lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+ lea tx2q, [o(.pass2_end)]
.pass2_loop:
mov [rsp+gprsize*3+16*35], r3d
@@ -4818,11 +4794,11 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp tx2q
.pass2_end:
- lea r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)]
+ lea r3, [o(.pass2_end1)]
jmp m(idct_8x32_internal_8bpc).end
.pass2_end1:
- lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+ lea tx2q, [o(.pass2_end)]
add coeffq, 16*32
mov dstq, [rsp+gprsize*2+16*35]
mov r3d, [rsp+gprsize*3+16*35]
@@ -4833,8 +4809,6 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
- %undef cmp
-
mov r4d, 2
cmp eobd, 136
mov r3d, 4
@@ -4895,8 +4869,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
%endif
test eobd, eobd
jz .dconly
-
call m(idct_16x64_internal_8bpc)
+.end:
RET
.dconly:
@@ -4905,16 +4879,11 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
movd m2, [o(pw_8192)]
mov [coeffq], eobd
mov r2d, 32
- lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)]
+ lea tx2q, [o(.end)]
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
-.end:
- RET
-
cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
-
mov r4d, 2
sub eobd, 151
mov [rsp+gprsize*1+16*67], eobd
@@ -4934,7 +4903,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+64*1, 64*2
call m(idct_16x8_internal_8bpc).main
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end:
@@ -4942,7 +4911,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1:
@@ -4956,7 +4925,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
mov r3d, 2
lea r4, [dstq+8]
mov [rsp+gprsize*2+16*67], r4
- lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ lea r4, [o(.end1)]
.pass2_loop:
mov [rsp+gprsize*3+16*67], r3d
@@ -5082,23 +5051,47 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.end1:
LOAD_8ROWS rsp+gprsize+16*35, 16
lea dstq, [dstq+strideq*2]
- add rsp, 16*32
- lea r3, [o(m(idct_16x64_internal_8bpc).end2)]
- jmp m(idct_8x32_internal_8bpc).end
-
-.end2:
- add coeffq, 16*32
- sub rsp, 16*32
-
+ lea r3, [rsp+16*32+gprsize]
+ call .write
mov dstq, [rsp+gprsize*2+16*67]
mov r3d, [rsp+gprsize*3+16*67]
lea r4, [dstq+8]
mov [rsp+gprsize*2+16*67], r4
- lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ lea r4, [o(.end1)]
dec r3d
jg .pass2_loop
ret
+.write:
+ mova [r3+16*0], m7
+ mov r4, -16*32
+ pxor m7, m7
+ sub coeffq, r4
+.zero_loop:
+ mova [coeffq+r4+16*0], m7
+ mova [coeffq+r4+16*1], m7
+ add r4, 16*2
+ jl .zero_loop
+ call .write_main2
+ LOAD_8ROWS r3+16*11, 16
+ call .write_main
+ LOAD_8ROWS r3+16*19, 16
+ call .write_main
+ LOAD_8ROWS r3+16*27, 16
+.write_main:
+ mova [r3+16*0], m7
+.write_main2:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [r3+16*0]
+ mova [r3+16*2], m5
+ mova [r3+16*1], m6
+ mova [r3+16*0], m7
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ ret
ALIGN function_align
@@ -5765,7 +5758,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eo
movd m2, [o(pw_8192)]
mov [coeffq], eobd
mov r3d, 16
- lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)]
+ lea tx2q, [o(.end)]
.body:
pmulhrsw m0, m2
@@ -5895,7 +5888,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end:
@@ -5903,7 +5896,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1:
@@ -5911,7 +5904,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end2:
@@ -5919,7 +5912,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*27, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)]
+ lea tx2q, [o(.pass1_end3)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end3:
@@ -5927,7 +5920,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*35, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)]
+ lea tx2q, [o(.pass1_end4)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end4:
@@ -5935,7 +5928,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*43, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)]
+ lea tx2q, [o(.pass1_end5)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end5:
@@ -5943,7 +5936,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*51, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)]
+ lea tx2q, [o(.pass1_end6)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end6:
@@ -5951,7 +5944,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*59, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)]
+ lea tx2q, [o(.pass1_end7)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end7:
@@ -5979,14 +5972,14 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
call m(idct_16x8_internal_8bpc).main
mov r3, dstq
- lea tx2q, [o(m(idct_64x16_internal_8bpc).end)]
+ lea tx2q, [o(.end)]
lea dstq, [dstq+strideq*8]
jmp m(idct_8x8_internal_8bpc).end
.end:
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x16_internal_8bpc).end1)]
+ lea tx2q, [o(.end1)]
mov dstq, r3
jmp m(idct_8x8_internal_8bpc).end
@@ -6016,14 +6009,14 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
call m(idct_16x8_internal_8bpc).main
mov r3, dstq
- lea tx2q, [o(m(idct_64x16_internal_8bpc).end2)]
+ lea tx2q, [o(.end2)]
lea dstq, [dstq+strideq*8]
jmp m(idct_8x8_internal_8bpc).end
.end2:
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x16_internal_8bpc).end3)]
+ lea tx2q, [o(.end3)]
mov dstq, r3
jmp m(idct_8x8_internal_8bpc).end
@@ -6045,8 +6038,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
%endif
test eobd, eobd
jz .dconly
-
call m(idct_32x64_internal_8bpc)
+.end:
RET
.dconly:
@@ -6056,16 +6049,11 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
mov [coeffq], eobd
pmulhrsw m0, m1
mov r3d, 64
- lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)]
+ lea tx2q, [o(.end)]
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
-.end:
- RET
-
cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
-
mov r4d, 2
sub eobd, 136
mov [rsp+gprsize*1+16*67], eobd
@@ -6133,28 +6121,28 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1_end:
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end1:
SAVE_8ROWS coeffq+64*0, 64
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end2:
SAVE_8ROWS coeffq+64*8, 64
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)]
+ lea tx2q, [o(.pass1_end3)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end3:
SAVE_8ROWS coeffq+64*16, 64
LOAD_8ROWS rsp+gprsize+16*27, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)]
+ lea tx2q, [o(.pass1_end4)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end4:
@@ -6179,8 +6167,8 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
%endif
test eobd, eobd
jz .dconly
-
call m(idct_64x32_internal_8bpc)
+.end:
RET
.dconly:
@@ -6190,15 +6178,11 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
pmulhrsw m0, m1
mov [coeffq], eobd
mov r3d, 32
- lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
+ lea tx2q, [o(.end)]
jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
-.end:
- RET
cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
-
mov r4d, 2
sub eobd, 136
mov [rsp+gprsize*1+16*67], eobd
@@ -6266,56 +6250,56 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end:
SAVE_8ROWS coeffq+64*0, 64
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end1:
SAVE_8ROWS coeffq+64*8, 64
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end2:
SAVE_8ROWS coeffq+64*16, 64
LOAD_8ROWS rsp+gprsize+16*27, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)]
+ lea tx2q, [o(.pass1_end3)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end3:
SAVE_8ROWS coeffq+64*24, 64
LOAD_8ROWS rsp+gprsize+16*35, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)]
+ lea tx2q, [o(.pass1_end4)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end4:
SAVE_8ROWS dstq+64*0, 64
LOAD_8ROWS rsp+gprsize+16*43, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)]
+ lea tx2q, [o(.pass1_end5)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end5:
SAVE_8ROWS dstq+64*8, 64
LOAD_8ROWS rsp+gprsize+16*51, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)]
+ lea tx2q, [o(.pass1_end6)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end6:
SAVE_8ROWS dstq+64*16, 64
LOAD_8ROWS rsp+gprsize+16*59, 16
mova [rsp+gprsize+16*0], m7
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)]
+ lea tx2q, [o(.pass1_end7)]
jmp m(idct_8x8_internal_8bpc).pass1_end
.pass1_end7:
@@ -6332,17 +6316,17 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
mov eobd, [rsp+gprsize*1+16*67]
lea dstq, [dstq+32]
mov [rsp+gprsize*1+16*35], eobd
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)]
+ lea tx2q, [o(.pass2_end)]
mov r3d, 4
jmp m(idct_32x32_internal_8bpc).pass2_loop
.pass2_end:
mova [rsp+gprsize+16*0], m7
- lea r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)]
+ lea r3, [o(.pass2_end1)]
jmp m(idct_8x32_internal_8bpc).end2
.pass2_end1:
- lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)]
+ lea tx2q, [o(.pass2_end)]
add coeffq, 16*32
mov dstq, [rsp+gprsize*2+16*35]
mov r3d, [rsp+gprsize*3+16*35]
@@ -6377,8 +6361,6 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
- %undef cmp
-
mov r5d, 4
mov r4d, 2
sub eobd, 136
@@ -6448,7 +6430,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*3, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)]
+ lea tx2q, [o(.pass1_end)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end:
@@ -6456,7 +6438,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*11, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)]
+ lea tx2q, [o(.pass1_end1)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end1:
@@ -6464,7 +6446,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*19, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)]
+ lea tx2q, [o(.pass1_end2)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end2:
@@ -6472,7 +6454,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*27, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)]
+ lea tx2q, [o(.pass1_end3)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end3:
@@ -6480,7 +6462,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*35, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)]
+ lea tx2q, [o(.pass1_end4)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end4:
@@ -6488,7 +6470,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*43, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)]
+ lea tx2q, [o(.pass1_end5)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end5:
@@ -6496,7 +6478,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*51, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)]
+ lea tx2q, [o(.pass1_end6)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end6:
@@ -6504,7 +6486,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS rsp+gprsize+16*59, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
- lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)]
+ lea tx2q, [o(.pass1_end7)]
jmp m(idct_8x8_internal_8bpc).pass1_end1
.pass1_end7:
@@ -6522,26 +6504,20 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
mov r3d, 4
lea r4, [dstq+8]
mov [rsp+gprsize*2+16*67], r4
- lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)]
+ lea r4, [o(.pass2_end)]
jmp m(idct_16x64_internal_8bpc).pass2_loop
.pass2_end:
LOAD_8ROWS rsp+gprsize+16*35, 16
lea dstq, [dstq+strideq*2]
- add rsp, 16*32
+ lea r3, [rsp+16*32+gprsize]
mova [rsp+gprsize+16*0], m7
- lea r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)]
- jmp m(idct_8x32_internal_8bpc).end2
-
-.pass2_end1:
- add coeffq, 16*32
- sub rsp, 16*32
-
+ call m(idct_16x64_internal_8bpc).write
mov dstq, [rsp+gprsize*2+16*67]
mov r3d, [rsp+gprsize*3+16*67]
lea r4, [dstq+8]
mov [rsp+gprsize*2+16*67], r4
- lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)]
+ lea r4, [o(.pass2_end)]
dec r3d
jg m(idct_16x64_internal_8bpc).pass2_loop
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx2.asm b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
index 0c8618655c46d..361ccc3b883ce 100644
--- a/third_party/dav1d/src/x86/loopfilter16_avx2.asm
+++ b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
@@ -49,14 +49,6 @@ pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
; in: out:
; mm%1 a b c d a e i m
; mm%2 e f g h b f j n
diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm
index 3ec3fd81fe3f0..c486b57a2113a 100644
--- a/third_party/dav1d/src/x86/loopfilter16_sse.asm
+++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm
@@ -106,14 +106,6 @@ ASSERT ARCH_X86_32
%endif
%endmacro
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%macro SPLATD 2
movd %1, %2
pshufd %1, %1, q0000
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx2.asm b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
index 98f51d8f1e5a7..ef25c28474416 100644
--- a/third_party/dav1d/src/x86/looprestoration16_avx2.asm
+++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
@@ -66,14 +66,6 @@ cextern sgr_x_by_x_avx2
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
INIT_YMM avx2
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
index 3dacfe66a6cf7..8b2ec4fa91fa7 100644
--- a/third_party/dav1d/src/x86/mc16_avx2.asm
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -202,14 +202,6 @@ cextern resize_filter
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f({%1})
-%endrep
-%endmacro
-
INIT_XMM avx2
cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
mov mxyd, r6m ; mx
diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm
index c2ea090b0c5ba..e83b18ad969cb 100644
--- a/third_party/dav1d/src/x86/mc16_avx512.asm
+++ b/third_party/dav1d/src/x86/mc16_avx512.asm
@@ -254,14 +254,6 @@ cextern resize_filter
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%if WIN64
DECLARE_REG_TMP 4
%else
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm
index 6435bd083cb18..fde8e372a3fed 100644
--- a/third_party/dav1d/src/x86/mc16_sse.asm
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
@@ -166,14 +166,6 @@ cextern resize_filter
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%if UNIX64
DECLARE_REG_TMP 7
%else
@@ -4799,9 +4791,7 @@ INIT_XMM ssse3
psrad m6, hsh_mem
packssdw m11, m6 ; 7 8
%if ARCH_X86_64
- ; fixme a bug in x86inc.asm forces us to explicitly load m9
- mova m9, [stk+0x40]
- shufps m9, m11, q1032 ; 6 7
+ shufps m9, [stk+0x40], m11, q1032 ; 6 7
mova m0, [stk+0x00]
mova [stk+0x40], m11
%else
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
index fb55449f335a8..eb3ca1c427da5 100644
--- a/third_party/dav1d/src/x86/mc_avx512.asm
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -303,14 +303,6 @@ BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
SECTION .text
-%macro REPX 2-*
- %xdefine %%f(x) %1
-%rep %0 - 1
- %rotate 1
- %%f(%1)
-%endrep
-%endmacro
-
%macro WRAP_YMM 1+
INIT_YMM cpuname
%1
diff --git a/third_party/dav1d/tests/checkasm/filmgrain.c b/third_party/dav1d/tests/checkasm/filmgrain.c
index a44a3ac422cb3..ff7ffc36c68a6 100644
--- a/third_party/dav1d/tests/checkasm/filmgrain.c
+++ b/third_party/dav1d/tests/checkasm/filmgrain.c
@@ -30,7 +30,7 @@
#include <string.h>
#include "src/levels.h"
-#include "src/film_grain.h"
+#include "src/filmgrain.h"
#define UNIT_TEST 1
#include "src/fg_apply_tmpl.c"
@@ -155,6 +155,7 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
+ ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,);
fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
@@ -163,7 +164,6 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
const int bitdepth_max = 0xff;
#endif
- uint8_t scaling[SCALING_SIZE];
entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
@@ -267,6 +267,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
BITDEPTH, ss_name[layout_idx], csfl))
{
ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+ ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,);
fg_data[0].seed = rnd() & 0xFFFF;
@@ -278,7 +279,6 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
const int uv_pl = rnd() & 1;
const int is_identity = rnd() & 1;
- uint8_t scaling[SCALING_SIZE];
entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
fg_data[0].grain_scale_shift = rnd() & 3;
fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
@@ -368,7 +368,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
checkasm_check_pixel_padded_align(c_dst, stride,
a_dst, stride,
w, h, "dst",
- 32 >> ss_x, 2);
+ 32 >> ss_x, 4);
}
}
@@ -380,7 +380,7 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
}
}
- bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
+ bench_new(a_dst, src, stride, fg_data, 64 >> ss_x, scaling, grain_lut[1], 32 >> ss_y,
1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
}
}
--
To stop receiving notification emails like this one, please contact
the administrator of this repository.
More information about the tbb-commits
mailing list